1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/epoc32/include/collate.h Tue Mar 16 16:12:26 2010 +0000
1.3 @@ -0,0 +1,362 @@
1.4 +// Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
1.5 +// All rights reserved.
1.6 +// This component and the accompanying materials are made available
1.7 +// under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
1.8 +// which accompanies this distribution, and is available
1.9 +// at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
1.10 +//
1.11 +// Initial Contributors:
1.12 +// Nokia Corporation - initial contribution.
1.13 +//
1.14 +// Contributors:
1.15 +//
1.16 +// Description:
1.17 +// e32\include\collate.h
1.18 +// Definitions needed for Unicode collation.
1.19 +// Collation is the comparison of two Unicode strings to produce an ordering
1.20 +// that may be used in a dictionary or other list.
1.21 +// Collation is implemented using the Standard Unicode Collation algorithm. There
1.22 +// are four levels of comparison:
1.23 +// primary: basic character identity
1.24 +// secondary: accents and diacritics
1.25 +// tertiary: upper and lower case, and other minor attributes
1.26 +// quaternary: Unicode character value
1.27 +// Punctuation is normally ignored but can optionally be taken into account.
1.28 +// Strings are fully expanded using the standard Unicode canonical expansions before
1.29 +// they are compared. Thai and Lao vowels are swapped with the following character
1.30 +// if any.
1.31 +// EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
1.32 +// to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
1.33 +// the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
1.34 +// all the characters for which keys are defined, and ordered by their Unicode values.
1.35 +// Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
1.36 +// method. This is done by using the standard table as the main key table (signalled by placing NULL in
1.37 +// TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
1.38 +// Locale-specific collation data resides in ELOCL.
1.39 +//
1.40 +//
1.41 +
1.42 +
1.43 +
1.44 +#ifndef __COLLATE_H__
1.45 +#define __COLLATE_H__
1.46 +
1.47 +#ifdef __KERNEL_MODE__
1.48 +#include <e32cmn.h>
1.49 +#else
1.50 +#include <e32std.h>
1.51 +#endif
1.52 +
1.53 +//This material is used in the Unicode build only.
1.54 +#ifdef _UNICODE
1.55 +
1.56 +/**
1.57 +Collation key table structure.
1.58 +@publishedPartner
1.59 +*/
1.60 +struct TCollationKeyTable
1.61 + {
1.62 +public:
1.63 + /**
1.64 + Masks for the various parts of the elements of the iKey array.
1.65 + */
1.66 + enum
1.67 + {
1.68 + ELevel0Mask = 0xFFFF0000, // primary key - basic character identity
1.69 + ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics
1.70 + ELevel2Mask = 0x000000FC, // tertiary key - case, etc.
1.71 + EIgnoreFlag = 0x2, // if set, this key is normally ignored
1.72 + EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values
1.73 + };
1.74 +
1.75 + /**
1.76 + An array containing all of the keys and strings of keys concatenated
1.77 + together. Each key has EStopFlag set only if it is the last key in its
1.78 + string. Eack key contains the keys for levels 0, 1 and 2, and a flag
1.79 + EIgnoreFlag if the key is usually ignored (for punctuation & spaces
1.80 + etc.).
1.81 + */
1.82 + const TUint32* iKey;
1.83 + /**
1.84 + An array of indices into the iKey array. Each element has its high 16
1.85 + bits indicating a Unicode value and its low 16 bits indicating an index
1.86 + into the iKey array at which its key starts. The elements are sorted by
1.87 + Unicode value.
1.88 + */
1.89 + const TUint32* iIndex;
1.90 + /**
1.91 + The size of the iIndex array.
1.92 + */
1.93 + TInt iIndices;
1.94 + /**
1.95 + Concatenated Unicode strings. Each is a strings that is to be converted
1.96 + to keys differently from how it would be if each letter were converted
1.97 + independently. An example is "ch" in Spanish, which sorts as though it
1.98 + were a single letter. Each Unicode string is preceeded by a 16-bit value
1.99 + indicating the string's length. The end of the string is not delimited.
1.100 + */
1.101 + const TUint16* iString;
1.102 + /**
1.103 + An array of elements mapping elements of iString to elements of iIndex.
1.104 + Each element has its high 16 bits indicating the index of the start of
1.105 + an element of iString, and its low 16 bits indicating the corresponding
1.106 + element in iIndex. This array is sorted on the string index.
1.107 + */
1.108 + const TUint32* iStringIndex;
1.109 + /**
1.110 + The size of the iStringIndex array.
1.111 + */
1.112 + TInt iStringIndices;
1.113 + };
1.114 +
1.115 +/**
1.116 +Defines a collation method.
1.117 +
1.118 +Collation means sorting pieces of text. It needs to take into account characters,
1.119 +accents and case; spaces and punctuation are usually ignored. It differs from
1.120 +ordinary methods of sorting in that it is locale-dependent - different
1.121 +languages use different ordering methods. Additionally, multiple collation
1.122 +methods may exist within the same locale.
1.123 +
1.124 +A collation method provides the collation keys and other data needed to customise
1.125 +collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC())
1.126 +perform the collation. Note that these functions use the standard collation
1.127 +method for the current locale - you only need to specify an object of class
1.128 +TCollationMethod to customise this collation scheme. Collation methods can
1.129 +be retrieved using member functions of the Mem class. Each one has a unique
1.130 +identifier.
1.131 +
1.132 +A collation method specifies a main table of collation keys, and optionally
1.133 +an overriding table that contains keys for which the values in the main table
1.134 +are overridden. A collation key table (TCollationKeyTable) is the set of collation
1.135 +keys: primary (basic character identity), secondary (accents and diacritics)
1.136 +and tertiary (case). The quaternary key is the Unicode character values themselves.
1.137 +
1.138 +The simplest way to customise a collation method is to create a local copy
1.139 +of the standard collation method and change it. For example, you could use
1.140 +the standard method, but not ignore punctuation and spaces:
1.141 +
1.142 +@code
1.143 +TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
1.144 +m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
1.145 +@endcode
1.146 +
1.147 +@publishedPartner
1.148 +*/
1.149 +struct TCollationMethod
1.150 + {
1.151 + public:
1.152 + /**
1.153 + The UID of this collation method.
1.154 + */
1.155 + TUint iId;
1.156 +
1.157 + /**
1.158 + The main collation key table; if NULL, use the standard table.
1.159 + */
1.160 + const TCollationKeyTable* iMainTable;
1.161 +
1.162 + /**
1.163 + If non-NULL, tailoring for collation keys.
1.164 + */
1.165 + const TCollationKeyTable* iOverrideTable;
1.166 + enum
1.167 + {
1.168 + /**
1.169 + Don't ignore any keys (punctuation, etc. is normally ignored).
1.170 + */
1.171 + EIgnoreNone = 1,
1.172 +
1.173 + /**
1.174 + Reverse the normal order for characters differing only in case
1.175 + */
1.176 + ESwapCase = 2,
1.177 +
1.178 + /**
1.179 + Compare secondary keys which represent accents in reverse
1.180 + order (from right to left); this is needed for French when comparing
1.181 + words that differ only in accents.
1.182 + */
1.183 + EAccentsBackwards = 4,
1.184 +
1.185 + /**
1.186 + Reverse the normal order for characters differing only in whether they
1.187 + are katakana or hiragana.
1.188 + */
1.189 + ESwapKana = 8,
1.190 +
1.191 + /**
1.192 + Fold all characters to lower case before extracting keys; needed for
1.193 + comparison of filenames, for which case is ignored but other
1.194 + tertiary (level-2) distinctions are not.
1.195 + */
1.196 + EFoldCase = 16,
1.197 +
1.198 + /** Flag to indicate a collation method for matching purpose
1.199 + This flag is only needed if we wish to specify a particular collation method
1.200 + to be used for matching purpose.
1.201 + */
1.202 + EMatchingTable = 32,
1.203 +
1.204 + /** Ignore the check for adjacent combining characters. A combining
1.205 + character effectively changes the character it combines with to something
1.206 + else and so a match doesn't occur. Setting this flag will allow character
1.207 + matching regardless of any combining characters.
1.208 + */
1.209 + EIgnoreCombining = 64
1.210 + };
1.211 +
1.212 + /**
1.213 + Flags.
1.214 +
1.215 + @see TCollationMethod::EIgnoreNone
1.216 + @see TCollationMethod::ESwapCase
1.217 + @see TCollationMethod::EAccentsBackwards
1.218 + @see TCollationMethod::ESwapKana
1.219 + @see TCollationMethod::EFoldCase
1.220 + */
1.221 + TUint iFlags;
1.222 + };
1.223 +
1.224 +/**
1.225 +A collation data set provides any collation methods needed by a locale.
1.226 +@publishedPartner
1.227 +*/
1.228 +struct TCollationDataSet
1.229 + {
1.230 + public:
1.231 + const TCollationMethod* iMethod;
1.232 + TInt iMethods;
1.233 + };
1.234 +
1.235 +// Collation method IDs
1.236 +
1.237 +/**
1.238 +A collation data set provides any collation methods needed by a locale.
1.239 +@internalTechnology
1.240 +@released
1.241 +*/
1.242 +const TUint KUidBasicCollationMethod = 0x10004F4E;
1.243 +
1.244 +/**
1.245 +A collation data set provides any collation methods needed by a locale.
1.246 +@internalTechnology
1.247 +@released
1.248 +*/
1.249 +const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
1.250 +
1.251 +#ifndef __KERNEL_MODE__
1.252 +
1.253 +//Forward declarations
1.254 +class TUTF32Iterator;
1.255 +struct LCharSet;
1.256 +
1.257 +/**
1.258 +Provides low-level collation functions.
1.259 +@internalComponent
1.260 +*/
1.261 +class TCollate
1.262 + {
1.263 +public:
1.264 + /**
1.265 + Construct a TCollate object based on the collation method specified
1.266 + within aCharSet, if any. If there is none, or aCharSet is null, the
1.267 + standard collation method will be used. aMask and aFlags provide a
1.268 + method for overriding the flags in the collation method: Each flag set
1.269 + to 1 in aMask is a flag that will be overridden and set to the
1.270 + corresponding flag value in aFlags. Ownership of aCharSet is not passed.
1.271 + */
1.272 + TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
1.273 + /**
1.274 + Construct a TCollate object based on an already constructed
1.275 + TCollationMethod specified in aMethod. Ownership is not passed.
1.276 + */
1.277 + TCollate(const TCollationMethod& aMethod);
1.278 +
1.279 + enum TComparisonResult
1.280 + {
1.281 + ELeftComparesLessAndIsNotPrefix = -2,
1.282 + ELeftIsPrefixOfRight = -1,
1.283 + EStringsIdentical = 0,
1.284 + ERightIsPrefixOfLeft = 1,
1.285 + ERightComparesLessAndIsNotPrefix = 2
1.286 + };
1.287 +
1.288 + /**
1.289 + Compare the string beginning at aString1 of length aLength1 against the
1.290 + string beginning at aString2 of length aLength2.
1.291 + aMaxLevel determines the tightness of the collation. At level 0, only
1.292 + character identities are distinguished. At level 1 accents are
1.293 + distinguished as well. At level 2 case is distinguishes as well. At
1.294 + level 3 all valid different Unicode characters are considered different.
1.295 + */
1.296 + TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
1.297 + const TUint16* aString2,TInt aLength2,
1.298 + TInt aMaxLevel = 3) const;
1.299 + /**
1.300 + Find the string beginning at aString2 of length aLength2 in the string
1.301 + beginning at aString1 of length aLength1. aMaxLevel determines
1.302 + the tightness of the collation, see Compare for details.
1.303 + */
1.304 + TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
1.305 + TInt aMaxLevel,TUint aString2WildChar = 0) const;
1.306 +
1.307 + TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
1.308 + TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
1.309 +
1.310 + /**
1.311 + Test if the string beginning at aSearchTerm of length aSearchTermLength
1.312 + matches the string beginning at aCandidate of length aCandidateLength.
1.313 + aMaxLevel determines the tightness of the collation, see
1.314 + Compare for details. The search term may have wild card characters as
1.315 + specified by aWildChar (for matching a single grapheme- i.e. character
1.316 + and any characters that combine with it, such as accents) and
1.317 + aWildSequenceChar (for matching any sequence of whole graphemes). The
1.318 + return value is KErrNotFound iff the search term does not match the
1.319 + candidate string exactly. To find a match within the candidate string,
1.320 + the search term must begin and end with a wild sequence character. If
1.321 + the search term does match the candidate string, 0 will be returned,
1.322 + unless the first character of the search term is a wild sequence
1.323 + character in which case the value returned will be the index into
1.324 + aCandidate at which the first non-wild sequence character matched.
1.325 + aWildSequenceChar must be a valid (non-surrogate) Unicode character
1.326 + below FFFE.
1.327 + */
1.328 + TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
1.329 + const TUint16 *aSearchTerm,TInt aSearchTermLength,
1.330 + TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
1.331 +
1.332 +private:
1.333 + /**
1.334 + Compare values output from the iterators. After the comparison, if
1.335 + ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
1.336 + aRight will be pointing at the next key (at MaxLevel) after the match.
1.337 + If right is shown to be a prefix of left, this means that it has been
1.338 + checked at all requested levels. If it is reported that the right is a
1.339 + prefix of the left, then this will mean also that there are no unmatched
1.340 + combining characters on the left.
1.341 + */
1.342 + TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
1.343 + TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
1.344 + /**
1.345 + Finds search term inside candidate string. Returns KErrNotFound if there
1.346 + is no match, returns the offset into the candidate string at which the
1.347 + search term was found (note that this is the offset from the start of
1.348 + the iteration, not from where the iteration was when the function was
1.349 + called). If a string was found, the search term iterator is left
1.350 + pointing at the end of the search term, and the candidate iterator is
1.351 + left pointing just after the matched keys. aMatchPos returns where in
1.352 + the candidate string the match was found.
1.353 + */
1.354 + TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
1.355 + TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
1.356 +
1.357 +private:
1.358 + TCollationMethod iMethod;
1.359 + };
1.360 +
1.361 +#endif // __KERNEL_MODE__
1.362 +
1.363 +#endif // _UNICODE
1.364 +
1.365 +#endif // __COLLATE_H__