sl@0: // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: // All rights reserved. sl@0: // This component and the accompanying materials are made available sl@0: // under the terms of the License "Eclipse Public License v1.0" sl@0: // which accompanies this distribution, and is available sl@0: // at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: // sl@0: // Initial Contributors: sl@0: // Nokia Corporation - initial contribution. sl@0: // sl@0: // Contributors: sl@0: // sl@0: // Description: sl@0: // e32\include\collate.h sl@0: // Definitions needed for Unicode collation. sl@0: // Collation is the comparison of two Unicode strings to produce an ordering sl@0: // that may be used in a dictionary or other list. sl@0: // Collation is implemented using the Standard Unicode Collation algorithm. There sl@0: // are four levels of comparison: sl@0: // primary: basic character identity sl@0: // secondary: accents and diacritics sl@0: // tertiary: upper and lower case, and other minor attributes sl@0: // quaternary: Unicode character value sl@0: // Punctuation is normally ignored but can optionally be taken into account. sl@0: // Strings are fully expanded using the standard Unicode canonical expansions before sl@0: // they are compared. Thai and Lao vowels are swapped with the following character sl@0: // if any. sl@0: // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values sl@0: // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus sl@0: // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after sl@0: // all the characters for which keys are defined, and ordered by their Unicode values. sl@0: // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard sl@0: // method. This is done by using the standard table as the main key table (signalled by placing NULL in sl@0: // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable). sl@0: // Locale-specific collation data resides in ELOCL. sl@0: // sl@0: // WARNING: This file contains some APIs which are internal and are subject sl@0: // to change without notice. Such APIs should therefore not be used sl@0: // outside the Kernel and Hardware Services package. sl@0: // sl@0: sl@0: #ifndef __COLLATE_H__ sl@0: #define __COLLATE_H__ sl@0: sl@0: #ifdef __KERNEL_MODE__ sl@0: #include sl@0: #else sl@0: #include sl@0: #endif sl@0: sl@0: //This material is used in the Unicode build only. sl@0: #ifdef _UNICODE sl@0: sl@0: /** sl@0: Collation key table structure. sl@0: @publishedPartner sl@0: @released sl@0: */ sl@0: struct TCollationKeyTable sl@0: { sl@0: public: sl@0: /** sl@0: Masks for the various parts of the elements of the iKey array. sl@0: */ sl@0: enum sl@0: { sl@0: ELevel0Mask = 0xFFFF0000, // primary key - basic character identity sl@0: ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics sl@0: ELevel2Mask = 0x000000FC, // tertiary key - case, etc. sl@0: EIgnoreFlag = 0x2, // if set, this key is normally ignored sl@0: EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values sl@0: }; sl@0: sl@0: /** sl@0: An array containing all of the keys and strings of keys concatenated sl@0: together. Each key has EStopFlag set only if it is the last key in its sl@0: string. Eack key contains the keys for levels 0, 1 and 2, and a flag sl@0: EIgnoreFlag if the key is usually ignored (for punctuation & spaces sl@0: etc.). sl@0: */ sl@0: const TUint32* iKey; sl@0: /** sl@0: An array of indices into the iKey array. Each element has its high 16 sl@0: bits indicating a Unicode value and its low 16 bits indicating an index sl@0: into the iKey array at which its key starts. For surrogate pairs, high sl@0: surrogate code is in index[i]:16-31, and low surrogate code is in sl@0: index[i+1]:16-31. These two elements are combined to represent a surrogate sl@0: pair. The elements are sorted by Unicode value. sl@0: */ sl@0: const TUint32* iIndex; sl@0: /** sl@0: The size of the iIndex array. sl@0: */ sl@0: TInt iIndices; sl@0: /** sl@0: Concatenated Unicode strings. Each is a strings that is to be converted sl@0: to keys differently from how it would be if each letter were converted sl@0: independently. An example is "ch" in Spanish, which sorts as though it sl@0: were a single letter. Each Unicode string is preceeded by a 16-bit value sl@0: indicating the string's length (in 16-bit). The end of the string is not sl@0: delimited. A surrogate pair is represented by two ajacent 16-bit values. sl@0: */ sl@0: const TUint16* iString; sl@0: /** sl@0: An array of elements mapping elements of iString to elements of iIndex. sl@0: Each element has its high 16 bits indicating the index of the start of sl@0: an element of iString, and its low 16 bits indicating the corresponding sl@0: element in iIndex. This array is sorted on the string index. sl@0: */ sl@0: const TUint32* iStringIndex; sl@0: /** sl@0: The size of the iStringIndex array. sl@0: */ sl@0: TInt iStringIndices; sl@0: }; sl@0: sl@0: /** sl@0: Defines a collation method. sl@0: sl@0: Collation means sorting pieces of text. It needs to take into account characters, sl@0: accents and case; spaces and punctuation are usually ignored. It differs from sl@0: ordinary methods of sorting in that it is locale-dependent - different sl@0: languages use different ordering methods. Additionally, multiple collation sl@0: methods may exist within the same locale. sl@0: sl@0: A collation method provides the collation keys and other data needed to customise sl@0: collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) sl@0: perform the collation. Note that these functions use the standard collation sl@0: method for the current locale - you only need to specify an object of class sl@0: TCollationMethod to customise this collation scheme. Collation methods can sl@0: be retrieved using member functions of the Mem class. Each one has a unique sl@0: identifier. sl@0: sl@0: A collation method specifies a main table of collation keys, and optionally sl@0: an overriding table that contains keys for which the values in the main table sl@0: are overridden. A collation key table (TCollationKeyTable) is the set of collation sl@0: keys: primary (basic character identity), secondary (accents and diacritics) sl@0: and tertiary (case). The quaternary key is the Unicode character values themselves. sl@0: sl@0: The simplest way to customise a collation method is to create a local copy sl@0: of the standard collation method and change it. For example, you could use sl@0: the standard method, but not ignore punctuation and spaces: sl@0: sl@0: @code sl@0: TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method sl@0: m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces sl@0: @endcode sl@0: sl@0: @publishedPartner sl@0: @released sl@0: */ sl@0: struct TCollationMethod sl@0: { sl@0: public: sl@0: /** sl@0: The UID of this collation method. sl@0: */ sl@0: TUint iId; sl@0: sl@0: /** sl@0: The main collation key table; if NULL, use the standard table. sl@0: */ sl@0: const TCollationKeyTable* iMainTable; sl@0: sl@0: /** sl@0: If non-NULL, tailoring for collation keys. sl@0: */ sl@0: const TCollationKeyTable* iOverrideTable; sl@0: enum sl@0: { sl@0: /** sl@0: Don't ignore any keys (punctuation, etc. is normally ignored). sl@0: */ sl@0: EIgnoreNone = 1, sl@0: sl@0: /** sl@0: Reverse the normal order for characters differing only in case sl@0: */ sl@0: ESwapCase = 2, sl@0: sl@0: /** sl@0: Compare secondary keys which represent accents in reverse sl@0: order (from right to left); this is needed for French when comparing sl@0: words that differ only in accents. sl@0: */ sl@0: EAccentsBackwards = 4, sl@0: sl@0: /** sl@0: Reverse the normal order for characters differing only in whether they sl@0: are katakana or hiragana. sl@0: */ sl@0: ESwapKana = 8, sl@0: sl@0: /** sl@0: Fold all characters to lower case before extracting keys; needed for sl@0: comparison of filenames, for which case is ignored but other sl@0: tertiary (level-2) distinctions are not. sl@0: */ sl@0: EFoldCase = 16, sl@0: sl@0: /** Flag to indicate a collation method for matching purpose sl@0: This flag is only needed if we wish to specify a particular collation method sl@0: to be used for matching purpose. sl@0: */ sl@0: EMatchingTable = 32, sl@0: sl@0: /** Ignore the check for adjacent combining characters. A combining sl@0: character effectively changes the character it combines with to something sl@0: else and so a match doesn't occur. Setting this flag will allow character sl@0: matching regardless of any combining characters. sl@0: */ sl@0: EIgnoreCombining = 64 sl@0: }; sl@0: sl@0: /** sl@0: Flags. sl@0: sl@0: @see TCollationMethod::EIgnoreNone sl@0: @see TCollationMethod::ESwapCase sl@0: @see TCollationMethod::EAccentsBackwards sl@0: @see TCollationMethod::ESwapKana sl@0: @see TCollationMethod::EFoldCase sl@0: */ sl@0: TUint iFlags; sl@0: }; sl@0: sl@0: /** sl@0: A collation data set provides any collation methods needed by a locale. sl@0: @publishedPartner sl@0: @released sl@0: */ sl@0: struct TCollationDataSet sl@0: { sl@0: public: sl@0: const TCollationMethod* iMethod; sl@0: TInt iMethods; sl@0: }; sl@0: sl@0: // Collation method IDs sl@0: sl@0: /** sl@0: A collation data set provides any collation methods needed by a locale. sl@0: @internalTechnology sl@0: @released sl@0: */ sl@0: const TUint KUidBasicCollationMethod = 0x10004F4E; sl@0: sl@0: /** sl@0: A collation data set provides any collation methods needed by a locale. sl@0: @internalTechnology sl@0: @released sl@0: */ sl@0: const TUint KUidStandardUnicodeCollationMethod = 0x10004E96; sl@0: sl@0: #ifndef __KERNEL_MODE__ sl@0: sl@0: //Forward declarations sl@0: class TUTF32Iterator; sl@0: struct LCharSet; sl@0: sl@0: /** sl@0: Provides low-level collation functions. sl@0: @internalComponent sl@0: @released sl@0: */ sl@0: class TCollate sl@0: { sl@0: public: sl@0: /** sl@0: Construct a TCollate object based on the collation method specified sl@0: within aCharSet, if any. If there is none, or aCharSet is null, the sl@0: standard collation method will be used. aMask and aFlags provide a sl@0: method for overriding the flags in the collation method: Each flag set sl@0: to 1 in aMask is a flag that will be overridden and set to the sl@0: corresponding flag value in aFlags. Ownership of aCharSet is not passed. sl@0: */ sl@0: TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF); sl@0: /** sl@0: Construct a TCollate object based on an already constructed sl@0: TCollationMethod specified in aMethod. Ownership is not passed. sl@0: */ sl@0: TCollate(const TCollationMethod& aMethod); sl@0: sl@0: enum TComparisonResult sl@0: { sl@0: ELeftComparesLessAndIsNotPrefix = -2, sl@0: ELeftIsPrefixOfRight = -1, sl@0: EStringsIdentical = 0, sl@0: ERightIsPrefixOfLeft = 1, sl@0: ERightComparesLessAndIsNotPrefix = 2 sl@0: }; sl@0: sl@0: /** sl@0: Compare the string beginning at aString1 of length aLength1 against the sl@0: string beginning at aString2 of length aLength2. sl@0: aMaxLevel determines the tightness of the collation. At level 0, only sl@0: character identities are distinguished. At level 1 accents are sl@0: distinguished as well. At level 2 case is distinguishes as well. At sl@0: level 3 all valid different Unicode characters are considered different. sl@0: */ sl@0: TComparisonResult Compare(const TUint16* aString1,TInt aLength1, sl@0: const TUint16* aString2,TInt aLength2, sl@0: TInt aMaxLevel = 3) const; sl@0: /** sl@0: Find the string beginning at aString2 of length aLength2 in the string sl@0: beginning at aString1 of length aLength1. aMaxLevel determines sl@0: the tightness of the collation, see Compare for details. sl@0: */ sl@0: TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, sl@0: TInt aMaxLevel,TUint aString2WildChar = 0) const; sl@0: sl@0: TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, sl@0: TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const; sl@0: sl@0: /** sl@0: Test if the string beginning at aSearchTerm of length aSearchTermLength sl@0: matches the string beginning at aCandidate of length aCandidateLength. sl@0: aMaxLevel determines the tightness of the collation, see sl@0: Compare for details. The search term may have wild card characters as sl@0: specified by aWildChar (for matching a single grapheme- i.e. character sl@0: and any characters that combine with it, such as accents) and sl@0: aWildSequenceChar (for matching any sequence of whole graphemes). The sl@0: return value is KErrNotFound iff the search term does not match the sl@0: candidate string exactly. To find a match within the candidate string, sl@0: the search term must begin and end with a wild sequence character. If sl@0: the search term does match the candidate string, 0 will be returned, sl@0: unless the first character of the search term is a wild sequence sl@0: character in which case the value returned will be the index into sl@0: aCandidate at which the first non-wild sequence character matched. sl@0: aWildSequenceChar must be a valid (non-surrogate) Unicode character sl@0: below FFFE. sl@0: */ sl@0: TInt Match(const TUint16 *aCandidate, TInt aCandidateLength, sl@0: const TUint16 *aSearchTerm,TInt aSearchTermLength, sl@0: TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const; sl@0: sl@0: private: sl@0: /** sl@0: Compare values output from the iterators. After the comparison, if sl@0: ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and sl@0: aRight will be pointing at the next key (at MaxLevel) after the match. sl@0: If right is shown to be a prefix of left, this means that it has been sl@0: checked at all requested levels. If it is reported that the right is a sl@0: prefix of the left, then this will mean also that there are no unmatched sl@0: combining characters on the left. sl@0: */ sl@0: TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight, sl@0: TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const; sl@0: /** sl@0: Finds search term inside candidate string. Returns KErrNotFound if there sl@0: is no match, returns the offset into the candidate string at which the sl@0: search term was found (note that this is the offset from the start of sl@0: the iteration, not from where the iteration was when the function was sl@0: called). If a string was found, the search term iterator is left sl@0: pointing at the end of the search term, and the candidate iterator is sl@0: left pointing just after the matched keys. aMatchPos returns where in sl@0: the candidate string the match was found. sl@0: */ sl@0: TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm, sl@0: TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const; sl@0: sl@0: private: sl@0: TCollationMethod iMethod; sl@0: }; sl@0: sl@0: #endif // __KERNEL_MODE__ sl@0: sl@0: #endif // _UNICODE sl@0: sl@0: #endif // __COLLATE_H__