williamr@2: // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies). williamr@2: // All rights reserved. williamr@2: // This component and the accompanying materials are made available williamr@2: // under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members williamr@2: // which accompanies this distribution, and is available williamr@2: // at the URL "http://www.symbianfoundation.org/legal/licencesv10.html". williamr@2: // williamr@2: // Initial Contributors: williamr@2: // Nokia Corporation - initial contribution. williamr@2: // williamr@2: // Contributors: williamr@2: // williamr@2: // Description: williamr@2: // e32\include\collate.h williamr@2: // Definitions needed for Unicode collation. williamr@2: // Collation is the comparison of two Unicode strings to produce an ordering williamr@2: // that may be used in a dictionary or other list. williamr@2: // Collation is implemented using the Standard Unicode Collation algorithm. There williamr@2: // are four levels of comparison: williamr@2: // primary: basic character identity williamr@2: // secondary: accents and diacritics williamr@2: // tertiary: upper and lower case, and other minor attributes williamr@2: // quaternary: Unicode character value williamr@2: // Punctuation is normally ignored but can optionally be taken into account. williamr@2: // Strings are fully expanded using the standard Unicode canonical expansions before williamr@2: // they are compared. Thai and Lao vowels are swapped with the following character williamr@2: // if any. williamr@2: // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values williamr@2: // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus williamr@2: // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after williamr@2: // all the characters for which keys are defined, and ordered by their Unicode values. williamr@2: // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard williamr@2: // method. This is done by using the standard table as the main key table (signalled by placing NULL in williamr@2: // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable). williamr@2: // Locale-specific collation data resides in ELOCL. williamr@2: // williamr@2: // williamr@2: williamr@2: williamr@2: williamr@2: #ifndef __COLLATE_H__ williamr@2: #define __COLLATE_H__ williamr@2: williamr@2: #ifdef __KERNEL_MODE__ williamr@2: #include williamr@2: #else williamr@2: #include williamr@2: #endif williamr@2: williamr@2: //This material is used in the Unicode build only. williamr@2: #ifdef _UNICODE williamr@2: williamr@2: /** williamr@2: Collation key table structure. williamr@2: @publishedPartner williamr@2: */ williamr@2: struct TCollationKeyTable williamr@2: { williamr@2: public: williamr@2: /** williamr@2: Masks for the various parts of the elements of the iKey array. williamr@2: */ williamr@2: enum williamr@2: { williamr@2: ELevel0Mask = 0xFFFF0000, // primary key - basic character identity williamr@2: ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics williamr@2: ELevel2Mask = 0x000000FC, // tertiary key - case, etc. williamr@2: EIgnoreFlag = 0x2, // if set, this key is normally ignored williamr@2: EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values williamr@2: }; williamr@2: williamr@2: /** williamr@2: An array containing all of the keys and strings of keys concatenated williamr@2: together. Each key has EStopFlag set only if it is the last key in its williamr@2: string. Eack key contains the keys for levels 0, 1 and 2, and a flag williamr@2: EIgnoreFlag if the key is usually ignored (for punctuation & spaces williamr@2: etc.). williamr@2: */ williamr@2: const TUint32* iKey; williamr@2: /** williamr@2: An array of indices into the iKey array. Each element has its high 16 williamr@2: bits indicating a Unicode value and its low 16 bits indicating an index williamr@2: into the iKey array at which its key starts. The elements are sorted by williamr@2: Unicode value. williamr@2: */ williamr@2: const TUint32* iIndex; williamr@2: /** williamr@2: The size of the iIndex array. williamr@2: */ williamr@2: TInt iIndices; williamr@2: /** williamr@2: Concatenated Unicode strings. Each is a strings that is to be converted williamr@2: to keys differently from how it would be if each letter were converted williamr@2: independently. An example is "ch" in Spanish, which sorts as though it williamr@2: were a single letter. Each Unicode string is preceeded by a 16-bit value williamr@2: indicating the string's length. The end of the string is not delimited. williamr@2: */ williamr@2: const TUint16* iString; williamr@2: /** williamr@2: An array of elements mapping elements of iString to elements of iIndex. williamr@2: Each element has its high 16 bits indicating the index of the start of williamr@2: an element of iString, and its low 16 bits indicating the corresponding williamr@2: element in iIndex. This array is sorted on the string index. williamr@2: */ williamr@2: const TUint32* iStringIndex; williamr@2: /** williamr@2: The size of the iStringIndex array. williamr@2: */ williamr@2: TInt iStringIndices; williamr@2: }; williamr@2: williamr@2: /** williamr@2: Defines a collation method. williamr@2: williamr@2: Collation means sorting pieces of text. It needs to take into account characters, williamr@2: accents and case; spaces and punctuation are usually ignored. It differs from williamr@2: ordinary methods of sorting in that it is locale-dependent - different williamr@2: languages use different ordering methods. Additionally, multiple collation williamr@2: methods may exist within the same locale. williamr@2: williamr@2: A collation method provides the collation keys and other data needed to customise williamr@2: collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) williamr@2: perform the collation. Note that these functions use the standard collation williamr@2: method for the current locale - you only need to specify an object of class williamr@2: TCollationMethod to customise this collation scheme. Collation methods can williamr@2: be retrieved using member functions of the Mem class. Each one has a unique williamr@2: identifier. williamr@2: williamr@2: A collation method specifies a main table of collation keys, and optionally williamr@2: an overriding table that contains keys for which the values in the main table williamr@2: are overridden. A collation key table (TCollationKeyTable) is the set of collation williamr@2: keys: primary (basic character identity), secondary (accents and diacritics) williamr@2: and tertiary (case). The quaternary key is the Unicode character values themselves. williamr@2: williamr@2: The simplest way to customise a collation method is to create a local copy williamr@2: of the standard collation method and change it. For example, you could use williamr@2: the standard method, but not ignore punctuation and spaces: williamr@2: williamr@2: @code williamr@2: TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method williamr@2: m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces williamr@2: @endcode williamr@2: williamr@2: @publishedPartner williamr@2: */ williamr@2: struct TCollationMethod williamr@2: { williamr@2: public: williamr@2: /** williamr@2: The UID of this collation method. williamr@2: */ williamr@2: TUint iId; williamr@2: williamr@2: /** williamr@2: The main collation key table; if NULL, use the standard table. williamr@2: */ williamr@2: const TCollationKeyTable* iMainTable; williamr@2: williamr@2: /** williamr@2: If non-NULL, tailoring for collation keys. williamr@2: */ williamr@2: const TCollationKeyTable* iOverrideTable; williamr@2: enum williamr@2: { williamr@2: /** williamr@2: Don't ignore any keys (punctuation, etc. is normally ignored). williamr@2: */ williamr@2: EIgnoreNone = 1, williamr@2: williamr@2: /** williamr@2: Reverse the normal order for characters differing only in case williamr@2: */ williamr@2: ESwapCase = 2, williamr@2: williamr@2: /** williamr@2: Compare secondary keys which represent accents in reverse williamr@2: order (from right to left); this is needed for French when comparing williamr@2: words that differ only in accents. williamr@2: */ williamr@2: EAccentsBackwards = 4, williamr@2: williamr@2: /** williamr@2: Reverse the normal order for characters differing only in whether they williamr@2: are katakana or hiragana. williamr@2: */ williamr@2: ESwapKana = 8, williamr@2: williamr@2: /** williamr@2: Fold all characters to lower case before extracting keys; needed for williamr@2: comparison of filenames, for which case is ignored but other williamr@2: tertiary (level-2) distinctions are not. williamr@2: */ williamr@2: EFoldCase = 16, williamr@2: williamr@2: /** Flag to indicate a collation method for matching purpose williamr@2: This flag is only needed if we wish to specify a particular collation method williamr@2: to be used for matching purpose. williamr@2: */ williamr@2: EMatchingTable = 32, williamr@2: williamr@2: /** Ignore the check for adjacent combining characters. A combining williamr@2: character effectively changes the character it combines with to something williamr@2: else and so a match doesn't occur. Setting this flag will allow character williamr@2: matching regardless of any combining characters. williamr@2: */ williamr@2: EIgnoreCombining = 64 williamr@2: }; williamr@2: williamr@2: /** williamr@2: Flags. williamr@2: williamr@2: @see TCollationMethod::EIgnoreNone williamr@2: @see TCollationMethod::ESwapCase williamr@2: @see TCollationMethod::EAccentsBackwards williamr@2: @see TCollationMethod::ESwapKana williamr@2: @see TCollationMethod::EFoldCase williamr@2: */ williamr@2: TUint iFlags; williamr@2: }; williamr@2: williamr@2: /** williamr@2: A collation data set provides any collation methods needed by a locale. williamr@2: @publishedPartner williamr@2: */ williamr@2: struct TCollationDataSet williamr@2: { williamr@2: public: williamr@2: const TCollationMethod* iMethod; williamr@2: TInt iMethods; williamr@2: }; williamr@2: williamr@2: // Collation method IDs williamr@2: williamr@2: /** williamr@2: A collation data set provides any collation methods needed by a locale. williamr@2: @internalTechnology williamr@2: @released williamr@2: */ williamr@2: const TUint KUidBasicCollationMethod = 0x10004F4E; williamr@2: williamr@2: /** williamr@2: A collation data set provides any collation methods needed by a locale. williamr@2: @internalTechnology williamr@2: @released williamr@2: */ williamr@2: const TUint KUidStandardUnicodeCollationMethod = 0x10004E96; williamr@2: williamr@2: #ifndef __KERNEL_MODE__ williamr@2: williamr@2: //Forward declarations williamr@2: class TUTF32Iterator; williamr@2: struct LCharSet; williamr@2: williamr@2: /** williamr@2: Provides low-level collation functions. williamr@2: @internalComponent williamr@2: */ williamr@2: class TCollate williamr@2: { williamr@2: public: williamr@2: /** williamr@2: Construct a TCollate object based on the collation method specified williamr@2: within aCharSet, if any. If there is none, or aCharSet is null, the williamr@2: standard collation method will be used. aMask and aFlags provide a williamr@2: method for overriding the flags in the collation method: Each flag set williamr@2: to 1 in aMask is a flag that will be overridden and set to the williamr@2: corresponding flag value in aFlags. Ownership of aCharSet is not passed. williamr@2: */ williamr@2: TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF); williamr@2: /** williamr@2: Construct a TCollate object based on an already constructed williamr@2: TCollationMethod specified in aMethod. Ownership is not passed. williamr@2: */ williamr@2: TCollate(const TCollationMethod& aMethod); williamr@2: williamr@2: enum TComparisonResult williamr@2: { williamr@2: ELeftComparesLessAndIsNotPrefix = -2, williamr@2: ELeftIsPrefixOfRight = -1, williamr@2: EStringsIdentical = 0, williamr@2: ERightIsPrefixOfLeft = 1, williamr@2: ERightComparesLessAndIsNotPrefix = 2 williamr@2: }; williamr@2: williamr@2: /** williamr@2: Compare the string beginning at aString1 of length aLength1 against the williamr@2: string beginning at aString2 of length aLength2. williamr@2: aMaxLevel determines the tightness of the collation. At level 0, only williamr@2: character identities are distinguished. At level 1 accents are williamr@2: distinguished as well. At level 2 case is distinguishes as well. At williamr@2: level 3 all valid different Unicode characters are considered different. williamr@2: */ williamr@2: TComparisonResult Compare(const TUint16* aString1,TInt aLength1, williamr@2: const TUint16* aString2,TInt aLength2, williamr@2: TInt aMaxLevel = 3) const; williamr@2: /** williamr@2: Find the string beginning at aString2 of length aLength2 in the string williamr@2: beginning at aString1 of length aLength1. aMaxLevel determines williamr@2: the tightness of the collation, see Compare for details. williamr@2: */ williamr@2: TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, williamr@2: TInt aMaxLevel,TUint aString2WildChar = 0) const; williamr@2: williamr@2: TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, williamr@2: TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const; williamr@2: williamr@2: /** williamr@2: Test if the string beginning at aSearchTerm of length aSearchTermLength williamr@2: matches the string beginning at aCandidate of length aCandidateLength. williamr@2: aMaxLevel determines the tightness of the collation, see williamr@2: Compare for details. The search term may have wild card characters as williamr@2: specified by aWildChar (for matching a single grapheme- i.e. character williamr@2: and any characters that combine with it, such as accents) and williamr@2: aWildSequenceChar (for matching any sequence of whole graphemes). The williamr@2: return value is KErrNotFound iff the search term does not match the williamr@2: candidate string exactly. To find a match within the candidate string, williamr@2: the search term must begin and end with a wild sequence character. If williamr@2: the search term does match the candidate string, 0 will be returned, williamr@2: unless the first character of the search term is a wild sequence williamr@2: character in which case the value returned will be the index into williamr@2: aCandidate at which the first non-wild sequence character matched. williamr@2: aWildSequenceChar must be a valid (non-surrogate) Unicode character williamr@2: below FFFE. williamr@2: */ williamr@2: TInt Match(const TUint16 *aCandidate, TInt aCandidateLength, williamr@2: const TUint16 *aSearchTerm,TInt aSearchTermLength, williamr@2: TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const; williamr@2: williamr@2: private: williamr@2: /** williamr@2: Compare values output from the iterators. After the comparison, if williamr@2: ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and williamr@2: aRight will be pointing at the next key (at MaxLevel) after the match. williamr@2: If right is shown to be a prefix of left, this means that it has been williamr@2: checked at all requested levels. If it is reported that the right is a williamr@2: prefix of the left, then this will mean also that there are no unmatched williamr@2: combining characters on the left. williamr@2: */ williamr@2: TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight, williamr@2: TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const; williamr@2: /** williamr@2: Finds search term inside candidate string. Returns KErrNotFound if there williamr@2: is no match, returns the offset into the candidate string at which the williamr@2: search term was found (note that this is the offset from the start of williamr@2: the iteration, not from where the iteration was when the function was williamr@2: called). If a string was found, the search term iterator is left williamr@2: pointing at the end of the search term, and the candidate iterator is williamr@2: left pointing just after the matched keys. aMatchPos returns where in williamr@2: the candidate string the match was found. williamr@2: */ williamr@2: TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm, williamr@2: TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const; williamr@2: williamr@2: private: williamr@2: TCollationMethod iMethod; williamr@2: }; williamr@2: williamr@2: #endif // __KERNEL_MODE__ williamr@2: williamr@2: #endif // _UNICODE williamr@2: williamr@2: #endif // __COLLATE_H__