epoc32/include/collate.h
author William Roberts <williamr@symbian.org>
Tue, 16 Mar 2010 16:12:26 +0000
branchSymbian2
changeset 2 2fe1408b6811
permissions -rw-r--r--
Final list of Symbian^2 public API header files
     1 // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
     2 // All rights reserved.
     3 // This component and the accompanying materials are made available
     4 // under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
     5 // which accompanies this distribution, and is available
     6 // at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
     7 //
     8 // Initial Contributors:
     9 // Nokia Corporation - initial contribution.
    10 //
    11 // Contributors:
    12 //
    13 // Description:
    14 // e32\include\collate.h
    15 // Definitions needed for Unicode collation.
    16 // Collation is the comparison of two Unicode strings to produce an ordering
    17 // that may be used in a dictionary or other list.
    18 // Collation is implemented using the Standard Unicode Collation algorithm. There
    19 // are four levels of comparison:
    20 // primary: basic character identity
    21 // secondary: accents and diacritics
    22 // tertiary: upper and lower case, and other minor attributes
    23 // quaternary: Unicode character value
    24 // Punctuation is normally ignored but can optionally be taken into account.
    25 // Strings are fully expanded using the standard Unicode canonical expansions before
    26 // they are compared. Thai and Lao vowels are swapped with the following character
    27 // if any.
    28 // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
    29 // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
    30 // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
    31 // all the characters for which keys are defined, and ordered by their Unicode values.
    32 // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
    33 // method. This is done by using the standard table as the main key table (signalled by placing NULL in
    34 // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
    35 // Locale-specific collation data resides in ELOCL.
    36 // 
    37 //
    38 
    39 
    40 
    41 #ifndef __COLLATE_H__
    42 #define __COLLATE_H__
    43 
    44 #ifdef __KERNEL_MODE__
    45 #include <e32cmn.h>
    46 #else
    47 #include <e32std.h>
    48 #endif
    49 
    50 //This material is used in the Unicode build only.
    51 #ifdef _UNICODE
    52 
    53 /**
    54 Collation key table structure.
    55 @publishedPartner
    56 */
    57 struct TCollationKeyTable
    58 	{
    59 public:
    60 	/**
    61 	Masks for the various parts of the elements of the iKey array.
    62 	*/
    63 	enum
    64 		{
    65 		ELevel0Mask = 0xFFFF0000,	// primary key - basic character identity
    66 		ELevel1Mask = 0x0000FF00,	// secondary key - accents and diacritics
    67 		ELevel2Mask = 0x000000FC,	// tertiary key - case, etc.
    68 		EIgnoreFlag = 0x2,			// if set, this key is normally ignored
    69 		EStopFlag = 0x1				// if set, this key is the last in a sequence representing a Unicode value or values
    70 		};
    71 
    72 	/**
    73 	An array containing all of the keys and strings of keys concatenated
    74 	together. Each key has EStopFlag set only if it is the last key in its
    75 	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
    76 	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
    77 	etc.).
    78 	*/
    79 	const TUint32* iKey;
    80 	/**
    81 	An array of indices into the iKey array. Each element has its high 16
    82 	bits indicating a Unicode value and its low 16 bits indicating an index
    83 	into the iKey array at which its key starts. The elements are sorted by
    84 	Unicode value.
    85 	*/
    86 	const TUint32* iIndex;
    87 	/**
    88 	The size of the iIndex array.
    89 	*/
    90 	TInt iIndices;
    91 	/**
    92 	Concatenated Unicode strings. Each is a strings that is to be converted
    93 	to keys differently from how it would be if each letter were converted
    94 	independently. An example is "ch" in Spanish, which sorts as though it
    95 	were a single letter. Each Unicode string is preceeded by a 16-bit value
    96 	indicating the string's length. The end of the string is not delimited.
    97 	*/
    98 	const TUint16* iString;
    99 	/**
   100 	An array of elements mapping elements of iString to elements of iIndex.
   101 	Each element has its high 16 bits indicating the index of the start of
   102 	an element of iString, and its low 16 bits indicating the corresponding
   103 	element in iIndex. This array is sorted on the string index.
   104 	*/
   105 	const TUint32* iStringIndex;
   106 	/**
   107 	The size of the iStringIndex array.
   108 	*/
   109 	TInt iStringIndices;
   110 	};
   111 
   112 /**
   113 Defines a collation method. 
   114 
   115 Collation means sorting pieces of text. It needs to take into account characters, 
   116 accents and case; spaces and punctuation are usually ignored. It differs from 
   117 ordinary methods of sorting in that it is locale-dependent - different 
   118 languages use different ordering methods. Additionally, multiple collation 
   119 methods may exist within the same locale.
   120 
   121 A collation method provides the collation keys and other data needed to customise 
   122 collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) 
   123 perform the collation. Note that these functions use the standard collation 
   124 method for the current locale - you only need to specify an object of class 
   125 TCollationMethod to customise this collation scheme. Collation methods can 
   126 be retrieved using member functions of the Mem class. Each one has a unique 
   127 identifier.
   128 
   129 A collation method specifies a main table of collation keys, and optionally 
   130 an overriding table that contains keys for which the values in the main table 
   131 are overridden. A collation key table (TCollationKeyTable) is the set of collation 
   132 keys: primary (basic character identity), secondary (accents and diacritics) 
   133 and tertiary (case). The quaternary key is the Unicode character values themselves.
   134 
   135 The simplest way to customise a collation method is to create a local copy 
   136 of the standard collation method and change it. For example, you could use 
   137 the standard method, but not ignore punctuation and spaces:
   138 
   139 @code
   140 TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
   141 m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
   142 @endcode
   143 
   144 @publishedPartner
   145 */
   146 struct TCollationMethod
   147 	{
   148 	public:
   149 	/**
   150 	The UID of this collation method.
   151 	*/
   152 	TUint iId;
   153 	
   154 	/**
   155 	The main collation key table; if NULL, use the standard table.
   156 	*/
   157 	const TCollationKeyTable* iMainTable;
   158 	
   159 	/**
   160 	If non-NULL, tailoring for collation keys.
   161 	*/
   162 	const TCollationKeyTable* iOverrideTable;
   163 	enum
   164 		{
   165 		/**
   166 		Don't ignore any keys (punctuation, etc. is normally ignored).
   167 		*/
   168 		EIgnoreNone = 1,
   169 		
   170 		/**
   171 		Reverse the normal order for characters differing only in case
   172 		*/
   173 		ESwapCase = 2,
   174 		
   175 		/**
   176 		Compare secondary keys which represent accents in reverse
   177 		order (from right to left); this is needed for French when comparing
   178 		words that differ only in accents.
   179 		*/
   180 		EAccentsBackwards = 4,	
   181 		
   182 		/**
   183 		Reverse the normal order for characters differing only in whether they
   184 		are katakana or hiragana.
   185 		*/
   186 		ESwapKana = 8,
   187 		
   188 		/**
   189 		Fold all characters to lower case before extracting keys; needed for
   190 		comparison of filenames, for which case is ignored but other
   191 		tertiary (level-2) distinctions are not.
   192 		*/
   193 		EFoldCase = 16,
   194 		
   195 		/** Flag to indicate a collation method for matching purpose 
   196 		This flag is only needed if we wish to specify a particular collation method
   197 		to be used for matching purpose.
   198 		*/
   199 		EMatchingTable = 32,
   200 		
   201 		/** Ignore the check for adjacent combining characters.  A combining
   202 		character effectively changes the character it combines with to something
   203 		else and so a match doesn't occur.  Setting this flag will allow character
   204 		matching regardless of any combining characters.
   205 		*/
   206 		EIgnoreCombining = 64
   207 		};
   208 		
   209 	/**
   210 	Flags.
   211 	
   212 	@see TCollationMethod::EIgnoreNone
   213 	@see TCollationMethod::ESwapCase
   214 	@see TCollationMethod::EAccentsBackwards
   215 	@see TCollationMethod::ESwapKana
   216 	@see TCollationMethod::EFoldCase
   217 	*/
   218 	TUint iFlags;
   219 	};
   220 
   221 /**
   222 A collation data set provides any collation methods needed by a locale.
   223 @publishedPartner
   224 */
   225 struct TCollationDataSet
   226 	{
   227 	public:
   228 	const TCollationMethod* iMethod;
   229 	TInt iMethods;
   230 	};
   231 
   232 // Collation method IDs
   233 
   234 /**
   235 A collation data set provides any collation methods needed by a locale.
   236 @internalTechnology
   237 @released
   238 */
   239 const TUint KUidBasicCollationMethod = 0x10004F4E;
   240 
   241 /**
   242 A collation data set provides any collation methods needed by a locale.
   243 @internalTechnology
   244 @released
   245 */
   246 const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
   247 
   248 #ifndef __KERNEL_MODE__
   249 
   250 //Forward declarations
   251 class TUTF32Iterator;
   252 struct LCharSet;
   253 
   254 /**
   255 Provides low-level collation functions.
   256 @internalComponent
   257 */
   258 class TCollate
   259 	{
   260 public:
   261 	/**
   262 	Construct a TCollate object based on the collation method specified
   263 	within aCharSet, if any. If there is none, or aCharSet is null, the
   264 	standard collation method will be used. aMask and aFlags provide a
   265 	method for overriding the flags in the collation method: Each flag set
   266 	to 1 in aMask is a flag that will be overridden and set to the
   267 	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
   268 	*/
   269 	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
   270 	/**
   271 	Construct a TCollate object based on an already constructed
   272 	TCollationMethod specified in aMethod. Ownership is not passed.
   273 	*/
   274 	TCollate(const TCollationMethod& aMethod);
   275 
   276 	enum TComparisonResult
   277 		{
   278 		ELeftComparesLessAndIsNotPrefix = -2,
   279 		ELeftIsPrefixOfRight = -1,
   280 		EStringsIdentical = 0,
   281 		ERightIsPrefixOfLeft = 1,
   282 		ERightComparesLessAndIsNotPrefix = 2
   283 		};
   284 
   285 	/**
   286 	Compare the string beginning at aString1 of length aLength1 against the
   287 	string beginning at aString2 of length aLength2.
   288 	aMaxLevel determines the tightness of the collation. At level 0, only
   289 	character identities are distinguished. At level 1 accents are
   290 	distinguished as well. At level 2 case is distinguishes as well. At
   291 	level 3 all valid different Unicode characters are considered different.
   292 	*/
   293 	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
   294 							  const TUint16* aString2,TInt aLength2,
   295 							  TInt aMaxLevel = 3) const;
   296 	/**
   297 	Find the string beginning at aString2 of length aLength2 in the string
   298 	beginning at aString1 of length aLength1. aMaxLevel determines
   299 	the tightness of the collation, see Compare for details.
   300 	*/
   301 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
   302 			  TInt aMaxLevel,TUint aString2WildChar = 0) const;
   303 			  
   304 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
   305 		      TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
   306 		      
   307 	/**
   308 	Test if the string beginning at aSearchTerm of length aSearchTermLength
   309 	matches the string beginning at aCandidate of length aCandidateLength.
   310 	aMaxLevel determines the tightness of the collation, see
   311 	Compare for details. The search term may have wild card characters as
   312 	specified by aWildChar (for matching a single grapheme- i.e. character
   313 	and any characters that combine with it, such as accents) and
   314 	aWildSequenceChar (for matching any sequence of whole graphemes). The
   315 	return value is KErrNotFound iff the search term does not match the
   316 	candidate string exactly. To find a match within the candidate string,
   317 	the search term must begin and end with a wild sequence character. If
   318 	the search term does match the candidate string, 0 will be returned,
   319 	unless the first character of the search term is a wild sequence
   320 	character in which case the value returned will be the index into
   321 	aCandidate at which the first non-wild sequence character matched.
   322 	aWildSequenceChar must be a valid (non-surrogate) Unicode character
   323 	below FFFE.
   324 	*/
   325 	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
   326 			   const TUint16 *aSearchTerm,TInt aSearchTermLength,
   327 			   TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
   328 
   329 private:
   330 	/**
   331 	Compare values output from the iterators. After the comparison, if
   332 	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
   333 	aRight will be pointing at the next key (at MaxLevel) after the match.
   334 	If right is shown to be a prefix of left, this means that it has been
   335 	checked at all requested levels. If it is reported that the right is a
   336 	prefix of the left, then this will mean also that there are no unmatched
   337 	combining characters on the left.
   338 	*/
   339 	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
   340 										  TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
   341 	/**
   342 	Finds search term inside candidate string. Returns KErrNotFound if there
   343 	is no match, returns the offset into the candidate string at which the
   344 	search term was found (note that this is the offset from the start of
   345 	the iteration, not from where the iteration was when the function was
   346 	called). If a string was found, the search term iterator is left
   347 	pointing at the end of the search term, and the candidate iterator is
   348 	left pointing just after the matched keys. aMatchPos returns where in
   349 	the candidate string the match was found.
   350 	*/
   351 	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
   352 						 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
   353 
   354 private:
   355 	TCollationMethod iMethod;
   356 	};
   357 
   358 #endif	// __KERNEL_MODE__
   359 
   360 #endif // _UNICODE
   361 
   362 #endif // __COLLATE_H__