os/kernelhwsrv/kernel/eka/include/collate.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
     2 // All rights reserved.
     3 // This component and the accompanying materials are made available
     4 // under the terms of the License "Eclipse Public License v1.0"
     5 // which accompanies this distribution, and is available
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
     7 //
     8 // Initial Contributors:
     9 // Nokia Corporation - initial contribution.
    10 //
    11 // Contributors:
    12 //
    13 // Description:
    14 // e32\include\collate.h
    15 // Definitions needed for Unicode collation.
    16 // Collation is the comparison of two Unicode strings to produce an ordering
    17 // that may be used in a dictionary or other list.
    18 // Collation is implemented using the Standard Unicode Collation algorithm. There
    19 // are four levels of comparison:
    20 // primary: basic character identity
    21 // secondary: accents and diacritics
    22 // tertiary: upper and lower case, and other minor attributes
    23 // quaternary: Unicode character value
    24 // Punctuation is normally ignored but can optionally be taken into account.
    25 // Strings are fully expanded using the standard Unicode canonical expansions before
    26 // they are compared. Thai and Lao vowels are swapped with the following character
    27 // if any.
    28 // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
    29 // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
    30 // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
    31 // all the characters for which keys are defined, and ordered by their Unicode values.
    32 // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
    33 // method. This is done by using the standard table as the main key table (signalled by placing NULL in
    34 // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
    35 // Locale-specific collation data resides in ELOCL.
    36 // 
    37 // WARNING: This file contains some APIs which are internal and are subject
    38 //          to change without notice. Such APIs should therefore not be used
    39 //          outside the Kernel and Hardware Services package.
    40 //
    41 
    42 #ifndef __COLLATE_H__
    43 #define __COLLATE_H__
    44 
    45 #ifdef __KERNEL_MODE__
    46 #include <e32cmn.h>
    47 #else
    48 #include <e32std.h>
    49 #endif
    50 
    51 //This material is used in the Unicode build only.
    52 #ifdef _UNICODE
    53 
    54 /**
    55 Collation key table structure.
    56 @publishedPartner
    57 @released
    58 */
    59 struct TCollationKeyTable
    60 	{
    61 public:
    62 	/**
    63 	Masks for the various parts of the elements of the iKey array.
    64 	*/
    65 	enum
    66 		{
    67 		ELevel0Mask = 0xFFFF0000,	// primary key - basic character identity
    68 		ELevel1Mask = 0x0000FF00,	// secondary key - accents and diacritics
    69 		ELevel2Mask = 0x000000FC,	// tertiary key - case, etc.
    70 		EIgnoreFlag = 0x2,			// if set, this key is normally ignored
    71 		EStopFlag = 0x1				// if set, this key is the last in a sequence representing a Unicode value or values
    72 		};
    73 
    74 	/**
    75 	An array containing all of the keys and strings of keys concatenated
    76 	together. Each key has EStopFlag set only if it is the last key in its
    77 	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
    78 	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
    79 	etc.).
    80 	*/
    81 	const TUint32* iKey;
    82 	/**
    83 	An array of indices into the iKey array. Each element has its high 16
    84 	bits indicating a Unicode value and its low 16 bits indicating an index
    85 	into the iKey array at which its key starts. For surrogate pairs, high
    86 	surrogate code is in index[i]:16-31, and low surrogate code is in 
    87 	index[i+1]:16-31. These two elements are combined to represent a surrogate
    88 	pair. The elements are sorted by Unicode value.
    89 	*/
    90 	const TUint32* iIndex;
    91 	/**
    92 	The size of the iIndex array.
    93 	*/
    94 	TInt iIndices;
    95 	/**
    96 	Concatenated Unicode strings. Each is a strings that is to be converted
    97 	to keys differently from how it would be if each letter were converted
    98 	independently. An example is "ch" in Spanish, which sorts as though it
    99 	were a single letter. Each Unicode string is preceeded by a 16-bit value
   100 	indicating the string's length (in 16-bit). The end of the string is not 
   101 	delimited. A surrogate pair is represented by two ajacent 16-bit values.
   102 	*/
   103 	const TUint16* iString;
   104 	/**
   105 	An array of elements mapping elements of iString to elements of iIndex.
   106 	Each element has its high 16 bits indicating the index of the start of
   107 	an element of iString, and its low 16 bits indicating the corresponding
   108 	element in iIndex. This array is sorted on the string index.
   109 	*/
   110 	const TUint32* iStringIndex;
   111 	/**
   112 	The size of the iStringIndex array.
   113 	*/
   114 	TInt iStringIndices;
   115 	};
   116 
   117 /**
   118 Defines a collation method. 
   119 
   120 Collation means sorting pieces of text. It needs to take into account characters, 
   121 accents and case; spaces and punctuation are usually ignored. It differs from 
   122 ordinary methods of sorting in that it is locale-dependent - different 
   123 languages use different ordering methods. Additionally, multiple collation 
   124 methods may exist within the same locale.
   125 
   126 A collation method provides the collation keys and other data needed to customise 
   127 collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) 
   128 perform the collation. Note that these functions use the standard collation 
   129 method for the current locale - you only need to specify an object of class 
   130 TCollationMethod to customise this collation scheme. Collation methods can 
   131 be retrieved using member functions of the Mem class. Each one has a unique 
   132 identifier.
   133 
   134 A collation method specifies a main table of collation keys, and optionally 
   135 an overriding table that contains keys for which the values in the main table 
   136 are overridden. A collation key table (TCollationKeyTable) is the set of collation 
   137 keys: primary (basic character identity), secondary (accents and diacritics) 
   138 and tertiary (case). The quaternary key is the Unicode character values themselves.
   139 
   140 The simplest way to customise a collation method is to create a local copy 
   141 of the standard collation method and change it. For example, you could use 
   142 the standard method, but not ignore punctuation and spaces:
   143 
   144 @code
   145 TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
   146 m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
   147 @endcode
   148 
   149 @publishedPartner
   150 @released
   151 */
   152 struct TCollationMethod
   153 	{
   154 	public:
   155 	/**
   156 	The UID of this collation method.
   157 	*/
   158 	TUint iId;
   159 	
   160 	/**
   161 	The main collation key table; if NULL, use the standard table.
   162 	*/
   163 	const TCollationKeyTable* iMainTable;
   164 	
   165 	/**
   166 	If non-NULL, tailoring for collation keys.
   167 	*/
   168 	const TCollationKeyTable* iOverrideTable;
   169 	enum
   170 		{
   171 		/**
   172 		Don't ignore any keys (punctuation, etc. is normally ignored).
   173 		*/
   174 		EIgnoreNone = 1,
   175 		
   176 		/**
   177 		Reverse the normal order for characters differing only in case
   178 		*/
   179 		ESwapCase = 2,
   180 		
   181 		/**
   182 		Compare secondary keys which represent accents in reverse
   183 		order (from right to left); this is needed for French when comparing
   184 		words that differ only in accents.
   185 		*/
   186 		EAccentsBackwards = 4,	
   187 		
   188 		/**
   189 		Reverse the normal order for characters differing only in whether they
   190 		are katakana or hiragana.
   191 		*/
   192 		ESwapKana = 8,
   193 		
   194 		/**
   195 		Fold all characters to lower case before extracting keys; needed for
   196 		comparison of filenames, for which case is ignored but other
   197 		tertiary (level-2) distinctions are not.
   198 		*/
   199 		EFoldCase = 16,
   200 		
   201 		/** Flag to indicate a collation method for matching purpose 
   202 		This flag is only needed if we wish to specify a particular collation method
   203 		to be used for matching purpose.
   204 		*/
   205 		EMatchingTable = 32,
   206 		
   207 		/** Ignore the check for adjacent combining characters.  A combining
   208 		character effectively changes the character it combines with to something
   209 		else and so a match doesn't occur.  Setting this flag will allow character
   210 		matching regardless of any combining characters.
   211 		*/
   212 		EIgnoreCombining = 64
   213 		};
   214 		
   215 	/**
   216 	Flags.
   217 	
   218 	@see TCollationMethod::EIgnoreNone
   219 	@see TCollationMethod::ESwapCase
   220 	@see TCollationMethod::EAccentsBackwards
   221 	@see TCollationMethod::ESwapKana
   222 	@see TCollationMethod::EFoldCase
   223 	*/
   224 	TUint iFlags;
   225 	};
   226 
   227 /**
   228 A collation data set provides any collation methods needed by a locale.
   229 @publishedPartner
   230 @released
   231 */
   232 struct TCollationDataSet
   233 	{
   234 	public:
   235 	const TCollationMethod* iMethod;
   236 	TInt iMethods;
   237 	};
   238 
   239 // Collation method IDs
   240 
   241 /**
   242 A collation data set provides any collation methods needed by a locale.
   243 @internalTechnology
   244 @released
   245 */
   246 const TUint KUidBasicCollationMethod = 0x10004F4E;
   247 
   248 /**
   249 A collation data set provides any collation methods needed by a locale.
   250 @internalTechnology
   251 @released
   252 */
   253 const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
   254 
   255 #ifndef __KERNEL_MODE__
   256 
   257 //Forward declarations
   258 class TUTF32Iterator;
   259 struct LCharSet;
   260 
   261 /**
   262 Provides low-level collation functions.
   263 @internalComponent
   264 @released
   265 */
   266 class TCollate
   267 	{
   268 public:
   269 	/**
   270 	Construct a TCollate object based on the collation method specified
   271 	within aCharSet, if any. If there is none, or aCharSet is null, the
   272 	standard collation method will be used. aMask and aFlags provide a
   273 	method for overriding the flags in the collation method: Each flag set
   274 	to 1 in aMask is a flag that will be overridden and set to the
   275 	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
   276 	*/
   277 	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
   278 	/**
   279 	Construct a TCollate object based on an already constructed
   280 	TCollationMethod specified in aMethod. Ownership is not passed.
   281 	*/
   282 	TCollate(const TCollationMethod& aMethod);
   283 
   284 	enum TComparisonResult
   285 		{
   286 		ELeftComparesLessAndIsNotPrefix = -2,
   287 		ELeftIsPrefixOfRight = -1,
   288 		EStringsIdentical = 0,
   289 		ERightIsPrefixOfLeft = 1,
   290 		ERightComparesLessAndIsNotPrefix = 2
   291 		};
   292 
   293 	/**
   294 	Compare the string beginning at aString1 of length aLength1 against the
   295 	string beginning at aString2 of length aLength2.
   296 	aMaxLevel determines the tightness of the collation. At level 0, only
   297 	character identities are distinguished. At level 1 accents are
   298 	distinguished as well. At level 2 case is distinguishes as well. At
   299 	level 3 all valid different Unicode characters are considered different.
   300 	*/
   301 	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
   302 							  const TUint16* aString2,TInt aLength2,
   303 							  TInt aMaxLevel = 3) const;
   304 	/**
   305 	Find the string beginning at aString2 of length aLength2 in the string
   306 	beginning at aString1 of length aLength1. aMaxLevel determines
   307 	the tightness of the collation, see Compare for details.
   308 	*/
   309 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
   310 			  TInt aMaxLevel,TUint aString2WildChar = 0) const;
   311 			  
   312 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
   313 		      TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
   314 		      
   315 	/**
   316 	Test if the string beginning at aSearchTerm of length aSearchTermLength
   317 	matches the string beginning at aCandidate of length aCandidateLength.
   318 	aMaxLevel determines the tightness of the collation, see
   319 	Compare for details. The search term may have wild card characters as
   320 	specified by aWildChar (for matching a single grapheme- i.e. character
   321 	and any characters that combine with it, such as accents) and
   322 	aWildSequenceChar (for matching any sequence of whole graphemes). The
   323 	return value is KErrNotFound iff the search term does not match the
   324 	candidate string exactly. To find a match within the candidate string,
   325 	the search term must begin and end with a wild sequence character. If
   326 	the search term does match the candidate string, 0 will be returned,
   327 	unless the first character of the search term is a wild sequence
   328 	character in which case the value returned will be the index into
   329 	aCandidate at which the first non-wild sequence character matched.
   330 	aWildSequenceChar must be a valid (non-surrogate) Unicode character
   331 	below FFFE.
   332 	*/
   333 	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
   334 			   const TUint16 *aSearchTerm,TInt aSearchTermLength,
   335 			   TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
   336 
   337 private:
   338 	/**
   339 	Compare values output from the iterators. After the comparison, if
   340 	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
   341 	aRight will be pointing at the next key (at MaxLevel) after the match.
   342 	If right is shown to be a prefix of left, this means that it has been
   343 	checked at all requested levels. If it is reported that the right is a
   344 	prefix of the left, then this will mean also that there are no unmatched
   345 	combining characters on the left.
   346 	*/
   347 	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
   348 										  TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
   349 	/**
   350 	Finds search term inside candidate string. Returns KErrNotFound if there
   351 	is no match, returns the offset into the candidate string at which the
   352 	search term was found (note that this is the offset from the start of
   353 	the iteration, not from where the iteration was when the function was
   354 	called). If a string was found, the search term iterator is left
   355 	pointing at the end of the search term, and the candidate iterator is
   356 	left pointing just after the matched keys. aMatchPos returns where in
   357 	the candidate string the match was found.
   358 	*/
   359 	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
   360 						 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
   361 
   362 private:
   363 	TCollationMethod iMethod;
   364 	};
   365 
   366 #endif	// __KERNEL_MODE__
   367 
   368 #endif // _UNICODE
   369 
   370 #endif // __COLLATE_H__