1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/epoc32/include/collate.h	Tue Mar 16 16:12:26 2010 +0000
     1.3 @@ -0,0 +1,362 @@
     1.4 +// Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.5 +// All rights reserved.
     1.6 +// This component and the accompanying materials are made available
     1.7 +// under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
     1.8 +// which accompanies this distribution, and is available
     1.9 +// at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
    1.10 +//
    1.11 +// Initial Contributors:
    1.12 +// Nokia Corporation - initial contribution.
    1.13 +//
    1.14 +// Contributors:
    1.15 +//
    1.16 +// Description:
    1.17 +// e32\include\collate.h
    1.18 +// Definitions needed for Unicode collation.
    1.19 +// Collation is the comparison of two Unicode strings to produce an ordering
    1.20 +// that may be used in a dictionary or other list.
    1.21 +// Collation is implemented using the Standard Unicode Collation algorithm. There
    1.22 +// are four levels of comparison:
    1.23 +// primary: basic character identity
    1.24 +// secondary: accents and diacritics
    1.25 +// tertiary: upper and lower case, and other minor attributes
    1.26 +// quaternary: Unicode character value
    1.27 +// Punctuation is normally ignored but can optionally be taken into account.
    1.28 +// Strings are fully expanded using the standard Unicode canonical expansions before
    1.29 +// they are compared. Thai and Lao vowels are swapped with the following character
    1.30 +// if any.
    1.31 +// EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
    1.32 +// to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
    1.33 +// the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
    1.34 +// all the characters for which keys are defined, and ordered by their Unicode values.
    1.35 +// Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
    1.36 +// method. This is done by using the standard table as the main key table (signalled by placing NULL in
    1.37 +// TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
    1.38 +// Locale-specific collation data resides in ELOCL.
    1.39 +// 
    1.40 +//
    1.41 +
    1.42 +
    1.43 +
    1.44 +#ifndef __COLLATE_H__
    1.45 +#define __COLLATE_H__
    1.46 +
    1.47 +#ifdef __KERNEL_MODE__
    1.48 +#include <e32cmn.h>
    1.49 +#else
    1.50 +#include <e32std.h>
    1.51 +#endif
    1.52 +
    1.53 +//This material is used in the Unicode build only.
    1.54 +#ifdef _UNICODE
    1.55 +
    1.56 +/**
    1.57 +Collation key table structure.
    1.58 +@publishedPartner
    1.59 +*/
    1.60 +struct TCollationKeyTable
    1.61 +	{
    1.62 +public:
    1.63 +	/**
    1.64 +	Masks for the various parts of the elements of the iKey array.
    1.65 +	*/
    1.66 +	enum
    1.67 +		{
    1.68 +		ELevel0Mask = 0xFFFF0000,	// primary key - basic character identity
    1.69 +		ELevel1Mask = 0x0000FF00,	// secondary key - accents and diacritics
    1.70 +		ELevel2Mask = 0x000000FC,	// tertiary key - case, etc.
    1.71 +		EIgnoreFlag = 0x2,			// if set, this key is normally ignored
    1.72 +		EStopFlag = 0x1				// if set, this key is the last in a sequence representing a Unicode value or values
    1.73 +		};
    1.74 +
    1.75 +	/**
    1.76 +	An array containing all of the keys and strings of keys concatenated
    1.77 +	together. Each key has EStopFlag set only if it is the last key in its
    1.78 +	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
    1.79 +	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
    1.80 +	etc.).
    1.81 +	*/
    1.82 +	const TUint32* iKey;
    1.83 +	/**
    1.84 +	An array of indices into the iKey array. Each element has its high 16
    1.85 +	bits indicating a Unicode value and its low 16 bits indicating an index
    1.86 +	into the iKey array at which its key starts. The elements are sorted by
    1.87 +	Unicode value.
    1.88 +	*/
    1.89 +	const TUint32* iIndex;
    1.90 +	/**
    1.91 +	The size of the iIndex array.
    1.92 +	*/
    1.93 +	TInt iIndices;
    1.94 +	/**
    1.95 +	Concatenated Unicode strings. Each is a strings that is to be converted
    1.96 +	to keys differently from how it would be if each letter were converted
    1.97 +	independently. An example is "ch" in Spanish, which sorts as though it
    1.98 +	were a single letter. Each Unicode string is preceeded by a 16-bit value
    1.99 +	indicating the string's length. The end of the string is not delimited.
   1.100 +	*/
   1.101 +	const TUint16* iString;
   1.102 +	/**
   1.103 +	An array of elements mapping elements of iString to elements of iIndex.
   1.104 +	Each element has its high 16 bits indicating the index of the start of
   1.105 +	an element of iString, and its low 16 bits indicating the corresponding
   1.106 +	element in iIndex. This array is sorted on the string index.
   1.107 +	*/
   1.108 +	const TUint32* iStringIndex;
   1.109 +	/**
   1.110 +	The size of the iStringIndex array.
   1.111 +	*/
   1.112 +	TInt iStringIndices;
   1.113 +	};
   1.114 +
   1.115 +/**
   1.116 +Defines a collation method. 
   1.117 +
   1.118 +Collation means sorting pieces of text. It needs to take into account characters, 
   1.119 +accents and case; spaces and punctuation are usually ignored. It differs from 
   1.120 +ordinary methods of sorting in that it is locale-dependent - different 
   1.121 +languages use different ordering methods. Additionally, multiple collation 
   1.122 +methods may exist within the same locale.
   1.123 +
   1.124 +A collation method provides the collation keys and other data needed to customise 
   1.125 +collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) 
   1.126 +perform the collation. Note that these functions use the standard collation 
   1.127 +method for the current locale - you only need to specify an object of class 
   1.128 +TCollationMethod to customise this collation scheme. Collation methods can 
   1.129 +be retrieved using member functions of the Mem class. Each one has a unique 
   1.130 +identifier.
   1.131 +
   1.132 +A collation method specifies a main table of collation keys, and optionally 
   1.133 +an overriding table that contains keys for which the values in the main table 
   1.134 +are overridden. A collation key table (TCollationKeyTable) is the set of collation 
   1.135 +keys: primary (basic character identity), secondary (accents and diacritics) 
   1.136 +and tertiary (case). The quaternary key is the Unicode character values themselves.
   1.137 +
   1.138 +The simplest way to customise a collation method is to create a local copy 
   1.139 +of the standard collation method and change it. For example, you could use 
   1.140 +the standard method, but not ignore punctuation and spaces:
   1.141 +
   1.142 +@code
   1.143 +TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
   1.144 +m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
   1.145 +@endcode
   1.146 +
   1.147 +@publishedPartner
   1.148 +*/
   1.149 +struct TCollationMethod
   1.150 +	{
   1.151 +	public:
   1.152 +	/**
   1.153 +	The UID of this collation method.
   1.154 +	*/
   1.155 +	TUint iId;
   1.156 +	
   1.157 +	/**
   1.158 +	The main collation key table; if NULL, use the standard table.
   1.159 +	*/
   1.160 +	const TCollationKeyTable* iMainTable;
   1.161 +	
   1.162 +	/**
   1.163 +	If non-NULL, tailoring for collation keys.
   1.164 +	*/
   1.165 +	const TCollationKeyTable* iOverrideTable;
   1.166 +	enum
   1.167 +		{
   1.168 +		/**
   1.169 +		Don't ignore any keys (punctuation, etc. is normally ignored).
   1.170 +		*/
   1.171 +		EIgnoreNone = 1,
   1.172 +		
   1.173 +		/**
   1.174 +		Reverse the normal order for characters differing only in case
   1.175 +		*/
   1.176 +		ESwapCase = 2,
   1.177 +		
   1.178 +		/**
   1.179 +		Compare secondary keys which represent accents in reverse
   1.180 +		order (from right to left); this is needed for French when comparing
   1.181 +		words that differ only in accents.
   1.182 +		*/
   1.183 +		EAccentsBackwards = 4,	
   1.184 +		
   1.185 +		/**
   1.186 +		Reverse the normal order for characters differing only in whether they
   1.187 +		are katakana or hiragana.
   1.188 +		*/
   1.189 +		ESwapKana = 8,
   1.190 +		
   1.191 +		/**
   1.192 +		Fold all characters to lower case before extracting keys; needed for
   1.193 +		comparison of filenames, for which case is ignored but other
   1.194 +		tertiary (level-2) distinctions are not.
   1.195 +		*/
   1.196 +		EFoldCase = 16,
   1.197 +		
   1.198 +		/** Flag to indicate a collation method for matching purpose 
   1.199 +		This flag is only needed if we wish to specify a particular collation method
   1.200 +		to be used for matching purpose.
   1.201 +		*/
   1.202 +		EMatchingTable = 32,
   1.203 +		
   1.204 +		/** Ignore the check for adjacent combining characters.  A combining
   1.205 +		character effectively changes the character it combines with to something
   1.206 +		else and so a match doesn't occur.  Setting this flag will allow character
   1.207 +		matching regardless of any combining characters.
   1.208 +		*/
   1.209 +		EIgnoreCombining = 64
   1.210 +		};
   1.211 +		
   1.212 +	/**
   1.213 +	Flags.
   1.214 +	
   1.215 +	@see TCollationMethod::EIgnoreNone
   1.216 +	@see TCollationMethod::ESwapCase
   1.217 +	@see TCollationMethod::EAccentsBackwards
   1.218 +	@see TCollationMethod::ESwapKana
   1.219 +	@see TCollationMethod::EFoldCase
   1.220 +	*/
   1.221 +	TUint iFlags;
   1.222 +	};
   1.223 +
   1.224 +/**
   1.225 +A collation data set provides any collation methods needed by a locale.
   1.226 +@publishedPartner
   1.227 +*/
   1.228 +struct TCollationDataSet
   1.229 +	{
   1.230 +	public:
   1.231 +	const TCollationMethod* iMethod;
   1.232 +	TInt iMethods;
   1.233 +	};
   1.234 +
   1.235 +// Collation method IDs
   1.236 +
   1.237 +/**
   1.238 +A collation data set provides any collation methods needed by a locale.
   1.239 +@internalTechnology
   1.240 +@released
   1.241 +*/
   1.242 +const TUint KUidBasicCollationMethod = 0x10004F4E;
   1.243 +
   1.244 +/**
   1.245 +A collation data set provides any collation methods needed by a locale.
   1.246 +@internalTechnology
   1.247 +@released
   1.248 +*/
   1.249 +const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
   1.250 +
   1.251 +#ifndef __KERNEL_MODE__
   1.252 +
   1.253 +//Forward declarations
   1.254 +class TUTF32Iterator;
   1.255 +struct LCharSet;
   1.256 +
   1.257 +/**
   1.258 +Provides low-level collation functions.
   1.259 +@internalComponent
   1.260 +*/
   1.261 +class TCollate
   1.262 +	{
   1.263 +public:
   1.264 +	/**
   1.265 +	Construct a TCollate object based on the collation method specified
   1.266 +	within aCharSet, if any. If there is none, or aCharSet is null, the
   1.267 +	standard collation method will be used. aMask and aFlags provide a
   1.268 +	method for overriding the flags in the collation method: Each flag set
   1.269 +	to 1 in aMask is a flag that will be overridden and set to the
   1.270 +	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
   1.271 +	*/
   1.272 +	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
   1.273 +	/**
   1.274 +	Construct a TCollate object based on an already constructed
   1.275 +	TCollationMethod specified in aMethod. Ownership is not passed.
   1.276 +	*/
   1.277 +	TCollate(const TCollationMethod& aMethod);
   1.278 +
   1.279 +	enum TComparisonResult
   1.280 +		{
   1.281 +		ELeftComparesLessAndIsNotPrefix = -2,
   1.282 +		ELeftIsPrefixOfRight = -1,
   1.283 +		EStringsIdentical = 0,
   1.284 +		ERightIsPrefixOfLeft = 1,
   1.285 +		ERightComparesLessAndIsNotPrefix = 2
   1.286 +		};
   1.287 +
   1.288 +	/**
   1.289 +	Compare the string beginning at aString1 of length aLength1 against the
   1.290 +	string beginning at aString2 of length aLength2.
   1.291 +	aMaxLevel determines the tightness of the collation. At level 0, only
   1.292 +	character identities are distinguished. At level 1 accents are
   1.293 +	distinguished as well. At level 2 case is distinguishes as well. At
   1.294 +	level 3 all valid different Unicode characters are considered different.
   1.295 +	*/
   1.296 +	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
   1.297 +							  const TUint16* aString2,TInt aLength2,
   1.298 +							  TInt aMaxLevel = 3) const;
   1.299 +	/**
   1.300 +	Find the string beginning at aString2 of length aLength2 in the string
   1.301 +	beginning at aString1 of length aLength1. aMaxLevel determines
   1.302 +	the tightness of the collation, see Compare for details.
   1.303 +	*/
   1.304 +	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
   1.305 +			  TInt aMaxLevel,TUint aString2WildChar = 0) const;
   1.306 +			  
   1.307 +	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
   1.308 +		      TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
   1.309 +		      
   1.310 +	/**
   1.311 +	Test if the string beginning at aSearchTerm of length aSearchTermLength
   1.312 +	matches the string beginning at aCandidate of length aCandidateLength.
   1.313 +	aMaxLevel determines the tightness of the collation, see
   1.314 +	Compare for details. The search term may have wild card characters as
   1.315 +	specified by aWildChar (for matching a single grapheme- i.e. character
   1.316 +	and any characters that combine with it, such as accents) and
   1.317 +	aWildSequenceChar (for matching any sequence of whole graphemes). The
   1.318 +	return value is KErrNotFound iff the search term does not match the
   1.319 +	candidate string exactly. To find a match within the candidate string,
   1.320 +	the search term must begin and end with a wild sequence character. If
   1.321 +	the search term does match the candidate string, 0 will be returned,
   1.322 +	unless the first character of the search term is a wild sequence
   1.323 +	character in which case the value returned will be the index into
   1.324 +	aCandidate at which the first non-wild sequence character matched.
   1.325 +	aWildSequenceChar must be a valid (non-surrogate) Unicode character
   1.326 +	below FFFE.
   1.327 +	*/
   1.328 +	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
   1.329 +			   const TUint16 *aSearchTerm,TInt aSearchTermLength,
   1.330 +			   TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
   1.331 +
   1.332 +private:
   1.333 +	/**
   1.334 +	Compare values output from the iterators. After the comparison, if
   1.335 +	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
   1.336 +	aRight will be pointing at the next key (at MaxLevel) after the match.
   1.337 +	If right is shown to be a prefix of left, this means that it has been
   1.338 +	checked at all requested levels. If it is reported that the right is a
   1.339 +	prefix of the left, then this will mean also that there are no unmatched
   1.340 +	combining characters on the left.
   1.341 +	*/
   1.342 +	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
   1.343 +										  TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
   1.344 +	/**
   1.345 +	Finds search term inside candidate string. Returns KErrNotFound if there
   1.346 +	is no match, returns the offset into the candidate string at which the
   1.347 +	search term was found (note that this is the offset from the start of
   1.348 +	the iteration, not from where the iteration was when the function was
   1.349 +	called). If a string was found, the search term iterator is left
   1.350 +	pointing at the end of the search term, and the candidate iterator is
   1.351 +	left pointing just after the matched keys. aMatchPos returns where in
   1.352 +	the candidate string the match was found.
   1.353 +	*/
   1.354 +	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
   1.355 +						 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
   1.356 +
   1.357 +private:
   1.358 +	TCollationMethod iMethod;
   1.359 +	};
   1.360 +
   1.361 +#endif	// __KERNEL_MODE__
   1.362 +
   1.363 +#endif // _UNICODE
   1.364 +
   1.365 +#endif // __COLLATE_H__