williamr@2: // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
williamr@2: // All rights reserved.
williamr@2: // This component and the accompanying materials are made available
williamr@2: // under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
williamr@2: // which accompanies this distribution, and is available
williamr@2: // at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
williamr@2: //
williamr@2: // Initial Contributors:
williamr@2: // Nokia Corporation - initial contribution.
williamr@2: //
williamr@2: // Contributors:
williamr@2: //
williamr@2: // Description:
williamr@2: // e32\include\collate.h
williamr@2: // Definitions needed for Unicode collation.
williamr@2: // Collation is the comparison of two Unicode strings to produce an ordering
williamr@2: // that may be used in a dictionary or other list.
williamr@2: // Collation is implemented using the Standard Unicode Collation algorithm. There
williamr@2: // are four levels of comparison:
williamr@2: // primary: basic character identity
williamr@2: // secondary: accents and diacritics
williamr@2: // tertiary: upper and lower case, and other minor attributes
williamr@2: // quaternary: Unicode character value
williamr@2: // Punctuation is normally ignored but can optionally be taken into account.
williamr@2: // Strings are fully expanded using the standard Unicode canonical expansions before
williamr@2: // they are compared. Thai and Lao vowels are swapped with the following character
williamr@2: // if any.
williamr@2: // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
williamr@2: // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
williamr@2: // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
williamr@2: // all the characters for which keys are defined, and ordered by their Unicode values.
williamr@2: // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
williamr@2: // method. This is done by using the standard table as the main key table (signalled by placing NULL in
williamr@2: // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
williamr@2: // Locale-specific collation data resides in ELOCL.
williamr@2: // 
williamr@2: //
williamr@2: 
williamr@2: 
williamr@2: 
williamr@2: #ifndef __COLLATE_H__
williamr@2: #define __COLLATE_H__
williamr@2: 
williamr@2: #ifdef __KERNEL_MODE__
williamr@2: #include <e32cmn.h>
williamr@2: #else
williamr@2: #include <e32std.h>
williamr@2: #endif
williamr@2: 
williamr@2: //This material is used in the Unicode build only.
williamr@2: #ifdef _UNICODE
williamr@2: 
williamr@2: /**
williamr@2: Collation key table structure.
williamr@2: @publishedPartner
williamr@2: */
williamr@2: struct TCollationKeyTable
williamr@2: 	{
williamr@2: public:
williamr@2: 	/**
williamr@2: 	Masks for the various parts of the elements of the iKey array.
williamr@2: 	*/
williamr@2: 	enum
williamr@2: 		{
williamr@2: 		ELevel0Mask = 0xFFFF0000,	// primary key - basic character identity
williamr@2: 		ELevel1Mask = 0x0000FF00,	// secondary key - accents and diacritics
williamr@2: 		ELevel2Mask = 0x000000FC,	// tertiary key - case, etc.
williamr@2: 		EIgnoreFlag = 0x2,			// if set, this key is normally ignored
williamr@2: 		EStopFlag = 0x1				// if set, this key is the last in a sequence representing a Unicode value or values
williamr@2: 		};
williamr@2: 
williamr@2: 	/**
williamr@2: 	An array containing all of the keys and strings of keys concatenated
williamr@2: 	together. Each key has EStopFlag set only if it is the last key in its
williamr@2: 	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
williamr@2: 	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
williamr@2: 	etc.).
williamr@2: 	*/
williamr@2: 	const TUint32* iKey;
williamr@2: 	/**
williamr@2: 	An array of indices into the iKey array. Each element has its high 16
williamr@2: 	bits indicating a Unicode value and its low 16 bits indicating an index
williamr@2: 	into the iKey array at which its key starts. The elements are sorted by
williamr@2: 	Unicode value.
williamr@2: 	*/
williamr@2: 	const TUint32* iIndex;
williamr@2: 	/**
williamr@2: 	The size of the iIndex array.
williamr@2: 	*/
williamr@2: 	TInt iIndices;
williamr@2: 	/**
williamr@2: 	Concatenated Unicode strings. Each is a strings that is to be converted
williamr@2: 	to keys differently from how it would be if each letter were converted
williamr@2: 	independently. An example is "ch" in Spanish, which sorts as though it
williamr@2: 	were a single letter. Each Unicode string is preceeded by a 16-bit value
williamr@2: 	indicating the string's length. The end of the string is not delimited.
williamr@2: 	*/
williamr@2: 	const TUint16* iString;
williamr@2: 	/**
williamr@2: 	An array of elements mapping elements of iString to elements of iIndex.
williamr@2: 	Each element has its high 16 bits indicating the index of the start of
williamr@2: 	an element of iString, and its low 16 bits indicating the corresponding
williamr@2: 	element in iIndex. This array is sorted on the string index.
williamr@2: 	*/
williamr@2: 	const TUint32* iStringIndex;
williamr@2: 	/**
williamr@2: 	The size of the iStringIndex array.
williamr@2: 	*/
williamr@2: 	TInt iStringIndices;
williamr@2: 	};
williamr@2: 
williamr@2: /**
williamr@2: Defines a collation method. 
williamr@2: 
williamr@2: Collation means sorting pieces of text. It needs to take into account characters, 
williamr@2: accents and case; spaces and punctuation are usually ignored. It differs from 
williamr@2: ordinary methods of sorting in that it is locale-dependent - different 
williamr@2: languages use different ordering methods. Additionally, multiple collation 
williamr@2: methods may exist within the same locale.
williamr@2: 
williamr@2: A collation method provides the collation keys and other data needed to customise 
williamr@2: collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) 
williamr@2: perform the collation. Note that these functions use the standard collation 
williamr@2: method for the current locale - you only need to specify an object of class 
williamr@2: TCollationMethod to customise this collation scheme. Collation methods can 
williamr@2: be retrieved using member functions of the Mem class. Each one has a unique 
williamr@2: identifier.
williamr@2: 
williamr@2: A collation method specifies a main table of collation keys, and optionally 
williamr@2: an overriding table that contains keys for which the values in the main table 
williamr@2: are overridden. A collation key table (TCollationKeyTable) is the set of collation 
williamr@2: keys: primary (basic character identity), secondary (accents and diacritics) 
williamr@2: and tertiary (case). The quaternary key is the Unicode character values themselves.
williamr@2: 
williamr@2: The simplest way to customise a collation method is to create a local copy 
williamr@2: of the standard collation method and change it. For example, you could use 
williamr@2: the standard method, but not ignore punctuation and spaces:
williamr@2: 
williamr@2: @code
williamr@2: TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
williamr@2: m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
williamr@2: @endcode
williamr@2: 
williamr@2: @publishedPartner
williamr@2: */
williamr@2: struct TCollationMethod
williamr@2: 	{
williamr@2: 	public:
williamr@2: 	/**
williamr@2: 	The UID of this collation method.
williamr@2: 	*/
williamr@2: 	TUint iId;
williamr@2: 	
williamr@2: 	/**
williamr@2: 	The main collation key table; if NULL, use the standard table.
williamr@2: 	*/
williamr@2: 	const TCollationKeyTable* iMainTable;
williamr@2: 	
williamr@2: 	/**
williamr@2: 	If non-NULL, tailoring for collation keys.
williamr@2: 	*/
williamr@2: 	const TCollationKeyTable* iOverrideTable;
williamr@2: 	enum
williamr@2: 		{
williamr@2: 		/**
williamr@2: 		Don't ignore any keys (punctuation, etc. is normally ignored).
williamr@2: 		*/
williamr@2: 		EIgnoreNone = 1,
williamr@2: 		
williamr@2: 		/**
williamr@2: 		Reverse the normal order for characters differing only in case
williamr@2: 		*/
williamr@2: 		ESwapCase = 2,
williamr@2: 		
williamr@2: 		/**
williamr@2: 		Compare secondary keys which represent accents in reverse
williamr@2: 		order (from right to left); this is needed for French when comparing
williamr@2: 		words that differ only in accents.
williamr@2: 		*/
williamr@2: 		EAccentsBackwards = 4,	
williamr@2: 		
williamr@2: 		/**
williamr@2: 		Reverse the normal order for characters differing only in whether they
williamr@2: 		are katakana or hiragana.
williamr@2: 		*/
williamr@2: 		ESwapKana = 8,
williamr@2: 		
williamr@2: 		/**
williamr@2: 		Fold all characters to lower case before extracting keys; needed for
williamr@2: 		comparison of filenames, for which case is ignored but other
williamr@2: 		tertiary (level-2) distinctions are not.
williamr@2: 		*/
williamr@2: 		EFoldCase = 16,
williamr@2: 		
williamr@2: 		/** Flag to indicate a collation method for matching purpose 
williamr@2: 		This flag is only needed if we wish to specify a particular collation method
williamr@2: 		to be used for matching purpose.
williamr@2: 		*/
williamr@2: 		EMatchingTable = 32,
williamr@2: 		
williamr@2: 		/** Ignore the check for adjacent combining characters.  A combining
williamr@2: 		character effectively changes the character it combines with to something
williamr@2: 		else and so a match doesn't occur.  Setting this flag will allow character
williamr@2: 		matching regardless of any combining characters.
williamr@2: 		*/
williamr@2: 		EIgnoreCombining = 64
williamr@2: 		};
williamr@2: 		
williamr@2: 	/**
williamr@2: 	Flags.
williamr@2: 	
williamr@2: 	@see TCollationMethod::EIgnoreNone
williamr@2: 	@see TCollationMethod::ESwapCase
williamr@2: 	@see TCollationMethod::EAccentsBackwards
williamr@2: 	@see TCollationMethod::ESwapKana
williamr@2: 	@see TCollationMethod::EFoldCase
williamr@2: 	*/
williamr@2: 	TUint iFlags;
williamr@2: 	};
williamr@2: 
williamr@2: /**
williamr@2: A collation data set provides any collation methods needed by a locale.
williamr@2: @publishedPartner
williamr@2: */
williamr@2: struct TCollationDataSet
williamr@2: 	{
williamr@2: 	public:
williamr@2: 	const TCollationMethod* iMethod;
williamr@2: 	TInt iMethods;
williamr@2: 	};
williamr@2: 
williamr@2: // Collation method IDs
williamr@2: 
williamr@2: /**
williamr@2: A collation data set provides any collation methods needed by a locale.
williamr@2: @internalTechnology
williamr@2: @released
williamr@2: */
williamr@2: const TUint KUidBasicCollationMethod = 0x10004F4E;
williamr@2: 
williamr@2: /**
williamr@2: A collation data set provides any collation methods needed by a locale.
williamr@2: @internalTechnology
williamr@2: @released
williamr@2: */
williamr@2: const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
williamr@2: 
williamr@2: #ifndef __KERNEL_MODE__
williamr@2: 
williamr@2: //Forward declarations
williamr@2: class TUTF32Iterator;
williamr@2: struct LCharSet;
williamr@2: 
williamr@2: /**
williamr@2: Provides low-level collation functions.
williamr@2: @internalComponent
williamr@2: */
williamr@2: class TCollate
williamr@2: 	{
williamr@2: public:
williamr@2: 	/**
williamr@2: 	Construct a TCollate object based on the collation method specified
williamr@2: 	within aCharSet, if any. If there is none, or aCharSet is null, the
williamr@2: 	standard collation method will be used. aMask and aFlags provide a
williamr@2: 	method for overriding the flags in the collation method: Each flag set
williamr@2: 	to 1 in aMask is a flag that will be overridden and set to the
williamr@2: 	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
williamr@2: 	*/
williamr@2: 	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
williamr@2: 	/**
williamr@2: 	Construct a TCollate object based on an already constructed
williamr@2: 	TCollationMethod specified in aMethod. Ownership is not passed.
williamr@2: 	*/
williamr@2: 	TCollate(const TCollationMethod& aMethod);
williamr@2: 
williamr@2: 	enum TComparisonResult
williamr@2: 		{
williamr@2: 		ELeftComparesLessAndIsNotPrefix = -2,
williamr@2: 		ELeftIsPrefixOfRight = -1,
williamr@2: 		EStringsIdentical = 0,
williamr@2: 		ERightIsPrefixOfLeft = 1,
williamr@2: 		ERightComparesLessAndIsNotPrefix = 2
williamr@2: 		};
williamr@2: 
williamr@2: 	/**
williamr@2: 	Compare the string beginning at aString1 of length aLength1 against the
williamr@2: 	string beginning at aString2 of length aLength2.
williamr@2: 	aMaxLevel determines the tightness of the collation. At level 0, only
williamr@2: 	character identities are distinguished. At level 1 accents are
williamr@2: 	distinguished as well. At level 2 case is distinguishes as well. At
williamr@2: 	level 3 all valid different Unicode characters are considered different.
williamr@2: 	*/
williamr@2: 	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
williamr@2: 							  const TUint16* aString2,TInt aLength2,
williamr@2: 							  TInt aMaxLevel = 3) const;
williamr@2: 	/**
williamr@2: 	Find the string beginning at aString2 of length aLength2 in the string
williamr@2: 	beginning at aString1 of length aLength1. aMaxLevel determines
williamr@2: 	the tightness of the collation, see Compare for details.
williamr@2: 	*/
williamr@2: 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
williamr@2: 			  TInt aMaxLevel,TUint aString2WildChar = 0) const;
williamr@2: 			  
williamr@2: 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
williamr@2: 		      TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
williamr@2: 		      
williamr@2: 	/**
williamr@2: 	Test if the string beginning at aSearchTerm of length aSearchTermLength
williamr@2: 	matches the string beginning at aCandidate of length aCandidateLength.
williamr@2: 	aMaxLevel determines the tightness of the collation, see
williamr@2: 	Compare for details. The search term may have wild card characters as
williamr@2: 	specified by aWildChar (for matching a single grapheme- i.e. character
williamr@2: 	and any characters that combine with it, such as accents) and
williamr@2: 	aWildSequenceChar (for matching any sequence of whole graphemes). The
williamr@2: 	return value is KErrNotFound iff the search term does not match the
williamr@2: 	candidate string exactly. To find a match within the candidate string,
williamr@2: 	the search term must begin and end with a wild sequence character. If
williamr@2: 	the search term does match the candidate string, 0 will be returned,
williamr@2: 	unless the first character of the search term is a wild sequence
williamr@2: 	character in which case the value returned will be the index into
williamr@2: 	aCandidate at which the first non-wild sequence character matched.
williamr@2: 	aWildSequenceChar must be a valid (non-surrogate) Unicode character
williamr@2: 	below FFFE.
williamr@2: 	*/
williamr@2: 	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
williamr@2: 			   const TUint16 *aSearchTerm,TInt aSearchTermLength,
williamr@2: 			   TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
williamr@2: 
williamr@2: private:
williamr@2: 	/**
williamr@2: 	Compare values output from the iterators. After the comparison, if
williamr@2: 	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
williamr@2: 	aRight will be pointing at the next key (at MaxLevel) after the match.
williamr@2: 	If right is shown to be a prefix of left, this means that it has been
williamr@2: 	checked at all requested levels. If it is reported that the right is a
williamr@2: 	prefix of the left, then this will mean also that there are no unmatched
williamr@2: 	combining characters on the left.
williamr@2: 	*/
williamr@2: 	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
williamr@2: 										  TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
williamr@2: 	/**
williamr@2: 	Finds search term inside candidate string. Returns KErrNotFound if there
williamr@2: 	is no match, returns the offset into the candidate string at which the
williamr@2: 	search term was found (note that this is the offset from the start of
williamr@2: 	the iteration, not from where the iteration was when the function was
williamr@2: 	called). If a string was found, the search term iterator is left
williamr@2: 	pointing at the end of the search term, and the candidate iterator is
williamr@2: 	left pointing just after the matched keys. aMatchPos returns where in
williamr@2: 	the candidate string the match was found.
williamr@2: 	*/
williamr@2: 	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
williamr@2: 						 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
williamr@2: 
williamr@2: private:
williamr@2: 	TCollationMethod iMethod;
williamr@2: 	};
williamr@2: 
williamr@2: #endif	// __KERNEL_MODE__
williamr@2: 
williamr@2: #endif // _UNICODE
williamr@2: 
williamr@2: #endif // __COLLATE_H__