test: epoc32/include/collate.h@2fe1408b6811 (annotated)

williamr@2	1	// Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
williamr@2	2	// All rights reserved.
williamr@2	3	// This component and the accompanying materials are made available
williamr@2	4	// under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
williamr@2	5	// which accompanies this distribution, and is available
williamr@2	6	// at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
williamr@2	7	//
williamr@2	8	// Initial Contributors:
williamr@2	9	// Nokia Corporation - initial contribution.
williamr@2	10	//
williamr@2	11	// Contributors:
williamr@2	12	//
williamr@2	13	// Description:
williamr@2	14	// e32\include\collate.h
williamr@2	15	// Definitions needed for Unicode collation.
williamr@2	16	// Collation is the comparison of two Unicode strings to produce an ordering
williamr@2	17	// that may be used in a dictionary or other list.
williamr@2	18	// Collation is implemented using the Standard Unicode Collation algorithm. There
williamr@2	19	// are four levels of comparison:
williamr@2	20	// primary: basic character identity
williamr@2	21	// secondary: accents and diacritics
williamr@2	22	// tertiary: upper and lower case, and other minor attributes
williamr@2	23	// quaternary: Unicode character value
williamr@2	24	// Punctuation is normally ignored but can optionally be taken into account.
williamr@2	25	// Strings are fully expanded using the standard Unicode canonical expansions before
williamr@2	26	// they are compared. Thai and Lao vowels are swapped with the following character
williamr@2	27	// if any.
williamr@2	28	// EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
williamr@2	29	// to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
williamr@2	30	// the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
williamr@2	31	// all the characters for which keys are defined, and ordered by their Unicode values.
williamr@2	32	// Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
williamr@2	33	// method. This is done by using the standard table as the main key table (signalled by placing NULL in
williamr@2	34	// TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
williamr@2	35	// Locale-specific collation data resides in ELOCL.
williamr@2	36	//
williamr@2	37	//
williamr@2	38
williamr@2	39
williamr@2	40
williamr@2	41	#ifndef __COLLATE_H__
williamr@2	42	#define __COLLATE_H__
williamr@2	43
williamr@2	44	#ifdef __KERNEL_MODE__
williamr@2	45	#include <e32cmn.h>
williamr@2	46	#else
williamr@2	47	#include <e32std.h>
williamr@2	48	#endif
williamr@2	49
williamr@2	50	//This material is used in the Unicode build only.
williamr@2	51	#ifdef _UNICODE
williamr@2	52
williamr@2	53	/**
williamr@2	54	Collation key table structure.
williamr@2	55	@publishedPartner
williamr@2	56	*/
williamr@2	57	struct TCollationKeyTable
williamr@2	58	{
williamr@2	59	public:
williamr@2	60	/**
williamr@2	61	Masks for the various parts of the elements of the iKey array.
williamr@2	62	*/
williamr@2	63	enum
williamr@2	64	{
williamr@2	65	ELevel0Mask = 0xFFFF0000, // primary key - basic character identity
williamr@2	66	ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics
williamr@2	67	ELevel2Mask = 0x000000FC, // tertiary key - case, etc.
williamr@2	68	EIgnoreFlag = 0x2, // if set, this key is normally ignored
williamr@2	69	EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values
williamr@2	70	};
williamr@2	71
williamr@2	72	/**
williamr@2	73	An array containing all of the keys and strings of keys concatenated
williamr@2	74	together. Each key has EStopFlag set only if it is the last key in its
williamr@2	75	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
williamr@2	76	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
williamr@2	77	etc.).
williamr@2	78	*/
williamr@2	79	const TUint32* iKey;
williamr@2	80	/**
williamr@2	81	An array of indices into the iKey array. Each element has its high 16
williamr@2	82	bits indicating a Unicode value and its low 16 bits indicating an index
williamr@2	83	into the iKey array at which its key starts. The elements are sorted by
williamr@2	84	Unicode value.
williamr@2	85	*/
williamr@2	86	const TUint32* iIndex;
williamr@2	87	/**
williamr@2	88	The size of the iIndex array.
williamr@2	89	*/
williamr@2	90	TInt iIndices;
williamr@2	91	/**
williamr@2	92	Concatenated Unicode strings. Each is a strings that is to be converted
williamr@2	93	to keys differently from how it would be if each letter were converted
williamr@2	94	independently. An example is "ch" in Spanish, which sorts as though it
williamr@2	95	were a single letter. Each Unicode string is preceeded by a 16-bit value
williamr@2	96	indicating the string's length. The end of the string is not delimited.
williamr@2	97	*/
williamr@2	98	const TUint16* iString;
williamr@2	99	/**
williamr@2	100	An array of elements mapping elements of iString to elements of iIndex.
williamr@2	101	Each element has its high 16 bits indicating the index of the start of
williamr@2	102	an element of iString, and its low 16 bits indicating the corresponding
williamr@2	103	element in iIndex. This array is sorted on the string index.
williamr@2	104	*/
williamr@2	105	const TUint32* iStringIndex;
williamr@2	106	/**
williamr@2	107	The size of the iStringIndex array.
williamr@2	108	*/
williamr@2	109	TInt iStringIndices;
williamr@2	110	};
williamr@2	111
williamr@2	112	/**
williamr@2	113	Defines a collation method.
williamr@2	114
williamr@2	115	Collation means sorting pieces of text. It needs to take into account characters,
williamr@2	116	accents and case; spaces and punctuation are usually ignored. It differs from
williamr@2	117	ordinary methods of sorting in that it is locale-dependent - different
williamr@2	118	languages use different ordering methods. Additionally, multiple collation
williamr@2	119	methods may exist within the same locale.
williamr@2	120
williamr@2	121	A collation method provides the collation keys and other data needed to customise
williamr@2	122	collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC())
williamr@2	123	perform the collation. Note that these functions use the standard collation
williamr@2	124	method for the current locale - you only need to specify an object of class
williamr@2	125	TCollationMethod to customise this collation scheme. Collation methods can
williamr@2	126	be retrieved using member functions of the Mem class. Each one has a unique
williamr@2	127	identifier.
williamr@2	128
williamr@2	129	A collation method specifies a main table of collation keys, and optionally
williamr@2	130	an overriding table that contains keys for which the values in the main table
williamr@2	131	are overridden. A collation key table (TCollationKeyTable) is the set of collation
williamr@2	132	keys: primary (basic character identity), secondary (accents and diacritics)
williamr@2	133	and tertiary (case). The quaternary key is the Unicode character values themselves.
williamr@2	134
williamr@2	135	The simplest way to customise a collation method is to create a local copy
williamr@2	136	of the standard collation method and change it. For example, you could use
williamr@2	137	the standard method, but not ignore punctuation and spaces:
williamr@2	138
williamr@2	139	@code
williamr@2	140	TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
williamr@2	141	m.iFlags \|= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
williamr@2	142	@endcode
williamr@2	143
williamr@2	144	@publishedPartner
williamr@2	145	*/
williamr@2	146	struct TCollationMethod
williamr@2	147	{
williamr@2	148	public:
williamr@2	149	/**
williamr@2	150	The UID of this collation method.
williamr@2	151	*/
williamr@2	152	TUint iId;
williamr@2	153
williamr@2	154	/**
williamr@2	155	The main collation key table; if NULL, use the standard table.
williamr@2	156	*/
williamr@2	157	const TCollationKeyTable* iMainTable;
williamr@2	158
williamr@2	159	/**
williamr@2	160	If non-NULL, tailoring for collation keys.
williamr@2	161	*/
williamr@2	162	const TCollationKeyTable* iOverrideTable;
williamr@2	163	enum
williamr@2	164	{
williamr@2	165	/**
williamr@2	166	Don't ignore any keys (punctuation, etc. is normally ignored).
williamr@2	167	*/
williamr@2	168	EIgnoreNone = 1,
williamr@2	169
williamr@2	170	/**
williamr@2	171	Reverse the normal order for characters differing only in case
williamr@2	172	*/
williamr@2	173	ESwapCase = 2,
williamr@2	174
williamr@2	175	/**
williamr@2	176	Compare secondary keys which represent accents in reverse
williamr@2	177	order (from right to left); this is needed for French when comparing
williamr@2	178	words that differ only in accents.
williamr@2	179	*/
williamr@2	180	EAccentsBackwards = 4,
williamr@2	181
williamr@2	182	/**
williamr@2	183	Reverse the normal order for characters differing only in whether they
williamr@2	184	are katakana or hiragana.
williamr@2	185	*/
williamr@2	186	ESwapKana = 8,
williamr@2	187
williamr@2	188	/**
williamr@2	189	Fold all characters to lower case before extracting keys; needed for
williamr@2	190	comparison of filenames, for which case is ignored but other
williamr@2	191	tertiary (level-2) distinctions are not.
williamr@2	192	*/
williamr@2	193	EFoldCase = 16,
williamr@2	194
williamr@2	195	/** Flag to indicate a collation method for matching purpose
williamr@2	196	This flag is only needed if we wish to specify a particular collation method
williamr@2	197	to be used for matching purpose.
williamr@2	198	*/
williamr@2	199	EMatchingTable = 32,
williamr@2	200
williamr@2	201	/** Ignore the check for adjacent combining characters. A combining
williamr@2	202	character effectively changes the character it combines with to something
williamr@2	203	else and so a match doesn't occur. Setting this flag will allow character
williamr@2	204	matching regardless of any combining characters.
williamr@2	205	*/
williamr@2	206	EIgnoreCombining = 64
williamr@2	207	};
williamr@2	208
williamr@2	209	/**
williamr@2	210	Flags.
williamr@2	211
williamr@2	212	@see TCollationMethod::EIgnoreNone
williamr@2	213	@see TCollationMethod::ESwapCase
williamr@2	214	@see TCollationMethod::EAccentsBackwards
williamr@2	215	@see TCollationMethod::ESwapKana
williamr@2	216	@see TCollationMethod::EFoldCase
williamr@2	217	*/
williamr@2	218	TUint iFlags;
williamr@2	219	};
williamr@2	220
williamr@2	221	/**
williamr@2	222	A collation data set provides any collation methods needed by a locale.
williamr@2	223	@publishedPartner
williamr@2	224	*/
williamr@2	225	struct TCollationDataSet
williamr@2	226	{
williamr@2	227	public:
williamr@2	228	const TCollationMethod* iMethod;
williamr@2	229	TInt iMethods;
williamr@2	230	};
williamr@2	231
williamr@2	232	// Collation method IDs
williamr@2	233
williamr@2	234	/**
williamr@2	235	A collation data set provides any collation methods needed by a locale.
williamr@2	236	@internalTechnology
williamr@2	237	@released
williamr@2	238	*/
williamr@2	239	const TUint KUidBasicCollationMethod = 0x10004F4E;
williamr@2	240
williamr@2	241	/**
williamr@2	242	A collation data set provides any collation methods needed by a locale.
williamr@2	243	@internalTechnology
williamr@2	244	@released
williamr@2	245	*/
williamr@2	246	const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
williamr@2	247
williamr@2	248	#ifndef __KERNEL_MODE__
williamr@2	249
williamr@2	250	//Forward declarations
williamr@2	251	class TUTF32Iterator;
williamr@2	252	struct LCharSet;
williamr@2	253
williamr@2	254	/**
williamr@2	255	Provides low-level collation functions.
williamr@2	256	@internalComponent
williamr@2	257	*/
williamr@2	258	class TCollate
williamr@2	259	{
williamr@2	260	public:
williamr@2	261	/**
williamr@2	262	Construct a TCollate object based on the collation method specified
williamr@2	263	within aCharSet, if any. If there is none, or aCharSet is null, the
williamr@2	264	standard collation method will be used. aMask and aFlags provide a
williamr@2	265	method for overriding the flags in the collation method: Each flag set
williamr@2	266	to 1 in aMask is a flag that will be overridden and set to the
williamr@2	267	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
williamr@2	268	*/
williamr@2	269	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
williamr@2	270	/**
williamr@2	271	Construct a TCollate object based on an already constructed
williamr@2	272	TCollationMethod specified in aMethod. Ownership is not passed.
williamr@2	273	*/
williamr@2	274	TCollate(const TCollationMethod& aMethod);
williamr@2	275
williamr@2	276	enum TComparisonResult
williamr@2	277	{
williamr@2	278	ELeftComparesLessAndIsNotPrefix = -2,
williamr@2	279	ELeftIsPrefixOfRight = -1,
williamr@2	280	EStringsIdentical = 0,
williamr@2	281	ERightIsPrefixOfLeft = 1,
williamr@2	282	ERightComparesLessAndIsNotPrefix = 2
williamr@2	283	};
williamr@2	284
williamr@2	285	/**
williamr@2	286	Compare the string beginning at aString1 of length aLength1 against the
williamr@2	287	string beginning at aString2 of length aLength2.
williamr@2	288	aMaxLevel determines the tightness of the collation. At level 0, only
williamr@2	289	character identities are distinguished. At level 1 accents are
williamr@2	290	distinguished as well. At level 2 case is distinguishes as well. At
williamr@2	291	level 3 all valid different Unicode characters are considered different.
williamr@2	292	*/
williamr@2	293	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
williamr@2	294	const TUint16* aString2,TInt aLength2,
williamr@2	295	TInt aMaxLevel = 3) const;
williamr@2	296	/**
williamr@2	297	Find the string beginning at aString2 of length aLength2 in the string
williamr@2	298	beginning at aString1 of length aLength1. aMaxLevel determines
williamr@2	299	the tightness of the collation, see Compare for details.
williamr@2	300	*/
williamr@2	301	TInt Find(const TUint16 aString1,TInt aLength1,const TUint16 aString2,TInt aLength2,
williamr@2	302	TInt aMaxLevel,TUint aString2WildChar = 0) const;
williamr@2	303
williamr@2	304	TInt Find(const TUint16 aString1,TInt aLength1,const TUint16 aString2,TInt aLength2,
williamr@2	305	TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
williamr@2	306
williamr@2	307	/**
williamr@2	308	Test if the string beginning at aSearchTerm of length aSearchTermLength
williamr@2	309	matches the string beginning at aCandidate of length aCandidateLength.
williamr@2	310	aMaxLevel determines the tightness of the collation, see
williamr@2	311	Compare for details. The search term may have wild card characters as
williamr@2	312	specified by aWildChar (for matching a single grapheme- i.e. character
williamr@2	313	and any characters that combine with it, such as accents) and
williamr@2	314	aWildSequenceChar (for matching any sequence of whole graphemes). The
williamr@2	315	return value is KErrNotFound iff the search term does not match the
williamr@2	316	candidate string exactly. To find a match within the candidate string,
williamr@2	317	the search term must begin and end with a wild sequence character. If
williamr@2	318	the search term does match the candidate string, 0 will be returned,
williamr@2	319	unless the first character of the search term is a wild sequence
williamr@2	320	character in which case the value returned will be the index into
williamr@2	321	aCandidate at which the first non-wild sequence character matched.
williamr@2	322	aWildSequenceChar must be a valid (non-surrogate) Unicode character
williamr@2	323	below FFFE.
williamr@2	324	*/
williamr@2	325	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
williamr@2	326	const TUint16 *aSearchTerm,TInt aSearchTermLength,
williamr@2	327	TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
williamr@2	328
williamr@2	329	private:
williamr@2	330	/**
williamr@2	331	Compare values output from the iterators. After the comparison, if
williamr@2	332	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
williamr@2	333	aRight will be pointing at the next key (at MaxLevel) after the match.
williamr@2	334	If right is shown to be a prefix of left, this means that it has been
williamr@2	335	checked at all requested levels. If it is reported that the right is a
williamr@2	336	prefix of the left, then this will mean also that there are no unmatched
williamr@2	337	combining characters on the left.
williamr@2	338	*/
williamr@2	339	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
williamr@2	340	TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
williamr@2	341	/**
williamr@2	342	Finds search term inside candidate string. Returns KErrNotFound if there
williamr@2	343	is no match, returns the offset into the candidate string at which the
williamr@2	344	search term was found (note that this is the offset from the start of
williamr@2	345	the iteration, not from where the iteration was when the function was
williamr@2	346	called). If a string was found, the search term iterator is left
williamr@2	347	pointing at the end of the search term, and the candidate iterator is
williamr@2	348	left pointing just after the matched keys. aMatchPos returns where in
williamr@2	349	the candidate string the match was found.
williamr@2	350	*/
williamr@2	351	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
williamr@2	352	TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
williamr@2	353
williamr@2	354	private:
williamr@2	355	TCollationMethod iMethod;
williamr@2	356	};
williamr@2	357
williamr@2	358	#endif // __KERNEL_MODE__
williamr@2	359
williamr@2	360	#endif // _UNICODE
williamr@2	361
williamr@2	362	#endif // __COLLATE_H__

author	William Roberts <williamr@symbian.org>
	Tue, 16 Mar 2010 16:12:26 +0000
branch	Symbian2
changeset 2	2fe1408b6811
permissions	-rw-r--r--