Symaptic: os/kernelhwsrv/kernel/eka/include/collate.h@bde4ae8d615e (annotated)

sl@0	1	// Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0	2	// All rights reserved.
sl@0	3	// This component and the accompanying materials are made available
sl@0	4	// under the terms of the License "Eclipse Public License v1.0"
sl@0	5	// which accompanies this distribution, and is available
sl@0	6	// at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0	7	//
sl@0	8	// Initial Contributors:
sl@0	9	// Nokia Corporation - initial contribution.
sl@0	10	//
sl@0	11	// Contributors:
sl@0	12	//
sl@0	13	// Description:
sl@0	14	// e32\include\collate.h
sl@0	15	// Definitions needed for Unicode collation.
sl@0	16	// Collation is the comparison of two Unicode strings to produce an ordering
sl@0	17	// that may be used in a dictionary or other list.
sl@0	18	// Collation is implemented using the Standard Unicode Collation algorithm. There
sl@0	19	// are four levels of comparison:
sl@0	20	// primary: basic character identity
sl@0	21	// secondary: accents and diacritics
sl@0	22	// tertiary: upper and lower case, and other minor attributes
sl@0	23	// quaternary: Unicode character value
sl@0	24	// Punctuation is normally ignored but can optionally be taken into account.
sl@0	25	// Strings are fully expanded using the standard Unicode canonical expansions before
sl@0	26	// they are compared. Thai and Lao vowels are swapped with the following character
sl@0	27	// if any.
sl@0	28	// EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
sl@0	29	// to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
sl@0	30	// the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
sl@0	31	// all the characters for which keys are defined, and ordered by their Unicode values.
sl@0	32	// Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
sl@0	33	// method. This is done by using the standard table as the main key table (signalled by placing NULL in
sl@0	34	// TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
sl@0	35	// Locale-specific collation data resides in ELOCL.
sl@0	36	//
sl@0	37	// WARNING: This file contains some APIs which are internal and are subject
sl@0	38	// to change without notice. Such APIs should therefore not be used
sl@0	39	// outside the Kernel and Hardware Services package.
sl@0	40	//
sl@0	41
sl@0	42	#ifndef __COLLATE_H__
sl@0	43	#define __COLLATE_H__
sl@0	44
sl@0	45	#ifdef __KERNEL_MODE__
sl@0	46	#include <e32cmn.h>
sl@0	47	#else
sl@0	48	#include <e32std.h>
sl@0	49	#endif
sl@0	50
sl@0	51	//This material is used in the Unicode build only.
sl@0	52	#ifdef _UNICODE
sl@0	53
sl@0	54	/**
sl@0	55	Collation key table structure.
sl@0	56	@publishedPartner
sl@0	57	@released
sl@0	58	*/
sl@0	59	struct TCollationKeyTable
sl@0	60	{
sl@0	61	public:
sl@0	62	/**
sl@0	63	Masks for the various parts of the elements of the iKey array.
sl@0	64	*/
sl@0	65	enum
sl@0	66	{
sl@0	67	ELevel0Mask = 0xFFFF0000, // primary key - basic character identity
sl@0	68	ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics
sl@0	69	ELevel2Mask = 0x000000FC, // tertiary key - case, etc.
sl@0	70	EIgnoreFlag = 0x2, // if set, this key is normally ignored
sl@0	71	EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values
sl@0	72	};
sl@0	73
sl@0	74	/**
sl@0	75	An array containing all of the keys and strings of keys concatenated
sl@0	76	together. Each key has EStopFlag set only if it is the last key in its
sl@0	77	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
sl@0	78	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
sl@0	79	etc.).
sl@0	80	*/
sl@0	81	const TUint32* iKey;
sl@0	82	/**
sl@0	83	An array of indices into the iKey array. Each element has its high 16
sl@0	84	bits indicating a Unicode value and its low 16 bits indicating an index
sl@0	85	into the iKey array at which its key starts. For surrogate pairs, high
sl@0	86	surrogate code is in index[i]:16-31, and low surrogate code is in
sl@0	87	index[i+1]:16-31. These two elements are combined to represent a surrogate
sl@0	88	pair. The elements are sorted by Unicode value.
sl@0	89	*/
sl@0	90	const TUint32* iIndex;
sl@0	91	/**
sl@0	92	The size of the iIndex array.
sl@0	93	*/
sl@0	94	TInt iIndices;
sl@0	95	/**
sl@0	96	Concatenated Unicode strings. Each is a strings that is to be converted
sl@0	97	to keys differently from how it would be if each letter were converted
sl@0	98	independently. An example is "ch" in Spanish, which sorts as though it
sl@0	99	were a single letter. Each Unicode string is preceeded by a 16-bit value
sl@0	100	indicating the string's length (in 16-bit). The end of the string is not
sl@0	101	delimited. A surrogate pair is represented by two ajacent 16-bit values.
sl@0	102	*/
sl@0	103	const TUint16* iString;
sl@0	104	/**
sl@0	105	An array of elements mapping elements of iString to elements of iIndex.
sl@0	106	Each element has its high 16 bits indicating the index of the start of
sl@0	107	an element of iString, and its low 16 bits indicating the corresponding
sl@0	108	element in iIndex. This array is sorted on the string index.
sl@0	109	*/
sl@0	110	const TUint32* iStringIndex;
sl@0	111	/**
sl@0	112	The size of the iStringIndex array.
sl@0	113	*/
sl@0	114	TInt iStringIndices;
sl@0	115	};
sl@0	116
sl@0	117	/**
sl@0	118	Defines a collation method.
sl@0	119
sl@0	120	Collation means sorting pieces of text. It needs to take into account characters,
sl@0	121	accents and case; spaces and punctuation are usually ignored. It differs from
sl@0	122	ordinary methods of sorting in that it is locale-dependent - different
sl@0	123	languages use different ordering methods. Additionally, multiple collation
sl@0	124	methods may exist within the same locale.
sl@0	125
sl@0	126	A collation method provides the collation keys and other data needed to customise
sl@0	127	collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC())
sl@0	128	perform the collation. Note that these functions use the standard collation
sl@0	129	method for the current locale - you only need to specify an object of class
sl@0	130	TCollationMethod to customise this collation scheme. Collation methods can
sl@0	131	be retrieved using member functions of the Mem class. Each one has a unique
sl@0	132	identifier.
sl@0	133
sl@0	134	A collation method specifies a main table of collation keys, and optionally
sl@0	135	an overriding table that contains keys for which the values in the main table
sl@0	136	are overridden. A collation key table (TCollationKeyTable) is the set of collation
sl@0	137	keys: primary (basic character identity), secondary (accents and diacritics)
sl@0	138	and tertiary (case). The quaternary key is the Unicode character values themselves.
sl@0	139
sl@0	140	The simplest way to customise a collation method is to create a local copy
sl@0	141	of the standard collation method and change it. For example, you could use
sl@0	142	the standard method, but not ignore punctuation and spaces:
sl@0	143
sl@0	144	@code
sl@0	145	TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
sl@0	146	m.iFlags \|= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
sl@0	147	@endcode
sl@0	148
sl@0	149	@publishedPartner
sl@0	150	@released
sl@0	151	*/
sl@0	152	struct TCollationMethod
sl@0	153	{
sl@0	154	public:
sl@0	155	/**
sl@0	156	The UID of this collation method.
sl@0	157	*/
sl@0	158	TUint iId;
sl@0	159
sl@0	160	/**
sl@0	161	The main collation key table; if NULL, use the standard table.
sl@0	162	*/
sl@0	163	const TCollationKeyTable* iMainTable;
sl@0	164
sl@0	165	/**
sl@0	166	If non-NULL, tailoring for collation keys.
sl@0	167	*/
sl@0	168	const TCollationKeyTable* iOverrideTable;
sl@0	169	enum
sl@0	170	{
sl@0	171	/**
sl@0	172	Don't ignore any keys (punctuation, etc. is normally ignored).
sl@0	173	*/
sl@0	174	EIgnoreNone = 1,
sl@0	175
sl@0	176	/**
sl@0	177	Reverse the normal order for characters differing only in case
sl@0	178	*/
sl@0	179	ESwapCase = 2,
sl@0	180
sl@0	181	/**
sl@0	182	Compare secondary keys which represent accents in reverse
sl@0	183	order (from right to left); this is needed for French when comparing
sl@0	184	words that differ only in accents.
sl@0	185	*/
sl@0	186	EAccentsBackwards = 4,
sl@0	187
sl@0	188	/**
sl@0	189	Reverse the normal order for characters differing only in whether they
sl@0	190	are katakana or hiragana.
sl@0	191	*/
sl@0	192	ESwapKana = 8,
sl@0	193
sl@0	194	/**
sl@0	195	Fold all characters to lower case before extracting keys; needed for
sl@0	196	comparison of filenames, for which case is ignored but other
sl@0	197	tertiary (level-2) distinctions are not.
sl@0	198	*/
sl@0	199	EFoldCase = 16,
sl@0	200
sl@0	201	/** Flag to indicate a collation method for matching purpose
sl@0	202	This flag is only needed if we wish to specify a particular collation method
sl@0	203	to be used for matching purpose.
sl@0	204	*/
sl@0	205	EMatchingTable = 32,
sl@0	206
sl@0	207	/** Ignore the check for adjacent combining characters. A combining
sl@0	208	character effectively changes the character it combines with to something
sl@0	209	else and so a match doesn't occur. Setting this flag will allow character
sl@0	210	matching regardless of any combining characters.
sl@0	211	*/
sl@0	212	EIgnoreCombining = 64
sl@0	213	};
sl@0	214
sl@0	215	/**
sl@0	216	Flags.
sl@0	217
sl@0	218	@see TCollationMethod::EIgnoreNone
sl@0	219	@see TCollationMethod::ESwapCase
sl@0	220	@see TCollationMethod::EAccentsBackwards
sl@0	221	@see TCollationMethod::ESwapKana
sl@0	222	@see TCollationMethod::EFoldCase
sl@0	223	*/
sl@0	224	TUint iFlags;
sl@0	225	};
sl@0	226
sl@0	227	/**
sl@0	228	A collation data set provides any collation methods needed by a locale.
sl@0	229	@publishedPartner
sl@0	230	@released
sl@0	231	*/
sl@0	232	struct TCollationDataSet
sl@0	233	{
sl@0	234	public:
sl@0	235	const TCollationMethod* iMethod;
sl@0	236	TInt iMethods;
sl@0	237	};
sl@0	238
sl@0	239	// Collation method IDs
sl@0	240
sl@0	241	/**
sl@0	242	A collation data set provides any collation methods needed by a locale.
sl@0	243	@internalTechnology
sl@0	244	@released
sl@0	245	*/
sl@0	246	const TUint KUidBasicCollationMethod = 0x10004F4E;
sl@0	247
sl@0	248	/**
sl@0	249	A collation data set provides any collation methods needed by a locale.
sl@0	250	@internalTechnology
sl@0	251	@released
sl@0	252	*/
sl@0	253	const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
sl@0	254
sl@0	255	#ifndef __KERNEL_MODE__
sl@0	256
sl@0	257	//Forward declarations
sl@0	258	class TUTF32Iterator;
sl@0	259	struct LCharSet;
sl@0	260
sl@0	261	/**
sl@0	262	Provides low-level collation functions.
sl@0	263	@internalComponent
sl@0	264	@released
sl@0	265	*/
sl@0	266	class TCollate
sl@0	267	{
sl@0	268	public:
sl@0	269	/**
sl@0	270	Construct a TCollate object based on the collation method specified
sl@0	271	within aCharSet, if any. If there is none, or aCharSet is null, the
sl@0	272	standard collation method will be used. aMask and aFlags provide a
sl@0	273	method for overriding the flags in the collation method: Each flag set
sl@0	274	to 1 in aMask is a flag that will be overridden and set to the
sl@0	275	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
sl@0	276	*/
sl@0	277	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
sl@0	278	/**
sl@0	279	Construct a TCollate object based on an already constructed
sl@0	280	TCollationMethod specified in aMethod. Ownership is not passed.
sl@0	281	*/
sl@0	282	TCollate(const TCollationMethod& aMethod);
sl@0	283
sl@0	284	enum TComparisonResult
sl@0	285	{
sl@0	286	ELeftComparesLessAndIsNotPrefix = -2,
sl@0	287	ELeftIsPrefixOfRight = -1,
sl@0	288	EStringsIdentical = 0,
sl@0	289	ERightIsPrefixOfLeft = 1,
sl@0	290	ERightComparesLessAndIsNotPrefix = 2
sl@0	291	};
sl@0	292
sl@0	293	/**
sl@0	294	Compare the string beginning at aString1 of length aLength1 against the
sl@0	295	string beginning at aString2 of length aLength2.
sl@0	296	aMaxLevel determines the tightness of the collation. At level 0, only
sl@0	297	character identities are distinguished. At level 1 accents are
sl@0	298	distinguished as well. At level 2 case is distinguishes as well. At
sl@0	299	level 3 all valid different Unicode characters are considered different.
sl@0	300	*/
sl@0	301	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
sl@0	302	const TUint16* aString2,TInt aLength2,
sl@0	303	TInt aMaxLevel = 3) const;
sl@0	304	/**
sl@0	305	Find the string beginning at aString2 of length aLength2 in the string
sl@0	306	beginning at aString1 of length aLength1. aMaxLevel determines
sl@0	307	the tightness of the collation, see Compare for details.
sl@0	308	*/
sl@0	309	TInt Find(const TUint16 aString1,TInt aLength1,const TUint16 aString2,TInt aLength2,
sl@0	310	TInt aMaxLevel,TUint aString2WildChar = 0) const;
sl@0	311
sl@0	312	TInt Find(const TUint16 aString1,TInt aLength1,const TUint16 aString2,TInt aLength2,
sl@0	313	TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
sl@0	314
sl@0	315	/**
sl@0	316	Test if the string beginning at aSearchTerm of length aSearchTermLength
sl@0	317	matches the string beginning at aCandidate of length aCandidateLength.
sl@0	318	aMaxLevel determines the tightness of the collation, see
sl@0	319	Compare for details. The search term may have wild card characters as
sl@0	320	specified by aWildChar (for matching a single grapheme- i.e. character
sl@0	321	and any characters that combine with it, such as accents) and
sl@0	322	aWildSequenceChar (for matching any sequence of whole graphemes). The
sl@0	323	return value is KErrNotFound iff the search term does not match the
sl@0	324	candidate string exactly. To find a match within the candidate string,
sl@0	325	the search term must begin and end with a wild sequence character. If
sl@0	326	the search term does match the candidate string, 0 will be returned,
sl@0	327	unless the first character of the search term is a wild sequence
sl@0	328	character in which case the value returned will be the index into
sl@0	329	aCandidate at which the first non-wild sequence character matched.
sl@0	330	aWildSequenceChar must be a valid (non-surrogate) Unicode character
sl@0	331	below FFFE.
sl@0	332	*/
sl@0	333	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
sl@0	334	const TUint16 *aSearchTerm,TInt aSearchTermLength,
sl@0	335	TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
sl@0	336
sl@0	337	private:
sl@0	338	/**
sl@0	339	Compare values output from the iterators. After the comparison, if
sl@0	340	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
sl@0	341	aRight will be pointing at the next key (at MaxLevel) after the match.
sl@0	342	If right is shown to be a prefix of left, this means that it has been
sl@0	343	checked at all requested levels. If it is reported that the right is a
sl@0	344	prefix of the left, then this will mean also that there are no unmatched
sl@0	345	combining characters on the left.
sl@0	346	*/
sl@0	347	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
sl@0	348	TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
sl@0	349	/**
sl@0	350	Finds search term inside candidate string. Returns KErrNotFound if there
sl@0	351	is no match, returns the offset into the candidate string at which the
sl@0	352	search term was found (note that this is the offset from the start of
sl@0	353	the iteration, not from where the iteration was when the function was
sl@0	354	called). If a string was found, the search term iterator is left
sl@0	355	pointing at the end of the search term, and the candidate iterator is
sl@0	356	left pointing just after the matched keys. aMatchPos returns where in
sl@0	357	the candidate string the match was found.
sl@0	358	*/
sl@0	359	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
sl@0	360	TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
sl@0	361
sl@0	362	private:
sl@0	363	TCollationMethod iMethod;
sl@0	364	};
sl@0	365
sl@0	366	#endif // __KERNEL_MODE__
sl@0	367
sl@0	368	#endif // _UNICODE
sl@0	369
sl@0	370	#endif // __COLLATE_H__

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--