Symaptic: os/kernelhwsrv/kernel/eka/euser/unicode/unicode.cpp@bde4ae8d615e

     1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).

     2 // All rights reserved.

     3 // This component and the accompanying materials are made available

     4 // under the terms of the License "Eclipse Public License v1.0"

     5 // which accompanies this distribution, and is available

     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".

     7 //

     8 // Initial Contributors:

     9 // Nokia Corporation - initial contribution.

    10 //

    11 // Contributors:

    12 //

    13 // Description:

    14 // e32\euser\unicode\unicode.cpp

    15 // The implementation of the base-level Unicode character classification functions. These are members of

    16 // a class called TUnicode that contains a Unicode value.

    17 //

    18 //

    20 #include <unicode.h>

    21 #include "CompareImp.h"

    23 static const TUnicodeData TheDefaultUnicodeData =

    24 	{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };

    27 // Declarations for tables held in unitable.cpp and used by unicode.cpp.

    28 #ifndef __KERNEL_MODE__

    29 extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];

    30 extern const TUnicodePlane ThePlanes[17];

    31 #endif

    34 // Fill in a TChar::TCharInfo structure with category information about the character.

    35 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const

    36 	{

    37 	const TUnicodeData& data = GetData(aOverridingDataSet);

    38 	aInfo.iCategory = (TChar::TCategory)data.iCategory;

    39 	aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;

    40 	aInfo.iCombiningClass = data.iCombiningClass;

    41 	aInfo.iLowerCase = iCode;

    42 	aInfo.iUpperCase = iCode;

    43 	aInfo.iTitleCase = iCode;

    44 	if (data.iFlags & TUnicodeData::EHasLowerCase)

    45 		aInfo.iLowerCase = GetLowerCase(data);

    46 	if (data.iFlags & TUnicodeData::EHasUpperCase)

    47 		aInfo.iUpperCase = GetUpperCase(data);

    48 	if (data.iFlags & TUnicodeData::EHasTitleCase)

    49 		aInfo.iTitleCase = GetTitleCase(data);

    50 	aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;

    51 	if (data.iFlags & TUnicodeData::ENumericFlags)

    52 		aInfo.iNumericValue = GetNumericValue(data);

    53 	else

    54 		aInfo.iNumericValue = -1;

    55 	}

    57 /*

    58 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that

    59 data set before searching the standard data set.

    60 */

    61 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const

    62 	{

    63 	const TUnicodeData *result = NULL;

    64 	if (aOverridingDataSet)

    65 		result = GetDataFromDataSet(*aOverridingDataSet);

    66 	if (result == NULL)

    67 		{

    68 		if (0xFFFF >= iCode)

    69 			{

    70 			// optimize for BMP characters (plane 0)

    71 			TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4];

    72 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index

    73 				index &= ~0x8000;

    74 			else

    75 				index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)];

    76 			return TheStandardUnicodeDataSet[0].iData[index];

    77 			}

    78 		else

    79 			{

    80 			// for non-BMP characters (plane 1-16)

    81 			TInt plane = (iCode >> 16);

    82 			if (plane > 16)

    83 				{

    84 				// for now we have no data for values above U+10FFFF

    85 				return TheDefaultUnicodeData;

    86 				}

    87 			TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock;

    88 			TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint;

    90 			TInt low16bit = (iCode & 0xFFFF);

    91 			TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock];

    92 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index

    93 				index &= ~0x8000;

    94 			else

    95 				index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)];

    96 			return TheStandardUnicodeDataSet[plane].iData[index];

    97 			}

    98 		}

   100 	return *result;

   101 	}

   103 /*

   104 Given a character data set, get the data referring to this character.

   105 Return NULL if no data is available in this data set.

   106 */

   107 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const

   108 	{

   109 	// Perform a binary chop to find the range containing this character.

   110 	TInt n = aDataSet.iRanges;

   111 	const TUnicodeDataRange *base = aDataSet.iRange;

   112 	const TUnicodeDataRange *last = base + n - 1;

   113 	const TUnicodeDataRange *r = base;

   115 	while (n > 1)

   116 		{

   117 		TInt pivot = n / 2;

   118 		r += pivot;

   119 		if (iCode < r->iRangeStart)									// it's before this range

   120 			n = pivot;

   121 		else if (r < last && iCode >= r[1].iRangeStart)				// it's after this range

   122 			{

   123 			base = r + 1;

   124 			n -= pivot + 1;

   125 			}

   126 		else														// it's in this range

   127 			break;

   128 		r = base;

   129 		}

   131 	if (r->iIndex >= 0)

   132 		return &aDataSet.iData[r->iIndex];		// index >= 0: data available

   133 	else

   134 		return NULL;							// index < 0: no data available

   135 	}

   137 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const

   138 	{

   139 	return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;

   140 	}

   142 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const

   143 	{

   144 	return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;

   145 	}

   147 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const

   148 	{

   149 	return GetData(aOverridingDataSet).iCombiningClass;

   150 	}

   152 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const

   153 	{

   154 	return GetLowerCase(GetData(aOverridingDataSet));

   155 	}

   157 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const

   158 	{

   159 	return GetUpperCase(GetData(aOverridingDataSet));

   160 	}

   162 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const

   163 	{

   164 	if (aData.iFlags & TUnicodeData::EHasLowerCase)

   165 		return iCode + aData.iCaseOffset;

   166 	else

   167 		return iCode;

   168 	}

   170 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const

   171 	{

   172 	if (aData.iFlags & TUnicodeData::EHasUpperCase)

   173 		return iCode - aData.iCaseOffset;

   174 	else

   175 		return iCode;

   176 	}

   178 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const

   179 	{

   180 	return GetTitleCase(GetData(aOverridingDataSet));

   181 	}

   183 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const

   184 	{

   185 	// Handle the very few characters with distinct title case variants.

   186 	if (aData.iFlags & TUnicodeData::EHasTitleCase)

   187 		{

   188 		// If the character has no upper case variant add one to get the title case form.

   189 		if (!(aData.iFlags & TUnicodeData::EHasUpperCase))

   190 			return iCode + 1;

   191 		// If the character has no lower case variant subtract one to get the title case form.

   192 		if (!(aData.iFlags & TUnicodeData::EHasLowerCase))

   193 			return iCode - 1;

   194 		// Both upper and lower case forms exist so the character itself must be title case.

   195 		return iCode;

   196 		}

   198 	// All other characters have title case forms that are the same as their upper case forms.

   199 	return GetUpperCase(aData);

   200 	}

   202 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const

   203 	{

   204 	return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;

   205 	}

   207 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const

   208 	{

   209 	return GetNumericValue(GetData(aOverridingDataSet));

   210 	}

   212 /*

   213 Return the integer numeric value of this character.

   214 Return -1 if the character is not numeric, or -2 if it has a fractional value.

   215 */

   216 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const

   217 	{

   218 	switch (aData.iFlags & TUnicodeData::ENumericFlags)

   219 		{

   220 		case TUnicodeData::ENonNumeric: return -1;

   221 		case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;

   222 		case TUnicodeData::EFiveHundred: return 500;

   223 		case TUnicodeData::EOneThousand: return 1000;

   224 		case TUnicodeData::EFiveThousand: return 5000;

   225 		case TUnicodeData::ETenThousand: return 10000;

   226 		case TUnicodeData::EHundredThousand: return 100000;

   227 		case TUnicodeData::EFraction: return -2;

   228 		default: return -1; // we should never come here

   229 		}

   230 	}

   232 struct TWidthInfo

   233 	{

   234 	TUint iStart;

   235 	TUint iEnd;

   236 	TChar::TCjkWidth iWidth;

   237 	};

   239 static const TWidthInfo TheWidthInfoTable[] =

   240 	{

   241 	{ 0x0020, 0x007F, TChar::ENarrow },

   242 	{ 0x00A2, 0x00A4, TChar::ENarrow },

   243 	{ 0x00A5, 0x00A7, TChar::ENarrow },

   244 	{ 0x00AF, 0x00B0, TChar::ENarrow },

   245 	{ 0x00B1, 0x1100, TChar::ENeutralWidth },

   246 	{ 0x1100, 0x1160, TChar::EWide },

   247 	{ 0x1160, 0x2E80, TChar::ENeutralWidth },

   248 	{ 0x2E80, 0xD7A4, TChar::EWide },

   249 	{ 0xF900, 0xFA2E, TChar::EWide },

   250 	{ 0xFE30, 0xFE6C, TChar::EWide },

   251 	{ 0xFF01, 0xFF5F, TChar::EFullWidth },

   252 	{ 0xFF61, 0xFFDD, TChar::EHalfWidth },

   253 	{ 0xFFE0, 0xFFE7, TChar::EFullWidth },

   254 	{ 0xFFE8, 0xFFEF, TChar::EHalfWidth },

   255 	{ 0x20000, 0x2A6DF, TChar::EWide },		// CJK Unified Ideographs Extension B

   256 	{ 0x2F800, 0x2FA1F, TChar::EWide },		// CJK Unified Ideographs Supplement

   257 	};

   259 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);

   261 /*

   262 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.

   263 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those

   264 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table

   265 is too great to justify at the moment.

   266 */

   267 TChar::TCjkWidth TUnicode::GetCjkWidth() const

   268 	{

   269 	const TWidthInfo* w = TheWidthInfoTable;

   270 	for (TInt i = 0; i < TheWidthInfos; i++, w++)

   271 		if (iCode >= w->iStart && iCode < w->iEnd)

   272 			return w->iWidth;

   273 	return TChar::ENeutralWidth;

   274 	}

   276 /*

   277 Convert a Unicode character into a form most likely to be equal to another character, while

   278 still preserving the essential meaning of the character. Possible folding operations include

   279 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.

   280 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done

   281 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,

   282 which performs all possible folding operations.

   284 Note that the difference between folding and collation is that folding is

   285 	*	character-based

   286 	*	biased towards yielding equality where possible

   287 while collation is

   288 	*	string-based

   289 	*	designed to yield a non-equal ordering

   291 Typically, folding will be used when searching for a match, while collation will be used when

   292 sorting a list.

   293 */

   294 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const

   295 	{

   296 	TUint result = iCode;

   298 	/*

   299 	Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use

   300 	a built-in table.

   301 	*/

   302 	if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))

   303 		result = CjkWidthFoldTable[result & 0xFF];

   305 	/*

   306 	If the character is <= 0x00FF and the flags include folding case and stripping accents,

   307 	and there is no overriding character data, we can use the built-in fold table.

   308 	*/

   309 	const TUnicodeData* data = NULL;

   310 	if (aOverridingDataSet)

   311 		data = GetDataFromDataSet(*aOverridingDataSet);

   312 	if (data == NULL && result < 256 &&

   313 		(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))

   314 		return FoldTable[result];

   316 	/*

   317 	Other characters have to be dealt with laboriously.

   318 	The first operations are those that, if successful, tell us that nothing more

   319 	need be done. If a value is folded to a space or a digit or converted to Katakana

   320 	it cannot have anything else done to it.

   321 	*/

   322 	if (aFlags & TChar::EFoldKana)

   323 		{

   324 		if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)

   325 			return result += 0x0060;

   326 		}

   327 	if (data == NULL)

   328 		data = &GetData(NULL);

   329 	if (aFlags & TChar::EFoldSpaces)

   330 		{

   331 		if (data->iCategory == TChar::EZsCategory)

   332 			return 0x0020;

   333 		}

   334 	if (aFlags & TChar::EFoldDigits)

   335 		{

   336 		TInt n = GetNumericValue(*data);

   337 		if (n >= 0 && n <= 9)

   338 			return 0x0030 + n;

   339 		}

   341 	/*

   342 	The final operations are the relatively rare and expensive ones (after the special

   343 	case dealt with above) of accent removal and case conversion.

   344 	*/

   345 	if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))

   346 		{

   347 		/*

   348 		Throw away characters other than the first if all are accents. For the moment these

   349 		are defined as characters in the range 0x0300..0x0361. This definition may need

   350 		to be modified; or I may decide to store a flag in the decomposition table indicating

   351 		whether or not the decomposition consists of base + accent(s).

   352 		*/

   353 		TPtrC16 decomposition;

   354 		if (::DecomposeChar(iCode, decomposition))

   355 			{

   356 			TBool all_accents = TRUE;

   357 			for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)

   358 				{

   359 				if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)

   360 					all_accents = FALSE;

   361 				}

   362 			if (all_accents)

   363 				result = decomposition[0];

   364 			}

   365 		}

   367 	if (aFlags & TChar::EFoldCase)

   368 		{

   369 		if (aOverridingDataSet == NULL && result < 256)

   370 			result = FoldTable[result];

   371 		else

   372 			result = TUnicode(result).GetLowerCase(aOverridingDataSet);

   373 		}

   375 	return result;

   376 	}

   378 /*

   379 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison

   380 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they

   381 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes

   382 after the string { 0x0001 }.

   384 This function exists to make it easier to search tables of Unicode strings (like the composition

   385 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.

   387 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.

   388 */

   389 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)

   390 	{

   391 	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)

   392 		{

   393 		TInt x = i < aLength1 ? *aString1 : -1;

   394 		TInt y = i < aLength2 ? *aString2 : -1;

   395 		if (x != y)

   396 			return x - y;

   397 		}

   398 	return 0;

   399 	}

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--