os/kernelhwsrv/kernel/eka/euser/unicode/unicode.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     2 // All rights reserved.
     3 // This component and the accompanying materials are made available
     4 // under the terms of the License "Eclipse Public License v1.0"
     5 // which accompanies this distribution, and is available
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
     7 //
     8 // Initial Contributors:
     9 // Nokia Corporation - initial contribution.
    10 //
    11 // Contributors:
    12 //
    13 // Description:
    14 // e32\euser\unicode\unicode.cpp
    15 // The implementation of the base-level Unicode character classification functions. These are members of
    16 // a class called TUnicode that contains a Unicode value.
    17 // 
    18 //
    19 
    20 #include <unicode.h>
    21 #include "CompareImp.h"
    22 
    23 static const TUnicodeData TheDefaultUnicodeData =
    24 	{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };
    25 
    26 
    27 // Declarations for tables held in unitable.cpp and used by unicode.cpp.
    28 #ifndef __KERNEL_MODE__
    29 extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];
    30 extern const TUnicodePlane ThePlanes[17];
    31 #endif
    32 
    33 
    34 // Fill in a TChar::TCharInfo structure with category information about the character.
    35 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const
    36 	{
    37 	const TUnicodeData& data = GetData(aOverridingDataSet);
    38 	aInfo.iCategory = (TChar::TCategory)data.iCategory;
    39 	aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;
    40 	aInfo.iCombiningClass = data.iCombiningClass;
    41 	aInfo.iLowerCase = iCode;
    42 	aInfo.iUpperCase = iCode;
    43 	aInfo.iTitleCase = iCode;
    44 	if (data.iFlags & TUnicodeData::EHasLowerCase)
    45 		aInfo.iLowerCase = GetLowerCase(data);
    46 	if (data.iFlags & TUnicodeData::EHasUpperCase)
    47 		aInfo.iUpperCase = GetUpperCase(data);
    48 	if (data.iFlags & TUnicodeData::EHasTitleCase)
    49 		aInfo.iTitleCase = GetTitleCase(data);
    50 	aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;
    51 	if (data.iFlags & TUnicodeData::ENumericFlags)
    52 		aInfo.iNumericValue = GetNumericValue(data);
    53 	else
    54 		aInfo.iNumericValue = -1;
    55 	}
    56 
    57 /*
    58 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that
    59 data set before searching the standard data set.
    60 */
    61 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const
    62 	{
    63 	const TUnicodeData *result = NULL;
    64 	if (aOverridingDataSet)
    65 		result = GetDataFromDataSet(*aOverridingDataSet);
    66 	if (result == NULL)
    67 		{
    68 		if (0xFFFF >= iCode)
    69 			{
    70 			// optimize for BMP characters (plane 0)
    71 			TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4];
    72 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
    73 				index &= ~0x8000;
    74 			else
    75 				index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)];
    76 			return TheStandardUnicodeDataSet[0].iData[index];
    77 			}
    78 		else
    79 			{
    80 			// for non-BMP characters (plane 1-16)
    81 			TInt plane = (iCode >> 16);
    82 			if (plane > 16)
    83 				{
    84 				// for now we have no data for values above U+10FFFF
    85 				return TheDefaultUnicodeData;
    86 				}
    87 			TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock;
    88 			TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint;
    89 			
    90 			TInt low16bit = (iCode & 0xFFFF);
    91 			TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock];
    92 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
    93 				index &= ~0x8000;
    94 			else
    95 				index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)];
    96 			return TheStandardUnicodeDataSet[plane].iData[index];
    97 			}
    98 		}
    99 
   100 	return *result;
   101 	}
   102 
   103 /*
   104 Given a character data set, get the data referring to this character.
   105 Return NULL if no data is available in this data set.
   106 */
   107 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const
   108 	{
   109 	// Perform a binary chop to find the range containing this character.
   110 	TInt n = aDataSet.iRanges;
   111 	const TUnicodeDataRange *base = aDataSet.iRange;
   112 	const TUnicodeDataRange *last = base + n - 1;
   113 	const TUnicodeDataRange *r = base;
   114 
   115 	while (n > 1)
   116 		{
   117 		TInt pivot = n / 2;
   118 		r += pivot;
   119 		if (iCode < r->iRangeStart)									// it's before this range
   120 			n = pivot;
   121 		else if (r < last && iCode >= r[1].iRangeStart)				// it's after this range
   122 			{
   123 			base = r + 1;
   124 			n -= pivot + 1;
   125 			}
   126 		else														// it's in this range
   127 			break;
   128 		r = base;
   129 		}
   130 
   131 	if (r->iIndex >= 0)
   132 		return &aDataSet.iData[r->iIndex];		// index >= 0: data available
   133 	else
   134 		return NULL;							// index < 0: no data available
   135 	}
   136 
   137 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const
   138 	{
   139 	return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;
   140 	}
   141 
   142 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const
   143 	{
   144 	return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;
   145 	}
   146 
   147 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const
   148 	{
   149 	return GetData(aOverridingDataSet).iCombiningClass;
   150 	}
   151 
   152 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const
   153 	{
   154 	return GetLowerCase(GetData(aOverridingDataSet));
   155 	}
   156 
   157 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const
   158 	{
   159 	return GetUpperCase(GetData(aOverridingDataSet));
   160 	}
   161 
   162 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const
   163 	{
   164 	if (aData.iFlags & TUnicodeData::EHasLowerCase)
   165 		return iCode + aData.iCaseOffset;
   166 	else
   167 		return iCode;
   168 	}
   169 
   170 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const
   171 	{
   172 	if (aData.iFlags & TUnicodeData::EHasUpperCase)
   173 		return iCode - aData.iCaseOffset;
   174 	else
   175 		return iCode;
   176 	}
   177 
   178 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const
   179 	{
   180 	return GetTitleCase(GetData(aOverridingDataSet));
   181 	}
   182 
   183 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const
   184 	{
   185 	// Handle the very few characters with distinct title case variants.
   186 	if (aData.iFlags & TUnicodeData::EHasTitleCase)
   187 		{
   188 		// If the character has no upper case variant add one to get the title case form.
   189 		if (!(aData.iFlags & TUnicodeData::EHasUpperCase))
   190 			return iCode + 1;
   191 		// If the character has no lower case variant subtract one to get the title case form.
   192 		if (!(aData.iFlags & TUnicodeData::EHasLowerCase))
   193 			return iCode - 1;
   194 		// Both upper and lower case forms exist so the character itself must be title case.
   195 		return iCode;
   196 		}
   197 
   198 	// All other characters have title case forms that are the same as their upper case forms.
   199 	return GetUpperCase(aData);
   200 	}
   201 
   202 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const
   203 	{
   204 	return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;
   205 	}
   206 
   207 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const
   208 	{
   209 	return GetNumericValue(GetData(aOverridingDataSet));
   210 	}
   211 
   212 /*
   213 Return the integer numeric value of this character.
   214 Return -1 if the character is not numeric, or -2 if it has a fractional value.
   215 */
   216 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const
   217 	{
   218 	switch (aData.iFlags & TUnicodeData::ENumericFlags)
   219 		{
   220 		case TUnicodeData::ENonNumeric: return -1;
   221 		case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;
   222 		case TUnicodeData::EFiveHundred: return 500;
   223 		case TUnicodeData::EOneThousand: return 1000;
   224 		case TUnicodeData::EFiveThousand: return 5000;
   225 		case TUnicodeData::ETenThousand: return 10000;
   226 		case TUnicodeData::EHundredThousand: return 100000;
   227 		case TUnicodeData::EFraction: return -2;
   228 		default: return -1; // we should never come here
   229 		}
   230 	}
   231 
   232 struct TWidthInfo
   233 	{
   234 	TUint iStart;
   235 	TUint iEnd;
   236 	TChar::TCjkWidth iWidth;
   237 	};
   238 
   239 static const TWidthInfo TheWidthInfoTable[] =
   240 	{
   241 	{ 0x0020, 0x007F, TChar::ENarrow },
   242 	{ 0x00A2, 0x00A4, TChar::ENarrow },
   243 	{ 0x00A5, 0x00A7, TChar::ENarrow },
   244 	{ 0x00AF, 0x00B0, TChar::ENarrow },
   245 	{ 0x00B1, 0x1100, TChar::ENeutralWidth },
   246 	{ 0x1100, 0x1160, TChar::EWide },
   247 	{ 0x1160, 0x2E80, TChar::ENeutralWidth },
   248 	{ 0x2E80, 0xD7A4, TChar::EWide },
   249 	{ 0xF900, 0xFA2E, TChar::EWide },
   250 	{ 0xFE30, 0xFE6C, TChar::EWide },
   251 	{ 0xFF01, 0xFF5F, TChar::EFullWidth },
   252 	{ 0xFF61, 0xFFDD, TChar::EHalfWidth },
   253 	{ 0xFFE0, 0xFFE7, TChar::EFullWidth },
   254 	{ 0xFFE8, 0xFFEF, TChar::EHalfWidth },
   255 	{ 0x20000, 0x2A6DF, TChar::EWide },		// CJK Unified Ideographs Extension B
   256 	{ 0x2F800, 0x2FA1F, TChar::EWide },		// CJK Unified Ideographs Supplement
   257 	};
   258 
   259 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);
   260 
   261 /*
   262 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.
   263 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those
   264 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table
   265 is too great to justify at the moment.
   266 */
   267 TChar::TCjkWidth TUnicode::GetCjkWidth() const
   268 	{
   269 	const TWidthInfo* w = TheWidthInfoTable;
   270 	for (TInt i = 0; i < TheWidthInfos; i++, w++)
   271 		if (iCode >= w->iStart && iCode < w->iEnd)
   272 			return w->iWidth;
   273 	return TChar::ENeutralWidth;
   274 	}
   275 
   276 /*
   277 Convert a Unicode character into a form most likely to be equal to another character, while
   278 still preserving the essential meaning of the character. Possible folding operations include
   279 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.
   280 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done
   281 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,
   282 which performs all possible folding operations.
   283 
   284 Note that the difference between folding and collation is that folding is
   285 	*	character-based
   286 	*	biased towards yielding equality where possible
   287 while collation is
   288 	*	string-based
   289 	*	designed to yield a non-equal ordering
   290 
   291 Typically, folding will be used when searching for a match, while collation will be used when
   292 sorting a list.
   293 */
   294 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const
   295 	{
   296 	TUint result = iCode;
   297 
   298 	/*
   299 	Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use
   300 	a built-in table.
   301 	*/
   302 	if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))
   303 		result = CjkWidthFoldTable[result & 0xFF];
   304 
   305 	/*
   306 	If the character is <= 0x00FF and the flags include folding case and stripping accents,
   307 	and there is no overriding character data, we can use the built-in fold table.
   308 	*/
   309 	const TUnicodeData* data = NULL;
   310 	if (aOverridingDataSet)
   311 		data = GetDataFromDataSet(*aOverridingDataSet);
   312 	if (data == NULL && result < 256 &&
   313 		(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))
   314 		return FoldTable[result];
   315 
   316 	/*
   317 	Other characters have to be dealt with laboriously.
   318 	The first operations are those that, if successful, tell us that nothing more
   319 	need be done. If a value is folded to a space or a digit or converted to Katakana
   320 	it cannot have anything else done to it.
   321 	*/
   322 	if (aFlags & TChar::EFoldKana)
   323 		{
   324 		if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)
   325 			return result += 0x0060;
   326 		}
   327 	if (data == NULL)
   328 		data = &GetData(NULL);
   329 	if (aFlags & TChar::EFoldSpaces)
   330 		{
   331 		if (data->iCategory == TChar::EZsCategory)
   332 			return 0x0020;
   333 		}
   334 	if (aFlags & TChar::EFoldDigits)
   335 		{
   336 		TInt n = GetNumericValue(*data);
   337 		if (n >= 0 && n <= 9)
   338 			return 0x0030 + n;
   339 		}
   340 
   341 	/*
   342 	The final operations are the relatively rare and expensive ones (after the special
   343 	case dealt with above) of accent removal and case conversion.
   344 	*/
   345 	if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))
   346 		{
   347 		/*
   348 		Throw away characters other than the first if all are accents. For the moment these
   349 		are defined as characters in the range 0x0300..0x0361. This definition may need
   350 		to be modified; or I may decide to store a flag in the decomposition table indicating
   351 		whether or not the decomposition consists of base + accent(s).
   352 		*/
   353 		TPtrC16 decomposition;
   354 		if (::DecomposeChar(iCode, decomposition))
   355 			{
   356 			TBool all_accents = TRUE;			
   357 			for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)
   358 				{
   359 				if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)
   360 					all_accents = FALSE;
   361 				}
   362 			if (all_accents)
   363 				result = decomposition[0];
   364 			}
   365 		}
   366 
   367 	if (aFlags & TChar::EFoldCase)
   368 		{
   369 		if (aOverridingDataSet == NULL && result < 256)
   370 			result = FoldTable[result];
   371 		else
   372 			result = TUnicode(result).GetLowerCase(aOverridingDataSet);
   373 		}
   374 	
   375 	return result;
   376 	}
   377 
   378 /*
   379 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison
   380 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they
   381 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes
   382 after the string { 0x0001 }.
   383 
   384 This function exists to make it easier to search tables of Unicode strings (like the composition
   385 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.
   386 
   387 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.
   388 */
   389 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
   390 	{
   391 	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
   392 		{
   393 		TInt x = i < aLength1 ? *aString1 : -1;
   394 		TInt y = i < aLength2 ? *aString2 : -1;
   395 		if (x != y)
   396 			return x - y;
   397 		}
   398 	return 0;
   399 	}
   400