First public contribution.
1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
2 // All rights reserved.
3 // This component and the accompanying materials are made available
4 // under the terms of the License "Eclipse Public License v1.0"
5 // which accompanies this distribution, and is available
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
8 // Initial Contributors:
9 // Nokia Corporation - initial contribution.
14 // e32\euser\unicode\unicode.cpp
15 // The implementation of the base-level Unicode character classification functions. These are members of
16 // a class called TUnicode that contains a Unicode value.
21 #include "CompareImp.h"
23 static const TUnicodeData TheDefaultUnicodeData =
24 { TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };
27 // Declarations for tables held in unitable.cpp and used by unicode.cpp.
28 #ifndef __KERNEL_MODE__
29 extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];
30 extern const TUnicodePlane ThePlanes[17];
34 // Fill in a TChar::TCharInfo structure with category information about the character.
35 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const
37 const TUnicodeData& data = GetData(aOverridingDataSet);
38 aInfo.iCategory = (TChar::TCategory)data.iCategory;
39 aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;
40 aInfo.iCombiningClass = data.iCombiningClass;
41 aInfo.iLowerCase = iCode;
42 aInfo.iUpperCase = iCode;
43 aInfo.iTitleCase = iCode;
44 if (data.iFlags & TUnicodeData::EHasLowerCase)
45 aInfo.iLowerCase = GetLowerCase(data);
46 if (data.iFlags & TUnicodeData::EHasUpperCase)
47 aInfo.iUpperCase = GetUpperCase(data);
48 if (data.iFlags & TUnicodeData::EHasTitleCase)
49 aInfo.iTitleCase = GetTitleCase(data);
50 aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;
51 if (data.iFlags & TUnicodeData::ENumericFlags)
52 aInfo.iNumericValue = GetNumericValue(data);
54 aInfo.iNumericValue = -1;
58 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that
59 data set before searching the standard data set.
61 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const
63 const TUnicodeData *result = NULL;
64 if (aOverridingDataSet)
65 result = GetDataFromDataSet(*aOverridingDataSet);
70 // optimize for BMP characters (plane 0)
71 TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4];
72 if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
75 index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)];
76 return TheStandardUnicodeDataSet[0].iData[index];
80 // for non-BMP characters (plane 1-16)
81 TInt plane = (iCode >> 16);
84 // for now we have no data for values above U+10FFFF
85 return TheDefaultUnicodeData;
87 TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock;
88 TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint;
90 TInt low16bit = (iCode & 0xFFFF);
91 TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock];
92 if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
95 index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)];
96 return TheStandardUnicodeDataSet[plane].iData[index];
104 Given a character data set, get the data referring to this character.
105 Return NULL if no data is available in this data set.
107 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const
109 // Perform a binary chop to find the range containing this character.
110 TInt n = aDataSet.iRanges;
111 const TUnicodeDataRange *base = aDataSet.iRange;
112 const TUnicodeDataRange *last = base + n - 1;
113 const TUnicodeDataRange *r = base;
119 if (iCode < r->iRangeStart) // it's before this range
121 else if (r < last && iCode >= r[1].iRangeStart) // it's after this range
126 else // it's in this range
132 return &aDataSet.iData[r->iIndex]; // index >= 0: data available
134 return NULL; // index < 0: no data available
137 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const
139 return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;
142 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const
144 return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;
147 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const
149 return GetData(aOverridingDataSet).iCombiningClass;
152 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const
154 return GetLowerCase(GetData(aOverridingDataSet));
157 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const
159 return GetUpperCase(GetData(aOverridingDataSet));
162 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const
164 if (aData.iFlags & TUnicodeData::EHasLowerCase)
165 return iCode + aData.iCaseOffset;
170 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const
172 if (aData.iFlags & TUnicodeData::EHasUpperCase)
173 return iCode - aData.iCaseOffset;
178 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const
180 return GetTitleCase(GetData(aOverridingDataSet));
183 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const
185 // Handle the very few characters with distinct title case variants.
186 if (aData.iFlags & TUnicodeData::EHasTitleCase)
188 // If the character has no upper case variant add one to get the title case form.
189 if (!(aData.iFlags & TUnicodeData::EHasUpperCase))
191 // If the character has no lower case variant subtract one to get the title case form.
192 if (!(aData.iFlags & TUnicodeData::EHasLowerCase))
194 // Both upper and lower case forms exist so the character itself must be title case.
198 // All other characters have title case forms that are the same as their upper case forms.
199 return GetUpperCase(aData);
202 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const
204 return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;
207 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const
209 return GetNumericValue(GetData(aOverridingDataSet));
213 Return the integer numeric value of this character.
214 Return -1 if the character is not numeric, or -2 if it has a fractional value.
216 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const
218 switch (aData.iFlags & TUnicodeData::ENumericFlags)
220 case TUnicodeData::ENonNumeric: return -1;
221 case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;
222 case TUnicodeData::EFiveHundred: return 500;
223 case TUnicodeData::EOneThousand: return 1000;
224 case TUnicodeData::EFiveThousand: return 5000;
225 case TUnicodeData::ETenThousand: return 10000;
226 case TUnicodeData::EHundredThousand: return 100000;
227 case TUnicodeData::EFraction: return -2;
228 default: return -1; // we should never come here
236 TChar::TCjkWidth iWidth;
239 static const TWidthInfo TheWidthInfoTable[] =
241 { 0x0020, 0x007F, TChar::ENarrow },
242 { 0x00A2, 0x00A4, TChar::ENarrow },
243 { 0x00A5, 0x00A7, TChar::ENarrow },
244 { 0x00AF, 0x00B0, TChar::ENarrow },
245 { 0x00B1, 0x1100, TChar::ENeutralWidth },
246 { 0x1100, 0x1160, TChar::EWide },
247 { 0x1160, 0x2E80, TChar::ENeutralWidth },
248 { 0x2E80, 0xD7A4, TChar::EWide },
249 { 0xF900, 0xFA2E, TChar::EWide },
250 { 0xFE30, 0xFE6C, TChar::EWide },
251 { 0xFF01, 0xFF5F, TChar::EFullWidth },
252 { 0xFF61, 0xFFDD, TChar::EHalfWidth },
253 { 0xFFE0, 0xFFE7, TChar::EFullWidth },
254 { 0xFFE8, 0xFFEF, TChar::EHalfWidth },
255 { 0x20000, 0x2A6DF, TChar::EWide }, // CJK Unified Ideographs Extension B
256 { 0x2F800, 0x2FA1F, TChar::EWide }, // CJK Unified Ideographs Supplement
259 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);
262 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.
263 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those
264 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table
265 is too great to justify at the moment.
267 TChar::TCjkWidth TUnicode::GetCjkWidth() const
269 const TWidthInfo* w = TheWidthInfoTable;
270 for (TInt i = 0; i < TheWidthInfos; i++, w++)
271 if (iCode >= w->iStart && iCode < w->iEnd)
273 return TChar::ENeutralWidth;
277 Convert a Unicode character into a form most likely to be equal to another character, while
278 still preserving the essential meaning of the character. Possible folding operations include
279 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.
280 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done
281 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,
282 which performs all possible folding operations.
284 Note that the difference between folding and collation is that folding is
286 * biased towards yielding equality where possible
289 * designed to yield a non-equal ordering
291 Typically, folding will be used when searching for a match, while collation will be used when
294 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const
296 TUint result = iCode;
299 Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use
302 if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))
303 result = CjkWidthFoldTable[result & 0xFF];
306 If the character is <= 0x00FF and the flags include folding case and stripping accents,
307 and there is no overriding character data, we can use the built-in fold table.
309 const TUnicodeData* data = NULL;
310 if (aOverridingDataSet)
311 data = GetDataFromDataSet(*aOverridingDataSet);
312 if (data == NULL && result < 256 &&
313 (aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))
314 return FoldTable[result];
317 Other characters have to be dealt with laboriously.
318 The first operations are those that, if successful, tell us that nothing more
319 need be done. If a value is folded to a space or a digit or converted to Katakana
320 it cannot have anything else done to it.
322 if (aFlags & TChar::EFoldKana)
324 if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)
325 return result += 0x0060;
328 data = &GetData(NULL);
329 if (aFlags & TChar::EFoldSpaces)
331 if (data->iCategory == TChar::EZsCategory)
334 if (aFlags & TChar::EFoldDigits)
336 TInt n = GetNumericValue(*data);
337 if (n >= 0 && n <= 9)
342 The final operations are the relatively rare and expensive ones (after the special
343 case dealt with above) of accent removal and case conversion.
345 if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))
348 Throw away characters other than the first if all are accents. For the moment these
349 are defined as characters in the range 0x0300..0x0361. This definition may need
350 to be modified; or I may decide to store a flag in the decomposition table indicating
351 whether or not the decomposition consists of base + accent(s).
353 TPtrC16 decomposition;
354 if (::DecomposeChar(iCode, decomposition))
356 TBool all_accents = TRUE;
357 for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)
359 if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)
363 result = decomposition[0];
367 if (aFlags & TChar::EFoldCase)
369 if (aOverridingDataSet == NULL && result < 256)
370 result = FoldTable[result];
372 result = TUnicode(result).GetLowerCase(aOverridingDataSet);
379 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison
380 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they
381 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes
382 after the string { 0x0001 }.
384 This function exists to make it easier to search tables of Unicode strings (like the composition
385 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.
387 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.
389 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
391 for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
393 TInt x = i < aLength1 ? *aString1 : -1;
394 TInt y = i < aLength2 ? *aString2 : -1;