os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
First public contribution.
2 * Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
17 // There are 2 reasons why not use existing unicodeconv.cpp:
18 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
19 // for huge code pages (e.g, Asia code pages). See INC127598.
21 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
23 // The algorithm of this special version unicodeconv.cpp is straightforward:
24 // 1) foreign->unicode:
25 // 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
26 // "cp54936_2byte_tounicode.cpp", which is generated with command
27 // "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
29 // 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
30 // search into the mapping table in "cp54936_4byte_tounicode.cpp",
31 // which is generated with command
32 // "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
34 // 1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
36 // 2) unicode->foreign:
37 // 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
38 // can map directly, which is generated with command
39 // "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
41 // 2.2) unicode non-bmp->4 byte: calculate with formula in this file.
43 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
44 // ConvertSingleUnicode() is not used anymore. It's reserved just because not
45 // changing the tool FatConversionTable.pl.
47 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
48 // 1) All Private Used Area (PUA) code points are reserved.
49 // 2) All GB18030 code points that mapping to undefined Unicode are reserved.
52 // About the formula for non-bmp calculation:
53 // 1) All code points from 0x10000 to 0x10FFFF are supported.
54 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
55 // the GB18030 standard, since the standard does not define the mapping for
56 // code points out of 0x20000-0x2FFFF.
62 #include "unicodeconv.h"
69 E4ByteIndexOutOfRange,
73 void Panic(TFccPanic aPanic)
76 User::Panic(_L("FatCharsetConv"),aPanic);
80 //replacement character to be used when unicode cannot be converted
81 const TUint8 KForeignReplacement = 0x5F;
83 const TUint8 KU10000Byte1 = 0x90;
84 const TUint8 KU10000Byte2 = 0x30;
85 const TUint8 KU10000Byte3 = 0x81;
86 const TUint8 KU10000Byte4 = 0x30;
88 inline TBool IsSupplementary(TUint aChar)
90 @param aChar The 32-bit code point value of a Unicode character.
92 @return True, if aChar is supplementary character; false, otherwise.
95 return (aChar > 0xFFFF);
98 inline TBool IsSurrogate(TText16 aInt16)
100 @return True, if aText16 is high surrogate or low surrogate; false, otherwise.
103 return (aInt16 & 0xF800) == 0xD800;
106 inline TBool IsHighSurrogate(TText16 aInt16)
108 @return True, if aText16 is high surrogate; false, otherwise.
111 return (aInt16 & 0xFC00) == 0xD800;
114 inline TBool IsLowSurrogate(TText16 aInt16)
116 @return True, if aText16 is low surrogate; false, otherwise.
119 return (aInt16 & 0xFC00) == 0xDC00;
122 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
124 Combine a high surrogate and a low surrogate into a supplementary character.
126 @return The 32-bit code point value of the generated Unicode supplementary
130 return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
133 inline TText16 GetHighSurrogate(TUint aChar)
135 Retrieve the high surrogate of a supplementary character.
137 @param aChar The 32-bit code point value of a Unicode character.
139 @return High surrogate of aChar, if aChar is a supplementary character;
140 aChar itself, if aChar is not a supplementary character.
143 return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
146 inline TText16 GetLowSurrogate(TUint aChar)
148 Retrieve the low surrogate of a supplementary character.
150 @param aChar The 32-bit code point value of a Unicode character.
152 @return Low surrogate of aChar, if aChar is a supplementary character;
153 zero, if aChar is not a supplementary character.
156 return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
159 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
160 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
162 UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
165 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
166 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
168 const TInt length = aUnicode.Length();
169 const TUint16* unicode = aUnicode.Ptr();
170 const TUint16* guard = unicode + length;
172 TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
173 TUint8* foreignguard = foreign + aForeign.MaxLength();
175 //loop going through the character of the unicode descriptor
176 while (unicode < guard)
178 TUint32 unicodeChar = *unicode++;
179 if (IsHighSurrogate(unicodeChar))
181 if (unicode >= guard || !IsLowSurrogate(*unicode))
183 if (foreign >= foreignguard)
185 aForeign.SetLength(foreign-aForeign.Ptr());
186 if (leaveWhenOverflow)
187 User::Leave(KErrOverflow);
191 *foreign++ = KForeignReplacement;
194 unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
196 if (IsLowSurrogate(unicodeChar))
198 if (foreign >= foreignguard)
200 aForeign.SetLength(foreign-aForeign.Ptr());
201 if (leaveWhenOverflow)
202 User::Leave(KErrOverflow);
206 *foreign++ = KForeignReplacement;
210 TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code.
211 TInt count; // byte count of result GB18030 code; can be 1, 2 or 4.
213 // unicode to cp54936
214 if (IsSupplementary(unicodeChar))
216 unicodeChar -= 0x10000;
217 b4 = unicodeChar % 10 + KU10000Byte4;
219 b3 = unicodeChar % 126 + KU10000Byte3;
221 b2 = unicodeChar % 10 + KU10000Byte2;
222 b1 = unicodeChar / 10 + KU10000Byte1;
228 foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
229 b1 = ((foreignChar >> 24) & 0xFF);
230 b2 = ((foreignChar >> 16) & 0xFF);
231 b3 = ((foreignChar >> 8) & 0xFF);
232 b4 = (foreignChar & 0xFF);
240 __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
248 if (foreign + count > foreignguard)
250 aForeign.SetLength(foreign-aForeign.Ptr());
251 if (leaveWhenOverflow)
252 User::Leave(KErrOverflow);
265 aForeign.SetLength(foreign-aForeign.Ptr());
270 //This function converts from foreign characters into unicode and adds them into a descriptor
271 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
273 UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
276 //This function converts from foreign characters into unicode and adds them into a descriptor
277 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
279 const TInt foreignLength = aForeign.Length();
280 const TUint8* foreign = aForeign.Ptr();
281 const TUint8* guard = foreign + foreignLength;
283 TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
284 TUint16* unicodeguard = unicode + aUnicode.MaxLength();
286 TUint8 b1, b2, b3, b4;
298 //loop going through the characters of the foreign descriptor
299 while (foreign < guard)
301 // roughly, detect which area the foreign code belongs to
305 else if (b1 == 0x80 || b1 > 0xFE)
307 else if (foreign >= guard)
312 if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
314 else if (b2 < 0x30 || b2 > 0x39)
316 else if (foreign+1 >= guard)
321 if (b3 < 0x81 || b3 > 0xFE)
326 if (b4 < 0x30 || b4 > 0x39)
328 else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39
329 codetype = E4ByteBmp;
330 else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39
331 codetype = E4ByteSupplementary;
333 codetype = EError; // others are reserved
338 // cp54936 to unicode
339 if (codetype == E1Byte)
343 else if (codetype == E2Byte)
345 // conventional algorithm used in FatCharsetConv
346 const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
347 if (structPtr->iUnicodeIfSingle)
348 unicodeChar = structPtr->iUnicodeIfSingle;
349 else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
350 unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
352 unicodeChar = 0xFFFD;
354 else if (codetype == E4ByteBmp)
356 TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
357 __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
358 unicodeChar = KMappingTable4ByteBmp2Unicode[index];
360 else if (codetype == E4ByteSupplementary)
362 unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
363 (b2 - KU10000Byte2) * 1260 +
364 (b3 - KU10000Byte3) * 10 +
366 __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
370 unicodeChar = 0xFFFD;
373 // append to output buffer
374 if (IsSupplementary(unicodeChar))
376 if (unicode + 1 >= unicodeguard)
378 aUnicode.SetLength(unicode-aUnicode.Ptr());
379 if (leaveWhenOverflow)
380 User::Leave(KErrOverflow);
384 *unicode++ = GetHighSurrogate(unicodeChar);
385 *unicode++ = GetLowSurrogate(unicodeChar);
389 if (unicode >= unicodeguard)
391 aUnicode.SetLength(unicode-aUnicode.Ptr());
392 if (leaveWhenOverflow)
393 User::Leave(KErrOverflow);
397 *unicode++ = unicodeChar;
400 aUnicode.SetLength(unicode-aUnicode.Ptr());
404 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
406 //1. aCharacter >= 0x0080
407 if (aCharacter>=0x0080)
409 // Since all Unicode characters can be mapped to GB18030, so no need to
410 // test the converting.
411 if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
417 // For most common cases:
418 // Note: lower case characters are considered legal DOS char here.
419 if ((aCharacter>='a' && aCharacter<='z') ||
420 (aCharacter>='A' && aCharacter<='Z') ||
421 (aCharacter>='0' && aCharacter<='9'))
425 // Checking for illegal chars:
426 // 2. aCharacter <= 0x20
427 // Note: leading 0x05 byte should be guarded by callers of this function
428 // as the information of the position of the character is required.
429 if (aCharacter < 0x20)
431 // Space (' ') is not considered as a legal DOS char here.
432 if (aCharacter == 0x20)
435 // 3. 0x20 < aCharacter < 0x80
436 // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name":
443 //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it
444 // is a valid character in short file names.