sl@0: /* sl@0: * Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: * All rights reserved. sl@0: * This component and the accompanying materials are made available sl@0: * under the terms of "Eclipse Public License v1.0" sl@0: * which accompanies this distribution, and is available sl@0: * at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: * sl@0: * Initial Contributors: sl@0: * Nokia Corporation - initial contribution. sl@0: * sl@0: * Contributors: sl@0: * sl@0: * Description: sl@0: * sl@0: */ sl@0: // There are 2 reasons why not use existing unicodeconv.cpp: sl@0: // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially sl@0: // for huge code pages (e.g, Asia code pages). See INC127598. sl@0: // sl@0: // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle. sl@0: // sl@0: // The algorithm of this special version unicodeconv.cpp is straightforward: sl@0: // 1) foreign->unicode: sl@0: // 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in sl@0: // "cp54936_2byte_tounicode.cpp", which is generated with command sl@0: // "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt". sl@0: // sl@0: // 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then sl@0: // search into the mapping table in "cp54936_4byte_tounicode.cpp", sl@0: // which is generated with command sl@0: // "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt". sl@0: // sl@0: // 1.3) 4 byte->unicode non-bmp: calculate with formula in this file. sl@0: // sl@0: // 2) unicode->foreign: sl@0: // 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp" sl@0: // can map directly, which is generated with command sl@0: // "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt". sl@0: // sl@0: // 2.2) unicode non-bmp->4 byte: calculate with formula in this file. sl@0: // sl@0: // The function cp54936_2byte_tounicode.cpp::TConvDataStruct:: sl@0: // ConvertSingleUnicode() is not used anymore. It's reserved just because not sl@0: // changing the tool FatConversionTable.pl. sl@0: // sl@0: // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt": sl@0: // 1) All Private Used Area (PUA) code points are reserved. sl@0: // 2) All GB18030 code points that mapping to undefined Unicode are reserved. sl@0: // sl@0: // sl@0: // About the formula for non-bmp calculation: sl@0: // 1) All code points from 0x10000 to 0x10FFFF are supported. sl@0: // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from sl@0: // the GB18030 standard, since the standard does not define the mapping for sl@0: // code points out of 0x20000-0x2FFFF. sl@0: sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: #include "unicodeconv.h" sl@0: #include "cp54936.h" sl@0: sl@0: sl@0: enum TFccPanic sl@0: { sl@0: EBadForeignCode = 0, sl@0: E4ByteIndexOutOfRange, sl@0: EPanicBadIndices1, sl@0: EInavlidUnicodeValue sl@0: }; sl@0: void Panic(TFccPanic aPanic) sl@0: { sl@0: sl@0: User::Panic(_L("FatCharsetConv"),aPanic); sl@0: } sl@0: sl@0: sl@0: //replacement character to be used when unicode cannot be converted sl@0: const TUint8 KForeignReplacement = 0x5F; sl@0: sl@0: const TUint8 KU10000Byte1 = 0x90; sl@0: const TUint8 KU10000Byte2 = 0x30; sl@0: const TUint8 KU10000Byte3 = 0x81; sl@0: const TUint8 KU10000Byte4 = 0x30; sl@0: sl@0: inline TBool IsSupplementary(TUint aChar) sl@0: /** sl@0: @param aChar The 32-bit code point value of a Unicode character. sl@0: sl@0: @return True, if aChar is supplementary character; false, otherwise. sl@0: */ sl@0: { sl@0: return (aChar > 0xFFFF); sl@0: } sl@0: sl@0: inline TBool IsSurrogate(TText16 aInt16) sl@0: /** sl@0: @return True, if aText16 is high surrogate or low surrogate; false, otherwise. sl@0: */ sl@0: { sl@0: return (aInt16 & 0xF800) == 0xD800; sl@0: } sl@0: sl@0: inline TBool IsHighSurrogate(TText16 aInt16) sl@0: /** sl@0: @return True, if aText16 is high surrogate; false, otherwise. sl@0: */ sl@0: { sl@0: return (aInt16 & 0xFC00) == 0xD800; sl@0: } sl@0: sl@0: inline TBool IsLowSurrogate(TText16 aInt16) sl@0: /** sl@0: @return True, if aText16 is low surrogate; false, otherwise. sl@0: */ sl@0: { sl@0: return (aInt16 & 0xFC00) == 0xDC00; sl@0: } sl@0: sl@0: inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate) sl@0: /** sl@0: Combine a high surrogate and a low surrogate into a supplementary character. sl@0: sl@0: @return The 32-bit code point value of the generated Unicode supplementary sl@0: character. sl@0: */ sl@0: { sl@0: return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate; sl@0: } sl@0: sl@0: inline TText16 GetHighSurrogate(TUint aChar) sl@0: /** sl@0: Retrieve the high surrogate of a supplementary character. sl@0: sl@0: @param aChar The 32-bit code point value of a Unicode character. sl@0: sl@0: @return High surrogate of aChar, if aChar is a supplementary character; sl@0: aChar itself, if aChar is not a supplementary character. sl@0: */ sl@0: { sl@0: return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10)); sl@0: } sl@0: sl@0: inline TText16 GetLowSurrogate(TUint aChar) sl@0: /** sl@0: Retrieve the low surrogate of a supplementary character. sl@0: sl@0: @param aChar The 32-bit code point value of a Unicode character. sl@0: sl@0: @return Low surrogate of aChar, if aChar is a supplementary character; sl@0: zero, if aChar is not a supplementary character. sl@0: */ sl@0: { sl@0: return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF)); sl@0: } sl@0: sl@0: //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor sl@0: EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode) sl@0: { sl@0: UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue); sl@0: } sl@0: sl@0: //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor sl@0: EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow) sl@0: { sl@0: const TInt length = aUnicode.Length(); sl@0: const TUint16* unicode = aUnicode.Ptr(); sl@0: const TUint16* guard = unicode + length; sl@0: sl@0: TUint8* foreign = const_cast(aForeign.Ptr()); sl@0: TUint8* foreignguard = foreign + aForeign.MaxLength(); sl@0: sl@0: //loop going through the character of the unicode descriptor sl@0: while (unicode < guard) sl@0: { sl@0: TUint32 unicodeChar = *unicode++; sl@0: if (IsHighSurrogate(unicodeChar)) sl@0: { sl@0: if (unicode >= guard || !IsLowSurrogate(*unicode)) sl@0: { sl@0: if (foreign >= foreignguard) sl@0: { sl@0: aForeign.SetLength(foreign-aForeign.Ptr()); sl@0: if (leaveWhenOverflow) sl@0: User::Leave(KErrOverflow); sl@0: else sl@0: return KErrOverflow; sl@0: } sl@0: *foreign++ = KForeignReplacement; sl@0: continue; sl@0: } sl@0: unicodeChar = JoinSurrogate(unicodeChar, *unicode++); sl@0: } sl@0: if (IsLowSurrogate(unicodeChar)) sl@0: { sl@0: if (foreign >= foreignguard) sl@0: { sl@0: aForeign.SetLength(foreign-aForeign.Ptr()); sl@0: if (leaveWhenOverflow) sl@0: User::Leave(KErrOverflow); sl@0: else sl@0: return KErrOverflow; sl@0: } sl@0: *foreign++ = KForeignReplacement; sl@0: continue; sl@0: } sl@0: sl@0: TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code. sl@0: TInt count; // byte count of result GB18030 code; can be 1, 2 or 4. sl@0: sl@0: // unicode to cp54936 sl@0: if (IsSupplementary(unicodeChar)) sl@0: { sl@0: unicodeChar -= 0x10000; sl@0: b4 = unicodeChar % 10 + KU10000Byte4; sl@0: unicodeChar /= 10; sl@0: b3 = unicodeChar % 126 + KU10000Byte3; sl@0: unicodeChar /= 126; sl@0: b2 = unicodeChar % 10 + KU10000Byte2; sl@0: b1 = unicodeChar / 10 + KU10000Byte1; sl@0: count = 4; sl@0: } sl@0: else sl@0: { sl@0: TUint32 foreignChar; sl@0: foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar]; sl@0: b1 = ((foreignChar >> 24) & 0xFF); sl@0: b2 = ((foreignChar >> 16) & 0xFF); sl@0: b3 = ((foreignChar >> 8) & 0xFF); sl@0: b4 = (foreignChar & 0xFF); sl@0: count = 1; sl@0: if (b1) sl@0: { sl@0: count = 4; sl@0: } sl@0: else sl@0: { sl@0: __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode)); sl@0: if (b3) sl@0: { sl@0: count = 2; sl@0: } sl@0: } sl@0: } sl@0: sl@0: if (foreign + count > foreignguard) sl@0: { sl@0: aForeign.SetLength(foreign-aForeign.Ptr()); sl@0: if (leaveWhenOverflow) sl@0: User::Leave(KErrOverflow); sl@0: else sl@0: return KErrOverflow; sl@0: } sl@0: if (count == 4) sl@0: { sl@0: *foreign++ = b1; sl@0: *foreign++ = b2; sl@0: } sl@0: if (count >= 2) sl@0: *foreign++ = b3; sl@0: *foreign++ = b4; sl@0: } sl@0: aForeign.SetLength(foreign-aForeign.Ptr()); sl@0: return KErrNone; sl@0: } sl@0: sl@0: sl@0: //This function converts from foreign characters into unicode and adds them into a descriptor sl@0: EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign) sl@0: { sl@0: UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue); sl@0: } sl@0: sl@0: //This function converts from foreign characters into unicode and adds them into a descriptor sl@0: EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow) sl@0: { sl@0: const TInt foreignLength = aForeign.Length(); sl@0: const TUint8* foreign = aForeign.Ptr(); sl@0: const TUint8* guard = foreign + foreignLength; sl@0: sl@0: TUint16* unicode = const_cast(aUnicode.Ptr()); sl@0: TUint16* unicodeguard = unicode + aUnicode.MaxLength(); sl@0: sl@0: TUint8 b1, b2, b3, b4; sl@0: enum TCodeType sl@0: { sl@0: E1Byte = 0, sl@0: E2Byte, sl@0: E4ByteBmp, sl@0: E4ByteSupplementary, sl@0: EError, sl@0: }; sl@0: TCodeType codetype; sl@0: TUint32 unicodeChar; sl@0: sl@0: //loop going through the characters of the foreign descriptor sl@0: while (foreign < guard) sl@0: { sl@0: // roughly, detect which area the foreign code belongs to sl@0: b1 = *foreign++; sl@0: if (b1 <= 0x7F) sl@0: codetype = E1Byte; sl@0: else if (b1 == 0x80 || b1 > 0xFE) sl@0: codetype = EError; sl@0: else if (foreign >= guard) sl@0: codetype = EError; sl@0: else sl@0: { sl@0: b2 = *foreign++; sl@0: if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F) sl@0: codetype = E2Byte; sl@0: else if (b2 < 0x30 || b2 > 0x39) sl@0: codetype = EError; sl@0: else if (foreign+1 >= guard) sl@0: codetype = EError; sl@0: else sl@0: { sl@0: b3 = *foreign++; sl@0: if (b3 < 0x81 || b3 > 0xFE) sl@0: codetype = EError; sl@0: else sl@0: { sl@0: b4 = *foreign++; sl@0: if (b4 < 0x30 || b4 > 0x39) sl@0: codetype = EError; sl@0: else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39 sl@0: codetype = E4ByteBmp; sl@0: else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39 sl@0: codetype = E4ByteSupplementary; sl@0: else sl@0: codetype = EError; // others are reserved sl@0: } sl@0: } sl@0: } sl@0: sl@0: // cp54936 to unicode sl@0: if (codetype == E1Byte) sl@0: { sl@0: unicodeChar = b1; sl@0: } sl@0: else if (codetype == E2Byte) sl@0: { sl@0: // conventional algorithm used in FatCharsetConv sl@0: const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80); sl@0: if (structPtr->iUnicodeIfSingle) sl@0: unicodeChar = structPtr->iUnicodeIfSingle; sl@0: else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte) sl@0: unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)]; sl@0: else sl@0: unicodeChar = 0xFFFD; sl@0: } sl@0: else if (codetype == E4ByteBmp) sl@0: { sl@0: TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30); sl@0: __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange)); sl@0: unicodeChar = KMappingTable4ByteBmp2Unicode[index]; sl@0: } sl@0: else if (codetype == E4ByteSupplementary) sl@0: { sl@0: unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 + sl@0: (b2 - KU10000Byte2) * 1260 + sl@0: (b3 - KU10000Byte3) * 10 + sl@0: (b4 - KU10000Byte4); sl@0: __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue)); sl@0: } sl@0: else sl@0: { sl@0: unicodeChar = 0xFFFD; sl@0: } sl@0: sl@0: // append to output buffer sl@0: if (IsSupplementary(unicodeChar)) sl@0: { sl@0: if (unicode + 1 >= unicodeguard) sl@0: { sl@0: aUnicode.SetLength(unicode-aUnicode.Ptr()); sl@0: if (leaveWhenOverflow) sl@0: User::Leave(KErrOverflow); sl@0: else sl@0: return KErrOverflow; sl@0: } sl@0: *unicode++ = GetHighSurrogate(unicodeChar); sl@0: *unicode++ = GetLowSurrogate(unicodeChar); sl@0: } sl@0: else sl@0: { sl@0: if (unicode >= unicodeguard) sl@0: { sl@0: aUnicode.SetLength(unicode-aUnicode.Ptr()); sl@0: if (leaveWhenOverflow) sl@0: User::Leave(KErrOverflow); sl@0: else sl@0: return KErrOverflow; sl@0: } sl@0: *unicode++ = unicodeChar; sl@0: } sl@0: } sl@0: aUnicode.SetLength(unicode-aUnicode.Ptr()); sl@0: return KErrNone; sl@0: } sl@0: sl@0: EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter) sl@0: { sl@0: //1. aCharacter >= 0x0080 sl@0: if (aCharacter>=0x0080) sl@0: { sl@0: // Since all Unicode characters can be mapped to GB18030, so no need to sl@0: // test the converting. sl@0: if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter)) sl@0: return ETrue; sl@0: else sl@0: return EFalse; sl@0: } sl@0: sl@0: // For most common cases: sl@0: // Note: lower case characters are considered legal DOS char here. sl@0: if ((aCharacter>='a' && aCharacter<='z') || sl@0: (aCharacter>='A' && aCharacter<='Z') || sl@0: (aCharacter>='0' && aCharacter<='9')) sl@0: { sl@0: return ETrue; sl@0: } sl@0: // Checking for illegal chars: sl@0: // 2. aCharacter <= 0x20 sl@0: // Note: leading 0x05 byte should be guarded by callers of this function sl@0: // as the information of the position of the character is required. sl@0: if (aCharacter < 0x20) sl@0: return EFalse; sl@0: // Space (' ') is not considered as a legal DOS char here. sl@0: if (aCharacter == 0x20) sl@0: return EFalse; sl@0: sl@0: // 3. 0x20 < aCharacter < 0x80 sl@0: // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": sl@0: switch (aCharacter) sl@0: { sl@0: case 0x22: // '"' sl@0: case 0x2A: // '*' sl@0: case 0x2B: // '+' sl@0: case 0x2C: // ',' sl@0: //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it sl@0: // is a valid character in short file names. sl@0: case 0x2F: // '/' sl@0: case 0x3A: // ':' sl@0: case 0x3B: // ';' sl@0: case 0x3C: // '<' sl@0: case 0x3D: // '=' sl@0: case 0x3E: // '>' sl@0: case 0x3F: // '?' sl@0: case 0x5B: // '[' sl@0: case 0x5C: // '\' sl@0: case 0x5D: // ']' sl@0: case 0x7C: // '|' sl@0: return EFalse; sl@0: default: sl@0: return ETrue; sl@0: } sl@0: } sl@0: