os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,461 @@
1.4 +/*
1.5 +* Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +* All rights reserved.
1.7 +* This component and the accompanying materials are made available
1.8 +* under the terms of "Eclipse Public License v1.0"
1.9 +* which accompanies this distribution, and is available
1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +*
1.12 +* Initial Contributors:
1.13 +* Nokia Corporation - initial contribution.
1.14 +*
1.15 +* Contributors:
1.16 +*
1.17 +* Description:
1.18 +*
1.19 +*/
1.20 +// There are 2 reasons why not use existing unicodeconv.cpp:
1.21 +// 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
1.22 +// for huge code pages (e.g, Asia code pages). See INC127598.
1.23 +//
1.24 +// 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
1.25 +//
1.26 +// The algorithm of this special version unicodeconv.cpp is straightforward:
1.27 +// 1) foreign->unicode:
1.28 +// 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
1.29 +// "cp54936_2byte_tounicode.cpp", which is generated with command
1.30 +// "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
1.31 +//
1.32 +// 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
1.33 +// search into the mapping table in "cp54936_4byte_tounicode.cpp",
1.34 +// which is generated with command
1.35 +// "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
1.36 +//
1.37 +// 1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
1.38 +//
1.39 +// 2) unicode->foreign:
1.40 +// 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
1.41 +// can map directly, which is generated with command
1.42 +// "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
1.43 +//
1.44 +// 2.2) unicode non-bmp->4 byte: calculate with formula in this file.
1.45 +//
1.46 +// The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
1.47 +// ConvertSingleUnicode() is not used anymore. It's reserved just because not
1.48 +// changing the tool FatConversionTable.pl.
1.49 +//
1.50 +// About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
1.51 +// 1) All Private Used Area (PUA) code points are reserved.
1.52 +// 2) All GB18030 code points that mapping to undefined Unicode are reserved.
1.53 +//
1.54 +//
1.55 +// About the formula for non-bmp calculation:
1.56 +// 1) All code points from 0x10000 to 0x10FFFF are supported.
1.57 +// 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
1.58 +// the GB18030 standard, since the standard does not define the mapping for
1.59 +// code points out of 0x20000-0x2FFFF.
1.60 +
1.61 +
1.62 +#include <e32std.h>
1.63 +#include <e32def.h>
1.64 +#include <e32des8.h>
1.65 +#include "unicodeconv.h"
1.66 +#include "cp54936.h"
1.67 +
1.68 +
1.69 +enum TFccPanic
1.70 + {
1.71 + EBadForeignCode = 0,
1.72 + E4ByteIndexOutOfRange,
1.73 + EPanicBadIndices1,
1.74 + EInavlidUnicodeValue
1.75 + };
1.76 +void Panic(TFccPanic aPanic)
1.77 + {
1.78 +
1.79 + User::Panic(_L("FatCharsetConv"),aPanic);
1.80 + }
1.81 +
1.82 +
1.83 +//replacement character to be used when unicode cannot be converted
1.84 +const TUint8 KForeignReplacement = 0x5F;
1.85 +
1.86 +const TUint8 KU10000Byte1 = 0x90;
1.87 +const TUint8 KU10000Byte2 = 0x30;
1.88 +const TUint8 KU10000Byte3 = 0x81;
1.89 +const TUint8 KU10000Byte4 = 0x30;
1.90 +
1.91 +inline TBool IsSupplementary(TUint aChar)
1.92 +/**
1.93 +@param aChar The 32-bit code point value of a Unicode character.
1.94 +
1.95 +@return True, if aChar is supplementary character; false, otherwise.
1.96 +*/
1.97 + {
1.98 + return (aChar > 0xFFFF);
1.99 + }
1.100 +
1.101 +inline TBool IsSurrogate(TText16 aInt16)
1.102 +/**
1.103 +@return True, if aText16 is high surrogate or low surrogate; false, otherwise.
1.104 +*/
1.105 + {
1.106 + return (aInt16 & 0xF800) == 0xD800;
1.107 + }
1.108 +
1.109 +inline TBool IsHighSurrogate(TText16 aInt16)
1.110 +/**
1.111 +@return True, if aText16 is high surrogate; false, otherwise.
1.112 +*/
1.113 + {
1.114 + return (aInt16 & 0xFC00) == 0xD800;
1.115 + }
1.116 +
1.117 +inline TBool IsLowSurrogate(TText16 aInt16)
1.118 +/**
1.119 +@return True, if aText16 is low surrogate; false, otherwise.
1.120 +*/
1.121 + {
1.122 + return (aInt16 & 0xFC00) == 0xDC00;
1.123 + }
1.124 +
1.125 +inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
1.126 +/**
1.127 +Combine a high surrogate and a low surrogate into a supplementary character.
1.128 +
1.129 +@return The 32-bit code point value of the generated Unicode supplementary
1.130 + character.
1.131 +*/
1.132 + {
1.133 + return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
1.134 + }
1.135 +
1.136 +inline TText16 GetHighSurrogate(TUint aChar)
1.137 +/**
1.138 +Retrieve the high surrogate of a supplementary character.
1.139 +
1.140 +@param aChar The 32-bit code point value of a Unicode character.
1.141 +
1.142 +@return High surrogate of aChar, if aChar is a supplementary character;
1.143 + aChar itself, if aChar is not a supplementary character.
1.144 +*/
1.145 + {
1.146 + return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
1.147 + }
1.148 +
1.149 +inline TText16 GetLowSurrogate(TUint aChar)
1.150 +/**
1.151 +Retrieve the low surrogate of a supplementary character.
1.152 +
1.153 +@param aChar The 32-bit code point value of a Unicode character.
1.154 +
1.155 +@return Low surrogate of aChar, if aChar is a supplementary character;
1.156 + zero, if aChar is not a supplementary character.
1.157 +*/
1.158 + {
1.159 + return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
1.160 + }
1.161 +
1.162 +//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
1.163 +EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
1.164 + {
1.165 + UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
1.166 + }
1.167 +
1.168 +//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
1.169 +EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
1.170 + {
1.171 + const TInt length = aUnicode.Length();
1.172 + const TUint16* unicode = aUnicode.Ptr();
1.173 + const TUint16* guard = unicode + length;
1.174 +
1.175 + TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
1.176 + TUint8* foreignguard = foreign + aForeign.MaxLength();
1.177 +
1.178 + //loop going through the character of the unicode descriptor
1.179 + while (unicode < guard)
1.180 + {
1.181 + TUint32 unicodeChar = *unicode++;
1.182 + if (IsHighSurrogate(unicodeChar))
1.183 + {
1.184 + if (unicode >= guard || !IsLowSurrogate(*unicode))
1.185 + {
1.186 + if (foreign >= foreignguard)
1.187 + {
1.188 + aForeign.SetLength(foreign-aForeign.Ptr());
1.189 + if (leaveWhenOverflow)
1.190 + User::Leave(KErrOverflow);
1.191 + else
1.192 + return KErrOverflow;
1.193 + }
1.194 + *foreign++ = KForeignReplacement;
1.195 + continue;
1.196 + }
1.197 + unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
1.198 + }
1.199 + if (IsLowSurrogate(unicodeChar))
1.200 + {
1.201 + if (foreign >= foreignguard)
1.202 + {
1.203 + aForeign.SetLength(foreign-aForeign.Ptr());
1.204 + if (leaveWhenOverflow)
1.205 + User::Leave(KErrOverflow);
1.206 + else
1.207 + return KErrOverflow;
1.208 + }
1.209 + *foreign++ = KForeignReplacement;
1.210 + continue;
1.211 + }
1.212 +
1.213 + TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code.
1.214 + TInt count; // byte count of result GB18030 code; can be 1, 2 or 4.
1.215 +
1.216 + // unicode to cp54936
1.217 + if (IsSupplementary(unicodeChar))
1.218 + {
1.219 + unicodeChar -= 0x10000;
1.220 + b4 = unicodeChar % 10 + KU10000Byte4;
1.221 + unicodeChar /= 10;
1.222 + b3 = unicodeChar % 126 + KU10000Byte3;
1.223 + unicodeChar /= 126;
1.224 + b2 = unicodeChar % 10 + KU10000Byte2;
1.225 + b1 = unicodeChar / 10 + KU10000Byte1;
1.226 + count = 4;
1.227 + }
1.228 + else
1.229 + {
1.230 + TUint32 foreignChar;
1.231 + foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
1.232 + b1 = ((foreignChar >> 24) & 0xFF);
1.233 + b2 = ((foreignChar >> 16) & 0xFF);
1.234 + b3 = ((foreignChar >> 8) & 0xFF);
1.235 + b4 = (foreignChar & 0xFF);
1.236 + count = 1;
1.237 + if (b1)
1.238 + {
1.239 + count = 4;
1.240 + }
1.241 + else
1.242 + {
1.243 + __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
1.244 + if (b3)
1.245 + {
1.246 + count = 2;
1.247 + }
1.248 + }
1.249 + }
1.250 +
1.251 + if (foreign + count > foreignguard)
1.252 + {
1.253 + aForeign.SetLength(foreign-aForeign.Ptr());
1.254 + if (leaveWhenOverflow)
1.255 + User::Leave(KErrOverflow);
1.256 + else
1.257 + return KErrOverflow;
1.258 + }
1.259 + if (count == 4)
1.260 + {
1.261 + *foreign++ = b1;
1.262 + *foreign++ = b2;
1.263 + }
1.264 + if (count >= 2)
1.265 + *foreign++ = b3;
1.266 + *foreign++ = b4;
1.267 + }
1.268 + aForeign.SetLength(foreign-aForeign.Ptr());
1.269 + return KErrNone;
1.270 + }
1.271 +
1.272 +
1.273 +//This function converts from foreign characters into unicode and adds them into a descriptor
1.274 +EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
1.275 + {
1.276 + UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
1.277 + }
1.278 +
1.279 +//This function converts from foreign characters into unicode and adds them into a descriptor
1.280 +EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
1.281 + {
1.282 + const TInt foreignLength = aForeign.Length();
1.283 + const TUint8* foreign = aForeign.Ptr();
1.284 + const TUint8* guard = foreign + foreignLength;
1.285 +
1.286 + TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
1.287 + TUint16* unicodeguard = unicode + aUnicode.MaxLength();
1.288 +
1.289 + TUint8 b1, b2, b3, b4;
1.290 + enum TCodeType
1.291 + {
1.292 + E1Byte = 0,
1.293 + E2Byte,
1.294 + E4ByteBmp,
1.295 + E4ByteSupplementary,
1.296 + EError,
1.297 + };
1.298 + TCodeType codetype;
1.299 + TUint32 unicodeChar;
1.300 +
1.301 + //loop going through the characters of the foreign descriptor
1.302 + while (foreign < guard)
1.303 + {
1.304 + // roughly, detect which area the foreign code belongs to
1.305 + b1 = *foreign++;
1.306 + if (b1 <= 0x7F)
1.307 + codetype = E1Byte;
1.308 + else if (b1 == 0x80 || b1 > 0xFE)
1.309 + codetype = EError;
1.310 + else if (foreign >= guard)
1.311 + codetype = EError;
1.312 + else
1.313 + {
1.314 + b2 = *foreign++;
1.315 + if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
1.316 + codetype = E2Byte;
1.317 + else if (b2 < 0x30 || b2 > 0x39)
1.318 + codetype = EError;
1.319 + else if (foreign+1 >= guard)
1.320 + codetype = EError;
1.321 + else
1.322 + {
1.323 + b3 = *foreign++;
1.324 + if (b3 < 0x81 || b3 > 0xFE)
1.325 + codetype = EError;
1.326 + else
1.327 + {
1.328 + b4 = *foreign++;
1.329 + if (b4 < 0x30 || b4 > 0x39)
1.330 + codetype = EError;
1.331 + else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39
1.332 + codetype = E4ByteBmp;
1.333 + else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39
1.334 + codetype = E4ByteSupplementary;
1.335 + else
1.336 + codetype = EError; // others are reserved
1.337 + }
1.338 + }
1.339 + }
1.340 +
1.341 + // cp54936 to unicode
1.342 + if (codetype == E1Byte)
1.343 + {
1.344 + unicodeChar = b1;
1.345 + }
1.346 + else if (codetype == E2Byte)
1.347 + {
1.348 + // conventional algorithm used in FatCharsetConv
1.349 + const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
1.350 + if (structPtr->iUnicodeIfSingle)
1.351 + unicodeChar = structPtr->iUnicodeIfSingle;
1.352 + else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
1.353 + unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
1.354 + else
1.355 + unicodeChar = 0xFFFD;
1.356 + }
1.357 + else if (codetype == E4ByteBmp)
1.358 + {
1.359 + TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
1.360 + __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
1.361 + unicodeChar = KMappingTable4ByteBmp2Unicode[index];
1.362 + }
1.363 + else if (codetype == E4ByteSupplementary)
1.364 + {
1.365 + unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
1.366 + (b2 - KU10000Byte2) * 1260 +
1.367 + (b3 - KU10000Byte3) * 10 +
1.368 + (b4 - KU10000Byte4);
1.369 + __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
1.370 + }
1.371 + else
1.372 + {
1.373 + unicodeChar = 0xFFFD;
1.374 + }
1.375 +
1.376 + // append to output buffer
1.377 + if (IsSupplementary(unicodeChar))
1.378 + {
1.379 + if (unicode + 1 >= unicodeguard)
1.380 + {
1.381 + aUnicode.SetLength(unicode-aUnicode.Ptr());
1.382 + if (leaveWhenOverflow)
1.383 + User::Leave(KErrOverflow);
1.384 + else
1.385 + return KErrOverflow;
1.386 + }
1.387 + *unicode++ = GetHighSurrogate(unicodeChar);
1.388 + *unicode++ = GetLowSurrogate(unicodeChar);
1.389 + }
1.390 + else
1.391 + {
1.392 + if (unicode >= unicodeguard)
1.393 + {
1.394 + aUnicode.SetLength(unicode-aUnicode.Ptr());
1.395 + if (leaveWhenOverflow)
1.396 + User::Leave(KErrOverflow);
1.397 + else
1.398 + return KErrOverflow;
1.399 + }
1.400 + *unicode++ = unicodeChar;
1.401 + }
1.402 + }
1.403 + aUnicode.SetLength(unicode-aUnicode.Ptr());
1.404 + return KErrNone;
1.405 + }
1.406 +
1.407 +EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
1.408 + {
1.409 + //1. aCharacter >= 0x0080
1.410 + if (aCharacter>=0x0080)
1.411 + {
1.412 + // Since all Unicode characters can be mapped to GB18030, so no need to
1.413 + // test the converting.
1.414 + if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
1.415 + return ETrue;
1.416 + else
1.417 + return EFalse;
1.418 + }
1.419 +
1.420 + // For most common cases:
1.421 + // Note: lower case characters are considered legal DOS char here.
1.422 + if ((aCharacter>='a' && aCharacter<='z') ||
1.423 + (aCharacter>='A' && aCharacter<='Z') ||
1.424 + (aCharacter>='0' && aCharacter<='9'))
1.425 + {
1.426 + return ETrue;
1.427 + }
1.428 + // Checking for illegal chars:
1.429 + // 2. aCharacter <= 0x20
1.430 + // Note: leading 0x05 byte should be guarded by callers of this function
1.431 + // as the information of the position of the character is required.
1.432 + if (aCharacter < 0x20)
1.433 + return EFalse;
1.434 + // Space (' ') is not considered as a legal DOS char here.
1.435 + if (aCharacter == 0x20)
1.436 + return EFalse;
1.437 +
1.438 + // 3. 0x20 < aCharacter < 0x80
1.439 + // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name":
1.440 + switch (aCharacter)
1.441 + {
1.442 + case 0x22: // '"'
1.443 + case 0x2A: // '*'
1.444 + case 0x2B: // '+'
1.445 + case 0x2C: // ','
1.446 + //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it
1.447 + // is a valid character in short file names.
1.448 + case 0x2F: // '/'
1.449 + case 0x3A: // ':'
1.450 + case 0x3B: // ';'
1.451 + case 0x3C: // '<'
1.452 + case 0x3D: // '='
1.453 + case 0x3E: // '>'
1.454 + case 0x3F: // '?'
1.455 + case 0x5B: // '['
1.456 + case 0x5C: // '\'
1.457 + case 0x5D: // ']'
1.458 + case 0x7C: // '|'
1.459 + return EFalse;
1.460 + default:
1.461 + return ETrue;
1.462 + }
1.463 + }
1.464 +