os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,461 @@
     1.4 +/*
     1.5 +* Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +* All rights reserved.
     1.7 +* This component and the accompanying materials are made available
     1.8 +* under the terms of "Eclipse Public License v1.0"
     1.9 +* which accompanies this distribution, and is available
    1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +*
    1.12 +* Initial Contributors:
    1.13 +* Nokia Corporation - initial contribution.
    1.14 +*
    1.15 +* Contributors:
    1.16 +*
    1.17 +* Description: 
    1.18 +*
    1.19 +*/
    1.20 +// There are 2 reasons why not use existing unicodeconv.cpp:
    1.21 +// 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
    1.22 +//    for huge code pages (e.g, Asia code pages). See INC127598.
    1.23 +//
    1.24 +// 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
    1.25 +//
    1.26 +// The algorithm of this special version unicodeconv.cpp is straightforward:
    1.27 +// 1) foreign->unicode:
    1.28 +//    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
    1.29 +//              "cp54936_2byte_tounicode.cpp", which is generated with command
    1.30 +//              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
    1.31 +//
    1.32 +//    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
    1.33 +//              search into the mapping table in "cp54936_4byte_tounicode.cpp",
    1.34 +//              which is generated with command
    1.35 +//              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
    1.36 +//
    1.37 +//    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
    1.38 +//
    1.39 +// 2) unicode->foreign:
    1.40 +//    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
    1.41 +//              can map directly, which is generated with command
    1.42 +//              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
    1.43 +//
    1.44 +//    2.2) unicode non-bmp->4 byte: calculate with formula in this file.
    1.45 +//
    1.46 +// The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
    1.47 +// ConvertSingleUnicode() is not used anymore. It's reserved just because not
    1.48 +// changing the tool FatConversionTable.pl.
    1.49 +//
    1.50 +// About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
    1.51 +// 1) All Private Used Area (PUA) code points are reserved.
    1.52 +// 2) All GB18030 code points that mapping to undefined Unicode are reserved.
    1.53 +//
    1.54 +//
    1.55 +// About the formula for non-bmp calculation:
    1.56 +// 1) All code points from 0x10000 to 0x10FFFF are supported.
    1.57 +// 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
    1.58 +//    the GB18030 standard, since the standard does not define the mapping for
    1.59 +//    code points out of 0x20000-0x2FFFF.
    1.60 +
    1.61 +
    1.62 +#include <e32std.h>
    1.63 +#include <e32def.h>
    1.64 +#include <e32des8.h> 
    1.65 +#include "unicodeconv.h"
    1.66 +#include "cp54936.h"
    1.67 +
    1.68 +
    1.69 +enum TFccPanic
    1.70 +	{
    1.71 +	EBadForeignCode = 0,
    1.72 +	E4ByteIndexOutOfRange,
    1.73 +	EPanicBadIndices1,
    1.74 +	EInavlidUnicodeValue
    1.75 +	};
    1.76 +void Panic(TFccPanic aPanic)
    1.77 +	{
    1.78 +
    1.79 +	User::Panic(_L("FatCharsetConv"),aPanic);
    1.80 +	}
    1.81 +
    1.82 +
    1.83 +//replacement character to be used when unicode cannot be converted
    1.84 +const TUint8 KForeignReplacement = 0x5F;
    1.85 +
    1.86 +const TUint8 KU10000Byte1 = 0x90;
    1.87 +const TUint8 KU10000Byte2 = 0x30;
    1.88 +const TUint8 KU10000Byte3 = 0x81;
    1.89 +const TUint8 KU10000Byte4 = 0x30;
    1.90 +
    1.91 +inline TBool IsSupplementary(TUint aChar)
    1.92 +/**
    1.93 +@param aChar The 32-bit code point value of a Unicode character.
    1.94 +
    1.95 +@return True, if aChar is supplementary character; false, otherwise.
    1.96 +*/
    1.97 +	{
    1.98 +	return (aChar > 0xFFFF);
    1.99 +	}
   1.100 +
   1.101 +inline TBool IsSurrogate(TText16 aInt16)
   1.102 +/**
   1.103 +@return True, if aText16 is high surrogate or low surrogate; false, otherwise.
   1.104 +*/
   1.105 +	{
   1.106 +	return (aInt16 & 0xF800) == 0xD800;
   1.107 +	}
   1.108 +
   1.109 +inline TBool IsHighSurrogate(TText16 aInt16)
   1.110 +/**
   1.111 +@return True, if aText16 is high surrogate; false, otherwise.
   1.112 +*/
   1.113 +	{
   1.114 +	return (aInt16 & 0xFC00) == 0xD800;
   1.115 +	}
   1.116 +
   1.117 +inline TBool IsLowSurrogate(TText16 aInt16)
   1.118 +/**
   1.119 +@return True, if aText16 is low surrogate; false, otherwise.
   1.120 +*/
   1.121 +	{
   1.122 +	return (aInt16 & 0xFC00) == 0xDC00;
   1.123 +	}
   1.124 +
   1.125 +inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
   1.126 +/**
   1.127 +Combine a high surrogate and a low surrogate into a supplementary character.
   1.128 +
   1.129 +@return The 32-bit code point value of the generated Unicode supplementary
   1.130 +        character.
   1.131 +*/
   1.132 +	{
   1.133 +	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
   1.134 +	}
   1.135 +
   1.136 +inline TText16 GetHighSurrogate(TUint aChar)
   1.137 +/**
   1.138 +Retrieve the high surrogate of a supplementary character.
   1.139 +
   1.140 +@param aChar The 32-bit code point value of a Unicode character.
   1.141 +
   1.142 +@return High surrogate of aChar, if aChar is a supplementary character; 
   1.143 +        aChar itself, if aChar is not a supplementary character.
   1.144 +*/
   1.145 +	{
   1.146 +	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
   1.147 +	}
   1.148 +
   1.149 +inline TText16 GetLowSurrogate(TUint aChar)
   1.150 +/**
   1.151 +Retrieve the low surrogate of a supplementary character.
   1.152 +
   1.153 +@param aChar The 32-bit code point value of a Unicode character.
   1.154 +
   1.155 +@return Low surrogate of aChar, if aChar is a supplementary character; 
   1.156 +        zero, if aChar is not a supplementary character.
   1.157 +*/
   1.158 +	{
   1.159 +	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
   1.160 +	}
   1.161 +
   1.162 +//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
   1.163 +EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
   1.164 +	{
   1.165 +    UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
   1.166 +    }
   1.167 +
   1.168 +//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
   1.169 +EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
   1.170 +	{
   1.171 +	const TInt length = aUnicode.Length();
   1.172 +	const TUint16* unicode = aUnicode.Ptr();
   1.173 +	const TUint16* guard = unicode + length;
   1.174 +	
   1.175 +	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
   1.176 +	TUint8* foreignguard = foreign + aForeign.MaxLength();
   1.177 +	
   1.178 +	//loop going through the character of the unicode descriptor
   1.179 +	while (unicode < guard)
   1.180 +		{
   1.181 +		TUint32 unicodeChar = *unicode++;
   1.182 +		if (IsHighSurrogate(unicodeChar))
   1.183 +			{
   1.184 +			if (unicode >= guard || !IsLowSurrogate(*unicode))
   1.185 +				{
   1.186 +				if (foreign >= foreignguard)
   1.187 +					{
   1.188 +                    aForeign.SetLength(foreign-aForeign.Ptr());
   1.189 +					if (leaveWhenOverflow)
   1.190 +						User::Leave(KErrOverflow);
   1.191 +                    else
   1.192 +                    	return KErrOverflow;
   1.193 +					}
   1.194 +				*foreign++ = KForeignReplacement;
   1.195 +				continue;
   1.196 +				}
   1.197 +			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
   1.198 +			}
   1.199 +		if (IsLowSurrogate(unicodeChar))
   1.200 +			{
   1.201 +			if (foreign >= foreignguard)
   1.202 +				{
   1.203 +				aForeign.SetLength(foreign-aForeign.Ptr());
   1.204 +				if (leaveWhenOverflow)
   1.205 +					User::Leave(KErrOverflow);
   1.206 +				else
   1.207 +					return KErrOverflow;
   1.208 +				}
   1.209 +			*foreign++ = KForeignReplacement;
   1.210 +			continue;
   1.211 +			}
   1.212 +		
   1.213 +		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.
   1.214 +		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.
   1.215 +		
   1.216 +		// unicode to cp54936
   1.217 +		if (IsSupplementary(unicodeChar))
   1.218 +			{
   1.219 +			unicodeChar -= 0x10000;
   1.220 +			b4 = unicodeChar % 10 + KU10000Byte4;
   1.221 +			unicodeChar /= 10;
   1.222 +			b3 = unicodeChar % 126 + KU10000Byte3;
   1.223 +			unicodeChar /= 126;
   1.224 +			b2 = unicodeChar % 10 + KU10000Byte2;
   1.225 +			b1 = unicodeChar / 10 + KU10000Byte1;
   1.226 +			count = 4;
   1.227 +			}
   1.228 +		else
   1.229 +			{
   1.230 +			TUint32 foreignChar;
   1.231 +			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
   1.232 +			b1 = ((foreignChar >> 24) & 0xFF);
   1.233 +			b2 = ((foreignChar >> 16) & 0xFF);
   1.234 +			b3 = ((foreignChar >> 8) & 0xFF);
   1.235 +			b4 = (foreignChar & 0xFF);
   1.236 +			count = 1;
   1.237 +			if (b1)
   1.238 +				{
   1.239 +				count = 4;
   1.240 +				}
   1.241 +			else
   1.242 +				{
   1.243 +				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
   1.244 +				if (b3)
   1.245 +					{
   1.246 +					count = 2;
   1.247 +					}
   1.248 +				}
   1.249 +			}
   1.250 +		
   1.251 +		if (foreign + count > foreignguard)
   1.252 +			{
   1.253 +			aForeign.SetLength(foreign-aForeign.Ptr());
   1.254 +            if (leaveWhenOverflow)
   1.255 +            	User::Leave(KErrOverflow);
   1.256 +            else
   1.257 +            	return KErrOverflow;
   1.258 +			}
   1.259 +		if (count == 4)
   1.260 +			{
   1.261 +			*foreign++ = b1;
   1.262 +			*foreign++ = b2;
   1.263 +			}
   1.264 +		if (count >= 2)
   1.265 +			*foreign++ = b3;
   1.266 +		*foreign++ = b4;
   1.267 +		}
   1.268 +	aForeign.SetLength(foreign-aForeign.Ptr());
   1.269 +	return KErrNone;
   1.270 +	}
   1.271 +
   1.272 +
   1.273 +//This function converts from foreign characters into unicode and adds them into a descriptor
   1.274 +EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
   1.275 +	{
   1.276 +    UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
   1.277 +    }
   1.278 +
   1.279 +//This function converts from foreign characters into unicode and adds them into a descriptor
   1.280 +EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
   1.281 +	{
   1.282 +	const TInt foreignLength = aForeign.Length();
   1.283 +	const TUint8* foreign = aForeign.Ptr();
   1.284 +	const TUint8* guard = foreign + foreignLength;
   1.285 +	
   1.286 +	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
   1.287 +	TUint16* unicodeguard = unicode + aUnicode.MaxLength();
   1.288 +	
   1.289 +	TUint8 b1, b2, b3, b4;
   1.290 +	enum TCodeType
   1.291 +	{
   1.292 +	E1Byte = 0,
   1.293 +	E2Byte,
   1.294 +	E4ByteBmp,
   1.295 +	E4ByteSupplementary,
   1.296 +	EError,
   1.297 +	};
   1.298 +	TCodeType codetype;
   1.299 +	TUint32 unicodeChar;
   1.300 +
   1.301 +	//loop going through the characters of the foreign descriptor
   1.302 +	while (foreign < guard)
   1.303 +		{
   1.304 +		// roughly, detect which area the foreign code belongs to
   1.305 +		b1 = *foreign++;
   1.306 +		if (b1 <= 0x7F)
   1.307 +			codetype = E1Byte;
   1.308 +		else if (b1 == 0x80 || b1 > 0xFE)
   1.309 +			codetype = EError;
   1.310 +		else if (foreign >= guard)
   1.311 +			codetype = EError;
   1.312 +		else
   1.313 +			{
   1.314 +			b2 = *foreign++;
   1.315 +			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
   1.316 +				codetype = E2Byte;
   1.317 +			else if (b2 < 0x30 || b2 > 0x39)
   1.318 +				codetype = EError;
   1.319 +			else if (foreign+1 >= guard)
   1.320 +				codetype = EError;
   1.321 +			else
   1.322 +				{
   1.323 +				b3 = *foreign++;
   1.324 +				if (b3 < 0x81 || b3 > 0xFE)
   1.325 +					codetype = EError;
   1.326 +				else
   1.327 +					{
   1.328 +					b4 = *foreign++;
   1.329 +					if (b4 < 0x30 || b4 > 0x39)
   1.330 +						codetype = EError;
   1.331 +					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39
   1.332 +						codetype = E4ByteBmp;
   1.333 +					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39
   1.334 +						codetype = E4ByteSupplementary;
   1.335 +					else
   1.336 +						codetype = EError;					// others are reserved
   1.337 +					}
   1.338 +				}
   1.339 +			}
   1.340 +		
   1.341 +		// cp54936 to unicode
   1.342 +		if (codetype == E1Byte)
   1.343 +			{
   1.344 +			unicodeChar = b1;
   1.345 +			}
   1.346 +		else if (codetype == E2Byte)
   1.347 +			{
   1.348 +			// conventional algorithm used in FatCharsetConv
   1.349 +			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
   1.350 +			if (structPtr->iUnicodeIfSingle)
   1.351 +				unicodeChar = structPtr->iUnicodeIfSingle;
   1.352 +			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
   1.353 +				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
   1.354 +			else
   1.355 +				unicodeChar = 0xFFFD;
   1.356 +			}
   1.357 +		else if (codetype == E4ByteBmp)
   1.358 +			{
   1.359 +			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
   1.360 +			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
   1.361 +			unicodeChar = KMappingTable4ByteBmp2Unicode[index];
   1.362 +			}
   1.363 +		else if (codetype == E4ByteSupplementary)
   1.364 +			{
   1.365 +			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
   1.366 +									(b2 - KU10000Byte2) * 1260 +
   1.367 +									(b3 - KU10000Byte3) * 10 +
   1.368 +									(b4 - KU10000Byte4);
   1.369 +			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
   1.370 +			}
   1.371 +		else
   1.372 +			{
   1.373 +			unicodeChar = 0xFFFD;
   1.374 +			}
   1.375 +		
   1.376 +		// append to output buffer
   1.377 +		if (IsSupplementary(unicodeChar))
   1.378 +			{
   1.379 +			if (unicode + 1 >= unicodeguard)
   1.380 +				{
   1.381 +				aUnicode.SetLength(unicode-aUnicode.Ptr());
   1.382 +				if (leaveWhenOverflow)
   1.383 +					User::Leave(KErrOverflow);
   1.384 +				else
   1.385 +					return KErrOverflow;
   1.386 +				}
   1.387 +			*unicode++ = GetHighSurrogate(unicodeChar);
   1.388 +			*unicode++ = GetLowSurrogate(unicodeChar);
   1.389 +			}
   1.390 +		else
   1.391 +			{
   1.392 +			if (unicode >= unicodeguard)
   1.393 +				{
   1.394 +				aUnicode.SetLength(unicode-aUnicode.Ptr());
   1.395 +                if (leaveWhenOverflow)
   1.396 +                	User::Leave(KErrOverflow);
   1.397 +                else
   1.398 +                	return KErrOverflow;
   1.399 +				}
   1.400 +			*unicode++ = unicodeChar;
   1.401 +			}
   1.402 +		}
   1.403 +	aUnicode.SetLength(unicode-aUnicode.Ptr());
   1.404 +	return KErrNone;
   1.405 +	}
   1.406 +
   1.407 +EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
   1.408 +	{
   1.409 +	//1. aCharacter >= 0x0080 
   1.410 +	if (aCharacter>=0x0080)
   1.411 +		{
   1.412 +		// Since all Unicode characters can be mapped to GB18030, so no need to
   1.413 +		// test the converting.
   1.414 +		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
   1.415 +			return ETrue;
   1.416 +		else
   1.417 +			return EFalse;
   1.418 +		}
   1.419 +
   1.420 +    // For most common cases: 
   1.421 +    // Note: lower case characters are considered legal DOS char here. 
   1.422 +	if ((aCharacter>='a' && aCharacter<='z') || 
   1.423 +	    (aCharacter>='A' && aCharacter<='Z') || 
   1.424 +	    (aCharacter>='0' && aCharacter<='9'))
   1.425 +			{
   1.426 +			return ETrue;
   1.427 +			}
   1.428 +    // Checking for illegal chars: 
   1.429 +    // 2. aCharacter <= 0x20 
   1.430 +    // Note: leading 0x05 byte should be guarded by callers of this function 
   1.431 +    //  as the information of the position of the character is required. 
   1.432 +	if (aCharacter < 0x20)
   1.433 +		return EFalse;
   1.434 +	// Space (' ') is not considered as a legal DOS char here.
   1.435 +	if (aCharacter == 0x20)
   1.436 +		return EFalse;
   1.437 +	
   1.438 +	// 3. 0x20 < aCharacter < 0x80 
   1.439 +    // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": 
   1.440 +    switch (aCharacter) 
   1.441 +            { 
   1.442 +            case 0x22:        // '"' 
   1.443 +            case 0x2A:        // '*' 
   1.444 +            case 0x2B:        // '+' 
   1.445 +            case 0x2C:        // ',' 
   1.446 +            //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it 
   1.447 +                                         // is a valid character in short file names. 
   1.448 +            case 0x2F:        // '/' 
   1.449 +            case 0x3A:        // ':' 
   1.450 +            case 0x3B:        // ';' 
   1.451 +            case 0x3C:        // '<' 
   1.452 +            case 0x3D:        // '=' 
   1.453 +            case 0x3E:        // '>' 
   1.454 +            case 0x3F:        // '?' 
   1.455 +            case 0x5B:        // '[' 
   1.456 +            case 0x5C:        // '\' 
   1.457 +            case 0x5D:        // ']' 
   1.458 +            case 0x7C:        // '|' 
   1.459 +            	return EFalse; 
   1.460 +            default: 
   1.461 +            	return ETrue; 
   1.462 +            } 
   1.463 +	}		
   1.464 +