Symaptic: os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936

     1 /*

     2 * Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).

     3 * All rights reserved.

     4 * This component and the accompanying materials are made available

     5 * under the terms of "Eclipse Public License v1.0"

     6 * which accompanies this distribution, and is available

     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".

8 *

     9 * Initial Contributors:

    10 * Nokia Corporation - initial contribution.

    11 *

    12 * Contributors:

    13 *

    14 * Description:

    15 *

    16 */

    17 // There are 2 reasons why not use existing unicodeconv.cpp:

    18 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially

    19 //    for huge code pages (e.g, Asia code pages). See INC127598.

    20 //

    21 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.

    22 //

    23 // The algorithm of this special version unicodeconv.cpp is straightforward:

    24 // 1) foreign->unicode:

    25 //    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in

    26 //              "cp54936_2byte_tounicode.cpp", which is generated with command

    27 //              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".

    28 //

    29 //    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then

    30 //              search into the mapping table in "cp54936_4byte_tounicode.cpp",

    31 //              which is generated with command

    32 //              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".

    33 //

    34 //    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.

    35 //

    36 // 2) unicode->foreign:

    37 //    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"

    38 //              can map directly, which is generated with command

    39 //              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".

    40 //

    41 //    2.2) unicode non-bmp->4 byte: calculate with formula in this file.

    42 //

    43 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct::

    44 // ConvertSingleUnicode() is not used anymore. It's reserved just because not

    45 // changing the tool FatConversionTable.pl.

    46 //

    47 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":

    48 // 1) All Private Used Area (PUA) code points are reserved.

    49 // 2) All GB18030 code points that mapping to undefined Unicode are reserved.

    50 //

    51 //

    52 // About the formula for non-bmp calculation:

    53 // 1) All code points from 0x10000 to 0x10FFFF are supported.

    54 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from

    55 //    the GB18030 standard, since the standard does not define the mapping for

    56 //    code points out of 0x20000-0x2FFFF.

    59 #include <e32std.h>

    60 #include <e32def.h>

    61 #include <e32des8.h>

    62 #include "unicodeconv.h"

    63 #include "cp54936.h"

    66 enum TFccPanic

    67 	{

    68 	EBadForeignCode = 0,

    69 	E4ByteIndexOutOfRange,

    70 	EPanicBadIndices1,

    71 	EInavlidUnicodeValue

    72 	};

    73 void Panic(TFccPanic aPanic)

    74 	{

    76 	User::Panic(_L("FatCharsetConv"),aPanic);

    77 	}

    80 //replacement character to be used when unicode cannot be converted

    81 const TUint8 KForeignReplacement = 0x5F;

    83 const TUint8 KU10000Byte1 = 0x90;

    84 const TUint8 KU10000Byte2 = 0x30;

    85 const TUint8 KU10000Byte3 = 0x81;

    86 const TUint8 KU10000Byte4 = 0x30;

    88 inline TBool IsSupplementary(TUint aChar)

    89 /**

    90 @param aChar The 32-bit code point value of a Unicode character.

    92 @return True, if aChar is supplementary character; false, otherwise.

    93 */

    94 	{

    95 	return (aChar > 0xFFFF);

    96 	}

    98 inline TBool IsSurrogate(TText16 aInt16)

    99 /**

   100 @return True, if aText16 is high surrogate or low surrogate; false, otherwise.

   101 */

   102 	{

   103 	return (aInt16 & 0xF800) == 0xD800;

   104 	}

   106 inline TBool IsHighSurrogate(TText16 aInt16)

   107 /**

   108 @return True, if aText16 is high surrogate; false, otherwise.

   109 */

   110 	{

   111 	return (aInt16 & 0xFC00) == 0xD800;

   112 	}

   114 inline TBool IsLowSurrogate(TText16 aInt16)

   115 /**

   116 @return True, if aText16 is low surrogate; false, otherwise.

   117 */

   118 	{

   119 	return (aInt16 & 0xFC00) == 0xDC00;

   120 	}

   122 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)

   123 /**

   124 Combine a high surrogate and a low surrogate into a supplementary character.

   126 @return The 32-bit code point value of the generated Unicode supplementary

   127         character.

   128 */

   129 	{

   130 	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;

   131 	}

   133 inline TText16 GetHighSurrogate(TUint aChar)

   134 /**

   135 Retrieve the high surrogate of a supplementary character.

   137 @param aChar The 32-bit code point value of a Unicode character.

   139 @return High surrogate of aChar, if aChar is a supplementary character;

   140         aChar itself, if aChar is not a supplementary character.

   141 */

   142 	{

   143 	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));

   144 	}

   146 inline TText16 GetLowSurrogate(TUint aChar)

   147 /**

   148 Retrieve the low surrogate of a supplementary character.

   150 @param aChar The 32-bit code point value of a Unicode character.

   152 @return Low surrogate of aChar, if aChar is a supplementary character;

   153         zero, if aChar is not a supplementary character.

   154 */

   155 	{

   156 	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));

   157 	}

   159 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor

   160 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)

   161 	{

   162     UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);

   163     }

   165 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor

   166 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)

   167 	{

   168 	const TInt length = aUnicode.Length();

   169 	const TUint16* unicode = aUnicode.Ptr();

   170 	const TUint16* guard = unicode + length;

   172 	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());

   173 	TUint8* foreignguard = foreign + aForeign.MaxLength();

   175 	//loop going through the character of the unicode descriptor

   176 	while (unicode < guard)

   177 		{

   178 		TUint32 unicodeChar = *unicode++;

   179 		if (IsHighSurrogate(unicodeChar))

   180 			{

   181 			if (unicode >= guard || !IsLowSurrogate(*unicode))

   182 				{

   183 				if (foreign >= foreignguard)

   184 					{

   185                     aForeign.SetLength(foreign-aForeign.Ptr());

   186 					if (leaveWhenOverflow)

   187 						User::Leave(KErrOverflow);

   188                     else

   189                     	return KErrOverflow;

   190 					}

   191 				*foreign++ = KForeignReplacement;

   192 				continue;

   193 				}

   194 			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);

   195 			}

   196 		if (IsLowSurrogate(unicodeChar))

   197 			{

   198 			if (foreign >= foreignguard)

   199 				{

   200 				aForeign.SetLength(foreign-aForeign.Ptr());

   201 				if (leaveWhenOverflow)

   202 					User::Leave(KErrOverflow);

   203 				else

   204 					return KErrOverflow;

   205 				}

   206 			*foreign++ = KForeignReplacement;

   207 			continue;

   208 			}

   210 		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.

   211 		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.

   213 		// unicode to cp54936

   214 		if (IsSupplementary(unicodeChar))

   215 			{

   216 			unicodeChar -= 0x10000;

   217 			b4 = unicodeChar % 10 + KU10000Byte4;

   218 			unicodeChar /= 10;

   219 			b3 = unicodeChar % 126 + KU10000Byte3;

   220 			unicodeChar /= 126;

   221 			b2 = unicodeChar % 10 + KU10000Byte2;

   222 			b1 = unicodeChar / 10 + KU10000Byte1;

   223 			count = 4;

   224 			}

   225 		else

   226 			{

   227 			TUint32 foreignChar;

   228 			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];

   229 			b1 = ((foreignChar >> 24) & 0xFF);

   230 			b2 = ((foreignChar >> 16) & 0xFF);

   231 			b3 = ((foreignChar >> 8) & 0xFF);

   232 			b4 = (foreignChar & 0xFF);

   233 			count = 1;

   234 			if (b1)

   235 				{

   236 				count = 4;

   237 				}

   238 			else

   239 				{

   240 				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));

   241 				if (b3)

   242 					{

   243 					count = 2;

   244 					}

   245 				}

   246 			}

   248 		if (foreign + count > foreignguard)

   249 			{

   250 			aForeign.SetLength(foreign-aForeign.Ptr());

   251             if (leaveWhenOverflow)

   252             	User::Leave(KErrOverflow);

   253             else

   254             	return KErrOverflow;

   255 			}

   256 		if (count == 4)

   257 			{

   258 			*foreign++ = b1;

   259 			*foreign++ = b2;

   260 			}

   261 		if (count >= 2)

   262 			*foreign++ = b3;

   263 		*foreign++ = b4;

   264 		}

   265 	aForeign.SetLength(foreign-aForeign.Ptr());

   266 	return KErrNone;

   267 	}

   270 //This function converts from foreign characters into unicode and adds them into a descriptor

   271 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)

   272 	{

   273     UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);

   274     }

   276 //This function converts from foreign characters into unicode and adds them into a descriptor

   277 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)

   278 	{

   279 	const TInt foreignLength = aForeign.Length();

   280 	const TUint8* foreign = aForeign.Ptr();

   281 	const TUint8* guard = foreign + foreignLength;

   283 	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());

   284 	TUint16* unicodeguard = unicode + aUnicode.MaxLength();

   286 	TUint8 b1, b2, b3, b4;

   287 	enum TCodeType

   288 	{

   289 	E1Byte = 0,

   290 	E2Byte,

   291 	E4ByteBmp,

   292 	E4ByteSupplementary,

   293 	EError,

   294 	};

   295 	TCodeType codetype;

   296 	TUint32 unicodeChar;

   298 	//loop going through the characters of the foreign descriptor

   299 	while (foreign < guard)

   300 		{

   301 		// roughly, detect which area the foreign code belongs to

   302 		b1 = *foreign++;

   303 		if (b1 <= 0x7F)

   304 			codetype = E1Byte;

   305 		else if (b1 == 0x80 || b1 > 0xFE)

   306 			codetype = EError;

   307 		else if (foreign >= guard)

   308 			codetype = EError;

   309 		else

   310 			{

   311 			b2 = *foreign++;

   312 			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)

   313 				codetype = E2Byte;

   314 			else if (b2 < 0x30 || b2 > 0x39)

   315 				codetype = EError;

   316 			else if (foreign+1 >= guard)

   317 				codetype = EError;

   318 			else

   319 				{

   320 				b3 = *foreign++;

   321 				if (b3 < 0x81 || b3 > 0xFE)

   322 					codetype = EError;

   323 				else

   324 					{

   325 					b4 = *foreign++;

   326 					if (b4 < 0x30 || b4 > 0x39)

   327 						codetype = EError;

   328 					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39

   329 						codetype = E4ByteBmp;

   330 					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39

   331 						codetype = E4ByteSupplementary;

   332 					else

   333 						codetype = EError;					// others are reserved

   334 					}

   335 				}

   336 			}

   338 		// cp54936 to unicode

   339 		if (codetype == E1Byte)

   340 			{

   341 			unicodeChar = b1;

   342 			}

   343 		else if (codetype == E2Byte)

   344 			{

   345 			// conventional algorithm used in FatCharsetConv

   346 			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);

   347 			if (structPtr->iUnicodeIfSingle)

   348 				unicodeChar = structPtr->iUnicodeIfSingle;

   349 			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)

   350 				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];

   351 			else

   352 				unicodeChar = 0xFFFD;

   353 			}

   354 		else if (codetype == E4ByteBmp)

   355 			{

   356 			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);

   357 			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));

   358 			unicodeChar = KMappingTable4ByteBmp2Unicode[index];

   359 			}

   360 		else if (codetype == E4ByteSupplementary)

   361 			{

   362 			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +

   363 									(b2 - KU10000Byte2) * 1260 +

   364 									(b3 - KU10000Byte3) * 10 +

   365 									(b4 - KU10000Byte4);

   366 			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));

   367 			}

   368 		else

   369 			{

   370 			unicodeChar = 0xFFFD;

   371 			}

   373 		// append to output buffer

   374 		if (IsSupplementary(unicodeChar))

   375 			{

   376 			if (unicode + 1 >= unicodeguard)

   377 				{

   378 				aUnicode.SetLength(unicode-aUnicode.Ptr());

   379 				if (leaveWhenOverflow)

   380 					User::Leave(KErrOverflow);

   381 				else

   382 					return KErrOverflow;

   383 				}

   384 			*unicode++ = GetHighSurrogate(unicodeChar);

   385 			*unicode++ = GetLowSurrogate(unicodeChar);

   386 			}

   387 		else

   388 			{

   389 			if (unicode >= unicodeguard)

   390 				{

   391 				aUnicode.SetLength(unicode-aUnicode.Ptr());

   392                 if (leaveWhenOverflow)

   393                 	User::Leave(KErrOverflow);

   394                 else

   395                 	return KErrOverflow;

   396 				}

   397 			*unicode++ = unicodeChar;

   398 			}

   399 		}

   400 	aUnicode.SetLength(unicode-aUnicode.Ptr());

   401 	return KErrNone;

   402 	}

   404 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)

   405 	{

   406 	//1. aCharacter >= 0x0080

   407 	if (aCharacter>=0x0080)

   408 		{

   409 		// Since all Unicode characters can be mapped to GB18030, so no need to

   410 		// test the converting.

   411 		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))

   412 			return ETrue;

   413 		else

   414 			return EFalse;

   415 		}

   417     // For most common cases:

   418     // Note: lower case characters are considered legal DOS char here.

   419 	if ((aCharacter>='a' && aCharacter<='z') ||

   420 	    (aCharacter>='A' && aCharacter<='Z') ||

   421 	    (aCharacter>='0' && aCharacter<='9'))

   422 			{

   423 			return ETrue;

   424 			}

   425     // Checking for illegal chars:

   426     // 2. aCharacter <= 0x20

   427     // Note: leading 0x05 byte should be guarded by callers of this function

   428     //  as the information of the position of the character is required.

   429 	if (aCharacter < 0x20)

   430 		return EFalse;

   431 	// Space (' ') is not considered as a legal DOS char here.

   432 	if (aCharacter == 0x20)

   433 		return EFalse;

   435 	// 3. 0x20 < aCharacter < 0x80

   436     // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name":

   437     switch (aCharacter)

   438             {

   439             case 0x22:        // '"'

   440             case 0x2A:        // '*'

   441             case 0x2B:        // '+'

   442             case 0x2C:        // ','

   443             //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it

   444                                          // is a valid character in short file names.

   445             case 0x2F:        // '/'

   446             case 0x3A:        // ':'

   447             case 0x3B:        // ';'

   448             case 0x3C:        // '<'

   449             case 0x3D:        // '='

   450             case 0x3E:        // '>'

   451             case 0x3F:        // '?'

   452             case 0x5B:        // '['

   453             case 0x5C:        // '\'

   454             case 0x5D:        // ']'

   455             case 0x7C:        // '|'

   456             	return EFalse;

   457             default:

   458             	return ETrue;

   459             }

   460 	}

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--