os/textandloc/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2 * Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 // There are 2 reasons why not use existing unicodeconv.cpp:
    18 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
    19 //    for huge code pages (e.g, Asia code pages). See INC127598.
    20 //
    21 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
    22 //
    23 // The algorithm of this special version unicodeconv.cpp is straightforward:
    24 // 1) foreign->unicode:
    25 //    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
    26 //              "cp54936_2byte_tounicode.cpp", which is generated with command
    27 //              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
    28 //
    29 //    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
    30 //              search into the mapping table in "cp54936_4byte_tounicode.cpp",
    31 //              which is generated with command
    32 //              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
    33 //
    34 //    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
    35 //
    36 // 2) unicode->foreign:
    37 //    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
    38 //              can map directly, which is generated with command
    39 //              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
    40 //
    41 //    2.2) unicode non-bmp->4 byte: calculate with formula in this file.
    42 //
    43 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
    44 // ConvertSingleUnicode() is not used anymore. It's reserved just because not
    45 // changing the tool FatConversionTable.pl.
    46 //
    47 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
    48 // 1) All Private Used Area (PUA) code points are reserved.
    49 // 2) All GB18030 code points that mapping to undefined Unicode are reserved.
    50 //
    51 //
    52 // About the formula for non-bmp calculation:
    53 // 1) All code points from 0x10000 to 0x10FFFF are supported.
    54 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
    55 //    the GB18030 standard, since the standard does not define the mapping for
    56 //    code points out of 0x20000-0x2FFFF.
    57 
    58 
    59 #include <e32std.h>
    60 #include <e32def.h>
    61 #include <e32des8.h> 
    62 #include "unicodeconv.h"
    63 #include "cp54936.h"
    64 
    65 
    66 enum TFccPanic
    67 	{
    68 	EBadForeignCode = 0,
    69 	E4ByteIndexOutOfRange,
    70 	EPanicBadIndices1,
    71 	EInavlidUnicodeValue
    72 	};
    73 void Panic(TFccPanic aPanic)
    74 	{
    75 
    76 	User::Panic(_L("FatCharsetConv"),aPanic);
    77 	}
    78 
    79 
    80 //replacement character to be used when unicode cannot be converted
    81 const TUint8 KForeignReplacement = 0x5F;
    82 
    83 const TUint8 KU10000Byte1 = 0x90;
    84 const TUint8 KU10000Byte2 = 0x30;
    85 const TUint8 KU10000Byte3 = 0x81;
    86 const TUint8 KU10000Byte4 = 0x30;
    87 
    88 inline TBool IsSupplementary(TUint aChar)
    89 /**
    90 @param aChar The 32-bit code point value of a Unicode character.
    91 
    92 @return True, if aChar is supplementary character; false, otherwise.
    93 */
    94 	{
    95 	return (aChar > 0xFFFF);
    96 	}
    97 
    98 inline TBool IsSurrogate(TText16 aInt16)
    99 /**
   100 @return True, if aText16 is high surrogate or low surrogate; false, otherwise.
   101 */
   102 	{
   103 	return (aInt16 & 0xF800) == 0xD800;
   104 	}
   105 
   106 inline TBool IsHighSurrogate(TText16 aInt16)
   107 /**
   108 @return True, if aText16 is high surrogate; false, otherwise.
   109 */
   110 	{
   111 	return (aInt16 & 0xFC00) == 0xD800;
   112 	}
   113 
   114 inline TBool IsLowSurrogate(TText16 aInt16)
   115 /**
   116 @return True, if aText16 is low surrogate; false, otherwise.
   117 */
   118 	{
   119 	return (aInt16 & 0xFC00) == 0xDC00;
   120 	}
   121 
   122 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
   123 /**
   124 Combine a high surrogate and a low surrogate into a supplementary character.
   125 
   126 @return The 32-bit code point value of the generated Unicode supplementary
   127         character.
   128 */
   129 	{
   130 	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
   131 	}
   132 
   133 inline TText16 GetHighSurrogate(TUint aChar)
   134 /**
   135 Retrieve the high surrogate of a supplementary character.
   136 
   137 @param aChar The 32-bit code point value of a Unicode character.
   138 
   139 @return High surrogate of aChar, if aChar is a supplementary character; 
   140         aChar itself, if aChar is not a supplementary character.
   141 */
   142 	{
   143 	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
   144 	}
   145 
   146 inline TText16 GetLowSurrogate(TUint aChar)
   147 /**
   148 Retrieve the low surrogate of a supplementary character.
   149 
   150 @param aChar The 32-bit code point value of a Unicode character.
   151 
   152 @return Low surrogate of aChar, if aChar is a supplementary character; 
   153         zero, if aChar is not a supplementary character.
   154 */
   155 	{
   156 	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
   157 	}
   158 
   159 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
   160 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
   161 	{
   162     UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
   163     }
   164 
   165 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
   166 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
   167 	{
   168 	const TInt length = aUnicode.Length();
   169 	const TUint16* unicode = aUnicode.Ptr();
   170 	const TUint16* guard = unicode + length;
   171 	
   172 	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
   173 	TUint8* foreignguard = foreign + aForeign.MaxLength();
   174 	
   175 	//loop going through the character of the unicode descriptor
   176 	while (unicode < guard)
   177 		{
   178 		TUint32 unicodeChar = *unicode++;
   179 		if (IsHighSurrogate(unicodeChar))
   180 			{
   181 			if (unicode >= guard || !IsLowSurrogate(*unicode))
   182 				{
   183 				if (foreign >= foreignguard)
   184 					{
   185                     aForeign.SetLength(foreign-aForeign.Ptr());
   186 					if (leaveWhenOverflow)
   187 						User::Leave(KErrOverflow);
   188                     else
   189                     	return KErrOverflow;
   190 					}
   191 				*foreign++ = KForeignReplacement;
   192 				continue;
   193 				}
   194 			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
   195 			}
   196 		if (IsLowSurrogate(unicodeChar))
   197 			{
   198 			if (foreign >= foreignguard)
   199 				{
   200 				aForeign.SetLength(foreign-aForeign.Ptr());
   201 				if (leaveWhenOverflow)
   202 					User::Leave(KErrOverflow);
   203 				else
   204 					return KErrOverflow;
   205 				}
   206 			*foreign++ = KForeignReplacement;
   207 			continue;
   208 			}
   209 		
   210 		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.
   211 		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.
   212 		
   213 		// unicode to cp54936
   214 		if (IsSupplementary(unicodeChar))
   215 			{
   216 			unicodeChar -= 0x10000;
   217 			b4 = unicodeChar % 10 + KU10000Byte4;
   218 			unicodeChar /= 10;
   219 			b3 = unicodeChar % 126 + KU10000Byte3;
   220 			unicodeChar /= 126;
   221 			b2 = unicodeChar % 10 + KU10000Byte2;
   222 			b1 = unicodeChar / 10 + KU10000Byte1;
   223 			count = 4;
   224 			}
   225 		else
   226 			{
   227 			TUint32 foreignChar;
   228 			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
   229 			b1 = ((foreignChar >> 24) & 0xFF);
   230 			b2 = ((foreignChar >> 16) & 0xFF);
   231 			b3 = ((foreignChar >> 8) & 0xFF);
   232 			b4 = (foreignChar & 0xFF);
   233 			count = 1;
   234 			if (b1)
   235 				{
   236 				count = 4;
   237 				}
   238 			else
   239 				{
   240 				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
   241 				if (b3)
   242 					{
   243 					count = 2;
   244 					}
   245 				}
   246 			}
   247 		
   248 		if (foreign + count > foreignguard)
   249 			{
   250 			aForeign.SetLength(foreign-aForeign.Ptr());
   251             if (leaveWhenOverflow)
   252             	User::Leave(KErrOverflow);
   253             else
   254             	return KErrOverflow;
   255 			}
   256 		if (count == 4)
   257 			{
   258 			*foreign++ = b1;
   259 			*foreign++ = b2;
   260 			}
   261 		if (count >= 2)
   262 			*foreign++ = b3;
   263 		*foreign++ = b4;
   264 		}
   265 	aForeign.SetLength(foreign-aForeign.Ptr());
   266 	return KErrNone;
   267 	}
   268 
   269 
   270 //This function converts from foreign characters into unicode and adds them into a descriptor
   271 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
   272 	{
   273     UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
   274     }
   275 
   276 //This function converts from foreign characters into unicode and adds them into a descriptor
   277 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
   278 	{
   279 	const TInt foreignLength = aForeign.Length();
   280 	const TUint8* foreign = aForeign.Ptr();
   281 	const TUint8* guard = foreign + foreignLength;
   282 	
   283 	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
   284 	TUint16* unicodeguard = unicode + aUnicode.MaxLength();
   285 	
   286 	TUint8 b1, b2, b3, b4;
   287 	enum TCodeType
   288 	{
   289 	E1Byte = 0,
   290 	E2Byte,
   291 	E4ByteBmp,
   292 	E4ByteSupplementary,
   293 	EError,
   294 	};
   295 	TCodeType codetype;
   296 	TUint32 unicodeChar;
   297 
   298 	//loop going through the characters of the foreign descriptor
   299 	while (foreign < guard)
   300 		{
   301 		// roughly, detect which area the foreign code belongs to
   302 		b1 = *foreign++;
   303 		if (b1 <= 0x7F)
   304 			codetype = E1Byte;
   305 		else if (b1 == 0x80 || b1 > 0xFE)
   306 			codetype = EError;
   307 		else if (foreign >= guard)
   308 			codetype = EError;
   309 		else
   310 			{
   311 			b2 = *foreign++;
   312 			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
   313 				codetype = E2Byte;
   314 			else if (b2 < 0x30 || b2 > 0x39)
   315 				codetype = EError;
   316 			else if (foreign+1 >= guard)
   317 				codetype = EError;
   318 			else
   319 				{
   320 				b3 = *foreign++;
   321 				if (b3 < 0x81 || b3 > 0xFE)
   322 					codetype = EError;
   323 				else
   324 					{
   325 					b4 = *foreign++;
   326 					if (b4 < 0x30 || b4 > 0x39)
   327 						codetype = EError;
   328 					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39
   329 						codetype = E4ByteBmp;
   330 					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39
   331 						codetype = E4ByteSupplementary;
   332 					else
   333 						codetype = EError;					// others are reserved
   334 					}
   335 				}
   336 			}
   337 		
   338 		// cp54936 to unicode
   339 		if (codetype == E1Byte)
   340 			{
   341 			unicodeChar = b1;
   342 			}
   343 		else if (codetype == E2Byte)
   344 			{
   345 			// conventional algorithm used in FatCharsetConv
   346 			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
   347 			if (structPtr->iUnicodeIfSingle)
   348 				unicodeChar = structPtr->iUnicodeIfSingle;
   349 			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
   350 				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
   351 			else
   352 				unicodeChar = 0xFFFD;
   353 			}
   354 		else if (codetype == E4ByteBmp)
   355 			{
   356 			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
   357 			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
   358 			unicodeChar = KMappingTable4ByteBmp2Unicode[index];
   359 			}
   360 		else if (codetype == E4ByteSupplementary)
   361 			{
   362 			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
   363 									(b2 - KU10000Byte2) * 1260 +
   364 									(b3 - KU10000Byte3) * 10 +
   365 									(b4 - KU10000Byte4);
   366 			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
   367 			}
   368 		else
   369 			{
   370 			unicodeChar = 0xFFFD;
   371 			}
   372 		
   373 		// append to output buffer
   374 		if (IsSupplementary(unicodeChar))
   375 			{
   376 			if (unicode + 1 >= unicodeguard)
   377 				{
   378 				aUnicode.SetLength(unicode-aUnicode.Ptr());
   379 				if (leaveWhenOverflow)
   380 					User::Leave(KErrOverflow);
   381 				else
   382 					return KErrOverflow;
   383 				}
   384 			*unicode++ = GetHighSurrogate(unicodeChar);
   385 			*unicode++ = GetLowSurrogate(unicodeChar);
   386 			}
   387 		else
   388 			{
   389 			if (unicode >= unicodeguard)
   390 				{
   391 				aUnicode.SetLength(unicode-aUnicode.Ptr());
   392                 if (leaveWhenOverflow)
   393                 	User::Leave(KErrOverflow);
   394                 else
   395                 	return KErrOverflow;
   396 				}
   397 			*unicode++ = unicodeChar;
   398 			}
   399 		}
   400 	aUnicode.SetLength(unicode-aUnicode.Ptr());
   401 	return KErrNone;
   402 	}
   403 
   404 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
   405 	{
   406 	//1. aCharacter >= 0x0080 
   407 	if (aCharacter>=0x0080)
   408 		{
   409 		// Since all Unicode characters can be mapped to GB18030, so no need to
   410 		// test the converting.
   411 		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
   412 			return ETrue;
   413 		else
   414 			return EFalse;
   415 		}
   416 
   417     // For most common cases: 
   418     // Note: lower case characters are considered legal DOS char here. 
   419 	if ((aCharacter>='a' && aCharacter<='z') || 
   420 	    (aCharacter>='A' && aCharacter<='Z') || 
   421 	    (aCharacter>='0' && aCharacter<='9'))
   422 			{
   423 			return ETrue;
   424 			}
   425     // Checking for illegal chars: 
   426     // 2. aCharacter <= 0x20 
   427     // Note: leading 0x05 byte should be guarded by callers of this function 
   428     //  as the information of the position of the character is required. 
   429 	if (aCharacter < 0x20)
   430 		return EFalse;
   431 	// Space (' ') is not considered as a legal DOS char here.
   432 	if (aCharacter == 0x20)
   433 		return EFalse;
   434 	
   435 	// 3. 0x20 < aCharacter < 0x80 
   436     // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": 
   437     switch (aCharacter) 
   438             { 
   439             case 0x22:        // '"' 
   440             case 0x2A:        // '*' 
   441             case 0x2B:        // '+' 
   442             case 0x2C:        // ',' 
   443             //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it 
   444                                          // is a valid character in short file names. 
   445             case 0x2F:        // '/' 
   446             case 0x3A:        // ':' 
   447             case 0x3B:        // ';' 
   448             case 0x3C:        // '<' 
   449             case 0x3D:        // '=' 
   450             case 0x3E:        // '>' 
   451             case 0x3F:        // '?' 
   452             case 0x5B:        // '[' 
   453             case 0x5C:        // '\' 
   454             case 0x5D:        // ']' 
   455             case 0x7C:        // '|' 
   456             	return EFalse; 
   457             default: 
   458             	return ETrue; 
   459             } 
   460 	}		
   461