Symaptic: os/security/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp@260cb5ec6c19

     1 /*

     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).

     3 * All rights reserved.

     4 * This component and the accompanying materials are made available

     5 * under the terms of the License "Eclipse Public License v1.0"

     6 * which accompanies this distribution, and is available

     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".

8 *

     9 * Initial Contributors:

    10 * Nokia Corporation - initial contribution.

    11 *

    12 * Contributors:

    13 *

    14 * Description:

    15 *

    16 */

    19 #include <e32std.h>

    20 #include <e32base.h>

    21 #include <utf.h>

    23 #define STATIC_CAST(t,v) static_cast<t>(v)

    24 #define CONST_CAST(t,v) const_cast<t>(v)

    25 #define FOREVER for(;;)

    27 const TUint KNotInBase64Alphabet=KMaxTUint;

    29 enum TPanic

    30 	{

    31 	EPanicBad6BitNumber=1,

    32 	EPanicBadUtf7Pointers1,

    33 	EPanicBadUtf7Pointers2,

    34 	EPanicBadUtf7Pointers3,

    35 	EPanicBadUtf7Pointers4,

    36 	EPanicBadUtf7Pointers5,

    37 	EPanicBadUtf7Pointers6,

    38 	EPanicBadUtf7Pointers7,

    39 	EPanicBadUtf7Pointers8,

    40 	EPanicBadUtf7Pointers9,

    41 	EPanicBadUtf7Pointers10,

    42 	EPanicBadUtf7Pointers11,

    43 	EPanicNotInBase64Block,

    44 	EPanicBadUnicodePointers1,

    45 	EPanicBadUnicodePointers2,

    46 	EPanicBadUnicodePointers3,

    47 	EPanicBadUnicodePointers4,

    48 	EPanicBadUnicodePointers5,

    49 	EPanicBadUnicodePointers6,

    50 	EPanicBadUnicodePointers7,

    51 	EPanicBadUnicodePointers8,

    52 	EPanicBadUnicodePointers9,

    53 	EPanicBadUnicodePointers10,

    54 	EPanicBadBitBufferState1,

    55 	EPanicBadBitBufferState2,

    56 	EPanicBadBitBufferState3,

    57 	EPanicBadBitBufferState4,

    58 	EPanicBadBitBufferState5,

    59 	EPanicBadBitBufferState6,

    60 	EPanicBadBitBufferState7,

    61 	EPanicBadBitBufferState8,

    62 	EPanicBadBitBufferState9,

    63 	EPanicBadBitBufferState10,

    64 	EPanicBadBitBufferState11,

    65 	EPanicBadBitBufferState12,

    66 	EPanicBadBitBufferState13,

    67 	EPanicBadBitBufferState14,

    68 	EPanicBadBitBufferState15,

    69 	EPanicBadBitBufferState16,

    70 	EPanicBadBitBufferState17,

    71 	EPanicUnexpectedNumberOfLoopIterations,

    72 	EPanicInitialEscapeCharacterButNoBase64,

    73 	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,

    74 	EPanicBadUtf8Pointers1,

    75 	EPanicBadUtf8Pointers2,

    76 	EPanicBadUtf8Pointers3,

    77 	EPanicBadUtf8Pointers4,

    78 	EPanicBadUtf8Pointers5,

    79 	EPanicBadUtf8Pointers6,

    80 	EPanicBadUtf8Pointers7,

    81 	EPanicOutOfSyncUtf7Byte1,

    82 	EPanicOutOfSyncUtf7Byte2,

    83 	EPanicOutOfSyncBase64Decoding

    84 	};

    86 _LIT(KLitPanicText, "CHARCONV-UTF");

    88 LOCAL_C void Panic(TPanic aPanic)

    89 	{

    90 	User::Panic(KLitPanicText, aPanic);

    91 	}

    93 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}

    95 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)

    96 	{

    97 	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;

    98 	}

   107 /** Converts Unicode text into UTF-8 encoding.

   109 @param aUtf8 On return, contains the UTF-8 encoded output string.

   110 @param aUnicode The Unicode-encoded input string.

   111 @return The number of unconverted characters left at the end of the input

   112 descriptor, or one of the error values defined in TError. */

   113 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)

   114 	{

   115 	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);

   116 	}

   120 /** Converts Unicode text into UTF-8 encoding.

   122 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.

   124 The variant of UTF-8 used internally by Java differs slightly from standard

   125 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.

   127 @param aUtf8 On return, contains the UTF-8 encoded output string.

   128 @param aUnicode A UCS-2 encoded input string.

   129 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java

   130 UTF-8. The default is EFalse.

   131 @return The number of unconverted characters left at the end of the input descriptor,

   132 or one of the error values defined in TError. */

   133 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,

   134 											   const TDesC16& aUnicode,

   135 											   TBool aGenerateJavaConformantUtf8)

   136 	{

   137 	if (aUnicode.Length() == 0)

   138 		{

   139 		aUtf8.SetLength(0);

   140 		return 0;

   141 		}

   142 	if (aUtf8.MaxLength() == 0)

   143 		{

   144 		return aUnicode.Length();

   145 		}

   147 	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());

   148 	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);

   149 	TBool inputIsTruncated = EFalse;

   150 	const TUint16* pUnicode = aUnicode.Ptr();

   151 	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);

   153 	FOREVER

   154 		{

   155 		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));

   156 		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));

   158 		if (pUnicode[0] < 0x80)

   159 			{

   160 			// ascii - 1 byte

   162 			// internally java is different since the \x0000 character is

   163 			// translated into \xC0 \x80.

   165 			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))

   166 				{

   167 				if (pUtf8 == pointerToLastUtf8Byte)

   168 					{

   169 					pUtf8--;

   170 					pUnicode--;

   171 					break;

   172 					}

   173 				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);

   174 				*pUtf8   = STATIC_CAST(TUint8, 0x80);

   175 				}

   176 			else

   177 				{

   178 				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);

   179 				}

   180 			}

   181 		else if (pUnicode[0] < 0x800)

   182 			{

   183 			// U+0080..U+07FF - 2 bytes

   185 			if (pUtf8 == pointerToLastUtf8Byte)

   186 				{

   187 				pUtf8--;

   188 				pUnicode--;

   189 				break;

   190 				}

   192 			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));

   193 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));

   195 			}

   197 		// check to see if we have a surrogate in the stream, surrogates encode code points outside

   198 		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.

   200 		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)

   201 			{

   202 			// surrogate pair - 4 bytes in utf-8

   203 			// U+10000..U+10FFFF

   205 			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));

   206 			// is there enough space to hold the character

   207 			if ((pointerToLastUtf8Byte - pUtf8) < 3)

   208 				{

   209 				pUtf8--;

   210 				pUnicode--;

   211 				break;  // no go to the exit condition

   212 				}

   214 			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));

   215 			if (pUnicode >= pointerToLastUnicodeCharacter)

   216 				{

   217 				pUtf8--;

   218 				pUnicode--;

   219 				inputIsTruncated = ETrue;

   220 				break; // middle of a surrogate pair. go to end condition

   221 				}

   223 			if ((pUnicode[1] & 0xfc00) != 0xdc00)

   224 				{

   225 				return EErrorIllFormedInput;

   226 				}

   228 			// convert utf-16 surrogate to utf-32

   229 			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;

   231 			// convert utf-32 to utf-8

   232             *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));

   233             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));

   234             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));

   235             *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));

   237             // we consumed 2 utf-16 values, move this pointer

   238 			pUnicode++;

   239 			}

   240 		else

   241 			{

   242 			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.

   244 			if (pointerToLastUtf8Byte - pUtf8 < 2)

   245 				{

   246 				pUtf8--;

   247 				pUnicode--;

   248 				break;

   249 				}

   250 			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));

   251 			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));

   252 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));

   253 			}

   255 		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))

   256 			{

   257 			break;

   258 			}

   260 		pUtf8++;

   261 		pUnicode++;

   263 		}

   265 	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)

   266 		{

   267 		return EErrorIllFormedInput;

   268 		}

   270 	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);

   271 	return pointerToLastUnicodeCharacter-pUnicode;

   272 	}

   284 /** Converts text encoded using the Unicode transformation format UTF-8 into the

   285 Unicode UCS-2 character set.

   287 @param aUnicode On return, contains the Unicode encoded output string.

   288 @param aUtf8 The UTF-8 encoded input string

   289 @return The number of unconverted bytes left at the end of the input descriptor,

   290 or one of the error values defined in TError. */

   291 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)

   292 	{

   293 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);

   294 	}

   296 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,

   297 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)

   298 	{

   299 	if (aNumberOfUnconvertibleCharacters<=0)

   300 		{

   301 		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;

   302 		}

   303 	++aNumberOfUnconvertibleCharacters;

   304 	}

   306 /** Converts text encoded using the Unicode transformation format UTF-8 into the

   307 Unicode UCS-2 character set.

   309 @param aUnicode On return, contains the Unicode encoded output string.

   310 @param aUtf8 The UTF-8 encoded input string

   311 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java

   312 @return The number of unconverted bytes left at the end of the input descriptor,

   313 or one of the error values defined in TError. */

   314 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)

   315 	{

   316 	TInt dummyUnconverted, dummyUnconvertedIndex;

   317 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);

   318 	}

   320 /** Converts text encoded using the Unicode transformation format UTF-8 into the

   321 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.

   323 The variant of UTF-8 used internally by Java differs slightly from standard

   324 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.

   326 @param aUnicode On return, contains the Unicode encoded output string.

   327 @param aUtf8 The UTF-8 encoded input string

   328 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java

   329 UTF-8. The default is EFalse.

   330 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes

   331 which were not converted.

   332 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index

   333 of the first byte of the first unconvertible character. For instance if the

   334 first character in the input descriptor (aForeign) could not be converted,

   335 then this parameter is set to the first byte of that character, i.e. zero.

   336 A negative value is returned if all the characters were converted.

   337 @return The number of unconverted bytes left at the end of the input descriptor,

   338 or one of the error values defined in TError. */

   340 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7

   341  * Well formed UTF-8 Byte Sequences, full table.

   342  * +----------------------------------------------------------------+

   343  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |

   344  * +--------------------+----------+----------+----------+----------+

   345  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii

   346  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2

   347  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0

   348  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal

   349  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F

   350  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal

   351  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90

   352  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal

   353  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F

   354  * +--------------------+----------+----------+----------+----------+

   355  *

   356  * As a consequence of the well-formedness conditions specified in table 3-7,

   357  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.

   358  */

   359 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,

   360 		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)

   361 	{

   362 	aUnicode.SetLength(0);

   364 	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))

   365 		{

   366 		return aUtf8.Length();

   367 		}

   369 	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());

   370 	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);

   371 	const TUint8*         pUtf8 = aUtf8.Ptr();

   372 	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);

   373 	const TUint16 replacementcharacter = 0xFFFD;

   374 	TUint currentUnicodeCharacter;

   375 	TUint sequenceLength;

   378 	FOREVER

   379 		{

   380 		TBool illFormed=EFalse;

   382 		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));

   383 		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));

   385 		sequenceLength = 1;

   387 		// ascii - optimisation (i.e. it isn't a sequence)

   388 		if (pUtf8[0] < 0x80)

   389 			{

   390 			currentUnicodeCharacter = pUtf8[0];

   391 			}

   392 		else

   393 			{

   394 			// see if well formed utf-8, use table above for reference

   395 			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))

   396 				{

   397 				// 0xc1-0xc2 are not valid bytes

   398 				sequenceLength = 2;

   399 				}

   400 			else if ((pUtf8[0] & 0xf0) == 0xe0)

   401 				{

   402 				sequenceLength = 3;

   403 				}

   404 			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))

   405 				{

   406 				// 0xf5-0xff, are not valid bytes

   407 				sequenceLength = 4;

   408 				}

   409 			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)

   410 				{

   411 				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))

   412 					{

   413 					// either we've split the 0xc0 0x80 (i.e. 0xc0 is

   414 					// the last character in the string) or we've

   415 					// discovered a valid 0xc0 0x80 sequence.

   416 					sequenceLength = 2;

   417 					}

   418 				}

   420 			/* checking to see if we got a valid sequence */

   421 			if (sequenceLength == 1)

   422 				{

   423 				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example

   424 				currentUnicodeCharacter = replacementcharacter;

   425 				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   426 						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   427 				}

   428 			else

   429 				{

   430 				// this is a check to see if the sequence goes beyond the input

   431 				// stream.  if its not the first and only character in the input

   432 				// stream this isn't an error, otherwise it is.

   433 				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)

   434 					{

   435 					// check to see if this sequence was the first character

   436 					if ((pUnicode - aUnicode.Ptr()) == 0)

   437 						{

   438 						return EErrorIllFormedInput;

   439 						}

   440 					break;

   441 					}

   443 				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);

   445 				/* check the trailing bytes, they should begin with 10 */

   446 				TUint i = 1;

   448 				do

   449 					{

   450 					if ((pUtf8[i] & 0xc0) == 0x80)

   451 						{

   452 						// add the trailing 6 bits to the current unicode char

   453 						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);

   454 						}

   455 					else

   456 						{

   457 						// ill formed character (doesn't have a lead 10)

   458 						currentUnicodeCharacter = replacementcharacter;

   459 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   460 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   461 						illFormed=ETrue;

   462 						break;

   463 						}

   464 					i++;

   465 					}

   466 				while (i < sequenceLength);

   467 				}

   469 			/* conformance check.  bits of above table for reference.

   470 			 * +----------------------------------------------------------------+

   471 			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |

   472 			 * +--------------------+----------+----------+----------+----------+

   473 			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0

   474 			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F

   475 			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90

   476 			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F

   477 			 * +--------------------+----------+----------+----------+----------+

   478 			 */

   480 			if (currentUnicodeCharacter != replacementcharacter)

   481 				{

   482 				if (sequenceLength == 3)

   483 					{

   484 					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))

   485 						{

   486 						currentUnicodeCharacter = replacementcharacter;

   487 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   488 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   489 						illFormed=ETrue;

   490 						}

   491 					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))

   492 						{

   493 						currentUnicodeCharacter = replacementcharacter;

   494 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   495 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   496 						illFormed=ETrue;

   497 						}

   498 					}

   499 				else if (sequenceLength == 4)

   500 					{

   501 					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))

   502 						{

   503 						currentUnicodeCharacter = replacementcharacter;

   504 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   505 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   506 						illFormed=ETrue;

   507 						}

   508 					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))

   509 						{

   510 						currentUnicodeCharacter = replacementcharacter;

   511 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   512 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   513 						illFormed=ETrue;

   514 						}

   515 					}

   518 				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points

   519 				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code

   520 				 * points D800..DFFF is ill formed */

   522 				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))

   523 					{

   524 					currentUnicodeCharacter = replacementcharacter;

   525 					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,

   526 							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());

   527 					illFormed=ETrue;

   528 					}

   529 				}

   530 				// end conformance check

   531 			}

   533 		// would this character generate a surrogate pair in UTF-16?

   534 		if (currentUnicodeCharacter > 0xFFFF)

   535 			{

   536 			// is there enough space to hold a surrogate pair in the output?

   537 			if (pUnicode >= pLastUnicode)

   538 				{

   539 				break; // no, end processing.

   540 				}

   542 			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;

   543 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);

   545 			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;

   546 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);

   547 			}

   548 		else

   549 			{

   550 			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);

   551 			}

   553 		// move the input pointer

   554 		if (currentUnicodeCharacter != replacementcharacter)

   555 			{

   556 			pUtf8 += sequenceLength;

   557 			}

   558 		else if(illFormed == EFalse)

   559 			{

   560 			pUtf8 += (sequenceLength);

   561 			}

   562 		else

   563 			{

   564 			// we had a character we didn't recognize (i.e. it was invalid)

   565 			// so move to the next character in the input

   566 			pUtf8++;

   567 			}

   569 		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))

   570 			{

   571 			break;  // we've either reached the end of the input or the end of output

   572 			}

   573 		}

   575 	aUnicode.SetLength(pUnicode - aUnicode.Ptr());

   576 	return (pLastUtf8 - pUtf8 + 1);

   577 	}

   579 /** Given a sample text this function attempts to determine whether or not

   580  *  the same text is encoded using the UTF-8 standard encoding scheme.

   582 @param TInt a confidence level, given at certain value.  if the given sample

   583 			is UTF-8 this value will not be changed (unless > 100) then its

   584 			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.

   585 @param TDesC8 sample text.

   586 UTF-8. The default is EFalse.

   587 @return void

   588  */

   590 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7

   591  * Well formed UTF-8 Byte Sequences, full table.

   592  * +----------------------------------------------------------------+

   593  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |

   594  * +--------------------+----------+----------+----------+----------+

   595  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii

   596  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2

   597  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0

   598  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal

   599  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F

   600  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal

   601  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90

   602  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal

   603  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F

   604  * +--------------------+----------+----------+----------+----------+

   605  *

   606  * As a consequence of the well-formedness conditions specified in table 3-7,

   607  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.

   608  *

   609  * Code Rules:

   610  *   R1: If the string contains any non-UTF-8 characters the returned confidence

   611  *       is 0.  Valid UTF-8 combinations are listed in the above table.

   612  *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in

   613  *       the (see ) the returned confidence is 95.

   614  *   R3: Otherwise the confidence returned is based upon the sample string

   615  *       length.

   616  *   R4: If the sample string is under 75 characters, the confidence is set to

   617  *       75.

   618  */

   619 void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)

   620 	{

   622 	TInt sampleLength = aSample.Length();

   624 	if (sampleLength == 0)

   625 		{

   626 		aConfidenceLevel = 89;

   627 		return;

   628 		}

   629 	TInt bytesRemaining  = 0;

   630 	TUint sequenceLength  = 0;

   632 	aConfidenceLevel = sampleLength;

   634 	const TUint8* buffer = &aSample[0];

   636 	if (sampleLength < 95)

   637 		{

   638 		// check for the BOM

   639 		if ((sampleLength >= 3) &&

   640 			((buffer[0] == 0xEF) &&

   641 			 (buffer[1] == 0xBB) &&

   642 			 (buffer[2] == 0xBF))

   643 			)

   644 			{

   645 			aConfidenceLevel = 95;

   646 			}

   647 		else if (sampleLength < 75)

   648 			{

   649 			aConfidenceLevel = 75;

   650 			}

   651 		}

   653 	for (TInt index = 0;index != sampleLength;index++)

   654 		{

   656 		if (bytesRemaining > 0)

   657 			{

   658 			// bytesRemaining > 0, means that a byte representing the start of a

   659 			// multibyte sequence was encountered and the bytesRemaining is the

   660 			// number of bytes to follow.

   662 			if ((buffer[index] & 0xc0) == 0x80)

   663 				{

   664 				// need to check for ill-formed sequences -- all are in the 2nd byte

   666 				if ((sequenceLength == 3) && (bytesRemaining == 2))

   667 					{

   668 					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))

   669 						{

   670 						aConfidenceLevel = 0;

   671 						break;

   672 						}

   673 					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))

   674 						{

   675 						aConfidenceLevel = 0;

   676 						break;

   677 						}

   678 					}

   679 				else if ((sequenceLength == 4) && (bytesRemaining == 3))

   680 					{

   681 					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))

   682 						{

   683 						aConfidenceLevel = 0;

   684 						break;

   685 						}

   686 					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))

   687 						{

   688 						aConfidenceLevel = 0;

   689 						break;

   690 						}

   691 					}

   693 				--bytesRemaining;

   694 				continue;

   695 				}

   696 			else

   697 				{

   698 				aConfidenceLevel = 0;

   699 				break;

   700 				}

   701 			}

   703 		if (bytesRemaining == 0)

   704 			{

   705 			if (buffer[index] < 0x80)

   706 				{

   707 				// The value of aSample[index] is in the range 0x00-0x7f

   708 				//UTF8 maintains ASCII transparency. So it's a valid

   709 				//UTF8. Do nothing, check next value.

   710 				continue;

   711 				}

   712 			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))

   713 				{

   714 				// valid start of a 2 byte sequence (see conformance note)

   715 				sequenceLength = 2;

   716 				bytesRemaining = 1;

   717 				}

   718 			else if ((buffer[index] & 0xf0) == 0xe0)

   719 				{

   720 				// valid start of a 3 byte sequence

   721 				sequenceLength = 3;

   722 				bytesRemaining = 2;

   723 				}

   724 			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))

   725 				{

   726 				// valid start of a 4 byte sequence (see conformance note)

   727 				sequenceLength = 4;

   728 				bytesRemaining = 3;

   729 				}

   730 			else

   731 				{

   732 				// wasn't anything expected so must be an illegal/irregular UTF8 coded value

   733 				aConfidenceLevel = 0;

   734 				break;

   735 				}

   736 			}

   737 		} // for

   739 	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;

   740 	}

   742 // End of file

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--