os/security/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of the License "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include <e32std.h>
    20 #include <e32base.h>
    21 #include <utf.h>
    22 
    23 #define STATIC_CAST(t,v) static_cast<t>(v)
    24 #define CONST_CAST(t,v) const_cast<t>(v)
    25 #define FOREVER for(;;)
    26 
    27 const TUint KNotInBase64Alphabet=KMaxTUint;
    28 
    29 enum TPanic
    30 	{
    31 	EPanicBad6BitNumber=1,
    32 	EPanicBadUtf7Pointers1,
    33 	EPanicBadUtf7Pointers2,
    34 	EPanicBadUtf7Pointers3,
    35 	EPanicBadUtf7Pointers4,
    36 	EPanicBadUtf7Pointers5,
    37 	EPanicBadUtf7Pointers6,
    38 	EPanicBadUtf7Pointers7,
    39 	EPanicBadUtf7Pointers8,
    40 	EPanicBadUtf7Pointers9,
    41 	EPanicBadUtf7Pointers10,
    42 	EPanicBadUtf7Pointers11,
    43 	EPanicNotInBase64Block,
    44 	EPanicBadUnicodePointers1,
    45 	EPanicBadUnicodePointers2,
    46 	EPanicBadUnicodePointers3,
    47 	EPanicBadUnicodePointers4,
    48 	EPanicBadUnicodePointers5,
    49 	EPanicBadUnicodePointers6,
    50 	EPanicBadUnicodePointers7,
    51 	EPanicBadUnicodePointers8,
    52 	EPanicBadUnicodePointers9,
    53 	EPanicBadUnicodePointers10,
    54 	EPanicBadBitBufferState1,
    55 	EPanicBadBitBufferState2,
    56 	EPanicBadBitBufferState3,
    57 	EPanicBadBitBufferState4,
    58 	EPanicBadBitBufferState5,
    59 	EPanicBadBitBufferState6,
    60 	EPanicBadBitBufferState7,
    61 	EPanicBadBitBufferState8,
    62 	EPanicBadBitBufferState9,
    63 	EPanicBadBitBufferState10,
    64 	EPanicBadBitBufferState11,
    65 	EPanicBadBitBufferState12,
    66 	EPanicBadBitBufferState13,
    67 	EPanicBadBitBufferState14,
    68 	EPanicBadBitBufferState15,
    69 	EPanicBadBitBufferState16,
    70 	EPanicBadBitBufferState17,
    71 	EPanicUnexpectedNumberOfLoopIterations,
    72 	EPanicInitialEscapeCharacterButNoBase64,
    73 	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
    74 	EPanicBadUtf8Pointers1,
    75 	EPanicBadUtf8Pointers2,
    76 	EPanicBadUtf8Pointers3,
    77 	EPanicBadUtf8Pointers4,
    78 	EPanicBadUtf8Pointers5,
    79 	EPanicBadUtf8Pointers6,
    80 	EPanicBadUtf8Pointers7,
    81 	EPanicOutOfSyncUtf7Byte1,
    82 	EPanicOutOfSyncUtf7Byte2,
    83 	EPanicOutOfSyncBase64Decoding
    84 	};
    85 
    86 _LIT(KLitPanicText, "CHARCONV-UTF");
    87 
    88 LOCAL_C void Panic(TPanic aPanic)
    89 	{
    90 	User::Panic(KLitPanicText, aPanic);
    91 	}
    92 
    93 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
    94 
    95 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
    96 	{
    97 	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
    98 	}
    99 
   100 
   101 
   102 
   103 
   104 
   105  
   106 
   107 /** Converts Unicode text into UTF-8 encoding.
   108 
   109 @param aUtf8 On return, contains the UTF-8 encoded output string.
   110 @param aUnicode The Unicode-encoded input string.
   111 @return The number of unconverted characters left at the end of the input 
   112 descriptor, or one of the error values defined in TError. */
   113 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
   114 	{
   115 	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
   116 	}
   117 
   118 
   119 
   120 /** Converts Unicode text into UTF-8 encoding. 
   121 
   122 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
   123 
   124 The variant of UTF-8 used internally by Java differs slightly from standard 
   125 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
   126 
   127 @param aUtf8 On return, contains the UTF-8 encoded output string.
   128 @param aUnicode A UCS-2 encoded input string.
   129 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
   130 UTF-8. The default is EFalse.
   131 @return The number of unconverted characters left at the end of the input descriptor, 
   132 or one of the error values defined in TError. */
   133 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
   134 											   const TDesC16& aUnicode, 
   135 											   TBool aGenerateJavaConformantUtf8)
   136 	{
   137 	if (aUnicode.Length() == 0)
   138 		{
   139 		aUtf8.SetLength(0);
   140 		return 0;
   141 		}
   142 	if (aUtf8.MaxLength() == 0)
   143 		{
   144 		return aUnicode.Length();
   145 		}
   146 	
   147 	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
   148 	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
   149 	TBool inputIsTruncated = EFalse;
   150 	const TUint16* pUnicode = aUnicode.Ptr();
   151 	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
   152 	
   153 	FOREVER
   154 		{
   155 		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
   156 		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
   157 	
   158 		if (pUnicode[0] < 0x80)
   159 			{
   160 			// ascii - 1 byte
   161 			
   162 			// internally java is different since the \x0000 character is 
   163 			// translated into \xC0 \x80.
   164 			
   165 			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
   166 				{
   167 				if (pUtf8 == pointerToLastUtf8Byte)
   168 					{
   169 					pUtf8--;
   170 					pUnicode--;
   171 					break;			
   172 					}
   173 				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
   174 				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
   175 				}
   176 			else
   177 				{
   178 				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
   179 				}
   180 			}
   181 		else if (pUnicode[0] < 0x800)
   182 			{
   183 			// U+0080..U+07FF - 2 bytes
   184 			
   185 			if (pUtf8 == pointerToLastUtf8Byte)
   186 				{
   187 				pUtf8--;
   188 				pUnicode--;
   189 				break;
   190 				}
   191 			
   192 			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
   193 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
   194 			
   195 			}
   196 
   197 		// check to see if we have a surrogate in the stream, surrogates encode code points outside
   198 		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
   199 
   200 		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
   201 			{
   202 			// surrogate pair - 4 bytes in utf-8
   203 			// U+10000..U+10FFFF
   204 			
   205 			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
   206 			// is there enough space to hold the character
   207 			if ((pointerToLastUtf8Byte - pUtf8) < 3)
   208 				{
   209 				pUtf8--;
   210 				pUnicode--;
   211 				break;  // no go to the exit condition
   212 				}
   213 			
   214 			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
   215 			if (pUnicode >= pointerToLastUnicodeCharacter)
   216 				{
   217 				pUtf8--;
   218 				pUnicode--;
   219 				inputIsTruncated = ETrue;
   220 				break; // middle of a surrogate pair. go to end condition
   221 				}
   222 			
   223 			if ((pUnicode[1] & 0xfc00) != 0xdc00)
   224 				{
   225 				return EErrorIllFormedInput;
   226 				}
   227 			
   228 			// convert utf-16 surrogate to utf-32
   229 			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
   230 			
   231 			// convert utf-32 to utf-8
   232             *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
   233             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
   234             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
   235             *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
   236 			
   237             // we consumed 2 utf-16 values, move this pointer
   238 			pUnicode++;
   239 			}		
   240 		else
   241 			{
   242 			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
   243 			
   244 			if (pointerToLastUtf8Byte - pUtf8 < 2)
   245 				{
   246 				pUtf8--;
   247 				pUnicode--;
   248 				break;
   249 				}
   250 			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
   251 			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
   252 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
   253 			}
   254 		
   255 		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
   256 			{
   257 			break;
   258 			}
   259 		
   260 		pUtf8++;
   261 		pUnicode++;
   262 		
   263 		}
   264 	
   265 	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
   266 		{
   267 		return EErrorIllFormedInput;
   268 		}
   269 	
   270 	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
   271 	return pointerToLastUnicodeCharacter-pUnicode;
   272 	}
   273 
   274 
   275 
   276 
   277 
   278  
   279 
   280 
   281 
   282 
   283 
   284 /** Converts text encoded using the Unicode transformation format UTF-8 into the 
   285 Unicode UCS-2 character set.
   286 
   287 @param aUnicode On return, contains the Unicode encoded output string.
   288 @param aUtf8 The UTF-8 encoded input string
   289 @return The number of unconverted bytes left at the end of the input descriptor, 
   290 or one of the error values defined in TError. */
   291 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
   292 	{
   293 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
   294 	}
   295 
   296 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
   297 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
   298 	{
   299 	if (aNumberOfUnconvertibleCharacters<=0)
   300 		{
   301 		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
   302 		}
   303 	++aNumberOfUnconvertibleCharacters;
   304 	}
   305 
   306 /** Converts text encoded using the Unicode transformation format UTF-8 into the 
   307 Unicode UCS-2 character set.
   308 
   309 @param aUnicode On return, contains the Unicode encoded output string.
   310 @param aUtf8 The UTF-8 encoded input string
   311 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
   312 @return The number of unconverted bytes left at the end of the input descriptor, 
   313 or one of the error values defined in TError. */
   314 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
   315 	{
   316 	TInt dummyUnconverted, dummyUnconvertedIndex;
   317 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
   318 	}
   319 
   320 /** Converts text encoded using the Unicode transformation format UTF-8 into the 
   321 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
   322 
   323 The variant of UTF-8 used internally by Java differs slightly from standard 
   324 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
   325 
   326 @param aUnicode On return, contains the Unicode encoded output string.
   327 @param aUtf8 The UTF-8 encoded input string
   328 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
   329 UTF-8. The default is EFalse.
   330 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
   331 which were not converted.
   332 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
   333 of the first byte of the first unconvertible character. For instance if the 
   334 first character in the input descriptor (aForeign) could not be converted, 
   335 then this parameter is set to the first byte of that character, i.e. zero. 
   336 A negative value is returned if all the characters were converted.
   337 @return The number of unconverted bytes left at the end of the input descriptor, 
   338 or one of the error values defined in TError. */
   339 
   340 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
   341  * Well formed UTF-8 Byte Sequences, full table.
   342  * +----------------------------------------------------------------+
   343  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
   344  * +--------------------+----------+----------+----------+----------+
   345  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
   346  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
   347  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
   348  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
   349  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
   350  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
   351  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
   352  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
   353  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
   354  * +--------------------+----------+----------+----------+----------+
   355  * 
   356  * As a consequence of the well-formedness conditions specified in table 3-7,
   357  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
   358  */
   359 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
   360 		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   361 	{	
   362 	aUnicode.SetLength(0);
   363 	
   364 	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
   365 		{
   366 		return aUtf8.Length();
   367 		}
   368 
   369 	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
   370 	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
   371 	const TUint8*         pUtf8 = aUtf8.Ptr();   
   372 	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
   373 	const TUint16 replacementcharacter = 0xFFFD;
   374 	TUint currentUnicodeCharacter;
   375 	TUint sequenceLength;
   376 
   377 	
   378 	FOREVER
   379 		{
   380 		TBool illFormed=EFalse;
   381 		
   382 		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
   383 		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
   384 		
   385 		sequenceLength = 1;
   386 		
   387 		// ascii - optimisation (i.e. it isn't a sequence)
   388 		if (pUtf8[0] < 0x80)
   389 			{
   390 			currentUnicodeCharacter = pUtf8[0];
   391 			}
   392 		else
   393 			{
   394 			// see if well formed utf-8, use table above for reference	
   395 			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
   396 				{
   397 				// 0xc1-0xc2 are not valid bytes
   398 				sequenceLength = 2;
   399 				}
   400 			else if ((pUtf8[0] & 0xf0) == 0xe0)
   401 				{
   402 				sequenceLength = 3;
   403 				}
   404 			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
   405 				{
   406 				// 0xf5-0xff, are not valid bytes
   407 				sequenceLength = 4;
   408 				}
   409 			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
   410 				{
   411 				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
   412 					{
   413 					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
   414 					// the last character in the string) or we've
   415 					// discovered a valid 0xc0 0x80 sequence.  
   416 					sequenceLength = 2;
   417 					}
   418 				}
   419 			
   420 			/* checking to see if we got a valid sequence */
   421 			if (sequenceLength == 1)
   422 				{
   423 				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
   424 				currentUnicodeCharacter = replacementcharacter;
   425 				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   426 						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   427 				}
   428 			else
   429 				{
   430 				// this is a check to see if the sequence goes beyond the input 
   431 				// stream.  if its not the first and only character in the input
   432 				// stream this isn't an error, otherwise it is.
   433 				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
   434 					{
   435 					// check to see if this sequence was the first character
   436 					if ((pUnicode - aUnicode.Ptr()) == 0)
   437 						{
   438 						return EErrorIllFormedInput;
   439 						}
   440 					break;
   441 					}			
   442 				
   443 				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
   444 			
   445 				/* check the trailing bytes, they should begin with 10 */
   446 				TUint i = 1;
   447 
   448 				do
   449 					{
   450 					if ((pUtf8[i] & 0xc0) == 0x80)
   451 						{
   452 						// add the trailing 6 bits to the current unicode char
   453 						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
   454 						}
   455 					else
   456 						{
   457 						// ill formed character (doesn't have a lead 10)
   458 						currentUnicodeCharacter = replacementcharacter;
   459 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   460 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   461 						illFormed=ETrue;
   462 						break; 
   463 						}
   464 					i++;
   465 					}
   466 				while (i < sequenceLength);
   467 				}
   468 				
   469 			/* conformance check.  bits of above table for reference.
   470 			 * +----------------------------------------------------------------+
   471 			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
   472 			 * +--------------------+----------+----------+----------+----------+
   473 			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
   474 			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
   475 			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
   476 			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
   477 			 * +--------------------+----------+----------+----------+----------+
   478 			 */
   479 			
   480 			if (currentUnicodeCharacter != replacementcharacter)
   481 				{
   482 				if (sequenceLength == 3)
   483 					{
   484 					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
   485 						{
   486 						currentUnicodeCharacter = replacementcharacter;
   487 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   488 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   489 						illFormed=ETrue;
   490 						}
   491 					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
   492 						{
   493 						currentUnicodeCharacter = replacementcharacter;
   494 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   495 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   496 						illFormed=ETrue;
   497 						}
   498 					}
   499 				else if (sequenceLength == 4)
   500 					{
   501 					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
   502 						{
   503 						currentUnicodeCharacter = replacementcharacter;
   504 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   505 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   506 						illFormed=ETrue;
   507 						}
   508 					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
   509 						{
   510 						currentUnicodeCharacter = replacementcharacter;
   511 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   512 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   513 						illFormed=ETrue;
   514 						}
   515 					}
   516 				
   517 				
   518 				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
   519 				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
   520 				 * points D800..DFFF is ill formed */
   521 				
   522 				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
   523 					{
   524 					currentUnicodeCharacter = replacementcharacter;
   525 					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   526 							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   527 					illFormed=ETrue;
   528 					}	
   529 				}
   530 				// end conformance check
   531 			}
   532 
   533 		// would this character generate a surrogate pair in UTF-16?
   534 		if (currentUnicodeCharacter > 0xFFFF)
   535 			{
   536 			// is there enough space to hold a surrogate pair in the output?
   537 			if (pUnicode >= pLastUnicode)
   538 				{
   539 				break; // no, end processing.
   540 				}
   541 			
   542 			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
   543 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
   544 					
   545 			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
   546 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
   547 			}
   548 		else
   549 			{
   550 			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
   551 			}
   552 		
   553 		// move the input pointer
   554 		if (currentUnicodeCharacter != replacementcharacter)
   555 			{
   556 			pUtf8 += sequenceLength;
   557 			}
   558 		else if(illFormed == EFalse)
   559 			{
   560 			pUtf8 += (sequenceLength);
   561 			}
   562 		else
   563 			{
   564 			// we had a character we didn't recognize (i.e. it was invalid)
   565 			// so move to the next character in the input
   566 			pUtf8++;
   567 			}
   568 		
   569 		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
   570 			{ 
   571 			break;  // we've either reached the end of the input or the end of output
   572 			}
   573 		}
   574 
   575 	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
   576 	return (pLastUtf8 - pUtf8 + 1);
   577 	}
   578 
   579 /** Given a sample text this function attempts to determine whether or not
   580  *  the same text is encoded using the UTF-8 standard encoding scheme.
   581 
   582 @param TInt a confidence level, given at certain value.  if the given sample
   583 			is UTF-8 this value will not be changed (unless > 100) then its
   584 			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
   585 @param TDesC8 sample text.
   586 UTF-8. The default is EFalse.
   587 @return void
   588  */
   589 
   590 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
   591  * Well formed UTF-8 Byte Sequences, full table.
   592  * +----------------------------------------------------------------+
   593  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
   594  * +--------------------+----------+----------+----------+----------+
   595  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
   596  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
   597  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
   598  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
   599  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
   600  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
   601  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
   602  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
   603  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
   604  * +--------------------+----------+----------+----------+----------+
   605  * 
   606  * As a consequence of the well-formedness conditions specified in table 3-7,
   607  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
   608  * 
   609  * Code Rules:
   610  *   R1: If the string contains any non-UTF-8 characters the returned confidence
   611  *       is 0.  Valid UTF-8 combinations are listed in the above table.
   612  *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
   613  *       the (see ) the returned confidence is 95.
   614  *   R3: Otherwise the confidence returned is based upon the sample string 
   615  *       length.
   616  *   R4: If the sample string is under 75 characters, the confidence is set to 
   617  *       75.
   618  */
   619 void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
   620 	{
   621 
   622 	TInt sampleLength = aSample.Length();
   623 	
   624 	if (sampleLength == 0)
   625 		{
   626 		aConfidenceLevel = 89;
   627 		return;
   628 		}
   629 	TInt bytesRemaining  = 0;
   630 	TUint sequenceLength  = 0;
   631 	
   632 	aConfidenceLevel = sampleLength;
   633 
   634 	const TUint8* buffer = &aSample[0];
   635 
   636 	if (sampleLength < 95)
   637 		{
   638 		// check for the BOM
   639 		if ((sampleLength >= 3) && 
   640 			((buffer[0] == 0xEF) &&
   641 			 (buffer[1] == 0xBB) &&
   642 			 (buffer[2] == 0xBF)) 
   643 			) 
   644 			{
   645 			aConfidenceLevel = 95;
   646 			}
   647 		else if (sampleLength < 75)
   648 			{
   649 			aConfidenceLevel = 75;
   650 			}
   651 		}
   652 	
   653 	for (TInt index = 0;index != sampleLength;index++)
   654 		{
   655 		
   656 		if (bytesRemaining > 0)
   657 			{
   658 			// bytesRemaining > 0, means that a byte representing the start of a 
   659 			// multibyte sequence was encountered and the bytesRemaining is the 
   660 			// number of bytes to follow. 
   661 			
   662 			if ((buffer[index] & 0xc0) == 0x80) 
   663 				{
   664 				// need to check for ill-formed sequences -- all are in the 2nd byte
   665 				
   666 				if ((sequenceLength == 3) && (bytesRemaining == 2))
   667 					{
   668 					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
   669 						{
   670 						aConfidenceLevel = 0;
   671 						break;
   672 						}
   673 					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
   674 						{
   675 						aConfidenceLevel = 0;
   676 						break;
   677 						}
   678 					}
   679 				else if ((sequenceLength == 4) && (bytesRemaining == 3))
   680 					{
   681 					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
   682 						{
   683 						aConfidenceLevel = 0;
   684 						break;
   685 						}
   686 					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
   687 						{
   688 						aConfidenceLevel = 0;
   689 						break;
   690 						}
   691 					}
   692 				
   693 				--bytesRemaining;
   694 				continue;
   695 				}
   696 			else
   697 				{
   698 				aConfidenceLevel = 0;
   699 				break;
   700 				}
   701 			}
   702 		
   703 		if (bytesRemaining == 0)
   704 			{
   705 			if (buffer[index] < 0x80)
   706 				{
   707 				// The value of aSample[index] is in the range 0x00-0x7f
   708 				//UTF8 maintains ASCII transparency. So it's a valid
   709 				//UTF8. Do nothing, check next value.
   710 				continue;
   711 				}
   712 			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
   713 				{
   714 				// valid start of a 2 byte sequence (see conformance note)
   715 				sequenceLength = 2;
   716 				bytesRemaining = 1;
   717 				}
   718 			else if ((buffer[index] & 0xf0) == 0xe0)
   719 				{
   720 				// valid start of a 3 byte sequence
   721 				sequenceLength = 3;
   722 				bytesRemaining = 2;
   723 				}
   724 			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
   725 				{
   726 				// valid start of a 4 byte sequence (see conformance note)
   727 				sequenceLength = 4;
   728 				bytesRemaining = 3;
   729 				}	
   730 			else
   731 				{
   732 				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
   733 				aConfidenceLevel = 0;
   734 				break;
   735 				}
   736 			}
   737 		} // for 
   738 	
   739 	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
   740 	}
   741 
   742 // End of file