os/security/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/security/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,742 @@
     1.4 +/*
     1.5 +* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +* All rights reserved.
     1.7 +* This component and the accompanying materials are made available
     1.8 +* under the terms of the License "Eclipse Public License v1.0"
     1.9 +* which accompanies this distribution, and is available
    1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +*
    1.12 +* Initial Contributors:
    1.13 +* Nokia Corporation - initial contribution.
    1.14 +*
    1.15 +* Contributors:
    1.16 +*
    1.17 +* Description: 
    1.18 +*
    1.19 +*/
    1.20 +
    1.21 +
    1.22 +#include <e32std.h>
    1.23 +#include <e32base.h>
    1.24 +#include <utf.h>
    1.25 +
    1.26 +#define STATIC_CAST(t,v) static_cast<t>(v)
    1.27 +#define CONST_CAST(t,v) const_cast<t>(v)
    1.28 +#define FOREVER for(;;)
    1.29 +
    1.30 +const TUint KNotInBase64Alphabet=KMaxTUint;
    1.31 +
    1.32 +enum TPanic
    1.33 +	{
    1.34 +	EPanicBad6BitNumber=1,
    1.35 +	EPanicBadUtf7Pointers1,
    1.36 +	EPanicBadUtf7Pointers2,
    1.37 +	EPanicBadUtf7Pointers3,
    1.38 +	EPanicBadUtf7Pointers4,
    1.39 +	EPanicBadUtf7Pointers5,
    1.40 +	EPanicBadUtf7Pointers6,
    1.41 +	EPanicBadUtf7Pointers7,
    1.42 +	EPanicBadUtf7Pointers8,
    1.43 +	EPanicBadUtf7Pointers9,
    1.44 +	EPanicBadUtf7Pointers10,
    1.45 +	EPanicBadUtf7Pointers11,
    1.46 +	EPanicNotInBase64Block,
    1.47 +	EPanicBadUnicodePointers1,
    1.48 +	EPanicBadUnicodePointers2,
    1.49 +	EPanicBadUnicodePointers3,
    1.50 +	EPanicBadUnicodePointers4,
    1.51 +	EPanicBadUnicodePointers5,
    1.52 +	EPanicBadUnicodePointers6,
    1.53 +	EPanicBadUnicodePointers7,
    1.54 +	EPanicBadUnicodePointers8,
    1.55 +	EPanicBadUnicodePointers9,
    1.56 +	EPanicBadUnicodePointers10,
    1.57 +	EPanicBadBitBufferState1,
    1.58 +	EPanicBadBitBufferState2,
    1.59 +	EPanicBadBitBufferState3,
    1.60 +	EPanicBadBitBufferState4,
    1.61 +	EPanicBadBitBufferState5,
    1.62 +	EPanicBadBitBufferState6,
    1.63 +	EPanicBadBitBufferState7,
    1.64 +	EPanicBadBitBufferState8,
    1.65 +	EPanicBadBitBufferState9,
    1.66 +	EPanicBadBitBufferState10,
    1.67 +	EPanicBadBitBufferState11,
    1.68 +	EPanicBadBitBufferState12,
    1.69 +	EPanicBadBitBufferState13,
    1.70 +	EPanicBadBitBufferState14,
    1.71 +	EPanicBadBitBufferState15,
    1.72 +	EPanicBadBitBufferState16,
    1.73 +	EPanicBadBitBufferState17,
    1.74 +	EPanicUnexpectedNumberOfLoopIterations,
    1.75 +	EPanicInitialEscapeCharacterButNoBase64,
    1.76 +	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
    1.77 +	EPanicBadUtf8Pointers1,
    1.78 +	EPanicBadUtf8Pointers2,
    1.79 +	EPanicBadUtf8Pointers3,
    1.80 +	EPanicBadUtf8Pointers4,
    1.81 +	EPanicBadUtf8Pointers5,
    1.82 +	EPanicBadUtf8Pointers6,
    1.83 +	EPanicBadUtf8Pointers7,
    1.84 +	EPanicOutOfSyncUtf7Byte1,
    1.85 +	EPanicOutOfSyncUtf7Byte2,
    1.86 +	EPanicOutOfSyncBase64Decoding
    1.87 +	};
    1.88 +
    1.89 +_LIT(KLitPanicText, "CHARCONV-UTF");
    1.90 +
    1.91 +LOCAL_C void Panic(TPanic aPanic)
    1.92 +	{
    1.93 +	User::Panic(KLitPanicText, aPanic);
    1.94 +	}
    1.95 +
    1.96 +inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
    1.97 +
    1.98 +inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
    1.99 +	{
   1.100 +	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
   1.101 +	}
   1.102 +
   1.103 +
   1.104 +
   1.105 +
   1.106 +
   1.107 +
   1.108 + 
   1.109 +
   1.110 +/** Converts Unicode text into UTF-8 encoding.
   1.111 +
   1.112 +@param aUtf8 On return, contains the UTF-8 encoded output string.
   1.113 +@param aUnicode The Unicode-encoded input string.
   1.114 +@return The number of unconverted characters left at the end of the input 
   1.115 +descriptor, or one of the error values defined in TError. */
   1.116 +EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
   1.117 +	{
   1.118 +	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
   1.119 +	}
   1.120 +
   1.121 +
   1.122 +
   1.123 +/** Converts Unicode text into UTF-8 encoding. 
   1.124 +
   1.125 +Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
   1.126 +
   1.127 +The variant of UTF-8 used internally by Java differs slightly from standard 
   1.128 +UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
   1.129 +
   1.130 +@param aUtf8 On return, contains the UTF-8 encoded output string.
   1.131 +@param aUnicode A UCS-2 encoded input string.
   1.132 +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
   1.133 +UTF-8. The default is EFalse.
   1.134 +@return The number of unconverted characters left at the end of the input descriptor, 
   1.135 +or one of the error values defined in TError. */
   1.136 +TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
   1.137 +											   const TDesC16& aUnicode, 
   1.138 +											   TBool aGenerateJavaConformantUtf8)
   1.139 +	{
   1.140 +	if (aUnicode.Length() == 0)
   1.141 +		{
   1.142 +		aUtf8.SetLength(0);
   1.143 +		return 0;
   1.144 +		}
   1.145 +	if (aUtf8.MaxLength() == 0)
   1.146 +		{
   1.147 +		return aUnicode.Length();
   1.148 +		}
   1.149 +	
   1.150 +	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
   1.151 +	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
   1.152 +	TBool inputIsTruncated = EFalse;
   1.153 +	const TUint16* pUnicode = aUnicode.Ptr();
   1.154 +	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
   1.155 +	
   1.156 +	FOREVER
   1.157 +		{
   1.158 +		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
   1.159 +		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
   1.160 +	
   1.161 +		if (pUnicode[0] < 0x80)
   1.162 +			{
   1.163 +			// ascii - 1 byte
   1.164 +			
   1.165 +			// internally java is different since the \x0000 character is 
   1.166 +			// translated into \xC0 \x80.
   1.167 +			
   1.168 +			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
   1.169 +				{
   1.170 +				if (pUtf8 == pointerToLastUtf8Byte)
   1.171 +					{
   1.172 +					pUtf8--;
   1.173 +					pUnicode--;
   1.174 +					break;			
   1.175 +					}
   1.176 +				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
   1.177 +				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
   1.178 +				}
   1.179 +			else
   1.180 +				{
   1.181 +				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
   1.182 +				}
   1.183 +			}
   1.184 +		else if (pUnicode[0] < 0x800)
   1.185 +			{
   1.186 +			// U+0080..U+07FF - 2 bytes
   1.187 +			
   1.188 +			if (pUtf8 == pointerToLastUtf8Byte)
   1.189 +				{
   1.190 +				pUtf8--;
   1.191 +				pUnicode--;
   1.192 +				break;
   1.193 +				}
   1.194 +			
   1.195 +			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
   1.196 +			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
   1.197 +			
   1.198 +			}
   1.199 +
   1.200 +		// check to see if we have a surrogate in the stream, surrogates encode code points outside
   1.201 +		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
   1.202 +
   1.203 +		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
   1.204 +			{
   1.205 +			// surrogate pair - 4 bytes in utf-8
   1.206 +			// U+10000..U+10FFFF
   1.207 +			
   1.208 +			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
   1.209 +			// is there enough space to hold the character
   1.210 +			if ((pointerToLastUtf8Byte - pUtf8) < 3)
   1.211 +				{
   1.212 +				pUtf8--;
   1.213 +				pUnicode--;
   1.214 +				break;  // no go to the exit condition
   1.215 +				}
   1.216 +			
   1.217 +			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
   1.218 +			if (pUnicode >= pointerToLastUnicodeCharacter)
   1.219 +				{
   1.220 +				pUtf8--;
   1.221 +				pUnicode--;
   1.222 +				inputIsTruncated = ETrue;
   1.223 +				break; // middle of a surrogate pair. go to end condition
   1.224 +				}
   1.225 +			
   1.226 +			if ((pUnicode[1] & 0xfc00) != 0xdc00)
   1.227 +				{
   1.228 +				return EErrorIllFormedInput;
   1.229 +				}
   1.230 +			
   1.231 +			// convert utf-16 surrogate to utf-32
   1.232 +			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
   1.233 +			
   1.234 +			// convert utf-32 to utf-8
   1.235 +            *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
   1.236 +            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
   1.237 +            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
   1.238 +            *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
   1.239 +			
   1.240 +            // we consumed 2 utf-16 values, move this pointer
   1.241 +			pUnicode++;
   1.242 +			}		
   1.243 +		else
   1.244 +			{
   1.245 +			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
   1.246 +			
   1.247 +			if (pointerToLastUtf8Byte - pUtf8 < 2)
   1.248 +				{
   1.249 +				pUtf8--;
   1.250 +				pUnicode--;
   1.251 +				break;
   1.252 +				}
   1.253 +			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
   1.254 +			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
   1.255 +			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
   1.256 +			}
   1.257 +		
   1.258 +		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
   1.259 +			{
   1.260 +			break;
   1.261 +			}
   1.262 +		
   1.263 +		pUtf8++;
   1.264 +		pUnicode++;
   1.265 +		
   1.266 +		}
   1.267 +	
   1.268 +	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
   1.269 +		{
   1.270 +		return EErrorIllFormedInput;
   1.271 +		}
   1.272 +	
   1.273 +	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
   1.274 +	return pointerToLastUnicodeCharacter-pUnicode;
   1.275 +	}
   1.276 +
   1.277 +
   1.278 +
   1.279 +
   1.280 +
   1.281 + 
   1.282 +
   1.283 +
   1.284 +
   1.285 +
   1.286 +
   1.287 +/** Converts text encoded using the Unicode transformation format UTF-8 into the 
   1.288 +Unicode UCS-2 character set.
   1.289 +
   1.290 +@param aUnicode On return, contains the Unicode encoded output string.
   1.291 +@param aUtf8 The UTF-8 encoded input string
   1.292 +@return The number of unconverted bytes left at the end of the input descriptor, 
   1.293 +or one of the error values defined in TError. */
   1.294 +EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
   1.295 +	{
   1.296 +	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
   1.297 +	}
   1.298 +
   1.299 +static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
   1.300 +		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
   1.301 +	{
   1.302 +	if (aNumberOfUnconvertibleCharacters<=0)
   1.303 +		{
   1.304 +		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
   1.305 +		}
   1.306 +	++aNumberOfUnconvertibleCharacters;
   1.307 +	}
   1.308 +
   1.309 +/** Converts text encoded using the Unicode transformation format UTF-8 into the 
   1.310 +Unicode UCS-2 character set.
   1.311 +
   1.312 +@param aUnicode On return, contains the Unicode encoded output string.
   1.313 +@param aUtf8 The UTF-8 encoded input string
   1.314 +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
   1.315 +@return The number of unconverted bytes left at the end of the input descriptor, 
   1.316 +or one of the error values defined in TError. */
   1.317 +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
   1.318 +	{
   1.319 +	TInt dummyUnconverted, dummyUnconvertedIndex;
   1.320 +	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
   1.321 +	}
   1.322 +
   1.323 +/** Converts text encoded using the Unicode transformation format UTF-8 into the 
   1.324 +Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
   1.325 +
   1.326 +The variant of UTF-8 used internally by Java differs slightly from standard 
   1.327 +UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
   1.328 +
   1.329 +@param aUnicode On return, contains the Unicode encoded output string.
   1.330 +@param aUtf8 The UTF-8 encoded input string
   1.331 +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
   1.332 +UTF-8. The default is EFalse.
   1.333 +@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
   1.334 +which were not converted.
   1.335 +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
   1.336 +of the first byte of the first unconvertible character. For instance if the 
   1.337 +first character in the input descriptor (aForeign) could not be converted, 
   1.338 +then this parameter is set to the first byte of that character, i.e. zero. 
   1.339 +A negative value is returned if all the characters were converted.
   1.340 +@return The number of unconverted bytes left at the end of the input descriptor, 
   1.341 +or one of the error values defined in TError. */
   1.342 +
   1.343 +/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
   1.344 + * Well formed UTF-8 Byte Sequences, full table.
   1.345 + * +----------------------------------------------------------------+
   1.346 + * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
   1.347 + * +--------------------+----------+----------+----------+----------+
   1.348 + * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
   1.349 + * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
   1.350 + * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
   1.351 + * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
   1.352 + * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
   1.353 + * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
   1.354 + * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
   1.355 + * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
   1.356 + * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
   1.357 + * +--------------------+----------+----------+----------+----------+
   1.358 + * 
   1.359 + * As a consequence of the well-formedness conditions specified in table 3-7,
   1.360 + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
   1.361 + */
   1.362 +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
   1.363 +		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   1.364 +	{	
   1.365 +	aUnicode.SetLength(0);
   1.366 +	
   1.367 +	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
   1.368 +		{
   1.369 +		return aUtf8.Length();
   1.370 +		}
   1.371 +
   1.372 +	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
   1.373 +	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
   1.374 +	const TUint8*         pUtf8 = aUtf8.Ptr();   
   1.375 +	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
   1.376 +	const TUint16 replacementcharacter = 0xFFFD;
   1.377 +	TUint currentUnicodeCharacter;
   1.378 +	TUint sequenceLength;
   1.379 +
   1.380 +	
   1.381 +	FOREVER
   1.382 +		{
   1.383 +		TBool illFormed=EFalse;
   1.384 +		
   1.385 +		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
   1.386 +		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
   1.387 +		
   1.388 +		sequenceLength = 1;
   1.389 +		
   1.390 +		// ascii - optimisation (i.e. it isn't a sequence)
   1.391 +		if (pUtf8[0] < 0x80)
   1.392 +			{
   1.393 +			currentUnicodeCharacter = pUtf8[0];
   1.394 +			}
   1.395 +		else
   1.396 +			{
   1.397 +			// see if well formed utf-8, use table above for reference	
   1.398 +			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
   1.399 +				{
   1.400 +				// 0xc1-0xc2 are not valid bytes
   1.401 +				sequenceLength = 2;
   1.402 +				}
   1.403 +			else if ((pUtf8[0] & 0xf0) == 0xe0)
   1.404 +				{
   1.405 +				sequenceLength = 3;
   1.406 +				}
   1.407 +			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
   1.408 +				{
   1.409 +				// 0xf5-0xff, are not valid bytes
   1.410 +				sequenceLength = 4;
   1.411 +				}
   1.412 +			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
   1.413 +				{
   1.414 +				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
   1.415 +					{
   1.416 +					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
   1.417 +					// the last character in the string) or we've
   1.418 +					// discovered a valid 0xc0 0x80 sequence.  
   1.419 +					sequenceLength = 2;
   1.420 +					}
   1.421 +				}
   1.422 +			
   1.423 +			/* checking to see if we got a valid sequence */
   1.424 +			if (sequenceLength == 1)
   1.425 +				{
   1.426 +				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
   1.427 +				currentUnicodeCharacter = replacementcharacter;
   1.428 +				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.429 +						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.430 +				}
   1.431 +			else
   1.432 +				{
   1.433 +				// this is a check to see if the sequence goes beyond the input 
   1.434 +				// stream.  if its not the first and only character in the input
   1.435 +				// stream this isn't an error, otherwise it is.
   1.436 +				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
   1.437 +					{
   1.438 +					// check to see if this sequence was the first character
   1.439 +					if ((pUnicode - aUnicode.Ptr()) == 0)
   1.440 +						{
   1.441 +						return EErrorIllFormedInput;
   1.442 +						}
   1.443 +					break;
   1.444 +					}			
   1.445 +				
   1.446 +				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
   1.447 +			
   1.448 +				/* check the trailing bytes, they should begin with 10 */
   1.449 +				TUint i = 1;
   1.450 +
   1.451 +				do
   1.452 +					{
   1.453 +					if ((pUtf8[i] & 0xc0) == 0x80)
   1.454 +						{
   1.455 +						// add the trailing 6 bits to the current unicode char
   1.456 +						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
   1.457 +						}
   1.458 +					else
   1.459 +						{
   1.460 +						// ill formed character (doesn't have a lead 10)
   1.461 +						currentUnicodeCharacter = replacementcharacter;
   1.462 +						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.463 +								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.464 +						illFormed=ETrue;
   1.465 +						break; 
   1.466 +						}
   1.467 +					i++;
   1.468 +					}
   1.469 +				while (i < sequenceLength);
   1.470 +				}
   1.471 +				
   1.472 +			/* conformance check.  bits of above table for reference.
   1.473 +			 * +----------------------------------------------------------------+
   1.474 +			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
   1.475 +			 * +--------------------+----------+----------+----------+----------+
   1.476 +			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
   1.477 +			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
   1.478 +			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
   1.479 +			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
   1.480 +			 * +--------------------+----------+----------+----------+----------+
   1.481 +			 */
   1.482 +			
   1.483 +			if (currentUnicodeCharacter != replacementcharacter)
   1.484 +				{
   1.485 +				if (sequenceLength == 3)
   1.486 +					{
   1.487 +					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
   1.488 +						{
   1.489 +						currentUnicodeCharacter = replacementcharacter;
   1.490 +						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.491 +								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.492 +						illFormed=ETrue;
   1.493 +						}
   1.494 +					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
   1.495 +						{
   1.496 +						currentUnicodeCharacter = replacementcharacter;
   1.497 +						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.498 +								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.499 +						illFormed=ETrue;
   1.500 +						}
   1.501 +					}
   1.502 +				else if (sequenceLength == 4)
   1.503 +					{
   1.504 +					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
   1.505 +						{
   1.506 +						currentUnicodeCharacter = replacementcharacter;
   1.507 +						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.508 +								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.509 +						illFormed=ETrue;
   1.510 +						}
   1.511 +					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
   1.512 +						{
   1.513 +						currentUnicodeCharacter = replacementcharacter;
   1.514 +						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.515 +								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.516 +						illFormed=ETrue;
   1.517 +						}
   1.518 +					}
   1.519 +				
   1.520 +				
   1.521 +				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
   1.522 +				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
   1.523 +				 * points D800..DFFF is ill formed */
   1.524 +				
   1.525 +				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
   1.526 +					{
   1.527 +					currentUnicodeCharacter = replacementcharacter;
   1.528 +					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
   1.529 +							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
   1.530 +					illFormed=ETrue;
   1.531 +					}	
   1.532 +				}
   1.533 +				// end conformance check
   1.534 +			}
   1.535 +
   1.536 +		// would this character generate a surrogate pair in UTF-16?
   1.537 +		if (currentUnicodeCharacter > 0xFFFF)
   1.538 +			{
   1.539 +			// is there enough space to hold a surrogate pair in the output?
   1.540 +			if (pUnicode >= pLastUnicode)
   1.541 +				{
   1.542 +				break; // no, end processing.
   1.543 +				}
   1.544 +			
   1.545 +			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
   1.546 +			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
   1.547 +					
   1.548 +			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
   1.549 +			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
   1.550 +			}
   1.551 +		else
   1.552 +			{
   1.553 +			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
   1.554 +			}
   1.555 +		
   1.556 +		// move the input pointer
   1.557 +		if (currentUnicodeCharacter != replacementcharacter)
   1.558 +			{
   1.559 +			pUtf8 += sequenceLength;
   1.560 +			}
   1.561 +		else if(illFormed == EFalse)
   1.562 +			{
   1.563 +			pUtf8 += (sequenceLength);
   1.564 +			}
   1.565 +		else
   1.566 +			{
   1.567 +			// we had a character we didn't recognize (i.e. it was invalid)
   1.568 +			// so move to the next character in the input
   1.569 +			pUtf8++;
   1.570 +			}
   1.571 +		
   1.572 +		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
   1.573 +			{ 
   1.574 +			break;  // we've either reached the end of the input or the end of output
   1.575 +			}
   1.576 +		}
   1.577 +
   1.578 +	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
   1.579 +	return (pLastUtf8 - pUtf8 + 1);
   1.580 +	}
   1.581 +
   1.582 +/** Given a sample text this function attempts to determine whether or not
   1.583 + *  the same text is encoded using the UTF-8 standard encoding scheme.
   1.584 +
   1.585 +@param TInt a confidence level, given at certain value.  if the given sample
   1.586 +			is UTF-8 this value will not be changed (unless > 100) then its
   1.587 +			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
   1.588 +@param TDesC8 sample text.
   1.589 +UTF-8. The default is EFalse.
   1.590 +@return void
   1.591 + */
   1.592 +
   1.593 +/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
   1.594 + * Well formed UTF-8 Byte Sequences, full table.
   1.595 + * +----------------------------------------------------------------+
   1.596 + * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
   1.597 + * +--------------------+----------+----------+----------+----------+
   1.598 + * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
   1.599 + * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
   1.600 + * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
   1.601 + * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
   1.602 + * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
   1.603 + * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
   1.604 + * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
   1.605 + * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
   1.606 + * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
   1.607 + * +--------------------+----------+----------+----------+----------+
   1.608 + * 
   1.609 + * As a consequence of the well-formedness conditions specified in table 3-7,
   1.610 + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
   1.611 + * 
   1.612 + * Code Rules:
   1.613 + *   R1: If the string contains any non-UTF-8 characters the returned confidence
   1.614 + *       is 0.  Valid UTF-8 combinations are listed in the above table.
   1.615 + *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
   1.616 + *       the (see ) the returned confidence is 95.
   1.617 + *   R3: Otherwise the confidence returned is based upon the sample string 
   1.618 + *       length.
   1.619 + *   R4: If the sample string is under 75 characters, the confidence is set to 
   1.620 + *       75.
   1.621 + */
   1.622 +void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
   1.623 +	{
   1.624 +
   1.625 +	TInt sampleLength = aSample.Length();
   1.626 +	
   1.627 +	if (sampleLength == 0)
   1.628 +		{
   1.629 +		aConfidenceLevel = 89;
   1.630 +		return;
   1.631 +		}
   1.632 +	TInt bytesRemaining  = 0;
   1.633 +	TUint sequenceLength  = 0;
   1.634 +	
   1.635 +	aConfidenceLevel = sampleLength;
   1.636 +
   1.637 +	const TUint8* buffer = &aSample[0];
   1.638 +
   1.639 +	if (sampleLength < 95)
   1.640 +		{
   1.641 +		// check for the BOM
   1.642 +		if ((sampleLength >= 3) && 
   1.643 +			((buffer[0] == 0xEF) &&
   1.644 +			 (buffer[1] == 0xBB) &&
   1.645 +			 (buffer[2] == 0xBF)) 
   1.646 +			) 
   1.647 +			{
   1.648 +			aConfidenceLevel = 95;
   1.649 +			}
   1.650 +		else if (sampleLength < 75)
   1.651 +			{
   1.652 +			aConfidenceLevel = 75;
   1.653 +			}
   1.654 +		}
   1.655 +	
   1.656 +	for (TInt index = 0;index != sampleLength;index++)
   1.657 +		{
   1.658 +		
   1.659 +		if (bytesRemaining > 0)
   1.660 +			{
   1.661 +			// bytesRemaining > 0, means that a byte representing the start of a 
   1.662 +			// multibyte sequence was encountered and the bytesRemaining is the 
   1.663 +			// number of bytes to follow. 
   1.664 +			
   1.665 +			if ((buffer[index] & 0xc0) == 0x80) 
   1.666 +				{
   1.667 +				// need to check for ill-formed sequences -- all are in the 2nd byte
   1.668 +				
   1.669 +				if ((sequenceLength == 3) && (bytesRemaining == 2))
   1.670 +					{
   1.671 +					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
   1.672 +						{
   1.673 +						aConfidenceLevel = 0;
   1.674 +						break;
   1.675 +						}
   1.676 +					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
   1.677 +						{
   1.678 +						aConfidenceLevel = 0;
   1.679 +						break;
   1.680 +						}
   1.681 +					}
   1.682 +				else if ((sequenceLength == 4) && (bytesRemaining == 3))
   1.683 +					{
   1.684 +					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
   1.685 +						{
   1.686 +						aConfidenceLevel = 0;
   1.687 +						break;
   1.688 +						}
   1.689 +					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
   1.690 +						{
   1.691 +						aConfidenceLevel = 0;
   1.692 +						break;
   1.693 +						}
   1.694 +					}
   1.695 +				
   1.696 +				--bytesRemaining;
   1.697 +				continue;
   1.698 +				}
   1.699 +			else
   1.700 +				{
   1.701 +				aConfidenceLevel = 0;
   1.702 +				break;
   1.703 +				}
   1.704 +			}
   1.705 +		
   1.706 +		if (bytesRemaining == 0)
   1.707 +			{
   1.708 +			if (buffer[index] < 0x80)
   1.709 +				{
   1.710 +				// The value of aSample[index] is in the range 0x00-0x7f
   1.711 +				//UTF8 maintains ASCII transparency. So it's a valid
   1.712 +				//UTF8. Do nothing, check next value.
   1.713 +				continue;
   1.714 +				}
   1.715 +			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
   1.716 +				{
   1.717 +				// valid start of a 2 byte sequence (see conformance note)
   1.718 +				sequenceLength = 2;
   1.719 +				bytesRemaining = 1;
   1.720 +				}
   1.721 +			else if ((buffer[index] & 0xf0) == 0xe0)
   1.722 +				{
   1.723 +				// valid start of a 3 byte sequence
   1.724 +				sequenceLength = 3;
   1.725 +				bytesRemaining = 2;
   1.726 +				}
   1.727 +			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
   1.728 +				{
   1.729 +				// valid start of a 4 byte sequence (see conformance note)
   1.730 +				sequenceLength = 4;
   1.731 +				bytesRemaining = 3;
   1.732 +				}	
   1.733 +			else
   1.734 +				{
   1.735 +				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
   1.736 +				aConfidenceLevel = 0;
   1.737 +				break;
   1.738 +				}
   1.739 +			}
   1.740 +		} // for 
   1.741 +	
   1.742 +	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
   1.743 +	}
   1.744 +
   1.745 +// End of file