1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/security/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,742 @@
1.4 +/*
1.5 +* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +* All rights reserved.
1.7 +* This component and the accompanying materials are made available
1.8 +* under the terms of the License "Eclipse Public License v1.0"
1.9 +* which accompanies this distribution, and is available
1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +*
1.12 +* Initial Contributors:
1.13 +* Nokia Corporation - initial contribution.
1.14 +*
1.15 +* Contributors:
1.16 +*
1.17 +* Description:
1.18 +*
1.19 +*/
1.20 +
1.21 +
1.22 +#include <e32std.h>
1.23 +#include <e32base.h>
1.24 +#include <utf.h>
1.25 +
1.26 +#define STATIC_CAST(t,v) static_cast<t>(v)
1.27 +#define CONST_CAST(t,v) const_cast<t>(v)
1.28 +#define FOREVER for(;;)
1.29 +
1.30 +const TUint KNotInBase64Alphabet=KMaxTUint;
1.31 +
1.32 +enum TPanic
1.33 + {
1.34 + EPanicBad6BitNumber=1,
1.35 + EPanicBadUtf7Pointers1,
1.36 + EPanicBadUtf7Pointers2,
1.37 + EPanicBadUtf7Pointers3,
1.38 + EPanicBadUtf7Pointers4,
1.39 + EPanicBadUtf7Pointers5,
1.40 + EPanicBadUtf7Pointers6,
1.41 + EPanicBadUtf7Pointers7,
1.42 + EPanicBadUtf7Pointers8,
1.43 + EPanicBadUtf7Pointers9,
1.44 + EPanicBadUtf7Pointers10,
1.45 + EPanicBadUtf7Pointers11,
1.46 + EPanicNotInBase64Block,
1.47 + EPanicBadUnicodePointers1,
1.48 + EPanicBadUnicodePointers2,
1.49 + EPanicBadUnicodePointers3,
1.50 + EPanicBadUnicodePointers4,
1.51 + EPanicBadUnicodePointers5,
1.52 + EPanicBadUnicodePointers6,
1.53 + EPanicBadUnicodePointers7,
1.54 + EPanicBadUnicodePointers8,
1.55 + EPanicBadUnicodePointers9,
1.56 + EPanicBadUnicodePointers10,
1.57 + EPanicBadBitBufferState1,
1.58 + EPanicBadBitBufferState2,
1.59 + EPanicBadBitBufferState3,
1.60 + EPanicBadBitBufferState4,
1.61 + EPanicBadBitBufferState5,
1.62 + EPanicBadBitBufferState6,
1.63 + EPanicBadBitBufferState7,
1.64 + EPanicBadBitBufferState8,
1.65 + EPanicBadBitBufferState9,
1.66 + EPanicBadBitBufferState10,
1.67 + EPanicBadBitBufferState11,
1.68 + EPanicBadBitBufferState12,
1.69 + EPanicBadBitBufferState13,
1.70 + EPanicBadBitBufferState14,
1.71 + EPanicBadBitBufferState15,
1.72 + EPanicBadBitBufferState16,
1.73 + EPanicBadBitBufferState17,
1.74 + EPanicUnexpectedNumberOfLoopIterations,
1.75 + EPanicInitialEscapeCharacterButNoBase64,
1.76 + EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
1.77 + EPanicBadUtf8Pointers1,
1.78 + EPanicBadUtf8Pointers2,
1.79 + EPanicBadUtf8Pointers3,
1.80 + EPanicBadUtf8Pointers4,
1.81 + EPanicBadUtf8Pointers5,
1.82 + EPanicBadUtf8Pointers6,
1.83 + EPanicBadUtf8Pointers7,
1.84 + EPanicOutOfSyncUtf7Byte1,
1.85 + EPanicOutOfSyncUtf7Byte2,
1.86 + EPanicOutOfSyncBase64Decoding
1.87 + };
1.88 +
1.89 +_LIT(KLitPanicText, "CHARCONV-UTF");
1.90 +
1.91 +LOCAL_C void Panic(TPanic aPanic)
1.92 + {
1.93 + User::Panic(KLitPanicText, aPanic);
1.94 + }
1.95 +
1.96 +inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
1.97 +
1.98 +inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
1.99 + {
1.100 + return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
1.101 + }
1.102 +
1.103 +
1.104 +
1.105 +
1.106 +
1.107 +
1.108 +
1.109 +
1.110 +/** Converts Unicode text into UTF-8 encoding.
1.111 +
1.112 +@param aUtf8 On return, contains the UTF-8 encoded output string.
1.113 +@param aUnicode The Unicode-encoded input string.
1.114 +@return The number of unconverted characters left at the end of the input
1.115 +descriptor, or one of the error values defined in TError. */
1.116 +EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
1.117 + {
1.118 + return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
1.119 + }
1.120 +
1.121 +
1.122 +
1.123 +/** Converts Unicode text into UTF-8 encoding.
1.124 +
1.125 +Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
1.126 +
1.127 +The variant of UTF-8 used internally by Java differs slightly from standard
1.128 +UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
1.129 +
1.130 +@param aUtf8 On return, contains the UTF-8 encoded output string.
1.131 +@param aUnicode A UCS-2 encoded input string.
1.132 +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
1.133 +UTF-8. The default is EFalse.
1.134 +@return The number of unconverted characters left at the end of the input descriptor,
1.135 +or one of the error values defined in TError. */
1.136 +TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,
1.137 + const TDesC16& aUnicode,
1.138 + TBool aGenerateJavaConformantUtf8)
1.139 + {
1.140 + if (aUnicode.Length() == 0)
1.141 + {
1.142 + aUtf8.SetLength(0);
1.143 + return 0;
1.144 + }
1.145 + if (aUtf8.MaxLength() == 0)
1.146 + {
1.147 + return aUnicode.Length();
1.148 + }
1.149 +
1.150 + TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
1.151 + const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
1.152 + TBool inputIsTruncated = EFalse;
1.153 + const TUint16* pUnicode = aUnicode.Ptr();
1.154 + const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
1.155 +
1.156 + FOREVER
1.157 + {
1.158 + __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
1.159 + __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
1.160 +
1.161 + if (pUnicode[0] < 0x80)
1.162 + {
1.163 + // ascii - 1 byte
1.164 +
1.165 + // internally java is different since the \x0000 character is
1.166 + // translated into \xC0 \x80.
1.167 +
1.168 + if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
1.169 + {
1.170 + if (pUtf8 == pointerToLastUtf8Byte)
1.171 + {
1.172 + pUtf8--;
1.173 + pUnicode--;
1.174 + break;
1.175 + }
1.176 + *pUtf8++ = STATIC_CAST(TUint8, 0xc0);
1.177 + *pUtf8 = STATIC_CAST(TUint8, 0x80);
1.178 + }
1.179 + else
1.180 + {
1.181 + *pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
1.182 + }
1.183 + }
1.184 + else if (pUnicode[0] < 0x800)
1.185 + {
1.186 + // U+0080..U+07FF - 2 bytes
1.187 +
1.188 + if (pUtf8 == pointerToLastUtf8Byte)
1.189 + {
1.190 + pUtf8--;
1.191 + pUnicode--;
1.192 + break;
1.193 + }
1.194 +
1.195 + *pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
1.196 + *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
1.197 +
1.198 + }
1.199 +
1.200 + // check to see if we have a surrogate in the stream, surrogates encode code points outside
1.201 + // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
1.202 +
1.203 + else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
1.204 + {
1.205 + // surrogate pair - 4 bytes in utf-8
1.206 + // U+10000..U+10FFFF
1.207 +
1.208 + __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
1.209 + // is there enough space to hold the character
1.210 + if ((pointerToLastUtf8Byte - pUtf8) < 3)
1.211 + {
1.212 + pUtf8--;
1.213 + pUnicode--;
1.214 + break; // no go to the exit condition
1.215 + }
1.216 +
1.217 + __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
1.218 + if (pUnicode >= pointerToLastUnicodeCharacter)
1.219 + {
1.220 + pUtf8--;
1.221 + pUnicode--;
1.222 + inputIsTruncated = ETrue;
1.223 + break; // middle of a surrogate pair. go to end condition
1.224 + }
1.225 +
1.226 + if ((pUnicode[1] & 0xfc00) != 0xdc00)
1.227 + {
1.228 + return EErrorIllFormedInput;
1.229 + }
1.230 +
1.231 + // convert utf-16 surrogate to utf-32
1.232 + TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
1.233 +
1.234 + // convert utf-32 to utf-8
1.235 + *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));
1.236 + *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
1.237 + *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
1.238 + *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
1.239 +
1.240 + // we consumed 2 utf-16 values, move this pointer
1.241 + pUnicode++;
1.242 + }
1.243 + else
1.244 + {
1.245 + // 3 byte - utf-8, U+800..U+FFFF rest of BMP.
1.246 +
1.247 + if (pointerToLastUtf8Byte - pUtf8 < 2)
1.248 + {
1.249 + pUtf8--;
1.250 + pUnicode--;
1.251 + break;
1.252 + }
1.253 + *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
1.254 + *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
1.255 + *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
1.256 + }
1.257 +
1.258 + if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
1.259 + {
1.260 + break;
1.261 + }
1.262 +
1.263 + pUtf8++;
1.264 + pUnicode++;
1.265 +
1.266 + }
1.267 +
1.268 + if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
1.269 + {
1.270 + return EErrorIllFormedInput;
1.271 + }
1.272 +
1.273 + aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
1.274 + return pointerToLastUnicodeCharacter-pUnicode;
1.275 + }
1.276 +
1.277 +
1.278 +
1.279 +
1.280 +
1.281 +
1.282 +
1.283 +
1.284 +
1.285 +
1.286 +
1.287 +/** Converts text encoded using the Unicode transformation format UTF-8 into the
1.288 +Unicode UCS-2 character set.
1.289 +
1.290 +@param aUnicode On return, contains the Unicode encoded output string.
1.291 +@param aUtf8 The UTF-8 encoded input string
1.292 +@return The number of unconverted bytes left at the end of the input descriptor,
1.293 +or one of the error values defined in TError. */
1.294 +EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
1.295 + {
1.296 + return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
1.297 + }
1.298 +
1.299 +static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
1.300 + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
1.301 + {
1.302 + if (aNumberOfUnconvertibleCharacters<=0)
1.303 + {
1.304 + aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
1.305 + }
1.306 + ++aNumberOfUnconvertibleCharacters;
1.307 + }
1.308 +
1.309 +/** Converts text encoded using the Unicode transformation format UTF-8 into the
1.310 +Unicode UCS-2 character set.
1.311 +
1.312 +@param aUnicode On return, contains the Unicode encoded output string.
1.313 +@param aUtf8 The UTF-8 encoded input string
1.314 +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
1.315 +@return The number of unconverted bytes left at the end of the input descriptor,
1.316 +or one of the error values defined in TError. */
1.317 +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
1.318 + {
1.319 + TInt dummyUnconverted, dummyUnconvertedIndex;
1.320 + return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
1.321 + }
1.322 +
1.323 +/** Converts text encoded using the Unicode transformation format UTF-8 into the
1.324 +Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
1.325 +
1.326 +The variant of UTF-8 used internally by Java differs slightly from standard
1.327 +UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
1.328 +
1.329 +@param aUnicode On return, contains the Unicode encoded output string.
1.330 +@param aUtf8 The UTF-8 encoded input string
1.331 +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
1.332 +UTF-8. The default is EFalse.
1.333 +@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes
1.334 +which were not converted.
1.335 +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
1.336 +of the first byte of the first unconvertible character. For instance if the
1.337 +first character in the input descriptor (aForeign) could not be converted,
1.338 +then this parameter is set to the first byte of that character, i.e. zero.
1.339 +A negative value is returned if all the characters were converted.
1.340 +@return The number of unconverted bytes left at the end of the input descriptor,
1.341 +or one of the error values defined in TError. */
1.342 +
1.343 +/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
1.344 + * Well formed UTF-8 Byte Sequences, full table.
1.345 + * +----------------------------------------------------------------+
1.346 + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
1.347 + * +--------------------+----------+----------+----------+----------+
1.348 + * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
1.349 + * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
1.350 + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
1.351 + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
1.352 + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
1.353 + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
1.354 + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
1.355 + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
1.356 + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
1.357 + * +--------------------+----------+----------+----------+----------+
1.358 + *
1.359 + * As a consequence of the well-formedness conditions specified in table 3-7,
1.360 + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
1.361 + */
1.362 +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
1.363 + TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
1.364 + {
1.365 + aUnicode.SetLength(0);
1.366 +
1.367 + if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
1.368 + {
1.369 + return aUtf8.Length();
1.370 + }
1.371 +
1.372 + TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
1.373 + const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
1.374 + const TUint8* pUtf8 = aUtf8.Ptr();
1.375 + const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
1.376 + const TUint16 replacementcharacter = 0xFFFD;
1.377 + TUint currentUnicodeCharacter;
1.378 + TUint sequenceLength;
1.379 +
1.380 +
1.381 + FOREVER
1.382 + {
1.383 + TBool illFormed=EFalse;
1.384 +
1.385 + __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
1.386 + __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
1.387 +
1.388 + sequenceLength = 1;
1.389 +
1.390 + // ascii - optimisation (i.e. it isn't a sequence)
1.391 + if (pUtf8[0] < 0x80)
1.392 + {
1.393 + currentUnicodeCharacter = pUtf8[0];
1.394 + }
1.395 + else
1.396 + {
1.397 + // see if well formed utf-8, use table above for reference
1.398 + if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
1.399 + {
1.400 + // 0xc1-0xc2 are not valid bytes
1.401 + sequenceLength = 2;
1.402 + }
1.403 + else if ((pUtf8[0] & 0xf0) == 0xe0)
1.404 + {
1.405 + sequenceLength = 3;
1.406 + }
1.407 + else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
1.408 + {
1.409 + // 0xf5-0xff, are not valid bytes
1.410 + sequenceLength = 4;
1.411 + }
1.412 + else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
1.413 + {
1.414 + if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
1.415 + {
1.416 + // either we've split the 0xc0 0x80 (i.e. 0xc0 is
1.417 + // the last character in the string) or we've
1.418 + // discovered a valid 0xc0 0x80 sequence.
1.419 + sequenceLength = 2;
1.420 + }
1.421 + }
1.422 +
1.423 + /* checking to see if we got a valid sequence */
1.424 + if (sequenceLength == 1)
1.425 + {
1.426 + // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
1.427 + currentUnicodeCharacter = replacementcharacter;
1.428 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.429 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.430 + }
1.431 + else
1.432 + {
1.433 + // this is a check to see if the sequence goes beyond the input
1.434 + // stream. if its not the first and only character in the input
1.435 + // stream this isn't an error, otherwise it is.
1.436 + if ((pUtf8 + sequenceLength - 1) > pLastUtf8)
1.437 + {
1.438 + // check to see if this sequence was the first character
1.439 + if ((pUnicode - aUnicode.Ptr()) == 0)
1.440 + {
1.441 + return EErrorIllFormedInput;
1.442 + }
1.443 + break;
1.444 + }
1.445 +
1.446 + currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
1.447 +
1.448 + /* check the trailing bytes, they should begin with 10 */
1.449 + TUint i = 1;
1.450 +
1.451 + do
1.452 + {
1.453 + if ((pUtf8[i] & 0xc0) == 0x80)
1.454 + {
1.455 + // add the trailing 6 bits to the current unicode char
1.456 + currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
1.457 + }
1.458 + else
1.459 + {
1.460 + // ill formed character (doesn't have a lead 10)
1.461 + currentUnicodeCharacter = replacementcharacter;
1.462 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.463 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.464 + illFormed=ETrue;
1.465 + break;
1.466 + }
1.467 + i++;
1.468 + }
1.469 + while (i < sequenceLength);
1.470 + }
1.471 +
1.472 + /* conformance check. bits of above table for reference.
1.473 + * +----------------------------------------------------------------+
1.474 + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
1.475 + * +--------------------+----------+----------+----------+----------+
1.476 + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0
1.477 + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F
1.478 + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90
1.479 + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F
1.480 + * +--------------------+----------+----------+----------+----------+
1.481 + */
1.482 +
1.483 + if (currentUnicodeCharacter != replacementcharacter)
1.484 + {
1.485 + if (sequenceLength == 3)
1.486 + {
1.487 + if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
1.488 + {
1.489 + currentUnicodeCharacter = replacementcharacter;
1.490 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.491 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.492 + illFormed=ETrue;
1.493 + }
1.494 + else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
1.495 + {
1.496 + currentUnicodeCharacter = replacementcharacter;
1.497 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.498 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.499 + illFormed=ETrue;
1.500 + }
1.501 + }
1.502 + else if (sequenceLength == 4)
1.503 + {
1.504 + if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
1.505 + {
1.506 + currentUnicodeCharacter = replacementcharacter;
1.507 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.508 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.509 + illFormed=ETrue;
1.510 + }
1.511 + else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
1.512 + {
1.513 + currentUnicodeCharacter = replacementcharacter;
1.514 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.515 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.516 + illFormed=ETrue;
1.517 + }
1.518 + }
1.519 +
1.520 +
1.521 + /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
1.522 + * are not Unicode scalar values, any UTF-8 byte sequence that would map to code
1.523 + * points D800..DFFF is ill formed */
1.524 +
1.525 + if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
1.526 + {
1.527 + currentUnicodeCharacter = replacementcharacter;
1.528 + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1.529 + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1.530 + illFormed=ETrue;
1.531 + }
1.532 + }
1.533 + // end conformance check
1.534 + }
1.535 +
1.536 + // would this character generate a surrogate pair in UTF-16?
1.537 + if (currentUnicodeCharacter > 0xFFFF)
1.538 + {
1.539 + // is there enough space to hold a surrogate pair in the output?
1.540 + if (pUnicode >= pLastUnicode)
1.541 + {
1.542 + break; // no, end processing.
1.543 + }
1.544 +
1.545 + TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
1.546 + *pUnicode++ = STATIC_CAST(TUint16, surrogate);
1.547 +
1.548 + surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
1.549 + *pUnicode++ = STATIC_CAST(TUint16, surrogate);
1.550 + }
1.551 + else
1.552 + {
1.553 + *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
1.554 + }
1.555 +
1.556 + // move the input pointer
1.557 + if (currentUnicodeCharacter != replacementcharacter)
1.558 + {
1.559 + pUtf8 += sequenceLength;
1.560 + }
1.561 + else if(illFormed == EFalse)
1.562 + {
1.563 + pUtf8 += (sequenceLength);
1.564 + }
1.565 + else
1.566 + {
1.567 + // we had a character we didn't recognize (i.e. it was invalid)
1.568 + // so move to the next character in the input
1.569 + pUtf8++;
1.570 + }
1.571 +
1.572 + if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
1.573 + {
1.574 + break; // we've either reached the end of the input or the end of output
1.575 + }
1.576 + }
1.577 +
1.578 + aUnicode.SetLength(pUnicode - aUnicode.Ptr());
1.579 + return (pLastUtf8 - pUtf8 + 1);
1.580 + }
1.581 +
1.582 +/** Given a sample text this function attempts to determine whether or not
1.583 + * the same text is encoded using the UTF-8 standard encoding scheme.
1.584 +
1.585 +@param TInt a confidence level, given at certain value. if the given sample
1.586 + is UTF-8 this value will not be changed (unless > 100) then its
1.587 + set to 100. Otherwise if the same isn't UTF-8, its set to 0.
1.588 +@param TDesC8 sample text.
1.589 +UTF-8. The default is EFalse.
1.590 +@return void
1.591 + */
1.592 +
1.593 +/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
1.594 + * Well formed UTF-8 Byte Sequences, full table.
1.595 + * +----------------------------------------------------------------+
1.596 + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
1.597 + * +--------------------+----------+----------+----------+----------+
1.598 + * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
1.599 + * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
1.600 + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
1.601 + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
1.602 + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
1.603 + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
1.604 + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
1.605 + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
1.606 + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
1.607 + * +--------------------+----------+----------+----------+----------+
1.608 + *
1.609 + * As a consequence of the well-formedness conditions specified in table 3-7,
1.610 + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
1.611 + *
1.612 + * Code Rules:
1.613 + * R1: If the string contains any non-UTF-8 characters the returned confidence
1.614 + * is 0. Valid UTF-8 combinations are listed in the above table.
1.615 + * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in
1.616 + * the (see ) the returned confidence is 95.
1.617 + * R3: Otherwise the confidence returned is based upon the sample string
1.618 + * length.
1.619 + * R4: If the sample string is under 75 characters, the confidence is set to
1.620 + * 75.
1.621 + */
1.622 +void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
1.623 + {
1.624 +
1.625 + TInt sampleLength = aSample.Length();
1.626 +
1.627 + if (sampleLength == 0)
1.628 + {
1.629 + aConfidenceLevel = 89;
1.630 + return;
1.631 + }
1.632 + TInt bytesRemaining = 0;
1.633 + TUint sequenceLength = 0;
1.634 +
1.635 + aConfidenceLevel = sampleLength;
1.636 +
1.637 + const TUint8* buffer = &aSample[0];
1.638 +
1.639 + if (sampleLength < 95)
1.640 + {
1.641 + // check for the BOM
1.642 + if ((sampleLength >= 3) &&
1.643 + ((buffer[0] == 0xEF) &&
1.644 + (buffer[1] == 0xBB) &&
1.645 + (buffer[2] == 0xBF))
1.646 + )
1.647 + {
1.648 + aConfidenceLevel = 95;
1.649 + }
1.650 + else if (sampleLength < 75)
1.651 + {
1.652 + aConfidenceLevel = 75;
1.653 + }
1.654 + }
1.655 +
1.656 + for (TInt index = 0;index != sampleLength;index++)
1.657 + {
1.658 +
1.659 + if (bytesRemaining > 0)
1.660 + {
1.661 + // bytesRemaining > 0, means that a byte representing the start of a
1.662 + // multibyte sequence was encountered and the bytesRemaining is the
1.663 + // number of bytes to follow.
1.664 +
1.665 + if ((buffer[index] & 0xc0) == 0x80)
1.666 + {
1.667 + // need to check for ill-formed sequences -- all are in the 2nd byte
1.668 +
1.669 + if ((sequenceLength == 3) && (bytesRemaining == 2))
1.670 + {
1.671 + if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
1.672 + {
1.673 + aConfidenceLevel = 0;
1.674 + break;
1.675 + }
1.676 + else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
1.677 + {
1.678 + aConfidenceLevel = 0;
1.679 + break;
1.680 + }
1.681 + }
1.682 + else if ((sequenceLength == 4) && (bytesRemaining == 3))
1.683 + {
1.684 + if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
1.685 + {
1.686 + aConfidenceLevel = 0;
1.687 + break;
1.688 + }
1.689 + else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
1.690 + {
1.691 + aConfidenceLevel = 0;
1.692 + break;
1.693 + }
1.694 + }
1.695 +
1.696 + --bytesRemaining;
1.697 + continue;
1.698 + }
1.699 + else
1.700 + {
1.701 + aConfidenceLevel = 0;
1.702 + break;
1.703 + }
1.704 + }
1.705 +
1.706 + if (bytesRemaining == 0)
1.707 + {
1.708 + if (buffer[index] < 0x80)
1.709 + {
1.710 + // The value of aSample[index] is in the range 0x00-0x7f
1.711 + //UTF8 maintains ASCII transparency. So it's a valid
1.712 + //UTF8. Do nothing, check next value.
1.713 + continue;
1.714 + }
1.715 + else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
1.716 + {
1.717 + // valid start of a 2 byte sequence (see conformance note)
1.718 + sequenceLength = 2;
1.719 + bytesRemaining = 1;
1.720 + }
1.721 + else if ((buffer[index] & 0xf0) == 0xe0)
1.722 + {
1.723 + // valid start of a 3 byte sequence
1.724 + sequenceLength = 3;
1.725 + bytesRemaining = 2;
1.726 + }
1.727 + else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
1.728 + {
1.729 + // valid start of a 4 byte sequence (see conformance note)
1.730 + sequenceLength = 4;
1.731 + bytesRemaining = 3;
1.732 + }
1.733 + else
1.734 + {
1.735 + // wasn't anything expected so must be an illegal/irregular UTF8 coded value
1.736 + aConfidenceLevel = 0;
1.737 + break;
1.738 + }
1.739 + }
1.740 + } // for
1.741 +
1.742 + aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
1.743 + }
1.744 +
1.745 +// End of file