sl@0: /* sl@0: * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: * All rights reserved. sl@0: * This component and the accompanying materials are made available sl@0: * under the terms of the License "Eclipse Public License v1.0" sl@0: * which accompanies this distribution, and is available sl@0: * at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: * sl@0: * Initial Contributors: sl@0: * Nokia Corporation - initial contribution. sl@0: * sl@0: * Contributors: sl@0: * sl@0: * Description: sl@0: * sl@0: */ sl@0: sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: sl@0: #define STATIC_CAST(t,v) static_cast(v) sl@0: #define CONST_CAST(t,v) const_cast(v) sl@0: #define FOREVER for(;;) sl@0: sl@0: const TUint KNotInBase64Alphabet=KMaxTUint; sl@0: sl@0: enum TPanic sl@0: { sl@0: EPanicBad6BitNumber=1, sl@0: EPanicBadUtf7Pointers1, sl@0: EPanicBadUtf7Pointers2, sl@0: EPanicBadUtf7Pointers3, sl@0: EPanicBadUtf7Pointers4, sl@0: EPanicBadUtf7Pointers5, sl@0: EPanicBadUtf7Pointers6, sl@0: EPanicBadUtf7Pointers7, sl@0: EPanicBadUtf7Pointers8, sl@0: EPanicBadUtf7Pointers9, sl@0: EPanicBadUtf7Pointers10, sl@0: EPanicBadUtf7Pointers11, sl@0: EPanicNotInBase64Block, sl@0: EPanicBadUnicodePointers1, sl@0: EPanicBadUnicodePointers2, sl@0: EPanicBadUnicodePointers3, sl@0: EPanicBadUnicodePointers4, sl@0: EPanicBadUnicodePointers5, sl@0: EPanicBadUnicodePointers6, sl@0: EPanicBadUnicodePointers7, sl@0: EPanicBadUnicodePointers8, sl@0: EPanicBadUnicodePointers9, sl@0: EPanicBadUnicodePointers10, sl@0: EPanicBadBitBufferState1, sl@0: EPanicBadBitBufferState2, sl@0: EPanicBadBitBufferState3, sl@0: EPanicBadBitBufferState4, sl@0: EPanicBadBitBufferState5, sl@0: EPanicBadBitBufferState6, sl@0: EPanicBadBitBufferState7, sl@0: EPanicBadBitBufferState8, sl@0: EPanicBadBitBufferState9, sl@0: EPanicBadBitBufferState10, sl@0: EPanicBadBitBufferState11, sl@0: EPanicBadBitBufferState12, sl@0: EPanicBadBitBufferState13, sl@0: EPanicBadBitBufferState14, sl@0: EPanicBadBitBufferState15, sl@0: EPanicBadBitBufferState16, sl@0: EPanicBadBitBufferState17, sl@0: EPanicUnexpectedNumberOfLoopIterations, sl@0: EPanicInitialEscapeCharacterButNoBase64, sl@0: EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, sl@0: EPanicBadUtf8Pointers1, sl@0: EPanicBadUtf8Pointers2, sl@0: EPanicBadUtf8Pointers3, sl@0: EPanicBadUtf8Pointers4, sl@0: EPanicBadUtf8Pointers5, sl@0: EPanicBadUtf8Pointers6, sl@0: EPanicBadUtf8Pointers7, sl@0: EPanicOutOfSyncUtf7Byte1, sl@0: EPanicOutOfSyncUtf7Byte2, sl@0: EPanicOutOfSyncBase64Decoding sl@0: }; sl@0: sl@0: _LIT(KLitPanicText, "CHARCONV-UTF"); sl@0: sl@0: LOCAL_C void Panic(TPanic aPanic) sl@0: { sl@0: User::Panic(KLitPanicText, aPanic); sl@0: } sl@0: sl@0: inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} sl@0: sl@0: inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) sl@0: { sl@0: return (aBitBuffer&((1<>6)); sl@0: *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); sl@0: sl@0: } sl@0: sl@0: // check to see if we have a surrogate in the stream, surrogates encode code points outside sl@0: // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars. sl@0: sl@0: else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8) sl@0: { sl@0: // surrogate pair - 4 bytes in utf-8 sl@0: // U+10000..U+10FFFF sl@0: sl@0: __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); sl@0: // is there enough space to hold the character sl@0: if ((pointerToLastUtf8Byte - pUtf8) < 3) sl@0: { sl@0: pUtf8--; sl@0: pUnicode--; sl@0: break; // no go to the exit condition sl@0: } sl@0: sl@0: __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); sl@0: if (pUnicode >= pointerToLastUnicodeCharacter) sl@0: { sl@0: pUtf8--; sl@0: pUnicode--; sl@0: inputIsTruncated = ETrue; sl@0: break; // middle of a surrogate pair. go to end condition sl@0: } sl@0: sl@0: if ((pUnicode[1] & 0xfc00) != 0xdc00) sl@0: { sl@0: return EErrorIllFormedInput; sl@0: } sl@0: sl@0: // convert utf-16 surrogate to utf-32 sl@0: TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000; sl@0: sl@0: // convert utf-32 to utf-8 sl@0: *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18)); sl@0: *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f)); sl@0: *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f)); sl@0: *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f)); sl@0: sl@0: // we consumed 2 utf-16 values, move this pointer sl@0: pUnicode++; sl@0: } sl@0: else sl@0: { sl@0: // 3 byte - utf-8, U+800..U+FFFF rest of BMP. sl@0: sl@0: if (pointerToLastUtf8Byte - pUtf8 < 2) sl@0: { sl@0: pUtf8--; sl@0: pUnicode--; sl@0: break; sl@0: } sl@0: *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12)); sl@0: *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f)); sl@0: *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); sl@0: } sl@0: sl@0: if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte)) sl@0: { sl@0: break; sl@0: } sl@0: sl@0: pUtf8++; sl@0: pUnicode++; sl@0: sl@0: } sl@0: sl@0: if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated) sl@0: { sl@0: return EErrorIllFormedInput; sl@0: } sl@0: sl@0: aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1); sl@0: return pointerToLastUnicodeCharacter-pUnicode; sl@0: } sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: sl@0: /** Converts text encoded using the Unicode transformation format UTF-8 into the sl@0: Unicode UCS-2 character set. sl@0: sl@0: @param aUnicode On return, contains the Unicode encoded output string. sl@0: @param aUtf8 The UTF-8 encoded input string sl@0: @return The number of unconverted bytes left at the end of the input descriptor, sl@0: or one of the error values defined in TError. */ sl@0: EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) sl@0: { sl@0: return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); sl@0: } sl@0: sl@0: static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, sl@0: TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) sl@0: { sl@0: if (aNumberOfUnconvertibleCharacters<=0) sl@0: { sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; sl@0: } sl@0: ++aNumberOfUnconvertibleCharacters; sl@0: } sl@0: sl@0: /** Converts text encoded using the Unicode transformation format UTF-8 into the sl@0: Unicode UCS-2 character set. sl@0: sl@0: @param aUnicode On return, contains the Unicode encoded output string. sl@0: @param aUtf8 The UTF-8 encoded input string sl@0: @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java sl@0: @return The number of unconverted bytes left at the end of the input descriptor, sl@0: or one of the error values defined in TError. */ sl@0: TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) sl@0: { sl@0: TInt dummyUnconverted, dummyUnconvertedIndex; sl@0: return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); sl@0: } sl@0: sl@0: /** Converts text encoded using the Unicode transformation format UTF-8 into the sl@0: Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. sl@0: sl@0: The variant of UTF-8 used internally by Java differs slightly from standard sl@0: UTF-8. The TBool argument controls the UTF-8 variant generated by this function. sl@0: sl@0: @param aUnicode On return, contains the Unicode encoded output string. sl@0: @param aUtf8 The UTF-8 encoded input string sl@0: @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java sl@0: UTF-8. The default is EFalse. sl@0: @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes sl@0: which were not converted. sl@0: @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index sl@0: of the first byte of the first unconvertible character. For instance if the sl@0: first character in the input descriptor (aForeign) could not be converted, sl@0: then this parameter is set to the first byte of that character, i.e. zero. sl@0: A negative value is returned if all the characters were converted. sl@0: @return The number of unconverted bytes left at the end of the input descriptor, sl@0: or one of the error values defined in TError. */ sl@0: sl@0: /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 sl@0: * Well formed UTF-8 Byte Sequences, full table. sl@0: * +----------------------------------------------------------------+ sl@0: * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | sl@0: * +--------------------+----------+----------+----------+----------+ sl@0: * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii sl@0: * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 sl@0: * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 sl@0: * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal sl@0: * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F sl@0: * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal sl@0: * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 sl@0: * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal sl@0: * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F sl@0: * +--------------------+----------+----------+----------+----------+ sl@0: * sl@0: * As a consequence of the well-formedness conditions specified in table 3-7, sl@0: * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. sl@0: */ sl@0: TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, sl@0: TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) sl@0: { sl@0: aUnicode.SetLength(0); sl@0: sl@0: if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0)) sl@0: { sl@0: return aUtf8.Length(); sl@0: } sl@0: sl@0: TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr()); sl@0: const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1); sl@0: const TUint8* pUtf8 = aUtf8.Ptr(); sl@0: const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1); sl@0: const TUint16 replacementcharacter = 0xFFFD; sl@0: TUint currentUnicodeCharacter; sl@0: TUint sequenceLength; sl@0: sl@0: sl@0: FOREVER sl@0: { sl@0: TBool illFormed=EFalse; sl@0: sl@0: __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8)); sl@0: __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3)); sl@0: sl@0: sequenceLength = 1; sl@0: sl@0: // ascii - optimisation (i.e. it isn't a sequence) sl@0: if (pUtf8[0] < 0x80) sl@0: { sl@0: currentUnicodeCharacter = pUtf8[0]; sl@0: } sl@0: else sl@0: { sl@0: // see if well formed utf-8, use table above for reference sl@0: if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf)) sl@0: { sl@0: // 0xc1-0xc2 are not valid bytes sl@0: sequenceLength = 2; sl@0: } sl@0: else if ((pUtf8[0] & 0xf0) == 0xe0) sl@0: { sl@0: sequenceLength = 3; sl@0: } sl@0: else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5)) sl@0: { sl@0: // 0xf5-0xff, are not valid bytes sl@0: sequenceLength = 4; sl@0: } sl@0: else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8) sl@0: { sl@0: if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80)) sl@0: { sl@0: // either we've split the 0xc0 0x80 (i.e. 0xc0 is sl@0: // the last character in the string) or we've sl@0: // discovered a valid 0xc0 0x80 sequence. sl@0: sequenceLength = 2; sl@0: } sl@0: } sl@0: sl@0: /* checking to see if we got a valid sequence */ sl@0: if (sequenceLength == 1) sl@0: { sl@0: // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: } sl@0: else sl@0: { sl@0: // this is a check to see if the sequence goes beyond the input sl@0: // stream. if its not the first and only character in the input sl@0: // stream this isn't an error, otherwise it is. sl@0: if ((pUtf8 + sequenceLength - 1) > pLastUtf8) sl@0: { sl@0: // check to see if this sequence was the first character sl@0: if ((pUnicode - aUnicode.Ptr()) == 0) sl@0: { sl@0: return EErrorIllFormedInput; sl@0: } sl@0: break; sl@0: } sl@0: sl@0: currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength); sl@0: sl@0: /* check the trailing bytes, they should begin with 10 */ sl@0: TUint i = 1; sl@0: sl@0: do sl@0: { sl@0: if ((pUtf8[i] & 0xc0) == 0x80) sl@0: { sl@0: // add the trailing 6 bits to the current unicode char sl@0: currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F); sl@0: } sl@0: else sl@0: { sl@0: // ill formed character (doesn't have a lead 10) sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: illFormed=ETrue; sl@0: break; sl@0: } sl@0: i++; sl@0: } sl@0: while (i < sequenceLength); sl@0: } sl@0: sl@0: /* conformance check. bits of above table for reference. sl@0: * +----------------------------------------------------------------+ sl@0: * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | sl@0: * +--------------------+----------+----------+----------+----------+ sl@0: * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0 sl@0: * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F sl@0: * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90 sl@0: * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F sl@0: * +--------------------+----------+----------+----------+----------+ sl@0: */ sl@0: sl@0: if (currentUnicodeCharacter != replacementcharacter) sl@0: { sl@0: if (sequenceLength == 3) sl@0: { sl@0: if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0)) sl@0: { sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: illFormed=ETrue; sl@0: } sl@0: else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F)) sl@0: { sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: illFormed=ETrue; sl@0: } sl@0: } sl@0: else if (sequenceLength == 4) sl@0: { sl@0: if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90)) sl@0: { sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: illFormed=ETrue; sl@0: } sl@0: else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F)) sl@0: { sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: illFormed=ETrue; sl@0: } sl@0: } sl@0: sl@0: sl@0: /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points sl@0: * are not Unicode scalar values, any UTF-8 byte sequence that would map to code sl@0: * points D800..DFFF is ill formed */ sl@0: sl@0: if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF)) sl@0: { sl@0: currentUnicodeCharacter = replacementcharacter; sl@0: UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); sl@0: illFormed=ETrue; sl@0: } sl@0: } sl@0: // end conformance check sl@0: } sl@0: sl@0: // would this character generate a surrogate pair in UTF-16? sl@0: if (currentUnicodeCharacter > 0xFFFF) sl@0: { sl@0: // is there enough space to hold a surrogate pair in the output? sl@0: if (pUnicode >= pLastUnicode) sl@0: { sl@0: break; // no, end processing. sl@0: } sl@0: sl@0: TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; sl@0: *pUnicode++ = STATIC_CAST(TUint16, surrogate); sl@0: sl@0: surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00; sl@0: *pUnicode++ = STATIC_CAST(TUint16, surrogate); sl@0: } sl@0: else sl@0: { sl@0: *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter); sl@0: } sl@0: sl@0: // move the input pointer sl@0: if (currentUnicodeCharacter != replacementcharacter) sl@0: { sl@0: pUtf8 += sequenceLength; sl@0: } sl@0: else if(illFormed == EFalse) sl@0: { sl@0: pUtf8 += (sequenceLength); sl@0: } sl@0: else sl@0: { sl@0: // we had a character we didn't recognize (i.e. it was invalid) sl@0: // so move to the next character in the input sl@0: pUtf8++; sl@0: } sl@0: sl@0: if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode)) sl@0: { sl@0: break; // we've either reached the end of the input or the end of output sl@0: } sl@0: } sl@0: sl@0: aUnicode.SetLength(pUnicode - aUnicode.Ptr()); sl@0: return (pLastUtf8 - pUtf8 + 1); sl@0: } sl@0: sl@0: /** Given a sample text this function attempts to determine whether or not sl@0: * the same text is encoded using the UTF-8 standard encoding scheme. sl@0: sl@0: @param TInt a confidence level, given at certain value. if the given sample sl@0: is UTF-8 this value will not be changed (unless > 100) then its sl@0: set to 100. Otherwise if the same isn't UTF-8, its set to 0. sl@0: @param TDesC8 sample text. sl@0: UTF-8. The default is EFalse. sl@0: @return void sl@0: */ sl@0: sl@0: /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 sl@0: * Well formed UTF-8 Byte Sequences, full table. sl@0: * +----------------------------------------------------------------+ sl@0: * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | sl@0: * +--------------------+----------+----------+----------+----------+ sl@0: * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii sl@0: * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 sl@0: * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 sl@0: * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal sl@0: * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F sl@0: * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal sl@0: * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 sl@0: * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal sl@0: * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F sl@0: * +--------------------+----------+----------+----------+----------+ sl@0: * sl@0: * As a consequence of the well-formedness conditions specified in table 3-7, sl@0: * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. sl@0: * sl@0: * Code Rules: sl@0: * R1: If the string contains any non-UTF-8 characters the returned confidence sl@0: * is 0. Valid UTF-8 combinations are listed in the above table. sl@0: * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in sl@0: * the (see ) the returned confidence is 95. sl@0: * R3: Otherwise the confidence returned is based upon the sample string sl@0: * length. sl@0: * R4: If the sample string is under 75 characters, the confidence is set to sl@0: * 75. sl@0: */ sl@0: void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample) sl@0: { sl@0: sl@0: TInt sampleLength = aSample.Length(); sl@0: sl@0: if (sampleLength == 0) sl@0: { sl@0: aConfidenceLevel = 89; sl@0: return; sl@0: } sl@0: TInt bytesRemaining = 0; sl@0: TUint sequenceLength = 0; sl@0: sl@0: aConfidenceLevel = sampleLength; sl@0: sl@0: const TUint8* buffer = &aSample[0]; sl@0: sl@0: if (sampleLength < 95) sl@0: { sl@0: // check for the BOM sl@0: if ((sampleLength >= 3) && sl@0: ((buffer[0] == 0xEF) && sl@0: (buffer[1] == 0xBB) && sl@0: (buffer[2] == 0xBF)) sl@0: ) sl@0: { sl@0: aConfidenceLevel = 95; sl@0: } sl@0: else if (sampleLength < 75) sl@0: { sl@0: aConfidenceLevel = 75; sl@0: } sl@0: } sl@0: sl@0: for (TInt index = 0;index != sampleLength;index++) sl@0: { sl@0: sl@0: if (bytesRemaining > 0) sl@0: { sl@0: // bytesRemaining > 0, means that a byte representing the start of a sl@0: // multibyte sequence was encountered and the bytesRemaining is the sl@0: // number of bytes to follow. sl@0: sl@0: if ((buffer[index] & 0xc0) == 0x80) sl@0: { sl@0: // need to check for ill-formed sequences -- all are in the 2nd byte sl@0: sl@0: if ((sequenceLength == 3) && (bytesRemaining == 2)) sl@0: { sl@0: if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0)) sl@0: { sl@0: aConfidenceLevel = 0; sl@0: break; sl@0: } sl@0: else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f)) sl@0: { sl@0: aConfidenceLevel = 0; sl@0: break; sl@0: } sl@0: } sl@0: else if ((sequenceLength == 4) && (bytesRemaining == 3)) sl@0: { sl@0: if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90)) sl@0: { sl@0: aConfidenceLevel = 0; sl@0: break; sl@0: } sl@0: else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f)) sl@0: { sl@0: aConfidenceLevel = 0; sl@0: break; sl@0: } sl@0: } sl@0: sl@0: --bytesRemaining; sl@0: continue; sl@0: } sl@0: else sl@0: { sl@0: aConfidenceLevel = 0; sl@0: break; sl@0: } sl@0: } sl@0: sl@0: if (bytesRemaining == 0) sl@0: { sl@0: if (buffer[index] < 0x80) sl@0: { sl@0: // The value of aSample[index] is in the range 0x00-0x7f sl@0: //UTF8 maintains ASCII transparency. So it's a valid sl@0: //UTF8. Do nothing, check next value. sl@0: continue; sl@0: } sl@0: else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0)) sl@0: { sl@0: // valid start of a 2 byte sequence (see conformance note) sl@0: sequenceLength = 2; sl@0: bytesRemaining = 1; sl@0: } sl@0: else if ((buffer[index] & 0xf0) == 0xe0) sl@0: { sl@0: // valid start of a 3 byte sequence sl@0: sequenceLength = 3; sl@0: bytesRemaining = 2; sl@0: } sl@0: else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5)) sl@0: { sl@0: // valid start of a 4 byte sequence (see conformance note) sl@0: sequenceLength = 4; sl@0: bytesRemaining = 3; sl@0: } sl@0: else sl@0: { sl@0: // wasn't anything expected so must be an illegal/irregular UTF8 coded value sl@0: aConfidenceLevel = 0; sl@0: break; sl@0: } sl@0: } sl@0: } // for sl@0: sl@0: aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; sl@0: } sl@0: sl@0: // End of file