Update contrib.
2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of the License "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
23 #define STATIC_CAST(t,v) static_cast<t>(v)
24 #define CONST_CAST(t,v) const_cast<t>(v)
25 #define FOREVER for(;;)
27 const TUint KNotInBase64Alphabet=KMaxTUint;
31 EPanicBad6BitNumber=1,
32 EPanicBadUtf7Pointers1,
33 EPanicBadUtf7Pointers2,
34 EPanicBadUtf7Pointers3,
35 EPanicBadUtf7Pointers4,
36 EPanicBadUtf7Pointers5,
37 EPanicBadUtf7Pointers6,
38 EPanicBadUtf7Pointers7,
39 EPanicBadUtf7Pointers8,
40 EPanicBadUtf7Pointers9,
41 EPanicBadUtf7Pointers10,
42 EPanicBadUtf7Pointers11,
43 EPanicNotInBase64Block,
44 EPanicBadUnicodePointers1,
45 EPanicBadUnicodePointers2,
46 EPanicBadUnicodePointers3,
47 EPanicBadUnicodePointers4,
48 EPanicBadUnicodePointers5,
49 EPanicBadUnicodePointers6,
50 EPanicBadUnicodePointers7,
51 EPanicBadUnicodePointers8,
52 EPanicBadUnicodePointers9,
53 EPanicBadUnicodePointers10,
54 EPanicBadBitBufferState1,
55 EPanicBadBitBufferState2,
56 EPanicBadBitBufferState3,
57 EPanicBadBitBufferState4,
58 EPanicBadBitBufferState5,
59 EPanicBadBitBufferState6,
60 EPanicBadBitBufferState7,
61 EPanicBadBitBufferState8,
62 EPanicBadBitBufferState9,
63 EPanicBadBitBufferState10,
64 EPanicBadBitBufferState11,
65 EPanicBadBitBufferState12,
66 EPanicBadBitBufferState13,
67 EPanicBadBitBufferState14,
68 EPanicBadBitBufferState15,
69 EPanicBadBitBufferState16,
70 EPanicBadBitBufferState17,
71 EPanicUnexpectedNumberOfLoopIterations,
72 EPanicInitialEscapeCharacterButNoBase64,
73 EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
74 EPanicBadUtf8Pointers1,
75 EPanicBadUtf8Pointers2,
76 EPanicBadUtf8Pointers3,
77 EPanicBadUtf8Pointers4,
78 EPanicBadUtf8Pointers5,
79 EPanicBadUtf8Pointers6,
80 EPanicBadUtf8Pointers7,
81 EPanicOutOfSyncUtf7Byte1,
82 EPanicOutOfSyncUtf7Byte2,
83 EPanicOutOfSyncBase64Decoding
86 _LIT(KLitPanicText, "CHARCONV-UTF");
88 LOCAL_C void Panic(TPanic aPanic)
90 User::Panic(KLitPanicText, aPanic);
93 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
95 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
97 return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
107 /** Converts Unicode text into UTF-8 encoding.
109 @param aUtf8 On return, contains the UTF-8 encoded output string.
110 @param aUnicode The Unicode-encoded input string.
111 @return The number of unconverted characters left at the end of the input
112 descriptor, or one of the error values defined in TError. */
113 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
115 return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
120 /** Converts Unicode text into UTF-8 encoding.
122 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
124 The variant of UTF-8 used internally by Java differs slightly from standard
125 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
127 @param aUtf8 On return, contains the UTF-8 encoded output string.
128 @param aUnicode A UCS-2 encoded input string.
129 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
130 UTF-8. The default is EFalse.
131 @return The number of unconverted characters left at the end of the input descriptor,
132 or one of the error values defined in TError. */
133 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,
134 const TDesC16& aUnicode,
135 TBool aGenerateJavaConformantUtf8)
137 if (aUnicode.Length() == 0)
142 if (aUtf8.MaxLength() == 0)
144 return aUnicode.Length();
147 TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
148 const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
149 TBool inputIsTruncated = EFalse;
150 const TUint16* pUnicode = aUnicode.Ptr();
151 const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
155 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
156 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
158 if (pUnicode[0] < 0x80)
162 // internally java is different since the \x0000 character is
163 // translated into \xC0 \x80.
165 if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
167 if (pUtf8 == pointerToLastUtf8Byte)
173 *pUtf8++ = STATIC_CAST(TUint8, 0xc0);
174 *pUtf8 = STATIC_CAST(TUint8, 0x80);
178 *pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
181 else if (pUnicode[0] < 0x800)
183 // U+0080..U+07FF - 2 bytes
185 if (pUtf8 == pointerToLastUtf8Byte)
192 *pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
193 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
197 // check to see if we have a surrogate in the stream, surrogates encode code points outside
198 // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
200 else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
202 // surrogate pair - 4 bytes in utf-8
205 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
206 // is there enough space to hold the character
207 if ((pointerToLastUtf8Byte - pUtf8) < 3)
211 break; // no go to the exit condition
214 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
215 if (pUnicode >= pointerToLastUnicodeCharacter)
219 inputIsTruncated = ETrue;
220 break; // middle of a surrogate pair. go to end condition
223 if ((pUnicode[1] & 0xfc00) != 0xdc00)
225 return EErrorIllFormedInput;
228 // convert utf-16 surrogate to utf-32
229 TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
231 // convert utf-32 to utf-8
232 *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));
233 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
234 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
235 *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
237 // we consumed 2 utf-16 values, move this pointer
242 // 3 byte - utf-8, U+800..U+FFFF rest of BMP.
244 if (pointerToLastUtf8Byte - pUtf8 < 2)
250 *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
251 *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
252 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
255 if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
265 if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
267 return EErrorIllFormedInput;
270 aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
271 return pointerToLastUnicodeCharacter-pUnicode;
284 /** Converts text encoded using the Unicode transformation format UTF-8 into the
285 Unicode UCS-2 character set.
287 @param aUnicode On return, contains the Unicode encoded output string.
288 @param aUtf8 The UTF-8 encoded input string
289 @return The number of unconverted bytes left at the end of the input descriptor,
290 or one of the error values defined in TError. */
291 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
293 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
296 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
297 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
299 if (aNumberOfUnconvertibleCharacters<=0)
301 aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
303 ++aNumberOfUnconvertibleCharacters;
306 /** Converts text encoded using the Unicode transformation format UTF-8 into the
307 Unicode UCS-2 character set.
309 @param aUnicode On return, contains the Unicode encoded output string.
310 @param aUtf8 The UTF-8 encoded input string
311 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
312 @return The number of unconverted bytes left at the end of the input descriptor,
313 or one of the error values defined in TError. */
314 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
316 TInt dummyUnconverted, dummyUnconvertedIndex;
317 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
320 /** Converts text encoded using the Unicode transformation format UTF-8 into the
321 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
323 The variant of UTF-8 used internally by Java differs slightly from standard
324 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
326 @param aUnicode On return, contains the Unicode encoded output string.
327 @param aUtf8 The UTF-8 encoded input string
328 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
329 UTF-8. The default is EFalse.
330 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes
331 which were not converted.
332 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
333 of the first byte of the first unconvertible character. For instance if the
334 first character in the input descriptor (aForeign) could not be converted,
335 then this parameter is set to the first byte of that character, i.e. zero.
336 A negative value is returned if all the characters were converted.
337 @return The number of unconverted bytes left at the end of the input descriptor,
338 or one of the error values defined in TError. */
340 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
341 * Well formed UTF-8 Byte Sequences, full table.
342 * +----------------------------------------------------------------+
343 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
344 * +--------------------+----------+----------+----------+----------+
345 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
346 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
347 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
348 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
349 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
350 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
351 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
352 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
353 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
354 * +--------------------+----------+----------+----------+----------+
356 * As a consequence of the well-formedness conditions specified in table 3-7,
357 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
359 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
360 TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
362 aUnicode.SetLength(0);
364 if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
366 return aUtf8.Length();
369 TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
370 const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
371 const TUint8* pUtf8 = aUtf8.Ptr();
372 const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
373 const TUint16 replacementcharacter = 0xFFFD;
374 TUint currentUnicodeCharacter;
375 TUint sequenceLength;
380 TBool illFormed=EFalse;
382 __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
383 __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
387 // ascii - optimisation (i.e. it isn't a sequence)
390 currentUnicodeCharacter = pUtf8[0];
394 // see if well formed utf-8, use table above for reference
395 if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
397 // 0xc1-0xc2 are not valid bytes
400 else if ((pUtf8[0] & 0xf0) == 0xe0)
404 else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
406 // 0xf5-0xff, are not valid bytes
409 else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
411 if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
413 // either we've split the 0xc0 0x80 (i.e. 0xc0 is
414 // the last character in the string) or we've
415 // discovered a valid 0xc0 0x80 sequence.
420 /* checking to see if we got a valid sequence */
421 if (sequenceLength == 1)
423 // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
424 currentUnicodeCharacter = replacementcharacter;
425 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
426 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
430 // this is a check to see if the sequence goes beyond the input
431 // stream. if its not the first and only character in the input
432 // stream this isn't an error, otherwise it is.
433 if ((pUtf8 + sequenceLength - 1) > pLastUtf8)
435 // check to see if this sequence was the first character
436 if ((pUnicode - aUnicode.Ptr()) == 0)
438 return EErrorIllFormedInput;
443 currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
445 /* check the trailing bytes, they should begin with 10 */
450 if ((pUtf8[i] & 0xc0) == 0x80)
452 // add the trailing 6 bits to the current unicode char
453 currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
457 // ill formed character (doesn't have a lead 10)
458 currentUnicodeCharacter = replacementcharacter;
459 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
460 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
466 while (i < sequenceLength);
469 /* conformance check. bits of above table for reference.
470 * +----------------------------------------------------------------+
471 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
472 * +--------------------+----------+----------+----------+----------+
473 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0
474 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F
475 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90
476 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F
477 * +--------------------+----------+----------+----------+----------+
480 if (currentUnicodeCharacter != replacementcharacter)
482 if (sequenceLength == 3)
484 if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
486 currentUnicodeCharacter = replacementcharacter;
487 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
488 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
491 else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
493 currentUnicodeCharacter = replacementcharacter;
494 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
495 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
499 else if (sequenceLength == 4)
501 if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
503 currentUnicodeCharacter = replacementcharacter;
504 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
505 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
508 else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
510 currentUnicodeCharacter = replacementcharacter;
511 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
512 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
518 /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
519 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code
520 * points D800..DFFF is ill formed */
522 if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
524 currentUnicodeCharacter = replacementcharacter;
525 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
526 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
530 // end conformance check
533 // would this character generate a surrogate pair in UTF-16?
534 if (currentUnicodeCharacter > 0xFFFF)
536 // is there enough space to hold a surrogate pair in the output?
537 if (pUnicode >= pLastUnicode)
539 break; // no, end processing.
542 TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
543 *pUnicode++ = STATIC_CAST(TUint16, surrogate);
545 surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
546 *pUnicode++ = STATIC_CAST(TUint16, surrogate);
550 *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
553 // move the input pointer
554 if (currentUnicodeCharacter != replacementcharacter)
556 pUtf8 += sequenceLength;
558 else if(illFormed == EFalse)
560 pUtf8 += (sequenceLength);
564 // we had a character we didn't recognize (i.e. it was invalid)
565 // so move to the next character in the input
569 if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
571 break; // we've either reached the end of the input or the end of output
575 aUnicode.SetLength(pUnicode - aUnicode.Ptr());
576 return (pLastUtf8 - pUtf8 + 1);
579 /** Given a sample text this function attempts to determine whether or not
580 * the same text is encoded using the UTF-8 standard encoding scheme.
582 @param TInt a confidence level, given at certain value. if the given sample
583 is UTF-8 this value will not be changed (unless > 100) then its
584 set to 100. Otherwise if the same isn't UTF-8, its set to 0.
585 @param TDesC8 sample text.
586 UTF-8. The default is EFalse.
590 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
591 * Well formed UTF-8 Byte Sequences, full table.
592 * +----------------------------------------------------------------+
593 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
594 * +--------------------+----------+----------+----------+----------+
595 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
596 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
597 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
598 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
599 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
600 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
601 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
602 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
603 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
604 * +--------------------+----------+----------+----------+----------+
606 * As a consequence of the well-formedness conditions specified in table 3-7,
607 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
610 * R1: If the string contains any non-UTF-8 characters the returned confidence
611 * is 0. Valid UTF-8 combinations are listed in the above table.
612 * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in
613 * the (see ) the returned confidence is 95.
614 * R3: Otherwise the confidence returned is based upon the sample string
616 * R4: If the sample string is under 75 characters, the confidence is set to
619 void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
622 TInt sampleLength = aSample.Length();
624 if (sampleLength == 0)
626 aConfidenceLevel = 89;
629 TInt bytesRemaining = 0;
630 TUint sequenceLength = 0;
632 aConfidenceLevel = sampleLength;
634 const TUint8* buffer = &aSample[0];
636 if (sampleLength < 95)
639 if ((sampleLength >= 3) &&
640 ((buffer[0] == 0xEF) &&
641 (buffer[1] == 0xBB) &&
645 aConfidenceLevel = 95;
647 else if (sampleLength < 75)
649 aConfidenceLevel = 75;
653 for (TInt index = 0;index != sampleLength;index++)
656 if (bytesRemaining > 0)
658 // bytesRemaining > 0, means that a byte representing the start of a
659 // multibyte sequence was encountered and the bytesRemaining is the
660 // number of bytes to follow.
662 if ((buffer[index] & 0xc0) == 0x80)
664 // need to check for ill-formed sequences -- all are in the 2nd byte
666 if ((sequenceLength == 3) && (bytesRemaining == 2))
668 if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
670 aConfidenceLevel = 0;
673 else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
675 aConfidenceLevel = 0;
679 else if ((sequenceLength == 4) && (bytesRemaining == 3))
681 if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
683 aConfidenceLevel = 0;
686 else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
688 aConfidenceLevel = 0;
698 aConfidenceLevel = 0;
703 if (bytesRemaining == 0)
705 if (buffer[index] < 0x80)
707 // The value of aSample[index] is in the range 0x00-0x7f
708 //UTF8 maintains ASCII transparency. So it's a valid
709 //UTF8. Do nothing, check next value.
712 else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
714 // valid start of a 2 byte sequence (see conformance note)
718 else if ((buffer[index] & 0xf0) == 0xe0)
720 // valid start of a 3 byte sequence
724 else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
726 // valid start of a 4 byte sequence (see conformance note)
732 // wasn't anything expected so must be an illegal/irregular UTF8 coded value
733 aConfidenceLevel = 0;
739 aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;