Update contrib.
2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
23 const TUint KNotInBase64Alphabet=KMaxTUint;
27 EPanicBad6BitNumber=1,
28 EPanicBadUtf7Pointers1,
29 EPanicBadUtf7Pointers2,
30 EPanicBadUtf7Pointers3,
31 EPanicBadUtf7Pointers4,
32 EPanicBadUtf7Pointers5,
33 EPanicBadUtf7Pointers6,
34 EPanicBadUtf7Pointers7,
35 EPanicBadUtf7Pointers8,
36 EPanicBadUtf7Pointers9,
37 EPanicBadUtf7Pointers10,
38 EPanicBadUtf7Pointers11,
39 EPanicNotInBase64Block,
40 EPanicBadUnicodePointers1,
41 EPanicBadUnicodePointers2,
42 EPanicBadUnicodePointers3,
43 EPanicBadUnicodePointers4,
44 EPanicBadUnicodePointers5,
45 EPanicBadUnicodePointers6,
46 EPanicBadUnicodePointers7,
47 EPanicBadUnicodePointers8,
48 EPanicBadUnicodePointers9,
49 EPanicBadUnicodePointers10,
50 EPanicBadBitBufferState1,
51 EPanicBadBitBufferState2,
52 EPanicBadBitBufferState3,
53 EPanicBadBitBufferState4,
54 EPanicBadBitBufferState5,
55 EPanicBadBitBufferState6,
56 EPanicBadBitBufferState7,
57 EPanicBadBitBufferState8,
58 EPanicBadBitBufferState9,
59 EPanicBadBitBufferState10,
60 EPanicBadBitBufferState11,
61 EPanicBadBitBufferState12,
62 EPanicBadBitBufferState13,
63 EPanicBadBitBufferState14,
64 EPanicBadBitBufferState15,
65 EPanicBadBitBufferState16,
66 EPanicBadBitBufferState17,
67 EPanicUnexpectedNumberOfLoopIterations,
68 EPanicInitialEscapeCharacterButNoBase64,
69 EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
70 EPanicBadUtf8Pointers1,
71 EPanicBadUtf8Pointers2,
72 EPanicBadUtf8Pointers3,
73 EPanicBadUtf8Pointers4,
74 EPanicBadUtf8Pointers5,
75 EPanicBadUtf8Pointers6,
76 EPanicBadUtf8Pointers7,
77 EPanicOutOfSyncUtf7Byte1,
78 EPanicOutOfSyncUtf7Byte2,
79 EPanicOutOfSyncBase64Decoding
82 _LIT(KLitPanicText, "CHARCONV-UTF");
84 LOCAL_C void Panic(TPanic aPanic)
86 User::Panic(KLitPanicText, aPanic);
89 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
91 LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7)
93 if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z'))
95 return aMemberOfBase64Alphabet-'A';
97 if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z'))
99 return aMemberOfBase64Alphabet-('a'-26);
101 if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9'))
103 return aMemberOfBase64Alphabet+((26*2)-'0');
105 if (aMemberOfBase64Alphabet=='+')
109 if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/'))
113 return KNotInBase64Alphabet;
116 LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7)
118 __ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber));
119 if ((a6BitNumber==63) && aIsImapUtf7)
123 static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};
124 return base64Alphabet[a6BitNumber];
127 LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7)
129 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1));
130 TUint8* pointerToCandidateEscapeCharacter=NULL;
133 const TUint utf7Byte=*aPointerToUtf7Byte;
134 if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7))
136 pointerToCandidateEscapeCharacter=aPointerToUtf7Byte;
138 else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet)
142 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2));
143 if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte)
147 --aPointerToUtf7Byte;
149 __ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block));
150 return pointerToCandidateEscapeCharacter;
153 LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64)
157 return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e);
159 if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d))
161 if (aEncodeOptionalDirectCharactersInBase64)
163 return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) ||
164 ((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) ||
165 ((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) ||
166 ((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) ||
167 (aUnicodeCharacter==0x003f));
169 return aUnicodeCharacter!=0x005c;
171 return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a);
174 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
176 return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
181 /** Converts Unicode text into UTF-7 encoding. The fucntion leaves with
182 KErrCorrupt if the input string is corrupt.
184 @param aUnicode A UCS-2 encoded input string.
185 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then
186 characters from UTF-7 set O (optional direct characters) are encoded in
187 Modified Base64. If EFalse the characters are encoded directly,
188 as their ASCII equivalents.
189 @return A descriptor containing the UTF-7 encoded output string. */
190 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L(
191 const TDesC16& aUnicode,
192 TBool aEncodeOptionalDirectCharactersInBase64)
194 // If aUnicode is Null string, return an empty HBufC
195 if (aUnicode.Length() == 0)
197 HBufC8* hBuf8 = HBufC8::NewL(1);
201 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
202 TInt length = aUnicode.Length();
203 const TInt bufsize = 100;
205 TPtrC16 unicode (aUnicode);
207 HBufC8* hBuf8 = HBufC8::NewLC(length);
208 TPtr8 utf7 = hBuf8->Des();
212 TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64);
213 if( unconverted == EErrorIllFormedInput || unconverted < 0)
214 User::Leave(KErrCorrupt);
216 if (utf7.Length() + buf.Length() > utf7.MaxLength())
218 // Reallocate the hBuf8
219 hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length());
221 CleanupStack::PushL(hBuf8);
222 utf7.Set(hBuf8->Des());
227 unicode.Set(unicode.Right(unconverted));
234 /** Converts Unicode text into UTF-7 encoding.
236 @param aUtf7 On return, contains the UTF-7 encoded output string.
237 @param aUnicode A UCS-2 encoded input string.
238 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from
239 UTF-7 set O (optional direct characters) are encoded in Modified Base64. If
240 EFalse the characters are encoded directly, as their ASCII equivalents.
241 @return The number of unconverted characters left at the end of the input
242 descriptor, or one of the error values defined in TError. */
243 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(
245 const TDesC16& aUnicode,
246 TBool aEncodeOptionalDirectCharactersInBase64)
248 return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64);
251 TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7,
252 const TDesC16& aUnicode,
254 TBool aEncodeOptionalDirectCharactersInBase64)
256 if (aUnicode.Length()==0)
261 if (aUtf7.MaxLength()==0)
263 return aUnicode.Length();
265 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
266 TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1);
267 const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength();
268 const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1;
269 const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length();
270 const TUint KIsInBase64Block=0x80000000u;
272 TInt numberOfBitsInBuffer=0;
275 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3));
276 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1));
277 TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1);
278 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64))
280 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1));
281 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2));
282 if (bitBuffer&KIsInBase64Block)
284 if (numberOfBitsInBuffer!=0)
286 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written
290 ++pointerToPreviousUtf7Byte;
291 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
295 if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte)
300 ++pointerToPreviousUtf7Byte;
301 *pointerToPreviousUtf7Byte='-';
303 numberOfBitsInBuffer=0;
305 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2));
306 if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter)
310 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4));
311 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1))
315 ++pointerToPreviousUtf7Byte;
316 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter);
317 ++pointerToPreviousUnicodeCharacter;
318 if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block)
320 ++pointerToPreviousUtf7Byte;
321 *pointerToPreviousUtf7Byte='-';
327 TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below
328 if (~bitBuffer&KIsInBase64Block)
330 ++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block
332 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired)
337 if (~bitBuffer&KIsInBase64Block)
339 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5));
340 ++pointerToPreviousUtf7Byte;
341 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block);
344 bitBuffer|=currentUnicodeCharacter;
345 numberOfBitsInBuffer+=16;
346 ++pointerToPreviousUnicodeCharacter;
347 __ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3));
348 while (numberOfBitsInBuffer>=6)
350 numberOfBitsInBuffer-=6;
351 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6));
352 ++pointerToPreviousUtf7Byte;
353 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7));
355 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state
356 bitBuffer|=KIsInBase64Block;
359 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4));
360 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5));
361 if (bitBuffer&KIsInBase64Block)
364 TInt numberOfLoopIterations=1;
366 FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time
368 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7));
369 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6));
370 __ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations));
372 ++numberOfLoopIterations;
374 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-'
376 if (numberOfBitsInBuffer!=0)
378 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8));
379 ++pointerToPreviousUtf7Byte;
380 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
382 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9));
383 ++pointerToPreviousUtf7Byte;
384 *pointerToPreviousUtf7Byte='-';
387 // it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too
388 TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7);
389 const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block;
390 __ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64));
391 __ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary));
392 pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence
393 pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block;
394 __ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10));
395 if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character
397 --pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block
400 const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8;
401 pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters;
402 pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3;
403 const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2;
404 if (numberOfBitsToBeZeroedInLastBase64Character!=0)
406 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7));
408 bitBuffer=KIsInBase64Block;
409 numberOfBitsInBuffer=0;
412 aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1);
413 return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter;
418 /** Converts Unicode text into UTF-8 encoding.
420 @param aUtf8 On return, contains the UTF-8 encoded output string.
421 @param aUnicode The Unicode-encoded input string.
422 @return The number of unconverted characters left at the end of the input
423 descriptor, or one of the error values defined in TError. */
424 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
426 return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
430 /** Converts Unicode text into UTF-8 encoding.
432 The variant of UTF-8 used internally by Java differs slightly from
433 standard UTF-8. The TBool argument controls the UTF-8
434 variant generated by this function. This function leaves with a
435 KErrCorrupt if the input string is corrupt.
437 @param aUnicode A UCS-2 encoded input string.
438 @return A pointer to an HBufC8 containing the converted UTF8. */
439 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode)
441 // If aUnicode is Null string, return an empty HBufC
442 if (aUnicode.Length() == 0)
444 HBufC8* hBuf8 = HBufC8::NewL(1);
448 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
449 const TInt length = aUnicode.Length();
450 const TInt bufsize = 100;
452 TPtrC16 unicode (aUnicode);
454 HBufC8* hBuf8 = HBufC8::NewLC(length);
455 TPtr8 utf8 = hBuf8->Des();
459 TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode);
460 if( unconverted == EErrorIllFormedInput || unconverted < 0)
461 User::Leave(KErrCorrupt);
463 if (utf8.Length() + buf.Length() > utf8.MaxLength())
465 // Reallocate the hBuf8
466 hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length());
468 CleanupStack::PushL(hBuf8);
469 utf8.Set(hBuf8->Des());
474 unicode.Set(unicode.Right(unconverted));
480 /** Converts Unicode text into UTF-8 encoding.
482 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
484 The variant of UTF-8 used internally by Java differs slightly from standard
485 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
487 @param aUtf8 On return, contains the UTF-8 encoded output string.
488 @param aUnicode A UCS-2 encoded input string.
489 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
490 UTF-8. The default is EFalse.
491 @return The number of unconverted characters left at the end of the input descriptor,
492 or one of the error values defined in TError. */
493 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,
494 const TDesC16& aUnicode,
495 TBool aGenerateJavaConformantUtf8)
497 if (aUnicode.Length() == 0)
502 if (aUtf8.MaxLength() == 0)
504 return aUnicode.Length();
507 TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
508 const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
509 TBool inputIsTruncated = EFalse;
510 const TUint16* pUnicode = aUnicode.Ptr();
511 const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
515 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
516 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
518 if (pUnicode[0] < 0x80)
522 // internally java is different since the \x0000 character is
523 // translated into \xC0 \x80.
525 if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
527 if (pUtf8 == pointerToLastUtf8Byte)
533 *pUtf8++ = STATIC_CAST(TUint8, 0xc0);
534 *pUtf8 = STATIC_CAST(TUint8, 0x80);
538 *pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
541 else if (pUnicode[0] < 0x800)
543 // U+0080..U+07FF - 2 bytes
545 if (pUtf8 == pointerToLastUtf8Byte)
552 *pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
553 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
557 // check to see if we have a surrogate in the stream, surrogates encode code points outside
558 // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
560 else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
562 // surrogate pair - 4 bytes in utf-8
565 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
566 // is there enough space to hold the character
567 if ((pointerToLastUtf8Byte - pUtf8) < 3)
571 break; // no go to the exit condition
574 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
575 if (pUnicode >= pointerToLastUnicodeCharacter)
579 inputIsTruncated = ETrue;
580 break; // middle of a surrogate pair. go to end condition
583 if ((pUnicode[1] & 0xfc00) != 0xdc00)
585 return EErrorIllFormedInput;
588 // convert utf-16 surrogate to utf-32
589 TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
591 // convert utf-32 to utf-8
592 *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));
593 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
594 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
595 *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
597 // we consumed 2 utf-16 values, move this pointer
602 // 3 byte - utf-8, U+800..U+FFFF rest of BMP.
604 if (pointerToLastUtf8Byte - pUtf8 < 2)
610 *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
611 *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
612 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
615 if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
625 if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
627 return EErrorIllFormedInput;
630 aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
631 return pointerToLastUnicodeCharacter-pUnicode;
636 /** Converts text encoded using the Unicode transformation format UTF-7
637 into the Unicode UCS-2 character set.
639 @param aUtf7 The UTF-7 encoded input string.
640 @return A pointer to an HBufC16 containing the converted Unicode string */
641 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7)
643 // If aUtf8 is an empty string return
644 if (aUtf7.Length()==0)
646 HBufC16* hBuf = HBufC16::NewL(1);
650 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating
652 TInt length = aUtf7.Length();
653 const TInt bufsize = 100;
654 TInt state = KStateDefault;
658 HBufC16* hBuf = HBufC16::NewLC(length);
659 TPtr unicode = hBuf->Des();
663 TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state);
664 if( unconverted == EErrorIllFormedInput || unconverted < 0)
665 User::Leave(KErrCorrupt);
667 if (unicode.Length() + buf.Length() > unicode.MaxLength())
670 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
672 CleanupStack::PushL(hBuf);
673 unicode.Set(hBuf->Des());
678 utf7.Set(utf7.Right(unconverted));
686 /** Converts text encoded using the Unicode transformation format UTF-7 into the
687 Unicode UCS-2 character set.
689 If the conversion is achieved using a series of calls to this function, where
690 each call starts off where the previous call reached in the input descriptor,
691 the state of the conversion is stored. The initial value of the state variable
692 should be set as KStateDefault when the conversion is started, and afterwards
693 simply passed unchanged into each function call.
695 @param aUnicode On return, contains the Unicode encoded output string.
696 @param aUtf7 The UTF-7 encoded input string.
697 @param aState For the first call of the function set to KStateDefault. For
698 subsequent calls, pass in the variable unchanged.
699 @return The number of unconverted bytes left at the end of the input descriptor,
700 or one of the error values defined in TError. */
701 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode,
705 return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState);
708 TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode,
713 if (aUtf7.Length()==0)
715 aUnicode.SetLength(0);
718 if (aUnicode.MaxLength()==0)
720 return aUtf7.Length();
722 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
723 TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1);
724 const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength();
725 const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr();
726 const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1);
727 TUint currentUtf7Byte=*pointerToCurrentUtf7Byte;
728 const TUint KIsInBase64Block=0x80000000u;
729 TUint bitBuffer=STATIC_CAST(TUint, aState);
730 TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4);
731 bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer
732 if (bitBuffer&KIsInBase64Block)
734 __ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7));
735 __ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8));
739 __ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9));
740 __ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10));
742 aState=KStateDefault;
743 if (bitBuffer&KIsInBase64Block)
745 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
747 TBool inputIsTruncated=EFalse;
750 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5));
751 __ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11));
752 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1));
753 __ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2));
754 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11));
755 if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block))
757 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
759 --pointerToCurrentUtf7Byte;
760 inputIsTruncated=ETrue;
763 ++pointerToCurrentUtf7Byte;
764 currentUtf7Byte=*pointerToCurrentUtf7Byte;
765 if (currentUtf7Byte=='-')
767 currentUtf7Byte=escapeCharacterForStartingBase64Block;
771 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
772 if (currentUtf7Byte==KNotInBase64Alphabet)
774 return EErrorIllFormedInput;
776 bitBuffer=KIsInBase64Block;
779 if (bitBuffer&KIsInBase64Block)
783 __ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding));
784 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12));
785 if (currentUtf7Byte==KNotInBase64Alphabet)
787 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
789 return EErrorIllFormedInput;
792 numberOfBitsInBuffer=0;
793 currentUtf7Byte=*pointerToCurrentUtf7Byte;
794 if (currentUtf7Byte=='-')
796 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
800 ++pointerToCurrentUtf7Byte;
801 currentUtf7Byte=*pointerToCurrentUtf7Byte;
806 bitBuffer|=currentUtf7Byte;
807 bitBuffer|=KIsInBase64Block;
808 numberOfBitsInBuffer+=6;
809 // only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor)
810 if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16)))
812 numberOfBitsInBuffer-=16;
813 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6));
814 ++pointerToPreviousUnicodeCharacter;
815 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer);
816 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated
817 bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off
818 if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)
823 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
825 inputIsTruncated=ETrue;
828 ++pointerToCurrentUtf7Byte;
829 currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7);
834 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7));
835 ++pointerToPreviousUnicodeCharacter;
836 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte);
837 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte))
841 ++pointerToCurrentUtf7Byte;
842 currentUtf7Byte=*pointerToCurrentUtf7Byte;
846 if (bitBuffer&KIsInBase64Block)
848 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13));
849 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
851 // rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence
852 __ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14));
853 pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6;
854 const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6;
855 bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift
856 bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer);
857 bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState
858 numberOfBitsInBuffer=newNumberOfBitsInBuffer;
859 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15));
861 __ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16));
862 aState=STATIC_CAST(TInt, bitBuffer);
863 aState|=(numberOfBitsInBuffer<<4);
864 __ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17));
866 numberOfBitsInBuffer=0;
868 if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated)
870 return EErrorIllFormedInput;
872 aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr());
873 return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte;
878 /** Converts text encoded using the Unicode transformation format UTF-8
879 into the Unicode UCS-2 character set. This function leaves with an
880 error code of the input string is corrupted.
882 @param aUtf8 The UTF-8 encoded input string
883 @return A pointer to an HBufC16 with the converted Unicode string. */
884 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8)
886 // If aUtf8 is an empty string return
887 if (aUtf8.Length()==0)
889 HBufC16* hBuf = HBufC16::NewL(1);
893 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating
895 TInt length = aUtf8.Length();
896 const TInt bufsize = 100;
900 HBufC16* hBuf = HBufC16::NewLC(length);
901 TPtr unicode = hBuf->Des();
905 TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8);
906 if( unconverted == EErrorIllFormedInput || unconverted < 0)
907 User::Leave(KErrCorrupt);
909 if (unicode.Length() + buf.Length() > unicode.MaxLength())
912 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
914 CleanupStack::PushL(hBuf);
915 unicode.Set(hBuf->Des());
920 utf8.Set(utf8.Right(unconverted));
926 /** Converts text encoded using the Unicode transformation format UTF-8 into the
927 Unicode UCS-2 character set.
929 @param aUnicode On return, contains the Unicode encoded output string.
930 @param aUtf8 The UTF-8 encoded input string
931 @return The number of unconverted bytes left at the end of the input descriptor,
932 or one of the error values defined in TError. */
933 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
935 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
938 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
939 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
941 if (aNumberOfUnconvertibleCharacters<=0)
943 aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
945 ++aNumberOfUnconvertibleCharacters;
948 /** Converts text encoded using the Unicode transformation format UTF-8 into the
949 Unicode UCS-2 character set.
951 @param aUnicode On return, contains the Unicode encoded output string.
952 @param aUtf8 The UTF-8 encoded input string
953 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
954 @return The number of unconverted bytes left at the end of the input descriptor,
955 or one of the error values defined in TError. */
956 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
958 TInt dummyUnconverted, dummyUnconvertedIndex;
959 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
962 /** Converts text encoded using the Unicode transformation format UTF-8 into the
963 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
965 The variant of UTF-8 used internally by Java differs slightly from standard
966 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
968 @param aUnicode On return, contains the Unicode encoded output string.
969 @param aUtf8 The UTF-8 encoded input string
970 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
971 UTF-8. The default is EFalse.
972 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes
973 which were not converted.
974 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
975 of the first byte of the first unconvertible character. For instance if the
976 first character in the input descriptor (aForeign) could not be converted,
977 then this parameter is set to the first byte of that character, i.e. zero.
978 A negative value is returned if all the characters were converted.
979 @return The number of unconverted bytes left at the end of the input descriptor,
980 or one of the error values defined in TError. */
982 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
983 * Well formed UTF-8 Byte Sequences, full table.
984 * +----------------------------------------------------------------+
985 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
986 * +--------------------+----------+----------+----------+----------+
987 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
988 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
989 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
990 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
991 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
992 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
993 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
994 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
995 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
996 * +--------------------+----------+----------+----------+----------+
998 * As a consequence of the well-formedness conditions specified in table 3-7,
999 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
1001 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
1002 TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
1004 aUnicode.SetLength(0);
1006 if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
1008 return aUtf8.Length();
1011 TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
1012 const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
1013 const TUint8* pUtf8 = aUtf8.Ptr();
1014 const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
1015 const TUint16 replacementcharacter = 0xFFFD;
1016 TUint currentUnicodeCharacter;
1017 TInt sequenceLength;
1022 TBool illFormed=EFalse;
1024 __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
1025 __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
1029 // ascii - optimisation (i.e. it isn't a sequence)
1030 if (pUtf8[0] < 0x80)
1032 currentUnicodeCharacter = pUtf8[0];
1036 // see if well formed utf-8, use table above for reference
1037 if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
1039 // 0xc1-0xc2 are not valid bytes
1042 else if ((pUtf8[0] & 0xf0) == 0xe0)
1046 else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
1048 // 0xf5-0xff, are not valid bytes
1051 else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
1053 if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
1055 // either we've split the 0xc0 0x80 (i.e. 0xc0 is
1056 // the last character in the string) or we've
1057 // discovered a valid 0xc0 0x80 sequence.
1062 /* checking to see if we got a valid sequence */
1063 if (sequenceLength == 1)
1065 // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
1066 currentUnicodeCharacter = replacementcharacter;
1067 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1068 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1072 // this is a check to see if the sequence goes beyond the input
1073 // stream. if its not the first and only character in the input
1074 // stream this isn't an error, otherwise it is.
1075 if ((pUtf8 + sequenceLength - 1) > pLastUtf8)
1077 // check to see if this sequence was the first character
1078 if ((pUnicode - aUnicode.Ptr()) == 0)
1080 return EErrorIllFormedInput;
1085 currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
1087 /* check the trailing bytes, they should begin with 10 */
1092 if ((pUtf8[i] & 0xc0) == 0x80)
1094 // add the trailing 6 bits to the current unicode char
1095 currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
1099 // ill formed character (doesn't have a lead 10)
1100 currentUnicodeCharacter = replacementcharacter;
1101 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1102 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1108 while (i < sequenceLength);
1111 /* conformance check. bits of above table for reference.
1112 * +----------------------------------------------------------------+
1113 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
1114 * +--------------------+----------+----------+----------+----------+
1115 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0
1116 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F
1117 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90
1118 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F
1119 * +--------------------+----------+----------+----------+----------+
1122 if (currentUnicodeCharacter != replacementcharacter)
1124 if (sequenceLength == 3)
1126 if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
1128 currentUnicodeCharacter = replacementcharacter;
1129 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1130 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1133 else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
1135 currentUnicodeCharacter = replacementcharacter;
1136 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1137 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1141 else if (sequenceLength == 4)
1143 if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
1145 currentUnicodeCharacter = replacementcharacter;
1146 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1147 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1150 else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
1152 currentUnicodeCharacter = replacementcharacter;
1153 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1154 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1160 /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
1161 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code
1162 * points D800..DFFF is ill formed */
1164 if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
1166 currentUnicodeCharacter = replacementcharacter;
1167 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
1168 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
1172 // end conformance check
1175 // would this character generate a surrogate pair in UTF-16?
1176 if (currentUnicodeCharacter > 0xFFFF)
1178 // is there enough space to hold a surrogate pair in the output?
1179 if (pUnicode >= pLastUnicode)
1181 break; // no, end processing.
1184 TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
1185 *pUnicode++ = STATIC_CAST(TUint16, surrogate);
1187 surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
1188 *pUnicode++ = STATIC_CAST(TUint16, surrogate);
1192 *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
1195 // move the input pointer
1196 if (currentUnicodeCharacter != replacementcharacter)
1198 pUtf8 += sequenceLength;
1200 else if(illFormed == EFalse)
1202 pUtf8 += (sequenceLength);
1206 // we had a character we didn't recognize (i.e. it was invalid)
1207 // so move to the next character in the input
1211 if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
1213 break; // we've either reached the end of the input or the end of output
1217 aUnicode.SetLength(pUnicode - aUnicode.Ptr());
1218 return (pLastUtf8 - pUtf8 + 1);
1221 /** Given a sample text this function attempts to determine whether or not
1222 * the same text is encoded using the UTF-8 standard encoding scheme.
1224 @param TInt a confidence level, given at certain value. if the given sample
1225 is UTF-8 this value will not be changed (unless > 100) then its
1226 set to 100. Otherwise if the same isn't UTF-8, its set to 0.
1227 @param TDesC8 sample text.
1228 UTF-8. The default is EFalse.
1232 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
1233 * Well formed UTF-8 Byte Sequences, full table.
1234 * +----------------------------------------------------------------+
1235 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
1236 * +--------------------+----------+----------+----------+----------+
1237 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
1238 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
1239 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
1240 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
1241 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
1242 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
1243 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
1244 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
1245 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
1246 * +--------------------+----------+----------+----------+----------+
1248 * As a consequence of the well-formedness conditions specified in table 3-7,
1249 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
1252 * R1: If the string contains any non-UTF-8 characters the returned confidence
1253 * is 0. Valid UTF-8 combinations are listed in the above table.
1254 * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in
1255 * the (see ) the returned confidence is 95.
1256 * R3: Otherwise the confidence returned is based upon the sample string
1258 * R4: If the sample string is under 75 characters, the confidence is set to
1261 GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
1264 TInt sampleLength = aSample.Length();
1266 if (sampleLength == 0)
1268 aConfidenceLevel = 89;
1271 TInt bytesRemaining = 0;
1272 TInt sequenceLength = 0;
1274 aConfidenceLevel = sampleLength;
1276 const TUint8* buffer = &aSample[0];
1278 if (sampleLength < 95)
1280 // check for the BOM
1281 if ((sampleLength >= 3) &&
1282 ((buffer[0] == 0xEF) &&
1283 (buffer[1] == 0xBB) &&
1284 (buffer[2] == 0xBF))
1287 aConfidenceLevel = 95;
1289 else if (sampleLength < 75)
1291 aConfidenceLevel = 75;
1295 for (TInt index = 0;index != sampleLength;index++)
1298 if (bytesRemaining > 0)
1300 // bytesRemaining > 0, means that a byte representing the start of a
1301 // multibyte sequence was encountered and the bytesRemaining is the
1302 // number of bytes to follow.
1304 if ((buffer[index] & 0xc0) == 0x80)
1306 // need to check for ill-formed sequences -- all are in the 2nd byte
1308 if ((sequenceLength == 3) && (bytesRemaining == 2))
1310 if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
1312 aConfidenceLevel = 0;
1315 else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
1317 aConfidenceLevel = 0;
1321 else if ((sequenceLength == 4) && (bytesRemaining == 3))
1323 if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
1325 aConfidenceLevel = 0;
1328 else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
1330 aConfidenceLevel = 0;
1340 aConfidenceLevel = 0;
1345 if (bytesRemaining == 0)
1347 if (buffer[index] < 0x80)
1349 // The value of aSample[index] is in the range 0x00-0x7f
1350 //UTF8 maintains ASCII transparency. So it's a valid
1351 //UTF8. Do nothing, check next value.
1354 else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
1356 // valid start of a 2 byte sequence (see conformance note)
1360 else if ((buffer[index] & 0xf0) == 0xe0)
1362 // valid start of a 3 byte sequence
1366 else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
1368 // valid start of a 4 byte sequence (see conformance note)
1374 // wasn't anything expected so must be an illegal/irregular UTF8 coded value
1375 aConfidenceLevel = 0;
1381 aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
1384 GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample)
1386 TInt sampleLength = aSample.Length();
1387 aConfidenceLevel = 70;
1388 for (TInt i=0; i<sampleLength; ++i)
1390 // UTF-7 value ranges only 7 bits
1391 if((aSample[i]&0x80)!=0x00)
1393 aConfidenceLevel= 0;
1397 // there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7
1398 else if (char(aSample[i])=='~')
1400 aConfidenceLevel = 0;
1404 // The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format
1405 else if ( (aSample[i]==0x1b) && (i <sampleLength-1) )
1407 static const TInt smsExtensionTable[11] =
1408 {0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65};
1409 TInt increment1 = i+1;
1410 if (increment1>= sampleLength)
1412 for (TInt j=0; j < 11; ++j)
1414 if (aSample[increment1] == smsExtensionTable[j])
1416 aConfidenceLevel-=10;
1420 // The UTF-7 escape char is 0x2b. The values that follow the escape sequence
1421 // the values following the escape char value must belong to the modified base64
1422 // or '-' else it is an ill-formed sequence, so probably not UTF-7
1423 else if ( (aSample[i]==0x2b) && (i <sampleLength-1) )
1425 TInt increment1 = i+1;
1426 if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) ||
1427 ((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) ||
1428 ((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a)))
1430 aConfidenceLevel+=5;
1434 aConfidenceLevel-=15;
1436 i++; // should this be here or up in the if loop ??
1439 aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;