sl@0
|
1 |
/*
|
sl@0
|
2 |
* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
3 |
* All rights reserved.
|
sl@0
|
4 |
* This component and the accompanying materials are made available
|
sl@0
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
sl@0
|
6 |
* which accompanies this distribution, and is available
|
sl@0
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
8 |
*
|
sl@0
|
9 |
* Initial Contributors:
|
sl@0
|
10 |
* Nokia Corporation - initial contribution.
|
sl@0
|
11 |
*
|
sl@0
|
12 |
* Contributors:
|
sl@0
|
13 |
*
|
sl@0
|
14 |
* Description:
|
sl@0
|
15 |
*
|
sl@0
|
16 |
*/
|
sl@0
|
17 |
|
sl@0
|
18 |
|
sl@0
|
19 |
#include <e32std.h>
|
sl@0
|
20 |
#include <e32base.h>
|
sl@0
|
21 |
#include <utf.h>
|
sl@0
|
22 |
|
sl@0
|
23 |
const TUint KNotInBase64Alphabet=KMaxTUint;
|
sl@0
|
24 |
|
sl@0
|
25 |
enum TPanic
|
sl@0
|
26 |
{
|
sl@0
|
27 |
EPanicBad6BitNumber=1,
|
sl@0
|
28 |
EPanicBadUtf7Pointers1,
|
sl@0
|
29 |
EPanicBadUtf7Pointers2,
|
sl@0
|
30 |
EPanicBadUtf7Pointers3,
|
sl@0
|
31 |
EPanicBadUtf7Pointers4,
|
sl@0
|
32 |
EPanicBadUtf7Pointers5,
|
sl@0
|
33 |
EPanicBadUtf7Pointers6,
|
sl@0
|
34 |
EPanicBadUtf7Pointers7,
|
sl@0
|
35 |
EPanicBadUtf7Pointers8,
|
sl@0
|
36 |
EPanicBadUtf7Pointers9,
|
sl@0
|
37 |
EPanicBadUtf7Pointers10,
|
sl@0
|
38 |
EPanicBadUtf7Pointers11,
|
sl@0
|
39 |
EPanicNotInBase64Block,
|
sl@0
|
40 |
EPanicBadUnicodePointers1,
|
sl@0
|
41 |
EPanicBadUnicodePointers2,
|
sl@0
|
42 |
EPanicBadUnicodePointers3,
|
sl@0
|
43 |
EPanicBadUnicodePointers4,
|
sl@0
|
44 |
EPanicBadUnicodePointers5,
|
sl@0
|
45 |
EPanicBadUnicodePointers6,
|
sl@0
|
46 |
EPanicBadUnicodePointers7,
|
sl@0
|
47 |
EPanicBadUnicodePointers8,
|
sl@0
|
48 |
EPanicBadUnicodePointers9,
|
sl@0
|
49 |
EPanicBadUnicodePointers10,
|
sl@0
|
50 |
EPanicBadBitBufferState1,
|
sl@0
|
51 |
EPanicBadBitBufferState2,
|
sl@0
|
52 |
EPanicBadBitBufferState3,
|
sl@0
|
53 |
EPanicBadBitBufferState4,
|
sl@0
|
54 |
EPanicBadBitBufferState5,
|
sl@0
|
55 |
EPanicBadBitBufferState6,
|
sl@0
|
56 |
EPanicBadBitBufferState7,
|
sl@0
|
57 |
EPanicBadBitBufferState8,
|
sl@0
|
58 |
EPanicBadBitBufferState9,
|
sl@0
|
59 |
EPanicBadBitBufferState10,
|
sl@0
|
60 |
EPanicBadBitBufferState11,
|
sl@0
|
61 |
EPanicBadBitBufferState12,
|
sl@0
|
62 |
EPanicBadBitBufferState13,
|
sl@0
|
63 |
EPanicBadBitBufferState14,
|
sl@0
|
64 |
EPanicBadBitBufferState15,
|
sl@0
|
65 |
EPanicBadBitBufferState16,
|
sl@0
|
66 |
EPanicBadBitBufferState17,
|
sl@0
|
67 |
EPanicUnexpectedNumberOfLoopIterations,
|
sl@0
|
68 |
EPanicInitialEscapeCharacterButNoBase64,
|
sl@0
|
69 |
EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
|
sl@0
|
70 |
EPanicBadUtf8Pointers1,
|
sl@0
|
71 |
EPanicBadUtf8Pointers2,
|
sl@0
|
72 |
EPanicBadUtf8Pointers3,
|
sl@0
|
73 |
EPanicBadUtf8Pointers4,
|
sl@0
|
74 |
EPanicBadUtf8Pointers5,
|
sl@0
|
75 |
EPanicBadUtf8Pointers6,
|
sl@0
|
76 |
EPanicBadUtf8Pointers7,
|
sl@0
|
77 |
EPanicOutOfSyncUtf7Byte1,
|
sl@0
|
78 |
EPanicOutOfSyncUtf7Byte2,
|
sl@0
|
79 |
EPanicOutOfSyncBase64Decoding
|
sl@0
|
80 |
};
|
sl@0
|
81 |
|
sl@0
|
82 |
_LIT(KLitPanicText, "CHARCONV-UTF");
|
sl@0
|
83 |
|
sl@0
|
84 |
LOCAL_C void Panic(TPanic aPanic)
|
sl@0
|
85 |
{
|
sl@0
|
86 |
User::Panic(KLitPanicText, aPanic);
|
sl@0
|
87 |
}
|
sl@0
|
88 |
|
sl@0
|
89 |
inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
|
sl@0
|
90 |
|
sl@0
|
91 |
LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7)
|
sl@0
|
92 |
{
|
sl@0
|
93 |
if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z'))
|
sl@0
|
94 |
{
|
sl@0
|
95 |
return aMemberOfBase64Alphabet-'A';
|
sl@0
|
96 |
}
|
sl@0
|
97 |
if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z'))
|
sl@0
|
98 |
{
|
sl@0
|
99 |
return aMemberOfBase64Alphabet-('a'-26);
|
sl@0
|
100 |
}
|
sl@0
|
101 |
if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9'))
|
sl@0
|
102 |
{
|
sl@0
|
103 |
return aMemberOfBase64Alphabet+((26*2)-'0');
|
sl@0
|
104 |
}
|
sl@0
|
105 |
if (aMemberOfBase64Alphabet=='+')
|
sl@0
|
106 |
{
|
sl@0
|
107 |
return 62;
|
sl@0
|
108 |
}
|
sl@0
|
109 |
if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/'))
|
sl@0
|
110 |
{
|
sl@0
|
111 |
return 63;
|
sl@0
|
112 |
}
|
sl@0
|
113 |
return KNotInBase64Alphabet;
|
sl@0
|
114 |
}
|
sl@0
|
115 |
|
sl@0
|
116 |
LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7)
|
sl@0
|
117 |
{
|
sl@0
|
118 |
__ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber));
|
sl@0
|
119 |
if ((a6BitNumber==63) && aIsImapUtf7)
|
sl@0
|
120 |
{
|
sl@0
|
121 |
return ',';
|
sl@0
|
122 |
}
|
sl@0
|
123 |
static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};
|
sl@0
|
124 |
return base64Alphabet[a6BitNumber];
|
sl@0
|
125 |
}
|
sl@0
|
126 |
|
sl@0
|
127 |
LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7)
|
sl@0
|
128 |
{
|
sl@0
|
129 |
__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1));
|
sl@0
|
130 |
TUint8* pointerToCandidateEscapeCharacter=NULL;
|
sl@0
|
131 |
FOREVER
|
sl@0
|
132 |
{
|
sl@0
|
133 |
const TUint utf7Byte=*aPointerToUtf7Byte;
|
sl@0
|
134 |
if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7))
|
sl@0
|
135 |
{
|
sl@0
|
136 |
pointerToCandidateEscapeCharacter=aPointerToUtf7Byte;
|
sl@0
|
137 |
}
|
sl@0
|
138 |
else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet)
|
sl@0
|
139 |
{
|
sl@0
|
140 |
break;
|
sl@0
|
141 |
}
|
sl@0
|
142 |
__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2));
|
sl@0
|
143 |
if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte)
|
sl@0
|
144 |
{
|
sl@0
|
145 |
break;
|
sl@0
|
146 |
}
|
sl@0
|
147 |
--aPointerToUtf7Byte;
|
sl@0
|
148 |
}
|
sl@0
|
149 |
__ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block));
|
sl@0
|
150 |
return pointerToCandidateEscapeCharacter;
|
sl@0
|
151 |
}
|
sl@0
|
152 |
|
sl@0
|
153 |
LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64)
|
sl@0
|
154 |
{
|
sl@0
|
155 |
if (aIsImapUtf7)
|
sl@0
|
156 |
{
|
sl@0
|
157 |
return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e);
|
sl@0
|
158 |
}
|
sl@0
|
159 |
if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d))
|
sl@0
|
160 |
{
|
sl@0
|
161 |
if (aEncodeOptionalDirectCharactersInBase64)
|
sl@0
|
162 |
{
|
sl@0
|
163 |
return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) ||
|
sl@0
|
164 |
((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) ||
|
sl@0
|
165 |
((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) ||
|
sl@0
|
166 |
((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) ||
|
sl@0
|
167 |
(aUnicodeCharacter==0x003f));
|
sl@0
|
168 |
}
|
sl@0
|
169 |
return aUnicodeCharacter!=0x005c;
|
sl@0
|
170 |
}
|
sl@0
|
171 |
return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a);
|
sl@0
|
172 |
}
|
sl@0
|
173 |
|
sl@0
|
174 |
inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
|
sl@0
|
175 |
{
|
sl@0
|
176 |
return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
|
sl@0
|
177 |
}
|
sl@0
|
178 |
|
sl@0
|
179 |
|
sl@0
|
180 |
|
sl@0
|
181 |
/** Converts Unicode text into UTF-7 encoding. The fucntion leaves with
|
sl@0
|
182 |
KErrCorrupt if the input string is corrupt.
|
sl@0
|
183 |
|
sl@0
|
184 |
@param aUnicode A UCS-2 encoded input string.
|
sl@0
|
185 |
@param aEncodeOptionalDirectCharactersInBase64 If ETrue then
|
sl@0
|
186 |
characters from UTF-7 set O (optional direct characters) are encoded in
|
sl@0
|
187 |
Modified Base64. If EFalse the characters are encoded directly,
|
sl@0
|
188 |
as their ASCII equivalents.
|
sl@0
|
189 |
@return A descriptor containing the UTF-7 encoded output string. */
|
sl@0
|
190 |
EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L(
|
sl@0
|
191 |
const TDesC16& aUnicode,
|
sl@0
|
192 |
TBool aEncodeOptionalDirectCharactersInBase64)
|
sl@0
|
193 |
{
|
sl@0
|
194 |
// If aUnicode is Null string, return an empty HBufC
|
sl@0
|
195 |
if (aUnicode.Length() == 0)
|
sl@0
|
196 |
{
|
sl@0
|
197 |
HBufC8* hBuf8 = HBufC8::NewL(1);
|
sl@0
|
198 |
return hBuf8;
|
sl@0
|
199 |
}
|
sl@0
|
200 |
|
sl@0
|
201 |
// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
|
sl@0
|
202 |
TInt length = aUnicode.Length();
|
sl@0
|
203 |
const TInt bufsize = 100;
|
sl@0
|
204 |
|
sl@0
|
205 |
TPtrC16 unicode (aUnicode);
|
sl@0
|
206 |
TBuf8<bufsize> buf;
|
sl@0
|
207 |
HBufC8* hBuf8 = HBufC8::NewLC(length);
|
sl@0
|
208 |
TPtr8 utf7 = hBuf8->Des();
|
sl@0
|
209 |
|
sl@0
|
210 |
FOREVER
|
sl@0
|
211 |
{
|
sl@0
|
212 |
TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64);
|
sl@0
|
213 |
if( unconverted == EErrorIllFormedInput || unconverted < 0)
|
sl@0
|
214 |
User::Leave(KErrCorrupt);
|
sl@0
|
215 |
|
sl@0
|
216 |
if (utf7.Length() + buf.Length() > utf7.MaxLength())
|
sl@0
|
217 |
{
|
sl@0
|
218 |
// Reallocate the hBuf8
|
sl@0
|
219 |
hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length());
|
sl@0
|
220 |
CleanupStack::Pop();
|
sl@0
|
221 |
CleanupStack::PushL(hBuf8);
|
sl@0
|
222 |
utf7.Set(hBuf8->Des());
|
sl@0
|
223 |
}
|
sl@0
|
224 |
utf7.Append(buf);
|
sl@0
|
225 |
if (unconverted ==0)
|
sl@0
|
226 |
break;
|
sl@0
|
227 |
unicode.Set(unicode.Right(unconverted));
|
sl@0
|
228 |
}
|
sl@0
|
229 |
CleanupStack::Pop();
|
sl@0
|
230 |
return hBuf8;
|
sl@0
|
231 |
|
sl@0
|
232 |
}
|
sl@0
|
233 |
|
sl@0
|
234 |
/** Converts Unicode text into UTF-7 encoding.
|
sl@0
|
235 |
|
sl@0
|
236 |
@param aUtf7 On return, contains the UTF-7 encoded output string.
|
sl@0
|
237 |
@param aUnicode A UCS-2 encoded input string.
|
sl@0
|
238 |
@param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from
|
sl@0
|
239 |
UTF-7 set O (optional direct characters) are encoded in Modified Base64. If
|
sl@0
|
240 |
EFalse the characters are encoded directly, as their ASCII equivalents.
|
sl@0
|
241 |
@return The number of unconverted characters left at the end of the input
|
sl@0
|
242 |
descriptor, or one of the error values defined in TError. */
|
sl@0
|
243 |
EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(
|
sl@0
|
244 |
TDes8& aUtf7,
|
sl@0
|
245 |
const TDesC16& aUnicode,
|
sl@0
|
246 |
TBool aEncodeOptionalDirectCharactersInBase64)
|
sl@0
|
247 |
{
|
sl@0
|
248 |
return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64);
|
sl@0
|
249 |
}
|
sl@0
|
250 |
|
sl@0
|
251 |
TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7,
|
sl@0
|
252 |
const TDesC16& aUnicode,
|
sl@0
|
253 |
TBool aIsImapUtf7,
|
sl@0
|
254 |
TBool aEncodeOptionalDirectCharactersInBase64)
|
sl@0
|
255 |
{
|
sl@0
|
256 |
if (aUnicode.Length()==0)
|
sl@0
|
257 |
{
|
sl@0
|
258 |
aUtf7.SetLength(0);
|
sl@0
|
259 |
return 0;
|
sl@0
|
260 |
}
|
sl@0
|
261 |
if (aUtf7.MaxLength()==0)
|
sl@0
|
262 |
{
|
sl@0
|
263 |
return aUnicode.Length();
|
sl@0
|
264 |
}
|
sl@0
|
265 |
const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
|
sl@0
|
266 |
TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1);
|
sl@0
|
267 |
const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength();
|
sl@0
|
268 |
const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1;
|
sl@0
|
269 |
const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length();
|
sl@0
|
270 |
const TUint KIsInBase64Block=0x80000000u;
|
sl@0
|
271 |
TUint bitBuffer=0;
|
sl@0
|
272 |
TInt numberOfBitsInBuffer=0;
|
sl@0
|
273 |
FOREVER
|
sl@0
|
274 |
{
|
sl@0
|
275 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3));
|
sl@0
|
276 |
__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1));
|
sl@0
|
277 |
TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1);
|
sl@0
|
278 |
if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64))
|
sl@0
|
279 |
{
|
sl@0
|
280 |
__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1));
|
sl@0
|
281 |
__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2));
|
sl@0
|
282 |
if (bitBuffer&KIsInBase64Block)
|
sl@0
|
283 |
{
|
sl@0
|
284 |
if (numberOfBitsInBuffer!=0)
|
sl@0
|
285 |
{
|
sl@0
|
286 |
if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written
|
sl@0
|
287 |
{
|
sl@0
|
288 |
break;
|
sl@0
|
289 |
}
|
sl@0
|
290 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
291 |
*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
|
sl@0
|
292 |
}
|
sl@0
|
293 |
else
|
sl@0
|
294 |
{
|
sl@0
|
295 |
if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte)
|
sl@0
|
296 |
{
|
sl@0
|
297 |
break;
|
sl@0
|
298 |
}
|
sl@0
|
299 |
}
|
sl@0
|
300 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
301 |
*pointerToPreviousUtf7Byte='-';
|
sl@0
|
302 |
bitBuffer=0;
|
sl@0
|
303 |
numberOfBitsInBuffer=0;
|
sl@0
|
304 |
}
|
sl@0
|
305 |
__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2));
|
sl@0
|
306 |
if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter)
|
sl@0
|
307 |
{
|
sl@0
|
308 |
break;
|
sl@0
|
309 |
}
|
sl@0
|
310 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4));
|
sl@0
|
311 |
if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1))
|
sl@0
|
312 |
{
|
sl@0
|
313 |
break;
|
sl@0
|
314 |
}
|
sl@0
|
315 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
316 |
*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter);
|
sl@0
|
317 |
++pointerToPreviousUnicodeCharacter;
|
sl@0
|
318 |
if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block)
|
sl@0
|
319 |
{
|
sl@0
|
320 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
321 |
*pointerToPreviousUtf7Byte='-';
|
sl@0
|
322 |
}
|
sl@0
|
323 |
}
|
sl@0
|
324 |
else
|
sl@0
|
325 |
{
|
sl@0
|
326 |
{
|
sl@0
|
327 |
TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below
|
sl@0
|
328 |
if (~bitBuffer&KIsInBase64Block)
|
sl@0
|
329 |
{
|
sl@0
|
330 |
++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block
|
sl@0
|
331 |
}
|
sl@0
|
332 |
if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired)
|
sl@0
|
333 |
{
|
sl@0
|
334 |
break;
|
sl@0
|
335 |
}
|
sl@0
|
336 |
}
|
sl@0
|
337 |
if (~bitBuffer&KIsInBase64Block)
|
sl@0
|
338 |
{
|
sl@0
|
339 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5));
|
sl@0
|
340 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
341 |
*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block);
|
sl@0
|
342 |
}
|
sl@0
|
343 |
bitBuffer<<=16;
|
sl@0
|
344 |
bitBuffer|=currentUnicodeCharacter;
|
sl@0
|
345 |
numberOfBitsInBuffer+=16;
|
sl@0
|
346 |
++pointerToPreviousUnicodeCharacter;
|
sl@0
|
347 |
__ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3));
|
sl@0
|
348 |
while (numberOfBitsInBuffer>=6)
|
sl@0
|
349 |
{
|
sl@0
|
350 |
numberOfBitsInBuffer-=6;
|
sl@0
|
351 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6));
|
sl@0
|
352 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
353 |
*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7));
|
sl@0
|
354 |
}
|
sl@0
|
355 |
bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state
|
sl@0
|
356 |
bitBuffer|=KIsInBase64Block;
|
sl@0
|
357 |
}
|
sl@0
|
358 |
}
|
sl@0
|
359 |
__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4));
|
sl@0
|
360 |
__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5));
|
sl@0
|
361 |
if (bitBuffer&KIsInBase64Block)
|
sl@0
|
362 |
{
|
sl@0
|
363 |
#if defined(_DEBUG)
|
sl@0
|
364 |
TInt numberOfLoopIterations=1;
|
sl@0
|
365 |
#endif
|
sl@0
|
366 |
FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time
|
sl@0
|
367 |
{
|
sl@0
|
368 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7));
|
sl@0
|
369 |
__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6));
|
sl@0
|
370 |
__ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations));
|
sl@0
|
371 |
#if defined(_DEBUG)
|
sl@0
|
372 |
++numberOfLoopIterations;
|
sl@0
|
373 |
#endif
|
sl@0
|
374 |
if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-'
|
sl@0
|
375 |
{
|
sl@0
|
376 |
if (numberOfBitsInBuffer!=0)
|
sl@0
|
377 |
{
|
sl@0
|
378 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8));
|
sl@0
|
379 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
380 |
*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
|
sl@0
|
381 |
}
|
sl@0
|
382 |
__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9));
|
sl@0
|
383 |
++pointerToPreviousUtf7Byte;
|
sl@0
|
384 |
*pointerToPreviousUtf7Byte='-';
|
sl@0
|
385 |
break;
|
sl@0
|
386 |
}
|
sl@0
|
387 |
// it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too
|
sl@0
|
388 |
TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7);
|
sl@0
|
389 |
const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block;
|
sl@0
|
390 |
__ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64));
|
sl@0
|
391 |
__ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary));
|
sl@0
|
392 |
pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence
|
sl@0
|
393 |
pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block;
|
sl@0
|
394 |
__ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10));
|
sl@0
|
395 |
if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character
|
sl@0
|
396 |
{
|
sl@0
|
397 |
--pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block
|
sl@0
|
398 |
break;
|
sl@0
|
399 |
}
|
sl@0
|
400 |
const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8;
|
sl@0
|
401 |
pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters;
|
sl@0
|
402 |
pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3;
|
sl@0
|
403 |
const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2;
|
sl@0
|
404 |
if (numberOfBitsToBeZeroedInLastBase64Character!=0)
|
sl@0
|
405 |
{
|
sl@0
|
406 |
*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7));
|
sl@0
|
407 |
}
|
sl@0
|
408 |
bitBuffer=KIsInBase64Block;
|
sl@0
|
409 |
numberOfBitsInBuffer=0;
|
sl@0
|
410 |
}
|
sl@0
|
411 |
}
|
sl@0
|
412 |
aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1);
|
sl@0
|
413 |
return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter;
|
sl@0
|
414 |
}
|
sl@0
|
415 |
|
sl@0
|
416 |
|
sl@0
|
417 |
|
sl@0
|
418 |
/** Converts Unicode text into UTF-8 encoding.
|
sl@0
|
419 |
|
sl@0
|
420 |
@param aUtf8 On return, contains the UTF-8 encoded output string.
|
sl@0
|
421 |
@param aUnicode The Unicode-encoded input string.
|
sl@0
|
422 |
@return The number of unconverted characters left at the end of the input
|
sl@0
|
423 |
descriptor, or one of the error values defined in TError. */
|
sl@0
|
424 |
EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
|
sl@0
|
425 |
{
|
sl@0
|
426 |
return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
|
sl@0
|
427 |
}
|
sl@0
|
428 |
|
sl@0
|
429 |
|
sl@0
|
430 |
/** Converts Unicode text into UTF-8 encoding.
|
sl@0
|
431 |
|
sl@0
|
432 |
The variant of UTF-8 used internally by Java differs slightly from
|
sl@0
|
433 |
standard UTF-8. The TBool argument controls the UTF-8
|
sl@0
|
434 |
variant generated by this function. This function leaves with a
|
sl@0
|
435 |
KErrCorrupt if the input string is corrupt.
|
sl@0
|
436 |
|
sl@0
|
437 |
@param aUnicode A UCS-2 encoded input string.
|
sl@0
|
438 |
@return A pointer to an HBufC8 containing the converted UTF8. */
|
sl@0
|
439 |
EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode)
|
sl@0
|
440 |
{
|
sl@0
|
441 |
// If aUnicode is Null string, return an empty HBufC
|
sl@0
|
442 |
if (aUnicode.Length() == 0)
|
sl@0
|
443 |
{
|
sl@0
|
444 |
HBufC8* hBuf8 = HBufC8::NewL(1);
|
sl@0
|
445 |
return hBuf8;
|
sl@0
|
446 |
}
|
sl@0
|
447 |
|
sl@0
|
448 |
// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
|
sl@0
|
449 |
const TInt length = aUnicode.Length();
|
sl@0
|
450 |
const TInt bufsize = 100;
|
sl@0
|
451 |
|
sl@0
|
452 |
TPtrC16 unicode (aUnicode);
|
sl@0
|
453 |
TBuf8<bufsize> buf;
|
sl@0
|
454 |
HBufC8* hBuf8 = HBufC8::NewLC(length);
|
sl@0
|
455 |
TPtr8 utf8 = hBuf8->Des();
|
sl@0
|
456 |
|
sl@0
|
457 |
FOREVER
|
sl@0
|
458 |
{
|
sl@0
|
459 |
TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode);
|
sl@0
|
460 |
if( unconverted == EErrorIllFormedInput || unconverted < 0)
|
sl@0
|
461 |
User::Leave(KErrCorrupt);
|
sl@0
|
462 |
|
sl@0
|
463 |
if (utf8.Length() + buf.Length() > utf8.MaxLength())
|
sl@0
|
464 |
{
|
sl@0
|
465 |
// Reallocate the hBuf8
|
sl@0
|
466 |
hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length());
|
sl@0
|
467 |
CleanupStack::Pop();
|
sl@0
|
468 |
CleanupStack::PushL(hBuf8);
|
sl@0
|
469 |
utf8.Set(hBuf8->Des());
|
sl@0
|
470 |
}
|
sl@0
|
471 |
utf8.Append(buf);
|
sl@0
|
472 |
if (unconverted ==0)
|
sl@0
|
473 |
break;
|
sl@0
|
474 |
unicode.Set(unicode.Right(unconverted));
|
sl@0
|
475 |
}
|
sl@0
|
476 |
CleanupStack::Pop();
|
sl@0
|
477 |
return hBuf8;
|
sl@0
|
478 |
}
|
sl@0
|
479 |
|
sl@0
|
480 |
/** Converts Unicode text into UTF-8 encoding.
|
sl@0
|
481 |
|
sl@0
|
482 |
Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
|
sl@0
|
483 |
|
sl@0
|
484 |
The variant of UTF-8 used internally by Java differs slightly from standard
|
sl@0
|
485 |
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
|
sl@0
|
486 |
|
sl@0
|
487 |
@param aUtf8 On return, contains the UTF-8 encoded output string.
|
sl@0
|
488 |
@param aUnicode A UCS-2 encoded input string.
|
sl@0
|
489 |
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
|
sl@0
|
490 |
UTF-8. The default is EFalse.
|
sl@0
|
491 |
@return The number of unconverted characters left at the end of the input descriptor,
|
sl@0
|
492 |
or one of the error values defined in TError. */
|
sl@0
|
493 |
TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,
|
sl@0
|
494 |
const TDesC16& aUnicode,
|
sl@0
|
495 |
TBool aGenerateJavaConformantUtf8)
|
sl@0
|
496 |
{
|
sl@0
|
497 |
if (aUnicode.Length() == 0)
|
sl@0
|
498 |
{
|
sl@0
|
499 |
aUtf8.SetLength(0);
|
sl@0
|
500 |
return 0;
|
sl@0
|
501 |
}
|
sl@0
|
502 |
if (aUtf8.MaxLength() == 0)
|
sl@0
|
503 |
{
|
sl@0
|
504 |
return aUnicode.Length();
|
sl@0
|
505 |
}
|
sl@0
|
506 |
|
sl@0
|
507 |
TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
|
sl@0
|
508 |
const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
|
sl@0
|
509 |
TBool inputIsTruncated = EFalse;
|
sl@0
|
510 |
const TUint16* pUnicode = aUnicode.Ptr();
|
sl@0
|
511 |
const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
|
sl@0
|
512 |
|
sl@0
|
513 |
FOREVER
|
sl@0
|
514 |
{
|
sl@0
|
515 |
__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
|
sl@0
|
516 |
__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
|
sl@0
|
517 |
|
sl@0
|
518 |
if (pUnicode[0] < 0x80)
|
sl@0
|
519 |
{
|
sl@0
|
520 |
// ascii - 1 byte
|
sl@0
|
521 |
|
sl@0
|
522 |
// internally java is different since the \x0000 character is
|
sl@0
|
523 |
// translated into \xC0 \x80.
|
sl@0
|
524 |
|
sl@0
|
525 |
if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
|
sl@0
|
526 |
{
|
sl@0
|
527 |
if (pUtf8 == pointerToLastUtf8Byte)
|
sl@0
|
528 |
{
|
sl@0
|
529 |
pUtf8--;
|
sl@0
|
530 |
pUnicode--;
|
sl@0
|
531 |
break;
|
sl@0
|
532 |
}
|
sl@0
|
533 |
*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
|
sl@0
|
534 |
*pUtf8 = STATIC_CAST(TUint8, 0x80);
|
sl@0
|
535 |
}
|
sl@0
|
536 |
else
|
sl@0
|
537 |
{
|
sl@0
|
538 |
*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
|
sl@0
|
539 |
}
|
sl@0
|
540 |
}
|
sl@0
|
541 |
else if (pUnicode[0] < 0x800)
|
sl@0
|
542 |
{
|
sl@0
|
543 |
// U+0080..U+07FF - 2 bytes
|
sl@0
|
544 |
|
sl@0
|
545 |
if (pUtf8 == pointerToLastUtf8Byte)
|
sl@0
|
546 |
{
|
sl@0
|
547 |
pUtf8--;
|
sl@0
|
548 |
pUnicode--;
|
sl@0
|
549 |
break;
|
sl@0
|
550 |
}
|
sl@0
|
551 |
|
sl@0
|
552 |
*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
|
sl@0
|
553 |
*pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
|
sl@0
|
554 |
|
sl@0
|
555 |
}
|
sl@0
|
556 |
|
sl@0
|
557 |
// check to see if we have a surrogate in the stream, surrogates encode code points outside
|
sl@0
|
558 |
// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
|
sl@0
|
559 |
|
sl@0
|
560 |
else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
|
sl@0
|
561 |
{
|
sl@0
|
562 |
// surrogate pair - 4 bytes in utf-8
|
sl@0
|
563 |
// U+10000..U+10FFFF
|
sl@0
|
564 |
|
sl@0
|
565 |
__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
|
sl@0
|
566 |
// is there enough space to hold the character
|
sl@0
|
567 |
if ((pointerToLastUtf8Byte - pUtf8) < 3)
|
sl@0
|
568 |
{
|
sl@0
|
569 |
pUtf8--;
|
sl@0
|
570 |
pUnicode--;
|
sl@0
|
571 |
break; // no go to the exit condition
|
sl@0
|
572 |
}
|
sl@0
|
573 |
|
sl@0
|
574 |
__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
|
sl@0
|
575 |
if (pUnicode >= pointerToLastUnicodeCharacter)
|
sl@0
|
576 |
{
|
sl@0
|
577 |
pUtf8--;
|
sl@0
|
578 |
pUnicode--;
|
sl@0
|
579 |
inputIsTruncated = ETrue;
|
sl@0
|
580 |
break; // middle of a surrogate pair. go to end condition
|
sl@0
|
581 |
}
|
sl@0
|
582 |
|
sl@0
|
583 |
if ((pUnicode[1] & 0xfc00) != 0xdc00)
|
sl@0
|
584 |
{
|
sl@0
|
585 |
return EErrorIllFormedInput;
|
sl@0
|
586 |
}
|
sl@0
|
587 |
|
sl@0
|
588 |
// convert utf-16 surrogate to utf-32
|
sl@0
|
589 |
TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
|
sl@0
|
590 |
|
sl@0
|
591 |
// convert utf-32 to utf-8
|
sl@0
|
592 |
*pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));
|
sl@0
|
593 |
*pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
|
sl@0
|
594 |
*pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
|
sl@0
|
595 |
*pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
|
sl@0
|
596 |
|
sl@0
|
597 |
// we consumed 2 utf-16 values, move this pointer
|
sl@0
|
598 |
pUnicode++;
|
sl@0
|
599 |
}
|
sl@0
|
600 |
else
|
sl@0
|
601 |
{
|
sl@0
|
602 |
// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
|
sl@0
|
603 |
|
sl@0
|
604 |
if (pointerToLastUtf8Byte - pUtf8 < 2)
|
sl@0
|
605 |
{
|
sl@0
|
606 |
pUtf8--;
|
sl@0
|
607 |
pUnicode--;
|
sl@0
|
608 |
break;
|
sl@0
|
609 |
}
|
sl@0
|
610 |
*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
|
sl@0
|
611 |
*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
|
sl@0
|
612 |
*pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
|
sl@0
|
613 |
}
|
sl@0
|
614 |
|
sl@0
|
615 |
if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
|
sl@0
|
616 |
{
|
sl@0
|
617 |
break;
|
sl@0
|
618 |
}
|
sl@0
|
619 |
|
sl@0
|
620 |
pUtf8++;
|
sl@0
|
621 |
pUnicode++;
|
sl@0
|
622 |
|
sl@0
|
623 |
}
|
sl@0
|
624 |
|
sl@0
|
625 |
if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
|
sl@0
|
626 |
{
|
sl@0
|
627 |
return EErrorIllFormedInput;
|
sl@0
|
628 |
}
|
sl@0
|
629 |
|
sl@0
|
630 |
aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
|
sl@0
|
631 |
return pointerToLastUnicodeCharacter-pUnicode;
|
sl@0
|
632 |
}
|
sl@0
|
633 |
|
sl@0
|
634 |
|
sl@0
|
635 |
|
sl@0
|
636 |
/** Converts text encoded using the Unicode transformation format UTF-7
|
sl@0
|
637 |
into the Unicode UCS-2 character set.
|
sl@0
|
638 |
|
sl@0
|
639 |
@param aUtf7 The UTF-7 encoded input string.
|
sl@0
|
640 |
@return A pointer to an HBufC16 containing the converted Unicode string */
|
sl@0
|
641 |
EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7)
|
sl@0
|
642 |
{
|
sl@0
|
643 |
// If aUtf8 is an empty string return
|
sl@0
|
644 |
if (aUtf7.Length()==0)
|
sl@0
|
645 |
{
|
sl@0
|
646 |
HBufC16* hBuf = HBufC16::NewL(1);
|
sl@0
|
647 |
return hBuf;
|
sl@0
|
648 |
}
|
sl@0
|
649 |
|
sl@0
|
650 |
// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
|
sl@0
|
651 |
// it when needed.
|
sl@0
|
652 |
TInt length = aUtf7.Length();
|
sl@0
|
653 |
const TInt bufsize = 100;
|
sl@0
|
654 |
TInt state = KStateDefault;
|
sl@0
|
655 |
|
sl@0
|
656 |
TPtrC8 utf7 (aUtf7);
|
sl@0
|
657 |
TBuf<bufsize> buf;
|
sl@0
|
658 |
HBufC16* hBuf = HBufC16::NewLC(length);
|
sl@0
|
659 |
TPtr unicode = hBuf->Des();
|
sl@0
|
660 |
|
sl@0
|
661 |
FOREVER
|
sl@0
|
662 |
{
|
sl@0
|
663 |
TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state);
|
sl@0
|
664 |
if( unconverted == EErrorIllFormedInput || unconverted < 0)
|
sl@0
|
665 |
User::Leave(KErrCorrupt);
|
sl@0
|
666 |
|
sl@0
|
667 |
if (unicode.Length() + buf.Length() > unicode.MaxLength())
|
sl@0
|
668 |
{
|
sl@0
|
669 |
// Reallocate hBuf
|
sl@0
|
670 |
hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
|
sl@0
|
671 |
CleanupStack::Pop();
|
sl@0
|
672 |
CleanupStack::PushL(hBuf);
|
sl@0
|
673 |
unicode.Set(hBuf->Des());
|
sl@0
|
674 |
}
|
sl@0
|
675 |
unicode.Append(buf);
|
sl@0
|
676 |
if (unconverted ==0)
|
sl@0
|
677 |
break;
|
sl@0
|
678 |
utf7.Set(utf7.Right(unconverted));
|
sl@0
|
679 |
}
|
sl@0
|
680 |
CleanupStack::Pop();
|
sl@0
|
681 |
return hBuf;
|
sl@0
|
682 |
}
|
sl@0
|
683 |
|
sl@0
|
684 |
|
sl@0
|
685 |
|
sl@0
|
686 |
/** Converts text encoded using the Unicode transformation format UTF-7 into the
|
sl@0
|
687 |
Unicode UCS-2 character set.
|
sl@0
|
688 |
|
sl@0
|
689 |
If the conversion is achieved using a series of calls to this function, where
|
sl@0
|
690 |
each call starts off where the previous call reached in the input descriptor,
|
sl@0
|
691 |
the state of the conversion is stored. The initial value of the state variable
|
sl@0
|
692 |
should be set as KStateDefault when the conversion is started, and afterwards
|
sl@0
|
693 |
simply passed unchanged into each function call.
|
sl@0
|
694 |
|
sl@0
|
695 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
696 |
@param aUtf7 The UTF-7 encoded input string.
|
sl@0
|
697 |
@param aState For the first call of the function set to KStateDefault. For
|
sl@0
|
698 |
subsequent calls, pass in the variable unchanged.
|
sl@0
|
699 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
700 |
or one of the error values defined in TError. */
|
sl@0
|
701 |
EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode,
|
sl@0
|
702 |
const TDesC8& aUtf7,
|
sl@0
|
703 |
TInt& aState)
|
sl@0
|
704 |
{
|
sl@0
|
705 |
return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState);
|
sl@0
|
706 |
}
|
sl@0
|
707 |
|
sl@0
|
708 |
TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode,
|
sl@0
|
709 |
const TDesC8& aUtf7,
|
sl@0
|
710 |
TBool aIsImapUtf7,
|
sl@0
|
711 |
TInt& aState)
|
sl@0
|
712 |
{
|
sl@0
|
713 |
if (aUtf7.Length()==0)
|
sl@0
|
714 |
{
|
sl@0
|
715 |
aUnicode.SetLength(0);
|
sl@0
|
716 |
return 0;
|
sl@0
|
717 |
}
|
sl@0
|
718 |
if (aUnicode.MaxLength()==0)
|
sl@0
|
719 |
{
|
sl@0
|
720 |
return aUtf7.Length();
|
sl@0
|
721 |
}
|
sl@0
|
722 |
const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
|
sl@0
|
723 |
TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1);
|
sl@0
|
724 |
const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength();
|
sl@0
|
725 |
const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr();
|
sl@0
|
726 |
const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1);
|
sl@0
|
727 |
TUint currentUtf7Byte=*pointerToCurrentUtf7Byte;
|
sl@0
|
728 |
const TUint KIsInBase64Block=0x80000000u;
|
sl@0
|
729 |
TUint bitBuffer=STATIC_CAST(TUint, aState);
|
sl@0
|
730 |
TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4);
|
sl@0
|
731 |
bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer
|
sl@0
|
732 |
if (bitBuffer&KIsInBase64Block)
|
sl@0
|
733 |
{
|
sl@0
|
734 |
__ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7));
|
sl@0
|
735 |
__ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8));
|
sl@0
|
736 |
}
|
sl@0
|
737 |
else
|
sl@0
|
738 |
{
|
sl@0
|
739 |
__ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9));
|
sl@0
|
740 |
__ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10));
|
sl@0
|
741 |
}
|
sl@0
|
742 |
aState=KStateDefault;
|
sl@0
|
743 |
if (bitBuffer&KIsInBase64Block)
|
sl@0
|
744 |
{
|
sl@0
|
745 |
currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
|
sl@0
|
746 |
}
|
sl@0
|
747 |
TBool inputIsTruncated=EFalse;
|
sl@0
|
748 |
FOREVER
|
sl@0
|
749 |
{
|
sl@0
|
750 |
__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5));
|
sl@0
|
751 |
__ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11));
|
sl@0
|
752 |
__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1));
|
sl@0
|
753 |
__ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2));
|
sl@0
|
754 |
__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11));
|
sl@0
|
755 |
if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block))
|
sl@0
|
756 |
{
|
sl@0
|
757 |
if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
|
sl@0
|
758 |
{
|
sl@0
|
759 |
--pointerToCurrentUtf7Byte;
|
sl@0
|
760 |
inputIsTruncated=ETrue;
|
sl@0
|
761 |
goto end;
|
sl@0
|
762 |
}
|
sl@0
|
763 |
++pointerToCurrentUtf7Byte;
|
sl@0
|
764 |
currentUtf7Byte=*pointerToCurrentUtf7Byte;
|
sl@0
|
765 |
if (currentUtf7Byte=='-')
|
sl@0
|
766 |
{
|
sl@0
|
767 |
currentUtf7Byte=escapeCharacterForStartingBase64Block;
|
sl@0
|
768 |
}
|
sl@0
|
769 |
else
|
sl@0
|
770 |
{
|
sl@0
|
771 |
currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
|
sl@0
|
772 |
if (currentUtf7Byte==KNotInBase64Alphabet)
|
sl@0
|
773 |
{
|
sl@0
|
774 |
return EErrorIllFormedInput;
|
sl@0
|
775 |
}
|
sl@0
|
776 |
bitBuffer=KIsInBase64Block;
|
sl@0
|
777 |
}
|
sl@0
|
778 |
}
|
sl@0
|
779 |
if (bitBuffer&KIsInBase64Block)
|
sl@0
|
780 |
{
|
sl@0
|
781 |
FOREVER
|
sl@0
|
782 |
{
|
sl@0
|
783 |
__ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding));
|
sl@0
|
784 |
__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12));
|
sl@0
|
785 |
if (currentUtf7Byte==KNotInBase64Alphabet)
|
sl@0
|
786 |
{
|
sl@0
|
787 |
if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
|
sl@0
|
788 |
{
|
sl@0
|
789 |
return EErrorIllFormedInput;
|
sl@0
|
790 |
}
|
sl@0
|
791 |
bitBuffer=0;
|
sl@0
|
792 |
numberOfBitsInBuffer=0;
|
sl@0
|
793 |
currentUtf7Byte=*pointerToCurrentUtf7Byte;
|
sl@0
|
794 |
if (currentUtf7Byte=='-')
|
sl@0
|
795 |
{
|
sl@0
|
796 |
if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
|
sl@0
|
797 |
{
|
sl@0
|
798 |
goto end;
|
sl@0
|
799 |
}
|
sl@0
|
800 |
++pointerToCurrentUtf7Byte;
|
sl@0
|
801 |
currentUtf7Byte=*pointerToCurrentUtf7Byte;
|
sl@0
|
802 |
}
|
sl@0
|
803 |
break;
|
sl@0
|
804 |
}
|
sl@0
|
805 |
bitBuffer<<=6;
|
sl@0
|
806 |
bitBuffer|=currentUtf7Byte;
|
sl@0
|
807 |
bitBuffer|=KIsInBase64Block;
|
sl@0
|
808 |
numberOfBitsInBuffer+=6;
|
sl@0
|
809 |
// only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor)
|
sl@0
|
810 |
if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16)))
|
sl@0
|
811 |
{
|
sl@0
|
812 |
numberOfBitsInBuffer-=16;
|
sl@0
|
813 |
__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6));
|
sl@0
|
814 |
++pointerToPreviousUnicodeCharacter;
|
sl@0
|
815 |
*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer);
|
sl@0
|
816 |
bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated
|
sl@0
|
817 |
bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off
|
sl@0
|
818 |
if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)
|
sl@0
|
819 |
{
|
sl@0
|
820 |
goto end;
|
sl@0
|
821 |
}
|
sl@0
|
822 |
}
|
sl@0
|
823 |
if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
|
sl@0
|
824 |
{
|
sl@0
|
825 |
inputIsTruncated=ETrue;
|
sl@0
|
826 |
goto end;
|
sl@0
|
827 |
}
|
sl@0
|
828 |
++pointerToCurrentUtf7Byte;
|
sl@0
|
829 |
currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7);
|
sl@0
|
830 |
}
|
sl@0
|
831 |
}
|
sl@0
|
832 |
else
|
sl@0
|
833 |
{
|
sl@0
|
834 |
__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7));
|
sl@0
|
835 |
++pointerToPreviousUnicodeCharacter;
|
sl@0
|
836 |
*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte);
|
sl@0
|
837 |
if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte))
|
sl@0
|
838 |
{
|
sl@0
|
839 |
goto end;
|
sl@0
|
840 |
}
|
sl@0
|
841 |
++pointerToCurrentUtf7Byte;
|
sl@0
|
842 |
currentUtf7Byte=*pointerToCurrentUtf7Byte;
|
sl@0
|
843 |
}
|
sl@0
|
844 |
}
|
sl@0
|
845 |
end:
|
sl@0
|
846 |
if (bitBuffer&KIsInBase64Block)
|
sl@0
|
847 |
{
|
sl@0
|
848 |
__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13));
|
sl@0
|
849 |
if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
|
sl@0
|
850 |
{
|
sl@0
|
851 |
// rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence
|
sl@0
|
852 |
__ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14));
|
sl@0
|
853 |
pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6;
|
sl@0
|
854 |
const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6;
|
sl@0
|
855 |
bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift
|
sl@0
|
856 |
bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer);
|
sl@0
|
857 |
bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState
|
sl@0
|
858 |
numberOfBitsInBuffer=newNumberOfBitsInBuffer;
|
sl@0
|
859 |
__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15));
|
sl@0
|
860 |
}
|
sl@0
|
861 |
__ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16));
|
sl@0
|
862 |
aState=STATIC_CAST(TInt, bitBuffer);
|
sl@0
|
863 |
aState|=(numberOfBitsInBuffer<<4);
|
sl@0
|
864 |
__ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17));
|
sl@0
|
865 |
bitBuffer=0;
|
sl@0
|
866 |
numberOfBitsInBuffer=0;
|
sl@0
|
867 |
}
|
sl@0
|
868 |
if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated)
|
sl@0
|
869 |
{
|
sl@0
|
870 |
return EErrorIllFormedInput;
|
sl@0
|
871 |
}
|
sl@0
|
872 |
aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr());
|
sl@0
|
873 |
return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte;
|
sl@0
|
874 |
}
|
sl@0
|
875 |
|
sl@0
|
876 |
|
sl@0
|
877 |
|
sl@0
|
878 |
/** Converts text encoded using the Unicode transformation format UTF-8
|
sl@0
|
879 |
into the Unicode UCS-2 character set. This function leaves with an
|
sl@0
|
880 |
error code of the input string is corrupted.
|
sl@0
|
881 |
|
sl@0
|
882 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
883 |
@return A pointer to an HBufC16 with the converted Unicode string. */
|
sl@0
|
884 |
EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8)
|
sl@0
|
885 |
{
|
sl@0
|
886 |
// If aUtf8 is an empty string return
|
sl@0
|
887 |
if (aUtf8.Length()==0)
|
sl@0
|
888 |
{
|
sl@0
|
889 |
HBufC16* hBuf = HBufC16::NewL(1);
|
sl@0
|
890 |
return hBuf;
|
sl@0
|
891 |
}
|
sl@0
|
892 |
|
sl@0
|
893 |
// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
|
sl@0
|
894 |
// it when needed.
|
sl@0
|
895 |
TInt length = aUtf8.Length();
|
sl@0
|
896 |
const TInt bufsize = 100;
|
sl@0
|
897 |
|
sl@0
|
898 |
TPtrC8 utf8 (aUtf8);
|
sl@0
|
899 |
TBuf<bufsize> buf;
|
sl@0
|
900 |
HBufC16* hBuf = HBufC16::NewLC(length);
|
sl@0
|
901 |
TPtr unicode = hBuf->Des();
|
sl@0
|
902 |
|
sl@0
|
903 |
FOREVER
|
sl@0
|
904 |
{
|
sl@0
|
905 |
TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8);
|
sl@0
|
906 |
if( unconverted == EErrorIllFormedInput || unconverted < 0)
|
sl@0
|
907 |
User::Leave(KErrCorrupt);
|
sl@0
|
908 |
|
sl@0
|
909 |
if (unicode.Length() + buf.Length() > unicode.MaxLength())
|
sl@0
|
910 |
{
|
sl@0
|
911 |
// Reallocate hBuf
|
sl@0
|
912 |
hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
|
sl@0
|
913 |
CleanupStack::Pop();
|
sl@0
|
914 |
CleanupStack::PushL(hBuf);
|
sl@0
|
915 |
unicode.Set(hBuf->Des());
|
sl@0
|
916 |
}
|
sl@0
|
917 |
unicode.Append(buf);
|
sl@0
|
918 |
if (unconverted ==0)
|
sl@0
|
919 |
break;
|
sl@0
|
920 |
utf8.Set(utf8.Right(unconverted));
|
sl@0
|
921 |
}
|
sl@0
|
922 |
CleanupStack::Pop();
|
sl@0
|
923 |
return hBuf;
|
sl@0
|
924 |
}
|
sl@0
|
925 |
|
sl@0
|
926 |
/** Converts text encoded using the Unicode transformation format UTF-8 into the
|
sl@0
|
927 |
Unicode UCS-2 character set.
|
sl@0
|
928 |
|
sl@0
|
929 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
930 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
931 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
932 |
or one of the error values defined in TError. */
|
sl@0
|
933 |
EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
|
sl@0
|
934 |
{
|
sl@0
|
935 |
return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
|
sl@0
|
936 |
}
|
sl@0
|
937 |
|
sl@0
|
938 |
static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
|
sl@0
|
939 |
TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
|
sl@0
|
940 |
{
|
sl@0
|
941 |
if (aNumberOfUnconvertibleCharacters<=0)
|
sl@0
|
942 |
{
|
sl@0
|
943 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
|
sl@0
|
944 |
}
|
sl@0
|
945 |
++aNumberOfUnconvertibleCharacters;
|
sl@0
|
946 |
}
|
sl@0
|
947 |
|
sl@0
|
948 |
/** Converts text encoded using the Unicode transformation format UTF-8 into the
|
sl@0
|
949 |
Unicode UCS-2 character set.
|
sl@0
|
950 |
|
sl@0
|
951 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
952 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
953 |
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
|
sl@0
|
954 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
955 |
or one of the error values defined in TError. */
|
sl@0
|
956 |
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
|
sl@0
|
957 |
{
|
sl@0
|
958 |
TInt dummyUnconverted, dummyUnconvertedIndex;
|
sl@0
|
959 |
return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
|
sl@0
|
960 |
}
|
sl@0
|
961 |
|
sl@0
|
962 |
/** Converts text encoded using the Unicode transformation format UTF-8 into the
|
sl@0
|
963 |
Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
|
sl@0
|
964 |
|
sl@0
|
965 |
The variant of UTF-8 used internally by Java differs slightly from standard
|
sl@0
|
966 |
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
|
sl@0
|
967 |
|
sl@0
|
968 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
969 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
970 |
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
|
sl@0
|
971 |
UTF-8. The default is EFalse.
|
sl@0
|
972 |
@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes
|
sl@0
|
973 |
which were not converted.
|
sl@0
|
974 |
@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
|
sl@0
|
975 |
of the first byte of the first unconvertible character. For instance if the
|
sl@0
|
976 |
first character in the input descriptor (aForeign) could not be converted,
|
sl@0
|
977 |
then this parameter is set to the first byte of that character, i.e. zero.
|
sl@0
|
978 |
A negative value is returned if all the characters were converted.
|
sl@0
|
979 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
980 |
or one of the error values defined in TError. */
|
sl@0
|
981 |
|
sl@0
|
982 |
/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
|
sl@0
|
983 |
* Well formed UTF-8 Byte Sequences, full table.
|
sl@0
|
984 |
* +----------------------------------------------------------------+
|
sl@0
|
985 |
* | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
|
sl@0
|
986 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
987 |
* | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
|
sl@0
|
988 |
* | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
|
sl@0
|
989 |
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
|
sl@0
|
990 |
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
|
sl@0
|
991 |
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
|
sl@0
|
992 |
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
|
sl@0
|
993 |
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
|
sl@0
|
994 |
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
|
sl@0
|
995 |
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
|
sl@0
|
996 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
997 |
*
|
sl@0
|
998 |
* As a consequence of the well-formedness conditions specified in table 3-7,
|
sl@0
|
999 |
* the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
|
sl@0
|
1000 |
*/
|
sl@0
|
1001 |
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
|
sl@0
|
1002 |
TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
|
sl@0
|
1003 |
{
|
sl@0
|
1004 |
aUnicode.SetLength(0);
|
sl@0
|
1005 |
|
sl@0
|
1006 |
if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
|
sl@0
|
1007 |
{
|
sl@0
|
1008 |
return aUtf8.Length();
|
sl@0
|
1009 |
}
|
sl@0
|
1010 |
|
sl@0
|
1011 |
TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
|
sl@0
|
1012 |
const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
|
sl@0
|
1013 |
const TUint8* pUtf8 = aUtf8.Ptr();
|
sl@0
|
1014 |
const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
|
sl@0
|
1015 |
const TUint16 replacementcharacter = 0xFFFD;
|
sl@0
|
1016 |
TUint currentUnicodeCharacter;
|
sl@0
|
1017 |
TInt sequenceLength;
|
sl@0
|
1018 |
|
sl@0
|
1019 |
|
sl@0
|
1020 |
FOREVER
|
sl@0
|
1021 |
{
|
sl@0
|
1022 |
TBool illFormed=EFalse;
|
sl@0
|
1023 |
|
sl@0
|
1024 |
__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
|
sl@0
|
1025 |
__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
|
sl@0
|
1026 |
|
sl@0
|
1027 |
sequenceLength = 1;
|
sl@0
|
1028 |
|
sl@0
|
1029 |
// ascii - optimisation (i.e. it isn't a sequence)
|
sl@0
|
1030 |
if (pUtf8[0] < 0x80)
|
sl@0
|
1031 |
{
|
sl@0
|
1032 |
currentUnicodeCharacter = pUtf8[0];
|
sl@0
|
1033 |
}
|
sl@0
|
1034 |
else
|
sl@0
|
1035 |
{
|
sl@0
|
1036 |
// see if well formed utf-8, use table above for reference
|
sl@0
|
1037 |
if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
|
sl@0
|
1038 |
{
|
sl@0
|
1039 |
// 0xc1-0xc2 are not valid bytes
|
sl@0
|
1040 |
sequenceLength = 2;
|
sl@0
|
1041 |
}
|
sl@0
|
1042 |
else if ((pUtf8[0] & 0xf0) == 0xe0)
|
sl@0
|
1043 |
{
|
sl@0
|
1044 |
sequenceLength = 3;
|
sl@0
|
1045 |
}
|
sl@0
|
1046 |
else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
|
sl@0
|
1047 |
{
|
sl@0
|
1048 |
// 0xf5-0xff, are not valid bytes
|
sl@0
|
1049 |
sequenceLength = 4;
|
sl@0
|
1050 |
}
|
sl@0
|
1051 |
else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
|
sl@0
|
1052 |
{
|
sl@0
|
1053 |
if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
|
sl@0
|
1054 |
{
|
sl@0
|
1055 |
// either we've split the 0xc0 0x80 (i.e. 0xc0 is
|
sl@0
|
1056 |
// the last character in the string) or we've
|
sl@0
|
1057 |
// discovered a valid 0xc0 0x80 sequence.
|
sl@0
|
1058 |
sequenceLength = 2;
|
sl@0
|
1059 |
}
|
sl@0
|
1060 |
}
|
sl@0
|
1061 |
|
sl@0
|
1062 |
/* checking to see if we got a valid sequence */
|
sl@0
|
1063 |
if (sequenceLength == 1)
|
sl@0
|
1064 |
{
|
sl@0
|
1065 |
// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
|
sl@0
|
1066 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1067 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1068 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1069 |
}
|
sl@0
|
1070 |
else
|
sl@0
|
1071 |
{
|
sl@0
|
1072 |
// this is a check to see if the sequence goes beyond the input
|
sl@0
|
1073 |
// stream. if its not the first and only character in the input
|
sl@0
|
1074 |
// stream this isn't an error, otherwise it is.
|
sl@0
|
1075 |
if ((pUtf8 + sequenceLength - 1) > pLastUtf8)
|
sl@0
|
1076 |
{
|
sl@0
|
1077 |
// check to see if this sequence was the first character
|
sl@0
|
1078 |
if ((pUnicode - aUnicode.Ptr()) == 0)
|
sl@0
|
1079 |
{
|
sl@0
|
1080 |
return EErrorIllFormedInput;
|
sl@0
|
1081 |
}
|
sl@0
|
1082 |
break;
|
sl@0
|
1083 |
}
|
sl@0
|
1084 |
|
sl@0
|
1085 |
currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
|
sl@0
|
1086 |
|
sl@0
|
1087 |
/* check the trailing bytes, they should begin with 10 */
|
sl@0
|
1088 |
TUint i = 1;
|
sl@0
|
1089 |
|
sl@0
|
1090 |
do
|
sl@0
|
1091 |
{
|
sl@0
|
1092 |
if ((pUtf8[i] & 0xc0) == 0x80)
|
sl@0
|
1093 |
{
|
sl@0
|
1094 |
// add the trailing 6 bits to the current unicode char
|
sl@0
|
1095 |
currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
|
sl@0
|
1096 |
}
|
sl@0
|
1097 |
else
|
sl@0
|
1098 |
{
|
sl@0
|
1099 |
// ill formed character (doesn't have a lead 10)
|
sl@0
|
1100 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1101 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1102 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1103 |
illFormed=ETrue;
|
sl@0
|
1104 |
break;
|
sl@0
|
1105 |
}
|
sl@0
|
1106 |
i++;
|
sl@0
|
1107 |
}
|
sl@0
|
1108 |
while (i < sequenceLength);
|
sl@0
|
1109 |
}
|
sl@0
|
1110 |
|
sl@0
|
1111 |
/* conformance check. bits of above table for reference.
|
sl@0
|
1112 |
* +----------------------------------------------------------------+
|
sl@0
|
1113 |
* | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
|
sl@0
|
1114 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
1115 |
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0
|
sl@0
|
1116 |
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F
|
sl@0
|
1117 |
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90
|
sl@0
|
1118 |
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F
|
sl@0
|
1119 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
1120 |
*/
|
sl@0
|
1121 |
|
sl@0
|
1122 |
if (currentUnicodeCharacter != replacementcharacter)
|
sl@0
|
1123 |
{
|
sl@0
|
1124 |
if (sequenceLength == 3)
|
sl@0
|
1125 |
{
|
sl@0
|
1126 |
if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
|
sl@0
|
1127 |
{
|
sl@0
|
1128 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1129 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1130 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1131 |
illFormed=ETrue;
|
sl@0
|
1132 |
}
|
sl@0
|
1133 |
else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
|
sl@0
|
1134 |
{
|
sl@0
|
1135 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1136 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1137 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1138 |
illFormed=ETrue;
|
sl@0
|
1139 |
}
|
sl@0
|
1140 |
}
|
sl@0
|
1141 |
else if (sequenceLength == 4)
|
sl@0
|
1142 |
{
|
sl@0
|
1143 |
if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
|
sl@0
|
1144 |
{
|
sl@0
|
1145 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1146 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1147 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1148 |
illFormed=ETrue;
|
sl@0
|
1149 |
}
|
sl@0
|
1150 |
else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
|
sl@0
|
1151 |
{
|
sl@0
|
1152 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1153 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1154 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1155 |
illFormed=ETrue;
|
sl@0
|
1156 |
}
|
sl@0
|
1157 |
}
|
sl@0
|
1158 |
|
sl@0
|
1159 |
|
sl@0
|
1160 |
/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
|
sl@0
|
1161 |
* are not Unicode scalar values, any UTF-8 byte sequence that would map to code
|
sl@0
|
1162 |
* points D800..DFFF is ill formed */
|
sl@0
|
1163 |
|
sl@0
|
1164 |
if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
|
sl@0
|
1165 |
{
|
sl@0
|
1166 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
1167 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
1168 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
1169 |
illFormed=ETrue;
|
sl@0
|
1170 |
}
|
sl@0
|
1171 |
}
|
sl@0
|
1172 |
// end conformance check
|
sl@0
|
1173 |
}
|
sl@0
|
1174 |
|
sl@0
|
1175 |
// would this character generate a surrogate pair in UTF-16?
|
sl@0
|
1176 |
if (currentUnicodeCharacter > 0xFFFF)
|
sl@0
|
1177 |
{
|
sl@0
|
1178 |
// is there enough space to hold a surrogate pair in the output?
|
sl@0
|
1179 |
if (pUnicode >= pLastUnicode)
|
sl@0
|
1180 |
{
|
sl@0
|
1181 |
break; // no, end processing.
|
sl@0
|
1182 |
}
|
sl@0
|
1183 |
|
sl@0
|
1184 |
TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
|
sl@0
|
1185 |
*pUnicode++ = STATIC_CAST(TUint16, surrogate);
|
sl@0
|
1186 |
|
sl@0
|
1187 |
surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
|
sl@0
|
1188 |
*pUnicode++ = STATIC_CAST(TUint16, surrogate);
|
sl@0
|
1189 |
}
|
sl@0
|
1190 |
else
|
sl@0
|
1191 |
{
|
sl@0
|
1192 |
*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
|
sl@0
|
1193 |
}
|
sl@0
|
1194 |
|
sl@0
|
1195 |
// move the input pointer
|
sl@0
|
1196 |
if (currentUnicodeCharacter != replacementcharacter)
|
sl@0
|
1197 |
{
|
sl@0
|
1198 |
pUtf8 += sequenceLength;
|
sl@0
|
1199 |
}
|
sl@0
|
1200 |
else if(illFormed == EFalse)
|
sl@0
|
1201 |
{
|
sl@0
|
1202 |
pUtf8 += (sequenceLength);
|
sl@0
|
1203 |
}
|
sl@0
|
1204 |
else
|
sl@0
|
1205 |
{
|
sl@0
|
1206 |
// we had a character we didn't recognize (i.e. it was invalid)
|
sl@0
|
1207 |
// so move to the next character in the input
|
sl@0
|
1208 |
pUtf8++;
|
sl@0
|
1209 |
}
|
sl@0
|
1210 |
|
sl@0
|
1211 |
if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
|
sl@0
|
1212 |
{
|
sl@0
|
1213 |
break; // we've either reached the end of the input or the end of output
|
sl@0
|
1214 |
}
|
sl@0
|
1215 |
}
|
sl@0
|
1216 |
|
sl@0
|
1217 |
aUnicode.SetLength(pUnicode - aUnicode.Ptr());
|
sl@0
|
1218 |
return (pLastUtf8 - pUtf8 + 1);
|
sl@0
|
1219 |
}
|
sl@0
|
1220 |
|
sl@0
|
1221 |
/** Given a sample text this function attempts to determine whether or not
|
sl@0
|
1222 |
* the same text is encoded using the UTF-8 standard encoding scheme.
|
sl@0
|
1223 |
|
sl@0
|
1224 |
@param TInt a confidence level, given at certain value. if the given sample
|
sl@0
|
1225 |
is UTF-8 this value will not be changed (unless > 100) then its
|
sl@0
|
1226 |
set to 100. Otherwise if the same isn't UTF-8, its set to 0.
|
sl@0
|
1227 |
@param TDesC8 sample text.
|
sl@0
|
1228 |
UTF-8. The default is EFalse.
|
sl@0
|
1229 |
@return void
|
sl@0
|
1230 |
*/
|
sl@0
|
1231 |
|
sl@0
|
1232 |
/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
|
sl@0
|
1233 |
* Well formed UTF-8 Byte Sequences, full table.
|
sl@0
|
1234 |
* +----------------------------------------------------------------+
|
sl@0
|
1235 |
* | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
|
sl@0
|
1236 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
1237 |
* | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
|
sl@0
|
1238 |
* | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
|
sl@0
|
1239 |
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
|
sl@0
|
1240 |
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
|
sl@0
|
1241 |
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
|
sl@0
|
1242 |
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
|
sl@0
|
1243 |
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
|
sl@0
|
1244 |
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
|
sl@0
|
1245 |
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
|
sl@0
|
1246 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
1247 |
*
|
sl@0
|
1248 |
* As a consequence of the well-formedness conditions specified in table 3-7,
|
sl@0
|
1249 |
* the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
|
sl@0
|
1250 |
*
|
sl@0
|
1251 |
* Code Rules:
|
sl@0
|
1252 |
* R1: If the string contains any non-UTF-8 characters the returned confidence
|
sl@0
|
1253 |
* is 0. Valid UTF-8 combinations are listed in the above table.
|
sl@0
|
1254 |
* R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in
|
sl@0
|
1255 |
* the (see ) the returned confidence is 95.
|
sl@0
|
1256 |
* R3: Otherwise the confidence returned is based upon the sample string
|
sl@0
|
1257 |
* length.
|
sl@0
|
1258 |
* R4: If the sample string is under 75 characters, the confidence is set to
|
sl@0
|
1259 |
* 75.
|
sl@0
|
1260 |
*/
|
sl@0
|
1261 |
GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
|
sl@0
|
1262 |
{
|
sl@0
|
1263 |
|
sl@0
|
1264 |
TInt sampleLength = aSample.Length();
|
sl@0
|
1265 |
|
sl@0
|
1266 |
if (sampleLength == 0)
|
sl@0
|
1267 |
{
|
sl@0
|
1268 |
aConfidenceLevel = 89;
|
sl@0
|
1269 |
return;
|
sl@0
|
1270 |
}
|
sl@0
|
1271 |
TInt bytesRemaining = 0;
|
sl@0
|
1272 |
TInt sequenceLength = 0;
|
sl@0
|
1273 |
|
sl@0
|
1274 |
aConfidenceLevel = sampleLength;
|
sl@0
|
1275 |
|
sl@0
|
1276 |
const TUint8* buffer = &aSample[0];
|
sl@0
|
1277 |
|
sl@0
|
1278 |
if (sampleLength < 95)
|
sl@0
|
1279 |
{
|
sl@0
|
1280 |
// check for the BOM
|
sl@0
|
1281 |
if ((sampleLength >= 3) &&
|
sl@0
|
1282 |
((buffer[0] == 0xEF) &&
|
sl@0
|
1283 |
(buffer[1] == 0xBB) &&
|
sl@0
|
1284 |
(buffer[2] == 0xBF))
|
sl@0
|
1285 |
)
|
sl@0
|
1286 |
{
|
sl@0
|
1287 |
aConfidenceLevel = 95;
|
sl@0
|
1288 |
}
|
sl@0
|
1289 |
else if (sampleLength < 75)
|
sl@0
|
1290 |
{
|
sl@0
|
1291 |
aConfidenceLevel = 75;
|
sl@0
|
1292 |
}
|
sl@0
|
1293 |
}
|
sl@0
|
1294 |
|
sl@0
|
1295 |
for (TInt index = 0;index != sampleLength;index++)
|
sl@0
|
1296 |
{
|
sl@0
|
1297 |
|
sl@0
|
1298 |
if (bytesRemaining > 0)
|
sl@0
|
1299 |
{
|
sl@0
|
1300 |
// bytesRemaining > 0, means that a byte representing the start of a
|
sl@0
|
1301 |
// multibyte sequence was encountered and the bytesRemaining is the
|
sl@0
|
1302 |
// number of bytes to follow.
|
sl@0
|
1303 |
|
sl@0
|
1304 |
if ((buffer[index] & 0xc0) == 0x80)
|
sl@0
|
1305 |
{
|
sl@0
|
1306 |
// need to check for ill-formed sequences -- all are in the 2nd byte
|
sl@0
|
1307 |
|
sl@0
|
1308 |
if ((sequenceLength == 3) && (bytesRemaining == 2))
|
sl@0
|
1309 |
{
|
sl@0
|
1310 |
if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
|
sl@0
|
1311 |
{
|
sl@0
|
1312 |
aConfidenceLevel = 0;
|
sl@0
|
1313 |
break;
|
sl@0
|
1314 |
}
|
sl@0
|
1315 |
else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
|
sl@0
|
1316 |
{
|
sl@0
|
1317 |
aConfidenceLevel = 0;
|
sl@0
|
1318 |
break;
|
sl@0
|
1319 |
}
|
sl@0
|
1320 |
}
|
sl@0
|
1321 |
else if ((sequenceLength == 4) && (bytesRemaining == 3))
|
sl@0
|
1322 |
{
|
sl@0
|
1323 |
if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
|
sl@0
|
1324 |
{
|
sl@0
|
1325 |
aConfidenceLevel = 0;
|
sl@0
|
1326 |
break;
|
sl@0
|
1327 |
}
|
sl@0
|
1328 |
else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
|
sl@0
|
1329 |
{
|
sl@0
|
1330 |
aConfidenceLevel = 0;
|
sl@0
|
1331 |
break;
|
sl@0
|
1332 |
}
|
sl@0
|
1333 |
}
|
sl@0
|
1334 |
|
sl@0
|
1335 |
--bytesRemaining;
|
sl@0
|
1336 |
continue;
|
sl@0
|
1337 |
}
|
sl@0
|
1338 |
else
|
sl@0
|
1339 |
{
|
sl@0
|
1340 |
aConfidenceLevel = 0;
|
sl@0
|
1341 |
break;
|
sl@0
|
1342 |
}
|
sl@0
|
1343 |
}
|
sl@0
|
1344 |
|
sl@0
|
1345 |
if (bytesRemaining == 0)
|
sl@0
|
1346 |
{
|
sl@0
|
1347 |
if (buffer[index] < 0x80)
|
sl@0
|
1348 |
{
|
sl@0
|
1349 |
// The value of aSample[index] is in the range 0x00-0x7f
|
sl@0
|
1350 |
//UTF8 maintains ASCII transparency. So it's a valid
|
sl@0
|
1351 |
//UTF8. Do nothing, check next value.
|
sl@0
|
1352 |
continue;
|
sl@0
|
1353 |
}
|
sl@0
|
1354 |
else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
|
sl@0
|
1355 |
{
|
sl@0
|
1356 |
// valid start of a 2 byte sequence (see conformance note)
|
sl@0
|
1357 |
sequenceLength = 2;
|
sl@0
|
1358 |
bytesRemaining = 1;
|
sl@0
|
1359 |
}
|
sl@0
|
1360 |
else if ((buffer[index] & 0xf0) == 0xe0)
|
sl@0
|
1361 |
{
|
sl@0
|
1362 |
// valid start of a 3 byte sequence
|
sl@0
|
1363 |
sequenceLength = 3;
|
sl@0
|
1364 |
bytesRemaining = 2;
|
sl@0
|
1365 |
}
|
sl@0
|
1366 |
else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
|
sl@0
|
1367 |
{
|
sl@0
|
1368 |
// valid start of a 4 byte sequence (see conformance note)
|
sl@0
|
1369 |
sequenceLength = 4;
|
sl@0
|
1370 |
bytesRemaining = 3;
|
sl@0
|
1371 |
}
|
sl@0
|
1372 |
else
|
sl@0
|
1373 |
{
|
sl@0
|
1374 |
// wasn't anything expected so must be an illegal/irregular UTF8 coded value
|
sl@0
|
1375 |
aConfidenceLevel = 0;
|
sl@0
|
1376 |
break;
|
sl@0
|
1377 |
}
|
sl@0
|
1378 |
}
|
sl@0
|
1379 |
} // for
|
sl@0
|
1380 |
|
sl@0
|
1381 |
aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
|
sl@0
|
1382 |
}
|
sl@0
|
1383 |
|
sl@0
|
1384 |
GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample)
|
sl@0
|
1385 |
{
|
sl@0
|
1386 |
TInt sampleLength = aSample.Length();
|
sl@0
|
1387 |
aConfidenceLevel = 70;
|
sl@0
|
1388 |
for (TInt i=0; i<sampleLength; ++i)
|
sl@0
|
1389 |
{
|
sl@0
|
1390 |
// UTF-7 value ranges only 7 bits
|
sl@0
|
1391 |
if((aSample[i]&0x80)!=0x00)
|
sl@0
|
1392 |
{
|
sl@0
|
1393 |
aConfidenceLevel= 0;
|
sl@0
|
1394 |
break;
|
sl@0
|
1395 |
}
|
sl@0
|
1396 |
|
sl@0
|
1397 |
// there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7
|
sl@0
|
1398 |
else if (char(aSample[i])=='~')
|
sl@0
|
1399 |
{
|
sl@0
|
1400 |
aConfidenceLevel = 0;
|
sl@0
|
1401 |
break;
|
sl@0
|
1402 |
}
|
sl@0
|
1403 |
|
sl@0
|
1404 |
// The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format
|
sl@0
|
1405 |
else if ( (aSample[i]==0x1b) && (i <sampleLength-1) )
|
sl@0
|
1406 |
{
|
sl@0
|
1407 |
static const TInt smsExtensionTable[11] =
|
sl@0
|
1408 |
{0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65};
|
sl@0
|
1409 |
TInt increment1 = i+1;
|
sl@0
|
1410 |
if (increment1>= sampleLength)
|
sl@0
|
1411 |
break;
|
sl@0
|
1412 |
for (TInt j=0; j < 11; ++j)
|
sl@0
|
1413 |
{
|
sl@0
|
1414 |
if (aSample[increment1] == smsExtensionTable[j])
|
sl@0
|
1415 |
{
|
sl@0
|
1416 |
aConfidenceLevel-=10;
|
sl@0
|
1417 |
}
|
sl@0
|
1418 |
}
|
sl@0
|
1419 |
}
|
sl@0
|
1420 |
// The UTF-7 escape char is 0x2b. The values that follow the escape sequence
|
sl@0
|
1421 |
// the values following the escape char value must belong to the modified base64
|
sl@0
|
1422 |
// or '-' else it is an ill-formed sequence, so probably not UTF-7
|
sl@0
|
1423 |
else if ( (aSample[i]==0x2b) && (i <sampleLength-1) )
|
sl@0
|
1424 |
{
|
sl@0
|
1425 |
TInt increment1 = i+1;
|
sl@0
|
1426 |
if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) ||
|
sl@0
|
1427 |
((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) ||
|
sl@0
|
1428 |
((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a)))
|
sl@0
|
1429 |
{
|
sl@0
|
1430 |
aConfidenceLevel+=5;
|
sl@0
|
1431 |
}
|
sl@0
|
1432 |
else
|
sl@0
|
1433 |
{
|
sl@0
|
1434 |
aConfidenceLevel-=15;
|
sl@0
|
1435 |
}
|
sl@0
|
1436 |
i++; // should this be here or up in the if loop ??
|
sl@0
|
1437 |
}
|
sl@0
|
1438 |
} //for
|
sl@0
|
1439 |
aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
|
sl@0
|
1440 |
}
|