sl@0
|
1 |
/*
|
sl@0
|
2 |
* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
3 |
* All rights reserved.
|
sl@0
|
4 |
* This component and the accompanying materials are made available
|
sl@0
|
5 |
* under the terms of the License "Eclipse Public License v1.0"
|
sl@0
|
6 |
* which accompanies this distribution, and is available
|
sl@0
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
8 |
*
|
sl@0
|
9 |
* Initial Contributors:
|
sl@0
|
10 |
* Nokia Corporation - initial contribution.
|
sl@0
|
11 |
*
|
sl@0
|
12 |
* Contributors:
|
sl@0
|
13 |
*
|
sl@0
|
14 |
* Description:
|
sl@0
|
15 |
*
|
sl@0
|
16 |
*/
|
sl@0
|
17 |
|
sl@0
|
18 |
|
sl@0
|
19 |
#include <e32std.h>
|
sl@0
|
20 |
#include <e32base.h>
|
sl@0
|
21 |
#include <utf.h>
|
sl@0
|
22 |
|
sl@0
|
23 |
#define STATIC_CAST(t,v) static_cast<t>(v)
|
sl@0
|
24 |
#define CONST_CAST(t,v) const_cast<t>(v)
|
sl@0
|
25 |
#define FOREVER for(;;)
|
sl@0
|
26 |
|
sl@0
|
27 |
const TUint KNotInBase64Alphabet=KMaxTUint;
|
sl@0
|
28 |
|
sl@0
|
29 |
enum TPanic
|
sl@0
|
30 |
{
|
sl@0
|
31 |
EPanicBad6BitNumber=1,
|
sl@0
|
32 |
EPanicBadUtf7Pointers1,
|
sl@0
|
33 |
EPanicBadUtf7Pointers2,
|
sl@0
|
34 |
EPanicBadUtf7Pointers3,
|
sl@0
|
35 |
EPanicBadUtf7Pointers4,
|
sl@0
|
36 |
EPanicBadUtf7Pointers5,
|
sl@0
|
37 |
EPanicBadUtf7Pointers6,
|
sl@0
|
38 |
EPanicBadUtf7Pointers7,
|
sl@0
|
39 |
EPanicBadUtf7Pointers8,
|
sl@0
|
40 |
EPanicBadUtf7Pointers9,
|
sl@0
|
41 |
EPanicBadUtf7Pointers10,
|
sl@0
|
42 |
EPanicBadUtf7Pointers11,
|
sl@0
|
43 |
EPanicNotInBase64Block,
|
sl@0
|
44 |
EPanicBadUnicodePointers1,
|
sl@0
|
45 |
EPanicBadUnicodePointers2,
|
sl@0
|
46 |
EPanicBadUnicodePointers3,
|
sl@0
|
47 |
EPanicBadUnicodePointers4,
|
sl@0
|
48 |
EPanicBadUnicodePointers5,
|
sl@0
|
49 |
EPanicBadUnicodePointers6,
|
sl@0
|
50 |
EPanicBadUnicodePointers7,
|
sl@0
|
51 |
EPanicBadUnicodePointers8,
|
sl@0
|
52 |
EPanicBadUnicodePointers9,
|
sl@0
|
53 |
EPanicBadUnicodePointers10,
|
sl@0
|
54 |
EPanicBadBitBufferState1,
|
sl@0
|
55 |
EPanicBadBitBufferState2,
|
sl@0
|
56 |
EPanicBadBitBufferState3,
|
sl@0
|
57 |
EPanicBadBitBufferState4,
|
sl@0
|
58 |
EPanicBadBitBufferState5,
|
sl@0
|
59 |
EPanicBadBitBufferState6,
|
sl@0
|
60 |
EPanicBadBitBufferState7,
|
sl@0
|
61 |
EPanicBadBitBufferState8,
|
sl@0
|
62 |
EPanicBadBitBufferState9,
|
sl@0
|
63 |
EPanicBadBitBufferState10,
|
sl@0
|
64 |
EPanicBadBitBufferState11,
|
sl@0
|
65 |
EPanicBadBitBufferState12,
|
sl@0
|
66 |
EPanicBadBitBufferState13,
|
sl@0
|
67 |
EPanicBadBitBufferState14,
|
sl@0
|
68 |
EPanicBadBitBufferState15,
|
sl@0
|
69 |
EPanicBadBitBufferState16,
|
sl@0
|
70 |
EPanicBadBitBufferState17,
|
sl@0
|
71 |
EPanicUnexpectedNumberOfLoopIterations,
|
sl@0
|
72 |
EPanicInitialEscapeCharacterButNoBase64,
|
sl@0
|
73 |
EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
|
sl@0
|
74 |
EPanicBadUtf8Pointers1,
|
sl@0
|
75 |
EPanicBadUtf8Pointers2,
|
sl@0
|
76 |
EPanicBadUtf8Pointers3,
|
sl@0
|
77 |
EPanicBadUtf8Pointers4,
|
sl@0
|
78 |
EPanicBadUtf8Pointers5,
|
sl@0
|
79 |
EPanicBadUtf8Pointers6,
|
sl@0
|
80 |
EPanicBadUtf8Pointers7,
|
sl@0
|
81 |
EPanicOutOfSyncUtf7Byte1,
|
sl@0
|
82 |
EPanicOutOfSyncUtf7Byte2,
|
sl@0
|
83 |
EPanicOutOfSyncBase64Decoding
|
sl@0
|
84 |
};
|
sl@0
|
85 |
|
sl@0
|
86 |
_LIT(KLitPanicText, "CHARCONV-UTF");
|
sl@0
|
87 |
|
sl@0
|
88 |
LOCAL_C void Panic(TPanic aPanic)
|
sl@0
|
89 |
{
|
sl@0
|
90 |
User::Panic(KLitPanicText, aPanic);
|
sl@0
|
91 |
}
|
sl@0
|
92 |
|
sl@0
|
93 |
inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
|
sl@0
|
94 |
|
sl@0
|
95 |
inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
|
sl@0
|
96 |
{
|
sl@0
|
97 |
return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
|
sl@0
|
98 |
}
|
sl@0
|
99 |
|
sl@0
|
100 |
|
sl@0
|
101 |
|
sl@0
|
102 |
|
sl@0
|
103 |
|
sl@0
|
104 |
|
sl@0
|
105 |
|
sl@0
|
106 |
|
sl@0
|
107 |
/** Converts Unicode text into UTF-8 encoding.
|
sl@0
|
108 |
|
sl@0
|
109 |
@param aUtf8 On return, contains the UTF-8 encoded output string.
|
sl@0
|
110 |
@param aUnicode The Unicode-encoded input string.
|
sl@0
|
111 |
@return The number of unconverted characters left at the end of the input
|
sl@0
|
112 |
descriptor, or one of the error values defined in TError. */
|
sl@0
|
113 |
EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
|
sl@0
|
114 |
{
|
sl@0
|
115 |
return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
|
sl@0
|
116 |
}
|
sl@0
|
117 |
|
sl@0
|
118 |
|
sl@0
|
119 |
|
sl@0
|
120 |
/** Converts Unicode text into UTF-8 encoding.
|
sl@0
|
121 |
|
sl@0
|
122 |
Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
|
sl@0
|
123 |
|
sl@0
|
124 |
The variant of UTF-8 used internally by Java differs slightly from standard
|
sl@0
|
125 |
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
|
sl@0
|
126 |
|
sl@0
|
127 |
@param aUtf8 On return, contains the UTF-8 encoded output string.
|
sl@0
|
128 |
@param aUnicode A UCS-2 encoded input string.
|
sl@0
|
129 |
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
|
sl@0
|
130 |
UTF-8. The default is EFalse.
|
sl@0
|
131 |
@return The number of unconverted characters left at the end of the input descriptor,
|
sl@0
|
132 |
or one of the error values defined in TError. */
|
sl@0
|
133 |
TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,
|
sl@0
|
134 |
const TDesC16& aUnicode,
|
sl@0
|
135 |
TBool aGenerateJavaConformantUtf8)
|
sl@0
|
136 |
{
|
sl@0
|
137 |
if (aUnicode.Length() == 0)
|
sl@0
|
138 |
{
|
sl@0
|
139 |
aUtf8.SetLength(0);
|
sl@0
|
140 |
return 0;
|
sl@0
|
141 |
}
|
sl@0
|
142 |
if (aUtf8.MaxLength() == 0)
|
sl@0
|
143 |
{
|
sl@0
|
144 |
return aUnicode.Length();
|
sl@0
|
145 |
}
|
sl@0
|
146 |
|
sl@0
|
147 |
TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
|
sl@0
|
148 |
const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
|
sl@0
|
149 |
TBool inputIsTruncated = EFalse;
|
sl@0
|
150 |
const TUint16* pUnicode = aUnicode.Ptr();
|
sl@0
|
151 |
const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
|
sl@0
|
152 |
|
sl@0
|
153 |
FOREVER
|
sl@0
|
154 |
{
|
sl@0
|
155 |
__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
|
sl@0
|
156 |
__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
|
sl@0
|
157 |
|
sl@0
|
158 |
if (pUnicode[0] < 0x80)
|
sl@0
|
159 |
{
|
sl@0
|
160 |
// ascii - 1 byte
|
sl@0
|
161 |
|
sl@0
|
162 |
// internally java is different since the \x0000 character is
|
sl@0
|
163 |
// translated into \xC0 \x80.
|
sl@0
|
164 |
|
sl@0
|
165 |
if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
|
sl@0
|
166 |
{
|
sl@0
|
167 |
if (pUtf8 == pointerToLastUtf8Byte)
|
sl@0
|
168 |
{
|
sl@0
|
169 |
pUtf8--;
|
sl@0
|
170 |
pUnicode--;
|
sl@0
|
171 |
break;
|
sl@0
|
172 |
}
|
sl@0
|
173 |
*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
|
sl@0
|
174 |
*pUtf8 = STATIC_CAST(TUint8, 0x80);
|
sl@0
|
175 |
}
|
sl@0
|
176 |
else
|
sl@0
|
177 |
{
|
sl@0
|
178 |
*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
|
sl@0
|
179 |
}
|
sl@0
|
180 |
}
|
sl@0
|
181 |
else if (pUnicode[0] < 0x800)
|
sl@0
|
182 |
{
|
sl@0
|
183 |
// U+0080..U+07FF - 2 bytes
|
sl@0
|
184 |
|
sl@0
|
185 |
if (pUtf8 == pointerToLastUtf8Byte)
|
sl@0
|
186 |
{
|
sl@0
|
187 |
pUtf8--;
|
sl@0
|
188 |
pUnicode--;
|
sl@0
|
189 |
break;
|
sl@0
|
190 |
}
|
sl@0
|
191 |
|
sl@0
|
192 |
*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
|
sl@0
|
193 |
*pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
|
sl@0
|
194 |
|
sl@0
|
195 |
}
|
sl@0
|
196 |
|
sl@0
|
197 |
// check to see if we have a surrogate in the stream, surrogates encode code points outside
|
sl@0
|
198 |
// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
|
sl@0
|
199 |
|
sl@0
|
200 |
else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
|
sl@0
|
201 |
{
|
sl@0
|
202 |
// surrogate pair - 4 bytes in utf-8
|
sl@0
|
203 |
// U+10000..U+10FFFF
|
sl@0
|
204 |
|
sl@0
|
205 |
__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
|
sl@0
|
206 |
// is there enough space to hold the character
|
sl@0
|
207 |
if ((pointerToLastUtf8Byte - pUtf8) < 3)
|
sl@0
|
208 |
{
|
sl@0
|
209 |
pUtf8--;
|
sl@0
|
210 |
pUnicode--;
|
sl@0
|
211 |
break; // no go to the exit condition
|
sl@0
|
212 |
}
|
sl@0
|
213 |
|
sl@0
|
214 |
__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
|
sl@0
|
215 |
if (pUnicode >= pointerToLastUnicodeCharacter)
|
sl@0
|
216 |
{
|
sl@0
|
217 |
pUtf8--;
|
sl@0
|
218 |
pUnicode--;
|
sl@0
|
219 |
inputIsTruncated = ETrue;
|
sl@0
|
220 |
break; // middle of a surrogate pair. go to end condition
|
sl@0
|
221 |
}
|
sl@0
|
222 |
|
sl@0
|
223 |
if ((pUnicode[1] & 0xfc00) != 0xdc00)
|
sl@0
|
224 |
{
|
sl@0
|
225 |
return EErrorIllFormedInput;
|
sl@0
|
226 |
}
|
sl@0
|
227 |
|
sl@0
|
228 |
// convert utf-16 surrogate to utf-32
|
sl@0
|
229 |
TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
|
sl@0
|
230 |
|
sl@0
|
231 |
// convert utf-32 to utf-8
|
sl@0
|
232 |
*pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));
|
sl@0
|
233 |
*pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
|
sl@0
|
234 |
*pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
|
sl@0
|
235 |
*pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
|
sl@0
|
236 |
|
sl@0
|
237 |
// we consumed 2 utf-16 values, move this pointer
|
sl@0
|
238 |
pUnicode++;
|
sl@0
|
239 |
}
|
sl@0
|
240 |
else
|
sl@0
|
241 |
{
|
sl@0
|
242 |
// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
|
sl@0
|
243 |
|
sl@0
|
244 |
if (pointerToLastUtf8Byte - pUtf8 < 2)
|
sl@0
|
245 |
{
|
sl@0
|
246 |
pUtf8--;
|
sl@0
|
247 |
pUnicode--;
|
sl@0
|
248 |
break;
|
sl@0
|
249 |
}
|
sl@0
|
250 |
*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
|
sl@0
|
251 |
*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
|
sl@0
|
252 |
*pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
|
sl@0
|
253 |
}
|
sl@0
|
254 |
|
sl@0
|
255 |
if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
|
sl@0
|
256 |
{
|
sl@0
|
257 |
break;
|
sl@0
|
258 |
}
|
sl@0
|
259 |
|
sl@0
|
260 |
pUtf8++;
|
sl@0
|
261 |
pUnicode++;
|
sl@0
|
262 |
|
sl@0
|
263 |
}
|
sl@0
|
264 |
|
sl@0
|
265 |
if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
|
sl@0
|
266 |
{
|
sl@0
|
267 |
return EErrorIllFormedInput;
|
sl@0
|
268 |
}
|
sl@0
|
269 |
|
sl@0
|
270 |
aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
|
sl@0
|
271 |
return pointerToLastUnicodeCharacter-pUnicode;
|
sl@0
|
272 |
}
|
sl@0
|
273 |
|
sl@0
|
274 |
|
sl@0
|
275 |
|
sl@0
|
276 |
|
sl@0
|
277 |
|
sl@0
|
278 |
|
sl@0
|
279 |
|
sl@0
|
280 |
|
sl@0
|
281 |
|
sl@0
|
282 |
|
sl@0
|
283 |
|
sl@0
|
284 |
/** Converts text encoded using the Unicode transformation format UTF-8 into the
|
sl@0
|
285 |
Unicode UCS-2 character set.
|
sl@0
|
286 |
|
sl@0
|
287 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
288 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
289 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
290 |
or one of the error values defined in TError. */
|
sl@0
|
291 |
EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
|
sl@0
|
292 |
{
|
sl@0
|
293 |
return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
|
sl@0
|
294 |
}
|
sl@0
|
295 |
|
sl@0
|
296 |
static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
|
sl@0
|
297 |
TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
|
sl@0
|
298 |
{
|
sl@0
|
299 |
if (aNumberOfUnconvertibleCharacters<=0)
|
sl@0
|
300 |
{
|
sl@0
|
301 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
|
sl@0
|
302 |
}
|
sl@0
|
303 |
++aNumberOfUnconvertibleCharacters;
|
sl@0
|
304 |
}
|
sl@0
|
305 |
|
sl@0
|
306 |
/** Converts text encoded using the Unicode transformation format UTF-8 into the
|
sl@0
|
307 |
Unicode UCS-2 character set.
|
sl@0
|
308 |
|
sl@0
|
309 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
310 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
311 |
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
|
sl@0
|
312 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
313 |
or one of the error values defined in TError. */
|
sl@0
|
314 |
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
|
sl@0
|
315 |
{
|
sl@0
|
316 |
TInt dummyUnconverted, dummyUnconvertedIndex;
|
sl@0
|
317 |
return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
|
sl@0
|
318 |
}
|
sl@0
|
319 |
|
sl@0
|
320 |
/** Converts text encoded using the Unicode transformation format UTF-8 into the
|
sl@0
|
321 |
Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
|
sl@0
|
322 |
|
sl@0
|
323 |
The variant of UTF-8 used internally by Java differs slightly from standard
|
sl@0
|
324 |
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
|
sl@0
|
325 |
|
sl@0
|
326 |
@param aUnicode On return, contains the Unicode encoded output string.
|
sl@0
|
327 |
@param aUtf8 The UTF-8 encoded input string
|
sl@0
|
328 |
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
|
sl@0
|
329 |
UTF-8. The default is EFalse.
|
sl@0
|
330 |
@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes
|
sl@0
|
331 |
which were not converted.
|
sl@0
|
332 |
@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
|
sl@0
|
333 |
of the first byte of the first unconvertible character. For instance if the
|
sl@0
|
334 |
first character in the input descriptor (aForeign) could not be converted,
|
sl@0
|
335 |
then this parameter is set to the first byte of that character, i.e. zero.
|
sl@0
|
336 |
A negative value is returned if all the characters were converted.
|
sl@0
|
337 |
@return The number of unconverted bytes left at the end of the input descriptor,
|
sl@0
|
338 |
or one of the error values defined in TError. */
|
sl@0
|
339 |
|
sl@0
|
340 |
/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
|
sl@0
|
341 |
* Well formed UTF-8 Byte Sequences, full table.
|
sl@0
|
342 |
* +----------------------------------------------------------------+
|
sl@0
|
343 |
* | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
|
sl@0
|
344 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
345 |
* | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
|
sl@0
|
346 |
* | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
|
sl@0
|
347 |
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
|
sl@0
|
348 |
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
|
sl@0
|
349 |
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
|
sl@0
|
350 |
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
|
sl@0
|
351 |
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
|
sl@0
|
352 |
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
|
sl@0
|
353 |
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
|
sl@0
|
354 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
355 |
*
|
sl@0
|
356 |
* As a consequence of the well-formedness conditions specified in table 3-7,
|
sl@0
|
357 |
* the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
|
sl@0
|
358 |
*/
|
sl@0
|
359 |
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
|
sl@0
|
360 |
TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
|
sl@0
|
361 |
{
|
sl@0
|
362 |
aUnicode.SetLength(0);
|
sl@0
|
363 |
|
sl@0
|
364 |
if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
|
sl@0
|
365 |
{
|
sl@0
|
366 |
return aUtf8.Length();
|
sl@0
|
367 |
}
|
sl@0
|
368 |
|
sl@0
|
369 |
TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
|
sl@0
|
370 |
const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
|
sl@0
|
371 |
const TUint8* pUtf8 = aUtf8.Ptr();
|
sl@0
|
372 |
const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
|
sl@0
|
373 |
const TUint16 replacementcharacter = 0xFFFD;
|
sl@0
|
374 |
TUint currentUnicodeCharacter;
|
sl@0
|
375 |
TUint sequenceLength;
|
sl@0
|
376 |
|
sl@0
|
377 |
|
sl@0
|
378 |
FOREVER
|
sl@0
|
379 |
{
|
sl@0
|
380 |
TBool illFormed=EFalse;
|
sl@0
|
381 |
|
sl@0
|
382 |
__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
|
sl@0
|
383 |
__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
|
sl@0
|
384 |
|
sl@0
|
385 |
sequenceLength = 1;
|
sl@0
|
386 |
|
sl@0
|
387 |
// ascii - optimisation (i.e. it isn't a sequence)
|
sl@0
|
388 |
if (pUtf8[0] < 0x80)
|
sl@0
|
389 |
{
|
sl@0
|
390 |
currentUnicodeCharacter = pUtf8[0];
|
sl@0
|
391 |
}
|
sl@0
|
392 |
else
|
sl@0
|
393 |
{
|
sl@0
|
394 |
// see if well formed utf-8, use table above for reference
|
sl@0
|
395 |
if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
|
sl@0
|
396 |
{
|
sl@0
|
397 |
// 0xc1-0xc2 are not valid bytes
|
sl@0
|
398 |
sequenceLength = 2;
|
sl@0
|
399 |
}
|
sl@0
|
400 |
else if ((pUtf8[0] & 0xf0) == 0xe0)
|
sl@0
|
401 |
{
|
sl@0
|
402 |
sequenceLength = 3;
|
sl@0
|
403 |
}
|
sl@0
|
404 |
else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
|
sl@0
|
405 |
{
|
sl@0
|
406 |
// 0xf5-0xff, are not valid bytes
|
sl@0
|
407 |
sequenceLength = 4;
|
sl@0
|
408 |
}
|
sl@0
|
409 |
else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
|
sl@0
|
410 |
{
|
sl@0
|
411 |
if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
|
sl@0
|
412 |
{
|
sl@0
|
413 |
// either we've split the 0xc0 0x80 (i.e. 0xc0 is
|
sl@0
|
414 |
// the last character in the string) or we've
|
sl@0
|
415 |
// discovered a valid 0xc0 0x80 sequence.
|
sl@0
|
416 |
sequenceLength = 2;
|
sl@0
|
417 |
}
|
sl@0
|
418 |
}
|
sl@0
|
419 |
|
sl@0
|
420 |
/* checking to see if we got a valid sequence */
|
sl@0
|
421 |
if (sequenceLength == 1)
|
sl@0
|
422 |
{
|
sl@0
|
423 |
// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
|
sl@0
|
424 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
425 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
426 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
427 |
}
|
sl@0
|
428 |
else
|
sl@0
|
429 |
{
|
sl@0
|
430 |
// this is a check to see if the sequence goes beyond the input
|
sl@0
|
431 |
// stream. if its not the first and only character in the input
|
sl@0
|
432 |
// stream this isn't an error, otherwise it is.
|
sl@0
|
433 |
if ((pUtf8 + sequenceLength - 1) > pLastUtf8)
|
sl@0
|
434 |
{
|
sl@0
|
435 |
// check to see if this sequence was the first character
|
sl@0
|
436 |
if ((pUnicode - aUnicode.Ptr()) == 0)
|
sl@0
|
437 |
{
|
sl@0
|
438 |
return EErrorIllFormedInput;
|
sl@0
|
439 |
}
|
sl@0
|
440 |
break;
|
sl@0
|
441 |
}
|
sl@0
|
442 |
|
sl@0
|
443 |
currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
|
sl@0
|
444 |
|
sl@0
|
445 |
/* check the trailing bytes, they should begin with 10 */
|
sl@0
|
446 |
TUint i = 1;
|
sl@0
|
447 |
|
sl@0
|
448 |
do
|
sl@0
|
449 |
{
|
sl@0
|
450 |
if ((pUtf8[i] & 0xc0) == 0x80)
|
sl@0
|
451 |
{
|
sl@0
|
452 |
// add the trailing 6 bits to the current unicode char
|
sl@0
|
453 |
currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
|
sl@0
|
454 |
}
|
sl@0
|
455 |
else
|
sl@0
|
456 |
{
|
sl@0
|
457 |
// ill formed character (doesn't have a lead 10)
|
sl@0
|
458 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
459 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
460 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
461 |
illFormed=ETrue;
|
sl@0
|
462 |
break;
|
sl@0
|
463 |
}
|
sl@0
|
464 |
i++;
|
sl@0
|
465 |
}
|
sl@0
|
466 |
while (i < sequenceLength);
|
sl@0
|
467 |
}
|
sl@0
|
468 |
|
sl@0
|
469 |
/* conformance check. bits of above table for reference.
|
sl@0
|
470 |
* +----------------------------------------------------------------+
|
sl@0
|
471 |
* | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
|
sl@0
|
472 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
473 |
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0
|
sl@0
|
474 |
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F
|
sl@0
|
475 |
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90
|
sl@0
|
476 |
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F
|
sl@0
|
477 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
478 |
*/
|
sl@0
|
479 |
|
sl@0
|
480 |
if (currentUnicodeCharacter != replacementcharacter)
|
sl@0
|
481 |
{
|
sl@0
|
482 |
if (sequenceLength == 3)
|
sl@0
|
483 |
{
|
sl@0
|
484 |
if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
|
sl@0
|
485 |
{
|
sl@0
|
486 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
487 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
488 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
489 |
illFormed=ETrue;
|
sl@0
|
490 |
}
|
sl@0
|
491 |
else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
|
sl@0
|
492 |
{
|
sl@0
|
493 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
494 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
495 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
496 |
illFormed=ETrue;
|
sl@0
|
497 |
}
|
sl@0
|
498 |
}
|
sl@0
|
499 |
else if (sequenceLength == 4)
|
sl@0
|
500 |
{
|
sl@0
|
501 |
if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
|
sl@0
|
502 |
{
|
sl@0
|
503 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
504 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
505 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
506 |
illFormed=ETrue;
|
sl@0
|
507 |
}
|
sl@0
|
508 |
else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
|
sl@0
|
509 |
{
|
sl@0
|
510 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
511 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
512 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
513 |
illFormed=ETrue;
|
sl@0
|
514 |
}
|
sl@0
|
515 |
}
|
sl@0
|
516 |
|
sl@0
|
517 |
|
sl@0
|
518 |
/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
|
sl@0
|
519 |
* are not Unicode scalar values, any UTF-8 byte sequence that would map to code
|
sl@0
|
520 |
* points D800..DFFF is ill formed */
|
sl@0
|
521 |
|
sl@0
|
522 |
if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
|
sl@0
|
523 |
{
|
sl@0
|
524 |
currentUnicodeCharacter = replacementcharacter;
|
sl@0
|
525 |
UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
|
sl@0
|
526 |
aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr());
|
sl@0
|
527 |
illFormed=ETrue;
|
sl@0
|
528 |
}
|
sl@0
|
529 |
}
|
sl@0
|
530 |
// end conformance check
|
sl@0
|
531 |
}
|
sl@0
|
532 |
|
sl@0
|
533 |
// would this character generate a surrogate pair in UTF-16?
|
sl@0
|
534 |
if (currentUnicodeCharacter > 0xFFFF)
|
sl@0
|
535 |
{
|
sl@0
|
536 |
// is there enough space to hold a surrogate pair in the output?
|
sl@0
|
537 |
if (pUnicode >= pLastUnicode)
|
sl@0
|
538 |
{
|
sl@0
|
539 |
break; // no, end processing.
|
sl@0
|
540 |
}
|
sl@0
|
541 |
|
sl@0
|
542 |
TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
|
sl@0
|
543 |
*pUnicode++ = STATIC_CAST(TUint16, surrogate);
|
sl@0
|
544 |
|
sl@0
|
545 |
surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
|
sl@0
|
546 |
*pUnicode++ = STATIC_CAST(TUint16, surrogate);
|
sl@0
|
547 |
}
|
sl@0
|
548 |
else
|
sl@0
|
549 |
{
|
sl@0
|
550 |
*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
|
sl@0
|
551 |
}
|
sl@0
|
552 |
|
sl@0
|
553 |
// move the input pointer
|
sl@0
|
554 |
if (currentUnicodeCharacter != replacementcharacter)
|
sl@0
|
555 |
{
|
sl@0
|
556 |
pUtf8 += sequenceLength;
|
sl@0
|
557 |
}
|
sl@0
|
558 |
else if(illFormed == EFalse)
|
sl@0
|
559 |
{
|
sl@0
|
560 |
pUtf8 += (sequenceLength);
|
sl@0
|
561 |
}
|
sl@0
|
562 |
else
|
sl@0
|
563 |
{
|
sl@0
|
564 |
// we had a character we didn't recognize (i.e. it was invalid)
|
sl@0
|
565 |
// so move to the next character in the input
|
sl@0
|
566 |
pUtf8++;
|
sl@0
|
567 |
}
|
sl@0
|
568 |
|
sl@0
|
569 |
if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
|
sl@0
|
570 |
{
|
sl@0
|
571 |
break; // we've either reached the end of the input or the end of output
|
sl@0
|
572 |
}
|
sl@0
|
573 |
}
|
sl@0
|
574 |
|
sl@0
|
575 |
aUnicode.SetLength(pUnicode - aUnicode.Ptr());
|
sl@0
|
576 |
return (pLastUtf8 - pUtf8 + 1);
|
sl@0
|
577 |
}
|
sl@0
|
578 |
|
sl@0
|
579 |
/** Given a sample text this function attempts to determine whether or not
|
sl@0
|
580 |
* the same text is encoded using the UTF-8 standard encoding scheme.
|
sl@0
|
581 |
|
sl@0
|
582 |
@param TInt a confidence level, given at certain value. if the given sample
|
sl@0
|
583 |
is UTF-8 this value will not be changed (unless > 100) then its
|
sl@0
|
584 |
set to 100. Otherwise if the same isn't UTF-8, its set to 0.
|
sl@0
|
585 |
@param TDesC8 sample text.
|
sl@0
|
586 |
UTF-8. The default is EFalse.
|
sl@0
|
587 |
@return void
|
sl@0
|
588 |
*/
|
sl@0
|
589 |
|
sl@0
|
590 |
/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7
|
sl@0
|
591 |
* Well formed UTF-8 Byte Sequences, full table.
|
sl@0
|
592 |
* +----------------------------------------------------------------+
|
sl@0
|
593 |
* | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte |
|
sl@0
|
594 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
595 |
* | U+0000..U+007F | 00..7D | | | | 1 byte, ascii
|
sl@0
|
596 |
* | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2
|
sl@0
|
597 |
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0
|
sl@0
|
598 |
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal
|
sl@0
|
599 |
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F
|
sl@0
|
600 |
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal
|
sl@0
|
601 |
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90
|
sl@0
|
602 |
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal
|
sl@0
|
603 |
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F
|
sl@0
|
604 |
* +--------------------+----------+----------+----------+----------+
|
sl@0
|
605 |
*
|
sl@0
|
606 |
* As a consequence of the well-formedness conditions specified in table 3-7,
|
sl@0
|
607 |
* the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
|
sl@0
|
608 |
*
|
sl@0
|
609 |
* Code Rules:
|
sl@0
|
610 |
* R1: If the string contains any non-UTF-8 characters the returned confidence
|
sl@0
|
611 |
* is 0. Valid UTF-8 combinations are listed in the above table.
|
sl@0
|
612 |
* R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in
|
sl@0
|
613 |
* the (see ) the returned confidence is 95.
|
sl@0
|
614 |
* R3: Otherwise the confidence returned is based upon the sample string
|
sl@0
|
615 |
* length.
|
sl@0
|
616 |
* R4: If the sample string is under 75 characters, the confidence is set to
|
sl@0
|
617 |
* 75.
|
sl@0
|
618 |
*/
|
sl@0
|
619 |
void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
|
sl@0
|
620 |
{
|
sl@0
|
621 |
|
sl@0
|
622 |
TInt sampleLength = aSample.Length();
|
sl@0
|
623 |
|
sl@0
|
624 |
if (sampleLength == 0)
|
sl@0
|
625 |
{
|
sl@0
|
626 |
aConfidenceLevel = 89;
|
sl@0
|
627 |
return;
|
sl@0
|
628 |
}
|
sl@0
|
629 |
TInt bytesRemaining = 0;
|
sl@0
|
630 |
TUint sequenceLength = 0;
|
sl@0
|
631 |
|
sl@0
|
632 |
aConfidenceLevel = sampleLength;
|
sl@0
|
633 |
|
sl@0
|
634 |
const TUint8* buffer = &aSample[0];
|
sl@0
|
635 |
|
sl@0
|
636 |
if (sampleLength < 95)
|
sl@0
|
637 |
{
|
sl@0
|
638 |
// check for the BOM
|
sl@0
|
639 |
if ((sampleLength >= 3) &&
|
sl@0
|
640 |
((buffer[0] == 0xEF) &&
|
sl@0
|
641 |
(buffer[1] == 0xBB) &&
|
sl@0
|
642 |
(buffer[2] == 0xBF))
|
sl@0
|
643 |
)
|
sl@0
|
644 |
{
|
sl@0
|
645 |
aConfidenceLevel = 95;
|
sl@0
|
646 |
}
|
sl@0
|
647 |
else if (sampleLength < 75)
|
sl@0
|
648 |
{
|
sl@0
|
649 |
aConfidenceLevel = 75;
|
sl@0
|
650 |
}
|
sl@0
|
651 |
}
|
sl@0
|
652 |
|
sl@0
|
653 |
for (TInt index = 0;index != sampleLength;index++)
|
sl@0
|
654 |
{
|
sl@0
|
655 |
|
sl@0
|
656 |
if (bytesRemaining > 0)
|
sl@0
|
657 |
{
|
sl@0
|
658 |
// bytesRemaining > 0, means that a byte representing the start of a
|
sl@0
|
659 |
// multibyte sequence was encountered and the bytesRemaining is the
|
sl@0
|
660 |
// number of bytes to follow.
|
sl@0
|
661 |
|
sl@0
|
662 |
if ((buffer[index] & 0xc0) == 0x80)
|
sl@0
|
663 |
{
|
sl@0
|
664 |
// need to check for ill-formed sequences -- all are in the 2nd byte
|
sl@0
|
665 |
|
sl@0
|
666 |
if ((sequenceLength == 3) && (bytesRemaining == 2))
|
sl@0
|
667 |
{
|
sl@0
|
668 |
if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
|
sl@0
|
669 |
{
|
sl@0
|
670 |
aConfidenceLevel = 0;
|
sl@0
|
671 |
break;
|
sl@0
|
672 |
}
|
sl@0
|
673 |
else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
|
sl@0
|
674 |
{
|
sl@0
|
675 |
aConfidenceLevel = 0;
|
sl@0
|
676 |
break;
|
sl@0
|
677 |
}
|
sl@0
|
678 |
}
|
sl@0
|
679 |
else if ((sequenceLength == 4) && (bytesRemaining == 3))
|
sl@0
|
680 |
{
|
sl@0
|
681 |
if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
|
sl@0
|
682 |
{
|
sl@0
|
683 |
aConfidenceLevel = 0;
|
sl@0
|
684 |
break;
|
sl@0
|
685 |
}
|
sl@0
|
686 |
else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
|
sl@0
|
687 |
{
|
sl@0
|
688 |
aConfidenceLevel = 0;
|
sl@0
|
689 |
break;
|
sl@0
|
690 |
}
|
sl@0
|
691 |
}
|
sl@0
|
692 |
|
sl@0
|
693 |
--bytesRemaining;
|
sl@0
|
694 |
continue;
|
sl@0
|
695 |
}
|
sl@0
|
696 |
else
|
sl@0
|
697 |
{
|
sl@0
|
698 |
aConfidenceLevel = 0;
|
sl@0
|
699 |
break;
|
sl@0
|
700 |
}
|
sl@0
|
701 |
}
|
sl@0
|
702 |
|
sl@0
|
703 |
if (bytesRemaining == 0)
|
sl@0
|
704 |
{
|
sl@0
|
705 |
if (buffer[index] < 0x80)
|
sl@0
|
706 |
{
|
sl@0
|
707 |
// The value of aSample[index] is in the range 0x00-0x7f
|
sl@0
|
708 |
//UTF8 maintains ASCII transparency. So it's a valid
|
sl@0
|
709 |
//UTF8. Do nothing, check next value.
|
sl@0
|
710 |
continue;
|
sl@0
|
711 |
}
|
sl@0
|
712 |
else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
|
sl@0
|
713 |
{
|
sl@0
|
714 |
// valid start of a 2 byte sequence (see conformance note)
|
sl@0
|
715 |
sequenceLength = 2;
|
sl@0
|
716 |
bytesRemaining = 1;
|
sl@0
|
717 |
}
|
sl@0
|
718 |
else if ((buffer[index] & 0xf0) == 0xe0)
|
sl@0
|
719 |
{
|
sl@0
|
720 |
// valid start of a 3 byte sequence
|
sl@0
|
721 |
sequenceLength = 3;
|
sl@0
|
722 |
bytesRemaining = 2;
|
sl@0
|
723 |
}
|
sl@0
|
724 |
else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
|
sl@0
|
725 |
{
|
sl@0
|
726 |
// valid start of a 4 byte sequence (see conformance note)
|
sl@0
|
727 |
sequenceLength = 4;
|
sl@0
|
728 |
bytesRemaining = 3;
|
sl@0
|
729 |
}
|
sl@0
|
730 |
else
|
sl@0
|
731 |
{
|
sl@0
|
732 |
// wasn't anything expected so must be an illegal/irregular UTF8 coded value
|
sl@0
|
733 |
aConfidenceLevel = 0;
|
sl@0
|
734 |
break;
|
sl@0
|
735 |
}
|
sl@0
|
736 |
}
|
sl@0
|
737 |
} // for
|
sl@0
|
738 |
|
sl@0
|
739 |
aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
|
sl@0
|
740 |
}
|
sl@0
|
741 |
|
sl@0
|
742 |
// End of file
|