sl@0
|
1 |
// Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
2 |
// All rights reserved.
|
sl@0
|
3 |
// This component and the accompanying materials are made available
|
sl@0
|
4 |
// under the terms of "Eclipse Public License v1.0"
|
sl@0
|
5 |
// which accompanies this distribution, and is available
|
sl@0
|
6 |
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
7 |
//
|
sl@0
|
8 |
// Initial Contributors:
|
sl@0
|
9 |
// Nokia Corporation - initial contribution.
|
sl@0
|
10 |
//
|
sl@0
|
11 |
// Contributors:
|
sl@0
|
12 |
//
|
sl@0
|
13 |
// Description:
|
sl@0
|
14 |
// Name : MRT_WCHARCNVT.CPP
|
sl@0
|
15 |
// Part of : MRT LIBC
|
sl@0
|
16 |
// Contains the source for the helper functions used by wchar
|
sl@0
|
17 |
// restartable conversion API's in libc
|
sl@0
|
18 |
// Version : 1.0
|
sl@0
|
19 |
//
|
sl@0
|
20 |
|
sl@0
|
21 |
|
sl@0
|
22 |
|
sl@0
|
23 |
// Copyright (c) 1997-2003 Symbian Ltd. All rights reserved.
|
sl@0
|
24 |
|
sl@0
|
25 |
// system includes
|
sl@0
|
26 |
#include <e32std.h>
|
sl@0
|
27 |
#include <e32base.h>
|
sl@0
|
28 |
#include <utf.h>
|
sl@0
|
29 |
#include <stdlib.h>
|
sl@0
|
30 |
#include <string.h>
|
sl@0
|
31 |
#include <errno.h>
|
sl@0
|
32 |
#include <wchar.h>
|
sl@0
|
33 |
|
sl@0
|
34 |
#include "wcharcnv.h"
|
sl@0
|
35 |
|
sl@0
|
36 |
#define KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00
|
sl@0
|
37 |
|
sl@0
|
38 |
//-----------------------------------------------------------------------------
|
sl@0
|
39 |
//Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const
|
sl@0
|
40 |
// TDesC8& aUtf8, mbstate_t *state)
|
sl@0
|
41 |
//Description : Converts the unicode to UTF8
|
sl@0
|
42 |
//Return Value : The number of unconverted bytes left at the end of the input
|
sl@0
|
43 |
//descriptor, or one of the error values defined in TError.
|
sl@0
|
44 |
//-----------------------------------------------------------------------------
|
sl@0
|
45 |
TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
|
sl@0
|
46 |
{
|
sl@0
|
47 |
aUnicode.SetLength(0);
|
sl@0
|
48 |
if (aUtf8.Length()==0)
|
sl@0
|
49 |
{
|
sl@0
|
50 |
return 0;
|
sl@0
|
51 |
}
|
sl@0
|
52 |
if (aUnicode.MaxLength()==0)
|
sl@0
|
53 |
{
|
sl@0
|
54 |
return aUtf8.Length();
|
sl@0
|
55 |
}
|
sl@0
|
56 |
|
sl@0
|
57 |
HBufC8* utf8 = NULL;
|
sl@0
|
58 |
if ( state->__count > 0)
|
sl@0
|
59 |
{
|
sl@0
|
60 |
// state have some information, use that.
|
sl@0
|
61 |
utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
|
sl@0
|
62 |
TPtr8 tempBuf = utf8->Des();
|
sl@0
|
63 |
TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
|
sl@0
|
64 |
tempBuf.Copy(temp);
|
sl@0
|
65 |
tempBuf.Append(aUtf8);
|
sl@0
|
66 |
}
|
sl@0
|
67 |
|
sl@0
|
68 |
TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
|
sl@0
|
69 |
const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
|
sl@0
|
70 |
const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
|
sl@0
|
71 |
const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
|
sl@0
|
72 |
TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
|
sl@0
|
73 |
const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
|
sl@0
|
74 |
TUint16 replacementcharacter = 0xFFFD;
|
sl@0
|
75 |
TUint8 currentUtf8Byte;
|
sl@0
|
76 |
TUint currentUnicodeCharacter;
|
sl@0
|
77 |
TInt sequenceLength;
|
sl@0
|
78 |
|
sl@0
|
79 |
|
sl@0
|
80 |
FOREVER
|
sl@0
|
81 |
{
|
sl@0
|
82 |
currentUtf8Byte=*pointerToCurrentUtf8Byte;
|
sl@0
|
83 |
pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
|
sl@0
|
84 |
sequenceLength=100;
|
sl@0
|
85 |
|
sl@0
|
86 |
for(TInt i=0;i<7;i++)
|
sl@0
|
87 |
{
|
sl@0
|
88 |
if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
|
sl@0
|
89 |
{
|
sl@0
|
90 |
sequenceLength = 4-i;
|
sl@0
|
91 |
break;
|
sl@0
|
92 |
}
|
sl@0
|
93 |
}
|
sl@0
|
94 |
|
sl@0
|
95 |
if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0)
|
sl@0
|
96 |
{
|
sl@0
|
97 |
currentUnicodeCharacter=replacementcharacter;
|
sl@0
|
98 |
}
|
sl@0
|
99 |
else
|
sl@0
|
100 |
{
|
sl@0
|
101 |
if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
|
sl@0
|
102 |
{
|
sl@0
|
103 |
// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
|
sl@0
|
104 |
// store the character within the state.
|
sl@0
|
105 |
state->__count = 0;
|
sl@0
|
106 |
while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
|
sl@0
|
107 |
{
|
sl@0
|
108 |
state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
|
sl@0
|
109 |
}
|
sl@0
|
110 |
// reset the current pointer
|
sl@0
|
111 |
pointerToCurrentUtf8Byte -= state->__count;
|
sl@0
|
112 |
if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
|
sl@0
|
113 |
{
|
sl@0
|
114 |
// still nothing is decoded.
|
sl@0
|
115 |
if ( utf8 )
|
sl@0
|
116 |
{
|
sl@0
|
117 |
CleanupStack::PopAndDestroy(); // utf8
|
sl@0
|
118 |
}
|
sl@0
|
119 |
return -2;
|
sl@0
|
120 |
//return -1;
|
sl@0
|
121 |
}
|
sl@0
|
122 |
// something is already decoded, so return the no of bytes that use for
|
sl@0
|
123 |
// decoding.
|
sl@0
|
124 |
break;
|
sl@0
|
125 |
}
|
sl@0
|
126 |
|
sl@0
|
127 |
// reset the state
|
sl@0
|
128 |
state->__count = 0;
|
sl@0
|
129 |
currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
|
sl@0
|
130 |
|
sl@0
|
131 |
for(TInt i=sequenceLength;i>1; i--)
|
sl@0
|
132 |
{
|
sl@0
|
133 |
currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
|
sl@0
|
134 |
if ((currentUtf8Byte&0xc0)==0x80)
|
sl@0
|
135 |
{
|
sl@0
|
136 |
currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F);
|
sl@0
|
137 |
}
|
sl@0
|
138 |
else
|
sl@0
|
139 |
{
|
sl@0
|
140 |
// Encoding error occured.
|
sl@0
|
141 |
// store the contained within the state and return -1.
|
sl@0
|
142 |
// set the error EILSEQ to errno
|
sl@0
|
143 |
if ( utf8 )
|
sl@0
|
144 |
{
|
sl@0
|
145 |
CleanupStack::PopAndDestroy(); // utf8
|
sl@0
|
146 |
}
|
sl@0
|
147 |
errno = EILSEQ;
|
sl@0
|
148 |
return -1;
|
sl@0
|
149 |
//currentUnicodeCharacter=replacementcharacter;
|
sl@0
|
150 |
//--pointerToCurrentUtf8Byte;
|
sl@0
|
151 |
}
|
sl@0
|
152 |
}
|
sl@0
|
153 |
}
|
sl@0
|
154 |
|
sl@0
|
155 |
if (currentUnicodeCharacter > 0xFFFF)
|
sl@0
|
156 |
{
|
sl@0
|
157 |
if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
|
sl@0
|
158 |
{
|
sl@0
|
159 |
// unicode descriptor dnt have 2 wchar bytes to hold the data.
|
sl@0
|
160 |
pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
|
sl@0
|
161 |
break;
|
sl@0
|
162 |
}
|
sl@0
|
163 |
|
sl@0
|
164 |
TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
|
sl@0
|
165 |
*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);
|
sl@0
|
166 |
++pointerToCurrentUnicodeCharacter;
|
sl@0
|
167 |
|
sl@0
|
168 |
surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
|
sl@0
|
169 |
*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);
|
sl@0
|
170 |
++pointerToCurrentUnicodeCharacter;
|
sl@0
|
171 |
++pointerToCurrentUtf8Byte;
|
sl@0
|
172 |
}
|
sl@0
|
173 |
else
|
sl@0
|
174 |
{
|
sl@0
|
175 |
*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
|
sl@0
|
176 |
++pointerToCurrentUnicodeCharacter;
|
sl@0
|
177 |
++pointerToCurrentUtf8Byte;
|
sl@0
|
178 |
}
|
sl@0
|
179 |
|
sl@0
|
180 |
if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
|
sl@0
|
181 |
{
|
sl@0
|
182 |
// checking the boundary condition.
|
sl@0
|
183 |
// Here either the UTF-8 or Unicode descriptor reached to the end.
|
sl@0
|
184 |
break;
|
sl@0
|
185 |
}
|
sl@0
|
186 |
} // forever
|
sl@0
|
187 |
// decoding finished.
|
sl@0
|
188 |
aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
|
sl@0
|
189 |
if ( utf8 )
|
sl@0
|
190 |
{
|
sl@0
|
191 |
CleanupStack::PopAndDestroy(); // utf8
|
sl@0
|
192 |
}
|
sl@0
|
193 |
//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
|
sl@0
|
194 |
// returns the number of bytes used to complete a valid multibyte character.
|
sl@0
|
195 |
return pointerToCurrentUtf8Byte - aUtf8.Ptr();
|
sl@0
|
196 |
} //end of function
|
sl@0
|
197 |
|
sl@0
|
198 |
//-----------------------------------------------------------------------------
|
sl@0
|
199 |
//Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
|
sl@0
|
200 |
//Description : Converts wide char in UCS2 format to UTF8 equivalent
|
sl@0
|
201 |
//Return Value : The number of bytes converted, 0 if L'\0\' was translated, -1 on
|
sl@0
|
202 |
//generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
|
sl@0
|
203 |
//-----------------------------------------------------------------------------
|
sl@0
|
204 |
TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
|
sl@0
|
205 |
{
|
sl@0
|
206 |
int retval = 0;
|
sl@0
|
207 |
// check the state
|
sl@0
|
208 |
if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
|
sl@0
|
209 |
{
|
sl@0
|
210 |
errno = EINVAL;
|
sl@0
|
211 |
return -1;
|
sl@0
|
212 |
}
|
sl@0
|
213 |
|
sl@0
|
214 |
//following characters are illegal
|
sl@0
|
215 |
//see http://www.unicode.org/faq/utf_bom.html#40
|
sl@0
|
216 |
if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
|
sl@0
|
217 |
{
|
sl@0
|
218 |
errno = EILSEQ;
|
sl@0
|
219 |
return -1;
|
sl@0
|
220 |
}
|
sl@0
|
221 |
|
sl@0
|
222 |
|
sl@0
|
223 |
if(ps->__count == _EUTF16InitialState)
|
sl@0
|
224 |
{
|
sl@0
|
225 |
|
sl@0
|
226 |
//following characters in addition are illegal in initial state
|
sl@0
|
227 |
//see http://www.unicode.org/faq/utf_bom.html#40
|
sl@0
|
228 |
if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
|
sl@0
|
229 |
{
|
sl@0
|
230 |
errno = EILSEQ;
|
sl@0
|
231 |
return -1;
|
sl@0
|
232 |
}
|
sl@0
|
233 |
|
sl@0
|
234 |
|
sl@0
|
235 |
if ((aSrc & 0xff80)==0x0000)
|
sl@0
|
236 |
{
|
sl@0
|
237 |
if(aLen >= 1)
|
sl@0
|
238 |
{
|
sl@0
|
239 |
*dst++ = static_cast<TUint8>(aSrc);
|
sl@0
|
240 |
retval = 1;
|
sl@0
|
241 |
}
|
sl@0
|
242 |
else
|
sl@0
|
243 |
{
|
sl@0
|
244 |
return -2;
|
sl@0
|
245 |
}
|
sl@0
|
246 |
|
sl@0
|
247 |
}
|
sl@0
|
248 |
else if ((aSrc & 0xf800)==0x0000)
|
sl@0
|
249 |
{
|
sl@0
|
250 |
if (aLen >= 2)
|
sl@0
|
251 |
{
|
sl@0
|
252 |
*dst++ = static_cast<TUint8>(0xc0|(aSrc>>6));
|
sl@0
|
253 |
*dst++ = static_cast<TUint8> (0x80|(aSrc&0x3f));
|
sl@0
|
254 |
retval = 2;
|
sl@0
|
255 |
}
|
sl@0
|
256 |
else
|
sl@0
|
257 |
{
|
sl@0
|
258 |
return -2;
|
sl@0
|
259 |
}
|
sl@0
|
260 |
}
|
sl@0
|
261 |
else if ((aSrc & 0xfc00)==0xd800)
|
sl@0
|
262 |
{
|
sl@0
|
263 |
ps->__value.lead = aSrc;
|
sl@0
|
264 |
ps->__count = _EUTF16_21BitExtensionState;
|
sl@0
|
265 |
retval = 0; //nothing written out just yet
|
sl@0
|
266 |
}
|
sl@0
|
267 |
else
|
sl@0
|
268 |
{
|
sl@0
|
269 |
if ( aLen >= 3)
|
sl@0
|
270 |
{
|
sl@0
|
271 |
*dst++ = static_cast<TUint8>(0xe0|(aSrc>>12));
|
sl@0
|
272 |
*dst++ = static_cast<TUint8>(0x80|((aSrc>>6)&0x3f));
|
sl@0
|
273 |
*dst++ = static_cast<TUint8>(0x80|(aSrc&0x3f));
|
sl@0
|
274 |
retval = 3;
|
sl@0
|
275 |
}
|
sl@0
|
276 |
else
|
sl@0
|
277 |
{
|
sl@0
|
278 |
return -2;
|
sl@0
|
279 |
}
|
sl@0
|
280 |
}
|
sl@0
|
281 |
|
sl@0
|
282 |
|
sl@0
|
283 |
}
|
sl@0
|
284 |
else //ps->__count == _EUCS2_21BitExtensionState)
|
sl@0
|
285 |
{
|
sl@0
|
286 |
//characters outside this range are illegal in this state
|
sl@0
|
287 |
//see http://www.unicode.org/faq/utf_bom.html#40
|
sl@0
|
288 |
if((aSrc < 0xDC00 || aSrc > 0xDFFF) )
|
sl@0
|
289 |
{
|
sl@0
|
290 |
errno = EILSEQ;
|
sl@0
|
291 |
return -1;
|
sl@0
|
292 |
}
|
sl@0
|
293 |
|
sl@0
|
294 |
if ((aSrc & 0xfc00)!=0xdc00)
|
sl@0
|
295 |
{
|
sl@0
|
296 |
errno = EILSEQ;
|
sl@0
|
297 |
return -1;
|
sl@0
|
298 |
}
|
sl@0
|
299 |
if ( aLen >= 4)
|
sl@0
|
300 |
{
|
sl@0
|
301 |
//snippet taken from unicode faq
|
sl@0
|
302 |
//http://www.unicode.org/faq/utf_bom.html#39
|
sl@0
|
303 |
|
sl@0
|
304 |
unsigned long codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
|
sl@0
|
305 |
|
sl@0
|
306 |
*dst++ = static_cast<TUint8>( 0xf0|(codepoint>>18));
|
sl@0
|
307 |
*dst++ = static_cast<TUint8>(0x80|((codepoint>>12)&0x3f));
|
sl@0
|
308 |
*dst++ = static_cast<TUint8>(0x80|((codepoint>>6)&0x3f));
|
sl@0
|
309 |
*dst++ = static_cast<TUint8>(0x80|(codepoint&0x3f));
|
sl@0
|
310 |
retval = 4;
|
sl@0
|
311 |
}
|
sl@0
|
312 |
else
|
sl@0
|
313 |
{
|
sl@0
|
314 |
return -2;
|
sl@0
|
315 |
}
|
sl@0
|
316 |
ps->__count = _EUTF16InitialState;
|
sl@0
|
317 |
}
|
sl@0
|
318 |
return retval;
|
sl@0
|
319 |
|
sl@0
|
320 |
|
sl@0
|
321 |
}//end of function
|
sl@0
|
322 |
|