Symaptic: os/ossrv/genericopenlibs/openenvcore/libc/src/charcnv.cpp@260cb5ec6c19 (annotated)

sl@0	1	// Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0	2	// All rights reserved.
sl@0	3	// This component and the accompanying materials are made available
sl@0	4	// under the terms of "Eclipse Public License v1.0"
sl@0	5	// which accompanies this distribution, and is available
sl@0	6	// at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0	7	//
sl@0	8	// Initial Contributors:
sl@0	9	// Nokia Corporation - initial contribution.
sl@0	10	//
sl@0	11	// Contributors:
sl@0	12	//
sl@0	13	// Description:
sl@0	14	// Name : MRT_WCHARCNVT.CPP
sl@0	15	// Part of : MRT LIBC
sl@0	16	// Contains the source for the helper functions used by wchar
sl@0	17	// restartable conversion API's in libc
sl@0	18	// Version : 1.0
sl@0	19	//
sl@0	20
sl@0	21
sl@0	22
sl@0	23	// Copyright (c) 1997-2003 Symbian Ltd. All rights reserved.
sl@0	24
sl@0	25	// system includes
sl@0	26	#include <e32std.h>
sl@0	27	#include <e32base.h>
sl@0	28	#include <utf.h>
sl@0	29	#include <stdlib.h>
sl@0	30	#include <string.h>
sl@0	31	#include <errno.h>
sl@0	32	#include <wchar.h>
sl@0	33
sl@0	34	#include "wcharcnv.h"
sl@0	35
sl@0	36	#define KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00
sl@0	37
sl@0	38	//-----------------------------------------------------------------------------
sl@0	39	//Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const
sl@0	40	// TDesC8& aUtf8, mbstate_t *state)
sl@0	41	//Description : Converts the unicode to UTF8
sl@0	42	//Return Value : The number of unconverted bytes left at the end of the input
sl@0	43	//descriptor, or one of the error values defined in TError.
sl@0	44	//-----------------------------------------------------------------------------
sl@0	45	TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
sl@0	46	{
sl@0	47	aUnicode.SetLength(0);
sl@0	48	if (aUtf8.Length()==0)
sl@0	49	{
sl@0	50	return 0;
sl@0	51	}
sl@0	52	if (aUnicode.MaxLength()==0)
sl@0	53	{
sl@0	54	return aUtf8.Length();
sl@0	55	}
sl@0	56
sl@0	57	HBufC8* utf8 = NULL;
sl@0	58	if ( state->__count > 0)
sl@0	59	{
sl@0	60	// state have some information, use that.
sl@0	61	utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
sl@0	62	TPtr8 tempBuf = utf8->Des();
sl@0	63	TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
sl@0	64	tempBuf.Copy(temp);
sl@0	65	tempBuf.Append(aUtf8);
sl@0	66	}
sl@0	67
sl@0	68	TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
sl@0	69	const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
sl@0	70	const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
sl@0	71	const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
sl@0	72	TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
sl@0	73	const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
sl@0	74	TUint16 replacementcharacter = 0xFFFD;
sl@0	75	TUint8 currentUtf8Byte;
sl@0	76	TUint currentUnicodeCharacter;
sl@0	77	TInt sequenceLength;
sl@0	78
sl@0	79
sl@0	80	FOREVER
sl@0	81	{
sl@0	82	currentUtf8Byte=*pointerToCurrentUtf8Byte;
sl@0	83	pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
sl@0	84	sequenceLength=100;
sl@0	85
sl@0	86	for(TInt i=0;i<7;i++)
sl@0	87	{
sl@0	88	if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
sl@0	89	{
sl@0	90	sequenceLength = 4-i;
sl@0	91	break;
sl@0	92	}
sl@0	93	}
sl@0	94
sl@0	95	if ((sequenceLength<2 \|\| sequenceLength>6) && sequenceLength!=0)
sl@0	96	{
sl@0	97	currentUnicodeCharacter=replacementcharacter;
sl@0	98	}
sl@0	99	else
sl@0	100	{
sl@0	101	if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
sl@0	102	{
sl@0	103	// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
sl@0	104	// store the character within the state.
sl@0	105	state->__count = 0;
sl@0	106	while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
sl@0	107	{
sl@0	108	state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
sl@0	109	}
sl@0	110	// reset the current pointer
sl@0	111	pointerToCurrentUtf8Byte -= state->__count;
sl@0	112	if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
sl@0	113	{
sl@0	114	// still nothing is decoded.
sl@0	115	if ( utf8 )
sl@0	116	{
sl@0	117	CleanupStack::PopAndDestroy(); // utf8
sl@0	118	}
sl@0	119	return -2;
sl@0	120	//return -1;
sl@0	121	}
sl@0	122	// something is already decoded, so return the no of bytes that use for
sl@0	123	// decoding.
sl@0	124	break;
sl@0	125	}
sl@0	126
sl@0	127	// reset the state
sl@0	128	state->__count = 0;
sl@0	129	currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
sl@0	130
sl@0	131	for(TInt i=sequenceLength;i>1; i--)
sl@0	132	{
sl@0	133	currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
sl@0	134	if ((currentUtf8Byte&0xc0)==0x80)
sl@0	135	{
sl@0	136	currentUnicodeCharacter = (currentUnicodeCharacter<<6)\|(currentUtf8Byte&0x3F);
sl@0	137	}
sl@0	138	else
sl@0	139	{
sl@0	140	// Encoding error occured.
sl@0	141	// store the contained within the state and return -1.
sl@0	142	// set the error EILSEQ to errno
sl@0	143	if ( utf8 )
sl@0	144	{
sl@0	145	CleanupStack::PopAndDestroy(); // utf8
sl@0	146	}
sl@0	147	errno = EILSEQ;
sl@0	148	return -1;
sl@0	149	//currentUnicodeCharacter=replacementcharacter;
sl@0	150	//--pointerToCurrentUtf8Byte;
sl@0	151	}
sl@0	152	}
sl@0	153	}
sl@0	154
sl@0	155	if (currentUnicodeCharacter > 0xFFFF)
sl@0	156	{
sl@0	157	if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
sl@0	158	{
sl@0	159	// unicode descriptor dnt have 2 wchar bytes to hold the data.
sl@0	160	pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
sl@0	161	break;
sl@0	162	}
sl@0	163
sl@0	164	TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
sl@0	165	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);
sl@0	166	++pointerToCurrentUnicodeCharacter;
sl@0	167
sl@0	168	surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
sl@0	169	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);
sl@0	170	++pointerToCurrentUnicodeCharacter;
sl@0	171	++pointerToCurrentUtf8Byte;
sl@0	172	}
sl@0	173	else
sl@0	174	{
sl@0	175	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
sl@0	176	++pointerToCurrentUnicodeCharacter;
sl@0	177	++pointerToCurrentUtf8Byte;
sl@0	178	}
sl@0	179
sl@0	180	if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) \|\| (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
sl@0	181	{
sl@0	182	// checking the boundary condition.
sl@0	183	// Here either the UTF-8 or Unicode descriptor reached to the end.
sl@0	184	break;
sl@0	185	}
sl@0	186	} // forever
sl@0	187	// decoding finished.
sl@0	188	aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
sl@0	189	if ( utf8 )
sl@0	190	{
sl@0	191	CleanupStack::PopAndDestroy(); // utf8
sl@0	192	}
sl@0	193	//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
sl@0	194	// returns the number of bytes used to complete a valid multibyte character.
sl@0	195	return pointerToCurrentUtf8Byte - aUtf8.Ptr();
sl@0	196	} //end of function
sl@0	197
sl@0	198	//-----------------------------------------------------------------------------
sl@0	199	//Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
sl@0	200	//Description : Converts wide char in UCS2 format to UTF8 equivalent
sl@0	201	//Return Value : The number of bytes converted, 0 if L'\0\' was translated, -1 on
sl@0	202	//generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
sl@0	203	//-----------------------------------------------------------------------------
sl@0	204	TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
sl@0	205	{
sl@0	206	int retval = 0;
sl@0	207	// check the state
sl@0	208	if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
sl@0	209	{
sl@0	210	errno = EINVAL;
sl@0	211	return -1;
sl@0	212	}
sl@0	213
sl@0	214	//following characters are illegal
sl@0	215	//see http://www.unicode.org/faq/utf_bom.html#40
sl@0	216	if(aSrc == 0xFFFE \|\| aSrc == 0xFFFF \|\| (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
sl@0	217	{
sl@0	218	errno = EILSEQ;
sl@0	219	return -1;
sl@0	220	}
sl@0	221
sl@0	222
sl@0	223	if(ps->__count == _EUTF16InitialState)
sl@0	224	{
sl@0	225
sl@0	226	//following characters in addition are illegal in initial state
sl@0	227	//see http://www.unicode.org/faq/utf_bom.html#40
sl@0	228	if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
sl@0	229	{
sl@0	230	errno = EILSEQ;
sl@0	231	return -1;
sl@0	232	}
sl@0	233
sl@0	234
sl@0	235	if ((aSrc & 0xff80)==0x0000)
sl@0	236	{
sl@0	237	if(aLen >= 1)
sl@0	238	{
sl@0	239	*dst++ = static_cast<TUint8>(aSrc);
sl@0	240	retval = 1;
sl@0	241	}
sl@0	242	else
sl@0	243	{
sl@0	244	return -2;
sl@0	245	}
sl@0	246
sl@0	247	}
sl@0	248	else if ((aSrc & 0xf800)==0x0000)
sl@0	249	{
sl@0	250	if (aLen >= 2)
sl@0	251	{
sl@0	252	*dst++ = static_cast<TUint8>(0xc0\|(aSrc>>6));
sl@0	253	*dst++ = static_cast<TUint8> (0x80\|(aSrc&0x3f));
sl@0	254	retval = 2;
sl@0	255	}
sl@0	256	else
sl@0	257	{
sl@0	258	return -2;
sl@0	259	}
sl@0	260	}
sl@0	261	else if ((aSrc & 0xfc00)==0xd800)
sl@0	262	{
sl@0	263	ps->__value.lead = aSrc;
sl@0	264	ps->__count = _EUTF16_21BitExtensionState;
sl@0	265	retval = 0; //nothing written out just yet
sl@0	266	}
sl@0	267	else
sl@0	268	{
sl@0	269	if ( aLen >= 3)
sl@0	270	{
sl@0	271	*dst++ = static_cast<TUint8>(0xe0\|(aSrc>>12));
sl@0	272	*dst++ = static_cast<TUint8>(0x80\|((aSrc>>6)&0x3f));
sl@0	273	*dst++ = static_cast<TUint8>(0x80\|(aSrc&0x3f));
sl@0	274	retval = 3;
sl@0	275	}
sl@0	276	else
sl@0	277	{
sl@0	278	return -2;
sl@0	279	}
sl@0	280	}
sl@0	281
sl@0	282
sl@0	283	}
sl@0	284	else //ps->__count == _EUCS2_21BitExtensionState)
sl@0	285	{
sl@0	286	//characters outside this range are illegal in this state
sl@0	287	//see http://www.unicode.org/faq/utf_bom.html#40
sl@0	288	if((aSrc < 0xDC00 \|\| aSrc > 0xDFFF) )
sl@0	289	{
sl@0	290	errno = EILSEQ;
sl@0	291	return -1;
sl@0	292	}
sl@0	293
sl@0	294	if ((aSrc & 0xfc00)!=0xdc00)
sl@0	295	{
sl@0	296	errno = EILSEQ;
sl@0	297	return -1;
sl@0	298	}
sl@0	299	if ( aLen >= 4)
sl@0	300	{
sl@0	301	//snippet taken from unicode faq
sl@0	302	//http://www.unicode.org/faq/utf_bom.html#39
sl@0	303
sl@0	304	unsigned long codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
sl@0	305
sl@0	306	*dst++ = static_cast<TUint8>( 0xf0\|(codepoint>>18));
sl@0	307	*dst++ = static_cast<TUint8>(0x80\|((codepoint>>12)&0x3f));
sl@0	308	*dst++ = static_cast<TUint8>(0x80\|((codepoint>>6)&0x3f));
sl@0	309	*dst++ = static_cast<TUint8>(0x80\|(codepoint&0x3f));
sl@0	310	retval = 4;
sl@0	311	}
sl@0	312	else
sl@0	313	{
sl@0	314	return -2;
sl@0	315	}
sl@0	316	ps->__count = _EUTF16InitialState;
sl@0	317	}
sl@0	318	return retval;
sl@0	319
sl@0	320
sl@0	321	}//end of function
sl@0	322

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--