os/ossrv/genericopenlibs/openenvcore/libc/src/charcnv.cpp
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/ossrv/genericopenlibs/openenvcore/libc/src/charcnv.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,322 @@
     1.4 +// Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.5 +// All rights reserved.
     1.6 +// This component and the accompanying materials are made available
     1.7 +// under the terms of "Eclipse Public License v1.0"
     1.8 +// which accompanies this distribution, and is available
     1.9 +// at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.10 +//
    1.11 +// Initial Contributors:
    1.12 +// Nokia Corporation - initial contribution.
    1.13 +//
    1.14 +// Contributors:
    1.15 +//
    1.16 +// Description:
    1.17 +// Name        : MRT_WCHARCNVT.CPP
    1.18 +// Part of     : MRT LIBC
    1.19 +// Contains the source for the helper functions used by wchar 
    1.20 +// restartable conversion API's in libc
    1.21 +// Version     : 1.0
    1.22 +//
    1.23 +
    1.24 +
    1.25 +
    1.26 +// Copyright (c) 1997-2003 Symbian Ltd.  All rights reserved.
    1.27 +
    1.28 +// system includes
    1.29 +#include <e32std.h>
    1.30 +#include <e32base.h>
    1.31 +#include <utf.h>
    1.32 +#include <stdlib.h>
    1.33 +#include <string.h>
    1.34 +#include <errno.h>
    1.35 +#include <wchar.h>
    1.36 +
    1.37 +#include "wcharcnv.h"
    1.38 +
    1.39 +#define  KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00
    1.40 +
    1.41 +//-----------------------------------------------------------------------------
    1.42 +//Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const 
    1.43 +//                           TDesC8& aUtf8, mbstate_t *state)
    1.44 +//Description   : Converts the unicode to UTF8 
    1.45 +//Return Value  : The number of unconverted bytes left at the end of the input
    1.46 +//descriptor, or one of the error values defined in TError.
    1.47 +//-----------------------------------------------------------------------------
    1.48 +TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
    1.49 +{
    1.50 +	aUnicode.SetLength(0);
    1.51 +	if (aUtf8.Length()==0)
    1.52 +	{
    1.53 +		return 0;
    1.54 +	}
    1.55 +	if (aUnicode.MaxLength()==0)
    1.56 +	{
    1.57 +		return aUtf8.Length();
    1.58 +	}
    1.59 +	
    1.60 +	HBufC8* utf8 = NULL;
    1.61 +	if ( state->__count > 0)
    1.62 +        {
    1.63 +	        // state have some information, use that.
    1.64 +	        utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
    1.65 +	        TPtr8 tempBuf = utf8->Des();
    1.66 +	        TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
    1.67 +	        tempBuf.Copy(temp);
    1.68 +	        tempBuf.Append(aUtf8);
    1.69 +        }
    1.70 +    
    1.71 +	TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
    1.72 +	const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
    1.73 +	const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
    1.74 +	const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
    1.75 +	TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
    1.76 +	const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
    1.77 +	TUint16 replacementcharacter = 0xFFFD;
    1.78 +	TUint8 currentUtf8Byte;
    1.79 +	TUint currentUnicodeCharacter;
    1.80 +	TInt sequenceLength;		
    1.81 +	
    1.82 +	
    1.83 +	FOREVER
    1.84 +	{
    1.85 +		currentUtf8Byte=*pointerToCurrentUtf8Byte;
    1.86 +		pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
    1.87 +		sequenceLength=100;
    1.88 +        
    1.89 +		for(TInt i=0;i<7;i++)
    1.90 +		{
    1.91 +			if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
    1.92 +			{
    1.93 +				sequenceLength = 4-i;
    1.94 +				break;
    1.95 +			}
    1.96 +		}
    1.97 +
    1.98 +		if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0)
    1.99 +		{
   1.100 +			currentUnicodeCharacter=replacementcharacter;
   1.101 +		}
   1.102 +		else
   1.103 +		{		
   1.104 +			if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
   1.105 +			{
   1.106 +				// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
   1.107 +				// store the character within the state.
   1.108 +				state->__count = 0;
   1.109 +            			while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
   1.110 +        		        {
   1.111 +            			        state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
   1.112 +		                }
   1.113 +            			// reset the current pointer
   1.114 +    	        		pointerToCurrentUtf8Byte -= state->__count;
   1.115 +				if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
   1.116 +			        {
   1.117 +				        // still nothing is decoded.
   1.118 +				        if ( utf8 )
   1.119 +			                {
   1.120 +	        			        CleanupStack::PopAndDestroy(); // utf8
   1.121 +			                }
   1.122 +				        return -2;
   1.123 +				        //return -1;
   1.124 +				}
   1.125 +        			// something is already decoded, so return the no of bytes that use for 
   1.126 +	        		// decoding.
   1.127 +		        	break;
   1.128 +		        }			
   1.129 +		    
   1.130 +                        // reset the state				
   1.131 +                    	state->__count = 0;
   1.132 +			currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
   1.133 +			
   1.134 +        		for(TInt i=sequenceLength;i>1; i--)
   1.135 +	                {
   1.136 +        			currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
   1.137 +        			if ((currentUtf8Byte&0xc0)==0x80)
   1.138 +        			{
   1.139 +	       				currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F);
   1.140 +        			}
   1.141 +        			else
   1.142 +        			{
   1.143 +        				// Encoding error occured.
   1.144 +        				// store the contained within the state and return -1.
   1.145 +        				// set the error EILSEQ to errno
   1.146 +                		        if ( utf8 )
   1.147 +               			        {
   1.148 +                			        CleanupStack::PopAndDestroy(); // utf8
   1.149 +               			        }
   1.150 +        				errno = EILSEQ;
   1.151 +               				return -1;
   1.152 +        				//currentUnicodeCharacter=replacementcharacter;
   1.153 +        				//--pointerToCurrentUtf8Byte;
   1.154 +        			}
   1.155 +	       		}
   1.156 +	        }
   1.157 +			
   1.158 +       		if (currentUnicodeCharacter > 0xFFFF)
   1.159 +       		{
   1.160 +	        	if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
   1.161 +	        	{
   1.162 +        			// unicode descriptor dnt have 2 wchar bytes to hold the data.
   1.163 +	       			pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
   1.164 +	        		break;
   1.165 +	        	}
   1.166 +		
   1.167 +        		TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
   1.168 +	       		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
   1.169 +	        	++pointerToCurrentUnicodeCharacter;
   1.170 +				
   1.171 +		        surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
   1.172 +        		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
   1.173 +	       		++pointerToCurrentUnicodeCharacter;
   1.174 +	        	++pointerToCurrentUtf8Byte;
   1.175 +	        }
   1.176 +        	else
   1.177 +		{
   1.178 +	        	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
   1.179 +		        ++pointerToCurrentUnicodeCharacter;
   1.180 +        		++pointerToCurrentUtf8Byte;
   1.181 +		}
   1.182 +	        
   1.183 +	        if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
   1.184 +	        {
   1.185 +        		// checking the boundary condition.
   1.186 +        		// Here either the UTF-8 or Unicode descriptor reached to the end.
   1.187 +        		break;
   1.188 +        	}
   1.189 +	} // forever
   1.190 +        // decoding finished.
   1.191 +	aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
   1.192 +        if ( utf8 )
   1.193 +        {
   1.194 +                CleanupStack::PopAndDestroy(); // utf8
   1.195 +        }
   1.196 +       	//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
   1.197 +	// returns the number of bytes used to complete a valid multibyte character.
   1.198 +	return pointerToCurrentUtf8Byte - aUtf8.Ptr();
   1.199 +} //end of function
   1.200 +
   1.201 +//-----------------------------------------------------------------------------
   1.202 +//Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
   1.203 +//Description   : Converts wide char in UCS2 format to UTF8 equivalent
   1.204 +//Return Value  : The number of bytes converted, 0 if L'\0\' was translated, -1 on
   1.205 +//generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
   1.206 +//-----------------------------------------------------------------------------
   1.207 +TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
   1.208 +{
   1.209 +	int retval = 0;
   1.210 +	// check the state 
   1.211 +	if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
   1.212 +	{
   1.213 +		errno = EINVAL;
   1.214 +		return -1;
   1.215 +	}
   1.216 +	
   1.217 +	//following characters are illegal
   1.218 +	//see http://www.unicode.org/faq/utf_bom.html#40
   1.219 +	if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
   1.220 +	{
   1.221 +		errno = EILSEQ;
   1.222 +		return -1;
   1.223 +	}
   1.224 +	
   1.225 +			
   1.226 +	if(ps->__count == _EUTF16InitialState)
   1.227 +	{
   1.228 +	
   1.229 +		//following characters in addition are illegal in initial state
   1.230 +		//see http://www.unicode.org/faq/utf_bom.html#40
   1.231 +		if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
   1.232 +		{
   1.233 +			errno = EILSEQ;
   1.234 +			return -1;
   1.235 +		}
   1.236 +
   1.237 +	
   1.238 +		if ((aSrc & 0xff80)==0x0000)
   1.239 +		{
   1.240 +			if(aLen >= 1)
   1.241 +			{
   1.242 +				*dst++ = static_cast<TUint8>(aSrc);
   1.243 +				retval = 1;
   1.244 +			}
   1.245 +			else
   1.246 +			{
   1.247 +				return -2;
   1.248 +			}
   1.249 +			
   1.250 +		}
   1.251 +		else if ((aSrc & 0xf800)==0x0000)
   1.252 +		{
   1.253 +			if (aLen >= 2)
   1.254 +			{
   1.255 +				*dst++ = static_cast<TUint8>(0xc0|(aSrc>>6));
   1.256 +				*dst++ = static_cast<TUint8> (0x80|(aSrc&0x3f));
   1.257 +				retval = 2;
   1.258 +			}
   1.259 +			else
   1.260 +			{
   1.261 +				return -2;
   1.262 +			}
   1.263 +		}
   1.264 +		else if ((aSrc & 0xfc00)==0xd800)
   1.265 +		{
   1.266 +			 ps->__value.lead = aSrc;
   1.267 +		 	 ps->__count = _EUTF16_21BitExtensionState;
   1.268 +		 	retval = 0; //nothing written out just yet
   1.269 +		}
   1.270 +		else
   1.271 +		{
   1.272 +			if ( aLen >= 3)
   1.273 +			{
   1.274 +				*dst++ = static_cast<TUint8>(0xe0|(aSrc>>12));
   1.275 +				*dst++ = static_cast<TUint8>(0x80|((aSrc>>6)&0x3f));
   1.276 +				*dst++ = static_cast<TUint8>(0x80|(aSrc&0x3f));
   1.277 +				retval = 3;
   1.278 +			}
   1.279 +			else
   1.280 +			{
   1.281 +				return -2;
   1.282 +			}
   1.283 +		}
   1.284 +		
   1.285 +		
   1.286 +	}
   1.287 +	else //ps->__count == _EUCS2_21BitExtensionState)
   1.288 +	{
   1.289 +		//characters outside this range are illegal in this state
   1.290 +		//see http://www.unicode.org/faq/utf_bom.html#40
   1.291 +		if((aSrc < 0xDC00 || aSrc > 0xDFFF) )
   1.292 +		{
   1.293 +			errno = EILSEQ;
   1.294 +			return -1;
   1.295 +		}
   1.296 +		
   1.297 +		if ((aSrc & 0xfc00)!=0xdc00)
   1.298 +		{
   1.299 +			errno = EILSEQ;
   1.300 +			return -1;
   1.301 +		}
   1.302 +		if ( aLen >= 4)
   1.303 +		{
   1.304 +			//snippet taken from unicode faq
   1.305 +			//http://www.unicode.org/faq/utf_bom.html#39
   1.306 +			
   1.307 +			unsigned long  codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
   1.308 +			
   1.309 +			*dst++ = static_cast<TUint8>( 0xf0|(codepoint>>18));
   1.310 +			*dst++ = static_cast<TUint8>(0x80|((codepoint>>12)&0x3f));
   1.311 +			*dst++ = static_cast<TUint8>(0x80|((codepoint>>6)&0x3f));
   1.312 +			*dst++ = static_cast<TUint8>(0x80|(codepoint&0x3f));
   1.313 +			retval = 4;
   1.314 +		}
   1.315 +		else
   1.316 +		{
   1.317 +			return -2;
   1.318 +		}
   1.319 +		ps->__count = _EUTF16InitialState;
   1.320 +	}
   1.321 +	return retval;
   1.322 +	
   1.323 +	
   1.324 +}//end of function
   1.325 +