os/ossrv/genericopenlibs/openenvcore/libc/src/charcnv.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 // Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
     2 // All rights reserved.
     3 // This component and the accompanying materials are made available
     4 // under the terms of "Eclipse Public License v1.0"
     5 // which accompanies this distribution, and is available
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
     7 //
     8 // Initial Contributors:
     9 // Nokia Corporation - initial contribution.
    10 //
    11 // Contributors:
    12 //
    13 // Description:
    14 // Name        : MRT_WCHARCNVT.CPP
    15 // Part of     : MRT LIBC
    16 // Contains the source for the helper functions used by wchar 
    17 // restartable conversion API's in libc
    18 // Version     : 1.0
    19 //
    20 
    21 
    22 
    23 // Copyright (c) 1997-2003 Symbian Ltd.  All rights reserved.
    24 
    25 // system includes
    26 #include <e32std.h>
    27 #include <e32base.h>
    28 #include <utf.h>
    29 #include <stdlib.h>
    30 #include <string.h>
    31 #include <errno.h>
    32 #include <wchar.h>
    33 
    34 #include "wcharcnv.h"
    35 
    36 #define  KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00
    37 
    38 //-----------------------------------------------------------------------------
    39 //Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const 
    40 //                           TDesC8& aUtf8, mbstate_t *state)
    41 //Description   : Converts the unicode to UTF8 
    42 //Return Value  : The number of unconverted bytes left at the end of the input
    43 //descriptor, or one of the error values defined in TError.
    44 //-----------------------------------------------------------------------------
    45 TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
    46 {
    47 	aUnicode.SetLength(0);
    48 	if (aUtf8.Length()==0)
    49 	{
    50 		return 0;
    51 	}
    52 	if (aUnicode.MaxLength()==0)
    53 	{
    54 		return aUtf8.Length();
    55 	}
    56 	
    57 	HBufC8* utf8 = NULL;
    58 	if ( state->__count > 0)
    59         {
    60 	        // state have some information, use that.
    61 	        utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
    62 	        TPtr8 tempBuf = utf8->Des();
    63 	        TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
    64 	        tempBuf.Copy(temp);
    65 	        tempBuf.Append(aUtf8);
    66         }
    67     
    68 	TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
    69 	const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
    70 	const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
    71 	const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
    72 	TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
    73 	const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
    74 	TUint16 replacementcharacter = 0xFFFD;
    75 	TUint8 currentUtf8Byte;
    76 	TUint currentUnicodeCharacter;
    77 	TInt sequenceLength;		
    78 	
    79 	
    80 	FOREVER
    81 	{
    82 		currentUtf8Byte=*pointerToCurrentUtf8Byte;
    83 		pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
    84 		sequenceLength=100;
    85         
    86 		for(TInt i=0;i<7;i++)
    87 		{
    88 			if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
    89 			{
    90 				sequenceLength = 4-i;
    91 				break;
    92 			}
    93 		}
    94 
    95 		if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0)
    96 		{
    97 			currentUnicodeCharacter=replacementcharacter;
    98 		}
    99 		else
   100 		{		
   101 			if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
   102 			{
   103 				// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
   104 				// store the character within the state.
   105 				state->__count = 0;
   106             			while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
   107         		        {
   108             			        state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
   109 		                }
   110             			// reset the current pointer
   111     	        		pointerToCurrentUtf8Byte -= state->__count;
   112 				if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
   113 			        {
   114 				        // still nothing is decoded.
   115 				        if ( utf8 )
   116 			                {
   117 	        			        CleanupStack::PopAndDestroy(); // utf8
   118 			                }
   119 				        return -2;
   120 				        //return -1;
   121 				}
   122         			// something is already decoded, so return the no of bytes that use for 
   123 	        		// decoding.
   124 		        	break;
   125 		        }			
   126 		    
   127                         // reset the state				
   128                     	state->__count = 0;
   129 			currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
   130 			
   131         		for(TInt i=sequenceLength;i>1; i--)
   132 	                {
   133         			currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
   134         			if ((currentUtf8Byte&0xc0)==0x80)
   135         			{
   136 	       				currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F);
   137         			}
   138         			else
   139         			{
   140         				// Encoding error occured.
   141         				// store the contained within the state and return -1.
   142         				// set the error EILSEQ to errno
   143                 		        if ( utf8 )
   144                			        {
   145                 			        CleanupStack::PopAndDestroy(); // utf8
   146                			        }
   147         				errno = EILSEQ;
   148                				return -1;
   149         				//currentUnicodeCharacter=replacementcharacter;
   150         				//--pointerToCurrentUtf8Byte;
   151         			}
   152 	       		}
   153 	        }
   154 			
   155        		if (currentUnicodeCharacter > 0xFFFF)
   156        		{
   157 	        	if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
   158 	        	{
   159         			// unicode descriptor dnt have 2 wchar bytes to hold the data.
   160 	       			pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
   161 	        		break;
   162 	        	}
   163 		
   164         		TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
   165 	       		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
   166 	        	++pointerToCurrentUnicodeCharacter;
   167 				
   168 		        surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
   169         		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
   170 	       		++pointerToCurrentUnicodeCharacter;
   171 	        	++pointerToCurrentUtf8Byte;
   172 	        }
   173         	else
   174 		{
   175 	        	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
   176 		        ++pointerToCurrentUnicodeCharacter;
   177         		++pointerToCurrentUtf8Byte;
   178 		}
   179 	        
   180 	        if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
   181 	        {
   182         		// checking the boundary condition.
   183         		// Here either the UTF-8 or Unicode descriptor reached to the end.
   184         		break;
   185         	}
   186 	} // forever
   187         // decoding finished.
   188 	aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
   189         if ( utf8 )
   190         {
   191                 CleanupStack::PopAndDestroy(); // utf8
   192         }
   193        	//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
   194 	// returns the number of bytes used to complete a valid multibyte character.
   195 	return pointerToCurrentUtf8Byte - aUtf8.Ptr();
   196 } //end of function
   197 
   198 //-----------------------------------------------------------------------------
   199 //Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
   200 //Description   : Converts wide char in UCS2 format to UTF8 equivalent
   201 //Return Value  : The number of bytes converted, 0 if L'\0\' was translated, -1 on
   202 //generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
   203 //-----------------------------------------------------------------------------
   204 TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
   205 {
   206 	int retval = 0;
   207 	// check the state 
   208 	if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
   209 	{
   210 		errno = EINVAL;
   211 		return -1;
   212 	}
   213 	
   214 	//following characters are illegal
   215 	//see http://www.unicode.org/faq/utf_bom.html#40
   216 	if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
   217 	{
   218 		errno = EILSEQ;
   219 		return -1;
   220 	}
   221 	
   222 			
   223 	if(ps->__count == _EUTF16InitialState)
   224 	{
   225 	
   226 		//following characters in addition are illegal in initial state
   227 		//see http://www.unicode.org/faq/utf_bom.html#40
   228 		if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
   229 		{
   230 			errno = EILSEQ;
   231 			return -1;
   232 		}
   233 
   234 	
   235 		if ((aSrc & 0xff80)==0x0000)
   236 		{
   237 			if(aLen >= 1)
   238 			{
   239 				*dst++ = static_cast<TUint8>(aSrc);
   240 				retval = 1;
   241 			}
   242 			else
   243 			{
   244 				return -2;
   245 			}
   246 			
   247 		}
   248 		else if ((aSrc & 0xf800)==0x0000)
   249 		{
   250 			if (aLen >= 2)
   251 			{
   252 				*dst++ = static_cast<TUint8>(0xc0|(aSrc>>6));
   253 				*dst++ = static_cast<TUint8> (0x80|(aSrc&0x3f));
   254 				retval = 2;
   255 			}
   256 			else
   257 			{
   258 				return -2;
   259 			}
   260 		}
   261 		else if ((aSrc & 0xfc00)==0xd800)
   262 		{
   263 			 ps->__value.lead = aSrc;
   264 		 	 ps->__count = _EUTF16_21BitExtensionState;
   265 		 	retval = 0; //nothing written out just yet
   266 		}
   267 		else
   268 		{
   269 			if ( aLen >= 3)
   270 			{
   271 				*dst++ = static_cast<TUint8>(0xe0|(aSrc>>12));
   272 				*dst++ = static_cast<TUint8>(0x80|((aSrc>>6)&0x3f));
   273 				*dst++ = static_cast<TUint8>(0x80|(aSrc&0x3f));
   274 				retval = 3;
   275 			}
   276 			else
   277 			{
   278 				return -2;
   279 			}
   280 		}
   281 		
   282 		
   283 	}
   284 	else //ps->__count == _EUCS2_21BitExtensionState)
   285 	{
   286 		//characters outside this range are illegal in this state
   287 		//see http://www.unicode.org/faq/utf_bom.html#40
   288 		if((aSrc < 0xDC00 || aSrc > 0xDFFF) )
   289 		{
   290 			errno = EILSEQ;
   291 			return -1;
   292 		}
   293 		
   294 		if ((aSrc & 0xfc00)!=0xdc00)
   295 		{
   296 			errno = EILSEQ;
   297 			return -1;
   298 		}
   299 		if ( aLen >= 4)
   300 		{
   301 			//snippet taken from unicode faq
   302 			//http://www.unicode.org/faq/utf_bom.html#39
   303 			
   304 			unsigned long  codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
   305 			
   306 			*dst++ = static_cast<TUint8>( 0xf0|(codepoint>>18));
   307 			*dst++ = static_cast<TUint8>(0x80|((codepoint>>12)&0x3f));
   308 			*dst++ = static_cast<TUint8>(0x80|((codepoint>>6)&0x3f));
   309 			*dst++ = static_cast<TUint8>(0x80|(codepoint&0x3f));
   310 			retval = 4;
   311 		}
   312 		else
   313 		{
   314 			return -2;
   315 		}
   316 		ps->__count = _EUTF16InitialState;
   317 	}
   318 	return retval;
   319 	
   320 	
   321 }//end of function
   322