os/ossrv/genericopenlibs/openenvcore/libc/src/charcnv.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
// Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     2
// All rights reserved.
sl@0
     3
// This component and the accompanying materials are made available
sl@0
     4
// under the terms of "Eclipse Public License v1.0"
sl@0
     5
// which accompanies this distribution, and is available
sl@0
     6
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     7
//
sl@0
     8
// Initial Contributors:
sl@0
     9
// Nokia Corporation - initial contribution.
sl@0
    10
//
sl@0
    11
// Contributors:
sl@0
    12
//
sl@0
    13
// Description:
sl@0
    14
// Name        : MRT_WCHARCNVT.CPP
sl@0
    15
// Part of     : MRT LIBC
sl@0
    16
// Contains the source for the helper functions used by wchar 
sl@0
    17
// restartable conversion API's in libc
sl@0
    18
// Version     : 1.0
sl@0
    19
//
sl@0
    20
sl@0
    21
sl@0
    22
sl@0
    23
// Copyright (c) 1997-2003 Symbian Ltd.  All rights reserved.
sl@0
    24
sl@0
    25
// system includes
sl@0
    26
#include <e32std.h>
sl@0
    27
#include <e32base.h>
sl@0
    28
#include <utf.h>
sl@0
    29
#include <stdlib.h>
sl@0
    30
#include <string.h>
sl@0
    31
#include <errno.h>
sl@0
    32
#include <wchar.h>
sl@0
    33
sl@0
    34
#include "wcharcnv.h"
sl@0
    35
sl@0
    36
#define  KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00
sl@0
    37
sl@0
    38
//-----------------------------------------------------------------------------
sl@0
    39
//Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const 
sl@0
    40
//                           TDesC8& aUtf8, mbstate_t *state)
sl@0
    41
//Description   : Converts the unicode to UTF8 
sl@0
    42
//Return Value  : The number of unconverted bytes left at the end of the input
sl@0
    43
//descriptor, or one of the error values defined in TError.
sl@0
    44
//-----------------------------------------------------------------------------
sl@0
    45
TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
sl@0
    46
{
sl@0
    47
	aUnicode.SetLength(0);
sl@0
    48
	if (aUtf8.Length()==0)
sl@0
    49
	{
sl@0
    50
		return 0;
sl@0
    51
	}
sl@0
    52
	if (aUnicode.MaxLength()==0)
sl@0
    53
	{
sl@0
    54
		return aUtf8.Length();
sl@0
    55
	}
sl@0
    56
	
sl@0
    57
	HBufC8* utf8 = NULL;
sl@0
    58
	if ( state->__count > 0)
sl@0
    59
        {
sl@0
    60
	        // state have some information, use that.
sl@0
    61
	        utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
sl@0
    62
	        TPtr8 tempBuf = utf8->Des();
sl@0
    63
	        TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
sl@0
    64
	        tempBuf.Copy(temp);
sl@0
    65
	        tempBuf.Append(aUtf8);
sl@0
    66
        }
sl@0
    67
    
sl@0
    68
	TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
sl@0
    69
	const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
sl@0
    70
	const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
sl@0
    71
	const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
sl@0
    72
	TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
sl@0
    73
	const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
sl@0
    74
	TUint16 replacementcharacter = 0xFFFD;
sl@0
    75
	TUint8 currentUtf8Byte;
sl@0
    76
	TUint currentUnicodeCharacter;
sl@0
    77
	TInt sequenceLength;		
sl@0
    78
	
sl@0
    79
	
sl@0
    80
	FOREVER
sl@0
    81
	{
sl@0
    82
		currentUtf8Byte=*pointerToCurrentUtf8Byte;
sl@0
    83
		pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
sl@0
    84
		sequenceLength=100;
sl@0
    85
        
sl@0
    86
		for(TInt i=0;i<7;i++)
sl@0
    87
		{
sl@0
    88
			if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
sl@0
    89
			{
sl@0
    90
				sequenceLength = 4-i;
sl@0
    91
				break;
sl@0
    92
			}
sl@0
    93
		}
sl@0
    94
sl@0
    95
		if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0)
sl@0
    96
		{
sl@0
    97
			currentUnicodeCharacter=replacementcharacter;
sl@0
    98
		}
sl@0
    99
		else
sl@0
   100
		{		
sl@0
   101
			if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
sl@0
   102
			{
sl@0
   103
				// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
sl@0
   104
				// store the character within the state.
sl@0
   105
				state->__count = 0;
sl@0
   106
            			while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
sl@0
   107
        		        {
sl@0
   108
            			        state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
sl@0
   109
		                }
sl@0
   110
            			// reset the current pointer
sl@0
   111
    	        		pointerToCurrentUtf8Byte -= state->__count;
sl@0
   112
				if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
sl@0
   113
			        {
sl@0
   114
				        // still nothing is decoded.
sl@0
   115
				        if ( utf8 )
sl@0
   116
			                {
sl@0
   117
	        			        CleanupStack::PopAndDestroy(); // utf8
sl@0
   118
			                }
sl@0
   119
				        return -2;
sl@0
   120
				        //return -1;
sl@0
   121
				}
sl@0
   122
        			// something is already decoded, so return the no of bytes that use for 
sl@0
   123
	        		// decoding.
sl@0
   124
		        	break;
sl@0
   125
		        }			
sl@0
   126
		    
sl@0
   127
                        // reset the state				
sl@0
   128
                    	state->__count = 0;
sl@0
   129
			currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
sl@0
   130
			
sl@0
   131
        		for(TInt i=sequenceLength;i>1; i--)
sl@0
   132
	                {
sl@0
   133
        			currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
sl@0
   134
        			if ((currentUtf8Byte&0xc0)==0x80)
sl@0
   135
        			{
sl@0
   136
	       				currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F);
sl@0
   137
        			}
sl@0
   138
        			else
sl@0
   139
        			{
sl@0
   140
        				// Encoding error occured.
sl@0
   141
        				// store the contained within the state and return -1.
sl@0
   142
        				// set the error EILSEQ to errno
sl@0
   143
                		        if ( utf8 )
sl@0
   144
               			        {
sl@0
   145
                			        CleanupStack::PopAndDestroy(); // utf8
sl@0
   146
               			        }
sl@0
   147
        				errno = EILSEQ;
sl@0
   148
               				return -1;
sl@0
   149
        				//currentUnicodeCharacter=replacementcharacter;
sl@0
   150
        				//--pointerToCurrentUtf8Byte;
sl@0
   151
        			}
sl@0
   152
	       		}
sl@0
   153
	        }
sl@0
   154
			
sl@0
   155
       		if (currentUnicodeCharacter > 0xFFFF)
sl@0
   156
       		{
sl@0
   157
	        	if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
sl@0
   158
	        	{
sl@0
   159
        			// unicode descriptor dnt have 2 wchar bytes to hold the data.
sl@0
   160
	       			pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
sl@0
   161
	        		break;
sl@0
   162
	        	}
sl@0
   163
		
sl@0
   164
        		TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
sl@0
   165
	       		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
sl@0
   166
	        	++pointerToCurrentUnicodeCharacter;
sl@0
   167
				
sl@0
   168
		        surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
sl@0
   169
        		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
sl@0
   170
	       		++pointerToCurrentUnicodeCharacter;
sl@0
   171
	        	++pointerToCurrentUtf8Byte;
sl@0
   172
	        }
sl@0
   173
        	else
sl@0
   174
		{
sl@0
   175
	        	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
sl@0
   176
		        ++pointerToCurrentUnicodeCharacter;
sl@0
   177
        		++pointerToCurrentUtf8Byte;
sl@0
   178
		}
sl@0
   179
	        
sl@0
   180
	        if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
sl@0
   181
	        {
sl@0
   182
        		// checking the boundary condition.
sl@0
   183
        		// Here either the UTF-8 or Unicode descriptor reached to the end.
sl@0
   184
        		break;
sl@0
   185
        	}
sl@0
   186
	} // forever
sl@0
   187
        // decoding finished.
sl@0
   188
	aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
sl@0
   189
        if ( utf8 )
sl@0
   190
        {
sl@0
   191
                CleanupStack::PopAndDestroy(); // utf8
sl@0
   192
        }
sl@0
   193
       	//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
sl@0
   194
	// returns the number of bytes used to complete a valid multibyte character.
sl@0
   195
	return pointerToCurrentUtf8Byte - aUtf8.Ptr();
sl@0
   196
} //end of function
sl@0
   197
sl@0
   198
//-----------------------------------------------------------------------------
sl@0
   199
//Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
sl@0
   200
//Description   : Converts wide char in UCS2 format to UTF8 equivalent
sl@0
   201
//Return Value  : The number of bytes converted, 0 if L'\0\' was translated, -1 on
sl@0
   202
//generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
sl@0
   203
//-----------------------------------------------------------------------------
sl@0
   204
TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
sl@0
   205
{
sl@0
   206
	int retval = 0;
sl@0
   207
	// check the state 
sl@0
   208
	if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
sl@0
   209
	{
sl@0
   210
		errno = EINVAL;
sl@0
   211
		return -1;
sl@0
   212
	}
sl@0
   213
	
sl@0
   214
	//following characters are illegal
sl@0
   215
	//see http://www.unicode.org/faq/utf_bom.html#40
sl@0
   216
	if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
sl@0
   217
	{
sl@0
   218
		errno = EILSEQ;
sl@0
   219
		return -1;
sl@0
   220
	}
sl@0
   221
	
sl@0
   222
			
sl@0
   223
	if(ps->__count == _EUTF16InitialState)
sl@0
   224
	{
sl@0
   225
	
sl@0
   226
		//following characters in addition are illegal in initial state
sl@0
   227
		//see http://www.unicode.org/faq/utf_bom.html#40
sl@0
   228
		if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
sl@0
   229
		{
sl@0
   230
			errno = EILSEQ;
sl@0
   231
			return -1;
sl@0
   232
		}
sl@0
   233
sl@0
   234
	
sl@0
   235
		if ((aSrc & 0xff80)==0x0000)
sl@0
   236
		{
sl@0
   237
			if(aLen >= 1)
sl@0
   238
			{
sl@0
   239
				*dst++ = static_cast<TUint8>(aSrc);
sl@0
   240
				retval = 1;
sl@0
   241
			}
sl@0
   242
			else
sl@0
   243
			{
sl@0
   244
				return -2;
sl@0
   245
			}
sl@0
   246
			
sl@0
   247
		}
sl@0
   248
		else if ((aSrc & 0xf800)==0x0000)
sl@0
   249
		{
sl@0
   250
			if (aLen >= 2)
sl@0
   251
			{
sl@0
   252
				*dst++ = static_cast<TUint8>(0xc0|(aSrc>>6));
sl@0
   253
				*dst++ = static_cast<TUint8> (0x80|(aSrc&0x3f));
sl@0
   254
				retval = 2;
sl@0
   255
			}
sl@0
   256
			else
sl@0
   257
			{
sl@0
   258
				return -2;
sl@0
   259
			}
sl@0
   260
		}
sl@0
   261
		else if ((aSrc & 0xfc00)==0xd800)
sl@0
   262
		{
sl@0
   263
			 ps->__value.lead = aSrc;
sl@0
   264
		 	 ps->__count = _EUTF16_21BitExtensionState;
sl@0
   265
		 	retval = 0; //nothing written out just yet
sl@0
   266
		}
sl@0
   267
		else
sl@0
   268
		{
sl@0
   269
			if ( aLen >= 3)
sl@0
   270
			{
sl@0
   271
				*dst++ = static_cast<TUint8>(0xe0|(aSrc>>12));
sl@0
   272
				*dst++ = static_cast<TUint8>(0x80|((aSrc>>6)&0x3f));
sl@0
   273
				*dst++ = static_cast<TUint8>(0x80|(aSrc&0x3f));
sl@0
   274
				retval = 3;
sl@0
   275
			}
sl@0
   276
			else
sl@0
   277
			{
sl@0
   278
				return -2;
sl@0
   279
			}
sl@0
   280
		}
sl@0
   281
		
sl@0
   282
		
sl@0
   283
	}
sl@0
   284
	else //ps->__count == _EUCS2_21BitExtensionState)
sl@0
   285
	{
sl@0
   286
		//characters outside this range are illegal in this state
sl@0
   287
		//see http://www.unicode.org/faq/utf_bom.html#40
sl@0
   288
		if((aSrc < 0xDC00 || aSrc > 0xDFFF) )
sl@0
   289
		{
sl@0
   290
			errno = EILSEQ;
sl@0
   291
			return -1;
sl@0
   292
		}
sl@0
   293
		
sl@0
   294
		if ((aSrc & 0xfc00)!=0xdc00)
sl@0
   295
		{
sl@0
   296
			errno = EILSEQ;
sl@0
   297
			return -1;
sl@0
   298
		}
sl@0
   299
		if ( aLen >= 4)
sl@0
   300
		{
sl@0
   301
			//snippet taken from unicode faq
sl@0
   302
			//http://www.unicode.org/faq/utf_bom.html#39
sl@0
   303
			
sl@0
   304
			unsigned long  codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
sl@0
   305
			
sl@0
   306
			*dst++ = static_cast<TUint8>( 0xf0|(codepoint>>18));
sl@0
   307
			*dst++ = static_cast<TUint8>(0x80|((codepoint>>12)&0x3f));
sl@0
   308
			*dst++ = static_cast<TUint8>(0x80|((codepoint>>6)&0x3f));
sl@0
   309
			*dst++ = static_cast<TUint8>(0x80|(codepoint&0x3f));
sl@0
   310
			retval = 4;
sl@0
   311
		}
sl@0
   312
		else
sl@0
   313
		{
sl@0
   314
			return -2;
sl@0
   315
		}
sl@0
   316
		ps->__count = _EUTF16InitialState;
sl@0
   317
	}
sl@0
   318
	return retval;
sl@0
   319
	
sl@0
   320
	
sl@0
   321
}//end of function
sl@0
   322