sl@0: // Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0: // All rights reserved.
sl@0: // This component and the accompanying materials are made available
sl@0: // under the terms of "Eclipse Public License v1.0"
sl@0: // which accompanies this distribution, and is available
sl@0: // at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0: //
sl@0: // Initial Contributors:
sl@0: // Nokia Corporation - initial contribution.
sl@0: //
sl@0: // Contributors:
sl@0: //
sl@0: // Description:
sl@0: // Name        : MRT_WCHARCNVT.CPP
sl@0: // Part of     : MRT LIBC
sl@0: // Contains the source for the helper functions used by wchar 
sl@0: // restartable conversion API's in libc
sl@0: // Version     : 1.0
sl@0: //
sl@0: 
sl@0: 
sl@0: 
sl@0: // Copyright (c) 1997-2003 Symbian Ltd.  All rights reserved.
sl@0: 
sl@0: // system includes
sl@0: #include <e32std.h>
sl@0: #include <e32base.h>
sl@0: #include <utf.h>
sl@0: #include <stdlib.h>
sl@0: #include <string.h>
sl@0: #include <errno.h>
sl@0: #include <wchar.h>
sl@0: 
sl@0: #include "wcharcnv.h"
sl@0: 
sl@0: #define  KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00
sl@0: 
sl@0: //-----------------------------------------------------------------------------
sl@0: //Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const 
sl@0: //                           TDesC8& aUtf8, mbstate_t *state)
sl@0: //Description   : Converts the unicode to UTF8 
sl@0: //Return Value  : The number of unconverted bytes left at the end of the input
sl@0: //descriptor, or one of the error values defined in TError.
sl@0: //-----------------------------------------------------------------------------
sl@0: TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
sl@0: {
sl@0: 	aUnicode.SetLength(0);
sl@0: 	if (aUtf8.Length()==0)
sl@0: 	{
sl@0: 		return 0;
sl@0: 	}
sl@0: 	if (aUnicode.MaxLength()==0)
sl@0: 	{
sl@0: 		return aUtf8.Length();
sl@0: 	}
sl@0: 	
sl@0: 	HBufC8* utf8 = NULL;
sl@0: 	if ( state->__count > 0)
sl@0:         {
sl@0: 	        // state have some information, use that.
sl@0: 	        utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
sl@0: 	        TPtr8 tempBuf = utf8->Des();
sl@0: 	        TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
sl@0: 	        tempBuf.Copy(temp);
sl@0: 	        tempBuf.Append(aUtf8);
sl@0:         }
sl@0:     
sl@0: 	TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
sl@0: 	const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
sl@0: 	const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
sl@0: 	const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
sl@0: 	TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
sl@0: 	const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
sl@0: 	TUint16 replacementcharacter = 0xFFFD;
sl@0: 	TUint8 currentUtf8Byte;
sl@0: 	TUint currentUnicodeCharacter;
sl@0: 	TInt sequenceLength;		
sl@0: 	
sl@0: 	
sl@0: 	FOREVER
sl@0: 	{
sl@0: 		currentUtf8Byte=*pointerToCurrentUtf8Byte;
sl@0: 		pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
sl@0: 		sequenceLength=100;
sl@0:         
sl@0: 		for(TInt i=0;i<7;i++)
sl@0: 		{
sl@0: 			if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
sl@0: 			{
sl@0: 				sequenceLength = 4-i;
sl@0: 				break;
sl@0: 			}
sl@0: 		}
sl@0: 
sl@0: 		if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0)
sl@0: 		{
sl@0: 			currentUnicodeCharacter=replacementcharacter;
sl@0: 		}
sl@0: 		else
sl@0: 		{		
sl@0: 			if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
sl@0: 			{
sl@0: 				// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
sl@0: 				// store the character within the state.
sl@0: 				state->__count = 0;
sl@0:             			while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
sl@0:         		        {
sl@0:             			        state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
sl@0: 		                }
sl@0:             			// reset the current pointer
sl@0:     	        		pointerToCurrentUtf8Byte -= state->__count;
sl@0: 				if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
sl@0: 			        {
sl@0: 				        // still nothing is decoded.
sl@0: 				        if ( utf8 )
sl@0: 			                {
sl@0: 	        			        CleanupStack::PopAndDestroy(); // utf8
sl@0: 			                }
sl@0: 				        return -2;
sl@0: 				        //return -1;
sl@0: 				}
sl@0:         			// something is already decoded, so return the no of bytes that use for 
sl@0: 	        		// decoding.
sl@0: 		        	break;
sl@0: 		        }			
sl@0: 		    
sl@0:                         // reset the state				
sl@0:                     	state->__count = 0;
sl@0: 			currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
sl@0: 			
sl@0:         		for(TInt i=sequenceLength;i>1; i--)
sl@0: 	                {
sl@0:         			currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
sl@0:         			if ((currentUtf8Byte&0xc0)==0x80)
sl@0:         			{
sl@0: 	       				currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F);
sl@0:         			}
sl@0:         			else
sl@0:         			{
sl@0:         				// Encoding error occured.
sl@0:         				// store the contained within the state and return -1.
sl@0:         				// set the error EILSEQ to errno
sl@0:                 		        if ( utf8 )
sl@0:                			        {
sl@0:                 			        CleanupStack::PopAndDestroy(); // utf8
sl@0:                			        }
sl@0:         				errno = EILSEQ;
sl@0:                				return -1;
sl@0:         				//currentUnicodeCharacter=replacementcharacter;
sl@0:         				//--pointerToCurrentUtf8Byte;
sl@0:         			}
sl@0: 	       		}
sl@0: 	        }
sl@0: 			
sl@0:        		if (currentUnicodeCharacter > 0xFFFF)
sl@0:        		{
sl@0: 	        	if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
sl@0: 	        	{
sl@0:         			// unicode descriptor dnt have 2 wchar bytes to hold the data.
sl@0: 	       			pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
sl@0: 	        		break;
sl@0: 	        	}
sl@0: 		
sl@0:         		TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
sl@0: 	       		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
sl@0: 	        	++pointerToCurrentUnicodeCharacter;
sl@0: 				
sl@0: 		        surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
sl@0:         		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
sl@0: 	       		++pointerToCurrentUnicodeCharacter;
sl@0: 	        	++pointerToCurrentUtf8Byte;
sl@0: 	        }
sl@0:         	else
sl@0: 		{
sl@0: 	        	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
sl@0: 		        ++pointerToCurrentUnicodeCharacter;
sl@0:         		++pointerToCurrentUtf8Byte;
sl@0: 		}
sl@0: 	        
sl@0: 	        if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
sl@0: 	        {
sl@0:         		// checking the boundary condition.
sl@0:         		// Here either the UTF-8 or Unicode descriptor reached to the end.
sl@0:         		break;
sl@0:         	}
sl@0: 	} // forever
sl@0:         // decoding finished.
sl@0: 	aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
sl@0:         if ( utf8 )
sl@0:         {
sl@0:                 CleanupStack::PopAndDestroy(); // utf8
sl@0:         }
sl@0:        	//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
sl@0: 	// returns the number of bytes used to complete a valid multibyte character.
sl@0: 	return pointerToCurrentUtf8Byte - aUtf8.Ptr();
sl@0: } //end of function
sl@0: 
sl@0: //-----------------------------------------------------------------------------
sl@0: //Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
sl@0: //Description   : Converts wide char in UCS2 format to UTF8 equivalent
sl@0: //Return Value  : The number of bytes converted, 0 if L'\0\' was translated, -1 on
sl@0: //generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
sl@0: //-----------------------------------------------------------------------------
sl@0: TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
sl@0: {
sl@0: 	int retval = 0;
sl@0: 	// check the state 
sl@0: 	if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
sl@0: 	{
sl@0: 		errno = EINVAL;
sl@0: 		return -1;
sl@0: 	}
sl@0: 	
sl@0: 	//following characters are illegal
sl@0: 	//see http://www.unicode.org/faq/utf_bom.html#40
sl@0: 	if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
sl@0: 	{
sl@0: 		errno = EILSEQ;
sl@0: 		return -1;
sl@0: 	}
sl@0: 	
sl@0: 			
sl@0: 	if(ps->__count == _EUTF16InitialState)
sl@0: 	{
sl@0: 	
sl@0: 		//following characters in addition are illegal in initial state
sl@0: 		//see http://www.unicode.org/faq/utf_bom.html#40
sl@0: 		if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
sl@0: 		{
sl@0: 			errno = EILSEQ;
sl@0: 			return -1;
sl@0: 		}
sl@0: 
sl@0: 	
sl@0: 		if ((aSrc & 0xff80)==0x0000)
sl@0: 		{
sl@0: 			if(aLen >= 1)
sl@0: 			{
sl@0: 				*dst++ = static_cast<TUint8>(aSrc);
sl@0: 				retval = 1;
sl@0: 			}
sl@0: 			else
sl@0: 			{
sl@0: 				return -2;
sl@0: 			}
sl@0: 			
sl@0: 		}
sl@0: 		else if ((aSrc & 0xf800)==0x0000)
sl@0: 		{
sl@0: 			if (aLen >= 2)
sl@0: 			{
sl@0: 				*dst++ = static_cast<TUint8>(0xc0|(aSrc>>6));
sl@0: 				*dst++ = static_cast<TUint8> (0x80|(aSrc&0x3f));
sl@0: 				retval = 2;
sl@0: 			}
sl@0: 			else
sl@0: 			{
sl@0: 				return -2;
sl@0: 			}
sl@0: 		}
sl@0: 		else if ((aSrc & 0xfc00)==0xd800)
sl@0: 		{
sl@0: 			 ps->__value.lead = aSrc;
sl@0: 		 	 ps->__count = _EUTF16_21BitExtensionState;
sl@0: 		 	retval = 0; //nothing written out just yet
sl@0: 		}
sl@0: 		else
sl@0: 		{
sl@0: 			if ( aLen >= 3)
sl@0: 			{
sl@0: 				*dst++ = static_cast<TUint8>(0xe0|(aSrc>>12));
sl@0: 				*dst++ = static_cast<TUint8>(0x80|((aSrc>>6)&0x3f));
sl@0: 				*dst++ = static_cast<TUint8>(0x80|(aSrc&0x3f));
sl@0: 				retval = 3;
sl@0: 			}
sl@0: 			else
sl@0: 			{
sl@0: 				return -2;
sl@0: 			}
sl@0: 		}
sl@0: 		
sl@0: 		
sl@0: 	}
sl@0: 	else //ps->__count == _EUCS2_21BitExtensionState)
sl@0: 	{
sl@0: 		//characters outside this range are illegal in this state
sl@0: 		//see http://www.unicode.org/faq/utf_bom.html#40
sl@0: 		if((aSrc < 0xDC00 || aSrc > 0xDFFF) )
sl@0: 		{
sl@0: 			errno = EILSEQ;
sl@0: 			return -1;
sl@0: 		}
sl@0: 		
sl@0: 		if ((aSrc & 0xfc00)!=0xdc00)
sl@0: 		{
sl@0: 			errno = EILSEQ;
sl@0: 			return -1;
sl@0: 		}
sl@0: 		if ( aLen >= 4)
sl@0: 		{
sl@0: 			//snippet taken from unicode faq
sl@0: 			//http://www.unicode.org/faq/utf_bom.html#39
sl@0: 			
sl@0: 			unsigned long  codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
sl@0: 			
sl@0: 			*dst++ = static_cast<TUint8>( 0xf0|(codepoint>>18));
sl@0: 			*dst++ = static_cast<TUint8>(0x80|((codepoint>>12)&0x3f));
sl@0: 			*dst++ = static_cast<TUint8>(0x80|((codepoint>>6)&0x3f));
sl@0: 			*dst++ = static_cast<TUint8>(0x80|(codepoint&0x3f));
sl@0: 			retval = 4;
sl@0: 		}
sl@0: 		else
sl@0: 		{
sl@0: 			return -2;
sl@0: 		}
sl@0: 		ps->__count = _EUTF16InitialState;
sl@0: 	}
sl@0: 	return retval;
sl@0: 	
sl@0: 	
sl@0: }//end of function
sl@0: