sl@0: // Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: // All rights reserved. sl@0: // This component and the accompanying materials are made available sl@0: // under the terms of "Eclipse Public License v1.0" sl@0: // which accompanies this distribution, and is available sl@0: // at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: // sl@0: // Initial Contributors: sl@0: // Nokia Corporation - initial contribution. sl@0: // sl@0: // Contributors: sl@0: // sl@0: // Description: sl@0: // Name : MRT_WCHARCNVT.CPP sl@0: // Part of : MRT LIBC sl@0: // Contains the source for the helper functions used by wchar sl@0: // restartable conversion API's in libc sl@0: // Version : 1.0 sl@0: // sl@0: sl@0: sl@0: sl@0: // Copyright (c) 1997-2003 Symbian Ltd. All rights reserved. sl@0: sl@0: // system includes sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: sl@0: #include "wcharcnv.h" sl@0: sl@0: #define KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00 sl@0: sl@0: //----------------------------------------------------------------------------- sl@0: //Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const sl@0: // TDesC8& aUtf8, mbstate_t *state) sl@0: //Description : Converts the unicode to UTF8 sl@0: //Return Value : The number of unconverted bytes left at the end of the input sl@0: //descriptor, or one of the error values defined in TError. sl@0: //----------------------------------------------------------------------------- sl@0: TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state) sl@0: { sl@0: aUnicode.SetLength(0); sl@0: if (aUtf8.Length()==0) sl@0: { sl@0: return 0; sl@0: } sl@0: if (aUnicode.MaxLength()==0) sl@0: { sl@0: return aUtf8.Length(); sl@0: } sl@0: sl@0: HBufC8* utf8 = NULL; sl@0: if ( state->__count > 0) sl@0: { sl@0: // state have some information, use that. sl@0: utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() ); sl@0: TPtr8 tempBuf = utf8->Des(); sl@0: TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count); sl@0: tempBuf.Copy(temp); sl@0: tempBuf.Append(aUtf8); sl@0: } sl@0: sl@0: TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()); sl@0: const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1); sl@0: const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr(); sl@0: const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr(); sl@0: TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length(); sl@0: const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1); sl@0: TUint16 replacementcharacter = 0xFFFD; sl@0: TUint8 currentUtf8Byte; sl@0: TUint currentUnicodeCharacter; sl@0: TInt sequenceLength; sl@0: sl@0: sl@0: FOREVER sl@0: { sl@0: currentUtf8Byte=*pointerToCurrentUtf8Byte; sl@0: pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte; sl@0: sequenceLength=100; sl@0: sl@0: for(TInt i=0;i<7;i++) sl@0: { sl@0: if ((currentUtf8Byte&(0xf8<(0xF0<6) && sequenceLength!=0) sl@0: { sl@0: currentUnicodeCharacter=replacementcharacter; sl@0: } sl@0: else sl@0: { sl@0: if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)__count = 0; sl@0: while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte) sl@0: { sl@0: state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++); sl@0: } sl@0: // reset the current pointer sl@0: pointerToCurrentUtf8Byte -= state->__count; sl@0: if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0) sl@0: { sl@0: // still nothing is decoded. sl@0: if ( utf8 ) sl@0: { sl@0: CleanupStack::PopAndDestroy(); // utf8 sl@0: } sl@0: return -2; sl@0: //return -1; sl@0: } sl@0: // something is already decoded, so return the no of bytes that use for sl@0: // decoding. sl@0: break; sl@0: } sl@0: sl@0: // reset the state sl@0: state->__count = 0; sl@0: currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength); sl@0: sl@0: for(TInt i=sequenceLength;i>1; i--) sl@0: { sl@0: currentUtf8Byte = *(++pointerToCurrentUtf8Byte); sl@0: if ((currentUtf8Byte&0xc0)==0x80) sl@0: { sl@0: currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F); sl@0: } sl@0: else sl@0: { sl@0: // Encoding error occured. sl@0: // store the contained within the state and return -1. sl@0: // set the error EILSEQ to errno sl@0: if ( utf8 ) sl@0: { sl@0: CleanupStack::PopAndDestroy(); // utf8 sl@0: } sl@0: errno = EILSEQ; sl@0: return -1; sl@0: //currentUnicodeCharacter=replacementcharacter; sl@0: //--pointerToCurrentUtf8Byte; sl@0: } sl@0: } sl@0: } sl@0: sl@0: if (currentUnicodeCharacter > 0xFFFF) sl@0: { sl@0: if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter) sl@0: { sl@0: // unicode descriptor dnt have 2 wchar bytes to hold the data. sl@0: pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte; sl@0: break; sl@0: } sl@0: sl@0: TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; sl@0: *pointerToCurrentUnicodeCharacter=static_cast(surrogate); sl@0: ++pointerToCurrentUnicodeCharacter; sl@0: sl@0: surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00; sl@0: *pointerToCurrentUnicodeCharacter=static_cast(surrogate); sl@0: ++pointerToCurrentUnicodeCharacter; sl@0: ++pointerToCurrentUtf8Byte; sl@0: } sl@0: else sl@0: { sl@0: *pointerToCurrentUnicodeCharacter=static_cast(currentUnicodeCharacter); sl@0: ++pointerToCurrentUnicodeCharacter; sl@0: ++pointerToCurrentUtf8Byte; sl@0: } sl@0: sl@0: if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter)) sl@0: { sl@0: // checking the boundary condition. sl@0: // Here either the UTF-8 or Unicode descriptor reached to the end. sl@0: break; sl@0: } sl@0: } // forever sl@0: // decoding finished. sl@0: aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr()); sl@0: if ( utf8 ) sl@0: { sl@0: CleanupStack::PopAndDestroy(); // utf8 sl@0: } sl@0: //return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1; sl@0: // returns the number of bytes used to complete a valid multibyte character. sl@0: return pointerToCurrentUtf8Byte - aUtf8.Ptr(); sl@0: } //end of function sl@0: sl@0: //----------------------------------------------------------------------------- sl@0: //Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen ) sl@0: //Description : Converts wide char in UCS2 format to UTF8 equivalent sl@0: //Return Value : The number of bytes converted, 0 if L'\0\' was translated, -1 on sl@0: //generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char sl@0: //----------------------------------------------------------------------------- sl@0: TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen) sl@0: { sl@0: int retval = 0; sl@0: // check the state sl@0: if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState) sl@0: { sl@0: errno = EINVAL; sl@0: return -1; sl@0: } sl@0: sl@0: //following characters are illegal sl@0: //see http://www.unicode.org/faq/utf_bom.html#40 sl@0: if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) ) sl@0: { sl@0: errno = EILSEQ; sl@0: return -1; sl@0: } sl@0: sl@0: sl@0: if(ps->__count == _EUTF16InitialState) sl@0: { sl@0: sl@0: //following characters in addition are illegal in initial state sl@0: //see http://www.unicode.org/faq/utf_bom.html#40 sl@0: if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) ) sl@0: { sl@0: errno = EILSEQ; sl@0: return -1; sl@0: } sl@0: sl@0: sl@0: if ((aSrc & 0xff80)==0x0000) sl@0: { sl@0: if(aLen >= 1) sl@0: { sl@0: *dst++ = static_cast(aSrc); sl@0: retval = 1; sl@0: } sl@0: else sl@0: { sl@0: return -2; sl@0: } sl@0: sl@0: } sl@0: else if ((aSrc & 0xf800)==0x0000) sl@0: { sl@0: if (aLen >= 2) sl@0: { sl@0: *dst++ = static_cast(0xc0|(aSrc>>6)); sl@0: *dst++ = static_cast (0x80|(aSrc&0x3f)); sl@0: retval = 2; sl@0: } sl@0: else sl@0: { sl@0: return -2; sl@0: } sl@0: } sl@0: else if ((aSrc & 0xfc00)==0xd800) sl@0: { sl@0: ps->__value.lead = aSrc; sl@0: ps->__count = _EUTF16_21BitExtensionState; sl@0: retval = 0; //nothing written out just yet sl@0: } sl@0: else sl@0: { sl@0: if ( aLen >= 3) sl@0: { sl@0: *dst++ = static_cast(0xe0|(aSrc>>12)); sl@0: *dst++ = static_cast(0x80|((aSrc>>6)&0x3f)); sl@0: *dst++ = static_cast(0x80|(aSrc&0x3f)); sl@0: retval = 3; sl@0: } sl@0: else sl@0: { sl@0: return -2; sl@0: } sl@0: } sl@0: sl@0: sl@0: } sl@0: else //ps->__count == _EUCS2_21BitExtensionState) sl@0: { sl@0: //characters outside this range are illegal in this state sl@0: //see http://www.unicode.org/faq/utf_bom.html#40 sl@0: if((aSrc < 0xDC00 || aSrc > 0xDFFF) ) sl@0: { sl@0: errno = EILSEQ; sl@0: return -1; sl@0: } sl@0: sl@0: if ((aSrc & 0xfc00)!=0xdc00) sl@0: { sl@0: errno = EILSEQ; sl@0: return -1; sl@0: } sl@0: if ( aLen >= 4) sl@0: { sl@0: //snippet taken from unicode faq sl@0: //http://www.unicode.org/faq/utf_bom.html#39 sl@0: sl@0: unsigned long codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET; sl@0: sl@0: *dst++ = static_cast( 0xf0|(codepoint>>18)); sl@0: *dst++ = static_cast(0x80|((codepoint>>12)&0x3f)); sl@0: *dst++ = static_cast(0x80|((codepoint>>6)&0x3f)); sl@0: *dst++ = static_cast(0x80|(codepoint&0x3f)); sl@0: retval = 4; sl@0: } sl@0: else sl@0: { sl@0: return -2; sl@0: } sl@0: ps->__count = _EUTF16InitialState; sl@0: } sl@0: return retval; sl@0: sl@0: sl@0: }//end of function sl@0: