os/textandloc/charconvfw/charconv_fw/tools/convtool/utf.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
*
sl@0
    16
*/
sl@0
    17
sl@0
    18
sl@0
    19
#include <stdlib.h>
sl@0
    20
sl@0
    21
const int KErrorIllFormedInput=-1;
sl@0
    22
sl@0
    23
int Utf8ToUnicode(wchar_t* aUnicode, const char* aUtf8)
sl@0
    24
// must '\0'-terminate the output
sl@0
    25
	{
sl@0
    26
	wchar_t* startOfUnicode=aUnicode;
sl@0
    27
	for (;;)
sl@0
    28
		{
sl@0
    29
		unsigned int currentUtf8Byte=*aUtf8;
sl@0
    30
		if (currentUtf8Byte=='\0')
sl@0
    31
			{
sl@0
    32
			break;
sl@0
    33
			}
sl@0
    34
		if ((currentUtf8Byte&0x80)==0x00)
sl@0
    35
			{
sl@0
    36
			if (startOfUnicode!=NULL)
sl@0
    37
				{
sl@0
    38
				*aUnicode=(wchar_t)currentUtf8Byte;
sl@0
    39
				}
sl@0
    40
			}
sl@0
    41
		else if ((currentUtf8Byte&0xe0)==0xc0)
sl@0
    42
			{
sl@0
    43
			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x1f)<<6);
sl@0
    44
			++aUtf8;
sl@0
    45
			currentUtf8Byte=*aUtf8;
sl@0
    46
			if ((currentUtf8Byte&0xc0)!=0x80)
sl@0
    47
				{
sl@0
    48
				return KErrorIllFormedInput;
sl@0
    49
				}
sl@0
    50
			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
sl@0
    51
			if (startOfUnicode!=NULL)
sl@0
    52
				{
sl@0
    53
				*aUnicode=(wchar_t)currentUnicodeCharacter;
sl@0
    54
				}
sl@0
    55
			}
sl@0
    56
		else if ((currentUtf8Byte&0xf0)==0xe0)
sl@0
    57
			{
sl@0
    58
			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x0f)<<12);
sl@0
    59
			++aUtf8;
sl@0
    60
			currentUtf8Byte=*aUtf8;
sl@0
    61
			if ((currentUtf8Byte&0xc0)!=0x80)
sl@0
    62
				{
sl@0
    63
				return KErrorIllFormedInput;
sl@0
    64
				}
sl@0
    65
			currentUnicodeCharacter|=((currentUtf8Byte&0x3f)<<6);
sl@0
    66
			++aUtf8;
sl@0
    67
			currentUtf8Byte=*aUtf8;
sl@0
    68
			if ((currentUtf8Byte&0xc0)!=0x80)
sl@0
    69
				{
sl@0
    70
				return KErrorIllFormedInput;
sl@0
    71
				}
sl@0
    72
			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
sl@0
    73
			if (startOfUnicode!=NULL)
sl@0
    74
				{
sl@0
    75
				*aUnicode=(wchar_t)currentUnicodeCharacter;
sl@0
    76
				}
sl@0
    77
			}
sl@0
    78
		else if ((currentUtf8Byte&0xf8)==0xf0)
sl@0
    79
			{
sl@0
    80
			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x07)<<8);
sl@0
    81
			++aUtf8;
sl@0
    82
			currentUtf8Byte=*aUtf8;
sl@0
    83
			if ((currentUtf8Byte&0xc0)!=0x80)
sl@0
    84
				{
sl@0
    85
				return KErrorIllFormedInput;
sl@0
    86
				}
sl@0
    87
			currentUnicodeCharacter|=((currentUtf8Byte&0x3f)<<2);
sl@0
    88
			if (currentUnicodeCharacter<0x0040)
sl@0
    89
				{
sl@0
    90
				return KErrorIllFormedInput;
sl@0
    91
				}
sl@0
    92
			currentUnicodeCharacter-=0x0040;
sl@0
    93
			if (currentUnicodeCharacter>=0x0400)
sl@0
    94
				{
sl@0
    95
				return KErrorIllFormedInput;
sl@0
    96
				}
sl@0
    97
			++aUtf8;
sl@0
    98
			currentUtf8Byte=*aUtf8;
sl@0
    99
			if ((currentUtf8Byte&0xc0)!=0x80)
sl@0
   100
				{
sl@0
   101
				return KErrorIllFormedInput;
sl@0
   102
				}
sl@0
   103
			currentUnicodeCharacter|=((currentUtf8Byte&0x30)>>4);
sl@0
   104
			if (startOfUnicode!=NULL)
sl@0
   105
				{
sl@0
   106
				*aUnicode=(wchar_t)(0xd800|currentUnicodeCharacter);
sl@0
   107
				}
sl@0
   108
			currentUnicodeCharacter=((currentUtf8Byte&0x0f)<<6);
sl@0
   109
			++aUtf8;
sl@0
   110
			currentUtf8Byte=*aUtf8;
sl@0
   111
			if ((currentUtf8Byte&0xc0)!=0x80)
sl@0
   112
				{
sl@0
   113
				return KErrorIllFormedInput;
sl@0
   114
				}
sl@0
   115
			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
sl@0
   116
			++aUnicode;
sl@0
   117
			if (startOfUnicode!=NULL)
sl@0
   118
				{
sl@0
   119
				*aUnicode=(wchar_t)(0xdc00|currentUnicodeCharacter);
sl@0
   120
				}
sl@0
   121
			}
sl@0
   122
		else
sl@0
   123
			{
sl@0
   124
			return KErrorIllFormedInput;
sl@0
   125
			}
sl@0
   126
		++aUnicode;
sl@0
   127
		++aUtf8;
sl@0
   128
		}
sl@0
   129
	if (startOfUnicode!=NULL)
sl@0
   130
		{
sl@0
   131
		*aUnicode='\0';
sl@0
   132
		}
sl@0
   133
	return aUnicode-startOfUnicode;
sl@0
   134
	}
sl@0
   135
#include <stdio.h>
sl@0
   136
int UnicodeToUtf8(char* aUtf8, const wchar_t* aUnicode)
sl@0
   137
// must '\0'-terminate the output
sl@0
   138
	{
sl@0
   139
	char* startOfUtf8=aUtf8;
sl@0
   140
	for (;;)
sl@0
   141
		{
sl@0
   142
		unsigned int currentUnicodeCharacter=*aUnicode;
sl@0
   143
		if (currentUnicodeCharacter=='\0')
sl@0
   144
			{
sl@0
   145
			break;
sl@0
   146
			}
sl@0
   147
		if ((currentUnicodeCharacter&0xff80)==0x0000)
sl@0
   148
			{
sl@0
   149
			if (startOfUtf8!=NULL)
sl@0
   150
				{
sl@0
   151
				*aUtf8=(char)currentUnicodeCharacter;
sl@0
   152
				}
sl@0
   153
			}
sl@0
   154
		else if ((currentUnicodeCharacter&0xf800)==0x0000)
sl@0
   155
			{
sl@0
   156
			if (startOfUtf8!=NULL)
sl@0
   157
				{
sl@0
   158
				*aUtf8=(char)(0xc0|(currentUnicodeCharacter>>6));
sl@0
   159
				}
sl@0
   160
			++aUtf8;
sl@0
   161
			if (startOfUtf8!=NULL)
sl@0
   162
				{
sl@0
   163
				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
sl@0
   164
				}
sl@0
   165
			}
sl@0
   166
		else if ((currentUnicodeCharacter&0xfc00)==0xd800)
sl@0
   167
			{
sl@0
   168
			currentUnicodeCharacter+=0x0040;
sl@0
   169
			if (startOfUtf8!=NULL)
sl@0
   170
				{
sl@0
   171
				*aUtf8=(char)(0xf0|((currentUnicodeCharacter>>8)&0x07));
sl@0
   172
				}
sl@0
   173
			++aUtf8;
sl@0
   174
			if (startOfUtf8!=NULL)
sl@0
   175
				{
sl@0
   176
				*aUtf8=(char)(0x80|((currentUnicodeCharacter>>2)&0x3f));
sl@0
   177
				}
sl@0
   178
			{
sl@0
   179
			unsigned int currentUtf8Byte=(0x80|((currentUnicodeCharacter&0x03)<<4));
sl@0
   180
			++aUnicode;
sl@0
   181
			currentUnicodeCharacter=*aUnicode;
sl@0
   182
			if ((currentUnicodeCharacter&0xfc00)!=0xdc00)
sl@0
   183
				{
sl@0
   184
				return KErrorIllFormedInput;
sl@0
   185
				}
sl@0
   186
			currentUtf8Byte|=((currentUnicodeCharacter>>6)&0x0f);
sl@0
   187
			++aUtf8;
sl@0
   188
			if (startOfUtf8!=NULL)
sl@0
   189
				{
sl@0
   190
				*aUtf8=(char)currentUtf8Byte;
sl@0
   191
				}
sl@0
   192
			}
sl@0
   193
			++aUtf8;
sl@0
   194
			if (startOfUtf8!=NULL)
sl@0
   195
				{
sl@0
   196
				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
sl@0
   197
				}
sl@0
   198
			}
sl@0
   199
		else
sl@0
   200
			{
sl@0
   201
			if (startOfUtf8!=NULL)
sl@0
   202
				{
sl@0
   203
				*aUtf8=(char)(0xe0|(currentUnicodeCharacter>>12));
sl@0
   204
				}
sl@0
   205
			++aUtf8;
sl@0
   206
			if (startOfUtf8!=NULL)
sl@0
   207
				{
sl@0
   208
				*aUtf8=(char)(0x80|((currentUnicodeCharacter>>6)&0x3f));
sl@0
   209
				}
sl@0
   210
			++aUtf8;
sl@0
   211
			if (startOfUtf8!=NULL)
sl@0
   212
				{
sl@0
   213
				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
sl@0
   214
				}
sl@0
   215
			}
sl@0
   216
		++aUtf8;
sl@0
   217
		++aUnicode;
sl@0
   218
		}
sl@0
   219
	if (startOfUtf8!=NULL)
sl@0
   220
		{
sl@0
   221
		*aUtf8='\0';
sl@0
   222
		}
sl@0
   223
	return aUtf8-startOfUtf8;
sl@0
   224
	}
sl@0
   225
sl@0
   226