os/textandloc/charconvfw/charconv_fw/tools/convtool/utf.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include <stdlib.h>
    20 
    21 const int KErrorIllFormedInput=-1;
    22 
    23 int Utf8ToUnicode(wchar_t* aUnicode, const char* aUtf8)
    24 // must '\0'-terminate the output
    25 	{
    26 	wchar_t* startOfUnicode=aUnicode;
    27 	for (;;)
    28 		{
    29 		unsigned int currentUtf8Byte=*aUtf8;
    30 		if (currentUtf8Byte=='\0')
    31 			{
    32 			break;
    33 			}
    34 		if ((currentUtf8Byte&0x80)==0x00)
    35 			{
    36 			if (startOfUnicode!=NULL)
    37 				{
    38 				*aUnicode=(wchar_t)currentUtf8Byte;
    39 				}
    40 			}
    41 		else if ((currentUtf8Byte&0xe0)==0xc0)
    42 			{
    43 			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x1f)<<6);
    44 			++aUtf8;
    45 			currentUtf8Byte=*aUtf8;
    46 			if ((currentUtf8Byte&0xc0)!=0x80)
    47 				{
    48 				return KErrorIllFormedInput;
    49 				}
    50 			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
    51 			if (startOfUnicode!=NULL)
    52 				{
    53 				*aUnicode=(wchar_t)currentUnicodeCharacter;
    54 				}
    55 			}
    56 		else if ((currentUtf8Byte&0xf0)==0xe0)
    57 			{
    58 			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x0f)<<12);
    59 			++aUtf8;
    60 			currentUtf8Byte=*aUtf8;
    61 			if ((currentUtf8Byte&0xc0)!=0x80)
    62 				{
    63 				return KErrorIllFormedInput;
    64 				}
    65 			currentUnicodeCharacter|=((currentUtf8Byte&0x3f)<<6);
    66 			++aUtf8;
    67 			currentUtf8Byte=*aUtf8;
    68 			if ((currentUtf8Byte&0xc0)!=0x80)
    69 				{
    70 				return KErrorIllFormedInput;
    71 				}
    72 			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
    73 			if (startOfUnicode!=NULL)
    74 				{
    75 				*aUnicode=(wchar_t)currentUnicodeCharacter;
    76 				}
    77 			}
    78 		else if ((currentUtf8Byte&0xf8)==0xf0)
    79 			{
    80 			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x07)<<8);
    81 			++aUtf8;
    82 			currentUtf8Byte=*aUtf8;
    83 			if ((currentUtf8Byte&0xc0)!=0x80)
    84 				{
    85 				return KErrorIllFormedInput;
    86 				}
    87 			currentUnicodeCharacter|=((currentUtf8Byte&0x3f)<<2);
    88 			if (currentUnicodeCharacter<0x0040)
    89 				{
    90 				return KErrorIllFormedInput;
    91 				}
    92 			currentUnicodeCharacter-=0x0040;
    93 			if (currentUnicodeCharacter>=0x0400)
    94 				{
    95 				return KErrorIllFormedInput;
    96 				}
    97 			++aUtf8;
    98 			currentUtf8Byte=*aUtf8;
    99 			if ((currentUtf8Byte&0xc0)!=0x80)
   100 				{
   101 				return KErrorIllFormedInput;
   102 				}
   103 			currentUnicodeCharacter|=((currentUtf8Byte&0x30)>>4);
   104 			if (startOfUnicode!=NULL)
   105 				{
   106 				*aUnicode=(wchar_t)(0xd800|currentUnicodeCharacter);
   107 				}
   108 			currentUnicodeCharacter=((currentUtf8Byte&0x0f)<<6);
   109 			++aUtf8;
   110 			currentUtf8Byte=*aUtf8;
   111 			if ((currentUtf8Byte&0xc0)!=0x80)
   112 				{
   113 				return KErrorIllFormedInput;
   114 				}
   115 			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
   116 			++aUnicode;
   117 			if (startOfUnicode!=NULL)
   118 				{
   119 				*aUnicode=(wchar_t)(0xdc00|currentUnicodeCharacter);
   120 				}
   121 			}
   122 		else
   123 			{
   124 			return KErrorIllFormedInput;
   125 			}
   126 		++aUnicode;
   127 		++aUtf8;
   128 		}
   129 	if (startOfUnicode!=NULL)
   130 		{
   131 		*aUnicode='\0';
   132 		}
   133 	return aUnicode-startOfUnicode;
   134 	}
   135 #include <stdio.h>
   136 int UnicodeToUtf8(char* aUtf8, const wchar_t* aUnicode)
   137 // must '\0'-terminate the output
   138 	{
   139 	char* startOfUtf8=aUtf8;
   140 	for (;;)
   141 		{
   142 		unsigned int currentUnicodeCharacter=*aUnicode;
   143 		if (currentUnicodeCharacter=='\0')
   144 			{
   145 			break;
   146 			}
   147 		if ((currentUnicodeCharacter&0xff80)==0x0000)
   148 			{
   149 			if (startOfUtf8!=NULL)
   150 				{
   151 				*aUtf8=(char)currentUnicodeCharacter;
   152 				}
   153 			}
   154 		else if ((currentUnicodeCharacter&0xf800)==0x0000)
   155 			{
   156 			if (startOfUtf8!=NULL)
   157 				{
   158 				*aUtf8=(char)(0xc0|(currentUnicodeCharacter>>6));
   159 				}
   160 			++aUtf8;
   161 			if (startOfUtf8!=NULL)
   162 				{
   163 				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
   164 				}
   165 			}
   166 		else if ((currentUnicodeCharacter&0xfc00)==0xd800)
   167 			{
   168 			currentUnicodeCharacter+=0x0040;
   169 			if (startOfUtf8!=NULL)
   170 				{
   171 				*aUtf8=(char)(0xf0|((currentUnicodeCharacter>>8)&0x07));
   172 				}
   173 			++aUtf8;
   174 			if (startOfUtf8!=NULL)
   175 				{
   176 				*aUtf8=(char)(0x80|((currentUnicodeCharacter>>2)&0x3f));
   177 				}
   178 			{
   179 			unsigned int currentUtf8Byte=(0x80|((currentUnicodeCharacter&0x03)<<4));
   180 			++aUnicode;
   181 			currentUnicodeCharacter=*aUnicode;
   182 			if ((currentUnicodeCharacter&0xfc00)!=0xdc00)
   183 				{
   184 				return KErrorIllFormedInput;
   185 				}
   186 			currentUtf8Byte|=((currentUnicodeCharacter>>6)&0x0f);
   187 			++aUtf8;
   188 			if (startOfUtf8!=NULL)
   189 				{
   190 				*aUtf8=(char)currentUtf8Byte;
   191 				}
   192 			}
   193 			++aUtf8;
   194 			if (startOfUtf8!=NULL)
   195 				{
   196 				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
   197 				}
   198 			}
   199 		else
   200 			{
   201 			if (startOfUtf8!=NULL)
   202 				{
   203 				*aUtf8=(char)(0xe0|(currentUnicodeCharacter>>12));
   204 				}
   205 			++aUtf8;
   206 			if (startOfUtf8!=NULL)
   207 				{
   208 				*aUtf8=(char)(0x80|((currentUnicodeCharacter>>6)&0x3f));
   209 				}
   210 			++aUtf8;
   211 			if (startOfUtf8!=NULL)
   212 				{
   213 				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
   214 				}
   215 			}
   216 		++aUtf8;
   217 		++aUnicode;
   218 		}
   219 	if (startOfUtf8!=NULL)
   220 		{
   221 		*aUtf8='\0';
   222 		}
   223 	return aUtf8-startOfUtf8;
   224 	}
   225 
   226