os/textandloc/charconvfw/charconvplugins/tools/UTF.PM
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 #
     2 # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 # All rights reserved.
     4 # This component and the accompanying materials are made available
     5 # under the terms of "Eclipse Public License v1.0"
     6 # which accompanies this distribution, and is available
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 #
     9 # Initial Contributors:
    10 # Nokia Corporation - initial contribution.
    11 #
    12 # Contributors:
    13 #
    14 # Description: 
    15 #
    16 
    17 use strict;
    18 use integer;
    19 
    20 package UTF;
    21 require Exporter;
    22 @UTF::ISA=qw(Exporter);
    23 @UTF::EXPORT=qw(Utf8ToUnicode UnicodeToUtf8);
    24 
    25 my $KErrorIllFormedInput=-1;
    26 
    27 sub Utf8ToUnicode
    28 	{
    29 	my $Unicode = shift;  
    30 	my $Utf8 = shift;
    31 	my $UnicodeTemplate = shift;
    32 	my $Utf8Index = 0;
    33 	my $UnicodeIndex = 0;
    34 	my $numOfBytes = length($Utf8);
    35 	my @Utf8Unpacked = unpack "C*",$Utf8;
    36 	my @UnicodeUnpacked = (); 
    37 
    38 	for (;;)
    39 		{
    40 		if ($Utf8Index > $#Utf8Unpacked)
    41 			{
    42 			last;
    43 			}
    44 
    45 		my $currentUtf8Byte = $Utf8Unpacked[$Utf8Index];
    46 		
    47 		if (($currentUtf8Byte&0x80)==0x00)
    48 			{
    49 			$UnicodeUnpacked[$UnicodeIndex] = $currentUtf8Byte;
    50 			}
    51 		
    52 		elsif (($currentUtf8Byte&0xe0)==0xc0)
    53 			{
    54 			my $currentUnicodeCharacter=(($currentUtf8Byte&0x1f)<<6);
    55 			++$Utf8Index;
    56 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
    57 			if (($currentUtf8Byte&0xc0)!=0x80)
    58 				{
    59 				return $KErrorIllFormedInput;
    60 				}
    61 			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
    62 			$UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
    63 			}
    64 
    65 		elsif (($currentUtf8Byte&0xf0)==0xe0)
    66 			{
    67 			my $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<12);
    68 			++$Utf8Index;
    69 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
    70 			if (($currentUtf8Byte&0xc0)!=0x80)
    71 				{
    72 				return $KErrorIllFormedInput;
    73 				}
    74 			$currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<6);
    75 			++$Utf8Index;
    76 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
    77 			if (($currentUtf8Byte&0xc0)!=0x80)
    78 				{
    79 				return $KErrorIllFormedInput;
    80 				}
    81 			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
    82 			$UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
    83 			}
    84 
    85 		elsif (($currentUtf8Byte&0xf8)==0xf0)
    86 			{                                         
    87 			my $currentUnicodeCharacter=(($currentUtf8Byte&0x07)<<8);
    88 			++$Utf8Index;
    89 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
    90 			if (($currentUtf8Byte&0xc0)!=0x80)
    91 				{
    92 				return $KErrorIllFormedInput;
    93 				}
    94 			$currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<2);
    95 			if ($currentUnicodeCharacter<0x0040)
    96 				{
    97 				return $KErrorIllFormedInput;
    98 				}
    99 			$currentUnicodeCharacter-=0x0040;
   100 			if ($currentUnicodeCharacter>=0x0400)
   101 				{
   102 				return $KErrorIllFormedInput;
   103 				}
   104 			++$Utf8Index;
   105 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
   106 			if (($currentUtf8Byte&0xc0)!=0x80)
   107 				{
   108 				return $KErrorIllFormedInput;
   109 				}
   110 			$currentUnicodeCharacter|=(($currentUtf8Byte&0x30)>>4);
   111 			$UnicodeUnpacked[$UnicodeIndex] = (0xd800|$currentUnicodeCharacter);
   112 			$currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<6);
   113 			++$Utf8Index;
   114 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
   115 			if (($currentUtf8Byte&0xc0)!=0x80)
   116 				{
   117 				return $KErrorIllFormedInput;
   118 				}
   119 			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
   120 			++$UnicodeIndex;
   121 			$UnicodeUnpacked[$UnicodeIndex] = (0xdc00|$currentUnicodeCharacter);
   122 			}
   123 		else
   124 			{
   125 			return $KErrorIllFormedInput;
   126 			}
   127 		++$UnicodeIndex;
   128 		++$Utf8Index;
   129 		}
   130 	$$Unicode = (); 
   131 	$$Unicode = pack "$UnicodeTemplate*", @UnicodeUnpacked;
   132 	return $UnicodeIndex;  
   133 	}
   134 
   135 sub UnicodeToUtf8
   136 	{
   137 	my $Utf8 = shift; 
   138 	my $Unicode = shift;
   139 	my $UnicodeTemplate = shift;
   140 	my $Utf8Index = 0;
   141 	my $UnicodeIndex = 0;
   142 	my $numOfBytes = length($Unicode);
   143 	my @UnicodeUnpacked = unpack "$UnicodeTemplate*", $Unicode;
   144 	my @Utf8Unpacked = ();
   145 	
   146 	for (;;)
   147 		{
   148 		# exit the loop if no more in the UnicodeUnpacked
   149 		if ($UnicodeIndex > $#UnicodeUnpacked)
   150 			{
   151 			last;
   152 			}
   153 
   154 		my $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
   155 		if (($currentUnicodeCharacter&0xff80)==0x0000)
   156 			{	
   157 			$Utf8Unpacked[$Utf8Index]= $currentUnicodeCharacter;
   158 			}
   159 		elsif (($currentUnicodeCharacter&0xf800)==0x0000)
   160 			{
   161 
   162 			$Utf8Unpacked[$Utf8Index]= (0xc0 | $currentUnicodeCharacter >> 6);
   163 			++$Utf8Index;
   164 			$Utf8Unpacked[$Utf8Index]= (0x80 | $currentUnicodeCharacter&0x3f);
   165 			}
   166 		elsif (($currentUnicodeCharacter&0xfc00)==0xd800)
   167 			{
   168 			$currentUnicodeCharacter+=0x0040;
   169 			$Utf8Unpacked[$Utf8Index]= (0xf0|(($currentUnicodeCharacter>>8)&0x07));
   170 			++$Utf8Index;
   171 			$Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>2)&0x3f));
   172 			my $currentUtf8Byte=(0x80|(($currentUnicodeCharacter&0x03)<<4));
   173 			++$UnicodeIndex;
   174 			$currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
   175 			if (($currentUnicodeCharacter&0xfc00)!=0xdc00)
   176 				{
   177 				return $KErrorIllFormedInput;
   178 				}
   179 			$currentUtf8Byte|=(($currentUnicodeCharacter>>6)&0x0f);
   180 			++$Utf8Index;
   181 			$Utf8Unpacked[$Utf8Index]= $currentUtf8Byte;
   182 			++$Utf8Index;
   183 			$Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
   184 			}
   185 		else
   186 			{
   187 			$Utf8Unpacked[$Utf8Index]= (0xe0|($currentUnicodeCharacter>>12));
   188 			++$Utf8Index;
   189 			$Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>6)&0x3f));
   190 			++$Utf8Index;
   191 			$Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
   192 			}
   193 		++$Utf8Index;
   194 		++$UnicodeIndex;
   195 		}
   196 	$$Utf8 = ();	
   197 	$$Utf8 = pack "C*", @Utf8Unpacked;
   198 	return $Utf8Index; 
   199 
   200 	}