1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/charconvfw/charconvplugins/tools/UTF.PM Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,200 @@
1.4 +#
1.5 +# Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +# All rights reserved.
1.7 +# This component and the accompanying materials are made available
1.8 +# under the terms of "Eclipse Public License v1.0"
1.9 +# which accompanies this distribution, and is available
1.10 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +#
1.12 +# Initial Contributors:
1.13 +# Nokia Corporation - initial contribution.
1.14 +#
1.15 +# Contributors:
1.16 +#
1.17 +# Description:
1.18 +#
1.19 +
1.20 +use strict;
1.21 +use integer;
1.22 +
1.23 +package UTF;
1.24 +require Exporter;
1.25 +@UTF::ISA=qw(Exporter);
1.26 +@UTF::EXPORT=qw(Utf8ToUnicode UnicodeToUtf8);
1.27 +
1.28 +my $KErrorIllFormedInput=-1;
1.29 +
1.30 +sub Utf8ToUnicode
1.31 + {
1.32 + my $Unicode = shift;
1.33 + my $Utf8 = shift;
1.34 + my $UnicodeTemplate = shift;
1.35 + my $Utf8Index = 0;
1.36 + my $UnicodeIndex = 0;
1.37 + my $numOfBytes = length($Utf8);
1.38 + my @Utf8Unpacked = unpack "C*",$Utf8;
1.39 + my @UnicodeUnpacked = ();
1.40 +
1.41 + for (;;)
1.42 + {
1.43 + if ($Utf8Index > $#Utf8Unpacked)
1.44 + {
1.45 + last;
1.46 + }
1.47 +
1.48 + my $currentUtf8Byte = $Utf8Unpacked[$Utf8Index];
1.49 +
1.50 + if (($currentUtf8Byte&0x80)==0x00)
1.51 + {
1.52 + $UnicodeUnpacked[$UnicodeIndex] = $currentUtf8Byte;
1.53 + }
1.54 +
1.55 + elsif (($currentUtf8Byte&0xe0)==0xc0)
1.56 + {
1.57 + my $currentUnicodeCharacter=(($currentUtf8Byte&0x1f)<<6);
1.58 + ++$Utf8Index;
1.59 + $currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
1.60 + if (($currentUtf8Byte&0xc0)!=0x80)
1.61 + {
1.62 + return $KErrorIllFormedInput;
1.63 + }
1.64 + $currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
1.65 + $UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
1.66 + }
1.67 +
1.68 + elsif (($currentUtf8Byte&0xf0)==0xe0)
1.69 + {
1.70 + my $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<12);
1.71 + ++$Utf8Index;
1.72 + $currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
1.73 + if (($currentUtf8Byte&0xc0)!=0x80)
1.74 + {
1.75 + return $KErrorIllFormedInput;
1.76 + }
1.77 + $currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<6);
1.78 + ++$Utf8Index;
1.79 + $currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
1.80 + if (($currentUtf8Byte&0xc0)!=0x80)
1.81 + {
1.82 + return $KErrorIllFormedInput;
1.83 + }
1.84 + $currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
1.85 + $UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
1.86 + }
1.87 +
1.88 + elsif (($currentUtf8Byte&0xf8)==0xf0)
1.89 + {
1.90 + my $currentUnicodeCharacter=(($currentUtf8Byte&0x07)<<8);
1.91 + ++$Utf8Index;
1.92 + $currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
1.93 + if (($currentUtf8Byte&0xc0)!=0x80)
1.94 + {
1.95 + return $KErrorIllFormedInput;
1.96 + }
1.97 + $currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<2);
1.98 + if ($currentUnicodeCharacter<0x0040)
1.99 + {
1.100 + return $KErrorIllFormedInput;
1.101 + }
1.102 + $currentUnicodeCharacter-=0x0040;
1.103 + if ($currentUnicodeCharacter>=0x0400)
1.104 + {
1.105 + return $KErrorIllFormedInput;
1.106 + }
1.107 + ++$Utf8Index;
1.108 + $currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
1.109 + if (($currentUtf8Byte&0xc0)!=0x80)
1.110 + {
1.111 + return $KErrorIllFormedInput;
1.112 + }
1.113 + $currentUnicodeCharacter|=(($currentUtf8Byte&0x30)>>4);
1.114 + $UnicodeUnpacked[$UnicodeIndex] = (0xd800|$currentUnicodeCharacter);
1.115 + $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<6);
1.116 + ++$Utf8Index;
1.117 + $currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
1.118 + if (($currentUtf8Byte&0xc0)!=0x80)
1.119 + {
1.120 + return $KErrorIllFormedInput;
1.121 + }
1.122 + $currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
1.123 + ++$UnicodeIndex;
1.124 + $UnicodeUnpacked[$UnicodeIndex] = (0xdc00|$currentUnicodeCharacter);
1.125 + }
1.126 + else
1.127 + {
1.128 + return $KErrorIllFormedInput;
1.129 + }
1.130 + ++$UnicodeIndex;
1.131 + ++$Utf8Index;
1.132 + }
1.133 + $$Unicode = ();
1.134 + $$Unicode = pack "$UnicodeTemplate*", @UnicodeUnpacked;
1.135 + return $UnicodeIndex;
1.136 + }
1.137 +
1.138 +sub UnicodeToUtf8
1.139 + {
1.140 + my $Utf8 = shift;
1.141 + my $Unicode = shift;
1.142 + my $UnicodeTemplate = shift;
1.143 + my $Utf8Index = 0;
1.144 + my $UnicodeIndex = 0;
1.145 + my $numOfBytes = length($Unicode);
1.146 + my @UnicodeUnpacked = unpack "$UnicodeTemplate*", $Unicode;
1.147 + my @Utf8Unpacked = ();
1.148 +
1.149 + for (;;)
1.150 + {
1.151 + # exit the loop if no more in the UnicodeUnpacked
1.152 + if ($UnicodeIndex > $#UnicodeUnpacked)
1.153 + {
1.154 + last;
1.155 + }
1.156 +
1.157 + my $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
1.158 + if (($currentUnicodeCharacter&0xff80)==0x0000)
1.159 + {
1.160 + $Utf8Unpacked[$Utf8Index]= $currentUnicodeCharacter;
1.161 + }
1.162 + elsif (($currentUnicodeCharacter&0xf800)==0x0000)
1.163 + {
1.164 +
1.165 + $Utf8Unpacked[$Utf8Index]= (0xc0 | $currentUnicodeCharacter >> 6);
1.166 + ++$Utf8Index;
1.167 + $Utf8Unpacked[$Utf8Index]= (0x80 | $currentUnicodeCharacter&0x3f);
1.168 + }
1.169 + elsif (($currentUnicodeCharacter&0xfc00)==0xd800)
1.170 + {
1.171 + $currentUnicodeCharacter+=0x0040;
1.172 + $Utf8Unpacked[$Utf8Index]= (0xf0|(($currentUnicodeCharacter>>8)&0x07));
1.173 + ++$Utf8Index;
1.174 + $Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>2)&0x3f));
1.175 + my $currentUtf8Byte=(0x80|(($currentUnicodeCharacter&0x03)<<4));
1.176 + ++$UnicodeIndex;
1.177 + $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
1.178 + if (($currentUnicodeCharacter&0xfc00)!=0xdc00)
1.179 + {
1.180 + return $KErrorIllFormedInput;
1.181 + }
1.182 + $currentUtf8Byte|=(($currentUnicodeCharacter>>6)&0x0f);
1.183 + ++$Utf8Index;
1.184 + $Utf8Unpacked[$Utf8Index]= $currentUtf8Byte;
1.185 + ++$Utf8Index;
1.186 + $Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
1.187 + }
1.188 + else
1.189 + {
1.190 + $Utf8Unpacked[$Utf8Index]= (0xe0|($currentUnicodeCharacter>>12));
1.191 + ++$Utf8Index;
1.192 + $Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>6)&0x3f));
1.193 + ++$Utf8Index;
1.194 + $Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
1.195 + }
1.196 + ++$Utf8Index;
1.197 + ++$UnicodeIndex;
1.198 + }
1.199 + $$Utf8 = ();
1.200 + $$Utf8 = pack "C*", @Utf8Unpacked;
1.201 + return $Utf8Index;
1.202 +
1.203 + }