sl@0: # sl@0: # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: # All rights reserved. sl@0: # This component and the accompanying materials are made available sl@0: # under the terms of "Eclipse Public License v1.0" sl@0: # which accompanies this distribution, and is available sl@0: # at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: # sl@0: # Initial Contributors: sl@0: # Nokia Corporation - initial contribution. sl@0: # sl@0: # Contributors: sl@0: # sl@0: # Description: sl@0: # sl@0: sl@0: use strict; sl@0: use integer; sl@0: sl@0: package UTF; sl@0: require Exporter; sl@0: @UTF::ISA=qw(Exporter); sl@0: @UTF::EXPORT=qw(Utf8ToUnicode UnicodeToUtf8); sl@0: sl@0: my $KErrorIllFormedInput=-1; sl@0: sl@0: sub Utf8ToUnicode sl@0: { sl@0: my $Unicode = shift; sl@0: my $Utf8 = shift; sl@0: my $UnicodeTemplate = shift; sl@0: my $Utf8Index = 0; sl@0: my $UnicodeIndex = 0; sl@0: my $numOfBytes = length($Utf8); sl@0: my @Utf8Unpacked = unpack "C*",$Utf8; sl@0: my @UnicodeUnpacked = (); sl@0: sl@0: for (;;) sl@0: { sl@0: if ($Utf8Index > $#Utf8Unpacked) sl@0: { sl@0: last; sl@0: } sl@0: sl@0: my $currentUtf8Byte = $Utf8Unpacked[$Utf8Index]; sl@0: sl@0: if (($currentUtf8Byte&0x80)==0x00) sl@0: { sl@0: $UnicodeUnpacked[$UnicodeIndex] = $currentUtf8Byte; sl@0: } sl@0: sl@0: elsif (($currentUtf8Byte&0xe0)==0xc0) sl@0: { sl@0: my $currentUnicodeCharacter=(($currentUtf8Byte&0x1f)<<6); sl@0: ++$Utf8Index; sl@0: $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; sl@0: if (($currentUtf8Byte&0xc0)!=0x80) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter|=($currentUtf8Byte&0x3f); sl@0: $UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter; sl@0: } sl@0: sl@0: elsif (($currentUtf8Byte&0xf0)==0xe0) sl@0: { sl@0: my $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<12); sl@0: ++$Utf8Index; sl@0: $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; sl@0: if (($currentUtf8Byte&0xc0)!=0x80) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<6); sl@0: ++$Utf8Index; sl@0: $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; sl@0: if (($currentUtf8Byte&0xc0)!=0x80) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter|=($currentUtf8Byte&0x3f); sl@0: $UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter; sl@0: } sl@0: sl@0: elsif (($currentUtf8Byte&0xf8)==0xf0) sl@0: { sl@0: my $currentUnicodeCharacter=(($currentUtf8Byte&0x07)<<8); sl@0: ++$Utf8Index; sl@0: $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; sl@0: if (($currentUtf8Byte&0xc0)!=0x80) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<2); sl@0: if ($currentUnicodeCharacter<0x0040) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter-=0x0040; sl@0: if ($currentUnicodeCharacter>=0x0400) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: ++$Utf8Index; sl@0: $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; sl@0: if (($currentUtf8Byte&0xc0)!=0x80) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter|=(($currentUtf8Byte&0x30)>>4); sl@0: $UnicodeUnpacked[$UnicodeIndex] = (0xd800|$currentUnicodeCharacter); sl@0: $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<6); sl@0: ++$Utf8Index; sl@0: $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; sl@0: if (($currentUtf8Byte&0xc0)!=0x80) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUnicodeCharacter|=($currentUtf8Byte&0x3f); sl@0: ++$UnicodeIndex; sl@0: $UnicodeUnpacked[$UnicodeIndex] = (0xdc00|$currentUnicodeCharacter); sl@0: } sl@0: else sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: ++$UnicodeIndex; sl@0: ++$Utf8Index; sl@0: } sl@0: $$Unicode = (); sl@0: $$Unicode = pack "$UnicodeTemplate*", @UnicodeUnpacked; sl@0: return $UnicodeIndex; sl@0: } sl@0: sl@0: sub UnicodeToUtf8 sl@0: { sl@0: my $Utf8 = shift; sl@0: my $Unicode = shift; sl@0: my $UnicodeTemplate = shift; sl@0: my $Utf8Index = 0; sl@0: my $UnicodeIndex = 0; sl@0: my $numOfBytes = length($Unicode); sl@0: my @UnicodeUnpacked = unpack "$UnicodeTemplate*", $Unicode; sl@0: my @Utf8Unpacked = (); sl@0: sl@0: for (;;) sl@0: { sl@0: # exit the loop if no more in the UnicodeUnpacked sl@0: if ($UnicodeIndex > $#UnicodeUnpacked) sl@0: { sl@0: last; sl@0: } sl@0: sl@0: my $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex]; sl@0: if (($currentUnicodeCharacter&0xff80)==0x0000) sl@0: { sl@0: $Utf8Unpacked[$Utf8Index]= $currentUnicodeCharacter; sl@0: } sl@0: elsif (($currentUnicodeCharacter&0xf800)==0x0000) sl@0: { sl@0: sl@0: $Utf8Unpacked[$Utf8Index]= (0xc0 | $currentUnicodeCharacter >> 6); sl@0: ++$Utf8Index; sl@0: $Utf8Unpacked[$Utf8Index]= (0x80 | $currentUnicodeCharacter&0x3f); sl@0: } sl@0: elsif (($currentUnicodeCharacter&0xfc00)==0xd800) sl@0: { sl@0: $currentUnicodeCharacter+=0x0040; sl@0: $Utf8Unpacked[$Utf8Index]= (0xf0|(($currentUnicodeCharacter>>8)&0x07)); sl@0: ++$Utf8Index; sl@0: $Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>2)&0x3f)); sl@0: my $currentUtf8Byte=(0x80|(($currentUnicodeCharacter&0x03)<<4)); sl@0: ++$UnicodeIndex; sl@0: $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex]; sl@0: if (($currentUnicodeCharacter&0xfc00)!=0xdc00) sl@0: { sl@0: return $KErrorIllFormedInput; sl@0: } sl@0: $currentUtf8Byte|=(($currentUnicodeCharacter>>6)&0x0f); sl@0: ++$Utf8Index; sl@0: $Utf8Unpacked[$Utf8Index]= $currentUtf8Byte; sl@0: ++$Utf8Index; sl@0: $Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f)); sl@0: } sl@0: else sl@0: { sl@0: $Utf8Unpacked[$Utf8Index]= (0xe0|($currentUnicodeCharacter>>12)); sl@0: ++$Utf8Index; sl@0: $Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>6)&0x3f)); sl@0: ++$Utf8Index; sl@0: $Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f)); sl@0: } sl@0: ++$Utf8Index; sl@0: ++$UnicodeIndex; sl@0: } sl@0: $$Utf8 = (); sl@0: $$Utf8 = pack "C*", @Utf8Unpacked; sl@0: return $Utf8Index; sl@0: sl@0: }