os/textandloc/charconvfw/charconvplugins/tools/UTF.PM
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
#
sl@0
     2
# Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
# All rights reserved.
sl@0
     4
# This component and the accompanying materials are made available
sl@0
     5
# under the terms of "Eclipse Public License v1.0"
sl@0
     6
# which accompanies this distribution, and is available
sl@0
     7
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
#
sl@0
     9
# Initial Contributors:
sl@0
    10
# Nokia Corporation - initial contribution.
sl@0
    11
#
sl@0
    12
# Contributors:
sl@0
    13
#
sl@0
    14
# Description: 
sl@0
    15
#
sl@0
    16
sl@0
    17
use strict;
sl@0
    18
use integer;
sl@0
    19
sl@0
    20
package UTF;
sl@0
    21
require Exporter;
sl@0
    22
@UTF::ISA=qw(Exporter);
sl@0
    23
@UTF::EXPORT=qw(Utf8ToUnicode UnicodeToUtf8);
sl@0
    24
sl@0
    25
my $KErrorIllFormedInput=-1;
sl@0
    26
sl@0
    27
sub Utf8ToUnicode
sl@0
    28
	{
sl@0
    29
	my $Unicode = shift;  
sl@0
    30
	my $Utf8 = shift;
sl@0
    31
	my $UnicodeTemplate = shift;
sl@0
    32
	my $Utf8Index = 0;
sl@0
    33
	my $UnicodeIndex = 0;
sl@0
    34
	my $numOfBytes = length($Utf8);
sl@0
    35
	my @Utf8Unpacked = unpack "C*",$Utf8;
sl@0
    36
	my @UnicodeUnpacked = (); 
sl@0
    37
sl@0
    38
	for (;;)
sl@0
    39
		{
sl@0
    40
		if ($Utf8Index > $#Utf8Unpacked)
sl@0
    41
			{
sl@0
    42
			last;
sl@0
    43
			}
sl@0
    44
sl@0
    45
		my $currentUtf8Byte = $Utf8Unpacked[$Utf8Index];
sl@0
    46
		
sl@0
    47
		if (($currentUtf8Byte&0x80)==0x00)
sl@0
    48
			{
sl@0
    49
			$UnicodeUnpacked[$UnicodeIndex] = $currentUtf8Byte;
sl@0
    50
			}
sl@0
    51
		
sl@0
    52
		elsif (($currentUtf8Byte&0xe0)==0xc0)
sl@0
    53
			{
sl@0
    54
			my $currentUnicodeCharacter=(($currentUtf8Byte&0x1f)<<6);
sl@0
    55
			++$Utf8Index;
sl@0
    56
			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
sl@0
    57
			if (($currentUtf8Byte&0xc0)!=0x80)
sl@0
    58
				{
sl@0
    59
				return $KErrorIllFormedInput;
sl@0
    60
				}
sl@0
    61
			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
sl@0
    62
			$UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
sl@0
    63
			}
sl@0
    64
sl@0
    65
		elsif (($currentUtf8Byte&0xf0)==0xe0)
sl@0
    66
			{
sl@0
    67
			my $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<12);
sl@0
    68
			++$Utf8Index;
sl@0
    69
			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
sl@0
    70
			if (($currentUtf8Byte&0xc0)!=0x80)
sl@0
    71
				{
sl@0
    72
				return $KErrorIllFormedInput;
sl@0
    73
				}
sl@0
    74
			$currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<6);
sl@0
    75
			++$Utf8Index;
sl@0
    76
			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
sl@0
    77
			if (($currentUtf8Byte&0xc0)!=0x80)
sl@0
    78
				{
sl@0
    79
				return $KErrorIllFormedInput;
sl@0
    80
				}
sl@0
    81
			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
sl@0
    82
			$UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
sl@0
    83
			}
sl@0
    84
sl@0
    85
		elsif (($currentUtf8Byte&0xf8)==0xf0)
sl@0
    86
			{                                         
sl@0
    87
			my $currentUnicodeCharacter=(($currentUtf8Byte&0x07)<<8);
sl@0
    88
			++$Utf8Index;
sl@0
    89
			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
sl@0
    90
			if (($currentUtf8Byte&0xc0)!=0x80)
sl@0
    91
				{
sl@0
    92
				return $KErrorIllFormedInput;
sl@0
    93
				}
sl@0
    94
			$currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<2);
sl@0
    95
			if ($currentUnicodeCharacter<0x0040)
sl@0
    96
				{
sl@0
    97
				return $KErrorIllFormedInput;
sl@0
    98
				}
sl@0
    99
			$currentUnicodeCharacter-=0x0040;
sl@0
   100
			if ($currentUnicodeCharacter>=0x0400)
sl@0
   101
				{
sl@0
   102
				return $KErrorIllFormedInput;
sl@0
   103
				}
sl@0
   104
			++$Utf8Index;
sl@0
   105
			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
sl@0
   106
			if (($currentUtf8Byte&0xc0)!=0x80)
sl@0
   107
				{
sl@0
   108
				return $KErrorIllFormedInput;
sl@0
   109
				}
sl@0
   110
			$currentUnicodeCharacter|=(($currentUtf8Byte&0x30)>>4);
sl@0
   111
			$UnicodeUnpacked[$UnicodeIndex] = (0xd800|$currentUnicodeCharacter);
sl@0
   112
			$currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<6);
sl@0
   113
			++$Utf8Index;
sl@0
   114
			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
sl@0
   115
			if (($currentUtf8Byte&0xc0)!=0x80)
sl@0
   116
				{
sl@0
   117
				return $KErrorIllFormedInput;
sl@0
   118
				}
sl@0
   119
			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
sl@0
   120
			++$UnicodeIndex;
sl@0
   121
			$UnicodeUnpacked[$UnicodeIndex] = (0xdc00|$currentUnicodeCharacter);
sl@0
   122
			}
sl@0
   123
		else
sl@0
   124
			{
sl@0
   125
			return $KErrorIllFormedInput;
sl@0
   126
			}
sl@0
   127
		++$UnicodeIndex;
sl@0
   128
		++$Utf8Index;
sl@0
   129
		}
sl@0
   130
	$$Unicode = (); 
sl@0
   131
	$$Unicode = pack "$UnicodeTemplate*", @UnicodeUnpacked;
sl@0
   132
	return $UnicodeIndex;  
sl@0
   133
	}
sl@0
   134
sl@0
   135
sub UnicodeToUtf8
sl@0
   136
	{
sl@0
   137
	my $Utf8 = shift; 
sl@0
   138
	my $Unicode = shift;
sl@0
   139
	my $UnicodeTemplate = shift;
sl@0
   140
	my $Utf8Index = 0;
sl@0
   141
	my $UnicodeIndex = 0;
sl@0
   142
	my $numOfBytes = length($Unicode);
sl@0
   143
	my @UnicodeUnpacked = unpack "$UnicodeTemplate*", $Unicode;
sl@0
   144
	my @Utf8Unpacked = ();
sl@0
   145
	
sl@0
   146
	for (;;)
sl@0
   147
		{
sl@0
   148
		# exit the loop if no more in the UnicodeUnpacked
sl@0
   149
		if ($UnicodeIndex > $#UnicodeUnpacked)
sl@0
   150
			{
sl@0
   151
			last;
sl@0
   152
			}
sl@0
   153
sl@0
   154
		my $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
sl@0
   155
		if (($currentUnicodeCharacter&0xff80)==0x0000)
sl@0
   156
			{	
sl@0
   157
			$Utf8Unpacked[$Utf8Index]= $currentUnicodeCharacter;
sl@0
   158
			}
sl@0
   159
		elsif (($currentUnicodeCharacter&0xf800)==0x0000)
sl@0
   160
			{
sl@0
   161
sl@0
   162
			$Utf8Unpacked[$Utf8Index]= (0xc0 | $currentUnicodeCharacter >> 6);
sl@0
   163
			++$Utf8Index;
sl@0
   164
			$Utf8Unpacked[$Utf8Index]= (0x80 | $currentUnicodeCharacter&0x3f);
sl@0
   165
			}
sl@0
   166
		elsif (($currentUnicodeCharacter&0xfc00)==0xd800)
sl@0
   167
			{
sl@0
   168
			$currentUnicodeCharacter+=0x0040;
sl@0
   169
			$Utf8Unpacked[$Utf8Index]= (0xf0|(($currentUnicodeCharacter>>8)&0x07));
sl@0
   170
			++$Utf8Index;
sl@0
   171
			$Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>2)&0x3f));
sl@0
   172
			my $currentUtf8Byte=(0x80|(($currentUnicodeCharacter&0x03)<<4));
sl@0
   173
			++$UnicodeIndex;
sl@0
   174
			$currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
sl@0
   175
			if (($currentUnicodeCharacter&0xfc00)!=0xdc00)
sl@0
   176
				{
sl@0
   177
				return $KErrorIllFormedInput;
sl@0
   178
				}
sl@0
   179
			$currentUtf8Byte|=(($currentUnicodeCharacter>>6)&0x0f);
sl@0
   180
			++$Utf8Index;
sl@0
   181
			$Utf8Unpacked[$Utf8Index]= $currentUtf8Byte;
sl@0
   182
			++$Utf8Index;
sl@0
   183
			$Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
sl@0
   184
			}
sl@0
   185
		else
sl@0
   186
			{
sl@0
   187
			$Utf8Unpacked[$Utf8Index]= (0xe0|($currentUnicodeCharacter>>12));
sl@0
   188
			++$Utf8Index;
sl@0
   189
			$Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>6)&0x3f));
sl@0
   190
			++$Utf8Index;
sl@0
   191
			$Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
sl@0
   192
			}
sl@0
   193
		++$Utf8Index;
sl@0
   194
		++$UnicodeIndex;
sl@0
   195
		}
sl@0
   196
	$$Utf8 = ();	
sl@0
   197
	$$Utf8 = pack "C*", @Utf8Unpacked;
sl@0
   198
	return $Utf8Index; 
sl@0
   199
sl@0
   200
	}