1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/charconvfw/charconvplugins/tools/charconv.pl Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,546 @@
1.4 +#
1.5 +# Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +# All rights reserved.
1.7 +# This component and the accompanying materials are made available
1.8 +# under the terms of "Eclipse Public License v1.0"
1.9 +# which accompanies this distribution, and is available
1.10 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +#
1.12 +# Initial Contributors:
1.13 +# Nokia Corporation - initial contribution.
1.14 +#
1.15 +# Contributors:
1.16 +#
1.17 +# Description:
1.18 +#
1.19 +
1.20 +use strict;
1.21 +use integer;
1.22 +
1.23 +sub PerlScriptPath
1.24 + {
1.25 + my $perlScriptPath=$0;
1.26 + my $os = $^O; #get the OS type
1.27 + #check OS type
1.28 + if($os=~/MSWin32/) #Windows OS
1.29 + {
1.30 + $perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
1.31 + $perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
1.32 + }
1.33 + else #Unix OS
1.34 + {
1.35 + $perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes
1.36 + $perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name
1.37 + }
1.38 + return $perlScriptPath;
1.39 + }
1.40 +BEGIN
1.41 + {
1.42 + unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
1.43 + }
1.44 +use PARSER;
1.45 +use UTF;
1.46 +
1.47 +# The following numbers are used for byte-orders:
1.48 +# 0 means unspecified
1.49 +# 1 means big-endian
1.50 +# 2 means little-endian
1.51 +
1.52 +FixParametersToWorkWithWindows98(\@ARGV);
1.53 +my $versionNumber = 3;
1.54 +my $outputByteOrderMark = 0;
1.55 +my $unicodeByteOrder = 0;
1.56 +my $inputEncoding = "";
1.57 +my $outputEncoding = "";
1.58 +my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key
1.59 +my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key
1.60 +
1.61 +
1.62 +my $inputFile=\*STDIN;
1.63 +my $outputFile=\*STDOUT;
1.64 +ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
1.65 +HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
1.66 +DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
1.67 +if ($inputFile!=\*STDIN)
1.68 + {
1.69 + close($inputFile) or die;
1.70 + }
1.71 +if ($outputFile!=\*STDOUT)
1.72 + {
1.73 + close($outputFile) or die;
1.74 + }
1.75 +
1.76 +sub FixParametersToWorkWithWindows98
1.77 + {
1.78 + my $parameters=shift;
1.79 + my $i;
1.80 + for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
1.81 + {
1.82 + if (($parameters->[$i]=~/^(-input)$/i) ||
1.83 + ($parameters->[$i]=~/^(-output)$/i))
1.84 + {
1.85 + $parameters->[$i].='='.$parameters->[$i+1];
1.86 + splice(@$parameters, $i+1, 1);
1.87 + }
1.88 + }
1.89 + }
1.90 +
1.91 +sub PrintUsage
1.92 + {
1.93 + print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
1.94 + print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
1.95 + print "options := [-big|-little][-byteordermark]\n\t";
1.96 + print "inputspec := -input=<format> [<input_file>]\n\t";
1.97 + print "outputspec := -output=<format> [<output_file>]\n\t";
1.98 + print "format := unicode|utf8|big5|gb2312...\n\n";
1.99 + }
1.100 +
1.101 +sub Assert
1.102 + {
1.103 + my $condition = shift;
1.104 + my $errorMessage = shift;
1.105 + if (!($condition)) # find out where this is used and work this out
1.106 + {
1.107 + die("Error: $errorMessage");
1.108 + }
1.109 + }
1.110 +
1.111 +sub PrintWarning
1.112 + {
1.113 + my $warningMessage = shift;
1.114 + print STDERR "Warning: $warningMessage\n";
1.115 + }
1.116 +
1.117 +
1.118 +sub TryFileParameter
1.119 + {
1.120 + my $args = shift;
1.121 + my $argindex = shift;
1.122 + my $inputoroutput = shift;
1.123 + my $encoding = shift;
1.124 + my $filehandle = shift;
1.125 + my $prefix = "-$inputoroutput=";
1.126 +
1.127 + if ($args->[$$argindex] =~ /^$prefix(.*)/)
1.128 + {
1.129 + Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
1.130 + $$encoding = $1;
1.131 + ++$$argindex;
1.132 + if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
1.133 + {
1.134 + --$$argindex;
1.135 + }
1.136 + else
1.137 + {
1.138 + if ($inputoroutput =~ /input/i)
1.139 + {
1.140 + open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
1.141 + $$filehandle=\*INPUT_FILE;
1.142 + }
1.143 + else
1.144 + {
1.145 + open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
1.146 + $$filehandle=\*OUTPUT_FILE;
1.147 + }
1.148 + }
1.149 + binmode $$filehandle;
1.150 + return 1;
1.151 + }
1.152 + return 0;
1.153 + }
1.154 +
1.155 +sub ReadParameters
1.156 + {
1.157 + my $args = shift;
1.158 + my $outputbyteordermark = shift;
1.159 + my $unicodebyteorder = shift;
1.160 + my $inputencoding = shift;
1.161 + my $outputencoding = shift;
1.162 + my $inputhandle = shift;
1.163 + my $outputhandle = shift;
1.164 + my $i;
1.165 + my $range;
1.166 + if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
1.167 + {
1.168 + PrintUsage();
1.169 + exit;
1.170 + }
1.171 +
1.172 + for ($i = 0; $i < @$args ; ++$i)
1.173 + {
1.174 + if ( $args->[$i]=~ /-byteordermark/i)
1.175 + {
1.176 + Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
1.177 + $$outputbyteordermark = 1;
1.178 + }
1.179 + elsif ($args->[$i]=~ /-big/i)
1.180 + {
1.181 + Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
1.182 + $$unicodebyteorder = 1;
1.183 + }
1.184 + elsif ($args->[$i]=~ /-little/i)
1.185 + {
1.186 + Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
1.187 + $$unicodebyteorder = 2;
1.188 + }
1.189 + else
1.190 + {
1.191 + Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
1.192 + TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
1.193 + }
1.194 + }
1.195 + Assert($$inputencoding ne "", "no input encoding is specified");
1.196 + Assert($$outputencoding ne "", "no output encoding is specified");
1.197 + }
1.198 +
1.199 +sub ReadFromFile
1.200 + {
1.201 + my $buffer = shift;
1.202 + my $numOfBytesToRead = shift;
1.203 + my $inputhandle = shift;
1.204 + my $numOfBytesRead = 0;
1.205 + my $numOfBytesToReadThisTime = $numOfBytesToRead;
1.206 +
1.207 + for(;;)
1.208 + {
1.209 + for(;;)
1.210 + {
1.211 + my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;
1.212 + if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
1.213 + {
1.214 + $numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
1.215 + }
1.216 + my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;
1.217 + if (defined $numOfBytesReadThisTime)
1.218 + {
1.219 + $numOfBytesRead += $numOfBytesReadThisTime;
1.220 + Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
1.221 + if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
1.222 + {
1.223 + return;
1.224 + }
1.225 + last;
1.226 + }
1.227 + $numOfBytesToReadThisTime /= 2;
1.228 + Assert($numOfBytesToReadThisTime >0, "reading from file failed");
1.229 + }
1.230 + }
1.231 + }
1.232 +
1.233 +sub HandleByteOrderMarks
1.234 + {
1.235 + my $outputbyteordermark = shift;
1.236 + my $unicodebyteorder = shift;
1.237 + my $inputencoding = shift;
1.238 + my $outputencoding = shift;
1.239 + my $inputhandle = shift;
1.240 + my $outputhandle = shift;
1.241 +
1.242 + if ($$inputencoding =~ /unicode/i)
1.243 + {
1.244 + my $firstUnicodeCharacter = 0;
1.245 + ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
1.246 + my $byteOrderSpecifiedByByteOrderMark = 0;
1.247 + if (length($firstUnicodeCharacter) == 2)
1.248 + {
1.249 + my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
1.250 + if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
1.251 + {
1.252 + $byteOrderSpecifiedByByteOrderMark = 2;
1.253 + }
1.254 + elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
1.255 + {
1.256 + $byteOrderSpecifiedByByteOrderMark = 1;
1.257 + }
1.258 + else
1.259 + {
1.260 + my $error = seek $inputhandle, 0, 0; # rewind to start of file
1.261 + Assert ($error == 1, "could not rewind to the start of input file");
1.262 + }
1.263 + }
1.264 + if ($byteOrderSpecifiedByByteOrderMark!=0)
1.265 + {
1.266 + if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
1.267 + {
1.268 + PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
1.269 + }
1.270 + $$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
1.271 + }
1.272 + }
1.273 + if ($outputbyteordermark)
1.274 + {
1.275 + if ($$outputencoding ne "unicode")
1.276 + {
1.277 + PrintWarning("\"-byteordermark\" is only relevant for unicode output");
1.278 + }
1.279 + else
1.280 + {
1.281 + Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
1.282 + my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
1.283 + WriteToFile(\$firstUnicodeCharacter, $outputhandle);
1.284 + }
1.285 + }
1.286 + }
1.287 +
1.288 +sub WriteToFile
1.289 + {
1.290 + my $buffer = shift;
1.291 + my $outputhandle = shift;
1.292 +
1.293 + print $outputhandle $$buffer;
1.294 + }
1.295 +
1.296 +sub DoConversion
1.297 + {
1.298 + my $unicodebyteorder = shift;
1.299 + my $inputencoding = shift;
1.300 + my $outputencoding = shift;
1.301 + my $inputhandle = shift;
1.302 + my $outputhandle = shift;
1.303 + my $foreignCharacters = shift;
1.304 + my $unicodeCharacters = shift;
1.305 +
1.306 + my $currentBuffer = 0;
1.307 + my @arrayOfBuffers = ('', '', '');
1.308 + my $largeNumber=1000000;
1.309 + ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
1.310 + ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
1.311 + if ($$inputencoding ne $$outputencoding)
1.312 + {
1.313 + if ($$inputencoding !~ /^unicode$/i)
1.314 + {
1.315 + my $nextBuffer = $currentBuffer + 1;
1.316 + OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
1.317 + $currentBuffer = $nextBuffer;
1.318 + }
1.319 + if ($$outputencoding !~ /^unicode$/i)
1.320 + {
1.321 + my $nextBuffer = $currentBuffer + 1;
1.322 + UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
1.323 + $currentBuffer = $nextBuffer;
1.324 + }
1.325 + }
1.326 + ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
1.327 + WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
1.328 + }
1.329 +
1.330 +sub ReverseByteOrderIfUnicodeAndBigEndian
1.331 + {
1.332 + my $unicodebyteorder = shift;
1.333 + my $encoding = shift;
1.334 + my $buffer = shift;
1.335 + my $i;
1.336 +
1.337 + if ($$encoding =~ /^unicode$/i)
1.338 + {
1.339 + Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
1.340 + if ($$unicodebyteorder==0)
1.341 + {
1.342 + PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
1.343 + $$unicodebyteorder = 2;
1.344 + }
1.345 + if ($$unicodebyteorder==1)
1.346 + {
1.347 + $$buffer=pack('v*', unpack('n*', $$buffer));
1.348 + }
1.349 + }
1.350 + }
1.351 +
1.352 +sub FillInHashes
1.353 + {
1.354 + my $foreignCharacters = shift;
1.355 + my $unicodeCharacters = shift;
1.356 + my $encoding = shift;
1.357 + my $replacementCharacter = shift;
1.358 + my $ranges = shift;
1.359 + my $bigEndian = shift;
1.360 +
1.361 + my $endianness = 0;
1.362 + my $replacenum = 0;
1.363 + my $rangenum = 0;
1.364 + my $fileread = 0;
1.365 + my $largenumber = 1000000;
1.366 +
1.367 + my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
1.368 +
1.369 + my $line;
1.370 +
1.371 + if (-e $dataFile)
1.372 + {
1.373 + open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
1.374 +
1.375 + binmode HASH_INPUT;
1.376 + # reading the endianness
1.377 + $fileread = read HASH_INPUT, $endianness, 1;
1.378 + $endianness = unpack "C",$endianness;
1.379 + if ($endianness == 0)
1.380 + {
1.381 + # set the template to a default-> n for the eman time
1.382 + $$bigEndian = 0;
1.383 + }
1.384 + elsif ($endianness == 1)
1.385 + {
1.386 + $$bigEndian = 0;
1.387 + }
1.388 + elsif ($endianness == 2)
1.389 + {
1.390 + $$bigEndian = 1;
1.391 + }
1.392 + else
1.393 + {
1.394 + print "Illegal Endianness specified in the control files";
1.395 + }
1.396 + #reading the replacement characters
1.397 + $fileread = read HASH_INPUT, $replacenum,1;
1.398 + $replacenum= unpack "C",$replacenum;
1.399 + $fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
1.400 + # reading the ranges
1.401 + $fileread = read HASH_INPUT, $rangenum, 1;
1.402 + $rangenum = unpack "C",$rangenum;
1.403 + my $i; # loop variable
1.404 + for ($i=0; $i < $rangenum; ++$i)
1.405 + {
1.406 + my $lowerrange = 0;
1.407 + my $upperrange = 0;
1.408 + my $followchar = 0;
1.409 +
1.410 + $fileread = read HASH_INPUT,$lowerrange,1;
1.411 + $lowerrange = unpack "C",$lowerrange;
1.412 + $fileread = read HASH_INPUT,$upperrange,1;
1.413 + $upperrange = unpack "C",$upperrange;
1.414 + $fileread = read HASH_INPUT,$followchar,1;
1.415 + $followchar = unpack "C",$followchar;
1.416 +
1.417 + push @$ranges,[$lowerrange,$upperrange,$followchar];
1.418 + }
1.419 + my $data = 0;
1.420 + my @unpackeddata = 0;
1.421 + $fileread = read HASH_INPUT, $data, $largenumber;
1.422 + @unpackeddata = unpack "v*",$data;
1.423 + for($i = 0; $i <= $#unpackeddata; $i= $i+2)
1.424 + {
1.425 + $unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
1.426 + $foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
1.427 + }
1.428 + }
1.429 + else
1.430 + {
1.431 + die ("Encoding Format \"$$encoding\" not recognised");
1.432 + }
1.433 + }
1.434 +
1.435 +sub OtherToUnicode
1.436 + {
1.437 + my $inputencoding = shift;
1.438 + my $unicode = shift;
1.439 + my $other = shift;
1.440 + my $foreignCharacters = shift;
1.441 + my $unicodeCharacters = shift;
1.442 + my $unicodetemplate = shift;
1.443 + my $replacementCharacter = 0;
1.444 + my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
1.445 + my @ranges=();
1.446 +
1.447 + my $otherIndex= 0;
1.448 + my $numOfBytes = length($other);
1.449 + my $key = 0;
1.450 + my $inRange = 0;
1.451 + my $followByte = -1;
1.452 +
1.453 + if ($$inputencoding=~/^utf8$/i)
1.454 + {
1.455 + return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
1.456 + }
1.457 + my $bigEndian;
1.458 + FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
1.459 + for (;;)
1.460 + {
1.461 + if ($otherIndex > $numOfBytes -1)
1.462 + {
1.463 + last;
1.464 + }
1.465 + my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
1.466 + # @ranges is an array of references. Each reference is a reference to an array
1.467 + for ($key = 0; $key <= $#ranges; ++$key)
1.468 + {
1.469 + my $arrayref = $ranges[$key];
1.470 + if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
1.471 + {
1.472 + $followByte = $arrayref->[2];
1.473 + $inRange = 1;
1.474 + }
1.475 + }
1.476 + Assert ($inRange != 0, "cannot figure out the Byte size of the character");
1.477 + my $tempByte = 0;
1.478 + for ($key = 0; $key<= $followByte; ++$key)
1.479 + {
1.480 + if ($bigEndian)
1.481 + {
1.482 + $tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
1.483 + }
1.484 + else
1.485 + {
1.486 + $tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
1.487 + }
1.488 + $otherIndex++;
1.489 + }
1.490 + if (exists $unicodeCharacters->{$tempByte})
1.491 + {
1.492 + $$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
1.493 + }
1.494 + else
1.495 + {
1.496 + $$unicode .= $unicodeReplacementCharacter;
1.497 + }
1.498 + }
1.499 + }
1.500 +
1.501 +sub UnicodeToOther
1.502 + {
1.503 + my $outputencoding = shift;
1.504 + my $other = shift;
1.505 + my $unicode = shift;
1.506 + my $foreignCharacters = shift;
1.507 + my $unicodeCharacters = shift;
1.508 + my $unicodetemplate = shift;
1.509 + my $replacementCharacter = 0;
1.510 + my @ranges=();
1.511 +
1.512 + my $unicodeIndex= 0;
1.513 + my $numOfBytes = length($unicode);
1.514 + my @UnicodeUnpacked = ();
1.515 + my $key = 0;
1.516 +
1.517 + if ($$outputencoding=~/^utf8$/i)
1.518 + {
1.519 + return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
1.520 + }
1.521 + my $bigEndian;
1.522 + FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
1.523 + my $foreignTemplate=$bigEndian? 'n': 'v';
1.524 + @UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
1.525 + foreach $key (@UnicodeUnpacked)
1.526 + {
1.527 + if (!exists($foreignCharacters->{$key}))
1.528 + {
1.529 + $$other .= $replacementCharacter;
1.530 + }
1.531 + else
1.532 + {
1.533 + # This is the WRONG but it will work for the mean time
1.534 + # This will fail if the foreignCharacter has characters that are more than
1.535 + # two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
1.536 +
1.537 + my $foreignValue = $foreignCharacters->{$key};
1.538 + if ( $foreignValue <= 255)
1.539 + {
1.540 + $$other .= pack "C" , $foreignValue;
1.541 + }
1.542 + else
1.543 + {
1.544 + $$other .= pack $foreignTemplate, $foreignValue;
1.545 + }
1.546 + }
1.547 + }
1.548 + }
1.549 +