First public contribution.
2 # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
4 # This component and the accompanying materials are made available
5 # under the terms of "Eclipse Public License v1.0"
6 # which accompanies this distribution, and is available
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 # Initial Contributors:
10 # Nokia Corporation - initial contribution.
22 my $perlScriptPath=$0;
23 my $os = $^O; #get the OS type
25 if($os=~/MSWin32/) #Windows OS
27 $perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
28 $perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
32 $perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes
33 $perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name
35 return $perlScriptPath;
39 unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
44 # The following numbers are used for byte-orders:
47 # 2 means little-endian
49 FixParametersToWorkWithWindows98(\@ARGV);
50 my $versionNumber = 3;
51 my $outputByteOrderMark = 0;
52 my $unicodeByteOrder = 0;
53 my $inputEncoding = "";
54 my $outputEncoding = "";
55 my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key
56 my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key
59 my $inputFile=\*STDIN;
60 my $outputFile=\*STDOUT;
61 ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
62 HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
63 DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
64 if ($inputFile!=\*STDIN)
66 close($inputFile) or die;
68 if ($outputFile!=\*STDOUT)
70 close($outputFile) or die;
73 sub FixParametersToWorkWithWindows98
77 for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
79 if (($parameters->[$i]=~/^(-input)$/i) ||
80 ($parameters->[$i]=~/^(-output)$/i))
82 $parameters->[$i].='='.$parameters->[$i+1];
83 splice(@$parameters, $i+1, 1);
90 print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
91 print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
92 print "options := [-big|-little][-byteordermark]\n\t";
93 print "inputspec := -input=<format> [<input_file>]\n\t";
94 print "outputspec := -output=<format> [<output_file>]\n\t";
95 print "format := unicode|utf8|big5|gb2312...\n\n";
100 my $condition = shift;
101 my $errorMessage = shift;
102 if (!($condition)) # find out where this is used and work this out
104 die("Error: $errorMessage");
110 my $warningMessage = shift;
111 print STDERR "Warning: $warningMessage\n";
118 my $argindex = shift;
119 my $inputoroutput = shift;
120 my $encoding = shift;
121 my $filehandle = shift;
122 my $prefix = "-$inputoroutput=";
124 if ($args->[$$argindex] =~ /^$prefix(.*)/)
126 Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
129 if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
135 if ($inputoroutput =~ /input/i)
137 open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
138 $$filehandle=\*INPUT_FILE;
142 open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
143 $$filehandle=\*OUTPUT_FILE;
146 binmode $$filehandle;
155 my $outputbyteordermark = shift;
156 my $unicodebyteorder = shift;
157 my $inputencoding = shift;
158 my $outputencoding = shift;
159 my $inputhandle = shift;
160 my $outputhandle = shift;
163 if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
169 for ($i = 0; $i < @$args ; ++$i)
171 if ( $args->[$i]=~ /-byteordermark/i)
173 Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
174 $$outputbyteordermark = 1;
176 elsif ($args->[$i]=~ /-big/i)
178 Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
179 $$unicodebyteorder = 1;
181 elsif ($args->[$i]=~ /-little/i)
183 Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
184 $$unicodebyteorder = 2;
188 Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
189 TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
192 Assert($$inputencoding ne "", "no input encoding is specified");
193 Assert($$outputencoding ne "", "no output encoding is specified");
199 my $numOfBytesToRead = shift;
200 my $inputhandle = shift;
201 my $numOfBytesRead = 0;
202 my $numOfBytesToReadThisTime = $numOfBytesToRead;
208 my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;
209 if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
211 $numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
213 my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;
214 if (defined $numOfBytesReadThisTime)
216 $numOfBytesRead += $numOfBytesReadThisTime;
217 Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
218 if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
224 $numOfBytesToReadThisTime /= 2;
225 Assert($numOfBytesToReadThisTime >0, "reading from file failed");
230 sub HandleByteOrderMarks
232 my $outputbyteordermark = shift;
233 my $unicodebyteorder = shift;
234 my $inputencoding = shift;
235 my $outputencoding = shift;
236 my $inputhandle = shift;
237 my $outputhandle = shift;
239 if ($$inputencoding =~ /unicode/i)
241 my $firstUnicodeCharacter = 0;
242 ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
243 my $byteOrderSpecifiedByByteOrderMark = 0;
244 if (length($firstUnicodeCharacter) == 2)
246 my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
247 if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
249 $byteOrderSpecifiedByByteOrderMark = 2;
251 elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
253 $byteOrderSpecifiedByByteOrderMark = 1;
257 my $error = seek $inputhandle, 0, 0; # rewind to start of file
258 Assert ($error == 1, "could not rewind to the start of input file");
261 if ($byteOrderSpecifiedByByteOrderMark!=0)
263 if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
265 PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
267 $$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
270 if ($outputbyteordermark)
272 if ($$outputencoding ne "unicode")
274 PrintWarning("\"-byteordermark\" is only relevant for unicode output");
278 Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
279 my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
280 WriteToFile(\$firstUnicodeCharacter, $outputhandle);
288 my $outputhandle = shift;
290 print $outputhandle $$buffer;
295 my $unicodebyteorder = shift;
296 my $inputencoding = shift;
297 my $outputencoding = shift;
298 my $inputhandle = shift;
299 my $outputhandle = shift;
300 my $foreignCharacters = shift;
301 my $unicodeCharacters = shift;
303 my $currentBuffer = 0;
304 my @arrayOfBuffers = ('', '', '');
305 my $largeNumber=1000000;
306 ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
307 ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
308 if ($$inputencoding ne $$outputencoding)
310 if ($$inputencoding !~ /^unicode$/i)
312 my $nextBuffer = $currentBuffer + 1;
313 OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
314 $currentBuffer = $nextBuffer;
316 if ($$outputencoding !~ /^unicode$/i)
318 my $nextBuffer = $currentBuffer + 1;
319 UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
320 $currentBuffer = $nextBuffer;
323 ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
324 WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
327 sub ReverseByteOrderIfUnicodeAndBigEndian
329 my $unicodebyteorder = shift;
330 my $encoding = shift;
334 if ($$encoding =~ /^unicode$/i)
336 Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
337 if ($$unicodebyteorder==0)
339 PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
340 $$unicodebyteorder = 2;
342 if ($$unicodebyteorder==1)
344 $$buffer=pack('v*', unpack('n*', $$buffer));
351 my $foreignCharacters = shift;
352 my $unicodeCharacters = shift;
353 my $encoding = shift;
354 my $replacementCharacter = shift;
356 my $bigEndian = shift;
362 my $largenumber = 1000000;
364 my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
370 open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
373 # reading the endianness
374 $fileread = read HASH_INPUT, $endianness, 1;
375 $endianness = unpack "C",$endianness;
376 if ($endianness == 0)
378 # set the template to a default-> n for the eman time
381 elsif ($endianness == 1)
385 elsif ($endianness == 2)
391 print "Illegal Endianness specified in the control files";
393 #reading the replacement characters
394 $fileread = read HASH_INPUT, $replacenum,1;
395 $replacenum= unpack "C",$replacenum;
396 $fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
398 $fileread = read HASH_INPUT, $rangenum, 1;
399 $rangenum = unpack "C",$rangenum;
400 my $i; # loop variable
401 for ($i=0; $i < $rangenum; ++$i)
407 $fileread = read HASH_INPUT,$lowerrange,1;
408 $lowerrange = unpack "C",$lowerrange;
409 $fileread = read HASH_INPUT,$upperrange,1;
410 $upperrange = unpack "C",$upperrange;
411 $fileread = read HASH_INPUT,$followchar,1;
412 $followchar = unpack "C",$followchar;
414 push @$ranges,[$lowerrange,$upperrange,$followchar];
417 my @unpackeddata = 0;
418 $fileread = read HASH_INPUT, $data, $largenumber;
419 @unpackeddata = unpack "v*",$data;
420 for($i = 0; $i <= $#unpackeddata; $i= $i+2)
422 $unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
423 $foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
428 die ("Encoding Format \"$$encoding\" not recognised");
434 my $inputencoding = shift;
437 my $foreignCharacters = shift;
438 my $unicodeCharacters = shift;
439 my $unicodetemplate = shift;
440 my $replacementCharacter = 0;
441 my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
445 my $numOfBytes = length($other);
450 if ($$inputencoding=~/^utf8$/i)
452 return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
455 FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
458 if ($otherIndex > $numOfBytes -1)
462 my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
463 # @ranges is an array of references. Each reference is a reference to an array
464 for ($key = 0; $key <= $#ranges; ++$key)
466 my $arrayref = $ranges[$key];
467 if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
469 $followByte = $arrayref->[2];
473 Assert ($inRange != 0, "cannot figure out the Byte size of the character");
475 for ($key = 0; $key<= $followByte; ++$key)
479 $tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
483 $tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
487 if (exists $unicodeCharacters->{$tempByte})
489 $$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
493 $$unicode .= $unicodeReplacementCharacter;
500 my $outputencoding = shift;
503 my $foreignCharacters = shift;
504 my $unicodeCharacters = shift;
505 my $unicodetemplate = shift;
506 my $replacementCharacter = 0;
510 my $numOfBytes = length($unicode);
511 my @UnicodeUnpacked = ();
514 if ($$outputencoding=~/^utf8$/i)
516 return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
519 FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
520 my $foreignTemplate=$bigEndian? 'n': 'v';
521 @UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
522 foreach $key (@UnicodeUnpacked)
524 if (!exists($foreignCharacters->{$key}))
526 $$other .= $replacementCharacter;
530 # This is the WRONG but it will work for the mean time
531 # This will fail if the foreignCharacter has characters that are more than
532 # two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
534 my $foreignValue = $foreignCharacters->{$key};
535 if ( $foreignValue <= 255)
537 $$other .= pack "C" , $foreignValue;
541 $$other .= pack $foreignTemplate, $foreignValue;