sl@0: # sl@0: # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: # All rights reserved. sl@0: # This component and the accompanying materials are made available sl@0: # under the terms of "Eclipse Public License v1.0" sl@0: # which accompanies this distribution, and is available sl@0: # at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: # sl@0: # Initial Contributors: sl@0: # Nokia Corporation - initial contribution. sl@0: # sl@0: # Contributors: sl@0: # sl@0: # Description: sl@0: # sl@0: sl@0: use strict; sl@0: use integer; sl@0: sl@0: sub PerlScriptPath sl@0: { sl@0: my $perlScriptPath=$0; sl@0: my $os = $^O; #get the OS type sl@0: #check OS type sl@0: if($os=~/MSWin32/) #Windows OS sl@0: { sl@0: $perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes sl@0: $perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name sl@0: } sl@0: else #Unix OS sl@0: { sl@0: $perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes sl@0: $perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name sl@0: } sl@0: return $perlScriptPath; sl@0: } sl@0: BEGIN sl@0: { sl@0: unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names sl@0: } sl@0: use PARSER; sl@0: use UTF; sl@0: sl@0: # The following numbers are used for byte-orders: sl@0: # 0 means unspecified sl@0: # 1 means big-endian sl@0: # 2 means little-endian sl@0: sl@0: FixParametersToWorkWithWindows98(\@ARGV); sl@0: my $versionNumber = 3; sl@0: my $outputByteOrderMark = 0; sl@0: my $unicodeByteOrder = 0; sl@0: my $inputEncoding = ""; sl@0: my $outputEncoding = ""; sl@0: my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key sl@0: my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key sl@0: sl@0: sl@0: my $inputFile=\*STDIN; sl@0: my $outputFile=\*STDOUT; sl@0: ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile); sl@0: HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile); sl@0: DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters); sl@0: if ($inputFile!=\*STDIN) sl@0: { sl@0: close($inputFile) or die; sl@0: } sl@0: if ($outputFile!=\*STDOUT) sl@0: { sl@0: close($outputFile) or die; sl@0: } sl@0: sl@0: sub FixParametersToWorkWithWindows98 sl@0: { sl@0: my $parameters=shift; sl@0: my $i; sl@0: for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters sl@0: { sl@0: if (($parameters->[$i]=~/^(-input)$/i) || sl@0: ($parameters->[$i]=~/^(-output)$/i)) sl@0: { sl@0: $parameters->[$i].='='.$parameters->[$i+1]; sl@0: splice(@$parameters, $i+1, 1); sl@0: } sl@0: } sl@0: } sl@0: sl@0: sub PrintUsage sl@0: { sl@0: print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n"; sl@0: print "Usage:\n\n\t charconv [] \n\nwhere\n\n\t"; sl@0: print "options := [-big|-little][-byteordermark]\n\t"; sl@0: print "inputspec := -input= []\n\t"; sl@0: print "outputspec := -output= []\n\t"; sl@0: print "format := unicode|utf8|big5|gb2312...\n\n"; sl@0: } sl@0: sl@0: sub Assert sl@0: { sl@0: my $condition = shift; sl@0: my $errorMessage = shift; sl@0: if (!($condition)) # find out where this is used and work this out sl@0: { sl@0: die("Error: $errorMessage"); sl@0: } sl@0: } sl@0: sl@0: sub PrintWarning sl@0: { sl@0: my $warningMessage = shift; sl@0: print STDERR "Warning: $warningMessage\n"; sl@0: } sl@0: sl@0: sl@0: sub TryFileParameter sl@0: { sl@0: my $args = shift; sl@0: my $argindex = shift; sl@0: my $inputoroutput = shift; sl@0: my $encoding = shift; sl@0: my $filehandle = shift; sl@0: my $prefix = "-$inputoroutput="; sl@0: sl@0: if ($args->[$$argindex] =~ /^$prefix(.*)/) sl@0: { sl@0: Assert($$encoding eq "", "\"$prefix...\" is specified more than once"); sl@0: $$encoding = $1; sl@0: ++$$argindex; sl@0: if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/)) sl@0: { sl@0: --$$argindex; sl@0: } sl@0: else sl@0: { sl@0: if ($inputoroutput =~ /input/i) sl@0: { sl@0: open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!"; sl@0: $$filehandle=\*INPUT_FILE; sl@0: } sl@0: else sl@0: { sl@0: open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!"; sl@0: $$filehandle=\*OUTPUT_FILE; sl@0: } sl@0: } sl@0: binmode $$filehandle; sl@0: return 1; sl@0: } sl@0: return 0; sl@0: } sl@0: sl@0: sub ReadParameters sl@0: { sl@0: my $args = shift; sl@0: my $outputbyteordermark = shift; sl@0: my $unicodebyteorder = shift; sl@0: my $inputencoding = shift; sl@0: my $outputencoding = shift; sl@0: my $inputhandle = shift; sl@0: my $outputhandle = shift; sl@0: my $i; sl@0: my $range; sl@0: if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?")) sl@0: { sl@0: PrintUsage(); sl@0: exit; sl@0: } sl@0: sl@0: for ($i = 0; $i < @$args ; ++$i) sl@0: { sl@0: if ( $args->[$i]=~ /-byteordermark/i) sl@0: { sl@0: Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once"); sl@0: $$outputbyteordermark = 1; sl@0: } sl@0: elsif ($args->[$i]=~ /-big/i) sl@0: { sl@0: Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once"); sl@0: $$unicodebyteorder = 1; sl@0: } sl@0: elsif ($args->[$i]=~ /-little/i) sl@0: { sl@0: Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once"); sl@0: $$unicodebyteorder = 2; sl@0: } sl@0: else sl@0: { sl@0: Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) || sl@0: TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\""); sl@0: } sl@0: } sl@0: Assert($$inputencoding ne "", "no input encoding is specified"); sl@0: Assert($$outputencoding ne "", "no output encoding is specified"); sl@0: } sl@0: sl@0: sub ReadFromFile sl@0: { sl@0: my $buffer = shift; sl@0: my $numOfBytesToRead = shift; sl@0: my $inputhandle = shift; sl@0: my $numOfBytesRead = 0; sl@0: my $numOfBytesToReadThisTime = $numOfBytesToRead; sl@0: sl@0: for(;;) sl@0: { sl@0: for(;;) sl@0: { sl@0: my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead; sl@0: if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead) sl@0: { sl@0: $numOfBytesToReadThisTime = $remainingNumOfBytesToRead; sl@0: } sl@0: my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime; sl@0: if (defined $numOfBytesReadThisTime) sl@0: { sl@0: $numOfBytesRead += $numOfBytesReadThisTime; sl@0: Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)"); sl@0: if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0) sl@0: { sl@0: return; sl@0: } sl@0: last; sl@0: } sl@0: $numOfBytesToReadThisTime /= 2; sl@0: Assert($numOfBytesToReadThisTime >0, "reading from file failed"); sl@0: } sl@0: } sl@0: } sl@0: sl@0: sub HandleByteOrderMarks sl@0: { sl@0: my $outputbyteordermark = shift; sl@0: my $unicodebyteorder = shift; sl@0: my $inputencoding = shift; sl@0: my $outputencoding = shift; sl@0: my $inputhandle = shift; sl@0: my $outputhandle = shift; sl@0: sl@0: if ($$inputencoding =~ /unicode/i) sl@0: { sl@0: my $firstUnicodeCharacter = 0; sl@0: ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle); sl@0: my $byteOrderSpecifiedByByteOrderMark = 0; sl@0: if (length($firstUnicodeCharacter) == 2) sl@0: { sl@0: my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter; sl@0: if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe)) sl@0: { sl@0: $byteOrderSpecifiedByByteOrderMark = 2; sl@0: } sl@0: elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff)) sl@0: { sl@0: $byteOrderSpecifiedByByteOrderMark = 1; sl@0: } sl@0: else sl@0: { sl@0: my $error = seek $inputhandle, 0, 0; # rewind to start of file sl@0: Assert ($error == 1, "could not rewind to the start of input file"); sl@0: } sl@0: } sl@0: if ($byteOrderSpecifiedByByteOrderMark!=0) sl@0: { sl@0: if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder)) sl@0: { sl@0: PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input"); sl@0: } sl@0: $$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark; sl@0: } sl@0: } sl@0: if ($outputbyteordermark) sl@0: { sl@0: if ($$outputencoding ne "unicode") sl@0: { sl@0: PrintWarning("\"-byteordermark\" is only relevant for unicode output"); sl@0: } sl@0: else sl@0: { sl@0: Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output"); sl@0: my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe"; sl@0: WriteToFile(\$firstUnicodeCharacter, $outputhandle); sl@0: } sl@0: } sl@0: } sl@0: sl@0: sub WriteToFile sl@0: { sl@0: my $buffer = shift; sl@0: my $outputhandle = shift; sl@0: sl@0: print $outputhandle $$buffer; sl@0: } sl@0: sl@0: sub DoConversion sl@0: { sl@0: my $unicodebyteorder = shift; sl@0: my $inputencoding = shift; sl@0: my $outputencoding = shift; sl@0: my $inputhandle = shift; sl@0: my $outputhandle = shift; sl@0: my $foreignCharacters = shift; sl@0: my $unicodeCharacters = shift; sl@0: sl@0: my $currentBuffer = 0; sl@0: my @arrayOfBuffers = ('', '', ''); sl@0: my $largeNumber=1000000; sl@0: ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle); sl@0: ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer])); sl@0: if ($$inputencoding ne $$outputencoding) sl@0: { sl@0: if ($$inputencoding !~ /^unicode$/i) sl@0: { sl@0: my $nextBuffer = $currentBuffer + 1; sl@0: OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v'); sl@0: $currentBuffer = $nextBuffer; sl@0: } sl@0: if ($$outputencoding !~ /^unicode$/i) sl@0: { sl@0: my $nextBuffer = $currentBuffer + 1; sl@0: UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v'); sl@0: $currentBuffer = $nextBuffer; sl@0: } sl@0: } sl@0: ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer])); sl@0: WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle); sl@0: } sl@0: sl@0: sub ReverseByteOrderIfUnicodeAndBigEndian sl@0: { sl@0: my $unicodebyteorder = shift; sl@0: my $encoding = shift; sl@0: my $buffer = shift; sl@0: my $i; sl@0: sl@0: if ($$encoding =~ /^unicode$/i) sl@0: { sl@0: Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)"); sl@0: if ($$unicodebyteorder==0) sl@0: { sl@0: PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian"); sl@0: $$unicodebyteorder = 2; sl@0: } sl@0: if ($$unicodebyteorder==1) sl@0: { sl@0: $$buffer=pack('v*', unpack('n*', $$buffer)); sl@0: } sl@0: } sl@0: } sl@0: sl@0: sub FillInHashes sl@0: { sl@0: my $foreignCharacters = shift; sl@0: my $unicodeCharacters = shift; sl@0: my $encoding = shift; sl@0: my $replacementCharacter = shift; sl@0: my $ranges = shift; sl@0: my $bigEndian = shift; sl@0: sl@0: my $endianness = 0; sl@0: my $replacenum = 0; sl@0: my $rangenum = 0; sl@0: my $fileread = 0; sl@0: my $largenumber = 1000000; sl@0: sl@0: my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat'; sl@0: sl@0: my $line; sl@0: sl@0: if (-e $dataFile) sl@0: { sl@0: open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading"); sl@0: sl@0: binmode HASH_INPUT; sl@0: # reading the endianness sl@0: $fileread = read HASH_INPUT, $endianness, 1; sl@0: $endianness = unpack "C",$endianness; sl@0: if ($endianness == 0) sl@0: { sl@0: # set the template to a default-> n for the eman time sl@0: $$bigEndian = 0; sl@0: } sl@0: elsif ($endianness == 1) sl@0: { sl@0: $$bigEndian = 0; sl@0: } sl@0: elsif ($endianness == 2) sl@0: { sl@0: $$bigEndian = 1; sl@0: } sl@0: else sl@0: { sl@0: print "Illegal Endianness specified in the control files"; sl@0: } sl@0: #reading the replacement characters sl@0: $fileread = read HASH_INPUT, $replacenum,1; sl@0: $replacenum= unpack "C",$replacenum; sl@0: $fileread = read HASH_INPUT, $$replacementCharacter,$replacenum; sl@0: # reading the ranges sl@0: $fileread = read HASH_INPUT, $rangenum, 1; sl@0: $rangenum = unpack "C",$rangenum; sl@0: my $i; # loop variable sl@0: for ($i=0; $i < $rangenum; ++$i) sl@0: { sl@0: my $lowerrange = 0; sl@0: my $upperrange = 0; sl@0: my $followchar = 0; sl@0: sl@0: $fileread = read HASH_INPUT,$lowerrange,1; sl@0: $lowerrange = unpack "C",$lowerrange; sl@0: $fileread = read HASH_INPUT,$upperrange,1; sl@0: $upperrange = unpack "C",$upperrange; sl@0: $fileread = read HASH_INPUT,$followchar,1; sl@0: $followchar = unpack "C",$followchar; sl@0: sl@0: push @$ranges,[$lowerrange,$upperrange,$followchar]; sl@0: } sl@0: my $data = 0; sl@0: my @unpackeddata = 0; sl@0: $fileread = read HASH_INPUT, $data, $largenumber; sl@0: @unpackeddata = unpack "v*",$data; sl@0: for($i = 0; $i <= $#unpackeddata; $i= $i+2) sl@0: { sl@0: $unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1]; sl@0: $foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i]; sl@0: } sl@0: } sl@0: else sl@0: { sl@0: die ("Encoding Format \"$$encoding\" not recognised"); sl@0: } sl@0: } sl@0: sl@0: sub OtherToUnicode sl@0: { sl@0: my $inputencoding = shift; sl@0: my $unicode = shift; sl@0: my $other = shift; sl@0: my $foreignCharacters = shift; sl@0: my $unicodeCharacters = shift; sl@0: my $unicodetemplate = shift; sl@0: my $replacementCharacter = 0; sl@0: my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd); sl@0: my @ranges=(); sl@0: sl@0: my $otherIndex= 0; sl@0: my $numOfBytes = length($other); sl@0: my $key = 0; sl@0: my $inRange = 0; sl@0: my $followByte = -1; sl@0: sl@0: if ($$inputencoding=~/^utf8$/i) sl@0: { sl@0: return &Utf8ToUnicode($unicode, $other, $unicodetemplate); sl@0: } sl@0: my $bigEndian; sl@0: FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian); sl@0: for (;;) sl@0: { sl@0: if ($otherIndex > $numOfBytes -1) sl@0: { sl@0: last; sl@0: } sl@0: my $frontByte = (unpack("x$otherIndex".'C', $other))[0]; sl@0: # @ranges is an array of references. Each reference is a reference to an array sl@0: for ($key = 0; $key <= $#ranges; ++$key) sl@0: { sl@0: my $arrayref = $ranges[$key]; sl@0: if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1])) sl@0: { sl@0: $followByte = $arrayref->[2]; sl@0: $inRange = 1; sl@0: } sl@0: } sl@0: Assert ($inRange != 0, "cannot figure out the Byte size of the character"); sl@0: my $tempByte = 0; sl@0: for ($key = 0; $key<= $followByte; ++$key) sl@0: { sl@0: if ($bigEndian) sl@0: { sl@0: $tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0]; sl@0: } sl@0: else sl@0: { sl@0: $tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key)); sl@0: } sl@0: $otherIndex++; sl@0: } sl@0: if (exists $unicodeCharacters->{$tempByte}) sl@0: { sl@0: $$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte}; sl@0: } sl@0: else sl@0: { sl@0: $$unicode .= $unicodeReplacementCharacter; sl@0: } sl@0: } sl@0: } sl@0: sl@0: sub UnicodeToOther sl@0: { sl@0: my $outputencoding = shift; sl@0: my $other = shift; sl@0: my $unicode = shift; sl@0: my $foreignCharacters = shift; sl@0: my $unicodeCharacters = shift; sl@0: my $unicodetemplate = shift; sl@0: my $replacementCharacter = 0; sl@0: my @ranges=(); sl@0: sl@0: my $unicodeIndex= 0; sl@0: my $numOfBytes = length($unicode); sl@0: my @UnicodeUnpacked = (); sl@0: my $key = 0; sl@0: sl@0: if ($$outputencoding=~/^utf8$/i) sl@0: { sl@0: return &UnicodeToUtf8($other, $unicode, $unicodetemplate); sl@0: } sl@0: my $bigEndian; sl@0: FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian); sl@0: my $foreignTemplate=$bigEndian? 'n': 'v'; sl@0: @UnicodeUnpacked = unpack "$unicodetemplate*", $unicode; sl@0: foreach $key (@UnicodeUnpacked) sl@0: { sl@0: if (!exists($foreignCharacters->{$key})) sl@0: { sl@0: $$other .= $replacementCharacter; sl@0: } sl@0: else sl@0: { sl@0: # This is the WRONG but it will work for the mean time sl@0: # This will fail if the foreignCharacter has characters that are more than sl@0: # two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes sl@0: sl@0: my $foreignValue = $foreignCharacters->{$key}; sl@0: if ( $foreignValue <= 255) sl@0: { sl@0: $$other .= pack "C" , $foreignValue; sl@0: } sl@0: else sl@0: { sl@0: $$other .= pack $foreignTemplate, $foreignValue; sl@0: } sl@0: } sl@0: } sl@0: } sl@0: