os/textandloc/charconvfw/charconvplugins/tools/charconv.pl
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 #
     2 # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 # All rights reserved.
     4 # This component and the accompanying materials are made available
     5 # under the terms of "Eclipse Public License v1.0"
     6 # which accompanies this distribution, and is available
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 #
     9 # Initial Contributors:
    10 # Nokia Corporation - initial contribution.
    11 #
    12 # Contributors:
    13 #
    14 # Description: 
    15 #
    16 
    17 use strict;
    18 use integer;
    19 
    20 sub PerlScriptPath
    21 	{
    22 	my $perlScriptPath=$0;
    23 	my $os = $^O; #get the OS type
    24 	#check OS type
    25   if($os=~/MSWin32/) #Windows OS
    26     {
    27     $perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
    28     $perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
    29     }
    30   else #Unix OS
    31     {
    32     $perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes
    33     $perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name
    34     }
    35 	return $perlScriptPath;
    36 	}
    37 BEGIN
    38 	{
    39 	unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
    40 	}
    41 use PARSER;
    42 use UTF;
    43 
    44 # The following numbers are used for byte-orders:
    45 #     0 means unspecified
    46 #     1 means big-endian
    47 #     2 means little-endian
    48 
    49 FixParametersToWorkWithWindows98(\@ARGV);
    50 my $versionNumber = 3;
    51 my $outputByteOrderMark = 0;
    52 my $unicodeByteOrder = 0;
    53 my $inputEncoding = ""; 
    54 my $outputEncoding = ""; 
    55 my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key 
    56 my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key 
    57 
    58 
    59 my $inputFile=\*STDIN;
    60 my $outputFile=\*STDOUT;
    61 ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
    62 HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
    63 DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
    64 if ($inputFile!=\*STDIN)
    65 	{
    66 	close($inputFile) or die;
    67 	}
    68 if ($outputFile!=\*STDOUT)
    69 	{
    70 	close($outputFile) or die;
    71 	}
    72 
    73 sub FixParametersToWorkWithWindows98
    74 	{
    75 	my $parameters=shift;
    76 	my $i;
    77 	for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
    78 		{
    79 		if (($parameters->[$i]=~/^(-input)$/i) ||
    80 			($parameters->[$i]=~/^(-output)$/i))
    81 			{
    82 			$parameters->[$i].='='.$parameters->[$i+1];
    83 			splice(@$parameters, $i+1, 1);
    84 			}
    85 		}
    86 	}
    87 
    88 sub PrintUsage 
    89 	{
    90 	print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
    91 	print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
    92 	print "options    :=  [-big|-little][-byteordermark]\n\t";
    93 	print "inputspec  :=  -input=<format> [<input_file>]\n\t";
    94 	print "outputspec :=  -output=<format> [<output_file>]\n\t";
    95 	print "format     :=  unicode|utf8|big5|gb2312...\n\n";
    96 	}
    97 
    98 sub Assert
    99 	{
   100 	my $condition = shift;
   101 	my $errorMessage = shift;
   102 	if (!($condition)) # find out where this is used and work this out 
   103 		{
   104 		die("Error: $errorMessage");
   105 		}
   106 	}
   107 
   108 sub PrintWarning 
   109 	{
   110 	my $warningMessage = shift;
   111 	print STDERR "Warning: $warningMessage\n";
   112 	}
   113 	
   114 
   115 sub TryFileParameter 
   116 	{
   117 	my $args = shift;
   118 	my $argindex = shift;
   119 	my $inputoroutput = shift;
   120 	my $encoding = shift;
   121 	my $filehandle = shift;
   122 	my $prefix = "-$inputoroutput=";
   123 
   124 	if ($args->[$$argindex] =~ /^$prefix(.*)/)
   125 		{
   126 		Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
   127 		$$encoding = $1;
   128 		++$$argindex;
   129 		if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
   130 			{
   131 			--$$argindex;
   132 			}
   133 		else
   134 			{
   135 			if ($inputoroutput =~ /input/i)
   136 				{
   137 				open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
   138 				$$filehandle=\*INPUT_FILE;
   139 				}
   140 			else
   141 				{
   142 				open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
   143 				$$filehandle=\*OUTPUT_FILE;
   144 				}
   145 			}
   146 		binmode $$filehandle;
   147 		return  1;
   148 		}
   149 	return 0; 
   150 	}
   151 
   152 sub ReadParameters 
   153 	{
   154 	my $args = shift; 
   155 	my $outputbyteordermark = shift;
   156 	my $unicodebyteorder = shift;
   157 	my $inputencoding = shift;
   158 	my $outputencoding = shift;
   159 	my $inputhandle = shift;
   160 	my $outputhandle = shift;
   161 	my $i;
   162 	my $range;
   163 	if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
   164 		{
   165 		PrintUsage();
   166 		exit;
   167 		}
   168 
   169 	for ($i = 0; $i < @$args ; ++$i)
   170 		{
   171 		if ( $args->[$i]=~ /-byteordermark/i)
   172 			{
   173 			Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
   174 			$$outputbyteordermark = 1; 
   175 			}
   176 		elsif ($args->[$i]=~ /-big/i)
   177 			{
   178 			Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
   179 			$$unicodebyteorder = 1;
   180 			}
   181 		elsif ($args->[$i]=~ /-little/i)
   182 			{
   183 			Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
   184 			$$unicodebyteorder = 2;
   185 			}
   186 		else
   187 			{ 
   188 			Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
   189 				   TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
   190 			}
   191 		}
   192 	Assert($$inputencoding ne "", "no input encoding is specified");
   193 	Assert($$outputencoding ne "", "no output encoding is specified");
   194 	}
   195 
   196 sub ReadFromFile
   197 	{  
   198 	my $buffer = shift;
   199 	my $numOfBytesToRead = shift;
   200 	my $inputhandle = shift;
   201 	my $numOfBytesRead = 0;									
   202 	my $numOfBytesToReadThisTime = $numOfBytesToRead;		
   203 
   204 	for(;;)
   205 		{
   206 		for(;;)
   207 			{
   208 			my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;  
   209 			if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
   210 				{
   211 				$numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
   212 				}  		
   213 			my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;  
   214 			if (defined $numOfBytesReadThisTime)
   215 				{
   216 				$numOfBytesRead += $numOfBytesReadThisTime;
   217 				Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
   218 				if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
   219 					{
   220 					return;
   221 					}
   222 				last;
   223 				}
   224 			$numOfBytesToReadThisTime /= 2;
   225 			Assert($numOfBytesToReadThisTime >0, "reading from file failed");
   226 			}
   227 		}
   228 	}
   229 
   230 sub HandleByteOrderMarks 
   231 	{
   232 	my $outputbyteordermark = shift;
   233 	my $unicodebyteorder = shift;
   234 	my $inputencoding = shift;
   235 	my $outputencoding = shift;
   236 	my $inputhandle = shift;
   237 	my $outputhandle = shift;
   238 
   239 	if ($$inputencoding =~ /unicode/i) 
   240 		{
   241 		my $firstUnicodeCharacter = 0; 
   242 		ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
   243 		my $byteOrderSpecifiedByByteOrderMark = 0;
   244 		if (length($firstUnicodeCharacter) == 2)
   245 			{
   246 			my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
   247 			if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
   248 				{
   249 				$byteOrderSpecifiedByByteOrderMark = 2; 
   250 				}
   251 			elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
   252 				{
   253 				$byteOrderSpecifiedByByteOrderMark = 1; 
   254 				}
   255 			else
   256 				{
   257 				my $error = seek $inputhandle, 0, 0; # rewind to start of file
   258 				Assert ($error == 1, "could not rewind to the start of input file");
   259 				}
   260 			}
   261 		if ($byteOrderSpecifiedByByteOrderMark!=0)
   262 			{
   263 			if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
   264 				{
   265 				PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
   266 				}
   267 			$$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
   268 			}
   269 		}
   270 	if ($outputbyteordermark)
   271 		{
   272 		if ($$outputencoding ne "unicode") 
   273 			{
   274 			PrintWarning("\"-byteordermark\" is only relevant for unicode output");
   275 			}
   276 		else
   277 			{
   278 			Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
   279 			my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
   280 			WriteToFile(\$firstUnicodeCharacter, $outputhandle);
   281 			}
   282 		}
   283 	}
   284 
   285 sub WriteToFile 
   286 	{
   287 	my $buffer = shift;
   288 	my $outputhandle = shift;
   289 
   290 	print $outputhandle $$buffer;
   291 	}
   292 
   293 sub DoConversion
   294 	{
   295 	my $unicodebyteorder = shift;
   296 	my $inputencoding = shift;
   297 	my $outputencoding = shift;
   298 	my $inputhandle = shift;
   299 	my $outputhandle = shift; 
   300 	my $foreignCharacters = shift;
   301 	my $unicodeCharacters = shift;
   302 	
   303 	my $currentBuffer = 0;
   304 	my @arrayOfBuffers = ('', '', '');
   305 	my $largeNumber=1000000;
   306 	ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
   307 	ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
   308 	if ($$inputencoding ne $$outputencoding) 
   309 		{
   310 		if ($$inputencoding !~ /^unicode$/i)
   311 			{
   312 			my $nextBuffer = $currentBuffer + 1;
   313 			OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
   314 			$currentBuffer = $nextBuffer;
   315 			}
   316 		if ($$outputencoding !~ /^unicode$/i)
   317 			{
   318 			my $nextBuffer = $currentBuffer + 1;
   319 			UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
   320 			$currentBuffer = $nextBuffer;
   321 			}
   322 		}
   323 	ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
   324 	WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
   325 	}
   326 
   327 sub ReverseByteOrderIfUnicodeAndBigEndian
   328 	{
   329 	my $unicodebyteorder = shift;
   330 	my $encoding = shift;
   331 	my $buffer = shift;
   332 	my $i;
   333 
   334 	if ($$encoding =~ /^unicode$/i)
   335 		{
   336 		Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
   337 		if ($$unicodebyteorder==0)
   338 			{
   339 			PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
   340 			$$unicodebyteorder = 2;
   341 			}
   342 		if ($$unicodebyteorder==1)
   343 			{
   344 			$$buffer=pack('v*', unpack('n*', $$buffer));
   345 			}
   346 		}
   347 	}
   348 
   349 sub FillInHashes
   350 	{
   351 	my $foreignCharacters = shift;
   352 	my $unicodeCharacters = shift;
   353 	my $encoding = shift; 
   354 	my $replacementCharacter = shift;
   355 	my $ranges = shift;
   356 	my $bigEndian = shift;
   357 
   358 	my $endianness = 0;
   359 	my $replacenum = 0;
   360 	my $rangenum = 0;
   361 	my $fileread = 0;
   362 	my $largenumber = 1000000;
   363 
   364 	my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
   365 
   366 	my $line;
   367 
   368 	if (-e $dataFile)
   369 		{
   370 		open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
   371 		
   372 		binmode HASH_INPUT;
   373 		# reading the endianness
   374 		$fileread = read HASH_INPUT, $endianness, 1;
   375 		$endianness = unpack "C",$endianness;
   376 		if ($endianness == 0)
   377 			{
   378 			# set the template to a default-> n for the eman time
   379 			$$bigEndian = 0;
   380 			}
   381 		elsif ($endianness == 1)
   382 			{
   383 			$$bigEndian = 0;
   384 			}
   385 		elsif ($endianness == 2)
   386 			{
   387 			$$bigEndian = 1;
   388 			}
   389 		else
   390 			{
   391 			print "Illegal Endianness specified in the control files";
   392 			}
   393 		#reading the replacement characters
   394 		$fileread = read HASH_INPUT, $replacenum,1;
   395 		$replacenum= unpack "C",$replacenum;
   396 		$fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
   397 		# reading the ranges
   398 		$fileread = read HASH_INPUT, $rangenum, 1;
   399 		$rangenum = unpack "C",$rangenum;
   400 		my $i; # loop variable 
   401 		for ($i=0; $i < $rangenum; ++$i)
   402 			{
   403 			my $lowerrange = 0;
   404 			my $upperrange = 0;
   405 			my $followchar = 0;
   406 
   407 			$fileread = read HASH_INPUT,$lowerrange,1;
   408 			$lowerrange = unpack "C",$lowerrange;
   409 			$fileread = read HASH_INPUT,$upperrange,1;
   410 			$upperrange = unpack "C",$upperrange;
   411 			$fileread = read HASH_INPUT,$followchar,1;
   412 			$followchar = unpack "C",$followchar;
   413 
   414 			push @$ranges,[$lowerrange,$upperrange,$followchar];
   415 			}
   416 		my $data = 0;
   417 		my @unpackeddata = 0;
   418 		$fileread = read HASH_INPUT, $data, $largenumber;
   419 		@unpackeddata = unpack "v*",$data;
   420 		for($i = 0; $i <= $#unpackeddata; $i= $i+2)
   421 			{
   422 			$unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
   423 			$foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
   424 			}
   425 		}
   426 	else
   427 		{
   428 		die ("Encoding Format \"$$encoding\" not recognised");
   429 		}
   430 	}
   431 
   432 sub OtherToUnicode
   433 	{
   434 	my $inputencoding = shift;
   435 	my $unicode = shift;  
   436 	my $other = shift; 
   437 	my $foreignCharacters = shift;
   438 	my $unicodeCharacters = shift;
   439 	my $unicodetemplate = shift;
   440 	my $replacementCharacter = 0;
   441 	my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
   442 	my @ranges=(); 
   443  
   444 	my $otherIndex= 0;
   445 	my $numOfBytes = length($other);
   446 	my $key = 0;
   447 	my $inRange = 0;
   448 	my $followByte = -1;
   449 
   450 	if ($$inputencoding=~/^utf8$/i)
   451 		{
   452 		return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
   453 		}
   454 	my $bigEndian;
   455 	FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
   456 	for (;;)
   457 		{
   458 		if ($otherIndex > $numOfBytes -1) 
   459 			{
   460 			last;
   461 			}
   462 		my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
   463 		# @ranges is an array of references. Each reference is a reference to an array
   464 		for ($key = 0; $key <= $#ranges; ++$key)
   465 			{
   466 			my $arrayref = $ranges[$key];
   467 			if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
   468 				{
   469 				$followByte = $arrayref->[2];
   470 				$inRange = 1;
   471 				}
   472 			}
   473 		Assert ($inRange != 0, "cannot figure out the Byte size of the character");
   474 		my $tempByte = 0;
   475 		for ($key = 0; $key<= $followByte; ++$key)
   476 			{
   477 			if ($bigEndian)
   478 				{
   479 				$tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
   480 				}
   481 			else
   482 				{
   483 				$tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
   484 				}
   485 			$otherIndex++;	
   486 			}
   487 		if (exists $unicodeCharacters->{$tempByte})
   488 			{
   489 			$$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
   490 			}
   491 		else
   492 			{
   493 			$$unicode .= $unicodeReplacementCharacter;
   494 			}
   495 		}
   496 	}
   497 
   498 sub UnicodeToOther
   499 	{
   500 	my $outputencoding = shift;
   501 	my $other = shift;
   502 	my $unicode = shift;
   503 	my $foreignCharacters = shift;
   504 	my $unicodeCharacters = shift;
   505 	my $unicodetemplate = shift;
   506 	my $replacementCharacter = 0;
   507 	my @ranges=(); 
   508 
   509 	my $unicodeIndex= 0;
   510 	my $numOfBytes = length($unicode);
   511 	my @UnicodeUnpacked = ();
   512 	my $key = 0;
   513 
   514 	if ($$outputencoding=~/^utf8$/i)
   515 		{
   516 		return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
   517 		}
   518 	my $bigEndian;
   519 	FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
   520 	my $foreignTemplate=$bigEndian? 'n': 'v';
   521 	@UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
   522 	foreach $key (@UnicodeUnpacked)
   523 		{
   524 		if (!exists($foreignCharacters->{$key}))
   525 			{
   526 			$$other .= $replacementCharacter;
   527 			}
   528 		else
   529 			{
   530 			# This is the WRONG but it will work for the mean time
   531 			# This will fail if the foreignCharacter has characters that are more than
   532 			# two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
   533 
   534 			my $foreignValue = $foreignCharacters->{$key};
   535 			if ( $foreignValue <= 255)
   536 				{
   537 				$$other .= pack "C" , $foreignValue;
   538 				}
   539 			else
   540 				{
   541 				$$other .= pack $foreignTemplate, $foreignValue;
   542 				}
   543 			}
   544 		}
   545 	}
   546