os/textandloc/charconvfw/charconvplugins/tools/analyse.pl
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/charconvfw/charconvplugins/tools/analyse.pl	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,238 @@
     1.4 +#
     1.5 +# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +# All rights reserved.
     1.7 +# This component and the accompanying materials are made available
     1.8 +# under the terms of "Eclipse Public License v1.0"
     1.9 +# which accompanies this distribution, and is available
    1.10 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +#
    1.12 +# Initial Contributors:
    1.13 +# Nokia Corporation - initial contribution.
    1.14 +#
    1.15 +# Contributors:
    1.16 +#
    1.17 +# Description: 
    1.18 +#
    1.19 +
    1.20 +use strict;
    1.21 +use integer;
    1.22 +
    1.23 +BEGIN
    1.24 +	{
    1.25 +	my $perlScriptPath=$0;
    1.26 +	$perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
    1.27 +	$perlScriptPath=~s/\\?[^\\]+$//; # get rid of this Perl-script's file-name
    1.28 +	if ($perlScriptPath eq '')
    1.29 +		{
    1.30 +		$perlScriptPath='..\group';
    1.31 +		}
    1.32 +	else
    1.33 +		{
    1.34 +		$perlScriptPath=~s/(\\?)[^\\]+$/$1group/;
    1.35 +		}
    1.36 +	unshift(@INC, $perlScriptPath); # can't do "use lib $perlScriptPath" here as "use lib" only seems to work with *hard-coded* directory names
    1.37 +	}
    1.38 +use PARSER;
    1.39 +
    1.40 +if ((@ARGV==0) || ($ARGV[0]=~/\?/i) || ($ARGV[0]=~/-h/i) || ($ARGV[0]=~/\/h/i) || ($ARGV[0]=~/help/i))
    1.41 +	{
    1.42 +	die("\nVersion 021\n\nCharacter-set conversion-table generating tool\nCopyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reservered.\n\nUsage:\n\n\tperl analyse.pl <source-file> <output-file> foreign|Unicode [options]\n\nwhere the following options are available (each has a short form and a long form which are shown below separated by a '|'):\n\n\t-c | -columns(<a>: <b>, <c>)\n\t-p | -cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed\n\t-u | -sourceFilesToSubtract(<a>, <b>, ...)\n\n");
    1.43 +	}
    1.44 +my @columns=(2, 1, 2);
    1.45 +my $cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed=0;
    1.46 +my @sourceFilesToSubtract=();
    1.47 +&extractCommandLineFlags(\@columns, \$cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed, \@sourceFilesToSubtract);
    1.48 +my $sourceFileName=shift;
    1.49 +my $outputFileName=shift;
    1.50 +my $columnToSortOn=shift;
    1.51 +my %characterCodesOfOtherColumn=();
    1.52 +my %linesSorted=();
    1.53 +open(SOURCE_FILE, "< $sourceFileName") or die("Error: could not open \"$sourceFileName\" for reading");
    1.54 +&readSourceFile(\*SOURCE_FILE, $sourceFileName, \%characterCodesOfOtherColumn, \%linesSorted, $columnToSortOn, \@columns, $cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed, 0);
    1.55 +close(SOURCE_FILE) or die("Error: could not close \"$sourceFileName\"\n");
    1.56 +my $sourceFileToSubtract;
    1.57 +foreach $sourceFileToSubtract (@sourceFilesToSubtract)
    1.58 +	{
    1.59 +	open(SOURCE_FILE_TO_SUBTRACT, "< $sourceFileToSubtract") or die("Error: could not open \"$sourceFileToSubtract\" for reading\n");
    1.60 +	&readSourceFile(\*SOURCE_FILE_TO_SUBTRACT, $sourceFileToSubtract, \%characterCodesOfOtherColumn, \%linesSorted, $columnToSortOn, \@columns, $cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed, 1);
    1.61 +	close(SOURCE_FILE_TO_SUBTRACT) or die("Error: could not close \"$sourceFileToSubtract\"\n");
    1.62 +	}
    1.63 +open(OUTPUT_FILE, "> $outputFileName") or die("Error: could not open \"$outputFileName\" for writing");
    1.64 +my $numberOfBreaks=0;
    1.65 +my $numberOfMissingSpaces=0;
    1.66 +my $numberOfLinesSorted=0;
    1.67 +my $previousKey="";
    1.68 +my $offset=0;
    1.69 +my $key;
    1.70 +foreach $key (sort {$a<=>$b} (keys(%linesSorted)))
    1.71 +	{
    1.72 +	if ($previousKey ne "")
    1.73 +		{
    1.74 +		$previousKey<$key or die("Error: there appears to be a mix up with the keys \"$previousKey\" and \"$key\"");
    1.75 +		if ($previousKey!=$key-1)
    1.76 +			{
    1.77 +			++$numberOfBreaks;
    1.78 +			$numberOfMissingSpaces+=$key-$previousKey;
    1.79 +			print(OUTPUT_FILE "# End of contiguous block - relationship between the columns in this block: ".((!defined $offset)? "RANDOM": ($offset==0)? "DIRECT": "OFFSET ($offset)")."\n\n");
    1.80 +			}
    1.81 +		}
    1.82 +	if (($previousKey eq "") || ($previousKey!=$key-1))
    1.83 +		{
    1.84 +		$offset=$characterCodesOfOtherColumn{$key}-$key;
    1.85 +		}
    1.86 +	elsif ((defined $offset) && ($offset!=$characterCodesOfOtherColumn{$key}-$key))
    1.87 +		{
    1.88 +		undef $offset;
    1.89 +		}
    1.90 +	print(OUTPUT_FILE "$linesSorted{$key}");
    1.91 +	++$numberOfLinesSorted;
    1.92 +	$previousKey=$key;
    1.93 +	}
    1.94 +print(OUTPUT_FILE "# End of contiguous block - relationship between the columns in this block: ".((!defined $offset)? "RANDOM": ($offset==0)? "DIRECT": "OFFSET ($offset)")."\n\n");
    1.95 +close(OUTPUT_FILE);
    1.96 +my $maximumNumberOfIterationsWhenBinarySearching=1;
    1.97 +while (($numberOfLinesSorted>>$maximumNumberOfIterationsWhenBinarySearching)>0)
    1.98 +	{
    1.99 +	++$maximumNumberOfIterationsWhenBinarySearching;
   1.100 +	}
   1.101 +print("The number of breaks was $numberOfBreaks\nThe number of missing spaces was $numberOfMissingSpaces\nThe number of lines sorted was $numberOfLinesSorted\nThe maximum number of iterations when binary searching would be $maximumNumberOfIterationsWhenBinarySearching");
   1.102 +
   1.103 +sub extractCommandLineFlags()
   1.104 +	{
   1.105 +	my $columns=shift;
   1.106 +	my $cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed=shift;
   1.107 +	my $sourceFilesToSubtract=shift;
   1.108 +	my $i;
   1.109 +	for ($i=0; $i<=$#ARGV;) # (i) not cache-ing $#ARGV into a variable as @ARGV may change length in this loop (ii) iterate forwards as some parameters may occupy more than one element in @ARGV
   1.110 +		{
   1.111 +		if (($ARGV[$i]=~/^-c\b(.*)$/i) || ($ARGV[$i]=~/^-columns\b(.*)$/i))
   1.112 +			{
   1.113 +			my $columnsData=$1;
   1.114 +			splice(@ARGV, $i, 1);
   1.115 +			for (;;)
   1.116 +				{
   1.117 +				if ($columnsData=~/^\s*\(\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+)\s*\)\s*$/)
   1.118 +					{
   1.119 +					@$columns=($1, $2, $3);
   1.120 +					last;
   1.121 +					}
   1.122 +				($#ARGV>=$i) or die("Error: bad \"-columns\" format\n");
   1.123 +				$columnsData.=(splice(@ARGV, $i, 1))[0];
   1.124 +				}
   1.125 +			}
   1.126 +		elsif (($ARGV[$i]=~/^-p$/i) || ($ARGV[$i]=~/^-cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed$/i))
   1.127 +			{
   1.128 +			splice(@ARGV, $i, 1);
   1.129 +			$$cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed=1;
   1.130 +			}
   1.131 +		elsif (($ARGV[$i]=~/^-u\b(.*)$/i) || ($ARGV[$i]=~/^-sourceFilesToSubtract\b(.*)$/i))
   1.132 +			{
   1.133 +			my $sourceFilesData=$1;
   1.134 +			splice(@ARGV, $i, 1);
   1.135 +			for (;;)
   1.136 +				{
   1.137 +				if ($sourceFilesData=~/^\s*\(\s*(.+)\)\s*$/)
   1.138 +					{
   1.139 +					my $sourceFilesData=$1;
   1.140 +					@$sourceFilesToSubtract=split(/,/, $sourceFilesData, -1);
   1.141 +					my $j;
   1.142 +					for ($j=$#$sourceFilesToSubtract; $j>=0; --$j)
   1.143 +						{
   1.144 +						$sourceFilesToSubtract->[$j]=~s/^\s+//;
   1.145 +						$sourceFilesToSubtract->[$j]=~s/\s+$//;
   1.146 +						($sourceFilesToSubtract->[$j] ne '') or die("Error: bad \"-sourceFilesToSubtract\" format (1)\n");
   1.147 +						}
   1.148 +					last;
   1.149 +					}
   1.150 +				($#ARGV>=$i) or die("Error: bad \"-sourceFilesToSubtract\" format (2)\n");
   1.151 +				$sourceFilesData.=(splice(@ARGV, $i, 1))[0];
   1.152 +				}
   1.153 +			}
   1.154 +		else
   1.155 +			{
   1.156 +			++$i;
   1.157 +			}
   1.158 +		}
   1.159 +	}
   1.160 +
   1.161 +sub readSourceFile
   1.162 +	{
   1.163 +	my $fileHandle=shift;
   1.164 +	my $fileName=shift;
   1.165 +	my $characterCodesOfOtherColumn=shift;
   1.166 +	my $linesSorted=shift;
   1.167 +	my $columnToSortOn=shift;
   1.168 +	my $columns=shift;
   1.169 +	my $cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed=shift;
   1.170 +	my $subtract=shift;
   1.171 +	my $foreignCharacterCodeProcessingCode='';
   1.172 +	if (!(($columns->[0]>0) && ($columns->[1]>0) && ($columns->[2]>0) && ($columns->[1]<=$columns->[0]) && ($columns->[2]<=$columns->[0]) && ($columns->[1]!=$columns->[2])))
   1.173 +		{
   1.174 +		close($fileHandle);
   1.175 +		die("Error: bad \"-columns\" data\n");
   1.176 +		}
   1.177 +	my $patternOfLineContainingCharacterCodes=join('\s+', ('0x([0-9a-f]+)') x $columns->[0]);
   1.178 +	my $line;
   1.179 +	my $strippedDownLine;
   1.180 +	for (;;)
   1.181 +		{
   1.182 +		($line, $strippedDownLine)=&nextNonEmptyStrippedDownLine($fileHandle);
   1.183 +		if ($strippedDownLine eq '')
   1.184 +			{
   1.185 +			last;
   1.186 +			}
   1.187 +		if ($strippedDownLine=~/^SET_FOREIGN_CHARACTER_CODE_PROCESSING_CODE\s+(.*)$/i)
   1.188 +			{
   1.189 +			$foreignCharacterCodeProcessingCode=$1;
   1.190 +			}
   1.191 +		elsif ($strippedDownLine=~/^$patternOfLineContainingCharacterCodes$/i)
   1.192 +			{
   1.193 +			no strict 'refs'; # so that we can use symbolic references for $1, $2, etc
   1.194 +			my $foreignCharacterCode=hex(${$columns->[1]});
   1.195 +			my $unicodeCharacterCode=hex(${$columns->[2]});
   1.196 +			use strict 'refs';
   1.197 +			if ($foreignCharacterCodeProcessingCode ne '')
   1.198 +				{
   1.199 +				$foreignCharacterCode=eval($foreignCharacterCodeProcessingCode);
   1.200 +				}
   1.201 +			my $characterCodeOfColumnToSortOn;
   1.202 +			my $characterCodeOfOtherColumn;
   1.203 +			if ($columnToSortOn=~/^foreign$/i)
   1.204 +				{
   1.205 +				$characterCodeOfColumnToSortOn=$foreignCharacterCode;
   1.206 +				$characterCodeOfOtherColumn=$unicodeCharacterCode;
   1.207 +				}
   1.208 +			elsif ($columnToSortOn=~/^Unicode$/i)
   1.209 +				{
   1.210 +				$characterCodeOfColumnToSortOn=$unicodeCharacterCode;
   1.211 +				$characterCodeOfOtherColumn=$foreignCharacterCode;
   1.212 +				}
   1.213 +			else
   1.214 +				{
   1.215 +				die("Error: bad parameter \"$columnToSortOn\"");
   1.216 +				}
   1.217 +			if ((!$cutOutAnyPrivateUseUnicodeCharacterSlotsBeingUsed) || !((($unicodeCharacterCode>=0xe000) && ($unicodeCharacterCode<=0xf8ff)) || (($unicodeCharacterCode>=0xf0000) && ($unicodeCharacterCode<=0x10ffff))))
   1.218 +				{
   1.219 +				if ($subtract)
   1.220 +					{
   1.221 +					$linesSorted->{$characterCodeOfColumnToSortOn}='### '.$linesSorted->{$characterCodeOfColumnToSortOn};
   1.222 +					if ($characterCodesOfOtherColumn->{$characterCodeOfColumnToSortOn}!=$characterCodeOfOtherColumn)
   1.223 +						{
   1.224 +						printf(STDERR "Warning: 0x%x maps to 0x%x in the main source file, but to 0x%x in a source file to be extracted\n", $characterCodeOfColumnToSortOn, $characterCodesOfOtherColumn->{$characterCodeOfColumnToSortOn}, $characterCodeOfOtherColumn);
   1.225 +						}
   1.226 +					}
   1.227 +				else
   1.228 +					{
   1.229 +					$linesSorted->{$characterCodeOfColumnToSortOn}=$line;
   1.230 +					$characterCodesOfOtherColumn->{$characterCodeOfColumnToSortOn}=$characterCodeOfOtherColumn;
   1.231 +					}
   1.232 +				}
   1.233 +			}
   1.234 +		elsif ($line!~/^\s*0x([0-9a-f]+)\s*#\s*undefined.*$/i)
   1.235 +			{
   1.236 +			close($fileHandle);
   1.237 +			die("Error: unexpected line in \"$fileName\":\n    $line\n");
   1.238 +			}
   1.239 +		}
   1.240 +	}
   1.241 +