os/textandloc/localisation/localesupport/OtherTools/CaseEquivalence.pl
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/localisation/localesupport/OtherTools/CaseEquivalence.pl	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,423 @@
     1.4 +#
     1.5 +# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +# All rights reserved.
     1.7 +# This component and the accompanying materials are made available
     1.8 +# under the terms of "Eclipse Public License v1.0"
     1.9 +# which accompanies this distribution, and is available
    1.10 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +#
    1.12 +# Initial Contributors:
    1.13 +# Nokia Corporation - initial contribution.
    1.14 +#
    1.15 +# Contributors:
    1.16 +#
    1.17 +# Description: 
    1.18 +# Case Equivalence
    1.19 +# Given the unicode data file, work out the case equivalence classes
    1.20 +# i.e. the equivalence classes for the transitive closure of ~ defined as
    1.21 +# follows:
    1.22 +# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
    1.23 +# Usage: perl CaseEquivalence <UnicodeData.txt
    1.24 +#
    1.25 +
    1.26 +use strict;
    1.27 +my @Name = ();
    1.28 +my @Upper = ();
    1.29 +my @Lower = ();
    1.30 +my @Title = ();
    1.31 +# $DecompositionValue[$code] is undefined if $code has no decomposition
    1.32 +# sequence, if it has a single value decomposition sequence, then this is it,
    1.33 +# if it has a longer sequence, the value is -1
    1.34 +my @DecompositionValue = ();
    1.35 +# 1 for each code that has a differently-cased version,
    1.36 +# 2 for each code that is a lower-case version of something else.
    1.37 +my %Codes = ();
    1.38 +my %CaseClass = ();
    1.39 +
    1.40 +# Command-line options
    1.41 +my $OptionOutputTrie = 1;
    1.42 +my $OptionOutputForwardMapping = 0;
    1.43 +my $OptionOutputReverseMapping = 0;
    1.44 +my $OptionIgnoreOneToOneReverseMappings = 0;
    1.45 +my $OptionIncludeExtraMappings = 1;
    1.46 +
    1.47 +foreach my $optionString (@ARGV)
    1.48 +	{
    1.49 +	if ($optionString =~ m![/-]o[tfrm]!)
    1.50 +		{
    1.51 +		$OptionOutputTrie = 0;
    1.52 +		my $option = substr($optionString, 2, 1);
    1.53 +		if ($option eq 'f')
    1.54 +			{
    1.55 +			$OptionOutputForwardMapping = 1;
    1.56 +			}
    1.57 +		elsif ($option eq 'r')
    1.58 +			{
    1.59 +			$OptionOutputReverseMapping = 1;
    1.60 +			}
    1.61 +		elsif ($option eq 'm')
    1.62 +			{
    1.63 +			$OptionOutputReverseMapping = 1;
    1.64 +			$OptionIgnoreOneToOneReverseMappings = 1;
    1.65 +			}
    1.66 +		else
    1.67 +			{
    1.68 +			$OptionOutputTrie = 1;
    1.69 +			}
    1.70 +		}
    1.71 +	elsif ($optionString =~ m![/-]s!)
    1.72 +		{
    1.73 +		$OptionIncludeExtraMappings = 0;
    1.74 +		}
    1.75 +	else
    1.76 +		{
    1.77 +		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
    1.78 +		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
    1.79 +		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
    1.80 +		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
    1.81 +		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
    1.82 +		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
    1.83 +		exit;
    1.84 +		}
    1.85 +	}
    1.86 +
    1.87 +# set a code as being part of a non-unitary case-equivalence class.
    1.88 +sub add
    1.89 +	{
    1.90 +	my ($addition) = @_;
    1.91 +	if (!$Codes{$addition})
    1.92 +		{
    1.93 +		$Codes{$addition} = 1;
    1.94 +		}
    1.95 +	}
    1.96 +
    1.97 +# make a code point to its final case varient
    1.98 +sub chaseDown
    1.99 +	{
   1.100 +	my ($codeVal) = @_;
   1.101 +	my $class = $codeVal;
   1.102 +	while ($CaseClass{$class})
   1.103 +		{
   1.104 +		$class = $CaseClass{$class};
   1.105 +		}
   1.106 +	$CaseClass{$codeVal} = $class unless $codeVal == $class;
   1.107 +	return $class;
   1.108 +	}
   1.109 +
   1.110 +# link two codes together as being part of the same case-equivalence class
   1.111 +sub makeEquivalent
   1.112 +	{
   1.113 +	my ($left, $right) = @_;
   1.114 +	if (!$left || !$right)
   1.115 +		{
   1.116 +		return;
   1.117 +		}
   1.118 +	$left = chaseDown($left);
   1.119 +	$right = chaseDown($right);
   1.120 +	if ($Codes{$left} < $Codes{$right})
   1.121 +		{
   1.122 +		$CaseClass{$left} = $right;
   1.123 +		return;
   1.124 +		}
   1.125 +	if ($Codes{$right} < $Codes{$left})
   1.126 +		{
   1.127 +		$CaseClass{$right} = $left;
   1.128 +		return;
   1.129 +		}
   1.130 +	if ($left < $right)
   1.131 +		{
   1.132 +		$CaseClass{$right} = $left;
   1.133 +		return;
   1.134 +		}
   1.135 +	if ($right < $left)
   1.136 +		{
   1.137 +		$CaseClass{$left} = $right;
   1.138 +		return;
   1.139 +		}
   1.140 +	# $left == $right.. do nothing
   1.141 +	return;
   1.142 +	}
   1.143 +
   1.144 +# Link possibly unmentioned codes together. The first one is considered lower-case
   1.145 +sub addEquivalenceClass
   1.146 +	{
   1.147 +	my ($lower, @rest) = @_;
   1.148 +	$Codes{$lower} = 2;
   1.149 +	foreach my $one (@rest)
   1.150 +		{
   1.151 +		$Codes{$one} = 1;
   1.152 +		makeEquivalent($lower, $one);
   1.153 +		}
   1.154 +	}
   1.155 +
   1.156 +# Firstly we read in the data
   1.157 +while(<STDIN>)
   1.158 +	{
   1.159 +	my @line = split('#', $_, 1);
   1.160 +	my @fields = split(/;/, $line[0]);
   1.161 +	my @decomposition = split(' ', $fields[5]);
   1.162 +	if (1 < scalar(@fields))
   1.163 +		{
   1.164 +		my $codeVal = hex($fields[0]);
   1.165 +		# if the character has a non-compatibility decomposition sequence, record this fact.
   1.166 +		if (0 < scalar(@decomposition))
   1.167 +			{
   1.168 +			my $decompositionType = "";
   1.169 +			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
   1.170 +				{
   1.171 +				$decompositionType = shift @decomposition;
   1.172 +				}
   1.173 +			if ($decompositionType !~ m/compat/i)
   1.174 +				{
   1.175 +				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
   1.176 +				}
   1.177 +			}
   1.178 +		$Name[$codeVal] = $fields[1];
   1.179 +		my $upperval = $fields[12];
   1.180 +		my $lowerval = $fields[13];
   1.181 +		my $titleval = $fields[14];
   1.182 +
   1.183 +		# strip whitespace from the end of the string
   1.184 +		$titleval =~ s/\s+$//;
   1.185 +		if ($upperval)
   1.186 +			{
   1.187 +			$upperval = hex($upperval);
   1.188 +			$Upper[$codeVal] = $upperval;
   1.189 +			add $codeVal;
   1.190 +			add $upperval;
   1.191 +			}
   1.192 +		if ($titleval)
   1.193 +			{
   1.194 +			$titleval = hex($titleval);
   1.195 +			$Title[$codeVal] = $titleval;
   1.196 +			add $codeVal;
   1.197 +			add $titleval;
   1.198 +			}
   1.199 +		if ($lowerval)
   1.200 +			{
   1.201 +			$lowerval = hex($lowerval);
   1.202 +			$Lower[$codeVal] = $lowerval;
   1.203 +			add $codeVal;
   1.204 +			$Codes{$lowerval} = 2;
   1.205 +			}
   1.206 +		}
   1.207 +	}
   1.208 +
   1.209 +# Remove all codes that decompose to a sequence
   1.210 +foreach my $codeVal (keys(%Codes))
   1.211 +	{
   1.212 +	my $current = $DecompositionValue[$codeVal];
   1.213 +	while ($current && 0 < $current)
   1.214 +		{
   1.215 +		$current = $DecompositionValue[$current];
   1.216 +		}
   1.217 +	if ($current && $current == -1)
   1.218 +		{
   1.219 +		delete $Codes{$codeVal};
   1.220 +		}
   1.221 +	}
   1.222 +
   1.223 +# Next we form the equivalence classes.
   1.224 +if ($OptionIncludeExtraMappings)
   1.225 +	{
   1.226 +	# space = non-breaking space
   1.227 +	addEquivalenceClass(0x20, 0xA0);
   1.228 +	}
   1.229 +# We try to end up with everything being equivalent to a lower case letter
   1.230 +foreach my $codeVal (keys(%Codes))
   1.231 +	{
   1.232 +	makeEquivalent($codeVal, $Lower[$codeVal]);
   1.233 +	makeEquivalent($codeVal, $Upper[$codeVal]);
   1.234 +	makeEquivalent($codeVal, $Title[$codeVal]);
   1.235 +	}
   1.236 +
   1.237 +# Next we chase each pointer in CaseClass down to its final result
   1.238 +foreach my $codeVal (keys(%CaseClass))
   1.239 +	{
   1.240 +	chaseDown($codeVal);
   1.241 +	}
   1.242 +
   1.243 +# Now output the results in order, and collect the raw data
   1.244 +my @Offset = ();
   1.245 +my $oldCodeCount = 0;
   1.246 +foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
   1.247 +	{
   1.248 +	my $class = $CaseClass{$codeVal};
   1.249 +	my $offset = $class - $codeVal;
   1.250 +	if ($OptionOutputForwardMapping)
   1.251 +		{
   1.252 +		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
   1.253 +		}
   1.254 +	while ($oldCodeCount != $codeVal)
   1.255 +		{
   1.256 +		$Offset[$oldCodeCount] = 0;
   1.257 +		$oldCodeCount++;
   1.258 +		}
   1.259 +	$oldCodeCount++;
   1.260 +	$Offset[$codeVal] = $offset;
   1.261 +	}
   1.262 +
   1.263 +if ($OptionOutputReverseMapping)
   1.264 +	{
   1.265 +	my %ReverseMapping = ();
   1.266 +	foreach my $codeVal (keys(%CaseClass))
   1.267 +		{
   1.268 +		my $mapsTo = $CaseClass{$codeVal};
   1.269 +		if (!$ReverseMapping{$mapsTo})
   1.270 +			{
   1.271 +			$ReverseMapping{$mapsTo} = [$codeVal];
   1.272 +			}
   1.273 +		else
   1.274 +			{
   1.275 +			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
   1.276 +			}
   1.277 +		}
   1.278 +	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
   1.279 +		{
   1.280 +		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
   1.281 +		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
   1.282 +		my $firstTime = 1;
   1.283 +		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
   1.284 +			{
   1.285 +			if (!$firstTime)
   1.286 +				{
   1.287 +				print ',';
   1.288 +				}
   1.289 +			$firstTime = 0;
   1.290 +			printf(" %s:%x", $Name[$val], $val);
   1.291 +			}
   1.292 +		print "\n";
   1.293 +		}
   1.294 +	}
   1.295 +
   1.296 +# does the array 2 match array 1? Match the shorter array against the prefix of
   1.297 +# the other array
   1.298 +sub arraysMatch
   1.299 +	{
   1.300 +	my ($left, $right, $leftpos) = @_;
   1.301 +	my $last = scalar(@$left) - $leftpos;
   1.302 +	if (scalar(@$right) < $last)
   1.303 +		{
   1.304 +		$last = scalar(@$right);
   1.305 +		}
   1.306 +	my $pos = 0;
   1.307 +	while ($pos < $last)
   1.308 +		{
   1.309 +		if ($$left[$pos + $leftpos] != $$right[$pos])
   1.310 +			{
   1.311 +			return 0;
   1.312 +			}
   1.313 +		$pos++;
   1.314 +		}
   1.315 +	return 1;
   1.316 +	}
   1.317 +
   1.318 +# find a match for array 2 in array 1, allowing values past the end of array 1
   1.319 +# to match anything in array 1
   1.320 +sub findMatch
   1.321 +	{
   1.322 +	my ($candidate, $term) = @_;
   1.323 +	my $pos = 0;
   1.324 +	while (!arraysMatch($candidate, $term, $pos))
   1.325 +		{
   1.326 +		$pos++;
   1.327 +		}
   1.328 +	return $pos;
   1.329 +	}
   1.330 +
   1.331 +# add the data in array 2 to array 1, returning the position they went in.
   1.332 +sub addArray
   1.333 +	{
   1.334 +	my ($candidate, $addition) = @_;
   1.335 +	my $pos = findMatch($candidate, $addition);
   1.336 +	# add any required on to the end of the candidate block
   1.337 +	my $last = $pos + scalar(@$addition);
   1.338 +	my $additionPos = scalar(@$candidate) - $pos;
   1.339 +	while ($pos + $additionPos < $last)
   1.340 +		{
   1.341 +		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
   1.342 +		$additionPos++;
   1.343 +		}
   1.344 +	return $pos;
   1.345 +	}
   1.346 +
   1.347 +# create data block 1 and indices 2 from data 3 and block size 4
   1.348 +sub createTrieLevel
   1.349 +	{
   1.350 +	my ($data, $indices, $input, $blockSize) = @_;
   1.351 +	my $block = 0;
   1.352 +	while ($block * $blockSize < scalar(@$input))
   1.353 +		{
   1.354 +		my $start = $block * $blockSize;
   1.355 +		my $end = $start + $blockSize;
   1.356 +		my $currentBlockSize = $blockSize;
   1.357 +		if (scalar(@$input) < $end)
   1.358 +			{
   1.359 +			$end = scalar(@$input);
   1.360 +			$currentBlockSize = $end - $start;
   1.361 +			}
   1.362 +		my @currentBlock = @$input[$start..($end - 1)];
   1.363 +		while ($currentBlockSize != $blockSize)
   1.364 +			{
   1.365 +			$currentBlock[$currentBlockSize] = 0;
   1.366 +			$currentBlockSize++;
   1.367 +			}
   1.368 +		$$indices[$block] = addArray($data, \@currentBlock);
   1.369 +		$block++;
   1.370 +		}
   1.371 +	}
   1.372 +
   1.373 +sub OutputArray
   1.374 +	{
   1.375 +	my $index = 0;
   1.376 +	my $firstTime = 1;
   1.377 +	while ($index != scalar(@_))
   1.378 +		{
   1.379 +		if (!$firstTime)
   1.380 +			{
   1.381 +			if ($index % 8)
   1.382 +				{
   1.383 +				print ', ';
   1.384 +				}
   1.385 +			else
   1.386 +				{
   1.387 +				print ",\n\t";
   1.388 +				}
   1.389 +			}
   1.390 +		else
   1.391 +			{
   1.392 +			print "\t";
   1.393 +			$firstTime = 0;
   1.394 +			}
   1.395 +		print($_[$index]);
   1.396 +		$index++;
   1.397 +		}
   1.398 +	print "\n";
   1.399 +	}
   1.400 +
   1.401 +if ($OptionOutputTrie)
   1.402 +	{
   1.403 +	my @Trie0 = ();
   1.404 +	my @Index0 = ();
   1.405 +	my @Trie1 = ();
   1.406 +	my @Index1 = ();
   1.407 +	my @Trie2 = ();
   1.408 +	my @Index2 = ();
   1.409 +	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
   1.410 +	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
   1.411 +	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
   1.412 +	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
   1.413 +	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
   1.414 +	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
   1.415 +	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
   1.416 +	print "// Add the result of this to your character to fold it.\n\n";
   1.417 +	print "static const short CaseFoldTable3[] =\n\t{\n";
   1.418 +	OutputArray(@Trie0);
   1.419 +	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
   1.420 +	OutputArray(@Trie1);
   1.421 +	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
   1.422 +	OutputArray(@Trie2);
   1.423 +	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
   1.424 +	OutputArray(@Index2);
   1.425 +	print "\t};\n";
   1.426 +	}