1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/localisation/localesupport/OtherTools/CaseEquivalence.pl Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,423 @@
1.4 +#
1.5 +# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +# All rights reserved.
1.7 +# This component and the accompanying materials are made available
1.8 +# under the terms of "Eclipse Public License v1.0"
1.9 +# which accompanies this distribution, and is available
1.10 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +#
1.12 +# Initial Contributors:
1.13 +# Nokia Corporation - initial contribution.
1.14 +#
1.15 +# Contributors:
1.16 +#
1.17 +# Description:
1.18 +# Case Equivalence
1.19 +# Given the unicode data file, work out the case equivalence classes
1.20 +# i.e. the equivalence classes for the transitive closure of ~ defined as
1.21 +# follows:
1.22 +# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
1.23 +# Usage: perl CaseEquivalence <UnicodeData.txt
1.24 +#
1.25 +
1.26 +use strict;
1.27 +my @Name = ();
1.28 +my @Upper = ();
1.29 +my @Lower = ();
1.30 +my @Title = ();
1.31 +# $DecompositionValue[$code] is undefined if $code has no decomposition
1.32 +# sequence, if it has a single value decomposition sequence, then this is it,
1.33 +# if it has a longer sequence, the value is -1
1.34 +my @DecompositionValue = ();
1.35 +# 1 for each code that has a differently-cased version,
1.36 +# 2 for each code that is a lower-case version of something else.
1.37 +my %Codes = ();
1.38 +my %CaseClass = ();
1.39 +
1.40 +# Command-line options
1.41 +my $OptionOutputTrie = 1;
1.42 +my $OptionOutputForwardMapping = 0;
1.43 +my $OptionOutputReverseMapping = 0;
1.44 +my $OptionIgnoreOneToOneReverseMappings = 0;
1.45 +my $OptionIncludeExtraMappings = 1;
1.46 +
1.47 +foreach my $optionString (@ARGV)
1.48 + {
1.49 + if ($optionString =~ m![/-]o[tfrm]!)
1.50 + {
1.51 + $OptionOutputTrie = 0;
1.52 + my $option = substr($optionString, 2, 1);
1.53 + if ($option eq 'f')
1.54 + {
1.55 + $OptionOutputForwardMapping = 1;
1.56 + }
1.57 + elsif ($option eq 'r')
1.58 + {
1.59 + $OptionOutputReverseMapping = 1;
1.60 + }
1.61 + elsif ($option eq 'm')
1.62 + {
1.63 + $OptionOutputReverseMapping = 1;
1.64 + $OptionIgnoreOneToOneReverseMappings = 1;
1.65 + }
1.66 + else
1.67 + {
1.68 + $OptionOutputTrie = 1;
1.69 + }
1.70 + }
1.71 + elsif ($optionString =~ m![/-]s!)
1.72 + {
1.73 + $OptionIncludeExtraMappings = 0;
1.74 + }
1.75 + else
1.76 + {
1.77 + print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
1.78 + print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
1.79 + print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
1.80 + print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
1.81 + print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
1.82 + print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
1.83 + exit;
1.84 + }
1.85 + }
1.86 +
1.87 +# set a code as being part of a non-unitary case-equivalence class.
1.88 +sub add
1.89 + {
1.90 + my ($addition) = @_;
1.91 + if (!$Codes{$addition})
1.92 + {
1.93 + $Codes{$addition} = 1;
1.94 + }
1.95 + }
1.96 +
1.97 +# make a code point to its final case varient
1.98 +sub chaseDown
1.99 + {
1.100 + my ($codeVal) = @_;
1.101 + my $class = $codeVal;
1.102 + while ($CaseClass{$class})
1.103 + {
1.104 + $class = $CaseClass{$class};
1.105 + }
1.106 + $CaseClass{$codeVal} = $class unless $codeVal == $class;
1.107 + return $class;
1.108 + }
1.109 +
1.110 +# link two codes together as being part of the same case-equivalence class
1.111 +sub makeEquivalent
1.112 + {
1.113 + my ($left, $right) = @_;
1.114 + if (!$left || !$right)
1.115 + {
1.116 + return;
1.117 + }
1.118 + $left = chaseDown($left);
1.119 + $right = chaseDown($right);
1.120 + if ($Codes{$left} < $Codes{$right})
1.121 + {
1.122 + $CaseClass{$left} = $right;
1.123 + return;
1.124 + }
1.125 + if ($Codes{$right} < $Codes{$left})
1.126 + {
1.127 + $CaseClass{$right} = $left;
1.128 + return;
1.129 + }
1.130 + if ($left < $right)
1.131 + {
1.132 + $CaseClass{$right} = $left;
1.133 + return;
1.134 + }
1.135 + if ($right < $left)
1.136 + {
1.137 + $CaseClass{$left} = $right;
1.138 + return;
1.139 + }
1.140 + # $left == $right.. do nothing
1.141 + return;
1.142 + }
1.143 +
1.144 +# Link possibly unmentioned codes together. The first one is considered lower-case
1.145 +sub addEquivalenceClass
1.146 + {
1.147 + my ($lower, @rest) = @_;
1.148 + $Codes{$lower} = 2;
1.149 + foreach my $one (@rest)
1.150 + {
1.151 + $Codes{$one} = 1;
1.152 + makeEquivalent($lower, $one);
1.153 + }
1.154 + }
1.155 +
1.156 +# Firstly we read in the data
1.157 +while(<STDIN>)
1.158 + {
1.159 + my @line = split('#', $_, 1);
1.160 + my @fields = split(/;/, $line[0]);
1.161 + my @decomposition = split(' ', $fields[5]);
1.162 + if (1 < scalar(@fields))
1.163 + {
1.164 + my $codeVal = hex($fields[0]);
1.165 + # if the character has a non-compatibility decomposition sequence, record this fact.
1.166 + if (0 < scalar(@decomposition))
1.167 + {
1.168 + my $decompositionType = "";
1.169 + if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
1.170 + {
1.171 + $decompositionType = shift @decomposition;
1.172 + }
1.173 + if ($decompositionType !~ m/compat/i)
1.174 + {
1.175 + $DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
1.176 + }
1.177 + }
1.178 + $Name[$codeVal] = $fields[1];
1.179 + my $upperval = $fields[12];
1.180 + my $lowerval = $fields[13];
1.181 + my $titleval = $fields[14];
1.182 +
1.183 + # strip whitespace from the end of the string
1.184 + $titleval =~ s/\s+$//;
1.185 + if ($upperval)
1.186 + {
1.187 + $upperval = hex($upperval);
1.188 + $Upper[$codeVal] = $upperval;
1.189 + add $codeVal;
1.190 + add $upperval;
1.191 + }
1.192 + if ($titleval)
1.193 + {
1.194 + $titleval = hex($titleval);
1.195 + $Title[$codeVal] = $titleval;
1.196 + add $codeVal;
1.197 + add $titleval;
1.198 + }
1.199 + if ($lowerval)
1.200 + {
1.201 + $lowerval = hex($lowerval);
1.202 + $Lower[$codeVal] = $lowerval;
1.203 + add $codeVal;
1.204 + $Codes{$lowerval} = 2;
1.205 + }
1.206 + }
1.207 + }
1.208 +
1.209 +# Remove all codes that decompose to a sequence
1.210 +foreach my $codeVal (keys(%Codes))
1.211 + {
1.212 + my $current = $DecompositionValue[$codeVal];
1.213 + while ($current && 0 < $current)
1.214 + {
1.215 + $current = $DecompositionValue[$current];
1.216 + }
1.217 + if ($current && $current == -1)
1.218 + {
1.219 + delete $Codes{$codeVal};
1.220 + }
1.221 + }
1.222 +
1.223 +# Next we form the equivalence classes.
1.224 +if ($OptionIncludeExtraMappings)
1.225 + {
1.226 + # space = non-breaking space
1.227 + addEquivalenceClass(0x20, 0xA0);
1.228 + }
1.229 +# We try to end up with everything being equivalent to a lower case letter
1.230 +foreach my $codeVal (keys(%Codes))
1.231 + {
1.232 + makeEquivalent($codeVal, $Lower[$codeVal]);
1.233 + makeEquivalent($codeVal, $Upper[$codeVal]);
1.234 + makeEquivalent($codeVal, $Title[$codeVal]);
1.235 + }
1.236 +
1.237 +# Next we chase each pointer in CaseClass down to its final result
1.238 +foreach my $codeVal (keys(%CaseClass))
1.239 + {
1.240 + chaseDown($codeVal);
1.241 + }
1.242 +
1.243 +# Now output the results in order, and collect the raw data
1.244 +my @Offset = ();
1.245 +my $oldCodeCount = 0;
1.246 +foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
1.247 + {
1.248 + my $class = $CaseClass{$codeVal};
1.249 + my $offset = $class - $codeVal;
1.250 + if ($OptionOutputForwardMapping)
1.251 + {
1.252 + printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
1.253 + }
1.254 + while ($oldCodeCount != $codeVal)
1.255 + {
1.256 + $Offset[$oldCodeCount] = 0;
1.257 + $oldCodeCount++;
1.258 + }
1.259 + $oldCodeCount++;
1.260 + $Offset[$codeVal] = $offset;
1.261 + }
1.262 +
1.263 +if ($OptionOutputReverseMapping)
1.264 + {
1.265 + my %ReverseMapping = ();
1.266 + foreach my $codeVal (keys(%CaseClass))
1.267 + {
1.268 + my $mapsTo = $CaseClass{$codeVal};
1.269 + if (!$ReverseMapping{$mapsTo})
1.270 + {
1.271 + $ReverseMapping{$mapsTo} = [$codeVal];
1.272 + }
1.273 + else
1.274 + {
1.275 + push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
1.276 + }
1.277 + }
1.278 + foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
1.279 + {
1.280 + next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
1.281 + printf("%x: %s <=", $mapVal, $Name[$mapVal]);
1.282 + my $firstTime = 1;
1.283 + foreach my $val ( @{ $ReverseMapping{$mapVal} } )
1.284 + {
1.285 + if (!$firstTime)
1.286 + {
1.287 + print ',';
1.288 + }
1.289 + $firstTime = 0;
1.290 + printf(" %s:%x", $Name[$val], $val);
1.291 + }
1.292 + print "\n";
1.293 + }
1.294 + }
1.295 +
1.296 +# does the array 2 match array 1? Match the shorter array against the prefix of
1.297 +# the other array
1.298 +sub arraysMatch
1.299 + {
1.300 + my ($left, $right, $leftpos) = @_;
1.301 + my $last = scalar(@$left) - $leftpos;
1.302 + if (scalar(@$right) < $last)
1.303 + {
1.304 + $last = scalar(@$right);
1.305 + }
1.306 + my $pos = 0;
1.307 + while ($pos < $last)
1.308 + {
1.309 + if ($$left[$pos + $leftpos] != $$right[$pos])
1.310 + {
1.311 + return 0;
1.312 + }
1.313 + $pos++;
1.314 + }
1.315 + return 1;
1.316 + }
1.317 +
1.318 +# find a match for array 2 in array 1, allowing values past the end of array 1
1.319 +# to match anything in array 1
1.320 +sub findMatch
1.321 + {
1.322 + my ($candidate, $term) = @_;
1.323 + my $pos = 0;
1.324 + while (!arraysMatch($candidate, $term, $pos))
1.325 + {
1.326 + $pos++;
1.327 + }
1.328 + return $pos;
1.329 + }
1.330 +
1.331 +# add the data in array 2 to array 1, returning the position they went in.
1.332 +sub addArray
1.333 + {
1.334 + my ($candidate, $addition) = @_;
1.335 + my $pos = findMatch($candidate, $addition);
1.336 + # add any required on to the end of the candidate block
1.337 + my $last = $pos + scalar(@$addition);
1.338 + my $additionPos = scalar(@$candidate) - $pos;
1.339 + while ($pos + $additionPos < $last)
1.340 + {
1.341 + $$candidate[$pos + $additionPos] = $$addition[$additionPos];
1.342 + $additionPos++;
1.343 + }
1.344 + return $pos;
1.345 + }
1.346 +
1.347 +# create data block 1 and indices 2 from data 3 and block size 4
1.348 +sub createTrieLevel
1.349 + {
1.350 + my ($data, $indices, $input, $blockSize) = @_;
1.351 + my $block = 0;
1.352 + while ($block * $blockSize < scalar(@$input))
1.353 + {
1.354 + my $start = $block * $blockSize;
1.355 + my $end = $start + $blockSize;
1.356 + my $currentBlockSize = $blockSize;
1.357 + if (scalar(@$input) < $end)
1.358 + {
1.359 + $end = scalar(@$input);
1.360 + $currentBlockSize = $end - $start;
1.361 + }
1.362 + my @currentBlock = @$input[$start..($end - 1)];
1.363 + while ($currentBlockSize != $blockSize)
1.364 + {
1.365 + $currentBlock[$currentBlockSize] = 0;
1.366 + $currentBlockSize++;
1.367 + }
1.368 + $$indices[$block] = addArray($data, \@currentBlock);
1.369 + $block++;
1.370 + }
1.371 + }
1.372 +
1.373 +sub OutputArray
1.374 + {
1.375 + my $index = 0;
1.376 + my $firstTime = 1;
1.377 + while ($index != scalar(@_))
1.378 + {
1.379 + if (!$firstTime)
1.380 + {
1.381 + if ($index % 8)
1.382 + {
1.383 + print ', ';
1.384 + }
1.385 + else
1.386 + {
1.387 + print ",\n\t";
1.388 + }
1.389 + }
1.390 + else
1.391 + {
1.392 + print "\t";
1.393 + $firstTime = 0;
1.394 + }
1.395 + print($_[$index]);
1.396 + $index++;
1.397 + }
1.398 + print "\n";
1.399 + }
1.400 +
1.401 +if ($OptionOutputTrie)
1.402 + {
1.403 + my @Trie0 = ();
1.404 + my @Index0 = ();
1.405 + my @Trie1 = ();
1.406 + my @Index1 = ();
1.407 + my @Trie2 = ();
1.408 + my @Index2 = ();
1.409 + createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
1.410 + createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
1.411 + createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
1.412 + print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
1.413 + print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
1.414 + print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
1.415 + print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
1.416 + print "// Add the result of this to your character to fold it.\n\n";
1.417 + print "static const short CaseFoldTable3[] =\n\t{\n";
1.418 + OutputArray(@Trie0);
1.419 + print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
1.420 + OutputArray(@Trie1);
1.421 + print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
1.422 + OutputArray(@Trie2);
1.423 + print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
1.424 + OutputArray(@Index2);
1.425 + print "\t};\n";
1.426 + }