os/textandloc/localisation/localesupport/OtherTools/CaseEquivalence.pl
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 #
     2 # Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 # All rights reserved.
     4 # This component and the accompanying materials are made available
     5 # under the terms of "Eclipse Public License v1.0"
     6 # which accompanies this distribution, and is available
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 #
     9 # Initial Contributors:
    10 # Nokia Corporation - initial contribution.
    11 #
    12 # Contributors:
    13 #
    14 # Description: 
    15 # Case Equivalence
    16 # Given the unicode data file, work out the case equivalence classes
    17 # i.e. the equivalence classes for the transitive closure of ~ defined as
    18 # follows:
    19 # a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
    20 # Usage: perl CaseEquivalence <UnicodeData.txt
    21 #
    22 
    23 use strict;
    24 my @Name = ();
    25 my @Upper = ();
    26 my @Lower = ();
    27 my @Title = ();
    28 # $DecompositionValue[$code] is undefined if $code has no decomposition
    29 # sequence, if it has a single value decomposition sequence, then this is it,
    30 # if it has a longer sequence, the value is -1
    31 my @DecompositionValue = ();
    32 # 1 for each code that has a differently-cased version,
    33 # 2 for each code that is a lower-case version of something else.
    34 my %Codes = ();
    35 my %CaseClass = ();
    36 
    37 # Command-line options
    38 my $OptionOutputTrie = 1;
    39 my $OptionOutputForwardMapping = 0;
    40 my $OptionOutputReverseMapping = 0;
    41 my $OptionIgnoreOneToOneReverseMappings = 0;
    42 my $OptionIncludeExtraMappings = 1;
    43 
    44 foreach my $optionString (@ARGV)
    45 	{
    46 	if ($optionString =~ m![/-]o[tfrm]!)
    47 		{
    48 		$OptionOutputTrie = 0;
    49 		my $option = substr($optionString, 2, 1);
    50 		if ($option eq 'f')
    51 			{
    52 			$OptionOutputForwardMapping = 1;
    53 			}
    54 		elsif ($option eq 'r')
    55 			{
    56 			$OptionOutputReverseMapping = 1;
    57 			}
    58 		elsif ($option eq 'm')
    59 			{
    60 			$OptionOutputReverseMapping = 1;
    61 			$OptionIgnoreOneToOneReverseMappings = 1;
    62 			}
    63 		else
    64 			{
    65 			$OptionOutputTrie = 1;
    66 			}
    67 		}
    68 	elsif ($optionString =~ m![/-]s!)
    69 		{
    70 		$OptionIncludeExtraMappings = 0;
    71 		}
    72 	else
    73 		{
    74 		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
    75 		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
    76 		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
    77 		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
    78 		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
    79 		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
    80 		exit;
    81 		}
    82 	}
    83 
    84 # set a code as being part of a non-unitary case-equivalence class.
    85 sub add
    86 	{
    87 	my ($addition) = @_;
    88 	if (!$Codes{$addition})
    89 		{
    90 		$Codes{$addition} = 1;
    91 		}
    92 	}
    93 
    94 # make a code point to its final case varient
    95 sub chaseDown
    96 	{
    97 	my ($codeVal) = @_;
    98 	my $class = $codeVal;
    99 	while ($CaseClass{$class})
   100 		{
   101 		$class = $CaseClass{$class};
   102 		}
   103 	$CaseClass{$codeVal} = $class unless $codeVal == $class;
   104 	return $class;
   105 	}
   106 
   107 # link two codes together as being part of the same case-equivalence class
   108 sub makeEquivalent
   109 	{
   110 	my ($left, $right) = @_;
   111 	if (!$left || !$right)
   112 		{
   113 		return;
   114 		}
   115 	$left = chaseDown($left);
   116 	$right = chaseDown($right);
   117 	if ($Codes{$left} < $Codes{$right})
   118 		{
   119 		$CaseClass{$left} = $right;
   120 		return;
   121 		}
   122 	if ($Codes{$right} < $Codes{$left})
   123 		{
   124 		$CaseClass{$right} = $left;
   125 		return;
   126 		}
   127 	if ($left < $right)
   128 		{
   129 		$CaseClass{$right} = $left;
   130 		return;
   131 		}
   132 	if ($right < $left)
   133 		{
   134 		$CaseClass{$left} = $right;
   135 		return;
   136 		}
   137 	# $left == $right.. do nothing
   138 	return;
   139 	}
   140 
   141 # Link possibly unmentioned codes together. The first one is considered lower-case
   142 sub addEquivalenceClass
   143 	{
   144 	my ($lower, @rest) = @_;
   145 	$Codes{$lower} = 2;
   146 	foreach my $one (@rest)
   147 		{
   148 		$Codes{$one} = 1;
   149 		makeEquivalent($lower, $one);
   150 		}
   151 	}
   152 
   153 # Firstly we read in the data
   154 while(<STDIN>)
   155 	{
   156 	my @line = split('#', $_, 1);
   157 	my @fields = split(/;/, $line[0]);
   158 	my @decomposition = split(' ', $fields[5]);
   159 	if (1 < scalar(@fields))
   160 		{
   161 		my $codeVal = hex($fields[0]);
   162 		# if the character has a non-compatibility decomposition sequence, record this fact.
   163 		if (0 < scalar(@decomposition))
   164 			{
   165 			my $decompositionType = "";
   166 			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
   167 				{
   168 				$decompositionType = shift @decomposition;
   169 				}
   170 			if ($decompositionType !~ m/compat/i)
   171 				{
   172 				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
   173 				}
   174 			}
   175 		$Name[$codeVal] = $fields[1];
   176 		my $upperval = $fields[12];
   177 		my $lowerval = $fields[13];
   178 		my $titleval = $fields[14];
   179 
   180 		# strip whitespace from the end of the string
   181 		$titleval =~ s/\s+$//;
   182 		if ($upperval)
   183 			{
   184 			$upperval = hex($upperval);
   185 			$Upper[$codeVal] = $upperval;
   186 			add $codeVal;
   187 			add $upperval;
   188 			}
   189 		if ($titleval)
   190 			{
   191 			$titleval = hex($titleval);
   192 			$Title[$codeVal] = $titleval;
   193 			add $codeVal;
   194 			add $titleval;
   195 			}
   196 		if ($lowerval)
   197 			{
   198 			$lowerval = hex($lowerval);
   199 			$Lower[$codeVal] = $lowerval;
   200 			add $codeVal;
   201 			$Codes{$lowerval} = 2;
   202 			}
   203 		}
   204 	}
   205 
   206 # Remove all codes that decompose to a sequence
   207 foreach my $codeVal (keys(%Codes))
   208 	{
   209 	my $current = $DecompositionValue[$codeVal];
   210 	while ($current && 0 < $current)
   211 		{
   212 		$current = $DecompositionValue[$current];
   213 		}
   214 	if ($current && $current == -1)
   215 		{
   216 		delete $Codes{$codeVal};
   217 		}
   218 	}
   219 
   220 # Next we form the equivalence classes.
   221 if ($OptionIncludeExtraMappings)
   222 	{
   223 	# space = non-breaking space
   224 	addEquivalenceClass(0x20, 0xA0);
   225 	}
   226 # We try to end up with everything being equivalent to a lower case letter
   227 foreach my $codeVal (keys(%Codes))
   228 	{
   229 	makeEquivalent($codeVal, $Lower[$codeVal]);
   230 	makeEquivalent($codeVal, $Upper[$codeVal]);
   231 	makeEquivalent($codeVal, $Title[$codeVal]);
   232 	}
   233 
   234 # Next we chase each pointer in CaseClass down to its final result
   235 foreach my $codeVal (keys(%CaseClass))
   236 	{
   237 	chaseDown($codeVal);
   238 	}
   239 
   240 # Now output the results in order, and collect the raw data
   241 my @Offset = ();
   242 my $oldCodeCount = 0;
   243 foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
   244 	{
   245 	my $class = $CaseClass{$codeVal};
   246 	my $offset = $class - $codeVal;
   247 	if ($OptionOutputForwardMapping)
   248 		{
   249 		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
   250 		}
   251 	while ($oldCodeCount != $codeVal)
   252 		{
   253 		$Offset[$oldCodeCount] = 0;
   254 		$oldCodeCount++;
   255 		}
   256 	$oldCodeCount++;
   257 	$Offset[$codeVal] = $offset;
   258 	}
   259 
   260 if ($OptionOutputReverseMapping)
   261 	{
   262 	my %ReverseMapping = ();
   263 	foreach my $codeVal (keys(%CaseClass))
   264 		{
   265 		my $mapsTo = $CaseClass{$codeVal};
   266 		if (!$ReverseMapping{$mapsTo})
   267 			{
   268 			$ReverseMapping{$mapsTo} = [$codeVal];
   269 			}
   270 		else
   271 			{
   272 			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
   273 			}
   274 		}
   275 	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
   276 		{
   277 		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
   278 		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
   279 		my $firstTime = 1;
   280 		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
   281 			{
   282 			if (!$firstTime)
   283 				{
   284 				print ',';
   285 				}
   286 			$firstTime = 0;
   287 			printf(" %s:%x", $Name[$val], $val);
   288 			}
   289 		print "\n";
   290 		}
   291 	}
   292 
   293 # does the array 2 match array 1? Match the shorter array against the prefix of
   294 # the other array
   295 sub arraysMatch
   296 	{
   297 	my ($left, $right, $leftpos) = @_;
   298 	my $last = scalar(@$left) - $leftpos;
   299 	if (scalar(@$right) < $last)
   300 		{
   301 		$last = scalar(@$right);
   302 		}
   303 	my $pos = 0;
   304 	while ($pos < $last)
   305 		{
   306 		if ($$left[$pos + $leftpos] != $$right[$pos])
   307 			{
   308 			return 0;
   309 			}
   310 		$pos++;
   311 		}
   312 	return 1;
   313 	}
   314 
   315 # find a match for array 2 in array 1, allowing values past the end of array 1
   316 # to match anything in array 1
   317 sub findMatch
   318 	{
   319 	my ($candidate, $term) = @_;
   320 	my $pos = 0;
   321 	while (!arraysMatch($candidate, $term, $pos))
   322 		{
   323 		$pos++;
   324 		}
   325 	return $pos;
   326 	}
   327 
   328 # add the data in array 2 to array 1, returning the position they went in.
   329 sub addArray
   330 	{
   331 	my ($candidate, $addition) = @_;
   332 	my $pos = findMatch($candidate, $addition);
   333 	# add any required on to the end of the candidate block
   334 	my $last = $pos + scalar(@$addition);
   335 	my $additionPos = scalar(@$candidate) - $pos;
   336 	while ($pos + $additionPos < $last)
   337 		{
   338 		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
   339 		$additionPos++;
   340 		}
   341 	return $pos;
   342 	}
   343 
   344 # create data block 1 and indices 2 from data 3 and block size 4
   345 sub createTrieLevel
   346 	{
   347 	my ($data, $indices, $input, $blockSize) = @_;
   348 	my $block = 0;
   349 	while ($block * $blockSize < scalar(@$input))
   350 		{
   351 		my $start = $block * $blockSize;
   352 		my $end = $start + $blockSize;
   353 		my $currentBlockSize = $blockSize;
   354 		if (scalar(@$input) < $end)
   355 			{
   356 			$end = scalar(@$input);
   357 			$currentBlockSize = $end - $start;
   358 			}
   359 		my @currentBlock = @$input[$start..($end - 1)];
   360 		while ($currentBlockSize != $blockSize)
   361 			{
   362 			$currentBlock[$currentBlockSize] = 0;
   363 			$currentBlockSize++;
   364 			}
   365 		$$indices[$block] = addArray($data, \@currentBlock);
   366 		$block++;
   367 		}
   368 	}
   369 
   370 sub OutputArray
   371 	{
   372 	my $index = 0;
   373 	my $firstTime = 1;
   374 	while ($index != scalar(@_))
   375 		{
   376 		if (!$firstTime)
   377 			{
   378 			if ($index % 8)
   379 				{
   380 				print ', ';
   381 				}
   382 			else
   383 				{
   384 				print ",\n\t";
   385 				}
   386 			}
   387 		else
   388 			{
   389 			print "\t";
   390 			$firstTime = 0;
   391 			}
   392 		print($_[$index]);
   393 		$index++;
   394 		}
   395 	print "\n";
   396 	}
   397 
   398 if ($OptionOutputTrie)
   399 	{
   400 	my @Trie0 = ();
   401 	my @Index0 = ();
   402 	my @Trie1 = ();
   403 	my @Index1 = ();
   404 	my @Trie2 = ();
   405 	my @Index2 = ();
   406 	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
   407 	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
   408 	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
   409 	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
   410 	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
   411 	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
   412 	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
   413 	print "// Add the result of this to your character to fold it.\n\n";
   414 	print "static const short CaseFoldTable3[] =\n\t{\n";
   415 	OutputArray(@Trie0);
   416 	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
   417 	OutputArray(@Trie1);
   418 	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
   419 	OutputArray(@Trie2);
   420 	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
   421 	OutputArray(@Index2);
   422 	print "\t};\n";
   423 	}