os/textandloc/localisation/localesupport/OtherTools/CaseEquivalence.pl
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
#
sl@0
     2
# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
# All rights reserved.
sl@0
     4
# This component and the accompanying materials are made available
sl@0
     5
# under the terms of "Eclipse Public License v1.0"
sl@0
     6
# which accompanies this distribution, and is available
sl@0
     7
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
#
sl@0
     9
# Initial Contributors:
sl@0
    10
# Nokia Corporation - initial contribution.
sl@0
    11
#
sl@0
    12
# Contributors:
sl@0
    13
#
sl@0
    14
# Description: 
sl@0
    15
# Case Equivalence
sl@0
    16
# Given the unicode data file, work out the case equivalence classes
sl@0
    17
# i.e. the equivalence classes for the transitive closure of ~ defined as
sl@0
    18
# follows:
sl@0
    19
# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
sl@0
    20
# Usage: perl CaseEquivalence <UnicodeData.txt
sl@0
    21
#
sl@0
    22
sl@0
    23
use strict;
sl@0
    24
my @Name = ();
sl@0
    25
my @Upper = ();
sl@0
    26
my @Lower = ();
sl@0
    27
my @Title = ();
sl@0
    28
# $DecompositionValue[$code] is undefined if $code has no decomposition
sl@0
    29
# sequence, if it has a single value decomposition sequence, then this is it,
sl@0
    30
# if it has a longer sequence, the value is -1
sl@0
    31
my @DecompositionValue = ();
sl@0
    32
# 1 for each code that has a differently-cased version,
sl@0
    33
# 2 for each code that is a lower-case version of something else.
sl@0
    34
my %Codes = ();
sl@0
    35
my %CaseClass = ();
sl@0
    36
sl@0
    37
# Command-line options
sl@0
    38
my $OptionOutputTrie = 1;
sl@0
    39
my $OptionOutputForwardMapping = 0;
sl@0
    40
my $OptionOutputReverseMapping = 0;
sl@0
    41
my $OptionIgnoreOneToOneReverseMappings = 0;
sl@0
    42
my $OptionIncludeExtraMappings = 1;
sl@0
    43
sl@0
    44
foreach my $optionString (@ARGV)
sl@0
    45
	{
sl@0
    46
	if ($optionString =~ m![/-]o[tfrm]!)
sl@0
    47
		{
sl@0
    48
		$OptionOutputTrie = 0;
sl@0
    49
		my $option = substr($optionString, 2, 1);
sl@0
    50
		if ($option eq 'f')
sl@0
    51
			{
sl@0
    52
			$OptionOutputForwardMapping = 1;
sl@0
    53
			}
sl@0
    54
		elsif ($option eq 'r')
sl@0
    55
			{
sl@0
    56
			$OptionOutputReverseMapping = 1;
sl@0
    57
			}
sl@0
    58
		elsif ($option eq 'm')
sl@0
    59
			{
sl@0
    60
			$OptionOutputReverseMapping = 1;
sl@0
    61
			$OptionIgnoreOneToOneReverseMappings = 1;
sl@0
    62
			}
sl@0
    63
		else
sl@0
    64
			{
sl@0
    65
			$OptionOutputTrie = 1;
sl@0
    66
			}
sl@0
    67
		}
sl@0
    68
	elsif ($optionString =~ m![/-]s!)
sl@0
    69
		{
sl@0
    70
		$OptionIncludeExtraMappings = 0;
sl@0
    71
		}
sl@0
    72
	else
sl@0
    73
		{
sl@0
    74
		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
sl@0
    75
		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
sl@0
    76
		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
sl@0
    77
		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
sl@0
    78
		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
sl@0
    79
		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
sl@0
    80
		exit;
sl@0
    81
		}
sl@0
    82
	}
sl@0
    83
sl@0
    84
# set a code as being part of a non-unitary case-equivalence class.
sl@0
    85
sub add
sl@0
    86
	{
sl@0
    87
	my ($addition) = @_;
sl@0
    88
	if (!$Codes{$addition})
sl@0
    89
		{
sl@0
    90
		$Codes{$addition} = 1;
sl@0
    91
		}
sl@0
    92
	}
sl@0
    93
sl@0
    94
# make a code point to its final case varient
sl@0
    95
sub chaseDown
sl@0
    96
	{
sl@0
    97
	my ($codeVal) = @_;
sl@0
    98
	my $class = $codeVal;
sl@0
    99
	while ($CaseClass{$class})
sl@0
   100
		{
sl@0
   101
		$class = $CaseClass{$class};
sl@0
   102
		}
sl@0
   103
	$CaseClass{$codeVal} = $class unless $codeVal == $class;
sl@0
   104
	return $class;
sl@0
   105
	}
sl@0
   106
sl@0
   107
# link two codes together as being part of the same case-equivalence class
sl@0
   108
sub makeEquivalent
sl@0
   109
	{
sl@0
   110
	my ($left, $right) = @_;
sl@0
   111
	if (!$left || !$right)
sl@0
   112
		{
sl@0
   113
		return;
sl@0
   114
		}
sl@0
   115
	$left = chaseDown($left);
sl@0
   116
	$right = chaseDown($right);
sl@0
   117
	if ($Codes{$left} < $Codes{$right})
sl@0
   118
		{
sl@0
   119
		$CaseClass{$left} = $right;
sl@0
   120
		return;
sl@0
   121
		}
sl@0
   122
	if ($Codes{$right} < $Codes{$left})
sl@0
   123
		{
sl@0
   124
		$CaseClass{$right} = $left;
sl@0
   125
		return;
sl@0
   126
		}
sl@0
   127
	if ($left < $right)
sl@0
   128
		{
sl@0
   129
		$CaseClass{$right} = $left;
sl@0
   130
		return;
sl@0
   131
		}
sl@0
   132
	if ($right < $left)
sl@0
   133
		{
sl@0
   134
		$CaseClass{$left} = $right;
sl@0
   135
		return;
sl@0
   136
		}
sl@0
   137
	# $left == $right.. do nothing
sl@0
   138
	return;
sl@0
   139
	}
sl@0
   140
sl@0
   141
# Link possibly unmentioned codes together. The first one is considered lower-case
sl@0
   142
sub addEquivalenceClass
sl@0
   143
	{
sl@0
   144
	my ($lower, @rest) = @_;
sl@0
   145
	$Codes{$lower} = 2;
sl@0
   146
	foreach my $one (@rest)
sl@0
   147
		{
sl@0
   148
		$Codes{$one} = 1;
sl@0
   149
		makeEquivalent($lower, $one);
sl@0
   150
		}
sl@0
   151
	}
sl@0
   152
sl@0
   153
# Firstly we read in the data
sl@0
   154
while(<STDIN>)
sl@0
   155
	{
sl@0
   156
	my @line = split('#', $_, 1);
sl@0
   157
	my @fields = split(/;/, $line[0]);
sl@0
   158
	my @decomposition = split(' ', $fields[5]);
sl@0
   159
	if (1 < scalar(@fields))
sl@0
   160
		{
sl@0
   161
		my $codeVal = hex($fields[0]);
sl@0
   162
		# if the character has a non-compatibility decomposition sequence, record this fact.
sl@0
   163
		if (0 < scalar(@decomposition))
sl@0
   164
			{
sl@0
   165
			my $decompositionType = "";
sl@0
   166
			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
sl@0
   167
				{
sl@0
   168
				$decompositionType = shift @decomposition;
sl@0
   169
				}
sl@0
   170
			if ($decompositionType !~ m/compat/i)
sl@0
   171
				{
sl@0
   172
				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
sl@0
   173
				}
sl@0
   174
			}
sl@0
   175
		$Name[$codeVal] = $fields[1];
sl@0
   176
		my $upperval = $fields[12];
sl@0
   177
		my $lowerval = $fields[13];
sl@0
   178
		my $titleval = $fields[14];
sl@0
   179
sl@0
   180
		# strip whitespace from the end of the string
sl@0
   181
		$titleval =~ s/\s+$//;
sl@0
   182
		if ($upperval)
sl@0
   183
			{
sl@0
   184
			$upperval = hex($upperval);
sl@0
   185
			$Upper[$codeVal] = $upperval;
sl@0
   186
			add $codeVal;
sl@0
   187
			add $upperval;
sl@0
   188
			}
sl@0
   189
		if ($titleval)
sl@0
   190
			{
sl@0
   191
			$titleval = hex($titleval);
sl@0
   192
			$Title[$codeVal] = $titleval;
sl@0
   193
			add $codeVal;
sl@0
   194
			add $titleval;
sl@0
   195
			}
sl@0
   196
		if ($lowerval)
sl@0
   197
			{
sl@0
   198
			$lowerval = hex($lowerval);
sl@0
   199
			$Lower[$codeVal] = $lowerval;
sl@0
   200
			add $codeVal;
sl@0
   201
			$Codes{$lowerval} = 2;
sl@0
   202
			}
sl@0
   203
		}
sl@0
   204
	}
sl@0
   205
sl@0
   206
# Remove all codes that decompose to a sequence
sl@0
   207
foreach my $codeVal (keys(%Codes))
sl@0
   208
	{
sl@0
   209
	my $current = $DecompositionValue[$codeVal];
sl@0
   210
	while ($current && 0 < $current)
sl@0
   211
		{
sl@0
   212
		$current = $DecompositionValue[$current];
sl@0
   213
		}
sl@0
   214
	if ($current && $current == -1)
sl@0
   215
		{
sl@0
   216
		delete $Codes{$codeVal};
sl@0
   217
		}
sl@0
   218
	}
sl@0
   219
sl@0
   220
# Next we form the equivalence classes.
sl@0
   221
if ($OptionIncludeExtraMappings)
sl@0
   222
	{
sl@0
   223
	# space = non-breaking space
sl@0
   224
	addEquivalenceClass(0x20, 0xA0);
sl@0
   225
	}
sl@0
   226
# We try to end up with everything being equivalent to a lower case letter
sl@0
   227
foreach my $codeVal (keys(%Codes))
sl@0
   228
	{
sl@0
   229
	makeEquivalent($codeVal, $Lower[$codeVal]);
sl@0
   230
	makeEquivalent($codeVal, $Upper[$codeVal]);
sl@0
   231
	makeEquivalent($codeVal, $Title[$codeVal]);
sl@0
   232
	}
sl@0
   233
sl@0
   234
# Next we chase each pointer in CaseClass down to its final result
sl@0
   235
foreach my $codeVal (keys(%CaseClass))
sl@0
   236
	{
sl@0
   237
	chaseDown($codeVal);
sl@0
   238
	}
sl@0
   239
sl@0
   240
# Now output the results in order, and collect the raw data
sl@0
   241
my @Offset = ();
sl@0
   242
my $oldCodeCount = 0;
sl@0
   243
foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
sl@0
   244
	{
sl@0
   245
	my $class = $CaseClass{$codeVal};
sl@0
   246
	my $offset = $class - $codeVal;
sl@0
   247
	if ($OptionOutputForwardMapping)
sl@0
   248
		{
sl@0
   249
		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
sl@0
   250
		}
sl@0
   251
	while ($oldCodeCount != $codeVal)
sl@0
   252
		{
sl@0
   253
		$Offset[$oldCodeCount] = 0;
sl@0
   254
		$oldCodeCount++;
sl@0
   255
		}
sl@0
   256
	$oldCodeCount++;
sl@0
   257
	$Offset[$codeVal] = $offset;
sl@0
   258
	}
sl@0
   259
sl@0
   260
if ($OptionOutputReverseMapping)
sl@0
   261
	{
sl@0
   262
	my %ReverseMapping = ();
sl@0
   263
	foreach my $codeVal (keys(%CaseClass))
sl@0
   264
		{
sl@0
   265
		my $mapsTo = $CaseClass{$codeVal};
sl@0
   266
		if (!$ReverseMapping{$mapsTo})
sl@0
   267
			{
sl@0
   268
			$ReverseMapping{$mapsTo} = [$codeVal];
sl@0
   269
			}
sl@0
   270
		else
sl@0
   271
			{
sl@0
   272
			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
sl@0
   273
			}
sl@0
   274
		}
sl@0
   275
	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
sl@0
   276
		{
sl@0
   277
		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
sl@0
   278
		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
sl@0
   279
		my $firstTime = 1;
sl@0
   280
		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
sl@0
   281
			{
sl@0
   282
			if (!$firstTime)
sl@0
   283
				{
sl@0
   284
				print ',';
sl@0
   285
				}
sl@0
   286
			$firstTime = 0;
sl@0
   287
			printf(" %s:%x", $Name[$val], $val);
sl@0
   288
			}
sl@0
   289
		print "\n";
sl@0
   290
		}
sl@0
   291
	}
sl@0
   292
sl@0
   293
# does the array 2 match array 1? Match the shorter array against the prefix of
sl@0
   294
# the other array
sl@0
   295
sub arraysMatch
sl@0
   296
	{
sl@0
   297
	my ($left, $right, $leftpos) = @_;
sl@0
   298
	my $last = scalar(@$left) - $leftpos;
sl@0
   299
	if (scalar(@$right) < $last)
sl@0
   300
		{
sl@0
   301
		$last = scalar(@$right);
sl@0
   302
		}
sl@0
   303
	my $pos = 0;
sl@0
   304
	while ($pos < $last)
sl@0
   305
		{
sl@0
   306
		if ($$left[$pos + $leftpos] != $$right[$pos])
sl@0
   307
			{
sl@0
   308
			return 0;
sl@0
   309
			}
sl@0
   310
		$pos++;
sl@0
   311
		}
sl@0
   312
	return 1;
sl@0
   313
	}
sl@0
   314
sl@0
   315
# find a match for array 2 in array 1, allowing values past the end of array 1
sl@0
   316
# to match anything in array 1
sl@0
   317
sub findMatch
sl@0
   318
	{
sl@0
   319
	my ($candidate, $term) = @_;
sl@0
   320
	my $pos = 0;
sl@0
   321
	while (!arraysMatch($candidate, $term, $pos))
sl@0
   322
		{
sl@0
   323
		$pos++;
sl@0
   324
		}
sl@0
   325
	return $pos;
sl@0
   326
	}
sl@0
   327
sl@0
   328
# add the data in array 2 to array 1, returning the position they went in.
sl@0
   329
sub addArray
sl@0
   330
	{
sl@0
   331
	my ($candidate, $addition) = @_;
sl@0
   332
	my $pos = findMatch($candidate, $addition);
sl@0
   333
	# add any required on to the end of the candidate block
sl@0
   334
	my $last = $pos + scalar(@$addition);
sl@0
   335
	my $additionPos = scalar(@$candidate) - $pos;
sl@0
   336
	while ($pos + $additionPos < $last)
sl@0
   337
		{
sl@0
   338
		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
sl@0
   339
		$additionPos++;
sl@0
   340
		}
sl@0
   341
	return $pos;
sl@0
   342
	}
sl@0
   343
sl@0
   344
# create data block 1 and indices 2 from data 3 and block size 4
sl@0
   345
sub createTrieLevel
sl@0
   346
	{
sl@0
   347
	my ($data, $indices, $input, $blockSize) = @_;
sl@0
   348
	my $block = 0;
sl@0
   349
	while ($block * $blockSize < scalar(@$input))
sl@0
   350
		{
sl@0
   351
		my $start = $block * $blockSize;
sl@0
   352
		my $end = $start + $blockSize;
sl@0
   353
		my $currentBlockSize = $blockSize;
sl@0
   354
		if (scalar(@$input) < $end)
sl@0
   355
			{
sl@0
   356
			$end = scalar(@$input);
sl@0
   357
			$currentBlockSize = $end - $start;
sl@0
   358
			}
sl@0
   359
		my @currentBlock = @$input[$start..($end - 1)];
sl@0
   360
		while ($currentBlockSize != $blockSize)
sl@0
   361
			{
sl@0
   362
			$currentBlock[$currentBlockSize] = 0;
sl@0
   363
			$currentBlockSize++;
sl@0
   364
			}
sl@0
   365
		$$indices[$block] = addArray($data, \@currentBlock);
sl@0
   366
		$block++;
sl@0
   367
		}
sl@0
   368
	}
sl@0
   369
sl@0
   370
sub OutputArray
sl@0
   371
	{
sl@0
   372
	my $index = 0;
sl@0
   373
	my $firstTime = 1;
sl@0
   374
	while ($index != scalar(@_))
sl@0
   375
		{
sl@0
   376
		if (!$firstTime)
sl@0
   377
			{
sl@0
   378
			if ($index % 8)
sl@0
   379
				{
sl@0
   380
				print ', ';
sl@0
   381
				}
sl@0
   382
			else
sl@0
   383
				{
sl@0
   384
				print ",\n\t";
sl@0
   385
				}
sl@0
   386
			}
sl@0
   387
		else
sl@0
   388
			{
sl@0
   389
			print "\t";
sl@0
   390
			$firstTime = 0;
sl@0
   391
			}
sl@0
   392
		print($_[$index]);
sl@0
   393
		$index++;
sl@0
   394
		}
sl@0
   395
	print "\n";
sl@0
   396
	}
sl@0
   397
sl@0
   398
if ($OptionOutputTrie)
sl@0
   399
	{
sl@0
   400
	my @Trie0 = ();
sl@0
   401
	my @Index0 = ();
sl@0
   402
	my @Trie1 = ();
sl@0
   403
	my @Index1 = ();
sl@0
   404
	my @Trie2 = ();
sl@0
   405
	my @Index2 = ();
sl@0
   406
	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
sl@0
   407
	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
sl@0
   408
	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
sl@0
   409
	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
sl@0
   410
	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
sl@0
   411
	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
sl@0
   412
	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
sl@0
   413
	print "// Add the result of this to your character to fold it.\n\n";
sl@0
   414
	print "static const short CaseFoldTable3[] =\n\t{\n";
sl@0
   415
	OutputArray(@Trie0);
sl@0
   416
	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
sl@0
   417
	OutputArray(@Trie1);
sl@0
   418
	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
sl@0
   419
	OutputArray(@Trie2);
sl@0
   420
	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
sl@0
   421
	OutputArray(@Index2);
sl@0
   422
	print "\t};\n";
sl@0
   423
	}