First public contribution.
2 # Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
4 # This component and the accompanying materials are made available
5 # under the terms of "Eclipse Public License v1.0"
6 # which accompanies this distribution, and is available
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 # Initial Contributors:
10 # Nokia Corporation - initial contribution.
16 # Given the unicode data file, work out the case equivalence classes
17 # i.e. the equivalence classes for the transitive closure of ~ defined as
19 # a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
20 # Usage: perl CaseEquivalence <UnicodeData.txt
28 # $DecompositionValue[$code] is undefined if $code has no decomposition
29 # sequence, if it has a single value decomposition sequence, then this is it,
30 # if it has a longer sequence, the value is -1
31 my @DecompositionValue = ();
32 # 1 for each code that has a differently-cased version,
33 # 2 for each code that is a lower-case version of something else.
37 # Command-line options
38 my $OptionOutputTrie = 1;
39 my $OptionOutputForwardMapping = 0;
40 my $OptionOutputReverseMapping = 0;
41 my $OptionIgnoreOneToOneReverseMappings = 0;
42 my $OptionIncludeExtraMappings = 1;
44 foreach my $optionString (@ARGV)
46 if ($optionString =~ m![/-]o[tfrm]!)
48 $OptionOutputTrie = 0;
49 my $option = substr($optionString, 2, 1);
52 $OptionOutputForwardMapping = 1;
54 elsif ($option eq 'r')
56 $OptionOutputReverseMapping = 1;
58 elsif ($option eq 'm')
60 $OptionOutputReverseMapping = 1;
61 $OptionIgnoreOneToOneReverseMappings = 1;
65 $OptionOutputTrie = 1;
68 elsif ($optionString =~ m![/-]s!)
70 $OptionIncludeExtraMappings = 0;
74 print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
75 print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
76 print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
77 print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
78 print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
79 print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
84 # set a code as being part of a non-unitary case-equivalence class.
88 if (!$Codes{$addition})
90 $Codes{$addition} = 1;
94 # make a code point to its final case varient
99 while ($CaseClass{$class})
101 $class = $CaseClass{$class};
103 $CaseClass{$codeVal} = $class unless $codeVal == $class;
107 # link two codes together as being part of the same case-equivalence class
110 my ($left, $right) = @_;
111 if (!$left || !$right)
115 $left = chaseDown($left);
116 $right = chaseDown($right);
117 if ($Codes{$left} < $Codes{$right})
119 $CaseClass{$left} = $right;
122 if ($Codes{$right} < $Codes{$left})
124 $CaseClass{$right} = $left;
129 $CaseClass{$right} = $left;
134 $CaseClass{$left} = $right;
137 # $left == $right.. do nothing
141 # Link possibly unmentioned codes together. The first one is considered lower-case
142 sub addEquivalenceClass
144 my ($lower, @rest) = @_;
146 foreach my $one (@rest)
149 makeEquivalent($lower, $one);
153 # Firstly we read in the data
156 my @line = split('#', $_, 1);
157 my @fields = split(/;/, $line[0]);
158 my @decomposition = split(' ', $fields[5]);
159 if (1 < scalar(@fields))
161 my $codeVal = hex($fields[0]);
162 # if the character has a non-compatibility decomposition sequence, record this fact.
163 if (0 < scalar(@decomposition))
165 my $decompositionType = "";
166 if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
168 $decompositionType = shift @decomposition;
170 if ($decompositionType !~ m/compat/i)
172 $DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
175 $Name[$codeVal] = $fields[1];
176 my $upperval = $fields[12];
177 my $lowerval = $fields[13];
178 my $titleval = $fields[14];
180 # strip whitespace from the end of the string
181 $titleval =~ s/\s+$//;
184 $upperval = hex($upperval);
185 $Upper[$codeVal] = $upperval;
191 $titleval = hex($titleval);
192 $Title[$codeVal] = $titleval;
198 $lowerval = hex($lowerval);
199 $Lower[$codeVal] = $lowerval;
201 $Codes{$lowerval} = 2;
206 # Remove all codes that decompose to a sequence
207 foreach my $codeVal (keys(%Codes))
209 my $current = $DecompositionValue[$codeVal];
210 while ($current && 0 < $current)
212 $current = $DecompositionValue[$current];
214 if ($current && $current == -1)
216 delete $Codes{$codeVal};
220 # Next we form the equivalence classes.
221 if ($OptionIncludeExtraMappings)
223 # space = non-breaking space
224 addEquivalenceClass(0x20, 0xA0);
226 # We try to end up with everything being equivalent to a lower case letter
227 foreach my $codeVal (keys(%Codes))
229 makeEquivalent($codeVal, $Lower[$codeVal]);
230 makeEquivalent($codeVal, $Upper[$codeVal]);
231 makeEquivalent($codeVal, $Title[$codeVal]);
234 # Next we chase each pointer in CaseClass down to its final result
235 foreach my $codeVal (keys(%CaseClass))
240 # Now output the results in order, and collect the raw data
242 my $oldCodeCount = 0;
243 foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
245 my $class = $CaseClass{$codeVal};
246 my $offset = $class - $codeVal;
247 if ($OptionOutputForwardMapping)
249 printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
251 while ($oldCodeCount != $codeVal)
253 $Offset[$oldCodeCount] = 0;
257 $Offset[$codeVal] = $offset;
260 if ($OptionOutputReverseMapping)
262 my %ReverseMapping = ();
263 foreach my $codeVal (keys(%CaseClass))
265 my $mapsTo = $CaseClass{$codeVal};
266 if (!$ReverseMapping{$mapsTo})
268 $ReverseMapping{$mapsTo} = [$codeVal];
272 push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
275 foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
277 next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
278 printf("%x: %s <=", $mapVal, $Name[$mapVal]);
280 foreach my $val ( @{ $ReverseMapping{$mapVal} } )
287 printf(" %s:%x", $Name[$val], $val);
293 # does the array 2 match array 1? Match the shorter array against the prefix of
297 my ($left, $right, $leftpos) = @_;
298 my $last = scalar(@$left) - $leftpos;
299 if (scalar(@$right) < $last)
301 $last = scalar(@$right);
306 if ($$left[$pos + $leftpos] != $$right[$pos])
315 # find a match for array 2 in array 1, allowing values past the end of array 1
316 # to match anything in array 1
319 my ($candidate, $term) = @_;
321 while (!arraysMatch($candidate, $term, $pos))
328 # add the data in array 2 to array 1, returning the position they went in.
331 my ($candidate, $addition) = @_;
332 my $pos = findMatch($candidate, $addition);
333 # add any required on to the end of the candidate block
334 my $last = $pos + scalar(@$addition);
335 my $additionPos = scalar(@$candidate) - $pos;
336 while ($pos + $additionPos < $last)
338 $$candidate[$pos + $additionPos] = $$addition[$additionPos];
344 # create data block 1 and indices 2 from data 3 and block size 4
347 my ($data, $indices, $input, $blockSize) = @_;
349 while ($block * $blockSize < scalar(@$input))
351 my $start = $block * $blockSize;
352 my $end = $start + $blockSize;
353 my $currentBlockSize = $blockSize;
354 if (scalar(@$input) < $end)
356 $end = scalar(@$input);
357 $currentBlockSize = $end - $start;
359 my @currentBlock = @$input[$start..($end - 1)];
360 while ($currentBlockSize != $blockSize)
362 $currentBlock[$currentBlockSize] = 0;
365 $$indices[$block] = addArray($data, \@currentBlock);
374 while ($index != scalar(@_))
398 if ($OptionOutputTrie)
406 createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
407 createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
408 createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
409 print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
410 print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
411 print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
412 print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
413 print "// Add the result of this to your character to fold it.\n\n";
414 print "static const short CaseFoldTable3[] =\n\t{\n";
416 print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
418 print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
420 print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
421 OutputArray(@Index2);