sl@0: # sl@0: # Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: # All rights reserved. sl@0: # This component and the accompanying materials are made available sl@0: # under the terms of "Eclipse Public License v1.0" sl@0: # which accompanies this distribution, and is available sl@0: # at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: # sl@0: # Initial Contributors: sl@0: # Nokia Corporation - initial contribution. sl@0: # sl@0: # Contributors: sl@0: # sl@0: # Description: sl@0: # Case Equivalence sl@0: # Given the unicode data file, work out the case equivalence classes sl@0: # i.e. the equivalence classes for the transitive closure of ~ defined as sl@0: # follows: sl@0: # a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b sl@0: # Usage: perl CaseEquivalence ] [-s]\nusing standard input and output streams.\n"; sl@0: print STDERR " is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n"; sl@0: print STDERR "f: Give a list of all codes that need mapping and what they map to.\n"; sl@0: print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n"; sl@0: print STDERR "m: Give a list of all codes are mapped to by more than one code.\n"; sl@0: print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n"; sl@0: exit; sl@0: } sl@0: } sl@0: sl@0: # set a code as being part of a non-unitary case-equivalence class. sl@0: sub add sl@0: { sl@0: my ($addition) = @_; sl@0: if (!$Codes{$addition}) sl@0: { sl@0: $Codes{$addition} = 1; sl@0: } sl@0: } sl@0: sl@0: # make a code point to its final case varient sl@0: sub chaseDown sl@0: { sl@0: my ($codeVal) = @_; sl@0: my $class = $codeVal; sl@0: while ($CaseClass{$class}) sl@0: { sl@0: $class = $CaseClass{$class}; sl@0: } sl@0: $CaseClass{$codeVal} = $class unless $codeVal == $class; sl@0: return $class; sl@0: } sl@0: sl@0: # link two codes together as being part of the same case-equivalence class sl@0: sub makeEquivalent sl@0: { sl@0: my ($left, $right) = @_; sl@0: if (!$left || !$right) sl@0: { sl@0: return; sl@0: } sl@0: $left = chaseDown($left); sl@0: $right = chaseDown($right); sl@0: if ($Codes{$left} < $Codes{$right}) sl@0: { sl@0: $CaseClass{$left} = $right; sl@0: return; sl@0: } sl@0: if ($Codes{$right} < $Codes{$left}) sl@0: { sl@0: $CaseClass{$right} = $left; sl@0: return; sl@0: } sl@0: if ($left < $right) sl@0: { sl@0: $CaseClass{$right} = $left; sl@0: return; sl@0: } sl@0: if ($right < $left) sl@0: { sl@0: $CaseClass{$left} = $right; sl@0: return; sl@0: } sl@0: # $left == $right.. do nothing sl@0: return; sl@0: } sl@0: sl@0: # Link possibly unmentioned codes together. The first one is considered lower-case sl@0: sub addEquivalenceClass sl@0: { sl@0: my ($lower, @rest) = @_; sl@0: $Codes{$lower} = 2; sl@0: foreach my $one (@rest) sl@0: { sl@0: $Codes{$one} = 1; sl@0: makeEquivalent($lower, $one); sl@0: } sl@0: } sl@0: sl@0: # Firstly we read in the data sl@0: while() sl@0: { sl@0: my @line = split('#', $_, 1); sl@0: my @fields = split(/;/, $line[0]); sl@0: my @decomposition = split(' ', $fields[5]); sl@0: if (1 < scalar(@fields)) sl@0: { sl@0: my $codeVal = hex($fields[0]); sl@0: # if the character has a non-compatibility decomposition sequence, record this fact. sl@0: if (0 < scalar(@decomposition)) sl@0: { sl@0: my $decompositionType = ""; sl@0: if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/) sl@0: { sl@0: $decompositionType = shift @decomposition; sl@0: } sl@0: if ($decompositionType !~ m/compat/i) sl@0: { sl@0: $DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1; sl@0: } sl@0: } sl@0: $Name[$codeVal] = $fields[1]; sl@0: my $upperval = $fields[12]; sl@0: my $lowerval = $fields[13]; sl@0: my $titleval = $fields[14]; sl@0: sl@0: # strip whitespace from the end of the string sl@0: $titleval =~ s/\s+$//; sl@0: if ($upperval) sl@0: { sl@0: $upperval = hex($upperval); sl@0: $Upper[$codeVal] = $upperval; sl@0: add $codeVal; sl@0: add $upperval; sl@0: } sl@0: if ($titleval) sl@0: { sl@0: $titleval = hex($titleval); sl@0: $Title[$codeVal] = $titleval; sl@0: add $codeVal; sl@0: add $titleval; sl@0: } sl@0: if ($lowerval) sl@0: { sl@0: $lowerval = hex($lowerval); sl@0: $Lower[$codeVal] = $lowerval; sl@0: add $codeVal; sl@0: $Codes{$lowerval} = 2; sl@0: } sl@0: } sl@0: } sl@0: sl@0: # Remove all codes that decompose to a sequence sl@0: foreach my $codeVal (keys(%Codes)) sl@0: { sl@0: my $current = $DecompositionValue[$codeVal]; sl@0: while ($current && 0 < $current) sl@0: { sl@0: $current = $DecompositionValue[$current]; sl@0: } sl@0: if ($current && $current == -1) sl@0: { sl@0: delete $Codes{$codeVal}; sl@0: } sl@0: } sl@0: sl@0: # Next we form the equivalence classes. sl@0: if ($OptionIncludeExtraMappings) sl@0: { sl@0: # space = non-breaking space sl@0: addEquivalenceClass(0x20, 0xA0); sl@0: } sl@0: # We try to end up with everything being equivalent to a lower case letter sl@0: foreach my $codeVal (keys(%Codes)) sl@0: { sl@0: makeEquivalent($codeVal, $Lower[$codeVal]); sl@0: makeEquivalent($codeVal, $Upper[$codeVal]); sl@0: makeEquivalent($codeVal, $Title[$codeVal]); sl@0: } sl@0: sl@0: # Next we chase each pointer in CaseClass down to its final result sl@0: foreach my $codeVal (keys(%CaseClass)) sl@0: { sl@0: chaseDown($codeVal); sl@0: } sl@0: sl@0: # Now output the results in order, and collect the raw data sl@0: my @Offset = (); sl@0: my $oldCodeCount = 0; sl@0: foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass)) sl@0: { sl@0: my $class = $CaseClass{$codeVal}; sl@0: my $offset = $class - $codeVal; sl@0: if ($OptionOutputForwardMapping) sl@0: { sl@0: printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class]; sl@0: } sl@0: while ($oldCodeCount != $codeVal) sl@0: { sl@0: $Offset[$oldCodeCount] = 0; sl@0: $oldCodeCount++; sl@0: } sl@0: $oldCodeCount++; sl@0: $Offset[$codeVal] = $offset; sl@0: } sl@0: sl@0: if ($OptionOutputReverseMapping) sl@0: { sl@0: my %ReverseMapping = (); sl@0: foreach my $codeVal (keys(%CaseClass)) sl@0: { sl@0: my $mapsTo = $CaseClass{$codeVal}; sl@0: if (!$ReverseMapping{$mapsTo}) sl@0: { sl@0: $ReverseMapping{$mapsTo} = [$codeVal]; sl@0: } sl@0: else sl@0: { sl@0: push (@{ $ReverseMapping{$mapsTo} }, $codeVal); sl@0: } sl@0: } sl@0: foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping)) sl@0: { sl@0: next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1); sl@0: printf("%x: %s <=", $mapVal, $Name[$mapVal]); sl@0: my $firstTime = 1; sl@0: foreach my $val ( @{ $ReverseMapping{$mapVal} } ) sl@0: { sl@0: if (!$firstTime) sl@0: { sl@0: print ','; sl@0: } sl@0: $firstTime = 0; sl@0: printf(" %s:%x", $Name[$val], $val); sl@0: } sl@0: print "\n"; sl@0: } sl@0: } sl@0: sl@0: # does the array 2 match array 1? Match the shorter array against the prefix of sl@0: # the other array sl@0: sub arraysMatch sl@0: { sl@0: my ($left, $right, $leftpos) = @_; sl@0: my $last = scalar(@$left) - $leftpos; sl@0: if (scalar(@$right) < $last) sl@0: { sl@0: $last = scalar(@$right); sl@0: } sl@0: my $pos = 0; sl@0: while ($pos < $last) sl@0: { sl@0: if ($$left[$pos + $leftpos] != $$right[$pos]) sl@0: { sl@0: return 0; sl@0: } sl@0: $pos++; sl@0: } sl@0: return 1; sl@0: } sl@0: sl@0: # find a match for array 2 in array 1, allowing values past the end of array 1 sl@0: # to match anything in array 1 sl@0: sub findMatch sl@0: { sl@0: my ($candidate, $term) = @_; sl@0: my $pos = 0; sl@0: while (!arraysMatch($candidate, $term, $pos)) sl@0: { sl@0: $pos++; sl@0: } sl@0: return $pos; sl@0: } sl@0: sl@0: # add the data in array 2 to array 1, returning the position they went in. sl@0: sub addArray sl@0: { sl@0: my ($candidate, $addition) = @_; sl@0: my $pos = findMatch($candidate, $addition); sl@0: # add any required on to the end of the candidate block sl@0: my $last = $pos + scalar(@$addition); sl@0: my $additionPos = scalar(@$candidate) - $pos; sl@0: while ($pos + $additionPos < $last) sl@0: { sl@0: $$candidate[$pos + $additionPos] = $$addition[$additionPos]; sl@0: $additionPos++; sl@0: } sl@0: return $pos; sl@0: } sl@0: sl@0: # create data block 1 and indices 2 from data 3 and block size 4 sl@0: sub createTrieLevel sl@0: { sl@0: my ($data, $indices, $input, $blockSize) = @_; sl@0: my $block = 0; sl@0: while ($block * $blockSize < scalar(@$input)) sl@0: { sl@0: my $start = $block * $blockSize; sl@0: my $end = $start + $blockSize; sl@0: my $currentBlockSize = $blockSize; sl@0: if (scalar(@$input) < $end) sl@0: { sl@0: $end = scalar(@$input); sl@0: $currentBlockSize = $end - $start; sl@0: } sl@0: my @currentBlock = @$input[$start..($end - 1)]; sl@0: while ($currentBlockSize != $blockSize) sl@0: { sl@0: $currentBlock[$currentBlockSize] = 0; sl@0: $currentBlockSize++; sl@0: } sl@0: $$indices[$block] = addArray($data, \@currentBlock); sl@0: $block++; sl@0: } sl@0: } sl@0: sl@0: sub OutputArray sl@0: { sl@0: my $index = 0; sl@0: my $firstTime = 1; sl@0: while ($index != scalar(@_)) sl@0: { sl@0: if (!$firstTime) sl@0: { sl@0: if ($index % 8) sl@0: { sl@0: print ', '; sl@0: } sl@0: else sl@0: { sl@0: print ",\n\t"; sl@0: } sl@0: } sl@0: else sl@0: { sl@0: print "\t"; sl@0: $firstTime = 0; sl@0: } sl@0: print($_[$index]); sl@0: $index++; sl@0: } sl@0: print "\n"; sl@0: } sl@0: sl@0: if ($OptionOutputTrie) sl@0: { sl@0: my @Trie0 = (); sl@0: my @Index0 = (); sl@0: my @Trie1 = (); sl@0: my @Index1 = (); sl@0: my @Trie2 = (); sl@0: my @Index2 = (); sl@0: createTrieLevel(\@Trie0, \@Index0, \@Offset, 16); sl@0: createTrieLevel(\@Trie1, \@Index1, \@Index0, 16); sl@0: createTrieLevel(\@Trie2, \@Index2, \@Index1, 16); sl@0: print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n"; sl@0: print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n"; sl@0: print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n"; sl@0: print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n"; sl@0: print "// Add the result of this to your character to fold it.\n\n"; sl@0: print "static const short CaseFoldTable3[] =\n\t{\n"; sl@0: OutputArray(@Trie0); sl@0: print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n"; sl@0: OutputArray(@Trie1); sl@0: print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n"; sl@0: OutputArray(@Trie2); sl@0: print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n"; sl@0: OutputArray(@Index2); sl@0: print "\t};\n"; sl@0: }