1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/kernelhwsrv/kernel/eka/euser/unicode/perl/FoldAndDecompTables.pl Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,625 @@
1.4 +# Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
1.5 +# All rights reserved.
1.6 +# This component and the accompanying materials are made available
1.7 +# under the terms of the License "Eclipse Public License v1.0"
1.8 +# which accompanies this distribution, and is available
1.9 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.10 +#
1.11 +# Initial Contributors:
1.12 +# Nokia Corporation - initial contribution.
1.13 +#
1.14 +# Contributors:
1.15 +#
1.16 +# Description:
1.17 +# Creates C++ code describing how to decompose, compose and fold each character.
1.18 +# Usage:
1.19 +# perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>
1.20 +# Tables we want to create:
1.21 +# A: Ordered list of non-excluded decompositions
1.22 +# B: List of folded decompositions matching A
1.23 +# C: List of decompositions not listed in A of length > 1
1.24 +# D: List of folded decompositions matching C
1.25 +# E: List of decompositions of length = 1 whose matching folded decompositions
1.26 +# are of length > 1
1.27 +# F: List of folded decompositions matching E
1.28 +# G: List of decompositions of length = 1 with matching folded decompositions
1.29 +# H: List of folded decompostions matching G
1.30 +# I: List of folded decompositions that do not have matching decompositions
1.31 +# J: List of decompositions (folding and otherwise) of length > 2
1.32 +# K: Hash table mapping Unicode value to its folded decomposition value in the
1.33 +# concatenated list B-D-F-H-I
1.34 +# L: List of hash slots in K matching A (providing a mapping from non-excluded
1.35 +# decompositions to Unicode value)
1.36 +# [all lengths are of UTF16 strings]
1.37 +#
1.38 +#
1.39 +
1.40 +use strict;
1.41 +
1.42 +#
1.43 +# Hash table:
1.44 +#
1.45 +
1.46 +# Size of hashing table = 1 to the power $LgHashTableSize
1.47 +my $LgHashTableSize = 12;
1.48 +
1.49 +# Do not change these next two values!
1.50 +my $HashTableSize = 1 << $LgHashTableSize;
1.51 +my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1);
1.52 +
1.53 +# Hashing function in Perl: Getting the initial search position
1.54 +sub HashStart
1.55 + {
1.56 + return $_[0] & ($HashTableSize - 1);
1.57 + }
1.58 +# How far to step through each time
1.59 +sub HashStep
1.60 + {
1.61 + my ($code) = @_;
1.62 + $code *= $code >> $LgHashTableSize;
1.63 + return ($code * 2 + 1) & ($HashTableSize - 1);
1.64 + }
1.65 +
1.66 +# Make sure input string is all hex numbers separated by single spaces with
1.67 +# each hex number having 4 digits and decomposed into UTF16
1.68 +sub Normalize
1.69 + {
1.70 + my ($string) = @_;
1.71 + if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/)
1.72 + {
1.73 + return $string;
1.74 + }
1.75 + my $norm = '';
1.76 + foreach my $elt (split(' ', $string))
1.77 + {
1.78 + if ($elt)
1.79 + {
1.80 + die "'$elt' is not a hex number"
1.81 + unless $elt =~ /[0-9a-fA-F]+/;
1.82 + $norm = $norm.' '
1.83 + unless $norm eq '';
1.84 + $elt = hex $elt;
1.85 + if ($elt < 0x10000)
1.86 + {
1.87 + $norm = $norm.(sprintf('%04X', $elt));
1.88 + }
1.89 + else
1.90 + {
1.91 + # Add a surrogate pair
1.92 + $norm = $norm.(sprintf('%04X %04X',
1.93 + ($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00));
1.94 + }
1.95 + }
1.96 + }
1.97 + #print STDERR "'$string' normalized to '$norm'\n";
1.98 + return $norm;
1.99 + }
1.100 +
1.101 +# First stage:
1.102 +# Hash of Unicode values to normalised decomposition and folded strings
1.103 +my %Decomp = ();
1.104 +my %Folded = ();
1.105 +# Mapping from decomposition->char, if not excluded
1.106 +my %Composition = ();
1.107 +# characters with non-excluded decompositions
1.108 +my @IncludedDecomps = ();
1.109 +# characters with long (>1 UTF16) excluded decompositions
1.110 +my @LongExcludedDecomps = ();
1.111 +# characters with singleton decompositions but long folds
1.112 +my @ShortDecompsLongFolds = ();
1.113 +# characters with singleton folds and singleton
1.114 +my @ShortDecompsShortFolds = ();
1.115 +# characters with singleton folds but no decomps
1.116 +my @ShortFoldsOnly = ();
1.117 +
1.118 +# A mapping from decompositions of length greater than two
1.119 +# to the code that produced them.
1.120 +my %VeryLongDecompositions = ();
1.121 +
1.122 +# A list of characters containing all decompositions of length >2 as slices
1.123 +my @VeryLongDecompData = ();
1.124 +# Mapping from decomposition->index into VeryLongDecompData
1.125 +my %VeryLongDecompMap = ();
1.126 +
1.127 +# There will be a hash table mapping Unicode values to indices into the other
1.128 +# tables. %Index maps the same thing in Perl.
1.129 +my %Index = ();
1.130 +# %HashTableEntryContents maps the table entries to the Unicode values they
1.131 +# contain.
1.132 +my %HashTableEntryContents = ();
1.133 +# %HashTableEntry maps Unicode value to the entry in the hash table
1.134 +my %HashTableEntry = ();
1.135 +
1.136 +# Bind a unicode value to an index into the tables
1.137 +sub AddHashValue
1.138 + {
1.139 + my ($unicode, $index) = @_;
1.140 + $Index{$unicode} = $index;
1.141 + my $pos = HashStart($unicode);
1.142 + my $step = HashStep($unicode);
1.143 + while (exists $HashTableEntryContents{$pos})
1.144 + {
1.145 + $pos += $step;
1.146 + if ($HashTableSize <= $pos)
1.147 + {
1.148 + $pos %= $HashTableSize;
1.149 + }
1.150 + }
1.151 + $HashTableEntryContents{$pos} = $unicode;
1.152 + $HashTableEntry{$unicode} = $pos;
1.153 + }
1.154 +
1.155 +# Bind a whole array to the indices starting from that given as the first
1.156 +# argument. Returns the index of the next slot to be filled.
1.157 +sub AddListToHash
1.158 + {
1.159 + my ($index, @unicodes) = @_;
1.160 + while (@unicodes)
1.161 + {
1.162 + AddHashValue(shift @unicodes, $index);
1.163 + $index++;
1.164 + }
1.165 + return $index;
1.166 + }
1.167 +
1.168 +# put the results of a read line into the data structures
1.169 +sub AddCode
1.170 + {
1.171 + my ($code, $excluded, $decomposition, $folded) = @_;
1.172 + return if ($decomposition eq '' && $folded eq '');
1.173 + $Decomp{$code} = $decomposition;
1.174 + $Folded{$code} = $folded;
1.175 +
1.176 + if (!$excluded && $decomposition ne '')
1.177 + {
1.178 + push @IncludedDecomps, $code;
1.179 + $Composition{$decomposition} = $code;
1.180 + }
1.181 + elsif (4 < length $decomposition)
1.182 + {
1.183 + push @LongExcludedDecomps, $code;
1.184 + }
1.185 + elsif (4 < length $folded)
1.186 + {
1.187 + push @ShortDecompsLongFolds, $code;
1.188 + }
1.189 + elsif ($decomposition ne '')
1.190 + {
1.191 + push @ShortDecompsShortFolds, $code;
1.192 + }
1.193 + elsif ($folded ne '')
1.194 + {
1.195 + push @ShortFoldsOnly, $code;
1.196 + }
1.197 +
1.198 + $VeryLongDecompositions{$decomposition} = $code
1.199 + if (9 < length $decomposition);
1.200 + $VeryLongDecompositions{$folded} = $code
1.201 + if (9 < length $folded);
1.202 + }
1.203 +
1.204 +if (scalar(@ARGV) != 0)
1.205 + {
1.206 + print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n");
1.207 + exit 1;
1.208 + }
1.209 +
1.210 +my $lineNo = 0;
1.211 +my $inBlock = 0;
1.212 +while(<STDIN>)
1.213 + {
1.214 + $lineNo++;
1.215 + if (/^(1?[0-9a-fA-F]{4,5});([^;]*);.*symbian:(E?);[^;]*;([0-9a-fA-F \t]*);([0-9a-fA-F \t]*)[ \t]*$/i)
1.216 + {
1.217 + my $code = hex $1;
1.218 + my $description = $2;
1.219 + my $excluded = $3;
1.220 + my $decomposition = Normalize($4);
1.221 + my $folded = Normalize($5);
1.222 +
1.223 + die ("Value $1 too large to be Unicode at line $lineNo.")
1.224 + if (0x110000 <= $code);
1.225 +
1.226 + die("Normalisation failed with '$decomposition' at line $lineNo.")
1.227 + unless (length $decomposition) == 0 || (length $decomposition) % 5 == 4;
1.228 + die("Normalisation failed with '$folded' at line $lineNo.")
1.229 + unless (length $folded) == 0 || (length $folded) % 5 == 4;
1.230 +
1.231 + AddCode($code, $excluded, $decomposition, $folded);
1.232 +
1.233 + if ($description =~ /^<.*Last>$/i)
1.234 + {
1.235 + die("End of block without start at line $lineNo!")
1.236 + if !$inBlock;
1.237 + while ($inBlock <= $code)
1.238 + {
1.239 + AddCode($inBlock, $excluded, $decomposition, $folded);
1.240 + $inBlock++;
1.241 + }
1.242 + $inBlock = 0;
1.243 + }
1.244 + elsif ($description =~ /^<.*First>$/i)
1.245 + {
1.246 + die("Block within block at line $lineNo!")
1.247 + if $inBlock;
1.248 + $inBlock = $code + 1;
1.249 + }
1.250 + }
1.251 + elsif (!/^[ \t]*$/)
1.252 + {
1.253 + die("Did not understand line $lineNo.");
1.254 + }
1.255 + }
1.256 +
1.257 +# We need to construct the data for the table of decompositions of length > 2.
1.258 +foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions)
1.259 + {
1.260 + if (!exists $VeryLongDecompMap{$decomp})
1.261 + {
1.262 + # Does not already exist
1.263 + my $newPos = scalar @VeryLongDecompData;
1.264 + $VeryLongDecompMap{$decomp} = $newPos;
1.265 + foreach my $code (split(' ', $decomp))
1.266 + {
1.267 + push @VeryLongDecompData, $code;
1.268 + }
1.269 + while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/)
1.270 + {
1.271 + $decomp = $1;
1.272 + $VeryLongDecompMap{$decomp} = $newPos;
1.273 + }
1.274 + }
1.275 + }
1.276 +
1.277 +# We need to sort the codes for included decompositions into lexicographic
1.278 +# order of their decompositions.
1.279 +# This, luckily, is the same as sorting the strings that represent their
1.280 +# decompositions in hex lexicographically.
1.281 +@IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps;
1.282 +
1.283 +print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps));
1.284 +print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds));
1.285 +print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: ");
1.286 +print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly));
1.287 +print STDERR "\n";
1.288 +
1.289 +# Analyse the hash table to find out the maximum and average time
1.290 +# taken to find each ASCII character
1.291 +my $maxAsciiTime = 0;
1.292 +my $totalAsciiTime = 0;
1.293 +my $mostDifficultCode = undef;
1.294 +my $asciiFoundWithoutStepCount = 0;
1.295 +for (32..126)
1.296 + {
1.297 + my $code = $_;
1.298 + my $pos = HashStart($code);
1.299 + my $step = HashStep($code);
1.300 + my $stepCount = 1;
1.301 + if ($HashTableEntry{$code})
1.302 + {
1.303 + my $posRequired = $HashTableEntry{$code};
1.304 + while ($pos != $posRequired)
1.305 + {
1.306 + $pos = ($pos + $step) % $HashTableSize;
1.307 + $stepCount++;
1.308 + }
1.309 + }
1.310 + $totalAsciiTime += $stepCount;
1.311 + if ($maxAsciiTime < $stepCount)
1.312 + {
1.313 + $maxAsciiTime = $stepCount;
1.314 + $mostDifficultCode = $code;
1.315 + }
1.316 + if ($stepCount == 1)
1.317 + {
1.318 + $asciiFoundWithoutStepCount++;
1.319 + }
1.320 + }
1.321 +printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95);
1.322 +printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode);
1.323 +
1.324 +# Now we populate the hash table
1.325 +my $index = 0;
1.326 +
1.327 +$index = AddListToHash($index, @IncludedDecomps);
1.328 +my $hashIndexAfterIncludedDecomps = $index;
1.329 +printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps);
1.330 +
1.331 +$index = AddListToHash($index, @LongExcludedDecomps);
1.332 +my $hashIndexAfterLongExcludeDecomps = $index;
1.333 +printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps);
1.334 +
1.335 +$index = AddListToHash($index, @ShortDecompsLongFolds);
1.336 +my $hashIndexAfterShortDecompsLongFolds = $index;
1.337 +printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds);
1.338 +
1.339 +$index = AddListToHash($index, @ShortDecompsShortFolds);
1.340 +my $hashIndexAfterShortDecompsShortFolds = $index;
1.341 +printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds);
1.342 +
1.343 +$index = AddListToHash($index, @ShortFoldsOnly);
1.344 +my $hashIndexAfterShortFoldsOnly = $index;
1.345 +printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly);
1.346 +
1.347 +#
1.348 +# Output C++ File
1.349 +#
1.350 +my $totalBytes = 0;
1.351 +
1.352 +print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n";
1.353 +print "// All rights reserved.\n";
1.354 +print "// This component and the accompanying materials are made available\n";
1.355 +print "// under the terms of the License \"Eclipse Public License v1.0\"\n";
1.356 +print "// which accompanies this distribution, and is available\n";
1.357 +print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n";
1.358 +print "//\n";
1.359 +print "// Initial Contributors:\n";
1.360 +print "// Nokia Corporation - initial contribution.\n";
1.361 +print "//\n";
1.362 +print "// Contributors:\n";
1.363 +print "//\n";
1.364 +print "// Description:\n";
1.365 +print "//\n";
1.366 +print "// Fold and decomposition tables.\n";
1.367 +print "//\n";
1.368 +print "// These tables are linked in the following way:\n";
1.369 +print "// KUnicodeToIndexHash is a hash table using double hashing for\n";
1.370 +print "// conflict resolution. The functions DecompositionHashStart and\n";
1.371 +print "// DecompositionHashStep give the start and step values for accessing\n";
1.372 +print "// the table. The first probe is at DecompositionHashStart and each\n";
1.373 +print "// subsequent probe is offset by DecompositionHashStep. Probes\n";
1.374 +print "// continue until either 0 is found (indicating that the Unicode value\n";
1.375 +print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n";
1.376 +print "// is found that has the sought Unicode value in its lower 20 bits.\n";
1.377 +print "//\n";
1.378 +print "// In this latter case, the upper 12 bits contain an index into\n";
1.379 +print "// one of the following tables, according to the following rules:\n";
1.380 +print "//\n";
1.381 +print "// In the case of folding:\n";
1.382 +print "// If the Index is less than the length of KNonSingletonFolds / 2,\n";
1.383 +print "// it is an index into KNonSingletonFolds. If the Index is\n";
1.384 +print "// greater than the length of KNonSingletonFolds / 2, then it is an\n";
1.385 +print "// index into KSingletonFolds.\n";
1.386 +print "//\n";
1.387 +print "// In the case of decomposition:\n";
1.388 +print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n";
1.389 +print "// it is an index into KNonSingletonDecompositions. If the Index is\n";
1.390 +print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n";
1.391 +print "// index into KSingletonDecompositions.\n";
1.392 +print "//\n";
1.393 +print "// In summary:\n";
1.394 +print "// Let Knsf be the length of KNonSingletonFolds / 2,\n";
1.395 +print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n";
1.396 +print "// let Ksd be the length of KSingletonDecompositions and\n";
1.397 +print "// let Ksf be the length of KSingletonFolds.\n";
1.398 +print "// Now if you want to fold a character and you have found\n";
1.399 +print "// its index 'i' from the KUnicodeToIndexHash, then;\n";
1.400 +print "// if (i < Knsf) then look up\n";
1.401 +print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n";
1.402 +print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n";
1.403 +print "// else there is no fold for this character.\n";
1.404 +print "//\n";
1.405 +print "// Or if you want to decompose the same character, then;\n";
1.406 +print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n";
1.407 +print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n";
1.408 +print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n";
1.409 +print "// else there is no decomposition for this character.\n";
1.410 +print "//\n";
1.411 +print "// Your index into KSingletonDecompositions or KSingletonFolds\n";
1.412 +print "// yields a single value which is the decomposition or fold.\n";
1.413 +print "//\n";
1.414 +print "// The KNonSingletonFolds and KNonSingletonDecomposition\n";
1.415 +print "// tables are made up of pairs of values. Each pair is either a pair\n";
1.416 +print "// of Unicode values that constitute the fold or decomposition, or\n";
1.417 +print "// the first value is KLongD and the second has its top 4 bits as the\n";
1.418 +print "// length of the decomposition (or folded decomposition) minus 3,\n";
1.419 +print "// and its bottom 12 bits as the index into KLongDecompositions\n";
1.420 +print "// of where you can find this decomposition.\n";
1.421 +print "//\n";
1.422 +print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n";
1.423 +print "// all the decomposed and folded sequences longer than 4 bytes long.\n";
1.424 +print "\n";
1.425 +print "// Hash table mapping unicode values to indices into the other tables\n";
1.426 +print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n";
1.427 +print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t";
1.428 +my @HashTableOutput;
1.429 +for (0..($HashTableSize - 1))
1.430 + {
1.431 + my $v = 0;
1.432 + if (exists $HashTableEntryContents{$_})
1.433 + {
1.434 + $v = $HashTableEntryContents{$_};
1.435 + die ('Did not expect a Unicode value > 0xFFFFF')
1.436 + if 0xFFFFF < $v;
1.437 + $v |= ($Index{$v}) << 20;
1.438 + }
1.439 + push @HashTableOutput, sprintf('0x%08x', $v);
1.440 + $totalBytes += 4;
1.441 + }
1.442 +print (shift @HashTableOutput);
1.443 +my $valueCount = 0;
1.444 +foreach my $v (@HashTableOutput)
1.445 + {
1.446 + print (((++$valueCount & 7) == 0)? ",\n\t" : ', ');
1.447 + print $v;
1.448 + }
1.449 +print "\n\t};\n\n";
1.450 +print "// Hash table access functions\n";
1.451 +print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n";
1.452 +print "inline int DecompositionHashStart(long a)\n";
1.453 +print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n";
1.454 +print "inline int DecompositionHashStep(long a)\n";
1.455 +print "\t{\n\ta *= a >> $LgHashTableSize;\n";
1.456 +print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n";
1.457 +
1.458 +print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n";
1.459 +print "// indexes it\n";
1.460 +print "const unsigned short KCompositionMapping[] =\n\t{\n\t";
1.461 +for (0..(scalar(@IncludedDecomps - 1)))
1.462 + {
1.463 + if ($_ != 0)
1.464 + {print (($_ & 7) == 0? ",\n\t" : ', ')}
1.465 + printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} );
1.466 + $totalBytes += 2;
1.467 + }
1.468 +print "\n\t};\n\n";
1.469 +
1.470 +print "// Table containing all the decomposition and folding strings longer\n";
1.471 +print "// than 2 UTF16 characters\n";
1.472 +print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x";
1.473 +for(0..(scalar(@VeryLongDecompData) - 1))
1.474 + {
1.475 + if ($_ != 0)
1.476 + {print (($_ & 7) == 0?",\n\t0x" : ', 0x')}
1.477 + print $VeryLongDecompData[$_];
1.478 + $totalBytes += 2;
1.479 + }
1.480 +print "\n\t};\n\n";
1.481 +
1.482 +print "// Table containing decompositions longer than one UTF16 character.\n";
1.483 +print "// The top of the table contains all compositions, sorted lexicographically.\n";
1.484 +print "// Any decompositions of length 2 are in the table as a pair of values,\n";
1.485 +print "// decompositions longer than that are represented by a KLongD followed by\n";
1.486 +print "// a value whose top four bits indicate the length of the decomposition minus\n";
1.487 +print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n";
1.488 +print "// array where the decomposition starts.\n";
1.489 +print "const long KLongD = 0;\n";
1.490 +print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n";
1.491 +print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t";
1.492 +
1.493 +sub PrintNonsingletonDecompTableEntry
1.494 + {
1.495 + my ($decomp) = @_;
1.496 + if (length $decomp < 10)
1.497 + {
1.498 + if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/)
1.499 + {
1.500 + print '0x'.$1.', 0x'.$2;
1.501 + }
1.502 + else
1.503 + {
1.504 + die("$decomp expected to be normalized and of length 1 or 2")
1.505 + if $decomp !~ /[0-9A-F]{4}/;
1.506 + print '0x'.$decomp.', 0xFFFF';
1.507 + }
1.508 + }
1.509 + else
1.510 + {
1.511 + printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp});
1.512 + }
1.513 + }
1.514 +
1.515 +{my $entryNo = 0;
1.516 +foreach my $code (@IncludedDecomps)
1.517 + {
1.518 + if ($entryNo != 0)
1.519 + {print (($entryNo & 3) == 0?",\n\t" : ', ')}
1.520 + PrintNonsingletonDecompTableEntry($Decomp{$code});
1.521 + $entryNo++;
1.522 + $totalBytes += 4;
1.523 + }
1.524 +foreach my $code (@LongExcludedDecomps)
1.525 + {
1.526 + print (($entryNo & 3) == 0?",\n\t" : ', ');
1.527 + PrintNonsingletonDecompTableEntry($Decomp{$code});
1.528 + $entryNo++;
1.529 + $totalBytes += 4;
1.530 + }
1.531 +}
1.532 +print "\n\t};\n\n";
1.533 +
1.534 +print "// Table of folded decompositions which either have more than one UTF16, or\n";
1.535 +print "// their normal decompositions have more than one UTF16\n";
1.536 +print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n";
1.537 +print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t";
1.538 +{my $entryNo = 0;
1.539 +foreach my $code (@IncludedDecomps)
1.540 + {
1.541 + if ($entryNo != 0)
1.542 + {print (($entryNo & 3) == 0?",\n\t" : ', ')}
1.543 + PrintNonsingletonDecompTableEntry($Folded{$code});
1.544 + $entryNo++;
1.545 + $totalBytes += 4;
1.546 + }
1.547 +foreach my $code (@LongExcludedDecomps)
1.548 + {
1.549 + print (($entryNo & 3) == 0?",\n\t" : ', ');
1.550 + PrintNonsingletonDecompTableEntry($Folded{$code});
1.551 + $entryNo++;
1.552 + $totalBytes += 4;
1.553 + }
1.554 +foreach my $code (@ShortDecompsLongFolds)
1.555 + {
1.556 + print (($entryNo & 3) == 0?",\n\t" : ', ');
1.557 + PrintNonsingletonDecompTableEntry($Folded{$code});
1.558 + $entryNo++;
1.559 + $totalBytes += 4;
1.560 + }
1.561 +}
1.562 +print "\n\t};\n\n";
1.563 +
1.564 +print "// Table of singleton decompositions and characters with singleton folds\n";
1.565 +print "// Note for Unicode 5.0:\n";
1.566 +print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n";
1.567 +print "// As per the algorithm of this file, the non-BMP character should be stored in \n";
1.568 +print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n";
1.569 +print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n";
1.570 +print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n";
1.571 +print "// your character has no decomposition.\n";
1.572 +print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n";
1.573 +print "// want to know more.\n";
1.574 +print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n";
1.575 +print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x";
1.576 +{my $entryNo = 0;
1.577 +foreach my $code (@ShortDecompsLongFolds)
1.578 + {
1.579 + if ($entryNo != 0)
1.580 + {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
1.581 + if (exists $Decomp{$code} && $Decomp{$code} ne '')
1.582 + {
1.583 + print $Decomp{$code};
1.584 + }
1.585 + else
1.586 + {
1.587 + # Don't take these 0xFFFF as character.
1.588 + #printf ('%04X', $code);
1.589 + printf ("FFFF");
1.590 + }
1.591 + $entryNo++;
1.592 + $totalBytes += 4;
1.593 + }
1.594 +foreach my $code (@ShortDecompsShortFolds)
1.595 + {
1.596 + if ($entryNo != 0)
1.597 + {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
1.598 + print $Decomp{$code};
1.599 + $entryNo++;
1.600 + $totalBytes += 4;
1.601 + }
1.602 +}
1.603 +print "\n\t};\n\n";
1.604 +
1.605 +print "// Table of singleton folds\n";
1.606 +print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n";
1.607 +print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x";
1.608 +{my $entryNo = 0;
1.609 +foreach my $code (@ShortDecompsShortFolds)
1.610 + {
1.611 + if ($entryNo != 0)
1.612 + {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
1.613 + print $Folded{$code};
1.614 + $entryNo++;
1.615 + $totalBytes += 4;
1.616 + }
1.617 +foreach my $code (@ShortFoldsOnly)
1.618 + {
1.619 + print (($entryNo & 7) == 0?",\n\t0x" : ', 0x');
1.620 + print $Folded{$code};
1.621 + $entryNo++;
1.622 + $totalBytes += 4;
1.623 + }
1.624 +}
1.625 +print "\n\t};\n";
1.626 +
1.627 +print "\n// Total size: $totalBytes bytes\n";
1.628 +print STDERR $totalBytes, " bytes\n";