Update contrib.
1 # Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
3 # This component and the accompanying materials are made available
4 # under the terms of the License "Eclipse Public License v1.0"
5 # which accompanies this distribution, and is available
6 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
8 # Initial Contributors:
9 # Nokia Corporation - initial contribution.
14 # Creates C++ code describing how to decompose, compose and fold each character.
16 # perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>
17 # Tables we want to create:
18 # A: Ordered list of non-excluded decompositions
19 # B: List of folded decompositions matching A
20 # C: List of decompositions not listed in A of length > 1
21 # D: List of folded decompositions matching C
22 # E: List of decompositions of length = 1 whose matching folded decompositions
24 # F: List of folded decompositions matching E
25 # G: List of decompositions of length = 1 with matching folded decompositions
26 # H: List of folded decompostions matching G
27 # I: List of folded decompositions that do not have matching decompositions
28 # J: List of decompositions (folding and otherwise) of length > 2
29 # K: Hash table mapping Unicode value to its folded decomposition value in the
30 # concatenated list B-D-F-H-I
31 # L: List of hash slots in K matching A (providing a mapping from non-excluded
32 # decompositions to Unicode value)
33 # [all lengths are of UTF16 strings]
43 # Size of hashing table = 1 to the power $LgHashTableSize
44 my $LgHashTableSize = 12;
46 # Do not change these next two values!
47 my $HashTableSize = 1 << $LgHashTableSize;
48 my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1);
50 # Hashing function in Perl: Getting the initial search position
53 return $_[0] & ($HashTableSize - 1);
55 # How far to step through each time
59 $code *= $code >> $LgHashTableSize;
60 return ($code * 2 + 1) & ($HashTableSize - 1);
63 # Make sure input string is all hex numbers separated by single spaces with
64 # each hex number having 4 digits and decomposed into UTF16
68 if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/)
73 foreach my $elt (split(' ', $string))
77 die "'$elt' is not a hex number"
78 unless $elt =~ /[0-9a-fA-F]+/;
84 $norm = $norm.(sprintf('%04X', $elt));
88 # Add a surrogate pair
89 $norm = $norm.(sprintf('%04X %04X',
90 ($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00));
94 #print STDERR "'$string' normalized to '$norm'\n";
99 # Hash of Unicode values to normalised decomposition and folded strings
102 # Mapping from decomposition->char, if not excluded
103 my %Composition = ();
104 # characters with non-excluded decompositions
105 my @IncludedDecomps = ();
106 # characters with long (>1 UTF16) excluded decompositions
107 my @LongExcludedDecomps = ();
108 # characters with singleton decompositions but long folds
109 my @ShortDecompsLongFolds = ();
110 # characters with singleton folds and singleton
111 my @ShortDecompsShortFolds = ();
112 # characters with singleton folds but no decomps
113 my @ShortFoldsOnly = ();
115 # A mapping from decompositions of length greater than two
116 # to the code that produced them.
117 my %VeryLongDecompositions = ();
119 # A list of characters containing all decompositions of length >2 as slices
120 my @VeryLongDecompData = ();
121 # Mapping from decomposition->index into VeryLongDecompData
122 my %VeryLongDecompMap = ();
124 # There will be a hash table mapping Unicode values to indices into the other
125 # tables. %Index maps the same thing in Perl.
127 # %HashTableEntryContents maps the table entries to the Unicode values they
129 my %HashTableEntryContents = ();
130 # %HashTableEntry maps Unicode value to the entry in the hash table
131 my %HashTableEntry = ();
133 # Bind a unicode value to an index into the tables
136 my ($unicode, $index) = @_;
137 $Index{$unicode} = $index;
138 my $pos = HashStart($unicode);
139 my $step = HashStep($unicode);
140 while (exists $HashTableEntryContents{$pos})
143 if ($HashTableSize <= $pos)
145 $pos %= $HashTableSize;
148 $HashTableEntryContents{$pos} = $unicode;
149 $HashTableEntry{$unicode} = $pos;
152 # Bind a whole array to the indices starting from that given as the first
153 # argument. Returns the index of the next slot to be filled.
156 my ($index, @unicodes) = @_;
159 AddHashValue(shift @unicodes, $index);
165 # put the results of a read line into the data structures
168 my ($code, $excluded, $decomposition, $folded) = @_;
169 return if ($decomposition eq '' && $folded eq '');
170 $Decomp{$code} = $decomposition;
171 $Folded{$code} = $folded;
173 if (!$excluded && $decomposition ne '')
175 push @IncludedDecomps, $code;
176 $Composition{$decomposition} = $code;
178 elsif (4 < length $decomposition)
180 push @LongExcludedDecomps, $code;
182 elsif (4 < length $folded)
184 push @ShortDecompsLongFolds, $code;
186 elsif ($decomposition ne '')
188 push @ShortDecompsShortFolds, $code;
190 elsif ($folded ne '')
192 push @ShortFoldsOnly, $code;
195 $VeryLongDecompositions{$decomposition} = $code
196 if (9 < length $decomposition);
197 $VeryLongDecompositions{$folded} = $code
198 if (9 < length $folded);
201 if (scalar(@ARGV) != 0)
203 print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n");
212 if (/^(1?[0-9a-fA-F]{4,5});([^;]*);.*symbian:(E?);[^;]*;([0-9a-fA-F \t]*);([0-9a-fA-F \t]*)[ \t]*$/i)
215 my $description = $2;
217 my $decomposition = Normalize($4);
218 my $folded = Normalize($5);
220 die ("Value $1 too large to be Unicode at line $lineNo.")
221 if (0x110000 <= $code);
223 die("Normalisation failed with '$decomposition' at line $lineNo.")
224 unless (length $decomposition) == 0 || (length $decomposition) % 5 == 4;
225 die("Normalisation failed with '$folded' at line $lineNo.")
226 unless (length $folded) == 0 || (length $folded) % 5 == 4;
228 AddCode($code, $excluded, $decomposition, $folded);
230 if ($description =~ /^<.*Last>$/i)
232 die("End of block without start at line $lineNo!")
234 while ($inBlock <= $code)
236 AddCode($inBlock, $excluded, $decomposition, $folded);
241 elsif ($description =~ /^<.*First>$/i)
243 die("Block within block at line $lineNo!")
245 $inBlock = $code + 1;
250 die("Did not understand line $lineNo.");
254 # We need to construct the data for the table of decompositions of length > 2.
255 foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions)
257 if (!exists $VeryLongDecompMap{$decomp})
259 # Does not already exist
260 my $newPos = scalar @VeryLongDecompData;
261 $VeryLongDecompMap{$decomp} = $newPos;
262 foreach my $code (split(' ', $decomp))
264 push @VeryLongDecompData, $code;
266 while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/)
269 $VeryLongDecompMap{$decomp} = $newPos;
274 # We need to sort the codes for included decompositions into lexicographic
275 # order of their decompositions.
276 # This, luckily, is the same as sorting the strings that represent their
277 # decompositions in hex lexicographically.
278 @IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps;
280 print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps));
281 print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds));
282 print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: ");
283 print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly));
286 # Analyse the hash table to find out the maximum and average time
287 # taken to find each ASCII character
288 my $maxAsciiTime = 0;
289 my $totalAsciiTime = 0;
290 my $mostDifficultCode = undef;
291 my $asciiFoundWithoutStepCount = 0;
295 my $pos = HashStart($code);
296 my $step = HashStep($code);
298 if ($HashTableEntry{$code})
300 my $posRequired = $HashTableEntry{$code};
301 while ($pos != $posRequired)
303 $pos = ($pos + $step) % $HashTableSize;
307 $totalAsciiTime += $stepCount;
308 if ($maxAsciiTime < $stepCount)
310 $maxAsciiTime = $stepCount;
311 $mostDifficultCode = $code;
315 $asciiFoundWithoutStepCount++;
318 printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95);
319 printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode);
321 # Now we populate the hash table
324 $index = AddListToHash($index, @IncludedDecomps);
325 my $hashIndexAfterIncludedDecomps = $index;
326 printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps);
328 $index = AddListToHash($index, @LongExcludedDecomps);
329 my $hashIndexAfterLongExcludeDecomps = $index;
330 printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps);
332 $index = AddListToHash($index, @ShortDecompsLongFolds);
333 my $hashIndexAfterShortDecompsLongFolds = $index;
334 printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds);
336 $index = AddListToHash($index, @ShortDecompsShortFolds);
337 my $hashIndexAfterShortDecompsShortFolds = $index;
338 printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds);
340 $index = AddListToHash($index, @ShortFoldsOnly);
341 my $hashIndexAfterShortFoldsOnly = $index;
342 printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly);
349 print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n";
350 print "// All rights reserved.\n";
351 print "// This component and the accompanying materials are made available\n";
352 print "// under the terms of the License \"Eclipse Public License v1.0\"\n";
353 print "// which accompanies this distribution, and is available\n";
354 print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n";
356 print "// Initial Contributors:\n";
357 print "// Nokia Corporation - initial contribution.\n";
359 print "// Contributors:\n";
361 print "// Description:\n";
363 print "// Fold and decomposition tables.\n";
365 print "// These tables are linked in the following way:\n";
366 print "// KUnicodeToIndexHash is a hash table using double hashing for\n";
367 print "// conflict resolution. The functions DecompositionHashStart and\n";
368 print "// DecompositionHashStep give the start and step values for accessing\n";
369 print "// the table. The first probe is at DecompositionHashStart and each\n";
370 print "// subsequent probe is offset by DecompositionHashStep. Probes\n";
371 print "// continue until either 0 is found (indicating that the Unicode value\n";
372 print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n";
373 print "// is found that has the sought Unicode value in its lower 20 bits.\n";
375 print "// In this latter case, the upper 12 bits contain an index into\n";
376 print "// one of the following tables, according to the following rules:\n";
378 print "// In the case of folding:\n";
379 print "// If the Index is less than the length of KNonSingletonFolds / 2,\n";
380 print "// it is an index into KNonSingletonFolds. If the Index is\n";
381 print "// greater than the length of KNonSingletonFolds / 2, then it is an\n";
382 print "// index into KSingletonFolds.\n";
384 print "// In the case of decomposition:\n";
385 print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n";
386 print "// it is an index into KNonSingletonDecompositions. If the Index is\n";
387 print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n";
388 print "// index into KSingletonDecompositions.\n";
390 print "// In summary:\n";
391 print "// Let Knsf be the length of KNonSingletonFolds / 2,\n";
392 print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n";
393 print "// let Ksd be the length of KSingletonDecompositions and\n";
394 print "// let Ksf be the length of KSingletonFolds.\n";
395 print "// Now if you want to fold a character and you have found\n";
396 print "// its index 'i' from the KUnicodeToIndexHash, then;\n";
397 print "// if (i < Knsf) then look up\n";
398 print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n";
399 print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n";
400 print "// else there is no fold for this character.\n";
402 print "// Or if you want to decompose the same character, then;\n";
403 print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n";
404 print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n";
405 print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n";
406 print "// else there is no decomposition for this character.\n";
408 print "// Your index into KSingletonDecompositions or KSingletonFolds\n";
409 print "// yields a single value which is the decomposition or fold.\n";
411 print "// The KNonSingletonFolds and KNonSingletonDecomposition\n";
412 print "// tables are made up of pairs of values. Each pair is either a pair\n";
413 print "// of Unicode values that constitute the fold or decomposition, or\n";
414 print "// the first value is KLongD and the second has its top 4 bits as the\n";
415 print "// length of the decomposition (or folded decomposition) minus 3,\n";
416 print "// and its bottom 12 bits as the index into KLongDecompositions\n";
417 print "// of where you can find this decomposition.\n";
419 print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n";
420 print "// all the decomposed and folded sequences longer than 4 bytes long.\n";
422 print "// Hash table mapping unicode values to indices into the other tables\n";
423 print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n";
424 print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t";
426 for (0..($HashTableSize - 1))
429 if (exists $HashTableEntryContents{$_})
431 $v = $HashTableEntryContents{$_};
432 die ('Did not expect a Unicode value > 0xFFFFF')
434 $v |= ($Index{$v}) << 20;
436 push @HashTableOutput, sprintf('0x%08x', $v);
439 print (shift @HashTableOutput);
441 foreach my $v (@HashTableOutput)
443 print (((++$valueCount & 7) == 0)? ",\n\t" : ', ');
447 print "// Hash table access functions\n";
448 print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n";
449 print "inline int DecompositionHashStart(long a)\n";
450 print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n";
451 print "inline int DecompositionHashStep(long a)\n";
452 print "\t{\n\ta *= a >> $LgHashTableSize;\n";
453 print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n";
455 print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n";
456 print "// indexes it\n";
457 print "const unsigned short KCompositionMapping[] =\n\t{\n\t";
458 for (0..(scalar(@IncludedDecomps - 1)))
461 {print (($_ & 7) == 0? ",\n\t" : ', ')}
462 printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} );
467 print "// Table containing all the decomposition and folding strings longer\n";
468 print "// than 2 UTF16 characters\n";
469 print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x";
470 for(0..(scalar(@VeryLongDecompData) - 1))
473 {print (($_ & 7) == 0?",\n\t0x" : ', 0x')}
474 print $VeryLongDecompData[$_];
479 print "// Table containing decompositions longer than one UTF16 character.\n";
480 print "// The top of the table contains all compositions, sorted lexicographically.\n";
481 print "// Any decompositions of length 2 are in the table as a pair of values,\n";
482 print "// decompositions longer than that are represented by a KLongD followed by\n";
483 print "// a value whose top four bits indicate the length of the decomposition minus\n";
484 print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n";
485 print "// array where the decomposition starts.\n";
486 print "const long KLongD = 0;\n";
487 print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n";
488 print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t";
490 sub PrintNonsingletonDecompTableEntry
493 if (length $decomp < 10)
495 if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/)
497 print '0x'.$1.', 0x'.$2;
501 die("$decomp expected to be normalized and of length 1 or 2")
502 if $decomp !~ /[0-9A-F]{4}/;
503 print '0x'.$decomp.', 0xFFFF';
508 printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp});
513 foreach my $code (@IncludedDecomps)
516 {print (($entryNo & 3) == 0?",\n\t" : ', ')}
517 PrintNonsingletonDecompTableEntry($Decomp{$code});
521 foreach my $code (@LongExcludedDecomps)
523 print (($entryNo & 3) == 0?",\n\t" : ', ');
524 PrintNonsingletonDecompTableEntry($Decomp{$code});
531 print "// Table of folded decompositions which either have more than one UTF16, or\n";
532 print "// their normal decompositions have more than one UTF16\n";
533 print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n";
534 print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t";
536 foreach my $code (@IncludedDecomps)
539 {print (($entryNo & 3) == 0?",\n\t" : ', ')}
540 PrintNonsingletonDecompTableEntry($Folded{$code});
544 foreach my $code (@LongExcludedDecomps)
546 print (($entryNo & 3) == 0?",\n\t" : ', ');
547 PrintNonsingletonDecompTableEntry($Folded{$code});
551 foreach my $code (@ShortDecompsLongFolds)
553 print (($entryNo & 3) == 0?",\n\t" : ', ');
554 PrintNonsingletonDecompTableEntry($Folded{$code});
561 print "// Table of singleton decompositions and characters with singleton folds\n";
562 print "// Note for Unicode 5.0:\n";
563 print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n";
564 print "// As per the algorithm of this file, the non-BMP character should be stored in \n";
565 print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n";
566 print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n";
567 print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n";
568 print "// your character has no decomposition.\n";
569 print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n";
570 print "// want to know more.\n";
571 print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n";
572 print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x";
574 foreach my $code (@ShortDecompsLongFolds)
577 {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
578 if (exists $Decomp{$code} && $Decomp{$code} ne '')
580 print $Decomp{$code};
584 # Don't take these 0xFFFF as character.
585 #printf ('%04X', $code);
591 foreach my $code (@ShortDecompsShortFolds)
594 {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
595 print $Decomp{$code};
602 print "// Table of singleton folds\n";
603 print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n";
604 print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x";
606 foreach my $code (@ShortDecompsShortFolds)
609 {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
610 print $Folded{$code};
614 foreach my $code (@ShortFoldsOnly)
616 print (($entryNo & 7) == 0?",\n\t0x" : ', 0x');
617 print $Folded{$code};
624 print "\n// Total size: $totalBytes bytes\n";
625 print STDERR $totalBytes, " bytes\n";