sl@0
|
1 |
# Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
2 |
# All rights reserved.
|
sl@0
|
3 |
# This component and the accompanying materials are made available
|
sl@0
|
4 |
# under the terms of the License "Eclipse Public License v1.0"
|
sl@0
|
5 |
# which accompanies this distribution, and is available
|
sl@0
|
6 |
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
7 |
#
|
sl@0
|
8 |
# Initial Contributors:
|
sl@0
|
9 |
# Nokia Corporation - initial contribution.
|
sl@0
|
10 |
#
|
sl@0
|
11 |
# Contributors:
|
sl@0
|
12 |
#
|
sl@0
|
13 |
# Description:
|
sl@0
|
14 |
# Creates C++ code describing how to decompose, compose and fold each character.
|
sl@0
|
15 |
# Usage:
|
sl@0
|
16 |
# perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>
|
sl@0
|
17 |
# Tables we want to create:
|
sl@0
|
18 |
# A: Ordered list of non-excluded decompositions
|
sl@0
|
19 |
# B: List of folded decompositions matching A
|
sl@0
|
20 |
# C: List of decompositions not listed in A of length > 1
|
sl@0
|
21 |
# D: List of folded decompositions matching C
|
sl@0
|
22 |
# E: List of decompositions of length = 1 whose matching folded decompositions
|
sl@0
|
23 |
# are of length > 1
|
sl@0
|
24 |
# F: List of folded decompositions matching E
|
sl@0
|
25 |
# G: List of decompositions of length = 1 with matching folded decompositions
|
sl@0
|
26 |
# H: List of folded decompostions matching G
|
sl@0
|
27 |
# I: List of folded decompositions that do not have matching decompositions
|
sl@0
|
28 |
# J: List of decompositions (folding and otherwise) of length > 2
|
sl@0
|
29 |
# K: Hash table mapping Unicode value to its folded decomposition value in the
|
sl@0
|
30 |
# concatenated list B-D-F-H-I
|
sl@0
|
31 |
# L: List of hash slots in K matching A (providing a mapping from non-excluded
|
sl@0
|
32 |
# decompositions to Unicode value)
|
sl@0
|
33 |
# [all lengths are of UTF16 strings]
|
sl@0
|
34 |
#
|
sl@0
|
35 |
#
|
sl@0
|
36 |
|
sl@0
|
37 |
use strict;
|
sl@0
|
38 |
|
sl@0
|
39 |
#
|
sl@0
|
40 |
# Hash table:
|
sl@0
|
41 |
#
|
sl@0
|
42 |
|
sl@0
|
43 |
# Size of hashing table = 1 to the power $LgHashTableSize
|
sl@0
|
44 |
my $LgHashTableSize = 12;
|
sl@0
|
45 |
|
sl@0
|
46 |
# Do not change these next two values!
|
sl@0
|
47 |
my $HashTableSize = 1 << $LgHashTableSize;
|
sl@0
|
48 |
my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1);
|
sl@0
|
49 |
|
sl@0
|
50 |
# Hashing function in Perl: Getting the initial search position
|
sl@0
|
51 |
sub HashStart
|
sl@0
|
52 |
{
|
sl@0
|
53 |
return $_[0] & ($HashTableSize - 1);
|
sl@0
|
54 |
}
|
sl@0
|
55 |
# How far to step through each time
|
sl@0
|
56 |
sub HashStep
|
sl@0
|
57 |
{
|
sl@0
|
58 |
my ($code) = @_;
|
sl@0
|
59 |
$code *= $code >> $LgHashTableSize;
|
sl@0
|
60 |
return ($code * 2 + 1) & ($HashTableSize - 1);
|
sl@0
|
61 |
}
|
sl@0
|
62 |
|
sl@0
|
63 |
# Make sure input string is all hex numbers separated by single spaces with
|
sl@0
|
64 |
# each hex number having 4 digits and decomposed into UTF16
|
sl@0
|
65 |
sub Normalize
|
sl@0
|
66 |
{
|
sl@0
|
67 |
my ($string) = @_;
|
sl@0
|
68 |
if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/)
|
sl@0
|
69 |
{
|
sl@0
|
70 |
return $string;
|
sl@0
|
71 |
}
|
sl@0
|
72 |
my $norm = '';
|
sl@0
|
73 |
foreach my $elt (split(' ', $string))
|
sl@0
|
74 |
{
|
sl@0
|
75 |
if ($elt)
|
sl@0
|
76 |
{
|
sl@0
|
77 |
die "'$elt' is not a hex number"
|
sl@0
|
78 |
unless $elt =~ /[0-9a-fA-F]+/;
|
sl@0
|
79 |
$norm = $norm.' '
|
sl@0
|
80 |
unless $norm eq '';
|
sl@0
|
81 |
$elt = hex $elt;
|
sl@0
|
82 |
if ($elt < 0x10000)
|
sl@0
|
83 |
{
|
sl@0
|
84 |
$norm = $norm.(sprintf('%04X', $elt));
|
sl@0
|
85 |
}
|
sl@0
|
86 |
else
|
sl@0
|
87 |
{
|
sl@0
|
88 |
# Add a surrogate pair
|
sl@0
|
89 |
$norm = $norm.(sprintf('%04X %04X',
|
sl@0
|
90 |
($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00));
|
sl@0
|
91 |
}
|
sl@0
|
92 |
}
|
sl@0
|
93 |
}
|
sl@0
|
94 |
#print STDERR "'$string' normalized to '$norm'\n";
|
sl@0
|
95 |
return $norm;
|
sl@0
|
96 |
}
|
sl@0
|
97 |
|
sl@0
|
98 |
# First stage:
|
sl@0
|
99 |
# Hash of Unicode values to normalised decomposition and folded strings
|
sl@0
|
100 |
my %Decomp = ();
|
sl@0
|
101 |
my %Folded = ();
|
sl@0
|
102 |
# Mapping from decomposition->char, if not excluded
|
sl@0
|
103 |
my %Composition = ();
|
sl@0
|
104 |
# characters with non-excluded decompositions
|
sl@0
|
105 |
my @IncludedDecomps = ();
|
sl@0
|
106 |
# characters with long (>1 UTF16) excluded decompositions
|
sl@0
|
107 |
my @LongExcludedDecomps = ();
|
sl@0
|
108 |
# characters with singleton decompositions but long folds
|
sl@0
|
109 |
my @ShortDecompsLongFolds = ();
|
sl@0
|
110 |
# characters with singleton folds and singleton
|
sl@0
|
111 |
my @ShortDecompsShortFolds = ();
|
sl@0
|
112 |
# characters with singleton folds but no decomps
|
sl@0
|
113 |
my @ShortFoldsOnly = ();
|
sl@0
|
114 |
|
sl@0
|
115 |
# A mapping from decompositions of length greater than two
|
sl@0
|
116 |
# to the code that produced them.
|
sl@0
|
117 |
my %VeryLongDecompositions = ();
|
sl@0
|
118 |
|
sl@0
|
119 |
# A list of characters containing all decompositions of length >2 as slices
|
sl@0
|
120 |
my @VeryLongDecompData = ();
|
sl@0
|
121 |
# Mapping from decomposition->index into VeryLongDecompData
|
sl@0
|
122 |
my %VeryLongDecompMap = ();
|
sl@0
|
123 |
|
sl@0
|
124 |
# There will be a hash table mapping Unicode values to indices into the other
|
sl@0
|
125 |
# tables. %Index maps the same thing in Perl.
|
sl@0
|
126 |
my %Index = ();
|
sl@0
|
127 |
# %HashTableEntryContents maps the table entries to the Unicode values they
|
sl@0
|
128 |
# contain.
|
sl@0
|
129 |
my %HashTableEntryContents = ();
|
sl@0
|
130 |
# %HashTableEntry maps Unicode value to the entry in the hash table
|
sl@0
|
131 |
my %HashTableEntry = ();
|
sl@0
|
132 |
|
sl@0
|
133 |
# Bind a unicode value to an index into the tables
|
sl@0
|
134 |
sub AddHashValue
|
sl@0
|
135 |
{
|
sl@0
|
136 |
my ($unicode, $index) = @_;
|
sl@0
|
137 |
$Index{$unicode} = $index;
|
sl@0
|
138 |
my $pos = HashStart($unicode);
|
sl@0
|
139 |
my $step = HashStep($unicode);
|
sl@0
|
140 |
while (exists $HashTableEntryContents{$pos})
|
sl@0
|
141 |
{
|
sl@0
|
142 |
$pos += $step;
|
sl@0
|
143 |
if ($HashTableSize <= $pos)
|
sl@0
|
144 |
{
|
sl@0
|
145 |
$pos %= $HashTableSize;
|
sl@0
|
146 |
}
|
sl@0
|
147 |
}
|
sl@0
|
148 |
$HashTableEntryContents{$pos} = $unicode;
|
sl@0
|
149 |
$HashTableEntry{$unicode} = $pos;
|
sl@0
|
150 |
}
|
sl@0
|
151 |
|
sl@0
|
152 |
# Bind a whole array to the indices starting from that given as the first
|
sl@0
|
153 |
# argument. Returns the index of the next slot to be filled.
|
sl@0
|
154 |
sub AddListToHash
|
sl@0
|
155 |
{
|
sl@0
|
156 |
my ($index, @unicodes) = @_;
|
sl@0
|
157 |
while (@unicodes)
|
sl@0
|
158 |
{
|
sl@0
|
159 |
AddHashValue(shift @unicodes, $index);
|
sl@0
|
160 |
$index++;
|
sl@0
|
161 |
}
|
sl@0
|
162 |
return $index;
|
sl@0
|
163 |
}
|
sl@0
|
164 |
|
sl@0
|
165 |
# put the results of a read line into the data structures
|
sl@0
|
166 |
sub AddCode
|
sl@0
|
167 |
{
|
sl@0
|
168 |
my ($code, $excluded, $decomposition, $folded) = @_;
|
sl@0
|
169 |
return if ($decomposition eq '' && $folded eq '');
|
sl@0
|
170 |
$Decomp{$code} = $decomposition;
|
sl@0
|
171 |
$Folded{$code} = $folded;
|
sl@0
|
172 |
|
sl@0
|
173 |
if (!$excluded && $decomposition ne '')
|
sl@0
|
174 |
{
|
sl@0
|
175 |
push @IncludedDecomps, $code;
|
sl@0
|
176 |
$Composition{$decomposition} = $code;
|
sl@0
|
177 |
}
|
sl@0
|
178 |
elsif (4 < length $decomposition)
|
sl@0
|
179 |
{
|
sl@0
|
180 |
push @LongExcludedDecomps, $code;
|
sl@0
|
181 |
}
|
sl@0
|
182 |
elsif (4 < length $folded)
|
sl@0
|
183 |
{
|
sl@0
|
184 |
push @ShortDecompsLongFolds, $code;
|
sl@0
|
185 |
}
|
sl@0
|
186 |
elsif ($decomposition ne '')
|
sl@0
|
187 |
{
|
sl@0
|
188 |
push @ShortDecompsShortFolds, $code;
|
sl@0
|
189 |
}
|
sl@0
|
190 |
elsif ($folded ne '')
|
sl@0
|
191 |
{
|
sl@0
|
192 |
push @ShortFoldsOnly, $code;
|
sl@0
|
193 |
}
|
sl@0
|
194 |
|
sl@0
|
195 |
$VeryLongDecompositions{$decomposition} = $code
|
sl@0
|
196 |
if (9 < length $decomposition);
|
sl@0
|
197 |
$VeryLongDecompositions{$folded} = $code
|
sl@0
|
198 |
if (9 < length $folded);
|
sl@0
|
199 |
}
|
sl@0
|
200 |
|
sl@0
|
201 |
if (scalar(@ARGV) != 0)
|
sl@0
|
202 |
{
|
sl@0
|
203 |
print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n");
|
sl@0
|
204 |
exit 1;
|
sl@0
|
205 |
}
|
sl@0
|
206 |
|
sl@0
|
207 |
my $lineNo = 0;
|
sl@0
|
208 |
my $inBlock = 0;
|
sl@0
|
209 |
while(<STDIN>)
|
sl@0
|
210 |
{
|
sl@0
|
211 |
$lineNo++;
|
sl@0
|
212 |
if (/^(1?[0-9a-fA-F]{4,5});([^;]*);.*symbian:(E?);[^;]*;([0-9a-fA-F \t]*);([0-9a-fA-F \t]*)[ \t]*$/i)
|
sl@0
|
213 |
{
|
sl@0
|
214 |
my $code = hex $1;
|
sl@0
|
215 |
my $description = $2;
|
sl@0
|
216 |
my $excluded = $3;
|
sl@0
|
217 |
my $decomposition = Normalize($4);
|
sl@0
|
218 |
my $folded = Normalize($5);
|
sl@0
|
219 |
|
sl@0
|
220 |
die ("Value $1 too large to be Unicode at line $lineNo.")
|
sl@0
|
221 |
if (0x110000 <= $code);
|
sl@0
|
222 |
|
sl@0
|
223 |
die("Normalisation failed with '$decomposition' at line $lineNo.")
|
sl@0
|
224 |
unless (length $decomposition) == 0 || (length $decomposition) % 5 == 4;
|
sl@0
|
225 |
die("Normalisation failed with '$folded' at line $lineNo.")
|
sl@0
|
226 |
unless (length $folded) == 0 || (length $folded) % 5 == 4;
|
sl@0
|
227 |
|
sl@0
|
228 |
AddCode($code, $excluded, $decomposition, $folded);
|
sl@0
|
229 |
|
sl@0
|
230 |
if ($description =~ /^<.*Last>$/i)
|
sl@0
|
231 |
{
|
sl@0
|
232 |
die("End of block without start at line $lineNo!")
|
sl@0
|
233 |
if !$inBlock;
|
sl@0
|
234 |
while ($inBlock <= $code)
|
sl@0
|
235 |
{
|
sl@0
|
236 |
AddCode($inBlock, $excluded, $decomposition, $folded);
|
sl@0
|
237 |
$inBlock++;
|
sl@0
|
238 |
}
|
sl@0
|
239 |
$inBlock = 0;
|
sl@0
|
240 |
}
|
sl@0
|
241 |
elsif ($description =~ /^<.*First>$/i)
|
sl@0
|
242 |
{
|
sl@0
|
243 |
die("Block within block at line $lineNo!")
|
sl@0
|
244 |
if $inBlock;
|
sl@0
|
245 |
$inBlock = $code + 1;
|
sl@0
|
246 |
}
|
sl@0
|
247 |
}
|
sl@0
|
248 |
elsif (!/^[ \t]*$/)
|
sl@0
|
249 |
{
|
sl@0
|
250 |
die("Did not understand line $lineNo.");
|
sl@0
|
251 |
}
|
sl@0
|
252 |
}
|
sl@0
|
253 |
|
sl@0
|
254 |
# We need to construct the data for the table of decompositions of length > 2.
|
sl@0
|
255 |
foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions)
|
sl@0
|
256 |
{
|
sl@0
|
257 |
if (!exists $VeryLongDecompMap{$decomp})
|
sl@0
|
258 |
{
|
sl@0
|
259 |
# Does not already exist
|
sl@0
|
260 |
my $newPos = scalar @VeryLongDecompData;
|
sl@0
|
261 |
$VeryLongDecompMap{$decomp} = $newPos;
|
sl@0
|
262 |
foreach my $code (split(' ', $decomp))
|
sl@0
|
263 |
{
|
sl@0
|
264 |
push @VeryLongDecompData, $code;
|
sl@0
|
265 |
}
|
sl@0
|
266 |
while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/)
|
sl@0
|
267 |
{
|
sl@0
|
268 |
$decomp = $1;
|
sl@0
|
269 |
$VeryLongDecompMap{$decomp} = $newPos;
|
sl@0
|
270 |
}
|
sl@0
|
271 |
}
|
sl@0
|
272 |
}
|
sl@0
|
273 |
|
sl@0
|
274 |
# We need to sort the codes for included decompositions into lexicographic
|
sl@0
|
275 |
# order of their decompositions.
|
sl@0
|
276 |
# This, luckily, is the same as sorting the strings that represent their
|
sl@0
|
277 |
# decompositions in hex lexicographically.
|
sl@0
|
278 |
@IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps;
|
sl@0
|
279 |
|
sl@0
|
280 |
print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps));
|
sl@0
|
281 |
print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds));
|
sl@0
|
282 |
print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: ");
|
sl@0
|
283 |
print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly));
|
sl@0
|
284 |
print STDERR "\n";
|
sl@0
|
285 |
|
sl@0
|
286 |
# Analyse the hash table to find out the maximum and average time
|
sl@0
|
287 |
# taken to find each ASCII character
|
sl@0
|
288 |
my $maxAsciiTime = 0;
|
sl@0
|
289 |
my $totalAsciiTime = 0;
|
sl@0
|
290 |
my $mostDifficultCode = undef;
|
sl@0
|
291 |
my $asciiFoundWithoutStepCount = 0;
|
sl@0
|
292 |
for (32..126)
|
sl@0
|
293 |
{
|
sl@0
|
294 |
my $code = $_;
|
sl@0
|
295 |
my $pos = HashStart($code);
|
sl@0
|
296 |
my $step = HashStep($code);
|
sl@0
|
297 |
my $stepCount = 1;
|
sl@0
|
298 |
if ($HashTableEntry{$code})
|
sl@0
|
299 |
{
|
sl@0
|
300 |
my $posRequired = $HashTableEntry{$code};
|
sl@0
|
301 |
while ($pos != $posRequired)
|
sl@0
|
302 |
{
|
sl@0
|
303 |
$pos = ($pos + $step) % $HashTableSize;
|
sl@0
|
304 |
$stepCount++;
|
sl@0
|
305 |
}
|
sl@0
|
306 |
}
|
sl@0
|
307 |
$totalAsciiTime += $stepCount;
|
sl@0
|
308 |
if ($maxAsciiTime < $stepCount)
|
sl@0
|
309 |
{
|
sl@0
|
310 |
$maxAsciiTime = $stepCount;
|
sl@0
|
311 |
$mostDifficultCode = $code;
|
sl@0
|
312 |
}
|
sl@0
|
313 |
if ($stepCount == 1)
|
sl@0
|
314 |
{
|
sl@0
|
315 |
$asciiFoundWithoutStepCount++;
|
sl@0
|
316 |
}
|
sl@0
|
317 |
}
|
sl@0
|
318 |
printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95);
|
sl@0
|
319 |
printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode);
|
sl@0
|
320 |
|
sl@0
|
321 |
# Now we populate the hash table
|
sl@0
|
322 |
my $index = 0;
|
sl@0
|
323 |
|
sl@0
|
324 |
$index = AddListToHash($index, @IncludedDecomps);
|
sl@0
|
325 |
my $hashIndexAfterIncludedDecomps = $index;
|
sl@0
|
326 |
printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps);
|
sl@0
|
327 |
|
sl@0
|
328 |
$index = AddListToHash($index, @LongExcludedDecomps);
|
sl@0
|
329 |
my $hashIndexAfterLongExcludeDecomps = $index;
|
sl@0
|
330 |
printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps);
|
sl@0
|
331 |
|
sl@0
|
332 |
$index = AddListToHash($index, @ShortDecompsLongFolds);
|
sl@0
|
333 |
my $hashIndexAfterShortDecompsLongFolds = $index;
|
sl@0
|
334 |
printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds);
|
sl@0
|
335 |
|
sl@0
|
336 |
$index = AddListToHash($index, @ShortDecompsShortFolds);
|
sl@0
|
337 |
my $hashIndexAfterShortDecompsShortFolds = $index;
|
sl@0
|
338 |
printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds);
|
sl@0
|
339 |
|
sl@0
|
340 |
$index = AddListToHash($index, @ShortFoldsOnly);
|
sl@0
|
341 |
my $hashIndexAfterShortFoldsOnly = $index;
|
sl@0
|
342 |
printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly);
|
sl@0
|
343 |
|
sl@0
|
344 |
#
|
sl@0
|
345 |
# Output C++ File
|
sl@0
|
346 |
#
|
sl@0
|
347 |
my $totalBytes = 0;
|
sl@0
|
348 |
|
sl@0
|
349 |
print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n";
|
sl@0
|
350 |
print "// All rights reserved.\n";
|
sl@0
|
351 |
print "// This component and the accompanying materials are made available\n";
|
sl@0
|
352 |
print "// under the terms of the License \"Eclipse Public License v1.0\"\n";
|
sl@0
|
353 |
print "// which accompanies this distribution, and is available\n";
|
sl@0
|
354 |
print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n";
|
sl@0
|
355 |
print "//\n";
|
sl@0
|
356 |
print "// Initial Contributors:\n";
|
sl@0
|
357 |
print "// Nokia Corporation - initial contribution.\n";
|
sl@0
|
358 |
print "//\n";
|
sl@0
|
359 |
print "// Contributors:\n";
|
sl@0
|
360 |
print "//\n";
|
sl@0
|
361 |
print "// Description:\n";
|
sl@0
|
362 |
print "//\n";
|
sl@0
|
363 |
print "// Fold and decomposition tables.\n";
|
sl@0
|
364 |
print "//\n";
|
sl@0
|
365 |
print "// These tables are linked in the following way:\n";
|
sl@0
|
366 |
print "// KUnicodeToIndexHash is a hash table using double hashing for\n";
|
sl@0
|
367 |
print "// conflict resolution. The functions DecompositionHashStart and\n";
|
sl@0
|
368 |
print "// DecompositionHashStep give the start and step values for accessing\n";
|
sl@0
|
369 |
print "// the table. The first probe is at DecompositionHashStart and each\n";
|
sl@0
|
370 |
print "// subsequent probe is offset by DecompositionHashStep. Probes\n";
|
sl@0
|
371 |
print "// continue until either 0 is found (indicating that the Unicode value\n";
|
sl@0
|
372 |
print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n";
|
sl@0
|
373 |
print "// is found that has the sought Unicode value in its lower 20 bits.\n";
|
sl@0
|
374 |
print "//\n";
|
sl@0
|
375 |
print "// In this latter case, the upper 12 bits contain an index into\n";
|
sl@0
|
376 |
print "// one of the following tables, according to the following rules:\n";
|
sl@0
|
377 |
print "//\n";
|
sl@0
|
378 |
print "// In the case of folding:\n";
|
sl@0
|
379 |
print "// If the Index is less than the length of KNonSingletonFolds / 2,\n";
|
sl@0
|
380 |
print "// it is an index into KNonSingletonFolds. If the Index is\n";
|
sl@0
|
381 |
print "// greater than the length of KNonSingletonFolds / 2, then it is an\n";
|
sl@0
|
382 |
print "// index into KSingletonFolds.\n";
|
sl@0
|
383 |
print "//\n";
|
sl@0
|
384 |
print "// In the case of decomposition:\n";
|
sl@0
|
385 |
print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n";
|
sl@0
|
386 |
print "// it is an index into KNonSingletonDecompositions. If the Index is\n";
|
sl@0
|
387 |
print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n";
|
sl@0
|
388 |
print "// index into KSingletonDecompositions.\n";
|
sl@0
|
389 |
print "//\n";
|
sl@0
|
390 |
print "// In summary:\n";
|
sl@0
|
391 |
print "// Let Knsf be the length of KNonSingletonFolds / 2,\n";
|
sl@0
|
392 |
print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n";
|
sl@0
|
393 |
print "// let Ksd be the length of KSingletonDecompositions and\n";
|
sl@0
|
394 |
print "// let Ksf be the length of KSingletonFolds.\n";
|
sl@0
|
395 |
print "// Now if you want to fold a character and you have found\n";
|
sl@0
|
396 |
print "// its index 'i' from the KUnicodeToIndexHash, then;\n";
|
sl@0
|
397 |
print "// if (i < Knsf) then look up\n";
|
sl@0
|
398 |
print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n";
|
sl@0
|
399 |
print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n";
|
sl@0
|
400 |
print "// else there is no fold for this character.\n";
|
sl@0
|
401 |
print "//\n";
|
sl@0
|
402 |
print "// Or if you want to decompose the same character, then;\n";
|
sl@0
|
403 |
print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n";
|
sl@0
|
404 |
print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n";
|
sl@0
|
405 |
print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n";
|
sl@0
|
406 |
print "// else there is no decomposition for this character.\n";
|
sl@0
|
407 |
print "//\n";
|
sl@0
|
408 |
print "// Your index into KSingletonDecompositions or KSingletonFolds\n";
|
sl@0
|
409 |
print "// yields a single value which is the decomposition or fold.\n";
|
sl@0
|
410 |
print "//\n";
|
sl@0
|
411 |
print "// The KNonSingletonFolds and KNonSingletonDecomposition\n";
|
sl@0
|
412 |
print "// tables are made up of pairs of values. Each pair is either a pair\n";
|
sl@0
|
413 |
print "// of Unicode values that constitute the fold or decomposition, or\n";
|
sl@0
|
414 |
print "// the first value is KLongD and the second has its top 4 bits as the\n";
|
sl@0
|
415 |
print "// length of the decomposition (or folded decomposition) minus 3,\n";
|
sl@0
|
416 |
print "// and its bottom 12 bits as the index into KLongDecompositions\n";
|
sl@0
|
417 |
print "// of where you can find this decomposition.\n";
|
sl@0
|
418 |
print "//\n";
|
sl@0
|
419 |
print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n";
|
sl@0
|
420 |
print "// all the decomposed and folded sequences longer than 4 bytes long.\n";
|
sl@0
|
421 |
print "\n";
|
sl@0
|
422 |
print "// Hash table mapping unicode values to indices into the other tables\n";
|
sl@0
|
423 |
print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n";
|
sl@0
|
424 |
print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t";
|
sl@0
|
425 |
my @HashTableOutput;
|
sl@0
|
426 |
for (0..($HashTableSize - 1))
|
sl@0
|
427 |
{
|
sl@0
|
428 |
my $v = 0;
|
sl@0
|
429 |
if (exists $HashTableEntryContents{$_})
|
sl@0
|
430 |
{
|
sl@0
|
431 |
$v = $HashTableEntryContents{$_};
|
sl@0
|
432 |
die ('Did not expect a Unicode value > 0xFFFFF')
|
sl@0
|
433 |
if 0xFFFFF < $v;
|
sl@0
|
434 |
$v |= ($Index{$v}) << 20;
|
sl@0
|
435 |
}
|
sl@0
|
436 |
push @HashTableOutput, sprintf('0x%08x', $v);
|
sl@0
|
437 |
$totalBytes += 4;
|
sl@0
|
438 |
}
|
sl@0
|
439 |
print (shift @HashTableOutput);
|
sl@0
|
440 |
my $valueCount = 0;
|
sl@0
|
441 |
foreach my $v (@HashTableOutput)
|
sl@0
|
442 |
{
|
sl@0
|
443 |
print (((++$valueCount & 7) == 0)? ",\n\t" : ', ');
|
sl@0
|
444 |
print $v;
|
sl@0
|
445 |
}
|
sl@0
|
446 |
print "\n\t};\n\n";
|
sl@0
|
447 |
print "// Hash table access functions\n";
|
sl@0
|
448 |
print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n";
|
sl@0
|
449 |
print "inline int DecompositionHashStart(long a)\n";
|
sl@0
|
450 |
print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n";
|
sl@0
|
451 |
print "inline int DecompositionHashStep(long a)\n";
|
sl@0
|
452 |
print "\t{\n\ta *= a >> $LgHashTableSize;\n";
|
sl@0
|
453 |
print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n";
|
sl@0
|
454 |
|
sl@0
|
455 |
print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n";
|
sl@0
|
456 |
print "// indexes it\n";
|
sl@0
|
457 |
print "const unsigned short KCompositionMapping[] =\n\t{\n\t";
|
sl@0
|
458 |
for (0..(scalar(@IncludedDecomps - 1)))
|
sl@0
|
459 |
{
|
sl@0
|
460 |
if ($_ != 0)
|
sl@0
|
461 |
{print (($_ & 7) == 0? ",\n\t" : ', ')}
|
sl@0
|
462 |
printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} );
|
sl@0
|
463 |
$totalBytes += 2;
|
sl@0
|
464 |
}
|
sl@0
|
465 |
print "\n\t};\n\n";
|
sl@0
|
466 |
|
sl@0
|
467 |
print "// Table containing all the decomposition and folding strings longer\n";
|
sl@0
|
468 |
print "// than 2 UTF16 characters\n";
|
sl@0
|
469 |
print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x";
|
sl@0
|
470 |
for(0..(scalar(@VeryLongDecompData) - 1))
|
sl@0
|
471 |
{
|
sl@0
|
472 |
if ($_ != 0)
|
sl@0
|
473 |
{print (($_ & 7) == 0?",\n\t0x" : ', 0x')}
|
sl@0
|
474 |
print $VeryLongDecompData[$_];
|
sl@0
|
475 |
$totalBytes += 2;
|
sl@0
|
476 |
}
|
sl@0
|
477 |
print "\n\t};\n\n";
|
sl@0
|
478 |
|
sl@0
|
479 |
print "// Table containing decompositions longer than one UTF16 character.\n";
|
sl@0
|
480 |
print "// The top of the table contains all compositions, sorted lexicographically.\n";
|
sl@0
|
481 |
print "// Any decompositions of length 2 are in the table as a pair of values,\n";
|
sl@0
|
482 |
print "// decompositions longer than that are represented by a KLongD followed by\n";
|
sl@0
|
483 |
print "// a value whose top four bits indicate the length of the decomposition minus\n";
|
sl@0
|
484 |
print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n";
|
sl@0
|
485 |
print "// array where the decomposition starts.\n";
|
sl@0
|
486 |
print "const long KLongD = 0;\n";
|
sl@0
|
487 |
print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n";
|
sl@0
|
488 |
print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t";
|
sl@0
|
489 |
|
sl@0
|
490 |
sub PrintNonsingletonDecompTableEntry
|
sl@0
|
491 |
{
|
sl@0
|
492 |
my ($decomp) = @_;
|
sl@0
|
493 |
if (length $decomp < 10)
|
sl@0
|
494 |
{
|
sl@0
|
495 |
if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/)
|
sl@0
|
496 |
{
|
sl@0
|
497 |
print '0x'.$1.', 0x'.$2;
|
sl@0
|
498 |
}
|
sl@0
|
499 |
else
|
sl@0
|
500 |
{
|
sl@0
|
501 |
die("$decomp expected to be normalized and of length 1 or 2")
|
sl@0
|
502 |
if $decomp !~ /[0-9A-F]{4}/;
|
sl@0
|
503 |
print '0x'.$decomp.', 0xFFFF';
|
sl@0
|
504 |
}
|
sl@0
|
505 |
}
|
sl@0
|
506 |
else
|
sl@0
|
507 |
{
|
sl@0
|
508 |
printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp});
|
sl@0
|
509 |
}
|
sl@0
|
510 |
}
|
sl@0
|
511 |
|
sl@0
|
512 |
{my $entryNo = 0;
|
sl@0
|
513 |
foreach my $code (@IncludedDecomps)
|
sl@0
|
514 |
{
|
sl@0
|
515 |
if ($entryNo != 0)
|
sl@0
|
516 |
{print (($entryNo & 3) == 0?",\n\t" : ', ')}
|
sl@0
|
517 |
PrintNonsingletonDecompTableEntry($Decomp{$code});
|
sl@0
|
518 |
$entryNo++;
|
sl@0
|
519 |
$totalBytes += 4;
|
sl@0
|
520 |
}
|
sl@0
|
521 |
foreach my $code (@LongExcludedDecomps)
|
sl@0
|
522 |
{
|
sl@0
|
523 |
print (($entryNo & 3) == 0?",\n\t" : ', ');
|
sl@0
|
524 |
PrintNonsingletonDecompTableEntry($Decomp{$code});
|
sl@0
|
525 |
$entryNo++;
|
sl@0
|
526 |
$totalBytes += 4;
|
sl@0
|
527 |
}
|
sl@0
|
528 |
}
|
sl@0
|
529 |
print "\n\t};\n\n";
|
sl@0
|
530 |
|
sl@0
|
531 |
print "// Table of folded decompositions which either have more than one UTF16, or\n";
|
sl@0
|
532 |
print "// their normal decompositions have more than one UTF16\n";
|
sl@0
|
533 |
print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n";
|
sl@0
|
534 |
print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t";
|
sl@0
|
535 |
{my $entryNo = 0;
|
sl@0
|
536 |
foreach my $code (@IncludedDecomps)
|
sl@0
|
537 |
{
|
sl@0
|
538 |
if ($entryNo != 0)
|
sl@0
|
539 |
{print (($entryNo & 3) == 0?",\n\t" : ', ')}
|
sl@0
|
540 |
PrintNonsingletonDecompTableEntry($Folded{$code});
|
sl@0
|
541 |
$entryNo++;
|
sl@0
|
542 |
$totalBytes += 4;
|
sl@0
|
543 |
}
|
sl@0
|
544 |
foreach my $code (@LongExcludedDecomps)
|
sl@0
|
545 |
{
|
sl@0
|
546 |
print (($entryNo & 3) == 0?",\n\t" : ', ');
|
sl@0
|
547 |
PrintNonsingletonDecompTableEntry($Folded{$code});
|
sl@0
|
548 |
$entryNo++;
|
sl@0
|
549 |
$totalBytes += 4;
|
sl@0
|
550 |
}
|
sl@0
|
551 |
foreach my $code (@ShortDecompsLongFolds)
|
sl@0
|
552 |
{
|
sl@0
|
553 |
print (($entryNo & 3) == 0?",\n\t" : ', ');
|
sl@0
|
554 |
PrintNonsingletonDecompTableEntry($Folded{$code});
|
sl@0
|
555 |
$entryNo++;
|
sl@0
|
556 |
$totalBytes += 4;
|
sl@0
|
557 |
}
|
sl@0
|
558 |
}
|
sl@0
|
559 |
print "\n\t};\n\n";
|
sl@0
|
560 |
|
sl@0
|
561 |
print "// Table of singleton decompositions and characters with singleton folds\n";
|
sl@0
|
562 |
print "// Note for Unicode 5.0:\n";
|
sl@0
|
563 |
print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n";
|
sl@0
|
564 |
print "// As per the algorithm of this file, the non-BMP character should be stored in \n";
|
sl@0
|
565 |
print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n";
|
sl@0
|
566 |
print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n";
|
sl@0
|
567 |
print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n";
|
sl@0
|
568 |
print "// your character has no decomposition.\n";
|
sl@0
|
569 |
print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n";
|
sl@0
|
570 |
print "// want to know more.\n";
|
sl@0
|
571 |
print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n";
|
sl@0
|
572 |
print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x";
|
sl@0
|
573 |
{my $entryNo = 0;
|
sl@0
|
574 |
foreach my $code (@ShortDecompsLongFolds)
|
sl@0
|
575 |
{
|
sl@0
|
576 |
if ($entryNo != 0)
|
sl@0
|
577 |
{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
|
sl@0
|
578 |
if (exists $Decomp{$code} && $Decomp{$code} ne '')
|
sl@0
|
579 |
{
|
sl@0
|
580 |
print $Decomp{$code};
|
sl@0
|
581 |
}
|
sl@0
|
582 |
else
|
sl@0
|
583 |
{
|
sl@0
|
584 |
# Don't take these 0xFFFF as character.
|
sl@0
|
585 |
#printf ('%04X', $code);
|
sl@0
|
586 |
printf ("FFFF");
|
sl@0
|
587 |
}
|
sl@0
|
588 |
$entryNo++;
|
sl@0
|
589 |
$totalBytes += 4;
|
sl@0
|
590 |
}
|
sl@0
|
591 |
foreach my $code (@ShortDecompsShortFolds)
|
sl@0
|
592 |
{
|
sl@0
|
593 |
if ($entryNo != 0)
|
sl@0
|
594 |
{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
|
sl@0
|
595 |
print $Decomp{$code};
|
sl@0
|
596 |
$entryNo++;
|
sl@0
|
597 |
$totalBytes += 4;
|
sl@0
|
598 |
}
|
sl@0
|
599 |
}
|
sl@0
|
600 |
print "\n\t};\n\n";
|
sl@0
|
601 |
|
sl@0
|
602 |
print "// Table of singleton folds\n";
|
sl@0
|
603 |
print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n";
|
sl@0
|
604 |
print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x";
|
sl@0
|
605 |
{my $entryNo = 0;
|
sl@0
|
606 |
foreach my $code (@ShortDecompsShortFolds)
|
sl@0
|
607 |
{
|
sl@0
|
608 |
if ($entryNo != 0)
|
sl@0
|
609 |
{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
|
sl@0
|
610 |
print $Folded{$code};
|
sl@0
|
611 |
$entryNo++;
|
sl@0
|
612 |
$totalBytes += 4;
|
sl@0
|
613 |
}
|
sl@0
|
614 |
foreach my $code (@ShortFoldsOnly)
|
sl@0
|
615 |
{
|
sl@0
|
616 |
print (($entryNo & 7) == 0?",\n\t0x" : ', 0x');
|
sl@0
|
617 |
print $Folded{$code};
|
sl@0
|
618 |
$entryNo++;
|
sl@0
|
619 |
$totalBytes += 4;
|
sl@0
|
620 |
}
|
sl@0
|
621 |
}
|
sl@0
|
622 |
print "\n\t};\n";
|
sl@0
|
623 |
|
sl@0
|
624 |
print "\n// Total size: $totalBytes bytes\n";
|
sl@0
|
625 |
print STDERR $totalBytes, " bytes\n";
|