Symaptic: os/kernelhwsrv/kernel/eka/euser/unicode/perl/FoldAndDecompTables.pl@bde4ae8d615e (annotated)

sl@0	1	# Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0	2	# All rights reserved.
sl@0	3	# This component and the accompanying materials are made available
sl@0	4	# under the terms of the License "Eclipse Public License v1.0"
sl@0	5	# which accompanies this distribution, and is available
sl@0	6	# at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0	7	#
sl@0	8	# Initial Contributors:
sl@0	9	# Nokia Corporation - initial contribution.
sl@0	10	#
sl@0	11	# Contributors:
sl@0	12	#
sl@0	13	# Description:
sl@0	14	# Creates C++ code describing how to decompose, compose and fold each character.
sl@0	15	# Usage:
sl@0	16	# perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>
sl@0	17	# Tables we want to create:
sl@0	18	# A: Ordered list of non-excluded decompositions
sl@0	19	# B: List of folded decompositions matching A
sl@0	20	# C: List of decompositions not listed in A of length > 1
sl@0	21	# D: List of folded decompositions matching C
sl@0	22	# E: List of decompositions of length = 1 whose matching folded decompositions
sl@0	23	# are of length > 1
sl@0	24	# F: List of folded decompositions matching E
sl@0	25	# G: List of decompositions of length = 1 with matching folded decompositions
sl@0	26	# H: List of folded decompostions matching G
sl@0	27	# I: List of folded decompositions that do not have matching decompositions
sl@0	28	# J: List of decompositions (folding and otherwise) of length > 2
sl@0	29	# K: Hash table mapping Unicode value to its folded decomposition value in the
sl@0	30	# concatenated list B-D-F-H-I
sl@0	31	# L: List of hash slots in K matching A (providing a mapping from non-excluded
sl@0	32	# decompositions to Unicode value)
sl@0	33	# [all lengths are of UTF16 strings]
sl@0	34	#
sl@0	35	#
sl@0	36
sl@0	37	use strict;
sl@0	38
sl@0	39	#
sl@0	40	# Hash table:
sl@0	41	#
sl@0	42
sl@0	43	# Size of hashing table = 1 to the power $LgHashTableSize
sl@0	44	my $LgHashTableSize = 12;
sl@0	45
sl@0	46	# Do not change these next two values!
sl@0	47	my $HashTableSize = 1 << $LgHashTableSize;
sl@0	48	my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1);
sl@0	49
sl@0	50	# Hashing function in Perl: Getting the initial search position
sl@0	51	sub HashStart
sl@0	52	{
sl@0	53	return $_[0] & ($HashTableSize - 1);
sl@0	54	}
sl@0	55	# How far to step through each time
sl@0	56	sub HashStep
sl@0	57	{
sl@0	58	my ($code) = @_;
sl@0	59	$code *= $code >> $LgHashTableSize;
sl@0	60	return ($code * 2 + 1) & ($HashTableSize - 1);
sl@0	61	}
sl@0	62
sl@0	63	# Make sure input string is all hex numbers separated by single spaces with
sl@0	64	# each hex number having 4 digits and decomposed into UTF16
sl@0	65	sub Normalize
sl@0	66	{
sl@0	67	my ($string) = @_;
sl@0	68	if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/)
sl@0	69	{
sl@0	70	return $string;
sl@0	71	}
sl@0	72	my $norm = '';
sl@0	73	foreach my $elt (split(' ', $string))
sl@0	74	{
sl@0	75	if ($elt)
sl@0	76	{
sl@0	77	die "'$elt' is not a hex number"
sl@0	78	unless $elt =~ /[0-9a-fA-F]+/;
sl@0	79	$norm = $norm.' '
sl@0	80	unless $norm eq '';
sl@0	81	$elt = hex $elt;
sl@0	82	if ($elt < 0x10000)
sl@0	83	{
sl@0	84	$norm = $norm.(sprintf('%04X', $elt));
sl@0	85	}
sl@0	86	else
sl@0	87	{
sl@0	88	# Add a surrogate pair
sl@0	89	$norm = $norm.(sprintf('%04X %04X',
sl@0	90	($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00));
sl@0	91	}
sl@0	92	}
sl@0	93	}
sl@0	94	#print STDERR "'$string' normalized to '$norm'\n";
sl@0	95	return $norm;
sl@0	96	}
sl@0	97
sl@0	98	# First stage:
sl@0	99	# Hash of Unicode values to normalised decomposition and folded strings
sl@0	100	my %Decomp = ();
sl@0	101	my %Folded = ();
sl@0	102	# Mapping from decomposition->char, if not excluded
sl@0	103	my %Composition = ();
sl@0	104	# characters with non-excluded decompositions
sl@0	105	my @IncludedDecomps = ();
sl@0	106	# characters with long (>1 UTF16) excluded decompositions
sl@0	107	my @LongExcludedDecomps = ();
sl@0	108	# characters with singleton decompositions but long folds
sl@0	109	my @ShortDecompsLongFolds = ();
sl@0	110	# characters with singleton folds and singleton
sl@0	111	my @ShortDecompsShortFolds = ();
sl@0	112	# characters with singleton folds but no decomps
sl@0	113	my @ShortFoldsOnly = ();
sl@0	114
sl@0	115	# A mapping from decompositions of length greater than two
sl@0	116	# to the code that produced them.
sl@0	117	my %VeryLongDecompositions = ();
sl@0	118
sl@0	119	# A list of characters containing all decompositions of length >2 as slices
sl@0	120	my @VeryLongDecompData = ();
sl@0	121	# Mapping from decomposition->index into VeryLongDecompData
sl@0	122	my %VeryLongDecompMap = ();
sl@0	123
sl@0	124	# There will be a hash table mapping Unicode values to indices into the other
sl@0	125	# tables. %Index maps the same thing in Perl.
sl@0	126	my %Index = ();
sl@0	127	# %HashTableEntryContents maps the table entries to the Unicode values they
sl@0	128	# contain.
sl@0	129	my %HashTableEntryContents = ();
sl@0	130	# %HashTableEntry maps Unicode value to the entry in the hash table
sl@0	131	my %HashTableEntry = ();
sl@0	132
sl@0	133	# Bind a unicode value to an index into the tables
sl@0	134	sub AddHashValue
sl@0	135	{
sl@0	136	my ($unicode, $index) = @_;
sl@0	137	$Index{$unicode} = $index;
sl@0	138	my $pos = HashStart($unicode);
sl@0	139	my $step = HashStep($unicode);
sl@0	140	while (exists $HashTableEntryContents{$pos})
sl@0	141	{
sl@0	142	$pos += $step;
sl@0	143	if ($HashTableSize <= $pos)
sl@0	144	{
sl@0	145	$pos %= $HashTableSize;
sl@0	146	}
sl@0	147	}
sl@0	148	$HashTableEntryContents{$pos} = $unicode;
sl@0	149	$HashTableEntry{$unicode} = $pos;
sl@0	150	}
sl@0	151
sl@0	152	# Bind a whole array to the indices starting from that given as the first
sl@0	153	# argument. Returns the index of the next slot to be filled.
sl@0	154	sub AddListToHash
sl@0	155	{
sl@0	156	my ($index, @unicodes) = @_;
sl@0	157	while (@unicodes)
sl@0	158	{
sl@0	159	AddHashValue(shift @unicodes, $index);
sl@0	160	$index++;
sl@0	161	}
sl@0	162	return $index;
sl@0	163	}
sl@0	164
sl@0	165	# put the results of a read line into the data structures
sl@0	166	sub AddCode
sl@0	167	{
sl@0	168	my ($code, $excluded, $decomposition, $folded) = @_;
sl@0	169	return if ($decomposition eq '' && $folded eq '');
sl@0	170	$Decomp{$code} = $decomposition;
sl@0	171	$Folded{$code} = $folded;
sl@0	172
sl@0	173	if (!$excluded && $decomposition ne '')
sl@0	174	{
sl@0	175	push @IncludedDecomps, $code;
sl@0	176	$Composition{$decomposition} = $code;
sl@0	177	}
sl@0	178	elsif (4 < length $decomposition)
sl@0	179	{
sl@0	180	push @LongExcludedDecomps, $code;
sl@0	181	}
sl@0	182	elsif (4 < length $folded)
sl@0	183	{
sl@0	184	push @ShortDecompsLongFolds, $code;
sl@0	185	}
sl@0	186	elsif ($decomposition ne '')
sl@0	187	{
sl@0	188	push @ShortDecompsShortFolds, $code;
sl@0	189	}
sl@0	190	elsif ($folded ne '')
sl@0	191	{
sl@0	192	push @ShortFoldsOnly, $code;
sl@0	193	}
sl@0	194
sl@0	195	$VeryLongDecompositions{$decomposition} = $code
sl@0	196	if (9 < length $decomposition);
sl@0	197	$VeryLongDecompositions{$folded} = $code
sl@0	198	if (9 < length $folded);
sl@0	199	}
sl@0	200
sl@0	201	if (scalar(@ARGV) != 0)
sl@0	202	{
sl@0	203	print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n");
sl@0	204	exit 1;
sl@0	205	}
sl@0	206
sl@0	207	my $lineNo = 0;
sl@0	208	my $inBlock = 0;
sl@0	209	while(<STDIN>)
sl@0	210	{
sl@0	211	$lineNo++;
sl@0	212	if (/^(1?[0-9a-fA-F]{4,5});([^;]);.symbian:(E?);[^;];([0-9a-fA-F \t]);([0-9a-fA-F \t])[ \t]$/i)
sl@0	213	{
sl@0	214	my $code = hex $1;
sl@0	215	my $description = $2;
sl@0	216	my $excluded = $3;
sl@0	217	my $decomposition = Normalize($4);
sl@0	218	my $folded = Normalize($5);
sl@0	219
sl@0	220	die ("Value $1 too large to be Unicode at line $lineNo.")
sl@0	221	if (0x110000 <= $code);
sl@0	222
sl@0	223	die("Normalisation failed with '$decomposition' at line $lineNo.")
sl@0	224	unless (length $decomposition) == 0 \|\| (length $decomposition) % 5 == 4;
sl@0	225	die("Normalisation failed with '$folded' at line $lineNo.")
sl@0	226	unless (length $folded) == 0 \|\| (length $folded) % 5 == 4;
sl@0	227
sl@0	228	AddCode($code, $excluded, $decomposition, $folded);
sl@0	229
sl@0	230	if ($description =~ /^<.*Last>$/i)
sl@0	231	{
sl@0	232	die("End of block without start at line $lineNo!")
sl@0	233	if !$inBlock;
sl@0	234	while ($inBlock <= $code)
sl@0	235	{
sl@0	236	AddCode($inBlock, $excluded, $decomposition, $folded);
sl@0	237	$inBlock++;
sl@0	238	}
sl@0	239	$inBlock = 0;
sl@0	240	}
sl@0	241	elsif ($description =~ /^<.*First>$/i)
sl@0	242	{
sl@0	243	die("Block within block at line $lineNo!")
sl@0	244	if $inBlock;
sl@0	245	$inBlock = $code + 1;
sl@0	246	}
sl@0	247	}
sl@0	248	elsif (!/^[ \t]*$/)
sl@0	249	{
sl@0	250	die("Did not understand line $lineNo.");
sl@0	251	}
sl@0	252	}
sl@0	253
sl@0	254	# We need to construct the data for the table of decompositions of length > 2.
sl@0	255	foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions)
sl@0	256	{
sl@0	257	if (!exists $VeryLongDecompMap{$decomp})
sl@0	258	{
sl@0	259	# Does not already exist
sl@0	260	my $newPos = scalar @VeryLongDecompData;
sl@0	261	$VeryLongDecompMap{$decomp} = $newPos;
sl@0	262	foreach my $code (split(' ', $decomp))
sl@0	263	{
sl@0	264	push @VeryLongDecompData, $code;
sl@0	265	}
sl@0	266	while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/)
sl@0	267	{
sl@0	268	$decomp = $1;
sl@0	269	$VeryLongDecompMap{$decomp} = $newPos;
sl@0	270	}
sl@0	271	}
sl@0	272	}
sl@0	273
sl@0	274	# We need to sort the codes for included decompositions into lexicographic
sl@0	275	# order of their decompositions.
sl@0	276	# This, luckily, is the same as sorting the strings that represent their
sl@0	277	# decompositions in hex lexicographically.
sl@0	278	@IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps;
sl@0	279
sl@0	280	print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps));
sl@0	281	print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds));
sl@0	282	print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: ");
sl@0	283	print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly));
sl@0	284	print STDERR "\n";
sl@0	285
sl@0	286	# Analyse the hash table to find out the maximum and average time
sl@0	287	# taken to find each ASCII character
sl@0	288	my $maxAsciiTime = 0;
sl@0	289	my $totalAsciiTime = 0;
sl@0	290	my $mostDifficultCode = undef;
sl@0	291	my $asciiFoundWithoutStepCount = 0;
sl@0	292	for (32..126)
sl@0	293	{
sl@0	294	my $code = $_;
sl@0	295	my $pos = HashStart($code);
sl@0	296	my $step = HashStep($code);
sl@0	297	my $stepCount = 1;
sl@0	298	if ($HashTableEntry{$code})
sl@0	299	{
sl@0	300	my $posRequired = $HashTableEntry{$code};
sl@0	301	while ($pos != $posRequired)
sl@0	302	{
sl@0	303	$pos = ($pos + $step) % $HashTableSize;
sl@0	304	$stepCount++;
sl@0	305	}
sl@0	306	}
sl@0	307	$totalAsciiTime += $stepCount;
sl@0	308	if ($maxAsciiTime < $stepCount)
sl@0	309	{
sl@0	310	$maxAsciiTime = $stepCount;
sl@0	311	$mostDifficultCode = $code;
sl@0	312	}
sl@0	313	if ($stepCount == 1)
sl@0	314	{
sl@0	315	$asciiFoundWithoutStepCount++;
sl@0	316	}
sl@0	317	}
sl@0	318	printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95);
sl@0	319	printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode);
sl@0	320
sl@0	321	# Now we populate the hash table
sl@0	322	my $index = 0;
sl@0	323
sl@0	324	$index = AddListToHash($index, @IncludedDecomps);
sl@0	325	my $hashIndexAfterIncludedDecomps = $index;
sl@0	326	printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps);
sl@0	327
sl@0	328	$index = AddListToHash($index, @LongExcludedDecomps);
sl@0	329	my $hashIndexAfterLongExcludeDecomps = $index;
sl@0	330	printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps);
sl@0	331
sl@0	332	$index = AddListToHash($index, @ShortDecompsLongFolds);
sl@0	333	my $hashIndexAfterShortDecompsLongFolds = $index;
sl@0	334	printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds);
sl@0	335
sl@0	336	$index = AddListToHash($index, @ShortDecompsShortFolds);
sl@0	337	my $hashIndexAfterShortDecompsShortFolds = $index;
sl@0	338	printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds);
sl@0	339
sl@0	340	$index = AddListToHash($index, @ShortFoldsOnly);
sl@0	341	my $hashIndexAfterShortFoldsOnly = $index;
sl@0	342	printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly);
sl@0	343
sl@0	344	#
sl@0	345	# Output C++ File
sl@0	346	#
sl@0	347	my $totalBytes = 0;
sl@0	348
sl@0	349	print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n";
sl@0	350	print "// All rights reserved.\n";
sl@0	351	print "// This component and the accompanying materials are made available\n";
sl@0	352	print "// under the terms of the License \"Eclipse Public License v1.0\"\n";
sl@0	353	print "// which accompanies this distribution, and is available\n";
sl@0	354	print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n";
sl@0	355	print "//\n";
sl@0	356	print "// Initial Contributors:\n";
sl@0	357	print "// Nokia Corporation - initial contribution.\n";
sl@0	358	print "//\n";
sl@0	359	print "// Contributors:\n";
sl@0	360	print "//\n";
sl@0	361	print "// Description:\n";
sl@0	362	print "//\n";
sl@0	363	print "// Fold and decomposition tables.\n";
sl@0	364	print "//\n";
sl@0	365	print "// These tables are linked in the following way:\n";
sl@0	366	print "// KUnicodeToIndexHash is a hash table using double hashing for\n";
sl@0	367	print "// conflict resolution. The functions DecompositionHashStart and\n";
sl@0	368	print "// DecompositionHashStep give the start and step values for accessing\n";
sl@0	369	print "// the table. The first probe is at DecompositionHashStart and each\n";
sl@0	370	print "// subsequent probe is offset by DecompositionHashStep. Probes\n";
sl@0	371	print "// continue until either 0 is found (indicating that the Unicode value\n";
sl@0	372	print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n";
sl@0	373	print "// is found that has the sought Unicode value in its lower 20 bits.\n";
sl@0	374	print "//\n";
sl@0	375	print "// In this latter case, the upper 12 bits contain an index into\n";
sl@0	376	print "// one of the following tables, according to the following rules:\n";
sl@0	377	print "//\n";
sl@0	378	print "// In the case of folding:\n";
sl@0	379	print "// If the Index is less than the length of KNonSingletonFolds / 2,\n";
sl@0	380	print "// it is an index into KNonSingletonFolds. If the Index is\n";
sl@0	381	print "// greater than the length of KNonSingletonFolds / 2, then it is an\n";
sl@0	382	print "// index into KSingletonFolds.\n";
sl@0	383	print "//\n";
sl@0	384	print "// In the case of decomposition:\n";
sl@0	385	print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n";
sl@0	386	print "// it is an index into KNonSingletonDecompositions. If the Index is\n";
sl@0	387	print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n";
sl@0	388	print "// index into KSingletonDecompositions.\n";
sl@0	389	print "//\n";
sl@0	390	print "// In summary:\n";
sl@0	391	print "// Let Knsf be the length of KNonSingletonFolds / 2,\n";
sl@0	392	print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n";
sl@0	393	print "// let Ksd be the length of KSingletonDecompositions and\n";
sl@0	394	print "// let Ksf be the length of KSingletonFolds.\n";
sl@0	395	print "// Now if you want to fold a character and you have found\n";
sl@0	396	print "// its index 'i' from the KUnicodeToIndexHash, then;\n";
sl@0	397	print "// if (i < Knsf) then look up\n";
sl@0	398	print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n";
sl@0	399	print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n";
sl@0	400	print "// else there is no fold for this character.\n";
sl@0	401	print "//\n";
sl@0	402	print "// Or if you want to decompose the same character, then;\n";
sl@0	403	print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n";
sl@0	404	print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n";
sl@0	405	print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n";
sl@0	406	print "// else there is no decomposition for this character.\n";
sl@0	407	print "//\n";
sl@0	408	print "// Your index into KSingletonDecompositions or KSingletonFolds\n";
sl@0	409	print "// yields a single value which is the decomposition or fold.\n";
sl@0	410	print "//\n";
sl@0	411	print "// The KNonSingletonFolds and KNonSingletonDecomposition\n";
sl@0	412	print "// tables are made up of pairs of values. Each pair is either a pair\n";
sl@0	413	print "// of Unicode values that constitute the fold or decomposition, or\n";
sl@0	414	print "// the first value is KLongD and the second has its top 4 bits as the\n";
sl@0	415	print "// length of the decomposition (or folded decomposition) minus 3,\n";
sl@0	416	print "// and its bottom 12 bits as the index into KLongDecompositions\n";
sl@0	417	print "// of where you can find this decomposition.\n";
sl@0	418	print "//\n";
sl@0	419	print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n";
sl@0	420	print "// all the decomposed and folded sequences longer than 4 bytes long.\n";
sl@0	421	print "\n";
sl@0	422	print "// Hash table mapping unicode values to indices into the other tables\n";
sl@0	423	print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n";
sl@0	424	print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t";
sl@0	425	my @HashTableOutput;
sl@0	426	for (0..($HashTableSize - 1))
sl@0	427	{
sl@0	428	my $v = 0;
sl@0	429	if (exists $HashTableEntryContents{$_})
sl@0	430	{
sl@0	431	$v = $HashTableEntryContents{$_};
sl@0	432	die ('Did not expect a Unicode value > 0xFFFFF')
sl@0	433	if 0xFFFFF < $v;
sl@0	434	$v \|= ($Index{$v}) << 20;
sl@0	435	}
sl@0	436	push @HashTableOutput, sprintf('0x%08x', $v);
sl@0	437	$totalBytes += 4;
sl@0	438	}
sl@0	439	print (shift @HashTableOutput);
sl@0	440	my $valueCount = 0;
sl@0	441	foreach my $v (@HashTableOutput)
sl@0	442	{
sl@0	443	print (((++$valueCount & 7) == 0)? ",\n\t" : ', ');
sl@0	444	print $v;
sl@0	445	}
sl@0	446	print "\n\t};\n\n";
sl@0	447	print "// Hash table access functions\n";
sl@0	448	print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n";
sl@0	449	print "inline int DecompositionHashStart(long a)\n";
sl@0	450	print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n";
sl@0	451	print "inline int DecompositionHashStep(long a)\n";
sl@0	452	print "\t{\n\ta *= a >> $LgHashTableSize;\n";
sl@0	453	print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n";
sl@0	454
sl@0	455	print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n";
sl@0	456	print "// indexes it\n";
sl@0	457	print "const unsigned short KCompositionMapping[] =\n\t{\n\t";
sl@0	458	for (0..(scalar(@IncludedDecomps - 1)))
sl@0	459	{
sl@0	460	if ($_ != 0)
sl@0	461	{print (($_ & 7) == 0? ",\n\t" : ', ')}
sl@0	462	printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} );
sl@0	463	$totalBytes += 2;
sl@0	464	}
sl@0	465	print "\n\t};\n\n";
sl@0	466
sl@0	467	print "// Table containing all the decomposition and folding strings longer\n";
sl@0	468	print "// than 2 UTF16 characters\n";
sl@0	469	print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x";
sl@0	470	for(0..(scalar(@VeryLongDecompData) - 1))
sl@0	471	{
sl@0	472	if ($_ != 0)
sl@0	473	{print (($_ & 7) == 0?",\n\t0x" : ', 0x')}
sl@0	474	print $VeryLongDecompData[$_];
sl@0	475	$totalBytes += 2;
sl@0	476	}
sl@0	477	print "\n\t};\n\n";
sl@0	478
sl@0	479	print "// Table containing decompositions longer than one UTF16 character.\n";
sl@0	480	print "// The top of the table contains all compositions, sorted lexicographically.\n";
sl@0	481	print "// Any decompositions of length 2 are in the table as a pair of values,\n";
sl@0	482	print "// decompositions longer than that are represented by a KLongD followed by\n";
sl@0	483	print "// a value whose top four bits indicate the length of the decomposition minus\n";
sl@0	484	print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n";
sl@0	485	print "// array where the decomposition starts.\n";
sl@0	486	print "const long KLongD = 0;\n";
sl@0	487	print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n";
sl@0	488	print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t";
sl@0	489
sl@0	490	sub PrintNonsingletonDecompTableEntry
sl@0	491	{
sl@0	492	my ($decomp) = @_;
sl@0	493	if (length $decomp < 10)
sl@0	494	{
sl@0	495	if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/)
sl@0	496	{
sl@0	497	print '0x'.$1.', 0x'.$2;
sl@0	498	}
sl@0	499	else
sl@0	500	{
sl@0	501	die("$decomp expected to be normalized and of length 1 or 2")
sl@0	502	if $decomp !~ /[0-9A-F]{4}/;
sl@0	503	print '0x'.$decomp.', 0xFFFF';
sl@0	504	}
sl@0	505	}
sl@0	506	else
sl@0	507	{
sl@0	508	printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp});
sl@0	509	}
sl@0	510	}
sl@0	511
sl@0	512	{my $entryNo = 0;
sl@0	513	foreach my $code (@IncludedDecomps)
sl@0	514	{
sl@0	515	if ($entryNo != 0)
sl@0	516	{print (($entryNo & 3) == 0?",\n\t" : ', ')}
sl@0	517	PrintNonsingletonDecompTableEntry($Decomp{$code});
sl@0	518	$entryNo++;
sl@0	519	$totalBytes += 4;
sl@0	520	}
sl@0	521	foreach my $code (@LongExcludedDecomps)
sl@0	522	{
sl@0	523	print (($entryNo & 3) == 0?",\n\t" : ', ');
sl@0	524	PrintNonsingletonDecompTableEntry($Decomp{$code});
sl@0	525	$entryNo++;
sl@0	526	$totalBytes += 4;
sl@0	527	}
sl@0	528	}
sl@0	529	print "\n\t};\n\n";
sl@0	530
sl@0	531	print "// Table of folded decompositions which either have more than one UTF16, or\n";
sl@0	532	print "// their normal decompositions have more than one UTF16\n";
sl@0	533	print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n";
sl@0	534	print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t";
sl@0	535	{my $entryNo = 0;
sl@0	536	foreach my $code (@IncludedDecomps)
sl@0	537	{
sl@0	538	if ($entryNo != 0)
sl@0	539	{print (($entryNo & 3) == 0?",\n\t" : ', ')}
sl@0	540	PrintNonsingletonDecompTableEntry($Folded{$code});
sl@0	541	$entryNo++;
sl@0	542	$totalBytes += 4;
sl@0	543	}
sl@0	544	foreach my $code (@LongExcludedDecomps)
sl@0	545	{
sl@0	546	print (($entryNo & 3) == 0?",\n\t" : ', ');
sl@0	547	PrintNonsingletonDecompTableEntry($Folded{$code});
sl@0	548	$entryNo++;
sl@0	549	$totalBytes += 4;
sl@0	550	}
sl@0	551	foreach my $code (@ShortDecompsLongFolds)
sl@0	552	{
sl@0	553	print (($entryNo & 3) == 0?",\n\t" : ', ');
sl@0	554	PrintNonsingletonDecompTableEntry($Folded{$code});
sl@0	555	$entryNo++;
sl@0	556	$totalBytes += 4;
sl@0	557	}
sl@0	558	}
sl@0	559	print "\n\t};\n\n";
sl@0	560
sl@0	561	print "// Table of singleton decompositions and characters with singleton folds\n";
sl@0	562	print "// Note for Unicode 5.0:\n";
sl@0	563	print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n";
sl@0	564	print "// As per the algorithm of this file, the non-BMP character should be stored in \n";
sl@0	565	print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n";
sl@0	566	print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n";
sl@0	567	print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n";
sl@0	568	print "// your character has no decomposition.\n";
sl@0	569	print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n";
sl@0	570	print "// want to know more.\n";
sl@0	571	print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n";
sl@0	572	print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x";
sl@0	573	{my $entryNo = 0;
sl@0	574	foreach my $code (@ShortDecompsLongFolds)
sl@0	575	{
sl@0	576	if ($entryNo != 0)
sl@0	577	{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
sl@0	578	if (exists $Decomp{$code} && $Decomp{$code} ne '')
sl@0	579	{
sl@0	580	print $Decomp{$code};
sl@0	581	}
sl@0	582	else
sl@0	583	{
sl@0	584	# Don't take these 0xFFFF as character.
sl@0	585	#printf ('%04X', $code);
sl@0	586	printf ("FFFF");
sl@0	587	}
sl@0	588	$entryNo++;
sl@0	589	$totalBytes += 4;
sl@0	590	}
sl@0	591	foreach my $code (@ShortDecompsShortFolds)
sl@0	592	{
sl@0	593	if ($entryNo != 0)
sl@0	594	{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
sl@0	595	print $Decomp{$code};
sl@0	596	$entryNo++;
sl@0	597	$totalBytes += 4;
sl@0	598	}
sl@0	599	}
sl@0	600	print "\n\t};\n\n";
sl@0	601
sl@0	602	print "// Table of singleton folds\n";
sl@0	603	print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n";
sl@0	604	print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x";
sl@0	605	{my $entryNo = 0;
sl@0	606	foreach my $code (@ShortDecompsShortFolds)
sl@0	607	{
sl@0	608	if ($entryNo != 0)
sl@0	609	{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
sl@0	610	print $Folded{$code};
sl@0	611	$entryNo++;
sl@0	612	$totalBytes += 4;
sl@0	613	}
sl@0	614	foreach my $code (@ShortFoldsOnly)
sl@0	615	{
sl@0	616	print (($entryNo & 7) == 0?",\n\t0x" : ', 0x');
sl@0	617	print $Folded{$code};
sl@0	618	$entryNo++;
sl@0	619	$totalBytes += 4;
sl@0	620	}
sl@0	621	}
sl@0	622	print "\n\t};\n";
sl@0	623
sl@0	624	print "\n// Total size: $totalBytes bytes\n";
sl@0	625	print STDERR $totalBytes, " bytes\n";

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--