os/kernelhwsrv/kernel/eka/euser/unicode/perl/FoldAndDecompTables.pl
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/kernelhwsrv/kernel/eka/euser/unicode/perl/FoldAndDecompTables.pl	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,625 @@
     1.4 +# Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.5 +# All rights reserved.
     1.6 +# This component and the accompanying materials are made available
     1.7 +# under the terms of the License "Eclipse Public License v1.0"
     1.8 +# which accompanies this distribution, and is available
     1.9 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.10 +#
    1.11 +# Initial Contributors:
    1.12 +# Nokia Corporation - initial contribution.
    1.13 +#
    1.14 +# Contributors:
    1.15 +#
    1.16 +# Description:
    1.17 +# Creates C++ code describing how to decompose, compose and fold each character.
    1.18 +# Usage:
    1.19 +# perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>
    1.20 +# Tables we want to create:
    1.21 +# A: Ordered list of non-excluded decompositions
    1.22 +# B: List of folded decompositions matching A
    1.23 +# C: List of decompositions not listed in A of length > 1
    1.24 +# D: List of folded decompositions matching C
    1.25 +# E: List of decompositions of length = 1 whose matching folded decompositions
    1.26 +# are of length > 1
    1.27 +# F: List of folded decompositions matching E
    1.28 +# G: List of decompositions of length = 1 with matching folded decompositions
    1.29 +# H: List of folded decompostions matching G
    1.30 +# I: List of folded decompositions that do not have matching decompositions
    1.31 +# J: List of decompositions (folding and otherwise) of length > 2
    1.32 +# K: Hash table mapping Unicode value to its folded decomposition value in the
    1.33 +# concatenated list B-D-F-H-I
    1.34 +# L: List of hash slots in K matching A (providing a mapping from non-excluded
    1.35 +# decompositions to Unicode value)
    1.36 +# [all lengths are of UTF16 strings]
    1.37 +# 
    1.38 +#
    1.39 +
    1.40 +use strict;
    1.41 +
    1.42 +#
    1.43 +# Hash table:
    1.44 +#
    1.45 +
    1.46 +# Size of hashing table = 1 to the power $LgHashTableSize
    1.47 +my $LgHashTableSize = 12;
    1.48 +
    1.49 +# Do not change these next two values!
    1.50 +my $HashTableSize = 1 << $LgHashTableSize;
    1.51 +my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1);
    1.52 +
    1.53 +# Hashing function in Perl: Getting the initial search position
    1.54 +sub HashStart
    1.55 +	{
    1.56 +	return $_[0] & ($HashTableSize - 1);
    1.57 +	}
    1.58 +# How far to step through each time
    1.59 +sub HashStep
    1.60 +	{
    1.61 +	my ($code) = @_;
    1.62 +	$code *= $code >> $LgHashTableSize;
    1.63 +	return ($code * 2 + 1) & ($HashTableSize - 1);
    1.64 +	}
    1.65 +
    1.66 +# Make sure input string is all hex numbers separated by single spaces with
    1.67 +# each hex number having 4 digits and decomposed into UTF16
    1.68 +sub Normalize
    1.69 +	{
    1.70 +	my ($string) = @_;
    1.71 +	if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/)
    1.72 +		{
    1.73 +		return $string;
    1.74 +		}
    1.75 +	my $norm = '';
    1.76 +	foreach my $elt (split(' ', $string))
    1.77 +		{
    1.78 +		if ($elt)
    1.79 +			{
    1.80 +			die "'$elt' is not a hex number"
    1.81 +				unless $elt =~ /[0-9a-fA-F]+/;
    1.82 +			$norm = $norm.' '
    1.83 +				unless $norm eq '';
    1.84 +			$elt = hex $elt;
    1.85 +			if ($elt < 0x10000)
    1.86 +				{
    1.87 +				$norm = $norm.(sprintf('%04X', $elt));
    1.88 +				}
    1.89 +			else
    1.90 +				{
    1.91 +				# Add a surrogate pair
    1.92 +				$norm = $norm.(sprintf('%04X %04X',
    1.93 +					($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00));
    1.94 +				}
    1.95 +			}
    1.96 +		}
    1.97 +	#print STDERR "'$string' normalized to '$norm'\n";
    1.98 +	return $norm;
    1.99 +	}
   1.100 +
   1.101 +# First stage:
   1.102 +# Hash of Unicode values to normalised decomposition and folded strings
   1.103 +my %Decomp = ();
   1.104 +my %Folded = ();
   1.105 +# Mapping from decomposition->char, if not excluded
   1.106 +my %Composition = ();
   1.107 +# characters with non-excluded decompositions
   1.108 +my @IncludedDecomps = ();
   1.109 +# characters with long (>1 UTF16) excluded decompositions
   1.110 +my @LongExcludedDecomps = ();
   1.111 +# characters with singleton decompositions but long folds
   1.112 +my @ShortDecompsLongFolds = ();
   1.113 +# characters with singleton folds and singleton
   1.114 +my @ShortDecompsShortFolds = ();
   1.115 +# characters with singleton folds but no decomps
   1.116 +my @ShortFoldsOnly = ();
   1.117 +
   1.118 +# A mapping from decompositions of length greater than two
   1.119 +# to the code that produced them.
   1.120 +my %VeryLongDecompositions = ();
   1.121 +
   1.122 +# A list of characters containing all decompositions of length >2 as slices
   1.123 +my @VeryLongDecompData = ();
   1.124 +# Mapping from decomposition->index into VeryLongDecompData
   1.125 +my %VeryLongDecompMap = ();
   1.126 +
   1.127 +# There will be a hash table mapping Unicode values to indices into the other
   1.128 +# tables. %Index maps the same thing in Perl.
   1.129 +my %Index = ();
   1.130 +# %HashTableEntryContents maps the table entries to the Unicode values they
   1.131 +# contain.
   1.132 +my %HashTableEntryContents = ();
   1.133 +# %HashTableEntry maps Unicode value to the entry in the hash table
   1.134 +my %HashTableEntry = ();
   1.135 +
   1.136 +# Bind a unicode value to an index into the tables
   1.137 +sub AddHashValue
   1.138 +	{
   1.139 +	my ($unicode, $index) = @_;
   1.140 +	$Index{$unicode} = $index;
   1.141 +	my $pos = HashStart($unicode);
   1.142 +	my $step = HashStep($unicode);
   1.143 +	while (exists $HashTableEntryContents{$pos})
   1.144 +		{
   1.145 +		$pos += $step;
   1.146 +		if ($HashTableSize <= $pos)
   1.147 +			{
   1.148 +			$pos %= $HashTableSize;
   1.149 +			}
   1.150 +		}
   1.151 +	$HashTableEntryContents{$pos} = $unicode;
   1.152 +	$HashTableEntry{$unicode} = $pos;
   1.153 +	}
   1.154 +
   1.155 +# Bind a whole array to the indices starting from that given as the first
   1.156 +# argument. Returns the index of the next slot to be filled.
   1.157 +sub AddListToHash
   1.158 +	{
   1.159 +	my ($index, @unicodes) = @_;
   1.160 +	while (@unicodes)
   1.161 +		{
   1.162 +		AddHashValue(shift @unicodes, $index);
   1.163 +		$index++;
   1.164 +		}
   1.165 +	return $index;
   1.166 +	}
   1.167 +
   1.168 +# put the results of a read line into the data structures
   1.169 +sub AddCode
   1.170 +	{
   1.171 +	my ($code, $excluded, $decomposition, $folded) = @_;
   1.172 +	return if ($decomposition eq '' && $folded eq '');
   1.173 +	$Decomp{$code} = $decomposition;
   1.174 +	$Folded{$code} = $folded;
   1.175 +
   1.176 +	if (!$excluded && $decomposition ne '')
   1.177 +		{
   1.178 +		push @IncludedDecomps, $code;
   1.179 +		$Composition{$decomposition} = $code;
   1.180 +		}
   1.181 +	elsif (4 < length $decomposition)
   1.182 +		{
   1.183 +		push @LongExcludedDecomps, $code;
   1.184 +		}
   1.185 +	elsif (4 < length $folded)
   1.186 +		{
   1.187 +		push @ShortDecompsLongFolds, $code;
   1.188 +		}
   1.189 +	elsif ($decomposition ne '')
   1.190 +		{
   1.191 +		push @ShortDecompsShortFolds, $code;
   1.192 +		}
   1.193 +	elsif ($folded ne '')
   1.194 +		{
   1.195 +		push @ShortFoldsOnly, $code;
   1.196 +		}
   1.197 +
   1.198 +	$VeryLongDecompositions{$decomposition} = $code
   1.199 +		if (9 < length $decomposition);
   1.200 +	$VeryLongDecompositions{$folded} = $code
   1.201 +		if (9 < length $folded);
   1.202 +	}
   1.203 +
   1.204 +if (scalar(@ARGV) != 0)
   1.205 +	{
   1.206 +	print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n");
   1.207 +	exit 1;
   1.208 +	}
   1.209 +
   1.210 +my $lineNo = 0;
   1.211 +my $inBlock = 0;
   1.212 +while(<STDIN>)
   1.213 +	{
   1.214 +	$lineNo++;
   1.215 +	if (/^(1?[0-9a-fA-F]{4,5});([^;]*);.*symbian:(E?);[^;]*;([0-9a-fA-F \t]*);([0-9a-fA-F \t]*)[ \t]*$/i)
   1.216 +		{
   1.217 +		my $code = hex $1;
   1.218 +		my $description = $2;
   1.219 +		my $excluded = $3;
   1.220 +		my $decomposition = Normalize($4);
   1.221 +		my $folded = Normalize($5);
   1.222 +
   1.223 +		die ("Value $1 too large to be Unicode at line $lineNo.")
   1.224 +			if (0x110000 <= $code);
   1.225 +
   1.226 +		die("Normalisation failed with '$decomposition' at line $lineNo.")
   1.227 +			unless (length $decomposition) == 0 || (length $decomposition) % 5 == 4;
   1.228 +		die("Normalisation failed with '$folded' at line $lineNo.")
   1.229 +			unless (length $folded) == 0 || (length $folded) % 5 == 4;
   1.230 +
   1.231 +		AddCode($code, $excluded, $decomposition, $folded);
   1.232 +
   1.233 +		if ($description =~ /^<.*Last>$/i)
   1.234 +			{
   1.235 +			die("End of block without start at line $lineNo!")
   1.236 +				if !$inBlock;
   1.237 +			while ($inBlock <= $code)
   1.238 +				{
   1.239 +				AddCode($inBlock, $excluded, $decomposition, $folded);
   1.240 +				$inBlock++;
   1.241 +				}
   1.242 +			$inBlock = 0;
   1.243 +			}
   1.244 +		elsif ($description =~ /^<.*First>$/i)
   1.245 +			{
   1.246 +			die("Block within block at line $lineNo!")
   1.247 +				if $inBlock;
   1.248 +			$inBlock = $code + 1;
   1.249 +			}
   1.250 +		}
   1.251 +	elsif (!/^[ \t]*$/)
   1.252 +		{
   1.253 +		die("Did not understand line $lineNo.");
   1.254 +		}
   1.255 +	}
   1.256 +
   1.257 +# We need to construct the data for the table of decompositions of length > 2.
   1.258 +foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions)
   1.259 +	{
   1.260 +	if (!exists $VeryLongDecompMap{$decomp})
   1.261 +		{
   1.262 +		# Does not already exist
   1.263 +		my $newPos = scalar @VeryLongDecompData;
   1.264 +		$VeryLongDecompMap{$decomp} = $newPos;
   1.265 +		foreach my $code (split(' ', $decomp))
   1.266 +			{
   1.267 +			push @VeryLongDecompData, $code;
   1.268 +			}
   1.269 +		while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/)
   1.270 +			{
   1.271 +			$decomp = $1;
   1.272 +			$VeryLongDecompMap{$decomp} = $newPos;
   1.273 +			}
   1.274 +		}
   1.275 +	}
   1.276 +
   1.277 +# We need to sort the codes for included decompositions into lexicographic
   1.278 +# order of their decompositions.
   1.279 +# This, luckily, is the same as sorting the strings that represent their
   1.280 +# decompositions in hex lexicographically.
   1.281 +@IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps;
   1.282 +
   1.283 +print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps));
   1.284 +print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds));
   1.285 +print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: ");
   1.286 +print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly));
   1.287 +print STDERR "\n";
   1.288 +
   1.289 +# Analyse the hash table to find out the maximum and average time
   1.290 +# taken to find each ASCII character
   1.291 +my $maxAsciiTime = 0;
   1.292 +my $totalAsciiTime = 0;
   1.293 +my $mostDifficultCode = undef;
   1.294 +my $asciiFoundWithoutStepCount = 0;
   1.295 +for (32..126)
   1.296 +	{
   1.297 +	my $code = $_;
   1.298 +	my $pos = HashStart($code);
   1.299 +	my $step = HashStep($code);
   1.300 +	my $stepCount = 1;
   1.301 +	if ($HashTableEntry{$code})
   1.302 +		{
   1.303 +		my $posRequired = $HashTableEntry{$code};
   1.304 +		while ($pos != $posRequired)
   1.305 +			{
   1.306 +			$pos = ($pos + $step) % $HashTableSize;
   1.307 +			$stepCount++;
   1.308 +			}
   1.309 +		}
   1.310 +	$totalAsciiTime += $stepCount;
   1.311 +	if ($maxAsciiTime < $stepCount)
   1.312 +		{
   1.313 +		$maxAsciiTime = $stepCount;
   1.314 +		$mostDifficultCode = $code;
   1.315 +		}
   1.316 +	if ($stepCount == 1)
   1.317 +		{
   1.318 +		$asciiFoundWithoutStepCount++;
   1.319 +		}
   1.320 +	}
   1.321 +printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95);
   1.322 +printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode);
   1.323 +
   1.324 +# Now we populate the hash table
   1.325 +my $index = 0;
   1.326 +
   1.327 +$index = AddListToHash($index, @IncludedDecomps);
   1.328 +my $hashIndexAfterIncludedDecomps = $index;
   1.329 +printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps);
   1.330 +
   1.331 +$index = AddListToHash($index, @LongExcludedDecomps);
   1.332 +my $hashIndexAfterLongExcludeDecomps = $index;
   1.333 +printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps);
   1.334 +
   1.335 +$index = AddListToHash($index, @ShortDecompsLongFolds);
   1.336 +my $hashIndexAfterShortDecompsLongFolds = $index;
   1.337 +printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds);
   1.338 +
   1.339 +$index = AddListToHash($index, @ShortDecompsShortFolds);
   1.340 +my $hashIndexAfterShortDecompsShortFolds = $index;
   1.341 +printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds);
   1.342 +
   1.343 +$index = AddListToHash($index, @ShortFoldsOnly);
   1.344 +my $hashIndexAfterShortFoldsOnly = $index;
   1.345 +printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly);
   1.346 +
   1.347 +#
   1.348 +# Output C++ File
   1.349 +#
   1.350 +my $totalBytes = 0;
   1.351 +
   1.352 +print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n";
   1.353 +print "// All rights reserved.\n";
   1.354 +print "// This component and the accompanying materials are made available\n";
   1.355 +print "// under the terms of the License \"Eclipse Public License v1.0\"\n";
   1.356 +print "// which accompanies this distribution, and is available\n";
   1.357 +print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n";
   1.358 +print "//\n";
   1.359 +print "// Initial Contributors:\n";
   1.360 +print "// Nokia Corporation - initial contribution.\n";
   1.361 +print "//\n";
   1.362 +print "// Contributors:\n";
   1.363 +print "//\n";
   1.364 +print "// Description:\n";
   1.365 +print "//\n";
   1.366 +print "// Fold and decomposition tables.\n";
   1.367 +print "//\n";
   1.368 +print "// These tables are linked in the following way:\n";
   1.369 +print "// KUnicodeToIndexHash is a hash table using double hashing for\n";
   1.370 +print "// conflict resolution. The functions DecompositionHashStart and\n";
   1.371 +print "// DecompositionHashStep give the start and step values for accessing\n";
   1.372 +print "// the table. The first probe is at DecompositionHashStart and each\n";
   1.373 +print "// subsequent probe is offset by DecompositionHashStep. Probes\n";
   1.374 +print "// continue until either 0 is found (indicating that the Unicode value\n";
   1.375 +print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n";
   1.376 +print "// is found that has the sought Unicode value in its lower 20 bits.\n";
   1.377 +print "//\n";
   1.378 +print "// In this latter case, the upper 12 bits contain an index into\n";
   1.379 +print "// one of the following tables, according to the following rules:\n";
   1.380 +print "//\n";
   1.381 +print "// In the case of folding:\n";
   1.382 +print "// If the Index is less than the length of KNonSingletonFolds / 2,\n";
   1.383 +print "// it is an index into KNonSingletonFolds. If the Index is\n";
   1.384 +print "// greater than the length of KNonSingletonFolds / 2, then it is an\n";
   1.385 +print "// index into KSingletonFolds.\n";
   1.386 +print "//\n";
   1.387 +print "// In the case of decomposition:\n";
   1.388 +print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n";
   1.389 +print "// it is an index into KNonSingletonDecompositions. If the Index is\n";
   1.390 +print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n";
   1.391 +print "// index into KSingletonDecompositions.\n";
   1.392 +print "//\n";
   1.393 +print "// In summary:\n";
   1.394 +print "// Let Knsf be the length of KNonSingletonFolds / 2,\n";
   1.395 +print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n";
   1.396 +print "// let Ksd be the length of KSingletonDecompositions and\n";
   1.397 +print "// let Ksf be the length of KSingletonFolds.\n";
   1.398 +print "// Now if you want to fold a character and you have found\n";
   1.399 +print "// its index 'i' from the KUnicodeToIndexHash, then;\n";
   1.400 +print "// if (i < Knsf) then look up\n";
   1.401 +print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n";
   1.402 +print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n";
   1.403 +print "// else there is no fold for this character.\n";
   1.404 +print "//\n";
   1.405 +print "// Or if you want to decompose the same character, then;\n";
   1.406 +print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n";
   1.407 +print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n";
   1.408 +print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n";
   1.409 +print "// else there is no decomposition for this character.\n";
   1.410 +print "//\n";
   1.411 +print "// Your index into KSingletonDecompositions or KSingletonFolds\n";
   1.412 +print "// yields a single value which is the decomposition or fold.\n";
   1.413 +print "//\n";
   1.414 +print "// The KNonSingletonFolds and KNonSingletonDecomposition\n";
   1.415 +print "// tables are made up of pairs of values. Each pair is either a pair\n";
   1.416 +print "// of Unicode values that constitute the fold or decomposition, or\n";
   1.417 +print "// the first value is KLongD and the second has its top 4 bits as the\n";
   1.418 +print "// length of the decomposition (or folded decomposition) minus 3,\n";
   1.419 +print "// and its bottom 12 bits as the index into KLongDecompositions\n";
   1.420 +print "// of where you can find this decomposition.\n";
   1.421 +print "//\n";
   1.422 +print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n";
   1.423 +print "// all the decomposed and folded sequences longer than 4 bytes long.\n";
   1.424 +print "\n";
   1.425 +print "// Hash table mapping unicode values to indices into the other tables\n";
   1.426 +print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n";
   1.427 +print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t";
   1.428 +my @HashTableOutput;
   1.429 +for (0..($HashTableSize - 1))
   1.430 +	{
   1.431 +	my $v = 0;
   1.432 +	if (exists $HashTableEntryContents{$_})
   1.433 +		{
   1.434 +		$v = $HashTableEntryContents{$_};
   1.435 +		die ('Did not expect a Unicode value > 0xFFFFF')
   1.436 +			if 0xFFFFF < $v;
   1.437 +		$v |= ($Index{$v}) << 20;
   1.438 +		}
   1.439 +	push @HashTableOutput, sprintf('0x%08x', $v);
   1.440 +	$totalBytes += 4;
   1.441 +	}
   1.442 +print (shift @HashTableOutput);
   1.443 +my $valueCount = 0;
   1.444 +foreach my $v (@HashTableOutput)
   1.445 +	{
   1.446 +	print (((++$valueCount & 7) == 0)? ",\n\t" : ', ');
   1.447 +	print $v;
   1.448 +	}
   1.449 +print "\n\t};\n\n";
   1.450 +print "// Hash table access functions\n";
   1.451 +print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n";
   1.452 +print "inline int DecompositionHashStart(long a)\n";
   1.453 +print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n";
   1.454 +print "inline int DecompositionHashStep(long a)\n";
   1.455 +print "\t{\n\ta *= a >> $LgHashTableSize;\n";
   1.456 +print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n";
   1.457 +
   1.458 +print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n";
   1.459 +print "// indexes it\n";
   1.460 +print "const unsigned short KCompositionMapping[] =\n\t{\n\t";
   1.461 +for (0..(scalar(@IncludedDecomps - 1)))
   1.462 +	{
   1.463 +	if ($_ != 0)
   1.464 +		{print (($_ & 7) == 0? ",\n\t" : ', ')}
   1.465 +	printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} );
   1.466 +	$totalBytes += 2;
   1.467 +	}
   1.468 +print "\n\t};\n\n";
   1.469 +
   1.470 +print "// Table containing all the decomposition and folding strings longer\n";
   1.471 +print "// than 2 UTF16 characters\n";
   1.472 +print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x";
   1.473 +for(0..(scalar(@VeryLongDecompData) - 1))
   1.474 +	{
   1.475 +	if ($_ != 0)
   1.476 +		{print (($_ & 7) == 0?",\n\t0x" : ', 0x')}
   1.477 +	print $VeryLongDecompData[$_];
   1.478 +	$totalBytes += 2;
   1.479 +	}
   1.480 +print "\n\t};\n\n";
   1.481 +
   1.482 +print "// Table containing decompositions longer than one UTF16 character.\n";
   1.483 +print "// The top of the table contains all compositions, sorted lexicographically.\n";
   1.484 +print "// Any decompositions of length 2 are in the table as a pair of values,\n";
   1.485 +print "// decompositions longer than that are represented by a KLongD followed by\n";
   1.486 +print "// a value whose top four bits indicate the length of the decomposition minus\n";
   1.487 +print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n";
   1.488 +print "// array where the decomposition starts.\n";
   1.489 +print "const long KLongD = 0;\n";
   1.490 +print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n";
   1.491 +print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t";
   1.492 +
   1.493 +sub PrintNonsingletonDecompTableEntry
   1.494 +	{
   1.495 +	my ($decomp) = @_;
   1.496 +	if (length $decomp < 10)
   1.497 +		{
   1.498 +		if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/)
   1.499 +			{
   1.500 +			print '0x'.$1.', 0x'.$2;
   1.501 +			}
   1.502 +		else
   1.503 +			{
   1.504 +			die("$decomp expected to be normalized and of length 1 or 2")
   1.505 +				if $decomp !~ /[0-9A-F]{4}/;
   1.506 +			print '0x'.$decomp.', 0xFFFF';
   1.507 +			}
   1.508 +		}
   1.509 +	else
   1.510 +		{
   1.511 +		printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp});
   1.512 +		}
   1.513 +	}
   1.514 +
   1.515 +{my $entryNo = 0;
   1.516 +foreach my $code (@IncludedDecomps)
   1.517 +	{
   1.518 +	if ($entryNo != 0)
   1.519 +		{print (($entryNo & 3) == 0?",\n\t" : ', ')}
   1.520 +	PrintNonsingletonDecompTableEntry($Decomp{$code});
   1.521 +	$entryNo++;
   1.522 +	$totalBytes += 4;
   1.523 +	}
   1.524 +foreach my $code (@LongExcludedDecomps)
   1.525 +	{
   1.526 +	print (($entryNo & 3) == 0?",\n\t" : ', ');
   1.527 +	PrintNonsingletonDecompTableEntry($Decomp{$code});
   1.528 +	$entryNo++;
   1.529 +	$totalBytes += 4;
   1.530 +	}
   1.531 +}
   1.532 +print "\n\t};\n\n";
   1.533 +
   1.534 +print "// Table of folded decompositions which either have more than one UTF16, or\n";
   1.535 +print "// their normal decompositions have more than one UTF16\n";
   1.536 +print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n";
   1.537 +print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t";
   1.538 +{my $entryNo = 0;
   1.539 +foreach my $code (@IncludedDecomps)
   1.540 +	{
   1.541 +	if ($entryNo != 0)
   1.542 +		{print (($entryNo & 3) == 0?",\n\t" : ', ')}
   1.543 +	PrintNonsingletonDecompTableEntry($Folded{$code});
   1.544 +	$entryNo++;
   1.545 +	$totalBytes += 4;
   1.546 +	}
   1.547 +foreach my $code (@LongExcludedDecomps)
   1.548 +	{
   1.549 +	print (($entryNo & 3) == 0?",\n\t" : ', ');
   1.550 +	PrintNonsingletonDecompTableEntry($Folded{$code});
   1.551 +	$entryNo++;
   1.552 +	$totalBytes += 4;
   1.553 +	}
   1.554 +foreach my $code (@ShortDecompsLongFolds)
   1.555 +	{
   1.556 +	print (($entryNo & 3) == 0?",\n\t" : ', ');
   1.557 +	PrintNonsingletonDecompTableEntry($Folded{$code});
   1.558 +	$entryNo++;
   1.559 +	$totalBytes += 4;
   1.560 +	}
   1.561 +}
   1.562 +print "\n\t};\n\n";
   1.563 +
   1.564 +print "// Table of singleton decompositions and characters with singleton folds\n";
   1.565 +print "// Note for Unicode 5.0:\n";
   1.566 +print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n";
   1.567 +print "// As per the algorithm of this file, the non-BMP character should be stored in \n";
   1.568 +print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n";
   1.569 +print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n";
   1.570 +print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n";
   1.571 +print "// your character has no decomposition.\n";
   1.572 +print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n";
   1.573 +print "// want to know more.\n";
   1.574 +print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n";
   1.575 +print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x";
   1.576 +{my $entryNo = 0;
   1.577 +foreach my $code (@ShortDecompsLongFolds)
   1.578 +	{
   1.579 +	if ($entryNo != 0)
   1.580 +		{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
   1.581 +	if (exists $Decomp{$code} && $Decomp{$code} ne '')
   1.582 +		{
   1.583 +		print $Decomp{$code};
   1.584 +		}
   1.585 +	else
   1.586 +		{
   1.587 +		# Don't take these 0xFFFF as character.
   1.588 +		#printf ('%04X', $code);
   1.589 +		printf ("FFFF");
   1.590 +		}
   1.591 +	$entryNo++;
   1.592 +	$totalBytes += 4;
   1.593 +	}
   1.594 +foreach my $code (@ShortDecompsShortFolds)
   1.595 +	{
   1.596 +	if ($entryNo != 0)
   1.597 +		{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
   1.598 +	print $Decomp{$code};
   1.599 +	$entryNo++;
   1.600 +	$totalBytes += 4;
   1.601 +	}
   1.602 +}
   1.603 +print "\n\t};\n\n";
   1.604 +
   1.605 +print "// Table of singleton folds\n";
   1.606 +print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n";
   1.607 +print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x";
   1.608 +{my $entryNo = 0;
   1.609 +foreach my $code (@ShortDecompsShortFolds)
   1.610 +	{
   1.611 +	if ($entryNo != 0)
   1.612 +		{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
   1.613 +	print $Folded{$code};
   1.614 +	$entryNo++;
   1.615 +	$totalBytes += 4;
   1.616 +	}
   1.617 +foreach my $code (@ShortFoldsOnly)
   1.618 +	{
   1.619 +	print (($entryNo & 7) == 0?",\n\t0x" : ', 0x');
   1.620 +	print $Folded{$code};
   1.621 +	$entryNo++;
   1.622 +	$totalBytes += 4;
   1.623 +	}
   1.624 +}
   1.625 +print "\n\t};\n";
   1.626 +
   1.627 +print "\n// Total size: $totalBytes bytes\n";
   1.628 +print STDERR $totalBytes, " bytes\n";