1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/kernelhwsrv/kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,108 @@
1.4 +# Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
1.5 +# All rights reserved.
1.6 +# This component and the accompanying materials are made available
1.7 +# under the terms of the License "Eclipse Public License v1.0"
1.8 +# which accompanies this distribution, and is available
1.9 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.10 +#
1.11 +# Initial Contributors:
1.12 +# Nokia Corporation - initial contribution.
1.13 +#
1.14 +# Contributors:
1.15 +#
1.16 +# Description:
1.17 +# Adds folding information to Unicode data
1.18 +# Added as the third field after the 'Symbian:' marker in the following format:
1.19 +# Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
1.20 +# where <folded-form> is null or a sequence of hex unicode values
1.21 +# separated by spaces representing the folded form of the character.
1.22 +# Usage:
1.23 +# perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
1.24 +#
1.25 +#
1.26 +
1.27 +use strict;
1.28 +
1.29 +if (scalar(@ARGV) != 1)
1.30 + {
1.31 + print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
1.32 + exit 1;
1.33 + }
1.34 +
1.35 +open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
1.36 +
1.37 +my %Fold = ();
1.38 +my %MappingLine = ();
1.39 +my $lineNo = 0;
1.40 +while (<FOLDING>)
1.41 + {
1.42 + $lineNo++;
1.43 + my ($line, $comment) = split(/#/, $_, 2);
1.44 + if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
1.45 + {
1.46 + my $code = hex($1);
1.47 + my $type = $2;
1.48 + my $folded = $3;
1.49 + # We'll deal with Turkic mappings with our own hack.
1.50 + # F = Full mappings (fold is longer than one character)
1.51 + # T = I = Turkic mapping
1.52 + if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
1.53 + {
1.54 + die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
1.55 + if (exists $Fold{$code});
1.56 + $Fold{$code} = $folded;
1.57 + $MappingLine{$code} = $lineNo;
1.58 + }
1.59 + }
1.60 + elsif ($line !~ /^[ \t]*$/)
1.61 + {
1.62 + die ("Did not understand line $lineNo of $ARGV[0]");
1.63 + }
1.64 + }
1.65 +
1.66 +close FOLDING;
1.67 +
1.68 +# Turkic hack:
1.69 +# Map dotted capital I and dotless small I to lower case i.
1.70 +# This makes all the 'i's fold the same, which isn't very nice for Turkic
1.71 +# languages, but it at least gives us behaviour consistent across locales
1.72 +# which does at least map dotted I, and i to the same value, as well
1.73 +# as mapping I and dotless i to the same value, and mapping I and i
1.74 +# to the same value.
1.75 +$Fold{0x49} = '0069';
1.76 +$Fold{0x130} = '0069';
1.77 +$Fold{0x131} = '0069';
1.78 +
1.79 +$lineNo = 0;
1.80 +while (my $line = <STDIN>)
1.81 + {
1.82 + chomp $line;
1.83 + $lineNo++;
1.84 + # Split into fields: make sure trailing null strings are not
1.85 + # deleted by adding a dummy final field
1.86 + my @attribute = split(/;/, $line.';dummy');
1.87 + # Delete the dummy field
1.88 + pop @attribute;
1.89 + die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
1.90 + if (scalar(@attribute) == 15);
1.91 + if (scalar(@attribute) == 16)
1.92 + {
1.93 + die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
1.94 + if ($attribute[15] !~ /^[ \t]*symbian:/i);
1.95 + my $code = $attribute[0];
1.96 + die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
1.97 + unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
1.98 + $code = hex($code);
1.99 + $attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
1.100 + print join(';', @attribute);
1.101 + }
1.102 + elsif ($line !~ /^[ \t]*$/)
1.103 + {
1.104 + die 'Do not understand line '.$lineNo;
1.105 + }
1.106 + else
1.107 + {
1.108 + print $line;
1.109 + }
1.110 + print "\n";
1.111 + }