First public contribution.
1 # Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
3 # This component and the accompanying materials are made available
4 # under the terms of the License "Eclipse Public License v1.0"
5 # which accompanies this distribution, and is available
6 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
8 # Initial Contributors:
9 # Nokia Corporation - initial contribution.
14 # Adds folding information to Unicode data
15 # Added as the third field after the 'Symbian:' marker in the following format:
16 # Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
17 # where <folded-form> is null or a sequence of hex unicode values
18 # separated by spaces representing the folded form of the character.
20 # perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
26 if (scalar(@ARGV) != 1)
28 print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
32 open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
40 my ($line, $comment) = split(/#/, $_, 2);
41 if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
46 # We'll deal with Turkic mappings with our own hack.
47 # F = Full mappings (fold is longer than one character)
48 # T = I = Turkic mapping
49 if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
51 die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
52 if (exists $Fold{$code});
53 $Fold{$code} = $folded;
54 $MappingLine{$code} = $lineNo;
57 elsif ($line !~ /^[ \t]*$/)
59 die ("Did not understand line $lineNo of $ARGV[0]");
66 # Map dotted capital I and dotless small I to lower case i.
67 # This makes all the 'i's fold the same, which isn't very nice for Turkic
68 # languages, but it at least gives us behaviour consistent across locales
69 # which does at least map dotted I, and i to the same value, as well
70 # as mapping I and dotless i to the same value, and mapping I and i
73 $Fold{0x130} = '0069';
74 $Fold{0x131} = '0069';
77 while (my $line = <STDIN>)
81 # Split into fields: make sure trailing null strings are not
82 # deleted by adding a dummy final field
83 my @attribute = split(/;/, $line.';dummy');
84 # Delete the dummy field
86 die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
87 if (scalar(@attribute) == 15);
88 if (scalar(@attribute) == 16)
90 die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
91 if ($attribute[15] !~ /^[ \t]*symbian:/i);
92 my $code = $attribute[0];
93 die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
94 unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
96 $attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
97 print join(';', @attribute);
99 elsif ($line !~ /^[ \t]*$/)
101 die 'Do not understand line '.$lineNo;