First public contribution.
2 # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
4 # This component and the accompanying materials are made available
5 # under the terms of the License "Eclipse Public License v1.0"
6 # which accompanies this distribution, and is available
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 # Initial Contributors:
10 # Nokia Corporation - initial contribution.
16 # UnicodeCompositionEx
17 # adds composition exclusion information to unicode data
19 # Added as a new field:
20 # Symbian:<excluded-from-composition>
21 # where <excluded-from-composition> is E or null.
24 # perl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>
28 if (scalar(@ARGV) != 1)
30 print (STDERR "Usage:\nperl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>\n");
34 open(EXCLUSIONS, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
41 # try to parse the line if there is some non-whitespace before the comment
42 if (!/^[ \t]*([#].*)?$/)
44 /^[ \t]*([0-9A-Fa-f]{4,6})[ \t]*([#].*)?$/ or die("Did not understand line $lineNo of $ARGV[0]");
46 die ("Value $code outside Unicode range at line $lineNo of $ARGV[0]")
47 unless ($code < 0x110000);
49 #printf("Excluding %X because it is in the exclusion list\n", $code);
54 # This is a two-pass operation, so we must store the lines ready for output later.
55 my @DataFileLines = ();
56 my %DataFileLineCodes = ();
57 # The first pass will collect all the relevant data:
58 # The first character of the decomposition if there is more than one
59 my %FirstOfDecompositionString = ();
60 # The singleton decomposition if it is a singleton
61 my %SingletonDecomposition = ();
62 # The decompositions tag, if any
65 my %CombiningClass = ();
66 # We will also be marking all singleton decompositions for exclusion
68 while (my $line = <STDIN>)
71 $DataFileLines[$lineNo] = $line;
73 # Split into fields: make sure trailing null strings are not
74 # deleted by adding a dummy final field
75 my @attribute = split(/;/, $line.';dummy');
76 # Delete the dummy field
79 if (scalar(@attribute) == 15)
81 my $code = $attribute[0];
82 die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
83 unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
85 my $combiningClass = $attribute[3];
86 die("Fourth attribute '$combiningClass' is not a valid Unicode combining class at line $lineNo")
87 unless (0 <= $combiningClass && $combiningClass < 256);
88 my $decompositionString = $attribute[5];
89 die ("Sixth attribute '$decompositionString' is not a valid decomposition string at line $lineNo")
90 unless ($decompositionString =~ /^(<.*>)?[0-9a-fA-F \t]*$/);
91 my @decomposition = split(/[ \t]+/, $decompositionString);
92 if (@decomposition && $decomposition[0] =~ /^<.*>$/)
94 $DecompTag{$code} = shift @decomposition;
96 if (scalar(@decomposition) == 1)
98 # We want to exclude codes such as these, with a singleton
99 # decomposition mapping, but at the moment we don't know if the
100 # character mapped to has a decomposition mapping, so we will
101 # defer this to another stage.
102 die("Decomposition $decomposition[0] not understood at line $lineNo")
103 unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/);
104 $SingletonDecomposition{$code} = hex($decomposition[0]);
106 elsif (1 < scalar(@decomposition))
108 die("Decomposition $decomposition[0] not understood at line $lineNo")
109 unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/);
110 $FirstOfDecompositionString{$code} = hex($decomposition[0]);
112 $CombiningClass{$code} = $combiningClass;
113 $DataFileLineCodes{$lineNo-1} = $code;
115 elsif ($line !~ /^[ \t]*$/)
117 die 'Do not understand line '.$lineNo;
121 # Each code that has a decomposition string longer than one character
122 # where the first character has non-zero combining class is excluded
123 foreach my $code (keys %FirstOfDecompositionString)
125 my $decomp = $FirstOfDecompositionString{$code};
126 if (exists($CombiningClass{$decomp}))
128 if ($CombiningClass{$decomp} != 0)
130 $Excluded{$code} = 1;
131 #printf("Excluding %X because its decomposition starts with a non-starter(%X)\n", $code, $decomp);
136 # Each code that has a singleton decomposition string may be excluded if
137 # that code has only a singleton mapping itself.
138 foreach my $code (sort (keys %SingletonDecomposition))
141 while (exists $SingletonDecomposition{$mapsTo} && !exists $DecompTag{$code})
143 $mapsTo = $SingletonDecomposition{$mapsTo};
145 if (!exists $FirstOfDecompositionString{$mapsTo})
147 #printf("Excluding %X because its decomposition is a singleton(%X)\n", $code, $mapsTo);
148 $Excluded{$code} = 1;
152 # Now we output the file with the extra filed appended to each line
153 for(my $i = 0; $i != scalar(@DataFileLines); $i++)
155 print $DataFileLines[$i];
156 if (exists($DataFileLineCodes{$i}))
159 if (exists($Excluded{ $DataFileLineCodes{$i} }))