sl@0
|
1 |
# Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
2 |
# All rights reserved.
|
sl@0
|
3 |
# This component and the accompanying materials are made available
|
sl@0
|
4 |
# under the terms of the License "Eclipse Public License v1.0"
|
sl@0
|
5 |
# which accompanies this distribution, and is available
|
sl@0
|
6 |
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
7 |
#
|
sl@0
|
8 |
# Initial Contributors:
|
sl@0
|
9 |
# Nokia Corporation - initial contribution.
|
sl@0
|
10 |
#
|
sl@0
|
11 |
# Contributors:
|
sl@0
|
12 |
#
|
sl@0
|
13 |
# Description:
|
sl@0
|
14 |
# Adds folding information to Unicode data
|
sl@0
|
15 |
# Added as the third field after the 'Symbian:' marker in the following format:
|
sl@0
|
16 |
# Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
|
sl@0
|
17 |
# where <folded-form> is null or a sequence of hex unicode values
|
sl@0
|
18 |
# separated by spaces representing the folded form of the character.
|
sl@0
|
19 |
# Usage:
|
sl@0
|
20 |
# perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
|
sl@0
|
21 |
#
|
sl@0
|
22 |
#
|
sl@0
|
23 |
|
sl@0
|
24 |
use strict;
|
sl@0
|
25 |
|
sl@0
|
26 |
if (scalar(@ARGV) != 1)
|
sl@0
|
27 |
{
|
sl@0
|
28 |
print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
|
sl@0
|
29 |
exit 1;
|
sl@0
|
30 |
}
|
sl@0
|
31 |
|
sl@0
|
32 |
open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
|
sl@0
|
33 |
|
sl@0
|
34 |
my %Fold = ();
|
sl@0
|
35 |
my %MappingLine = ();
|
sl@0
|
36 |
my $lineNo = 0;
|
sl@0
|
37 |
while (<FOLDING>)
|
sl@0
|
38 |
{
|
sl@0
|
39 |
$lineNo++;
|
sl@0
|
40 |
my ($line, $comment) = split(/#/, $_, 2);
|
sl@0
|
41 |
if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
|
sl@0
|
42 |
{
|
sl@0
|
43 |
my $code = hex($1);
|
sl@0
|
44 |
my $type = $2;
|
sl@0
|
45 |
my $folded = $3;
|
sl@0
|
46 |
# We'll deal with Turkic mappings with our own hack.
|
sl@0
|
47 |
# F = Full mappings (fold is longer than one character)
|
sl@0
|
48 |
# T = I = Turkic mapping
|
sl@0
|
49 |
if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
|
sl@0
|
50 |
{
|
sl@0
|
51 |
die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
|
sl@0
|
52 |
if (exists $Fold{$code});
|
sl@0
|
53 |
$Fold{$code} = $folded;
|
sl@0
|
54 |
$MappingLine{$code} = $lineNo;
|
sl@0
|
55 |
}
|
sl@0
|
56 |
}
|
sl@0
|
57 |
elsif ($line !~ /^[ \t]*$/)
|
sl@0
|
58 |
{
|
sl@0
|
59 |
die ("Did not understand line $lineNo of $ARGV[0]");
|
sl@0
|
60 |
}
|
sl@0
|
61 |
}
|
sl@0
|
62 |
|
sl@0
|
63 |
close FOLDING;
|
sl@0
|
64 |
|
sl@0
|
65 |
# Turkic hack:
|
sl@0
|
66 |
# Map dotted capital I and dotless small I to lower case i.
|
sl@0
|
67 |
# This makes all the 'i's fold the same, which isn't very nice for Turkic
|
sl@0
|
68 |
# languages, but it at least gives us behaviour consistent across locales
|
sl@0
|
69 |
# which does at least map dotted I, and i to the same value, as well
|
sl@0
|
70 |
# as mapping I and dotless i to the same value, and mapping I and i
|
sl@0
|
71 |
# to the same value.
|
sl@0
|
72 |
$Fold{0x49} = '0069';
|
sl@0
|
73 |
$Fold{0x130} = '0069';
|
sl@0
|
74 |
$Fold{0x131} = '0069';
|
sl@0
|
75 |
|
sl@0
|
76 |
$lineNo = 0;
|
sl@0
|
77 |
while (my $line = <STDIN>)
|
sl@0
|
78 |
{
|
sl@0
|
79 |
chomp $line;
|
sl@0
|
80 |
$lineNo++;
|
sl@0
|
81 |
# Split into fields: make sure trailing null strings are not
|
sl@0
|
82 |
# deleted by adding a dummy final field
|
sl@0
|
83 |
my @attribute = split(/;/, $line.';dummy');
|
sl@0
|
84 |
# Delete the dummy field
|
sl@0
|
85 |
pop @attribute;
|
sl@0
|
86 |
die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
|
sl@0
|
87 |
if (scalar(@attribute) == 15);
|
sl@0
|
88 |
if (scalar(@attribute) == 16)
|
sl@0
|
89 |
{
|
sl@0
|
90 |
die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
|
sl@0
|
91 |
if ($attribute[15] !~ /^[ \t]*symbian:/i);
|
sl@0
|
92 |
my $code = $attribute[0];
|
sl@0
|
93 |
die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
|
sl@0
|
94 |
unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
|
sl@0
|
95 |
$code = hex($code);
|
sl@0
|
96 |
$attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
|
sl@0
|
97 |
print join(';', @attribute);
|
sl@0
|
98 |
}
|
sl@0
|
99 |
elsif ($line !~ /^[ \t]*$/)
|
sl@0
|
100 |
{
|
sl@0
|
101 |
die 'Do not understand line '.$lineNo;
|
sl@0
|
102 |
}
|
sl@0
|
103 |
else
|
sl@0
|
104 |
{
|
sl@0
|
105 |
print $line;
|
sl@0
|
106 |
}
|
sl@0
|
107 |
print "\n";
|
sl@0
|
108 |
}
|