sl@0: # Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: # All rights reserved. sl@0: # This component and the accompanying materials are made available sl@0: # under the terms of the License "Eclipse Public License v1.0" sl@0: # which accompanies this distribution, and is available sl@0: # at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: # sl@0: # Initial Contributors: sl@0: # Nokia Corporation - initial contribution. sl@0: # sl@0: # Contributors: sl@0: # sl@0: # Description: sl@0: # Adds folding information to Unicode data sl@0: # Added as the third field after the 'Symbian:' marker in the following format: sl@0: # Symbian:;; sl@0: # where is null or a sequence of hex unicode values sl@0: # separated by spaces representing the folded form of the character. sl@0: # Usage: sl@0: # perl -w UnicodeAddFolded.pl CaseFolding.txt < sl@0: # sl@0: # sl@0: sl@0: use strict; sl@0: sl@0: if (scalar(@ARGV) != 1) sl@0: { sl@0: print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < \n"); sl@0: exit 1; sl@0: } sl@0: sl@0: open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n"); sl@0: sl@0: my %Fold = (); sl@0: my %MappingLine = (); sl@0: my $lineNo = 0; sl@0: while () sl@0: { sl@0: $lineNo++; sl@0: my ($line, $comment) = split(/#/, $_, 2); sl@0: if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/) sl@0: { sl@0: my $code = hex($1); sl@0: my $type = $2; sl@0: my $folded = $3; sl@0: # We'll deal with Turkic mappings with our own hack. sl@0: # F = Full mappings (fold is longer than one character) sl@0: # T = I = Turkic mapping sl@0: if ($type !~ /[FTI]/ && $folded !~ /[ \t]/) sl@0: { sl@0: die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.") sl@0: if (exists $Fold{$code}); sl@0: $Fold{$code} = $folded; sl@0: $MappingLine{$code} = $lineNo; sl@0: } sl@0: } sl@0: elsif ($line !~ /^[ \t]*$/) sl@0: { sl@0: die ("Did not understand line $lineNo of $ARGV[0]"); sl@0: } sl@0: } sl@0: sl@0: close FOLDING; sl@0: sl@0: # Turkic hack: sl@0: # Map dotted capital I and dotless small I to lower case i. sl@0: # This makes all the 'i's fold the same, which isn't very nice for Turkic sl@0: # languages, but it at least gives us behaviour consistent across locales sl@0: # which does at least map dotted I, and i to the same value, as well sl@0: # as mapping I and dotless i to the same value, and mapping I and i sl@0: # to the same value. sl@0: $Fold{0x49} = '0069'; sl@0: $Fold{0x130} = '0069'; sl@0: $Fold{0x131} = '0069'; sl@0: sl@0: $lineNo = 0; sl@0: while (my $line = ) sl@0: { sl@0: chomp $line; sl@0: $lineNo++; sl@0: # Split into fields: make sure trailing null strings are not sl@0: # deleted by adding a dummy final field sl@0: my @attribute = split(/;/, $line.';dummy'); sl@0: # Delete the dummy field sl@0: pop @attribute; sl@0: die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?") sl@0: if (scalar(@attribute) == 15); sl@0: if (scalar(@attribute) == 16) sl@0: { sl@0: die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?") sl@0: if ($attribute[15] !~ /^[ \t]*symbian:/i); sl@0: my $code = $attribute[0]; sl@0: die("First attribute '$code' not a valid Unicode codepoint at line $lineNo") sl@0: unless $code =~ /^1?[0-9a-fA-F]{4,5}$/; sl@0: $code = hex($code); sl@0: $attribute[16] = exists $Fold{$code}? $Fold{$code} : ''; sl@0: print join(';', @attribute); sl@0: } sl@0: elsif ($line !~ /^[ \t]*$/) sl@0: { sl@0: die 'Do not understand line '.$lineNo; sl@0: } sl@0: else sl@0: { sl@0: print $line; sl@0: } sl@0: print "\n"; sl@0: }