sl@0: # sl@0: # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: # All rights reserved. sl@0: # This component and the accompanying materials are made available sl@0: # under the terms of the License "Eclipse Public License v1.0" sl@0: # which accompanies this distribution, and is available sl@0: # at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: # sl@0: # Initial Contributors: sl@0: # Nokia Corporation - initial contribution. sl@0: # sl@0: # Contributors: sl@0: # sl@0: # Description: sl@0: # sl@0: # UnicodeCompositionEx sl@0: # adds composition exclusion information to unicode data sl@0: # sl@0: # Added as a new field: sl@0: # Symbian: sl@0: # where is E or null. sl@0: # sl@0: # Usage: sl@0: # perl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < sl@0: sl@0: use strict; sl@0: sl@0: if (scalar(@ARGV) != 1) sl@0: { sl@0: print (STDERR "Usage:\nperl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < \n"); sl@0: exit 1; sl@0: } sl@0: sl@0: open(EXCLUSIONS, $ARGV[0]) or die("Could not open file $ARGV[0]\n"); sl@0: sl@0: my $lineNo = 0; sl@0: my %Excluded = (); sl@0: while () sl@0: { sl@0: $lineNo++; sl@0: # try to parse the line if there is some non-whitespace before the comment sl@0: if (!/^[ \t]*([#].*)?$/) sl@0: { sl@0: /^[ \t]*([0-9A-Fa-f]{4,6})[ \t]*([#].*)?$/ or die("Did not understand line $lineNo of $ARGV[0]"); sl@0: my $code = hex($1); sl@0: die ("Value $code outside Unicode range at line $lineNo of $ARGV[0]") sl@0: unless ($code < 0x110000); sl@0: $Excluded{$code} = 1; sl@0: #printf("Excluding %X because it is in the exclusion list\n", $code); sl@0: } sl@0: } sl@0: sl@0: close EXCLUSIONS; sl@0: # This is a two-pass operation, so we must store the lines ready for output later. sl@0: my @DataFileLines = (); sl@0: my %DataFileLineCodes = (); sl@0: # The first pass will collect all the relevant data: sl@0: # The first character of the decomposition if there is more than one sl@0: my %FirstOfDecompositionString = (); sl@0: # The singleton decomposition if it is a singleton sl@0: my %SingletonDecomposition = (); sl@0: # The decompositions tag, if any sl@0: my %DecompTag = (); sl@0: # The combining class sl@0: my %CombiningClass = (); sl@0: # We will also be marking all singleton decompositions for exclusion sl@0: $lineNo = 0; sl@0: while (my $line = ) sl@0: { sl@0: chomp $line; sl@0: $DataFileLines[$lineNo] = $line; sl@0: $lineNo++; sl@0: # Split into fields: make sure trailing null strings are not sl@0: # deleted by adding a dummy final field sl@0: my @attribute = split(/;/, $line.';dummy'); sl@0: # Delete the dummy field sl@0: pop @attribute; sl@0: sl@0: if (scalar(@attribute) == 15) sl@0: { sl@0: my $code = $attribute[0]; sl@0: die("First attribute '$code' not a valid Unicode codepoint at line $lineNo") sl@0: unless $code =~ /^1?[0-9a-fA-F]{4,5}$/; sl@0: $code = hex($code); sl@0: my $combiningClass = $attribute[3]; sl@0: die("Fourth attribute '$combiningClass' is not a valid Unicode combining class at line $lineNo") sl@0: unless (0 <= $combiningClass && $combiningClass < 256); sl@0: my $decompositionString = $attribute[5]; sl@0: die ("Sixth attribute '$decompositionString' is not a valid decomposition string at line $lineNo") sl@0: unless ($decompositionString =~ /^(<.*>)?[0-9a-fA-F \t]*$/); sl@0: my @decomposition = split(/[ \t]+/, $decompositionString); sl@0: if (@decomposition && $decomposition[0] =~ /^<.*>$/) sl@0: { sl@0: $DecompTag{$code} = shift @decomposition; sl@0: } sl@0: if (scalar(@decomposition) == 1) sl@0: { sl@0: # We want to exclude codes such as these, with a singleton sl@0: # decomposition mapping, but at the moment we don't know if the sl@0: # character mapped to has a decomposition mapping, so we will sl@0: # defer this to another stage. sl@0: die("Decomposition $decomposition[0] not understood at line $lineNo") sl@0: unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/); sl@0: $SingletonDecomposition{$code} = hex($decomposition[0]); sl@0: } sl@0: elsif (1 < scalar(@decomposition)) sl@0: { sl@0: die("Decomposition $decomposition[0] not understood at line $lineNo") sl@0: unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/); sl@0: $FirstOfDecompositionString{$code} = hex($decomposition[0]); sl@0: } sl@0: $CombiningClass{$code} = $combiningClass; sl@0: $DataFileLineCodes{$lineNo-1} = $code; sl@0: } sl@0: elsif ($line !~ /^[ \t]*$/) sl@0: { sl@0: die 'Do not understand line '.$lineNo; sl@0: } sl@0: } sl@0: sl@0: # Each code that has a decomposition string longer than one character sl@0: # where the first character has non-zero combining class is excluded sl@0: foreach my $code (keys %FirstOfDecompositionString) sl@0: { sl@0: my $decomp = $FirstOfDecompositionString{$code}; sl@0: if (exists($CombiningClass{$decomp})) sl@0: { sl@0: if ($CombiningClass{$decomp} != 0) sl@0: { sl@0: $Excluded{$code} = 1; sl@0: #printf("Excluding %X because its decomposition starts with a non-starter(%X)\n", $code, $decomp); sl@0: } sl@0: } sl@0: } sl@0: sl@0: # Each code that has a singleton decomposition string may be excluded if sl@0: # that code has only a singleton mapping itself. sl@0: foreach my $code (sort (keys %SingletonDecomposition)) sl@0: { sl@0: my $mapsTo = $code; sl@0: while (exists $SingletonDecomposition{$mapsTo} && !exists $DecompTag{$code}) sl@0: { sl@0: $mapsTo = $SingletonDecomposition{$mapsTo}; sl@0: } sl@0: if (!exists $FirstOfDecompositionString{$mapsTo}) sl@0: { sl@0: #printf("Excluding %X because its decomposition is a singleton(%X)\n", $code, $mapsTo); sl@0: $Excluded{$code} = 1; sl@0: } sl@0: } sl@0: sl@0: # Now we output the file with the extra filed appended to each line sl@0: for(my $i = 0; $i != scalar(@DataFileLines); $i++) sl@0: { sl@0: print $DataFileLines[$i]; sl@0: if (exists($DataFileLineCodes{$i})) sl@0: { sl@0: print ';Symbian:'; sl@0: if (exists($Excluded{ $DataFileLineCodes{$i} })) sl@0: { sl@0: print 'E'; sl@0: } sl@0: } sl@0: print "\n"; sl@0: }