os/kernelhwsrv/kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/kernelhwsrv/kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,108 @@
     1.4 +# Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.5 +# All rights reserved.
     1.6 +# This component and the accompanying materials are made available
     1.7 +# under the terms of the License "Eclipse Public License v1.0"
     1.8 +# which accompanies this distribution, and is available
     1.9 +# at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.10 +#
    1.11 +# Initial Contributors:
    1.12 +# Nokia Corporation - initial contribution.
    1.13 +#
    1.14 +# Contributors:
    1.15 +#
    1.16 +# Description:
    1.17 +# Adds folding information to Unicode data
    1.18 +# Added as the third field after the 'Symbian:' marker in the following format:
    1.19 +# Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
    1.20 +# where <folded-form> is null or a sequence of hex unicode values
    1.21 +# separated by spaces representing the folded form of the character.
    1.22 +# Usage:
    1.23 +# perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
    1.24 +# 
    1.25 +#
    1.26 +
    1.27 +use strict;
    1.28 +
    1.29 +if (scalar(@ARGV) != 1)
    1.30 +	{
    1.31 +	print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
    1.32 +	exit 1;
    1.33 +	}
    1.34 +
    1.35 +open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
    1.36 +
    1.37 +my %Fold = ();
    1.38 +my %MappingLine = ();
    1.39 +my $lineNo = 0;
    1.40 +while (<FOLDING>)
    1.41 +	{
    1.42 +	$lineNo++;
    1.43 +	my ($line, $comment) = split(/#/, $_, 2);
    1.44 +	if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
    1.45 +		{
    1.46 +		my $code = hex($1);
    1.47 +		my $type = $2;
    1.48 +		my $folded = $3;
    1.49 +		# We'll deal with Turkic mappings with our own hack.
    1.50 +		# F = Full mappings (fold is longer than one character)
    1.51 +		# T = I = Turkic mapping
    1.52 +		if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
    1.53 +			{
    1.54 +			die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
    1.55 +				if (exists $Fold{$code});
    1.56 +			$Fold{$code} = $folded;
    1.57 +			$MappingLine{$code} = $lineNo;
    1.58 +			}
    1.59 +		}
    1.60 +	elsif ($line !~ /^[ \t]*$/)
    1.61 +		{
    1.62 +		die ("Did not understand line $lineNo of $ARGV[0]");
    1.63 +		}
    1.64 +	}
    1.65 +
    1.66 +close FOLDING;
    1.67 +
    1.68 +# Turkic hack:
    1.69 +# Map dotted capital I and dotless small I to lower case i.
    1.70 +# This makes all the 'i's fold the same, which isn't very nice for Turkic
    1.71 +# languages, but it at least gives us behaviour consistent across locales
    1.72 +# which does at least map dotted I, and i to the same value, as well
    1.73 +# as mapping I and dotless i to the same value, and mapping I and i
    1.74 +# to the same value.
    1.75 +$Fold{0x49} = '0069';
    1.76 +$Fold{0x130} = '0069';
    1.77 +$Fold{0x131} = '0069';
    1.78 +
    1.79 +$lineNo = 0;
    1.80 +while (my $line = <STDIN>)
    1.81 +	{
    1.82 +	chomp $line;
    1.83 +	$lineNo++;
    1.84 +	# Split into fields: make sure trailing null strings are not
    1.85 +	# deleted by adding a dummy final field
    1.86 +	my @attribute = split(/;/, $line.';dummy');
    1.87 +	# Delete the dummy field
    1.88 +	pop @attribute;
    1.89 +	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
    1.90 +		if (scalar(@attribute) == 15);
    1.91 +	if (scalar(@attribute) == 16)
    1.92 +		{
    1.93 +		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
    1.94 +			if ($attribute[15] !~ /^[ \t]*symbian:/i);
    1.95 +		my $code = $attribute[0];
    1.96 +		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
    1.97 +			unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
    1.98 +		$code = hex($code);
    1.99 +		$attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
   1.100 +		print join(';', @attribute);
   1.101 +		}
   1.102 +	elsif ($line !~ /^[ \t]*$/)
   1.103 +		{
   1.104 +		die 'Do not understand line '.$lineNo;
   1.105 +		}
   1.106 +	else
   1.107 +		{
   1.108 +		print $line;
   1.109 +		}
   1.110 +	print "\n";
   1.111 +	}