os/kernelhwsrv/kernel/eka/euser/unicode/perl/UnicodeMaxDecompose.pl
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
#
sl@0
     2
# Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
# All rights reserved.
sl@0
     4
# This component and the accompanying materials are made available
sl@0
     5
# under the terms of the License "Eclipse Public License v1.0"
sl@0
     6
# which accompanies this distribution, and is available
sl@0
     7
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
#
sl@0
     9
# Initial Contributors:
sl@0
    10
# Nokia Corporation - initial contribution.
sl@0
    11
#
sl@0
    12
# Contributors:
sl@0
    13
#
sl@0
    14
# Description:
sl@0
    15
#
sl@0
    16
# UnicodeMaxDecompose.pl
sl@0
    17
#
sl@0
    18
# Adds maximal decompositions of the character and maximal decompositions of
sl@0
    19
# its folded varient to the Unicode data.
sl@0
    20
#
sl@0
    21
# Added as the fourth field after the 'Symbain:' marker in the following format:
sl@0
    22
#
sl@0
    23
# Symbian:<grapheme-role>;<excluded>;<folded>;<max-decomposition>;<folded-decomposition>
sl@0
    24
# where each of <max-decomposition> and <folded-decomposition> are strings
sl@0
    25
# of hex numbers separated by spaces, representing the complete decomposition
sl@0
    26
# of the character and its folded equivalent respectively.
sl@0
    27
#
sl@0
    28
# Usage:
sl@0
    29
# perl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>
sl@0
    30
sl@0
    31
use strict;
sl@0
    32
sl@0
    33
if (scalar(@ARGV) != 0)
sl@0
    34
	{
sl@0
    35
	print (STDERR "Usage:\nperl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>\n");
sl@0
    36
	exit 1;
sl@0
    37
	}
sl@0
    38
sl@0
    39
my %StatedDecomposition = ();
sl@0
    40
my %CompleteDecomposition = ();
sl@0
    41
sl@0
    42
sub Decompose
sl@0
    43
	{
sl@0
    44
	my ($code) = @_;
sl@0
    45
	return unless exists $StatedDecomposition{$code};
sl@0
    46
	my $stated = $StatedDecomposition{$code};
sl@0
    47
	delete $StatedDecomposition{$code};
sl@0
    48
	my @complete = ();
sl@0
    49
	foreach my $hexelt ( split(' ', $stated) )
sl@0
    50
		{
sl@0
    51
		if ($hexelt)
sl@0
    52
			{
sl@0
    53
			Decompose($hexelt);
sl@0
    54
			if (exists $CompleteDecomposition{$hexelt})
sl@0
    55
				{
sl@0
    56
				push @complete, $CompleteDecomposition{$hexelt};
sl@0
    57
				}
sl@0
    58
			else
sl@0
    59
				{
sl@0
    60
				push @complete, $hexelt;
sl@0
    61
				}
sl@0
    62
			}
sl@0
    63
		}
sl@0
    64
	$CompleteDecomposition{$code} = join(' ', @complete);
sl@0
    65
	}
sl@0
    66
sl@0
    67
my %Folded = ();
sl@0
    68
my %LineToCode = ();
sl@0
    69
my @RawLine = ();
sl@0
    70
sl@0
    71
my $lineNo = 0;
sl@0
    72
while (my $line = <STDIN>)
sl@0
    73
	{
sl@0
    74
	chomp $line;
sl@0
    75
	$lineNo++;
sl@0
    76
	# Split into fields: make sure trailing null strings are not
sl@0
    77
	# deleted by adding a dummy final field
sl@0
    78
	my @attribute = split(/;/, $line.';dummy');
sl@0
    79
	# Delete the dummy field
sl@0
    80
	pop @attribute;
sl@0
    81
	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?")
sl@0
    82
		if (scalar(@attribute) == 16);
sl@0
    83
	if (scalar(@attribute) == 17)
sl@0
    84
		{
sl@0
    85
		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?")
sl@0
    86
			if ($attribute[15] !~ /^[ \t]*symbian:/i);
sl@0
    87
		my $code = $attribute[0];
sl@0
    88
		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
sl@0
    89
			unless ($code =~ /^1?[0-9a-fA-F]{4,5}$/ && hex($code) < 0x110000);
sl@0
    90
		my $decomposition = $attribute[5];
sl@0
    91
		die("Decomposition '$decomposition' at line $lineNo is not a valid Unicode decomposition.")
sl@0
    92
			unless $decomposition =~ /^[ \t]*(<.*>[ \t]*[0-9a-fA-F])?[0-9a-fA-F \t]*$/;
sl@0
    93
		my $folded = $attribute[16];
sl@0
    94
		die ("'$folded' not a valid string of hex values at line $lineNo.")
sl@0
    95
			unless $folded =~ /[0-9a-fA-F \t]*/;
sl@0
    96
		# Store all decompositions that  have no tag and at least one value
sl@0
    97
		if ($decomposition =~ /^[ \t]*[0-9a-fA-F]/)
sl@0
    98
			{
sl@0
    99
			$StatedDecomposition{$code} = $decomposition;
sl@0
   100
			}
sl@0
   101
		if ($folded =~ /[0-9a-fA-F]/)
sl@0
   102
			{
sl@0
   103
			$Folded{$code} = $folded;
sl@0
   104
			}
sl@0
   105
		$LineToCode{$lineNo-1} = $code;
sl@0
   106
		}
sl@0
   107
	elsif ($line !~ /^[ \t]*$/)
sl@0
   108
		{
sl@0
   109
		die 'Do not understand line '.$lineNo;
sl@0
   110
		}
sl@0
   111
	$RawLine[$lineNo-1] = $line;
sl@0
   112
	}
sl@0
   113
sl@0
   114
# Completely decompose all strings in the %StatedDecomposition
sl@0
   115
foreach my $code (keys %StatedDecomposition)
sl@0
   116
	{
sl@0
   117
	Decompose($code);
sl@0
   118
	}
sl@0
   119
sl@0
   120
# Now decompose all the folded versions
sl@0
   121
foreach my $code (keys %Folded)
sl@0
   122
	{
sl@0
   123
	my @result = ();
sl@0
   124
	foreach my $hexelt (split(' ', $Folded{$code}))
sl@0
   125
		{
sl@0
   126
		if (exists $CompleteDecomposition{$hexelt})
sl@0
   127
			{
sl@0
   128
			push @result, split(' ', $CompleteDecomposition{$hexelt});
sl@0
   129
			}
sl@0
   130
		else
sl@0
   131
			{
sl@0
   132
			push @result, $hexelt;
sl@0
   133
			}
sl@0
   134
		}
sl@0
   135
	$Folded{$code} = join(' ', @result);
sl@0
   136
	}
sl@0
   137
sl@0
   138
# Now output all the results
sl@0
   139
for (my $i = 0; $i != scalar(@RawLine); $i++)
sl@0
   140
	{
sl@0
   141
	print $RawLine[$i];
sl@0
   142
	if (exists $LineToCode{$i})
sl@0
   143
		{
sl@0
   144
		my $code = $LineToCode{$i};
sl@0
   145
		print ';';
sl@0
   146
		my $decomp = '';
sl@0
   147
		$decomp = $CompleteDecomposition{$code}
sl@0
   148
			if exists $CompleteDecomposition{$code};
sl@0
   149
		print $decomp.';';
sl@0
   150
		if (exists $Folded{$code})
sl@0
   151
			{
sl@0
   152
			print $Folded{$code}
sl@0
   153
			}
sl@0
   154
		else
sl@0
   155
			{
sl@0
   156
			# If there is no folded value, but there is a decomposition
sl@0
   157
			# sequence, the character must fold to the decomposition
sl@0
   158
			# sequence too.
sl@0
   159
			print $decomp;
sl@0
   160
			}
sl@0
   161
		}
sl@0
   162
	print "\n";
sl@0
   163
	}