os/kernelhwsrv/kernel/eka/euser/unicode/perl/UnicodeCompositionEx.pl
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
#
sl@0
     2
# Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
# All rights reserved.
sl@0
     4
# This component and the accompanying materials are made available
sl@0
     5
# under the terms of the License "Eclipse Public License v1.0"
sl@0
     6
# which accompanies this distribution, and is available
sl@0
     7
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
#
sl@0
     9
# Initial Contributors:
sl@0
    10
# Nokia Corporation - initial contribution.
sl@0
    11
#
sl@0
    12
# Contributors:
sl@0
    13
#
sl@0
    14
# Description:
sl@0
    15
#
sl@0
    16
# UnicodeCompositionEx
sl@0
    17
# adds composition exclusion information to unicode data
sl@0
    18
#
sl@0
    19
# Added as a new field:
sl@0
    20
# Symbian:<excluded-from-composition>
sl@0
    21
# where <excluded-from-composition> is E or null.
sl@0
    22
#
sl@0
    23
# Usage:
sl@0
    24
# perl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>
sl@0
    25
sl@0
    26
use strict;
sl@0
    27
sl@0
    28
if (scalar(@ARGV) != 1)
sl@0
    29
	{
sl@0
    30
	print (STDERR "Usage:\nperl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>\n");
sl@0
    31
	exit 1;
sl@0
    32
	}
sl@0
    33
sl@0
    34
open(EXCLUSIONS, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
sl@0
    35
sl@0
    36
my $lineNo = 0;
sl@0
    37
my %Excluded = ();
sl@0
    38
while (<EXCLUSIONS>)
sl@0
    39
	{
sl@0
    40
	$lineNo++;
sl@0
    41
	# try to parse the line if there is some non-whitespace before the comment
sl@0
    42
	if (!/^[ \t]*([#].*)?$/)
sl@0
    43
		{
sl@0
    44
		/^[ \t]*([0-9A-Fa-f]{4,6})[ \t]*([#].*)?$/ or die("Did not understand line $lineNo of $ARGV[0]");
sl@0
    45
		my $code = hex($1);
sl@0
    46
		die ("Value $code outside Unicode range at line $lineNo of $ARGV[0]")
sl@0
    47
			unless ($code < 0x110000);
sl@0
    48
		$Excluded{$code} = 1;
sl@0
    49
		#printf("Excluding %X because it is in the exclusion list\n", $code);
sl@0
    50
		}
sl@0
    51
	}
sl@0
    52
sl@0
    53
close EXCLUSIONS;
sl@0
    54
# This is a two-pass operation, so we must store the lines ready for output later.
sl@0
    55
my @DataFileLines = ();
sl@0
    56
my %DataFileLineCodes = ();
sl@0
    57
# The first pass will collect all the relevant data:
sl@0
    58
# The first character of the decomposition if there is more than one
sl@0
    59
my %FirstOfDecompositionString = ();
sl@0
    60
# The singleton decomposition if it is a singleton
sl@0
    61
my %SingletonDecomposition = ();
sl@0
    62
# The decompositions tag, if any
sl@0
    63
my %DecompTag = ();
sl@0
    64
# The combining class
sl@0
    65
my %CombiningClass = ();
sl@0
    66
# We will also be marking all singleton decompositions for exclusion
sl@0
    67
$lineNo = 0;
sl@0
    68
while (my $line = <STDIN>)
sl@0
    69
	{
sl@0
    70
	chomp $line;
sl@0
    71
	$DataFileLines[$lineNo] = $line;
sl@0
    72
	$lineNo++;
sl@0
    73
	# Split into fields: make sure trailing null strings are not
sl@0
    74
	# deleted by adding a dummy final field
sl@0
    75
	my @attribute = split(/;/, $line.';dummy');
sl@0
    76
	# Delete the dummy field
sl@0
    77
	pop @attribute;
sl@0
    78
sl@0
    79
	if (scalar(@attribute) == 15)
sl@0
    80
		{
sl@0
    81
		my $code = $attribute[0];
sl@0
    82
		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
sl@0
    83
			unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
sl@0
    84
		$code = hex($code);
sl@0
    85
		my $combiningClass = $attribute[3];
sl@0
    86
		die("Fourth attribute '$combiningClass' is not a valid Unicode combining class at line $lineNo")
sl@0
    87
			unless (0 <= $combiningClass && $combiningClass < 256);
sl@0
    88
		my $decompositionString = $attribute[5];
sl@0
    89
		die ("Sixth attribute '$decompositionString' is not a valid decomposition string at line $lineNo")
sl@0
    90
			unless ($decompositionString =~ /^(<.*>)?[0-9a-fA-F \t]*$/);
sl@0
    91
		my @decomposition = split(/[ \t]+/, $decompositionString);
sl@0
    92
		if (@decomposition && $decomposition[0] =~ /^<.*>$/)
sl@0
    93
			{
sl@0
    94
			$DecompTag{$code} = shift @decomposition;
sl@0
    95
			}
sl@0
    96
		if (scalar(@decomposition) == 1)
sl@0
    97
			{
sl@0
    98
			# We want to exclude codes such as these, with a singleton
sl@0
    99
			# decomposition mapping, but at the moment we don't know if the
sl@0
   100
			# character mapped to has a decomposition mapping, so we will
sl@0
   101
			# defer this to another stage.
sl@0
   102
			die("Decomposition $decomposition[0] not understood at line $lineNo")
sl@0
   103
				unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/);
sl@0
   104
			$SingletonDecomposition{$code} = hex($decomposition[0]);
sl@0
   105
			}
sl@0
   106
		elsif (1 < scalar(@decomposition))
sl@0
   107
			{
sl@0
   108
			die("Decomposition $decomposition[0] not understood at line $lineNo")
sl@0
   109
				unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/);
sl@0
   110
			$FirstOfDecompositionString{$code} = hex($decomposition[0]);
sl@0
   111
			}
sl@0
   112
		$CombiningClass{$code} = $combiningClass;
sl@0
   113
		$DataFileLineCodes{$lineNo-1} = $code;
sl@0
   114
		}
sl@0
   115
	elsif ($line !~ /^[ \t]*$/)
sl@0
   116
		{
sl@0
   117
		die 'Do not understand line '.$lineNo;
sl@0
   118
		}
sl@0
   119
	}
sl@0
   120
sl@0
   121
# Each code that has a decomposition string longer than one character
sl@0
   122
# where the first character has non-zero combining class is excluded
sl@0
   123
foreach my $code (keys %FirstOfDecompositionString)
sl@0
   124
	{
sl@0
   125
	my $decomp = $FirstOfDecompositionString{$code};
sl@0
   126
	if (exists($CombiningClass{$decomp}))
sl@0
   127
		{
sl@0
   128
		if ($CombiningClass{$decomp} != 0)
sl@0
   129
			{
sl@0
   130
			$Excluded{$code} = 1;
sl@0
   131
			#printf("Excluding %X because its decomposition starts with a non-starter(%X)\n", $code, $decomp);
sl@0
   132
			}
sl@0
   133
		}
sl@0
   134
	}
sl@0
   135
sl@0
   136
# Each code that has a singleton decomposition string may be excluded if
sl@0
   137
# that code has only a singleton mapping itself.
sl@0
   138
foreach my $code (sort (keys %SingletonDecomposition))
sl@0
   139
	{
sl@0
   140
	my $mapsTo = $code;
sl@0
   141
	while (exists $SingletonDecomposition{$mapsTo} && !exists $DecompTag{$code})
sl@0
   142
		{
sl@0
   143
		$mapsTo = $SingletonDecomposition{$mapsTo};
sl@0
   144
		}
sl@0
   145
	if (!exists $FirstOfDecompositionString{$mapsTo})
sl@0
   146
		{
sl@0
   147
		#printf("Excluding %X because its decomposition is a singleton(%X)\n", $code, $mapsTo);
sl@0
   148
		$Excluded{$code} = 1;
sl@0
   149
		}
sl@0
   150
	}
sl@0
   151
sl@0
   152
# Now we output the file with the extra filed appended to each line
sl@0
   153
for(my $i = 0; $i != scalar(@DataFileLines); $i++)
sl@0
   154
	{
sl@0
   155
	print $DataFileLines[$i];
sl@0
   156
	if (exists($DataFileLineCodes{$i}))
sl@0
   157
		{
sl@0
   158
		print ';Symbian:';
sl@0
   159
		if (exists($Excluded{ $DataFileLineCodes{$i} }))
sl@0
   160
			{
sl@0
   161
			print 'E';
sl@0
   162
			}
sl@0
   163
		}
sl@0
   164
	print "\n";
sl@0
   165
	}