sl@0: # Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0: # All rights reserved.
sl@0: # This component and the accompanying materials are made available
sl@0: # under the terms of the License "Eclipse Public License v1.0"
sl@0: # which accompanies this distribution, and is available
sl@0: # at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0: #
sl@0: # Initial Contributors:
sl@0: # Nokia Corporation - initial contribution.
sl@0: #
sl@0: # Contributors:
sl@0: #
sl@0: # Description:
sl@0: # Adds folding information to Unicode data
sl@0: # Added as the third field after the 'Symbian:' marker in the following format:
sl@0: # Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
sl@0: # where <folded-form> is null or a sequence of hex unicode values
sl@0: # separated by spaces representing the folded form of the character.
sl@0: # Usage:
sl@0: # perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
sl@0: # 
sl@0: #
sl@0: 
sl@0: use strict;
sl@0: 
sl@0: if (scalar(@ARGV) != 1)
sl@0: 	{
sl@0: 	print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
sl@0: 	exit 1;
sl@0: 	}
sl@0: 
sl@0: open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
sl@0: 
sl@0: my %Fold = ();
sl@0: my %MappingLine = ();
sl@0: my $lineNo = 0;
sl@0: while (<FOLDING>)
sl@0: 	{
sl@0: 	$lineNo++;
sl@0: 	my ($line, $comment) = split(/#/, $_, 2);
sl@0: 	if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
sl@0: 		{
sl@0: 		my $code = hex($1);
sl@0: 		my $type = $2;
sl@0: 		my $folded = $3;
sl@0: 		# We'll deal with Turkic mappings with our own hack.
sl@0: 		# F = Full mappings (fold is longer than one character)
sl@0: 		# T = I = Turkic mapping
sl@0: 		if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
sl@0: 			{
sl@0: 			die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
sl@0: 				if (exists $Fold{$code});
sl@0: 			$Fold{$code} = $folded;
sl@0: 			$MappingLine{$code} = $lineNo;
sl@0: 			}
sl@0: 		}
sl@0: 	elsif ($line !~ /^[ \t]*$/)
sl@0: 		{
sl@0: 		die ("Did not understand line $lineNo of $ARGV[0]");
sl@0: 		}
sl@0: 	}
sl@0: 
sl@0: close FOLDING;
sl@0: 
sl@0: # Turkic hack:
sl@0: # Map dotted capital I and dotless small I to lower case i.
sl@0: # This makes all the 'i's fold the same, which isn't very nice for Turkic
sl@0: # languages, but it at least gives us behaviour consistent across locales
sl@0: # which does at least map dotted I, and i to the same value, as well
sl@0: # as mapping I and dotless i to the same value, and mapping I and i
sl@0: # to the same value.
sl@0: $Fold{0x49} = '0069';
sl@0: $Fold{0x130} = '0069';
sl@0: $Fold{0x131} = '0069';
sl@0: 
sl@0: $lineNo = 0;
sl@0: while (my $line = <STDIN>)
sl@0: 	{
sl@0: 	chomp $line;
sl@0: 	$lineNo++;
sl@0: 	# Split into fields: make sure trailing null strings are not
sl@0: 	# deleted by adding a dummy final field
sl@0: 	my @attribute = split(/;/, $line.';dummy');
sl@0: 	# Delete the dummy field
sl@0: 	pop @attribute;
sl@0: 	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
sl@0: 		if (scalar(@attribute) == 15);
sl@0: 	if (scalar(@attribute) == 16)
sl@0: 		{
sl@0: 		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
sl@0: 			if ($attribute[15] !~ /^[ \t]*symbian:/i);
sl@0: 		my $code = $attribute[0];
sl@0: 		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
sl@0: 			unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
sl@0: 		$code = hex($code);
sl@0: 		$attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
sl@0: 		print join(';', @attribute);
sl@0: 		}
sl@0: 	elsif ($line !~ /^[ \t]*$/)
sl@0: 		{
sl@0: 		die 'Do not understand line '.$lineNo;
sl@0: 		}
sl@0: 	else
sl@0: 		{
sl@0: 		print $line;
sl@0: 		}
sl@0: 	print "\n";
sl@0: 	}