sl@0
|
1 |
#
|
sl@0
|
2 |
# Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
3 |
# All rights reserved.
|
sl@0
|
4 |
# This component and the accompanying materials are made available
|
sl@0
|
5 |
# under the terms of "Eclipse Public License v1.0"
|
sl@0
|
6 |
# which accompanies this distribution, and is available
|
sl@0
|
7 |
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
8 |
#
|
sl@0
|
9 |
# Initial Contributors:
|
sl@0
|
10 |
# Nokia Corporation - initial contribution.
|
sl@0
|
11 |
#
|
sl@0
|
12 |
# Contributors:
|
sl@0
|
13 |
#
|
sl@0
|
14 |
# Description:
|
sl@0
|
15 |
#
|
sl@0
|
16 |
|
sl@0
|
17 |
use strict;
|
sl@0
|
18 |
use integer;
|
sl@0
|
19 |
|
sl@0
|
20 |
sub PerlScriptPath
|
sl@0
|
21 |
{
|
sl@0
|
22 |
my $perlScriptPath=$0;
|
sl@0
|
23 |
my $os = $^O; #get the OS type
|
sl@0
|
24 |
#check OS type
|
sl@0
|
25 |
if($os=~/MSWin32/) #Windows OS
|
sl@0
|
26 |
{
|
sl@0
|
27 |
$perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
|
sl@0
|
28 |
$perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
|
sl@0
|
29 |
}
|
sl@0
|
30 |
else #Unix OS
|
sl@0
|
31 |
{
|
sl@0
|
32 |
$perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes
|
sl@0
|
33 |
$perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name
|
sl@0
|
34 |
}
|
sl@0
|
35 |
return $perlScriptPath;
|
sl@0
|
36 |
}
|
sl@0
|
37 |
BEGIN
|
sl@0
|
38 |
{
|
sl@0
|
39 |
unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
|
sl@0
|
40 |
}
|
sl@0
|
41 |
use PARSER;
|
sl@0
|
42 |
use UTF;
|
sl@0
|
43 |
|
sl@0
|
44 |
# The following numbers are used for byte-orders:
|
sl@0
|
45 |
# 0 means unspecified
|
sl@0
|
46 |
# 1 means big-endian
|
sl@0
|
47 |
# 2 means little-endian
|
sl@0
|
48 |
|
sl@0
|
49 |
FixParametersToWorkWithWindows98(\@ARGV);
|
sl@0
|
50 |
my $versionNumber = 3;
|
sl@0
|
51 |
my $outputByteOrderMark = 0;
|
sl@0
|
52 |
my $unicodeByteOrder = 0;
|
sl@0
|
53 |
my $inputEncoding = "";
|
sl@0
|
54 |
my $outputEncoding = "";
|
sl@0
|
55 |
my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key
|
sl@0
|
56 |
my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key
|
sl@0
|
57 |
|
sl@0
|
58 |
|
sl@0
|
59 |
my $inputFile=\*STDIN;
|
sl@0
|
60 |
my $outputFile=\*STDOUT;
|
sl@0
|
61 |
ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
|
sl@0
|
62 |
HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
|
sl@0
|
63 |
DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
|
sl@0
|
64 |
if ($inputFile!=\*STDIN)
|
sl@0
|
65 |
{
|
sl@0
|
66 |
close($inputFile) or die;
|
sl@0
|
67 |
}
|
sl@0
|
68 |
if ($outputFile!=\*STDOUT)
|
sl@0
|
69 |
{
|
sl@0
|
70 |
close($outputFile) or die;
|
sl@0
|
71 |
}
|
sl@0
|
72 |
|
sl@0
|
73 |
sub FixParametersToWorkWithWindows98
|
sl@0
|
74 |
{
|
sl@0
|
75 |
my $parameters=shift;
|
sl@0
|
76 |
my $i;
|
sl@0
|
77 |
for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
|
sl@0
|
78 |
{
|
sl@0
|
79 |
if (($parameters->[$i]=~/^(-input)$/i) ||
|
sl@0
|
80 |
($parameters->[$i]=~/^(-output)$/i))
|
sl@0
|
81 |
{
|
sl@0
|
82 |
$parameters->[$i].='='.$parameters->[$i+1];
|
sl@0
|
83 |
splice(@$parameters, $i+1, 1);
|
sl@0
|
84 |
}
|
sl@0
|
85 |
}
|
sl@0
|
86 |
}
|
sl@0
|
87 |
|
sl@0
|
88 |
sub PrintUsage
|
sl@0
|
89 |
{
|
sl@0
|
90 |
print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
|
sl@0
|
91 |
print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
|
sl@0
|
92 |
print "options := [-big|-little][-byteordermark]\n\t";
|
sl@0
|
93 |
print "inputspec := -input=<format> [<input_file>]\n\t";
|
sl@0
|
94 |
print "outputspec := -output=<format> [<output_file>]\n\t";
|
sl@0
|
95 |
print "format := unicode|utf8|big5|gb2312...\n\n";
|
sl@0
|
96 |
}
|
sl@0
|
97 |
|
sl@0
|
98 |
sub Assert
|
sl@0
|
99 |
{
|
sl@0
|
100 |
my $condition = shift;
|
sl@0
|
101 |
my $errorMessage = shift;
|
sl@0
|
102 |
if (!($condition)) # find out where this is used and work this out
|
sl@0
|
103 |
{
|
sl@0
|
104 |
die("Error: $errorMessage");
|
sl@0
|
105 |
}
|
sl@0
|
106 |
}
|
sl@0
|
107 |
|
sl@0
|
108 |
sub PrintWarning
|
sl@0
|
109 |
{
|
sl@0
|
110 |
my $warningMessage = shift;
|
sl@0
|
111 |
print STDERR "Warning: $warningMessage\n";
|
sl@0
|
112 |
}
|
sl@0
|
113 |
|
sl@0
|
114 |
|
sl@0
|
115 |
sub TryFileParameter
|
sl@0
|
116 |
{
|
sl@0
|
117 |
my $args = shift;
|
sl@0
|
118 |
my $argindex = shift;
|
sl@0
|
119 |
my $inputoroutput = shift;
|
sl@0
|
120 |
my $encoding = shift;
|
sl@0
|
121 |
my $filehandle = shift;
|
sl@0
|
122 |
my $prefix = "-$inputoroutput=";
|
sl@0
|
123 |
|
sl@0
|
124 |
if ($args->[$$argindex] =~ /^$prefix(.*)/)
|
sl@0
|
125 |
{
|
sl@0
|
126 |
Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
|
sl@0
|
127 |
$$encoding = $1;
|
sl@0
|
128 |
++$$argindex;
|
sl@0
|
129 |
if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
|
sl@0
|
130 |
{
|
sl@0
|
131 |
--$$argindex;
|
sl@0
|
132 |
}
|
sl@0
|
133 |
else
|
sl@0
|
134 |
{
|
sl@0
|
135 |
if ($inputoroutput =~ /input/i)
|
sl@0
|
136 |
{
|
sl@0
|
137 |
open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
|
sl@0
|
138 |
$$filehandle=\*INPUT_FILE;
|
sl@0
|
139 |
}
|
sl@0
|
140 |
else
|
sl@0
|
141 |
{
|
sl@0
|
142 |
open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
|
sl@0
|
143 |
$$filehandle=\*OUTPUT_FILE;
|
sl@0
|
144 |
}
|
sl@0
|
145 |
}
|
sl@0
|
146 |
binmode $$filehandle;
|
sl@0
|
147 |
return 1;
|
sl@0
|
148 |
}
|
sl@0
|
149 |
return 0;
|
sl@0
|
150 |
}
|
sl@0
|
151 |
|
sl@0
|
152 |
sub ReadParameters
|
sl@0
|
153 |
{
|
sl@0
|
154 |
my $args = shift;
|
sl@0
|
155 |
my $outputbyteordermark = shift;
|
sl@0
|
156 |
my $unicodebyteorder = shift;
|
sl@0
|
157 |
my $inputencoding = shift;
|
sl@0
|
158 |
my $outputencoding = shift;
|
sl@0
|
159 |
my $inputhandle = shift;
|
sl@0
|
160 |
my $outputhandle = shift;
|
sl@0
|
161 |
my $i;
|
sl@0
|
162 |
my $range;
|
sl@0
|
163 |
if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
|
sl@0
|
164 |
{
|
sl@0
|
165 |
PrintUsage();
|
sl@0
|
166 |
exit;
|
sl@0
|
167 |
}
|
sl@0
|
168 |
|
sl@0
|
169 |
for ($i = 0; $i < @$args ; ++$i)
|
sl@0
|
170 |
{
|
sl@0
|
171 |
if ( $args->[$i]=~ /-byteordermark/i)
|
sl@0
|
172 |
{
|
sl@0
|
173 |
Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
|
sl@0
|
174 |
$$outputbyteordermark = 1;
|
sl@0
|
175 |
}
|
sl@0
|
176 |
elsif ($args->[$i]=~ /-big/i)
|
sl@0
|
177 |
{
|
sl@0
|
178 |
Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
|
sl@0
|
179 |
$$unicodebyteorder = 1;
|
sl@0
|
180 |
}
|
sl@0
|
181 |
elsif ($args->[$i]=~ /-little/i)
|
sl@0
|
182 |
{
|
sl@0
|
183 |
Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
|
sl@0
|
184 |
$$unicodebyteorder = 2;
|
sl@0
|
185 |
}
|
sl@0
|
186 |
else
|
sl@0
|
187 |
{
|
sl@0
|
188 |
Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
|
sl@0
|
189 |
TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
|
sl@0
|
190 |
}
|
sl@0
|
191 |
}
|
sl@0
|
192 |
Assert($$inputencoding ne "", "no input encoding is specified");
|
sl@0
|
193 |
Assert($$outputencoding ne "", "no output encoding is specified");
|
sl@0
|
194 |
}
|
sl@0
|
195 |
|
sl@0
|
196 |
sub ReadFromFile
|
sl@0
|
197 |
{
|
sl@0
|
198 |
my $buffer = shift;
|
sl@0
|
199 |
my $numOfBytesToRead = shift;
|
sl@0
|
200 |
my $inputhandle = shift;
|
sl@0
|
201 |
my $numOfBytesRead = 0;
|
sl@0
|
202 |
my $numOfBytesToReadThisTime = $numOfBytesToRead;
|
sl@0
|
203 |
|
sl@0
|
204 |
for(;;)
|
sl@0
|
205 |
{
|
sl@0
|
206 |
for(;;)
|
sl@0
|
207 |
{
|
sl@0
|
208 |
my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;
|
sl@0
|
209 |
if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
|
sl@0
|
210 |
{
|
sl@0
|
211 |
$numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
|
sl@0
|
212 |
}
|
sl@0
|
213 |
my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;
|
sl@0
|
214 |
if (defined $numOfBytesReadThisTime)
|
sl@0
|
215 |
{
|
sl@0
|
216 |
$numOfBytesRead += $numOfBytesReadThisTime;
|
sl@0
|
217 |
Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
|
sl@0
|
218 |
if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
|
sl@0
|
219 |
{
|
sl@0
|
220 |
return;
|
sl@0
|
221 |
}
|
sl@0
|
222 |
last;
|
sl@0
|
223 |
}
|
sl@0
|
224 |
$numOfBytesToReadThisTime /= 2;
|
sl@0
|
225 |
Assert($numOfBytesToReadThisTime >0, "reading from file failed");
|
sl@0
|
226 |
}
|
sl@0
|
227 |
}
|
sl@0
|
228 |
}
|
sl@0
|
229 |
|
sl@0
|
230 |
sub HandleByteOrderMarks
|
sl@0
|
231 |
{
|
sl@0
|
232 |
my $outputbyteordermark = shift;
|
sl@0
|
233 |
my $unicodebyteorder = shift;
|
sl@0
|
234 |
my $inputencoding = shift;
|
sl@0
|
235 |
my $outputencoding = shift;
|
sl@0
|
236 |
my $inputhandle = shift;
|
sl@0
|
237 |
my $outputhandle = shift;
|
sl@0
|
238 |
|
sl@0
|
239 |
if ($$inputencoding =~ /unicode/i)
|
sl@0
|
240 |
{
|
sl@0
|
241 |
my $firstUnicodeCharacter = 0;
|
sl@0
|
242 |
ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
|
sl@0
|
243 |
my $byteOrderSpecifiedByByteOrderMark = 0;
|
sl@0
|
244 |
if (length($firstUnicodeCharacter) == 2)
|
sl@0
|
245 |
{
|
sl@0
|
246 |
my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
|
sl@0
|
247 |
if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
|
sl@0
|
248 |
{
|
sl@0
|
249 |
$byteOrderSpecifiedByByteOrderMark = 2;
|
sl@0
|
250 |
}
|
sl@0
|
251 |
elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
|
sl@0
|
252 |
{
|
sl@0
|
253 |
$byteOrderSpecifiedByByteOrderMark = 1;
|
sl@0
|
254 |
}
|
sl@0
|
255 |
else
|
sl@0
|
256 |
{
|
sl@0
|
257 |
my $error = seek $inputhandle, 0, 0; # rewind to start of file
|
sl@0
|
258 |
Assert ($error == 1, "could not rewind to the start of input file");
|
sl@0
|
259 |
}
|
sl@0
|
260 |
}
|
sl@0
|
261 |
if ($byteOrderSpecifiedByByteOrderMark!=0)
|
sl@0
|
262 |
{
|
sl@0
|
263 |
if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
|
sl@0
|
264 |
{
|
sl@0
|
265 |
PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
|
sl@0
|
266 |
}
|
sl@0
|
267 |
$$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
|
sl@0
|
268 |
}
|
sl@0
|
269 |
}
|
sl@0
|
270 |
if ($outputbyteordermark)
|
sl@0
|
271 |
{
|
sl@0
|
272 |
if ($$outputencoding ne "unicode")
|
sl@0
|
273 |
{
|
sl@0
|
274 |
PrintWarning("\"-byteordermark\" is only relevant for unicode output");
|
sl@0
|
275 |
}
|
sl@0
|
276 |
else
|
sl@0
|
277 |
{
|
sl@0
|
278 |
Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
|
sl@0
|
279 |
my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
|
sl@0
|
280 |
WriteToFile(\$firstUnicodeCharacter, $outputhandle);
|
sl@0
|
281 |
}
|
sl@0
|
282 |
}
|
sl@0
|
283 |
}
|
sl@0
|
284 |
|
sl@0
|
285 |
sub WriteToFile
|
sl@0
|
286 |
{
|
sl@0
|
287 |
my $buffer = shift;
|
sl@0
|
288 |
my $outputhandle = shift;
|
sl@0
|
289 |
|
sl@0
|
290 |
print $outputhandle $$buffer;
|
sl@0
|
291 |
}
|
sl@0
|
292 |
|
sl@0
|
293 |
sub DoConversion
|
sl@0
|
294 |
{
|
sl@0
|
295 |
my $unicodebyteorder = shift;
|
sl@0
|
296 |
my $inputencoding = shift;
|
sl@0
|
297 |
my $outputencoding = shift;
|
sl@0
|
298 |
my $inputhandle = shift;
|
sl@0
|
299 |
my $outputhandle = shift;
|
sl@0
|
300 |
my $foreignCharacters = shift;
|
sl@0
|
301 |
my $unicodeCharacters = shift;
|
sl@0
|
302 |
|
sl@0
|
303 |
my $currentBuffer = 0;
|
sl@0
|
304 |
my @arrayOfBuffers = ('', '', '');
|
sl@0
|
305 |
my $largeNumber=1000000;
|
sl@0
|
306 |
ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
|
sl@0
|
307 |
ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
|
sl@0
|
308 |
if ($$inputencoding ne $$outputencoding)
|
sl@0
|
309 |
{
|
sl@0
|
310 |
if ($$inputencoding !~ /^unicode$/i)
|
sl@0
|
311 |
{
|
sl@0
|
312 |
my $nextBuffer = $currentBuffer + 1;
|
sl@0
|
313 |
OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
|
sl@0
|
314 |
$currentBuffer = $nextBuffer;
|
sl@0
|
315 |
}
|
sl@0
|
316 |
if ($$outputencoding !~ /^unicode$/i)
|
sl@0
|
317 |
{
|
sl@0
|
318 |
my $nextBuffer = $currentBuffer + 1;
|
sl@0
|
319 |
UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
|
sl@0
|
320 |
$currentBuffer = $nextBuffer;
|
sl@0
|
321 |
}
|
sl@0
|
322 |
}
|
sl@0
|
323 |
ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
|
sl@0
|
324 |
WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
|
sl@0
|
325 |
}
|
sl@0
|
326 |
|
sl@0
|
327 |
sub ReverseByteOrderIfUnicodeAndBigEndian
|
sl@0
|
328 |
{
|
sl@0
|
329 |
my $unicodebyteorder = shift;
|
sl@0
|
330 |
my $encoding = shift;
|
sl@0
|
331 |
my $buffer = shift;
|
sl@0
|
332 |
my $i;
|
sl@0
|
333 |
|
sl@0
|
334 |
if ($$encoding =~ /^unicode$/i)
|
sl@0
|
335 |
{
|
sl@0
|
336 |
Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
|
sl@0
|
337 |
if ($$unicodebyteorder==0)
|
sl@0
|
338 |
{
|
sl@0
|
339 |
PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
|
sl@0
|
340 |
$$unicodebyteorder = 2;
|
sl@0
|
341 |
}
|
sl@0
|
342 |
if ($$unicodebyteorder==1)
|
sl@0
|
343 |
{
|
sl@0
|
344 |
$$buffer=pack('v*', unpack('n*', $$buffer));
|
sl@0
|
345 |
}
|
sl@0
|
346 |
}
|
sl@0
|
347 |
}
|
sl@0
|
348 |
|
sl@0
|
349 |
sub FillInHashes
|
sl@0
|
350 |
{
|
sl@0
|
351 |
my $foreignCharacters = shift;
|
sl@0
|
352 |
my $unicodeCharacters = shift;
|
sl@0
|
353 |
my $encoding = shift;
|
sl@0
|
354 |
my $replacementCharacter = shift;
|
sl@0
|
355 |
my $ranges = shift;
|
sl@0
|
356 |
my $bigEndian = shift;
|
sl@0
|
357 |
|
sl@0
|
358 |
my $endianness = 0;
|
sl@0
|
359 |
my $replacenum = 0;
|
sl@0
|
360 |
my $rangenum = 0;
|
sl@0
|
361 |
my $fileread = 0;
|
sl@0
|
362 |
my $largenumber = 1000000;
|
sl@0
|
363 |
|
sl@0
|
364 |
my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
|
sl@0
|
365 |
|
sl@0
|
366 |
my $line;
|
sl@0
|
367 |
|
sl@0
|
368 |
if (-e $dataFile)
|
sl@0
|
369 |
{
|
sl@0
|
370 |
open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
|
sl@0
|
371 |
|
sl@0
|
372 |
binmode HASH_INPUT;
|
sl@0
|
373 |
# reading the endianness
|
sl@0
|
374 |
$fileread = read HASH_INPUT, $endianness, 1;
|
sl@0
|
375 |
$endianness = unpack "C",$endianness;
|
sl@0
|
376 |
if ($endianness == 0)
|
sl@0
|
377 |
{
|
sl@0
|
378 |
# set the template to a default-> n for the eman time
|
sl@0
|
379 |
$$bigEndian = 0;
|
sl@0
|
380 |
}
|
sl@0
|
381 |
elsif ($endianness == 1)
|
sl@0
|
382 |
{
|
sl@0
|
383 |
$$bigEndian = 0;
|
sl@0
|
384 |
}
|
sl@0
|
385 |
elsif ($endianness == 2)
|
sl@0
|
386 |
{
|
sl@0
|
387 |
$$bigEndian = 1;
|
sl@0
|
388 |
}
|
sl@0
|
389 |
else
|
sl@0
|
390 |
{
|
sl@0
|
391 |
print "Illegal Endianness specified in the control files";
|
sl@0
|
392 |
}
|
sl@0
|
393 |
#reading the replacement characters
|
sl@0
|
394 |
$fileread = read HASH_INPUT, $replacenum,1;
|
sl@0
|
395 |
$replacenum= unpack "C",$replacenum;
|
sl@0
|
396 |
$fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
|
sl@0
|
397 |
# reading the ranges
|
sl@0
|
398 |
$fileread = read HASH_INPUT, $rangenum, 1;
|
sl@0
|
399 |
$rangenum = unpack "C",$rangenum;
|
sl@0
|
400 |
my $i; # loop variable
|
sl@0
|
401 |
for ($i=0; $i < $rangenum; ++$i)
|
sl@0
|
402 |
{
|
sl@0
|
403 |
my $lowerrange = 0;
|
sl@0
|
404 |
my $upperrange = 0;
|
sl@0
|
405 |
my $followchar = 0;
|
sl@0
|
406 |
|
sl@0
|
407 |
$fileread = read HASH_INPUT,$lowerrange,1;
|
sl@0
|
408 |
$lowerrange = unpack "C",$lowerrange;
|
sl@0
|
409 |
$fileread = read HASH_INPUT,$upperrange,1;
|
sl@0
|
410 |
$upperrange = unpack "C",$upperrange;
|
sl@0
|
411 |
$fileread = read HASH_INPUT,$followchar,1;
|
sl@0
|
412 |
$followchar = unpack "C",$followchar;
|
sl@0
|
413 |
|
sl@0
|
414 |
push @$ranges,[$lowerrange,$upperrange,$followchar];
|
sl@0
|
415 |
}
|
sl@0
|
416 |
my $data = 0;
|
sl@0
|
417 |
my @unpackeddata = 0;
|
sl@0
|
418 |
$fileread = read HASH_INPUT, $data, $largenumber;
|
sl@0
|
419 |
@unpackeddata = unpack "v*",$data;
|
sl@0
|
420 |
for($i = 0; $i <= $#unpackeddata; $i= $i+2)
|
sl@0
|
421 |
{
|
sl@0
|
422 |
$unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
|
sl@0
|
423 |
$foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
|
sl@0
|
424 |
}
|
sl@0
|
425 |
}
|
sl@0
|
426 |
else
|
sl@0
|
427 |
{
|
sl@0
|
428 |
die ("Encoding Format \"$$encoding\" not recognised");
|
sl@0
|
429 |
}
|
sl@0
|
430 |
}
|
sl@0
|
431 |
|
sl@0
|
432 |
sub OtherToUnicode
|
sl@0
|
433 |
{
|
sl@0
|
434 |
my $inputencoding = shift;
|
sl@0
|
435 |
my $unicode = shift;
|
sl@0
|
436 |
my $other = shift;
|
sl@0
|
437 |
my $foreignCharacters = shift;
|
sl@0
|
438 |
my $unicodeCharacters = shift;
|
sl@0
|
439 |
my $unicodetemplate = shift;
|
sl@0
|
440 |
my $replacementCharacter = 0;
|
sl@0
|
441 |
my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
|
sl@0
|
442 |
my @ranges=();
|
sl@0
|
443 |
|
sl@0
|
444 |
my $otherIndex= 0;
|
sl@0
|
445 |
my $numOfBytes = length($other);
|
sl@0
|
446 |
my $key = 0;
|
sl@0
|
447 |
my $inRange = 0;
|
sl@0
|
448 |
my $followByte = -1;
|
sl@0
|
449 |
|
sl@0
|
450 |
if ($$inputencoding=~/^utf8$/i)
|
sl@0
|
451 |
{
|
sl@0
|
452 |
return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
|
sl@0
|
453 |
}
|
sl@0
|
454 |
my $bigEndian;
|
sl@0
|
455 |
FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
|
sl@0
|
456 |
for (;;)
|
sl@0
|
457 |
{
|
sl@0
|
458 |
if ($otherIndex > $numOfBytes -1)
|
sl@0
|
459 |
{
|
sl@0
|
460 |
last;
|
sl@0
|
461 |
}
|
sl@0
|
462 |
my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
|
sl@0
|
463 |
# @ranges is an array of references. Each reference is a reference to an array
|
sl@0
|
464 |
for ($key = 0; $key <= $#ranges; ++$key)
|
sl@0
|
465 |
{
|
sl@0
|
466 |
my $arrayref = $ranges[$key];
|
sl@0
|
467 |
if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
|
sl@0
|
468 |
{
|
sl@0
|
469 |
$followByte = $arrayref->[2];
|
sl@0
|
470 |
$inRange = 1;
|
sl@0
|
471 |
}
|
sl@0
|
472 |
}
|
sl@0
|
473 |
Assert ($inRange != 0, "cannot figure out the Byte size of the character");
|
sl@0
|
474 |
my $tempByte = 0;
|
sl@0
|
475 |
for ($key = 0; $key<= $followByte; ++$key)
|
sl@0
|
476 |
{
|
sl@0
|
477 |
if ($bigEndian)
|
sl@0
|
478 |
{
|
sl@0
|
479 |
$tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
|
sl@0
|
480 |
}
|
sl@0
|
481 |
else
|
sl@0
|
482 |
{
|
sl@0
|
483 |
$tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
|
sl@0
|
484 |
}
|
sl@0
|
485 |
$otherIndex++;
|
sl@0
|
486 |
}
|
sl@0
|
487 |
if (exists $unicodeCharacters->{$tempByte})
|
sl@0
|
488 |
{
|
sl@0
|
489 |
$$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
|
sl@0
|
490 |
}
|
sl@0
|
491 |
else
|
sl@0
|
492 |
{
|
sl@0
|
493 |
$$unicode .= $unicodeReplacementCharacter;
|
sl@0
|
494 |
}
|
sl@0
|
495 |
}
|
sl@0
|
496 |
}
|
sl@0
|
497 |
|
sl@0
|
498 |
sub UnicodeToOther
|
sl@0
|
499 |
{
|
sl@0
|
500 |
my $outputencoding = shift;
|
sl@0
|
501 |
my $other = shift;
|
sl@0
|
502 |
my $unicode = shift;
|
sl@0
|
503 |
my $foreignCharacters = shift;
|
sl@0
|
504 |
my $unicodeCharacters = shift;
|
sl@0
|
505 |
my $unicodetemplate = shift;
|
sl@0
|
506 |
my $replacementCharacter = 0;
|
sl@0
|
507 |
my @ranges=();
|
sl@0
|
508 |
|
sl@0
|
509 |
my $unicodeIndex= 0;
|
sl@0
|
510 |
my $numOfBytes = length($unicode);
|
sl@0
|
511 |
my @UnicodeUnpacked = ();
|
sl@0
|
512 |
my $key = 0;
|
sl@0
|
513 |
|
sl@0
|
514 |
if ($$outputencoding=~/^utf8$/i)
|
sl@0
|
515 |
{
|
sl@0
|
516 |
return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
|
sl@0
|
517 |
}
|
sl@0
|
518 |
my $bigEndian;
|
sl@0
|
519 |
FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
|
sl@0
|
520 |
my $foreignTemplate=$bigEndian? 'n': 'v';
|
sl@0
|
521 |
@UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
|
sl@0
|
522 |
foreach $key (@UnicodeUnpacked)
|
sl@0
|
523 |
{
|
sl@0
|
524 |
if (!exists($foreignCharacters->{$key}))
|
sl@0
|
525 |
{
|
sl@0
|
526 |
$$other .= $replacementCharacter;
|
sl@0
|
527 |
}
|
sl@0
|
528 |
else
|
sl@0
|
529 |
{
|
sl@0
|
530 |
# This is the WRONG but it will work for the mean time
|
sl@0
|
531 |
# This will fail if the foreignCharacter has characters that are more than
|
sl@0
|
532 |
# two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
|
sl@0
|
533 |
|
sl@0
|
534 |
my $foreignValue = $foreignCharacters->{$key};
|
sl@0
|
535 |
if ( $foreignValue <= 255)
|
sl@0
|
536 |
{
|
sl@0
|
537 |
$$other .= pack "C" , $foreignValue;
|
sl@0
|
538 |
}
|
sl@0
|
539 |
else
|
sl@0
|
540 |
{
|
sl@0
|
541 |
$$other .= pack $foreignTemplate, $foreignValue;
|
sl@0
|
542 |
}
|
sl@0
|
543 |
}
|
sl@0
|
544 |
}
|
sl@0
|
545 |
}
|
sl@0
|
546 |
|