os/textandloc/localisation/localesupport/coltab/COLTAB.CPP
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
// Copyright (c) 1999-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     2
// All rights reserved.
sl@0
     3
// This component and the accompanying materials are made available
sl@0
     4
// under the terms of "Eclipse Public License v1.0"
sl@0
     5
// which accompanies this distribution, and is available
sl@0
     6
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     7
//
sl@0
     8
// Initial Contributors:
sl@0
     9
// Nokia Corporation - initial contribution.
sl@0
    10
//
sl@0
    11
// Contributors:
sl@0
    12
//
sl@0
    13
// Description:
sl@0
    14
// Reads and parses the Unicode collation value table and writes out a C++ source file
sl@0
    15
// containing the data in a form that can be used by the EPOC collation system.
sl@0
    16
//
sl@0
    17
// The program reads three files or one compositive files:
sl@0
    18
//
sl@0
    19
// Three files (by default):
sl@0
    20
// 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as
sl@0
    21
// basekeys.txt, supplied with the Standard Unicode Collation system
sl@0
    22
//
sl@0
    23
// 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as
sl@0
    24
// compkeys.txt, supplied with the Standard Unicode Collation system
sl@0
    25
//
sl@0
    26
// 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the
sl@0
    27
// same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line,
sl@0
    28
// space-separated and each exactly 4 hex digits.
sl@0
    29
//
sl@0
    30
// One compositive files (with option /a):
sl@0
    31
// 1. All Keys (combine above three files into one file): must be in the same format as allkeys.txt, supplied with the Standard Unicode Collation system (after Unicode 3.0).
sl@0
    32
//
sl@0
    33
//
sl@0
    34
sl@0
    35
sl@0
    36
#include <assert.h>
sl@0
    37
#include <ctype.h>
sl@0
    38
sl@0
    39
#ifdef __MSVCDOTNET__
sl@0
    40
#include <fstream>
sl@0
    41
#include <iostream>
sl@0
    42
using namespace std;
sl@0
    43
#else //!__MSVCDOTNET__
sl@0
    44
#include <fstream.h>
sl@0
    45
#include <iostream.h>
sl@0
    46
#endif //__MSVCDOTNET__
sl@0
    47
sl@0
    48
#include <stdlib.h>
sl@0
    49
#include <string.h>
sl@0
    50
#include <stdio.h>
sl@0
    51
sl@0
    52
/*
sl@0
    53
Constants constraining the range of level-1 and level-2 keys so that they can be packed.
sl@0
    54
Non-zero values are reduced by one less than the minimum value.
sl@0
    55
*/
sl@0
    56
const unsigned int KLevel1Bits = 8;
sl@0
    57
const unsigned int KLevel1Min = 0x20;
sl@0
    58
const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2;
sl@0
    59
const unsigned int KLevel2Bits = 6;
sl@0
    60
const unsigned int KLevel2Min = 1;
sl@0
    61
const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2;
sl@0
    62
sl@0
    63
/*
sl@0
    64
Table of characters in the WGL4 set, plus characters in canonical decompositions of
sl@0
    65
those characters, plus commonly used control characters and space characters,
sl@0
    66
given as ranges of Unicode characters. In each pair, the first code is the first in the range,
sl@0
    67
and the second is the first code NOT in the range.
sl@0
    68
sl@0
    69
The extra characters are added mainly to ensure that control characters and spaces are
sl@0
    70
normally ignored. The extra characters are:
sl@0
    71
sl@0
    72
0x0000-0x001F: ASCII control characters
sl@0
    73
0x2000-0x2012: spaces, hyphen variants, figure dash
sl@0
    74
0x2028-0x202E: line and paragraph separator, bidirectional control characters
sl@0
    75
0xFEFF		 : byte-order mark
sl@0
    76
0xFFFC-0xFFFD: object replacement character, replacement character
sl@0
    77
*/
sl@0
    78
const unsigned int Wgl4Range[] =
sl@0
    79
	{
sl@0
    80
	0x00, 0x7f,		// All ASCII
sl@0
    81
	0xa0, 0x180,		// Non-breaking space, Latin-1, Latin Extended-A
sl@0
    82
	0x192,0x193,		// Latin f with hook
sl@0
    83
	0x1fa,0x200,		// A-ring, a-ring, AE, ae, O slash, o slash all with acute accent
sl@0
    84
	0x2c6,0x2c8,		// non-combining circumflex and caron
sl@0
    85
	0x2c9,0x2ca,		// non-combining macron
sl@0
    86
	0x2d8,0x2dc,		// non-combining breve, dot above, ring above, ogonek
sl@0
    87
	0x2dd,0x2de,		// non-combining double acute
sl@0
    88
	0x300,0x305,		// combining grave, acute, circumflex, tilde, macron
sl@0
    89
	0x306,0x309,		// combining breve, dot above, double dot above
sl@0
    90
	0x30a,0x30e,		// combining ring above, double acute, caron, vertical line above
sl@0
    91
	0x327,0x329,		// combining cedilla, ogonek
sl@0
    92
	0x384,0x38b,		// Greek
sl@0
    93
	0x38c,0x38d,		// Greek
sl@0
    94
	0x38e,0x3a2,		// Greek
sl@0
    95
	0x3a3,0x3cf,		// Greek
sl@0
    96
	0x401,0x40d,		// Cyrillic
sl@0
    97
	0x40e,0x450,		// Cyrillic
sl@0
    98
	0x451,0x45d,		// Cyrillic
sl@0
    99
	0x45e,0x460,		// Cyrillic
sl@0
   100
	0x490,0x492,		// Cyrillic
sl@0
   101
	0x1e80,0x1e86,		// Both W and w with each of grave, acute and diaeresis
sl@0
   102
	0x1ef2,0x1ef4,		// Y with grave, y with grave
sl@0
   103
	0x2000,0x2016,		// various space and horizontal lines
sl@0
   104
	0x2017,0x201f,		//double vertical line, double low line, various quotation marks
sl@0
   105
	0x2020,0x2023,		// dagger, double dagger, bullet
sl@0
   106
	0x2026,0x2027,		//ellipsis
sl@0
   107
	0x2028,0x202F,		// line & paragraph separators and directional formatting
sl@0
   108
	0x2030,0x2031,		// per mille
sl@0
   109
	0x2032,0x2034,		// prime
sl@0
   110
	0x2039,0x203b,		// single angle quotation marks
sl@0
   111
	0x203c,0x203d,		// double exclamation mark
sl@0
   112
	0x203e,0x203f,		// non-combining overscore
sl@0
   113
	0x2044,0x2045,		// fraction slash
sl@0
   114
	0x207f,0x2080,		// superscript n
sl@0
   115
	0x20a3,0x20a5,		// French Franc, Italian/Turkish Lira
sl@0
   116
	0x20a7,0x20a8,		// Spanish Peseta
sl@0
   117
	0x20ac,0x20ad,		// Euro symbol
sl@0
   118
	0x2105,0x2106,		// care of
sl@0
   119
	0x2113,0x2114,		// script l
sl@0
   120
	0x2116,0x2117,		// numero
sl@0
   121
	0x2122,0x2123,		// trade mark
sl@0
   122
	0x2126,0x2127,		// ohm
sl@0
   123
	0x212e,0x212f,		// estimated (net weight)
sl@0
   124
	0x215b,0x215f,		// 1/8, 3/8, 5/8, 7/8
sl@0
   125
	0x2190,0x2196,		// horizontal and vertical arrows
sl@0
   126
	0x21a8,0x21a9,		// up down arrow with base
sl@0
   127
	0x2202,0x2203,		// partial differential
sl@0
   128
	0x2206,0x2207,		// increment (delta)
sl@0
   129
	0x220f,0x2210,		// n-ary product (pi)
sl@0
   130
	0x2211,0x2213,		// n-ary sum (sigma), minus
sl@0
   131
	0x2215,0x2216,		// division (slash)
sl@0
   132
	0x2219,0x221b,		// bullet operator, square root
sl@0
   133
	0x221e,0x2220,		// infinity, right angle
sl@0
   134
	0x2229,0x222a,		// intersection
sl@0
   135
	0x222b,0x222c,		// union
sl@0
   136
	0x2248,0x2249,		// almost equal to
sl@0
   137
	0x2260,0x2262,		// not equal to, identical to
sl@0
   138
	0x2264,0x2266,		// less-than-or-equal-to, greater-than-or-equal-to
sl@0
   139
	0x2302,0x2303,		// house
sl@0
   140
	0x2310,0x2311,		// rversed not sign
sl@0
   141
	0x2320,0x2322,		// top and bottom of integral
sl@0
   142
	0x2500,0x2501,		// box drawing
sl@0
   143
	0x2502,0x2503,		// box drawing
sl@0
   144
	0x250c,0x250d,		// box drawing
sl@0
   145
	0x2510,0x2511,		// box drawing
sl@0
   146
	0x2514,0x2515,		// box drawing
sl@0
   147
	0x2518,0x2519,		// box drawing
sl@0
   148
	0x251c,0x251d,		// box drawing
sl@0
   149
	0x2524,0x2525,		// box drawing
sl@0
   150
	0x252c,0x252d,		// box drawing
sl@0
   151
	0x2534,0x2535,		// box drawing
sl@0
   152
	0x253c,0x253d,		// box drawing
sl@0
   153
	0x2550,0x256d,		// box drawing
sl@0
   154
	0x2580,0x2581,		// block element
sl@0
   155
	0x2584,0x2585,		// block element
sl@0
   156
	0x2588,0x2589,		// block element
sl@0
   157
	0x258c,0x258d,		// block element
sl@0
   158
	0x2590,0x2594,		// block element
sl@0
   159
	0x25a0,0x25a2,		// geometric shapes
sl@0
   160
	0x25aa,0x25ad,		// geometric shapes
sl@0
   161
	0x25b2,0x25b3,		// geometric shapes
sl@0
   162
	0x25ba,0x25bb,		// geometric shapes
sl@0
   163
	0x25bc,0x25bd,		// geometric shapes
sl@0
   164
	0x25c4,0x25c5,		// geometric shapes
sl@0
   165
	0x25ca,0x25cc,		// geometric shapes
sl@0
   166
	0x25cf,0x25d0,		// geometric shapes
sl@0
   167
	0x25d8,0x25da,		// geometric shapes
sl@0
   168
	0x25e6,0x25e7,		// geometric shapes
sl@0
   169
	0x263a,0x263d,		// smilies, sun
sl@0
   170
	0x2640,0x2641,		// female
sl@0
   171
	0x2642,0x2643,		// male
sl@0
   172
	0x2660,0x2661,		// spade
sl@0
   173
	0x2663,0x2664,		// club
sl@0
   174
	0x2665,0x2667,		// heart
sl@0
   175
	0x266a,0x266c,		// quaver, beamed quavers
sl@0
   176
	0xfb01,0xfb03,		// fi, fl ligatures
sl@0
   177
	0xfeff,0xff00,		// zero-width non-breaking space
sl@0
   178
	0xfffc, 0xfffe		// object replacement character and replacement character
sl@0
   179
	};
sl@0
   180
const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2;
sl@0
   181
sl@0
   182
int CompareWgl4Ranges(const void* aRange1,const void* aRange2)
sl@0
   183
	{
sl@0
   184
	unsigned int* p = (unsigned int*)aRange1;
sl@0
   185
	unsigned int* q = (unsigned int*)aRange2;
sl@0
   186
	if (q[0] == q[1])
sl@0
   187
		{
sl@0
   188
		unsigned int* temp = p;
sl@0
   189
		p = q;
sl@0
   190
		q = temp;
sl@0
   191
		}
sl@0
   192
	if (*p < *q)
sl@0
   193
		return -1;
sl@0
   194
	else if (*p >= q[1])
sl@0
   195
		return 1;
sl@0
   196
	else
sl@0
   197
		return 0;
sl@0
   198
	}
sl@0
   199
sl@0
   200
// Determine if a character is in the WGL4 character repertoire.
sl@0
   201
static bool InWgl4(unsigned int aChar)
sl@0
   202
	{
sl@0
   203
	unsigned int key[2];
sl@0
   204
	key[0] = key[1] = aChar;
sl@0
   205
	return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL;
sl@0
   206
	}
sl@0
   207
sl@0
   208
// A collation key.
sl@0
   209
class CollationKey
sl@0
   210
	{
sl@0
   211
public:
sl@0
   212
	bool operator==(const CollationKey& k) const
sl@0
   213
		{ return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] &&
sl@0
   214
		  iIgnorable == k.iIgnorable && iStop == k.iStop; }
sl@0
   215
sl@0
   216
	enum
sl@0
   217
		{
sl@0
   218
		ELevels = 3
sl@0
   219
		};
sl@0
   220
	int iLevel[ELevels];// the keys at the various levels
sl@0
   221
	bool iIgnorable;	// TRUE if this key can normally be ignored
sl@0
   222
	bool iStop;			// TRUE if this is the last key in a string of keys
sl@0
   223
	};
sl@0
   224
sl@0
   225
// The collation index for a single Unicode value.
sl@0
   226
class CollationIndex
sl@0
   227
	{
sl@0
   228
public:
sl@0
   229
	static int Compare(const void* aIndex1,const void* aIndex2);
sl@0
   230
sl@0
   231
	int iCode;			// Unicode value
sl@0
   232
	int iIndex;			// index into the key table
sl@0
   233
	};
sl@0
   234
sl@0
   235
class Reader
sl@0
   236
	{
sl@0
   237
public:
sl@0
   238
	Reader(bool aWgl4,bool aStandard,const char* aLocaleName, const char* aUidString);
sl@0
   239
	~Reader();
sl@0
   240
	void ReadBaseKeys(const char* aFileName);
sl@0
   241
	void ReadCompKeys(const char* aFileName);
sl@0
   242
	void ReadStrings(const char* aFileName);
sl@0
   243
	void ReadAllKeys(const char* aFileName);
sl@0
   244
	void WriteOutput(const char* aFileName, bool aCopyrightMessage);
sl@0
   245
	int CompareStringIndices(int aIndex1,int aIndex2) const;
sl@0
   246
sl@0
   247
private:
sl@0
   248
	Reader(const Reader&);
sl@0
   249
	int Hex(const char *aString, int &aCharConsumed, bool aTolerate = false);
sl@0
   250
	void GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey=NULL);
sl@0
   251
	void GetMultipleCollationKeys(const char* aString);
sl@0
   252
	unsigned int PackKey(const CollationKey& aValue);
sl@0
   253
	int PackIndex(const CollationIndex& aValue, unsigned int result[2]);
sl@0
   254
	bool ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount);
sl@0
   255
	void AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart);
sl@0
   256
	void AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart);
sl@0
   257
	void AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart);
sl@0
   258
sl@0
   259
	enum
sl@0
   260
		{
sl@0
   261
		EMaxCollationKeys = 0x110000 * 2, /*more elements considering composite keys */
sl@0
   262
		EMaxCollationIndices = 0x110000,
sl@0
   263
		EMaxStringElements = 65536,
sl@0
   264
		EMaxStringIndices = 65536
sl@0
   265
		};
sl@0
   266
	CollationKey iCollationKey[EMaxCollationKeys];
sl@0
   267
	int iKeys;
sl@0
   268
	CollationIndex iCollationIndex[EMaxCollationIndices];
sl@0
   269
	int iIndices;
sl@0
   270
	int iStringElement[EMaxStringElements];
sl@0
   271
	int iStringElements;
sl@0
   272
	unsigned int iStringIndex[EMaxStringIndices];
sl@0
   273
	int iStringIndices;
sl@0
   274
	const char* iInputFileName;
sl@0
   275
	int iLineNumber;
sl@0
   276
	bool iSuppressCanonseqWarning;		// have we issued the canonseq warning yet?
sl@0
   277
	bool iWgl4;				// true if writing keys for wgl4 characters only
sl@0
   278
	bool iStandard;			// true if reading standard files, not tailoring files
sl@0
   279
	const char* iLocaleName;
sl@0
   280
	const char* iUidString;
sl@0
   281
	char* iCPlusPlusIdentifier;		// iLocaleName in title case with difficult characters removed
sl@0
   282
	};
sl@0
   283
sl@0
   284
bool isValidHexDigit(char c)
sl@0
   285
	{
sl@0
   286
	if ('0' <= c && c <= '9')
sl@0
   287
		return true;
sl@0
   288
	if ('a' <= c && c <= 'f')
sl@0
   289
		return true;
sl@0
   290
	if ('A' <= c && c <= 'F')
sl@0
   291
		return true;
sl@0
   292
	return false;
sl@0
   293
	}
sl@0
   294
sl@0
   295
void PrintUsage()
sl@0
   296
	{
sl@0
   297
	cout << "Usage: coltab [/u<uid>] [/c] [/a] [/h<topic>] <locale>\n";
sl@0
   298
	cout << "By Default (without /a option), for the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n";
sl@0
   299
	cout << "For any other locale name <name> coltab reads <name>_basekeys.txt,\n";
sl@0
   300
	cout << "<name>_compkeys.txt and <name>_strings.txt.\n";
sl@0
   301
	cout << "Use the /a option, for the locales 'standard' and 'wgl4' coltab reads allkeys.txt\n";
sl@0
   302
	cout << "For any other locale name <name> coltab reads <name>_allkeys.txt.\n"; 
sl@0
   303
	cout << "The output file is always ls_<name>.cpp.\n";
sl@0
   304
	cout << "Use the /u option to specify the UID that the collation table should have.\n";
sl@0
   305
	cout << "A hex number must follow /u immediately, for example /u800ACBDE\n";
sl@0
   306
	cout << "this hex number must not exceed eight digits. If this is not specified,\n";
sl@0
   307
	cout << "the output file will have to be edited to make it compilable.\n";
sl@0
   308
	cout << "Specify /c to prefix the output with a Nokia copyright message.\n";
sl@0
   309
	cout << "Specify /h for in-depth help.";
sl@0
   310
	}
sl@0
   311
sl@0
   312
void UsageError()
sl@0
   313
	{
sl@0
   314
	PrintUsage();
sl@0
   315
	exit(1);
sl@0
   316
	}
sl@0
   317
sl@0
   318
void PrintHelp(char* aTopic)
sl@0
   319
	{
sl@0
   320
	int topic = 0;
sl@0
   321
	while ('0' <= *aTopic && *aTopic <= '9')
sl@0
   322
		{
sl@0
   323
		topic = topic * 10 + (*aTopic - '0');
sl@0
   324
		++aTopic;
sl@0
   325
		}
sl@0
   326
	switch(topic)
sl@0
   327
		{
sl@0
   328
	case 1:
sl@0
   329
		cout << "How Coltab interprets CANONSEQ:\n\n"\
sl@0
   330
			"If the CANONSEQ specifier is used in a line, Coltab will ignore the mapping.\n"\
sl@0
   331
			"This because, on the Symbian platform, any canonically composed character is\n"\
sl@0
   332
			"decomposed before the key mapping is applied, so characters with canonical\n"\
sl@0
   333
			"decompositions do not need keys. In files supplied by the Unicode Consortium,\n"\
sl@0
   334
			"all mappings for composed characters are flagged by CANONSEQ, so it is useful\n"\
sl@0
   335
			"if Coltab can just ignore these so that Unicode Consortium files can be used\n"\
sl@0
   336
			"unedited.\n\n"\
sl@0
   337
			"This can cause problems if a localizer copies a line from a Unicode file into,\n"\
sl@0
   338
			"say, the <lang>_strings.txt file, in order to give a mapping for an accented\n"\
sl@0
   339
			"character. The localizer replaces the composed character code with the\n"\
sl@0
   340
			"decomposition and changes the keys but forgets to remove the CANONSEQ\n"\
sl@0
   341
			"specifier. In this case the key would be ignored. Coltab provides a warning so\n"\
sl@0
   342
			"that this can be put right.\n\n"\
sl@0
   343
			"Coltab will only warn about the first CANONSEQ in each file, and does not warn\n"\
sl@0
   344
			"if the 'standard' or 'wgl4' options are used.";
sl@0
   345
		exit(1);
sl@0
   346
		break;
sl@0
   347
	case 2:
sl@0
   348
		cout << "How to ensure coltab's output files are compilable.\n\n"\
sl@0
   349
			"By default, Coltab's files for locales need to be edited before they are\n"\
sl@0
   350
			"compilable. The UID for the collation method needs to be filled in. This UID\n"\
sl@0
   351
			"is added so that the collation table can be searched for later. At present,\n"\
sl@0
   352
			"this UID is not necessary for the correct functioning of the Symbian platform\n"\
sl@0
   353
			"and so a value of 0 can be safely used.\n\n"\
sl@0
   354
			"To insert this value into the file directly, use the /u option, for example\n"\
sl@0
   355
			"coltab /u0 french\n"\
sl@0
   356
			"If the /u option is used, the file should be compilable as is. If it is not,\n"\
sl@0
   357
			"please raise it as a defect with Symbian's internationalization team,\n"\
sl@0
   358
			"supplying the files that caused the problem if this is possible.\n"\
sl@0
   359
			"If the 'standard' or 'wgl4' options are used, no UID is output, so the /u\n"\
sl@0
   360
			"option is not required.";
sl@0
   361
		exit(1);
sl@0
   362
		break;
sl@0
   363
	case 3:
sl@0
   364
		cout << "How to ensure collation key values are inside the supported range. \n\n"\
sl@0
   365
			"According to Unicode Standard, the range suppored by tool COLTAB:\n"\
sl@0
   366
			" Level 0 (primary):   0000 - FFFF, \n"\
sl@0
   367
			" Level 1 (Secondary): 0020 - 011E, \n"\
sl@0
   368
			" Level 2 (Tertiary):  0001 - 003F. \n"\
sl@0
   369
			"Please edit your collation files and make sure key values are inside the above range";
sl@0
   370
		exit(1);
sl@0
   371
		break;
sl@0
   372
	default:
sl@0
   373
		PrintUsage();
sl@0
   374
		cout << "\n\nSpecify /h1 for help on the use of CANONSEQ\n";
sl@0
   375
		cout << "Specify /h2 for help on making compilable files that do not need editing\n";
sl@0
   376
		exit(1);
sl@0
   377
		break;
sl@0
   378
		}
sl@0
   379
	}
sl@0
   380
sl@0
   381
short HighSurrogate(int aCode)
sl@0
   382
	{
sl@0
   383
	return static_cast<short>(0xD7C0 + (aCode >> 10));
sl@0
   384
	}
sl@0
   385
	
sl@0
   386
short LowSurrogate(int aCode)
sl@0
   387
	{
sl@0
   388
	return static_cast<short>(0xDC00 | (aCode & 0x3FF));
sl@0
   389
	}
sl@0
   390
sl@0
   391
int main(int argc,char** argv)
sl@0
   392
	{
sl@0
   393
	bool copyright = false;
sl@0
   394
	bool wgl4 = false;
sl@0
   395
	bool allKeys = false;
sl@0
   396
	const char* prefix = "";
sl@0
   397
	const char* infix = "";
sl@0
   398
	const char* locale = "";
sl@0
   399
	char* localeArg = 0;
sl@0
   400
	char* uidArg = 0;
sl@0
   401
	for (int i = 1; i < argc; ++i)
sl@0
   402
		{
sl@0
   403
		if (argv[i][0] == '/' || argv[i][0] == '-')
sl@0
   404
			{
sl@0
   405
			switch (argv[i][1])
sl@0
   406
				{
sl@0
   407
			case 'u':
sl@0
   408
			case 'U':
sl@0
   409
				{
sl@0
   410
				uidArg = argv[i] + 2;
sl@0
   411
				const char* uidCheck = uidArg;
sl@0
   412
				while (*uidCheck)
sl@0
   413
					{
sl@0
   414
					if (!isValidHexDigit(*uidCheck))
sl@0
   415
						UsageError();
sl@0
   416
					++uidCheck;
sl@0
   417
					}
sl@0
   418
				if (uidCheck == uidArg || 8 < uidCheck - uidArg)
sl@0
   419
					UsageError();
sl@0
   420
				break;
sl@0
   421
				}
sl@0
   422
			case 'c':
sl@0
   423
			case 'C':
sl@0
   424
				copyright = true;
sl@0
   425
				break;
sl@0
   426
			case 'a':
sl@0
   427
				allKeys = true;
sl@0
   428
				break;
sl@0
   429
			case 'h':
sl@0
   430
			case 'H':
sl@0
   431
				PrintHelp(argv[i] + 2);
sl@0
   432
				break;
sl@0
   433
			default:
sl@0
   434
				UsageError();
sl@0
   435
				break;
sl@0
   436
				}
sl@0
   437
			}
sl@0
   438
		else if (!localeArg)
sl@0
   439
			localeArg = argv[i];
sl@0
   440
		else
sl@0
   441
			UsageError();
sl@0
   442
		}
sl@0
   443
	if (!localeArg)
sl@0
   444
		UsageError();
sl@0
   445
	bool standard = false;
sl@0
   446
	if (!_stricmp(localeArg, "standard"))
sl@0
   447
		{
sl@0
   448
		locale = "Standard";
sl@0
   449
		standard = true;
sl@0
   450
		}
sl@0
   451
	else if (!_stricmp(localeArg, "wgl4"))
sl@0
   452
		{
sl@0
   453
		locale = "Wgl4";
sl@0
   454
		wgl4 = true;
sl@0
   455
		standard = true;
sl@0
   456
		}
sl@0
   457
	else
sl@0
   458
		{
sl@0
   459
		locale = prefix = localeArg;
sl@0
   460
		infix = "_";
sl@0
   461
		}
sl@0
   462
sl@0
   463
	Reader* reader = new Reader(wgl4, standard, locale, uidArg);
sl@0
   464
	if (!reader)
sl@0
   465
		{
sl@0
   466
		cout << "out of memory\n";
sl@0
   467
		exit(1);
sl@0
   468
		}
sl@0
   469
	char* filename = new char[strlen(prefix) + strlen(infix) + 64];
sl@0
   470
	if (allKeys == false)
sl@0
   471
		{
sl@0
   472
		sprintf(filename,"%s%scompkeys.txt",prefix,infix);
sl@0
   473
		reader->ReadCompKeys(filename);
sl@0
   474
		if (!standard)
sl@0
   475
			{
sl@0
   476
			sprintf(filename,"%s%sstrings.txt",prefix,infix);
sl@0
   477
			reader->ReadStrings(filename);
sl@0
   478
			}
sl@0
   479
		sprintf(filename,"%s%sbasekeys.txt",prefix,infix);
sl@0
   480
		reader->ReadBaseKeys(filename);
sl@0
   481
		}
sl@0
   482
	else
sl@0
   483
		{
sl@0
   484
		sprintf(filename,"%s%sAllKeys.txt",prefix,infix);
sl@0
   485
		reader->ReadAllKeys(filename);
sl@0
   486
		}
sl@0
   487
	sprintf(filename,"ls_%s.cpp", localeArg);
sl@0
   488
	reader->WriteOutput(filename, copyright);
sl@0
   489
sl@0
   490
	delete reader;
sl@0
   491
	delete [] filename;
sl@0
   492
	return 0;
sl@0
   493
	}
sl@0
   494
sl@0
   495
Reader::Reader(bool aWgl4, bool aStandard,
sl@0
   496
	const char* aLocaleName, const char* aUidString):
sl@0
   497
	iKeys(0),
sl@0
   498
	iIndices(0),
sl@0
   499
	iStringElements(0),
sl@0
   500
	iStringIndices(0),
sl@0
   501
	iInputFileName(NULL),
sl@0
   502
	iLineNumber(0),
sl@0
   503
	iSuppressCanonseqWarning(false),
sl@0
   504
	iWgl4(aWgl4),
sl@0
   505
	iStandard(aStandard),
sl@0
   506
	iLocaleName(aLocaleName),
sl@0
   507
	iUidString(aUidString)
sl@0
   508
	{
sl@0
   509
	if (iStandard)
sl@0
   510
		{
sl@0
   511
		iCPlusPlusIdentifier = new char[9];
sl@0
   512
		strcpy(iCPlusPlusIdentifier, "Standard");
sl@0
   513
		return;
sl@0
   514
		}
sl@0
   515
	char* p = iCPlusPlusIdentifier = new char[strlen(aLocaleName) + 2];
sl@0
   516
	int current = toupper(aLocaleName[0]);
sl@0
   517
	if (current < 'A' || 'Z' < current)
sl@0
   518
		*p++ = 'C';
sl@0
   519
	else
sl@0
   520
		{
sl@0
   521
		*p++ = static_cast<char>(current);
sl@0
   522
		++aLocaleName;
sl@0
   523
		}
sl@0
   524
	bool inUnderScore = false;
sl@0
   525
	while (*aLocaleName)
sl@0
   526
		{
sl@0
   527
		current = tolower(*aLocaleName++);
sl@0
   528
		if (current < 'a' || 'z' < current)
sl@0
   529
			{
sl@0
   530
			if (!inUnderScore)
sl@0
   531
				{
sl@0
   532
				inUnderScore = true;
sl@0
   533
				*p++ = '_';
sl@0
   534
				}
sl@0
   535
			}
sl@0
   536
		else
sl@0
   537
			{
sl@0
   538
			inUnderScore = false;
sl@0
   539
			*p++ = static_cast<char>(current);
sl@0
   540
			}
sl@0
   541
		}
sl@0
   542
	*p = 0;
sl@0
   543
	}
sl@0
   544
sl@0
   545
Reader::~Reader()
sl@0
   546
	{
sl@0
   547
	delete [] iCPlusPlusIdentifier;
sl@0
   548
	}
sl@0
   549
sl@0
   550
// Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true.
sl@0
   551
int Reader::Hex(const char *aString, int &aCharConsumed, bool aTolerate)
sl@0
   552
	{
sl@0
   553
	char *end;
sl@0
   554
	unsigned long x = strtoul(aString,&end,16);
sl@0
   555
	aCharConsumed = end - aString;
sl@0
   556
	if ((aCharConsumed != 4) && (aCharConsumed != 5) && (aCharConsumed != 6))
sl@0
   557
		{
sl@0
   558
		if (!aTolerate)
sl@0
   559
			{
sl@0
   560
			cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n';
sl@0
   561
			exit(1);
sl@0
   562
			}
sl@0
   563
		return -1;
sl@0
   564
		}
sl@0
   565
	return x;
sl@0
   566
	}
sl@0
   567
sl@0
   568
// Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx]
sl@0
   569
void Reader::GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey)
sl@0
   570
	{
sl@0
   571
	aCharConsumed = 0;
sl@0
   572
	const char *end = strchr(aString, ']');
sl@0
   573
	if (end != NULL){
sl@0
   574
		aCharConsumed = end - aString;
sl@0
   575
	}
sl@0
   576
	
sl@0
   577
	if (aString[0] != '[' || (aCharConsumed != 21 && aCharConsumed != 22 && aCharConsumed != 23))
sl@0
   578
		{
sl@0
   579
		cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n';
sl@0
   580
		exit(1);
sl@0
   581
		}
sl@0
   582
	if (aKey == NULL)
sl@0
   583
		{
sl@0
   584
		if (iKeys >= EMaxCollationKeys)
sl@0
   585
			{
sl@0
   586
			cout << "too many keys";
sl@0
   587
			exit(1);
sl@0
   588
			}
sl@0
   589
		aKey = &iCollationKey[iKeys++];
sl@0
   590
		}
sl@0
   591
	aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored
sl@0
   592
	int charConsumed = 0;
sl@0
   593
	for (int i = 0; i < CollationKey::ELevels; i++)
sl@0
   594
		aKey->iLevel[i] = Hex(aString + 2 + i * 5, charConsumed);
sl@0
   595
sl@0
   596
	if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max))
sl@0
   597
		{
sl@0
   598
		aKey->iLevel[1] = KLevel1Max;
sl@0
   599
		cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max << "\n";
sl@0
   600
		cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
sl@0
   601
		exit(1);
sl@0
   602
		}
sl@0
   603
	
sl@0
   604
	if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max))
sl@0
   605
		{
sl@0
   606
		cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max << "\n";
sl@0
   607
		cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
sl@0
   608
		exit(1);
sl@0
   609
		}
sl@0
   610
sl@0
   611
	aKey->iStop = true;
sl@0
   612
	}
sl@0
   613
sl@0
   614
void Reader::GetMultipleCollationKeys(const char* aString)
sl@0
   615
	{
sl@0
   616
	int keyCount = 0;
sl@0
   617
	int charConsumed =0;
sl@0
   618
	while (aString[0] == '[')
sl@0
   619
		{
sl@0
   620
		GetCollationKey(aString, charConsumed);
sl@0
   621
sl@0
   622
		keyCount++;
sl@0
   623
		iCollationKey[iKeys - 1].iStop = false;
sl@0
   624
		int length = strlen(aString);
sl@0
   625
		if (length <= charConsumed + 1)
sl@0
   626
			break;
sl@0
   627
		aString += charConsumed + 1;
sl@0
   628
		
sl@0
   629
		if (aString[0] == ' ') //a space is put between collation keys in keys files provided by previous Unicode Standard (i.e 3.1)
sl@0
   630
			aString++;
sl@0
   631
		
sl@0
   632
		}
sl@0
   633
	iCollationKey[iKeys - 1].iStop = true;
sl@0
   634
	}
sl@0
   635
sl@0
   636
/*
sl@0
   637
Partially parse a line, returning its key code and the start of its first block of key data.
sl@0
   638
Return false if it is not a data line, or not relevant.
sl@0
   639
*/
sl@0
   640
bool Reader::ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount)
sl@0
   641
	{
sl@0
   642
	int lineLength = strlen(aLine);
sl@0
   643
	int charConsumed = 0;
sl@0
   644
	aCodeCount = 0;
sl@0
   645
	aCode[0] = Hex(aLine,charConsumed,true);
sl@0
   646
sl@0
   647
	/*
sl@0
   648
	A data line must start with a hex number and be at least 27 characters long.
sl@0
   649
	Canonically decomposable Unicode characters are skipped.
sl@0
   650
	Skip non-WGL4 characters if doing WGL4 only.
sl@0
   651
	*/
sl@0
   652
	if (aCode[0] != -1)
sl@0
   653
		{
sl@0
   654
		aCodeCount = 1;
sl@0
   655
		if (!strcmp(aLine + lineLength - 8,"CANONSEQ"))
sl@0
   656
			{
sl@0
   657
			if (!iSuppressCanonseqWarning)
sl@0
   658
				{
sl@0
   659
				cout << "Warning: CANONSEQ used in file " << iInputFileName
sl@0
   660
					<< " on line " << iLineNumber << ".\nWarning: All mappings specifying CANONSEQ are ignored.\n"
sl@0
   661
					<< "Warning: Use coltab /h1 for more details.";
sl@0
   662
				iSuppressCanonseqWarning = true;
sl@0
   663
				}
sl@0
   664
			aCodeCount = 0;
sl@0
   665
			}
sl@0
   666
		else if (lineLength < 27 ||
sl@0
   667
			(iWgl4 && !InWgl4((unsigned int)aCode))) 
sl@0
   668
			aCodeCount = 0;
sl@0
   669
		}
sl@0
   670
sl@0
   671
	if (aCode[0] != -1)
sl@0
   672
		{
sl@0
   673
		// find '['
sl@0
   674
		aKeyStart = charConsumed;
sl@0
   675
		while (aKeyStart < lineLength && aLine[aKeyStart] != '[')
sl@0
   676
			aKeyStart++;
sl@0
   677
sl@0
   678
		// read all hex before '['
sl@0
   679
		int index = charConsumed + 1;
sl@0
   680
		while (index < aKeyStart)
sl@0
   681
			{
sl@0
   682
			aCode[aCodeCount] = Hex(aLine+index, charConsumed, true);
sl@0
   683
			if (aCode[aCodeCount] == -1)
sl@0
   684
				break;
sl@0
   685
sl@0
   686
			index += charConsumed + 1;
sl@0
   687
			aCodeCount++;
sl@0
   688
			}
sl@0
   689
sl@0
   690
		// find number of collation keys
sl@0
   691
		aKeyCount = 0;
sl@0
   692
		index = aKeyStart;
sl@0
   693
		while (index < lineLength && aLine[index] != '%' && aLine[index] != '#')
sl@0
   694
			{
sl@0
   695
			if (aLine[index] == '[')
sl@0
   696
				aKeyCount++;
sl@0
   697
			index++;
sl@0
   698
			}
sl@0
   699
		}
sl@0
   700
sl@0
   701
	return aCodeCount > 0;
sl@0
   702
	}
sl@0
   703
sl@0
   704
void Reader::AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart)
sl@0
   705
	{
sl@0
   706
	if (iIndices >= EMaxCollationIndices)
sl@0
   707
		{
sl@0
   708
		cout << "too many Unicode values";
sl@0
   709
		exit(1);
sl@0
   710
		}
sl@0
   711
	CollationIndex& index = iCollationIndex[iIndices++];
sl@0
   712
	index.iCode = aCode;
sl@0
   713
	index.iIndex = -1;
sl@0
   714
sl@0
   715
	/*
sl@0
   716
	First try to find the key in the array of keys found so far.
sl@0
   717
	Search backwards to use the fact that runs of the same key occur together.
sl@0
   718
	*/
sl@0
   719
	CollationKey key;
sl@0
   720
	int charConsumed = 0;
sl@0
   721
	GetCollationKey(aLine + aKeyStart, charConsumed, &key);
sl@0
   722
	for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--)
sl@0
   723
		if (iCollationKey[i] == key)
sl@0
   724
			index.iIndex = i;
sl@0
   725
sl@0
   726
	// If that fails, add a new key.
sl@0
   727
	if (index.iIndex == -1)
sl@0
   728
		{
sl@0
   729
		index.iIndex = iKeys++;
sl@0
   730
		if (iKeys > EMaxCollationKeys)
sl@0
   731
			{
sl@0
   732
			cout << "too many keys";
sl@0
   733
			exit(1);
sl@0
   734
			} 
sl@0
   735
		iCollationKey[index.iIndex] = key;
sl@0
   736
		}
sl@0
   737
	}
sl@0
   738
/*
sl@0
   739
Read 1-to-1 mapping. Sample:
sl@0
   740
02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
sl@0
   741
sl@0
   742
aCombinedFile = true: aFileName is combined file, which contains base keys, comp keys, and string keys.
sl@0
   743
*/
sl@0
   744
void Reader::ReadBaseKeys(const char* aFileName)
sl@0
   745
	{
sl@0
   746
	iSuppressCanonseqWarning = iStandard || iWgl4;
sl@0
   747
	iLineNumber = 0;
sl@0
   748
	iInputFileName = aFileName;
sl@0
   749
	ifstream input_file;
sl@0
   750
sl@0
   751
#ifdef __MSVCDOTNET__
sl@0
   752
	input_file.open(iInputFileName, ios::in);
sl@0
   753
#else //!__MSVCDOTNET__
sl@0
   754
	input_file.open(iInputFileName, ios::in | ios::nocreate);
sl@0
   755
#endif //__MSVCDOTNET__
sl@0
   756
sl@0
   757
	if (input_file.fail())
sl@0
   758
		{
sl@0
   759
		cout << "cannot open input file '" << iInputFileName << "'\n";
sl@0
   760
		exit(1);
sl@0
   761
		}
sl@0
   762
	cout << "reading base keys from '" << iInputFileName << "'\n";
sl@0
   763
sl@0
   764
	char line[1024];
sl@0
   765
	for (;;)
sl@0
   766
		{
sl@0
   767
		input_file.getline(line,sizeof(line));
sl@0
   768
		if (input_file.eof())
sl@0
   769
			break;
sl@0
   770
		iLineNumber++;
sl@0
   771
		// line number counting
sl@0
   772
		if (iLineNumber % 100 == 0)
sl@0
   773
			{
sl@0
   774
			cout << "line " << iLineNumber << '\n';
sl@0
   775
			cout.flush();
sl@0
   776
			}
sl@0
   777
		int code[16];
sl@0
   778
		int codeCount = 0;
sl@0
   779
		int key_start = 0;
sl@0
   780
		int keyCount = 0;
sl@0
   781
		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
sl@0
   782
			{
sl@0
   783
			if (codeCount != 1 || keyCount != 1)
sl@0
   784
				continue;	// goto next line
sl@0
   785
			AddKeyOneToOne(line, code[0], key_start);
sl@0
   786
			}
sl@0
   787
		}
sl@0
   788
sl@0
   789
	input_file.close();
sl@0
   790
	}
sl@0
   791
sl@0
   792
void Reader::AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart)
sl@0
   793
	{
sl@0
   794
	if (iIndices >= EMaxCollationIndices)
sl@0
   795
		{
sl@0
   796
		cout << "too many Unicode values";
sl@0
   797
		exit(1);
sl@0
   798
		}
sl@0
   799
	CollationIndex& index = iCollationIndex[iIndices++];
sl@0
   800
	index.iCode = aCode;
sl@0
   801
	index.iIndex = iKeys;
sl@0
   802
	GetMultipleCollationKeys(aLine + aKeyStart);
sl@0
   803
	}
sl@0
   804
/*
sl@0
   805
Read 1-to-much mapping.
sl@0
   806
3303  ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
sl@0
   807
*/
sl@0
   808
void Reader::ReadCompKeys(const char* aFileName)
sl@0
   809
	{
sl@0
   810
	iSuppressCanonseqWarning = iStandard || iWgl4;
sl@0
   811
	iLineNumber = 0;
sl@0
   812
	iInputFileName = aFileName;
sl@0
   813
	ifstream input_file;
sl@0
   814
sl@0
   815
#ifdef __MSVCDOTNET__
sl@0
   816
	input_file.open(iInputFileName, ios::in);
sl@0
   817
#else //!__MSVCDOTNET__
sl@0
   818
	input_file.open(iInputFileName, ios::in | ios::nocreate);
sl@0
   819
#endif //__MSVCDOTNET__
sl@0
   820
sl@0
   821
	if (input_file.fail())
sl@0
   822
		{
sl@0
   823
		cout << "there are no composite keys; '" << iInputFileName << "' not found\n";
sl@0
   824
		return;
sl@0
   825
		}
sl@0
   826
	cout << "reading composite keys from '" << iInputFileName << "'\n";
sl@0
   827
sl@0
   828
	char line[1024];
sl@0
   829
	for (;;)
sl@0
   830
		{
sl@0
   831
		input_file.getline(line,sizeof(line));
sl@0
   832
		if (input_file.eof())
sl@0
   833
			break;
sl@0
   834
		iLineNumber++;
sl@0
   835
		// line number counting
sl@0
   836
		if (iLineNumber % 100 == 0)
sl@0
   837
			{
sl@0
   838
			cout << "line " << iLineNumber << '\n';
sl@0
   839
			cout.flush();
sl@0
   840
			}
sl@0
   841
		int code[16];
sl@0
   842
		int codeCount = 0;
sl@0
   843
		int key_start = 0;
sl@0
   844
		int keyCount = 0;
sl@0
   845
		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
sl@0
   846
			{
sl@0
   847
			if (codeCount != 1 || keyCount < 2)
sl@0
   848
				continue;	// goto next line
sl@0
   849
			AddKeyOneToMuch(line, code[0], key_start);
sl@0
   850
			}
sl@0
   851
		}
sl@0
   852
sl@0
   853
	input_file.close();
sl@0
   854
	}
sl@0
   855
sl@0
   856
sl@0
   857
void Reader::AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart)
sl@0
   858
	{
sl@0
   859
sl@0
   860
	// Store the index to the Unicode string and the key sequence.
sl@0
   861
	if (iStringIndices > EMaxStringIndices)
sl@0
   862
		{
sl@0
   863
		cout << "too many string indices";
sl@0
   864
		exit(1);
sl@0
   865
		}
sl@0
   866
	iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys;
sl@0
   867
sl@0
   868
	// Reserve space for the length.
sl@0
   869
	if (iStringElements >= EMaxStringElements)
sl@0
   870
		{
sl@0
   871
		cout << "too many string elements";
sl@0
   872
		exit(1);
sl@0
   873
		}
sl@0
   874
	iStringElements++;
sl@0
   875
sl@0
   876
	// Read the Unicode string.
sl@0
   877
	int length = 0;		// in unit of int16
sl@0
   878
	int charCount = 0;	// in unit of char. for debug.
sl@0
   879
sl@0
   880
	for (int i=0; i<aCodeCount; i++)
sl@0
   881
		{	
sl@0
   882
		if (iStringElements >= EMaxStringElements)
sl@0
   883
			{
sl@0
   884
			cout << "too many string elements";
sl@0
   885
			exit(1);
sl@0
   886
			}
sl@0
   887
	
sl@0
   888
		if (aCode[i] > 0xFFFF)
sl@0
   889
			{
sl@0
   890
			// UCS4 --> UTF-16
sl@0
   891
			iStringElement[iStringElements++] = 0xD7C0 + (aCode[i] >> 10);
sl@0
   892
			iStringElement[iStringElements++] = 0xDC00 | (aCode[i] & 0x3FF);
sl@0
   893
			length += 2;
sl@0
   894
			}
sl@0
   895
		else
sl@0
   896
			{
sl@0
   897
			iStringElement[iStringElements++] = aCode[i];
sl@0
   898
			length++;
sl@0
   899
			}
sl@0
   900
		charCount++;
sl@0
   901
		}
sl@0
   902
sl@0
   903
	iStringElement[iStringElements - length - 1] = (unsigned int)length;
sl@0
   904
sl@0
   905
	// Read the key sequence.
sl@0
   906
	GetMultipleCollationKeys(aLine + aKeyStart);
sl@0
   907
	}
sl@0
   908
/*
sl@0
   909
Read much-to-much mapping. Sample:
sl@0
   910
004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
sl@0
   911
0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
sl@0
   912
*/
sl@0
   913
void Reader::ReadStrings(const char* aFileName)
sl@0
   914
	{
sl@0
   915
	iSuppressCanonseqWarning = iStandard || iWgl4;
sl@0
   916
	iLineNumber = 0;
sl@0
   917
	iInputFileName = aFileName;
sl@0
   918
	ifstream input_file;
sl@0
   919
sl@0
   920
#ifdef __MSVCDOTNET__
sl@0
   921
	input_file.open(iInputFileName, ios::in);
sl@0
   922
#else //!__MSVCDOTNET__
sl@0
   923
	input_file.open(iInputFileName, ios::in | ios::nocreate);
sl@0
   924
#endif //__MSVCDOTNET__
sl@0
   925
sl@0
   926
	if (input_file.fail())
sl@0
   927
		{
sl@0
   928
		cout << "there are no strings; '" << iInputFileName << "' not found\n";
sl@0
   929
		return;
sl@0
   930
		}
sl@0
   931
	cout << "reading strings from '" << iInputFileName << "'\n";
sl@0
   932
sl@0
   933
	char line[1024];
sl@0
   934
	for (;;)
sl@0
   935
		{
sl@0
   936
		input_file.getline(line,sizeof(line));
sl@0
   937
		if (input_file.eof())
sl@0
   938
			break;
sl@0
   939
		iLineNumber++;
sl@0
   940
		// line number counting
sl@0
   941
		if (iLineNumber % 100 == 0)
sl@0
   942
			{
sl@0
   943
			cout << "line " << iLineNumber << '\n';
sl@0
   944
			cout.flush();
sl@0
   945
			}
sl@0
   946
		int code[16];
sl@0
   947
		int codeCount = 0;
sl@0
   948
		int key_start = 0;
sl@0
   949
		int keyCount = 0;
sl@0
   950
		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
sl@0
   951
			{
sl@0
   952
			if (codeCount < 2 || keyCount < 1)
sl@0
   953
				continue;	// goto next line
sl@0
   954
			AddKeyMuchToMuch(line, code, codeCount, key_start);
sl@0
   955
			}
sl@0
   956
		}
sl@0
   957
sl@0
   958
	input_file.close();
sl@0
   959
	}
sl@0
   960
sl@0
   961
/*
sl@0
   962
Read combined key table. Sample:
sl@0
   963
1-to-1 mapping:
sl@0
   964
02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
sl@0
   965
sl@0
   966
1-to-much mapping:
sl@0
   967
3303  ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
sl@0
   968
sl@0
   969
much-to-much mapping:
sl@0
   970
004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
sl@0
   971
0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
sl@0
   972
*/
sl@0
   973
void Reader::ReadAllKeys(const char* aFileName)
sl@0
   974
	{
sl@0
   975
	iSuppressCanonseqWarning = iStandard || iWgl4;
sl@0
   976
	iLineNumber = 0;
sl@0
   977
	iInputFileName = aFileName;
sl@0
   978
	ifstream input_file;
sl@0
   979
sl@0
   980
#ifdef __MSVCDOTNET__
sl@0
   981
	input_file.open(iInputFileName, ios::in);
sl@0
   982
#else //!__MSVCDOTNET__
sl@0
   983
	input_file.open(iInputFileName, ios::in | ios::nocreate);
sl@0
   984
#endif //__MSVCDOTNET__
sl@0
   985
sl@0
   986
	if (input_file.fail())
sl@0
   987
		{
sl@0
   988
		cout << "there are no keys; '" << iInputFileName << "' not found\n";
sl@0
   989
		return;
sl@0
   990
		}
sl@0
   991
	cout << "reading all keys from '" << iInputFileName << "'\n";
sl@0
   992
sl@0
   993
	char line[1024];
sl@0
   994
	for (;;)
sl@0
   995
		{
sl@0
   996
		if (input_file.eof())
sl@0
   997
			break;
sl@0
   998
		input_file.getline(line,sizeof(line));
sl@0
   999
		iLineNumber++;
sl@0
  1000
sl@0
  1001
		int code[16];
sl@0
  1002
		int codeCount = 0;
sl@0
  1003
		int key_start = 0;
sl@0
  1004
		int keyCount = 0;
sl@0
  1005
		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
sl@0
  1006
			{
sl@0
  1007
			if (codeCount == 1 && keyCount == 1)
sl@0
  1008
				AddKeyOneToOne(line, code[0], key_start);
sl@0
  1009
			else if (codeCount == 1 && keyCount > 1)
sl@0
  1010
				AddKeyOneToMuch(line, code[0], key_start);
sl@0
  1011
			else if (codeCount > 1 && keyCount > 0)
sl@0
  1012
			AddKeyMuchToMuch(line, code, codeCount, key_start);
sl@0
  1013
			else
sl@0
  1014
				cout << "ignore line: " << line << "\n";
sl@0
  1015
			}
sl@0
  1016
		}
sl@0
  1017
sl@0
  1018
	input_file.close();
sl@0
  1019
	}
sl@0
  1020
sl@0
  1021
sl@0
  1022
// Pack the 3 collation key levels into a single 32-bit integer.
sl@0
  1023
unsigned int Reader::PackKey(const CollationKey& aValue)
sl@0
  1024
	{
sl@0
  1025
	unsigned int level0 = aValue.iLevel[0];
sl@0
  1026
	unsigned int level1 = aValue.iLevel[1];
sl@0
  1027
	if (level1 > 0)
sl@0
  1028
		level1 -= (KLevel1Min - 1);
sl@0
  1029
	unsigned int level2 = aValue.iLevel[2];
sl@0
  1030
	if (level2 > 0)
sl@0
  1031
		level2 -= (KLevel2Min - 1);
sl@0
  1032
	unsigned int key = level0 << 16 | level1 << 8 | level2 << 2;
sl@0
  1033
	if (aValue.iIgnorable)
sl@0
  1034
		key |= 2;
sl@0
  1035
	if (aValue.iStop)
sl@0
  1036
		key |= 1;
sl@0
  1037
	return key;
sl@0
  1038
	}
sl@0
  1039
sl@0
  1040
// Pack a collation index value into a single 32-bit integer.
sl@0
  1041
int Reader::PackIndex(const CollationIndex& aValue, unsigned int result[2])
sl@0
  1042
	{
sl@0
  1043
	unsigned int code = aValue.iCode;
sl@0
  1044
	unsigned int index = aValue.iIndex;
sl@0
  1045
	if (code <= 0xFFFF)
sl@0
  1046
		{
sl@0
  1047
		result[0] = (code << 16 | index);
sl@0
  1048
		return 1;
sl@0
  1049
		}
sl@0
  1050
	else
sl@0
  1051
		{
sl@0
  1052
		result[0] = (::HighSurrogate(code) << 16 | index);
sl@0
  1053
		result[1] = (::LowSurrogate(code) << 16 | index);
sl@0
  1054
		return 2;
sl@0
  1055
		}
sl@0
  1056
	}
sl@0
  1057
sl@0
  1058
const Reader* TheReader;
sl@0
  1059
static int CompareStringIndices(const void* aIndex1,const void* aIndex2)
sl@0
  1060
	{
sl@0
  1061
	return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16);
sl@0
  1062
	}
sl@0
  1063
sl@0
  1064
int CompareUnicodeStrings(const int *aString1,int aLength1,const int *aString2,int aLength2)
sl@0
  1065
	{
sl@0
  1066
	for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
sl@0
  1067
		{
sl@0
  1068
		int x = i < aLength1 ? *aString1 : -1;
sl@0
  1069
		int y = i < aLength2 ? *aString2 : -1;
sl@0
  1070
		if (x != y)
sl@0
  1071
			return x - y;
sl@0
  1072
		}
sl@0
  1073
	return 0;
sl@0
  1074
	}
sl@0
  1075
sl@0
  1076
int Reader::CompareStringIndices(int aIndex1,int aIndex2) const
sl@0
  1077
	{
sl@0
  1078
	return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1],
sl@0
  1079
								 iStringElement + aIndex2 + 1,iStringElement[aIndex2]);
sl@0
  1080
	}
sl@0
  1081
sl@0
  1082
void Reader::WriteOutput(const char* aFileName, bool aCopyright)
sl@0
  1083
	{
sl@0
  1084
	int i;
sl@0
  1085
	ofstream output_file;
sl@0
  1086
	output_file.open(aFileName);
sl@0
  1087
	if (output_file.fail())
sl@0
  1088
		{
sl@0
  1089
		cout << "cannot open output file '" << aFileName << "'\n";
sl@0
  1090
		exit(1);
sl@0
  1091
		}
sl@0
  1092
	cout << "writing output to '" << aFileName << "'\n";
sl@0
  1093
sl@0
  1094
	char *locale = NULL;
sl@0
  1095
	if (iStandard)
sl@0
  1096
		locale = _strdup("Standard");
sl@0
  1097
	else
sl@0
  1098
		locale = _strdup(iLocaleName);
sl@0
  1099
sl@0
  1100
	if (!iStandard)
sl@0
  1101
		{
sl@0
  1102
		_strlwr(locale);
sl@0
  1103
		locale[0] = (char)toupper(locale[0]);
sl@0
  1104
		if (aCopyright)
sl@0
  1105
			{
sl@0
  1106
			char* capsFileName = new char[strlen(aFileName) + 1];
sl@0
  1107
			strcpy(capsFileName, aFileName);
sl@0
  1108
			_strupr(capsFileName);
sl@0
  1109
			output_file << "/*\n" << capsFileName << "\n\nCopyright (C) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.\n*/\n";
sl@0
  1110
			delete [] capsFileName;
sl@0
  1111
			output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n";
sl@0
  1112
			output_file << "Generated by COLTAB.\n*/\n";
sl@0
  1113
			}
sl@0
  1114
sl@0
  1115
		output_file << "\n#include \"ls_std.h\"\n#include <collate.h>\n";
sl@0
  1116
		output_file << "\nconst TUint KUid" << iCPlusPlusIdentifier << "CollationMethod = ";
sl@0
  1117
		if (iUidString)
sl@0
  1118
			output_file << "0x" << iUidString << ";\n";
sl@0
  1119
		else
sl@0
  1120
			{
sl@0
  1121
			output_file << "/* FILL THIS IN */;\n";
sl@0
  1122
			cout << "Warning: File will need editing\nWarning: see coltab /h2 for details.\n";
sl@0
  1123
			}
sl@0
  1124
		}
sl@0
  1125
sl@0
  1126
	/*
sl@0
  1127
	Write the unique collation keys.
sl@0
  1128
	Each one has the format, going from highest to lowest bit:
sl@0
  1129
sl@0
  1130
	16 bits:	level-0 key
sl@0
  1131
	8 bits:		level-1 key
sl@0
  1132
	6 bits:		level-2 key
sl@0
  1133
	1 bit:		set if this key is optionally ignorable
sl@0
  1134
	1 bit:		set if this is the last key in the string of keys for a single Unicode value
sl@0
  1135
sl@0
  1136
	*/
sl@0
  1137
	if (iKeys != 0)
sl@0
  1138
		{
sl@0
  1139
		output_file << "\nstatic const TUint32 The" << iCPlusPlusIdentifier << "Key[] = \n\t{";
sl@0
  1140
		CollationKey* ck = iCollationKey;
sl@0
  1141
		output_file << "\t // " << iKeys << " keys";
sl@0
  1142
		output_file << hex;
sl@0
  1143
		for (i = 0; i < iKeys; i++, ck++)
sl@0
  1144
			{
sl@0
  1145
			unsigned int key = PackKey(*ck);
sl@0
  1146
			if (i % 8 == 0)
sl@0
  1147
				output_file << "\n\t";
sl@0
  1148
			output_file << "0x";
sl@0
  1149
			output_file << key << ",";
sl@0
  1150
			}
sl@0
  1151
		output_file << dec;
sl@0
  1152
		output_file << "\n\t};\n\n";
sl@0
  1153
		}
sl@0
  1154
sl@0
  1155
	if (iIndices != 0)
sl@0
  1156
		{
sl@0
  1157
		// Sort then write the collation index values - these relate Unicode values to collation keys.
sl@0
  1158
		qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare);
sl@0
  1159
		output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "Index[] = \n\t{";
sl@0
  1160
		CollationIndex* ci = iCollationIndex;
sl@0
  1161
		int entry=0;
sl@0
  1162
		output_file << "\t // " << iIndices << " indices";
sl@0
  1163
		output_file << hex;
sl@0
  1164
		for (i = 0; i < iIndices; i++, ci++, entry++)
sl@0
  1165
			{
sl@0
  1166
			unsigned int key[2];
sl@0
  1167
			int bytecount = PackIndex(*ci, key);
sl@0
  1168
sl@0
  1169
			if (entry % 8 == 0)
sl@0
  1170
				output_file << "\n\t";
sl@0
  1171
			output_file << "0x";
sl@0
  1172
			output_file << key[0] << ",";
sl@0
  1173
sl@0
  1174
			if (bytecount == 2)
sl@0
  1175
				{
sl@0
  1176
				entry++;
sl@0
  1177
				if (entry % 8 == 0)
sl@0
  1178
					output_file << "\n\t";
sl@0
  1179
				output_file << "0x";
sl@0
  1180
				output_file << key[1] << ",";
sl@0
  1181
				}
sl@0
  1182
			}
sl@0
  1183
		output_file << dec;
sl@0
  1184
		output_file << "\n\t};";
sl@0
  1185
		output_file << "\t // " << entry << " entries";
sl@0
  1186
		output_file << "\n\n";
sl@0
  1187
		iIndices = entry; //One surrogate pair occupies 2 entries 
sl@0
  1188
		}
sl@0
  1189
sl@0
  1190
	if (iStringElements)
sl@0
  1191
		{
sl@0
  1192
		// Write the Unicode strings; these are preceded by their lengths.
sl@0
  1193
		output_file << "static const TUint16 The" << iCPlusPlusIdentifier << "StringElement[] = \n\t{";
sl@0
  1194
		output_file << hex;
sl@0
  1195
		for (i = 0; i < iStringElements; i++)
sl@0
  1196
			{
sl@0
  1197
			if (i % 8 == 0)
sl@0
  1198
				output_file << "\n\t";
sl@0
  1199
			output_file << "0x" << iStringElement[i] << ",";
sl@0
  1200
			}
sl@0
  1201
		output_file << dec;
sl@0
  1202
		if (iStringElements==0)
sl@0
  1203
			output_file << "0";
sl@0
  1204
		output_file << "\n\t};\n\n";
sl@0
  1205
sl@0
  1206
		/*
sl@0
  1207
		Sort then write the string index values - these relate Unicode strings to collation keys.
sl@0
  1208
		Each one has the string index in the upper word and the key index in the lower word.
sl@0
  1209
		*/
sl@0
  1210
		TheReader = this;
sl@0
  1211
		qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices);
sl@0
  1212
		output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "StringIndex[] = \n\t{";
sl@0
  1213
		output_file << hex;
sl@0
  1214
		for (i = 0; i < iStringIndices; i++)
sl@0
  1215
			{
sl@0
  1216
			if (i % 8 == 0)
sl@0
  1217
				output_file << "\n\t";
sl@0
  1218
			output_file << "0x" << iStringIndex[i] << ",";
sl@0
  1219
			}
sl@0
  1220
		output_file << dec;
sl@0
  1221
		if (iStringIndices ==0)
sl@0
  1222
			output_file << "0";
sl@0
  1223
		output_file << "\n\t};\n\n";
sl@0
  1224
		}
sl@0
  1225
sl@0
  1226
	// Write the collation table structure.
sl@0
  1227
	output_file << "static const TCollationKeyTable The" << iCPlusPlusIdentifier << "Table = \n\t{ ";
sl@0
  1228
	if (iKeys)
sl@0
  1229
		output_file << "The" << iCPlusPlusIdentifier << "Key";
sl@0
  1230
	else
sl@0
  1231
		output_file << "0";
sl@0
  1232
	if (iIndices)
sl@0
  1233
		output_file << ", The" << iCPlusPlusIdentifier << "Index, " << iIndices;
sl@0
  1234
	else
sl@0
  1235
		output_file << ", 0, 0";
sl@0
  1236
	if (iStringElements)
sl@0
  1237
		output_file << ", The" << iCPlusPlusIdentifier << "StringElement, The" << iCPlusPlusIdentifier << "StringIndex, " << iStringIndices << " };\n";
sl@0
  1238
	else
sl@0
  1239
		output_file << ", 0, 0, 0 };\n";
sl@0
  1240
sl@0
  1241
	if (!iStandard)
sl@0
  1242
		output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\
sl@0
  1243
			"	{\n"\
sl@0
  1244
			"		{\n"\
sl@0
  1245
			"		KUid" << iCPlusPlusIdentifier << "CollationMethod, // the method for the locale\n"\
sl@0
  1246
			"		NULL, // use the standard table as the main table\n"\
sl@0
  1247
			"		&The" << iCPlusPlusIdentifier << "Table, // the locale values override the standard values\n"\
sl@0
  1248
			"		0 // the flags are standard\n"\
sl@0
  1249
			"		},\n"\
sl@0
  1250
			"		{\n"\
sl@0
  1251
			"		KUidBasicCollationMethod, // the standard unlocalised method\n"\
sl@0
  1252
			"		NULL, // null means use the standard table\n"\
sl@0
  1253
			"		NULL, // there's no override table\n"\
sl@0
  1254
			"		0 // the flags are standard\n"\
sl@0
  1255
			"		}\n"\
sl@0
  1256
			"	};\n"\
sl@0
  1257
			"\n"\
sl@0
  1258
			"static const TCollationDataSet TheCollationDataSet =\n"\
sl@0
  1259
			"	{\n"\
sl@0
  1260
			"	TheCollationMethod,\n"\
sl@0
  1261
			"	2\n"\
sl@0
  1262
			"	};"\
sl@0
  1263
			"\n\n"\
sl@0
  1264
			"// The one and only locale character set object.\n"\
sl@0
  1265
			"const LCharSet TheCharSet =\n"\
sl@0
  1266
			"	{\n"\
sl@0
  1267
			"	NULL,\n"\
sl@0
  1268
			"	&TheCollationDataSet\n"\
sl@0
  1269
			"	};\n";
sl@0
  1270
sl@0
  1271
	output_file.close();
sl@0
  1272
	delete [] locale;
sl@0
  1273
	}
sl@0
  1274
sl@0
  1275
int CollationIndex::Compare(const void* aIndex1,const void* aIndex2)
sl@0
  1276
	{
sl@0
  1277
	return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode;
sl@0
  1278
	}