os/textandloc/localisation/localesupport/coltab/COLTAB.CPP
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/localisation/localesupport/coltab/COLTAB.CPP	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,1278 @@
     1.4 +// Copyright (c) 1999-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.5 +// All rights reserved.
     1.6 +// This component and the accompanying materials are made available
     1.7 +// under the terms of "Eclipse Public License v1.0"
     1.8 +// which accompanies this distribution, and is available
     1.9 +// at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.10 +//
    1.11 +// Initial Contributors:
    1.12 +// Nokia Corporation - initial contribution.
    1.13 +//
    1.14 +// Contributors:
    1.15 +//
    1.16 +// Description:
    1.17 +// Reads and parses the Unicode collation value table and writes out a C++ source file
    1.18 +// containing the data in a form that can be used by the EPOC collation system.
    1.19 +//
    1.20 +// The program reads three files or one compositive files:
    1.21 +//
    1.22 +// Three files (by default):
    1.23 +// 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as
    1.24 +// basekeys.txt, supplied with the Standard Unicode Collation system
    1.25 +//
    1.26 +// 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as
    1.27 +// compkeys.txt, supplied with the Standard Unicode Collation system
    1.28 +//
    1.29 +// 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the
    1.30 +// same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line,
    1.31 +// space-separated and each exactly 4 hex digits.
    1.32 +//
    1.33 +// One compositive files (with option /a):
    1.34 +// 1. All Keys (combine above three files into one file): must be in the same format as allkeys.txt, supplied with the Standard Unicode Collation system (after Unicode 3.0).
    1.35 +//
    1.36 +//
    1.37 +
    1.38 +
    1.39 +#include <assert.h>
    1.40 +#include <ctype.h>
    1.41 +
    1.42 +#ifdef __MSVCDOTNET__
    1.43 +#include <fstream>
    1.44 +#include <iostream>
    1.45 +using namespace std;
    1.46 +#else //!__MSVCDOTNET__
    1.47 +#include <fstream.h>
    1.48 +#include <iostream.h>
    1.49 +#endif //__MSVCDOTNET__
    1.50 +
    1.51 +#include <stdlib.h>
    1.52 +#include <string.h>
    1.53 +#include <stdio.h>
    1.54 +
    1.55 +/*
    1.56 +Constants constraining the range of level-1 and level-2 keys so that they can be packed.
    1.57 +Non-zero values are reduced by one less than the minimum value.
    1.58 +*/
    1.59 +const unsigned int KLevel1Bits = 8;
    1.60 +const unsigned int KLevel1Min = 0x20;
    1.61 +const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2;
    1.62 +const unsigned int KLevel2Bits = 6;
    1.63 +const unsigned int KLevel2Min = 1;
    1.64 +const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2;
    1.65 +
    1.66 +/*
    1.67 +Table of characters in the WGL4 set, plus characters in canonical decompositions of
    1.68 +those characters, plus commonly used control characters and space characters,
    1.69 +given as ranges of Unicode characters. In each pair, the first code is the first in the range,
    1.70 +and the second is the first code NOT in the range.
    1.71 +
    1.72 +The extra characters are added mainly to ensure that control characters and spaces are
    1.73 +normally ignored. The extra characters are:
    1.74 +
    1.75 +0x0000-0x001F: ASCII control characters
    1.76 +0x2000-0x2012: spaces, hyphen variants, figure dash
    1.77 +0x2028-0x202E: line and paragraph separator, bidirectional control characters
    1.78 +0xFEFF		 : byte-order mark
    1.79 +0xFFFC-0xFFFD: object replacement character, replacement character
    1.80 +*/
    1.81 +const unsigned int Wgl4Range[] =
    1.82 +	{
    1.83 +	0x00, 0x7f,		// All ASCII
    1.84 +	0xa0, 0x180,		// Non-breaking space, Latin-1, Latin Extended-A
    1.85 +	0x192,0x193,		// Latin f with hook
    1.86 +	0x1fa,0x200,		// A-ring, a-ring, AE, ae, O slash, o slash all with acute accent
    1.87 +	0x2c6,0x2c8,		// non-combining circumflex and caron
    1.88 +	0x2c9,0x2ca,		// non-combining macron
    1.89 +	0x2d8,0x2dc,		// non-combining breve, dot above, ring above, ogonek
    1.90 +	0x2dd,0x2de,		// non-combining double acute
    1.91 +	0x300,0x305,		// combining grave, acute, circumflex, tilde, macron
    1.92 +	0x306,0x309,		// combining breve, dot above, double dot above
    1.93 +	0x30a,0x30e,		// combining ring above, double acute, caron, vertical line above
    1.94 +	0x327,0x329,		// combining cedilla, ogonek
    1.95 +	0x384,0x38b,		// Greek
    1.96 +	0x38c,0x38d,		// Greek
    1.97 +	0x38e,0x3a2,		// Greek
    1.98 +	0x3a3,0x3cf,		// Greek
    1.99 +	0x401,0x40d,		// Cyrillic
   1.100 +	0x40e,0x450,		// Cyrillic
   1.101 +	0x451,0x45d,		// Cyrillic
   1.102 +	0x45e,0x460,		// Cyrillic
   1.103 +	0x490,0x492,		// Cyrillic
   1.104 +	0x1e80,0x1e86,		// Both W and w with each of grave, acute and diaeresis
   1.105 +	0x1ef2,0x1ef4,		// Y with grave, y with grave
   1.106 +	0x2000,0x2016,		// various space and horizontal lines
   1.107 +	0x2017,0x201f,		//double vertical line, double low line, various quotation marks
   1.108 +	0x2020,0x2023,		// dagger, double dagger, bullet
   1.109 +	0x2026,0x2027,		//ellipsis
   1.110 +	0x2028,0x202F,		// line & paragraph separators and directional formatting
   1.111 +	0x2030,0x2031,		// per mille
   1.112 +	0x2032,0x2034,		// prime
   1.113 +	0x2039,0x203b,		// single angle quotation marks
   1.114 +	0x203c,0x203d,		// double exclamation mark
   1.115 +	0x203e,0x203f,		// non-combining overscore
   1.116 +	0x2044,0x2045,		// fraction slash
   1.117 +	0x207f,0x2080,		// superscript n
   1.118 +	0x20a3,0x20a5,		// French Franc, Italian/Turkish Lira
   1.119 +	0x20a7,0x20a8,		// Spanish Peseta
   1.120 +	0x20ac,0x20ad,		// Euro symbol
   1.121 +	0x2105,0x2106,		// care of
   1.122 +	0x2113,0x2114,		// script l
   1.123 +	0x2116,0x2117,		// numero
   1.124 +	0x2122,0x2123,		// trade mark
   1.125 +	0x2126,0x2127,		// ohm
   1.126 +	0x212e,0x212f,		// estimated (net weight)
   1.127 +	0x215b,0x215f,		// 1/8, 3/8, 5/8, 7/8
   1.128 +	0x2190,0x2196,		// horizontal and vertical arrows
   1.129 +	0x21a8,0x21a9,		// up down arrow with base
   1.130 +	0x2202,0x2203,		// partial differential
   1.131 +	0x2206,0x2207,		// increment (delta)
   1.132 +	0x220f,0x2210,		// n-ary product (pi)
   1.133 +	0x2211,0x2213,		// n-ary sum (sigma), minus
   1.134 +	0x2215,0x2216,		// division (slash)
   1.135 +	0x2219,0x221b,		// bullet operator, square root
   1.136 +	0x221e,0x2220,		// infinity, right angle
   1.137 +	0x2229,0x222a,		// intersection
   1.138 +	0x222b,0x222c,		// union
   1.139 +	0x2248,0x2249,		// almost equal to
   1.140 +	0x2260,0x2262,		// not equal to, identical to
   1.141 +	0x2264,0x2266,		// less-than-or-equal-to, greater-than-or-equal-to
   1.142 +	0x2302,0x2303,		// house
   1.143 +	0x2310,0x2311,		// rversed not sign
   1.144 +	0x2320,0x2322,		// top and bottom of integral
   1.145 +	0x2500,0x2501,		// box drawing
   1.146 +	0x2502,0x2503,		// box drawing
   1.147 +	0x250c,0x250d,		// box drawing
   1.148 +	0x2510,0x2511,		// box drawing
   1.149 +	0x2514,0x2515,		// box drawing
   1.150 +	0x2518,0x2519,		// box drawing
   1.151 +	0x251c,0x251d,		// box drawing
   1.152 +	0x2524,0x2525,		// box drawing
   1.153 +	0x252c,0x252d,		// box drawing
   1.154 +	0x2534,0x2535,		// box drawing
   1.155 +	0x253c,0x253d,		// box drawing
   1.156 +	0x2550,0x256d,		// box drawing
   1.157 +	0x2580,0x2581,		// block element
   1.158 +	0x2584,0x2585,		// block element
   1.159 +	0x2588,0x2589,		// block element
   1.160 +	0x258c,0x258d,		// block element
   1.161 +	0x2590,0x2594,		// block element
   1.162 +	0x25a0,0x25a2,		// geometric shapes
   1.163 +	0x25aa,0x25ad,		// geometric shapes
   1.164 +	0x25b2,0x25b3,		// geometric shapes
   1.165 +	0x25ba,0x25bb,		// geometric shapes
   1.166 +	0x25bc,0x25bd,		// geometric shapes
   1.167 +	0x25c4,0x25c5,		// geometric shapes
   1.168 +	0x25ca,0x25cc,		// geometric shapes
   1.169 +	0x25cf,0x25d0,		// geometric shapes
   1.170 +	0x25d8,0x25da,		// geometric shapes
   1.171 +	0x25e6,0x25e7,		// geometric shapes
   1.172 +	0x263a,0x263d,		// smilies, sun
   1.173 +	0x2640,0x2641,		// female
   1.174 +	0x2642,0x2643,		// male
   1.175 +	0x2660,0x2661,		// spade
   1.176 +	0x2663,0x2664,		// club
   1.177 +	0x2665,0x2667,		// heart
   1.178 +	0x266a,0x266c,		// quaver, beamed quavers
   1.179 +	0xfb01,0xfb03,		// fi, fl ligatures
   1.180 +	0xfeff,0xff00,		// zero-width non-breaking space
   1.181 +	0xfffc, 0xfffe		// object replacement character and replacement character
   1.182 +	};
   1.183 +const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2;
   1.184 +
   1.185 +int CompareWgl4Ranges(const void* aRange1,const void* aRange2)
   1.186 +	{
   1.187 +	unsigned int* p = (unsigned int*)aRange1;
   1.188 +	unsigned int* q = (unsigned int*)aRange2;
   1.189 +	if (q[0] == q[1])
   1.190 +		{
   1.191 +		unsigned int* temp = p;
   1.192 +		p = q;
   1.193 +		q = temp;
   1.194 +		}
   1.195 +	if (*p < *q)
   1.196 +		return -1;
   1.197 +	else if (*p >= q[1])
   1.198 +		return 1;
   1.199 +	else
   1.200 +		return 0;
   1.201 +	}
   1.202 +
   1.203 +// Determine if a character is in the WGL4 character repertoire.
   1.204 +static bool InWgl4(unsigned int aChar)
   1.205 +	{
   1.206 +	unsigned int key[2];
   1.207 +	key[0] = key[1] = aChar;
   1.208 +	return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL;
   1.209 +	}
   1.210 +
   1.211 +// A collation key.
   1.212 +class CollationKey
   1.213 +	{
   1.214 +public:
   1.215 +	bool operator==(const CollationKey& k) const
   1.216 +		{ return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] &&
   1.217 +		  iIgnorable == k.iIgnorable && iStop == k.iStop; }
   1.218 +
   1.219 +	enum
   1.220 +		{
   1.221 +		ELevels = 3
   1.222 +		};
   1.223 +	int iLevel[ELevels];// the keys at the various levels
   1.224 +	bool iIgnorable;	// TRUE if this key can normally be ignored
   1.225 +	bool iStop;			// TRUE if this is the last key in a string of keys
   1.226 +	};
   1.227 +
   1.228 +// The collation index for a single Unicode value.
   1.229 +class CollationIndex
   1.230 +	{
   1.231 +public:
   1.232 +	static int Compare(const void* aIndex1,const void* aIndex2);
   1.233 +
   1.234 +	int iCode;			// Unicode value
   1.235 +	int iIndex;			// index into the key table
   1.236 +	};
   1.237 +
   1.238 +class Reader
   1.239 +	{
   1.240 +public:
   1.241 +	Reader(bool aWgl4,bool aStandard,const char* aLocaleName, const char* aUidString);
   1.242 +	~Reader();
   1.243 +	void ReadBaseKeys(const char* aFileName);
   1.244 +	void ReadCompKeys(const char* aFileName);
   1.245 +	void ReadStrings(const char* aFileName);
   1.246 +	void ReadAllKeys(const char* aFileName);
   1.247 +	void WriteOutput(const char* aFileName, bool aCopyrightMessage);
   1.248 +	int CompareStringIndices(int aIndex1,int aIndex2) const;
   1.249 +
   1.250 +private:
   1.251 +	Reader(const Reader&);
   1.252 +	int Hex(const char *aString, int &aCharConsumed, bool aTolerate = false);
   1.253 +	void GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey=NULL);
   1.254 +	void GetMultipleCollationKeys(const char* aString);
   1.255 +	unsigned int PackKey(const CollationKey& aValue);
   1.256 +	int PackIndex(const CollationIndex& aValue, unsigned int result[2]);
   1.257 +	bool ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount);
   1.258 +	void AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart);
   1.259 +	void AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart);
   1.260 +	void AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart);
   1.261 +
   1.262 +	enum
   1.263 +		{
   1.264 +		EMaxCollationKeys = 0x110000 * 2, /*more elements considering composite keys */
   1.265 +		EMaxCollationIndices = 0x110000,
   1.266 +		EMaxStringElements = 65536,
   1.267 +		EMaxStringIndices = 65536
   1.268 +		};
   1.269 +	CollationKey iCollationKey[EMaxCollationKeys];
   1.270 +	int iKeys;
   1.271 +	CollationIndex iCollationIndex[EMaxCollationIndices];
   1.272 +	int iIndices;
   1.273 +	int iStringElement[EMaxStringElements];
   1.274 +	int iStringElements;
   1.275 +	unsigned int iStringIndex[EMaxStringIndices];
   1.276 +	int iStringIndices;
   1.277 +	const char* iInputFileName;
   1.278 +	int iLineNumber;
   1.279 +	bool iSuppressCanonseqWarning;		// have we issued the canonseq warning yet?
   1.280 +	bool iWgl4;				// true if writing keys for wgl4 characters only
   1.281 +	bool iStandard;			// true if reading standard files, not tailoring files
   1.282 +	const char* iLocaleName;
   1.283 +	const char* iUidString;
   1.284 +	char* iCPlusPlusIdentifier;		// iLocaleName in title case with difficult characters removed
   1.285 +	};
   1.286 +
   1.287 +bool isValidHexDigit(char c)
   1.288 +	{
   1.289 +	if ('0' <= c && c <= '9')
   1.290 +		return true;
   1.291 +	if ('a' <= c && c <= 'f')
   1.292 +		return true;
   1.293 +	if ('A' <= c && c <= 'F')
   1.294 +		return true;
   1.295 +	return false;
   1.296 +	}
   1.297 +
   1.298 +void PrintUsage()
   1.299 +	{
   1.300 +	cout << "Usage: coltab [/u<uid>] [/c] [/a] [/h<topic>] <locale>\n";
   1.301 +	cout << "By Default (without /a option), for the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n";
   1.302 +	cout << "For any other locale name <name> coltab reads <name>_basekeys.txt,\n";
   1.303 +	cout << "<name>_compkeys.txt and <name>_strings.txt.\n";
   1.304 +	cout << "Use the /a option, for the locales 'standard' and 'wgl4' coltab reads allkeys.txt\n";
   1.305 +	cout << "For any other locale name <name> coltab reads <name>_allkeys.txt.\n"; 
   1.306 +	cout << "The output file is always ls_<name>.cpp.\n";
   1.307 +	cout << "Use the /u option to specify the UID that the collation table should have.\n";
   1.308 +	cout << "A hex number must follow /u immediately, for example /u800ACBDE\n";
   1.309 +	cout << "this hex number must not exceed eight digits. If this is not specified,\n";
   1.310 +	cout << "the output file will have to be edited to make it compilable.\n";
   1.311 +	cout << "Specify /c to prefix the output with a Nokia copyright message.\n";
   1.312 +	cout << "Specify /h for in-depth help.";
   1.313 +	}
   1.314 +
   1.315 +void UsageError()
   1.316 +	{
   1.317 +	PrintUsage();
   1.318 +	exit(1);
   1.319 +	}
   1.320 +
   1.321 +void PrintHelp(char* aTopic)
   1.322 +	{
   1.323 +	int topic = 0;
   1.324 +	while ('0' <= *aTopic && *aTopic <= '9')
   1.325 +		{
   1.326 +		topic = topic * 10 + (*aTopic - '0');
   1.327 +		++aTopic;
   1.328 +		}
   1.329 +	switch(topic)
   1.330 +		{
   1.331 +	case 1:
   1.332 +		cout << "How Coltab interprets CANONSEQ:\n\n"\
   1.333 +			"If the CANONSEQ specifier is used in a line, Coltab will ignore the mapping.\n"\
   1.334 +			"This because, on the Symbian platform, any canonically composed character is\n"\
   1.335 +			"decomposed before the key mapping is applied, so characters with canonical\n"\
   1.336 +			"decompositions do not need keys. In files supplied by the Unicode Consortium,\n"\
   1.337 +			"all mappings for composed characters are flagged by CANONSEQ, so it is useful\n"\
   1.338 +			"if Coltab can just ignore these so that Unicode Consortium files can be used\n"\
   1.339 +			"unedited.\n\n"\
   1.340 +			"This can cause problems if a localizer copies a line from a Unicode file into,\n"\
   1.341 +			"say, the <lang>_strings.txt file, in order to give a mapping for an accented\n"\
   1.342 +			"character. The localizer replaces the composed character code with the\n"\
   1.343 +			"decomposition and changes the keys but forgets to remove the CANONSEQ\n"\
   1.344 +			"specifier. In this case the key would be ignored. Coltab provides a warning so\n"\
   1.345 +			"that this can be put right.\n\n"\
   1.346 +			"Coltab will only warn about the first CANONSEQ in each file, and does not warn\n"\
   1.347 +			"if the 'standard' or 'wgl4' options are used.";
   1.348 +		exit(1);
   1.349 +		break;
   1.350 +	case 2:
   1.351 +		cout << "How to ensure coltab's output files are compilable.\n\n"\
   1.352 +			"By default, Coltab's files for locales need to be edited before they are\n"\
   1.353 +			"compilable. The UID for the collation method needs to be filled in. This UID\n"\
   1.354 +			"is added so that the collation table can be searched for later. At present,\n"\
   1.355 +			"this UID is not necessary for the correct functioning of the Symbian platform\n"\
   1.356 +			"and so a value of 0 can be safely used.\n\n"\
   1.357 +			"To insert this value into the file directly, use the /u option, for example\n"\
   1.358 +			"coltab /u0 french\n"\
   1.359 +			"If the /u option is used, the file should be compilable as is. If it is not,\n"\
   1.360 +			"please raise it as a defect with Symbian's internationalization team,\n"\
   1.361 +			"supplying the files that caused the problem if this is possible.\n"\
   1.362 +			"If the 'standard' or 'wgl4' options are used, no UID is output, so the /u\n"\
   1.363 +			"option is not required.";
   1.364 +		exit(1);
   1.365 +		break;
   1.366 +	case 3:
   1.367 +		cout << "How to ensure collation key values are inside the supported range. \n\n"\
   1.368 +			"According to Unicode Standard, the range suppored by tool COLTAB:\n"\
   1.369 +			" Level 0 (primary):   0000 - FFFF, \n"\
   1.370 +			" Level 1 (Secondary): 0020 - 011E, \n"\
   1.371 +			" Level 2 (Tertiary):  0001 - 003F. \n"\
   1.372 +			"Please edit your collation files and make sure key values are inside the above range";
   1.373 +		exit(1);
   1.374 +		break;
   1.375 +	default:
   1.376 +		PrintUsage();
   1.377 +		cout << "\n\nSpecify /h1 for help on the use of CANONSEQ\n";
   1.378 +		cout << "Specify /h2 for help on making compilable files that do not need editing\n";
   1.379 +		exit(1);
   1.380 +		break;
   1.381 +		}
   1.382 +	}
   1.383 +
   1.384 +short HighSurrogate(int aCode)
   1.385 +	{
   1.386 +	return static_cast<short>(0xD7C0 + (aCode >> 10));
   1.387 +	}
   1.388 +	
   1.389 +short LowSurrogate(int aCode)
   1.390 +	{
   1.391 +	return static_cast<short>(0xDC00 | (aCode & 0x3FF));
   1.392 +	}
   1.393 +
   1.394 +int main(int argc,char** argv)
   1.395 +	{
   1.396 +	bool copyright = false;
   1.397 +	bool wgl4 = false;
   1.398 +	bool allKeys = false;
   1.399 +	const char* prefix = "";
   1.400 +	const char* infix = "";
   1.401 +	const char* locale = "";
   1.402 +	char* localeArg = 0;
   1.403 +	char* uidArg = 0;
   1.404 +	for (int i = 1; i < argc; ++i)
   1.405 +		{
   1.406 +		if (argv[i][0] == '/' || argv[i][0] == '-')
   1.407 +			{
   1.408 +			switch (argv[i][1])
   1.409 +				{
   1.410 +			case 'u':
   1.411 +			case 'U':
   1.412 +				{
   1.413 +				uidArg = argv[i] + 2;
   1.414 +				const char* uidCheck = uidArg;
   1.415 +				while (*uidCheck)
   1.416 +					{
   1.417 +					if (!isValidHexDigit(*uidCheck))
   1.418 +						UsageError();
   1.419 +					++uidCheck;
   1.420 +					}
   1.421 +				if (uidCheck == uidArg || 8 < uidCheck - uidArg)
   1.422 +					UsageError();
   1.423 +				break;
   1.424 +				}
   1.425 +			case 'c':
   1.426 +			case 'C':
   1.427 +				copyright = true;
   1.428 +				break;
   1.429 +			case 'a':
   1.430 +				allKeys = true;
   1.431 +				break;
   1.432 +			case 'h':
   1.433 +			case 'H':
   1.434 +				PrintHelp(argv[i] + 2);
   1.435 +				break;
   1.436 +			default:
   1.437 +				UsageError();
   1.438 +				break;
   1.439 +				}
   1.440 +			}
   1.441 +		else if (!localeArg)
   1.442 +			localeArg = argv[i];
   1.443 +		else
   1.444 +			UsageError();
   1.445 +		}
   1.446 +	if (!localeArg)
   1.447 +		UsageError();
   1.448 +	bool standard = false;
   1.449 +	if (!_stricmp(localeArg, "standard"))
   1.450 +		{
   1.451 +		locale = "Standard";
   1.452 +		standard = true;
   1.453 +		}
   1.454 +	else if (!_stricmp(localeArg, "wgl4"))
   1.455 +		{
   1.456 +		locale = "Wgl4";
   1.457 +		wgl4 = true;
   1.458 +		standard = true;
   1.459 +		}
   1.460 +	else
   1.461 +		{
   1.462 +		locale = prefix = localeArg;
   1.463 +		infix = "_";
   1.464 +		}
   1.465 +
   1.466 +	Reader* reader = new Reader(wgl4, standard, locale, uidArg);
   1.467 +	if (!reader)
   1.468 +		{
   1.469 +		cout << "out of memory\n";
   1.470 +		exit(1);
   1.471 +		}
   1.472 +	char* filename = new char[strlen(prefix) + strlen(infix) + 64];
   1.473 +	if (allKeys == false)
   1.474 +		{
   1.475 +		sprintf(filename,"%s%scompkeys.txt",prefix,infix);
   1.476 +		reader->ReadCompKeys(filename);
   1.477 +		if (!standard)
   1.478 +			{
   1.479 +			sprintf(filename,"%s%sstrings.txt",prefix,infix);
   1.480 +			reader->ReadStrings(filename);
   1.481 +			}
   1.482 +		sprintf(filename,"%s%sbasekeys.txt",prefix,infix);
   1.483 +		reader->ReadBaseKeys(filename);
   1.484 +		}
   1.485 +	else
   1.486 +		{
   1.487 +		sprintf(filename,"%s%sAllKeys.txt",prefix,infix);
   1.488 +		reader->ReadAllKeys(filename);
   1.489 +		}
   1.490 +	sprintf(filename,"ls_%s.cpp", localeArg);
   1.491 +	reader->WriteOutput(filename, copyright);
   1.492 +
   1.493 +	delete reader;
   1.494 +	delete [] filename;
   1.495 +	return 0;
   1.496 +	}
   1.497 +
   1.498 +Reader::Reader(bool aWgl4, bool aStandard,
   1.499 +	const char* aLocaleName, const char* aUidString):
   1.500 +	iKeys(0),
   1.501 +	iIndices(0),
   1.502 +	iStringElements(0),
   1.503 +	iStringIndices(0),
   1.504 +	iInputFileName(NULL),
   1.505 +	iLineNumber(0),
   1.506 +	iSuppressCanonseqWarning(false),
   1.507 +	iWgl4(aWgl4),
   1.508 +	iStandard(aStandard),
   1.509 +	iLocaleName(aLocaleName),
   1.510 +	iUidString(aUidString)
   1.511 +	{
   1.512 +	if (iStandard)
   1.513 +		{
   1.514 +		iCPlusPlusIdentifier = new char[9];
   1.515 +		strcpy(iCPlusPlusIdentifier, "Standard");
   1.516 +		return;
   1.517 +		}
   1.518 +	char* p = iCPlusPlusIdentifier = new char[strlen(aLocaleName) + 2];
   1.519 +	int current = toupper(aLocaleName[0]);
   1.520 +	if (current < 'A' || 'Z' < current)
   1.521 +		*p++ = 'C';
   1.522 +	else
   1.523 +		{
   1.524 +		*p++ = static_cast<char>(current);
   1.525 +		++aLocaleName;
   1.526 +		}
   1.527 +	bool inUnderScore = false;
   1.528 +	while (*aLocaleName)
   1.529 +		{
   1.530 +		current = tolower(*aLocaleName++);
   1.531 +		if (current < 'a' || 'z' < current)
   1.532 +			{
   1.533 +			if (!inUnderScore)
   1.534 +				{
   1.535 +				inUnderScore = true;
   1.536 +				*p++ = '_';
   1.537 +				}
   1.538 +			}
   1.539 +		else
   1.540 +			{
   1.541 +			inUnderScore = false;
   1.542 +			*p++ = static_cast<char>(current);
   1.543 +			}
   1.544 +		}
   1.545 +	*p = 0;
   1.546 +	}
   1.547 +
   1.548 +Reader::~Reader()
   1.549 +	{
   1.550 +	delete [] iCPlusPlusIdentifier;
   1.551 +	}
   1.552 +
   1.553 +// Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true.
   1.554 +int Reader::Hex(const char *aString, int &aCharConsumed, bool aTolerate)
   1.555 +	{
   1.556 +	char *end;
   1.557 +	unsigned long x = strtoul(aString,&end,16);
   1.558 +	aCharConsumed = end - aString;
   1.559 +	if ((aCharConsumed != 4) && (aCharConsumed != 5) && (aCharConsumed != 6))
   1.560 +		{
   1.561 +		if (!aTolerate)
   1.562 +			{
   1.563 +			cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n';
   1.564 +			exit(1);
   1.565 +			}
   1.566 +		return -1;
   1.567 +		}
   1.568 +	return x;
   1.569 +	}
   1.570 +
   1.571 +// Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx]
   1.572 +void Reader::GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey)
   1.573 +	{
   1.574 +	aCharConsumed = 0;
   1.575 +	const char *end = strchr(aString, ']');
   1.576 +	if (end != NULL){
   1.577 +		aCharConsumed = end - aString;
   1.578 +	}
   1.579 +	
   1.580 +	if (aString[0] != '[' || (aCharConsumed != 21 && aCharConsumed != 22 && aCharConsumed != 23))
   1.581 +		{
   1.582 +		cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n';
   1.583 +		exit(1);
   1.584 +		}
   1.585 +	if (aKey == NULL)
   1.586 +		{
   1.587 +		if (iKeys >= EMaxCollationKeys)
   1.588 +			{
   1.589 +			cout << "too many keys";
   1.590 +			exit(1);
   1.591 +			}
   1.592 +		aKey = &iCollationKey[iKeys++];
   1.593 +		}
   1.594 +	aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored
   1.595 +	int charConsumed = 0;
   1.596 +	for (int i = 0; i < CollationKey::ELevels; i++)
   1.597 +		aKey->iLevel[i] = Hex(aString + 2 + i * 5, charConsumed);
   1.598 +
   1.599 +	if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max))
   1.600 +		{
   1.601 +		aKey->iLevel[1] = KLevel1Max;
   1.602 +		cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max << "\n";
   1.603 +		cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
   1.604 +		exit(1);
   1.605 +		}
   1.606 +	
   1.607 +	if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max))
   1.608 +		{
   1.609 +		cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max << "\n";
   1.610 +		cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
   1.611 +		exit(1);
   1.612 +		}
   1.613 +
   1.614 +	aKey->iStop = true;
   1.615 +	}
   1.616 +
   1.617 +void Reader::GetMultipleCollationKeys(const char* aString)
   1.618 +	{
   1.619 +	int keyCount = 0;
   1.620 +	int charConsumed =0;
   1.621 +	while (aString[0] == '[')
   1.622 +		{
   1.623 +		GetCollationKey(aString, charConsumed);
   1.624 +
   1.625 +		keyCount++;
   1.626 +		iCollationKey[iKeys - 1].iStop = false;
   1.627 +		int length = strlen(aString);
   1.628 +		if (length <= charConsumed + 1)
   1.629 +			break;
   1.630 +		aString += charConsumed + 1;
   1.631 +		
   1.632 +		if (aString[0] == ' ') //a space is put between collation keys in keys files provided by previous Unicode Standard (i.e 3.1)
   1.633 +			aString++;
   1.634 +		
   1.635 +		}
   1.636 +	iCollationKey[iKeys - 1].iStop = true;
   1.637 +	}
   1.638 +
   1.639 +/*
   1.640 +Partially parse a line, returning its key code and the start of its first block of key data.
   1.641 +Return false if it is not a data line, or not relevant.
   1.642 +*/
   1.643 +bool Reader::ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount)
   1.644 +	{
   1.645 +	int lineLength = strlen(aLine);
   1.646 +	int charConsumed = 0;
   1.647 +	aCodeCount = 0;
   1.648 +	aCode[0] = Hex(aLine,charConsumed,true);
   1.649 +
   1.650 +	/*
   1.651 +	A data line must start with a hex number and be at least 27 characters long.
   1.652 +	Canonically decomposable Unicode characters are skipped.
   1.653 +	Skip non-WGL4 characters if doing WGL4 only.
   1.654 +	*/
   1.655 +	if (aCode[0] != -1)
   1.656 +		{
   1.657 +		aCodeCount = 1;
   1.658 +		if (!strcmp(aLine + lineLength - 8,"CANONSEQ"))
   1.659 +			{
   1.660 +			if (!iSuppressCanonseqWarning)
   1.661 +				{
   1.662 +				cout << "Warning: CANONSEQ used in file " << iInputFileName
   1.663 +					<< " on line " << iLineNumber << ".\nWarning: All mappings specifying CANONSEQ are ignored.\n"
   1.664 +					<< "Warning: Use coltab /h1 for more details.";
   1.665 +				iSuppressCanonseqWarning = true;
   1.666 +				}
   1.667 +			aCodeCount = 0;
   1.668 +			}
   1.669 +		else if (lineLength < 27 ||
   1.670 +			(iWgl4 && !InWgl4((unsigned int)aCode))) 
   1.671 +			aCodeCount = 0;
   1.672 +		}
   1.673 +
   1.674 +	if (aCode[0] != -1)
   1.675 +		{
   1.676 +		// find '['
   1.677 +		aKeyStart = charConsumed;
   1.678 +		while (aKeyStart < lineLength && aLine[aKeyStart] != '[')
   1.679 +			aKeyStart++;
   1.680 +
   1.681 +		// read all hex before '['
   1.682 +		int index = charConsumed + 1;
   1.683 +		while (index < aKeyStart)
   1.684 +			{
   1.685 +			aCode[aCodeCount] = Hex(aLine+index, charConsumed, true);
   1.686 +			if (aCode[aCodeCount] == -1)
   1.687 +				break;
   1.688 +
   1.689 +			index += charConsumed + 1;
   1.690 +			aCodeCount++;
   1.691 +			}
   1.692 +
   1.693 +		// find number of collation keys
   1.694 +		aKeyCount = 0;
   1.695 +		index = aKeyStart;
   1.696 +		while (index < lineLength && aLine[index] != '%' && aLine[index] != '#')
   1.697 +			{
   1.698 +			if (aLine[index] == '[')
   1.699 +				aKeyCount++;
   1.700 +			index++;
   1.701 +			}
   1.702 +		}
   1.703 +
   1.704 +	return aCodeCount > 0;
   1.705 +	}
   1.706 +
   1.707 +void Reader::AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart)
   1.708 +	{
   1.709 +	if (iIndices >= EMaxCollationIndices)
   1.710 +		{
   1.711 +		cout << "too many Unicode values";
   1.712 +		exit(1);
   1.713 +		}
   1.714 +	CollationIndex& index = iCollationIndex[iIndices++];
   1.715 +	index.iCode = aCode;
   1.716 +	index.iIndex = -1;
   1.717 +
   1.718 +	/*
   1.719 +	First try to find the key in the array of keys found so far.
   1.720 +	Search backwards to use the fact that runs of the same key occur together.
   1.721 +	*/
   1.722 +	CollationKey key;
   1.723 +	int charConsumed = 0;
   1.724 +	GetCollationKey(aLine + aKeyStart, charConsumed, &key);
   1.725 +	for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--)
   1.726 +		if (iCollationKey[i] == key)
   1.727 +			index.iIndex = i;
   1.728 +
   1.729 +	// If that fails, add a new key.
   1.730 +	if (index.iIndex == -1)
   1.731 +		{
   1.732 +		index.iIndex = iKeys++;
   1.733 +		if (iKeys > EMaxCollationKeys)
   1.734 +			{
   1.735 +			cout << "too many keys";
   1.736 +			exit(1);
   1.737 +			} 
   1.738 +		iCollationKey[index.iIndex] = key;
   1.739 +		}
   1.740 +	}
   1.741 +/*
   1.742 +Read 1-to-1 mapping. Sample:
   1.743 +02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
   1.744 +
   1.745 +aCombinedFile = true: aFileName is combined file, which contains base keys, comp keys, and string keys.
   1.746 +*/
   1.747 +void Reader::ReadBaseKeys(const char* aFileName)
   1.748 +	{
   1.749 +	iSuppressCanonseqWarning = iStandard || iWgl4;
   1.750 +	iLineNumber = 0;
   1.751 +	iInputFileName = aFileName;
   1.752 +	ifstream input_file;
   1.753 +
   1.754 +#ifdef __MSVCDOTNET__
   1.755 +	input_file.open(iInputFileName, ios::in);
   1.756 +#else //!__MSVCDOTNET__
   1.757 +	input_file.open(iInputFileName, ios::in | ios::nocreate);
   1.758 +#endif //__MSVCDOTNET__
   1.759 +
   1.760 +	if (input_file.fail())
   1.761 +		{
   1.762 +		cout << "cannot open input file '" << iInputFileName << "'\n";
   1.763 +		exit(1);
   1.764 +		}
   1.765 +	cout << "reading base keys from '" << iInputFileName << "'\n";
   1.766 +
   1.767 +	char line[1024];
   1.768 +	for (;;)
   1.769 +		{
   1.770 +		input_file.getline(line,sizeof(line));
   1.771 +		if (input_file.eof())
   1.772 +			break;
   1.773 +		iLineNumber++;
   1.774 +		// line number counting
   1.775 +		if (iLineNumber % 100 == 0)
   1.776 +			{
   1.777 +			cout << "line " << iLineNumber << '\n';
   1.778 +			cout.flush();
   1.779 +			}
   1.780 +		int code[16];
   1.781 +		int codeCount = 0;
   1.782 +		int key_start = 0;
   1.783 +		int keyCount = 0;
   1.784 +		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
   1.785 +			{
   1.786 +			if (codeCount != 1 || keyCount != 1)
   1.787 +				continue;	// goto next line
   1.788 +			AddKeyOneToOne(line, code[0], key_start);
   1.789 +			}
   1.790 +		}
   1.791 +
   1.792 +	input_file.close();
   1.793 +	}
   1.794 +
   1.795 +void Reader::AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart)
   1.796 +	{
   1.797 +	if (iIndices >= EMaxCollationIndices)
   1.798 +		{
   1.799 +		cout << "too many Unicode values";
   1.800 +		exit(1);
   1.801 +		}
   1.802 +	CollationIndex& index = iCollationIndex[iIndices++];
   1.803 +	index.iCode = aCode;
   1.804 +	index.iIndex = iKeys;
   1.805 +	GetMultipleCollationKeys(aLine + aKeyStart);
   1.806 +	}
   1.807 +/*
   1.808 +Read 1-to-much mapping.
   1.809 +3303  ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
   1.810 +*/
   1.811 +void Reader::ReadCompKeys(const char* aFileName)
   1.812 +	{
   1.813 +	iSuppressCanonseqWarning = iStandard || iWgl4;
   1.814 +	iLineNumber = 0;
   1.815 +	iInputFileName = aFileName;
   1.816 +	ifstream input_file;
   1.817 +
   1.818 +#ifdef __MSVCDOTNET__
   1.819 +	input_file.open(iInputFileName, ios::in);
   1.820 +#else //!__MSVCDOTNET__
   1.821 +	input_file.open(iInputFileName, ios::in | ios::nocreate);
   1.822 +#endif //__MSVCDOTNET__
   1.823 +
   1.824 +	if (input_file.fail())
   1.825 +		{
   1.826 +		cout << "there are no composite keys; '" << iInputFileName << "' not found\n";
   1.827 +		return;
   1.828 +		}
   1.829 +	cout << "reading composite keys from '" << iInputFileName << "'\n";
   1.830 +
   1.831 +	char line[1024];
   1.832 +	for (;;)
   1.833 +		{
   1.834 +		input_file.getline(line,sizeof(line));
   1.835 +		if (input_file.eof())
   1.836 +			break;
   1.837 +		iLineNumber++;
   1.838 +		// line number counting
   1.839 +		if (iLineNumber % 100 == 0)
   1.840 +			{
   1.841 +			cout << "line " << iLineNumber << '\n';
   1.842 +			cout.flush();
   1.843 +			}
   1.844 +		int code[16];
   1.845 +		int codeCount = 0;
   1.846 +		int key_start = 0;
   1.847 +		int keyCount = 0;
   1.848 +		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
   1.849 +			{
   1.850 +			if (codeCount != 1 || keyCount < 2)
   1.851 +				continue;	// goto next line
   1.852 +			AddKeyOneToMuch(line, code[0], key_start);
   1.853 +			}
   1.854 +		}
   1.855 +
   1.856 +	input_file.close();
   1.857 +	}
   1.858 +
   1.859 +
   1.860 +void Reader::AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart)
   1.861 +	{
   1.862 +
   1.863 +	// Store the index to the Unicode string and the key sequence.
   1.864 +	if (iStringIndices > EMaxStringIndices)
   1.865 +		{
   1.866 +		cout << "too many string indices";
   1.867 +		exit(1);
   1.868 +		}
   1.869 +	iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys;
   1.870 +
   1.871 +	// Reserve space for the length.
   1.872 +	if (iStringElements >= EMaxStringElements)
   1.873 +		{
   1.874 +		cout << "too many string elements";
   1.875 +		exit(1);
   1.876 +		}
   1.877 +	iStringElements++;
   1.878 +
   1.879 +	// Read the Unicode string.
   1.880 +	int length = 0;		// in unit of int16
   1.881 +	int charCount = 0;	// in unit of char. for debug.
   1.882 +
   1.883 +	for (int i=0; i<aCodeCount; i++)
   1.884 +		{	
   1.885 +		if (iStringElements >= EMaxStringElements)
   1.886 +			{
   1.887 +			cout << "too many string elements";
   1.888 +			exit(1);
   1.889 +			}
   1.890 +	
   1.891 +		if (aCode[i] > 0xFFFF)
   1.892 +			{
   1.893 +			// UCS4 --> UTF-16
   1.894 +			iStringElement[iStringElements++] = 0xD7C0 + (aCode[i] >> 10);
   1.895 +			iStringElement[iStringElements++] = 0xDC00 | (aCode[i] & 0x3FF);
   1.896 +			length += 2;
   1.897 +			}
   1.898 +		else
   1.899 +			{
   1.900 +			iStringElement[iStringElements++] = aCode[i];
   1.901 +			length++;
   1.902 +			}
   1.903 +		charCount++;
   1.904 +		}
   1.905 +
   1.906 +	iStringElement[iStringElements - length - 1] = (unsigned int)length;
   1.907 +
   1.908 +	// Read the key sequence.
   1.909 +	GetMultipleCollationKeys(aLine + aKeyStart);
   1.910 +	}
   1.911 +/*
   1.912 +Read much-to-much mapping. Sample:
   1.913 +004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
   1.914 +0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
   1.915 +*/
   1.916 +void Reader::ReadStrings(const char* aFileName)
   1.917 +	{
   1.918 +	iSuppressCanonseqWarning = iStandard || iWgl4;
   1.919 +	iLineNumber = 0;
   1.920 +	iInputFileName = aFileName;
   1.921 +	ifstream input_file;
   1.922 +
   1.923 +#ifdef __MSVCDOTNET__
   1.924 +	input_file.open(iInputFileName, ios::in);
   1.925 +#else //!__MSVCDOTNET__
   1.926 +	input_file.open(iInputFileName, ios::in | ios::nocreate);
   1.927 +#endif //__MSVCDOTNET__
   1.928 +
   1.929 +	if (input_file.fail())
   1.930 +		{
   1.931 +		cout << "there are no strings; '" << iInputFileName << "' not found\n";
   1.932 +		return;
   1.933 +		}
   1.934 +	cout << "reading strings from '" << iInputFileName << "'\n";
   1.935 +
   1.936 +	char line[1024];
   1.937 +	for (;;)
   1.938 +		{
   1.939 +		input_file.getline(line,sizeof(line));
   1.940 +		if (input_file.eof())
   1.941 +			break;
   1.942 +		iLineNumber++;
   1.943 +		// line number counting
   1.944 +		if (iLineNumber % 100 == 0)
   1.945 +			{
   1.946 +			cout << "line " << iLineNumber << '\n';
   1.947 +			cout.flush();
   1.948 +			}
   1.949 +		int code[16];
   1.950 +		int codeCount = 0;
   1.951 +		int key_start = 0;
   1.952 +		int keyCount = 0;
   1.953 +		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
   1.954 +			{
   1.955 +			if (codeCount < 2 || keyCount < 1)
   1.956 +				continue;	// goto next line
   1.957 +			AddKeyMuchToMuch(line, code, codeCount, key_start);
   1.958 +			}
   1.959 +		}
   1.960 +
   1.961 +	input_file.close();
   1.962 +	}
   1.963 +
   1.964 +/*
   1.965 +Read combined key table. Sample:
   1.966 +1-to-1 mapping:
   1.967 +02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
   1.968 +
   1.969 +1-to-much mapping:
   1.970 +3303  ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
   1.971 +
   1.972 +much-to-much mapping:
   1.973 +004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
   1.974 +0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
   1.975 +*/
   1.976 +void Reader::ReadAllKeys(const char* aFileName)
   1.977 +	{
   1.978 +	iSuppressCanonseqWarning = iStandard || iWgl4;
   1.979 +	iLineNumber = 0;
   1.980 +	iInputFileName = aFileName;
   1.981 +	ifstream input_file;
   1.982 +
   1.983 +#ifdef __MSVCDOTNET__
   1.984 +	input_file.open(iInputFileName, ios::in);
   1.985 +#else //!__MSVCDOTNET__
   1.986 +	input_file.open(iInputFileName, ios::in | ios::nocreate);
   1.987 +#endif //__MSVCDOTNET__
   1.988 +
   1.989 +	if (input_file.fail())
   1.990 +		{
   1.991 +		cout << "there are no keys; '" << iInputFileName << "' not found\n";
   1.992 +		return;
   1.993 +		}
   1.994 +	cout << "reading all keys from '" << iInputFileName << "'\n";
   1.995 +
   1.996 +	char line[1024];
   1.997 +	for (;;)
   1.998 +		{
   1.999 +		if (input_file.eof())
  1.1000 +			break;
  1.1001 +		input_file.getline(line,sizeof(line));
  1.1002 +		iLineNumber++;
  1.1003 +
  1.1004 +		int code[16];
  1.1005 +		int codeCount = 0;
  1.1006 +		int key_start = 0;
  1.1007 +		int keyCount = 0;
  1.1008 +		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
  1.1009 +			{
  1.1010 +			if (codeCount == 1 && keyCount == 1)
  1.1011 +				AddKeyOneToOne(line, code[0], key_start);
  1.1012 +			else if (codeCount == 1 && keyCount > 1)
  1.1013 +				AddKeyOneToMuch(line, code[0], key_start);
  1.1014 +			else if (codeCount > 1 && keyCount > 0)
  1.1015 +			AddKeyMuchToMuch(line, code, codeCount, key_start);
  1.1016 +			else
  1.1017 +				cout << "ignore line: " << line << "\n";
  1.1018 +			}
  1.1019 +		}
  1.1020 +
  1.1021 +	input_file.close();
  1.1022 +	}
  1.1023 +
  1.1024 +
  1.1025 +// Pack the 3 collation key levels into a single 32-bit integer.
  1.1026 +unsigned int Reader::PackKey(const CollationKey& aValue)
  1.1027 +	{
  1.1028 +	unsigned int level0 = aValue.iLevel[0];
  1.1029 +	unsigned int level1 = aValue.iLevel[1];
  1.1030 +	if (level1 > 0)
  1.1031 +		level1 -= (KLevel1Min - 1);
  1.1032 +	unsigned int level2 = aValue.iLevel[2];
  1.1033 +	if (level2 > 0)
  1.1034 +		level2 -= (KLevel2Min - 1);
  1.1035 +	unsigned int key = level0 << 16 | level1 << 8 | level2 << 2;
  1.1036 +	if (aValue.iIgnorable)
  1.1037 +		key |= 2;
  1.1038 +	if (aValue.iStop)
  1.1039 +		key |= 1;
  1.1040 +	return key;
  1.1041 +	}
  1.1042 +
  1.1043 +// Pack a collation index value into a single 32-bit integer.
  1.1044 +int Reader::PackIndex(const CollationIndex& aValue, unsigned int result[2])
  1.1045 +	{
  1.1046 +	unsigned int code = aValue.iCode;
  1.1047 +	unsigned int index = aValue.iIndex;
  1.1048 +	if (code <= 0xFFFF)
  1.1049 +		{
  1.1050 +		result[0] = (code << 16 | index);
  1.1051 +		return 1;
  1.1052 +		}
  1.1053 +	else
  1.1054 +		{
  1.1055 +		result[0] = (::HighSurrogate(code) << 16 | index);
  1.1056 +		result[1] = (::LowSurrogate(code) << 16 | index);
  1.1057 +		return 2;
  1.1058 +		}
  1.1059 +	}
  1.1060 +
  1.1061 +const Reader* TheReader;
  1.1062 +static int CompareStringIndices(const void* aIndex1,const void* aIndex2)
  1.1063 +	{
  1.1064 +	return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16);
  1.1065 +	}
  1.1066 +
  1.1067 +int CompareUnicodeStrings(const int *aString1,int aLength1,const int *aString2,int aLength2)
  1.1068 +	{
  1.1069 +	for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
  1.1070 +		{
  1.1071 +		int x = i < aLength1 ? *aString1 : -1;
  1.1072 +		int y = i < aLength2 ? *aString2 : -1;
  1.1073 +		if (x != y)
  1.1074 +			return x - y;
  1.1075 +		}
  1.1076 +	return 0;
  1.1077 +	}
  1.1078 +
  1.1079 +int Reader::CompareStringIndices(int aIndex1,int aIndex2) const
  1.1080 +	{
  1.1081 +	return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1],
  1.1082 +								 iStringElement + aIndex2 + 1,iStringElement[aIndex2]);
  1.1083 +	}
  1.1084 +
  1.1085 +void Reader::WriteOutput(const char* aFileName, bool aCopyright)
  1.1086 +	{
  1.1087 +	int i;
  1.1088 +	ofstream output_file;
  1.1089 +	output_file.open(aFileName);
  1.1090 +	if (output_file.fail())
  1.1091 +		{
  1.1092 +		cout << "cannot open output file '" << aFileName << "'\n";
  1.1093 +		exit(1);
  1.1094 +		}
  1.1095 +	cout << "writing output to '" << aFileName << "'\n";
  1.1096 +
  1.1097 +	char *locale = NULL;
  1.1098 +	if (iStandard)
  1.1099 +		locale = _strdup("Standard");
  1.1100 +	else
  1.1101 +		locale = _strdup(iLocaleName);
  1.1102 +
  1.1103 +	if (!iStandard)
  1.1104 +		{
  1.1105 +		_strlwr(locale);
  1.1106 +		locale[0] = (char)toupper(locale[0]);
  1.1107 +		if (aCopyright)
  1.1108 +			{
  1.1109 +			char* capsFileName = new char[strlen(aFileName) + 1];
  1.1110 +			strcpy(capsFileName, aFileName);
  1.1111 +			_strupr(capsFileName);
  1.1112 +			output_file << "/*\n" << capsFileName << "\n\nCopyright (C) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.\n*/\n";
  1.1113 +			delete [] capsFileName;
  1.1114 +			output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n";
  1.1115 +			output_file << "Generated by COLTAB.\n*/\n";
  1.1116 +			}
  1.1117 +
  1.1118 +		output_file << "\n#include \"ls_std.h\"\n#include <collate.h>\n";
  1.1119 +		output_file << "\nconst TUint KUid" << iCPlusPlusIdentifier << "CollationMethod = ";
  1.1120 +		if (iUidString)
  1.1121 +			output_file << "0x" << iUidString << ";\n";
  1.1122 +		else
  1.1123 +			{
  1.1124 +			output_file << "/* FILL THIS IN */;\n";
  1.1125 +			cout << "Warning: File will need editing\nWarning: see coltab /h2 for details.\n";
  1.1126 +			}
  1.1127 +		}
  1.1128 +
  1.1129 +	/*
  1.1130 +	Write the unique collation keys.
  1.1131 +	Each one has the format, going from highest to lowest bit:
  1.1132 +
  1.1133 +	16 bits:	level-0 key
  1.1134 +	8 bits:		level-1 key
  1.1135 +	6 bits:		level-2 key
  1.1136 +	1 bit:		set if this key is optionally ignorable
  1.1137 +	1 bit:		set if this is the last key in the string of keys for a single Unicode value
  1.1138 +
  1.1139 +	*/
  1.1140 +	if (iKeys != 0)
  1.1141 +		{
  1.1142 +		output_file << "\nstatic const TUint32 The" << iCPlusPlusIdentifier << "Key[] = \n\t{";
  1.1143 +		CollationKey* ck = iCollationKey;
  1.1144 +		output_file << "\t // " << iKeys << " keys";
  1.1145 +		output_file << hex;
  1.1146 +		for (i = 0; i < iKeys; i++, ck++)
  1.1147 +			{
  1.1148 +			unsigned int key = PackKey(*ck);
  1.1149 +			if (i % 8 == 0)
  1.1150 +				output_file << "\n\t";
  1.1151 +			output_file << "0x";
  1.1152 +			output_file << key << ",";
  1.1153 +			}
  1.1154 +		output_file << dec;
  1.1155 +		output_file << "\n\t};\n\n";
  1.1156 +		}
  1.1157 +
  1.1158 +	if (iIndices != 0)
  1.1159 +		{
  1.1160 +		// Sort then write the collation index values - these relate Unicode values to collation keys.
  1.1161 +		qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare);
  1.1162 +		output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "Index[] = \n\t{";
  1.1163 +		CollationIndex* ci = iCollationIndex;
  1.1164 +		int entry=0;
  1.1165 +		output_file << "\t // " << iIndices << " indices";
  1.1166 +		output_file << hex;
  1.1167 +		for (i = 0; i < iIndices; i++, ci++, entry++)
  1.1168 +			{
  1.1169 +			unsigned int key[2];
  1.1170 +			int bytecount = PackIndex(*ci, key);
  1.1171 +
  1.1172 +			if (entry % 8 == 0)
  1.1173 +				output_file << "\n\t";
  1.1174 +			output_file << "0x";
  1.1175 +			output_file << key[0] << ",";
  1.1176 +
  1.1177 +			if (bytecount == 2)
  1.1178 +				{
  1.1179 +				entry++;
  1.1180 +				if (entry % 8 == 0)
  1.1181 +					output_file << "\n\t";
  1.1182 +				output_file << "0x";
  1.1183 +				output_file << key[1] << ",";
  1.1184 +				}
  1.1185 +			}
  1.1186 +		output_file << dec;
  1.1187 +		output_file << "\n\t};";
  1.1188 +		output_file << "\t // " << entry << " entries";
  1.1189 +		output_file << "\n\n";
  1.1190 +		iIndices = entry; //One surrogate pair occupies 2 entries 
  1.1191 +		}
  1.1192 +
  1.1193 +	if (iStringElements)
  1.1194 +		{
  1.1195 +		// Write the Unicode strings; these are preceded by their lengths.
  1.1196 +		output_file << "static const TUint16 The" << iCPlusPlusIdentifier << "StringElement[] = \n\t{";
  1.1197 +		output_file << hex;
  1.1198 +		for (i = 0; i < iStringElements; i++)
  1.1199 +			{
  1.1200 +			if (i % 8 == 0)
  1.1201 +				output_file << "\n\t";
  1.1202 +			output_file << "0x" << iStringElement[i] << ",";
  1.1203 +			}
  1.1204 +		output_file << dec;
  1.1205 +		if (iStringElements==0)
  1.1206 +			output_file << "0";
  1.1207 +		output_file << "\n\t};\n\n";
  1.1208 +
  1.1209 +		/*
  1.1210 +		Sort then write the string index values - these relate Unicode strings to collation keys.
  1.1211 +		Each one has the string index in the upper word and the key index in the lower word.
  1.1212 +		*/
  1.1213 +		TheReader = this;
  1.1214 +		qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices);
  1.1215 +		output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "StringIndex[] = \n\t{";
  1.1216 +		output_file << hex;
  1.1217 +		for (i = 0; i < iStringIndices; i++)
  1.1218 +			{
  1.1219 +			if (i % 8 == 0)
  1.1220 +				output_file << "\n\t";
  1.1221 +			output_file << "0x" << iStringIndex[i] << ",";
  1.1222 +			}
  1.1223 +		output_file << dec;
  1.1224 +		if (iStringIndices ==0)
  1.1225 +			output_file << "0";
  1.1226 +		output_file << "\n\t};\n\n";
  1.1227 +		}
  1.1228 +
  1.1229 +	// Write the collation table structure.
  1.1230 +	output_file << "static const TCollationKeyTable The" << iCPlusPlusIdentifier << "Table = \n\t{ ";
  1.1231 +	if (iKeys)
  1.1232 +		output_file << "The" << iCPlusPlusIdentifier << "Key";
  1.1233 +	else
  1.1234 +		output_file << "0";
  1.1235 +	if (iIndices)
  1.1236 +		output_file << ", The" << iCPlusPlusIdentifier << "Index, " << iIndices;
  1.1237 +	else
  1.1238 +		output_file << ", 0, 0";
  1.1239 +	if (iStringElements)
  1.1240 +		output_file << ", The" << iCPlusPlusIdentifier << "StringElement, The" << iCPlusPlusIdentifier << "StringIndex, " << iStringIndices << " };\n";
  1.1241 +	else
  1.1242 +		output_file << ", 0, 0, 0 };\n";
  1.1243 +
  1.1244 +	if (!iStandard)
  1.1245 +		output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\
  1.1246 +			"	{\n"\
  1.1247 +			"		{\n"\
  1.1248 +			"		KUid" << iCPlusPlusIdentifier << "CollationMethod, // the method for the locale\n"\
  1.1249 +			"		NULL, // use the standard table as the main table\n"\
  1.1250 +			"		&The" << iCPlusPlusIdentifier << "Table, // the locale values override the standard values\n"\
  1.1251 +			"		0 // the flags are standard\n"\
  1.1252 +			"		},\n"\
  1.1253 +			"		{\n"\
  1.1254 +			"		KUidBasicCollationMethod, // the standard unlocalised method\n"\
  1.1255 +			"		NULL, // null means use the standard table\n"\
  1.1256 +			"		NULL, // there's no override table\n"\
  1.1257 +			"		0 // the flags are standard\n"\
  1.1258 +			"		}\n"\
  1.1259 +			"	};\n"\
  1.1260 +			"\n"\
  1.1261 +			"static const TCollationDataSet TheCollationDataSet =\n"\
  1.1262 +			"	{\n"\
  1.1263 +			"	TheCollationMethod,\n"\
  1.1264 +			"	2\n"\
  1.1265 +			"	};"\
  1.1266 +			"\n\n"\
  1.1267 +			"// The one and only locale character set object.\n"\
  1.1268 +			"const LCharSet TheCharSet =\n"\
  1.1269 +			"	{\n"\
  1.1270 +			"	NULL,\n"\
  1.1271 +			"	&TheCollationDataSet\n"\
  1.1272 +			"	};\n";
  1.1273 +
  1.1274 +	output_file.close();
  1.1275 +	delete [] locale;
  1.1276 +	}
  1.1277 +
  1.1278 +int CollationIndex::Compare(const void* aIndex1,const void* aIndex2)
  1.1279 +	{
  1.1280 +	return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode;
  1.1281 +	}