diff -r 000000000000 -r bde4ae8d615e os/textandloc/localisation/localesupport/coltab/COLTAB.CPP --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/os/textandloc/localisation/localesupport/coltab/COLTAB.CPP Fri Jun 15 03:10:57 2012 +0200 @@ -0,0 +1,1278 @@ +// Copyright (c) 1999-2009 Nokia Corporation and/or its subsidiary(-ies). +// All rights reserved. +// This component and the accompanying materials are made available +// under the terms of "Eclipse Public License v1.0" +// which accompanies this distribution, and is available +// at the URL "http://www.eclipse.org/legal/epl-v10.html". +// +// Initial Contributors: +// Nokia Corporation - initial contribution. +// +// Contributors: +// +// Description: +// Reads and parses the Unicode collation value table and writes out a C++ source file +// containing the data in a form that can be used by the EPOC collation system. +// +// The program reads three files or one compositive files: +// +// Three files (by default): +// 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as +// basekeys.txt, supplied with the Standard Unicode Collation system +// +// 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as +// compkeys.txt, supplied with the Standard Unicode Collation system +// +// 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the +// same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line, +// space-separated and each exactly 4 hex digits. +// +// One compositive files (with option /a): +// 1. All Keys (combine above three files into one file): must be in the same format as allkeys.txt, supplied with the Standard Unicode Collation system (after Unicode 3.0). +// +// + + +#include +#include + +#ifdef __MSVCDOTNET__ +#include +#include +using namespace std; +#else //!__MSVCDOTNET__ +#include +#include +#endif //__MSVCDOTNET__ + +#include +#include +#include + +/* +Constants constraining the range of level-1 and level-2 keys so that they can be packed. +Non-zero values are reduced by one less than the minimum value. +*/ +const unsigned int KLevel1Bits = 8; +const unsigned int KLevel1Min = 0x20; +const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2; +const unsigned int KLevel2Bits = 6; +const unsigned int KLevel2Min = 1; +const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2; + +/* +Table of characters in the WGL4 set, plus characters in canonical decompositions of +those characters, plus commonly used control characters and space characters, +given as ranges of Unicode characters. In each pair, the first code is the first in the range, +and the second is the first code NOT in the range. + +The extra characters are added mainly to ensure that control characters and spaces are +normally ignored. The extra characters are: + +0x0000-0x001F: ASCII control characters +0x2000-0x2012: spaces, hyphen variants, figure dash +0x2028-0x202E: line and paragraph separator, bidirectional control characters +0xFEFF : byte-order mark +0xFFFC-0xFFFD: object replacement character, replacement character +*/ +const unsigned int Wgl4Range[] = + { + 0x00, 0x7f, // All ASCII + 0xa0, 0x180, // Non-breaking space, Latin-1, Latin Extended-A + 0x192,0x193, // Latin f with hook + 0x1fa,0x200, // A-ring, a-ring, AE, ae, O slash, o slash all with acute accent + 0x2c6,0x2c8, // non-combining circumflex and caron + 0x2c9,0x2ca, // non-combining macron + 0x2d8,0x2dc, // non-combining breve, dot above, ring above, ogonek + 0x2dd,0x2de, // non-combining double acute + 0x300,0x305, // combining grave, acute, circumflex, tilde, macron + 0x306,0x309, // combining breve, dot above, double dot above + 0x30a,0x30e, // combining ring above, double acute, caron, vertical line above + 0x327,0x329, // combining cedilla, ogonek + 0x384,0x38b, // Greek + 0x38c,0x38d, // Greek + 0x38e,0x3a2, // Greek + 0x3a3,0x3cf, // Greek + 0x401,0x40d, // Cyrillic + 0x40e,0x450, // Cyrillic + 0x451,0x45d, // Cyrillic + 0x45e,0x460, // Cyrillic + 0x490,0x492, // Cyrillic + 0x1e80,0x1e86, // Both W and w with each of grave, acute and diaeresis + 0x1ef2,0x1ef4, // Y with grave, y with grave + 0x2000,0x2016, // various space and horizontal lines + 0x2017,0x201f, //double vertical line, double low line, various quotation marks + 0x2020,0x2023, // dagger, double dagger, bullet + 0x2026,0x2027, //ellipsis + 0x2028,0x202F, // line & paragraph separators and directional formatting + 0x2030,0x2031, // per mille + 0x2032,0x2034, // prime + 0x2039,0x203b, // single angle quotation marks + 0x203c,0x203d, // double exclamation mark + 0x203e,0x203f, // non-combining overscore + 0x2044,0x2045, // fraction slash + 0x207f,0x2080, // superscript n + 0x20a3,0x20a5, // French Franc, Italian/Turkish Lira + 0x20a7,0x20a8, // Spanish Peseta + 0x20ac,0x20ad, // Euro symbol + 0x2105,0x2106, // care of + 0x2113,0x2114, // script l + 0x2116,0x2117, // numero + 0x2122,0x2123, // trade mark + 0x2126,0x2127, // ohm + 0x212e,0x212f, // estimated (net weight) + 0x215b,0x215f, // 1/8, 3/8, 5/8, 7/8 + 0x2190,0x2196, // horizontal and vertical arrows + 0x21a8,0x21a9, // up down arrow with base + 0x2202,0x2203, // partial differential + 0x2206,0x2207, // increment (delta) + 0x220f,0x2210, // n-ary product (pi) + 0x2211,0x2213, // n-ary sum (sigma), minus + 0x2215,0x2216, // division (slash) + 0x2219,0x221b, // bullet operator, square root + 0x221e,0x2220, // infinity, right angle + 0x2229,0x222a, // intersection + 0x222b,0x222c, // union + 0x2248,0x2249, // almost equal to + 0x2260,0x2262, // not equal to, identical to + 0x2264,0x2266, // less-than-or-equal-to, greater-than-or-equal-to + 0x2302,0x2303, // house + 0x2310,0x2311, // rversed not sign + 0x2320,0x2322, // top and bottom of integral + 0x2500,0x2501, // box drawing + 0x2502,0x2503, // box drawing + 0x250c,0x250d, // box drawing + 0x2510,0x2511, // box drawing + 0x2514,0x2515, // box drawing + 0x2518,0x2519, // box drawing + 0x251c,0x251d, // box drawing + 0x2524,0x2525, // box drawing + 0x252c,0x252d, // box drawing + 0x2534,0x2535, // box drawing + 0x253c,0x253d, // box drawing + 0x2550,0x256d, // box drawing + 0x2580,0x2581, // block element + 0x2584,0x2585, // block element + 0x2588,0x2589, // block element + 0x258c,0x258d, // block element + 0x2590,0x2594, // block element + 0x25a0,0x25a2, // geometric shapes + 0x25aa,0x25ad, // geometric shapes + 0x25b2,0x25b3, // geometric shapes + 0x25ba,0x25bb, // geometric shapes + 0x25bc,0x25bd, // geometric shapes + 0x25c4,0x25c5, // geometric shapes + 0x25ca,0x25cc, // geometric shapes + 0x25cf,0x25d0, // geometric shapes + 0x25d8,0x25da, // geometric shapes + 0x25e6,0x25e7, // geometric shapes + 0x263a,0x263d, // smilies, sun + 0x2640,0x2641, // female + 0x2642,0x2643, // male + 0x2660,0x2661, // spade + 0x2663,0x2664, // club + 0x2665,0x2667, // heart + 0x266a,0x266c, // quaver, beamed quavers + 0xfb01,0xfb03, // fi, fl ligatures + 0xfeff,0xff00, // zero-width non-breaking space + 0xfffc, 0xfffe // object replacement character and replacement character + }; +const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2; + +int CompareWgl4Ranges(const void* aRange1,const void* aRange2) + { + unsigned int* p = (unsigned int*)aRange1; + unsigned int* q = (unsigned int*)aRange2; + if (q[0] == q[1]) + { + unsigned int* temp = p; + p = q; + q = temp; + } + if (*p < *q) + return -1; + else if (*p >= q[1]) + return 1; + else + return 0; + } + +// Determine if a character is in the WGL4 character repertoire. +static bool InWgl4(unsigned int aChar) + { + unsigned int key[2]; + key[0] = key[1] = aChar; + return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL; + } + +// A collation key. +class CollationKey + { +public: + bool operator==(const CollationKey& k) const + { return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] && + iIgnorable == k.iIgnorable && iStop == k.iStop; } + + enum + { + ELevels = 3 + }; + int iLevel[ELevels];// the keys at the various levels + bool iIgnorable; // TRUE if this key can normally be ignored + bool iStop; // TRUE if this is the last key in a string of keys + }; + +// The collation index for a single Unicode value. +class CollationIndex + { +public: + static int Compare(const void* aIndex1,const void* aIndex2); + + int iCode; // Unicode value + int iIndex; // index into the key table + }; + +class Reader + { +public: + Reader(bool aWgl4,bool aStandard,const char* aLocaleName, const char* aUidString); + ~Reader(); + void ReadBaseKeys(const char* aFileName); + void ReadCompKeys(const char* aFileName); + void ReadStrings(const char* aFileName); + void ReadAllKeys(const char* aFileName); + void WriteOutput(const char* aFileName, bool aCopyrightMessage); + int CompareStringIndices(int aIndex1,int aIndex2) const; + +private: + Reader(const Reader&); + int Hex(const char *aString, int &aCharConsumed, bool aTolerate = false); + void GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey=NULL); + void GetMultipleCollationKeys(const char* aString); + unsigned int PackKey(const CollationKey& aValue); + int PackIndex(const CollationIndex& aValue, unsigned int result[2]); + bool ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount); + void AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart); + void AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart); + void AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart); + + enum + { + EMaxCollationKeys = 0x110000 * 2, /*more elements considering composite keys */ + EMaxCollationIndices = 0x110000, + EMaxStringElements = 65536, + EMaxStringIndices = 65536 + }; + CollationKey iCollationKey[EMaxCollationKeys]; + int iKeys; + CollationIndex iCollationIndex[EMaxCollationIndices]; + int iIndices; + int iStringElement[EMaxStringElements]; + int iStringElements; + unsigned int iStringIndex[EMaxStringIndices]; + int iStringIndices; + const char* iInputFileName; + int iLineNumber; + bool iSuppressCanonseqWarning; // have we issued the canonseq warning yet? + bool iWgl4; // true if writing keys for wgl4 characters only + bool iStandard; // true if reading standard files, not tailoring files + const char* iLocaleName; + const char* iUidString; + char* iCPlusPlusIdentifier; // iLocaleName in title case with difficult characters removed + }; + +bool isValidHexDigit(char c) + { + if ('0' <= c && c <= '9') + return true; + if ('a' <= c && c <= 'f') + return true; + if ('A' <= c && c <= 'F') + return true; + return false; + } + +void PrintUsage() + { + cout << "Usage: coltab [/u] [/c] [/a] [/h] \n"; + cout << "By Default (without /a option), for the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n"; + cout << "For any other locale name coltab reads _basekeys.txt,\n"; + cout << "_compkeys.txt and _strings.txt.\n"; + cout << "Use the /a option, for the locales 'standard' and 'wgl4' coltab reads allkeys.txt\n"; + cout << "For any other locale name coltab reads _allkeys.txt.\n"; + cout << "The output file is always ls_.cpp.\n"; + cout << "Use the /u option to specify the UID that the collation table should have.\n"; + cout << "A hex number must follow /u immediately, for example /u800ACBDE\n"; + cout << "this hex number must not exceed eight digits. If this is not specified,\n"; + cout << "the output file will have to be edited to make it compilable.\n"; + cout << "Specify /c to prefix the output with a Nokia copyright message.\n"; + cout << "Specify /h for in-depth help."; + } + +void UsageError() + { + PrintUsage(); + exit(1); + } + +void PrintHelp(char* aTopic) + { + int topic = 0; + while ('0' <= *aTopic && *aTopic <= '9') + { + topic = topic * 10 + (*aTopic - '0'); + ++aTopic; + } + switch(topic) + { + case 1: + cout << "How Coltab interprets CANONSEQ:\n\n"\ + "If the CANONSEQ specifier is used in a line, Coltab will ignore the mapping.\n"\ + "This because, on the Symbian platform, any canonically composed character is\n"\ + "decomposed before the key mapping is applied, so characters with canonical\n"\ + "decompositions do not need keys. In files supplied by the Unicode Consortium,\n"\ + "all mappings for composed characters are flagged by CANONSEQ, so it is useful\n"\ + "if Coltab can just ignore these so that Unicode Consortium files can be used\n"\ + "unedited.\n\n"\ + "This can cause problems if a localizer copies a line from a Unicode file into,\n"\ + "say, the _strings.txt file, in order to give a mapping for an accented\n"\ + "character. The localizer replaces the composed character code with the\n"\ + "decomposition and changes the keys but forgets to remove the CANONSEQ\n"\ + "specifier. In this case the key would be ignored. Coltab provides a warning so\n"\ + "that this can be put right.\n\n"\ + "Coltab will only warn about the first CANONSEQ in each file, and does not warn\n"\ + "if the 'standard' or 'wgl4' options are used."; + exit(1); + break; + case 2: + cout << "How to ensure coltab's output files are compilable.\n\n"\ + "By default, Coltab's files for locales need to be edited before they are\n"\ + "compilable. The UID for the collation method needs to be filled in. This UID\n"\ + "is added so that the collation table can be searched for later. At present,\n"\ + "this UID is not necessary for the correct functioning of the Symbian platform\n"\ + "and so a value of 0 can be safely used.\n\n"\ + "To insert this value into the file directly, use the /u option, for example\n"\ + "coltab /u0 french\n"\ + "If the /u option is used, the file should be compilable as is. If it is not,\n"\ + "please raise it as a defect with Symbian's internationalization team,\n"\ + "supplying the files that caused the problem if this is possible.\n"\ + "If the 'standard' or 'wgl4' options are used, no UID is output, so the /u\n"\ + "option is not required."; + exit(1); + break; + case 3: + cout << "How to ensure collation key values are inside the supported range. \n\n"\ + "According to Unicode Standard, the range suppored by tool COLTAB:\n"\ + " Level 0 (primary): 0000 - FFFF, \n"\ + " Level 1 (Secondary): 0020 - 011E, \n"\ + " Level 2 (Tertiary): 0001 - 003F. \n"\ + "Please edit your collation files and make sure key values are inside the above range"; + exit(1); + break; + default: + PrintUsage(); + cout << "\n\nSpecify /h1 for help on the use of CANONSEQ\n"; + cout << "Specify /h2 for help on making compilable files that do not need editing\n"; + exit(1); + break; + } + } + +short HighSurrogate(int aCode) + { + return static_cast(0xD7C0 + (aCode >> 10)); + } + +short LowSurrogate(int aCode) + { + return static_cast(0xDC00 | (aCode & 0x3FF)); + } + +int main(int argc,char** argv) + { + bool copyright = false; + bool wgl4 = false; + bool allKeys = false; + const char* prefix = ""; + const char* infix = ""; + const char* locale = ""; + char* localeArg = 0; + char* uidArg = 0; + for (int i = 1; i < argc; ++i) + { + if (argv[i][0] == '/' || argv[i][0] == '-') + { + switch (argv[i][1]) + { + case 'u': + case 'U': + { + uidArg = argv[i] + 2; + const char* uidCheck = uidArg; + while (*uidCheck) + { + if (!isValidHexDigit(*uidCheck)) + UsageError(); + ++uidCheck; + } + if (uidCheck == uidArg || 8 < uidCheck - uidArg) + UsageError(); + break; + } + case 'c': + case 'C': + copyright = true; + break; + case 'a': + allKeys = true; + break; + case 'h': + case 'H': + PrintHelp(argv[i] + 2); + break; + default: + UsageError(); + break; + } + } + else if (!localeArg) + localeArg = argv[i]; + else + UsageError(); + } + if (!localeArg) + UsageError(); + bool standard = false; + if (!_stricmp(localeArg, "standard")) + { + locale = "Standard"; + standard = true; + } + else if (!_stricmp(localeArg, "wgl4")) + { + locale = "Wgl4"; + wgl4 = true; + standard = true; + } + else + { + locale = prefix = localeArg; + infix = "_"; + } + + Reader* reader = new Reader(wgl4, standard, locale, uidArg); + if (!reader) + { + cout << "out of memory\n"; + exit(1); + } + char* filename = new char[strlen(prefix) + strlen(infix) + 64]; + if (allKeys == false) + { + sprintf(filename,"%s%scompkeys.txt",prefix,infix); + reader->ReadCompKeys(filename); + if (!standard) + { + sprintf(filename,"%s%sstrings.txt",prefix,infix); + reader->ReadStrings(filename); + } + sprintf(filename,"%s%sbasekeys.txt",prefix,infix); + reader->ReadBaseKeys(filename); + } + else + { + sprintf(filename,"%s%sAllKeys.txt",prefix,infix); + reader->ReadAllKeys(filename); + } + sprintf(filename,"ls_%s.cpp", localeArg); + reader->WriteOutput(filename, copyright); + + delete reader; + delete [] filename; + return 0; + } + +Reader::Reader(bool aWgl4, bool aStandard, + const char* aLocaleName, const char* aUidString): + iKeys(0), + iIndices(0), + iStringElements(0), + iStringIndices(0), + iInputFileName(NULL), + iLineNumber(0), + iSuppressCanonseqWarning(false), + iWgl4(aWgl4), + iStandard(aStandard), + iLocaleName(aLocaleName), + iUidString(aUidString) + { + if (iStandard) + { + iCPlusPlusIdentifier = new char[9]; + strcpy(iCPlusPlusIdentifier, "Standard"); + return; + } + char* p = iCPlusPlusIdentifier = new char[strlen(aLocaleName) + 2]; + int current = toupper(aLocaleName[0]); + if (current < 'A' || 'Z' < current) + *p++ = 'C'; + else + { + *p++ = static_cast(current); + ++aLocaleName; + } + bool inUnderScore = false; + while (*aLocaleName) + { + current = tolower(*aLocaleName++); + if (current < 'a' || 'z' < current) + { + if (!inUnderScore) + { + inUnderScore = true; + *p++ = '_'; + } + } + else + { + inUnderScore = false; + *p++ = static_cast(current); + } + } + *p = 0; + } + +Reader::~Reader() + { + delete [] iCPlusPlusIdentifier; + } + +// Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true. +int Reader::Hex(const char *aString, int &aCharConsumed, bool aTolerate) + { + char *end; + unsigned long x = strtoul(aString,&end,16); + aCharConsumed = end - aString; + if ((aCharConsumed != 4) && (aCharConsumed != 5) && (aCharConsumed != 6)) + { + if (!aTolerate) + { + cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n'; + exit(1); + } + return -1; + } + return x; + } + +// Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx] +void Reader::GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey) + { + aCharConsumed = 0; + const char *end = strchr(aString, ']'); + if (end != NULL){ + aCharConsumed = end - aString; + } + + if (aString[0] != '[' || (aCharConsumed != 21 && aCharConsumed != 22 && aCharConsumed != 23)) + { + cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n'; + exit(1); + } + if (aKey == NULL) + { + if (iKeys >= EMaxCollationKeys) + { + cout << "too many keys"; + exit(1); + } + aKey = &iCollationKey[iKeys++]; + } + aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored + int charConsumed = 0; + for (int i = 0; i < CollationKey::ELevels; i++) + aKey->iLevel[i] = Hex(aString + 2 + i * 5, charConsumed); + + if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max)) + { + aKey->iLevel[1] = KLevel1Max; + cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max << "\n"; + cout << "Error: illegal key value in file, please see coltab /h3 for details.\n"; + exit(1); + } + + if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max)) + { + cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max << "\n"; + cout << "Error: illegal key value in file, please see coltab /h3 for details.\n"; + exit(1); + } + + aKey->iStop = true; + } + +void Reader::GetMultipleCollationKeys(const char* aString) + { + int keyCount = 0; + int charConsumed =0; + while (aString[0] == '[') + { + GetCollationKey(aString, charConsumed); + + keyCount++; + iCollationKey[iKeys - 1].iStop = false; + int length = strlen(aString); + if (length <= charConsumed + 1) + break; + aString += charConsumed + 1; + + if (aString[0] == ' ') //a space is put between collation keys in keys files provided by previous Unicode Standard (i.e 3.1) + aString++; + + } + iCollationKey[iKeys - 1].iStop = true; + } + +/* +Partially parse a line, returning its key code and the start of its first block of key data. +Return false if it is not a data line, or not relevant. +*/ +bool Reader::ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount) + { + int lineLength = strlen(aLine); + int charConsumed = 0; + aCodeCount = 0; + aCode[0] = Hex(aLine,charConsumed,true); + + /* + A data line must start with a hex number and be at least 27 characters long. + Canonically decomposable Unicode characters are skipped. + Skip non-WGL4 characters if doing WGL4 only. + */ + if (aCode[0] != -1) + { + aCodeCount = 1; + if (!strcmp(aLine + lineLength - 8,"CANONSEQ")) + { + if (!iSuppressCanonseqWarning) + { + cout << "Warning: CANONSEQ used in file " << iInputFileName + << " on line " << iLineNumber << ".\nWarning: All mappings specifying CANONSEQ are ignored.\n" + << "Warning: Use coltab /h1 for more details."; + iSuppressCanonseqWarning = true; + } + aCodeCount = 0; + } + else if (lineLength < 27 || + (iWgl4 && !InWgl4((unsigned int)aCode))) + aCodeCount = 0; + } + + if (aCode[0] != -1) + { + // find '[' + aKeyStart = charConsumed; + while (aKeyStart < lineLength && aLine[aKeyStart] != '[') + aKeyStart++; + + // read all hex before '[' + int index = charConsumed + 1; + while (index < aKeyStart) + { + aCode[aCodeCount] = Hex(aLine+index, charConsumed, true); + if (aCode[aCodeCount] == -1) + break; + + index += charConsumed + 1; + aCodeCount++; + } + + // find number of collation keys + aKeyCount = 0; + index = aKeyStart; + while (index < lineLength && aLine[index] != '%' && aLine[index] != '#') + { + if (aLine[index] == '[') + aKeyCount++; + index++; + } + } + + return aCodeCount > 0; + } + +void Reader::AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart) + { + if (iIndices >= EMaxCollationIndices) + { + cout << "too many Unicode values"; + exit(1); + } + CollationIndex& index = iCollationIndex[iIndices++]; + index.iCode = aCode; + index.iIndex = -1; + + /* + First try to find the key in the array of keys found so far. + Search backwards to use the fact that runs of the same key occur together. + */ + CollationKey key; + int charConsumed = 0; + GetCollationKey(aLine + aKeyStart, charConsumed, &key); + for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--) + if (iCollationKey[i] == key) + index.iIndex = i; + + // If that fails, add a new key. + if (index.iIndex == -1) + { + index.iIndex = iKeys++; + if (iKeys > EMaxCollationKeys) + { + cout << "too many keys"; + exit(1); + } + iCollationKey[index.iIndex] = key; + } + } +/* +Read 1-to-1 mapping. Sample: +02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME + +aCombinedFile = true: aFileName is combined file, which contains base keys, comp keys, and string keys. +*/ +void Reader::ReadBaseKeys(const char* aFileName) + { + iSuppressCanonseqWarning = iStandard || iWgl4; + iLineNumber = 0; + iInputFileName = aFileName; + ifstream input_file; + +#ifdef __MSVCDOTNET__ + input_file.open(iInputFileName, ios::in); +#else //!__MSVCDOTNET__ + input_file.open(iInputFileName, ios::in | ios::nocreate); +#endif //__MSVCDOTNET__ + + if (input_file.fail()) + { + cout << "cannot open input file '" << iInputFileName << "'\n"; + exit(1); + } + cout << "reading base keys from '" << iInputFileName << "'\n"; + + char line[1024]; + for (;;) + { + input_file.getline(line,sizeof(line)); + if (input_file.eof()) + break; + iLineNumber++; + // line number counting + if (iLineNumber % 100 == 0) + { + cout << "line " << iLineNumber << '\n'; + cout.flush(); + } + int code[16]; + int codeCount = 0; + int key_start = 0; + int keyCount = 0; + if (ParseLine(line, code, codeCount, key_start, keyCount)) + { + if (codeCount != 1 || keyCount != 1) + continue; // goto next line + AddKeyOneToOne(line, code[0], key_start); + } + } + + input_file.close(); + } + +void Reader::AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart) + { + if (iIndices >= EMaxCollationIndices) + { + cout << "too many Unicode values"; + exit(1); + } + CollationIndex& index = iCollationIndex[iIndices++]; + index.iCode = aCode; + index.iIndex = iKeys; + GetMultipleCollationKeys(aLine + aKeyStart); + } +/* +Read 1-to-much mapping. +3303 ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN +*/ +void Reader::ReadCompKeys(const char* aFileName) + { + iSuppressCanonseqWarning = iStandard || iWgl4; + iLineNumber = 0; + iInputFileName = aFileName; + ifstream input_file; + +#ifdef __MSVCDOTNET__ + input_file.open(iInputFileName, ios::in); +#else //!__MSVCDOTNET__ + input_file.open(iInputFileName, ios::in | ios::nocreate); +#endif //__MSVCDOTNET__ + + if (input_file.fail()) + { + cout << "there are no composite keys; '" << iInputFileName << "' not found\n"; + return; + } + cout << "reading composite keys from '" << iInputFileName << "'\n"; + + char line[1024]; + for (;;) + { + input_file.getline(line,sizeof(line)); + if (input_file.eof()) + break; + iLineNumber++; + // line number counting + if (iLineNumber % 100 == 0) + { + cout << "line " << iLineNumber << '\n'; + cout.flush(); + } + int code[16]; + int codeCount = 0; + int key_start = 0; + int keyCount = 0; + if (ParseLine(line, code, codeCount, key_start, keyCount)) + { + if (codeCount != 1 || keyCount < 2) + continue; // goto next line + AddKeyOneToMuch(line, code[0], key_start); + } + } + + input_file.close(); + } + + +void Reader::AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart) + { + + // Store the index to the Unicode string and the key sequence. + if (iStringIndices > EMaxStringIndices) + { + cout << "too many string indices"; + exit(1); + } + iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys; + + // Reserve space for the length. + if (iStringElements >= EMaxStringElements) + { + cout << "too many string elements"; + exit(1); + } + iStringElements++; + + // Read the Unicode string. + int length = 0; // in unit of int16 + int charCount = 0; // in unit of char. for debug. + + for (int i=0; i= EMaxStringElements) + { + cout << "too many string elements"; + exit(1); + } + + if (aCode[i] > 0xFFFF) + { + // UCS4 --> UTF-16 + iStringElement[iStringElements++] = 0xD7C0 + (aCode[i] >> 10); + iStringElement[iStringElements++] = 0xDC00 | (aCode[i] & 0x3FF); + length += 2; + } + else + { + iStringElement[iStringElements++] = aCode[i]; + length++; + } + charCount++; + } + + iStringElement[iStringElements - length - 1] = (unsigned int)length; + + // Read the key sequence. + GetMultipleCollationKeys(aLine + aKeyStart); + } +/* +Read much-to-much mapping. Sample: +004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke +0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # +*/ +void Reader::ReadStrings(const char* aFileName) + { + iSuppressCanonseqWarning = iStandard || iWgl4; + iLineNumber = 0; + iInputFileName = aFileName; + ifstream input_file; + +#ifdef __MSVCDOTNET__ + input_file.open(iInputFileName, ios::in); +#else //!__MSVCDOTNET__ + input_file.open(iInputFileName, ios::in | ios::nocreate); +#endif //__MSVCDOTNET__ + + if (input_file.fail()) + { + cout << "there are no strings; '" << iInputFileName << "' not found\n"; + return; + } + cout << "reading strings from '" << iInputFileName << "'\n"; + + char line[1024]; + for (;;) + { + input_file.getline(line,sizeof(line)); + if (input_file.eof()) + break; + iLineNumber++; + // line number counting + if (iLineNumber % 100 == 0) + { + cout << "line " << iLineNumber << '\n'; + cout.flush(); + } + int code[16]; + int codeCount = 0; + int key_start = 0; + int keyCount = 0; + if (ParseLine(line, code, codeCount, key_start, keyCount)) + { + if (codeCount < 2 || keyCount < 1) + continue; // goto next line + AddKeyMuchToMuch(line, code, codeCount, key_start); + } + } + + input_file.close(); + } + +/* +Read combined key table. Sample: +1-to-1 mapping: +02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME + +1-to-much mapping: +3303 ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN + +much-to-much mapping: +004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke +0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # +*/ +void Reader::ReadAllKeys(const char* aFileName) + { + iSuppressCanonseqWarning = iStandard || iWgl4; + iLineNumber = 0; + iInputFileName = aFileName; + ifstream input_file; + +#ifdef __MSVCDOTNET__ + input_file.open(iInputFileName, ios::in); +#else //!__MSVCDOTNET__ + input_file.open(iInputFileName, ios::in | ios::nocreate); +#endif //__MSVCDOTNET__ + + if (input_file.fail()) + { + cout << "there are no keys; '" << iInputFileName << "' not found\n"; + return; + } + cout << "reading all keys from '" << iInputFileName << "'\n"; + + char line[1024]; + for (;;) + { + if (input_file.eof()) + break; + input_file.getline(line,sizeof(line)); + iLineNumber++; + + int code[16]; + int codeCount = 0; + int key_start = 0; + int keyCount = 0; + if (ParseLine(line, code, codeCount, key_start, keyCount)) + { + if (codeCount == 1 && keyCount == 1) + AddKeyOneToOne(line, code[0], key_start); + else if (codeCount == 1 && keyCount > 1) + AddKeyOneToMuch(line, code[0], key_start); + else if (codeCount > 1 && keyCount > 0) + AddKeyMuchToMuch(line, code, codeCount, key_start); + else + cout << "ignore line: " << line << "\n"; + } + } + + input_file.close(); + } + + +// Pack the 3 collation key levels into a single 32-bit integer. +unsigned int Reader::PackKey(const CollationKey& aValue) + { + unsigned int level0 = aValue.iLevel[0]; + unsigned int level1 = aValue.iLevel[1]; + if (level1 > 0) + level1 -= (KLevel1Min - 1); + unsigned int level2 = aValue.iLevel[2]; + if (level2 > 0) + level2 -= (KLevel2Min - 1); + unsigned int key = level0 << 16 | level1 << 8 | level2 << 2; + if (aValue.iIgnorable) + key |= 2; + if (aValue.iStop) + key |= 1; + return key; + } + +// Pack a collation index value into a single 32-bit integer. +int Reader::PackIndex(const CollationIndex& aValue, unsigned int result[2]) + { + unsigned int code = aValue.iCode; + unsigned int index = aValue.iIndex; + if (code <= 0xFFFF) + { + result[0] = (code << 16 | index); + return 1; + } + else + { + result[0] = (::HighSurrogate(code) << 16 | index); + result[1] = (::LowSurrogate(code) << 16 | index); + return 2; + } + } + +const Reader* TheReader; +static int CompareStringIndices(const void* aIndex1,const void* aIndex2) + { + return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16); + } + +int CompareUnicodeStrings(const int *aString1,int aLength1,const int *aString2,int aLength2) + { + for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++) + { + int x = i < aLength1 ? *aString1 : -1; + int y = i < aLength2 ? *aString2 : -1; + if (x != y) + return x - y; + } + return 0; + } + +int Reader::CompareStringIndices(int aIndex1,int aIndex2) const + { + return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1], + iStringElement + aIndex2 + 1,iStringElement[aIndex2]); + } + +void Reader::WriteOutput(const char* aFileName, bool aCopyright) + { + int i; + ofstream output_file; + output_file.open(aFileName); + if (output_file.fail()) + { + cout << "cannot open output file '" << aFileName << "'\n"; + exit(1); + } + cout << "writing output to '" << aFileName << "'\n"; + + char *locale = NULL; + if (iStandard) + locale = _strdup("Standard"); + else + locale = _strdup(iLocaleName); + + if (!iStandard) + { + _strlwr(locale); + locale[0] = (char)toupper(locale[0]); + if (aCopyright) + { + char* capsFileName = new char[strlen(aFileName) + 1]; + strcpy(capsFileName, aFileName); + _strupr(capsFileName); + output_file << "/*\n" << capsFileName << "\n\nCopyright (C) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.\n*/\n"; + delete [] capsFileName; + output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n"; + output_file << "Generated by COLTAB.\n*/\n"; + } + + output_file << "\n#include \"ls_std.h\"\n#include \n"; + output_file << "\nconst TUint KUid" << iCPlusPlusIdentifier << "CollationMethod = "; + if (iUidString) + output_file << "0x" << iUidString << ";\n"; + else + { + output_file << "/* FILL THIS IN */;\n"; + cout << "Warning: File will need editing\nWarning: see coltab /h2 for details.\n"; + } + } + + /* + Write the unique collation keys. + Each one has the format, going from highest to lowest bit: + + 16 bits: level-0 key + 8 bits: level-1 key + 6 bits: level-2 key + 1 bit: set if this key is optionally ignorable + 1 bit: set if this is the last key in the string of keys for a single Unicode value + + */ + if (iKeys != 0) + { + output_file << "\nstatic const TUint32 The" << iCPlusPlusIdentifier << "Key[] = \n\t{"; + CollationKey* ck = iCollationKey; + output_file << "\t // " << iKeys << " keys"; + output_file << hex; + for (i = 0; i < iKeys; i++, ck++) + { + unsigned int key = PackKey(*ck); + if (i % 8 == 0) + output_file << "\n\t"; + output_file << "0x"; + output_file << key << ","; + } + output_file << dec; + output_file << "\n\t};\n\n"; + } + + if (iIndices != 0) + { + // Sort then write the collation index values - these relate Unicode values to collation keys. + qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare); + output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "Index[] = \n\t{"; + CollationIndex* ci = iCollationIndex; + int entry=0; + output_file << "\t // " << iIndices << " indices"; + output_file << hex; + for (i = 0; i < iIndices; i++, ci++, entry++) + { + unsigned int key[2]; + int bytecount = PackIndex(*ci, key); + + if (entry % 8 == 0) + output_file << "\n\t"; + output_file << "0x"; + output_file << key[0] << ","; + + if (bytecount == 2) + { + entry++; + if (entry % 8 == 0) + output_file << "\n\t"; + output_file << "0x"; + output_file << key[1] << ","; + } + } + output_file << dec; + output_file << "\n\t};"; + output_file << "\t // " << entry << " entries"; + output_file << "\n\n"; + iIndices = entry; //One surrogate pair occupies 2 entries + } + + if (iStringElements) + { + // Write the Unicode strings; these are preceded by their lengths. + output_file << "static const TUint16 The" << iCPlusPlusIdentifier << "StringElement[] = \n\t{"; + output_file << hex; + for (i = 0; i < iStringElements; i++) + { + if (i % 8 == 0) + output_file << "\n\t"; + output_file << "0x" << iStringElement[i] << ","; + } + output_file << dec; + if (iStringElements==0) + output_file << "0"; + output_file << "\n\t};\n\n"; + + /* + Sort then write the string index values - these relate Unicode strings to collation keys. + Each one has the string index in the upper word and the key index in the lower word. + */ + TheReader = this; + qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices); + output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "StringIndex[] = \n\t{"; + output_file << hex; + for (i = 0; i < iStringIndices; i++) + { + if (i % 8 == 0) + output_file << "\n\t"; + output_file << "0x" << iStringIndex[i] << ","; + } + output_file << dec; + if (iStringIndices ==0) + output_file << "0"; + output_file << "\n\t};\n\n"; + } + + // Write the collation table structure. + output_file << "static const TCollationKeyTable The" << iCPlusPlusIdentifier << "Table = \n\t{ "; + if (iKeys) + output_file << "The" << iCPlusPlusIdentifier << "Key"; + else + output_file << "0"; + if (iIndices) + output_file << ", The" << iCPlusPlusIdentifier << "Index, " << iIndices; + else + output_file << ", 0, 0"; + if (iStringElements) + output_file << ", The" << iCPlusPlusIdentifier << "StringElement, The" << iCPlusPlusIdentifier << "StringIndex, " << iStringIndices << " };\n"; + else + output_file << ", 0, 0, 0 };\n"; + + if (!iStandard) + output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\ + " {\n"\ + " {\n"\ + " KUid" << iCPlusPlusIdentifier << "CollationMethod, // the method for the locale\n"\ + " NULL, // use the standard table as the main table\n"\ + " &The" << iCPlusPlusIdentifier << "Table, // the locale values override the standard values\n"\ + " 0 // the flags are standard\n"\ + " },\n"\ + " {\n"\ + " KUidBasicCollationMethod, // the standard unlocalised method\n"\ + " NULL, // null means use the standard table\n"\ + " NULL, // there's no override table\n"\ + " 0 // the flags are standard\n"\ + " }\n"\ + " };\n"\ + "\n"\ + "static const TCollationDataSet TheCollationDataSet =\n"\ + " {\n"\ + " TheCollationMethod,\n"\ + " 2\n"\ + " };"\ + "\n\n"\ + "// The one and only locale character set object.\n"\ + "const LCharSet TheCharSet =\n"\ + " {\n"\ + " NULL,\n"\ + " &TheCollationDataSet\n"\ + " };\n"; + + output_file.close(); + delete [] locale; + } + +int CollationIndex::Compare(const void* aIndex1,const void* aIndex2) + { + return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode; + }