Update contrib.
1 // Copyright (c) 1999-2009 Nokia Corporation and/or its subsidiary(-ies).
2 // All rights reserved.
3 // This component and the accompanying materials are made available
4 // under the terms of "Eclipse Public License v1.0"
5 // which accompanies this distribution, and is available
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
8 // Initial Contributors:
9 // Nokia Corporation - initial contribution.
14 // Reads and parses the Unicode collation value table and writes out a C++ source file
15 // containing the data in a form that can be used by the EPOC collation system.
17 // The program reads three files or one compositive files:
19 // Three files (by default):
20 // 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as
21 // basekeys.txt, supplied with the Standard Unicode Collation system
23 // 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as
24 // compkeys.txt, supplied with the Standard Unicode Collation system
26 // 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the
27 // same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line,
28 // space-separated and each exactly 4 hex digits.
30 // One compositive files (with option /a):
31 // 1. All Keys (combine above three files into one file): must be in the same format as allkeys.txt, supplied with the Standard Unicode Collation system (after Unicode 3.0).
43 #else //!__MSVCDOTNET__
46 #endif //__MSVCDOTNET__
53 Constants constraining the range of level-1 and level-2 keys so that they can be packed.
54 Non-zero values are reduced by one less than the minimum value.
56 const unsigned int KLevel1Bits = 8;
57 const unsigned int KLevel1Min = 0x20;
58 const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2;
59 const unsigned int KLevel2Bits = 6;
60 const unsigned int KLevel2Min = 1;
61 const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2;
64 Table of characters in the WGL4 set, plus characters in canonical decompositions of
65 those characters, plus commonly used control characters and space characters,
66 given as ranges of Unicode characters. In each pair, the first code is the first in the range,
67 and the second is the first code NOT in the range.
69 The extra characters are added mainly to ensure that control characters and spaces are
70 normally ignored. The extra characters are:
72 0x0000-0x001F: ASCII control characters
73 0x2000-0x2012: spaces, hyphen variants, figure dash
74 0x2028-0x202E: line and paragraph separator, bidirectional control characters
75 0xFEFF : byte-order mark
76 0xFFFC-0xFFFD: object replacement character, replacement character
78 const unsigned int Wgl4Range[] =
80 0x00, 0x7f, // All ASCII
81 0xa0, 0x180, // Non-breaking space, Latin-1, Latin Extended-A
82 0x192,0x193, // Latin f with hook
83 0x1fa,0x200, // A-ring, a-ring, AE, ae, O slash, o slash all with acute accent
84 0x2c6,0x2c8, // non-combining circumflex and caron
85 0x2c9,0x2ca, // non-combining macron
86 0x2d8,0x2dc, // non-combining breve, dot above, ring above, ogonek
87 0x2dd,0x2de, // non-combining double acute
88 0x300,0x305, // combining grave, acute, circumflex, tilde, macron
89 0x306,0x309, // combining breve, dot above, double dot above
90 0x30a,0x30e, // combining ring above, double acute, caron, vertical line above
91 0x327,0x329, // combining cedilla, ogonek
96 0x401,0x40d, // Cyrillic
97 0x40e,0x450, // Cyrillic
98 0x451,0x45d, // Cyrillic
99 0x45e,0x460, // Cyrillic
100 0x490,0x492, // Cyrillic
101 0x1e80,0x1e86, // Both W and w with each of grave, acute and diaeresis
102 0x1ef2,0x1ef4, // Y with grave, y with grave
103 0x2000,0x2016, // various space and horizontal lines
104 0x2017,0x201f, //double vertical line, double low line, various quotation marks
105 0x2020,0x2023, // dagger, double dagger, bullet
106 0x2026,0x2027, //ellipsis
107 0x2028,0x202F, // line & paragraph separators and directional formatting
108 0x2030,0x2031, // per mille
109 0x2032,0x2034, // prime
110 0x2039,0x203b, // single angle quotation marks
111 0x203c,0x203d, // double exclamation mark
112 0x203e,0x203f, // non-combining overscore
113 0x2044,0x2045, // fraction slash
114 0x207f,0x2080, // superscript n
115 0x20a3,0x20a5, // French Franc, Italian/Turkish Lira
116 0x20a7,0x20a8, // Spanish Peseta
117 0x20ac,0x20ad, // Euro symbol
118 0x2105,0x2106, // care of
119 0x2113,0x2114, // script l
120 0x2116,0x2117, // numero
121 0x2122,0x2123, // trade mark
122 0x2126,0x2127, // ohm
123 0x212e,0x212f, // estimated (net weight)
124 0x215b,0x215f, // 1/8, 3/8, 5/8, 7/8
125 0x2190,0x2196, // horizontal and vertical arrows
126 0x21a8,0x21a9, // up down arrow with base
127 0x2202,0x2203, // partial differential
128 0x2206,0x2207, // increment (delta)
129 0x220f,0x2210, // n-ary product (pi)
130 0x2211,0x2213, // n-ary sum (sigma), minus
131 0x2215,0x2216, // division (slash)
132 0x2219,0x221b, // bullet operator, square root
133 0x221e,0x2220, // infinity, right angle
134 0x2229,0x222a, // intersection
135 0x222b,0x222c, // union
136 0x2248,0x2249, // almost equal to
137 0x2260,0x2262, // not equal to, identical to
138 0x2264,0x2266, // less-than-or-equal-to, greater-than-or-equal-to
139 0x2302,0x2303, // house
140 0x2310,0x2311, // rversed not sign
141 0x2320,0x2322, // top and bottom of integral
142 0x2500,0x2501, // box drawing
143 0x2502,0x2503, // box drawing
144 0x250c,0x250d, // box drawing
145 0x2510,0x2511, // box drawing
146 0x2514,0x2515, // box drawing
147 0x2518,0x2519, // box drawing
148 0x251c,0x251d, // box drawing
149 0x2524,0x2525, // box drawing
150 0x252c,0x252d, // box drawing
151 0x2534,0x2535, // box drawing
152 0x253c,0x253d, // box drawing
153 0x2550,0x256d, // box drawing
154 0x2580,0x2581, // block element
155 0x2584,0x2585, // block element
156 0x2588,0x2589, // block element
157 0x258c,0x258d, // block element
158 0x2590,0x2594, // block element
159 0x25a0,0x25a2, // geometric shapes
160 0x25aa,0x25ad, // geometric shapes
161 0x25b2,0x25b3, // geometric shapes
162 0x25ba,0x25bb, // geometric shapes
163 0x25bc,0x25bd, // geometric shapes
164 0x25c4,0x25c5, // geometric shapes
165 0x25ca,0x25cc, // geometric shapes
166 0x25cf,0x25d0, // geometric shapes
167 0x25d8,0x25da, // geometric shapes
168 0x25e6,0x25e7, // geometric shapes
169 0x263a,0x263d, // smilies, sun
170 0x2640,0x2641, // female
171 0x2642,0x2643, // male
172 0x2660,0x2661, // spade
173 0x2663,0x2664, // club
174 0x2665,0x2667, // heart
175 0x266a,0x266c, // quaver, beamed quavers
176 0xfb01,0xfb03, // fi, fl ligatures
177 0xfeff,0xff00, // zero-width non-breaking space
178 0xfffc, 0xfffe // object replacement character and replacement character
180 const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2;
182 int CompareWgl4Ranges(const void* aRange1,const void* aRange2)
184 unsigned int* p = (unsigned int*)aRange1;
185 unsigned int* q = (unsigned int*)aRange2;
188 unsigned int* temp = p;
200 // Determine if a character is in the WGL4 character repertoire.
201 static bool InWgl4(unsigned int aChar)
204 key[0] = key[1] = aChar;
205 return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL;
212 bool operator==(const CollationKey& k) const
213 { return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] &&
214 iIgnorable == k.iIgnorable && iStop == k.iStop; }
220 int iLevel[ELevels];// the keys at the various levels
221 bool iIgnorable; // TRUE if this key can normally be ignored
222 bool iStop; // TRUE if this is the last key in a string of keys
225 // The collation index for a single Unicode value.
229 static int Compare(const void* aIndex1,const void* aIndex2);
231 int iCode; // Unicode value
232 int iIndex; // index into the key table
238 Reader(bool aWgl4,bool aStandard,const char* aLocaleName, const char* aUidString);
240 void ReadBaseKeys(const char* aFileName);
241 void ReadCompKeys(const char* aFileName);
242 void ReadStrings(const char* aFileName);
243 void ReadAllKeys(const char* aFileName);
244 void WriteOutput(const char* aFileName, bool aCopyrightMessage);
245 int CompareStringIndices(int aIndex1,int aIndex2) const;
248 Reader(const Reader&);
249 int Hex(const char *aString, int &aCharConsumed, bool aTolerate = false);
250 void GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey=NULL);
251 void GetMultipleCollationKeys(const char* aString);
252 unsigned int PackKey(const CollationKey& aValue);
253 int PackIndex(const CollationIndex& aValue, unsigned int result[2]);
254 bool ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount);
255 void AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart);
256 void AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart);
257 void AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart);
261 EMaxCollationKeys = 0x110000 * 2, /*more elements considering composite keys */
262 EMaxCollationIndices = 0x110000,
263 EMaxStringElements = 65536,
264 EMaxStringIndices = 65536
266 CollationKey iCollationKey[EMaxCollationKeys];
268 CollationIndex iCollationIndex[EMaxCollationIndices];
270 int iStringElement[EMaxStringElements];
272 unsigned int iStringIndex[EMaxStringIndices];
274 const char* iInputFileName;
276 bool iSuppressCanonseqWarning; // have we issued the canonseq warning yet?
277 bool iWgl4; // true if writing keys for wgl4 characters only
278 bool iStandard; // true if reading standard files, not tailoring files
279 const char* iLocaleName;
280 const char* iUidString;
281 char* iCPlusPlusIdentifier; // iLocaleName in title case with difficult characters removed
284 bool isValidHexDigit(char c)
286 if ('0' <= c && c <= '9')
288 if ('a' <= c && c <= 'f')
290 if ('A' <= c && c <= 'F')
297 cout << "Usage: coltab [/u<uid>] [/c] [/a] [/h<topic>] <locale>\n";
298 cout << "By Default (without /a option), for the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n";
299 cout << "For any other locale name <name> coltab reads <name>_basekeys.txt,\n";
300 cout << "<name>_compkeys.txt and <name>_strings.txt.\n";
301 cout << "Use the /a option, for the locales 'standard' and 'wgl4' coltab reads allkeys.txt\n";
302 cout << "For any other locale name <name> coltab reads <name>_allkeys.txt.\n";
303 cout << "The output file is always ls_<name>.cpp.\n";
304 cout << "Use the /u option to specify the UID that the collation table should have.\n";
305 cout << "A hex number must follow /u immediately, for example /u800ACBDE\n";
306 cout << "this hex number must not exceed eight digits. If this is not specified,\n";
307 cout << "the output file will have to be edited to make it compilable.\n";
308 cout << "Specify /c to prefix the output with a Nokia copyright message.\n";
309 cout << "Specify /h for in-depth help.";
318 void PrintHelp(char* aTopic)
321 while ('0' <= *aTopic && *aTopic <= '9')
323 topic = topic * 10 + (*aTopic - '0');
329 cout << "How Coltab interprets CANONSEQ:\n\n"\
330 "If the CANONSEQ specifier is used in a line, Coltab will ignore the mapping.\n"\
331 "This because, on the Symbian platform, any canonically composed character is\n"\
332 "decomposed before the key mapping is applied, so characters with canonical\n"\
333 "decompositions do not need keys. In files supplied by the Unicode Consortium,\n"\
334 "all mappings for composed characters are flagged by CANONSEQ, so it is useful\n"\
335 "if Coltab can just ignore these so that Unicode Consortium files can be used\n"\
337 "This can cause problems if a localizer copies a line from a Unicode file into,\n"\
338 "say, the <lang>_strings.txt file, in order to give a mapping for an accented\n"\
339 "character. The localizer replaces the composed character code with the\n"\
340 "decomposition and changes the keys but forgets to remove the CANONSEQ\n"\
341 "specifier. In this case the key would be ignored. Coltab provides a warning so\n"\
342 "that this can be put right.\n\n"\
343 "Coltab will only warn about the first CANONSEQ in each file, and does not warn\n"\
344 "if the 'standard' or 'wgl4' options are used.";
348 cout << "How to ensure coltab's output files are compilable.\n\n"\
349 "By default, Coltab's files for locales need to be edited before they are\n"\
350 "compilable. The UID for the collation method needs to be filled in. This UID\n"\
351 "is added so that the collation table can be searched for later. At present,\n"\
352 "this UID is not necessary for the correct functioning of the Symbian platform\n"\
353 "and so a value of 0 can be safely used.\n\n"\
354 "To insert this value into the file directly, use the /u option, for example\n"\
355 "coltab /u0 french\n"\
356 "If the /u option is used, the file should be compilable as is. If it is not,\n"\
357 "please raise it as a defect with Symbian's internationalization team,\n"\
358 "supplying the files that caused the problem if this is possible.\n"\
359 "If the 'standard' or 'wgl4' options are used, no UID is output, so the /u\n"\
360 "option is not required.";
364 cout << "How to ensure collation key values are inside the supported range. \n\n"\
365 "According to Unicode Standard, the range suppored by tool COLTAB:\n"\
366 " Level 0 (primary): 0000 - FFFF, \n"\
367 " Level 1 (Secondary): 0020 - 011E, \n"\
368 " Level 2 (Tertiary): 0001 - 003F. \n"\
369 "Please edit your collation files and make sure key values are inside the above range";
374 cout << "\n\nSpecify /h1 for help on the use of CANONSEQ\n";
375 cout << "Specify /h2 for help on making compilable files that do not need editing\n";
381 short HighSurrogate(int aCode)
383 return static_cast<short>(0xD7C0 + (aCode >> 10));
386 short LowSurrogate(int aCode)
388 return static_cast<short>(0xDC00 | (aCode & 0x3FF));
391 int main(int argc,char** argv)
393 bool copyright = false;
395 bool allKeys = false;
396 const char* prefix = "";
397 const char* infix = "";
398 const char* locale = "";
401 for (int i = 1; i < argc; ++i)
403 if (argv[i][0] == '/' || argv[i][0] == '-')
410 uidArg = argv[i] + 2;
411 const char* uidCheck = uidArg;
414 if (!isValidHexDigit(*uidCheck))
418 if (uidCheck == uidArg || 8 < uidCheck - uidArg)
431 PrintHelp(argv[i] + 2);
445 bool standard = false;
446 if (!_stricmp(localeArg, "standard"))
451 else if (!_stricmp(localeArg, "wgl4"))
459 locale = prefix = localeArg;
463 Reader* reader = new Reader(wgl4, standard, locale, uidArg);
466 cout << "out of memory\n";
469 char* filename = new char[strlen(prefix) + strlen(infix) + 64];
470 if (allKeys == false)
472 sprintf(filename,"%s%scompkeys.txt",prefix,infix);
473 reader->ReadCompKeys(filename);
476 sprintf(filename,"%s%sstrings.txt",prefix,infix);
477 reader->ReadStrings(filename);
479 sprintf(filename,"%s%sbasekeys.txt",prefix,infix);
480 reader->ReadBaseKeys(filename);
484 sprintf(filename,"%s%sAllKeys.txt",prefix,infix);
485 reader->ReadAllKeys(filename);
487 sprintf(filename,"ls_%s.cpp", localeArg);
488 reader->WriteOutput(filename, copyright);
495 Reader::Reader(bool aWgl4, bool aStandard,
496 const char* aLocaleName, const char* aUidString):
501 iInputFileName(NULL),
503 iSuppressCanonseqWarning(false),
505 iStandard(aStandard),
506 iLocaleName(aLocaleName),
507 iUidString(aUidString)
511 iCPlusPlusIdentifier = new char[9];
512 strcpy(iCPlusPlusIdentifier, "Standard");
515 char* p = iCPlusPlusIdentifier = new char[strlen(aLocaleName) + 2];
516 int current = toupper(aLocaleName[0]);
517 if (current < 'A' || 'Z' < current)
521 *p++ = static_cast<char>(current);
524 bool inUnderScore = false;
527 current = tolower(*aLocaleName++);
528 if (current < 'a' || 'z' < current)
538 inUnderScore = false;
539 *p++ = static_cast<char>(current);
547 delete [] iCPlusPlusIdentifier;
550 // Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true.
551 int Reader::Hex(const char *aString, int &aCharConsumed, bool aTolerate)
554 unsigned long x = strtoul(aString,&end,16);
555 aCharConsumed = end - aString;
556 if ((aCharConsumed != 4) && (aCharConsumed != 5) && (aCharConsumed != 6))
560 cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n';
568 // Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx]
569 void Reader::GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey)
572 const char *end = strchr(aString, ']');
574 aCharConsumed = end - aString;
577 if (aString[0] != '[' || (aCharConsumed != 21 && aCharConsumed != 22 && aCharConsumed != 23))
579 cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n';
584 if (iKeys >= EMaxCollationKeys)
586 cout << "too many keys";
589 aKey = &iCollationKey[iKeys++];
591 aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored
592 int charConsumed = 0;
593 for (int i = 0; i < CollationKey::ELevels; i++)
594 aKey->iLevel[i] = Hex(aString + 2 + i * 5, charConsumed);
596 if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max))
598 aKey->iLevel[1] = KLevel1Max;
599 cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max << "\n";
600 cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
604 if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max))
606 cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max << "\n";
607 cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
614 void Reader::GetMultipleCollationKeys(const char* aString)
618 while (aString[0] == '[')
620 GetCollationKey(aString, charConsumed);
623 iCollationKey[iKeys - 1].iStop = false;
624 int length = strlen(aString);
625 if (length <= charConsumed + 1)
627 aString += charConsumed + 1;
629 if (aString[0] == ' ') //a space is put between collation keys in keys files provided by previous Unicode Standard (i.e 3.1)
633 iCollationKey[iKeys - 1].iStop = true;
637 Partially parse a line, returning its key code and the start of its first block of key data.
638 Return false if it is not a data line, or not relevant.
640 bool Reader::ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount)
642 int lineLength = strlen(aLine);
643 int charConsumed = 0;
645 aCode[0] = Hex(aLine,charConsumed,true);
648 A data line must start with a hex number and be at least 27 characters long.
649 Canonically decomposable Unicode characters are skipped.
650 Skip non-WGL4 characters if doing WGL4 only.
655 if (!strcmp(aLine + lineLength - 8,"CANONSEQ"))
657 if (!iSuppressCanonseqWarning)
659 cout << "Warning: CANONSEQ used in file " << iInputFileName
660 << " on line " << iLineNumber << ".\nWarning: All mappings specifying CANONSEQ are ignored.\n"
661 << "Warning: Use coltab /h1 for more details.";
662 iSuppressCanonseqWarning = true;
666 else if (lineLength < 27 ||
667 (iWgl4 && !InWgl4((unsigned int)aCode)))
674 aKeyStart = charConsumed;
675 while (aKeyStart < lineLength && aLine[aKeyStart] != '[')
678 // read all hex before '['
679 int index = charConsumed + 1;
680 while (index < aKeyStart)
682 aCode[aCodeCount] = Hex(aLine+index, charConsumed, true);
683 if (aCode[aCodeCount] == -1)
686 index += charConsumed + 1;
690 // find number of collation keys
693 while (index < lineLength && aLine[index] != '%' && aLine[index] != '#')
695 if (aLine[index] == '[')
701 return aCodeCount > 0;
704 void Reader::AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart)
706 if (iIndices >= EMaxCollationIndices)
708 cout << "too many Unicode values";
711 CollationIndex& index = iCollationIndex[iIndices++];
716 First try to find the key in the array of keys found so far.
717 Search backwards to use the fact that runs of the same key occur together.
720 int charConsumed = 0;
721 GetCollationKey(aLine + aKeyStart, charConsumed, &key);
722 for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--)
723 if (iCollationKey[i] == key)
726 // If that fails, add a new key.
727 if (index.iIndex == -1)
729 index.iIndex = iKeys++;
730 if (iKeys > EMaxCollationKeys)
732 cout << "too many keys";
735 iCollationKey[index.iIndex] = key;
739 Read 1-to-1 mapping. Sample:
740 02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
742 aCombinedFile = true: aFileName is combined file, which contains base keys, comp keys, and string keys.
744 void Reader::ReadBaseKeys(const char* aFileName)
746 iSuppressCanonseqWarning = iStandard || iWgl4;
748 iInputFileName = aFileName;
751 #ifdef __MSVCDOTNET__
752 input_file.open(iInputFileName, ios::in);
753 #else //!__MSVCDOTNET__
754 input_file.open(iInputFileName, ios::in | ios::nocreate);
755 #endif //__MSVCDOTNET__
757 if (input_file.fail())
759 cout << "cannot open input file '" << iInputFileName << "'\n";
762 cout << "reading base keys from '" << iInputFileName << "'\n";
767 input_file.getline(line,sizeof(line));
768 if (input_file.eof())
771 // line number counting
772 if (iLineNumber % 100 == 0)
774 cout << "line " << iLineNumber << '\n';
781 if (ParseLine(line, code, codeCount, key_start, keyCount))
783 if (codeCount != 1 || keyCount != 1)
784 continue; // goto next line
785 AddKeyOneToOne(line, code[0], key_start);
792 void Reader::AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart)
794 if (iIndices >= EMaxCollationIndices)
796 cout << "too many Unicode values";
799 CollationIndex& index = iCollationIndex[iIndices++];
801 index.iIndex = iKeys;
802 GetMultipleCollationKeys(aLine + aKeyStart);
805 Read 1-to-much mapping.
806 3303 ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
808 void Reader::ReadCompKeys(const char* aFileName)
810 iSuppressCanonseqWarning = iStandard || iWgl4;
812 iInputFileName = aFileName;
815 #ifdef __MSVCDOTNET__
816 input_file.open(iInputFileName, ios::in);
817 #else //!__MSVCDOTNET__
818 input_file.open(iInputFileName, ios::in | ios::nocreate);
819 #endif //__MSVCDOTNET__
821 if (input_file.fail())
823 cout << "there are no composite keys; '" << iInputFileName << "' not found\n";
826 cout << "reading composite keys from '" << iInputFileName << "'\n";
831 input_file.getline(line,sizeof(line));
832 if (input_file.eof())
835 // line number counting
836 if (iLineNumber % 100 == 0)
838 cout << "line " << iLineNumber << '\n';
845 if (ParseLine(line, code, codeCount, key_start, keyCount))
847 if (codeCount != 1 || keyCount < 2)
848 continue; // goto next line
849 AddKeyOneToMuch(line, code[0], key_start);
857 void Reader::AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart)
860 // Store the index to the Unicode string and the key sequence.
861 if (iStringIndices > EMaxStringIndices)
863 cout << "too many string indices";
866 iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys;
868 // Reserve space for the length.
869 if (iStringElements >= EMaxStringElements)
871 cout << "too many string elements";
876 // Read the Unicode string.
877 int length = 0; // in unit of int16
878 int charCount = 0; // in unit of char. for debug.
880 for (int i=0; i<aCodeCount; i++)
882 if (iStringElements >= EMaxStringElements)
884 cout << "too many string elements";
888 if (aCode[i] > 0xFFFF)
891 iStringElement[iStringElements++] = 0xD7C0 + (aCode[i] >> 10);
892 iStringElement[iStringElements++] = 0xDC00 | (aCode[i] & 0x3FF);
897 iStringElement[iStringElements++] = aCode[i];
903 iStringElement[iStringElements - length - 1] = (unsigned int)length;
905 // Read the key sequence.
906 GetMultipleCollationKeys(aLine + aKeyStart);
909 Read much-to-much mapping. Sample:
910 004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
911 0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
913 void Reader::ReadStrings(const char* aFileName)
915 iSuppressCanonseqWarning = iStandard || iWgl4;
917 iInputFileName = aFileName;
920 #ifdef __MSVCDOTNET__
921 input_file.open(iInputFileName, ios::in);
922 #else //!__MSVCDOTNET__
923 input_file.open(iInputFileName, ios::in | ios::nocreate);
924 #endif //__MSVCDOTNET__
926 if (input_file.fail())
928 cout << "there are no strings; '" << iInputFileName << "' not found\n";
931 cout << "reading strings from '" << iInputFileName << "'\n";
936 input_file.getline(line,sizeof(line));
937 if (input_file.eof())
940 // line number counting
941 if (iLineNumber % 100 == 0)
943 cout << "line " << iLineNumber << '\n';
950 if (ParseLine(line, code, codeCount, key_start, keyCount))
952 if (codeCount < 2 || keyCount < 1)
953 continue; // goto next line
954 AddKeyMuchToMuch(line, code, codeCount, key_start);
962 Read combined key table. Sample:
964 02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
967 3303 ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
969 much-to-much mapping:
970 004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
971 0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
973 void Reader::ReadAllKeys(const char* aFileName)
975 iSuppressCanonseqWarning = iStandard || iWgl4;
977 iInputFileName = aFileName;
980 #ifdef __MSVCDOTNET__
981 input_file.open(iInputFileName, ios::in);
982 #else //!__MSVCDOTNET__
983 input_file.open(iInputFileName, ios::in | ios::nocreate);
984 #endif //__MSVCDOTNET__
986 if (input_file.fail())
988 cout << "there are no keys; '" << iInputFileName << "' not found\n";
991 cout << "reading all keys from '" << iInputFileName << "'\n";
996 if (input_file.eof())
998 input_file.getline(line,sizeof(line));
1005 if (ParseLine(line, code, codeCount, key_start, keyCount))
1007 if (codeCount == 1 && keyCount == 1)
1008 AddKeyOneToOne(line, code[0], key_start);
1009 else if (codeCount == 1 && keyCount > 1)
1010 AddKeyOneToMuch(line, code[0], key_start);
1011 else if (codeCount > 1 && keyCount > 0)
1012 AddKeyMuchToMuch(line, code, codeCount, key_start);
1014 cout << "ignore line: " << line << "\n";
1022 // Pack the 3 collation key levels into a single 32-bit integer.
1023 unsigned int Reader::PackKey(const CollationKey& aValue)
1025 unsigned int level0 = aValue.iLevel[0];
1026 unsigned int level1 = aValue.iLevel[1];
1028 level1 -= (KLevel1Min - 1);
1029 unsigned int level2 = aValue.iLevel[2];
1031 level2 -= (KLevel2Min - 1);
1032 unsigned int key = level0 << 16 | level1 << 8 | level2 << 2;
1033 if (aValue.iIgnorable)
1040 // Pack a collation index value into a single 32-bit integer.
1041 int Reader::PackIndex(const CollationIndex& aValue, unsigned int result[2])
1043 unsigned int code = aValue.iCode;
1044 unsigned int index = aValue.iIndex;
1047 result[0] = (code << 16 | index);
1052 result[0] = (::HighSurrogate(code) << 16 | index);
1053 result[1] = (::LowSurrogate(code) << 16 | index);
1058 const Reader* TheReader;
1059 static int CompareStringIndices(const void* aIndex1,const void* aIndex2)
1061 return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16);
1064 int CompareUnicodeStrings(const int *aString1,int aLength1,const int *aString2,int aLength2)
1066 for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
1068 int x = i < aLength1 ? *aString1 : -1;
1069 int y = i < aLength2 ? *aString2 : -1;
1076 int Reader::CompareStringIndices(int aIndex1,int aIndex2) const
1078 return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1],
1079 iStringElement + aIndex2 + 1,iStringElement[aIndex2]);
1082 void Reader::WriteOutput(const char* aFileName, bool aCopyright)
1085 ofstream output_file;
1086 output_file.open(aFileName);
1087 if (output_file.fail())
1089 cout << "cannot open output file '" << aFileName << "'\n";
1092 cout << "writing output to '" << aFileName << "'\n";
1094 char *locale = NULL;
1096 locale = _strdup("Standard");
1098 locale = _strdup(iLocaleName);
1103 locale[0] = (char)toupper(locale[0]);
1106 char* capsFileName = new char[strlen(aFileName) + 1];
1107 strcpy(capsFileName, aFileName);
1108 _strupr(capsFileName);
1109 output_file << "/*\n" << capsFileName << "\n\nCopyright (C) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.\n*/\n";
1110 delete [] capsFileName;
1111 output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n";
1112 output_file << "Generated by COLTAB.\n*/\n";
1115 output_file << "\n#include \"ls_std.h\"\n#include <collate.h>\n";
1116 output_file << "\nconst TUint KUid" << iCPlusPlusIdentifier << "CollationMethod = ";
1118 output_file << "0x" << iUidString << ";\n";
1121 output_file << "/* FILL THIS IN */;\n";
1122 cout << "Warning: File will need editing\nWarning: see coltab /h2 for details.\n";
1127 Write the unique collation keys.
1128 Each one has the format, going from highest to lowest bit:
1130 16 bits: level-0 key
1133 1 bit: set if this key is optionally ignorable
1134 1 bit: set if this is the last key in the string of keys for a single Unicode value
1139 output_file << "\nstatic const TUint32 The" << iCPlusPlusIdentifier << "Key[] = \n\t{";
1140 CollationKey* ck = iCollationKey;
1141 output_file << "\t // " << iKeys << " keys";
1143 for (i = 0; i < iKeys; i++, ck++)
1145 unsigned int key = PackKey(*ck);
1147 output_file << "\n\t";
1148 output_file << "0x";
1149 output_file << key << ",";
1152 output_file << "\n\t};\n\n";
1157 // Sort then write the collation index values - these relate Unicode values to collation keys.
1158 qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare);
1159 output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "Index[] = \n\t{";
1160 CollationIndex* ci = iCollationIndex;
1162 output_file << "\t // " << iIndices << " indices";
1164 for (i = 0; i < iIndices; i++, ci++, entry++)
1166 unsigned int key[2];
1167 int bytecount = PackIndex(*ci, key);
1170 output_file << "\n\t";
1171 output_file << "0x";
1172 output_file << key[0] << ",";
1178 output_file << "\n\t";
1179 output_file << "0x";
1180 output_file << key[1] << ",";
1184 output_file << "\n\t};";
1185 output_file << "\t // " << entry << " entries";
1186 output_file << "\n\n";
1187 iIndices = entry; //One surrogate pair occupies 2 entries
1190 if (iStringElements)
1192 // Write the Unicode strings; these are preceded by their lengths.
1193 output_file << "static const TUint16 The" << iCPlusPlusIdentifier << "StringElement[] = \n\t{";
1195 for (i = 0; i < iStringElements; i++)
1198 output_file << "\n\t";
1199 output_file << "0x" << iStringElement[i] << ",";
1202 if (iStringElements==0)
1204 output_file << "\n\t};\n\n";
1207 Sort then write the string index values - these relate Unicode strings to collation keys.
1208 Each one has the string index in the upper word and the key index in the lower word.
1211 qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices);
1212 output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "StringIndex[] = \n\t{";
1214 for (i = 0; i < iStringIndices; i++)
1217 output_file << "\n\t";
1218 output_file << "0x" << iStringIndex[i] << ",";
1221 if (iStringIndices ==0)
1223 output_file << "\n\t};\n\n";
1226 // Write the collation table structure.
1227 output_file << "static const TCollationKeyTable The" << iCPlusPlusIdentifier << "Table = \n\t{ ";
1229 output_file << "The" << iCPlusPlusIdentifier << "Key";
1233 output_file << ", The" << iCPlusPlusIdentifier << "Index, " << iIndices;
1235 output_file << ", 0, 0";
1236 if (iStringElements)
1237 output_file << ", The" << iCPlusPlusIdentifier << "StringElement, The" << iCPlusPlusIdentifier << "StringIndex, " << iStringIndices << " };\n";
1239 output_file << ", 0, 0, 0 };\n";
1242 output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\
1245 " KUid" << iCPlusPlusIdentifier << "CollationMethod, // the method for the locale\n"\
1246 " NULL, // use the standard table as the main table\n"\
1247 " &The" << iCPlusPlusIdentifier << "Table, // the locale values override the standard values\n"\
1248 " 0 // the flags are standard\n"\
1251 " KUidBasicCollationMethod, // the standard unlocalised method\n"\
1252 " NULL, // null means use the standard table\n"\
1253 " NULL, // there's no override table\n"\
1254 " 0 // the flags are standard\n"\
1258 "static const TCollationDataSet TheCollationDataSet =\n"\
1260 " TheCollationMethod,\n"\
1264 "// The one and only locale character set object.\n"\
1265 "const LCharSet TheCharSet =\n"\
1268 " &TheCollationDataSet\n"\
1271 output_file.close();
1275 int CollationIndex::Compare(const void* aIndex1,const void* aIndex2)
1277 return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode;