sl@0: /* sl@0: * sl@0: * (C) Copyright IBM Corp. 1998-2004 - All Rights Reserved sl@0: * sl@0: * This file is a modification of the ICU file IndicReordering.h sl@0: * by Jens Herden and Javier Sola for Khmer language sl@0: * sl@0: */ sl@0: sl@0: #ifndef __KHMERREORDERING_H sl@0: #define __KHMERREORDERING_H sl@0: sl@0: /** sl@0: * \file sl@0: * \internal sl@0: */ sl@0: sl@0: // #include "LETypes.h" sl@0: // #include "OpenTypeTables.h" sl@0: sl@0: U_NAMESPACE_BEGIN sl@0: sl@0: class LEGlyphStorage; sl@0: sl@0: // Vocabulary sl@0: // Base -> A consonant or an independent vowel in its full (not subscript) form. It is the sl@0: // center of the syllable, it can be souranded by coeng (subscript) consonants, vowels, sl@0: // split vowels, signs... but there is only one base in a syllable, it has to be coded as sl@0: // the first character of the syllable. sl@0: // split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant). sl@0: // Khmer language has five of them. Khmer split vowels either have one part before the sl@0: // base and one after the base or they have a part before the base and a part above the base. sl@0: // The first part of all Khmer split vowels is the same character, identical to sl@0: // the glyph of Khmer dependent vowel SRA EI sl@0: // coeng --> modifier used in Khmer to construct coeng (subscript) consonants sl@0: // Differently than indian languages, the coeng modifies the consonant that follows it, sl@0: // not the one preceding it Each consonant has two forms, the base form and the subscript form sl@0: // the base form is the normal one (using the consonants code-point), the subscript form is sl@0: // displayed when the combination coeng + consonant is encountered. sl@0: // Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant sl@0: // Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO) sl@0: // Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA) sl@0: // Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds sl@0: // if it is attached to a consonant of the first series or a consonant of the second series sl@0: // Most consonants have an equivalent in the other series, but some of theme exist only in sl@0: // one series (for example SA). If we want to use the consonant SA with a vowel sound that sl@0: // can only be done with a vowel sound that corresponds to a vowel accompanying a consonant sl@0: // of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN sl@0: // x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and sl@0: // MUSIKATOAN a second series consonant to have a first series vowel sound. sl@0: // Consonant shifter are both normally supercript marks, but, when they are followed by a sl@0: // superscript, they change shape and take the form of subscript dependent vowel SRA U. sl@0: // If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they sl@0: // should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should sl@0: // be placed after the coeng consonant. sl@0: // Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base sl@0: // Each vowel has its own position. Only one vowel per syllable is allowed. sl@0: // Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are sl@0: // Allowed in a syllable. sl@0: // sl@0: // sl@0: sl@0: struct KhmerClassTable // This list must include all types of components that can be used inside a syllable sl@0: { sl@0: enum CharClassValues // order is important here! This order must be the same that is found in each horizontal sl@0: // line in the statetable for Khmer (file KhmerReordering.cpp). sl@0: { sl@0: CC_RESERVED = 0, sl@0: CC_CONSONANT = 1, // consonant of type 1 or independent vowel sl@0: CC_CONSONANT2 = 2, // Consonant of type 2 sl@0: CC_CONSONANT3 = 3, // Consonant of type 3 sl@0: CC_ZERO_WIDTH_NJ_MARK = 4, // Zero Width non joiner character (0x200C) sl@0: CC_CONSONANT_SHIFTER = 5, sl@0: CC_ROBAT = 6, // Khmer special diacritic accent -treated differently in state table sl@0: CC_COENG = 7, // Subscript consonant combining character sl@0: CC_DEPENDENT_VOWEL = 8, sl@0: CC_SIGN_ABOVE = 9, sl@0: CC_SIGN_AFTER = 10, sl@0: CC_ZERO_WIDTH_J_MARK = 11, // Zero width joiner character sl@0: CC_COUNT = 12 // This is the number of character classes sl@0: }; sl@0: sl@0: enum CharClassFlags sl@0: { sl@0: CF_CLASS_MASK = 0x0000FFFF, sl@0: sl@0: CF_CONSONANT = 0x01000000, // flag to speed up comparing sl@0: CF_SPLIT_VOWEL = 0x02000000, // flag for a split vowel -> the first part is added in front of the syllable sl@0: CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable sl@0: CF_COENG = 0x08000000, // flag to speed up comparing sl@0: CF_SHIFTER = 0x10000000, // flag to speed up comparing sl@0: CF_ABOVE_VOWEL = 0x20000000, // flag to speed up comparing sl@0: sl@0: // position flags sl@0: CF_POS_BEFORE = 0x00080000, sl@0: CF_POS_BELOW = 0x00040000, sl@0: CF_POS_ABOVE = 0x00020000, sl@0: CF_POS_AFTER = 0x00010000, sl@0: CF_POS_MASK = 0x000f0000 sl@0: }; sl@0: sl@0: typedef le_uint32 CharClass; sl@0: sl@0: typedef le_int32 ScriptFlags; sl@0: sl@0: LEUnicode firstChar; // for Khmer this will become x1780 sl@0: LEUnicode lastChar; // and this x17DF sl@0: const CharClass *classTable; sl@0: sl@0: CharClass getCharClass(LEUnicode ch) const; sl@0: sl@0: static const KhmerClassTable *getKhmerClassTable(); sl@0: }; sl@0: sl@0: sl@0: class KhmerReordering /* not : public UObject because all methods are static */ { sl@0: public: sl@0: static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode, sl@0: LEUnicode *outChars, LEGlyphStorage &glyphStorage); sl@0: sl@0: static const LETag *getFeatureOrder(); sl@0: sl@0: private: sl@0: // do not instantiate sl@0: KhmerReordering(); sl@0: sl@0: static le_int32 findSyllable(const KhmerClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount); sl@0: sl@0: }; sl@0: sl@0: sl@0: U_NAMESPACE_END sl@0: #endif