1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/fontservices/textshaperplugin/IcuSource/layout/KhmerReordering.cpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,523 @@
1.4 +/*
1.5 + *
1.6 + * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved
1.7 + *
1.8 + * This file is a modification of the ICU file IndicReordering.cpp
1.9 + * by Jens Herden and Javier Sola for Khmer language
1.10 + *
1.11 + */
1.12 +
1.13 +#include "LETypes.h"
1.14 +#include "KhmerReordering.h"
1.15 +#include "LEGlyphStorage.h"
1.16 +
1.17 +
1.18 +U_NAMESPACE_BEGIN
1.19 +
1.20 +// Characters that get refered to by name...
1.21 +enum
1.22 +{
1.23 + C_SIGN_ZWNJ = 0x200C,
1.24 + C_SIGN_ZWJ = 0x200D,
1.25 + C_DOTTED_CIRCLE = 0x25CC,
1.26 + C_RO = 0x179A,
1.27 + C_VOWEL_AA = 0x17B6,
1.28 + C_SIGN_NIKAHIT = 0x17C6,
1.29 + C_VOWEL_E = 0x17C1,
1.30 + C_COENG = 0x17D2
1.31 +};
1.32 +
1.33 +
1.34 +enum
1.35 +{
1.36 + // simple classes, they are used in the statetable (in this file) to control the length of a syllable
1.37 + // they are also used to know where a character should be placed (location in reference to the base character)
1.38 + // and also to know if a character, when independtly displayed, should be displayed with a dotted-circle to
1.39 + // indicate error in syllable construction
1.40 + _xx = KhmerClassTable::CC_RESERVED,
1.41 + _sa = KhmerClassTable::CC_SIGN_ABOVE | KhmerClassTable::CF_DOTTED_CIRCLE | KhmerClassTable::CF_POS_ABOVE,
1.42 + _sp = KhmerClassTable::CC_SIGN_AFTER | KhmerClassTable::CF_DOTTED_CIRCLE| KhmerClassTable::CF_POS_AFTER,
1.43 + _c1 = KhmerClassTable::CC_CONSONANT | KhmerClassTable::CF_CONSONANT,
1.44 + _c2 = KhmerClassTable::CC_CONSONANT2 | KhmerClassTable::CF_CONSONANT,
1.45 + _c3 = KhmerClassTable::CC_CONSONANT3 | KhmerClassTable::CF_CONSONANT,
1.46 + _rb = KhmerClassTable::CC_ROBAT | KhmerClassTable::CF_POS_ABOVE | KhmerClassTable::CF_DOTTED_CIRCLE,
1.47 + _cs = KhmerClassTable::CC_CONSONANT_SHIFTER | KhmerClassTable::CF_DOTTED_CIRCLE | KhmerClassTable::CF_SHIFTER,
1.48 + _dl = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_BEFORE | KhmerClassTable::CF_DOTTED_CIRCLE,
1.49 + _db = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_BELOW | KhmerClassTable::CF_DOTTED_CIRCLE,
1.50 + _da = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_ABOVE | KhmerClassTable::CF_DOTTED_CIRCLE | KhmerClassTable::CF_ABOVE_VOWEL,
1.51 + _dr = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_AFTER | KhmerClassTable::CF_DOTTED_CIRCLE,
1.52 + _co = KhmerClassTable::CC_COENG | KhmerClassTable::CF_COENG | KhmerClassTable::CF_DOTTED_CIRCLE,
1.53 +
1.54 + // split vowel
1.55 + _va = _da | KhmerClassTable::CF_SPLIT_VOWEL,
1.56 + _vr = _dr | KhmerClassTable::CF_SPLIT_VOWEL
1.57 +};
1.58 +
1.59 +
1.60 +// Character class tables
1.61 +// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
1.62 +// _sa Sign placed above the base
1.63 +// _sp Sign placed after the base
1.64 +// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
1.65 +// _c2 Consonant of type 2 (only RO)
1.66 +// _c3 Consonant of type 3
1.67 +// _rb Khmer sign robat u17CC. combining mark for subscript consonants
1.68 +// _cd Consonant-shifter
1.69 +// _dl Dependent vowel placed before the base (left of the base)
1.70 +// _db Dependent vowel placed below the base
1.71 +// _da Dependent vowel placed above the base
1.72 +// _dr Dependent vowel placed behind the base (right of the base)
1.73 +// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
1.74 +// it to create a subscript consonant or independent vowel
1.75 +// _va Khmer split vowel in wich the first part is before the base and the second one above the base
1.76 +// _vr Khmer split vowel in wich the first part is before the base and the second one behind (right of) the base
1.77 +
1.78 +static const KhmerClassTable::CharClass khmerCharClasses[] =
1.79 +{
1.80 + _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, // 1780 - 178F
1.81 + _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, // 1790 - 179F
1.82 + _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, // 17A0 - 17AF
1.83 + _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, // 17B0 - 17BF
1.84 + _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, // 17C0 - 17CF
1.85 + _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx, // 17D0 - 17DF
1.86 +};
1.87 +
1.88 +
1.89 +//
1.90 +// Khmer Class Tables
1.91 +//
1.92 +
1.93 +//
1.94 +// The range of characters defined in the above table is defined here. FOr Khmer 1780 to 17DF
1.95 +// Even if the Khmer range is bigger, all other characters are not combinable, and therefore treated
1.96 +// as _xx
1.97 +static const KhmerClassTable khmerClassTable = {0x1780, 0x17df, khmerCharClasses};
1.98 +
1.99 +
1.100 +// Below we define how a character in the input string is either in the khmerCharClasses table
1.101 +// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
1.102 +// within the syllable, but are not in the table) we also get their type back, or an unknown object
1.103 +// in which case we get _xx (CC_RESERVED) back
1.104 +KhmerClassTable::CharClass KhmerClassTable::getCharClass(LEUnicode ch) const
1.105 +{
1.106 +
1.107 + if (ch == C_SIGN_ZWJ) {
1.108 + return CC_ZERO_WIDTH_J_MARK;
1.109 + }
1.110 +
1.111 + if (ch == C_SIGN_ZWNJ) {
1.112 + return CC_ZERO_WIDTH_NJ_MARK;
1.113 + }
1.114 +
1.115 + if (ch < firstChar || ch > lastChar) {
1.116 + return CC_RESERVED;
1.117 + }
1.118 +
1.119 + return classTable[ch - firstChar];
1.120 +}
1.121 +
1.122 +const KhmerClassTable *KhmerClassTable::getKhmerClassTable()
1.123 +{
1.124 + return &khmerClassTable;
1.125 +}
1.126 +
1.127 +
1.128 +
1.129 +class ReorderingOutput : public UMemory {
1.130 +private:
1.131 + le_int32 fOutIndex;
1.132 + LEUnicode *fOutChars;
1.133 +
1.134 + LEGlyphStorage &fGlyphStorage;
1.135 +
1.136 +
1.137 +public:
1.138 + ReorderingOutput(LEUnicode *outChars, LEGlyphStorage &glyphStorage)
1.139 + : fOutIndex(0), fOutChars(outChars), fGlyphStorage(glyphStorage)
1.140 + {
1.141 + // nothing else to do...
1.142 + }
1.143 +
1.144 + ~ReorderingOutput()
1.145 + {
1.146 + // nothing to do here...
1.147 + }
1.148 +
1.149 + void writeChar(LEUnicode ch, le_uint32 charIndex, const LETag *charTags)
1.150 + {
1.151 + LEErrorCode success = LE_NO_ERROR;
1.152 +
1.153 + fOutChars[fOutIndex] = ch;
1.154 +
1.155 + fGlyphStorage.setCharIndex(fOutIndex, charIndex, success);
1.156 + fGlyphStorage.setAuxData(fOutIndex, (void *) charTags, success);
1.157 +
1.158 + fOutIndex += 1;
1.159 + }
1.160 +
1.161 + le_int32 getOutputIndex()
1.162 + {
1.163 + return fOutIndex;
1.164 + }
1.165 +};
1.166 +
1.167 +
1.168 +static const LETag emptyTag = 0x00000000; // ''
1.169 +//TODO remove unused flags
1.170 +//static const LETag nuktFeatureTag = LE_NUKT_FEATURE_TAG;
1.171 +//static const LETag akhnFeatureTag = LE_AKHN_FEATURE_TAG;
1.172 +//static const LETag rphfFeatureTag = LE_RPHF_FEATURE_TAG;
1.173 +static const LETag blwfFeatureTag = LE_BLWF_FEATURE_TAG;
1.174 +//static const LETag halfFeatureTag = LE_HALF_FEATURE_TAG;
1.175 +static const LETag pstfFeatureTag = LE_PSTF_FEATURE_TAG;
1.176 +//static const LETag vatuFeatureTag = LE_VATU_FEATURE_TAG;
1.177 +static const LETag presFeatureTag = LE_PRES_FEATURE_TAG;
1.178 +static const LETag blwsFeatureTag = LE_BLWS_FEATURE_TAG;
1.179 +static const LETag abvsFeatureTag = LE_ABVS_FEATURE_TAG;
1.180 +static const LETag pstsFeatureTag = LE_PSTS_FEATURE_TAG;
1.181 +//static const LETag halnFeatureTag = LE_HALN_FEATURE_TAG;
1.182 +
1.183 +static const LETag blwmFeatureTag = LE_BLWM_FEATURE_TAG;
1.184 +static const LETag abvmFeatureTag = LE_ABVM_FEATURE_TAG;
1.185 +static const LETag distFeatureTag = LE_DIST_FEATURE_TAG;
1.186 +
1.187 +static const LETag prefFeatureTag = LE_PREF_FEATURE_TAG;
1.188 +static const LETag abvfFeatureTag = LE_ABVF_FEATURE_TAG;
1.189 +static const LETag cligFeatureTag = LE_CLIG_FEATURE_TAG;
1.190 +static const LETag mkmkFeatureTag = LE_MKMK_FEATURE_TAG;
1.191 +
1.192 +// These are in the order in which the features need to be applied
1.193 +// for correct processing
1.194 +static const LETag featureOrder[] =
1.195 +{
1.196 + // Shaping features
1.197 + prefFeatureTag, blwfFeatureTag, abvfFeatureTag, pstfFeatureTag,
1.198 + presFeatureTag, blwsFeatureTag, abvsFeatureTag, pstsFeatureTag,
1.199 + cligFeatureTag,
1.200 +
1.201 + // Positioning features
1.202 + distFeatureTag, blwmFeatureTag, abvmFeatureTag, mkmkFeatureTag,
1.203 + emptyTag
1.204 +};
1.205 +
1.206 +static const LETag tagPref[] =
1.207 +{
1.208 + prefFeatureTag, presFeatureTag,
1.209 + cligFeatureTag,
1.210 +
1.211 + // Positioning features
1.212 + distFeatureTag,
1.213 + emptyTag
1.214 +};
1.215 +
1.216 +static const LETag tagAbvf[] =
1.217 +{
1.218 + abvfFeatureTag, abvsFeatureTag,
1.219 + cligFeatureTag,
1.220 +
1.221 + // Positioning features
1.222 + distFeatureTag, abvmFeatureTag, mkmkFeatureTag,
1.223 + emptyTag
1.224 +};
1.225 +
1.226 +static const LETag tagPstf[] =
1.227 +{
1.228 + blwfFeatureTag, blwsFeatureTag,
1.229 + prefFeatureTag, presFeatureTag,
1.230 +
1.231 + pstfFeatureTag, pstsFeatureTag,
1.232 + cligFeatureTag,
1.233 +
1.234 + // Positioning features
1.235 + distFeatureTag, blwmFeatureTag,
1.236 + emptyTag
1.237 +};
1.238 +
1.239 +static const LETag tagBlwf[] =
1.240 +{
1.241 + blwfFeatureTag, blwsFeatureTag,
1.242 + cligFeatureTag,
1.243 +
1.244 + // Positioning features
1.245 + distFeatureTag, blwmFeatureTag, mkmkFeatureTag,
1.246 + emptyTag
1.247 +};
1.248 +
1.249 +
1.250 +// TODO do we need all of them?
1.251 +static const LETag tagDefault[] =
1.252 +{
1.253 + // Shaping feature
1.254 + prefFeatureTag, blwfFeatureTag, /*abvfFeatureTag,*/ /*pstfFeatureTag, */
1.255 + presFeatureTag, blwsFeatureTag, /*abvsFeatureTag,*/ /*pstsFeatureTag,*/
1.256 + cligFeatureTag,
1.257 +
1.258 + // Positioning features
1.259 + distFeatureTag, abvmFeatureTag, blwmFeatureTag, mkmkFeatureTag,
1.260 + emptyTag
1.261 +};
1.262 +
1.263 +
1.264 +
1.265 +// The stateTable is used to calculate the end (the length) of a well
1.266 +// formed Khmer Syllable.
1.267 +//
1.268 +// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
1.269 +// CharClassValues in KhmerReordering.h This coincidence of values allows the
1.270 +// follow up of the table.
1.271 +//
1.272 +// Each line corresponds to a state, which does not necessarily need to be a type
1.273 +// of component... for example, state 2 is a base, with is always a first character
1.274 +// in the syllable, but the state could be produced a consonant of any type when
1.275 +// it is the first character that is analysed (in ground state).
1.276 +//
1.277 +// Differentiating 3 types of consonants is necessary in order to
1.278 +// forbid the use of certain combinations, such as having a second
1.279 +// coeng after a coeng RO,
1.280 +// The inexistent possibility of having a type 3 after another type 3 is permitted,
1.281 +// eliminating it would very much complicate the table, and it does not create typing
1.282 +// problems, as the case above.
1.283 +//
1.284 +// The table is quite complex, in order to limit the number of coeng consonants
1.285 +// to 2 (by means of the table).
1.286 +//
1.287 +// There a peculiarity, as far as Unicode is concerned:
1.288 +// - The consonant-shifter is considered in two possible different
1.289 +// locations, the one considered in Unicode 3.0 and the one considered in
1.290 +// Unicode 4.0. (there is a backwards compatibility problem in this standard).
1.291 +
1.292 +
1.293 +// xx independent character, such as a number, punctuation sign or non-khmer char
1.294 +//
1.295 +// c1 Khmer consonant of type 1 or an independent vowel
1.296 +// that is, a letter in which the subscript for is only under the
1.297 +// base, not taking any space to the right or to the left
1.298 +//
1.299 +// c2 Khmer consonant of type 2, the coeng form takes space under
1.300 +// and to the left of the base (only RO is of this type)
1.301 +//
1.302 +// c3 Khmer consonant of type 3. Its subscript form takes space under
1.303 +// and to the right of the base.
1.304 +//
1.305 +// cs Khmer consonant shifter
1.306 +//
1.307 +// rb Khmer robat
1.308 +//
1.309 +// co coeng character (u17D2)
1.310 +//
1.311 +// dv dependent vowel (including split vowels, they are treated in the same way).
1.312 +// even if dv is not defined above, the component that is really tested for is
1.313 +// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
1.314 +//
1.315 +// zwj Zero Width joiner
1.316 +//
1.317 +// zwnj Zero width non joiner
1.318 +//
1.319 +// sa above sign
1.320 +//
1.321 +// sp post sign
1.322 +//
1.323 +// there are lines with equal content but for an easier understanding
1.324 +// (and maybe change in the future) we did not join them
1.325 +//
1.326 +static const le_int8 khmerStateTable[][KhmerClassTable::CC_COUNT] =
1.327 +{
1.328 +
1.329 +// xx c1 c2 c3 zwnj cs rb co dv sa sp zwj
1.330 + { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, // 0 - ground state
1.331 + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 1 - exit state (or sign to the right of the syllable)
1.332 + {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, // 2 - Base consonant
1.333 + {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, // 3 - First ZWNJ before a register shifter
1.334 + // It can only be followed by a shifter or a vowel
1.335 + {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, // 4 - First register shifter
1.336 + {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, // 5 - Robat
1.337 + {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - First Coeng
1.338 + {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, // 7 - First consonant of type 1 after coeng
1.339 + {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, // 8 - First consonant of type 2 after coeng
1.340 + {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, // 9 - First consonant or type 3 after ceong
1.341 + {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, // 10 - Second Coeng (no register shifter before)
1.342 + {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, // 11 - Second coeng consonant (or ind. vowel) no register shifter before
1.343 + {-1, -1, 1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, // 12 - Second ZWNJ before a register shifter
1.344 + {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, // 13 - Second register shifter
1.345 + {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, // 14 - ZWJ before vowel
1.346 + {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, // 15 - ZWNJ before vowel
1.347 + {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, // 16 - dependent vowel
1.348 + {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, // 17 - sign above
1.349 + {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, // 18 - ZWJ after vowel
1.350 + {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, // 19 - Third coeng
1.351 + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, // 20 - dependent vowel after a Robat
1.352 +
1.353 +};
1.354 +
1.355 +
1.356 +const LETag *KhmerReordering::getFeatureOrder()
1.357 +{
1.358 + return featureOrder;
1.359 +}
1.360 +
1.361 +
1.362 +// Given an input string of characters and a location in which to start looking
1.363 +// calculate, using the state table, which one is the last character of the syllable
1.364 +// that starts in the starting position.
1.365 +le_int32 KhmerReordering::findSyllable(const KhmerClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount)
1.366 +{
1.367 + le_int32 cursor = prev;
1.368 + le_int8 state = 0;
1.369 +
1.370 + while (cursor < charCount) {
1.371 + KhmerClassTable::CharClass charClass = (classTable->getCharClass(chars[cursor]) & KhmerClassTable::CF_CLASS_MASK);
1.372 +
1.373 + state = khmerStateTable[state][charClass];
1.374 +
1.375 + if (state < 0) {
1.376 + break;
1.377 + }
1.378 +
1.379 + cursor += 1;
1.380 + }
1.381 +
1.382 + return cursor;
1.383 +}
1.384 +
1.385 +
1.386 +// This is the real reordering function as applied to the Khmer language
1.387 +
1.388 +le_int32 KhmerReordering::reorder(const LEUnicode *chars, le_int32 charCount, le_int32 /*scriptCode*/,
1.389 + LEUnicode *outChars, LEGlyphStorage &glyphStorage)
1.390 +{
1.391 + const KhmerClassTable *classTable = KhmerClassTable::getKhmerClassTable();
1.392 +
1.393 + ReorderingOutput output(outChars, glyphStorage);
1.394 + KhmerClassTable::CharClass charClass;
1.395 + le_int32 i, prev = 0, coengRo;
1.396 +
1.397 +
1.398 + // This loop only exits when we reach the end of a run, which may contain
1.399 + // several syllables.
1.400 + while (prev < charCount) {
1.401 + le_int32 syllable = findSyllable(classTable, chars, prev, charCount);
1.402 +
1.403 + // write a pre vowel or the pre part of a split vowel first
1.404 + // and look out for coeng + ro. RO is the only vowel of type 2, and
1.405 + // therefore the only one that requires saving space before the base.
1.406 + coengRo = -1; // There is no Coeng Ro, if found this value will change
1.407 + for (i = prev; i < syllable; i += 1) {
1.408 + charClass = classTable->getCharClass(chars[i]);
1.409 +
1.410 + // if a split vowel, write the pre part. In Khmer the pre part
1.411 + // is the same for all split vowels, same glyph as pre vowel C_VOWEL_E
1.412 + if (charClass & KhmerClassTable::CF_SPLIT_VOWEL) {
1.413 + output.writeChar(C_VOWEL_E, i, &tagPref[0]);
1.414 + break; // there can be only one vowel
1.415 + }
1.416 +
1.417 + // if a vowel with pos before write it out
1.418 + if (charClass & KhmerClassTable::CF_POS_BEFORE) {
1.419 + output.writeChar(chars[i], i, &tagPref[0]);
1.420 + break; // there can be only one vowel
1.421 + }
1.422 +
1.423 + // look for coeng + ro and remember position
1.424 + // works because coeng + ro is always in front of a vowel (if there is a vowel)
1.425 + // and because CC_CONSONANT2 is enough to identify it, as it is the only consonant
1.426 + // with this flag
1.427 + if ( (charClass & KhmerClassTable::CF_COENG) && (i + 1 < syllable) &&
1.428 + ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) == KhmerClassTable::CC_CONSONANT2) )
1.429 + {
1.430 + coengRo = i;
1.431 + }
1.432 + }
1.433 +
1.434 + // write coeng + ro if found
1.435 + if (coengRo > -1) {
1.436 + output.writeChar(C_COENG, coengRo, &tagPref[0]);
1.437 + output.writeChar(C_RO, coengRo + 1, &tagPref[0]);
1.438 + }
1.439 +
1.440 + // shall we add a dotted circle?
1.441 + // If in the position in which the base should be (first char in the string) there is
1.442 + // a character that has the Dotted circle flag (a character that cannot be a base)
1.443 + // then write a dotted circle
1.444 + if (classTable->getCharClass(chars[prev]) & KhmerClassTable::CF_DOTTED_CIRCLE) {
1.445 + output.writeChar(C_DOTTED_CIRCLE, prev, &tagDefault[0]);
1.446 + }
1.447 +
1.448 + // copy what is left to the output, skipping before vowels and coeng Ro if they are present
1.449 + for (i = prev; i < syllable; i += 1) {
1.450 + charClass = classTable->getCharClass(chars[i]);
1.451 +
1.452 + // skip a before vowel, it was already processed
1.453 + if (charClass & KhmerClassTable::CF_POS_BEFORE) {
1.454 + continue;
1.455 + }
1.456 +
1.457 + // skip coeng + ro, it was already processed
1.458 + if (i == coengRo) {
1.459 + i += 1;
1.460 + continue;
1.461 + }
1.462 +
1.463 + switch (charClass & KhmerClassTable::CF_POS_MASK) {
1.464 + case KhmerClassTable::CF_POS_ABOVE :
1.465 + output.writeChar(chars[i], i, &tagAbvf[0]);
1.466 + break;
1.467 +
1.468 + case KhmerClassTable::CF_POS_AFTER :
1.469 + output.writeChar(chars[i], i, &tagPstf[0]);
1.470 + break;
1.471 +
1.472 + case KhmerClassTable::CF_POS_BELOW :
1.473 + output.writeChar(chars[i], i, &tagBlwf[0]);
1.474 + break;
1.475 +
1.476 + default:
1.477 + // assign the correct flags to a coeng consonant
1.478 + // Consonants of type 3 are taged as Post forms and those type 1 as below forms
1.479 + if ( (charClass & KhmerClassTable::CF_COENG) && i + 1 < syllable ) {
1.480 + if ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK)
1.481 + == KhmerClassTable::CC_CONSONANT3) {
1.482 + output.writeChar(chars[i], i, &tagPstf[0]);
1.483 + i += 1;
1.484 + output.writeChar(chars[i], i, &tagPstf[0]);
1.485 + }
1.486 + else {
1.487 + output.writeChar(chars[i], i, &tagBlwf[0]);
1.488 + i += 1;
1.489 + output.writeChar(chars[i], i, &tagBlwf[0]);
1.490 + }
1.491 + break;
1.492 + }
1.493 + // if a shifter is followed by an above vowel change the shifter to below form,
1.494 + // an above vowel can have two possible positions i + 1 or i + 3
1.495 + // (position i+1 corresponds to unicode 3, position i+3 to Unicode 4)
1.496 + // and there is an extra rule for C_VOWEL_AA + C_SIGN_NIKAHIT also for two
1.497 + // different positions, right after the shifter or after a vowel (Unicode 4)
1.498 + if ( (charClass & KhmerClassTable::CF_SHIFTER) && (i + 1 < syllable) ) {
1.499 + if ((classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_ABOVE_VOWEL)
1.500 + || (i + 2 < syllable
1.501 + && ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) == C_VOWEL_AA)
1.502 + && ( (classTable->getCharClass(chars[i + 2]) & KhmerClassTable::CF_CLASS_MASK) == C_SIGN_NIKAHIT))
1.503 + || (i + 3 < syllable && (classTable->getCharClass(chars[i + 3]) & KhmerClassTable::CF_ABOVE_VOWEL))
1.504 + || (i + 4 < syllable
1.505 + && ( (classTable->getCharClass(chars[i + 3]) & KhmerClassTable::CF_CLASS_MASK) == C_VOWEL_AA)
1.506 + && ( (classTable->getCharClass(chars[i + 4]) & KhmerClassTable::CF_CLASS_MASK) == C_SIGN_NIKAHIT) ) )
1.507 + {
1.508 + output.writeChar(chars[i], i, &tagBlwf[0]);
1.509 + break;
1.510 + }
1.511 +
1.512 + }
1.513 + // default - any other characters
1.514 + output.writeChar(chars[i], i, &tagDefault[0]);
1.515 + break;
1.516 + } // switch
1.517 + } // for
1.518 +
1.519 + prev = syllable; // move the pointer to the start of next syllable
1.520 + }
1.521 +
1.522 + return output.getOutputIndex();
1.523 +}
1.524 +
1.525 +
1.526 +U_NAMESPACE_END