Symaptic: os/textandloc/fontservices/textshaperplugin/IcuSource/layout/KhmerReordering.cpp@260cb5ec6c19 (annotated)

sl@0	1	/*
sl@0	2	*
sl@0	3	* (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved
sl@0	4	*
sl@0	5	* This file is a modification of the ICU file IndicReordering.cpp
sl@0	6	* by Jens Herden and Javier Sola for Khmer language
sl@0	7	*
sl@0	8	*/
sl@0	9
sl@0	10	#include "LETypes.h"
sl@0	11	#include "KhmerReordering.h"
sl@0	12	#include "LEGlyphStorage.h"
sl@0	13
sl@0	14
sl@0	15	U_NAMESPACE_BEGIN
sl@0	16
sl@0	17	// Characters that get refered to by name...
sl@0	18	enum
sl@0	19	{
sl@0	20	C_SIGN_ZWNJ = 0x200C,
sl@0	21	C_SIGN_ZWJ = 0x200D,
sl@0	22	C_DOTTED_CIRCLE = 0x25CC,
sl@0	23	C_RO = 0x179A,
sl@0	24	C_VOWEL_AA = 0x17B6,
sl@0	25	C_SIGN_NIKAHIT = 0x17C6,
sl@0	26	C_VOWEL_E = 0x17C1,
sl@0	27	C_COENG = 0x17D2
sl@0	28	};
sl@0	29
sl@0	30
sl@0	31	enum
sl@0	32	{
sl@0	33	// simple classes, they are used in the statetable (in this file) to control the length of a syllable
sl@0	34	// they are also used to know where a character should be placed (location in reference to the base character)
sl@0	35	// and also to know if a character, when independtly displayed, should be displayed with a dotted-circle to
sl@0	36	// indicate error in syllable construction
sl@0	37	_xx = KhmerClassTable::CC_RESERVED,
sl@0	38	_sa = KhmerClassTable::CC_SIGN_ABOVE \| KhmerClassTable::CF_DOTTED_CIRCLE \| KhmerClassTable::CF_POS_ABOVE,
sl@0	39	_sp = KhmerClassTable::CC_SIGN_AFTER \| KhmerClassTable::CF_DOTTED_CIRCLE\| KhmerClassTable::CF_POS_AFTER,
sl@0	40	_c1 = KhmerClassTable::CC_CONSONANT \| KhmerClassTable::CF_CONSONANT,
sl@0	41	_c2 = KhmerClassTable::CC_CONSONANT2 \| KhmerClassTable::CF_CONSONANT,
sl@0	42	_c3 = KhmerClassTable::CC_CONSONANT3 \| KhmerClassTable::CF_CONSONANT,
sl@0	43	_rb = KhmerClassTable::CC_ROBAT \| KhmerClassTable::CF_POS_ABOVE \| KhmerClassTable::CF_DOTTED_CIRCLE,
sl@0	44	_cs = KhmerClassTable::CC_CONSONANT_SHIFTER \| KhmerClassTable::CF_DOTTED_CIRCLE \| KhmerClassTable::CF_SHIFTER,
sl@0	45	_dl = KhmerClassTable::CC_DEPENDENT_VOWEL \| KhmerClassTable::CF_POS_BEFORE \| KhmerClassTable::CF_DOTTED_CIRCLE,
sl@0	46	_db = KhmerClassTable::CC_DEPENDENT_VOWEL \| KhmerClassTable::CF_POS_BELOW \| KhmerClassTable::CF_DOTTED_CIRCLE,
sl@0	47	_da = KhmerClassTable::CC_DEPENDENT_VOWEL \| KhmerClassTable::CF_POS_ABOVE \| KhmerClassTable::CF_DOTTED_CIRCLE \| KhmerClassTable::CF_ABOVE_VOWEL,
sl@0	48	_dr = KhmerClassTable::CC_DEPENDENT_VOWEL \| KhmerClassTable::CF_POS_AFTER \| KhmerClassTable::CF_DOTTED_CIRCLE,
sl@0	49	_co = KhmerClassTable::CC_COENG \| KhmerClassTable::CF_COENG \| KhmerClassTable::CF_DOTTED_CIRCLE,
sl@0	50
sl@0	51	// split vowel
sl@0	52	_va = _da \| KhmerClassTable::CF_SPLIT_VOWEL,
sl@0	53	_vr = _dr \| KhmerClassTable::CF_SPLIT_VOWEL
sl@0	54	};
sl@0	55
sl@0	56
sl@0	57	// Character class tables
sl@0	58	// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
sl@0	59	// _sa Sign placed above the base
sl@0	60	// _sp Sign placed after the base
sl@0	61	// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
sl@0	62	// _c2 Consonant of type 2 (only RO)
sl@0	63	// _c3 Consonant of type 3
sl@0	64	// _rb Khmer sign robat u17CC. combining mark for subscript consonants
sl@0	65	// _cd Consonant-shifter
sl@0	66	// _dl Dependent vowel placed before the base (left of the base)
sl@0	67	// _db Dependent vowel placed below the base
sl@0	68	// _da Dependent vowel placed above the base
sl@0	69	// _dr Dependent vowel placed behind the base (right of the base)
sl@0	70	// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
sl@0	71	// it to create a subscript consonant or independent vowel
sl@0	72	// _va Khmer split vowel in wich the first part is before the base and the second one above the base
sl@0	73	// _vr Khmer split vowel in wich the first part is before the base and the second one behind (right of) the base
sl@0	74
sl@0	75	static const KhmerClassTable::CharClass khmerCharClasses[] =
sl@0	76	{
sl@0	77	_c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, // 1780 - 178F
sl@0	78	_c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, // 1790 - 179F
sl@0	79	_c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, // 17A0 - 17AF
sl@0	80	_c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, // 17B0 - 17BF
sl@0	81	_vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, // 17C0 - 17CF
sl@0	82	_sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx, // 17D0 - 17DF
sl@0	83	};
sl@0	84
sl@0	85
sl@0	86	//
sl@0	87	// Khmer Class Tables
sl@0	88	//
sl@0	89
sl@0	90	//
sl@0	91	// The range of characters defined in the above table is defined here. FOr Khmer 1780 to 17DF
sl@0	92	// Even if the Khmer range is bigger, all other characters are not combinable, and therefore treated
sl@0	93	// as _xx
sl@0	94	static const KhmerClassTable khmerClassTable = {0x1780, 0x17df, khmerCharClasses};
sl@0	95
sl@0	96
sl@0	97	// Below we define how a character in the input string is either in the khmerCharClasses table
sl@0	98	// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
sl@0	99	// within the syllable, but are not in the table) we also get their type back, or an unknown object
sl@0	100	// in which case we get _xx (CC_RESERVED) back
sl@0	101	KhmerClassTable::CharClass KhmerClassTable::getCharClass(LEUnicode ch) const
sl@0	102	{
sl@0	103
sl@0	104	if (ch == C_SIGN_ZWJ) {
sl@0	105	return CC_ZERO_WIDTH_J_MARK;
sl@0	106	}
sl@0	107
sl@0	108	if (ch == C_SIGN_ZWNJ) {
sl@0	109	return CC_ZERO_WIDTH_NJ_MARK;
sl@0	110	}
sl@0	111
sl@0	112	if (ch < firstChar \|\| ch > lastChar) {
sl@0	113	return CC_RESERVED;
sl@0	114	}
sl@0	115
sl@0	116	return classTable[ch - firstChar];
sl@0	117	}
sl@0	118
sl@0	119	const KhmerClassTable *KhmerClassTable::getKhmerClassTable()
sl@0	120	{
sl@0	121	return &khmerClassTable;
sl@0	122	}
sl@0	123
sl@0	124
sl@0	125
sl@0	126	class ReorderingOutput : public UMemory {
sl@0	127	private:
sl@0	128	le_int32 fOutIndex;
sl@0	129	LEUnicode *fOutChars;
sl@0	130
sl@0	131	LEGlyphStorage &fGlyphStorage;
sl@0	132
sl@0	133
sl@0	134	public:
sl@0	135	ReorderingOutput(LEUnicode *outChars, LEGlyphStorage &glyphStorage)
sl@0	136	: fOutIndex(0), fOutChars(outChars), fGlyphStorage(glyphStorage)
sl@0	137	{
sl@0	138	// nothing else to do...
sl@0	139	}
sl@0	140
sl@0	141	~ReorderingOutput()
sl@0	142	{
sl@0	143	// nothing to do here...
sl@0	144	}
sl@0	145
sl@0	146	void writeChar(LEUnicode ch, le_uint32 charIndex, const LETag *charTags)
sl@0	147	{
sl@0	148	LEErrorCode success = LE_NO_ERROR;
sl@0	149
sl@0	150	fOutChars[fOutIndex] = ch;
sl@0	151
sl@0	152	fGlyphStorage.setCharIndex(fOutIndex, charIndex, success);
sl@0	153	fGlyphStorage.setAuxData(fOutIndex, (void *) charTags, success);
sl@0	154
sl@0	155	fOutIndex += 1;
sl@0	156	}
sl@0	157
sl@0	158	le_int32 getOutputIndex()
sl@0	159	{
sl@0	160	return fOutIndex;
sl@0	161	}
sl@0	162	};
sl@0	163
sl@0	164
sl@0	165	static const LETag emptyTag = 0x00000000; // ''
sl@0	166	//TODO remove unused flags
sl@0	167	//static const LETag nuktFeatureTag = LE_NUKT_FEATURE_TAG;
sl@0	168	//static const LETag akhnFeatureTag = LE_AKHN_FEATURE_TAG;
sl@0	169	//static const LETag rphfFeatureTag = LE_RPHF_FEATURE_TAG;
sl@0	170	static const LETag blwfFeatureTag = LE_BLWF_FEATURE_TAG;
sl@0	171	//static const LETag halfFeatureTag = LE_HALF_FEATURE_TAG;
sl@0	172	static const LETag pstfFeatureTag = LE_PSTF_FEATURE_TAG;
sl@0	173	//static const LETag vatuFeatureTag = LE_VATU_FEATURE_TAG;
sl@0	174	static const LETag presFeatureTag = LE_PRES_FEATURE_TAG;
sl@0	175	static const LETag blwsFeatureTag = LE_BLWS_FEATURE_TAG;
sl@0	176	static const LETag abvsFeatureTag = LE_ABVS_FEATURE_TAG;
sl@0	177	static const LETag pstsFeatureTag = LE_PSTS_FEATURE_TAG;
sl@0	178	//static const LETag halnFeatureTag = LE_HALN_FEATURE_TAG;
sl@0	179
sl@0	180	static const LETag blwmFeatureTag = LE_BLWM_FEATURE_TAG;
sl@0	181	static const LETag abvmFeatureTag = LE_ABVM_FEATURE_TAG;
sl@0	182	static const LETag distFeatureTag = LE_DIST_FEATURE_TAG;
sl@0	183
sl@0	184	static const LETag prefFeatureTag = LE_PREF_FEATURE_TAG;
sl@0	185	static const LETag abvfFeatureTag = LE_ABVF_FEATURE_TAG;
sl@0	186	static const LETag cligFeatureTag = LE_CLIG_FEATURE_TAG;
sl@0	187	static const LETag mkmkFeatureTag = LE_MKMK_FEATURE_TAG;
sl@0	188
sl@0	189	// These are in the order in which the features need to be applied
sl@0	190	// for correct processing
sl@0	191	static const LETag featureOrder[] =
sl@0	192	{
sl@0	193	// Shaping features
sl@0	194	prefFeatureTag, blwfFeatureTag, abvfFeatureTag, pstfFeatureTag,
sl@0	195	presFeatureTag, blwsFeatureTag, abvsFeatureTag, pstsFeatureTag,
sl@0	196	cligFeatureTag,
sl@0	197
sl@0	198	// Positioning features
sl@0	199	distFeatureTag, blwmFeatureTag, abvmFeatureTag, mkmkFeatureTag,
sl@0	200	emptyTag
sl@0	201	};
sl@0	202
sl@0	203	static const LETag tagPref[] =
sl@0	204	{
sl@0	205	prefFeatureTag, presFeatureTag,
sl@0	206	cligFeatureTag,
sl@0	207
sl@0	208	// Positioning features
sl@0	209	distFeatureTag,
sl@0	210	emptyTag
sl@0	211	};
sl@0	212
sl@0	213	static const LETag tagAbvf[] =
sl@0	214	{
sl@0	215	abvfFeatureTag, abvsFeatureTag,
sl@0	216	cligFeatureTag,
sl@0	217
sl@0	218	// Positioning features
sl@0	219	distFeatureTag, abvmFeatureTag, mkmkFeatureTag,
sl@0	220	emptyTag
sl@0	221	};
sl@0	222
sl@0	223	static const LETag tagPstf[] =
sl@0	224	{
sl@0	225	blwfFeatureTag, blwsFeatureTag,
sl@0	226	prefFeatureTag, presFeatureTag,
sl@0	227
sl@0	228	pstfFeatureTag, pstsFeatureTag,
sl@0	229	cligFeatureTag,
sl@0	230
sl@0	231	// Positioning features
sl@0	232	distFeatureTag, blwmFeatureTag,
sl@0	233	emptyTag
sl@0	234	};
sl@0	235
sl@0	236	static const LETag tagBlwf[] =
sl@0	237	{
sl@0	238	blwfFeatureTag, blwsFeatureTag,
sl@0	239	cligFeatureTag,
sl@0	240
sl@0	241	// Positioning features
sl@0	242	distFeatureTag, blwmFeatureTag, mkmkFeatureTag,
sl@0	243	emptyTag
sl@0	244	};
sl@0	245
sl@0	246
sl@0	247	// TODO do we need all of them?
sl@0	248	static const LETag tagDefault[] =
sl@0	249	{
sl@0	250	// Shaping feature
sl@0	251	prefFeatureTag, blwfFeatureTag, /abvfFeatureTag,/ /pstfFeatureTag, /
sl@0	252	presFeatureTag, blwsFeatureTag, /abvsFeatureTag,/ /pstsFeatureTag,/
sl@0	253	cligFeatureTag,
sl@0	254
sl@0	255	// Positioning features
sl@0	256	distFeatureTag, abvmFeatureTag, blwmFeatureTag, mkmkFeatureTag,
sl@0	257	emptyTag
sl@0	258	};
sl@0	259
sl@0	260
sl@0	261
sl@0	262	// The stateTable is used to calculate the end (the length) of a well
sl@0	263	// formed Khmer Syllable.
sl@0	264	//
sl@0	265	// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
sl@0	266	// CharClassValues in KhmerReordering.h This coincidence of values allows the
sl@0	267	// follow up of the table.
sl@0	268	//
sl@0	269	// Each line corresponds to a state, which does not necessarily need to be a type
sl@0	270	// of component... for example, state 2 is a base, with is always a first character
sl@0	271	// in the syllable, but the state could be produced a consonant of any type when
sl@0	272	// it is the first character that is analysed (in ground state).
sl@0	273	//
sl@0	274	// Differentiating 3 types of consonants is necessary in order to
sl@0	275	// forbid the use of certain combinations, such as having a second
sl@0	276	// coeng after a coeng RO,
sl@0	277	// The inexistent possibility of having a type 3 after another type 3 is permitted,
sl@0	278	// eliminating it would very much complicate the table, and it does not create typing
sl@0	279	// problems, as the case above.
sl@0	280	//
sl@0	281	// The table is quite complex, in order to limit the number of coeng consonants
sl@0	282	// to 2 (by means of the table).
sl@0	283	//
sl@0	284	// There a peculiarity, as far as Unicode is concerned:
sl@0	285	// - The consonant-shifter is considered in two possible different
sl@0	286	// locations, the one considered in Unicode 3.0 and the one considered in
sl@0	287	// Unicode 4.0. (there is a backwards compatibility problem in this standard).
sl@0	288
sl@0	289
sl@0	290	// xx independent character, such as a number, punctuation sign or non-khmer char
sl@0	291	//
sl@0	292	// c1 Khmer consonant of type 1 or an independent vowel
sl@0	293	// that is, a letter in which the subscript for is only under the
sl@0	294	// base, not taking any space to the right or to the left
sl@0	295	//
sl@0	296	// c2 Khmer consonant of type 2, the coeng form takes space under
sl@0	297	// and to the left of the base (only RO is of this type)
sl@0	298	//
sl@0	299	// c3 Khmer consonant of type 3. Its subscript form takes space under
sl@0	300	// and to the right of the base.
sl@0	301	//
sl@0	302	// cs Khmer consonant shifter
sl@0	303	//
sl@0	304	// rb Khmer robat
sl@0	305	//
sl@0	306	// co coeng character (u17D2)
sl@0	307	//
sl@0	308	// dv dependent vowel (including split vowels, they are treated in the same way).
sl@0	309	// even if dv is not defined above, the component that is really tested for is
sl@0	310	// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
sl@0	311	//
sl@0	312	// zwj Zero Width joiner
sl@0	313	//
sl@0	314	// zwnj Zero width non joiner
sl@0	315	//
sl@0	316	// sa above sign
sl@0	317	//
sl@0	318	// sp post sign
sl@0	319	//
sl@0	320	// there are lines with equal content but for an easier understanding
sl@0	321	// (and maybe change in the future) we did not join them
sl@0	322	//
sl@0	323	static const le_int8 khmerStateTable[][KhmerClassTable::CC_COUNT] =
sl@0	324	{
sl@0	325
sl@0	326	// xx c1 c2 c3 zwnj cs rb co dv sa sp zwj
sl@0	327	{ 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, // 0 - ground state
sl@0	328	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 1 - exit state (or sign to the right of the syllable)
sl@0	329	{-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, // 2 - Base consonant
sl@0	330	{-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, // 3 - First ZWNJ before a register shifter
sl@0	331	// It can only be followed by a shifter or a vowel
sl@0	332	{-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, // 4 - First register shifter
sl@0	333	{-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, // 5 - Robat
sl@0	334	{-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - First Coeng
sl@0	335	{-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, // 7 - First consonant of type 1 after coeng
sl@0	336	{-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, // 8 - First consonant of type 2 after coeng
sl@0	337	{-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, // 9 - First consonant or type 3 after ceong
sl@0	338	{-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, // 10 - Second Coeng (no register shifter before)
sl@0	339	{-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, // 11 - Second coeng consonant (or ind. vowel) no register shifter before
sl@0	340	{-1, -1, 1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, // 12 - Second ZWNJ before a register shifter
sl@0	341	{-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, // 13 - Second register shifter
sl@0	342	{-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, // 14 - ZWJ before vowel
sl@0	343	{-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, // 15 - ZWNJ before vowel
sl@0	344	{-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, // 16 - dependent vowel
sl@0	345	{-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, // 17 - sign above
sl@0	346	{-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, // 18 - ZWJ after vowel
sl@0	347	{-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, // 19 - Third coeng
sl@0	348	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, // 20 - dependent vowel after a Robat
sl@0	349
sl@0	350	};
sl@0	351
sl@0	352
sl@0	353	const LETag *KhmerReordering::getFeatureOrder()
sl@0	354	{
sl@0	355	return featureOrder;
sl@0	356	}
sl@0	357
sl@0	358
sl@0	359	// Given an input string of characters and a location in which to start looking
sl@0	360	// calculate, using the state table, which one is the last character of the syllable
sl@0	361	// that starts in the starting position.
sl@0	362	le_int32 KhmerReordering::findSyllable(const KhmerClassTable classTable, const LEUnicode chars, le_int32 prev, le_int32 charCount)
sl@0	363	{
sl@0	364	le_int32 cursor = prev;
sl@0	365	le_int8 state = 0;
sl@0	366
sl@0	367	while (cursor < charCount) {
sl@0	368	KhmerClassTable::CharClass charClass = (classTable->getCharClass(chars[cursor]) & KhmerClassTable::CF_CLASS_MASK);
sl@0	369
sl@0	370	state = khmerStateTable[state][charClass];
sl@0	371
sl@0	372	if (state < 0) {
sl@0	373	break;
sl@0	374	}
sl@0	375
sl@0	376	cursor += 1;
sl@0	377	}
sl@0	378
sl@0	379	return cursor;
sl@0	380	}
sl@0	381
sl@0	382
sl@0	383	// This is the real reordering function as applied to the Khmer language
sl@0	384
sl@0	385	le_int32 KhmerReordering::reorder(const LEUnicode chars, le_int32 charCount, le_int32 /scriptCode*/,
sl@0	386	LEUnicode *outChars, LEGlyphStorage &glyphStorage)
sl@0	387	{
sl@0	388	const KhmerClassTable *classTable = KhmerClassTable::getKhmerClassTable();
sl@0	389
sl@0	390	ReorderingOutput output(outChars, glyphStorage);
sl@0	391	KhmerClassTable::CharClass charClass;
sl@0	392	le_int32 i, prev = 0, coengRo;
sl@0	393
sl@0	394
sl@0	395	// This loop only exits when we reach the end of a run, which may contain
sl@0	396	// several syllables.
sl@0	397	while (prev < charCount) {
sl@0	398	le_int32 syllable = findSyllable(classTable, chars, prev, charCount);
sl@0	399
sl@0	400	// write a pre vowel or the pre part of a split vowel first
sl@0	401	// and look out for coeng + ro. RO is the only vowel of type 2, and
sl@0	402	// therefore the only one that requires saving space before the base.
sl@0	403	coengRo = -1; // There is no Coeng Ro, if found this value will change
sl@0	404	for (i = prev; i < syllable; i += 1) {
sl@0	405	charClass = classTable->getCharClass(chars[i]);
sl@0	406
sl@0	407	// if a split vowel, write the pre part. In Khmer the pre part
sl@0	408	// is the same for all split vowels, same glyph as pre vowel C_VOWEL_E
sl@0	409	if (charClass & KhmerClassTable::CF_SPLIT_VOWEL) {
sl@0	410	output.writeChar(C_VOWEL_E, i, &tagPref[0]);
sl@0	411	break; // there can be only one vowel
sl@0	412	}
sl@0	413
sl@0	414	// if a vowel with pos before write it out
sl@0	415	if (charClass & KhmerClassTable::CF_POS_BEFORE) {
sl@0	416	output.writeChar(chars[i], i, &tagPref[0]);
sl@0	417	break; // there can be only one vowel
sl@0	418	}
sl@0	419
sl@0	420	// look for coeng + ro and remember position
sl@0	421	// works because coeng + ro is always in front of a vowel (if there is a vowel)
sl@0	422	// and because CC_CONSONANT2 is enough to identify it, as it is the only consonant
sl@0	423	// with this flag
sl@0	424	if ( (charClass & KhmerClassTable::CF_COENG) && (i + 1 < syllable) &&
sl@0	425	( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) == KhmerClassTable::CC_CONSONANT2) )
sl@0	426	{
sl@0	427	coengRo = i;
sl@0	428	}
sl@0	429	}
sl@0	430
sl@0	431	// write coeng + ro if found
sl@0	432	if (coengRo > -1) {
sl@0	433	output.writeChar(C_COENG, coengRo, &tagPref[0]);
sl@0	434	output.writeChar(C_RO, coengRo + 1, &tagPref[0]);
sl@0	435	}
sl@0	436
sl@0	437	// shall we add a dotted circle?
sl@0	438	// If in the position in which the base should be (first char in the string) there is
sl@0	439	// a character that has the Dotted circle flag (a character that cannot be a base)
sl@0	440	// then write a dotted circle
sl@0	441	if (classTable->getCharClass(chars[prev]) & KhmerClassTable::CF_DOTTED_CIRCLE) {
sl@0	442	output.writeChar(C_DOTTED_CIRCLE, prev, &tagDefault[0]);
sl@0	443	}
sl@0	444
sl@0	445	// copy what is left to the output, skipping before vowels and coeng Ro if they are present
sl@0	446	for (i = prev; i < syllable; i += 1) {
sl@0	447	charClass = classTable->getCharClass(chars[i]);
sl@0	448
sl@0	449	// skip a before vowel, it was already processed
sl@0	450	if (charClass & KhmerClassTable::CF_POS_BEFORE) {
sl@0	451	continue;
sl@0	452	}
sl@0	453
sl@0	454	// skip coeng + ro, it was already processed
sl@0	455	if (i == coengRo) {
sl@0	456	i += 1;
sl@0	457	continue;
sl@0	458	}
sl@0	459
sl@0	460	switch (charClass & KhmerClassTable::CF_POS_MASK) {
sl@0	461	case KhmerClassTable::CF_POS_ABOVE :
sl@0	462	output.writeChar(chars[i], i, &tagAbvf[0]);
sl@0	463	break;
sl@0	464
sl@0	465	case KhmerClassTable::CF_POS_AFTER :
sl@0	466	output.writeChar(chars[i], i, &tagPstf[0]);
sl@0	467	break;
sl@0	468
sl@0	469	case KhmerClassTable::CF_POS_BELOW :
sl@0	470	output.writeChar(chars[i], i, &tagBlwf[0]);
sl@0	471	break;
sl@0	472
sl@0	473	default:
sl@0	474	// assign the correct flags to a coeng consonant
sl@0	475	// Consonants of type 3 are taged as Post forms and those type 1 as below forms
sl@0	476	if ( (charClass & KhmerClassTable::CF_COENG) && i + 1 < syllable ) {
sl@0	477	if ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK)
sl@0	478	== KhmerClassTable::CC_CONSONANT3) {
sl@0	479	output.writeChar(chars[i], i, &tagPstf[0]);
sl@0	480	i += 1;
sl@0	481	output.writeChar(chars[i], i, &tagPstf[0]);
sl@0	482	}
sl@0	483	else {
sl@0	484	output.writeChar(chars[i], i, &tagBlwf[0]);
sl@0	485	i += 1;
sl@0	486	output.writeChar(chars[i], i, &tagBlwf[0]);
sl@0	487	}
sl@0	488	break;
sl@0	489	}
sl@0	490	// if a shifter is followed by an above vowel change the shifter to below form,
sl@0	491	// an above vowel can have two possible positions i + 1 or i + 3
sl@0	492	// (position i+1 corresponds to unicode 3, position i+3 to Unicode 4)
sl@0	493	// and there is an extra rule for C_VOWEL_AA + C_SIGN_NIKAHIT also for two
sl@0	494	// different positions, right after the shifter or after a vowel (Unicode 4)
sl@0	495	if ( (charClass & KhmerClassTable::CF_SHIFTER) && (i + 1 < syllable) ) {
sl@0	496	if ((classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_ABOVE_VOWEL)
sl@0	497	\|\| (i + 2 < syllable
sl@0	498	&& ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) == C_VOWEL_AA)
sl@0	499	&& ( (classTable->getCharClass(chars[i + 2]) & KhmerClassTable::CF_CLASS_MASK) == C_SIGN_NIKAHIT))
sl@0	500	\|\| (i + 3 < syllable && (classTable->getCharClass(chars[i + 3]) & KhmerClassTable::CF_ABOVE_VOWEL))
sl@0	501	\|\| (i + 4 < syllable
sl@0	502	&& ( (classTable->getCharClass(chars[i + 3]) & KhmerClassTable::CF_CLASS_MASK) == C_VOWEL_AA)
sl@0	503	&& ( (classTable->getCharClass(chars[i + 4]) & KhmerClassTable::CF_CLASS_MASK) == C_SIGN_NIKAHIT) ) )
sl@0	504	{
sl@0	505	output.writeChar(chars[i], i, &tagBlwf[0]);
sl@0	506	break;
sl@0	507	}
sl@0	508
sl@0	509	}
sl@0	510	// default - any other characters
sl@0	511	output.writeChar(chars[i], i, &tagDefault[0]);
sl@0	512	break;
sl@0	513	} // switch
sl@0	514	} // for
sl@0	515
sl@0	516	prev = syllable; // move the pointer to the start of next syllable
sl@0	517	}
sl@0	518
sl@0	519	return output.getOutputIndex();
sl@0	520	}
sl@0	521
sl@0	522
sl@0	523	U_NAMESPACE_END

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--