First public contribution.
2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
22 #include <convutils.h>
28 const TUint KControlCharacterEscape=0x1b;
29 const TUint KControlCharacterShiftOut=0x0e;
30 const TUint KControlCharacterShiftIn=0x0f;
31 const TUint KBitsForNonStandardStates=0x03;
33 _LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
34 _LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
35 _LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
36 _LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
37 _LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
38 _LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
39 _LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
40 _LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
42 typedef TInt (*FChangeState)(TInt aState);
43 typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
45 enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
47 ENonStandardStateJis7=1,
52 LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
60 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
64 SCnvConversionData::SOneDirectionData::SRange::EOffset,
68 STATIC_CAST(TUint, 65344),
73 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
77 SCnvConversionData::SOneDirectionData::SRange::EOffset,
81 STATIC_CAST(TUint, -65344),
86 LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
88 SCnvConversionData::EUnspecified,
91 &halfWidthKatakana7VariableByteDataRange
95 &halfWidthKatakana7ToUnicodeDataRange
99 &unicodeToHalfWidthKatakana7DataRange
105 _LIT(KLitPanicText, "JISBASE_SHARED");
109 EPanicNotAppending1=1,
112 EPanicBadNonStandardState,
117 EPanicBadFunctionPointer
120 LOCAL_C void Panic(TPanic aPanic)
122 User::Panic(KLitPanicText, aPanic);
127 TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
129 return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
132 TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
134 return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
137 TInt CnvJisBase::ChangeToStandardState(TInt)
139 return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
142 TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
144 __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
145 return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
148 TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
150 __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
151 return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
154 TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
156 __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
157 return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
160 EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
162 TFixedArray<CnvUtilities::SState, 8> states;
163 states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
164 states[0].iConversionData=&CnvJisRoman::ConversionData();
165 states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
166 states[1].iConversionData=&CnvJisRoman::ConversionData();
167 states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii;
168 states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
169 states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
170 states[3].iConversionData=&halfWidthKatakana7ConversionData;
171 states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
172 states[4].iConversionData=&CnvJisX0208::ConversionData();
173 states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
174 states[5].iConversionData=&CnvJisX0208::ConversionData();
175 states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
176 states[6].iConversionData=&CnvJisX0208::ConversionData();
177 states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
178 states[7].iConversionData=&CnvJisX0212::ConversionData();
179 const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
180 aUnicode.SetLength(0);
181 const TUint8* const pointerToFirstByte=aForeign.Ptr();
182 const TUint8* pointerToCurrentByte=pointerToFirstByte;
183 const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
184 const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
185 TUint outputConversionFlags=0;
186 TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
189 FChangeState changeState=NULL;
190 FAppendConvertToUnicode appendConvertToUnicode=NULL;
191 TBool skipThisByte=EFalse;
192 const TUint currentByte=*pointerToCurrentByte;
193 switch (aState&KBitsForNonStandardStates)
196 if (currentByte==KControlCharacterShiftOut)
198 changeState=ChangeToNonStandardStateJis7;
201 else if (currentByte&0x80)
203 changeState=ChangeToNonStandardStateJis8;
205 appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
207 case ENonStandardStateJis7:
208 if (currentByte==KControlCharacterEscape)
210 changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
212 else if (currentByte==KControlCharacterShiftIn)
214 changeState=ChangeToStandardState;
217 else if (currentByte&0x80)
219 changeState=ChangeToNonStandardStateJis8;
221 appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
223 case ENonStandardStateJis8:
224 if (currentByte==KControlCharacterEscape)
226 changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
228 else if (currentByte==KControlCharacterShiftOut)
230 changeState=ChangeToNonStandardStateJis7;
233 else if ((currentByte&0x80)==0)
235 changeState=ChangeToStandardState;
237 appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
241 Panic(EPanicBadNonStandardState);
245 __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
246 if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
248 TBool lastIteration=EFalse;
249 __ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
250 if (changeState==NULL)
252 ++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
255 if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
257 TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
258 TInt numberOfUnconvertibleCharacters;
259 TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
260 __ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
261 const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
264 return returnValue; // this is an error-code
266 if (numberOfUnconvertibleCharacters>0)
268 if (aNumberOfUnconvertibleCharacters==0)
270 aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
272 aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
276 pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
281 __ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
282 if (pointerToCurrentByte>pointerToFirstByte)
284 inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
287 if (changeState!=NULL)
289 aState=(*changeState)(aState);
293 if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
297 ++pointerToCurrentByte;
299 pointerToStartOfNextRunToConvert=pointerToCurrentByte;
300 if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
304 __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
305 if (pointerToCurrentByte>=pointerToLastByte)
310 ++pointerToCurrentByte;
312 // no checking with outputConversionFlags need to be done here
313 return pointerToLastByte-(pointerToCurrentByte-1);
316 EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
318 return halfWidthKatakana7ConversionData;
321 EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample)
323 // JIS is modal... so start off with a confidence of 0 and to begin with look
324 // for JIS escape sequences....Escape sequences defined above in the KLITs
325 // For each escape sequence, increase the confidenceLevel .....
326 aConfidenceLevel = 55;
327 TInt jisRomanResult = 0;
328 TInt asciiResult = 0;
329 TInt jisX0208Result = 0;
330 TInt jisC6226Result = 0;
331 TInt jixX0212Result = 0;
332 TInt hwKanaResult = 0;
334 TInt EscSequences = 0;
336 TInt sampleLength = aSample.Length();
337 for (TInt i = 0; i < sampleLength; ++i)
340 // JIS is 7 bit encoding
341 if((aSample[i]&0x80)!=0x00)
346 // JIS supports the following character sets
347 if (i > jisC6226Result)
349 jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
350 if (jisC6226Result!=KErrNotFound)
354 if (i > jisRomanResult)
356 jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
357 if (jisRomanResult!=KErrNotFound)
363 asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
364 if (asciiResult!=KErrNotFound)
368 if (i > jisX0208Result)
370 jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
371 if (jisX0208Result!=KErrNotFound)
375 if (i > jixX0212Result)
377 jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
378 if (jixX0212Result!=KErrNotFound)
382 if (i > hwKanaResult)
384 hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
385 if (hwKanaResult!=KErrNotFound)
390 aConfidenceLevel = 0 < sampleLength?
391 aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
392 aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;