sl@0: /* sl@0: * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: * All rights reserved. sl@0: * This component and the accompanying materials are made available sl@0: * under the terms of "Eclipse Public License v1.0" sl@0: * which accompanies this distribution, and is available sl@0: * at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: * sl@0: * Initial Contributors: sl@0: * Nokia Corporation - initial contribution. sl@0: * sl@0: * Contributors: sl@0: * sl@0: * Description: sl@0: * sl@0: */ sl@0: sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include "jisx0201.h" sl@0: #include "jisx0208.h" sl@0: #include "jisx0212.h" sl@0: #include "jisbase.h" sl@0: sl@0: const TUint KControlCharacterEscape=0x1b; sl@0: const TUint KControlCharacterShiftOut=0x0e; sl@0: const TUint KControlCharacterShiftIn=0x0f; sl@0: const TUint KBitsForNonStandardStates=0x03; sl@0: sl@0: _LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a"); sl@0: _LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48"); sl@0: _LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42"); sl@0: _LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49"); sl@0: _LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40"); sl@0: _LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42"); sl@0: _LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42"); sl@0: _LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44"); sl@0: sl@0: typedef TInt (*FChangeState)(TInt aState); sl@0: typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags); sl@0: sl@0: enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero sl@0: { sl@0: ENonStandardStateJis7=1, sl@0: ENonStandardStateJis8 sl@0: }; sl@0: sl@0: sl@0: LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange= sl@0: { sl@0: 0x00, sl@0: 0xff, sl@0: 0, sl@0: 0 sl@0: }; sl@0: sl@0: LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange= sl@0: { sl@0: 0x21, sl@0: 0x5f, sl@0: SCnvConversionData::SOneDirectionData::SRange::EOffset, sl@0: 0, sl@0: 0, sl@0: { sl@0: STATIC_CAST(TUint, 65344), sl@0: 0 sl@0: } sl@0: }; sl@0: sl@0: LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange= sl@0: { sl@0: 0xff61, sl@0: 0xff9f, sl@0: SCnvConversionData::SOneDirectionData::SRange::EOffset, sl@0: 1, sl@0: 0, sl@0: { sl@0: STATIC_CAST(TUint, -65344), sl@0: 0 sl@0: } sl@0: }; sl@0: sl@0: LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData= sl@0: { sl@0: SCnvConversionData::EUnspecified, sl@0: { sl@0: 1, sl@0: &halfWidthKatakana7VariableByteDataRange sl@0: }, sl@0: { sl@0: 1, sl@0: &halfWidthKatakana7ToUnicodeDataRange sl@0: }, sl@0: { sl@0: 1, sl@0: &unicodeToHalfWidthKatakana7DataRange sl@0: } sl@0: }; sl@0: sl@0: #if defined(_DEBUG) sl@0: sl@0: _LIT(KLitPanicText, "JISBASE_SHARED"); sl@0: sl@0: enum TPanic sl@0: { sl@0: EPanicNotAppending1=1, sl@0: EPanicNotAppending2, sl@0: EPanicNotAppending3, sl@0: EPanicBadNonStandardState, sl@0: EPanicBadPointers1, sl@0: EPanicBadPointers2, sl@0: EPanicBadPointers3, sl@0: EPanicBadPointers4, sl@0: EPanicBadFunctionPointer sl@0: }; sl@0: sl@0: LOCAL_C void Panic(TPanic aPanic) sl@0: { sl@0: User::Panic(KLitPanicText, aPanic); sl@0: } sl@0: sl@0: #endif sl@0: sl@0: TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState) sl@0: { sl@0: return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7; sl@0: } sl@0: sl@0: TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState) sl@0: { sl@0: return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8; sl@0: } sl@0: sl@0: TInt CnvJisBase::ChangeToStandardState(TInt) sl@0: { sl@0: return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman sl@0: } sl@0: sl@0: TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags) sl@0: { sl@0: __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1)); sl@0: return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags); sl@0: } sl@0: sl@0: TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray&, TUint& aOutputConversionFlags, TUint aInputConversionFlags) sl@0: { sl@0: __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2)); sl@0: return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags); sl@0: } sl@0: sl@0: TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray&, TUint& aOutputConversionFlags, TUint aInputConversionFlags) sl@0: { sl@0: __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3)); sl@0: return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags); sl@0: } sl@0: sl@0: EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) sl@0: { sl@0: TFixedArray states; sl@0: states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array sl@0: states[0].iConversionData=&CnvJisRoman::ConversionData(); sl@0: states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect; sl@0: states[1].iConversionData=&CnvJisRoman::ConversionData(); sl@0: states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii; sl@0: states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData(); sl@0: states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana; sl@0: states[3].iConversionData=&halfWidthKatakana7ConversionData; sl@0: states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978; sl@0: states[4].iConversionData=&CnvJisX0208::ConversionData(); sl@0: states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983; sl@0: states[5].iConversionData=&CnvJisX0208::ConversionData(); sl@0: states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x; sl@0: states[6].iConversionData=&CnvJisX0208::ConversionData(); sl@0: states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990; sl@0: states[7].iConversionData=&CnvJisX0212::ConversionData(); sl@0: const TArray arrayOfStates(states.Array()); sl@0: aUnicode.SetLength(0); sl@0: const TUint8* const pointerToFirstByte=aForeign.Ptr(); sl@0: const TUint8* pointerToCurrentByte=pointerToFirstByte; sl@0: const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte; sl@0: const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1); sl@0: TUint outputConversionFlags=0; sl@0: TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend; sl@0: FOREVER sl@0: { sl@0: FChangeState changeState=NULL; sl@0: FAppendConvertToUnicode appendConvertToUnicode=NULL; sl@0: TBool skipThisByte=EFalse; sl@0: const TUint currentByte=*pointerToCurrentByte; sl@0: switch (aState&KBitsForNonStandardStates) sl@0: { sl@0: case 0: sl@0: if (currentByte==KControlCharacterShiftOut) sl@0: { sl@0: changeState=ChangeToNonStandardStateJis7; sl@0: skipThisByte=ETrue; sl@0: } sl@0: else if (currentByte&0x80) sl@0: { sl@0: changeState=ChangeToNonStandardStateJis8; sl@0: } sl@0: appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign; sl@0: break; sl@0: case ENonStandardStateJis7: sl@0: if (currentByte==KControlCharacterEscape) sl@0: { sl@0: changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes sl@0: } sl@0: else if (currentByte==KControlCharacterShiftIn) sl@0: { sl@0: changeState=ChangeToStandardState; sl@0: skipThisByte=ETrue; sl@0: } sl@0: else if (currentByte&0x80) sl@0: { sl@0: changeState=ChangeToNonStandardStateJis8; sl@0: } sl@0: appendConvertToUnicode=AppendConvertToUnicodeFromJis7; sl@0: break; sl@0: case ENonStandardStateJis8: sl@0: if (currentByte==KControlCharacterEscape) sl@0: { sl@0: changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes sl@0: } sl@0: else if (currentByte==KControlCharacterShiftOut) sl@0: { sl@0: changeState=ChangeToNonStandardStateJis7; sl@0: skipThisByte=ETrue; sl@0: } sl@0: else if ((currentByte&0x80)==0) sl@0: { sl@0: changeState=ChangeToStandardState; sl@0: } sl@0: appendConvertToUnicode=AppendConvertToUnicodeFromJis8; sl@0: break; sl@0: #if defined(_DEBUG) sl@0: default: sl@0: Panic(EPanicBadNonStandardState); sl@0: break; sl@0: #endif sl@0: } sl@0: __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1)); sl@0: if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL)) sl@0: { sl@0: TBool lastIteration=EFalse; sl@0: __ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2)); sl@0: if (changeState==NULL) sl@0: { sl@0: ++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte sl@0: lastIteration=ETrue; sl@0: } sl@0: if (pointerToCurrentByte>pointerToStartOfNextRunToConvert) sl@0: { sl@0: TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert); sl@0: TInt numberOfUnconvertibleCharacters; sl@0: TInt indexOfFirstByteOfFirstUnconvertibleCharacter; sl@0: __ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer)); sl@0: const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags); sl@0: if (returnValue<0) sl@0: { sl@0: return returnValue; // this is an error-code sl@0: } sl@0: if (numberOfUnconvertibleCharacters>0) sl@0: { sl@0: if (aNumberOfUnconvertibleCharacters==0) sl@0: { sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter; sl@0: } sl@0: aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters; sl@0: } sl@0: if (returnValue>0) sl@0: { sl@0: pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below sl@0: lastIteration=ETrue; sl@0: changeState=NULL; sl@0: skipThisByte=EFalse; sl@0: } sl@0: __ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3)); sl@0: if (pointerToCurrentByte>pointerToFirstByte) sl@0: { sl@0: inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable; sl@0: } sl@0: } sl@0: if (changeState!=NULL) sl@0: { sl@0: aState=(*changeState)(aState); sl@0: } sl@0: if (skipThisByte) sl@0: { sl@0: if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue sl@0: { sl@0: lastIteration=ETrue; sl@0: } sl@0: ++pointerToCurrentByte; sl@0: } sl@0: pointerToStartOfNextRunToConvert=pointerToCurrentByte; sl@0: if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse) sl@0: { sl@0: break; sl@0: } sl@0: __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4)); sl@0: if (pointerToCurrentByte>=pointerToLastByte) sl@0: { sl@0: break; sl@0: } sl@0: } sl@0: ++pointerToCurrentByte; sl@0: } sl@0: // no checking with outputConversionFlags need to be done here sl@0: return pointerToLastByte-(pointerToCurrentByte-1); sl@0: } sl@0: sl@0: EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData() sl@0: { sl@0: return halfWidthKatakana7ConversionData; sl@0: } sl@0: sl@0: EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) sl@0: { sl@0: // JIS is modal... so start off with a confidence of 0 and to begin with look sl@0: // for JIS escape sequences....Escape sequences defined above in the KLITs sl@0: // For each escape sequence, increase the confidenceLevel ..... sl@0: aConfidenceLevel = 55; sl@0: TInt jisRomanResult = 0; sl@0: TInt asciiResult = 0; sl@0: TInt jisX0208Result = 0; sl@0: TInt jisC6226Result = 0; sl@0: TInt jixX0212Result = 0; sl@0: TInt hwKanaResult = 0; sl@0: sl@0: TInt EscSequences = 0; sl@0: sl@0: TInt sampleLength = aSample.Length(); sl@0: for (TInt i = 0; i < sampleLength; ++i) sl@0: { sl@0: sl@0: // JIS is 7 bit encoding sl@0: if((aSample[i]&0x80)!=0x00) sl@0: { sl@0: aConfidenceLevel=0; sl@0: break; sl@0: } sl@0: // JIS supports the following character sets sl@0: if (i > jisC6226Result) sl@0: { sl@0: jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978); sl@0: if (jisC6226Result!=KErrNotFound) sl@0: EscSequences += 15; sl@0: } sl@0: sl@0: if (i > jisRomanResult) sl@0: { sl@0: jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman); sl@0: if (jisRomanResult!=KErrNotFound) sl@0: EscSequences += 15; sl@0: } sl@0: sl@0: if (i > asciiResult) sl@0: { sl@0: asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii); sl@0: if (asciiResult!=KErrNotFound) sl@0: EscSequences += 15; sl@0: } sl@0: sl@0: if (i > jisX0208Result) sl@0: { sl@0: jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983); sl@0: if (jisX0208Result!=KErrNotFound) sl@0: EscSequences += 15; sl@0: } sl@0: sl@0: if (i > jixX0212Result) sl@0: { sl@0: jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990); sl@0: if (jixX0212Result!=KErrNotFound) sl@0: EscSequences += 15; sl@0: } sl@0: sl@0: if (i > hwKanaResult) sl@0: { sl@0: hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana); sl@0: if (hwKanaResult!=KErrNotFound) sl@0: EscSequences += 15; sl@0: } sl@0: } sl@0: sl@0: aConfidenceLevel = 0 < sampleLength? sl@0: aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90; sl@0: aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel; sl@0: }