os/textandloc/charconvfw/charconvplugins/src/shared/jisbase_shared.cpp
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/charconvfw/charconvplugins/src/shared/jisbase_shared.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,393 @@
     1.4 +/*
     1.5 +* Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +* All rights reserved.
     1.7 +* This component and the accompanying materials are made available
     1.8 +* under the terms of "Eclipse Public License v1.0"
     1.9 +* which accompanies this distribution, and is available
    1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +*
    1.12 +* Initial Contributors:
    1.13 +* Nokia Corporation - initial contribution.
    1.14 +*
    1.15 +* Contributors:
    1.16 +*
    1.17 +* Description: 
    1.18 +*
    1.19 +*/
    1.20 +
    1.21 +
    1.22 +#include <e32std.h>
    1.23 +#include <charconv.h>
    1.24 +#include <convdata.h>
    1.25 +#include <convutils.h>
    1.26 +#include "jisx0201.h"
    1.27 +#include "jisx0208.h"
    1.28 +#include "jisx0212.h"
    1.29 +#include "jisbase.h"
    1.30 +
    1.31 +const TUint KControlCharacterEscape=0x1b;
    1.32 +const TUint KControlCharacterShiftOut=0x0e;
    1.33 +const TUint KControlCharacterShiftIn=0x0f;
    1.34 +const TUint KBitsForNonStandardStates=0x03;
    1.35 +
    1.36 +_LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
    1.37 +_LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
    1.38 +_LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
    1.39 +_LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
    1.40 +_LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
    1.41 +_LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
    1.42 +_LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
    1.43 +_LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
    1.44 +
    1.45 +typedef TInt (*FChangeState)(TInt aState);
    1.46 +typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
    1.47 +
    1.48 +enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
    1.49 +	{
    1.50 +	ENonStandardStateJis7=1,
    1.51 +	ENonStandardStateJis8
    1.52 +	};
    1.53 +
    1.54 +
    1.55 +LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
    1.56 +	{
    1.57 +	0x00,
    1.58 +	0xff,
    1.59 +	0,
    1.60 +	0
    1.61 +	};
    1.62 +
    1.63 +LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
    1.64 +	{
    1.65 +	0x21,
    1.66 +	0x5f,
    1.67 +	SCnvConversionData::SOneDirectionData::SRange::EOffset,
    1.68 +	0,
    1.69 +	0,
    1.70 +		{
    1.71 +		STATIC_CAST(TUint, 65344),
    1.72 +		0
    1.73 +		}
    1.74 +	};
    1.75 +
    1.76 +LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
    1.77 +	{
    1.78 +	0xff61,
    1.79 +	0xff9f,
    1.80 +	SCnvConversionData::SOneDirectionData::SRange::EOffset,
    1.81 +	1,
    1.82 +	0,
    1.83 +		{
    1.84 +		STATIC_CAST(TUint, -65344),
    1.85 +		0
    1.86 +		}
    1.87 +	};
    1.88 +
    1.89 +LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
    1.90 +	{
    1.91 +	SCnvConversionData::EUnspecified,
    1.92 +		{
    1.93 +		1,
    1.94 +		&halfWidthKatakana7VariableByteDataRange
    1.95 +		},
    1.96 +		{
    1.97 +		1,
    1.98 +		&halfWidthKatakana7ToUnicodeDataRange
    1.99 +		},
   1.100 +		{
   1.101 +		1,
   1.102 +		&unicodeToHalfWidthKatakana7DataRange
   1.103 +		}
   1.104 +	};
   1.105 +
   1.106 +#if defined(_DEBUG)
   1.107 +
   1.108 +_LIT(KLitPanicText, "JISBASE_SHARED");
   1.109 +
   1.110 +enum TPanic
   1.111 +	{
   1.112 +	EPanicNotAppending1=1,
   1.113 +	EPanicNotAppending2,
   1.114 +	EPanicNotAppending3,
   1.115 +	EPanicBadNonStandardState,
   1.116 +	EPanicBadPointers1,
   1.117 +	EPanicBadPointers2,
   1.118 +	EPanicBadPointers3,
   1.119 +	EPanicBadPointers4,
   1.120 +	EPanicBadFunctionPointer
   1.121 +	};
   1.122 +
   1.123 +LOCAL_C void Panic(TPanic aPanic)
   1.124 +	{
   1.125 +	User::Panic(KLitPanicText, aPanic);
   1.126 +	}
   1.127 +
   1.128 +#endif
   1.129 +
   1.130 +TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
   1.131 +	{
   1.132 +	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
   1.133 +	}
   1.134 +
   1.135 +TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
   1.136 +	{
   1.137 +	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
   1.138 +	}
   1.139 +
   1.140 +TInt CnvJisBase::ChangeToStandardState(TInt)
   1.141 +	{
   1.142 +	return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
   1.143 +	}
   1.144 +
   1.145 +TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   1.146 +	{
   1.147 +	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
   1.148 +	return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
   1.149 +	}
   1.150 +
   1.151 +TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   1.152 +	{
   1.153 +	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
   1.154 +	return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
   1.155 +	}
   1.156 +
   1.157 +TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   1.158 +	{
   1.159 +	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
   1.160 +	return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
   1.161 +	}
   1.162 +
   1.163 +EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   1.164 +	{
   1.165 +	TFixedArray<CnvUtilities::SState, 8> states;
   1.166 +	states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
   1.167 +	states[0].iConversionData=&CnvJisRoman::ConversionData();
   1.168 +	states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
   1.169 +	states[1].iConversionData=&CnvJisRoman::ConversionData();	
   1.170 +	states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii;
   1.171 +	states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
   1.172 +	states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
   1.173 +	states[3].iConversionData=&halfWidthKatakana7ConversionData;
   1.174 +	states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
   1.175 +	states[4].iConversionData=&CnvJisX0208::ConversionData();
   1.176 +	states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
   1.177 +	states[5].iConversionData=&CnvJisX0208::ConversionData();
   1.178 +	states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
   1.179 +	states[6].iConversionData=&CnvJisX0208::ConversionData();
   1.180 +	states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
   1.181 +	states[7].iConversionData=&CnvJisX0212::ConversionData();
   1.182 +	const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
   1.183 +	aUnicode.SetLength(0);
   1.184 +	const TUint8* const pointerToFirstByte=aForeign.Ptr();
   1.185 +	const TUint8* pointerToCurrentByte=pointerToFirstByte;
   1.186 +	const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
   1.187 +	const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
   1.188 +	TUint outputConversionFlags=0;
   1.189 +	TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
   1.190 +	FOREVER
   1.191 +		{
   1.192 +		FChangeState changeState=NULL;
   1.193 +		FAppendConvertToUnicode appendConvertToUnicode=NULL;
   1.194 +		TBool skipThisByte=EFalse;
   1.195 +		const TUint currentByte=*pointerToCurrentByte;
   1.196 +		switch (aState&KBitsForNonStandardStates)
   1.197 +			{
   1.198 +		case 0:
   1.199 +			if (currentByte==KControlCharacterShiftOut)
   1.200 +				{
   1.201 +				changeState=ChangeToNonStandardStateJis7;
   1.202 +				skipThisByte=ETrue;
   1.203 +				}
   1.204 +			else if (currentByte&0x80)
   1.205 +				{
   1.206 +				changeState=ChangeToNonStandardStateJis8;
   1.207 +				}
   1.208 +			appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
   1.209 +			break;
   1.210 +		case ENonStandardStateJis7:
   1.211 +			if (currentByte==KControlCharacterEscape)
   1.212 +				{
   1.213 +				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
   1.214 +				}
   1.215 +			else if (currentByte==KControlCharacterShiftIn)
   1.216 +				{
   1.217 +				changeState=ChangeToStandardState;
   1.218 +				skipThisByte=ETrue;
   1.219 +				}
   1.220 +			else if (currentByte&0x80)
   1.221 +				{
   1.222 +				changeState=ChangeToNonStandardStateJis8;
   1.223 +				}
   1.224 +			appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
   1.225 +			break;
   1.226 +		case ENonStandardStateJis8:
   1.227 +			if (currentByte==KControlCharacterEscape)
   1.228 +				{
   1.229 +				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
   1.230 +				}
   1.231 +			else if (currentByte==KControlCharacterShiftOut)
   1.232 +				{
   1.233 +				changeState=ChangeToNonStandardStateJis7;
   1.234 +				skipThisByte=ETrue;
   1.235 +				}
   1.236 +			else if ((currentByte&0x80)==0)
   1.237 +				{
   1.238 +				changeState=ChangeToStandardState;
   1.239 +				}
   1.240 +			appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
   1.241 +			break;
   1.242 +#if defined(_DEBUG)
   1.243 +		default:
   1.244 +			Panic(EPanicBadNonStandardState);
   1.245 +			break;
   1.246 +#endif
   1.247 +			}
   1.248 +		__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
   1.249 +		if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
   1.250 +			{
   1.251 +			TBool lastIteration=EFalse;
   1.252 +			__ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
   1.253 +			if (changeState==NULL)
   1.254 +				{
   1.255 +				++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
   1.256 +				lastIteration=ETrue;
   1.257 +				}
   1.258 +			if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
   1.259 +				{
   1.260 +				TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
   1.261 +				TInt numberOfUnconvertibleCharacters;
   1.262 +				TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
   1.263 +				__ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
   1.264 +				const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
   1.265 +				if (returnValue<0)
   1.266 +					{
   1.267 +					return returnValue; // this is an error-code
   1.268 +					}
   1.269 +				if (numberOfUnconvertibleCharacters>0)
   1.270 +					{
   1.271 +					if (aNumberOfUnconvertibleCharacters==0)
   1.272 +						{
   1.273 +						aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
   1.274 +						}
   1.275 +					aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
   1.276 +					}
   1.277 +				if (returnValue>0)
   1.278 +					{
   1.279 +					pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
   1.280 +					lastIteration=ETrue;
   1.281 +					changeState=NULL;
   1.282 +					skipThisByte=EFalse;
   1.283 +					}
   1.284 +				__ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
   1.285 +				if (pointerToCurrentByte>pointerToFirstByte)
   1.286 +					{
   1.287 +					inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
   1.288 +					}
   1.289 +				}
   1.290 +			if (changeState!=NULL)
   1.291 +				{
   1.292 +				aState=(*changeState)(aState);
   1.293 +				}
   1.294 +			if (skipThisByte)
   1.295 +				{
   1.296 +				if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
   1.297 +					{
   1.298 +					lastIteration=ETrue;
   1.299 +					}
   1.300 +				++pointerToCurrentByte;
   1.301 +				}
   1.302 +			pointerToStartOfNextRunToConvert=pointerToCurrentByte;
   1.303 +			if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
   1.304 +				{
   1.305 +				break;
   1.306 +				}
   1.307 +			__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
   1.308 +			if (pointerToCurrentByte>=pointerToLastByte)
   1.309 +				{
   1.310 +				break;
   1.311 +				}
   1.312 +			}
   1.313 +		++pointerToCurrentByte;
   1.314 +		}
   1.315 +	// no checking with outputConversionFlags need to be done here
   1.316 +	return pointerToLastByte-(pointerToCurrentByte-1);
   1.317 +	}
   1.318 +
   1.319 +EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
   1.320 +	{
   1.321 +	return halfWidthKatakana7ConversionData;
   1.322 +	}
   1.323 +
   1.324 +EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) 
   1.325 +	{
   1.326 +	// JIS is modal... so start off with a confidence of 0 and to begin with look 
   1.327 +	// for JIS escape sequences....Escape sequences defined above in the KLITs
   1.328 +	// For each escape sequence, increase the confidenceLevel ..... 
   1.329 +	aConfidenceLevel = 55;
   1.330 +	TInt jisRomanResult = 0;
   1.331 +	TInt asciiResult = 0;
   1.332 +	TInt jisX0208Result = 0;
   1.333 +	TInt jisC6226Result = 0;
   1.334 +	TInt jixX0212Result = 0;
   1.335 +	TInt hwKanaResult = 0;
   1.336 +
   1.337 +	TInt EscSequences = 0;
   1.338 +	
   1.339 +	TInt sampleLength = aSample.Length();
   1.340 +	for (TInt i = 0; i < sampleLength; ++i)
   1.341 +		{
   1.342 +	
   1.343 +		// JIS is 7 bit encoding
   1.344 +		if((aSample[i]&0x80)!=0x00)
   1.345 +			{
   1.346 +			aConfidenceLevel=0;
   1.347 +			break;
   1.348 +			}
   1.349 +		// JIS supports the following character sets 
   1.350 +		if (i > jisC6226Result)
   1.351 +			{
   1.352 +			jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
   1.353 +			if (jisC6226Result!=KErrNotFound)
   1.354 +				EscSequences += 15; 
   1.355 +			}
   1.356 +
   1.357 +		if (i > jisRomanResult)
   1.358 +			{
   1.359 +			jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
   1.360 +			if (jisRomanResult!=KErrNotFound)
   1.361 +				EscSequences += 15; 
   1.362 +			}
   1.363 +
   1.364 +		if (i > asciiResult)
   1.365 +			{
   1.366 +			asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
   1.367 +			if (asciiResult!=KErrNotFound)
   1.368 +				EscSequences += 15; 
   1.369 +			}
   1.370 +
   1.371 +		if (i > jisX0208Result)
   1.372 +			{
   1.373 +			jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
   1.374 +			if (jisX0208Result!=KErrNotFound)
   1.375 +				EscSequences += 15; 
   1.376 +			}
   1.377 +
   1.378 +		if (i > jixX0212Result)
   1.379 +			{
   1.380 +			jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
   1.381 +			if (jixX0212Result!=KErrNotFound)
   1.382 +				EscSequences += 15; 
   1.383 +			}
   1.384 +
   1.385 +		if (i > hwKanaResult)
   1.386 +			{
   1.387 +			hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
   1.388 +			if (hwKanaResult!=KErrNotFound)
   1.389 +				EscSequences += 15; 
   1.390 +			}
   1.391 +		}
   1.392 +
   1.393 +	aConfidenceLevel = 0 < sampleLength?
   1.394 +		aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
   1.395 +	aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;
   1.396 +	}