1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/charconvfw/charconvplugins/src/shared/jisbase_shared.cpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,393 @@
1.4 +/*
1.5 +* Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +* All rights reserved.
1.7 +* This component and the accompanying materials are made available
1.8 +* under the terms of "Eclipse Public License v1.0"
1.9 +* which accompanies this distribution, and is available
1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +*
1.12 +* Initial Contributors:
1.13 +* Nokia Corporation - initial contribution.
1.14 +*
1.15 +* Contributors:
1.16 +*
1.17 +* Description:
1.18 +*
1.19 +*/
1.20 +
1.21 +
1.22 +#include <e32std.h>
1.23 +#include <charconv.h>
1.24 +#include <convdata.h>
1.25 +#include <convutils.h>
1.26 +#include "jisx0201.h"
1.27 +#include "jisx0208.h"
1.28 +#include "jisx0212.h"
1.29 +#include "jisbase.h"
1.30 +
1.31 +const TUint KControlCharacterEscape=0x1b;
1.32 +const TUint KControlCharacterShiftOut=0x0e;
1.33 +const TUint KControlCharacterShiftIn=0x0f;
1.34 +const TUint KBitsForNonStandardStates=0x03;
1.35 +
1.36 +_LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
1.37 +_LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
1.38 +_LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
1.39 +_LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
1.40 +_LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
1.41 +_LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
1.42 +_LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
1.43 +_LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
1.44 +
1.45 +typedef TInt (*FChangeState)(TInt aState);
1.46 +typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
1.47 +
1.48 +enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
1.49 + {
1.50 + ENonStandardStateJis7=1,
1.51 + ENonStandardStateJis8
1.52 + };
1.53 +
1.54 +
1.55 +LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
1.56 + {
1.57 + 0x00,
1.58 + 0xff,
1.59 + 0,
1.60 + 0
1.61 + };
1.62 +
1.63 +LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
1.64 + {
1.65 + 0x21,
1.66 + 0x5f,
1.67 + SCnvConversionData::SOneDirectionData::SRange::EOffset,
1.68 + 0,
1.69 + 0,
1.70 + {
1.71 + STATIC_CAST(TUint, 65344),
1.72 + 0
1.73 + }
1.74 + };
1.75 +
1.76 +LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
1.77 + {
1.78 + 0xff61,
1.79 + 0xff9f,
1.80 + SCnvConversionData::SOneDirectionData::SRange::EOffset,
1.81 + 1,
1.82 + 0,
1.83 + {
1.84 + STATIC_CAST(TUint, -65344),
1.85 + 0
1.86 + }
1.87 + };
1.88 +
1.89 +LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
1.90 + {
1.91 + SCnvConversionData::EUnspecified,
1.92 + {
1.93 + 1,
1.94 + &halfWidthKatakana7VariableByteDataRange
1.95 + },
1.96 + {
1.97 + 1,
1.98 + &halfWidthKatakana7ToUnicodeDataRange
1.99 + },
1.100 + {
1.101 + 1,
1.102 + &unicodeToHalfWidthKatakana7DataRange
1.103 + }
1.104 + };
1.105 +
1.106 +#if defined(_DEBUG)
1.107 +
1.108 +_LIT(KLitPanicText, "JISBASE_SHARED");
1.109 +
1.110 +enum TPanic
1.111 + {
1.112 + EPanicNotAppending1=1,
1.113 + EPanicNotAppending2,
1.114 + EPanicNotAppending3,
1.115 + EPanicBadNonStandardState,
1.116 + EPanicBadPointers1,
1.117 + EPanicBadPointers2,
1.118 + EPanicBadPointers3,
1.119 + EPanicBadPointers4,
1.120 + EPanicBadFunctionPointer
1.121 + };
1.122 +
1.123 +LOCAL_C void Panic(TPanic aPanic)
1.124 + {
1.125 + User::Panic(KLitPanicText, aPanic);
1.126 + }
1.127 +
1.128 +#endif
1.129 +
1.130 +TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
1.131 + {
1.132 + return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
1.133 + }
1.134 +
1.135 +TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
1.136 + {
1.137 + return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
1.138 + }
1.139 +
1.140 +TInt CnvJisBase::ChangeToStandardState(TInt)
1.141 + {
1.142 + return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
1.143 + }
1.144 +
1.145 +TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
1.146 + {
1.147 + __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
1.148 + return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
1.149 + }
1.150 +
1.151 +TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
1.152 + {
1.153 + __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
1.154 + return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
1.155 + }
1.156 +
1.157 +TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
1.158 + {
1.159 + __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
1.160 + return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
1.161 + }
1.162 +
1.163 +EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
1.164 + {
1.165 + TFixedArray<CnvUtilities::SState, 8> states;
1.166 + states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
1.167 + states[0].iConversionData=&CnvJisRoman::ConversionData();
1.168 + states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
1.169 + states[1].iConversionData=&CnvJisRoman::ConversionData();
1.170 + states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii;
1.171 + states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
1.172 + states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
1.173 + states[3].iConversionData=&halfWidthKatakana7ConversionData;
1.174 + states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
1.175 + states[4].iConversionData=&CnvJisX0208::ConversionData();
1.176 + states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
1.177 + states[5].iConversionData=&CnvJisX0208::ConversionData();
1.178 + states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
1.179 + states[6].iConversionData=&CnvJisX0208::ConversionData();
1.180 + states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
1.181 + states[7].iConversionData=&CnvJisX0212::ConversionData();
1.182 + const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
1.183 + aUnicode.SetLength(0);
1.184 + const TUint8* const pointerToFirstByte=aForeign.Ptr();
1.185 + const TUint8* pointerToCurrentByte=pointerToFirstByte;
1.186 + const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
1.187 + const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
1.188 + TUint outputConversionFlags=0;
1.189 + TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
1.190 + FOREVER
1.191 + {
1.192 + FChangeState changeState=NULL;
1.193 + FAppendConvertToUnicode appendConvertToUnicode=NULL;
1.194 + TBool skipThisByte=EFalse;
1.195 + const TUint currentByte=*pointerToCurrentByte;
1.196 + switch (aState&KBitsForNonStandardStates)
1.197 + {
1.198 + case 0:
1.199 + if (currentByte==KControlCharacterShiftOut)
1.200 + {
1.201 + changeState=ChangeToNonStandardStateJis7;
1.202 + skipThisByte=ETrue;
1.203 + }
1.204 + else if (currentByte&0x80)
1.205 + {
1.206 + changeState=ChangeToNonStandardStateJis8;
1.207 + }
1.208 + appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
1.209 + break;
1.210 + case ENonStandardStateJis7:
1.211 + if (currentByte==KControlCharacterEscape)
1.212 + {
1.213 + changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
1.214 + }
1.215 + else if (currentByte==KControlCharacterShiftIn)
1.216 + {
1.217 + changeState=ChangeToStandardState;
1.218 + skipThisByte=ETrue;
1.219 + }
1.220 + else if (currentByte&0x80)
1.221 + {
1.222 + changeState=ChangeToNonStandardStateJis8;
1.223 + }
1.224 + appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
1.225 + break;
1.226 + case ENonStandardStateJis8:
1.227 + if (currentByte==KControlCharacterEscape)
1.228 + {
1.229 + changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
1.230 + }
1.231 + else if (currentByte==KControlCharacterShiftOut)
1.232 + {
1.233 + changeState=ChangeToNonStandardStateJis7;
1.234 + skipThisByte=ETrue;
1.235 + }
1.236 + else if ((currentByte&0x80)==0)
1.237 + {
1.238 + changeState=ChangeToStandardState;
1.239 + }
1.240 + appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
1.241 + break;
1.242 +#if defined(_DEBUG)
1.243 + default:
1.244 + Panic(EPanicBadNonStandardState);
1.245 + break;
1.246 +#endif
1.247 + }
1.248 + __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
1.249 + if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
1.250 + {
1.251 + TBool lastIteration=EFalse;
1.252 + __ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
1.253 + if (changeState==NULL)
1.254 + {
1.255 + ++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
1.256 + lastIteration=ETrue;
1.257 + }
1.258 + if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
1.259 + {
1.260 + TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
1.261 + TInt numberOfUnconvertibleCharacters;
1.262 + TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
1.263 + __ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
1.264 + const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
1.265 + if (returnValue<0)
1.266 + {
1.267 + return returnValue; // this is an error-code
1.268 + }
1.269 + if (numberOfUnconvertibleCharacters>0)
1.270 + {
1.271 + if (aNumberOfUnconvertibleCharacters==0)
1.272 + {
1.273 + aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
1.274 + }
1.275 + aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
1.276 + }
1.277 + if (returnValue>0)
1.278 + {
1.279 + pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
1.280 + lastIteration=ETrue;
1.281 + changeState=NULL;
1.282 + skipThisByte=EFalse;
1.283 + }
1.284 + __ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
1.285 + if (pointerToCurrentByte>pointerToFirstByte)
1.286 + {
1.287 + inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
1.288 + }
1.289 + }
1.290 + if (changeState!=NULL)
1.291 + {
1.292 + aState=(*changeState)(aState);
1.293 + }
1.294 + if (skipThisByte)
1.295 + {
1.296 + if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
1.297 + {
1.298 + lastIteration=ETrue;
1.299 + }
1.300 + ++pointerToCurrentByte;
1.301 + }
1.302 + pointerToStartOfNextRunToConvert=pointerToCurrentByte;
1.303 + if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
1.304 + {
1.305 + break;
1.306 + }
1.307 + __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
1.308 + if (pointerToCurrentByte>=pointerToLastByte)
1.309 + {
1.310 + break;
1.311 + }
1.312 + }
1.313 + ++pointerToCurrentByte;
1.314 + }
1.315 + // no checking with outputConversionFlags need to be done here
1.316 + return pointerToLastByte-(pointerToCurrentByte-1);
1.317 + }
1.318 +
1.319 +EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
1.320 + {
1.321 + return halfWidthKatakana7ConversionData;
1.322 + }
1.323 +
1.324 +EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample)
1.325 + {
1.326 + // JIS is modal... so start off with a confidence of 0 and to begin with look
1.327 + // for JIS escape sequences....Escape sequences defined above in the KLITs
1.328 + // For each escape sequence, increase the confidenceLevel .....
1.329 + aConfidenceLevel = 55;
1.330 + TInt jisRomanResult = 0;
1.331 + TInt asciiResult = 0;
1.332 + TInt jisX0208Result = 0;
1.333 + TInt jisC6226Result = 0;
1.334 + TInt jixX0212Result = 0;
1.335 + TInt hwKanaResult = 0;
1.336 +
1.337 + TInt EscSequences = 0;
1.338 +
1.339 + TInt sampleLength = aSample.Length();
1.340 + for (TInt i = 0; i < sampleLength; ++i)
1.341 + {
1.342 +
1.343 + // JIS is 7 bit encoding
1.344 + if((aSample[i]&0x80)!=0x00)
1.345 + {
1.346 + aConfidenceLevel=0;
1.347 + break;
1.348 + }
1.349 + // JIS supports the following character sets
1.350 + if (i > jisC6226Result)
1.351 + {
1.352 + jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
1.353 + if (jisC6226Result!=KErrNotFound)
1.354 + EscSequences += 15;
1.355 + }
1.356 +
1.357 + if (i > jisRomanResult)
1.358 + {
1.359 + jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
1.360 + if (jisRomanResult!=KErrNotFound)
1.361 + EscSequences += 15;
1.362 + }
1.363 +
1.364 + if (i > asciiResult)
1.365 + {
1.366 + asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
1.367 + if (asciiResult!=KErrNotFound)
1.368 + EscSequences += 15;
1.369 + }
1.370 +
1.371 + if (i > jisX0208Result)
1.372 + {
1.373 + jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
1.374 + if (jisX0208Result!=KErrNotFound)
1.375 + EscSequences += 15;
1.376 + }
1.377 +
1.378 + if (i > jixX0212Result)
1.379 + {
1.380 + jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
1.381 + if (jixX0212Result!=KErrNotFound)
1.382 + EscSequences += 15;
1.383 + }
1.384 +
1.385 + if (i > hwKanaResult)
1.386 + {
1.387 + hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
1.388 + if (hwKanaResult!=KErrNotFound)
1.389 + EscSequences += 15;
1.390 + }
1.391 + }
1.392 +
1.393 + aConfidenceLevel = 0 < sampleLength?
1.394 + aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
1.395 + aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;
1.396 + }