os/textandloc/charconvfw/charconvplugins/src/shared/JISBASE_SHARED_2.CPP
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description:       
    15 *
    16 */
    17 
    18 
    19 #include "PictographObserver.h"
    20 #include <e32std.h>
    21 #include <charconv.h>
    22 #include <convdata.h>
    23 #include <convutils.h>
    24 #include "jisx0201.h"
    25 #include "jisx0208.h"
    26 #include "jisx0212.h"
    27 #include "jisbase.h"
    28 #include "featmgr/featmgr.h"
    29 
    30 const TUint KControlCharacterEscape=0x1b;
    31 const TUint KControlCharacterShiftOut=0x0e;
    32 const TUint KControlCharacterShiftIn=0x0f;
    33 const TUint KBitsForNonStandardStates=0x03;
    34 
    35 _LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
    36 _LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
    37 _LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
    38 _LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
    39 _LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
    40 _LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
    41 _LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
    42 _LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
    43 
    44 typedef TInt (*FChangeState)(TInt aState);
    45 typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
    46 
    47 enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
    48 	{
    49 	ENonStandardStateJis7=1,
    50 	ENonStandardStateJis8
    51 	};
    52 
    53 
    54 LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
    55 	{
    56 	0x00,
    57 	0xff,
    58 	0,
    59 	0
    60 	};
    61 
    62 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
    63 	{
    64 	0x21,
    65 	0x5f,
    66 	SCnvConversionData::SOneDirectionData::SRange::EOffset,
    67 	0,
    68 	0,
    69 		{
    70 		STATIC_CAST(TUint, 65344),
    71 		0
    72 		}
    73 	};
    74 
    75 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
    76 	{
    77 	0xff61,
    78 	0xff9f,
    79 	SCnvConversionData::SOneDirectionData::SRange::EOffset,
    80 	1,
    81 	0,
    82 		{
    83 		STATIC_CAST(TUint, -65344),
    84 		0
    85 		}
    86 	};
    87 
    88 LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
    89 	{
    90 	SCnvConversionData::EUnspecified,
    91 		{
    92 		1,
    93 		&halfWidthKatakana7VariableByteDataRange
    94 		},
    95 		{
    96 		1,
    97 		&halfWidthKatakana7ToUnicodeDataRange
    98 		},
    99 		{
   100 		1,
   101 		&unicodeToHalfWidthKatakana7DataRange
   102 		}
   103 	};
   104 
   105 #if defined(_DEBUG)
   106 
   107 _LIT(KLitPanicText, "JISBASE_SHARED");
   108 
   109 enum TPanic
   110 	{
   111 	EPanicNotAppending1=1,
   112 	EPanicNotAppending2,
   113 	EPanicNotAppending3,
   114 	EPanicBadNonStandardState,
   115 	EPanicBadPointers1,
   116 	EPanicBadPointers2,
   117 	EPanicBadPointers3,
   118 	EPanicBadPointers4,
   119 	EPanicBadFunctionPointer
   120 	};
   121 
   122 LOCAL_C void Panic(TPanic aPanic)
   123 	{
   124 	User::Panic(KLitPanicText, aPanic);
   125 	}
   126 
   127 #endif
   128 
   129 TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
   130 	{
   131 	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
   132 	}
   133 
   134 TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
   135 	{
   136 	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
   137 	}
   138 
   139 TInt CnvJisBase::ChangeToStandardState(TInt)
   140 	{
   141 	return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
   142 	}
   143 
   144 TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   145 	{
   146 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
   147 	return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
   148 	}
   149 
   150 TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   151 	{
   152 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
   153 	return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
   154 	}
   155 
   156 TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   157 	{
   158 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
   159 	return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
   160 	}
   161 
   162 EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   163 	{
   164     TInt ret =KErrNone;
   165     TBool pictographsSupported = FeatureManager::FeatureSupported(KFeatureIdJapanesePicto);
   166     RArray<CnvUtilities::SState> states;
   167     if ( pictographsSupported )
   168         {        
   169 
   170         CnvUtilities::SState state;
   171         state.iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
   172         state.iConversionData=&CnvJisRoman::ConversionData();
   173         ret |= states.Append(state);
   174         state.iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
   175         state.iConversionData=&CnvJisRoman::ConversionData();
   176         ret |= states.Append(state);
   177         state.iEscapeSequence=&KLit8EscapeSequenceForAscii;
   178         state.iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
   179         ret |= states.Append(state);
   180     
   181         SetStatesForPictograph(states);
   182 
   183         state.iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
   184         state.iConversionData=&halfWidthKatakana7ConversionData;
   185         ret |= states.Append(state);
   186         state.iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
   187         state.iConversionData=&CnvJisX0208::ConversionData();
   188         ret |= states.Append(state);
   189         state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
   190         state.iConversionData=&CnvJisX0208::ConversionData();
   191         ret |= states.Append(state);
   192         state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
   193         state.iConversionData=&CnvJisX0208::ConversionData();
   194         ret |= states.Append(state);
   195         state.iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
   196         state.iConversionData=&CnvJisX0212::ConversionData();
   197         ret |= states.Append(state);
   198         }
   199     else
   200         {            
   201         CnvUtilities::SState state;
   202     	state.iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
   203     	state.iConversionData=&CnvJisRoman::ConversionData();
   204     	ret |= states.Append(state);
   205         state.iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
   206     	state.iConversionData=&CnvJisRoman::ConversionData();	
   207     	ret |= states.Append(state);
   208     	state.iEscapeSequence=&KLit8EscapeSequenceForAscii;
   209     	state.iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
   210     	ret |= states.Append(state);
   211     	state.iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
   212     	state.iConversionData=&halfWidthKatakana7ConversionData;
   213     	ret |= states.Append(state);
   214     	state.iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
   215     	state.iConversionData=&CnvJisX0208::ConversionData();
   216     	ret |= states.Append(state);
   217     	state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
   218     	state.iConversionData=&CnvJisX0208::ConversionData();
   219     	ret |= states.Append(state);
   220     	state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
   221     	state.iConversionData=&CnvJisX0208::ConversionData();
   222     	ret |= states.Append(state);
   223     	state.iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
   224     	state.iConversionData=&CnvJisX0212::ConversionData();
   225     	ret |= states.Append(state);
   226         }
   227     __ASSERT_DEBUG(!ret, User::Panic(_L("RArray append failure"), ret));
   228     
   229 	const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
   230 	aUnicode.SetLength(0);
   231 	const TUint8* const pointerToFirstByte=aForeign.Ptr();
   232 	const TUint8* pointerToCurrentByte=pointerToFirstByte;
   233 	const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
   234 	const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
   235 	TUint outputConversionFlags=0;
   236 	TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
   237 	FOREVER
   238 		{
   239 		FChangeState changeState=NULL;
   240 		FAppendConvertToUnicode appendConvertToUnicode=NULL;
   241 		TBool skipThisByte=EFalse;
   242 		const TUint currentByte=*pointerToCurrentByte;
   243 		switch (aState&KBitsForNonStandardStates)
   244 			{
   245 		case 0:
   246 			if (currentByte==KControlCharacterShiftOut)
   247 				{
   248 				changeState=ChangeToNonStandardStateJis7;
   249 				skipThisByte=ETrue;
   250 				}
   251             else if (pictographsSupported && (currentByte==KControlCharacterShiftIn))
   252                 {
   253                 changeState=ChangeToStandardState;
   254                 skipThisByte=ETrue;
   255                 }
   256 			else if (currentByte&0x80)
   257 				{
   258 				changeState=ChangeToNonStandardStateJis8;
   259 				}
   260 			appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
   261 			break;
   262 		case ENonStandardStateJis7:
   263 			if (currentByte==KControlCharacterEscape)
   264 				{
   265 				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
   266 				}
   267 			else if (currentByte==KControlCharacterShiftIn)
   268 				{
   269 				changeState=ChangeToStandardState;
   270 				skipThisByte=ETrue;
   271 				}
   272 			else if (currentByte&0x80)
   273 				{
   274 				changeState=ChangeToNonStandardStateJis8;
   275 				}
   276 			appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
   277 			break;
   278 		case ENonStandardStateJis8:
   279 			if (currentByte==KControlCharacterEscape)
   280 				{
   281 				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
   282 				}
   283 			else if (currentByte==KControlCharacterShiftOut)
   284 				{
   285 				changeState=ChangeToNonStandardStateJis7;
   286 				skipThisByte=ETrue;
   287 				}
   288 			else if ((currentByte&0x80)==0)
   289 				{
   290 				changeState=ChangeToStandardState;
   291 				}
   292 			appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
   293 			break;
   294 #if defined(_DEBUG)
   295 		default:
   296 			Panic(EPanicBadNonStandardState);
   297 			break;
   298 #endif
   299 			}
   300 		__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
   301 		if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
   302 			{
   303 			TBool lastIteration=EFalse;
   304 			__ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
   305 			if (changeState==NULL)
   306 				{
   307 				++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
   308 				lastIteration=ETrue;
   309 				}
   310 			if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
   311 				{
   312 				TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
   313 				TInt numberOfUnconvertibleCharacters;
   314 				TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
   315 				__ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
   316 				const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
   317 				if (returnValue<0)
   318 					{
   319                     states.Close();
   320 					return returnValue; // this is an error-code
   321 					}
   322 				if (numberOfUnconvertibleCharacters>0)
   323 					{
   324 					if (aNumberOfUnconvertibleCharacters==0)
   325 						{
   326 						aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
   327 						}
   328 					aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
   329 					}
   330 				if (returnValue>0)
   331 					{
   332 					pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
   333 					lastIteration=ETrue;
   334 					changeState=NULL;
   335 					skipThisByte=EFalse;
   336 					}
   337 				__ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
   338 				if (pointerToCurrentByte>pointerToFirstByte)
   339 					{
   340 					inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
   341 					}
   342 				}
   343 			if (changeState!=NULL)
   344 				{
   345 				aState=(*changeState)(aState);
   346 				}
   347 			if (skipThisByte)
   348 				{
   349 				if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
   350 					{
   351 					lastIteration=ETrue;
   352 					}
   353 				++pointerToCurrentByte;
   354 				}
   355 			pointerToStartOfNextRunToConvert=pointerToCurrentByte;
   356 			if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
   357 				{
   358 				break;
   359 				}
   360 			__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
   361 			if (pointerToCurrentByte>=pointerToLastByte)
   362 				{
   363 				break;
   364 				}
   365 			}
   366 		++pointerToCurrentByte;
   367 		}
   368 
   369     states.Close();
   370 	// no checking with outputConversionFlags need to be done here
   371 	return pointerToLastByte-(pointerToCurrentByte-1);
   372 	}
   373 
   374 EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
   375 	{
   376 	return halfWidthKatakana7ConversionData;
   377 	}
   378 
   379 EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) 
   380 	{
   381 	// JIS is modal... so start off with a confidence of 0 and to begin with look 
   382 	// for JIS escape sequences....Escape sequences defined above in the KLITs
   383 	// For each escape sequence, increase the confidenceLevel ..... 
   384 	aConfidenceLevel = 55;
   385 	TInt jisRomanResult = 0;
   386 	TInt asciiResult = 0;
   387 	TInt jisX0208Result = 0;
   388 	TInt jisC6226Result = 0;
   389 	TInt jixX0212Result = 0;
   390 	TInt hwKanaResult = 0;
   391 
   392 	TInt EscSequences = 0;
   393 	
   394 	TInt sampleLength = aSample.Length();
   395 	for (TInt i = 0; i < sampleLength; ++i)
   396 		{
   397 	
   398 		// JIS is 7 bit encoding
   399 		if((aSample[i]&0x80)!=0x00)
   400 			{
   401 			aConfidenceLevel=0;
   402 			break;
   403 			}
   404 		// JIS supports the following character sets 
   405 		if (i > jisC6226Result)
   406 			{
   407 			jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
   408 			if (jisC6226Result!=KErrNotFound)
   409 				EscSequences += 15; 
   410 			}
   411 
   412 		if (i > jisRomanResult)
   413 			{
   414 			jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
   415 			if (jisRomanResult!=KErrNotFound)
   416 				EscSequences += 15; 
   417 			}
   418 
   419 		if (i > asciiResult)
   420 			{
   421 			asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
   422 			if (asciiResult!=KErrNotFound)
   423 				EscSequences += 15; 
   424 			}
   425 
   426 		if (i > jisX0208Result)
   427 			{
   428 			jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
   429 			if (jisX0208Result!=KErrNotFound)
   430 				EscSequences += 15; 
   431 			}
   432 
   433 		if (i > jixX0212Result)
   434 			{
   435 			jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
   436 			if (jixX0212Result!=KErrNotFound)
   437 				EscSequences += 15; 
   438 			}
   439 
   440 		if (i > hwKanaResult)
   441 			{
   442 			hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
   443 			if (hwKanaResult!=KErrNotFound)
   444 				EscSequences += 15; 
   445 			}
   446 		}
   447 
   448 	aConfidenceLevel = 0 < sampleLength?
   449 		aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
   450 	aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;
   451 	}