os/textandloc/charconvfw/charconvplugins/src/shared/jisbase_shared.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include <e32std.h>
    20 #include <charconv.h>
    21 #include <convdata.h>
    22 #include <convutils.h>
    23 #include "jisx0201.h"
    24 #include "jisx0208.h"
    25 #include "jisx0212.h"
    26 #include "jisbase.h"
    27 
    28 const TUint KControlCharacterEscape=0x1b;
    29 const TUint KControlCharacterShiftOut=0x0e;
    30 const TUint KControlCharacterShiftIn=0x0f;
    31 const TUint KBitsForNonStandardStates=0x03;
    32 
    33 _LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
    34 _LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
    35 _LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
    36 _LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
    37 _LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
    38 _LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
    39 _LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
    40 _LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
    41 
    42 typedef TInt (*FChangeState)(TInt aState);
    43 typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
    44 
    45 enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
    46 	{
    47 	ENonStandardStateJis7=1,
    48 	ENonStandardStateJis8
    49 	};
    50 
    51 
    52 LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
    53 	{
    54 	0x00,
    55 	0xff,
    56 	0,
    57 	0
    58 	};
    59 
    60 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
    61 	{
    62 	0x21,
    63 	0x5f,
    64 	SCnvConversionData::SOneDirectionData::SRange::EOffset,
    65 	0,
    66 	0,
    67 		{
    68 		STATIC_CAST(TUint, 65344),
    69 		0
    70 		}
    71 	};
    72 
    73 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
    74 	{
    75 	0xff61,
    76 	0xff9f,
    77 	SCnvConversionData::SOneDirectionData::SRange::EOffset,
    78 	1,
    79 	0,
    80 		{
    81 		STATIC_CAST(TUint, -65344),
    82 		0
    83 		}
    84 	};
    85 
    86 LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
    87 	{
    88 	SCnvConversionData::EUnspecified,
    89 		{
    90 		1,
    91 		&halfWidthKatakana7VariableByteDataRange
    92 		},
    93 		{
    94 		1,
    95 		&halfWidthKatakana7ToUnicodeDataRange
    96 		},
    97 		{
    98 		1,
    99 		&unicodeToHalfWidthKatakana7DataRange
   100 		}
   101 	};
   102 
   103 #if defined(_DEBUG)
   104 
   105 _LIT(KLitPanicText, "JISBASE_SHARED");
   106 
   107 enum TPanic
   108 	{
   109 	EPanicNotAppending1=1,
   110 	EPanicNotAppending2,
   111 	EPanicNotAppending3,
   112 	EPanicBadNonStandardState,
   113 	EPanicBadPointers1,
   114 	EPanicBadPointers2,
   115 	EPanicBadPointers3,
   116 	EPanicBadPointers4,
   117 	EPanicBadFunctionPointer
   118 	};
   119 
   120 LOCAL_C void Panic(TPanic aPanic)
   121 	{
   122 	User::Panic(KLitPanicText, aPanic);
   123 	}
   124 
   125 #endif
   126 
   127 TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
   128 	{
   129 	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
   130 	}
   131 
   132 TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
   133 	{
   134 	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
   135 	}
   136 
   137 TInt CnvJisBase::ChangeToStandardState(TInt)
   138 	{
   139 	return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
   140 	}
   141 
   142 TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   143 	{
   144 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
   145 	return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
   146 	}
   147 
   148 TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   149 	{
   150 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
   151 	return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
   152 	}
   153 
   154 TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
   155 	{
   156 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
   157 	return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
   158 	}
   159 
   160 EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   161 	{
   162 	TFixedArray<CnvUtilities::SState, 8> states;
   163 	states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
   164 	states[0].iConversionData=&CnvJisRoman::ConversionData();
   165 	states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
   166 	states[1].iConversionData=&CnvJisRoman::ConversionData();	
   167 	states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii;
   168 	states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
   169 	states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
   170 	states[3].iConversionData=&halfWidthKatakana7ConversionData;
   171 	states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
   172 	states[4].iConversionData=&CnvJisX0208::ConversionData();
   173 	states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
   174 	states[5].iConversionData=&CnvJisX0208::ConversionData();
   175 	states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
   176 	states[6].iConversionData=&CnvJisX0208::ConversionData();
   177 	states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
   178 	states[7].iConversionData=&CnvJisX0212::ConversionData();
   179 	const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
   180 	aUnicode.SetLength(0);
   181 	const TUint8* const pointerToFirstByte=aForeign.Ptr();
   182 	const TUint8* pointerToCurrentByte=pointerToFirstByte;
   183 	const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
   184 	const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
   185 	TUint outputConversionFlags=0;
   186 	TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
   187 	FOREVER
   188 		{
   189 		FChangeState changeState=NULL;
   190 		FAppendConvertToUnicode appendConvertToUnicode=NULL;
   191 		TBool skipThisByte=EFalse;
   192 		const TUint currentByte=*pointerToCurrentByte;
   193 		switch (aState&KBitsForNonStandardStates)
   194 			{
   195 		case 0:
   196 			if (currentByte==KControlCharacterShiftOut)
   197 				{
   198 				changeState=ChangeToNonStandardStateJis7;
   199 				skipThisByte=ETrue;
   200 				}
   201 			else if (currentByte&0x80)
   202 				{
   203 				changeState=ChangeToNonStandardStateJis8;
   204 				}
   205 			appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
   206 			break;
   207 		case ENonStandardStateJis7:
   208 			if (currentByte==KControlCharacterEscape)
   209 				{
   210 				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
   211 				}
   212 			else if (currentByte==KControlCharacterShiftIn)
   213 				{
   214 				changeState=ChangeToStandardState;
   215 				skipThisByte=ETrue;
   216 				}
   217 			else if (currentByte&0x80)
   218 				{
   219 				changeState=ChangeToNonStandardStateJis8;
   220 				}
   221 			appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
   222 			break;
   223 		case ENonStandardStateJis8:
   224 			if (currentByte==KControlCharacterEscape)
   225 				{
   226 				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
   227 				}
   228 			else if (currentByte==KControlCharacterShiftOut)
   229 				{
   230 				changeState=ChangeToNonStandardStateJis7;
   231 				skipThisByte=ETrue;
   232 				}
   233 			else if ((currentByte&0x80)==0)
   234 				{
   235 				changeState=ChangeToStandardState;
   236 				}
   237 			appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
   238 			break;
   239 #if defined(_DEBUG)
   240 		default:
   241 			Panic(EPanicBadNonStandardState);
   242 			break;
   243 #endif
   244 			}
   245 		__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
   246 		if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
   247 			{
   248 			TBool lastIteration=EFalse;
   249 			__ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
   250 			if (changeState==NULL)
   251 				{
   252 				++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
   253 				lastIteration=ETrue;
   254 				}
   255 			if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
   256 				{
   257 				TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
   258 				TInt numberOfUnconvertibleCharacters;
   259 				TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
   260 				__ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
   261 				const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
   262 				if (returnValue<0)
   263 					{
   264 					return returnValue; // this is an error-code
   265 					}
   266 				if (numberOfUnconvertibleCharacters>0)
   267 					{
   268 					if (aNumberOfUnconvertibleCharacters==0)
   269 						{
   270 						aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
   271 						}
   272 					aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
   273 					}
   274 				if (returnValue>0)
   275 					{
   276 					pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
   277 					lastIteration=ETrue;
   278 					changeState=NULL;
   279 					skipThisByte=EFalse;
   280 					}
   281 				__ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
   282 				if (pointerToCurrentByte>pointerToFirstByte)
   283 					{
   284 					inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
   285 					}
   286 				}
   287 			if (changeState!=NULL)
   288 				{
   289 				aState=(*changeState)(aState);
   290 				}
   291 			if (skipThisByte)
   292 				{
   293 				if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
   294 					{
   295 					lastIteration=ETrue;
   296 					}
   297 				++pointerToCurrentByte;
   298 				}
   299 			pointerToStartOfNextRunToConvert=pointerToCurrentByte;
   300 			if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
   301 				{
   302 				break;
   303 				}
   304 			__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
   305 			if (pointerToCurrentByte>=pointerToLastByte)
   306 				{
   307 				break;
   308 				}
   309 			}
   310 		++pointerToCurrentByte;
   311 		}
   312 	// no checking with outputConversionFlags need to be done here
   313 	return pointerToLastByte-(pointerToCurrentByte-1);
   314 	}
   315 
   316 EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
   317 	{
   318 	return halfWidthKatakana7ConversionData;
   319 	}
   320 
   321 EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) 
   322 	{
   323 	// JIS is modal... so start off with a confidence of 0 and to begin with look 
   324 	// for JIS escape sequences....Escape sequences defined above in the KLITs
   325 	// For each escape sequence, increase the confidenceLevel ..... 
   326 	aConfidenceLevel = 55;
   327 	TInt jisRomanResult = 0;
   328 	TInt asciiResult = 0;
   329 	TInt jisX0208Result = 0;
   330 	TInt jisC6226Result = 0;
   331 	TInt jixX0212Result = 0;
   332 	TInt hwKanaResult = 0;
   333 
   334 	TInt EscSequences = 0;
   335 	
   336 	TInt sampleLength = aSample.Length();
   337 	for (TInt i = 0; i < sampleLength; ++i)
   338 		{
   339 	
   340 		// JIS is 7 bit encoding
   341 		if((aSample[i]&0x80)!=0x00)
   342 			{
   343 			aConfidenceLevel=0;
   344 			break;
   345 			}
   346 		// JIS supports the following character sets 
   347 		if (i > jisC6226Result)
   348 			{
   349 			jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
   350 			if (jisC6226Result!=KErrNotFound)
   351 				EscSequences += 15; 
   352 			}
   353 
   354 		if (i > jisRomanResult)
   355 			{
   356 			jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
   357 			if (jisRomanResult!=KErrNotFound)
   358 				EscSequences += 15; 
   359 			}
   360 
   361 		if (i > asciiResult)
   362 			{
   363 			asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
   364 			if (asciiResult!=KErrNotFound)
   365 				EscSequences += 15; 
   366 			}
   367 
   368 		if (i > jisX0208Result)
   369 			{
   370 			jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
   371 			if (jisX0208Result!=KErrNotFound)
   372 				EscSequences += 15; 
   373 			}
   374 
   375 		if (i > jixX0212Result)
   376 			{
   377 			jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
   378 			if (jixX0212Result!=KErrNotFound)
   379 				EscSequences += 15; 
   380 			}
   381 
   382 		if (i > hwKanaResult)
   383 			{
   384 			hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
   385 			if (hwKanaResult!=KErrNotFound)
   386 				EscSequences += 15; 
   387 			}
   388 		}
   389 
   390 	aConfidenceLevel = 0 < sampleLength?
   391 		aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
   392 	aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;
   393 	}