os/textandloc/charconvfw/charconvplugins/src/plugins/j5.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 * J5 charconv character converter
    16 *
    17 */
    18 
    19 
    20 #include <e32std.h>
    21 #include <charconv.h>
    22 #include <ecom/implementationproxy.h>
    23 #include <utf.h>
    24 #include <charactersetconverter.h>
    25 #include <convutils.h>
    26 #include "shiftjis.h"
    27 #include "jisbase.h"
    28 #include "j5.h"
    29 
    30 #include "jisx0201.h"
    31 #include "jisx0208.h"
    32 #include "jisx0212.h"
    33 
    34 #include "featmgr/featmgr.h"
    35 
    36 /**
    37  J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data.
    38  */
    39 const TInt KMaxSizeAutoDetectSample = 1000;
    40 
    41 const TUint8 KEscape = 0x1b;
    42 const TInt KByteOrderMark = 0xfeff;
    43 
    44 const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters()
    45 	{
    46 	return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters();
    47 	}
    48 
    49 /**
    50  This API should not be used as it is ambiguous as to what encoding is required.  
    51  The user should instead call the specific plug-in for the appropriate conversion.
    52  J5 ConvertFromUnicode() will convert to UTF8 as default.
    53 @internalTechnology 
    54  */
    55 TInt CJ5Converter::ConvertFromUnicode(
    56 		CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */, 
    57 		const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */, 
    58 		TDes8& aForeign, 
    59 		const TDesC16& aUnicode, 
    60 		CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */)
    61 	{
    62 	return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode);
    63 	}
    64 
    65 /**
    66  This will automatically determine one of the five supported encodings 
    67  to use and convert accordingly.  This plugin method is available to the 
    68  user though the CCnvCharacterSetConverter::ConvertToUnicode() method.  
    69  There is no way for the caller to determine which encoding has been used.
    70  
    71  NOTE: For debugging the selected character set is returned in the state.
    72  
    73   @released  9.1
    74   @param     aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters
    75              in the foreign character set.
    76   @param     aUnicode On return, contains the text converted into Unicode.
    77   @param     aForeign The non-Unicode source text to be converted.
    78   @param     aState Used to save state information across multiple calls
    79              to <code>ConvertToUnicode()</code>.
    80   @param     aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not
    81              converted.
    82   @param     aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the
    83              input text that could not be converted. A negative
    84              value indicates that all the characters were
    85              converted.
    86   @return 	 The number of unconverted bytes left at the end of the input descriptor 
    87  		     (e.g. because the output descriptor is not long enough to hold all the text), 
    88  		     or one of the error values defined in TError. 
    89   @internalTechnology 
    90 */
    91 TInt CJ5Converter::ConvertToUnicode(
    92 		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
    93 		TDes16& aUnicode, 
    94 		const TDesC8& aForeign, 
    95 		TInt& aState, 
    96 		TInt& aNumberOfUnconvertibleCharacters, 
    97 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
    98 	{
    99 	// As the aState parameter is used to pass back the detected value
   100 	// use a "hidden" internal state variable.
   101 	TInt internalState = CCnvCharacterSetConverter::KStateDefault;
   102 	
   103 	// determine the encoding type and then decode appropriatly
   104 	switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign))
   105 		{
   106 		case EShiftjis:
   107 			aState = EShiftjis;
   108 			return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, 
   109 					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
   110 
   111 		case EIso2022jp1: 
   112 			aState = EIso2022jp1;
   113 			return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
   114 					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
   115 
   116 		case EEucjp: 
   117 			aState = EEucjp;
   118 			return ConvertEEucjpToUnicode(
   119 					aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
   120 					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);	
   121 
   122 		case EUcs2:
   123 			aState = EUcs2;
   124 			return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, 
   125 					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
   126 
   127 		case EUtf8: 
   128 			aState = EUtf8;
   129 			return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
   130 			
   131 		default:
   132 			// fall though to the default, which is decode as UTF8
   133 			aState = EUnknown;
   134 			break;
   135 		}
   136 
   137 	// decode as UTF8
   138 	return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
   139 	}
   140 
   141 /**
   142  This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL(). 
   143  This method returns a value between 0 and 100, indicating how likely it 
   144  is that this is the correct converter, for the text supplied.  As J5 is 
   145  NOT intended to be used with the existing auto-detect mechanism, it will 
   146  always return 0
   147  @internalTechnology 
   148  */
   149 TBool CJ5Converter::IsInThisCharacterSetL(
   150 		TBool& aSetToTrue, 
   151 		TInt& aConfidenceLevel, 
   152 		const TDesC8& /* aSample */)
   153 	{
   154   	/*
   155   	aSetToTrue - This value should be set to ETrue. It is used to indicate to 
   156   	CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL 
   157   	is implementing a function of this signature and is therefore not the empty 
   158   	*/
   159   	aSetToTrue=ETrue;
   160   	
   161  	/* no need to look at the sample as this always returns 0 
   162  	   as the autodetect feature is not supported by the J5 plug-in
   163  	*/
   164  	aConfidenceLevel=0;
   165 	return ETrue;
   166 	}
   167 
   168 CJ5Converter* CJ5Converter::NewL()
   169 	{
   170 	CJ5Converter* self = new(ELeave) CJ5Converter();
   171     CleanupStack::PushL(self);
   172     self->ConstructL();
   173     CleanupStack::Pop(self);	
   174 	return self;
   175 	}
   176 
   177 CJ5Converter::~CJ5Converter()
   178 	{
   179     FeatureManager::UnInitializeLib();	
   180 	}
   181 
   182 CJ5Converter::CJ5Converter()
   183 	{
   184 	}
   185 
   186 void CJ5Converter::ConstructL()
   187     {
   188     FeatureManager::InitializeLibL();
   189     }
   190 
   191 const TImplementationProxy ImplementationTable[] = 
   192 	{
   193 #ifdef KDDIAU_TEST
   194 		// for the test build use a special test UID
   195 		IMPLEMENTATION_PROXY_ENTRY(0x01000002,	CJ5Converter::NewL)
   196 #else
   197 		IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5,	CJ5Converter::NewL)
   198 #endif
   199 	};
   200 
   201 EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
   202 	{
   203 	aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
   204 
   205 	return ImplementationTable;
   206 	}
   207 	
   208 /**
   209  DetectEncoding determine the characterset encoding.
   210  The logic for this detection is based on the information in CJKV by Ken Lunde.
   211  A detailed diagram of this logic is in the J5 how to document section 2.4
   212  @return The detected character set as a enum CJ5Converter.
   213  @internalTechnology 
   214  */
   215 enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding(
   216 		CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters , 
   217 		const TDesC8& aForeign)
   218 	{
   219 	
   220 	// first check for UCS2
   221 	CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian;
   222 	if ( DetectUcs2(aForeign, ucs2Endianness ))
   223 		{
   224 		// if ucs2 is detected pass back the detected endianess
   225 		aDefaultEndiannessOfForeignCharacters = ucs2Endianness;
   226 		return EUcs2;
   227 		}
   228 
   229 	// next try EUC_JP
   230 	TInt eucJpValidBytes = 0;
   231 	CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes );
   232 	if ( result == EIsCharacterSet )
   233 		{
   234 		return EEucjp;
   235 		}
   236 		
   237 	// next try Iso 2020JP
   238 	if ( DetectIso2022( aForeign ) == EIsCharacterSet )
   239 		{
   240 		return EIso2022jp1;
   241 		}
   242 		
   243 	// next try Utf8
   244 	if ( DetectUtf8( aForeign ) == EIsCharacterSet )
   245 		{
   246 		return EUtf8;
   247 		}
   248 		
   249 	// shiftjis
   250 	TInt shiftjisValidBytes = 0;
   251 	result = DetectShiftJis( aForeign, shiftjisValidBytes );
   252 	if ( result == EIsCharacterSet )
   253 		{
   254 		return EShiftjis;
   255 		}
   256 		
   257 	// no clear winner so go for the best 
   258 	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);
   259 		
   260 	// if more than half is shiftjis and more shiftjis than EUC_JP, 	
   261 	if ((shiftjisValidBytes >  eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength))
   262 		return EShiftjis;
   263 
   264 	// if more than half is EUC_JP and more EUC_JP than shiftjis, 	
   265 	if ((eucJpValidBytes >  shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength))
   266 		return EEucjp;
   267 			
   268 	// return the default
   269 	return EUcs2;
   270 	}
   271 	
   272 	
   273 /**
   274  Check if UCS2.
   275  If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff)
   276  then this must be UCS2. Otherwise try lookiing for  0x**00 or 0x00**
   277  @param A sample of data to be checked
   278  @param The Endianness if USC2 is detected
   279  @return ETrue if UCS2 else EFalse
   280  @internalTechnology 
   281  */
   282 TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign, 
   283 	CCnvCharacterSetConverter::TEndianness& aTEndianness )
   284 	{
   285 	// if the sample is not big enough
   286 	if (aForeign.Length() < 2)
   287 		{
   288 		return EFalse;
   289 		}
   290 	else if (aForeign[0]==0xff && aForeign[1]==0xfe )
   291 		{ 
   292 		// we have found a Little Endian Byte order mark
   293 		aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
   294 		return ETrue;
   295 		}
   296 	else if (aForeign[0]==0xfe && aForeign[1]==0xff )
   297 		{ 
   298 		// we have found a Big Endian Byte order mark 
   299 		aTEndianness = CCnvCharacterSetConverter::EBigEndian;
   300 		return ETrue;
   301 		}
   302 
   303 	// Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that 
   304 	// specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters. 
   305 	// NB: This will fail if there are no ASCII characters in the text.
   306 	TInt sampleLength = aForeign.Length();
   307 	sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   308 
   309 	// check the sample for sequences of 0x**00	or 0x00**
   310 	TInt bigEndianConfidence = 0;
   311 	TInt littleEndianConfidence = 0;
   312 	TInt i=0;
   313 	for(;i< (sampleLength-1); i+=2)
   314 		{
   315 		if( aForeign[i] == 0x00)
   316 			{
   317 			bigEndianConfidence +=2;
   318 			}
   319 		else if ( aForeign[i+1] == 0x00)
   320 			{
   321 	 		littleEndianConfidence +=2;
   322 			}
   323 		}
   324 
   325 	// which occurs most BE or LE	
   326 	TInt confidenceLevel = 0;
   327 	if (bigEndianConfidence > littleEndianConfidence)
   328 		{
   329 		aTEndianness = CCnvCharacterSetConverter::EBigEndian;
   330 		confidenceLevel = bigEndianConfidence;
   331 		}
   332 	else
   333 		{
   334 		aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
   335 		confidenceLevel = littleEndianConfidence;
   336 		}
   337 		
   338 	// if more than 97% count as UCS2
   339 	if ( confidenceLevel * 100/sampleLength > 97) 
   340 		return ETrue;
   341 
   342 	return EFalse;
   343 	}	
   344 
   345 /**
   346  Check if ShiftJis (reference CJKV by Ken Lunde page 175)
   347  @param A sample of data to be checked
   348  @param The number of input bytes that can be converted
   349  @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   350  @internalTechnology 
   351  */
   352 enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
   353 	{
   354 	// Get the sample length
   355 	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   356 
   357 	TInt i=0;
   358 	aNumberOfBytesConverted = 0;
   359 	
   360 	TText8 character;
   361 	TText8 characterPlus1;
   362 	TText8 characterPlus2;
   363 	
   364 	// scan the sample text looking for valid shiftjis data
   365 	while ( i < sampleLength )
   366 		{
   367 		// get the next few characters, use 0 if there is no more sample
   368 		// as this will not match any test.
   369 		character = aForeign[i];
   370 		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   371 		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   372 
   373 		// SHIFTJIS	- 0x8e to 0x9f followed by 0x40 to 0xfc  
   374 		if ((character >= 0x81) && (character <= 0x9f) &&
   375 				(characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) ) 
   376 			{
   377 			// this is SHIFTJIS unless it is EUC JP code set 2 or 3
   378 			if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF))
   379 				{
   380 				// this could be EUC JP code set 2 (or shiftjis)
   381 				aNumberOfBytesConverted+=2;
   382 				i++;
   383 				}
   384 			else if ((character == 0x8F) && 
   385 				(characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) &&
   386 					(characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
   387 				{
   388 				// this could be EUC JP code set 3 (or shiftjis)
   389 				aNumberOfBytesConverted+=3;
   390 				i+=2;
   391 				}
   392 			else
   393 				{
   394 				// this can only be shift jis 
   395 				return EIsCharacterSet;
   396 				}
   397 			}
   398 			
   399 		// SHIFTJIS	- 0xE0 to 0xEF followed by .....
   400 		else if ((character >= 0xE0) && (character <= 0xEF))
   401 			{
   402 			// 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF  
   403 			// including Mopera extension to shiftjis from 0xEF80 to 0xEFFC
   404 			
   405 			if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) ) 
   406 				{
   407 				// this can only be shift jis 
   408 				return EIsCharacterSet;
   409 				}
   410 			else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) ) 
   411 				{
   412 				// this could be EUC JP code set 1
   413 				aNumberOfBytesConverted+=2;
   414 				i++;
   415 				}
   416 				
   417 			// problem here is the overlap between the UTF8 and shiftjis
   418 			else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) )
   419 				{
   420 				// this could be shiftjis or utf8
   421 				aNumberOfBytesConverted+=2;
   422 				i++;
   423 				}		
   424 			}
   425 		// half width katakana A1-DF	
   426 		else if ((character >= 0xA1) && (character <= 0xDF))
   427 			{
   428 			aNumberOfBytesConverted+=1;
   429 			}
   430 		// ASCII or JIS-Roman 20-7e	
   431 		else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D))
   432 			{
   433 			aNumberOfBytesConverted+=1;
   434 			}
   435 		else
   436 			{
   437 			// This is not decoding as shiftjis, so reject
   438 			aNumberOfBytesConverted =0;
   439 			return EIsNotCharacterSet;
   440 			}
   441 		i++;
   442 		}
   443 
   444 	// if all the characters could be converted
   445 	if (aNumberOfBytesConverted == sampleLength)
   446 		{
   447 		return EIsCharacterSet;
   448 		}
   449 	else if (aNumberOfBytesConverted == 0)
   450 		{
   451 		return EIsNotCharacterSet;
   452 		}
   453 	else
   454 		{
   455 		return EMaybeCharacterSet;
   456 		}
   457 	}
   458 	
   459 /**
   460  Check if UTF8 (reference CJKV by Ken Lunde page 189)
   461  @param A sample of data to be checked
   462  @param The number of input bytes that can be converted
   463  @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   464  @internalTechnology 
   465  */
   466 enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign )
   467 	{
   468 	// Get the sample length
   469 	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   470 
   471 	TInt i=0;	
   472 	TText8 character;
   473 	TText8 characterPlus1;
   474 	TText8 characterPlus2;
   475 	TText8 characterPlus3;
   476 	
   477 	// scan the sample text looking for valid UTF8
   478 	while ( i < sampleLength )
   479 		{
   480 		// get the next few characters, use 0 if there is no more sample
   481 		// as this will not match any test.
   482 		character = aForeign[i];
   483 		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   484 		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   485 		characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
   486 
   487 		// UTF8 range 110xxxxx followed by one valid UTF8 bytes
   488 		if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) )
   489 			{
   490 			// two bytes of valid UTF8 found
   491 			i+=2;
   492 			}
   493 		// UTF8 range 1110xxxx followed by two valid UTF8 bytes
   494 		else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80))
   495 			{
   496 			// three bytes of valid UTF8 found
   497 			i+=3;
   498 			}
   499 		// UTF8 range 11110xxx followed by three valid UTF8 bytes
   500 		else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80) 
   501 				&& (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) )
   502 			{
   503 			// four bytes of valid UTF8 found
   504 			i+=4;
   505 			}
   506 		
   507 		// ascii range 0 to 0x7F	
   508 		else if((character & 0x80)==0x00)
   509 			{
   510 			// The value of character is in the range 0x00-0x7f
   511 			// UTF8 maintains ASCII transparency. So it's a valid UTF8.
   512 			i++;
   513 			}
   514 		// if the sample data is longer than KMaxSizeAutoDetectSample then except anything
   515 		// for the last two bytes as they may not appear valid without more data	
   516 		else if( i >= (KMaxSizeAutoDetectSample -2) )
   517 			{
   518 			i++;
   519 			}
   520 		else
   521 			{
   522 			// This is not decoding as UTF8 so reject
   523 			return EIsNotCharacterSet;
   524 			}
   525 		}	
   526 	
   527 	// All the characters could be converted
   528 	return EIsCharacterSet;
   529 	
   530 	}
   531 
   532 
   533 /**
   534  Check if ISO2022JP by lookiing for the escape sequences.
   535  @param A sample of data to be checked
   536  @param The number of input bytes that can be converted
   537  @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   538  @internalTechnology 
   539  */
   540 enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign )
   541 	{
   542 	// Get the sample length
   543 	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   544 
   545 	TInt i=0;
   546 	TText8 character;
   547 	TText8 characterPlus1;
   548 	TText8 characterPlus2;
   549 	TText8 characterPlus3;
   550 	TText8 characterPlus4;
   551 	TText8 characterPlus5;
   552 	
   553 	// scan the sample text looking for valid UTF8
   554 	while ( i < sampleLength )
   555 		{
   556 		// get the next few characters, use 0 if there is no more sample
   557 		// as this will not match any test.
   558 		character = aForeign[i];
   559 		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   560 		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   561 		characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
   562 
   563 
   564 		// check for the JIS escape sequences of ISO 2022Jp
   565 		// These values have been taken from JISBASE_SHARED
   566 		if (character == KEscape)
   567 			{
   568 			// Escape Sequence For Jis C6226_1978 \x1b\x24\x40
   569 			if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40))
   570 				{
   571 				return EIsCharacterSet;
   572 				}
   573 				
   574 			// Escape Sequence For Jis X0208_1983 \x1b\x24\x42
   575 			else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42))
   576 				{
   577 				return EIsCharacterSet;
   578 				}
   579 			
   580 			// Escape Sequence For Jis Roman \x1b\x28\x4a
   581 			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A))
   582 				{
   583 				return EIsCharacterSet;
   584 				}
   585 				
   586 			// Escape Sequence For Jis RomanIncorrect \x1b\x28\x48
   587 			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48))
   588 				{
   589 				return EIsCharacterSet;
   590 				}
   591 
   592 			// Escape Sequence For Ascii \x1b\x28\x42
   593 			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42))
   594 				{
   595 				return EIsCharacterSet;
   596 				}
   597 				
   598 			// Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49
   599 			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49))
   600 				{
   601 				return EIsCharacterSet;
   602 				}
   603 				
   604 			// Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42
   605 			else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40))
   606 				{
   607 				characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0);
   608 				characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0);
   609 
   610 				if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42))
   611 					{
   612 					return EIsCharacterSet;
   613 					}
   614 				}
   615 			// Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44
   616 			else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28)) 
   617 				{
   618 				if (characterPlus3 == 0x44)
   619 					{
   620 					return EIsCharacterSet;
   621 					}
   622 				}
   623 				
   624 			// check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40
   625 			else if ((characterPlus1 == 'B') || (characterPlus1 == '@'))
   626 				{
   627 				return EIsCharacterSet;
   628 				}
   629 				
   630 			} // end of if ( character == KEscape )
   631 
   632 		i++;
   633 		}	
   634 
   635 	// if escape sequences have been found then this is not ISO2022
   636 	return EIsNotCharacterSet;
   637 	
   638 	}
   639 
   640 
   641 /**
   642  Check if EUC JP (reference CJKV by Ken Lunde page 164)
   643  @param A sample of data to be checked
   644  @param The number of input bytes that can be converted
   645  @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   646  @internalTechnology 
   647  */
   648 CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
   649 	{
   650 	// Get the sample length
   651 	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   652 
   653 	TInt i=0;
   654 	aNumberOfBytesConverted = 0;
   655 	
   656 	TText8 character;
   657 	TText8 characterPlus1;
   658 	TText8 characterPlus2;
   659 	
   660 	// scan the sample text looking for valid shiftjis data
   661 	while ( i < sampleLength )
   662 		{
   663 		// get the next few characters, use 0 if there is no more sample
   664 		// as this will not match any test.
   665 		character = aForeign[i];
   666 		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   667 		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   668 
   669 		// EUCJP code set 0 0x21-0x7e
   670 		if ( (character >= 0x21) && (character <= 0x7e))
   671 			{
   672 			aNumberOfBytesConverted++;
   673 			}
   674 		else if ( (character == 0x0a) || (character == 0x0d))
   675 			{
   676 			aNumberOfBytesConverted++;
   677 			}
   678 		// EUCJP code set 1
   679 		else if ( (character >= 0xa1) && (character <= 0xff)
   680 				&& (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) ) 
   681 			{
   682 			aNumberOfBytesConverted+=2;
   683 			i++;
   684 			}
   685 		 		
   686 		// EUC JP code set 2, starts with the EUC JP SS2 character (0x8E)
   687 		// and is followed by character in range 0xA1- 0xDF
   688 		else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) ) 
   689 			{
   690 			// this could be 2 bytes of EUC JP code set 2
   691 			aNumberOfBytesConverted += 2;
   692 			i++;
   693 			}
   694 		// EUC JP code set 3, starts with the EUC JP SS3 character (0x8F)
   695 		// and is followed by two characters in range A1- DF A1 -FE
   696 		else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) 
   697 				&& (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
   698 			{
   699 			// this could be 3 bytes of EUC JP code set 3
   700 			aNumberOfBytesConverted += 3;
   701 			i+=2;
   702 			}		
   703 		else
   704 			{
   705 			// This is not a valid decoding as EUC JP so reject
   706 			return EIsNotCharacterSet;
   707 			}
   708 		i++;
   709 		}	
   710 	
   711 	
   712 	// if all the characters could be converted
   713 	if (aNumberOfBytesConverted == sampleLength)
   714 		{
   715 		return EIsCharacterSet;
   716 		}
   717 	else if (aNumberOfBytesConverted == 0)
   718 		{
   719 		return EIsNotCharacterSet;
   720 		}
   721 	else
   722 		{
   723 		return EMaybeCharacterSet;
   724 		}
   725 	}
   726 
   727 			
   728 /**
   729  Convert from UCS2 (Universal Character Set containing two bytes) to unicode
   730  Remove any byte order marks in the UCSs.
   731  @param aUnicode Contains the converted text in the Unicode character set.
   732  @param	aForeign The non-Unicode source text to be converted
   733  @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted. 
   734  @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character.
   735  @return the number of bytes converted
   736  @internalTechnology 
   737  */
   738  TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters, 
   739 						   TDes16& aUnicode,	 
   740 						   const TDesC8& aForeign, 
   741 						   TInt& aNumberOfUnconvertibleCharacters,  
   742 						   TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) 
   743 
   744 	{
   745 	TInt numberOfBytesConverted = 0;
   746 	TInt numberOfUnicodeCharacters =0;
   747 	TChar nextChar;
   748 
   749 	// start at begining of the output buffer provided
   750 	aUnicode.Zero();
   751 
   752 	// while there is at least 2 bytes of data to convert and space in the output buffer
   753 	while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) )
   754 		{
   755 		if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian )
   756 			{
   757 			// ELittleEndian 0x??00
   758 			nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8);
   759 			}
   760 		else
   761 			{
   762 			// EBigEndian 0x00??
   763 			nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1];
   764 			}
   765 			
   766 		// save the unicode character extracted	unless it's a BOM
   767 		if ( nextChar != KByteOrderMark )
   768 			{
   769 			aUnicode.Append( nextChar );
   770 			numberOfUnicodeCharacters++;	
   771 			}
   772 			
   773 		numberOfBytesConverted+=2;
   774 		}
   775 	
   776 	// there are no uncovertable characters with UCS2, but there could be
   777 	aNumberOfUnconvertibleCharacters = 0;
   778 	// a negative value indicates that all characters converted
   779 	aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1;
   780 				
   781 	// returns the number of unconverted bytes left at the end of the input descriptor 
   782 	// Note there could be 1 byte left over if an odd number of bytes provided for conversion
   783 	return aForeign.Size() - numberOfBytesConverted;
   784 	}
   785 		
   786 /**
   787  Convert from EUC_JP (Extended Unix Code encoding for Japanese)
   788  Using the standard Charconv method of an array of methods
   789  @return the number of bytes converted
   790  @internalTechnology 
   791  */
   792  TInt CJ5Converter::ConvertEEucjpToUnicode(
   793 		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
   794 		TDes16& aUnicode, 
   795 		const TDesC8& aForeign, 
   796 		TInt& /*aState*/, 
   797 		TInt& aNumberOfUnconvertibleCharacters, 
   798 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   799 	{
   800 	TFixedArray<CnvUtilities::SMethod, 4> methods;
   801 	methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman;
   802 	methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace;
   803 	methods[0].iConversionData=&CnvJisRoman::ConversionData();
   804 	methods[0].iNumberOfBytesPerCharacter=1;
   805 	methods[0].iNumberOfCoreBytesPerCharacter=1;
   806 	methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208;
   807 	methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace;
   808 	methods[1].iConversionData=&CnvJisX0208::ConversionData();
   809 	methods[1].iNumberOfBytesPerCharacter=2;
   810 	methods[1].iNumberOfCoreBytesPerCharacter=2;
   811 	methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8;
   812 	methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace;
   813 	methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData();
   814 	methods[2].iNumberOfBytesPerCharacter=2;
   815 	methods[2].iNumberOfCoreBytesPerCharacter=1;
   816 	methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212;
   817 	methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace;
   818 	methods[3].iConversionData=&CnvJisX0212::ConversionData();
   819 	methods[3].iNumberOfBytesPerCharacter=3;
   820 	methods[3].iNumberOfCoreBytesPerCharacter=2;
   821 	return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array());
   822 	}		
   823