1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/charconvfw/charconvplugins/src/plugins/j5.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,823 @@
     1.4 +/*
     1.5 +* Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +* All rights reserved.
     1.7 +* This component and the accompanying materials are made available
     1.8 +* under the terms of "Eclipse Public License v1.0"
     1.9 +* which accompanies this distribution, and is available
    1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +*
    1.12 +* Initial Contributors:
    1.13 +* Nokia Corporation - initial contribution.
    1.14 +*
    1.15 +* Contributors:
    1.16 +*
    1.17 +* Description: 
    1.18 +* J5 charconv character converter
    1.19 +*
    1.20 +*/
    1.21 +
    1.22 +
    1.23 +#include <e32std.h>
    1.24 +#include <charconv.h>
    1.25 +#include <ecom/implementationproxy.h>
    1.26 +#include <utf.h>
    1.27 +#include <charactersetconverter.h>
    1.28 +#include <convutils.h>
    1.29 +#include "shiftjis.h"
    1.30 +#include "jisbase.h"
    1.31 +#include "j5.h"
    1.32 +
    1.33 +#include "jisx0201.h"
    1.34 +#include "jisx0208.h"
    1.35 +#include "jisx0212.h"
    1.36 +
    1.37 +#include "featmgr/featmgr.h"
    1.38 +
    1.39 +/**
    1.40 + J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data.
    1.41 + */
    1.42 +const TInt KMaxSizeAutoDetectSample = 1000;
    1.43 +
    1.44 +const TUint8 KEscape = 0x1b;
    1.45 +const TInt KByteOrderMark = 0xfeff;
    1.46 +
    1.47 +const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters()
    1.48 +	{
    1.49 +	return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters();
    1.50 +	}
    1.51 +
    1.52 +/**
    1.53 + This API should not be used as it is ambiguous as to what encoding is required.  
    1.54 + The user should instead call the specific plug-in for the appropriate conversion.
    1.55 + J5 ConvertFromUnicode() will convert to UTF8 as default.
    1.56 +@internalTechnology 
    1.57 + */
    1.58 +TInt CJ5Converter::ConvertFromUnicode(
    1.59 +		CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */, 
    1.60 +		const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */, 
    1.61 +		TDes8& aForeign, 
    1.62 +		const TDesC16& aUnicode, 
    1.63 +		CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */)
    1.64 +	{
    1.65 +	return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode);
    1.66 +	}
    1.67 +
    1.68 +/**
    1.69 + This will automatically determine one of the five supported encodings 
    1.70 + to use and convert accordingly.  This plugin method is available to the 
    1.71 + user though the CCnvCharacterSetConverter::ConvertToUnicode() method.  
    1.72 + There is no way for the caller to determine which encoding has been used.
    1.73 + 
    1.74 + NOTE: For debugging the selected character set is returned in the state.
    1.75 + 
    1.76 +  @released  9.1
    1.77 +  @param     aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters
    1.78 +             in the foreign character set.
    1.79 +  @param     aUnicode On return, contains the text converted into Unicode.
    1.80 +  @param     aForeign The non-Unicode source text to be converted.
    1.81 +  @param     aState Used to save state information across multiple calls
    1.82 +             to <code>ConvertToUnicode()</code>.
    1.83 +  @param     aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not
    1.84 +             converted.
    1.85 +  @param     aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the
    1.86 +             input text that could not be converted. A negative
    1.87 +             value indicates that all the characters were
    1.88 +             converted.
    1.89 +  @return 	 The number of unconverted bytes left at the end of the input descriptor 
    1.90 + 		     (e.g. because the output descriptor is not long enough to hold all the text), 
    1.91 + 		     or one of the error values defined in TError. 
    1.92 +  @internalTechnology 
    1.93 +*/
    1.94 +TInt CJ5Converter::ConvertToUnicode(
    1.95 +		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
    1.96 +		TDes16& aUnicode, 
    1.97 +		const TDesC8& aForeign, 
    1.98 +		TInt& aState, 
    1.99 +		TInt& aNumberOfUnconvertibleCharacters, 
   1.100 +		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   1.101 +	{
   1.102 +	// As the aState parameter is used to pass back the detected value
   1.103 +	// use a "hidden" internal state variable.
   1.104 +	TInt internalState = CCnvCharacterSetConverter::KStateDefault;
   1.105 +	
   1.106 +	// determine the encoding type and then decode appropriatly
   1.107 +	switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign))
   1.108 +		{
   1.109 +		case EShiftjis:
   1.110 +			aState = EShiftjis;
   1.111 +			return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, 
   1.112 +					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
   1.113 +
   1.114 +		case EIso2022jp1: 
   1.115 +			aState = EIso2022jp1;
   1.116 +			return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
   1.117 +					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
   1.118 +
   1.119 +		case EEucjp: 
   1.120 +			aState = EEucjp;
   1.121 +			return ConvertEEucjpToUnicode(
   1.122 +					aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
   1.123 +					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);	
   1.124 +
   1.125 +		case EUcs2:
   1.126 +			aState = EUcs2;
   1.127 +			return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, 
   1.128 +					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
   1.129 +
   1.130 +		case EUtf8: 
   1.131 +			aState = EUtf8;
   1.132 +			return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
   1.133 +			
   1.134 +		default:
   1.135 +			// fall though to the default, which is decode as UTF8
   1.136 +			aState = EUnknown;
   1.137 +			break;
   1.138 +		}
   1.139 +
   1.140 +	// decode as UTF8
   1.141 +	return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
   1.142 +	}
   1.143 +
   1.144 +/**
   1.145 + This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL(). 
   1.146 + This method returns a value between 0 and 100, indicating how likely it 
   1.147 + is that this is the correct converter, for the text supplied.  As J5 is 
   1.148 + NOT intended to be used with the existing auto-detect mechanism, it will 
   1.149 + always return 0
   1.150 + @internalTechnology 
   1.151 + */
   1.152 +TBool CJ5Converter::IsInThisCharacterSetL(
   1.153 +		TBool& aSetToTrue, 
   1.154 +		TInt& aConfidenceLevel, 
   1.155 +		const TDesC8& /* aSample */)
   1.156 +	{
   1.157 +  	/*
   1.158 +  	aSetToTrue - This value should be set to ETrue. It is used to indicate to 
   1.159 +  	CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL 
   1.160 +  	is implementing a function of this signature and is therefore not the empty 
   1.161 +  	*/
   1.162 +  	aSetToTrue=ETrue;
   1.163 +  	
   1.164 + 	/* no need to look at the sample as this always returns 0 
   1.165 + 	   as the autodetect feature is not supported by the J5 plug-in
   1.166 + 	*/
   1.167 + 	aConfidenceLevel=0;
   1.168 +	return ETrue;
   1.169 +	}
   1.170 +
   1.171 +CJ5Converter* CJ5Converter::NewL()
   1.172 +	{
   1.173 +	CJ5Converter* self = new(ELeave) CJ5Converter();
   1.174 +    CleanupStack::PushL(self);
   1.175 +    self->ConstructL();
   1.176 +    CleanupStack::Pop(self);	
   1.177 +	return self;
   1.178 +	}
   1.179 +
   1.180 +CJ5Converter::~CJ5Converter()
   1.181 +	{
   1.182 +    FeatureManager::UnInitializeLib();	
   1.183 +	}
   1.184 +
   1.185 +CJ5Converter::CJ5Converter()
   1.186 +	{
   1.187 +	}
   1.188 +
   1.189 +void CJ5Converter::ConstructL()
   1.190 +    {
   1.191 +    FeatureManager::InitializeLibL();
   1.192 +    }
   1.193 +
   1.194 +const TImplementationProxy ImplementationTable[] = 
   1.195 +	{
   1.196 +#ifdef KDDIAU_TEST
   1.197 +		// for the test build use a special test UID
   1.198 +		IMPLEMENTATION_PROXY_ENTRY(0x01000002,	CJ5Converter::NewL)
   1.199 +#else
   1.200 +		IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5,	CJ5Converter::NewL)
   1.201 +#endif
   1.202 +	};
   1.203 +
   1.204 +EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
   1.205 +	{
   1.206 +	aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
   1.207 +
   1.208 +	return ImplementationTable;
   1.209 +	}
   1.210 +	
   1.211 +/**
   1.212 + DetectEncoding determine the characterset encoding.
   1.213 + The logic for this detection is based on the information in CJKV by Ken Lunde.
   1.214 + A detailed diagram of this logic is in the J5 how to document section 2.4
   1.215 + @return The detected character set as a enum CJ5Converter.
   1.216 + @internalTechnology 
   1.217 + */
   1.218 +enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding(
   1.219 +		CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters , 
   1.220 +		const TDesC8& aForeign)
   1.221 +	{
   1.222 +	
   1.223 +	// first check for UCS2
   1.224 +	CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian;
   1.225 +	if ( DetectUcs2(aForeign, ucs2Endianness ))
   1.226 +		{
   1.227 +		// if ucs2 is detected pass back the detected endianess
   1.228 +		aDefaultEndiannessOfForeignCharacters = ucs2Endianness;
   1.229 +		return EUcs2;
   1.230 +		}
   1.231 +
   1.232 +	// next try EUC_JP
   1.233 +	TInt eucJpValidBytes = 0;
   1.234 +	CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes );
   1.235 +	if ( result == EIsCharacterSet )
   1.236 +		{
   1.237 +		return EEucjp;
   1.238 +		}
   1.239 +		
   1.240 +	// next try Iso 2020JP
   1.241 +	if ( DetectIso2022( aForeign ) == EIsCharacterSet )
   1.242 +		{
   1.243 +		return EIso2022jp1;
   1.244 +		}
   1.245 +		
   1.246 +	// next try Utf8
   1.247 +	if ( DetectUtf8( aForeign ) == EIsCharacterSet )
   1.248 +		{
   1.249 +		return EUtf8;
   1.250 +		}
   1.251 +		
   1.252 +	// shiftjis
   1.253 +	TInt shiftjisValidBytes = 0;
   1.254 +	result = DetectShiftJis( aForeign, shiftjisValidBytes );
   1.255 +	if ( result == EIsCharacterSet )
   1.256 +		{
   1.257 +		return EShiftjis;
   1.258 +		}
   1.259 +		
   1.260 +	// no clear winner so go for the best 
   1.261 +	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);
   1.262 +		
   1.263 +	// if more than half is shiftjis and more shiftjis than EUC_JP, 	
   1.264 +	if ((shiftjisValidBytes >  eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength))
   1.265 +		return EShiftjis;
   1.266 +
   1.267 +	// if more than half is EUC_JP and more EUC_JP than shiftjis, 	
   1.268 +	if ((eucJpValidBytes >  shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength))
   1.269 +		return EEucjp;
   1.270 +			
   1.271 +	// return the default
   1.272 +	return EUcs2;
   1.273 +	}
   1.274 +	
   1.275 +	
   1.276 +/**
   1.277 + Check if UCS2.
   1.278 + If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff)
   1.279 + then this must be UCS2. Otherwise try lookiing for  0x**00 or 0x00**
   1.280 + @param A sample of data to be checked
   1.281 + @param The Endianness if USC2 is detected
   1.282 + @return ETrue if UCS2 else EFalse
   1.283 + @internalTechnology 
   1.284 + */
   1.285 +TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign, 
   1.286 +	CCnvCharacterSetConverter::TEndianness& aTEndianness )
   1.287 +	{
   1.288 +	// if the sample is not big enough
   1.289 +	if (aForeign.Length() < 2)
   1.290 +		{
   1.291 +		return EFalse;
   1.292 +		}
   1.293 +	else if (aForeign[0]==0xff && aForeign[1]==0xfe )
   1.294 +		{ 
   1.295 +		// we have found a Little Endian Byte order mark
   1.296 +		aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
   1.297 +		return ETrue;
   1.298 +		}
   1.299 +	else if (aForeign[0]==0xfe && aForeign[1]==0xff )
   1.300 +		{ 
   1.301 +		// we have found a Big Endian Byte order mark 
   1.302 +		aTEndianness = CCnvCharacterSetConverter::EBigEndian;
   1.303 +		return ETrue;
   1.304 +		}
   1.305 +
   1.306 +	// Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that 
   1.307 +	// specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters. 
   1.308 +	// NB: This will fail if there are no ASCII characters in the text.
   1.309 +	TInt sampleLength = aForeign.Length();
   1.310 +	sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   1.311 +
   1.312 +	// check the sample for sequences of 0x**00	or 0x00**
   1.313 +	TInt bigEndianConfidence = 0;
   1.314 +	TInt littleEndianConfidence = 0;
   1.315 +	TInt i=0;
   1.316 +	for(;i< (sampleLength-1); i+=2)
   1.317 +		{
   1.318 +		if( aForeign[i] == 0x00)
   1.319 +			{
   1.320 +			bigEndianConfidence +=2;
   1.321 +			}
   1.322 +		else if ( aForeign[i+1] == 0x00)
   1.323 +			{
   1.324 +	 		littleEndianConfidence +=2;
   1.325 +			}
   1.326 +		}
   1.327 +
   1.328 +	// which occurs most BE or LE	
   1.329 +	TInt confidenceLevel = 0;
   1.330 +	if (bigEndianConfidence > littleEndianConfidence)
   1.331 +		{
   1.332 +		aTEndianness = CCnvCharacterSetConverter::EBigEndian;
   1.333 +		confidenceLevel = bigEndianConfidence;
   1.334 +		}
   1.335 +	else
   1.336 +		{
   1.337 +		aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
   1.338 +		confidenceLevel = littleEndianConfidence;
   1.339 +		}
   1.340 +		
   1.341 +	// if more than 97% count as UCS2
   1.342 +	if ( confidenceLevel * 100/sampleLength > 97) 
   1.343 +		return ETrue;
   1.344 +
   1.345 +	return EFalse;
   1.346 +	}	
   1.347 +
   1.348 +/**
   1.349 + Check if ShiftJis (reference CJKV by Ken Lunde page 175)
   1.350 + @param A sample of data to be checked
   1.351 + @param The number of input bytes that can be converted
   1.352 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   1.353 + @internalTechnology 
   1.354 + */
   1.355 +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
   1.356 +	{
   1.357 +	// Get the sample length
   1.358 +	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   1.359 +
   1.360 +	TInt i=0;
   1.361 +	aNumberOfBytesConverted = 0;
   1.362 +	
   1.363 +	TText8 character;
   1.364 +	TText8 characterPlus1;
   1.365 +	TText8 characterPlus2;
   1.366 +	
   1.367 +	// scan the sample text looking for valid shiftjis data
   1.368 +	while ( i < sampleLength )
   1.369 +		{
   1.370 +		// get the next few characters, use 0 if there is no more sample
   1.371 +		// as this will not match any test.
   1.372 +		character = aForeign[i];
   1.373 +		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   1.374 +		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   1.375 +
   1.376 +		// SHIFTJIS	- 0x8e to 0x9f followed by 0x40 to 0xfc  
   1.377 +		if ((character >= 0x81) && (character <= 0x9f) &&
   1.378 +				(characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) ) 
   1.379 +			{
   1.380 +			// this is SHIFTJIS unless it is EUC JP code set 2 or 3
   1.381 +			if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF))
   1.382 +				{
   1.383 +				// this could be EUC JP code set 2 (or shiftjis)
   1.384 +				aNumberOfBytesConverted+=2;
   1.385 +				i++;
   1.386 +				}
   1.387 +			else if ((character == 0x8F) && 
   1.388 +				(characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) &&
   1.389 +					(characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
   1.390 +				{
   1.391 +				// this could be EUC JP code set 3 (or shiftjis)
   1.392 +				aNumberOfBytesConverted+=3;
   1.393 +				i+=2;
   1.394 +				}
   1.395 +			else
   1.396 +				{
   1.397 +				// this can only be shift jis 
   1.398 +				return EIsCharacterSet;
   1.399 +				}
   1.400 +			}
   1.401 +			
   1.402 +		// SHIFTJIS	- 0xE0 to 0xEF followed by .....
   1.403 +		else if ((character >= 0xE0) && (character <= 0xEF))
   1.404 +			{
   1.405 +			// 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF  
   1.406 +			// including Mopera extension to shiftjis from 0xEF80 to 0xEFFC
   1.407 +			
   1.408 +			if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) ) 
   1.409 +				{
   1.410 +				// this can only be shift jis 
   1.411 +				return EIsCharacterSet;
   1.412 +				}
   1.413 +			else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) ) 
   1.414 +				{
   1.415 +				// this could be EUC JP code set 1
   1.416 +				aNumberOfBytesConverted+=2;
   1.417 +				i++;
   1.418 +				}
   1.419 +				
   1.420 +			// problem here is the overlap between the UTF8 and shiftjis
   1.421 +			else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) )
   1.422 +				{
   1.423 +				// this could be shiftjis or utf8
   1.424 +				aNumberOfBytesConverted+=2;
   1.425 +				i++;
   1.426 +				}		
   1.427 +			}
   1.428 +		// half width katakana A1-DF	
   1.429 +		else if ((character >= 0xA1) && (character <= 0xDF))
   1.430 +			{
   1.431 +			aNumberOfBytesConverted+=1;
   1.432 +			}
   1.433 +		// ASCII or JIS-Roman 20-7e	
   1.434 +		else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D))
   1.435 +			{
   1.436 +			aNumberOfBytesConverted+=1;
   1.437 +			}
   1.438 +		else
   1.439 +			{
   1.440 +			// This is not decoding as shiftjis, so reject
   1.441 +			aNumberOfBytesConverted =0;
   1.442 +			return EIsNotCharacterSet;
   1.443 +			}
   1.444 +		i++;
   1.445 +		}
   1.446 +
   1.447 +	// if all the characters could be converted
   1.448 +	if (aNumberOfBytesConverted == sampleLength)
   1.449 +		{
   1.450 +		return EIsCharacterSet;
   1.451 +		}
   1.452 +	else if (aNumberOfBytesConverted == 0)
   1.453 +		{
   1.454 +		return EIsNotCharacterSet;
   1.455 +		}
   1.456 +	else
   1.457 +		{
   1.458 +		return EMaybeCharacterSet;
   1.459 +		}
   1.460 +	}
   1.461 +	
   1.462 +/**
   1.463 + Check if UTF8 (reference CJKV by Ken Lunde page 189)
   1.464 + @param A sample of data to be checked
   1.465 + @param The number of input bytes that can be converted
   1.466 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   1.467 + @internalTechnology 
   1.468 + */
   1.469 +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign )
   1.470 +	{
   1.471 +	// Get the sample length
   1.472 +	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   1.473 +
   1.474 +	TInt i=0;	
   1.475 +	TText8 character;
   1.476 +	TText8 characterPlus1;
   1.477 +	TText8 characterPlus2;
   1.478 +	TText8 characterPlus3;
   1.479 +	
   1.480 +	// scan the sample text looking for valid UTF8
   1.481 +	while ( i < sampleLength )
   1.482 +		{
   1.483 +		// get the next few characters, use 0 if there is no more sample
   1.484 +		// as this will not match any test.
   1.485 +		character = aForeign[i];
   1.486 +		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   1.487 +		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   1.488 +		characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
   1.489 +
   1.490 +		// UTF8 range 110xxxxx followed by one valid UTF8 bytes
   1.491 +		if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) )
   1.492 +			{
   1.493 +			// two bytes of valid UTF8 found
   1.494 +			i+=2;
   1.495 +			}
   1.496 +		// UTF8 range 1110xxxx followed by two valid UTF8 bytes
   1.497 +		else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80))
   1.498 +			{
   1.499 +			// three bytes of valid UTF8 found
   1.500 +			i+=3;
   1.501 +			}
   1.502 +		// UTF8 range 11110xxx followed by three valid UTF8 bytes
   1.503 +		else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80) 
   1.504 +				&& (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) )
   1.505 +			{
   1.506 +			// four bytes of valid UTF8 found
   1.507 +			i+=4;
   1.508 +			}
   1.509 +		
   1.510 +		// ascii range 0 to 0x7F	
   1.511 +		else if((character & 0x80)==0x00)
   1.512 +			{
   1.513 +			// The value of character is in the range 0x00-0x7f
   1.514 +			// UTF8 maintains ASCII transparency. So it's a valid UTF8.
   1.515 +			i++;
   1.516 +			}
   1.517 +		// if the sample data is longer than KMaxSizeAutoDetectSample then except anything
   1.518 +		// for the last two bytes as they may not appear valid without more data	
   1.519 +		else if( i >= (KMaxSizeAutoDetectSample -2) )
   1.520 +			{
   1.521 +			i++;
   1.522 +			}
   1.523 +		else
   1.524 +			{
   1.525 +			// This is not decoding as UTF8 so reject
   1.526 +			return EIsNotCharacterSet;
   1.527 +			}
   1.528 +		}	
   1.529 +	
   1.530 +	// All the characters could be converted
   1.531 +	return EIsCharacterSet;
   1.532 +	
   1.533 +	}
   1.534 +
   1.535 +
   1.536 +/**
   1.537 + Check if ISO2022JP by lookiing for the escape sequences.
   1.538 + @param A sample of data to be checked
   1.539 + @param The number of input bytes that can be converted
   1.540 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   1.541 + @internalTechnology 
   1.542 + */
   1.543 +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign )
   1.544 +	{
   1.545 +	// Get the sample length
   1.546 +	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   1.547 +
   1.548 +	TInt i=0;
   1.549 +	TText8 character;
   1.550 +	TText8 characterPlus1;
   1.551 +	TText8 characterPlus2;
   1.552 +	TText8 characterPlus3;
   1.553 +	TText8 characterPlus4;
   1.554 +	TText8 characterPlus5;
   1.555 +	
   1.556 +	// scan the sample text looking for valid UTF8
   1.557 +	while ( i < sampleLength )
   1.558 +		{
   1.559 +		// get the next few characters, use 0 if there is no more sample
   1.560 +		// as this will not match any test.
   1.561 +		character = aForeign[i];
   1.562 +		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   1.563 +		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   1.564 +		characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
   1.565 +
   1.566 +
   1.567 +		// check for the JIS escape sequences of ISO 2022Jp
   1.568 +		// These values have been taken from JISBASE_SHARED
   1.569 +		if (character == KEscape)
   1.570 +			{
   1.571 +			// Escape Sequence For Jis C6226_1978 \x1b\x24\x40
   1.572 +			if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40))
   1.573 +				{
   1.574 +				return EIsCharacterSet;
   1.575 +				}
   1.576 +				
   1.577 +			// Escape Sequence For Jis X0208_1983 \x1b\x24\x42
   1.578 +			else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42))
   1.579 +				{
   1.580 +				return EIsCharacterSet;
   1.581 +				}
   1.582 +			
   1.583 +			// Escape Sequence For Jis Roman \x1b\x28\x4a
   1.584 +			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A))
   1.585 +				{
   1.586 +				return EIsCharacterSet;
   1.587 +				}
   1.588 +				
   1.589 +			// Escape Sequence For Jis RomanIncorrect \x1b\x28\x48
   1.590 +			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48))
   1.591 +				{
   1.592 +				return EIsCharacterSet;
   1.593 +				}
   1.594 +
   1.595 +			// Escape Sequence For Ascii \x1b\x28\x42
   1.596 +			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42))
   1.597 +				{
   1.598 +				return EIsCharacterSet;
   1.599 +				}
   1.600 +				
   1.601 +			// Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49
   1.602 +			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49))
   1.603 +				{
   1.604 +				return EIsCharacterSet;
   1.605 +				}
   1.606 +				
   1.607 +			// Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42
   1.608 +			else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40))
   1.609 +				{
   1.610 +				characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0);
   1.611 +				characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0);
   1.612 +
   1.613 +				if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42))
   1.614 +					{
   1.615 +					return EIsCharacterSet;
   1.616 +					}
   1.617 +				}
   1.618 +			// Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44
   1.619 +			else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28)) 
   1.620 +				{
   1.621 +				if (characterPlus3 == 0x44)
   1.622 +					{
   1.623 +					return EIsCharacterSet;
   1.624 +					}
   1.625 +				}
   1.626 +				
   1.627 +			// check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40
   1.628 +			else if ((characterPlus1 == 'B') || (characterPlus1 == '@'))
   1.629 +				{
   1.630 +				return EIsCharacterSet;
   1.631 +				}
   1.632 +				
   1.633 +			} // end of if ( character == KEscape )
   1.634 +
   1.635 +		i++;
   1.636 +		}	
   1.637 +
   1.638 +	// if escape sequences have been found then this is not ISO2022
   1.639 +	return EIsNotCharacterSet;
   1.640 +	
   1.641 +	}
   1.642 +
   1.643 +
   1.644 +/**
   1.645 + Check if EUC JP (reference CJKV by Ken Lunde page 164)
   1.646 + @param A sample of data to be checked
   1.647 + @param The number of input bytes that can be converted
   1.648 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
   1.649 + @internalTechnology 
   1.650 + */
   1.651 +CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
   1.652 +	{
   1.653 +	// Get the sample length
   1.654 +	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
   1.655 +
   1.656 +	TInt i=0;
   1.657 +	aNumberOfBytesConverted = 0;
   1.658 +	
   1.659 +	TText8 character;
   1.660 +	TText8 characterPlus1;
   1.661 +	TText8 characterPlus2;
   1.662 +	
   1.663 +	// scan the sample text looking for valid shiftjis data
   1.664 +	while ( i < sampleLength )
   1.665 +		{
   1.666 +		// get the next few characters, use 0 if there is no more sample
   1.667 +		// as this will not match any test.
   1.668 +		character = aForeign[i];
   1.669 +		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
   1.670 +		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
   1.671 +
   1.672 +		// EUCJP code set 0 0x21-0x7e
   1.673 +		if ( (character >= 0x21) && (character <= 0x7e))
   1.674 +			{
   1.675 +			aNumberOfBytesConverted++;
   1.676 +			}
   1.677 +		else if ( (character == 0x0a) || (character == 0x0d))
   1.678 +			{
   1.679 +			aNumberOfBytesConverted++;
   1.680 +			}
   1.681 +		// EUCJP code set 1
   1.682 +		else if ( (character >= 0xa1) && (character <= 0xff)
   1.683 +				&& (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) ) 
   1.684 +			{
   1.685 +			aNumberOfBytesConverted+=2;
   1.686 +			i++;
   1.687 +			}
   1.688 +		 		
   1.689 +		// EUC JP code set 2, starts with the EUC JP SS2 character (0x8E)
   1.690 +		// and is followed by character in range 0xA1- 0xDF
   1.691 +		else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) ) 
   1.692 +			{
   1.693 +			// this could be 2 bytes of EUC JP code set 2
   1.694 +			aNumberOfBytesConverted += 2;
   1.695 +			i++;
   1.696 +			}
   1.697 +		// EUC JP code set 3, starts with the EUC JP SS3 character (0x8F)
   1.698 +		// and is followed by two characters in range A1- DF A1 -FE
   1.699 +		else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) 
   1.700 +				&& (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
   1.701 +			{
   1.702 +			// this could be 3 bytes of EUC JP code set 3
   1.703 +			aNumberOfBytesConverted += 3;
   1.704 +			i+=2;
   1.705 +			}		
   1.706 +		else
   1.707 +			{
   1.708 +			// This is not a valid decoding as EUC JP so reject
   1.709 +			return EIsNotCharacterSet;
   1.710 +			}
   1.711 +		i++;
   1.712 +		}	
   1.713 +	
   1.714 +	
   1.715 +	// if all the characters could be converted
   1.716 +	if (aNumberOfBytesConverted == sampleLength)
   1.717 +		{
   1.718 +		return EIsCharacterSet;
   1.719 +		}
   1.720 +	else if (aNumberOfBytesConverted == 0)
   1.721 +		{
   1.722 +		return EIsNotCharacterSet;
   1.723 +		}
   1.724 +	else
   1.725 +		{
   1.726 +		return EMaybeCharacterSet;
   1.727 +		}
   1.728 +	}
   1.729 +
   1.730 +			
   1.731 +/**
   1.732 + Convert from UCS2 (Universal Character Set containing two bytes) to unicode
   1.733 + Remove any byte order marks in the UCSs.
   1.734 + @param aUnicode Contains the converted text in the Unicode character set.
   1.735 + @param	aForeign The non-Unicode source text to be converted
   1.736 + @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted. 
   1.737 + @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character.
   1.738 + @return the number of bytes converted
   1.739 + @internalTechnology 
   1.740 + */
   1.741 + TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters, 
   1.742 +						   TDes16& aUnicode,	 
   1.743 +						   const TDesC8& aForeign, 
   1.744 +						   TInt& aNumberOfUnconvertibleCharacters,  
   1.745 +						   TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) 
   1.746 +
   1.747 +	{
   1.748 +	TInt numberOfBytesConverted = 0;
   1.749 +	TInt numberOfUnicodeCharacters =0;
   1.750 +	TChar nextChar;
   1.751 +
   1.752 +	// start at begining of the output buffer provided
   1.753 +	aUnicode.Zero();
   1.754 +
   1.755 +	// while there is at least 2 bytes of data to convert and space in the output buffer
   1.756 +	while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) )
   1.757 +		{
   1.758 +		if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian )
   1.759 +			{
   1.760 +			// ELittleEndian 0x??00
   1.761 +			nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8);
   1.762 +			}
   1.763 +		else
   1.764 +			{
   1.765 +			// EBigEndian 0x00??
   1.766 +			nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1];
   1.767 +			}
   1.768 +			
   1.769 +		// save the unicode character extracted	unless it's a BOM
   1.770 +		if ( nextChar != KByteOrderMark )
   1.771 +			{
   1.772 +			aUnicode.Append( nextChar );
   1.773 +			numberOfUnicodeCharacters++;	
   1.774 +			}
   1.775 +			
   1.776 +		numberOfBytesConverted+=2;
   1.777 +		}
   1.778 +	
   1.779 +	// there are no uncovertable characters with UCS2, but there could be
   1.780 +	aNumberOfUnconvertibleCharacters = 0;
   1.781 +	// a negative value indicates that all characters converted
   1.782 +	aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1;
   1.783 +				
   1.784 +	// returns the number of unconverted bytes left at the end of the input descriptor 
   1.785 +	// Note there could be 1 byte left over if an odd number of bytes provided for conversion
   1.786 +	return aForeign.Size() - numberOfBytesConverted;
   1.787 +	}
   1.788 +		
   1.789 +/**
   1.790 + Convert from EUC_JP (Extended Unix Code encoding for Japanese)
   1.791 + Using the standard Charconv method of an array of methods
   1.792 + @return the number of bytes converted
   1.793 + @internalTechnology 
   1.794 + */
   1.795 + TInt CJ5Converter::ConvertEEucjpToUnicode(
   1.796 +		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
   1.797 +		TDes16& aUnicode, 
   1.798 +		const TDesC8& aForeign, 
   1.799 +		TInt& /*aState*/, 
   1.800 +		TInt& aNumberOfUnconvertibleCharacters, 
   1.801 +		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
   1.802 +	{
   1.803 +	TFixedArray<CnvUtilities::SMethod, 4> methods;
   1.804 +	methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman;
   1.805 +	methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace;
   1.806 +	methods[0].iConversionData=&CnvJisRoman::ConversionData();
   1.807 +	methods[0].iNumberOfBytesPerCharacter=1;
   1.808 +	methods[0].iNumberOfCoreBytesPerCharacter=1;
   1.809 +	methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208;
   1.810 +	methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace;
   1.811 +	methods[1].iConversionData=&CnvJisX0208::ConversionData();
   1.812 +	methods[1].iNumberOfBytesPerCharacter=2;
   1.813 +	methods[1].iNumberOfCoreBytesPerCharacter=2;
   1.814 +	methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8;
   1.815 +	methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace;
   1.816 +	methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData();
   1.817 +	methods[2].iNumberOfBytesPerCharacter=2;
   1.818 +	methods[2].iNumberOfCoreBytesPerCharacter=1;
   1.819 +	methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212;
   1.820 +	methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace;
   1.821 +	methods[3].iConversionData=&CnvJisX0212::ConversionData();
   1.822 +	methods[3].iNumberOfBytesPerCharacter=3;
   1.823 +	methods[3].iNumberOfCoreBytesPerCharacter=2;
   1.824 +	return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array());
   1.825 +	}		
   1.826 +