os/textandloc/charconvfw/charconvplugins/src/plugins/j5.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
* J5 charconv character converter
sl@0
    16
*
sl@0
    17
*/
sl@0
    18
sl@0
    19
sl@0
    20
#include <e32std.h>
sl@0
    21
#include <charconv.h>
sl@0
    22
#include <ecom/implementationproxy.h>
sl@0
    23
#include <utf.h>
sl@0
    24
#include <charactersetconverter.h>
sl@0
    25
#include <convutils.h>
sl@0
    26
#include "shiftjis.h"
sl@0
    27
#include "jisbase.h"
sl@0
    28
#include "j5.h"
sl@0
    29
sl@0
    30
#include "jisx0201.h"
sl@0
    31
#include "jisx0208.h"
sl@0
    32
#include "jisx0212.h"
sl@0
    33
sl@0
    34
#include "featmgr/featmgr.h"
sl@0
    35
sl@0
    36
/**
sl@0
    37
 J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data.
sl@0
    38
 */
sl@0
    39
const TInt KMaxSizeAutoDetectSample = 1000;
sl@0
    40
sl@0
    41
const TUint8 KEscape = 0x1b;
sl@0
    42
const TInt KByteOrderMark = 0xfeff;
sl@0
    43
sl@0
    44
const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters()
sl@0
    45
	{
sl@0
    46
	return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters();
sl@0
    47
	}
sl@0
    48
sl@0
    49
/**
sl@0
    50
 This API should not be used as it is ambiguous as to what encoding is required.  
sl@0
    51
 The user should instead call the specific plug-in for the appropriate conversion.
sl@0
    52
 J5 ConvertFromUnicode() will convert to UTF8 as default.
sl@0
    53
@internalTechnology 
sl@0
    54
 */
sl@0
    55
TInt CJ5Converter::ConvertFromUnicode(
sl@0
    56
		CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */, 
sl@0
    57
		const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */, 
sl@0
    58
		TDes8& aForeign, 
sl@0
    59
		const TDesC16& aUnicode, 
sl@0
    60
		CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */)
sl@0
    61
	{
sl@0
    62
	return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode);
sl@0
    63
	}
sl@0
    64
sl@0
    65
/**
sl@0
    66
 This will automatically determine one of the five supported encodings 
sl@0
    67
 to use and convert accordingly.  This plugin method is available to the 
sl@0
    68
 user though the CCnvCharacterSetConverter::ConvertToUnicode() method.  
sl@0
    69
 There is no way for the caller to determine which encoding has been used.
sl@0
    70
 
sl@0
    71
 NOTE: For debugging the selected character set is returned in the state.
sl@0
    72
 
sl@0
    73
  @released  9.1
sl@0
    74
  @param     aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters
sl@0
    75
             in the foreign character set.
sl@0
    76
  @param     aUnicode On return, contains the text converted into Unicode.
sl@0
    77
  @param     aForeign The non-Unicode source text to be converted.
sl@0
    78
  @param     aState Used to save state information across multiple calls
sl@0
    79
             to <code>ConvertToUnicode()</code>.
sl@0
    80
  @param     aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not
sl@0
    81
             converted.
sl@0
    82
  @param     aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the
sl@0
    83
             input text that could not be converted. A negative
sl@0
    84
             value indicates that all the characters were
sl@0
    85
             converted.
sl@0
    86
  @return 	 The number of unconverted bytes left at the end of the input descriptor 
sl@0
    87
 		     (e.g. because the output descriptor is not long enough to hold all the text), 
sl@0
    88
 		     or one of the error values defined in TError. 
sl@0
    89
  @internalTechnology 
sl@0
    90
*/
sl@0
    91
TInt CJ5Converter::ConvertToUnicode(
sl@0
    92
		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
sl@0
    93
		TDes16& aUnicode, 
sl@0
    94
		const TDesC8& aForeign, 
sl@0
    95
		TInt& aState, 
sl@0
    96
		TInt& aNumberOfUnconvertibleCharacters, 
sl@0
    97
		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0
    98
	{
sl@0
    99
	// As the aState parameter is used to pass back the detected value
sl@0
   100
	// use a "hidden" internal state variable.
sl@0
   101
	TInt internalState = CCnvCharacterSetConverter::KStateDefault;
sl@0
   102
	
sl@0
   103
	// determine the encoding type and then decode appropriatly
sl@0
   104
	switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign))
sl@0
   105
		{
sl@0
   106
		case EShiftjis:
sl@0
   107
			aState = EShiftjis;
sl@0
   108
			return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, 
sl@0
   109
					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0
   110
sl@0
   111
		case EIso2022jp1: 
sl@0
   112
			aState = EIso2022jp1;
sl@0
   113
			return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
sl@0
   114
					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0
   115
sl@0
   116
		case EEucjp: 
sl@0
   117
			aState = EEucjp;
sl@0
   118
			return ConvertEEucjpToUnicode(
sl@0
   119
					aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
sl@0
   120
					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);	
sl@0
   121
sl@0
   122
		case EUcs2:
sl@0
   123
			aState = EUcs2;
sl@0
   124
			return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, 
sl@0
   125
					aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0
   126
sl@0
   127
		case EUtf8: 
sl@0
   128
			aState = EUtf8;
sl@0
   129
			return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
sl@0
   130
			
sl@0
   131
		default:
sl@0
   132
			// fall though to the default, which is decode as UTF8
sl@0
   133
			aState = EUnknown;
sl@0
   134
			break;
sl@0
   135
		}
sl@0
   136
sl@0
   137
	// decode as UTF8
sl@0
   138
	return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
sl@0
   139
	}
sl@0
   140
sl@0
   141
/**
sl@0
   142
 This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL(). 
sl@0
   143
 This method returns a value between 0 and 100, indicating how likely it 
sl@0
   144
 is that this is the correct converter, for the text supplied.  As J5 is 
sl@0
   145
 NOT intended to be used with the existing auto-detect mechanism, it will 
sl@0
   146
 always return 0
sl@0
   147
 @internalTechnology 
sl@0
   148
 */
sl@0
   149
TBool CJ5Converter::IsInThisCharacterSetL(
sl@0
   150
		TBool& aSetToTrue, 
sl@0
   151
		TInt& aConfidenceLevel, 
sl@0
   152
		const TDesC8& /* aSample */)
sl@0
   153
	{
sl@0
   154
  	/*
sl@0
   155
  	aSetToTrue - This value should be set to ETrue. It is used to indicate to 
sl@0
   156
  	CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL 
sl@0
   157
  	is implementing a function of this signature and is therefore not the empty 
sl@0
   158
  	*/
sl@0
   159
  	aSetToTrue=ETrue;
sl@0
   160
  	
sl@0
   161
 	/* no need to look at the sample as this always returns 0 
sl@0
   162
 	   as the autodetect feature is not supported by the J5 plug-in
sl@0
   163
 	*/
sl@0
   164
 	aConfidenceLevel=0;
sl@0
   165
	return ETrue;
sl@0
   166
	}
sl@0
   167
sl@0
   168
CJ5Converter* CJ5Converter::NewL()
sl@0
   169
	{
sl@0
   170
	CJ5Converter* self = new(ELeave) CJ5Converter();
sl@0
   171
    CleanupStack::PushL(self);
sl@0
   172
    self->ConstructL();
sl@0
   173
    CleanupStack::Pop(self);	
sl@0
   174
	return self;
sl@0
   175
	}
sl@0
   176
sl@0
   177
CJ5Converter::~CJ5Converter()
sl@0
   178
	{
sl@0
   179
    FeatureManager::UnInitializeLib();	
sl@0
   180
	}
sl@0
   181
sl@0
   182
CJ5Converter::CJ5Converter()
sl@0
   183
	{
sl@0
   184
	}
sl@0
   185
sl@0
   186
void CJ5Converter::ConstructL()
sl@0
   187
    {
sl@0
   188
    FeatureManager::InitializeLibL();
sl@0
   189
    }
sl@0
   190
sl@0
   191
const TImplementationProxy ImplementationTable[] = 
sl@0
   192
	{
sl@0
   193
#ifdef KDDIAU_TEST
sl@0
   194
		// for the test build use a special test UID
sl@0
   195
		IMPLEMENTATION_PROXY_ENTRY(0x01000002,	CJ5Converter::NewL)
sl@0
   196
#else
sl@0
   197
		IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5,	CJ5Converter::NewL)
sl@0
   198
#endif
sl@0
   199
	};
sl@0
   200
sl@0
   201
EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
sl@0
   202
	{
sl@0
   203
	aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
sl@0
   204
sl@0
   205
	return ImplementationTable;
sl@0
   206
	}
sl@0
   207
	
sl@0
   208
/**
sl@0
   209
 DetectEncoding determine the characterset encoding.
sl@0
   210
 The logic for this detection is based on the information in CJKV by Ken Lunde.
sl@0
   211
 A detailed diagram of this logic is in the J5 how to document section 2.4
sl@0
   212
 @return The detected character set as a enum CJ5Converter.
sl@0
   213
 @internalTechnology 
sl@0
   214
 */
sl@0
   215
enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding(
sl@0
   216
		CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters , 
sl@0
   217
		const TDesC8& aForeign)
sl@0
   218
	{
sl@0
   219
	
sl@0
   220
	// first check for UCS2
sl@0
   221
	CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian;
sl@0
   222
	if ( DetectUcs2(aForeign, ucs2Endianness ))
sl@0
   223
		{
sl@0
   224
		// if ucs2 is detected pass back the detected endianess
sl@0
   225
		aDefaultEndiannessOfForeignCharacters = ucs2Endianness;
sl@0
   226
		return EUcs2;
sl@0
   227
		}
sl@0
   228
sl@0
   229
	// next try EUC_JP
sl@0
   230
	TInt eucJpValidBytes = 0;
sl@0
   231
	CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes );
sl@0
   232
	if ( result == EIsCharacterSet )
sl@0
   233
		{
sl@0
   234
		return EEucjp;
sl@0
   235
		}
sl@0
   236
		
sl@0
   237
	// next try Iso 2020JP
sl@0
   238
	if ( DetectIso2022( aForeign ) == EIsCharacterSet )
sl@0
   239
		{
sl@0
   240
		return EIso2022jp1;
sl@0
   241
		}
sl@0
   242
		
sl@0
   243
	// next try Utf8
sl@0
   244
	if ( DetectUtf8( aForeign ) == EIsCharacterSet )
sl@0
   245
		{
sl@0
   246
		return EUtf8;
sl@0
   247
		}
sl@0
   248
		
sl@0
   249
	// shiftjis
sl@0
   250
	TInt shiftjisValidBytes = 0;
sl@0
   251
	result = DetectShiftJis( aForeign, shiftjisValidBytes );
sl@0
   252
	if ( result == EIsCharacterSet )
sl@0
   253
		{
sl@0
   254
		return EShiftjis;
sl@0
   255
		}
sl@0
   256
		
sl@0
   257
	// no clear winner so go for the best 
sl@0
   258
	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);
sl@0
   259
		
sl@0
   260
	// if more than half is shiftjis and more shiftjis than EUC_JP, 	
sl@0
   261
	if ((shiftjisValidBytes >  eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength))
sl@0
   262
		return EShiftjis;
sl@0
   263
sl@0
   264
	// if more than half is EUC_JP and more EUC_JP than shiftjis, 	
sl@0
   265
	if ((eucJpValidBytes >  shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength))
sl@0
   266
		return EEucjp;
sl@0
   267
			
sl@0
   268
	// return the default
sl@0
   269
	return EUcs2;
sl@0
   270
	}
sl@0
   271
	
sl@0
   272
	
sl@0
   273
/**
sl@0
   274
 Check if UCS2.
sl@0
   275
 If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff)
sl@0
   276
 then this must be UCS2. Otherwise try lookiing for  0x**00 or 0x00**
sl@0
   277
 @param A sample of data to be checked
sl@0
   278
 @param The Endianness if USC2 is detected
sl@0
   279
 @return ETrue if UCS2 else EFalse
sl@0
   280
 @internalTechnology 
sl@0
   281
 */
sl@0
   282
TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign, 
sl@0
   283
	CCnvCharacterSetConverter::TEndianness& aTEndianness )
sl@0
   284
	{
sl@0
   285
	// if the sample is not big enough
sl@0
   286
	if (aForeign.Length() < 2)
sl@0
   287
		{
sl@0
   288
		return EFalse;
sl@0
   289
		}
sl@0
   290
	else if (aForeign[0]==0xff && aForeign[1]==0xfe )
sl@0
   291
		{ 
sl@0
   292
		// we have found a Little Endian Byte order mark
sl@0
   293
		aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
sl@0
   294
		return ETrue;
sl@0
   295
		}
sl@0
   296
	else if (aForeign[0]==0xfe && aForeign[1]==0xff )
sl@0
   297
		{ 
sl@0
   298
		// we have found a Big Endian Byte order mark 
sl@0
   299
		aTEndianness = CCnvCharacterSetConverter::EBigEndian;
sl@0
   300
		return ETrue;
sl@0
   301
		}
sl@0
   302
sl@0
   303
	// Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that 
sl@0
   304
	// specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters. 
sl@0
   305
	// NB: This will fail if there are no ASCII characters in the text.
sl@0
   306
	TInt sampleLength = aForeign.Length();
sl@0
   307
	sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0
   308
sl@0
   309
	// check the sample for sequences of 0x**00	or 0x00**
sl@0
   310
	TInt bigEndianConfidence = 0;
sl@0
   311
	TInt littleEndianConfidence = 0;
sl@0
   312
	TInt i=0;
sl@0
   313
	for(;i< (sampleLength-1); i+=2)
sl@0
   314
		{
sl@0
   315
		if( aForeign[i] == 0x00)
sl@0
   316
			{
sl@0
   317
			bigEndianConfidence +=2;
sl@0
   318
			}
sl@0
   319
		else if ( aForeign[i+1] == 0x00)
sl@0
   320
			{
sl@0
   321
	 		littleEndianConfidence +=2;
sl@0
   322
			}
sl@0
   323
		}
sl@0
   324
sl@0
   325
	// which occurs most BE or LE	
sl@0
   326
	TInt confidenceLevel = 0;
sl@0
   327
	if (bigEndianConfidence > littleEndianConfidence)
sl@0
   328
		{
sl@0
   329
		aTEndianness = CCnvCharacterSetConverter::EBigEndian;
sl@0
   330
		confidenceLevel = bigEndianConfidence;
sl@0
   331
		}
sl@0
   332
	else
sl@0
   333
		{
sl@0
   334
		aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
sl@0
   335
		confidenceLevel = littleEndianConfidence;
sl@0
   336
		}
sl@0
   337
		
sl@0
   338
	// if more than 97% count as UCS2
sl@0
   339
	if ( confidenceLevel * 100/sampleLength > 97) 
sl@0
   340
		return ETrue;
sl@0
   341
sl@0
   342
	return EFalse;
sl@0
   343
	}	
sl@0
   344
sl@0
   345
/**
sl@0
   346
 Check if ShiftJis (reference CJKV by Ken Lunde page 175)
sl@0
   347
 @param A sample of data to be checked
sl@0
   348
 @param The number of input bytes that can be converted
sl@0
   349
 @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0
   350
 @internalTechnology 
sl@0
   351
 */
sl@0
   352
enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
sl@0
   353
	{
sl@0
   354
	// Get the sample length
sl@0
   355
	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0
   356
sl@0
   357
	TInt i=0;
sl@0
   358
	aNumberOfBytesConverted = 0;
sl@0
   359
	
sl@0
   360
	TText8 character;
sl@0
   361
	TText8 characterPlus1;
sl@0
   362
	TText8 characterPlus2;
sl@0
   363
	
sl@0
   364
	// scan the sample text looking for valid shiftjis data
sl@0
   365
	while ( i < sampleLength )
sl@0
   366
		{
sl@0
   367
		// get the next few characters, use 0 if there is no more sample
sl@0
   368
		// as this will not match any test.
sl@0
   369
		character = aForeign[i];
sl@0
   370
		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0
   371
		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0
   372
sl@0
   373
		// SHIFTJIS	- 0x8e to 0x9f followed by 0x40 to 0xfc  
sl@0
   374
		if ((character >= 0x81) && (character <= 0x9f) &&
sl@0
   375
				(characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) ) 
sl@0
   376
			{
sl@0
   377
			// this is SHIFTJIS unless it is EUC JP code set 2 or 3
sl@0
   378
			if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF))
sl@0
   379
				{
sl@0
   380
				// this could be EUC JP code set 2 (or shiftjis)
sl@0
   381
				aNumberOfBytesConverted+=2;
sl@0
   382
				i++;
sl@0
   383
				}
sl@0
   384
			else if ((character == 0x8F) && 
sl@0
   385
				(characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) &&
sl@0
   386
					(characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
sl@0
   387
				{
sl@0
   388
				// this could be EUC JP code set 3 (or shiftjis)
sl@0
   389
				aNumberOfBytesConverted+=3;
sl@0
   390
				i+=2;
sl@0
   391
				}
sl@0
   392
			else
sl@0
   393
				{
sl@0
   394
				// this can only be shift jis 
sl@0
   395
				return EIsCharacterSet;
sl@0
   396
				}
sl@0
   397
			}
sl@0
   398
			
sl@0
   399
		// SHIFTJIS	- 0xE0 to 0xEF followed by .....
sl@0
   400
		else if ((character >= 0xE0) && (character <= 0xEF))
sl@0
   401
			{
sl@0
   402
			// 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF  
sl@0
   403
			// including Mopera extension to shiftjis from 0xEF80 to 0xEFFC
sl@0
   404
			
sl@0
   405
			if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) ) 
sl@0
   406
				{
sl@0
   407
				// this can only be shift jis 
sl@0
   408
				return EIsCharacterSet;
sl@0
   409
				}
sl@0
   410
			else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) ) 
sl@0
   411
				{
sl@0
   412
				// this could be EUC JP code set 1
sl@0
   413
				aNumberOfBytesConverted+=2;
sl@0
   414
				i++;
sl@0
   415
				}
sl@0
   416
				
sl@0
   417
			// problem here is the overlap between the UTF8 and shiftjis
sl@0
   418
			else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) )
sl@0
   419
				{
sl@0
   420
				// this could be shiftjis or utf8
sl@0
   421
				aNumberOfBytesConverted+=2;
sl@0
   422
				i++;
sl@0
   423
				}		
sl@0
   424
			}
sl@0
   425
		// half width katakana A1-DF	
sl@0
   426
		else if ((character >= 0xA1) && (character <= 0xDF))
sl@0
   427
			{
sl@0
   428
			aNumberOfBytesConverted+=1;
sl@0
   429
			}
sl@0
   430
		// ASCII or JIS-Roman 20-7e	
sl@0
   431
		else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D))
sl@0
   432
			{
sl@0
   433
			aNumberOfBytesConverted+=1;
sl@0
   434
			}
sl@0
   435
		else
sl@0
   436
			{
sl@0
   437
			// This is not decoding as shiftjis, so reject
sl@0
   438
			aNumberOfBytesConverted =0;
sl@0
   439
			return EIsNotCharacterSet;
sl@0
   440
			}
sl@0
   441
		i++;
sl@0
   442
		}
sl@0
   443
sl@0
   444
	// if all the characters could be converted
sl@0
   445
	if (aNumberOfBytesConverted == sampleLength)
sl@0
   446
		{
sl@0
   447
		return EIsCharacterSet;
sl@0
   448
		}
sl@0
   449
	else if (aNumberOfBytesConverted == 0)
sl@0
   450
		{
sl@0
   451
		return EIsNotCharacterSet;
sl@0
   452
		}
sl@0
   453
	else
sl@0
   454
		{
sl@0
   455
		return EMaybeCharacterSet;
sl@0
   456
		}
sl@0
   457
	}
sl@0
   458
	
sl@0
   459
/**
sl@0
   460
 Check if UTF8 (reference CJKV by Ken Lunde page 189)
sl@0
   461
 @param A sample of data to be checked
sl@0
   462
 @param The number of input bytes that can be converted
sl@0
   463
 @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0
   464
 @internalTechnology 
sl@0
   465
 */
sl@0
   466
enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign )
sl@0
   467
	{
sl@0
   468
	// Get the sample length
sl@0
   469
	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0
   470
sl@0
   471
	TInt i=0;	
sl@0
   472
	TText8 character;
sl@0
   473
	TText8 characterPlus1;
sl@0
   474
	TText8 characterPlus2;
sl@0
   475
	TText8 characterPlus3;
sl@0
   476
	
sl@0
   477
	// scan the sample text looking for valid UTF8
sl@0
   478
	while ( i < sampleLength )
sl@0
   479
		{
sl@0
   480
		// get the next few characters, use 0 if there is no more sample
sl@0
   481
		// as this will not match any test.
sl@0
   482
		character = aForeign[i];
sl@0
   483
		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0
   484
		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0
   485
		characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
sl@0
   486
sl@0
   487
		// UTF8 range 110xxxxx followed by one valid UTF8 bytes
sl@0
   488
		if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) )
sl@0
   489
			{
sl@0
   490
			// two bytes of valid UTF8 found
sl@0
   491
			i+=2;
sl@0
   492
			}
sl@0
   493
		// UTF8 range 1110xxxx followed by two valid UTF8 bytes
sl@0
   494
		else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80))
sl@0
   495
			{
sl@0
   496
			// three bytes of valid UTF8 found
sl@0
   497
			i+=3;
sl@0
   498
			}
sl@0
   499
		// UTF8 range 11110xxx followed by three valid UTF8 bytes
sl@0
   500
		else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80) 
sl@0
   501
				&& (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) )
sl@0
   502
			{
sl@0
   503
			// four bytes of valid UTF8 found
sl@0
   504
			i+=4;
sl@0
   505
			}
sl@0
   506
		
sl@0
   507
		// ascii range 0 to 0x7F	
sl@0
   508
		else if((character & 0x80)==0x00)
sl@0
   509
			{
sl@0
   510
			// The value of character is in the range 0x00-0x7f
sl@0
   511
			// UTF8 maintains ASCII transparency. So it's a valid UTF8.
sl@0
   512
			i++;
sl@0
   513
			}
sl@0
   514
		// if the sample data is longer than KMaxSizeAutoDetectSample then except anything
sl@0
   515
		// for the last two bytes as they may not appear valid without more data	
sl@0
   516
		else if( i >= (KMaxSizeAutoDetectSample -2) )
sl@0
   517
			{
sl@0
   518
			i++;
sl@0
   519
			}
sl@0
   520
		else
sl@0
   521
			{
sl@0
   522
			// This is not decoding as UTF8 so reject
sl@0
   523
			return EIsNotCharacterSet;
sl@0
   524
			}
sl@0
   525
		}	
sl@0
   526
	
sl@0
   527
	// All the characters could be converted
sl@0
   528
	return EIsCharacterSet;
sl@0
   529
	
sl@0
   530
	}
sl@0
   531
sl@0
   532
sl@0
   533
/**
sl@0
   534
 Check if ISO2022JP by lookiing for the escape sequences.
sl@0
   535
 @param A sample of data to be checked
sl@0
   536
 @param The number of input bytes that can be converted
sl@0
   537
 @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0
   538
 @internalTechnology 
sl@0
   539
 */
sl@0
   540
enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign )
sl@0
   541
	{
sl@0
   542
	// Get the sample length
sl@0
   543
	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0
   544
sl@0
   545
	TInt i=0;
sl@0
   546
	TText8 character;
sl@0
   547
	TText8 characterPlus1;
sl@0
   548
	TText8 characterPlus2;
sl@0
   549
	TText8 characterPlus3;
sl@0
   550
	TText8 characterPlus4;
sl@0
   551
	TText8 characterPlus5;
sl@0
   552
	
sl@0
   553
	// scan the sample text looking for valid UTF8
sl@0
   554
	while ( i < sampleLength )
sl@0
   555
		{
sl@0
   556
		// get the next few characters, use 0 if there is no more sample
sl@0
   557
		// as this will not match any test.
sl@0
   558
		character = aForeign[i];
sl@0
   559
		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0
   560
		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0
   561
		characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
sl@0
   562
sl@0
   563
sl@0
   564
		// check for the JIS escape sequences of ISO 2022Jp
sl@0
   565
		// These values have been taken from JISBASE_SHARED
sl@0
   566
		if (character == KEscape)
sl@0
   567
			{
sl@0
   568
			// Escape Sequence For Jis C6226_1978 \x1b\x24\x40
sl@0
   569
			if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40))
sl@0
   570
				{
sl@0
   571
				return EIsCharacterSet;
sl@0
   572
				}
sl@0
   573
				
sl@0
   574
			// Escape Sequence For Jis X0208_1983 \x1b\x24\x42
sl@0
   575
			else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42))
sl@0
   576
				{
sl@0
   577
				return EIsCharacterSet;
sl@0
   578
				}
sl@0
   579
			
sl@0
   580
			// Escape Sequence For Jis Roman \x1b\x28\x4a
sl@0
   581
			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A))
sl@0
   582
				{
sl@0
   583
				return EIsCharacterSet;
sl@0
   584
				}
sl@0
   585
				
sl@0
   586
			// Escape Sequence For Jis RomanIncorrect \x1b\x28\x48
sl@0
   587
			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48))
sl@0
   588
				{
sl@0
   589
				return EIsCharacterSet;
sl@0
   590
				}
sl@0
   591
sl@0
   592
			// Escape Sequence For Ascii \x1b\x28\x42
sl@0
   593
			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42))
sl@0
   594
				{
sl@0
   595
				return EIsCharacterSet;
sl@0
   596
				}
sl@0
   597
				
sl@0
   598
			// Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49
sl@0
   599
			else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49))
sl@0
   600
				{
sl@0
   601
				return EIsCharacterSet;
sl@0
   602
				}
sl@0
   603
				
sl@0
   604
			// Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42
sl@0
   605
			else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40))
sl@0
   606
				{
sl@0
   607
				characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0);
sl@0
   608
				characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0);
sl@0
   609
sl@0
   610
				if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42))
sl@0
   611
					{
sl@0
   612
					return EIsCharacterSet;
sl@0
   613
					}
sl@0
   614
				}
sl@0
   615
			// Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44
sl@0
   616
			else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28)) 
sl@0
   617
				{
sl@0
   618
				if (characterPlus3 == 0x44)
sl@0
   619
					{
sl@0
   620
					return EIsCharacterSet;
sl@0
   621
					}
sl@0
   622
				}
sl@0
   623
				
sl@0
   624
			// check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40
sl@0
   625
			else if ((characterPlus1 == 'B') || (characterPlus1 == '@'))
sl@0
   626
				{
sl@0
   627
				return EIsCharacterSet;
sl@0
   628
				}
sl@0
   629
				
sl@0
   630
			} // end of if ( character == KEscape )
sl@0
   631
sl@0
   632
		i++;
sl@0
   633
		}	
sl@0
   634
sl@0
   635
	// if escape sequences have been found then this is not ISO2022
sl@0
   636
	return EIsNotCharacterSet;
sl@0
   637
	
sl@0
   638
	}
sl@0
   639
sl@0
   640
sl@0
   641
/**
sl@0
   642
 Check if EUC JP (reference CJKV by Ken Lunde page 164)
sl@0
   643
 @param A sample of data to be checked
sl@0
   644
 @param The number of input bytes that can be converted
sl@0
   645
 @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0
   646
 @internalTechnology 
sl@0
   647
 */
sl@0
   648
CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
sl@0
   649
	{
sl@0
   650
	// Get the sample length
sl@0
   651
	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0
   652
sl@0
   653
	TInt i=0;
sl@0
   654
	aNumberOfBytesConverted = 0;
sl@0
   655
	
sl@0
   656
	TText8 character;
sl@0
   657
	TText8 characterPlus1;
sl@0
   658
	TText8 characterPlus2;
sl@0
   659
	
sl@0
   660
	// scan the sample text looking for valid shiftjis data
sl@0
   661
	while ( i < sampleLength )
sl@0
   662
		{
sl@0
   663
		// get the next few characters, use 0 if there is no more sample
sl@0
   664
		// as this will not match any test.
sl@0
   665
		character = aForeign[i];
sl@0
   666
		characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0
   667
		characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0
   668
sl@0
   669
		// EUCJP code set 0 0x21-0x7e
sl@0
   670
		if ( (character >= 0x21) && (character <= 0x7e))
sl@0
   671
			{
sl@0
   672
			aNumberOfBytesConverted++;
sl@0
   673
			}
sl@0
   674
		else if ( (character == 0x0a) || (character == 0x0d))
sl@0
   675
			{
sl@0
   676
			aNumberOfBytesConverted++;
sl@0
   677
			}
sl@0
   678
		// EUCJP code set 1
sl@0
   679
		else if ( (character >= 0xa1) && (character <= 0xff)
sl@0
   680
				&& (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) ) 
sl@0
   681
			{
sl@0
   682
			aNumberOfBytesConverted+=2;
sl@0
   683
			i++;
sl@0
   684
			}
sl@0
   685
		 		
sl@0
   686
		// EUC JP code set 2, starts with the EUC JP SS2 character (0x8E)
sl@0
   687
		// and is followed by character in range 0xA1- 0xDF
sl@0
   688
		else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) ) 
sl@0
   689
			{
sl@0
   690
			// this could be 2 bytes of EUC JP code set 2
sl@0
   691
			aNumberOfBytesConverted += 2;
sl@0
   692
			i++;
sl@0
   693
			}
sl@0
   694
		// EUC JP code set 3, starts with the EUC JP SS3 character (0x8F)
sl@0
   695
		// and is followed by two characters in range A1- DF A1 -FE
sl@0
   696
		else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) 
sl@0
   697
				&& (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
sl@0
   698
			{
sl@0
   699
			// this could be 3 bytes of EUC JP code set 3
sl@0
   700
			aNumberOfBytesConverted += 3;
sl@0
   701
			i+=2;
sl@0
   702
			}		
sl@0
   703
		else
sl@0
   704
			{
sl@0
   705
			// This is not a valid decoding as EUC JP so reject
sl@0
   706
			return EIsNotCharacterSet;
sl@0
   707
			}
sl@0
   708
		i++;
sl@0
   709
		}	
sl@0
   710
	
sl@0
   711
	
sl@0
   712
	// if all the characters could be converted
sl@0
   713
	if (aNumberOfBytesConverted == sampleLength)
sl@0
   714
		{
sl@0
   715
		return EIsCharacterSet;
sl@0
   716
		}
sl@0
   717
	else if (aNumberOfBytesConverted == 0)
sl@0
   718
		{
sl@0
   719
		return EIsNotCharacterSet;
sl@0
   720
		}
sl@0
   721
	else
sl@0
   722
		{
sl@0
   723
		return EMaybeCharacterSet;
sl@0
   724
		}
sl@0
   725
	}
sl@0
   726
sl@0
   727
			
sl@0
   728
/**
sl@0
   729
 Convert from UCS2 (Universal Character Set containing two bytes) to unicode
sl@0
   730
 Remove any byte order marks in the UCSs.
sl@0
   731
 @param aUnicode Contains the converted text in the Unicode character set.
sl@0
   732
 @param	aForeign The non-Unicode source text to be converted
sl@0
   733
 @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted. 
sl@0
   734
 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character.
sl@0
   735
 @return the number of bytes converted
sl@0
   736
 @internalTechnology 
sl@0
   737
 */
sl@0
   738
 TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters, 
sl@0
   739
						   TDes16& aUnicode,	 
sl@0
   740
						   const TDesC8& aForeign, 
sl@0
   741
						   TInt& aNumberOfUnconvertibleCharacters,  
sl@0
   742
						   TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) 
sl@0
   743
sl@0
   744
	{
sl@0
   745
	TInt numberOfBytesConverted = 0;
sl@0
   746
	TInt numberOfUnicodeCharacters =0;
sl@0
   747
	TChar nextChar;
sl@0
   748
sl@0
   749
	// start at begining of the output buffer provided
sl@0
   750
	aUnicode.Zero();
sl@0
   751
sl@0
   752
	// while there is at least 2 bytes of data to convert and space in the output buffer
sl@0
   753
	while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) )
sl@0
   754
		{
sl@0
   755
		if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian )
sl@0
   756
			{
sl@0
   757
			// ELittleEndian 0x??00
sl@0
   758
			nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8);
sl@0
   759
			}
sl@0
   760
		else
sl@0
   761
			{
sl@0
   762
			// EBigEndian 0x00??
sl@0
   763
			nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1];
sl@0
   764
			}
sl@0
   765
			
sl@0
   766
		// save the unicode character extracted	unless it's a BOM
sl@0
   767
		if ( nextChar != KByteOrderMark )
sl@0
   768
			{
sl@0
   769
			aUnicode.Append( nextChar );
sl@0
   770
			numberOfUnicodeCharacters++;	
sl@0
   771
			}
sl@0
   772
			
sl@0
   773
		numberOfBytesConverted+=2;
sl@0
   774
		}
sl@0
   775
	
sl@0
   776
	// there are no uncovertable characters with UCS2, but there could be
sl@0
   777
	aNumberOfUnconvertibleCharacters = 0;
sl@0
   778
	// a negative value indicates that all characters converted
sl@0
   779
	aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1;
sl@0
   780
				
sl@0
   781
	// returns the number of unconverted bytes left at the end of the input descriptor 
sl@0
   782
	// Note there could be 1 byte left over if an odd number of bytes provided for conversion
sl@0
   783
	return aForeign.Size() - numberOfBytesConverted;
sl@0
   784
	}
sl@0
   785
		
sl@0
   786
/**
sl@0
   787
 Convert from EUC_JP (Extended Unix Code encoding for Japanese)
sl@0
   788
 Using the standard Charconv method of an array of methods
sl@0
   789
 @return the number of bytes converted
sl@0
   790
 @internalTechnology 
sl@0
   791
 */
sl@0
   792
 TInt CJ5Converter::ConvertEEucjpToUnicode(
sl@0
   793
		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
sl@0
   794
		TDes16& aUnicode, 
sl@0
   795
		const TDesC8& aForeign, 
sl@0
   796
		TInt& /*aState*/, 
sl@0
   797
		TInt& aNumberOfUnconvertibleCharacters, 
sl@0
   798
		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0
   799
	{
sl@0
   800
	TFixedArray<CnvUtilities::SMethod, 4> methods;
sl@0
   801
	methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman;
sl@0
   802
	methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace;
sl@0
   803
	methods[0].iConversionData=&CnvJisRoman::ConversionData();
sl@0
   804
	methods[0].iNumberOfBytesPerCharacter=1;
sl@0
   805
	methods[0].iNumberOfCoreBytesPerCharacter=1;
sl@0
   806
	methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208;
sl@0
   807
	methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace;
sl@0
   808
	methods[1].iConversionData=&CnvJisX0208::ConversionData();
sl@0
   809
	methods[1].iNumberOfBytesPerCharacter=2;
sl@0
   810
	methods[1].iNumberOfCoreBytesPerCharacter=2;
sl@0
   811
	methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8;
sl@0
   812
	methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace;
sl@0
   813
	methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData();
sl@0
   814
	methods[2].iNumberOfBytesPerCharacter=2;
sl@0
   815
	methods[2].iNumberOfCoreBytesPerCharacter=1;
sl@0
   816
	methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212;
sl@0
   817
	methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace;
sl@0
   818
	methods[3].iConversionData=&CnvJisX0212::ConversionData();
sl@0
   819
	methods[3].iNumberOfBytesPerCharacter=3;
sl@0
   820
	methods[3].iNumberOfCoreBytesPerCharacter=2;
sl@0
   821
	return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array());
sl@0
   822
	}		
sl@0
   823