Symaptic: os/textandloc/charconvfw/charconvplugins/src/plugins/j5.cpp@260cb5ec6c19 (annotated)

sl@0	1	/*
sl@0	2	* Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0	3	* All rights reserved.
sl@0	4	* This component and the accompanying materials are made available
sl@0	5	* under the terms of "Eclipse Public License v1.0"
sl@0	6	* which accompanies this distribution, and is available
sl@0	7	* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0	8	*
sl@0	9	* Initial Contributors:
sl@0	10	* Nokia Corporation - initial contribution.
sl@0	11	*
sl@0	12	* Contributors:
sl@0	13	*
sl@0	14	* Description:
sl@0	15	* J5 charconv character converter
sl@0	16	*
sl@0	17	*/
sl@0	18
sl@0	19
sl@0	20	#include <e32std.h>
sl@0	21	#include <charconv.h>
sl@0	22	#include <ecom/implementationproxy.h>
sl@0	23	#include <utf.h>
sl@0	24	#include <charactersetconverter.h>
sl@0	25	#include <convutils.h>
sl@0	26	#include "shiftjis.h"
sl@0	27	#include "jisbase.h"
sl@0	28	#include "j5.h"
sl@0	29
sl@0	30	#include "jisx0201.h"
sl@0	31	#include "jisx0208.h"
sl@0	32	#include "jisx0212.h"
sl@0	33
sl@0	34	#include "featmgr/featmgr.h"
sl@0	35
sl@0	36	/**
sl@0	37	J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data.
sl@0	38	*/
sl@0	39	const TInt KMaxSizeAutoDetectSample = 1000;
sl@0	40
sl@0	41	const TUint8 KEscape = 0x1b;
sl@0	42	const TInt KByteOrderMark = 0xfeff;
sl@0	43
sl@0	44	const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters()
sl@0	45	{
sl@0	46	return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters();
sl@0	47	}
sl@0	48
sl@0	49	/**
sl@0	50	This API should not be used as it is ambiguous as to what encoding is required.
sl@0	51	The user should instead call the specific plug-in for the appropriate conversion.
sl@0	52	J5 ConvertFromUnicode() will convert to UTF8 as default.
sl@0	53	@internalTechnology
sl@0	54	*/
sl@0	55	TInt CJ5Converter::ConvertFromUnicode(
sl@0	56	CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */,
sl@0	57	const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */,
sl@0	58	TDes8& aForeign,
sl@0	59	const TDesC16& aUnicode,
sl@0	60	CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */)
sl@0	61	{
sl@0	62	return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode);
sl@0	63	}
sl@0	64
sl@0	65	/**
sl@0	66	This will automatically determine one of the five supported encodings
sl@0	67	to use and convert accordingly. This plugin method is available to the
sl@0	68	user though the CCnvCharacterSetConverter::ConvertToUnicode() method.
sl@0	69	There is no way for the caller to determine which encoding has been used.
sl@0	70
sl@0	71	NOTE: For debugging the selected character set is returned in the state.
sl@0	72
sl@0	73	@released 9.1
sl@0	74	@param aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters
sl@0	75	in the foreign character set.
sl@0	76	@param aUnicode On return, contains the text converted into Unicode.
sl@0	77	@param aForeign The non-Unicode source text to be converted.
sl@0	78	@param aState Used to save state information across multiple calls
sl@0	79	to <code>ConvertToUnicode()</code>.
sl@0	80	@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not
sl@0	81	converted.
sl@0	82	@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the
sl@0	83	input text that could not be converted. A negative
sl@0	84	value indicates that all the characters were
sl@0	85	converted.
sl@0	86	@return The number of unconverted bytes left at the end of the input descriptor
sl@0	87	(e.g. because the output descriptor is not long enough to hold all the text),
sl@0	88	or one of the error values defined in TError.
sl@0	89	@internalTechnology
sl@0	90	*/
sl@0	91	TInt CJ5Converter::ConvertToUnicode(
sl@0	92	CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
sl@0	93	TDes16& aUnicode,
sl@0	94	const TDesC8& aForeign,
sl@0	95	TInt& aState,
sl@0	96	TInt& aNumberOfUnconvertibleCharacters,
sl@0	97	TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0	98	{
sl@0	99	// As the aState parameter is used to pass back the detected value
sl@0	100	// use a "hidden" internal state variable.
sl@0	101	TInt internalState = CCnvCharacterSetConverter::KStateDefault;
sl@0	102
sl@0	103	// determine the encoding type and then decode appropriatly
sl@0	104	switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign))
sl@0	105	{
sl@0	106	case EShiftjis:
sl@0	107	aState = EShiftjis;
sl@0	108	return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign,
sl@0	109	aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0	110
sl@0	111	case EIso2022jp1:
sl@0	112	aState = EIso2022jp1;
sl@0	113	return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
sl@0	114	aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0	115
sl@0	116	case EEucjp:
sl@0	117	aState = EEucjp;
sl@0	118	return ConvertEEucjpToUnicode(
sl@0	119	aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
sl@0	120	aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0	121
sl@0	122	case EUcs2:
sl@0	123	aState = EUcs2;
sl@0	124	return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign,
sl@0	125	aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
sl@0	126
sl@0	127	case EUtf8:
sl@0	128	aState = EUtf8;
sl@0	129	return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
sl@0	130
sl@0	131	default:
sl@0	132	// fall though to the default, which is decode as UTF8
sl@0	133	aState = EUnknown;
sl@0	134	break;
sl@0	135	}
sl@0	136
sl@0	137	// decode as UTF8
sl@0	138	return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
sl@0	139	}
sl@0	140
sl@0	141	/**
sl@0	142	This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL().
sl@0	143	This method returns a value between 0 and 100, indicating how likely it
sl@0	144	is that this is the correct converter, for the text supplied. As J5 is
sl@0	145	NOT intended to be used with the existing auto-detect mechanism, it will
sl@0	146	always return 0
sl@0	147	@internalTechnology
sl@0	148	*/
sl@0	149	TBool CJ5Converter::IsInThisCharacterSetL(
sl@0	150	TBool& aSetToTrue,
sl@0	151	TInt& aConfidenceLevel,
sl@0	152	const TDesC8& /* aSample */)
sl@0	153	{
sl@0	154	/*
sl@0	155	aSetToTrue - This value should be set to ETrue. It is used to indicate to
sl@0	156	CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL
sl@0	157	is implementing a function of this signature and is therefore not the empty
sl@0	158	*/
sl@0	159	aSetToTrue=ETrue;
sl@0	160
sl@0	161	/* no need to look at the sample as this always returns 0
sl@0	162	as the autodetect feature is not supported by the J5 plug-in
sl@0	163	*/
sl@0	164	aConfidenceLevel=0;
sl@0	165	return ETrue;
sl@0	166	}
sl@0	167
sl@0	168	CJ5Converter* CJ5Converter::NewL()
sl@0	169	{
sl@0	170	CJ5Converter* self = new(ELeave) CJ5Converter();
sl@0	171	CleanupStack::PushL(self);
sl@0	172	self->ConstructL();
sl@0	173	CleanupStack::Pop(self);
sl@0	174	return self;
sl@0	175	}
sl@0	176
sl@0	177	CJ5Converter::~CJ5Converter()
sl@0	178	{
sl@0	179	FeatureManager::UnInitializeLib();
sl@0	180	}
sl@0	181
sl@0	182	CJ5Converter::CJ5Converter()
sl@0	183	{
sl@0	184	}
sl@0	185
sl@0	186	void CJ5Converter::ConstructL()
sl@0	187	{
sl@0	188	FeatureManager::InitializeLibL();
sl@0	189	}
sl@0	190
sl@0	191	const TImplementationProxy ImplementationTable[] =
sl@0	192	{
sl@0	193	#ifdef KDDIAU_TEST
sl@0	194	// for the test build use a special test UID
sl@0	195	IMPLEMENTATION_PROXY_ENTRY(0x01000002, CJ5Converter::NewL)
sl@0	196	#else
sl@0	197	IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5, CJ5Converter::NewL)
sl@0	198	#endif
sl@0	199	};
sl@0	200
sl@0	201	EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
sl@0	202	{
sl@0	203	aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
sl@0	204
sl@0	205	return ImplementationTable;
sl@0	206	}
sl@0	207
sl@0	208	/**
sl@0	209	DetectEncoding determine the characterset encoding.
sl@0	210	The logic for this detection is based on the information in CJKV by Ken Lunde.
sl@0	211	A detailed diagram of this logic is in the J5 how to document section 2.4
sl@0	212	@return The detected character set as a enum CJ5Converter.
sl@0	213	@internalTechnology
sl@0	214	*/
sl@0	215	enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding(
sl@0	216	CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters ,
sl@0	217	const TDesC8& aForeign)
sl@0	218	{
sl@0	219
sl@0	220	// first check for UCS2
sl@0	221	CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian;
sl@0	222	if ( DetectUcs2(aForeign, ucs2Endianness ))
sl@0	223	{
sl@0	224	// if ucs2 is detected pass back the detected endianess
sl@0	225	aDefaultEndiannessOfForeignCharacters = ucs2Endianness;
sl@0	226	return EUcs2;
sl@0	227	}
sl@0	228
sl@0	229	// next try EUC_JP
sl@0	230	TInt eucJpValidBytes = 0;
sl@0	231	CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes );
sl@0	232	if ( result == EIsCharacterSet )
sl@0	233	{
sl@0	234	return EEucjp;
sl@0	235	}
sl@0	236
sl@0	237	// next try Iso 2020JP
sl@0	238	if ( DetectIso2022( aForeign ) == EIsCharacterSet )
sl@0	239	{
sl@0	240	return EIso2022jp1;
sl@0	241	}
sl@0	242
sl@0	243	// next try Utf8
sl@0	244	if ( DetectUtf8( aForeign ) == EIsCharacterSet )
sl@0	245	{
sl@0	246	return EUtf8;
sl@0	247	}
sl@0	248
sl@0	249	// shiftjis
sl@0	250	TInt shiftjisValidBytes = 0;
sl@0	251	result = DetectShiftJis( aForeign, shiftjisValidBytes );
sl@0	252	if ( result == EIsCharacterSet )
sl@0	253	{
sl@0	254	return EShiftjis;
sl@0	255	}
sl@0	256
sl@0	257	// no clear winner so go for the best
sl@0	258	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);
sl@0	259
sl@0	260	// if more than half is shiftjis and more shiftjis than EUC_JP,
sl@0	261	if ((shiftjisValidBytes > eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength))
sl@0	262	return EShiftjis;
sl@0	263
sl@0	264	// if more than half is EUC_JP and more EUC_JP than shiftjis,
sl@0	265	if ((eucJpValidBytes > shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength))
sl@0	266	return EEucjp;
sl@0	267
sl@0	268	// return the default
sl@0	269	return EUcs2;
sl@0	270	}
sl@0	271
sl@0	272
sl@0	273	/**
sl@0	274	Check if UCS2.
sl@0	275	If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff)
sl@0	276	then this must be UCS2. Otherwise try lookiing for 0x00 or 0x00
sl@0	277	@param A sample of data to be checked
sl@0	278	@param The Endianness if USC2 is detected
sl@0	279	@return ETrue if UCS2 else EFalse
sl@0	280	@internalTechnology
sl@0	281	*/
sl@0	282	TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign,
sl@0	283	CCnvCharacterSetConverter::TEndianness& aTEndianness )
sl@0	284	{
sl@0	285	// if the sample is not big enough
sl@0	286	if (aForeign.Length() < 2)
sl@0	287	{
sl@0	288	return EFalse;
sl@0	289	}
sl@0	290	else if (aForeign[0]==0xff && aForeign[1]==0xfe )
sl@0	291	{
sl@0	292	// we have found a Little Endian Byte order mark
sl@0	293	aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
sl@0	294	return ETrue;
sl@0	295	}
sl@0	296	else if (aForeign[0]==0xfe && aForeign[1]==0xff )
sl@0	297	{
sl@0	298	// we have found a Big Endian Byte order mark
sl@0	299	aTEndianness = CCnvCharacterSetConverter::EBigEndian;
sl@0	300	return ETrue;
sl@0	301	}
sl@0	302
sl@0	303	// Next check for sequences of 0x00 or 0x00 as UCS-2 is the only charset that
sl@0	304	// specifies 0x00 or 0x00 (according to endianness) for the ASCII range of characters.
sl@0	305	// NB: This will fail if there are no ASCII characters in the text.
sl@0	306	TInt sampleLength = aForeign.Length();
sl@0	307	sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0	308
sl@0	309	// check the sample for sequences of 0x00 or 0x00
sl@0	310	TInt bigEndianConfidence = 0;
sl@0	311	TInt littleEndianConfidence = 0;
sl@0	312	TInt i=0;
sl@0	313	for(;i< (sampleLength-1); i+=2)
sl@0	314	{
sl@0	315	if( aForeign[i] == 0x00)
sl@0	316	{
sl@0	317	bigEndianConfidence +=2;
sl@0	318	}
sl@0	319	else if ( aForeign[i+1] == 0x00)
sl@0	320	{
sl@0	321	littleEndianConfidence +=2;
sl@0	322	}
sl@0	323	}
sl@0	324
sl@0	325	// which occurs most BE or LE
sl@0	326	TInt confidenceLevel = 0;
sl@0	327	if (bigEndianConfidence > littleEndianConfidence)
sl@0	328	{
sl@0	329	aTEndianness = CCnvCharacterSetConverter::EBigEndian;
sl@0	330	confidenceLevel = bigEndianConfidence;
sl@0	331	}
sl@0	332	else
sl@0	333	{
sl@0	334	aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
sl@0	335	confidenceLevel = littleEndianConfidence;
sl@0	336	}
sl@0	337
sl@0	338	// if more than 97% count as UCS2
sl@0	339	if ( confidenceLevel * 100/sampleLength > 97)
sl@0	340	return ETrue;
sl@0	341
sl@0	342	return EFalse;
sl@0	343	}
sl@0	344
sl@0	345	/**
sl@0	346	Check if ShiftJis (reference CJKV by Ken Lunde page 175)
sl@0	347	@param A sample of data to be checked
sl@0	348	@param The number of input bytes that can be converted
sl@0	349	@return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0	350	@internalTechnology
sl@0	351	*/
sl@0	352	enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
sl@0	353	{
sl@0	354	// Get the sample length
sl@0	355	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0	356
sl@0	357	TInt i=0;
sl@0	358	aNumberOfBytesConverted = 0;
sl@0	359
sl@0	360	TText8 character;
sl@0	361	TText8 characterPlus1;
sl@0	362	TText8 characterPlus2;
sl@0	363
sl@0	364	// scan the sample text looking for valid shiftjis data
sl@0	365	while ( i < sampleLength )
sl@0	366	{
sl@0	367	// get the next few characters, use 0 if there is no more sample
sl@0	368	// as this will not match any test.
sl@0	369	character = aForeign[i];
sl@0	370	characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0	371	characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0	372
sl@0	373	// SHIFTJIS - 0x8e to 0x9f followed by 0x40 to 0xfc
sl@0	374	if ((character >= 0x81) && (character <= 0x9f) &&
sl@0	375	(characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) )
sl@0	376	{
sl@0	377	// this is SHIFTJIS unless it is EUC JP code set 2 or 3
sl@0	378	if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF))
sl@0	379	{
sl@0	380	// this could be EUC JP code set 2 (or shiftjis)
sl@0	381	aNumberOfBytesConverted+=2;
sl@0	382	i++;
sl@0	383	}
sl@0	384	else if ((character == 0x8F) &&
sl@0	385	(characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) &&
sl@0	386	(characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
sl@0	387	{
sl@0	388	// this could be EUC JP code set 3 (or shiftjis)
sl@0	389	aNumberOfBytesConverted+=3;
sl@0	390	i+=2;
sl@0	391	}
sl@0	392	else
sl@0	393	{
sl@0	394	// this can only be shift jis
sl@0	395	return EIsCharacterSet;
sl@0	396	}
sl@0	397	}
sl@0	398
sl@0	399	// SHIFTJIS - 0xE0 to 0xEF followed by .....
sl@0	400	else if ((character >= 0xE0) && (character <= 0xEF))
sl@0	401	{
sl@0	402	// 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF
sl@0	403	// including Mopera extension to shiftjis from 0xEF80 to 0xEFFC
sl@0	404
sl@0	405	if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) )
sl@0	406	{
sl@0	407	// this can only be shift jis
sl@0	408	return EIsCharacterSet;
sl@0	409	}
sl@0	410	else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) )
sl@0	411	{
sl@0	412	// this could be EUC JP code set 1
sl@0	413	aNumberOfBytesConverted+=2;
sl@0	414	i++;
sl@0	415	}
sl@0	416
sl@0	417	// problem here is the overlap between the UTF8 and shiftjis
sl@0	418	else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) )
sl@0	419	{
sl@0	420	// this could be shiftjis or utf8
sl@0	421	aNumberOfBytesConverted+=2;
sl@0	422	i++;
sl@0	423	}
sl@0	424	}
sl@0	425	// half width katakana A1-DF
sl@0	426	else if ((character >= 0xA1) && (character <= 0xDF))
sl@0	427	{
sl@0	428	aNumberOfBytesConverted+=1;
sl@0	429	}
sl@0	430	// ASCII or JIS-Roman 20-7e
sl@0	431	else if ( ((character >= 0x20) && (character <= 0x7E)) \|\| (character == 0x0A) \|\| (character == 0x0D))
sl@0	432	{
sl@0	433	aNumberOfBytesConverted+=1;
sl@0	434	}
sl@0	435	else
sl@0	436	{
sl@0	437	// This is not decoding as shiftjis, so reject
sl@0	438	aNumberOfBytesConverted =0;
sl@0	439	return EIsNotCharacterSet;
sl@0	440	}
sl@0	441	i++;
sl@0	442	}
sl@0	443
sl@0	444	// if all the characters could be converted
sl@0	445	if (aNumberOfBytesConverted == sampleLength)
sl@0	446	{
sl@0	447	return EIsCharacterSet;
sl@0	448	}
sl@0	449	else if (aNumberOfBytesConverted == 0)
sl@0	450	{
sl@0	451	return EIsNotCharacterSet;
sl@0	452	}
sl@0	453	else
sl@0	454	{
sl@0	455	return EMaybeCharacterSet;
sl@0	456	}
sl@0	457	}
sl@0	458
sl@0	459	/**
sl@0	460	Check if UTF8 (reference CJKV by Ken Lunde page 189)
sl@0	461	@param A sample of data to be checked
sl@0	462	@param The number of input bytes that can be converted
sl@0	463	@return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0	464	@internalTechnology
sl@0	465	*/
sl@0	466	enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign )
sl@0	467	{
sl@0	468	// Get the sample length
sl@0	469	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0	470
sl@0	471	TInt i=0;
sl@0	472	TText8 character;
sl@0	473	TText8 characterPlus1;
sl@0	474	TText8 characterPlus2;
sl@0	475	TText8 characterPlus3;
sl@0	476
sl@0	477	// scan the sample text looking for valid UTF8
sl@0	478	while ( i < sampleLength )
sl@0	479	{
sl@0	480	// get the next few characters, use 0 if there is no more sample
sl@0	481	// as this will not match any test.
sl@0	482	character = aForeign[i];
sl@0	483	characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0	484	characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0	485	characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
sl@0	486
sl@0	487	// UTF8 range 110xxxxx followed by one valid UTF8 bytes
sl@0	488	if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) )
sl@0	489	{
sl@0	490	// two bytes of valid UTF8 found
sl@0	491	i+=2;
sl@0	492	}
sl@0	493	// UTF8 range 1110xxxx followed by two valid UTF8 bytes
sl@0	494	else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80))
sl@0	495	{
sl@0	496	// three bytes of valid UTF8 found
sl@0	497	i+=3;
sl@0	498	}
sl@0	499	// UTF8 range 11110xxx followed by three valid UTF8 bytes
sl@0	500	else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80)
sl@0	501	&& (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) )
sl@0	502	{
sl@0	503	// four bytes of valid UTF8 found
sl@0	504	i+=4;
sl@0	505	}
sl@0	506
sl@0	507	// ascii range 0 to 0x7F
sl@0	508	else if((character & 0x80)==0x00)
sl@0	509	{
sl@0	510	// The value of character is in the range 0x00-0x7f
sl@0	511	// UTF8 maintains ASCII transparency. So it's a valid UTF8.
sl@0	512	i++;
sl@0	513	}
sl@0	514	// if the sample data is longer than KMaxSizeAutoDetectSample then except anything
sl@0	515	// for the last two bytes as they may not appear valid without more data
sl@0	516	else if( i >= (KMaxSizeAutoDetectSample -2) )
sl@0	517	{
sl@0	518	i++;
sl@0	519	}
sl@0	520	else
sl@0	521	{
sl@0	522	// This is not decoding as UTF8 so reject
sl@0	523	return EIsNotCharacterSet;
sl@0	524	}
sl@0	525	}
sl@0	526
sl@0	527	// All the characters could be converted
sl@0	528	return EIsCharacterSet;
sl@0	529
sl@0	530	}
sl@0	531
sl@0	532
sl@0	533	/**
sl@0	534	Check if ISO2022JP by lookiing for the escape sequences.
sl@0	535	@param A sample of data to be checked
sl@0	536	@param The number of input bytes that can be converted
sl@0	537	@return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0	538	@internalTechnology
sl@0	539	*/
sl@0	540	enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign )
sl@0	541	{
sl@0	542	// Get the sample length
sl@0	543	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0	544
sl@0	545	TInt i=0;
sl@0	546	TText8 character;
sl@0	547	TText8 characterPlus1;
sl@0	548	TText8 characterPlus2;
sl@0	549	TText8 characterPlus3;
sl@0	550	TText8 characterPlus4;
sl@0	551	TText8 characterPlus5;
sl@0	552
sl@0	553	// scan the sample text looking for valid UTF8
sl@0	554	while ( i < sampleLength )
sl@0	555	{
sl@0	556	// get the next few characters, use 0 if there is no more sample
sl@0	557	// as this will not match any test.
sl@0	558	character = aForeign[i];
sl@0	559	characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0	560	characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0	561	characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
sl@0	562
sl@0	563
sl@0	564	// check for the JIS escape sequences of ISO 2022Jp
sl@0	565	// These values have been taken from JISBASE_SHARED
sl@0	566	if (character == KEscape)
sl@0	567	{
sl@0	568	// Escape Sequence For Jis C6226_1978 \x1b\x24\x40
sl@0	569	if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40))
sl@0	570	{
sl@0	571	return EIsCharacterSet;
sl@0	572	}
sl@0	573
sl@0	574	// Escape Sequence For Jis X0208_1983 \x1b\x24\x42
sl@0	575	else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42))
sl@0	576	{
sl@0	577	return EIsCharacterSet;
sl@0	578	}
sl@0	579
sl@0	580	// Escape Sequence For Jis Roman \x1b\x28\x4a
sl@0	581	else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A))
sl@0	582	{
sl@0	583	return EIsCharacterSet;
sl@0	584	}
sl@0	585
sl@0	586	// Escape Sequence For Jis RomanIncorrect \x1b\x28\x48
sl@0	587	else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48))
sl@0	588	{
sl@0	589	return EIsCharacterSet;
sl@0	590	}
sl@0	591
sl@0	592	// Escape Sequence For Ascii \x1b\x28\x42
sl@0	593	else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42))
sl@0	594	{
sl@0	595	return EIsCharacterSet;
sl@0	596	}
sl@0	597
sl@0	598	// Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49
sl@0	599	else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49))
sl@0	600	{
sl@0	601	return EIsCharacterSet;
sl@0	602	}
sl@0	603
sl@0	604	// Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42
sl@0	605	else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40))
sl@0	606	{
sl@0	607	characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0);
sl@0	608	characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0);
sl@0	609
sl@0	610	if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42))
sl@0	611	{
sl@0	612	return EIsCharacterSet;
sl@0	613	}
sl@0	614	}
sl@0	615	// Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44
sl@0	616	else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28))
sl@0	617	{
sl@0	618	if (characterPlus3 == 0x44)
sl@0	619	{
sl@0	620	return EIsCharacterSet;
sl@0	621	}
sl@0	622	}
sl@0	623
sl@0	624	// check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40
sl@0	625	else if ((characterPlus1 == 'B') \|\| (characterPlus1 == '@'))
sl@0	626	{
sl@0	627	return EIsCharacterSet;
sl@0	628	}
sl@0	629
sl@0	630	} // end of if ( character == KEscape )
sl@0	631
sl@0	632	i++;
sl@0	633	}
sl@0	634
sl@0	635	// if escape sequences have been found then this is not ISO2022
sl@0	636	return EIsNotCharacterSet;
sl@0	637
sl@0	638	}
sl@0	639
sl@0	640
sl@0	641	/**
sl@0	642	Check if EUC JP (reference CJKV by Ken Lunde page 164)
sl@0	643	@param A sample of data to be checked
sl@0	644	@param The number of input bytes that can be converted
sl@0	645	@return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
sl@0	646	@internalTechnology
sl@0	647	*/
sl@0	648	CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
sl@0	649	{
sl@0	650	// Get the sample length
sl@0	651	TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
sl@0	652
sl@0	653	TInt i=0;
sl@0	654	aNumberOfBytesConverted = 0;
sl@0	655
sl@0	656	TText8 character;
sl@0	657	TText8 characterPlus1;
sl@0	658	TText8 characterPlus2;
sl@0	659
sl@0	660	// scan the sample text looking for valid shiftjis data
sl@0	661	while ( i < sampleLength )
sl@0	662	{
sl@0	663	// get the next few characters, use 0 if there is no more sample
sl@0	664	// as this will not match any test.
sl@0	665	character = aForeign[i];
sl@0	666	characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
sl@0	667	characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
sl@0	668
sl@0	669	// EUCJP code set 0 0x21-0x7e
sl@0	670	if ( (character >= 0x21) && (character <= 0x7e))
sl@0	671	{
sl@0	672	aNumberOfBytesConverted++;
sl@0	673	}
sl@0	674	else if ( (character == 0x0a) \|\| (character == 0x0d))
sl@0	675	{
sl@0	676	aNumberOfBytesConverted++;
sl@0	677	}
sl@0	678	// EUCJP code set 1
sl@0	679	else if ( (character >= 0xa1) && (character <= 0xff)
sl@0	680	&& (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) )
sl@0	681	{
sl@0	682	aNumberOfBytesConverted+=2;
sl@0	683	i++;
sl@0	684	}
sl@0	685
sl@0	686	// EUC JP code set 2, starts with the EUC JP SS2 character (0x8E)
sl@0	687	// and is followed by character in range 0xA1- 0xDF
sl@0	688	else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) )
sl@0	689	{
sl@0	690	// this could be 2 bytes of EUC JP code set 2
sl@0	691	aNumberOfBytesConverted += 2;
sl@0	692	i++;
sl@0	693	}
sl@0	694	// EUC JP code set 3, starts with the EUC JP SS3 character (0x8F)
sl@0	695	// and is followed by two characters in range A1- DF A1 -FE
sl@0	696	else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF)
sl@0	697	&& (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
sl@0	698	{
sl@0	699	// this could be 3 bytes of EUC JP code set 3
sl@0	700	aNumberOfBytesConverted += 3;
sl@0	701	i+=2;
sl@0	702	}
sl@0	703	else
sl@0	704	{
sl@0	705	// This is not a valid decoding as EUC JP so reject
sl@0	706	return EIsNotCharacterSet;
sl@0	707	}
sl@0	708	i++;
sl@0	709	}
sl@0	710
sl@0	711
sl@0	712	// if all the characters could be converted
sl@0	713	if (aNumberOfBytesConverted == sampleLength)
sl@0	714	{
sl@0	715	return EIsCharacterSet;
sl@0	716	}
sl@0	717	else if (aNumberOfBytesConverted == 0)
sl@0	718	{
sl@0	719	return EIsNotCharacterSet;
sl@0	720	}
sl@0	721	else
sl@0	722	{
sl@0	723	return EMaybeCharacterSet;
sl@0	724	}
sl@0	725	}
sl@0	726
sl@0	727
sl@0	728	/**
sl@0	729	Convert from UCS2 (Universal Character Set containing two bytes) to unicode
sl@0	730	Remove any byte order marks in the UCSs.
sl@0	731	@param aUnicode Contains the converted text in the Unicode character set.
sl@0	732	@param aForeign The non-Unicode source text to be converted
sl@0	733	@param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted.
sl@0	734	@param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character.
sl@0	735	@return the number of bytes converted
sl@0	736	@internalTechnology
sl@0	737	*/
sl@0	738	TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters,
sl@0	739	TDes16& aUnicode,
sl@0	740	const TDesC8& aForeign,
sl@0	741	TInt& aNumberOfUnconvertibleCharacters,
sl@0	742	TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0	743
sl@0	744	{
sl@0	745	TInt numberOfBytesConverted = 0;
sl@0	746	TInt numberOfUnicodeCharacters =0;
sl@0	747	TChar nextChar;
sl@0	748
sl@0	749	// start at begining of the output buffer provided
sl@0	750	aUnicode.Zero();
sl@0	751
sl@0	752	// while there is at least 2 bytes of data to convert and space in the output buffer
sl@0	753	while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) )
sl@0	754	{
sl@0	755	if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian )
sl@0	756	{
sl@0	757	// ELittleEndian 0x??00
sl@0	758	nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8);
sl@0	759	}
sl@0	760	else
sl@0	761	{
sl@0	762	// EBigEndian 0x00??
sl@0	763	nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1];
sl@0	764	}
sl@0	765
sl@0	766	// save the unicode character extracted unless it's a BOM
sl@0	767	if ( nextChar != KByteOrderMark )
sl@0	768	{
sl@0	769	aUnicode.Append( nextChar );
sl@0	770	numberOfUnicodeCharacters++;
sl@0	771	}
sl@0	772
sl@0	773	numberOfBytesConverted+=2;
sl@0	774	}
sl@0	775
sl@0	776	// there are no uncovertable characters with UCS2, but there could be
sl@0	777	aNumberOfUnconvertibleCharacters = 0;
sl@0	778	// a negative value indicates that all characters converted
sl@0	779	aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1;
sl@0	780
sl@0	781	// returns the number of unconverted bytes left at the end of the input descriptor
sl@0	782	// Note there could be 1 byte left over if an odd number of bytes provided for conversion
sl@0	783	return aForeign.Size() - numberOfBytesConverted;
sl@0	784	}
sl@0	785
sl@0	786	/**
sl@0	787	Convert from EUC_JP (Extended Unix Code encoding for Japanese)
sl@0	788	Using the standard Charconv method of an array of methods
sl@0	789	@return the number of bytes converted
sl@0	790	@internalTechnology
sl@0	791	*/
sl@0	792	TInt CJ5Converter::ConvertEEucjpToUnicode(
sl@0	793	CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
sl@0	794	TDes16& aUnicode,
sl@0	795	const TDesC8& aForeign,
sl@0	796	TInt& /aState/,
sl@0	797	TInt& aNumberOfUnconvertibleCharacters,
sl@0	798	TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0	799	{
sl@0	800	TFixedArray<CnvUtilities::SMethod, 4> methods;
sl@0	801	methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman;
sl@0	802	methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace;
sl@0	803	methods[0].iConversionData=&CnvJisRoman::ConversionData();
sl@0	804	methods[0].iNumberOfBytesPerCharacter=1;
sl@0	805	methods[0].iNumberOfCoreBytesPerCharacter=1;
sl@0	806	methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208;
sl@0	807	methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace;
sl@0	808	methods[1].iConversionData=&CnvJisX0208::ConversionData();
sl@0	809	methods[1].iNumberOfBytesPerCharacter=2;
sl@0	810	methods[1].iNumberOfCoreBytesPerCharacter=2;
sl@0	811	methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8;
sl@0	812	methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace;
sl@0	813	methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData();
sl@0	814	methods[2].iNumberOfBytesPerCharacter=2;
sl@0	815	methods[2].iNumberOfCoreBytesPerCharacter=1;
sl@0	816	methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212;
sl@0	817	methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace;
sl@0	818	methods[3].iConversionData=&CnvJisX0212::ConversionData();
sl@0	819	methods[3].iNumberOfBytesPerCharacter=3;
sl@0	820	methods[3].iNumberOfCoreBytesPerCharacter=2;
sl@0	821	return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array());
sl@0	822	}
sl@0	823

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--