os/textandloc/charconvfw/charconvplugins/src/plugins/big5.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include <e32std.h>
    20 #include <charconv.h>
    21 #include "big5.h"
    22 #include <ecom/implementationproxy.h>
    23 #include <charactersetconverter.h>
    24 
    25 class CBIG5ConverterImpl : public CCharacterSetConverterPluginInterface
    26 	{
    27 
    28 public:
    29 	virtual const TDesC8& ReplacementForUnconvertibleUnicodeCharacters();
    30 
    31 	virtual TInt ConvertFromUnicode(
    32 		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
    33 		const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, 
    34 		TDes8& aForeign, 
    35 		const TDesC16& aUnicode, 
    36 		CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters);
    37 
    38 	virtual TInt ConvertToUnicode(
    39 		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
    40 		TDes16& aUnicode, 
    41 		const TDesC8& aForeign, 
    42 		TInt& aState, 
    43 		TInt& aNumberOfUnconvertibleCharacters, 
    44 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter);
    45 
    46 	virtual TBool IsInThisCharacterSetL(
    47 		TBool& aSetToTrue, 
    48 		TInt& aConfidenceLevel, 
    49 		const TDesC8& aSample);
    50 
    51 	static CBIG5ConverterImpl* NewL();
    52 	virtual ~CBIG5ConverterImpl();
    53 
    54 private:
    55 	CBIG5ConverterImpl();
    56 
    57 	};
    58 
    59 
    60 const TDesC8& CBIG5ConverterImpl::ReplacementForUnconvertibleUnicodeCharacters()
    61 	{
    62 	return CnvBig5::ReplacementForUnconvertibleUnicodeCharacters();
    63 	}
    64 
    65 TInt CBIG5ConverterImpl::ConvertFromUnicode(
    66 		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
    67 		const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, 
    68 		TDes8& aForeign, 
    69 		const TDesC16& aUnicode, 
    70 		CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters)
    71 	{
    72 	return CCnvCharacterSetConverter::DoConvertFromUnicode(CnvBig5::ConversionData(), aDefaultEndiannessOfForeignCharacters, aReplacementForUnconvertibleUnicodeCharacters, aForeign, aUnicode, aIndicesOfUnconvertibleCharacters);
    73 	}
    74 
    75 TInt CBIG5ConverterImpl::ConvertToUnicode(
    76 		CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, 
    77 		TDes16& aUnicode, 
    78 		const TDesC8& aForeign, 
    79 		TInt& /*aState*/, 
    80 		TInt& aNumberOfUnconvertibleCharacters, 
    81 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
    82 	{
    83 	return CCnvCharacterSetConverter::DoConvertToUnicode(CnvBig5::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
    84 	}
    85 
    86 TBool CBIG5ConverterImpl::IsInThisCharacterSetL(
    87 		TBool& aSetToTrue, 
    88 		TInt& aConfidenceLevel, 
    89 		const TDesC8& aSample)
    90 	{
    91 	aSetToTrue=ETrue;
    92 	TInt sampleLength = aSample.Length();
    93 	aConfidenceLevel = 0;
    94 	//WBB the following is for distiguish between big5 and GBK
    95 	TInt totalWeight=0;		//sum of the weights of 20 most frequent chars
    96 	TInt sumOfGoodChar=0;		//the number of chars whose first byte and second are both in the range
    97 	TInt sumOfWeight=0;		//sum of the weights of the chars which are included in the sample
    98 	TInt sumOutChar=0;		//the number of chars which are not common
    99 	TInt sumOfBadSecondByte=0;//the number of chars whose first byte is in the range but not the second
   100 	TInt sumOfBadSingleByte=0;	//the number of bad single byte, which is not in valid range
   101 	struct referenceChar
   102 		{
   103 		TUint charBig5;
   104 		TInt weight;
   105 		};
   106 
   107 	referenceChar refBig5[20];
   108 	static const TInt iniWeight[20]=
   109 		{
   110 		//occurence per 1000 chars
   111 		30,20,20,10,10,10,10,10,5,5,
   112 		5,5,5,5,5,5,5,5,5,5
   113 		};
   114 
   115 	static const TUint iniChar[20]=
   116 		{
   117 		0xa141,0xaaba,0xa446,0xadd3,0xa4a3,0xa7e2,0xa440,0xac4f,0xad6e,0xa45d,
   118 		0xa4d1,0xa457,0xa457,0xa94d,0xa4a4,0xa569,0xa662,0xa470,0xa448,0xa455
   119 		};
   120 
   121 	for (TInt k=0; k<20; k++)
   122 		{
   123 		refBig5[k].charBig5=iniChar[k];
   124 		refBig5[k].weight=iniWeight[k];
   125 		totalWeight=totalWeight+iniWeight[k];
   126 		}
   127 	//WBB
   128 	for (TInt i = 0; i < sampleLength; ++i)
   129 		{
   130 		// Big 5 encoding first byte range 0xA1-0xFE 
   131 		//                second byte range 0x40-0x7E  0xA1-0xFE
   132 		if((aSample[i] >= 0xa1) && (aSample[i] <= 0xfe))
   133 			{
   134 			TInt increment1 = i+1;
   135 			if (increment1 >= sampleLength)
   136 				break;
   137 			if(((aSample[increment1] >= 0x40) && (aSample[increment1] <= 0x7e)) ||
   138 				((aSample[increment1] >= 0xa1) && (aSample[increment1] <= 0xfe)))
   139 				{
   140 				TUint charBig5=(aSample[i]<<8)|(aSample[increment1]);
   141 				if (charBig5>=0xc6a1)//Kanas start and rare chars follow after 
   142 					sumOutChar++;
   143 				TInt j;
   144 				for (j=0; j<20; j++)
   145 					{
   146 					if (charBig5==refBig5[j].charBig5)
   147 						{
   148 						sumOfWeight=sumOfWeight+refBig5[j].weight;
   149 						break;
   150 						}
   151 					}
   152 				sumOfGoodChar++;
   153 				i++;
   154 				}
   155 			else
   156 				{
   157 				sumOfBadSecondByte++;
   158 				}
   159 			}
   160 		// if seldom used characters
   161 		else if (aSample[i] < 0x20 || aSample[i] > 0x7F ) 
   162 			{
   163 			if (aSample[i]!=0x09 && aSample[i]!=0x0A && aSample[i]!=0x0D)
   164 				sumOfBadSingleByte++;
   165 			}
   166 		} // for 
   167 
   168 	if (sumOfGoodChar)
   169 		{
   170 		aConfidenceLevel=sumOfGoodChar*100/(sumOfBadSecondByte+sumOfGoodChar+sumOfBadSingleByte);
   171 		aConfidenceLevel=aConfidenceLevel-Max(0,((totalWeight-sumOfWeight)*sumOfGoodChar/1000));//against frequent chars 
   172 		aConfidenceLevel=aConfidenceLevel-sumOutChar*100/sumOfGoodChar;//against gap
   173 		aConfidenceLevel=(aConfidenceLevel < 0)?0:aConfidenceLevel;
   174 		}
   175 	else
   176 		aConfidenceLevel=0;
   177 	return ETrue;
   178 	}
   179 
   180 CBIG5ConverterImpl* CBIG5ConverterImpl::NewL()
   181 	{
   182 	CBIG5ConverterImpl* self = new(ELeave) CBIG5ConverterImpl();
   183 	return self;
   184 	}
   185 
   186 CBIG5ConverterImpl::~CBIG5ConverterImpl()
   187 	{
   188 	}
   189 
   190 CBIG5ConverterImpl::CBIG5ConverterImpl()
   191 	{
   192 	}
   193 
   194 const TImplementationProxy ImplementationTable[] = 
   195 	{
   196 		IMPLEMENTATION_PROXY_ENTRY(0x10000FBF,CBIG5ConverterImpl::NewL)
   197 	};
   198 
   199 EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
   200 	{
   201 	aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
   202 
   203 	return ImplementationTable;
   204 	}
   205