os/textandloc/charconvfw/charconvplugins/src/shared/gb2312_shared.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include <e32std.h>
    20 #include <convgeneratedcpp.h>
    21 #include "gb2312.h"
    22 
    23 struct SCnvConversionData;
    24 
    25 EXPORT_C const TDesC8& CnvGb2312::ReplacementForUnconvertibleUnicodeCharacters()
    26 	{
    27 	return ReplacementForUnconvertibleUnicodeCharacters_internal();
    28 	}
    29 
    30 EXPORT_C const SCnvConversionData& CnvGb2312::ConversionData()
    31 	{
    32 	return conversionData;
    33 	}
    34 
    35 EXPORT_C TBool CnvGb2312::IsCharGBBased(TInt& aConfidenceLevel, const TDesC8& aSample)
    36 	{
    37 	TInt sampleLength = aSample.Length();
    38 	aConfidenceLevel = 0;
    39 	//WBB the following is for distiguish between big5 and GBK
    40 	TInt totalWeight=0;		//sum of the weights of 20 most frequent chars
    41 	TInt sumOfGoodChar=0;		//the number of chars whose first byte and second are both in the range
    42 	TInt sumOfWeight=0;		//sum of the weights of the chars which are included in the sample
    43 	TInt sumOutChar=0;		//the number of chars which are not common
    44 	TInt sumOfBadSecondByte=0;//the number of chars whose first byte is in the range but not the second
    45 	TInt sumOfBadSingleByte=0;	//the number of bad single byte, which is not in valid range
    46 	struct referenceChar
    47 		{
    48 		TUint charGBK;
    49 		TInt weight;
    50 		};
    51 
    52 	referenceChar refGbk[20];
    53 	static const TInt iniWeight[20]=
    54 		{
    55 		//occurence per 1000 chars
    56 		30,20,20,10,10,10,10,10,5,5,
    57 		5,5,5,5,5,5,5,5,5,5
    58 		};
    59 
    60 	static const TUint iniChar[20]=
    61 		{
    62 		0xa3ac,0xb5c4,0xc1cb,0xb8f6,0xb2bb,0xb0d1,0xd2bb,0xcac7,0xd2aa,0xbecd,
    63 		0xd2b2,0xccec,0xc9cf,0xbacd,0xd6d0,0xd4da,0xd0a1,0xc8cb,0xcfc2,0xd6d0,
    64 		};
    65 
    66 	for (TInt k=0; k<20; k++)
    67 		{
    68 		refGbk[k].charGBK=iniChar[k];
    69 		refGbk[k].weight=iniWeight[k];
    70 		totalWeight=totalWeight+iniWeight[k];
    71 		}
    72 
    73 	
    74 	//WBB
    75 	for (TInt i = 0; i < sampleLength; ++i)
    76 		{
    77 		//GBK encoding first byte range 0x81-0xfe
    78 		//              second byte range 0x40-0x7e, 0x80-0xfe
    79 		if((aSample[i] >= 0x81) && (aSample[i] <= 0xfe))
    80 			{
    81 			TInt increment1 = i+1;
    82 			if (increment1 >= sampleLength)
    83 				break;
    84 			if (((aSample[increment1] >=0x40) && (aSample[increment1] <= 0x7e)) ||
    85 				((aSample[increment1] >=0x80) && (aSample[increment1] <= 0xfe)))
    86 				{
    87 				//WBB
    88 				TUint charGbk=(aSample[i]<<8)|(aSample[increment1]);
    89 				TInt j;
    90 				for (j=0; j<20; j++)
    91 					{
    92 					if (charGbk==refGbk[j].charGBK)
    93 						{
    94 						sumOfWeight=sumOfWeight+refGbk[j].weight;
    95 						break;
    96 						}
    97 					}
    98 				if ((aSample[i]>=0xa4)&&(aSample[i]<=0xaf))
    99 					sumOutChar++;
   100 				sumOfGoodChar++;
   101 				i++;
   102 				//WBB
   103 				}
   104 			else
   105 				{
   106 				sumOfBadSecondByte++;				
   107 				}
   108 			}
   109 		// if seldom used characters
   110 		else if (aSample[i] < 0x20 || aSample[i] > 0x7F ) 
   111 			{
   112 			if (aSample[i]!=0x09 && aSample[i]!=0x0A && aSample[i]!=0x0D)
   113 				sumOfBadSingleByte++;
   114 			}
   115 		} // for 
   116 
   117 	TInt limit;
   118 	limit = (10*sampleLength)/100;
   119 	if (sumOfGoodChar > limit)
   120 		{
   121 		aConfidenceLevel=sumOfGoodChar*100/(sumOfBadSecondByte+sumOfGoodChar+sumOfBadSingleByte);
   122 		aConfidenceLevel=aConfidenceLevel-Max(0,((totalWeight-sumOfWeight)*sumOfGoodChar/1000));//against frequent chars 
   123 		aConfidenceLevel=aConfidenceLevel-(sumOutChar*100/sumOfGoodChar);//against gap
   124 		aConfidenceLevel=(aConfidenceLevel < 0)?0:aConfidenceLevel;
   125 		}
   126 	else
   127 		aConfidenceLevel=0;
   128 	return ETrue;
   129 	}