sl@0: /* sl@0: * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: * All rights reserved. sl@0: * This component and the accompanying materials are made available sl@0: * under the terms of "Eclipse Public License v1.0" sl@0: * which accompanies this distribution, and is available sl@0: * at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: * sl@0: * Initial Contributors: sl@0: * Nokia Corporation - initial contribution. sl@0: * sl@0: * Contributors: sl@0: * sl@0: * Description: sl@0: * sl@0: */ sl@0: sl@0: sl@0: #include sl@0: #include sl@0: #include "gb2312.h" sl@0: sl@0: struct SCnvConversionData; sl@0: sl@0: EXPORT_C const TDesC8& CnvGb2312::ReplacementForUnconvertibleUnicodeCharacters() sl@0: { sl@0: return ReplacementForUnconvertibleUnicodeCharacters_internal(); sl@0: } sl@0: sl@0: EXPORT_C const SCnvConversionData& CnvGb2312::ConversionData() sl@0: { sl@0: return conversionData; sl@0: } sl@0: sl@0: EXPORT_C TBool CnvGb2312::IsCharGBBased(TInt& aConfidenceLevel, const TDesC8& aSample) sl@0: { sl@0: TInt sampleLength = aSample.Length(); sl@0: aConfidenceLevel = 0; sl@0: //WBB the following is for distiguish between big5 and GBK sl@0: TInt totalWeight=0; //sum of the weights of 20 most frequent chars sl@0: TInt sumOfGoodChar=0; //the number of chars whose first byte and second are both in the range sl@0: TInt sumOfWeight=0; //sum of the weights of the chars which are included in the sample sl@0: TInt sumOutChar=0; //the number of chars which are not common sl@0: TInt sumOfBadSecondByte=0;//the number of chars whose first byte is in the range but not the second sl@0: TInt sumOfBadSingleByte=0; //the number of bad single byte, which is not in valid range sl@0: struct referenceChar sl@0: { sl@0: TUint charGBK; sl@0: TInt weight; sl@0: }; sl@0: sl@0: referenceChar refGbk[20]; sl@0: static const TInt iniWeight[20]= sl@0: { sl@0: //occurence per 1000 chars sl@0: 30,20,20,10,10,10,10,10,5,5, sl@0: 5,5,5,5,5,5,5,5,5,5 sl@0: }; sl@0: sl@0: static const TUint iniChar[20]= sl@0: { sl@0: 0xa3ac,0xb5c4,0xc1cb,0xb8f6,0xb2bb,0xb0d1,0xd2bb,0xcac7,0xd2aa,0xbecd, sl@0: 0xd2b2,0xccec,0xc9cf,0xbacd,0xd6d0,0xd4da,0xd0a1,0xc8cb,0xcfc2,0xd6d0, sl@0: }; sl@0: sl@0: for (TInt k=0; k<20; k++) sl@0: { sl@0: refGbk[k].charGBK=iniChar[k]; sl@0: refGbk[k].weight=iniWeight[k]; sl@0: totalWeight=totalWeight+iniWeight[k]; sl@0: } sl@0: sl@0: sl@0: //WBB sl@0: for (TInt i = 0; i < sampleLength; ++i) sl@0: { sl@0: //GBK encoding first byte range 0x81-0xfe sl@0: // second byte range 0x40-0x7e, 0x80-0xfe sl@0: if((aSample[i] >= 0x81) && (aSample[i] <= 0xfe)) sl@0: { sl@0: TInt increment1 = i+1; sl@0: if (increment1 >= sampleLength) sl@0: break; sl@0: if (((aSample[increment1] >=0x40) && (aSample[increment1] <= 0x7e)) || sl@0: ((aSample[increment1] >=0x80) && (aSample[increment1] <= 0xfe))) sl@0: { sl@0: //WBB sl@0: TUint charGbk=(aSample[i]<<8)|(aSample[increment1]); sl@0: TInt j; sl@0: for (j=0; j<20; j++) sl@0: { sl@0: if (charGbk==refGbk[j].charGBK) sl@0: { sl@0: sumOfWeight=sumOfWeight+refGbk[j].weight; sl@0: break; sl@0: } sl@0: } sl@0: if ((aSample[i]>=0xa4)&&(aSample[i]<=0xaf)) sl@0: sumOutChar++; sl@0: sumOfGoodChar++; sl@0: i++; sl@0: //WBB sl@0: } sl@0: else sl@0: { sl@0: sumOfBadSecondByte++; sl@0: } sl@0: } sl@0: // if seldom used characters sl@0: else if (aSample[i] < 0x20 || aSample[i] > 0x7F ) sl@0: { sl@0: if (aSample[i]!=0x09 && aSample[i]!=0x0A && aSample[i]!=0x0D) sl@0: sumOfBadSingleByte++; sl@0: } sl@0: } // for sl@0: sl@0: TInt limit; sl@0: limit = (10*sampleLength)/100; sl@0: if (sumOfGoodChar > limit) sl@0: { sl@0: aConfidenceLevel=sumOfGoodChar*100/(sumOfBadSecondByte+sumOfGoodChar+sumOfBadSingleByte); sl@0: aConfidenceLevel=aConfidenceLevel-Max(0,((totalWeight-sumOfWeight)*sumOfGoodChar/1000));//against frequent chars sl@0: aConfidenceLevel=aConfidenceLevel-(sumOutChar*100/sumOfGoodChar);//against gap sl@0: aConfidenceLevel=(aConfidenceLevel < 0)?0:aConfidenceLevel; sl@0: } sl@0: else sl@0: aConfidenceLevel=0; sl@0: return ETrue; sl@0: }