sl@0
|
1 |
/*
|
sl@0
|
2 |
* Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
|
sl@0
|
3 |
* All rights reserved.
|
sl@0
|
4 |
* This component and the accompanying materials are made available
|
sl@0
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
sl@0
|
6 |
* which accompanies this distribution, and is available
|
sl@0
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
sl@0
|
8 |
*
|
sl@0
|
9 |
* Initial Contributors:
|
sl@0
|
10 |
* Nokia Corporation - initial contribution.
|
sl@0
|
11 |
*
|
sl@0
|
12 |
* Contributors:
|
sl@0
|
13 |
*
|
sl@0
|
14 |
* Description:
|
sl@0
|
15 |
*
|
sl@0
|
16 |
*/
|
sl@0
|
17 |
|
sl@0
|
18 |
|
sl@0
|
19 |
#include <e32std.h>
|
sl@0
|
20 |
#include <convgeneratedcpp.h>
|
sl@0
|
21 |
#include "gb2312.h"
|
sl@0
|
22 |
|
sl@0
|
23 |
struct SCnvConversionData;
|
sl@0
|
24 |
|
sl@0
|
25 |
EXPORT_C const TDesC8& CnvGb2312::ReplacementForUnconvertibleUnicodeCharacters()
|
sl@0
|
26 |
{
|
sl@0
|
27 |
return ReplacementForUnconvertibleUnicodeCharacters_internal();
|
sl@0
|
28 |
}
|
sl@0
|
29 |
|
sl@0
|
30 |
EXPORT_C const SCnvConversionData& CnvGb2312::ConversionData()
|
sl@0
|
31 |
{
|
sl@0
|
32 |
return conversionData;
|
sl@0
|
33 |
}
|
sl@0
|
34 |
|
sl@0
|
35 |
EXPORT_C TBool CnvGb2312::IsCharGBBased(TInt& aConfidenceLevel, const TDesC8& aSample)
|
sl@0
|
36 |
{
|
sl@0
|
37 |
TInt sampleLength = aSample.Length();
|
sl@0
|
38 |
aConfidenceLevel = 0;
|
sl@0
|
39 |
//WBB the following is for distiguish between big5 and GBK
|
sl@0
|
40 |
TInt totalWeight=0; //sum of the weights of 20 most frequent chars
|
sl@0
|
41 |
TInt sumOfGoodChar=0; //the number of chars whose first byte and second are both in the range
|
sl@0
|
42 |
TInt sumOfWeight=0; //sum of the weights of the chars which are included in the sample
|
sl@0
|
43 |
TInt sumOutChar=0; //the number of chars which are not common
|
sl@0
|
44 |
TInt sumOfBadSecondByte=0;//the number of chars whose first byte is in the range but not the second
|
sl@0
|
45 |
TInt sumOfBadSingleByte=0; //the number of bad single byte, which is not in valid range
|
sl@0
|
46 |
struct referenceChar
|
sl@0
|
47 |
{
|
sl@0
|
48 |
TUint charGBK;
|
sl@0
|
49 |
TInt weight;
|
sl@0
|
50 |
};
|
sl@0
|
51 |
|
sl@0
|
52 |
referenceChar refGbk[20];
|
sl@0
|
53 |
static const TInt iniWeight[20]=
|
sl@0
|
54 |
{
|
sl@0
|
55 |
//occurence per 1000 chars
|
sl@0
|
56 |
30,20,20,10,10,10,10,10,5,5,
|
sl@0
|
57 |
5,5,5,5,5,5,5,5,5,5
|
sl@0
|
58 |
};
|
sl@0
|
59 |
|
sl@0
|
60 |
static const TUint iniChar[20]=
|
sl@0
|
61 |
{
|
sl@0
|
62 |
0xa3ac,0xb5c4,0xc1cb,0xb8f6,0xb2bb,0xb0d1,0xd2bb,0xcac7,0xd2aa,0xbecd,
|
sl@0
|
63 |
0xd2b2,0xccec,0xc9cf,0xbacd,0xd6d0,0xd4da,0xd0a1,0xc8cb,0xcfc2,0xd6d0,
|
sl@0
|
64 |
};
|
sl@0
|
65 |
|
sl@0
|
66 |
for (TInt k=0; k<20; k++)
|
sl@0
|
67 |
{
|
sl@0
|
68 |
refGbk[k].charGBK=iniChar[k];
|
sl@0
|
69 |
refGbk[k].weight=iniWeight[k];
|
sl@0
|
70 |
totalWeight=totalWeight+iniWeight[k];
|
sl@0
|
71 |
}
|
sl@0
|
72 |
|
sl@0
|
73 |
|
sl@0
|
74 |
//WBB
|
sl@0
|
75 |
for (TInt i = 0; i < sampleLength; ++i)
|
sl@0
|
76 |
{
|
sl@0
|
77 |
//GBK encoding first byte range 0x81-0xfe
|
sl@0
|
78 |
// second byte range 0x40-0x7e, 0x80-0xfe
|
sl@0
|
79 |
if((aSample[i] >= 0x81) && (aSample[i] <= 0xfe))
|
sl@0
|
80 |
{
|
sl@0
|
81 |
TInt increment1 = i+1;
|
sl@0
|
82 |
if (increment1 >= sampleLength)
|
sl@0
|
83 |
break;
|
sl@0
|
84 |
if (((aSample[increment1] >=0x40) && (aSample[increment1] <= 0x7e)) ||
|
sl@0
|
85 |
((aSample[increment1] >=0x80) && (aSample[increment1] <= 0xfe)))
|
sl@0
|
86 |
{
|
sl@0
|
87 |
//WBB
|
sl@0
|
88 |
TUint charGbk=(aSample[i]<<8)|(aSample[increment1]);
|
sl@0
|
89 |
TInt j;
|
sl@0
|
90 |
for (j=0; j<20; j++)
|
sl@0
|
91 |
{
|
sl@0
|
92 |
if (charGbk==refGbk[j].charGBK)
|
sl@0
|
93 |
{
|
sl@0
|
94 |
sumOfWeight=sumOfWeight+refGbk[j].weight;
|
sl@0
|
95 |
break;
|
sl@0
|
96 |
}
|
sl@0
|
97 |
}
|
sl@0
|
98 |
if ((aSample[i]>=0xa4)&&(aSample[i]<=0xaf))
|
sl@0
|
99 |
sumOutChar++;
|
sl@0
|
100 |
sumOfGoodChar++;
|
sl@0
|
101 |
i++;
|
sl@0
|
102 |
//WBB
|
sl@0
|
103 |
}
|
sl@0
|
104 |
else
|
sl@0
|
105 |
{
|
sl@0
|
106 |
sumOfBadSecondByte++;
|
sl@0
|
107 |
}
|
sl@0
|
108 |
}
|
sl@0
|
109 |
// if seldom used characters
|
sl@0
|
110 |
else if (aSample[i] < 0x20 || aSample[i] > 0x7F )
|
sl@0
|
111 |
{
|
sl@0
|
112 |
if (aSample[i]!=0x09 && aSample[i]!=0x0A && aSample[i]!=0x0D)
|
sl@0
|
113 |
sumOfBadSingleByte++;
|
sl@0
|
114 |
}
|
sl@0
|
115 |
} // for
|
sl@0
|
116 |
|
sl@0
|
117 |
TInt limit;
|
sl@0
|
118 |
limit = (10*sampleLength)/100;
|
sl@0
|
119 |
if (sumOfGoodChar > limit)
|
sl@0
|
120 |
{
|
sl@0
|
121 |
aConfidenceLevel=sumOfGoodChar*100/(sumOfBadSecondByte+sumOfGoodChar+sumOfBadSingleByte);
|
sl@0
|
122 |
aConfidenceLevel=aConfidenceLevel-Max(0,((totalWeight-sumOfWeight)*sumOfGoodChar/1000));//against frequent chars
|
sl@0
|
123 |
aConfidenceLevel=aConfidenceLevel-(sumOutChar*100/sumOfGoodChar);//against gap
|
sl@0
|
124 |
aConfidenceLevel=(aConfidenceLevel < 0)?0:aConfidenceLevel;
|
sl@0
|
125 |
}
|
sl@0
|
126 |
else
|
sl@0
|
127 |
aConfidenceLevel=0;
|
sl@0
|
128 |
return ETrue;
|
sl@0
|
129 |
}
|