os/textandloc/charconvfw/charconvplugins/src/shared/gb2312_shared.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
*
sl@0
    16
*/
sl@0
    17
sl@0
    18
sl@0
    19
#include <e32std.h>
sl@0
    20
#include <convgeneratedcpp.h>
sl@0
    21
#include "gb2312.h"
sl@0
    22
sl@0
    23
struct SCnvConversionData;
sl@0
    24
sl@0
    25
EXPORT_C const TDesC8& CnvGb2312::ReplacementForUnconvertibleUnicodeCharacters()
sl@0
    26
	{
sl@0
    27
	return ReplacementForUnconvertibleUnicodeCharacters_internal();
sl@0
    28
	}
sl@0
    29
sl@0
    30
EXPORT_C const SCnvConversionData& CnvGb2312::ConversionData()
sl@0
    31
	{
sl@0
    32
	return conversionData;
sl@0
    33
	}
sl@0
    34
sl@0
    35
EXPORT_C TBool CnvGb2312::IsCharGBBased(TInt& aConfidenceLevel, const TDesC8& aSample)
sl@0
    36
	{
sl@0
    37
	TInt sampleLength = aSample.Length();
sl@0
    38
	aConfidenceLevel = 0;
sl@0
    39
	//WBB the following is for distiguish between big5 and GBK
sl@0
    40
	TInt totalWeight=0;		//sum of the weights of 20 most frequent chars
sl@0
    41
	TInt sumOfGoodChar=0;		//the number of chars whose first byte and second are both in the range
sl@0
    42
	TInt sumOfWeight=0;		//sum of the weights of the chars which are included in the sample
sl@0
    43
	TInt sumOutChar=0;		//the number of chars which are not common
sl@0
    44
	TInt sumOfBadSecondByte=0;//the number of chars whose first byte is in the range but not the second
sl@0
    45
	TInt sumOfBadSingleByte=0;	//the number of bad single byte, which is not in valid range
sl@0
    46
	struct referenceChar
sl@0
    47
		{
sl@0
    48
		TUint charGBK;
sl@0
    49
		TInt weight;
sl@0
    50
		};
sl@0
    51
sl@0
    52
	referenceChar refGbk[20];
sl@0
    53
	static const TInt iniWeight[20]=
sl@0
    54
		{
sl@0
    55
		//occurence per 1000 chars
sl@0
    56
		30,20,20,10,10,10,10,10,5,5,
sl@0
    57
		5,5,5,5,5,5,5,5,5,5
sl@0
    58
		};
sl@0
    59
sl@0
    60
	static const TUint iniChar[20]=
sl@0
    61
		{
sl@0
    62
		0xa3ac,0xb5c4,0xc1cb,0xb8f6,0xb2bb,0xb0d1,0xd2bb,0xcac7,0xd2aa,0xbecd,
sl@0
    63
		0xd2b2,0xccec,0xc9cf,0xbacd,0xd6d0,0xd4da,0xd0a1,0xc8cb,0xcfc2,0xd6d0,
sl@0
    64
		};
sl@0
    65
sl@0
    66
	for (TInt k=0; k<20; k++)
sl@0
    67
		{
sl@0
    68
		refGbk[k].charGBK=iniChar[k];
sl@0
    69
		refGbk[k].weight=iniWeight[k];
sl@0
    70
		totalWeight=totalWeight+iniWeight[k];
sl@0
    71
		}
sl@0
    72
sl@0
    73
	
sl@0
    74
	//WBB
sl@0
    75
	for (TInt i = 0; i < sampleLength; ++i)
sl@0
    76
		{
sl@0
    77
		//GBK encoding first byte range 0x81-0xfe
sl@0
    78
		//              second byte range 0x40-0x7e, 0x80-0xfe
sl@0
    79
		if((aSample[i] >= 0x81) && (aSample[i] <= 0xfe))
sl@0
    80
			{
sl@0
    81
			TInt increment1 = i+1;
sl@0
    82
			if (increment1 >= sampleLength)
sl@0
    83
				break;
sl@0
    84
			if (((aSample[increment1] >=0x40) && (aSample[increment1] <= 0x7e)) ||
sl@0
    85
				((aSample[increment1] >=0x80) && (aSample[increment1] <= 0xfe)))
sl@0
    86
				{
sl@0
    87
				//WBB
sl@0
    88
				TUint charGbk=(aSample[i]<<8)|(aSample[increment1]);
sl@0
    89
				TInt j;
sl@0
    90
				for (j=0; j<20; j++)
sl@0
    91
					{
sl@0
    92
					if (charGbk==refGbk[j].charGBK)
sl@0
    93
						{
sl@0
    94
						sumOfWeight=sumOfWeight+refGbk[j].weight;
sl@0
    95
						break;
sl@0
    96
						}
sl@0
    97
					}
sl@0
    98
				if ((aSample[i]>=0xa4)&&(aSample[i]<=0xaf))
sl@0
    99
					sumOutChar++;
sl@0
   100
				sumOfGoodChar++;
sl@0
   101
				i++;
sl@0
   102
				//WBB
sl@0
   103
				}
sl@0
   104
			else
sl@0
   105
				{
sl@0
   106
				sumOfBadSecondByte++;				
sl@0
   107
				}
sl@0
   108
			}
sl@0
   109
		// if seldom used characters
sl@0
   110
		else if (aSample[i] < 0x20 || aSample[i] > 0x7F ) 
sl@0
   111
			{
sl@0
   112
			if (aSample[i]!=0x09 && aSample[i]!=0x0A && aSample[i]!=0x0D)
sl@0
   113
				sumOfBadSingleByte++;
sl@0
   114
			}
sl@0
   115
		} // for 
sl@0
   116
sl@0
   117
	TInt limit;
sl@0
   118
	limit = (10*sampleLength)/100;
sl@0
   119
	if (sumOfGoodChar > limit)
sl@0
   120
		{
sl@0
   121
		aConfidenceLevel=sumOfGoodChar*100/(sumOfBadSecondByte+sumOfGoodChar+sumOfBadSingleByte);
sl@0
   122
		aConfidenceLevel=aConfidenceLevel-Max(0,((totalWeight-sumOfWeight)*sumOfGoodChar/1000));//against frequent chars 
sl@0
   123
		aConfidenceLevel=aConfidenceLevel-(sumOutChar*100/sumOfGoodChar);//against gap
sl@0
   124
		aConfidenceLevel=(aConfidenceLevel < 0)?0:aConfidenceLevel;
sl@0
   125
		}
sl@0
   126
	else
sl@0
   127
		aConfidenceLevel=0;
sl@0
   128
	return ETrue;
sl@0
   129
	}