os/textandloc/charconvfw/charconv_fw/src/charconv/cp1252.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
*
sl@0
    16
*/
sl@0
    17
sl@0
    18
sl@0
    19
#include <e32std.h>
sl@0
    20
#include <convdata.h>
sl@0
    21
sl@0
    22
#define ARRAY_LENGTH(aArray) (sizeof(aArray)/sizeof((aArray)[0]))
sl@0
    23
sl@0
    24
LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_1[]=
sl@0
    25
	{
sl@0
    26
	0x201a,
sl@0
    27
	0x0192,
sl@0
    28
	0x201e,
sl@0
    29
	0x2026,
sl@0
    30
	0x2020,
sl@0
    31
	0x2021,
sl@0
    32
	0x02c6,
sl@0
    33
	0x2030,
sl@0
    34
	0x0160,
sl@0
    35
	0x2039,
sl@0
    36
	0x0152
sl@0
    37
	};
sl@0
    38
sl@0
    39
LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_2[]=
sl@0
    40
	{
sl@0
    41
	0x2018,
sl@0
    42
	0x2019,
sl@0
    43
	0x201c,
sl@0
    44
	0x201d,
sl@0
    45
	0x2022,
sl@0
    46
	0x2013,
sl@0
    47
	0x2014,
sl@0
    48
	0x02dc,
sl@0
    49
	0x2122,
sl@0
    50
	0x0161,
sl@0
    51
	0x203a,
sl@0
    52
	0x0153
sl@0
    53
	};
sl@0
    54
sl@0
    55
LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_3[]=
sl@0
    56
	{
sl@0
    57
	0x017e,
sl@0
    58
	0x0178
sl@0
    59
	};
sl@0
    60
sl@0
    61
LOCAL_D const SCnvConversionData::SOneDirectionData::SRange::UData::SKeyedTable16OfIndexedTables16::SKeyedEntry keyedTables16OfIndexedTables16_keyedEntries_codePage1252ToUnicode_1[]=
sl@0
    62
	{
sl@0
    63
		{
sl@0
    64
		0x82,
sl@0
    65
		0x8c,
sl@0
    66
		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_1
sl@0
    67
		},
sl@0
    68
		{
sl@0
    69
		0x91,
sl@0
    70
		0x9c,
sl@0
    71
		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_2
sl@0
    72
		},
sl@0
    73
		{
sl@0
    74
		0x9e,
sl@0
    75
		0x9f,
sl@0
    76
		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_3
sl@0
    77
		}
sl@0
    78
	};
sl@0
    79
sl@0
    80
LOCAL_D const SCnvConversionData::SOneDirectionData::SRange::UData::SKeyedTable1616::SEntry keyedTable1616_unicodeToCodePage1252_1[]=
sl@0
    81
	{
sl@0
    82
		{
sl@0
    83
		0x0152,
sl@0
    84
		0x8c
sl@0
    85
		},
sl@0
    86
		{
sl@0
    87
		0x0153,
sl@0
    88
		0x9c
sl@0
    89
		},
sl@0
    90
		{
sl@0
    91
		0x0160,
sl@0
    92
		0x8a
sl@0
    93
		},
sl@0
    94
		{
sl@0
    95
		0x0161,
sl@0
    96
		0x9a
sl@0
    97
		},
sl@0
    98
		{
sl@0
    99
		0x0178,
sl@0
   100
		0x9f
sl@0
   101
		},
sl@0
   102
		{
sl@0
   103
		0x017d,
sl@0
   104
		0x8e
sl@0
   105
		},
sl@0
   106
		{
sl@0
   107
		0x017e,
sl@0
   108
		0x9e
sl@0
   109
		},
sl@0
   110
		{
sl@0
   111
		0x0192,
sl@0
   112
		0x83
sl@0
   113
		},
sl@0
   114
		{
sl@0
   115
		0x02c6,
sl@0
   116
		0x88
sl@0
   117
		},
sl@0
   118
		{
sl@0
   119
		0x02dc,
sl@0
   120
		0x98
sl@0
   121
		},
sl@0
   122
		{
sl@0
   123
		0x2013,
sl@0
   124
		0x96
sl@0
   125
		},
sl@0
   126
		{
sl@0
   127
		0x2014,
sl@0
   128
		0x97
sl@0
   129
		},
sl@0
   130
		{
sl@0
   131
		0x2018,
sl@0
   132
		0x91
sl@0
   133
		},
sl@0
   134
		{
sl@0
   135
		0x2019,
sl@0
   136
		0x92
sl@0
   137
		},
sl@0
   138
		{
sl@0
   139
		0x201a,
sl@0
   140
		0x82
sl@0
   141
		},
sl@0
   142
		{
sl@0
   143
		0x201c,
sl@0
   144
		0x93
sl@0
   145
		},
sl@0
   146
		{
sl@0
   147
		0x201d,
sl@0
   148
		0x94
sl@0
   149
		},
sl@0
   150
		{
sl@0
   151
		0x201e,
sl@0
   152
		0x84
sl@0
   153
		},
sl@0
   154
		{
sl@0
   155
		0x2020,
sl@0
   156
		0x86
sl@0
   157
		},
sl@0
   158
		{
sl@0
   159
		0x2021,
sl@0
   160
		0x87
sl@0
   161
		},
sl@0
   162
		{
sl@0
   163
		0x2022,
sl@0
   164
		0x95
sl@0
   165
		},
sl@0
   166
		{
sl@0
   167
		0x2026,
sl@0
   168
		0x85
sl@0
   169
		},
sl@0
   170
		{
sl@0
   171
		0x2030,
sl@0
   172
		0x89
sl@0
   173
		},
sl@0
   174
		{
sl@0
   175
		0x2039,
sl@0
   176
		0x8b
sl@0
   177
		},
sl@0
   178
		{
sl@0
   179
		0x203a,
sl@0
   180
		0x9b
sl@0
   181
		},
sl@0
   182
		{
sl@0
   183
		0x20ac,
sl@0
   184
		0x80
sl@0
   185
		},
sl@0
   186
		{
sl@0
   187
		0x2122,
sl@0
   188
		0x99
sl@0
   189
		}
sl@0
   190
	};
sl@0
   191
sl@0
   192
LOCAL_D const SCnvConversionData::SVariableByteData::SRange codePage1252VariableByteDataRanges[]=
sl@0
   193
	{
sl@0
   194
		{
sl@0
   195
		0x00,
sl@0
   196
		0xff,
sl@0
   197
		0,
sl@0
   198
		0
sl@0
   199
		}
sl@0
   200
	};
sl@0
   201
sl@0
   202
LOCAL_D const SCnvConversionData::SOneDirectionData::SRange codePage1252ToUnicodeDataRanges[]=
sl@0
   203
	{
sl@0
   204
		{
sl@0
   205
		0x00,
sl@0
   206
		0x7f,
sl@0
   207
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
sl@0
   208
		0,
sl@0
   209
		0,
sl@0
   210
			{
sl@0
   211
			0,
sl@0
   212
			0
sl@0
   213
			}
sl@0
   214
		},
sl@0
   215
		{
sl@0
   216
		0xa0,
sl@0
   217
		0xff,
sl@0
   218
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
sl@0
   219
		0,
sl@0
   220
		0,
sl@0
   221
			{
sl@0
   222
			0,
sl@0
   223
			0
sl@0
   224
			}
sl@0
   225
		},
sl@0
   226
		{
sl@0
   227
		0x80,
sl@0
   228
		0x80,
sl@0
   229
		SCnvConversionData::SOneDirectionData::SRange::EOffset,
sl@0
   230
		0,
sl@0
   231
		0,
sl@0
   232
			{
sl@0
   233
			STATIC_CAST(TUint, 8236),
sl@0
   234
			0
sl@0
   235
			}
sl@0
   236
		},
sl@0
   237
		{
sl@0
   238
		0x8e,
sl@0
   239
		0x8e,
sl@0
   240
		SCnvConversionData::SOneDirectionData::SRange::EOffset,
sl@0
   241
		0,
sl@0
   242
		0,
sl@0
   243
			{
sl@0
   244
			STATIC_CAST(TUint, 239),
sl@0
   245
			0
sl@0
   246
			}
sl@0
   247
		},
sl@0
   248
		{
sl@0
   249
		0x82,
sl@0
   250
		0x9f,
sl@0
   251
		SCnvConversionData::SOneDirectionData::SRange::EKeyedTable16OfIndexedTables16,
sl@0
   252
		0,
sl@0
   253
		0,
sl@0
   254
			{
sl@0
   255
			UData_SKeyedTable16OfIndexedTables16(keyedTables16OfIndexedTables16_keyedEntries_codePage1252ToUnicode_1)
sl@0
   256
			}
sl@0
   257
		}
sl@0
   258
	};
sl@0
   259
sl@0
   260
LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToCodePage1252DataRanges[]=
sl@0
   261
	{
sl@0
   262
		{
sl@0
   263
		0x0000,
sl@0
   264
		0x007f,
sl@0
   265
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
sl@0
   266
		1,
sl@0
   267
		0,
sl@0
   268
			{
sl@0
   269
			0,
sl@0
   270
			0
sl@0
   271
			}
sl@0
   272
		},
sl@0
   273
		{
sl@0
   274
		0x00a0,
sl@0
   275
		0x00ff,
sl@0
   276
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
sl@0
   277
		1,
sl@0
   278
		0,
sl@0
   279
			{
sl@0
   280
			0,
sl@0
   281
			0
sl@0
   282
			}
sl@0
   283
		},
sl@0
   284
		{
sl@0
   285
		0x0152,
sl@0
   286
		0x2122,
sl@0
   287
		SCnvConversionData::SOneDirectionData::SRange::EKeyedTable1616,
sl@0
   288
		1,
sl@0
   289
		0,
sl@0
   290
			{
sl@0
   291
			UData_SKeyedTable1616(keyedTable1616_unicodeToCodePage1252_1)
sl@0
   292
			}
sl@0
   293
		}
sl@0
   294
	};
sl@0
   295
sl@0
   296
GLREF_D const SCnvConversionData codePage1252ConversionData=
sl@0
   297
	{
sl@0
   298
	SCnvConversionData::EUnspecified,
sl@0
   299
		{
sl@0
   300
		ARRAY_LENGTH(codePage1252VariableByteDataRanges),
sl@0
   301
		codePage1252VariableByteDataRanges
sl@0
   302
		},
sl@0
   303
		{
sl@0
   304
		ARRAY_LENGTH(codePage1252ToUnicodeDataRanges),
sl@0
   305
		codePage1252ToUnicodeDataRanges
sl@0
   306
		},
sl@0
   307
		{
sl@0
   308
		ARRAY_LENGTH(unicodeToCodePage1252DataRanges),
sl@0
   309
		unicodeToCodePage1252DataRanges
sl@0
   310
		},
sl@0
   311
	NULL,
sl@0
   312
	NULL
sl@0
   313
	};
sl@0
   314
sl@0
   315
GLREF_C void IsCharacterSetCP1252(TInt& aConfidenceLevel, const TDesC8& aSample)
sl@0
   316
	{
sl@0
   317
	aConfidenceLevel = 60;
sl@0
   318
	TInt sampleLength = aSample.Length();
sl@0
   319
sl@0
   320
	for (TInt i=0; i<sampleLength; ++i)
sl@0
   321
		{
sl@0
   322
		// CP1252 includes ASCII as well
sl@0
   323
		// first check if the char is in the range 0x80 - 0x9f (controls codes in ISO88591)
sl@0
   324
		// If it is in that range then the likelihood that it's CP1252 is a bit higher
sl@0
   325
		if ((aSample[i] >= 0x80) && (aSample[i] <= 0x9f))
sl@0
   326
			{
sl@0
   327
			if((aSample[i]==0x81)||(aSample[i]==0x8D)||(aSample[i]==0x8f)||
sl@0
   328
				(aSample[i]==0x90)||(aSample[i]==0x9d))
sl@0
   329
				{
sl@0
   330
				// These code values are not supported by the Codepage CP1252
sl@0
   331
				aConfidenceLevel = 0;
sl@0
   332
				break;
sl@0
   333
				}
sl@0
   334
			else
sl@0
   335
				{
sl@0
   336
				// problem: UTF8 uses the values 0x80-0x9f in more than 50% of it's multibyte representation
sl@0
   337
				// so if the text was UTF8 .... the confidence here would hit the roof. Could check to make 
sl@0
   338
				// sure that this is not UTF8
sl@0
   339
				aConfidenceLevel+=1;
sl@0
   340
				}
sl@0
   341
			}
sl@0
   342
		TInt increment1 = i+1;
sl@0
   343
		TInt decrement1 = i-1;
sl@0
   344
		// 0xf7 is the division symbol in CP1252.
sl@0
   345
		// 0xd7 is the division symbol in CP1252.If char on either side of the division
sl@0
   346
		// symbol is a number then the confidence that it's ISO88591 increases
sl@0
   347
		if( decrement1>= 0 && ((aSample[i]==0xf7) || (aSample[i]==0xd7)) && increment1<sampleLength)
sl@0
   348
			{
sl@0
   349
			
sl@0
   350
			if (increment1 >= sampleLength)
sl@0
   351
				break;
sl@0
   352
			if ( (aSample[decrement1] >= 0x30) && (aSample[decrement1] <= 0x39) &&  // char before is a number
sl@0
   353
				 (aSample[increment1] >= 0x30) && (aSample[increment1] <= 0x39) )   // char after is a number
sl@0
   354
				{
sl@0
   355
				aConfidenceLevel+=5;
sl@0
   356
				}
sl@0
   357
			}
sl@0
   358
		// Can also use the currency symbol to increase confidence if the char after a 
sl@0
   359
		// currency symbol is numeric
sl@0
   360
		if((aSample[i]>=0xa2) && (aSample[i] <= 0xa5) && increment1<sampleLength)
sl@0
   361
			{
sl@0
   362
			if ((aSample[increment1] >= 0x30) && (aSample[increment1] <= 0x39))
sl@0
   363
				{
sl@0
   364
				aConfidenceLevel+=5; 
sl@0
   365
				}
sl@0
   366
			}
sl@0
   367
		} // for loop
sl@0
   368
	aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
sl@0
   369
	}