os/textandloc/charconvfw/charconv_fw/src/charconv/cp1252.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include <e32std.h>
    20 #include <convdata.h>
    21 
    22 #define ARRAY_LENGTH(aArray) (sizeof(aArray)/sizeof((aArray)[0]))
    23 
    24 LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_1[]=
    25 	{
    26 	0x201a,
    27 	0x0192,
    28 	0x201e,
    29 	0x2026,
    30 	0x2020,
    31 	0x2021,
    32 	0x02c6,
    33 	0x2030,
    34 	0x0160,
    35 	0x2039,
    36 	0x0152
    37 	};
    38 
    39 LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_2[]=
    40 	{
    41 	0x2018,
    42 	0x2019,
    43 	0x201c,
    44 	0x201d,
    45 	0x2022,
    46 	0x2013,
    47 	0x2014,
    48 	0x02dc,
    49 	0x2122,
    50 	0x0161,
    51 	0x203a,
    52 	0x0153
    53 	};
    54 
    55 LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_3[]=
    56 	{
    57 	0x017e,
    58 	0x0178
    59 	};
    60 
    61 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange::UData::SKeyedTable16OfIndexedTables16::SKeyedEntry keyedTables16OfIndexedTables16_keyedEntries_codePage1252ToUnicode_1[]=
    62 	{
    63 		{
    64 		0x82,
    65 		0x8c,
    66 		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_1
    67 		},
    68 		{
    69 		0x91,
    70 		0x9c,
    71 		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_2
    72 		},
    73 		{
    74 		0x9e,
    75 		0x9f,
    76 		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_3
    77 		}
    78 	};
    79 
    80 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange::UData::SKeyedTable1616::SEntry keyedTable1616_unicodeToCodePage1252_1[]=
    81 	{
    82 		{
    83 		0x0152,
    84 		0x8c
    85 		},
    86 		{
    87 		0x0153,
    88 		0x9c
    89 		},
    90 		{
    91 		0x0160,
    92 		0x8a
    93 		},
    94 		{
    95 		0x0161,
    96 		0x9a
    97 		},
    98 		{
    99 		0x0178,
   100 		0x9f
   101 		},
   102 		{
   103 		0x017d,
   104 		0x8e
   105 		},
   106 		{
   107 		0x017e,
   108 		0x9e
   109 		},
   110 		{
   111 		0x0192,
   112 		0x83
   113 		},
   114 		{
   115 		0x02c6,
   116 		0x88
   117 		},
   118 		{
   119 		0x02dc,
   120 		0x98
   121 		},
   122 		{
   123 		0x2013,
   124 		0x96
   125 		},
   126 		{
   127 		0x2014,
   128 		0x97
   129 		},
   130 		{
   131 		0x2018,
   132 		0x91
   133 		},
   134 		{
   135 		0x2019,
   136 		0x92
   137 		},
   138 		{
   139 		0x201a,
   140 		0x82
   141 		},
   142 		{
   143 		0x201c,
   144 		0x93
   145 		},
   146 		{
   147 		0x201d,
   148 		0x94
   149 		},
   150 		{
   151 		0x201e,
   152 		0x84
   153 		},
   154 		{
   155 		0x2020,
   156 		0x86
   157 		},
   158 		{
   159 		0x2021,
   160 		0x87
   161 		},
   162 		{
   163 		0x2022,
   164 		0x95
   165 		},
   166 		{
   167 		0x2026,
   168 		0x85
   169 		},
   170 		{
   171 		0x2030,
   172 		0x89
   173 		},
   174 		{
   175 		0x2039,
   176 		0x8b
   177 		},
   178 		{
   179 		0x203a,
   180 		0x9b
   181 		},
   182 		{
   183 		0x20ac,
   184 		0x80
   185 		},
   186 		{
   187 		0x2122,
   188 		0x99
   189 		}
   190 	};
   191 
   192 LOCAL_D const SCnvConversionData::SVariableByteData::SRange codePage1252VariableByteDataRanges[]=
   193 	{
   194 		{
   195 		0x00,
   196 		0xff,
   197 		0,
   198 		0
   199 		}
   200 	};
   201 
   202 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange codePage1252ToUnicodeDataRanges[]=
   203 	{
   204 		{
   205 		0x00,
   206 		0x7f,
   207 		SCnvConversionData::SOneDirectionData::SRange::EDirect,
   208 		0,
   209 		0,
   210 			{
   211 			0,
   212 			0
   213 			}
   214 		},
   215 		{
   216 		0xa0,
   217 		0xff,
   218 		SCnvConversionData::SOneDirectionData::SRange::EDirect,
   219 		0,
   220 		0,
   221 			{
   222 			0,
   223 			0
   224 			}
   225 		},
   226 		{
   227 		0x80,
   228 		0x80,
   229 		SCnvConversionData::SOneDirectionData::SRange::EOffset,
   230 		0,
   231 		0,
   232 			{
   233 			STATIC_CAST(TUint, 8236),
   234 			0
   235 			}
   236 		},
   237 		{
   238 		0x8e,
   239 		0x8e,
   240 		SCnvConversionData::SOneDirectionData::SRange::EOffset,
   241 		0,
   242 		0,
   243 			{
   244 			STATIC_CAST(TUint, 239),
   245 			0
   246 			}
   247 		},
   248 		{
   249 		0x82,
   250 		0x9f,
   251 		SCnvConversionData::SOneDirectionData::SRange::EKeyedTable16OfIndexedTables16,
   252 		0,
   253 		0,
   254 			{
   255 			UData_SKeyedTable16OfIndexedTables16(keyedTables16OfIndexedTables16_keyedEntries_codePage1252ToUnicode_1)
   256 			}
   257 		}
   258 	};
   259 
   260 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToCodePage1252DataRanges[]=
   261 	{
   262 		{
   263 		0x0000,
   264 		0x007f,
   265 		SCnvConversionData::SOneDirectionData::SRange::EDirect,
   266 		1,
   267 		0,
   268 			{
   269 			0,
   270 			0
   271 			}
   272 		},
   273 		{
   274 		0x00a0,
   275 		0x00ff,
   276 		SCnvConversionData::SOneDirectionData::SRange::EDirect,
   277 		1,
   278 		0,
   279 			{
   280 			0,
   281 			0
   282 			}
   283 		},
   284 		{
   285 		0x0152,
   286 		0x2122,
   287 		SCnvConversionData::SOneDirectionData::SRange::EKeyedTable1616,
   288 		1,
   289 		0,
   290 			{
   291 			UData_SKeyedTable1616(keyedTable1616_unicodeToCodePage1252_1)
   292 			}
   293 		}
   294 	};
   295 
   296 GLREF_D const SCnvConversionData codePage1252ConversionData=
   297 	{
   298 	SCnvConversionData::EUnspecified,
   299 		{
   300 		ARRAY_LENGTH(codePage1252VariableByteDataRanges),
   301 		codePage1252VariableByteDataRanges
   302 		},
   303 		{
   304 		ARRAY_LENGTH(codePage1252ToUnicodeDataRanges),
   305 		codePage1252ToUnicodeDataRanges
   306 		},
   307 		{
   308 		ARRAY_LENGTH(unicodeToCodePage1252DataRanges),
   309 		unicodeToCodePage1252DataRanges
   310 		},
   311 	NULL,
   312 	NULL
   313 	};
   314 
   315 GLREF_C void IsCharacterSetCP1252(TInt& aConfidenceLevel, const TDesC8& aSample)
   316 	{
   317 	aConfidenceLevel = 60;
   318 	TInt sampleLength = aSample.Length();
   319 
   320 	for (TInt i=0; i<sampleLength; ++i)
   321 		{
   322 		// CP1252 includes ASCII as well
   323 		// first check if the char is in the range 0x80 - 0x9f (controls codes in ISO88591)
   324 		// If it is in that range then the likelihood that it's CP1252 is a bit higher
   325 		if ((aSample[i] >= 0x80) && (aSample[i] <= 0x9f))
   326 			{
   327 			if((aSample[i]==0x81)||(aSample[i]==0x8D)||(aSample[i]==0x8f)||
   328 				(aSample[i]==0x90)||(aSample[i]==0x9d))
   329 				{
   330 				// These code values are not supported by the Codepage CP1252
   331 				aConfidenceLevel = 0;
   332 				break;
   333 				}
   334 			else
   335 				{
   336 				// problem: UTF8 uses the values 0x80-0x9f in more than 50% of it's multibyte representation
   337 				// so if the text was UTF8 .... the confidence here would hit the roof. Could check to make 
   338 				// sure that this is not UTF8
   339 				aConfidenceLevel+=1;
   340 				}
   341 			}
   342 		TInt increment1 = i+1;
   343 		TInt decrement1 = i-1;
   344 		// 0xf7 is the division symbol in CP1252.
   345 		// 0xd7 is the division symbol in CP1252.If char on either side of the division
   346 		// symbol is a number then the confidence that it's ISO88591 increases
   347 		if( decrement1>= 0 && ((aSample[i]==0xf7) || (aSample[i]==0xd7)) && increment1<sampleLength)
   348 			{
   349 			
   350 			if (increment1 >= sampleLength)
   351 				break;
   352 			if ( (aSample[decrement1] >= 0x30) && (aSample[decrement1] <= 0x39) &&  // char before is a number
   353 				 (aSample[increment1] >= 0x30) && (aSample[increment1] <= 0x39) )   // char after is a number
   354 				{
   355 				aConfidenceLevel+=5;
   356 				}
   357 			}
   358 		// Can also use the currency symbol to increase confidence if the char after a 
   359 		// currency symbol is numeric
   360 		if((aSample[i]>=0xa2) && (aSample[i] <= 0xa5) && increment1<sampleLength)
   361 			{
   362 			if ((aSample[increment1] >= 0x30) && (aSample[increment1] <= 0x39))
   363 				{
   364 				aConfidenceLevel+=5; 
   365 				}
   366 			}
   367 		} // for loop
   368 	aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
   369 	}