1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/textandloc/charconvfw/charconvplugins/src/plugins/j5.cpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,823 @@
1.4 +/*
1.5 +* Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +* All rights reserved.
1.7 +* This component and the accompanying materials are made available
1.8 +* under the terms of "Eclipse Public License v1.0"
1.9 +* which accompanies this distribution, and is available
1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +*
1.12 +* Initial Contributors:
1.13 +* Nokia Corporation - initial contribution.
1.14 +*
1.15 +* Contributors:
1.16 +*
1.17 +* Description:
1.18 +* J5 charconv character converter
1.19 +*
1.20 +*/
1.21 +
1.22 +
1.23 +#include <e32std.h>
1.24 +#include <charconv.h>
1.25 +#include <ecom/implementationproxy.h>
1.26 +#include <utf.h>
1.27 +#include <charactersetconverter.h>
1.28 +#include <convutils.h>
1.29 +#include "shiftjis.h"
1.30 +#include "jisbase.h"
1.31 +#include "j5.h"
1.32 +
1.33 +#include "jisx0201.h"
1.34 +#include "jisx0208.h"
1.35 +#include "jisx0212.h"
1.36 +
1.37 +#include "featmgr/featmgr.h"
1.38 +
1.39 +/**
1.40 + J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data.
1.41 + */
1.42 +const TInt KMaxSizeAutoDetectSample = 1000;
1.43 +
1.44 +const TUint8 KEscape = 0x1b;
1.45 +const TInt KByteOrderMark = 0xfeff;
1.46 +
1.47 +const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters()
1.48 + {
1.49 + return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters();
1.50 + }
1.51 +
1.52 +/**
1.53 + This API should not be used as it is ambiguous as to what encoding is required.
1.54 + The user should instead call the specific plug-in for the appropriate conversion.
1.55 + J5 ConvertFromUnicode() will convert to UTF8 as default.
1.56 +@internalTechnology
1.57 + */
1.58 +TInt CJ5Converter::ConvertFromUnicode(
1.59 + CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */,
1.60 + const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */,
1.61 + TDes8& aForeign,
1.62 + const TDesC16& aUnicode,
1.63 + CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */)
1.64 + {
1.65 + return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode);
1.66 + }
1.67 +
1.68 +/**
1.69 + This will automatically determine one of the five supported encodings
1.70 + to use and convert accordingly. This plugin method is available to the
1.71 + user though the CCnvCharacterSetConverter::ConvertToUnicode() method.
1.72 + There is no way for the caller to determine which encoding has been used.
1.73 +
1.74 + NOTE: For debugging the selected character set is returned in the state.
1.75 +
1.76 + @released 9.1
1.77 + @param aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters
1.78 + in the foreign character set.
1.79 + @param aUnicode On return, contains the text converted into Unicode.
1.80 + @param aForeign The non-Unicode source text to be converted.
1.81 + @param aState Used to save state information across multiple calls
1.82 + to <code>ConvertToUnicode()</code>.
1.83 + @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not
1.84 + converted.
1.85 + @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the
1.86 + input text that could not be converted. A negative
1.87 + value indicates that all the characters were
1.88 + converted.
1.89 + @return The number of unconverted bytes left at the end of the input descriptor
1.90 + (e.g. because the output descriptor is not long enough to hold all the text),
1.91 + or one of the error values defined in TError.
1.92 + @internalTechnology
1.93 +*/
1.94 +TInt CJ5Converter::ConvertToUnicode(
1.95 + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
1.96 + TDes16& aUnicode,
1.97 + const TDesC8& aForeign,
1.98 + TInt& aState,
1.99 + TInt& aNumberOfUnconvertibleCharacters,
1.100 + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
1.101 + {
1.102 + // As the aState parameter is used to pass back the detected value
1.103 + // use a "hidden" internal state variable.
1.104 + TInt internalState = CCnvCharacterSetConverter::KStateDefault;
1.105 +
1.106 + // determine the encoding type and then decode appropriatly
1.107 + switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign))
1.108 + {
1.109 + case EShiftjis:
1.110 + aState = EShiftjis;
1.111 + return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign,
1.112 + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
1.113 +
1.114 + case EIso2022jp1:
1.115 + aState = EIso2022jp1;
1.116 + return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
1.117 + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
1.118 +
1.119 + case EEucjp:
1.120 + aState = EEucjp;
1.121 + return ConvertEEucjpToUnicode(
1.122 + aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
1.123 + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
1.124 +
1.125 + case EUcs2:
1.126 + aState = EUcs2;
1.127 + return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign,
1.128 + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
1.129 +
1.130 + case EUtf8:
1.131 + aState = EUtf8;
1.132 + return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
1.133 +
1.134 + default:
1.135 + // fall though to the default, which is decode as UTF8
1.136 + aState = EUnknown;
1.137 + break;
1.138 + }
1.139 +
1.140 + // decode as UTF8
1.141 + return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
1.142 + }
1.143 +
1.144 +/**
1.145 + This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL().
1.146 + This method returns a value between 0 and 100, indicating how likely it
1.147 + is that this is the correct converter, for the text supplied. As J5 is
1.148 + NOT intended to be used with the existing auto-detect mechanism, it will
1.149 + always return 0
1.150 + @internalTechnology
1.151 + */
1.152 +TBool CJ5Converter::IsInThisCharacterSetL(
1.153 + TBool& aSetToTrue,
1.154 + TInt& aConfidenceLevel,
1.155 + const TDesC8& /* aSample */)
1.156 + {
1.157 + /*
1.158 + aSetToTrue - This value should be set to ETrue. It is used to indicate to
1.159 + CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL
1.160 + is implementing a function of this signature and is therefore not the empty
1.161 + */
1.162 + aSetToTrue=ETrue;
1.163 +
1.164 + /* no need to look at the sample as this always returns 0
1.165 + as the autodetect feature is not supported by the J5 plug-in
1.166 + */
1.167 + aConfidenceLevel=0;
1.168 + return ETrue;
1.169 + }
1.170 +
1.171 +CJ5Converter* CJ5Converter::NewL()
1.172 + {
1.173 + CJ5Converter* self = new(ELeave) CJ5Converter();
1.174 + CleanupStack::PushL(self);
1.175 + self->ConstructL();
1.176 + CleanupStack::Pop(self);
1.177 + return self;
1.178 + }
1.179 +
1.180 +CJ5Converter::~CJ5Converter()
1.181 + {
1.182 + FeatureManager::UnInitializeLib();
1.183 + }
1.184 +
1.185 +CJ5Converter::CJ5Converter()
1.186 + {
1.187 + }
1.188 +
1.189 +void CJ5Converter::ConstructL()
1.190 + {
1.191 + FeatureManager::InitializeLibL();
1.192 + }
1.193 +
1.194 +const TImplementationProxy ImplementationTable[] =
1.195 + {
1.196 +#ifdef KDDIAU_TEST
1.197 + // for the test build use a special test UID
1.198 + IMPLEMENTATION_PROXY_ENTRY(0x01000002, CJ5Converter::NewL)
1.199 +#else
1.200 + IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5, CJ5Converter::NewL)
1.201 +#endif
1.202 + };
1.203 +
1.204 +EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
1.205 + {
1.206 + aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
1.207 +
1.208 + return ImplementationTable;
1.209 + }
1.210 +
1.211 +/**
1.212 + DetectEncoding determine the characterset encoding.
1.213 + The logic for this detection is based on the information in CJKV by Ken Lunde.
1.214 + A detailed diagram of this logic is in the J5 how to document section 2.4
1.215 + @return The detected character set as a enum CJ5Converter.
1.216 + @internalTechnology
1.217 + */
1.218 +enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding(
1.219 + CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters ,
1.220 + const TDesC8& aForeign)
1.221 + {
1.222 +
1.223 + // first check for UCS2
1.224 + CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian;
1.225 + if ( DetectUcs2(aForeign, ucs2Endianness ))
1.226 + {
1.227 + // if ucs2 is detected pass back the detected endianess
1.228 + aDefaultEndiannessOfForeignCharacters = ucs2Endianness;
1.229 + return EUcs2;
1.230 + }
1.231 +
1.232 + // next try EUC_JP
1.233 + TInt eucJpValidBytes = 0;
1.234 + CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes );
1.235 + if ( result == EIsCharacterSet )
1.236 + {
1.237 + return EEucjp;
1.238 + }
1.239 +
1.240 + // next try Iso 2020JP
1.241 + if ( DetectIso2022( aForeign ) == EIsCharacterSet )
1.242 + {
1.243 + return EIso2022jp1;
1.244 + }
1.245 +
1.246 + // next try Utf8
1.247 + if ( DetectUtf8( aForeign ) == EIsCharacterSet )
1.248 + {
1.249 + return EUtf8;
1.250 + }
1.251 +
1.252 + // shiftjis
1.253 + TInt shiftjisValidBytes = 0;
1.254 + result = DetectShiftJis( aForeign, shiftjisValidBytes );
1.255 + if ( result == EIsCharacterSet )
1.256 + {
1.257 + return EShiftjis;
1.258 + }
1.259 +
1.260 + // no clear winner so go for the best
1.261 + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);
1.262 +
1.263 + // if more than half is shiftjis and more shiftjis than EUC_JP,
1.264 + if ((shiftjisValidBytes > eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength))
1.265 + return EShiftjis;
1.266 +
1.267 + // if more than half is EUC_JP and more EUC_JP than shiftjis,
1.268 + if ((eucJpValidBytes > shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength))
1.269 + return EEucjp;
1.270 +
1.271 + // return the default
1.272 + return EUcs2;
1.273 + }
1.274 +
1.275 +
1.276 +/**
1.277 + Check if UCS2.
1.278 + If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff)
1.279 + then this must be UCS2. Otherwise try lookiing for 0x**00 or 0x00**
1.280 + @param A sample of data to be checked
1.281 + @param The Endianness if USC2 is detected
1.282 + @return ETrue if UCS2 else EFalse
1.283 + @internalTechnology
1.284 + */
1.285 +TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign,
1.286 + CCnvCharacterSetConverter::TEndianness& aTEndianness )
1.287 + {
1.288 + // if the sample is not big enough
1.289 + if (aForeign.Length() < 2)
1.290 + {
1.291 + return EFalse;
1.292 + }
1.293 + else if (aForeign[0]==0xff && aForeign[1]==0xfe )
1.294 + {
1.295 + // we have found a Little Endian Byte order mark
1.296 + aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
1.297 + return ETrue;
1.298 + }
1.299 + else if (aForeign[0]==0xfe && aForeign[1]==0xff )
1.300 + {
1.301 + // we have found a Big Endian Byte order mark
1.302 + aTEndianness = CCnvCharacterSetConverter::EBigEndian;
1.303 + return ETrue;
1.304 + }
1.305 +
1.306 + // Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that
1.307 + // specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters.
1.308 + // NB: This will fail if there are no ASCII characters in the text.
1.309 + TInt sampleLength = aForeign.Length();
1.310 + sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
1.311 +
1.312 + // check the sample for sequences of 0x**00 or 0x00**
1.313 + TInt bigEndianConfidence = 0;
1.314 + TInt littleEndianConfidence = 0;
1.315 + TInt i=0;
1.316 + for(;i< (sampleLength-1); i+=2)
1.317 + {
1.318 + if( aForeign[i] == 0x00)
1.319 + {
1.320 + bigEndianConfidence +=2;
1.321 + }
1.322 + else if ( aForeign[i+1] == 0x00)
1.323 + {
1.324 + littleEndianConfidence +=2;
1.325 + }
1.326 + }
1.327 +
1.328 + // which occurs most BE or LE
1.329 + TInt confidenceLevel = 0;
1.330 + if (bigEndianConfidence > littleEndianConfidence)
1.331 + {
1.332 + aTEndianness = CCnvCharacterSetConverter::EBigEndian;
1.333 + confidenceLevel = bigEndianConfidence;
1.334 + }
1.335 + else
1.336 + {
1.337 + aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
1.338 + confidenceLevel = littleEndianConfidence;
1.339 + }
1.340 +
1.341 + // if more than 97% count as UCS2
1.342 + if ( confidenceLevel * 100/sampleLength > 97)
1.343 + return ETrue;
1.344 +
1.345 + return EFalse;
1.346 + }
1.347 +
1.348 +/**
1.349 + Check if ShiftJis (reference CJKV by Ken Lunde page 175)
1.350 + @param A sample of data to be checked
1.351 + @param The number of input bytes that can be converted
1.352 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
1.353 + @internalTechnology
1.354 + */
1.355 +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
1.356 + {
1.357 + // Get the sample length
1.358 + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
1.359 +
1.360 + TInt i=0;
1.361 + aNumberOfBytesConverted = 0;
1.362 +
1.363 + TText8 character;
1.364 + TText8 characterPlus1;
1.365 + TText8 characterPlus2;
1.366 +
1.367 + // scan the sample text looking for valid shiftjis data
1.368 + while ( i < sampleLength )
1.369 + {
1.370 + // get the next few characters, use 0 if there is no more sample
1.371 + // as this will not match any test.
1.372 + character = aForeign[i];
1.373 + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
1.374 + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
1.375 +
1.376 + // SHIFTJIS - 0x8e to 0x9f followed by 0x40 to 0xfc
1.377 + if ((character >= 0x81) && (character <= 0x9f) &&
1.378 + (characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) )
1.379 + {
1.380 + // this is SHIFTJIS unless it is EUC JP code set 2 or 3
1.381 + if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF))
1.382 + {
1.383 + // this could be EUC JP code set 2 (or shiftjis)
1.384 + aNumberOfBytesConverted+=2;
1.385 + i++;
1.386 + }
1.387 + else if ((character == 0x8F) &&
1.388 + (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) &&
1.389 + (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
1.390 + {
1.391 + // this could be EUC JP code set 3 (or shiftjis)
1.392 + aNumberOfBytesConverted+=3;
1.393 + i+=2;
1.394 + }
1.395 + else
1.396 + {
1.397 + // this can only be shift jis
1.398 + return EIsCharacterSet;
1.399 + }
1.400 + }
1.401 +
1.402 + // SHIFTJIS - 0xE0 to 0xEF followed by .....
1.403 + else if ((character >= 0xE0) && (character <= 0xEF))
1.404 + {
1.405 + // 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF
1.406 + // including Mopera extension to shiftjis from 0xEF80 to 0xEFFC
1.407 +
1.408 + if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) )
1.409 + {
1.410 + // this can only be shift jis
1.411 + return EIsCharacterSet;
1.412 + }
1.413 + else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) )
1.414 + {
1.415 + // this could be EUC JP code set 1
1.416 + aNumberOfBytesConverted+=2;
1.417 + i++;
1.418 + }
1.419 +
1.420 + // problem here is the overlap between the UTF8 and shiftjis
1.421 + else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) )
1.422 + {
1.423 + // this could be shiftjis or utf8
1.424 + aNumberOfBytesConverted+=2;
1.425 + i++;
1.426 + }
1.427 + }
1.428 + // half width katakana A1-DF
1.429 + else if ((character >= 0xA1) && (character <= 0xDF))
1.430 + {
1.431 + aNumberOfBytesConverted+=1;
1.432 + }
1.433 + // ASCII or JIS-Roman 20-7e
1.434 + else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D))
1.435 + {
1.436 + aNumberOfBytesConverted+=1;
1.437 + }
1.438 + else
1.439 + {
1.440 + // This is not decoding as shiftjis, so reject
1.441 + aNumberOfBytesConverted =0;
1.442 + return EIsNotCharacterSet;
1.443 + }
1.444 + i++;
1.445 + }
1.446 +
1.447 + // if all the characters could be converted
1.448 + if (aNumberOfBytesConverted == sampleLength)
1.449 + {
1.450 + return EIsCharacterSet;
1.451 + }
1.452 + else if (aNumberOfBytesConverted == 0)
1.453 + {
1.454 + return EIsNotCharacterSet;
1.455 + }
1.456 + else
1.457 + {
1.458 + return EMaybeCharacterSet;
1.459 + }
1.460 + }
1.461 +
1.462 +/**
1.463 + Check if UTF8 (reference CJKV by Ken Lunde page 189)
1.464 + @param A sample of data to be checked
1.465 + @param The number of input bytes that can be converted
1.466 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
1.467 + @internalTechnology
1.468 + */
1.469 +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign )
1.470 + {
1.471 + // Get the sample length
1.472 + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
1.473 +
1.474 + TInt i=0;
1.475 + TText8 character;
1.476 + TText8 characterPlus1;
1.477 + TText8 characterPlus2;
1.478 + TText8 characterPlus3;
1.479 +
1.480 + // scan the sample text looking for valid UTF8
1.481 + while ( i < sampleLength )
1.482 + {
1.483 + // get the next few characters, use 0 if there is no more sample
1.484 + // as this will not match any test.
1.485 + character = aForeign[i];
1.486 + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
1.487 + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
1.488 + characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
1.489 +
1.490 + // UTF8 range 110xxxxx followed by one valid UTF8 bytes
1.491 + if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) )
1.492 + {
1.493 + // two bytes of valid UTF8 found
1.494 + i+=2;
1.495 + }
1.496 + // UTF8 range 1110xxxx followed by two valid UTF8 bytes
1.497 + else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80))
1.498 + {
1.499 + // three bytes of valid UTF8 found
1.500 + i+=3;
1.501 + }
1.502 + // UTF8 range 11110xxx followed by three valid UTF8 bytes
1.503 + else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80)
1.504 + && (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) )
1.505 + {
1.506 + // four bytes of valid UTF8 found
1.507 + i+=4;
1.508 + }
1.509 +
1.510 + // ascii range 0 to 0x7F
1.511 + else if((character & 0x80)==0x00)
1.512 + {
1.513 + // The value of character is in the range 0x00-0x7f
1.514 + // UTF8 maintains ASCII transparency. So it's a valid UTF8.
1.515 + i++;
1.516 + }
1.517 + // if the sample data is longer than KMaxSizeAutoDetectSample then except anything
1.518 + // for the last two bytes as they may not appear valid without more data
1.519 + else if( i >= (KMaxSizeAutoDetectSample -2) )
1.520 + {
1.521 + i++;
1.522 + }
1.523 + else
1.524 + {
1.525 + // This is not decoding as UTF8 so reject
1.526 + return EIsNotCharacterSet;
1.527 + }
1.528 + }
1.529 +
1.530 + // All the characters could be converted
1.531 + return EIsCharacterSet;
1.532 +
1.533 + }
1.534 +
1.535 +
1.536 +/**
1.537 + Check if ISO2022JP by lookiing for the escape sequences.
1.538 + @param A sample of data to be checked
1.539 + @param The number of input bytes that can be converted
1.540 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
1.541 + @internalTechnology
1.542 + */
1.543 +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign )
1.544 + {
1.545 + // Get the sample length
1.546 + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
1.547 +
1.548 + TInt i=0;
1.549 + TText8 character;
1.550 + TText8 characterPlus1;
1.551 + TText8 characterPlus2;
1.552 + TText8 characterPlus3;
1.553 + TText8 characterPlus4;
1.554 + TText8 characterPlus5;
1.555 +
1.556 + // scan the sample text looking for valid UTF8
1.557 + while ( i < sampleLength )
1.558 + {
1.559 + // get the next few characters, use 0 if there is no more sample
1.560 + // as this will not match any test.
1.561 + character = aForeign[i];
1.562 + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
1.563 + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
1.564 + characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
1.565 +
1.566 +
1.567 + // check for the JIS escape sequences of ISO 2022Jp
1.568 + // These values have been taken from JISBASE_SHARED
1.569 + if (character == KEscape)
1.570 + {
1.571 + // Escape Sequence For Jis C6226_1978 \x1b\x24\x40
1.572 + if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40))
1.573 + {
1.574 + return EIsCharacterSet;
1.575 + }
1.576 +
1.577 + // Escape Sequence For Jis X0208_1983 \x1b\x24\x42
1.578 + else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42))
1.579 + {
1.580 + return EIsCharacterSet;
1.581 + }
1.582 +
1.583 + // Escape Sequence For Jis Roman \x1b\x28\x4a
1.584 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A))
1.585 + {
1.586 + return EIsCharacterSet;
1.587 + }
1.588 +
1.589 + // Escape Sequence For Jis RomanIncorrect \x1b\x28\x48
1.590 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48))
1.591 + {
1.592 + return EIsCharacterSet;
1.593 + }
1.594 +
1.595 + // Escape Sequence For Ascii \x1b\x28\x42
1.596 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42))
1.597 + {
1.598 + return EIsCharacterSet;
1.599 + }
1.600 +
1.601 + // Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49
1.602 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49))
1.603 + {
1.604 + return EIsCharacterSet;
1.605 + }
1.606 +
1.607 + // Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42
1.608 + else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40))
1.609 + {
1.610 + characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0);
1.611 + characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0);
1.612 +
1.613 + if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42))
1.614 + {
1.615 + return EIsCharacterSet;
1.616 + }
1.617 + }
1.618 + // Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44
1.619 + else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28))
1.620 + {
1.621 + if (characterPlus3 == 0x44)
1.622 + {
1.623 + return EIsCharacterSet;
1.624 + }
1.625 + }
1.626 +
1.627 + // check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40
1.628 + else if ((characterPlus1 == 'B') || (characterPlus1 == '@'))
1.629 + {
1.630 + return EIsCharacterSet;
1.631 + }
1.632 +
1.633 + } // end of if ( character == KEscape )
1.634 +
1.635 + i++;
1.636 + }
1.637 +
1.638 + // if escape sequences have been found then this is not ISO2022
1.639 + return EIsNotCharacterSet;
1.640 +
1.641 + }
1.642 +
1.643 +
1.644 +/**
1.645 + Check if EUC JP (reference CJKV by Ken Lunde page 164)
1.646 + @param A sample of data to be checked
1.647 + @param The number of input bytes that can be converted
1.648 + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
1.649 + @internalTechnology
1.650 + */
1.651 +CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
1.652 + {
1.653 + // Get the sample length
1.654 + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
1.655 +
1.656 + TInt i=0;
1.657 + aNumberOfBytesConverted = 0;
1.658 +
1.659 + TText8 character;
1.660 + TText8 characterPlus1;
1.661 + TText8 characterPlus2;
1.662 +
1.663 + // scan the sample text looking for valid shiftjis data
1.664 + while ( i < sampleLength )
1.665 + {
1.666 + // get the next few characters, use 0 if there is no more sample
1.667 + // as this will not match any test.
1.668 + character = aForeign[i];
1.669 + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
1.670 + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
1.671 +
1.672 + // EUCJP code set 0 0x21-0x7e
1.673 + if ( (character >= 0x21) && (character <= 0x7e))
1.674 + {
1.675 + aNumberOfBytesConverted++;
1.676 + }
1.677 + else if ( (character == 0x0a) || (character == 0x0d))
1.678 + {
1.679 + aNumberOfBytesConverted++;
1.680 + }
1.681 + // EUCJP code set 1
1.682 + else if ( (character >= 0xa1) && (character <= 0xff)
1.683 + && (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) )
1.684 + {
1.685 + aNumberOfBytesConverted+=2;
1.686 + i++;
1.687 + }
1.688 +
1.689 + // EUC JP code set 2, starts with the EUC JP SS2 character (0x8E)
1.690 + // and is followed by character in range 0xA1- 0xDF
1.691 + else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) )
1.692 + {
1.693 + // this could be 2 bytes of EUC JP code set 2
1.694 + aNumberOfBytesConverted += 2;
1.695 + i++;
1.696 + }
1.697 + // EUC JP code set 3, starts with the EUC JP SS3 character (0x8F)
1.698 + // and is followed by two characters in range A1- DF A1 -FE
1.699 + else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF)
1.700 + && (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
1.701 + {
1.702 + // this could be 3 bytes of EUC JP code set 3
1.703 + aNumberOfBytesConverted += 3;
1.704 + i+=2;
1.705 + }
1.706 + else
1.707 + {
1.708 + // This is not a valid decoding as EUC JP so reject
1.709 + return EIsNotCharacterSet;
1.710 + }
1.711 + i++;
1.712 + }
1.713 +
1.714 +
1.715 + // if all the characters could be converted
1.716 + if (aNumberOfBytesConverted == sampleLength)
1.717 + {
1.718 + return EIsCharacterSet;
1.719 + }
1.720 + else if (aNumberOfBytesConverted == 0)
1.721 + {
1.722 + return EIsNotCharacterSet;
1.723 + }
1.724 + else
1.725 + {
1.726 + return EMaybeCharacterSet;
1.727 + }
1.728 + }
1.729 +
1.730 +
1.731 +/**
1.732 + Convert from UCS2 (Universal Character Set containing two bytes) to unicode
1.733 + Remove any byte order marks in the UCSs.
1.734 + @param aUnicode Contains the converted text in the Unicode character set.
1.735 + @param aForeign The non-Unicode source text to be converted
1.736 + @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted.
1.737 + @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character.
1.738 + @return the number of bytes converted
1.739 + @internalTechnology
1.740 + */
1.741 + TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters,
1.742 + TDes16& aUnicode,
1.743 + const TDesC8& aForeign,
1.744 + TInt& aNumberOfUnconvertibleCharacters,
1.745 + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
1.746 +
1.747 + {
1.748 + TInt numberOfBytesConverted = 0;
1.749 + TInt numberOfUnicodeCharacters =0;
1.750 + TChar nextChar;
1.751 +
1.752 + // start at begining of the output buffer provided
1.753 + aUnicode.Zero();
1.754 +
1.755 + // while there is at least 2 bytes of data to convert and space in the output buffer
1.756 + while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) )
1.757 + {
1.758 + if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian )
1.759 + {
1.760 + // ELittleEndian 0x??00
1.761 + nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8);
1.762 + }
1.763 + else
1.764 + {
1.765 + // EBigEndian 0x00??
1.766 + nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1];
1.767 + }
1.768 +
1.769 + // save the unicode character extracted unless it's a BOM
1.770 + if ( nextChar != KByteOrderMark )
1.771 + {
1.772 + aUnicode.Append( nextChar );
1.773 + numberOfUnicodeCharacters++;
1.774 + }
1.775 +
1.776 + numberOfBytesConverted+=2;
1.777 + }
1.778 +
1.779 + // there are no uncovertable characters with UCS2, but there could be
1.780 + aNumberOfUnconvertibleCharacters = 0;
1.781 + // a negative value indicates that all characters converted
1.782 + aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1;
1.783 +
1.784 + // returns the number of unconverted bytes left at the end of the input descriptor
1.785 + // Note there could be 1 byte left over if an odd number of bytes provided for conversion
1.786 + return aForeign.Size() - numberOfBytesConverted;
1.787 + }
1.788 +
1.789 +/**
1.790 + Convert from EUC_JP (Extended Unix Code encoding for Japanese)
1.791 + Using the standard Charconv method of an array of methods
1.792 + @return the number of bytes converted
1.793 + @internalTechnology
1.794 + */
1.795 + TInt CJ5Converter::ConvertEEucjpToUnicode(
1.796 + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
1.797 + TDes16& aUnicode,
1.798 + const TDesC8& aForeign,
1.799 + TInt& /*aState*/,
1.800 + TInt& aNumberOfUnconvertibleCharacters,
1.801 + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
1.802 + {
1.803 + TFixedArray<CnvUtilities::SMethod, 4> methods;
1.804 + methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman;
1.805 + methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace;
1.806 + methods[0].iConversionData=&CnvJisRoman::ConversionData();
1.807 + methods[0].iNumberOfBytesPerCharacter=1;
1.808 + methods[0].iNumberOfCoreBytesPerCharacter=1;
1.809 + methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208;
1.810 + methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace;
1.811 + methods[1].iConversionData=&CnvJisX0208::ConversionData();
1.812 + methods[1].iNumberOfBytesPerCharacter=2;
1.813 + methods[1].iNumberOfCoreBytesPerCharacter=2;
1.814 + methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8;
1.815 + methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace;
1.816 + methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData();
1.817 + methods[2].iNumberOfBytesPerCharacter=2;
1.818 + methods[2].iNumberOfCoreBytesPerCharacter=1;
1.819 + methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212;
1.820 + methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace;
1.821 + methods[3].iConversionData=&CnvJisX0212::ConversionData();
1.822 + methods[3].iNumberOfBytesPerCharacter=3;
1.823 + methods[3].iNumberOfCoreBytesPerCharacter=2;
1.824 + return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array());
1.825 + }
1.826 +