sl@0: /* sl@0: * Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: * All rights reserved. sl@0: * This component and the accompanying materials are made available sl@0: * under the terms of "Eclipse Public License v1.0" sl@0: * which accompanies this distribution, and is available sl@0: * at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: * sl@0: * Initial Contributors: sl@0: * Nokia Corporation - initial contribution. sl@0: * sl@0: * Contributors: sl@0: * sl@0: * Description: sl@0: * J5 charconv character converter sl@0: * sl@0: */ sl@0: sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include "shiftjis.h" sl@0: #include "jisbase.h" sl@0: #include "j5.h" sl@0: sl@0: #include "jisx0201.h" sl@0: #include "jisx0208.h" sl@0: #include "jisx0212.h" sl@0: sl@0: #include "featmgr/featmgr.h" sl@0: sl@0: /** sl@0: J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data. sl@0: */ sl@0: const TInt KMaxSizeAutoDetectSample = 1000; sl@0: sl@0: const TUint8 KEscape = 0x1b; sl@0: const TInt KByteOrderMark = 0xfeff; sl@0: sl@0: const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters() sl@0: { sl@0: return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters(); sl@0: } sl@0: sl@0: /** sl@0: This API should not be used as it is ambiguous as to what encoding is required. sl@0: The user should instead call the specific plug-in for the appropriate conversion. sl@0: J5 ConvertFromUnicode() will convert to UTF8 as default. sl@0: @internalTechnology sl@0: */ sl@0: TInt CJ5Converter::ConvertFromUnicode( sl@0: CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */, sl@0: const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */, sl@0: TDes8& aForeign, sl@0: const TDesC16& aUnicode, sl@0: CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */) sl@0: { sl@0: return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode); sl@0: } sl@0: sl@0: /** sl@0: This will automatically determine one of the five supported encodings sl@0: to use and convert accordingly. This plugin method is available to the sl@0: user though the CCnvCharacterSetConverter::ConvertToUnicode() method. sl@0: There is no way for the caller to determine which encoding has been used. sl@0: sl@0: NOTE: For debugging the selected character set is returned in the state. sl@0: sl@0: @released 9.1 sl@0: @param aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters sl@0: in the foreign character set. sl@0: @param aUnicode On return, contains the text converted into Unicode. sl@0: @param aForeign The non-Unicode source text to be converted. sl@0: @param aState Used to save state information across multiple calls sl@0: to ConvertToUnicode(). sl@0: @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not sl@0: converted. sl@0: @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the sl@0: input text that could not be converted. A negative sl@0: value indicates that all the characters were sl@0: converted. sl@0: @return The number of unconverted bytes left at the end of the input descriptor sl@0: (e.g. because the output descriptor is not long enough to hold all the text), sl@0: or one of the error values defined in TError. sl@0: @internalTechnology sl@0: */ sl@0: TInt CJ5Converter::ConvertToUnicode( sl@0: CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, sl@0: TDes16& aUnicode, sl@0: const TDesC8& aForeign, sl@0: TInt& aState, sl@0: TInt& aNumberOfUnconvertibleCharacters, sl@0: TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) sl@0: { sl@0: // As the aState parameter is used to pass back the detected value sl@0: // use a "hidden" internal state variable. sl@0: TInt internalState = CCnvCharacterSetConverter::KStateDefault; sl@0: sl@0: // determine the encoding type and then decode appropriatly sl@0: switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign)) sl@0: { sl@0: case EShiftjis: sl@0: aState = EShiftjis; sl@0: return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, sl@0: aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); sl@0: sl@0: case EIso2022jp1: sl@0: aState = EIso2022jp1; sl@0: return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState, sl@0: aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); sl@0: sl@0: case EEucjp: sl@0: aState = EEucjp; sl@0: return ConvertEEucjpToUnicode( sl@0: aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState, sl@0: aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); sl@0: sl@0: case EUcs2: sl@0: aState = EUcs2; sl@0: return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, sl@0: aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); sl@0: sl@0: case EUtf8: sl@0: aState = EUtf8; sl@0: return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign); sl@0: sl@0: default: sl@0: // fall though to the default, which is decode as UTF8 sl@0: aState = EUnknown; sl@0: break; sl@0: } sl@0: sl@0: // decode as UTF8 sl@0: return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign); sl@0: } sl@0: sl@0: /** sl@0: This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL(). sl@0: This method returns a value between 0 and 100, indicating how likely it sl@0: is that this is the correct converter, for the text supplied. As J5 is sl@0: NOT intended to be used with the existing auto-detect mechanism, it will sl@0: always return 0 sl@0: @internalTechnology sl@0: */ sl@0: TBool CJ5Converter::IsInThisCharacterSetL( sl@0: TBool& aSetToTrue, sl@0: TInt& aConfidenceLevel, sl@0: const TDesC8& /* aSample */) sl@0: { sl@0: /* sl@0: aSetToTrue - This value should be set to ETrue. It is used to indicate to sl@0: CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL sl@0: is implementing a function of this signature and is therefore not the empty sl@0: */ sl@0: aSetToTrue=ETrue; sl@0: sl@0: /* no need to look at the sample as this always returns 0 sl@0: as the autodetect feature is not supported by the J5 plug-in sl@0: */ sl@0: aConfidenceLevel=0; sl@0: return ETrue; sl@0: } sl@0: sl@0: CJ5Converter* CJ5Converter::NewL() sl@0: { sl@0: CJ5Converter* self = new(ELeave) CJ5Converter(); sl@0: CleanupStack::PushL(self); sl@0: self->ConstructL(); sl@0: CleanupStack::Pop(self); sl@0: return self; sl@0: } sl@0: sl@0: CJ5Converter::~CJ5Converter() sl@0: { sl@0: FeatureManager::UnInitializeLib(); sl@0: } sl@0: sl@0: CJ5Converter::CJ5Converter() sl@0: { sl@0: } sl@0: sl@0: void CJ5Converter::ConstructL() sl@0: { sl@0: FeatureManager::InitializeLibL(); sl@0: } sl@0: sl@0: const TImplementationProxy ImplementationTable[] = sl@0: { sl@0: #ifdef KDDIAU_TEST sl@0: // for the test build use a special test UID sl@0: IMPLEMENTATION_PROXY_ENTRY(0x01000002, CJ5Converter::NewL) sl@0: #else sl@0: IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5, CJ5Converter::NewL) sl@0: #endif sl@0: }; sl@0: sl@0: EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount) sl@0: { sl@0: aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy); sl@0: sl@0: return ImplementationTable; sl@0: } sl@0: sl@0: /** sl@0: DetectEncoding determine the characterset encoding. sl@0: The logic for this detection is based on the information in CJKV by Ken Lunde. sl@0: A detailed diagram of this logic is in the J5 how to document section 2.4 sl@0: @return The detected character set as a enum CJ5Converter. sl@0: @internalTechnology sl@0: */ sl@0: enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding( sl@0: CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters , sl@0: const TDesC8& aForeign) sl@0: { sl@0: sl@0: // first check for UCS2 sl@0: CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian; sl@0: if ( DetectUcs2(aForeign, ucs2Endianness )) sl@0: { sl@0: // if ucs2 is detected pass back the detected endianess sl@0: aDefaultEndiannessOfForeignCharacters = ucs2Endianness; sl@0: return EUcs2; sl@0: } sl@0: sl@0: // next try EUC_JP sl@0: TInt eucJpValidBytes = 0; sl@0: CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes ); sl@0: if ( result == EIsCharacterSet ) sl@0: { sl@0: return EEucjp; sl@0: } sl@0: sl@0: // next try Iso 2020JP sl@0: if ( DetectIso2022( aForeign ) == EIsCharacterSet ) sl@0: { sl@0: return EIso2022jp1; sl@0: } sl@0: sl@0: // next try Utf8 sl@0: if ( DetectUtf8( aForeign ) == EIsCharacterSet ) sl@0: { sl@0: return EUtf8; sl@0: } sl@0: sl@0: // shiftjis sl@0: TInt shiftjisValidBytes = 0; sl@0: result = DetectShiftJis( aForeign, shiftjisValidBytes ); sl@0: if ( result == EIsCharacterSet ) sl@0: { sl@0: return EShiftjis; sl@0: } sl@0: sl@0: // no clear winner so go for the best sl@0: TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample); sl@0: sl@0: // if more than half is shiftjis and more shiftjis than EUC_JP, sl@0: if ((shiftjisValidBytes > eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength)) sl@0: return EShiftjis; sl@0: sl@0: // if more than half is EUC_JP and more EUC_JP than shiftjis, sl@0: if ((eucJpValidBytes > shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength)) sl@0: return EEucjp; sl@0: sl@0: // return the default sl@0: return EUcs2; sl@0: } sl@0: sl@0: sl@0: /** sl@0: Check if UCS2. sl@0: If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff) sl@0: then this must be UCS2. Otherwise try lookiing for 0x**00 or 0x00** sl@0: @param A sample of data to be checked sl@0: @param The Endianness if USC2 is detected sl@0: @return ETrue if UCS2 else EFalse sl@0: @internalTechnology sl@0: */ sl@0: TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign, sl@0: CCnvCharacterSetConverter::TEndianness& aTEndianness ) sl@0: { sl@0: // if the sample is not big enough sl@0: if (aForeign.Length() < 2) sl@0: { sl@0: return EFalse; sl@0: } sl@0: else if (aForeign[0]==0xff && aForeign[1]==0xfe ) sl@0: { sl@0: // we have found a Little Endian Byte order mark sl@0: aTEndianness = CCnvCharacterSetConverter::ELittleEndian; sl@0: return ETrue; sl@0: } sl@0: else if (aForeign[0]==0xfe && aForeign[1]==0xff ) sl@0: { sl@0: // we have found a Big Endian Byte order mark sl@0: aTEndianness = CCnvCharacterSetConverter::EBigEndian; sl@0: return ETrue; sl@0: } sl@0: sl@0: // Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that sl@0: // specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters. sl@0: // NB: This will fail if there are no ASCII characters in the text. sl@0: TInt sampleLength = aForeign.Length(); sl@0: sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; sl@0: sl@0: // check the sample for sequences of 0x**00 or 0x00** sl@0: TInt bigEndianConfidence = 0; sl@0: TInt littleEndianConfidence = 0; sl@0: TInt i=0; sl@0: for(;i< (sampleLength-1); i+=2) sl@0: { sl@0: if( aForeign[i] == 0x00) sl@0: { sl@0: bigEndianConfidence +=2; sl@0: } sl@0: else if ( aForeign[i+1] == 0x00) sl@0: { sl@0: littleEndianConfidence +=2; sl@0: } sl@0: } sl@0: sl@0: // which occurs most BE or LE sl@0: TInt confidenceLevel = 0; sl@0: if (bigEndianConfidence > littleEndianConfidence) sl@0: { sl@0: aTEndianness = CCnvCharacterSetConverter::EBigEndian; sl@0: confidenceLevel = bigEndianConfidence; sl@0: } sl@0: else sl@0: { sl@0: aTEndianness = CCnvCharacterSetConverter::ELittleEndian; sl@0: confidenceLevel = littleEndianConfidence; sl@0: } sl@0: sl@0: // if more than 97% count as UCS2 sl@0: if ( confidenceLevel * 100/sampleLength > 97) sl@0: return ETrue; sl@0: sl@0: return EFalse; sl@0: } sl@0: sl@0: /** sl@0: Check if ShiftJis (reference CJKV by Ken Lunde page 175) sl@0: @param A sample of data to be checked sl@0: @param The number of input bytes that can be converted sl@0: @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet sl@0: @internalTechnology sl@0: */ sl@0: enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted ) sl@0: { sl@0: // Get the sample length sl@0: TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; sl@0: sl@0: TInt i=0; sl@0: aNumberOfBytesConverted = 0; sl@0: sl@0: TText8 character; sl@0: TText8 characterPlus1; sl@0: TText8 characterPlus2; sl@0: sl@0: // scan the sample text looking for valid shiftjis data sl@0: while ( i < sampleLength ) sl@0: { sl@0: // get the next few characters, use 0 if there is no more sample sl@0: // as this will not match any test. sl@0: character = aForeign[i]; sl@0: characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); sl@0: characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); sl@0: sl@0: // SHIFTJIS - 0x8e to 0x9f followed by 0x40 to 0xfc sl@0: if ((character >= 0x81) && (character <= 0x9f) && sl@0: (characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) ) sl@0: { sl@0: // this is SHIFTJIS unless it is EUC JP code set 2 or 3 sl@0: if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF)) sl@0: { sl@0: // this could be EUC JP code set 2 (or shiftjis) sl@0: aNumberOfBytesConverted+=2; sl@0: i++; sl@0: } sl@0: else if ((character == 0x8F) && sl@0: (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) && sl@0: (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF)) sl@0: { sl@0: // this could be EUC JP code set 3 (or shiftjis) sl@0: aNumberOfBytesConverted+=3; sl@0: i+=2; sl@0: } sl@0: else sl@0: { sl@0: // this can only be shift jis sl@0: return EIsCharacterSet; sl@0: } sl@0: } sl@0: sl@0: // SHIFTJIS - 0xE0 to 0xEF followed by ..... sl@0: else if ((character >= 0xE0) && (character <= 0xEF)) sl@0: { sl@0: // 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF sl@0: // including Mopera extension to shiftjis from 0xEF80 to 0xEFFC sl@0: sl@0: if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) ) sl@0: { sl@0: // this can only be shift jis sl@0: return EIsCharacterSet; sl@0: } sl@0: else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) ) sl@0: { sl@0: // this could be EUC JP code set 1 sl@0: aNumberOfBytesConverted+=2; sl@0: i++; sl@0: } sl@0: sl@0: // problem here is the overlap between the UTF8 and shiftjis sl@0: else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) ) sl@0: { sl@0: // this could be shiftjis or utf8 sl@0: aNumberOfBytesConverted+=2; sl@0: i++; sl@0: } sl@0: } sl@0: // half width katakana A1-DF sl@0: else if ((character >= 0xA1) && (character <= 0xDF)) sl@0: { sl@0: aNumberOfBytesConverted+=1; sl@0: } sl@0: // ASCII or JIS-Roman 20-7e sl@0: else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D)) sl@0: { sl@0: aNumberOfBytesConverted+=1; sl@0: } sl@0: else sl@0: { sl@0: // This is not decoding as shiftjis, so reject sl@0: aNumberOfBytesConverted =0; sl@0: return EIsNotCharacterSet; sl@0: } sl@0: i++; sl@0: } sl@0: sl@0: // if all the characters could be converted sl@0: if (aNumberOfBytesConverted == sampleLength) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: else if (aNumberOfBytesConverted == 0) sl@0: { sl@0: return EIsNotCharacterSet; sl@0: } sl@0: else sl@0: { sl@0: return EMaybeCharacterSet; sl@0: } sl@0: } sl@0: sl@0: /** sl@0: Check if UTF8 (reference CJKV by Ken Lunde page 189) sl@0: @param A sample of data to be checked sl@0: @param The number of input bytes that can be converted sl@0: @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet sl@0: @internalTechnology sl@0: */ sl@0: enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign ) sl@0: { sl@0: // Get the sample length sl@0: TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; sl@0: sl@0: TInt i=0; sl@0: TText8 character; sl@0: TText8 characterPlus1; sl@0: TText8 characterPlus2; sl@0: TText8 characterPlus3; sl@0: sl@0: // scan the sample text looking for valid UTF8 sl@0: while ( i < sampleLength ) sl@0: { sl@0: // get the next few characters, use 0 if there is no more sample sl@0: // as this will not match any test. sl@0: character = aForeign[i]; sl@0: characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); sl@0: characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); sl@0: characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0); sl@0: sl@0: // UTF8 range 110xxxxx followed by one valid UTF8 bytes sl@0: if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) ) sl@0: { sl@0: // two bytes of valid UTF8 found sl@0: i+=2; sl@0: } sl@0: // UTF8 range 1110xxxx followed by two valid UTF8 bytes sl@0: else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80)) sl@0: { sl@0: // three bytes of valid UTF8 found sl@0: i+=3; sl@0: } sl@0: // UTF8 range 11110xxx followed by three valid UTF8 bytes sl@0: else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80) sl@0: && (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) ) sl@0: { sl@0: // four bytes of valid UTF8 found sl@0: i+=4; sl@0: } sl@0: sl@0: // ascii range 0 to 0x7F sl@0: else if((character & 0x80)==0x00) sl@0: { sl@0: // The value of character is in the range 0x00-0x7f sl@0: // UTF8 maintains ASCII transparency. So it's a valid UTF8. sl@0: i++; sl@0: } sl@0: // if the sample data is longer than KMaxSizeAutoDetectSample then except anything sl@0: // for the last two bytes as they may not appear valid without more data sl@0: else if( i >= (KMaxSizeAutoDetectSample -2) ) sl@0: { sl@0: i++; sl@0: } sl@0: else sl@0: { sl@0: // This is not decoding as UTF8 so reject sl@0: return EIsNotCharacterSet; sl@0: } sl@0: } sl@0: sl@0: // All the characters could be converted sl@0: return EIsCharacterSet; sl@0: sl@0: } sl@0: sl@0: sl@0: /** sl@0: Check if ISO2022JP by lookiing for the escape sequences. sl@0: @param A sample of data to be checked sl@0: @param The number of input bytes that can be converted sl@0: @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet sl@0: @internalTechnology sl@0: */ sl@0: enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign ) sl@0: { sl@0: // Get the sample length sl@0: TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; sl@0: sl@0: TInt i=0; sl@0: TText8 character; sl@0: TText8 characterPlus1; sl@0: TText8 characterPlus2; sl@0: TText8 characterPlus3; sl@0: TText8 characterPlus4; sl@0: TText8 characterPlus5; sl@0: sl@0: // scan the sample text looking for valid UTF8 sl@0: while ( i < sampleLength ) sl@0: { sl@0: // get the next few characters, use 0 if there is no more sample sl@0: // as this will not match any test. sl@0: character = aForeign[i]; sl@0: characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); sl@0: characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); sl@0: characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0); sl@0: sl@0: sl@0: // check for the JIS escape sequences of ISO 2022Jp sl@0: // These values have been taken from JISBASE_SHARED sl@0: if (character == KEscape) sl@0: { sl@0: // Escape Sequence For Jis C6226_1978 \x1b\x24\x40 sl@0: if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: // Escape Sequence For Jis X0208_1983 \x1b\x24\x42 sl@0: else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: // Escape Sequence For Jis Roman \x1b\x28\x4a sl@0: else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: // Escape Sequence For Jis RomanIncorrect \x1b\x28\x48 sl@0: else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: // Escape Sequence For Ascii \x1b\x28\x42 sl@0: else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: // Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49 sl@0: else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: // Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42 sl@0: else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40)) sl@0: { sl@0: characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0); sl@0: characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0); sl@0: sl@0: if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42)) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: } sl@0: // Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44 sl@0: else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28)) sl@0: { sl@0: if (characterPlus3 == 0x44) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: } sl@0: sl@0: // check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40 sl@0: else if ((characterPlus1 == 'B') || (characterPlus1 == '@')) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: sl@0: } // end of if ( character == KEscape ) sl@0: sl@0: i++; sl@0: } sl@0: sl@0: // if escape sequences have been found then this is not ISO2022 sl@0: return EIsNotCharacterSet; sl@0: sl@0: } sl@0: sl@0: sl@0: /** sl@0: Check if EUC JP (reference CJKV by Ken Lunde page 164) sl@0: @param A sample of data to be checked sl@0: @param The number of input bytes that can be converted sl@0: @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet sl@0: @internalTechnology sl@0: */ sl@0: CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted ) sl@0: { sl@0: // Get the sample length sl@0: TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; sl@0: sl@0: TInt i=0; sl@0: aNumberOfBytesConverted = 0; sl@0: sl@0: TText8 character; sl@0: TText8 characterPlus1; sl@0: TText8 characterPlus2; sl@0: sl@0: // scan the sample text looking for valid shiftjis data sl@0: while ( i < sampleLength ) sl@0: { sl@0: // get the next few characters, use 0 if there is no more sample sl@0: // as this will not match any test. sl@0: character = aForeign[i]; sl@0: characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); sl@0: characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); sl@0: sl@0: // EUCJP code set 0 0x21-0x7e sl@0: if ( (character >= 0x21) && (character <= 0x7e)) sl@0: { sl@0: aNumberOfBytesConverted++; sl@0: } sl@0: else if ( (character == 0x0a) || (character == 0x0d)) sl@0: { sl@0: aNumberOfBytesConverted++; sl@0: } sl@0: // EUCJP code set 1 sl@0: else if ( (character >= 0xa1) && (character <= 0xff) sl@0: && (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) ) sl@0: { sl@0: aNumberOfBytesConverted+=2; sl@0: i++; sl@0: } sl@0: sl@0: // EUC JP code set 2, starts with the EUC JP SS2 character (0x8E) sl@0: // and is followed by character in range 0xA1- 0xDF sl@0: else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) ) sl@0: { sl@0: // this could be 2 bytes of EUC JP code set 2 sl@0: aNumberOfBytesConverted += 2; sl@0: i++; sl@0: } sl@0: // EUC JP code set 3, starts with the EUC JP SS3 character (0x8F) sl@0: // and is followed by two characters in range A1- DF A1 -FE sl@0: else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) sl@0: && (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF)) sl@0: { sl@0: // this could be 3 bytes of EUC JP code set 3 sl@0: aNumberOfBytesConverted += 3; sl@0: i+=2; sl@0: } sl@0: else sl@0: { sl@0: // This is not a valid decoding as EUC JP so reject sl@0: return EIsNotCharacterSet; sl@0: } sl@0: i++; sl@0: } sl@0: sl@0: sl@0: // if all the characters could be converted sl@0: if (aNumberOfBytesConverted == sampleLength) sl@0: { sl@0: return EIsCharacterSet; sl@0: } sl@0: else if (aNumberOfBytesConverted == 0) sl@0: { sl@0: return EIsNotCharacterSet; sl@0: } sl@0: else sl@0: { sl@0: return EMaybeCharacterSet; sl@0: } sl@0: } sl@0: sl@0: sl@0: /** sl@0: Convert from UCS2 (Universal Character Set containing two bytes) to unicode sl@0: Remove any byte order marks in the UCSs. sl@0: @param aUnicode Contains the converted text in the Unicode character set. sl@0: @param aForeign The non-Unicode source text to be converted sl@0: @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted. sl@0: @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character. sl@0: @return the number of bytes converted sl@0: @internalTechnology sl@0: */ sl@0: TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters, sl@0: TDes16& aUnicode, sl@0: const TDesC8& aForeign, sl@0: TInt& aNumberOfUnconvertibleCharacters, sl@0: TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) sl@0: sl@0: { sl@0: TInt numberOfBytesConverted = 0; sl@0: TInt numberOfUnicodeCharacters =0; sl@0: TChar nextChar; sl@0: sl@0: // start at begining of the output buffer provided sl@0: aUnicode.Zero(); sl@0: sl@0: // while there is at least 2 bytes of data to convert and space in the output buffer sl@0: while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) ) sl@0: { sl@0: if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian ) sl@0: { sl@0: // ELittleEndian 0x??00 sl@0: nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8); sl@0: } sl@0: else sl@0: { sl@0: // EBigEndian 0x00?? sl@0: nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1]; sl@0: } sl@0: sl@0: // save the unicode character extracted unless it's a BOM sl@0: if ( nextChar != KByteOrderMark ) sl@0: { sl@0: aUnicode.Append( nextChar ); sl@0: numberOfUnicodeCharacters++; sl@0: } sl@0: sl@0: numberOfBytesConverted+=2; sl@0: } sl@0: sl@0: // there are no uncovertable characters with UCS2, but there could be sl@0: aNumberOfUnconvertibleCharacters = 0; sl@0: // a negative value indicates that all characters converted sl@0: aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1; sl@0: sl@0: // returns the number of unconverted bytes left at the end of the input descriptor sl@0: // Note there could be 1 byte left over if an odd number of bytes provided for conversion sl@0: return aForeign.Size() - numberOfBytesConverted; sl@0: } sl@0: sl@0: /** sl@0: Convert from EUC_JP (Extended Unix Code encoding for Japanese) sl@0: Using the standard Charconv method of an array of methods sl@0: @return the number of bytes converted sl@0: @internalTechnology sl@0: */ sl@0: TInt CJ5Converter::ConvertEEucjpToUnicode( sl@0: CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, sl@0: TDes16& aUnicode, sl@0: const TDesC8& aForeign, sl@0: TInt& /*aState*/, sl@0: TInt& aNumberOfUnconvertibleCharacters, sl@0: TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) sl@0: { sl@0: TFixedArray methods; sl@0: methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman; sl@0: methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace; sl@0: methods[0].iConversionData=&CnvJisRoman::ConversionData(); sl@0: methods[0].iNumberOfBytesPerCharacter=1; sl@0: methods[0].iNumberOfCoreBytesPerCharacter=1; sl@0: methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208; sl@0: methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace; sl@0: methods[1].iConversionData=&CnvJisX0208::ConversionData(); sl@0: methods[1].iNumberOfBytesPerCharacter=2; sl@0: methods[1].iNumberOfCoreBytesPerCharacter=2; sl@0: methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8; sl@0: methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace; sl@0: methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData(); sl@0: methods[2].iNumberOfBytesPerCharacter=2; sl@0: methods[2].iNumberOfCoreBytesPerCharacter=1; sl@0: methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212; sl@0: methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace; sl@0: methods[3].iConversionData=&CnvJisX0212::ConversionData(); sl@0: methods[3].iNumberOfBytesPerCharacter=3; sl@0: methods[3].iNumberOfCoreBytesPerCharacter=2; sl@0: return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array()); sl@0: } sl@0: