diff -r 000000000000 -r bde4ae8d615e os/textandloc/charconvfw/charconv_fw/src/convutils/convutils.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/os/textandloc/charconvfw/charconv_fw/src/convutils/convutils.cpp Fri Jun 15 03:10:57 2012 +0200 @@ -0,0 +1,900 @@ +/* +* Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + + +#include +#include +#include +#include + +const TInt KNoPreviousCharacterSet=-1; +const TInt KDefaultCharacterSet = 0; +const TUint KControlCharacterEscape=0x1b; + +#if defined(_DEBUG) +//It will cause performance problem with small KMaximumLengthOfIntermediateBuffer. +//Please use release version to test performance cases. +const TInt KMaximumLengthOfIntermediateBuffer=5; +#else +const TInt KMaximumLengthOfIntermediateBuffer=150; +#endif + +struct SCnvConversionData; + +_LIT(KLitPanicText, "CONVUTILS"); + +enum TPanic + { + EPanicBadInputConversionFlags1=1, + EPanicBadInputConversionFlags2, + EPanicBadInputConversionFlags3, + EPanicBadNumberOfUnicodeElementsConsumed, + EPanicAppendFlagViolated, + EPanicBadNumberOfUnicodeCharactersConverted, + EPanicBadNumberOfCharactersThatDroppedOut, + EPanicLoopCounterOverRun1, + EPanicLoopCounterOverRun2, + EPanicDescriptorNotWholeNumberOfCharacters1, + EPanicDescriptorNotWholeNumberOfCharacters2, + EPanicDescriptorNotWholeNumberOfCharacters3, + EPanicDescriptorNotWholeNumberOfCharacters4, + EPanicBadStartOfNextEscapeSequence, + EPanicInconsistentNumberOfForeignBytesRemaining, + EPanicBadLengthOfRunToConvert1, + EPanicBadLengthOfRunToConvert2, + EPanicBadMethodPointer, + EPanicBadMethodData1, + EPanicBadMethodData2, + EPanicBadMethodData3, + EPanicBadMethodData4, + EPanicBadNumberOfCharacterSets, + EPanicBadConversionDataPointer1, + EPanicBadConversionDataPointer2, + EPanicBadConversionDataPointer3, + EPanicBadFunctionPointer1, + EPanicBadFunctionPointer2, + EPanicBadFunctionPointer3, + EPanicBadEscapeSequencePointer1, + EPanicBadEscapeSequencePointer2, + EPanicBadNumberOfStates, + EPanicBadEscapeSequenceStart, + EPanicBadNumberOfMethods, + EPanicBadSurrogatePair1, + EPanicBadSurrogatePair2, + EPanicBadRemainderOfForeign, + EPanicOutputDescriptorTooShortEvenToHoldEscapeSequenceToDefaultCharacterSet + }; + +LOCAL_C void Panic(TPanic aPanic) + { + User::Panic(KLitPanicText, aPanic); + } + +/** Converts Unicode text into a complex foreign character set encoding. This +is an encoding which cannot be converted simply by calling +CCnvCharacterSetConverter::DoConvertFromUnicode(). It may be modal (e.g. JIS) +or non-modal (e.g. Shift-JIS). + +The Unicode text specified in aUnicode is converted using the array of +conversion data objects (aArrayOfCharacterSets) provided by the plug-in for +the complex character set encoding, and the converted text is returned in +aForeign. Any existing contents in aForeign are overwritten. + +Unlike CCnvCharacterSetConverter::DoConvertFromUnicode(), multiple character +sets can be specified. aUnicode is converted using the first character conversion +data object in the array. When a character is found which cannot be converted +using that data, each character set in the array is tried in turn. If it cannot +be converted using any object in the array, the index of the character is +appended to aIndicesOfUnconvertibleCharacters and the character is replaced +by aReplacementForUnconvertibleUnicodeCharacters. + +If it can be converted using another object in the array, that object is used +to convert all subsequent characters until another unconvertible character +is found. + +@param aDefaultEndiannessOfForeignCharacters The default endian-ness to use +when writing the characters in the foreign character set. If an endian-ness +for foreign characters is specified in the current conversion data object, +then that is used instead and the value of +aDefaultEndiannessOfForeignCharacters is ignored. +@param aReplacementForUnconvertibleUnicodeCharacters The single character (one +or more byte values) which is used to replace unconvertible characters. +@param aForeign On return, contains the converted text in the non-Unicode +character set. +@param aUnicode The source Unicode text to be converted. +@param aIndicesOfUnconvertibleCharacters On return, holds an ascending array +of the indices of each Unicode character in the source text which could not +be converted (because none of the target character sets have an equivalent +character). +@param aArrayOfCharacterSets Array of character conversion data objects, +representing the character sets which comprise a complex character set +encoding. These are used in sequence to convert the Unicode text. There must +be at least one character set in this array and no character set may have any +NULL member data, or a panic occurs. +@return The number of unconverted characters left at the end of the input +descriptor (e.g. because aForeign was not long enough to hold all the text), +or a negative error value, as defined in CCnvCharacterSetConverter::TError. */ +EXPORT_C TInt CnvUtilities::ConvertFromUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, + TDes8& aForeign, + const TDesC16& aUnicode, + CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters, + const TArray& aArrayOfCharacterSets) + { + TUint notUsed; + return ConvertFromUnicode(aDefaultEndiannessOfForeignCharacters, + aReplacementForUnconvertibleUnicodeCharacters, + aForeign, + aUnicode, + aIndicesOfUnconvertibleCharacters, + aArrayOfCharacterSets, + notUsed, + 0); + } + +/** Converts Unicode text into a complex foreign character set encoding. This is +an encoding which cannot be converted simply by a call to +CCnvCharacterSetConverter::DoConvertFromUnicode(). It may be modal (e.g. JIS) +or non-modal (e.g. Shift-JIS). + +The Unicode text specified in aUnicode is converted using the array of conversion +data objects (aArrayOfCharacterSets) provided by the plug-in for the complex +character set encoding and the converted text is returned in aForeign. The +function can either append to aForeign or overwrite its contents (if any). + +Unlike CCnvCharacterSetConverter::DoConvertFromUnicode(), multiple character +sets can be specified. aUnicode is converted using the first character conversion +data object in the array. When a character is found which cannot be converted +using that data, each character set in the array is tried in turn. If it cannot +be converted using any object in the array, the index of the character is +appended to aIndicesOfUnconvertibleCharacters and the character is replaced +by aReplacementForUnconvertibleUnicodeCharacters. + +If it can be converted using another object in the array, that object is used +to convert all subsequent characters until another unconvertible character +is found. + +@param aDefaultEndiannessOfForeignCharacters The default endian-ness to use +when writing the characters in the foreign character set. If an endian-ness +for foreign characters is specified in the current conversion data object, +then that is used instead and the value of +aDefaultEndiannessOfForeignCharacters is ignored. +@param aReplacementForUnconvertibleUnicodeCharacters The single character (one +or more byte values) which is used to replace unconvertible characters. +@param aForeign On return, contains the converted text in the non-Unicode +character set. This may already contain some text. If it does, and if +aInputConversionFlags specifies EInputConversionFlagAppend, then the converted +text is appended to this descriptor. +@param aUnicode The source Unicode text to be converted. +@param aIndicesOfUnconvertibleCharacters On return, holds an ascending array +of the indices of each Unicode character in the source text which could not +be converted (because none of the target character sets have an equivalent +character). +@param aArrayOfCharacterSets Array of character set data objects. These are +used in sequence to convert the Unicode text. There must be at least one +character set in this array and no character set may have any NULL member +data, or a panic occurs. +@param aOutputConversionFlags If the input descriptor ended in a truncated +sequence, e.g. the first half only of a Unicode surrogate pair, this returns +with the EOutputConversionFlagInputIsTruncated flag set. +@param aInputConversionFlags Specify +CCnvCharacterSetConverter::EInputConversionFlagAppend to append the text to +aForeign. Specify CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable +to prevent the function from returning the error-code EErrorIllFormedInput +when the input descriptor consists of nothing but a truncated sequence. The +CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter +flag must not be set, otherwise a panic occurs. +@return The number of unconverted characters left at the end of the input descriptor +(e.g. because aForeign was not long enough to hold all the text), or a negative +error value, as defined in CCnvCharacterSetConverter::TError. */ +EXPORT_C TInt CnvUtilities::ConvertFromUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, + TDes8& aForeign, + const TDesC16& aUnicode, + CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters, + const TArray& aArrayOfCharacterSets, + TUint& aOutputConversionFlags, + TUint aInputConversionFlags) + { + __ASSERT_ALWAYS(~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter, Panic(EPanicBadInputConversionFlags1)); + CheckArrayOfCharacterSets(aArrayOfCharacterSets); + aOutputConversionFlags=0; + TUint internalInputConversionFlags=aInputConversionFlags; + if (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend) + { + aForeign.SetLength(0); + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAppend; + } + if (aUnicode.Length()==0) + { + return 0; + } + if (aForeign.MaxLength()==aForeign.Length()) // relies on the fact that aForeign's length has been set to zero if aInputConversionFlags does not have CCnvCharacterSetConverter::EInputConversionFlagAppend set + { + return aUnicode.Length(); + } + TDes8* foreign=&aForeign; + TPtr8 dummyForeign(NULL, 0, 0); + if (aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagMustEndInDefaultCharacterSet) + { + TInt dummyMaximumLength = + aForeign.MaxLength() - aArrayOfCharacterSets[KDefaultCharacterSet].iEscapeSequence->Length(); + __ASSERT_ALWAYS(dummyMaximumLength >= 0, + Panic(EPanicOutputDescriptorTooShortEvenToHoldEscapeSequenceToDefaultCharacterSet)); + dummyForeign.Set(const_cast (aForeign.Ptr()), + aForeign.Length(), + dummyMaximumLength); + foreign=&dummyForeign; + } + const TInt numberOfCharacterSets=aArrayOfCharacterSets.Count(); + TInt numberOfUnicodeElementsConsumed=0; + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter; // this is not just an optimization - it ensures that "foreign" doesn't get filled up too much each time CCnvCharacterSetConverter::DoConvertFromUnicode is called + TInt previousCharacterSet = aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAssumeStartInDefaultCharacterSet? + KDefaultCharacterSet : KNoPreviousCharacterSet; + FOREVER + { + for (TInt presentCharacterSet=KDefaultCharacterSet;;) + { + __ASSERT_DEBUG(numberOfUnicodeElementsConsumed<=aUnicode.Length(), Panic(EPanicBadNumberOfUnicodeElementsConsumed)); + if (numberOfUnicodeElementsConsumed>=aUnicode.Length()) + { + goto end; + } + const SCharacterSet& characterSet=aArrayOfCharacterSets[presentCharacterSet]; + const TInt oldNumberOfBytesInForeign=foreign->Length(); + if (numberOfUnicodeElementsConsumed>0) + { + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable; + } + CCnvCharacterSetConverter::TArrayOfAscendingIndices indicesOfUnconvertibleCharacters; + const TInt returnValue=CCnvCharacterSetConverter::DoConvertFromUnicode(*characterSet.iConversionData, aDefaultEndiannessOfForeignCharacters, KNullDesC8, *foreign, aUnicode.Mid(numberOfUnicodeElementsConsumed), indicesOfUnconvertibleCharacters, aOutputConversionFlags, internalInputConversionFlags); + if (returnValue<0) + { + return returnValue; // this is an error-code + } + __ASSERT_DEBUG(foreign->Length()>=oldNumberOfBytesInForeign, Panic(EPanicAppendFlagViolated)); + TInt indexOfFirstUnconvertibleCharacter; + if (indicesOfUnconvertibleCharacters.NumberOfIndices()==0) + { + indexOfFirstUnconvertibleCharacter=-1; + numberOfUnicodeElementsConsumed=aUnicode.Length()-returnValue; + } + else + { + indexOfFirstUnconvertibleCharacter=indicesOfUnconvertibleCharacters[0]; + numberOfUnicodeElementsConsumed+=indexOfFirstUnconvertibleCharacter; + __ASSERT_DEBUG(numberOfUnicodeElementsConsumed+LengthOfUnicodeCharacter(aUnicode, numberOfUnicodeElementsConsumed)==aUnicode.Length()-returnValue, Panic(EPanicBadNumberOfUnicodeCharactersConverted)); + } + if (indexOfFirstUnconvertibleCharacter!=0) // if at least one Unicode character at the start of CCnvCharacterSetConverter::DoConvertFromUnicode's input descriptor was convertible... + { + TBool gotoEnd = EFalse; + if (foreign->Length()>oldNumberOfBytesInForeign) + { + TInt numberOfCharactersThatDroppedOut=0; + // Insert an escape sequence if this character set is different from the last one. + if (presentCharacterSet != previousCharacterSet) + { + // Insert escape sequence (if requred) in front of the last encoded run of text. + // Note that this may cause some characters to drop out at the end. + (*characterSet.iConvertFromIntermediateBufferInPlace)(oldNumberOfBytesInForeign, *foreign, numberOfCharactersThatDroppedOut); + if (oldNumberOfBytesInForeign < foreign->Length()) + previousCharacterSet = presentCharacterSet; + } + numberOfUnicodeElementsConsumed-=numberOfCharactersThatDroppedOut; + if (numberOfCharactersThatDroppedOut>0 )// if "foreign" has been filled to as much as it will hold... + { + gotoEnd = ETrue; + } + } + if (indexOfFirstUnconvertibleCharacter<0) // if we've successfully converted up to the end of aUnicode (using *characterSet.iConversionData)... + { + gotoEnd = ETrue; + } + if (gotoEnd) + { + if ( aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagMustEndInDefaultCharacterSet + && previousCharacterSet != KDefaultCharacterSet + && previousCharacterSet != KNoPreviousCharacterSet) + { + aForeign.SetLength(foreign->Length()); + aForeign.Append(*aArrayOfCharacterSets[KDefaultCharacterSet].iEscapeSequence); + foreign=NULL; + } + goto end; + } + break; + } + __ASSERT_DEBUG(presentCharacterSet=numberOfCharacterSets) + { + if ((foreign->MaxLength()-foreign->Length()Append(aReplacementForUnconvertibleUnicodeCharacters); + break; + } + } + } +end: + if (foreign!=NULL) + { + aForeign.SetLength(foreign->Length()); + foreign=NULL; + } + if ((numberOfUnicodeElementsConsumed==0) && (aOutputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated) && (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable)) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + return aUnicode.Length()-numberOfUnicodeElementsConsumed; + } + + +/** Inserts an escape sequence into the descriptor. + +This function is provided to help in the implementation of +ConvertFromUnicode() for modal character set encodings. +Each SCharacterSet object in the array passed to +ConvertFromUnicode() must have its +iConvertFromIntermediateBufferInPlace member assigned. To +do this for a modal character set encoding, implement a function whose +signature matches that of FConvertFromIntermediateBufferInPlace +and which calls this function, passing all arguments unchanged, and +specifying the character set's escape sequence and the number of bytes per +character. + +@param aStartPositionInDescriptor The byte position in aDescriptor at which +the escape sequence is inserted. If the character set uses more than one byte +per character, this position must be the start of a character, otherwise a +panic occurs. +@param aDescriptor The descriptor into which the escape sequence is inserted. +@param aNumberOfCharactersThatDroppedOut The escape sequence is inserted into +the start of aDescriptor and any characters that need to drop out to make +room for the escape sequence (because the descriptor's maximum length was +not long enough) drop out from the end of the buffer. This parameter indicates +the number of characters that needed to drop out. +@param aEscapeSequence The escape sequence for the character set. +@param aNumberOfBytesPerCharacter The number of bytes per character. */ +EXPORT_C void CnvUtilities::ConvertFromIntermediateBufferInPlace( + TInt aStartPositionInDescriptor, + TDes8& aDescriptor, + TInt& aNumberOfCharactersThatDroppedOut, + const TDesC8& aEscapeSequence, + TInt aNumberOfBytesPerCharacter) + { + const TInt lengthOfDescriptor=aDescriptor.Length(); + __ASSERT_ALWAYS((lengthOfDescriptor-aStartPositionInDescriptor)%aNumberOfBytesPerCharacter==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters1)); + aNumberOfCharactersThatDroppedOut=(Max(0, aEscapeSequence.Length()-(aDescriptor.MaxLength()-lengthOfDescriptor))+(aNumberOfBytesPerCharacter-1))/aNumberOfBytesPerCharacter; + const TInt lengthOfRunInCharacters=(lengthOfDescriptor-aStartPositionInDescriptor)/aNumberOfBytesPerCharacter; + if (aNumberOfCharactersThatDroppedOut>=lengthOfRunInCharacters) // ">=" is correct (rather than ">") as if there's only room for the escape sequence we don't want to have it in the descriptor + { + aNumberOfCharactersThatDroppedOut=lengthOfRunInCharacters; + aDescriptor.SetLength(aStartPositionInDescriptor); + } + else + { + aDescriptor.SetLength(lengthOfDescriptor-(aNumberOfCharactersThatDroppedOut*aNumberOfBytesPerCharacter)); + aDescriptor.Insert(aStartPositionInDescriptor, aEscapeSequence); + } + } + + +/** Converts text from a modal foreign character set encoding into Unicode. + +The non-Unicode text specified in aForeign is converted using +the array of character set conversion objects (aArrayOfStates) +provided by the plug-in, and the converted text is returned in +aUnicode. The function can either append to aUnicode +or overwrite its contents (if any), depending on the input conversion flags +specified. The first element in aArrayOfStates is taken to be +the default mode (i.e. the mode to assume by default if there is no preceding +escape sequence). + +@param aDefaultEndiannessOfForeignCharacters The default endian-ness of the +foreign characters. If an endian-ness for foreign characters is specified +in the conversion data, then that is used instead and the value of +aDefaultEndiannessOfForeignCharacters is ignored. +@param aUnicode On return, contains the text converted into Unicode. +@param aForeign The non-Unicode source text to be converted. +@param aState Used to store a modal character set encoding's current mode across +multiple calls to ConvertToUnicode() on the same input descriptor. This argument +should be passed the same object as passed to the plug-in's ConvertToUnicode() +exported function. +@param aNumberOfUnconvertibleCharacters On return, contains the number of +characters in aForeign which were not converted. Characters which cannot be +converted are output as Unicode replacement characters (0xfffd). +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index +of the first byte of the first unconvertible character. For instance if the +first character in the input descriptor (aForeign) could not be converted, +then this parameter is set to the first byte of that character, i.e. zero. +A negative value is returned if all the characters were converted. +@param aArrayOfStates Array of character set conversion data objects, and their +escape sequences ("modes"). There must be one or more modes in this array, +none of the modes can have any NULL member data, and each mode's escape sequence +must begin with KControlCharacterEscape (0x1b) or a panic occurs. +@return The number of unconverted bytes left at the end of the input descriptor, +or a negative error value, as defined in TError. */ +EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromModalForeign( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aState, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, + const TArray& aArrayOfStates) + { + TUint notUsed; + return ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, + aUnicode, + aForeign, + aState, + aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, + aArrayOfStates, + notUsed, + 0); + } + +/** @param aDefaultEndiannessOfForeignCharacters The default endian-ness for +the foreign characters. If an endian-ness for foreign characters is specified +in the conversion data, then that is used instead and the value of +aDefaultEndiannessOfForeignCharacters is ignored. +@param aUnicode On return, contains the text converted into Unicode. +@param aForeign The non-Unicode source text to be converted. +@param aState Used to store a modal character set encoding's current mode +across multiple calls to ConvertToUnicode() on the same input descriptor. This +argument should be passed the same object as passed to the plug-in's +ConvertToUnicode() exported function. +@param aNumberOfUnconvertibleCharacters On return, contains the number of +characters in aForeign which were not converted. Characters which cannot be +converted are output as Unicode replacement characters (0xfffd). +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index +of the first byte of the first unconvertible character. For instance if the +first character in the input descriptor (aForeign) could not be converted, +then this parameter is set to the first byte of that character, i.e. zero. +A negative value is returned if all the characters were converted. +@param aArrayOfStates Array of character set conversion data objects, and their +escape sequences. There must be one or more modes in this array, none of the +modes can have any NULL member data, and each mode's escape sequence must +begin with KControlCharacterEscape (0x1b) or a panic occurs. +@param aOutputConversionFlags If the input descriptor ended in a truncated +sequence, e.g. a part of a multi-byte character, aOutputConversionFlags +returns with the EOutputConversionFlagInputIsTruncated flag set. +@param aInputConversionFlags Specify +CCnvCharacterSetConverter::EInputConversionFlagAppend to append the text to +aUnicode. Specify EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable +to prevent the function from returning the error-code EErrorIllFormedInput +when the input descriptor consists of nothing but a truncated sequence. The +CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter +flag must not be set, otherwise a panic occurs. +@return The number of unconverted bytes left at the end of the input descriptor, +or a negative error value, as defined in TError. */ +EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromModalForeign( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aState, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, + const TArray& aArrayOfStates, + TUint& aOutputConversionFlags, + TUint aInputConversionFlags) + { + __ASSERT_ALWAYS(~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter, Panic(EPanicBadInputConversionFlags2)); + CheckArrayOfStates(aArrayOfStates); + aNumberOfUnconvertibleCharacters=0; + aIndexOfFirstByteOfFirstUnconvertibleCharacter=-1; + aOutputConversionFlags=0; + TUint internalInputConversionFlags=aInputConversionFlags; + if (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend) + { + aUnicode.SetLength(0); + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAppend; + } + if (aForeign.Length()==0) + { + return 0; + } + if (aUnicode.MaxLength()==aUnicode.Length()) // relies on the fact that aUnicode's length has been set to zero if aInputConversionFlags does not have CCnvCharacterSetConverter::EInputConversionFlagAppend set + { + return aForeign.Length(); + } + TPtrC8 remainderOfForeign(aForeign); + TPtrC8 homogeneousRun; + TInt numberOfForeignBytesConsumed=0; + const SCnvConversionData* conversionData = NULL; + const TInt startOfNextEscapeSequence=aForeign.Locate(KControlCharacterEscape); + if (startOfNextEscapeSequence!=0) // if aForeign doesn't start with an escape sequence... + { + conversionData=(aState!=CCnvCharacterSetConverter::KStateDefault)? REINTERPRET_CAST(const SCnvConversionData*, aState): aArrayOfStates[0].iConversionData; + if (startOfNextEscapeSequence==KErrNotFound) + { + homogeneousRun.Set(remainderOfForeign); + remainderOfForeign.Set(NULL, 0); + } + else + { + __ASSERT_DEBUG(startOfNextEscapeSequence>0, Panic(EPanicBadStartOfNextEscapeSequence)); + homogeneousRun.Set(remainderOfForeign.Left(startOfNextEscapeSequence)); + remainderOfForeign.Set(remainderOfForeign.Mid(startOfNextEscapeSequence)); + } + goto handleHomogeneousRun; + } + FOREVER + { + if (!NextHomogeneousForeignRun(conversionData, numberOfForeignBytesConsumed, homogeneousRun, remainderOfForeign, aArrayOfStates, aOutputConversionFlags)) + { + goto end; + } +handleHomogeneousRun: + if (conversionData==NULL) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + TInt numberOfUnconvertibleCharacters; + TInt indexOfFirstByteOfFirstUnconvertibleCharacter; + const TInt returnValue=CCnvCharacterSetConverter::DoConvertToUnicode(*conversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, homogeneousRun, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, internalInputConversionFlags); + if (returnValue<0) + { + return returnValue; // this is an error-code + } + if (numberOfUnconvertibleCharacters>0) + { + if (aNumberOfUnconvertibleCharacters==0) + { + aIndexOfFirstByteOfFirstUnconvertibleCharacter=numberOfForeignBytesConsumed+indexOfFirstByteOfFirstUnconvertibleCharacter; + } + aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters; + } + numberOfForeignBytesConsumed+=homogeneousRun.Length(); + if (returnValue>0) + { + numberOfForeignBytesConsumed-=returnValue; + goto end; + } + if (numberOfForeignBytesConsumed>0) + { + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable; + } + __ASSERT_DEBUG(remainderOfForeign==aForeign.Mid(numberOfForeignBytesConsumed), Panic(EPanicInconsistentNumberOfForeignBytesRemaining)); + } +end: + if ((numberOfForeignBytesConsumed==0) && (aOutputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated) && (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable)) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + aState=REINTERPRET_CAST(TInt, conversionData); + return aForeign.Length()-numberOfForeignBytesConsumed; + } + + +/** Converts text from a non-modal complex character set encoding (e.g. +Shift-JIS or EUC-JP) into Unicode.The non-Unicode text specified in +aForeign is converted using the array of character set +conversion methods (aArrayOfMethods) provided by the +plug-in, and the converted text is returned in aUnicode. +Overwrites the contents, if any, of aUnicode. + +@param aDefaultEndiannessOfForeignCharacters The default endian-ness of the +foreign characters. If an endian-ness for foreign characters is specified +in the conversion data, then that is used instead and the value of +aDefaultEndiannessOfForeignCharacters is ignored. +@param aUnicode On return, contains the text converted into Unicode. +@param aForeign The non-Unicode source text to be converted. +@param aNumberOfUnconvertibleCharacters On return, contains the number of +characters in aForeign which were not converted. Characters which cannot be +converted are output as Unicode replacement characters (0xfffd). +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index +of the first byte of the first unconvertible character. For instance if the +first character in the input descriptor (aForeign) could not be converted, +then this parameter is set to the first byte of that character, i.e. zero. +A negative value is returned if all the characters were converted. +@param aArrayOfMethods Array of conversion methods. There must be one or more +methods in this array and none of the methods in the array can have any NULL +member data or a panic occurs. +@return The number of unconverted bytes left at the end of the input descriptor, +or a negative error value, as defined in TError. */ +EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, + const TArray& aArrayOfMethods) + { + TUint notUsed; + return ConvertToUnicodeFromHeterogeneousForeign( + aDefaultEndiannessOfForeignCharacters, + aUnicode, + aForeign, + aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, + aArrayOfMethods, + notUsed, + 0); + } + +/** @param aDefaultEndiannessOfForeignCharacters The default endian-ness for the +foreign characters. If an endian-ness for foreign characters is specified +in the conversion data, then that is used instead and the value of +aDefaultEndiannessOfForeignCharacters is ignored. +@param aUnicode On return, contains the text converted into Unicode. +@param aForeign The non-Unicode source text to be converted. +@param aNumberOfUnconvertibleCharacters On return, contains the number of +characters in aForeign which were not converted. Characters which cannot be +converted are output as Unicode replacement characters (0xfffd). +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index +of the first byte of the first unconvertible character. For instance if the +first character in the input descriptor (aForeign) could not be converted, +then this parameter is set to the first byte of that character, i.e. zero. +A negative value is returned if all the characters were converted. +@param aArrayOfMethods Array of conversion methods. There must be one or more +methods in this array and none of the methods in the array can have any NULL +member data or a panic occurs. +@param aOutputConversionFlags If the input descriptor ended in a truncated +sequence, e.g. a part of a multi-byte character, aOutputConversionFlags +returns with the EOutputConversionFlagInputIsTruncated flag set. +@param aInputConversionFlags Specify +CCnvCharacterSetConverter::EInputConversionFlagAppend to append the text to +aUnicode. Specify EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable +to prevent the function from returning the error-code EErrorIllFormedInput +when the input descriptor consists of nothing but a truncated sequence. The +CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter +flag must not be set, otherwise a panic occurs. +@return The number of unconverted bytes left at the end of the input descriptor, +or a negative error value, as defined in TError. */ +EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, + const TArray& aArrayOfMethods, + TUint& aOutputConversionFlags, + TUint aInputConversionFlags) + { + __ASSERT_ALWAYS(~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter, Panic(EPanicBadInputConversionFlags3)); + CheckArrayOfMethods(aArrayOfMethods); + aNumberOfUnconvertibleCharacters=0; + aIndexOfFirstByteOfFirstUnconvertibleCharacter=-1; + aOutputConversionFlags=0; + TUint internalInputConversionFlags=aInputConversionFlags; + if (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend) + { + aUnicode.SetLength(0); + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAppend; + } + if (aForeign.Length()==0) + { + return 0; + } + if (aUnicode.MaxLength()==aUnicode.Length()) // relies on the fact that aUnicode's length has been set to zero if aInputConversionFlags does not have CCnvCharacterSetConverter::EInputConversionFlagAppend set + { + return aForeign.Length(); + } + const TInt numberOfMethods=aArrayOfMethods.Count(); + TPtrC8 remainderOfForeign(aForeign); + TInt numberOfForeignBytesConsumed=0; + FOREVER + { + TInt lengthOfRunToConvert=0; + const SMethod* method=NULL; + for (TInt i=0;;) + { + method=&aArrayOfMethods[i]; + __ASSERT_DEBUG(method!=NULL, Panic(EPanicBadMethodPointer)); + lengthOfRunToConvert=(*method->iNumberOfBytesAbleToConvert)(remainderOfForeign); + if (lengthOfRunToConvert<0) + { + return lengthOfRunToConvert; // this is an error-code + } + if (lengthOfRunToConvert>0) + { + break; + } + __ASSERT_DEBUG(i=numberOfMethods) + { + aOutputConversionFlags|=CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated; + goto end; + } + } + TBuf8 intermediateBuffer; + const TInt maximumUsableLengthOfIntermediateBuffer=ReduceToNearestMultipleOf(KMaximumLengthOfIntermediateBuffer, method->iNumberOfBytesPerCharacter); + FOREVER + { + const TInt numberOfForeignBytesConsumedThisTime=Min(lengthOfRunToConvert, maximumUsableLengthOfIntermediateBuffer); + intermediateBuffer=remainderOfForeign.Left(numberOfForeignBytesConsumedThisTime); + __ASSERT_DEBUG((numberOfForeignBytesConsumedThisTime%method->iNumberOfBytesPerCharacter)==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters2)); + (*method->iConvertToIntermediateBufferInPlace)(intermediateBuffer); + __ASSERT_DEBUG((intermediateBuffer.Length()%method->iNumberOfCoreBytesPerCharacter)==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters3)); + __ASSERT_DEBUG((intermediateBuffer.Length()/method->iNumberOfCoreBytesPerCharacter)*method->iNumberOfBytesPerCharacter==numberOfForeignBytesConsumedThisTime, Panic(EPanicBadMethodData1)); + TInt numberOfUnconvertibleCharacters; + TInt indexOfFirstByteOfFirstUnconvertibleCharacter; + const TInt returnValue=CCnvCharacterSetConverter::DoConvertToUnicode(*method->iConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, intermediateBuffer, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, internalInputConversionFlags); + if (returnValue<0) + { + return returnValue; // this is an error-code + } + if (numberOfUnconvertibleCharacters>0) + { + if (aNumberOfUnconvertibleCharacters==0) + { + aIndexOfFirstByteOfFirstUnconvertibleCharacter=numberOfForeignBytesConsumed+indexOfFirstByteOfFirstUnconvertibleCharacter; + } + aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters; + } + numberOfForeignBytesConsumed+=numberOfForeignBytesConsumedThisTime; + if (returnValue>0) + { + __ASSERT_DEBUG((returnValue%method->iNumberOfCoreBytesPerCharacter)==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters4)); + numberOfForeignBytesConsumed-=(returnValue/method->iNumberOfCoreBytesPerCharacter)*method->iNumberOfBytesPerCharacter; + goto end; + } + if (numberOfForeignBytesConsumed>0) + { + internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable; + } + remainderOfForeign.Set(aForeign.Mid(numberOfForeignBytesConsumed)); + lengthOfRunToConvert-=numberOfForeignBytesConsumedThisTime; + __ASSERT_DEBUG(lengthOfRunToConvert>=0, Panic(EPanicBadLengthOfRunToConvert2)); + if (lengthOfRunToConvert<=0) + { + break; + } + } + } +end: + if ((numberOfForeignBytesConsumed==0) && (aOutputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated) && (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable)) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + return aForeign.Length()-numberOfForeignBytesConsumed; + } + +void CnvUtilities::CheckArrayOfCharacterSets(const TArray& aArrayOfCharacterSets) + { + const TInt numberOfCharacterSets=aArrayOfCharacterSets.Count(); + __ASSERT_ALWAYS(numberOfCharacterSets>0, Panic(EPanicBadNumberOfCharacterSets)); + for (TInt i=0; i& aArrayOfStates) + { + const TInt numberOfStates=aArrayOfStates.Count(); + __ASSERT_ALWAYS(numberOfStates>0, Panic(EPanicBadNumberOfStates)); + for (TInt i=0; i& aArrayOfMethods) + { + const TInt numberOfMethods=aArrayOfMethods.Count(); + __ASSERT_ALWAYS(numberOfMethods>0, Panic(EPanicBadNumberOfMethods)); + for (TInt i=0; i0, Panic(EPanicBadMethodData2)); + __ASSERT_ALWAYS(method.iNumberOfCoreBytesPerCharacter>0, Panic(EPanicBadMethodData3)); + __ASSERT_ALWAYS(method.iNumberOfCoreBytesPerCharacter<=method.iNumberOfBytesPerCharacter, Panic(EPanicBadMethodData4)); + } + } + +TInt CnvUtilities::LengthOfUnicodeCharacter(const TDesC16& aUnicode, TInt aIndex) + { + const TUint unicodeCharacter=aUnicode[aIndex]; + if ((unicodeCharacter>=0xd800) && (unicodeCharacter<=0xdbff)) // if the unicode character is the first half of a surrogate-pair... + { + __ASSERT_DEBUG(aIndex+1=0xdc00) && (secondHalfOfSurrogatePair<=0xdfff), Panic(EPanicBadSurrogatePair2)); // this can be asserted as CCnvCharacterSetConverter::DoConvertFromUnicode should have returned an error value if this was a bad surrogate pair + return 2; + } + return 1; + } + +TBool CnvUtilities::NextHomogeneousForeignRun(const SCnvConversionData*& aConversionData, TInt& aNumberOfForeignBytesConsumed, TPtrC8& aHomogeneousRun, TPtrC8& aRemainderOfForeign, const TArray& aArrayOfStates, TUint& aOutputConversionFlags) + { + __ASSERT_DEBUG((aRemainderOfForeign.Length()==0) || (aRemainderOfForeign[0]==KControlCharacterEscape), Panic(EPanicBadRemainderOfForeign)); + FOREVER + { + if (aRemainderOfForeign.Length()==0) + { + return EFalse; + } + const TInt numberOfStates=aArrayOfStates.Count(); + TInt i; + for (i=0; i0) + { + return ETrue; + } + } + } + +TBool CnvUtilities::MatchesEscapeSequence(TInt& aNumberOfForeignBytesConsumed, TPtrC8& aHomogeneousRun, TPtrC8& aRemainderOfForeign, const TDesC8& aEscapeSequence) + { + const TInt lengthOfEscapeSequence=aEscapeSequence.Length(); + if (IsStartOf(aEscapeSequence, aRemainderOfForeign)) + { + aRemainderOfForeign.Set(aRemainderOfForeign.Mid(lengthOfEscapeSequence)); + const TInt startOfNextEscapeSequence=aRemainderOfForeign.Locate(KControlCharacterEscape); + if (startOfNextEscapeSequence==KErrNotFound) + { + aHomogeneousRun.Set(aRemainderOfForeign); + aRemainderOfForeign.Set(NULL, 0); + } + else + { + aHomogeneousRun.Set(aRemainderOfForeign.Left(startOfNextEscapeSequence)); + aRemainderOfForeign.Set(aRemainderOfForeign.Mid(startOfNextEscapeSequence)); + } + aNumberOfForeignBytesConsumed+=lengthOfEscapeSequence; + return ETrue; + } + return EFalse; + } + +TBool CnvUtilities::IsStartOf(const TDesC8& aStart, const TDesC8& aPotentiallyLongerDescriptor) + { + const TInt lengthOfStart=aStart.Length(); + return (aPotentiallyLongerDescriptor.Length()>=lengthOfStart) && (aPotentiallyLongerDescriptor.Left(lengthOfStart)==aStart); + } +