Update contrib.
2 * Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
22 #include <convutils.h>
24 const TInt KNoPreviousCharacterSet=-1;
25 const TInt KDefaultCharacterSet = 0;
26 const TUint KControlCharacterEscape=0x1b;
29 //It will cause performance problem with small KMaximumLengthOfIntermediateBuffer.
30 //Please use release version to test performance cases.
31 const TInt KMaximumLengthOfIntermediateBuffer=5;
33 const TInt KMaximumLengthOfIntermediateBuffer=150;
36 struct SCnvConversionData;
38 _LIT(KLitPanicText, "CONVUTILS");
42 EPanicBadInputConversionFlags1=1,
43 EPanicBadInputConversionFlags2,
44 EPanicBadInputConversionFlags3,
45 EPanicBadNumberOfUnicodeElementsConsumed,
46 EPanicAppendFlagViolated,
47 EPanicBadNumberOfUnicodeCharactersConverted,
48 EPanicBadNumberOfCharactersThatDroppedOut,
49 EPanicLoopCounterOverRun1,
50 EPanicLoopCounterOverRun2,
51 EPanicDescriptorNotWholeNumberOfCharacters1,
52 EPanicDescriptorNotWholeNumberOfCharacters2,
53 EPanicDescriptorNotWholeNumberOfCharacters3,
54 EPanicDescriptorNotWholeNumberOfCharacters4,
55 EPanicBadStartOfNextEscapeSequence,
56 EPanicInconsistentNumberOfForeignBytesRemaining,
57 EPanicBadLengthOfRunToConvert1,
58 EPanicBadLengthOfRunToConvert2,
59 EPanicBadMethodPointer,
64 EPanicBadNumberOfCharacterSets,
65 EPanicBadConversionDataPointer1,
66 EPanicBadConversionDataPointer2,
67 EPanicBadConversionDataPointer3,
68 EPanicBadFunctionPointer1,
69 EPanicBadFunctionPointer2,
70 EPanicBadFunctionPointer3,
71 EPanicBadEscapeSequencePointer1,
72 EPanicBadEscapeSequencePointer2,
73 EPanicBadNumberOfStates,
74 EPanicBadEscapeSequenceStart,
75 EPanicBadNumberOfMethods,
76 EPanicBadSurrogatePair1,
77 EPanicBadSurrogatePair2,
78 EPanicBadRemainderOfForeign,
79 EPanicOutputDescriptorTooShortEvenToHoldEscapeSequenceToDefaultCharacterSet
82 LOCAL_C void Panic(TPanic aPanic)
84 User::Panic(KLitPanicText, aPanic);
87 /** Converts Unicode text into a complex foreign character set encoding. This
88 is an encoding which cannot be converted simply by calling
89 CCnvCharacterSetConverter::DoConvertFromUnicode(). It may be modal (e.g. JIS)
90 or non-modal (e.g. Shift-JIS).
92 The Unicode text specified in aUnicode is converted using the array of
93 conversion data objects (aArrayOfCharacterSets) provided by the plug-in for
94 the complex character set encoding, and the converted text is returned in
95 aForeign. Any existing contents in aForeign are overwritten.
97 Unlike CCnvCharacterSetConverter::DoConvertFromUnicode(), multiple character
98 sets can be specified. aUnicode is converted using the first character conversion
99 data object in the array. When a character is found which cannot be converted
100 using that data, each character set in the array is tried in turn. If it cannot
101 be converted using any object in the array, the index of the character is
102 appended to aIndicesOfUnconvertibleCharacters and the character is replaced
103 by aReplacementForUnconvertibleUnicodeCharacters.
105 If it can be converted using another object in the array, that object is used
106 to convert all subsequent characters until another unconvertible character
109 @param aDefaultEndiannessOfForeignCharacters The default endian-ness to use
110 when writing the characters in the foreign character set. If an endian-ness
111 for foreign characters is specified in the current conversion data object,
112 then that is used instead and the value of
113 aDefaultEndiannessOfForeignCharacters is ignored.
114 @param aReplacementForUnconvertibleUnicodeCharacters The single character (one
115 or more byte values) which is used to replace unconvertible characters.
116 @param aForeign On return, contains the converted text in the non-Unicode
118 @param aUnicode The source Unicode text to be converted.
119 @param aIndicesOfUnconvertibleCharacters On return, holds an ascending array
120 of the indices of each Unicode character in the source text which could not
121 be converted (because none of the target character sets have an equivalent
123 @param aArrayOfCharacterSets Array of character conversion data objects,
124 representing the character sets which comprise a complex character set
125 encoding. These are used in sequence to convert the Unicode text. There must
126 be at least one character set in this array and no character set may have any
127 NULL member data, or a panic occurs.
128 @return The number of unconverted characters left at the end of the input
129 descriptor (e.g. because aForeign was not long enough to hold all the text),
130 or a negative error value, as defined in CCnvCharacterSetConverter::TError. */
131 EXPORT_C TInt CnvUtilities::ConvertFromUnicode(
132 CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
133 const TDesC8& aReplacementForUnconvertibleUnicodeCharacters,
135 const TDesC16& aUnicode,
136 CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters,
137 const TArray<SCharacterSet>& aArrayOfCharacterSets)
140 return ConvertFromUnicode(aDefaultEndiannessOfForeignCharacters,
141 aReplacementForUnconvertibleUnicodeCharacters,
144 aIndicesOfUnconvertibleCharacters,
145 aArrayOfCharacterSets,
150 /** Converts Unicode text into a complex foreign character set encoding. This is
151 an encoding which cannot be converted simply by a call to
152 CCnvCharacterSetConverter::DoConvertFromUnicode(). It may be modal (e.g. JIS)
153 or non-modal (e.g. Shift-JIS).
155 The Unicode text specified in aUnicode is converted using the array of conversion
156 data objects (aArrayOfCharacterSets) provided by the plug-in for the complex
157 character set encoding and the converted text is returned in aForeign. The
158 function can either append to aForeign or overwrite its contents (if any).
160 Unlike CCnvCharacterSetConverter::DoConvertFromUnicode(), multiple character
161 sets can be specified. aUnicode is converted using the first character conversion
162 data object in the array. When a character is found which cannot be converted
163 using that data, each character set in the array is tried in turn. If it cannot
164 be converted using any object in the array, the index of the character is
165 appended to aIndicesOfUnconvertibleCharacters and the character is replaced
166 by aReplacementForUnconvertibleUnicodeCharacters.
168 If it can be converted using another object in the array, that object is used
169 to convert all subsequent characters until another unconvertible character
172 @param aDefaultEndiannessOfForeignCharacters The default endian-ness to use
173 when writing the characters in the foreign character set. If an endian-ness
174 for foreign characters is specified in the current conversion data object,
175 then that is used instead and the value of
176 aDefaultEndiannessOfForeignCharacters is ignored.
177 @param aReplacementForUnconvertibleUnicodeCharacters The single character (one
178 or more byte values) which is used to replace unconvertible characters.
179 @param aForeign On return, contains the converted text in the non-Unicode
180 character set. This may already contain some text. If it does, and if
181 aInputConversionFlags specifies EInputConversionFlagAppend, then the converted
182 text is appended to this descriptor.
183 @param aUnicode The source Unicode text to be converted.
184 @param aIndicesOfUnconvertibleCharacters On return, holds an ascending array
185 of the indices of each Unicode character in the source text which could not
186 be converted (because none of the target character sets have an equivalent
188 @param aArrayOfCharacterSets Array of character set data objects. These are
189 used in sequence to convert the Unicode text. There must be at least one
190 character set in this array and no character set may have any NULL member
191 data, or a panic occurs.
192 @param aOutputConversionFlags If the input descriptor ended in a truncated
193 sequence, e.g. the first half only of a Unicode surrogate pair, this returns
194 with the EOutputConversionFlagInputIsTruncated flag set.
195 @param aInputConversionFlags Specify
196 CCnvCharacterSetConverter::EInputConversionFlagAppend to append the text to
197 aForeign. Specify CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable
198 to prevent the function from returning the error-code EErrorIllFormedInput
199 when the input descriptor consists of nothing but a truncated sequence. The
200 CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter
201 flag must not be set, otherwise a panic occurs.
202 @return The number of unconverted characters left at the end of the input descriptor
203 (e.g. because aForeign was not long enough to hold all the text), or a negative
204 error value, as defined in CCnvCharacterSetConverter::TError. */
205 EXPORT_C TInt CnvUtilities::ConvertFromUnicode(
206 CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
207 const TDesC8& aReplacementForUnconvertibleUnicodeCharacters,
209 const TDesC16& aUnicode,
210 CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters,
211 const TArray<SCharacterSet>& aArrayOfCharacterSets,
212 TUint& aOutputConversionFlags,
213 TUint aInputConversionFlags)
215 __ASSERT_ALWAYS(~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter, Panic(EPanicBadInputConversionFlags1));
216 CheckArrayOfCharacterSets(aArrayOfCharacterSets);
217 aOutputConversionFlags=0;
218 TUint internalInputConversionFlags=aInputConversionFlags;
219 if (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend)
221 aForeign.SetLength(0);
222 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAppend;
224 if (aUnicode.Length()==0)
228 if (aForeign.MaxLength()==aForeign.Length()) // relies on the fact that aForeign's length has been set to zero if aInputConversionFlags does not have CCnvCharacterSetConverter::EInputConversionFlagAppend set
230 return aUnicode.Length();
232 TDes8* foreign=&aForeign;
233 TPtr8 dummyForeign(NULL, 0, 0);
234 if (aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagMustEndInDefaultCharacterSet)
236 TInt dummyMaximumLength =
237 aForeign.MaxLength() - aArrayOfCharacterSets[KDefaultCharacterSet].iEscapeSequence->Length();
238 __ASSERT_ALWAYS(dummyMaximumLength >= 0,
239 Panic(EPanicOutputDescriptorTooShortEvenToHoldEscapeSequenceToDefaultCharacterSet));
240 dummyForeign.Set(const_cast <TUint8*> (aForeign.Ptr()),
243 foreign=&dummyForeign;
245 const TInt numberOfCharacterSets=aArrayOfCharacterSets.Count();
246 TInt numberOfUnicodeElementsConsumed=0;
247 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter; // this is not just an optimization - it ensures that "foreign" doesn't get filled up too much each time CCnvCharacterSetConverter::DoConvertFromUnicode is called
248 TInt previousCharacterSet = aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAssumeStartInDefaultCharacterSet?
249 KDefaultCharacterSet : KNoPreviousCharacterSet;
252 for (TInt presentCharacterSet=KDefaultCharacterSet;;)
254 __ASSERT_DEBUG(numberOfUnicodeElementsConsumed<=aUnicode.Length(), Panic(EPanicBadNumberOfUnicodeElementsConsumed));
255 if (numberOfUnicodeElementsConsumed>=aUnicode.Length())
259 const SCharacterSet& characterSet=aArrayOfCharacterSets[presentCharacterSet];
260 const TInt oldNumberOfBytesInForeign=foreign->Length();
261 if (numberOfUnicodeElementsConsumed>0)
263 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
265 CCnvCharacterSetConverter::TArrayOfAscendingIndices indicesOfUnconvertibleCharacters;
266 const TInt returnValue=CCnvCharacterSetConverter::DoConvertFromUnicode(*characterSet.iConversionData, aDefaultEndiannessOfForeignCharacters, KNullDesC8, *foreign, aUnicode.Mid(numberOfUnicodeElementsConsumed), indicesOfUnconvertibleCharacters, aOutputConversionFlags, internalInputConversionFlags);
269 return returnValue; // this is an error-code
271 __ASSERT_DEBUG(foreign->Length()>=oldNumberOfBytesInForeign, Panic(EPanicAppendFlagViolated));
272 TInt indexOfFirstUnconvertibleCharacter;
273 if (indicesOfUnconvertibleCharacters.NumberOfIndices()==0)
275 indexOfFirstUnconvertibleCharacter=-1;
276 numberOfUnicodeElementsConsumed=aUnicode.Length()-returnValue;
280 indexOfFirstUnconvertibleCharacter=indicesOfUnconvertibleCharacters[0];
281 numberOfUnicodeElementsConsumed+=indexOfFirstUnconvertibleCharacter;
282 __ASSERT_DEBUG(numberOfUnicodeElementsConsumed+LengthOfUnicodeCharacter(aUnicode, numberOfUnicodeElementsConsumed)==aUnicode.Length()-returnValue, Panic(EPanicBadNumberOfUnicodeCharactersConverted));
284 if (indexOfFirstUnconvertibleCharacter!=0) // if at least one Unicode character at the start of CCnvCharacterSetConverter::DoConvertFromUnicode's input descriptor was convertible...
286 TBool gotoEnd = EFalse;
287 if (foreign->Length()>oldNumberOfBytesInForeign)
289 TInt numberOfCharactersThatDroppedOut=0;
290 // Insert an escape sequence if this character set is different from the last one.
291 if (presentCharacterSet != previousCharacterSet)
293 // Insert escape sequence (if requred) in front of the last encoded run of text.
294 // Note that this may cause some characters to drop out at the end.
295 (*characterSet.iConvertFromIntermediateBufferInPlace)(oldNumberOfBytesInForeign, *foreign, numberOfCharactersThatDroppedOut);
296 if (oldNumberOfBytesInForeign < foreign->Length())
297 previousCharacterSet = presentCharacterSet;
299 numberOfUnicodeElementsConsumed-=numberOfCharactersThatDroppedOut;
300 if (numberOfCharactersThatDroppedOut>0 )// if "foreign" has been filled to as much as it will hold...
305 if (indexOfFirstUnconvertibleCharacter<0) // if we've successfully converted up to the end of aUnicode (using *characterSet.iConversionData)...
311 if ( aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagMustEndInDefaultCharacterSet
312 && previousCharacterSet != KDefaultCharacterSet
313 && previousCharacterSet != KNoPreviousCharacterSet)
315 aForeign.SetLength(foreign->Length());
316 aForeign.Append(*aArrayOfCharacterSets[KDefaultCharacterSet].iEscapeSequence);
323 __ASSERT_DEBUG(presentCharacterSet<numberOfCharacterSets, Panic(EPanicLoopCounterOverRun1));
324 ++presentCharacterSet;
325 if (presentCharacterSet>=numberOfCharacterSets)
327 if ((foreign->MaxLength()-foreign->Length()<aReplacementForUnconvertibleUnicodeCharacters.Length()) ||
328 (aIndicesOfUnconvertibleCharacters.AppendIndex(numberOfUnicodeElementsConsumed)!=CCnvCharacterSetConverter::TArrayOfAscendingIndices::EAppendSuccessful)) // the tests must be done in this order as AppendIndex must only be called if there is room for aReplacementForUnconvertibleUnicodeCharacters
332 numberOfUnicodeElementsConsumed+=LengthOfUnicodeCharacter(aUnicode, numberOfUnicodeElementsConsumed);
333 foreign->Append(aReplacementForUnconvertibleUnicodeCharacters);
341 aForeign.SetLength(foreign->Length());
344 if ((numberOfUnicodeElementsConsumed==0) && (aOutputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated) && (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable))
346 return CCnvCharacterSetConverter::EErrorIllFormedInput;
348 return aUnicode.Length()-numberOfUnicodeElementsConsumed;
352 /** Inserts an escape sequence into the descriptor.
354 This function is provided to help in the implementation of
355 ConvertFromUnicode() for modal character set encodings.
356 Each SCharacterSet object in the array passed to
357 ConvertFromUnicode() must have its
358 iConvertFromIntermediateBufferInPlace member assigned. To
359 do this for a modal character set encoding, implement a function whose
360 signature matches that of FConvertFromIntermediateBufferInPlace
361 and which calls this function, passing all arguments unchanged, and
362 specifying the character set's escape sequence and the number of bytes per
365 @param aStartPositionInDescriptor The byte position in aDescriptor at which
366 the escape sequence is inserted. If the character set uses more than one byte
367 per character, this position must be the start of a character, otherwise a
369 @param aDescriptor The descriptor into which the escape sequence is inserted.
370 @param aNumberOfCharactersThatDroppedOut The escape sequence is inserted into
371 the start of aDescriptor and any characters that need to drop out to make
372 room for the escape sequence (because the descriptor's maximum length was
373 not long enough) drop out from the end of the buffer. This parameter indicates
374 the number of characters that needed to drop out.
375 @param aEscapeSequence The escape sequence for the character set.
376 @param aNumberOfBytesPerCharacter The number of bytes per character. */
377 EXPORT_C void CnvUtilities::ConvertFromIntermediateBufferInPlace(
378 TInt aStartPositionInDescriptor,
380 TInt& aNumberOfCharactersThatDroppedOut,
381 const TDesC8& aEscapeSequence,
382 TInt aNumberOfBytesPerCharacter)
384 const TInt lengthOfDescriptor=aDescriptor.Length();
385 __ASSERT_ALWAYS((lengthOfDescriptor-aStartPositionInDescriptor)%aNumberOfBytesPerCharacter==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters1));
386 aNumberOfCharactersThatDroppedOut=(Max(0, aEscapeSequence.Length()-(aDescriptor.MaxLength()-lengthOfDescriptor))+(aNumberOfBytesPerCharacter-1))/aNumberOfBytesPerCharacter;
387 const TInt lengthOfRunInCharacters=(lengthOfDescriptor-aStartPositionInDescriptor)/aNumberOfBytesPerCharacter;
388 if (aNumberOfCharactersThatDroppedOut>=lengthOfRunInCharacters) // ">=" is correct (rather than ">") as if there's only room for the escape sequence we don't want to have it in the descriptor
390 aNumberOfCharactersThatDroppedOut=lengthOfRunInCharacters;
391 aDescriptor.SetLength(aStartPositionInDescriptor);
395 aDescriptor.SetLength(lengthOfDescriptor-(aNumberOfCharactersThatDroppedOut*aNumberOfBytesPerCharacter));
396 aDescriptor.Insert(aStartPositionInDescriptor, aEscapeSequence);
401 /** Converts text from a modal foreign character set encoding into Unicode.
403 The non-Unicode text specified in aForeign is converted using
404 the array of character set conversion objects (aArrayOfStates)
405 provided by the plug-in, and the converted text is returned in
406 aUnicode. The function can either append to aUnicode
407 or overwrite its contents (if any), depending on the input conversion flags
408 specified. The first element in aArrayOfStates is taken to be
409 the default mode (i.e. the mode to assume by default if there is no preceding
412 @param aDefaultEndiannessOfForeignCharacters The default endian-ness of the
413 foreign characters. If an endian-ness for foreign characters is specified
414 in the conversion data, then that is used instead and the value of
415 aDefaultEndiannessOfForeignCharacters is ignored.
416 @param aUnicode On return, contains the text converted into Unicode.
417 @param aForeign The non-Unicode source text to be converted.
418 @param aState Used to store a modal character set encoding's current mode across
419 multiple calls to ConvertToUnicode() on the same input descriptor. This argument
420 should be passed the same object as passed to the plug-in's ConvertToUnicode()
422 @param aNumberOfUnconvertibleCharacters On return, contains the number of
423 characters in aForeign which were not converted. Characters which cannot be
424 converted are output as Unicode replacement characters (0xfffd).
425 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
426 of the first byte of the first unconvertible character. For instance if the
427 first character in the input descriptor (aForeign) could not be converted,
428 then this parameter is set to the first byte of that character, i.e. zero.
429 A negative value is returned if all the characters were converted.
430 @param aArrayOfStates Array of character set conversion data objects, and their
431 escape sequences ("modes"). There must be one or more modes in this array,
432 none of the modes can have any NULL member data, and each mode's escape sequence
433 must begin with KControlCharacterEscape (0x1b) or a panic occurs.
434 @return The number of unconverted bytes left at the end of the input descriptor,
435 or a negative error value, as defined in TError. */
436 EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromModalForeign(
437 CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
439 const TDesC8& aForeign,
441 TInt& aNumberOfUnconvertibleCharacters,
442 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter,
443 const TArray<SState>& aArrayOfStates)
446 return ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters,
450 aNumberOfUnconvertibleCharacters,
451 aIndexOfFirstByteOfFirstUnconvertibleCharacter,
457 /** @param aDefaultEndiannessOfForeignCharacters The default endian-ness for
458 the foreign characters. If an endian-ness for foreign characters is specified
459 in the conversion data, then that is used instead and the value of
460 aDefaultEndiannessOfForeignCharacters is ignored.
461 @param aUnicode On return, contains the text converted into Unicode.
462 @param aForeign The non-Unicode source text to be converted.
463 @param aState Used to store a modal character set encoding's current mode
464 across multiple calls to ConvertToUnicode() on the same input descriptor. This
465 argument should be passed the same object as passed to the plug-in's
466 ConvertToUnicode() exported function.
467 @param aNumberOfUnconvertibleCharacters On return, contains the number of
468 characters in aForeign which were not converted. Characters which cannot be
469 converted are output as Unicode replacement characters (0xfffd).
470 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
471 of the first byte of the first unconvertible character. For instance if the
472 first character in the input descriptor (aForeign) could not be converted,
473 then this parameter is set to the first byte of that character, i.e. zero.
474 A negative value is returned if all the characters were converted.
475 @param aArrayOfStates Array of character set conversion data objects, and their
476 escape sequences. There must be one or more modes in this array, none of the
477 modes can have any NULL member data, and each mode's escape sequence must
478 begin with KControlCharacterEscape (0x1b) or a panic occurs.
479 @param aOutputConversionFlags If the input descriptor ended in a truncated
480 sequence, e.g. a part of a multi-byte character, aOutputConversionFlags
481 returns with the EOutputConversionFlagInputIsTruncated flag set.
482 @param aInputConversionFlags Specify
483 CCnvCharacterSetConverter::EInputConversionFlagAppend to append the text to
484 aUnicode. Specify EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable
485 to prevent the function from returning the error-code EErrorIllFormedInput
486 when the input descriptor consists of nothing but a truncated sequence. The
487 CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter
488 flag must not be set, otherwise a panic occurs.
489 @return The number of unconverted bytes left at the end of the input descriptor,
490 or a negative error value, as defined in TError. */
491 EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromModalForeign(
492 CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
494 const TDesC8& aForeign,
496 TInt& aNumberOfUnconvertibleCharacters,
497 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter,
498 const TArray<SState>& aArrayOfStates,
499 TUint& aOutputConversionFlags,
500 TUint aInputConversionFlags)
502 __ASSERT_ALWAYS(~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter, Panic(EPanicBadInputConversionFlags2));
503 CheckArrayOfStates(aArrayOfStates);
504 aNumberOfUnconvertibleCharacters=0;
505 aIndexOfFirstByteOfFirstUnconvertibleCharacter=-1;
506 aOutputConversionFlags=0;
507 TUint internalInputConversionFlags=aInputConversionFlags;
508 if (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend)
510 aUnicode.SetLength(0);
511 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAppend;
513 if (aForeign.Length()==0)
517 if (aUnicode.MaxLength()==aUnicode.Length()) // relies on the fact that aUnicode's length has been set to zero if aInputConversionFlags does not have CCnvCharacterSetConverter::EInputConversionFlagAppend set
519 return aForeign.Length();
521 TPtrC8 remainderOfForeign(aForeign);
522 TPtrC8 homogeneousRun;
523 TInt numberOfForeignBytesConsumed=0;
524 const SCnvConversionData* conversionData = NULL;
525 const TInt startOfNextEscapeSequence=aForeign.Locate(KControlCharacterEscape);
526 if (startOfNextEscapeSequence!=0) // if aForeign doesn't start with an escape sequence...
528 conversionData=(aState!=CCnvCharacterSetConverter::KStateDefault)? REINTERPRET_CAST(const SCnvConversionData*, aState): aArrayOfStates[0].iConversionData;
529 if (startOfNextEscapeSequence==KErrNotFound)
531 homogeneousRun.Set(remainderOfForeign);
532 remainderOfForeign.Set(NULL, 0);
536 __ASSERT_DEBUG(startOfNextEscapeSequence>0, Panic(EPanicBadStartOfNextEscapeSequence));
537 homogeneousRun.Set(remainderOfForeign.Left(startOfNextEscapeSequence));
538 remainderOfForeign.Set(remainderOfForeign.Mid(startOfNextEscapeSequence));
540 goto handleHomogeneousRun;
544 if (!NextHomogeneousForeignRun(conversionData, numberOfForeignBytesConsumed, homogeneousRun, remainderOfForeign, aArrayOfStates, aOutputConversionFlags))
548 handleHomogeneousRun:
549 if (conversionData==NULL)
551 return CCnvCharacterSetConverter::EErrorIllFormedInput;
553 TInt numberOfUnconvertibleCharacters;
554 TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
555 const TInt returnValue=CCnvCharacterSetConverter::DoConvertToUnicode(*conversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, homogeneousRun, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, internalInputConversionFlags);
558 return returnValue; // this is an error-code
560 if (numberOfUnconvertibleCharacters>0)
562 if (aNumberOfUnconvertibleCharacters==0)
564 aIndexOfFirstByteOfFirstUnconvertibleCharacter=numberOfForeignBytesConsumed+indexOfFirstByteOfFirstUnconvertibleCharacter;
566 aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
568 numberOfForeignBytesConsumed+=homogeneousRun.Length();
571 numberOfForeignBytesConsumed-=returnValue;
574 if (numberOfForeignBytesConsumed>0)
576 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
578 __ASSERT_DEBUG(remainderOfForeign==aForeign.Mid(numberOfForeignBytesConsumed), Panic(EPanicInconsistentNumberOfForeignBytesRemaining));
581 if ((numberOfForeignBytesConsumed==0) && (aOutputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated) && (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable))
583 return CCnvCharacterSetConverter::EErrorIllFormedInput;
585 aState=REINTERPRET_CAST(TInt, conversionData);
586 return aForeign.Length()-numberOfForeignBytesConsumed;
590 /** Converts text from a non-modal complex character set encoding (e.g.
591 Shift-JIS or EUC-JP) into Unicode.The non-Unicode text specified in
592 aForeign is converted using the array of character set
593 conversion methods (aArrayOfMethods) provided by the
594 plug-in, and the converted text is returned in aUnicode.
595 Overwrites the contents, if any, of aUnicode.
597 @param aDefaultEndiannessOfForeignCharacters The default endian-ness of the
598 foreign characters. If an endian-ness for foreign characters is specified
599 in the conversion data, then that is used instead and the value of
600 aDefaultEndiannessOfForeignCharacters is ignored.
601 @param aUnicode On return, contains the text converted into Unicode.
602 @param aForeign The non-Unicode source text to be converted.
603 @param aNumberOfUnconvertibleCharacters On return, contains the number of
604 characters in aForeign which were not converted. Characters which cannot be
605 converted are output as Unicode replacement characters (0xfffd).
606 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
607 of the first byte of the first unconvertible character. For instance if the
608 first character in the input descriptor (aForeign) could not be converted,
609 then this parameter is set to the first byte of that character, i.e. zero.
610 A negative value is returned if all the characters were converted.
611 @param aArrayOfMethods Array of conversion methods. There must be one or more
612 methods in this array and none of the methods in the array can have any NULL
613 member data or a panic occurs.
614 @return The number of unconverted bytes left at the end of the input descriptor,
615 or a negative error value, as defined in TError. */
616 EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(
617 CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
619 const TDesC8& aForeign,
620 TInt& aNumberOfUnconvertibleCharacters,
621 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter,
622 const TArray<SMethod>& aArrayOfMethods)
625 return ConvertToUnicodeFromHeterogeneousForeign(
626 aDefaultEndiannessOfForeignCharacters,
629 aNumberOfUnconvertibleCharacters,
630 aIndexOfFirstByteOfFirstUnconvertibleCharacter,
636 /** @param aDefaultEndiannessOfForeignCharacters The default endian-ness for the
637 foreign characters. If an endian-ness for foreign characters is specified
638 in the conversion data, then that is used instead and the value of
639 aDefaultEndiannessOfForeignCharacters is ignored.
640 @param aUnicode On return, contains the text converted into Unicode.
641 @param aForeign The non-Unicode source text to be converted.
642 @param aNumberOfUnconvertibleCharacters On return, contains the number of
643 characters in aForeign which were not converted. Characters which cannot be
644 converted are output as Unicode replacement characters (0xfffd).
645 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
646 of the first byte of the first unconvertible character. For instance if the
647 first character in the input descriptor (aForeign) could not be converted,
648 then this parameter is set to the first byte of that character, i.e. zero.
649 A negative value is returned if all the characters were converted.
650 @param aArrayOfMethods Array of conversion methods. There must be one or more
651 methods in this array and none of the methods in the array can have any NULL
652 member data or a panic occurs.
653 @param aOutputConversionFlags If the input descriptor ended in a truncated
654 sequence, e.g. a part of a multi-byte character, aOutputConversionFlags
655 returns with the EOutputConversionFlagInputIsTruncated flag set.
656 @param aInputConversionFlags Specify
657 CCnvCharacterSetConverter::EInputConversionFlagAppend to append the text to
658 aUnicode. Specify EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable
659 to prevent the function from returning the error-code EErrorIllFormedInput
660 when the input descriptor consists of nothing but a truncated sequence. The
661 CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter
662 flag must not be set, otherwise a panic occurs.
663 @return The number of unconverted bytes left at the end of the input descriptor,
664 or a negative error value, as defined in TError. */
665 EXPORT_C TInt CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(
666 CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
668 const TDesC8& aForeign,
669 TInt& aNumberOfUnconvertibleCharacters,
670 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter,
671 const TArray<SMethod>& aArrayOfMethods,
672 TUint& aOutputConversionFlags,
673 TUint aInputConversionFlags)
675 __ASSERT_ALWAYS(~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagStopAtFirstUnconvertibleCharacter, Panic(EPanicBadInputConversionFlags3));
676 CheckArrayOfMethods(aArrayOfMethods);
677 aNumberOfUnconvertibleCharacters=0;
678 aIndexOfFirstByteOfFirstUnconvertibleCharacter=-1;
679 aOutputConversionFlags=0;
680 TUint internalInputConversionFlags=aInputConversionFlags;
681 if (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend)
683 aUnicode.SetLength(0);
684 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAppend;
686 if (aForeign.Length()==0)
690 if (aUnicode.MaxLength()==aUnicode.Length()) // relies on the fact that aUnicode's length has been set to zero if aInputConversionFlags does not have CCnvCharacterSetConverter::EInputConversionFlagAppend set
692 return aForeign.Length();
694 const TInt numberOfMethods=aArrayOfMethods.Count();
695 TPtrC8 remainderOfForeign(aForeign);
696 TInt numberOfForeignBytesConsumed=0;
699 TInt lengthOfRunToConvert=0;
700 const SMethod* method=NULL;
703 method=&aArrayOfMethods[i];
704 __ASSERT_DEBUG(method!=NULL, Panic(EPanicBadMethodPointer));
705 lengthOfRunToConvert=(*method->iNumberOfBytesAbleToConvert)(remainderOfForeign);
706 if (lengthOfRunToConvert<0)
708 return lengthOfRunToConvert; // this is an error-code
710 if (lengthOfRunToConvert>0)
714 __ASSERT_DEBUG(i<numberOfMethods, Panic(EPanicLoopCounterOverRun2));
716 if (i>=numberOfMethods)
718 aOutputConversionFlags|=CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated;
722 TBuf8<KMaximumLengthOfIntermediateBuffer> intermediateBuffer;
723 const TInt maximumUsableLengthOfIntermediateBuffer=ReduceToNearestMultipleOf(KMaximumLengthOfIntermediateBuffer, method->iNumberOfBytesPerCharacter);
726 const TInt numberOfForeignBytesConsumedThisTime=Min(lengthOfRunToConvert, maximumUsableLengthOfIntermediateBuffer);
727 intermediateBuffer=remainderOfForeign.Left(numberOfForeignBytesConsumedThisTime);
728 __ASSERT_DEBUG((numberOfForeignBytesConsumedThisTime%method->iNumberOfBytesPerCharacter)==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters2));
729 (*method->iConvertToIntermediateBufferInPlace)(intermediateBuffer);
730 __ASSERT_DEBUG((intermediateBuffer.Length()%method->iNumberOfCoreBytesPerCharacter)==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters3));
731 __ASSERT_DEBUG((intermediateBuffer.Length()/method->iNumberOfCoreBytesPerCharacter)*method->iNumberOfBytesPerCharacter==numberOfForeignBytesConsumedThisTime, Panic(EPanicBadMethodData1));
732 TInt numberOfUnconvertibleCharacters;
733 TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
734 const TInt returnValue=CCnvCharacterSetConverter::DoConvertToUnicode(*method->iConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, intermediateBuffer, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, internalInputConversionFlags);
737 return returnValue; // this is an error-code
739 if (numberOfUnconvertibleCharacters>0)
741 if (aNumberOfUnconvertibleCharacters==0)
743 aIndexOfFirstByteOfFirstUnconvertibleCharacter=numberOfForeignBytesConsumed+indexOfFirstByteOfFirstUnconvertibleCharacter;
745 aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
747 numberOfForeignBytesConsumed+=numberOfForeignBytesConsumedThisTime;
750 __ASSERT_DEBUG((returnValue%method->iNumberOfCoreBytesPerCharacter)==0, Panic(EPanicDescriptorNotWholeNumberOfCharacters4));
751 numberOfForeignBytesConsumed-=(returnValue/method->iNumberOfCoreBytesPerCharacter)*method->iNumberOfBytesPerCharacter;
754 if (numberOfForeignBytesConsumed>0)
756 internalInputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
758 remainderOfForeign.Set(aForeign.Mid(numberOfForeignBytesConsumed));
759 lengthOfRunToConvert-=numberOfForeignBytesConsumedThisTime;
760 __ASSERT_DEBUG(lengthOfRunToConvert>=0, Panic(EPanicBadLengthOfRunToConvert2));
761 if (lengthOfRunToConvert<=0)
768 if ((numberOfForeignBytesConsumed==0) && (aOutputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated) && (~aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable))
770 return CCnvCharacterSetConverter::EErrorIllFormedInput;
772 return aForeign.Length()-numberOfForeignBytesConsumed;
775 void CnvUtilities::CheckArrayOfCharacterSets(const TArray<SCharacterSet>& aArrayOfCharacterSets)
777 const TInt numberOfCharacterSets=aArrayOfCharacterSets.Count();
778 __ASSERT_ALWAYS(numberOfCharacterSets>0, Panic(EPanicBadNumberOfCharacterSets));
779 for (TInt i=0; i<numberOfCharacterSets; ++i)
781 const SCharacterSet& characterSet=aArrayOfCharacterSets[i];
782 __ASSERT_ALWAYS(characterSet.iConversionData!=NULL, Panic(EPanicBadConversionDataPointer1));
783 __ASSERT_ALWAYS(characterSet.iConvertFromIntermediateBufferInPlace!=NULL, Panic(EPanicBadFunctionPointer1));
784 __ASSERT_ALWAYS(characterSet.iEscapeSequence!=NULL, Panic(EPanicBadEscapeSequencePointer1));
788 void CnvUtilities::CheckArrayOfStates(const TArray<SState>& aArrayOfStates)
790 const TInt numberOfStates=aArrayOfStates.Count();
791 __ASSERT_ALWAYS(numberOfStates>0, Panic(EPanicBadNumberOfStates));
792 for (TInt i=0; i<numberOfStates; ++i)
794 const SState& state=aArrayOfStates[i];
795 __ASSERT_ALWAYS(state.iEscapeSequence!=NULL, Panic(EPanicBadEscapeSequencePointer2));
796 __ASSERT_ALWAYS((*state.iEscapeSequence)[0]==KControlCharacterEscape, Panic(EPanicBadEscapeSequenceStart));
797 __ASSERT_ALWAYS(state.iConversionData!=NULL, Panic(EPanicBadConversionDataPointer2));
801 void CnvUtilities::CheckArrayOfMethods(const TArray<SMethod>& aArrayOfMethods)
803 const TInt numberOfMethods=aArrayOfMethods.Count();
804 __ASSERT_ALWAYS(numberOfMethods>0, Panic(EPanicBadNumberOfMethods));
805 for (TInt i=0; i<numberOfMethods; ++i)
807 const SMethod& method=aArrayOfMethods[i];
808 __ASSERT_ALWAYS(method.iNumberOfBytesAbleToConvert!=NULL, Panic(EPanicBadFunctionPointer2));
809 __ASSERT_ALWAYS(method.iConvertToIntermediateBufferInPlace!=NULL, Panic(EPanicBadFunctionPointer3));
810 __ASSERT_ALWAYS(method.iConversionData!=NULL, Panic(EPanicBadConversionDataPointer3));
811 __ASSERT_ALWAYS(method.iNumberOfBytesPerCharacter>0, Panic(EPanicBadMethodData2));
812 __ASSERT_ALWAYS(method.iNumberOfCoreBytesPerCharacter>0, Panic(EPanicBadMethodData3));
813 __ASSERT_ALWAYS(method.iNumberOfCoreBytesPerCharacter<=method.iNumberOfBytesPerCharacter, Panic(EPanicBadMethodData4));
817 TInt CnvUtilities::LengthOfUnicodeCharacter(const TDesC16& aUnicode, TInt aIndex)
819 const TUint unicodeCharacter=aUnicode[aIndex];
820 if ((unicodeCharacter>=0xd800) && (unicodeCharacter<=0xdbff)) // if the unicode character is the first half of a surrogate-pair...
822 __ASSERT_DEBUG(aIndex+1<aUnicode.Length(), Panic(EPanicBadSurrogatePair1));
824 const TUint secondHalfOfSurrogatePair=aUnicode[aIndex+1];
826 __ASSERT_DEBUG((secondHalfOfSurrogatePair>=0xdc00) && (secondHalfOfSurrogatePair<=0xdfff), Panic(EPanicBadSurrogatePair2)); // this can be asserted as CCnvCharacterSetConverter::DoConvertFromUnicode should have returned an error value if this was a bad surrogate pair
832 TBool CnvUtilities::NextHomogeneousForeignRun(const SCnvConversionData*& aConversionData, TInt& aNumberOfForeignBytesConsumed, TPtrC8& aHomogeneousRun, TPtrC8& aRemainderOfForeign, const TArray<SState>& aArrayOfStates, TUint& aOutputConversionFlags)
834 __ASSERT_DEBUG((aRemainderOfForeign.Length()==0) || (aRemainderOfForeign[0]==KControlCharacterEscape), Panic(EPanicBadRemainderOfForeign));
837 if (aRemainderOfForeign.Length()==0)
841 const TInt numberOfStates=aArrayOfStates.Count();
843 for (i=0; i<numberOfStates; ++i)
845 const SState& state=aArrayOfStates[i];
846 if (MatchesEscapeSequence(aNumberOfForeignBytesConsumed, aHomogeneousRun, aRemainderOfForeign, *state.iEscapeSequence))
848 aConversionData=state.iConversionData;
852 for (i=0; i<numberOfStates; ++i)
854 if (IsStartOf(aRemainderOfForeign, *aArrayOfStates[i].iEscapeSequence))
856 // aRemainderOfForeign ends with a truncated escape sequence, so ConvertToUnicode cannot convert any more
857 aOutputConversionFlags|=CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated;
861 // force ConvertToUnicode to return CCnvCharacterSetConverter::EErrorIllFormedInput
862 aConversionData=NULL;
865 if (aHomogeneousRun.Length()>0)
872 TBool CnvUtilities::MatchesEscapeSequence(TInt& aNumberOfForeignBytesConsumed, TPtrC8& aHomogeneousRun, TPtrC8& aRemainderOfForeign, const TDesC8& aEscapeSequence)
874 const TInt lengthOfEscapeSequence=aEscapeSequence.Length();
875 if (IsStartOf(aEscapeSequence, aRemainderOfForeign))
877 aRemainderOfForeign.Set(aRemainderOfForeign.Mid(lengthOfEscapeSequence));
878 const TInt startOfNextEscapeSequence=aRemainderOfForeign.Locate(KControlCharacterEscape);
879 if (startOfNextEscapeSequence==KErrNotFound)
881 aHomogeneousRun.Set(aRemainderOfForeign);
882 aRemainderOfForeign.Set(NULL, 0);
886 aHomogeneousRun.Set(aRemainderOfForeign.Left(startOfNextEscapeSequence));
887 aRemainderOfForeign.Set(aRemainderOfForeign.Mid(startOfNextEscapeSequence));
889 aNumberOfForeignBytesConsumed+=lengthOfEscapeSequence;
895 TBool CnvUtilities::IsStartOf(const TDesC8& aStart, const TDesC8& aPotentiallyLongerDescriptor)
897 const TInt lengthOfStart=aStart.Length();
898 return (aPotentiallyLongerDescriptor.Length()>=lengthOfStart) && (aPotentiallyLongerDescriptor.Left(lengthOfStart)==aStart);