Update contrib.
2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
19 #if !defined(__CHARCONV_H__)
20 #define __CHARCONV_H__
22 #if !defined(__E32STD_H__)
26 #if !defined(__E32BASE_H__)
31 The maximum length in bytes of the replacement text for unconvertible Unicode
32 characters (=50) (see CCnvCharacterSetConverter::SetReplacementForUnconvertibleUnicodeCharactersL()).
36 const TInt KMaximumLengthOfReplacementForUnconvertibleUnicodeCharacters=50;
43 const TUint KCharacterSetIdentifierUtf7=0x1000582c;
49 const TUint KCharacterSetIdentifierUtf8=0x1000582d;
55 const TUint KCharacterSetIdentifierImapUtf7=0x1000582e;
61 const TUint KCharacterSetIdentifierJavaConformantUtf8=0x1000582f;
67 const TUint KCharacterSetIdentifierCodePage1252=0x100012b6;
73 const TUint KCharacterSetIdentifierIso88591=0x10003b10;
79 const TUint KCharacterSetIdentifierIso88592=0x1000507e;
85 const TUint KCharacterSetIdentifierIso88593=0x10008a28;
91 const TUint KCharacterSetIdentifierIso88594=0x1000507f;
97 const TUint KCharacterSetIdentifierIso88595=0x10005080;
103 const TUint KCharacterSetIdentifierIso88596=0x10008a29;
109 const TUint KCharacterSetIdentifierIso88597=0x10005081;
115 const TUint KCharacterSetIdentifierIso88598=0x10008a2a;
121 const TUint KCharacterSetIdentifierIso88599=0x10005082;
127 const TUint KCharacterSetIdentifierIso885910=0x10008a2b;
133 const TUint KCharacterSetIdentifierIso885913=0x10008a2c;
139 const TUint KCharacterSetIdentifierIso885914=0x10008a2d;
145 const TUint KCharacterSetIdentifierIso885915=0x10008a2e;
151 const TUint KCharacterSetIdentifierAscii=0x10004cc6;
157 const TUint KCharacterSetIdentifierSms7Bit=0x100053ab;
163 const TUint KCharacterSetIdentifierGb2312=0x10000fbe;
169 const TUint KCharacterSetIdentifierHz=0x10006065;
175 const TUint KCharacterSetIdentifierGb12345=0x1000401a;
181 const TUint KCharacterSetIdentifierGbk=0x10003ecb;
187 const TUint KCharacterSetIdentifierGb18030=0x10287038;
193 const TUint KCharacterSetIdentifierBig5=0x10000fbf;
199 const TUint KCharacterSetIdentifierShiftJis=0x10000fbd;
205 const TUint KCharacterSetIdentifierIso2022Jp=0x100066a0;
211 const TUint KCharacterSetIdentifierIso2022Jp1=0x100066a3;
217 const TUint KCharacterSetIdentifierJis=0x10006066;
223 const TUint KCharacterSetIdentifierEucJpPacked=0x10006067;
230 const TUint KCharacterSetIdentifierJ5=0x1020D408;
236 const TUint KCharacterSetIdentifierCP850=0x102825AD;
238 const TUint KCharacterSetIdentifierUnicodeLittle=0x101f3fae; //Little Endian Unicode
239 const TUint KCharacterSetIdentifierUnicodeBig=0x101f4052; // Big Endian Unicode
240 const TUint KCharacterSetIdentifierUcs2=0x101ff492;
248 const TUint KCharacterSetIdentifierExtendedSms7Bit=0x102863FD;
255 const TUint KCharacterSetIdentifierTurkishSingleSms7Bit=0x102863FE;
256 const TUint KCharacterSetIdentifierTurkishLockingSms7Bit=0x102863FF;
257 const TUint KCharacterSetIdentifierTurkishLockingAndSingleSms7Bit=0x10286400;
264 const TUint KCharacterSetIdentifierPortugueseSingleSms7Bit=0x10286407;
265 const TUint KCharacterSetIdentifierPortugueseLockingSms7Bit=0x10286408;
266 const TUint KCharacterSetIdentifierPortugueseLockingAndSingleSms7Bit=0x10286409;
273 const TUint KCharacterSetIdentifierSpanishSingleSms7Bit=0x1028640A;
281 const TUint KCharacterSetIdentifierCP949=0x200100FF;
284 Shift-JIS with Pictograph
288 const TUint KCharacterSetIdentifierShiftJisDirectmap=0x101F8691;
291 EUC-JP with direct mapped pictograph
295 const TUint KCharacterSetIdentifierEucJpDirectmap=0x101F86A6;
302 const TUint KCharacterSetIdentifierEUCKR=0x2000E526;
309 const TUint KCharacterSetIdentifierIscii=0x1027508E;
316 const TUint KCharacterSetIdentifierIso2022kr=0x20010101;
323 const TUint KCharacterSetIdentifierKOI8R=0x101F8778;
326 KOI8-U Belorusian/Ukrainian Cyrillic
330 const TUint KCharacterSetIdentifierKOI8U=0x101F8761;
337 const TUint KCharacterSetIdentifierKsc5601=0x200113CD;
344 const TUint KCharacterSetIdentifierTIS_620=0x101F8549;
351 const TUint KCharacterSetIdentifierWin874=0x101F854A;
354 Code page 1250 Eastern European
358 const TUint KCharacterSetIdentifierWin1250=0x100059D6;
361 Code page 1251 Cyrillic
365 const TUint KCharacterSetIdentifierWin1251=0x100059D7;
372 const TUint KCharacterSetIdentifierWin1253=0x100059D8;
375 Code page 1254 Turkish
379 const TUint KCharacterSetIdentifierWin1254=0x100059D9;
382 Code page 1255 Hebrew
386 const TUint KCharacterSetIdentifierWin1255=0x101F8547;
389 Code page 1256 Arabic
393 const TUint KCharacterSetIdentifierWin1256=0x101F8548;
396 Code page 1257 Baltic
400 const TUint KCharacterSetIdentifierWin1257=0x100059DA;
407 const TUint KCharacterSetIdentifierWin1258=0x102073B8;
409 // note that other character sets than those listed above may be available at run-time, and also that none of the above are necessarily available at run-time
411 struct SCnvConversionData;
412 class CDeepDestructingArrayOfCharactersSets;
414 class CStandardNamesAndMibEnums;
416 class CCharsetCnvCache;
418 Converts text between Unicode and other character sets.
420 The first stage of the conversion is to specify the non-Unicode character
421 set being converted to or from. This is done by calling one of the overloads
422 of PrepareToConvertToOrFromL().
424 The second stage is to convert the text, using one of the overloads of
425 ConvertFromUnicode() or ConvertToUnicode().
427 Where possible the first documented overload of PrepareToConvertToOrFromL()
428 should be used because the second overload panics if the specified character
429 set is not available: the first overload simply returns whether the character
430 set is available or not available. However if the conversions are to be
431 performed often, or if the user must select the character set for the
432 conversion from a list, the second overload may be more appropriate.
434 The first overload is less efficient than the second, because it searches
435 through the file system for the selected character set every time it is invoked.
436 The second overload searches through an array of all available character sets.
437 In this method, the file system need only be searched once - when
438 CreateArrayOfCharacterSetsAvailableLC() or
439 CreateArrayOfCharacterSetsAvailableL() is used to create the array.
441 The conversion functions allow users of this class to perform partial
442 conversions on an input descriptor, handling the situation where the input
443 descriptor is truncated mid way through a multi-byte character. This means
444 that you do not have to guess how big to make the output descriptor for a
445 given input descriptor, you can simply do the conversion in a loop using a
446 small output descriptor. The ability to handle truncated descriptors also
447 allows users of the class to convert information received in chunks from an
450 The class also provides a number of utility functions.
454 class CCnvCharacterSetConverter : public CBase
457 /** Indicates whether a character set is available or unavailable
458 for conversion. Used by the second overload of
459 PrepareToConvertToOrFromL(). */
462 /** The requested character set can be converted. */
464 /** The requested character set cannot be converted. */
468 /** Conversion error flags. At this stage there is only one error
469 flag- others may be added in the future. */
472 /** The input descriptor contains a single corrupt character. This
473 might occur when the input descriptor only contains some of the bytes
474 of a single multi-byte character. */
475 EErrorIllFormedInput=KErrCorrupt
478 /** Specifies the default endian-ness of the current character set.
479 Used by SetDefaultEndiannessOfForeignCharacters(). */
482 /** The character set is big-endian. */
484 /** The character set is little-endian. */
488 /** Downgrade for line and paragraph separators */
489 enum TDowngradeForExoticLineTerminatingCharacters
491 /** Paragraph/line separators should be downgraded (if necessary)
492 into carriage return and line feed pairs. */
493 EDowngradeExoticLineTerminatingCharactersToCarriageReturnLineFeed,
494 /** Paragraph/line separators should be downgraded (if necessary)
495 into a line feed only. */
496 EDowngradeExoticLineTerminatingCharactersToJustLineFeed
499 /** Output flag used to indicate whether or not a character in the source
500 descriptor is the first half of a surrogate pair, but is the last
501 character in the descriptor to convert.
503 Note: This enumeration can be used in the DoConvertToUnicode() and
504 DoConvertFromUnicode() functions. These are part of the
505 Character Conversion Plug-in Provider API and are for use by plug-in
506 conversion libraries only.
510 /** Appends the converted text to the output descriptor.*/
511 EInputConversionFlagAppend =0x00010000,
512 /** By default, when the input descriptor passed to DoConvertFromUnicode()
513 or DoConvertToUnicode() consists of nothing but a truncated sequence,
514 the error-code EErrorIllFormedInput is returned.
515 If this behaviour is undesirable, the input flag
516 EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable
518 EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable =0x00020000,
519 /** Stops converting when the first unconvertible character is reached. */
520 EInputConversionFlagStopAtFirstUnconvertibleCharacter =0x00040000,
521 /** Appends the default character set Escape sequence at end of converted text */
522 EInputConversionFlagMustEndInDefaultCharacterSet =0x00080000,
523 /*defect fix: INC053609; According to RFC1468 we can assume the line starts
524 in ASCII so there is no need to always insert an escape sequence*/
525 EInputConversionFlagAssumeStartInDefaultCharacterSet =0x00100000
529 /** Indicates whether or not the source descriptor ends in a truncated
530 sequence, e.g. the first half only of a surrogate pair. */
531 EOutputConversionFlagInputIsTruncated =0x01000000
534 /** Initial value for the state argument in a set of related calls to
535 ConvertToUnicode(). */
536 enum {KStateDefault=0};
539 /** The lowest confidence value for a character set accepted by
541 ELowestThreshold = 25
544 /** Stores information about a non-Unicode character set. The information
545 is used to locate the conversion information required by
546 ConvertFromUnicode() and ConvertToUnicode().
548 An array of these structs that contain all available character sets
549 can be generated by CreateArrayOfCharacterSetsAvailableLC() and
550 CreateArrayOfCharacterSetsAvailableL(), and is used by one of the
551 overloads of PrepareToConvertToOrFromL(). */
554 /** Gets the character sets UID.
556 @return The UID of the character set. */
557 inline TUint Identifier() const {return iIdentifier;}
559 /** Tests whether a filename given by the function SCharacterSet::Name()
560 is a real file name (i.e. conversion is provided by a plug in DLL), or
561 just the character set name (i.e. conversion is built into Symbian OS).
563 Note: If the function returns ETrue then the path and filename can be
564 parsed using TParse or TParsePtrC functions to obtain just the filename.
566 @return ETrue if the name is a real filename. EFalse if it is just the
567 character set name. */
568 inline TBool NameIsFileName() const {return iFlags&EFlagNameIsFileName;}
570 /** Gets the full path and filename of the DLL which implements
571 conversion for the character set.
573 If the character set is one for which conversion is built into Symbian
574 OS rather than implemented by a plug in DLL, the function just returns
575 the name of the character set. The NameIsFileName() function can be
576 used to determine whether or not it is legal to create a TParsePtrC
577 object over the descriptor returned by Name().
581 The name returned cannot be treated as an Internet-standard name, it
582 is locale-independent and should be mapped to the locale-dependent name
583 by software at a higher level before being shown to the user. Conversion
584 from Internet-standard names of character sets to the UID identifiers
585 is provided by the member function
586 ConvertStandardNameOfCharacterSetToIdentifierL().
588 Typically, to find the user-displayable name (as opposed to the
589 internet-standard name) of a character set, you would do something
593 const CCnvCharacterSetConverter::SCharacterSet& characterSet=...;
594 const TPtrC userDisplayable(characterSet.NameIsFileName()? TParsePtrC(characterSet.Name()).Name():
595 characterSet.Name());
598 @return Full path and filename of the character set converter plug in
599 DLL, or just the name of the character set. */
600 inline TPtrC Name() const {return *iName;}
604 EFlagNameIsFileName =0x00000001,
605 EFlagFileIsConversionPlugInLibrary =0x00000002
608 inline TBool FileIsConversionPlugInLibrary() const {return iFlags&EFlagFileIsConversionPlugInLibrary;}
614 friend class CCnvCharacterSetConverter;
615 friend class CDeepDestructingArrayOfCharactersSets;
620 Holds an ascending array of the indices of the characters in the
621 source Unicode text which could not be converted by
622 CCnvCharacterSetConverter::ConvertFromUnicode() into the foreign
627 class TArrayOfAscendingIndices
630 /** The return value of CCnvCharacterSetConverter::AppendIndex(). */
633 /** The append failed. */
635 /** The append succeeded. */
639 /** C++ constructor. The array is initialised to be of length zero. */
640 inline TArrayOfAscendingIndices() :iArrayOfIndices(0) {}
642 IMPORT_C TAppendResult AppendIndex(TInt aIndex);
644 /** Deletes a single index from the array.
646 @param aIndexOfIndex The index of the index to delete. Must not be
647 negative and must not be greater than the length of the array, or a
649 inline void Remove(TInt aIndexOfIndex) {iArrayOfIndices.Delete(aIndexOfIndex, 1);}
651 /** Deletes all indices from the array. */
652 inline void RemoveAll() {iArrayOfIndices.SetLength(0);}
654 /** Returns the number of indices in the array.
656 @return The number of indices in the array. */
657 inline TInt NumberOfIndices() const {return iArrayOfIndices.Length();}
659 /** Gets the value of the specified index.
661 @param aIndexOfIndex Index into the array.
662 @return The value of the index. */
663 inline TInt operator[](TInt aIndexOfIndex) const {return iArrayOfIndices[aIndexOfIndex];}
665 enum {KMaximumNumberOfIndices=25};
667 TBuf16<KMaximumNumberOfIndices> iArrayOfIndices;
670 IMPORT_C static CCnvCharacterSetConverter* NewL();
671 IMPORT_C static CCnvCharacterSetConverter* NewLC();
672 IMPORT_C virtual ~CCnvCharacterSetConverter();
673 IMPORT_C static CArrayFix<SCharacterSet>* CreateArrayOfCharacterSetsAvailableL(RFs& aFileServerSession);
674 IMPORT_C static CArrayFix<SCharacterSet>* CreateArrayOfCharacterSetsAvailableLC(RFs& aFileServerSession);
675 IMPORT_C TUint ConvertStandardNameOfCharacterSetToIdentifierL(const TDesC8& aStandardNameOfCharacterSet, RFs& aFileServerSession);
676 IMPORT_C HBufC8* ConvertCharacterSetIdentifierToStandardNameL(TUint aCharacterSetIdentifier, RFs& aFileServerSession);
677 IMPORT_C TUint ConvertMibEnumOfCharacterSetToIdentifierL(TInt aMibEnumOfCharacterSet, RFs& aFileServerSession);
678 IMPORT_C TInt ConvertCharacterSetIdentifierToMibEnumL(TUint aCharacterSetIdentifier, RFs& aFileServerSession);
679 IMPORT_C void PrepareToConvertToOrFromL(TUint aCharacterSetIdentifier, const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, RFs& aFileServerSession);
680 IMPORT_C TAvailability PrepareToConvertToOrFromL(TUint aCharacterSetIdentifier, RFs& aFileServerSession);
681 // the following attribute-setting functions should be called (if at all) after calling PrepareToConvertToOrFromL and before calling ConvertFromUnicode and/or ConvertToUnicode
682 IMPORT_C void SetDefaultEndiannessOfForeignCharacters(TEndianness aEndianness);
683 IMPORT_C void SetDowngradeForExoticLineTerminatingCharacters(TDowngradeForExoticLineTerminatingCharacters aDowngradeForExoticLineTerminatingCharacters); // by default this attribute is set to EDowngradeExoticLineTerminatingCharactersToCarriageReturnLineFeed
684 IMPORT_C void SetReplacementForUnconvertibleUnicodeCharactersL(const TDesC8& aReplacementForUnconvertibleUnicodeCharacters); // must be a single character preceded by its escape sequence (if any), and must be little-endian if the endianness of the character-set is unspecified, otherwise in the same endianness as the character-set
686 // the conversion functions return either one of the TError values above, or the number of unconverted elements left at the end of the input descriptor
687 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode) const;
688 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode, TInt& aNumberOfUnconvertibleCharacters) const;
689 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstUnconvertibleCharacter) const;
690 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode, TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters) const;
691 IMPORT_C TInt ConvertToUnicode(TDes16& aUnicode, const TDesC8& aForeign, TInt& aState) const;
692 IMPORT_C TInt ConvertToUnicode(TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters) const;
693 IMPORT_C TInt ConvertToUnicode(TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) const;
694 IMPORT_C static void AutoDetectCharacterSetL(TInt& aConfidenceLevel, TUint& aCharacterSetIdentifier, const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
695 IMPORT_C void AutoDetectCharSetL(TInt& aConfidenceLevel, TUint& aCharacterSetIdentifier, const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
696 IMPORT_C static void ConvertibleToCharacterSetL(TInt& aConfidenceLevel, const TUint aCharacterSetIdentifier,const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
697 IMPORT_C void ConvertibleToCharSetL(TInt& aConfidenceLevel, const TUint aCharacterSetIdentifier,const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
698 IMPORT_C void SetMaxCacheSize(TInt aSize);
699 // the following functions are only to be called by conversion plug-in libraries
700 IMPORT_C static TInt DoConvertFromUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, TDes8& aForeign, const TDesC16& aUnicode, TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters);
701 IMPORT_C static TInt DoConvertFromUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, TDes8& aForeign, const TDesC16& aUnicode, TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
702 IMPORT_C static TInt DoConvertToUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter);
703 IMPORT_C static TInt DoConvertToUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
704 IMPORT_C static const SCnvConversionData& AsciiConversionData();
705 inline TDowngradeForExoticLineTerminatingCharacters GetDowngradeForExoticLineTerminatingCharacters ()
707 return iDowngradeForExoticLineTerminatingCharacters ;
713 EStoredFlagOwnsConversionData =0x00000001,
714 EStoredFlagConversionPlugInLibraryIsLoaded =0x00000002
716 enum TCharacterSetSearch
718 EStopCharacterSetSearch,
719 EContinueCharacterSetSearch
721 enum TConversionPlugInFunctionOrdinals
723 EReplacementForUnconvertibleUnicodeCharacters=1,
724 EConvertFromUnicode=2,
726 EIsInThisCharacterSet=4
730 CCnvCharacterSetConverter();
732 static CArrayFix<SCharacterSet>* DoCreateArrayOfCharacterSetsAvailableLC(RFs& aFileServerSession, TUint aIdentifierOfOnlyCharacterSetOfInterest);
733 static TCharacterSetSearch AppendHardCodedCharacterSetIfRequiredL(CArrayFix<SCharacterSet>& aArrayOfCharacterSets, TUint aIdentifierOfOnlyCharacterSetOfInterest, TUint aIdentifierOfHardCodedCharacterSet, const TDesC& aNameOfHardCodedCharacterSet);
734 void ScanForStandardNamesAndMibEnumsL(RFs& aFileServerSession);
735 void ScanForStandardNamesAndMibEnumsROMOnlyL(RFs& aFileServerSession);
736 TAvailability DoPrepareToConvertToOrFromL(TUint aCharacterSetIdentifier, const CArrayFix<SCharacterSet>* aArrayOfCharacterSetsAvailable, RFs& aFileServerSession);
737 static void DeleteConversionData(const SCnvConversionData* aConversionData);
738 static void DeleteConversionData(TAny* aConversionData);
739 static TEndianness EndiannessOfForeignCharacters(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters);
743 TUint iCharacterSetIdentifierOfLoadedConversionData; // 0 or a UID of the loaded plugin
744 const SCnvConversionData* iConversionData;
745 TEndianness iDefaultEndiannessOfForeignCharacters;
746 TDowngradeForExoticLineTerminatingCharacters iDowngradeForExoticLineTerminatingCharacters;
747 TBuf8<KMaximumLengthOfReplacementForUnconvertibleUnicodeCharacters> iReplacementForUnconvertibleUnicodeCharacters;
748 CStandardNamesAndMibEnums* iStandardNamesAndMibEnums;
749 TBool iTlsDataConstructed;
750 CCharsetCnvCache* iCharsetCnvCache;
751 TBool iIsSystemStandardNamesAndMibEnumsScanned;