1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
2 // All rights reserved.
3 // This component and the accompanying materials are made available
4 // under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
5 // which accompanies this distribution, and is available
6 // at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
8 // Initial Contributors:
9 // Nokia Corporation - initial contribution.
16 #if !defined(__CHARCONV_H__)
17 #define __CHARCONV_H__
19 #if !defined(__E32STD_H__)
23 #if !defined(__E32BASE_H__)
28 The maximum length in bytes of the replacement text for unconvertible Unicode
29 characters (=50) (see CCnvCharacterSetConverter::SetReplacementForUnconvertibleUnicodeCharactersL()).
33 const TInt KMaximumLengthOfReplacementForUnconvertibleUnicodeCharacters=50;
40 const TUint KCharacterSetIdentifierUtf7=0x1000582c;
46 const TUint KCharacterSetIdentifierUtf8=0x1000582d;
52 const TUint KCharacterSetIdentifierImapUtf7=0x1000582e;
58 const TUint KCharacterSetIdentifierJavaConformantUtf8=0x1000582f;
64 const TUint KCharacterSetIdentifierCodePage1252=0x100012b6;
70 const TUint KCharacterSetIdentifierIso88591=0x10003b10;
76 const TUint KCharacterSetIdentifierIso88592=0x1000507e;
82 const TUint KCharacterSetIdentifierIso88593=0x10008a28;
88 const TUint KCharacterSetIdentifierIso88594=0x1000507f;
94 const TUint KCharacterSetIdentifierIso88595=0x10005080;
100 const TUint KCharacterSetIdentifierIso88596=0x10008a29;
106 const TUint KCharacterSetIdentifierIso88597=0x10005081;
112 const TUint KCharacterSetIdentifierIso88598=0x10008a2a;
118 const TUint KCharacterSetIdentifierIso88599=0x10005082;
124 const TUint KCharacterSetIdentifierIso885910=0x10008a2b;
130 const TUint KCharacterSetIdentifierIso885913=0x10008a2c;
136 const TUint KCharacterSetIdentifierIso885914=0x10008a2d;
142 const TUint KCharacterSetIdentifierIso885915=0x10008a2e;
148 const TUint KCharacterSetIdentifierAscii=0x10004cc6;
154 const TUint KCharacterSetIdentifierSms7Bit=0x100053ab;
160 const TUint KCharacterSetIdentifierGb2312=0x10000fbe;
166 const TUint KCharacterSetIdentifierHz=0x10006065;
172 const TUint KCharacterSetIdentifierGb12345=0x1000401a;
178 const TUint KCharacterSetIdentifierGbk=0x10003ecb;
184 const TUint KCharacterSetIdentifierBig5=0x10000fbf;
190 const TUint KCharacterSetIdentifierShiftJis=0x10000fbd;
196 const TUint KCharacterSetIdentifierIso2022Jp=0x100066a0;
202 const TUint KCharacterSetIdentifierIso2022Jp1=0x100066a3;
208 const TUint KCharacterSetIdentifierJis=0x10006066;
214 const TUint KCharacterSetIdentifierEucJpPacked=0x10006067;
221 const TUint KCharacterSetIdentifierJ5=0x1020D408;
227 const TUint KCharacterSetIdentifierCP850=0x102825AD;
229 const TUint KCharacterSetIdentifierUnicodeLittle=0x101f3fae; //Little Endian Unicode
230 const TUint KCharacterSetIdentifierUnicodeBig=0x101f4052; // Big Endian Unicode
231 const TUint KCharacterSetIdentifierUcs2=0x101ff492;
234 Extended SMS 7-bit (not supported before v9.5)
238 const TUint KCharacterSetIdentifierExtendedSms7Bit=0x102863FD;
245 const TUint KCharacterSetIdentifierTurkishSingleSms7Bit=0x102863FE;
246 const TUint KCharacterSetIdentifierTurkishLockingSms7Bit=0x102863FF;
247 const TUint KCharacterSetIdentifierTurkishLockingAndSingleSms7Bit=0x10286400;
254 const TUint KCharacterSetIdentifierPortugueseSingleSms7Bit=0x10286407;
255 const TUint KCharacterSetIdentifierPortugueseLockingSms7Bit=0x10286408;
256 const TUint KCharacterSetIdentifierPortugueseLockingAndSingleSms7Bit=0x10286409;
263 const TUint KCharacterSetIdentifierSpanishSingleSms7Bit=0x1028640A;
265 // note that other character sets than those listed above may be available at run-time, and also that none of the above are necessarily available at run-time
267 struct SCnvConversionData;
268 class CDeepDestructingArrayOfCharactersSets;
270 class CStandardNamesAndMibEnums;
272 class CCharsetCnvCache;
274 Converts text between Unicode and other character sets.
276 The first stage of the conversion is to specify the non-Unicode character
277 set being converted to or from. This is done by calling one of the overloads
278 of PrepareToConvertToOrFromL().
280 The second stage is to convert the text, using one of the overloads of
281 ConvertFromUnicode() or ConvertToUnicode().
283 Where possible the first documented overload of PrepareToConvertToOrFromL()
284 should be used because the second overload panics if the specified character
285 set is not available: the first overload simply returns whether the character
286 set is available or not available. However if the conversions are to be
287 performed often, or if the user must select the character set for the
288 conversion from a list, the second overload may be more appropriate.
290 The first overload is less efficient than the second, because it searches
291 through the file system for the selected character set every time it is invoked.
292 The second overload searches through an array of all available character sets.
293 In this method, the file system need only be searched once - when
294 CreateArrayOfCharacterSetsAvailableLC() or
295 CreateArrayOfCharacterSetsAvailableL() is used to create the array.
297 The conversion functions allow users of this class to perform partial
298 conversions on an input descriptor, handling the situation where the input
299 descriptor is truncated mid way through a multi-byte character. This means
300 that you do not have to guess how big to make the output descriptor for a
301 given input descriptor, you can simply do the conversion in a loop using a
302 small output descriptor. The ability to handle truncated descriptors also
303 allows users of the class to convert information received in chunks from an
306 The class also provides a number of utility functions.
310 class CCnvCharacterSetConverter : public CBase
313 /** Indicates whether a character set is available or unavailable
314 for conversion. Used by the second overload of
315 PrepareToConvertToOrFromL(). */
318 /** The requested character set can be converted. */
320 /** The requested character set cannot be converted. */
324 /** Conversion error flags. At this stage there is only one error
325 flag- others may be added in the future. */
328 /** The input descriptor contains a single corrupt character. This
329 might occur when the input descriptor only contains some of the bytes
330 of a single multi-byte character. */
331 EErrorIllFormedInput=KErrCorrupt
334 /** Specifies the default endian-ness of the current character set.
335 Used by SetDefaultEndiannessOfForeignCharacters(). */
338 /** The character set is big-endian. */
340 /** The character set is little-endian. */
344 /** Downgrade for line and paragraph separators */
345 enum TDowngradeForExoticLineTerminatingCharacters
347 /** Paragraph/line separators should be downgraded (if necessary)
348 into carriage return and line feed pairs. */
349 EDowngradeExoticLineTerminatingCharactersToCarriageReturnLineFeed,
350 /** Paragraph/line separators should be downgraded (if necessary)
351 into a line feed only. */
352 EDowngradeExoticLineTerminatingCharactersToJustLineFeed
355 /** Output flag used to indicate whether or not a character in the source
356 descriptor is the first half of a surrogate pair, but is the last
357 character in the descriptor to convert.
359 Note: This enumeration can be used in the DoConvertToUnicode() and
360 DoConvertFromUnicode() functions. These are part of the
361 Character Conversion Plug-in Provider API and are for use by plug-in
362 conversion libraries only.
366 /** Appends the converted text to the output descriptor.*/
367 EInputConversionFlagAppend =0x00010000,
368 /** By default, when the input descriptor passed to DoConvertFromUnicode()
369 or DoConvertToUnicode() consists of nothing but a truncated sequence,
370 the error-code EErrorIllFormedInput is returned.
371 If this behaviour is undesirable, the input flag
372 EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable
374 EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable =0x00020000,
375 /** Stops converting when the first unconvertible character is reached. */
376 EInputConversionFlagStopAtFirstUnconvertibleCharacter =0x00040000,
377 /** Appends the default character set Escape sequence at end of converted text */
378 EInputConversionFlagMustEndInDefaultCharacterSet =0x00080000,
379 /*defect fix: INC053609; According to RFC1468 we can assume the line starts
380 in ASCII so there is no need to always insert an escape sequence*/
381 EInputConversionFlagAssumeStartInDefaultCharacterSet =0x00100000
385 /** Indicates whether or not the source descriptor ends in a truncated
386 sequence, e.g. the first half only of a surrogate pair. */
387 EOutputConversionFlagInputIsTruncated =0x01000000
390 /** Initial value for the state argument in a set of related calls to
391 ConvertToUnicode(). */
392 enum {KStateDefault=0};
395 /** The lowest confidence value for a character set accepted by
397 ELowestThreshold = 25
400 /** Stores information about a non-Unicode character set. The information
401 is used to locate the conversion information required by
402 ConvertFromUnicode() and ConvertToUnicode().
404 An array of these structs that contain all available character sets
405 can be generated by CreateArrayOfCharacterSetsAvailableLC() and
406 CreateArrayOfCharacterSetsAvailableL(), and is used by one of the
407 overloads of PrepareToConvertToOrFromL(). */
410 /** Gets the character sets UID.
412 @return The UID of the character set. */
413 inline TUint Identifier() const {return iIdentifier;}
415 /** Tests whether a filename given by the function SCharacterSet::Name()
416 is a real file name (i.e. conversion is provided by a plug in DLL), or
417 just the character set name (i.e. conversion is built into Symbian OS).
419 Note: If the function returns ETrue then the path and filename can be
420 parsed using TParse or TParsePtrC functions to obtain just the filename.
422 @return ETrue if the name is a real filename. EFalse if it is just the
423 character set name. */
424 inline TBool NameIsFileName() const {return iFlags&EFlagNameIsFileName;}
426 /** Gets the full path and filename of the DLL which implements
427 conversion for the character set.
429 If the character set is one for which conversion is built into Symbian
430 OS rather than implemented by a plug in DLL, the function just returns
431 the name of the character set. The NameIsFileName() function can be
432 used to determine whether or not it is legal to create a TParsePtrC
433 object over the descriptor returned by Name().
437 The name returned cannot be treated as an Internet-standard name, it
438 is locale-independent and should be mapped to the locale-dependent name
439 by software at a higher level before being shown to the user. Conversion
440 from Internet-standard names of character sets to the UID identifiers
441 is provided by the member function
442 ConvertStandardNameOfCharacterSetToIdentifierL().
444 Typically, to find the user-displayable name (as opposed to the
445 internet-standard name) of a character set, you would do something
449 const CCnvCharacterSetConverter::SCharacterSet& characterSet=...;
450 const TPtrC userDisplayable(characterSet.NameIsFileName()? TParsePtrC(characterSet.Name()).Name():
451 characterSet.Name());
454 @return Full path and filename of the character set converter plug in
455 DLL, or just the name of the character set. */
456 inline TPtrC Name() const {return *iName;}
460 EFlagNameIsFileName =0x00000001,
461 EFlagFileIsConversionPlugInLibrary =0x00000002
464 inline TBool FileIsConversionPlugInLibrary() const {return iFlags&EFlagFileIsConversionPlugInLibrary;}
470 friend class CCnvCharacterSetConverter;
471 friend class CDeepDestructingArrayOfCharactersSets;
476 Holds an ascending array of the indices of the characters in the
477 source Unicode text which could not be converted by
478 CCnvCharacterSetConverter::ConvertFromUnicode() into the foreign
483 class TArrayOfAscendingIndices
486 /** The return value of CCnvCharacterSetConverter::AppendIndex(). */
489 /** The append failed. */
491 /** The append succeeded. */
495 /** C++ constructor. The array is initialised to be of length zero. */
496 inline TArrayOfAscendingIndices() :iArrayOfIndices(0) {}
498 IMPORT_C TAppendResult AppendIndex(TInt aIndex);
500 /** Deletes a single index from the array.
502 @param aIndexOfIndex The index of the index to delete. Must not be
503 negative and must not be greater than the length of the array, or a
505 inline void Remove(TInt aIndexOfIndex) {iArrayOfIndices.Delete(aIndexOfIndex, 1);}
507 /** Deletes all indices from the array. */
508 inline void RemoveAll() {iArrayOfIndices.SetLength(0);}
510 /** Returns the number of indices in the array.
512 @return The number of indices in the array. */
513 inline TInt NumberOfIndices() const {return iArrayOfIndices.Length();}
515 /** Gets the value of the specified index.
517 @param aIndexOfIndex Index into the array.
518 @return The value of the index. */
519 inline TInt operator[](TInt aIndexOfIndex) const {return iArrayOfIndices[aIndexOfIndex];}
521 enum {KMaximumNumberOfIndices=25};
523 TBuf16<KMaximumNumberOfIndices> iArrayOfIndices;
526 IMPORT_C static CCnvCharacterSetConverter* NewL();
527 IMPORT_C static CCnvCharacterSetConverter* NewLC();
528 IMPORT_C virtual ~CCnvCharacterSetConverter();
529 IMPORT_C static CArrayFix<SCharacterSet>* CreateArrayOfCharacterSetsAvailableL(RFs& aFileServerSession);
530 IMPORT_C static CArrayFix<SCharacterSet>* CreateArrayOfCharacterSetsAvailableLC(RFs& aFileServerSession);
531 IMPORT_C TUint ConvertStandardNameOfCharacterSetToIdentifierL(const TDesC8& aStandardNameOfCharacterSet, RFs& aFileServerSession);
532 IMPORT_C HBufC8* ConvertCharacterSetIdentifierToStandardNameL(TUint aCharacterSetIdentifier, RFs& aFileServerSession);
533 IMPORT_C TUint ConvertMibEnumOfCharacterSetToIdentifierL(TInt aMibEnumOfCharacterSet, RFs& aFileServerSession);
534 IMPORT_C TInt ConvertCharacterSetIdentifierToMibEnumL(TUint aCharacterSetIdentifier, RFs& aFileServerSession);
535 IMPORT_C void PrepareToConvertToOrFromL(TUint aCharacterSetIdentifier, const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, RFs& aFileServerSession);
536 IMPORT_C TAvailability PrepareToConvertToOrFromL(TUint aCharacterSetIdentifier, RFs& aFileServerSession);
537 // the following attribute-setting functions should be called (if at all) after calling PrepareToConvertToOrFromL and before calling ConvertFromUnicode and/or ConvertToUnicode
538 IMPORT_C void SetDefaultEndiannessOfForeignCharacters(TEndianness aEndianness);
539 IMPORT_C void SetDowngradeForExoticLineTerminatingCharacters(TDowngradeForExoticLineTerminatingCharacters aDowngradeForExoticLineTerminatingCharacters); // by default this attribute is set to EDowngradeExoticLineTerminatingCharactersToCarriageReturnLineFeed
540 IMPORT_C void SetReplacementForUnconvertibleUnicodeCharactersL(const TDesC8& aReplacementForUnconvertibleUnicodeCharacters); // must be a single character preceded by its escape sequence (if any), and must be little-endian if the endianness of the character-set is unspecified, otherwise in the same endianness as the character-set
542 // the conversion functions return either one of the TError values above, or the number of unconverted elements left at the end of the input descriptor
543 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode) const;
544 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode, TInt& aNumberOfUnconvertibleCharacters) const;
545 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstUnconvertibleCharacter) const;
546 IMPORT_C TInt ConvertFromUnicode(TDes8& aForeign, const TDesC16& aUnicode, TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters) const;
547 IMPORT_C TInt ConvertToUnicode(TDes16& aUnicode, const TDesC8& aForeign, TInt& aState) const;
548 IMPORT_C TInt ConvertToUnicode(TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters) const;
549 IMPORT_C TInt ConvertToUnicode(TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) const;
550 IMPORT_C static void AutoDetectCharacterSetL(TInt& aConfidenceLevel, TUint& aCharacterSetIdentifier, const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
551 IMPORT_C void AutoDetectCharSetL(TInt& aConfidenceLevel, TUint& aCharacterSetIdentifier, const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
552 IMPORT_C static void ConvertibleToCharacterSetL(TInt& aConfidenceLevel, const TUint aCharacterSetIdentifier,const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
553 IMPORT_C void ConvertibleToCharSetL(TInt& aConfidenceLevel, const TUint aCharacterSetIdentifier,const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, const TDesC8& aSample);
554 IMPORT_C void SetMaxCacheSize(TInt aSize);
555 // the following functions are only to be called by conversion plug-in libraries
556 IMPORT_C static TInt DoConvertFromUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, TDes8& aForeign, const TDesC16& aUnicode, TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters);
557 IMPORT_C static TInt DoConvertFromUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, TDes8& aForeign, const TDesC16& aUnicode, TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
558 IMPORT_C static TInt DoConvertToUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter);
559 IMPORT_C static TInt DoConvertToUnicode(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
560 IMPORT_C static const SCnvConversionData& AsciiConversionData();
561 inline TDowngradeForExoticLineTerminatingCharacters GetDowngradeForExoticLineTerminatingCharacters ()
563 return iDowngradeForExoticLineTerminatingCharacters ;
569 EStoredFlagOwnsConversionData =0x00000001,
570 EStoredFlagConversionPlugInLibraryIsLoaded =0x00000002
572 enum TCharacterSetSearch
574 EStopCharacterSetSearch,
575 EContinueCharacterSetSearch
577 enum TConversionPlugInFunctionOrdinals
579 EReplacementForUnconvertibleUnicodeCharacters=1,
580 EConvertFromUnicode=2,
582 EIsInThisCharacterSet=4
586 CCnvCharacterSetConverter();
588 static CArrayFix<SCharacterSet>* DoCreateArrayOfCharacterSetsAvailableLC(RFs& aFileServerSession, TUint aIdentifierOfOnlyCharacterSetOfInterest);
589 static TCharacterSetSearch AppendHardCodedCharacterSetIfRequiredL(CArrayFix<SCharacterSet>& aArrayOfCharacterSets, TUint aIdentifierOfOnlyCharacterSetOfInterest, TUint aIdentifierOfHardCodedCharacterSet, const TDesC& aNameOfHardCodedCharacterSet);
590 void ScanForStandardNamesAndMibEnumsL(RFs& aFileServerSession);
591 void ScanForStandardNamesAndMibEnumsROMOnlyL(RFs& aFileServerSession);
592 TAvailability DoPrepareToConvertToOrFromL(TUint aCharacterSetIdentifier, const CArrayFix<SCharacterSet>* aArrayOfCharacterSetsAvailable, RFs& aFileServerSession);
593 static void DeleteConversionData(const SCnvConversionData* aConversionData);
594 static void DeleteConversionData(TAny* aConversionData);
595 static TEndianness EndiannessOfForeignCharacters(const SCnvConversionData& aConversionData, TEndianness aDefaultEndiannessOfForeignCharacters);
599 TUint iCharacterSetIdentifierOfLoadedConversionData; // 0 or a UID of the loaded plugin
600 const SCnvConversionData* iConversionData;
601 TEndianness iDefaultEndiannessOfForeignCharacters;
602 TDowngradeForExoticLineTerminatingCharacters iDowngradeForExoticLineTerminatingCharacters;
603 TBuf8<KMaximumLengthOfReplacementForUnconvertibleUnicodeCharacters> iReplacementForUnconvertibleUnicodeCharacters;
604 CStandardNamesAndMibEnums* iStandardNamesAndMibEnums;
605 TBool iFullyConstructed;
606 CCharsetCnvCache* iCharsetCnvCache;
607 TBool iIsSystemStandardNamesAndMibEnumsScanned;