os/textandloc/charconvfw/charconv_fw/src/charconv/utf.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
*
sl@0
    16
*/
sl@0
    17
sl@0
    18
sl@0
    19
#include <e32std.h>
sl@0
    20
#include <e32base.h>
sl@0
    21
#include <utf.h>
sl@0
    22
sl@0
    23
const TUint KNotInBase64Alphabet=KMaxTUint;
sl@0
    24
sl@0
    25
enum TPanic
sl@0
    26
	{
sl@0
    27
	EPanicBad6BitNumber=1,
sl@0
    28
	EPanicBadUtf7Pointers1,
sl@0
    29
	EPanicBadUtf7Pointers2,
sl@0
    30
	EPanicBadUtf7Pointers3,
sl@0
    31
	EPanicBadUtf7Pointers4,
sl@0
    32
	EPanicBadUtf7Pointers5,
sl@0
    33
	EPanicBadUtf7Pointers6,
sl@0
    34
	EPanicBadUtf7Pointers7,
sl@0
    35
	EPanicBadUtf7Pointers8,
sl@0
    36
	EPanicBadUtf7Pointers9,
sl@0
    37
	EPanicBadUtf7Pointers10,
sl@0
    38
	EPanicBadUtf7Pointers11,
sl@0
    39
	EPanicNotInBase64Block,
sl@0
    40
	EPanicBadUnicodePointers1,
sl@0
    41
	EPanicBadUnicodePointers2,
sl@0
    42
	EPanicBadUnicodePointers3,
sl@0
    43
	EPanicBadUnicodePointers4,
sl@0
    44
	EPanicBadUnicodePointers5,
sl@0
    45
	EPanicBadUnicodePointers6,
sl@0
    46
	EPanicBadUnicodePointers7,
sl@0
    47
	EPanicBadUnicodePointers8,
sl@0
    48
	EPanicBadUnicodePointers9,
sl@0
    49
	EPanicBadUnicodePointers10,
sl@0
    50
	EPanicBadBitBufferState1,
sl@0
    51
	EPanicBadBitBufferState2,
sl@0
    52
	EPanicBadBitBufferState3,
sl@0
    53
	EPanicBadBitBufferState4,
sl@0
    54
	EPanicBadBitBufferState5,
sl@0
    55
	EPanicBadBitBufferState6,
sl@0
    56
	EPanicBadBitBufferState7,
sl@0
    57
	EPanicBadBitBufferState8,
sl@0
    58
	EPanicBadBitBufferState9,
sl@0
    59
	EPanicBadBitBufferState10,
sl@0
    60
	EPanicBadBitBufferState11,
sl@0
    61
	EPanicBadBitBufferState12,
sl@0
    62
	EPanicBadBitBufferState13,
sl@0
    63
	EPanicBadBitBufferState14,
sl@0
    64
	EPanicBadBitBufferState15,
sl@0
    65
	EPanicBadBitBufferState16,
sl@0
    66
	EPanicBadBitBufferState17,
sl@0
    67
	EPanicUnexpectedNumberOfLoopIterations,
sl@0
    68
	EPanicInitialEscapeCharacterButNoBase64,
sl@0
    69
	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
sl@0
    70
	EPanicBadUtf8Pointers1,
sl@0
    71
	EPanicBadUtf8Pointers2,
sl@0
    72
	EPanicBadUtf8Pointers3,
sl@0
    73
	EPanicBadUtf8Pointers4,
sl@0
    74
	EPanicBadUtf8Pointers5,
sl@0
    75
	EPanicBadUtf8Pointers6,
sl@0
    76
	EPanicBadUtf8Pointers7,
sl@0
    77
	EPanicOutOfSyncUtf7Byte1,
sl@0
    78
	EPanicOutOfSyncUtf7Byte2,
sl@0
    79
	EPanicOutOfSyncBase64Decoding
sl@0
    80
	};
sl@0
    81
sl@0
    82
_LIT(KLitPanicText, "CHARCONV-UTF");
sl@0
    83
sl@0
    84
LOCAL_C void Panic(TPanic aPanic)
sl@0
    85
	{
sl@0
    86
	User::Panic(KLitPanicText, aPanic);
sl@0
    87
	}
sl@0
    88
sl@0
    89
inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
sl@0
    90
sl@0
    91
LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7)
sl@0
    92
	{
sl@0
    93
	if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z'))
sl@0
    94
		{
sl@0
    95
		return aMemberOfBase64Alphabet-'A';
sl@0
    96
		}
sl@0
    97
	if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z'))
sl@0
    98
		{
sl@0
    99
		return aMemberOfBase64Alphabet-('a'-26);
sl@0
   100
		}
sl@0
   101
	if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9'))
sl@0
   102
		{
sl@0
   103
		return aMemberOfBase64Alphabet+((26*2)-'0');
sl@0
   104
		}
sl@0
   105
	if (aMemberOfBase64Alphabet=='+')
sl@0
   106
		{
sl@0
   107
		return 62;
sl@0
   108
		}
sl@0
   109
	if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/'))
sl@0
   110
		{
sl@0
   111
		return 63;
sl@0
   112
		}
sl@0
   113
	return KNotInBase64Alphabet;
sl@0
   114
	}
sl@0
   115
sl@0
   116
LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7)
sl@0
   117
	{
sl@0
   118
	__ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber));
sl@0
   119
	if ((a6BitNumber==63) && aIsImapUtf7)
sl@0
   120
		{
sl@0
   121
		return ',';
sl@0
   122
		}
sl@0
   123
	static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};
sl@0
   124
	return base64Alphabet[a6BitNumber];
sl@0
   125
	}
sl@0
   126
sl@0
   127
LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7)
sl@0
   128
	{
sl@0
   129
	__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1));
sl@0
   130
	TUint8* pointerToCandidateEscapeCharacter=NULL;
sl@0
   131
	FOREVER
sl@0
   132
		{
sl@0
   133
		const TUint utf7Byte=*aPointerToUtf7Byte;
sl@0
   134
		if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7))
sl@0
   135
			{
sl@0
   136
			pointerToCandidateEscapeCharacter=aPointerToUtf7Byte;
sl@0
   137
			}
sl@0
   138
		else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet)
sl@0
   139
			{
sl@0
   140
			break;
sl@0
   141
			}
sl@0
   142
		__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2));
sl@0
   143
		if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte)
sl@0
   144
			{
sl@0
   145
			break;
sl@0
   146
			}
sl@0
   147
		--aPointerToUtf7Byte;
sl@0
   148
		}
sl@0
   149
	__ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block));
sl@0
   150
	return pointerToCandidateEscapeCharacter;
sl@0
   151
	}
sl@0
   152
sl@0
   153
LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64)
sl@0
   154
	{
sl@0
   155
	if (aIsImapUtf7)
sl@0
   156
		{
sl@0
   157
		return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e);
sl@0
   158
		}
sl@0
   159
	if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d))
sl@0
   160
		{
sl@0
   161
		if (aEncodeOptionalDirectCharactersInBase64)
sl@0
   162
			{
sl@0
   163
			return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) ||
sl@0
   164
					((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) ||
sl@0
   165
					((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) ||
sl@0
   166
					((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) ||
sl@0
   167
					(aUnicodeCharacter==0x003f));
sl@0
   168
			}
sl@0
   169
		return aUnicodeCharacter!=0x005c;
sl@0
   170
		}
sl@0
   171
	return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a);
sl@0
   172
	}
sl@0
   173
sl@0
   174
inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
sl@0
   175
	{
sl@0
   176
	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
sl@0
   177
	}
sl@0
   178
sl@0
   179
sl@0
   180
sl@0
   181
/**  Converts Unicode text into UTF-7 encoding. The fucntion leaves with 
sl@0
   182
KErrCorrupt if the input string is corrupt.
sl@0
   183
sl@0
   184
@param aUnicode A UCS-2 encoded input string.
sl@0
   185
@param aEncodeOptionalDirectCharactersInBase64  If ETrue then 
sl@0
   186
characters from UTF-7 set O (optional direct characters) are encoded in 
sl@0
   187
Modified Base64. If EFalse the characters are encoded directly, 
sl@0
   188
as their ASCII equivalents.
sl@0
   189
@return A descriptor containing the UTF-7 encoded output string. */
sl@0
   190
EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L(
sl@0
   191
										const TDesC16& aUnicode, 
sl@0
   192
										TBool aEncodeOptionalDirectCharactersInBase64)
sl@0
   193
	{
sl@0
   194
	// If aUnicode is  Null string, return an empty HBufC
sl@0
   195
	if (aUnicode.Length() == 0)
sl@0
   196
		{
sl@0
   197
		HBufC8* hBuf8 = HBufC8::NewL(1);
sl@0
   198
		return hBuf8;
sl@0
   199
		}
sl@0
   200
sl@0
   201
	// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
sl@0
   202
	TInt length = aUnicode.Length();
sl@0
   203
	const TInt bufsize = 100;
sl@0
   204
	
sl@0
   205
	TPtrC16 unicode (aUnicode);
sl@0
   206
	TBuf8<bufsize> buf;
sl@0
   207
	HBufC8* hBuf8 = HBufC8::NewLC(length);
sl@0
   208
	TPtr8 utf7 = hBuf8->Des();
sl@0
   209
sl@0
   210
	FOREVER
sl@0
   211
		{
sl@0
   212
		TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64);
sl@0
   213
		if( unconverted == EErrorIllFormedInput || unconverted < 0)
sl@0
   214
			User::Leave(KErrCorrupt);
sl@0
   215
sl@0
   216
		if (utf7.Length() + buf.Length() > utf7.MaxLength())
sl@0
   217
			{
sl@0
   218
			// Reallocate the hBuf8
sl@0
   219
			hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length());
sl@0
   220
			CleanupStack::Pop();
sl@0
   221
			CleanupStack::PushL(hBuf8);
sl@0
   222
			utf7.Set(hBuf8->Des());
sl@0
   223
			}
sl@0
   224
		utf7.Append(buf);
sl@0
   225
		if (unconverted ==0) 
sl@0
   226
			break;
sl@0
   227
		unicode.Set(unicode.Right(unconverted));
sl@0
   228
		}
sl@0
   229
	CleanupStack::Pop();
sl@0
   230
	return hBuf8;
sl@0
   231
sl@0
   232
	}
sl@0
   233
sl@0
   234
/** Converts Unicode text into UTF-7 encoding.
sl@0
   235
sl@0
   236
@param aUtf7 On return, contains the UTF-7 encoded output string.
sl@0
   237
@param aUnicode A UCS-2 encoded input string.
sl@0
   238
@param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from 
sl@0
   239
UTF-7 set O (optional direct characters) are encoded in Modified Base64. If 
sl@0
   240
EFalse the characters are encoded directly, as their ASCII equivalents.
sl@0
   241
@return The number of unconverted characters left at the end of the input 
sl@0
   242
descriptor, or one of the error values defined in TError. */
sl@0
   243
EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(
sl@0
   244
										TDes8& aUtf7, 
sl@0
   245
										const TDesC16& aUnicode, 
sl@0
   246
										TBool aEncodeOptionalDirectCharactersInBase64)
sl@0
   247
	{
sl@0
   248
	return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64);
sl@0
   249
	}
sl@0
   250
sl@0
   251
TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7, 
sl@0
   252
											   const TDesC16& aUnicode, 
sl@0
   253
											   TBool aIsImapUtf7, 
sl@0
   254
											   TBool aEncodeOptionalDirectCharactersInBase64)
sl@0
   255
	{
sl@0
   256
	if (aUnicode.Length()==0)
sl@0
   257
		{
sl@0
   258
		aUtf7.SetLength(0);
sl@0
   259
		return 0;
sl@0
   260
		}
sl@0
   261
	if (aUtf7.MaxLength()==0)
sl@0
   262
		{
sl@0
   263
		return aUnicode.Length();
sl@0
   264
		}
sl@0
   265
	const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
sl@0
   266
	TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1);
sl@0
   267
	const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength();
sl@0
   268
	const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1;
sl@0
   269
	const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length();
sl@0
   270
	const TUint KIsInBase64Block=0x80000000u;
sl@0
   271
	TUint bitBuffer=0;
sl@0
   272
	TInt numberOfBitsInBuffer=0;
sl@0
   273
	FOREVER
sl@0
   274
		{
sl@0
   275
		__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3));
sl@0
   276
		__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1));
sl@0
   277
		TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1);
sl@0
   278
		if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64))
sl@0
   279
			{
sl@0
   280
			__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1));
sl@0
   281
			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2));
sl@0
   282
			if (bitBuffer&KIsInBase64Block)
sl@0
   283
				{
sl@0
   284
				if (numberOfBitsInBuffer!=0)
sl@0
   285
					{
sl@0
   286
					if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written
sl@0
   287
						{
sl@0
   288
						break;
sl@0
   289
						}
sl@0
   290
					++pointerToPreviousUtf7Byte;
sl@0
   291
					*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
sl@0
   292
					}
sl@0
   293
				else
sl@0
   294
					{
sl@0
   295
					if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte)
sl@0
   296
						{
sl@0
   297
						break;
sl@0
   298
						}
sl@0
   299
					}
sl@0
   300
				++pointerToPreviousUtf7Byte;
sl@0
   301
				*pointerToPreviousUtf7Byte='-';
sl@0
   302
				bitBuffer=0;
sl@0
   303
				numberOfBitsInBuffer=0;
sl@0
   304
				}
sl@0
   305
			__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2));
sl@0
   306
			if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter)
sl@0
   307
				{
sl@0
   308
				break;
sl@0
   309
				}
sl@0
   310
			__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4));
sl@0
   311
			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1))
sl@0
   312
				{
sl@0
   313
				break;
sl@0
   314
				}
sl@0
   315
			++pointerToPreviousUtf7Byte;
sl@0
   316
			*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter);
sl@0
   317
			++pointerToPreviousUnicodeCharacter;
sl@0
   318
			if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block)
sl@0
   319
				{
sl@0
   320
				++pointerToPreviousUtf7Byte;
sl@0
   321
				*pointerToPreviousUtf7Byte='-';
sl@0
   322
				}
sl@0
   323
			}
sl@0
   324
		else
sl@0
   325
			{
sl@0
   326
			{
sl@0
   327
			TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below
sl@0
   328
			if (~bitBuffer&KIsInBase64Block)
sl@0
   329
				{
sl@0
   330
				++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block
sl@0
   331
				}
sl@0
   332
			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired)
sl@0
   333
				{
sl@0
   334
				break;
sl@0
   335
				}
sl@0
   336
			}
sl@0
   337
			if (~bitBuffer&KIsInBase64Block)
sl@0
   338
				{
sl@0
   339
				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5));
sl@0
   340
				++pointerToPreviousUtf7Byte;
sl@0
   341
				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block);
sl@0
   342
				}
sl@0
   343
			bitBuffer<<=16;
sl@0
   344
			bitBuffer|=currentUnicodeCharacter;
sl@0
   345
			numberOfBitsInBuffer+=16;
sl@0
   346
			++pointerToPreviousUnicodeCharacter;
sl@0
   347
			__ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3));
sl@0
   348
			while (numberOfBitsInBuffer>=6)
sl@0
   349
				{
sl@0
   350
				numberOfBitsInBuffer-=6;
sl@0
   351
				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6));
sl@0
   352
				++pointerToPreviousUtf7Byte;
sl@0
   353
				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7));
sl@0
   354
				}
sl@0
   355
			bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state
sl@0
   356
			bitBuffer|=KIsInBase64Block;
sl@0
   357
			}
sl@0
   358
		}
sl@0
   359
	__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4));
sl@0
   360
	__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5));
sl@0
   361
	if (bitBuffer&KIsInBase64Block)
sl@0
   362
		{
sl@0
   363
#if defined(_DEBUG)
sl@0
   364
		TInt numberOfLoopIterations=1;
sl@0
   365
#endif
sl@0
   366
		FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time
sl@0
   367
			{
sl@0
   368
			__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7));
sl@0
   369
			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6));
sl@0
   370
			__ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations));
sl@0
   371
#if defined(_DEBUG)
sl@0
   372
			++numberOfLoopIterations;
sl@0
   373
#endif
sl@0
   374
			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-'
sl@0
   375
				{
sl@0
   376
				if (numberOfBitsInBuffer!=0)
sl@0
   377
					{
sl@0
   378
					__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8));
sl@0
   379
					++pointerToPreviousUtf7Byte;
sl@0
   380
					*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
sl@0
   381
					}
sl@0
   382
				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9));
sl@0
   383
				++pointerToPreviousUtf7Byte;
sl@0
   384
				*pointerToPreviousUtf7Byte='-';
sl@0
   385
				break;
sl@0
   386
				}
sl@0
   387
			// it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too
sl@0
   388
			TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7);
sl@0
   389
			const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block;
sl@0
   390
			__ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64));
sl@0
   391
			__ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary));
sl@0
   392
			pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence
sl@0
   393
			pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block;
sl@0
   394
			__ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10));
sl@0
   395
			if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character
sl@0
   396
				{
sl@0
   397
				--pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block
sl@0
   398
				break;
sl@0
   399
				}
sl@0
   400
			const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8;
sl@0
   401
			pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters;
sl@0
   402
			pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3;
sl@0
   403
			const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2;
sl@0
   404
			if (numberOfBitsToBeZeroedInLastBase64Character!=0)
sl@0
   405
				{
sl@0
   406
				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7));
sl@0
   407
				}
sl@0
   408
			bitBuffer=KIsInBase64Block;
sl@0
   409
			numberOfBitsInBuffer=0;
sl@0
   410
			}
sl@0
   411
		}
sl@0
   412
	aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1);
sl@0
   413
	return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter;
sl@0
   414
	}
sl@0
   415
sl@0
   416
 
sl@0
   417
sl@0
   418
/** Converts Unicode text into UTF-8 encoding.
sl@0
   419
sl@0
   420
@param aUtf8 On return, contains the UTF-8 encoded output string.
sl@0
   421
@param aUnicode The Unicode-encoded input string.
sl@0
   422
@return The number of unconverted characters left at the end of the input 
sl@0
   423
descriptor, or one of the error values defined in TError. */
sl@0
   424
EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
sl@0
   425
	{
sl@0
   426
	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
sl@0
   427
	}
sl@0
   428
sl@0
   429
sl@0
   430
/**  Converts Unicode text into UTF-8 encoding.
sl@0
   431
sl@0
   432
The variant of UTF-8 used internally by Java differs slightly from
sl@0
   433
standard UTF-8. The TBool argument controls the UTF-8
sl@0
   434
variant generated by this function. This function leaves with a 
sl@0
   435
KErrCorrupt if the input string is corrupt. 
sl@0
   436
sl@0
   437
@param aUnicode A UCS-2 encoded input string.
sl@0
   438
@return A pointer to an HBufC8 containing the converted UTF8. */	
sl@0
   439
EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode)
sl@0
   440
 	{
sl@0
   441
	// If aUnicode is  Null string, return an empty HBufC
sl@0
   442
	if (aUnicode.Length() == 0)
sl@0
   443
		{
sl@0
   444
		HBufC8* hBuf8 = HBufC8::NewL(1);
sl@0
   445
		return hBuf8;
sl@0
   446
		}
sl@0
   447
sl@0
   448
	// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
sl@0
   449
	const TInt length = aUnicode.Length();
sl@0
   450
	const TInt bufsize = 100;
sl@0
   451
	
sl@0
   452
	TPtrC16 unicode (aUnicode);
sl@0
   453
	TBuf8<bufsize> buf;
sl@0
   454
	HBufC8* hBuf8 = HBufC8::NewLC(length);
sl@0
   455
	TPtr8 utf8 = hBuf8->Des();
sl@0
   456
sl@0
   457
	FOREVER
sl@0
   458
		{
sl@0
   459
		TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode);
sl@0
   460
		if( unconverted == EErrorIllFormedInput || unconverted < 0)
sl@0
   461
			User::Leave(KErrCorrupt);
sl@0
   462
sl@0
   463
		if (utf8.Length() + buf.Length() > utf8.MaxLength())
sl@0
   464
			{
sl@0
   465
			// Reallocate the hBuf8
sl@0
   466
			hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length());
sl@0
   467
			CleanupStack::Pop();
sl@0
   468
			CleanupStack::PushL(hBuf8);
sl@0
   469
			utf8.Set(hBuf8->Des());
sl@0
   470
			}
sl@0
   471
		utf8.Append(buf);
sl@0
   472
		if (unconverted ==0) 
sl@0
   473
			break;
sl@0
   474
		unicode.Set(unicode.Right(unconverted));
sl@0
   475
		}
sl@0
   476
	CleanupStack::Pop();
sl@0
   477
	return hBuf8;
sl@0
   478
	}
sl@0
   479
sl@0
   480
/** Converts Unicode text into UTF-8 encoding. 
sl@0
   481
sl@0
   482
Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
sl@0
   483
sl@0
   484
The variant of UTF-8 used internally by Java differs slightly from standard 
sl@0
   485
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
sl@0
   486
sl@0
   487
@param aUtf8 On return, contains the UTF-8 encoded output string.
sl@0
   488
@param aUnicode A UCS-2 encoded input string.
sl@0
   489
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
sl@0
   490
UTF-8. The default is EFalse.
sl@0
   491
@return The number of unconverted characters left at the end of the input descriptor, 
sl@0
   492
or one of the error values defined in TError. */
sl@0
   493
TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
sl@0
   494
											   const TDesC16& aUnicode, 
sl@0
   495
											   TBool aGenerateJavaConformantUtf8)
sl@0
   496
	{
sl@0
   497
	if (aUnicode.Length() == 0)
sl@0
   498
		{
sl@0
   499
		aUtf8.SetLength(0);
sl@0
   500
		return 0;
sl@0
   501
		}
sl@0
   502
	if (aUtf8.MaxLength() == 0)
sl@0
   503
		{
sl@0
   504
		return aUnicode.Length();
sl@0
   505
		}
sl@0
   506
	
sl@0
   507
	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
sl@0
   508
	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
sl@0
   509
	TBool inputIsTruncated = EFalse;
sl@0
   510
	const TUint16* pUnicode = aUnicode.Ptr();
sl@0
   511
	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
sl@0
   512
	
sl@0
   513
	FOREVER
sl@0
   514
		{
sl@0
   515
		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
sl@0
   516
		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
sl@0
   517
	
sl@0
   518
		if (pUnicode[0] < 0x80)
sl@0
   519
			{
sl@0
   520
			// ascii - 1 byte
sl@0
   521
			
sl@0
   522
			// internally java is different since the \x0000 character is 
sl@0
   523
			// translated into \xC0 \x80.
sl@0
   524
			
sl@0
   525
			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
sl@0
   526
				{
sl@0
   527
				if (pUtf8 == pointerToLastUtf8Byte)
sl@0
   528
					{
sl@0
   529
					pUtf8--;
sl@0
   530
					pUnicode--;
sl@0
   531
					break;			
sl@0
   532
					}
sl@0
   533
				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
sl@0
   534
				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
sl@0
   535
				}
sl@0
   536
			else
sl@0
   537
				{
sl@0
   538
				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
sl@0
   539
				}
sl@0
   540
			}
sl@0
   541
		else if (pUnicode[0] < 0x800)
sl@0
   542
			{
sl@0
   543
			// U+0080..U+07FF - 2 bytes
sl@0
   544
			
sl@0
   545
			if (pUtf8 == pointerToLastUtf8Byte)
sl@0
   546
				{
sl@0
   547
				pUtf8--;
sl@0
   548
				pUnicode--;
sl@0
   549
				break;
sl@0
   550
				}
sl@0
   551
			
sl@0
   552
			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
sl@0
   553
			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
sl@0
   554
			
sl@0
   555
			}
sl@0
   556
sl@0
   557
		// check to see if we have a surrogate in the stream, surrogates encode code points outside
sl@0
   558
		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
sl@0
   559
sl@0
   560
		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
sl@0
   561
			{
sl@0
   562
			// surrogate pair - 4 bytes in utf-8
sl@0
   563
			// U+10000..U+10FFFF
sl@0
   564
			
sl@0
   565
			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
sl@0
   566
			// is there enough space to hold the character
sl@0
   567
			if ((pointerToLastUtf8Byte - pUtf8) < 3)
sl@0
   568
				{
sl@0
   569
				pUtf8--;
sl@0
   570
				pUnicode--;
sl@0
   571
				break;  // no go to the exit condition
sl@0
   572
				}
sl@0
   573
			
sl@0
   574
			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
sl@0
   575
			if (pUnicode >= pointerToLastUnicodeCharacter)
sl@0
   576
				{
sl@0
   577
				pUtf8--;
sl@0
   578
				pUnicode--;
sl@0
   579
				inputIsTruncated = ETrue;
sl@0
   580
				break; // middle of a surrogate pair. go to end condition
sl@0
   581
				}
sl@0
   582
			
sl@0
   583
			if ((pUnicode[1] & 0xfc00) != 0xdc00)
sl@0
   584
				{
sl@0
   585
				return EErrorIllFormedInput;
sl@0
   586
				}
sl@0
   587
			
sl@0
   588
			// convert utf-16 surrogate to utf-32
sl@0
   589
			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
sl@0
   590
			
sl@0
   591
			// convert utf-32 to utf-8
sl@0
   592
            *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
sl@0
   593
            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
sl@0
   594
            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
sl@0
   595
            *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
sl@0
   596
			
sl@0
   597
            // we consumed 2 utf-16 values, move this pointer
sl@0
   598
			pUnicode++;
sl@0
   599
			}		
sl@0
   600
		else
sl@0
   601
			{
sl@0
   602
			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
sl@0
   603
			
sl@0
   604
			if (pointerToLastUtf8Byte - pUtf8 < 2)
sl@0
   605
				{
sl@0
   606
				pUtf8--;
sl@0
   607
				pUnicode--;
sl@0
   608
				break;
sl@0
   609
				}
sl@0
   610
			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
sl@0
   611
			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
sl@0
   612
			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
sl@0
   613
			}
sl@0
   614
		
sl@0
   615
		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
sl@0
   616
			{
sl@0
   617
			break;
sl@0
   618
			}
sl@0
   619
		
sl@0
   620
		pUtf8++;
sl@0
   621
		pUnicode++;
sl@0
   622
		
sl@0
   623
		}
sl@0
   624
	
sl@0
   625
	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
sl@0
   626
		{
sl@0
   627
		return EErrorIllFormedInput;
sl@0
   628
		}
sl@0
   629
	
sl@0
   630
	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
sl@0
   631
	return pointerToLastUnicodeCharacter-pUnicode;
sl@0
   632
	}
sl@0
   633
sl@0
   634
sl@0
   635
sl@0
   636
/**  Converts text encoded using the Unicode transformation format UTF-7
sl@0
   637
into the Unicode UCS-2 character set.
sl@0
   638
sl@0
   639
@param aUtf7 The UTF-7 encoded input string.
sl@0
   640
@return A pointer to an HBufC16 containing the converted Unicode string */	
sl@0
   641
EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7)
sl@0
   642
	{
sl@0
   643
		// If aUtf8 is an empty string return 
sl@0
   644
	if (aUtf7.Length()==0)
sl@0
   645
		{
sl@0
   646
		HBufC16* hBuf = HBufC16::NewL(1);
sl@0
   647
		return hBuf;
sl@0
   648
		}
sl@0
   649
sl@0
   650
	// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
sl@0
   651
	// it when needed.
sl@0
   652
	TInt length = aUtf7.Length();
sl@0
   653
	const TInt bufsize = 100;
sl@0
   654
	TInt state = KStateDefault;
sl@0
   655
sl@0
   656
	TPtrC8 utf7 (aUtf7);
sl@0
   657
	TBuf<bufsize> buf;
sl@0
   658
	HBufC16* hBuf = HBufC16::NewLC(length);
sl@0
   659
	TPtr unicode = hBuf->Des();
sl@0
   660
sl@0
   661
	FOREVER
sl@0
   662
		{
sl@0
   663
		TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state);
sl@0
   664
		if( unconverted == EErrorIllFormedInput || unconverted < 0)
sl@0
   665
			User::Leave(KErrCorrupt);
sl@0
   666
sl@0
   667
		if (unicode.Length() + buf.Length() > unicode.MaxLength())
sl@0
   668
			{
sl@0
   669
			// Reallocate hBuf
sl@0
   670
			hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
sl@0
   671
			CleanupStack::Pop();
sl@0
   672
			CleanupStack::PushL(hBuf);
sl@0
   673
			unicode.Set(hBuf->Des());
sl@0
   674
			}
sl@0
   675
		unicode.Append(buf);
sl@0
   676
		if (unconverted ==0) 
sl@0
   677
			break;
sl@0
   678
		utf7.Set(utf7.Right(unconverted));
sl@0
   679
		}
sl@0
   680
	CleanupStack::Pop();
sl@0
   681
	return hBuf;
sl@0
   682
	}
sl@0
   683
sl@0
   684
 
sl@0
   685
sl@0
   686
/** Converts text encoded using the Unicode transformation format UTF-7 into the 
sl@0
   687
Unicode UCS-2 character set.
sl@0
   688
sl@0
   689
If the conversion is achieved using a series of calls to this function, where 
sl@0
   690
each call starts off where the previous call reached in the input descriptor, 
sl@0
   691
the state of the conversion is stored. The initial value of the state variable 
sl@0
   692
should be set as KStateDefault when the conversion is started, and afterwards 
sl@0
   693
simply passed unchanged into each function call.
sl@0
   694
sl@0
   695
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   696
@param aUtf7 The UTF-7 encoded input string.
sl@0
   697
@param aState For the first call of the function set to KStateDefault. For 
sl@0
   698
subsequent calls, pass in the variable unchanged.
sl@0
   699
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   700
or one of the error values defined in TError. */
sl@0
   701
EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, 
sl@0
   702
														const TDesC8& aUtf7, 
sl@0
   703
														TInt& aState)
sl@0
   704
	{
sl@0
   705
	return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState);
sl@0
   706
	}
sl@0
   707
sl@0
   708
TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, 
sl@0
   709
											   const TDesC8& aUtf7, 
sl@0
   710
											   TBool aIsImapUtf7, 
sl@0
   711
											   TInt& aState)
sl@0
   712
	{
sl@0
   713
	if (aUtf7.Length()==0)
sl@0
   714
		{
sl@0
   715
		aUnicode.SetLength(0);
sl@0
   716
		return 0;
sl@0
   717
		}
sl@0
   718
	if (aUnicode.MaxLength()==0)
sl@0
   719
		{
sl@0
   720
		return aUtf7.Length();
sl@0
   721
		}
sl@0
   722
	const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
sl@0
   723
	TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1);
sl@0
   724
	const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength();
sl@0
   725
	const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr();
sl@0
   726
	const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1);
sl@0
   727
	TUint currentUtf7Byte=*pointerToCurrentUtf7Byte;
sl@0
   728
	const TUint KIsInBase64Block=0x80000000u;
sl@0
   729
	TUint bitBuffer=STATIC_CAST(TUint, aState);
sl@0
   730
	TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4);
sl@0
   731
	bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer
sl@0
   732
	if (bitBuffer&KIsInBase64Block)
sl@0
   733
		{
sl@0
   734
		__ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7));
sl@0
   735
		__ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8));
sl@0
   736
		}
sl@0
   737
	else
sl@0
   738
		{
sl@0
   739
		__ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9));
sl@0
   740
		__ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10));
sl@0
   741
		}
sl@0
   742
	aState=KStateDefault;
sl@0
   743
	if (bitBuffer&KIsInBase64Block)
sl@0
   744
		{
sl@0
   745
		currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
sl@0
   746
		}
sl@0
   747
	TBool inputIsTruncated=EFalse;
sl@0
   748
	FOREVER
sl@0
   749
		{
sl@0
   750
		__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5));
sl@0
   751
		__ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11));
sl@0
   752
		__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1));
sl@0
   753
		__ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2));
sl@0
   754
		__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11));
sl@0
   755
		if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block))
sl@0
   756
			{
sl@0
   757
			if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
sl@0
   758
				{
sl@0
   759
				--pointerToCurrentUtf7Byte;
sl@0
   760
				inputIsTruncated=ETrue;
sl@0
   761
				goto end;
sl@0
   762
				}
sl@0
   763
			++pointerToCurrentUtf7Byte;
sl@0
   764
			currentUtf7Byte=*pointerToCurrentUtf7Byte;
sl@0
   765
			if (currentUtf7Byte=='-')
sl@0
   766
				{
sl@0
   767
				currentUtf7Byte=escapeCharacterForStartingBase64Block;
sl@0
   768
				}
sl@0
   769
			else
sl@0
   770
				{
sl@0
   771
				currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
sl@0
   772
				if (currentUtf7Byte==KNotInBase64Alphabet)
sl@0
   773
					{
sl@0
   774
					return EErrorIllFormedInput;
sl@0
   775
					}
sl@0
   776
				bitBuffer=KIsInBase64Block;
sl@0
   777
				}
sl@0
   778
			}
sl@0
   779
		if (bitBuffer&KIsInBase64Block)
sl@0
   780
			{
sl@0
   781
			FOREVER
sl@0
   782
				{
sl@0
   783
				__ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding));
sl@0
   784
				__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12));
sl@0
   785
				if (currentUtf7Byte==KNotInBase64Alphabet)
sl@0
   786
					{
sl@0
   787
					if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
sl@0
   788
						{
sl@0
   789
						return EErrorIllFormedInput;
sl@0
   790
						}
sl@0
   791
					bitBuffer=0;
sl@0
   792
					numberOfBitsInBuffer=0;
sl@0
   793
					currentUtf7Byte=*pointerToCurrentUtf7Byte;
sl@0
   794
					if (currentUtf7Byte=='-')
sl@0
   795
						{
sl@0
   796
						if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
sl@0
   797
							{
sl@0
   798
							goto end;
sl@0
   799
							}
sl@0
   800
						++pointerToCurrentUtf7Byte;
sl@0
   801
						currentUtf7Byte=*pointerToCurrentUtf7Byte;
sl@0
   802
						}
sl@0
   803
					break;
sl@0
   804
					}
sl@0
   805
				bitBuffer<<=6;
sl@0
   806
				bitBuffer|=currentUtf7Byte;
sl@0
   807
				bitBuffer|=KIsInBase64Block;
sl@0
   808
				numberOfBitsInBuffer+=6;
sl@0
   809
				// only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor)
sl@0
   810
				if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16)))
sl@0
   811
					{
sl@0
   812
					numberOfBitsInBuffer-=16;
sl@0
   813
					__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6));
sl@0
   814
					++pointerToPreviousUnicodeCharacter;
sl@0
   815
					*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer);
sl@0
   816
					bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated
sl@0
   817
					bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off
sl@0
   818
					if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)
sl@0
   819
						{
sl@0
   820
						goto end;
sl@0
   821
						}
sl@0
   822
					}
sl@0
   823
				if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
sl@0
   824
					{
sl@0
   825
					inputIsTruncated=ETrue;
sl@0
   826
					goto end;
sl@0
   827
					}
sl@0
   828
				++pointerToCurrentUtf7Byte;
sl@0
   829
				currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7);
sl@0
   830
				}
sl@0
   831
			}
sl@0
   832
		else
sl@0
   833
			{
sl@0
   834
			__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7));
sl@0
   835
			++pointerToPreviousUnicodeCharacter;
sl@0
   836
			*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte);
sl@0
   837
			if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte))
sl@0
   838
				{
sl@0
   839
				goto end;
sl@0
   840
				}
sl@0
   841
			++pointerToCurrentUtf7Byte;
sl@0
   842
			currentUtf7Byte=*pointerToCurrentUtf7Byte;
sl@0
   843
			}
sl@0
   844
		}
sl@0
   845
end:
sl@0
   846
	if (bitBuffer&KIsInBase64Block)
sl@0
   847
		{
sl@0
   848
		__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13));
sl@0
   849
		if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
sl@0
   850
			{
sl@0
   851
			// rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence
sl@0
   852
			__ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14));
sl@0
   853
			pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6;
sl@0
   854
			const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6;
sl@0
   855
			bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift
sl@0
   856
			bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer);
sl@0
   857
			bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState
sl@0
   858
			numberOfBitsInBuffer=newNumberOfBitsInBuffer;
sl@0
   859
			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15));
sl@0
   860
			}
sl@0
   861
		__ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16));
sl@0
   862
		aState=STATIC_CAST(TInt, bitBuffer);
sl@0
   863
		aState|=(numberOfBitsInBuffer<<4);
sl@0
   864
		__ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17));
sl@0
   865
		bitBuffer=0;
sl@0
   866
		numberOfBitsInBuffer=0;
sl@0
   867
		}
sl@0
   868
	if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated)
sl@0
   869
		{
sl@0
   870
		return EErrorIllFormedInput;
sl@0
   871
		}
sl@0
   872
	aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr());
sl@0
   873
	return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte;
sl@0
   874
	}
sl@0
   875
sl@0
   876
sl@0
   877
sl@0
   878
/** Converts text encoded using the Unicode transformation format UTF-8
sl@0
   879
into the Unicode UCS-2 character set. This function leaves with an 
sl@0
   880
error code of the input string is corrupted. 
sl@0
   881
sl@0
   882
@param aUtf8 The UTF-8 encoded input string
sl@0
   883
@return A pointer to an HBufC16 with the converted Unicode string. */	
sl@0
   884
EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8)
sl@0
   885
 	{
sl@0
   886
	// If aUtf8 is an empty string return 
sl@0
   887
	if (aUtf8.Length()==0)
sl@0
   888
		{
sl@0
   889
		HBufC16* hBuf = HBufC16::NewL(1);
sl@0
   890
		return hBuf;
sl@0
   891
		}
sl@0
   892
sl@0
   893
	// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
sl@0
   894
	// it when needed.
sl@0
   895
	TInt length = aUtf8.Length();
sl@0
   896
	const TInt bufsize = 100;
sl@0
   897
sl@0
   898
	TPtrC8 utf8 (aUtf8);
sl@0
   899
	TBuf<bufsize> buf;
sl@0
   900
	HBufC16* hBuf = HBufC16::NewLC(length);
sl@0
   901
	TPtr unicode = hBuf->Des();
sl@0
   902
sl@0
   903
	FOREVER
sl@0
   904
		{
sl@0
   905
		TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8);
sl@0
   906
		if( unconverted == EErrorIllFormedInput || unconverted < 0)
sl@0
   907
			User::Leave(KErrCorrupt);
sl@0
   908
sl@0
   909
		if (unicode.Length() + buf.Length() > unicode.MaxLength())
sl@0
   910
			{
sl@0
   911
			// Reallocate hBuf
sl@0
   912
			hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
sl@0
   913
			CleanupStack::Pop();
sl@0
   914
			CleanupStack::PushL(hBuf);
sl@0
   915
			unicode.Set(hBuf->Des());
sl@0
   916
			}
sl@0
   917
		unicode.Append(buf);
sl@0
   918
		if (unconverted ==0) 
sl@0
   919
			break;
sl@0
   920
		utf8.Set(utf8.Right(unconverted));
sl@0
   921
		}
sl@0
   922
	CleanupStack::Pop();
sl@0
   923
	return hBuf;
sl@0
   924
	}
sl@0
   925
sl@0
   926
/** Converts text encoded using the Unicode transformation format UTF-8 into the 
sl@0
   927
Unicode UCS-2 character set.
sl@0
   928
sl@0
   929
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   930
@param aUtf8 The UTF-8 encoded input string
sl@0
   931
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   932
or one of the error values defined in TError. */
sl@0
   933
EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
sl@0
   934
	{
sl@0
   935
	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
sl@0
   936
	}
sl@0
   937
sl@0
   938
static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
sl@0
   939
		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
sl@0
   940
	{
sl@0
   941
	if (aNumberOfUnconvertibleCharacters<=0)
sl@0
   942
		{
sl@0
   943
		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
sl@0
   944
		}
sl@0
   945
	++aNumberOfUnconvertibleCharacters;
sl@0
   946
	}
sl@0
   947
sl@0
   948
/** Converts text encoded using the Unicode transformation format UTF-8 into the 
sl@0
   949
Unicode UCS-2 character set.
sl@0
   950
sl@0
   951
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   952
@param aUtf8 The UTF-8 encoded input string
sl@0
   953
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
sl@0
   954
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   955
or one of the error values defined in TError. */
sl@0
   956
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
sl@0
   957
	{
sl@0
   958
	TInt dummyUnconverted, dummyUnconvertedIndex;
sl@0
   959
	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
sl@0
   960
	}
sl@0
   961
sl@0
   962
/** Converts text encoded using the Unicode transformation format UTF-8 into the 
sl@0
   963
Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
sl@0
   964
sl@0
   965
The variant of UTF-8 used internally by Java differs slightly from standard 
sl@0
   966
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
sl@0
   967
sl@0
   968
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   969
@param aUtf8 The UTF-8 encoded input string
sl@0
   970
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
sl@0
   971
UTF-8. The default is EFalse.
sl@0
   972
@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
sl@0
   973
which were not converted.
sl@0
   974
@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
sl@0
   975
of the first byte of the first unconvertible character. For instance if the 
sl@0
   976
first character in the input descriptor (aForeign) could not be converted, 
sl@0
   977
then this parameter is set to the first byte of that character, i.e. zero. 
sl@0
   978
A negative value is returned if all the characters were converted.
sl@0
   979
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   980
or one of the error values defined in TError. */
sl@0
   981
sl@0
   982
/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
sl@0
   983
 * Well formed UTF-8 Byte Sequences, full table.
sl@0
   984
 * +----------------------------------------------------------------+
sl@0
   985
 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
sl@0
   986
 * +--------------------+----------+----------+----------+----------+
sl@0
   987
 * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
sl@0
   988
 * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
sl@0
   989
 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
sl@0
   990
 * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
sl@0
   991
 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
sl@0
   992
 * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
sl@0
   993
 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
sl@0
   994
 * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
sl@0
   995
 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
sl@0
   996
 * +--------------------+----------+----------+----------+----------+
sl@0
   997
 * 
sl@0
   998
 * As a consequence of the well-formedness conditions specified in table 3-7,
sl@0
   999
 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
sl@0
  1000
 */
sl@0
  1001
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
sl@0
  1002
		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0
  1003
	{	
sl@0
  1004
	aUnicode.SetLength(0);
sl@0
  1005
	
sl@0
  1006
	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
sl@0
  1007
		{
sl@0
  1008
		return aUtf8.Length();
sl@0
  1009
		}
sl@0
  1010
sl@0
  1011
	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
sl@0
  1012
	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
sl@0
  1013
	const TUint8*         pUtf8 = aUtf8.Ptr();   
sl@0
  1014
	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
sl@0
  1015
	const TUint16 replacementcharacter = 0xFFFD;
sl@0
  1016
	TUint currentUnicodeCharacter;
sl@0
  1017
	TInt sequenceLength;
sl@0
  1018
sl@0
  1019
	
sl@0
  1020
	FOREVER
sl@0
  1021
		{
sl@0
  1022
		TBool illFormed=EFalse;
sl@0
  1023
		
sl@0
  1024
		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
sl@0
  1025
		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
sl@0
  1026
		
sl@0
  1027
		sequenceLength = 1;
sl@0
  1028
		
sl@0
  1029
		// ascii - optimisation (i.e. it isn't a sequence)
sl@0
  1030
		if (pUtf8[0] < 0x80)
sl@0
  1031
			{
sl@0
  1032
			currentUnicodeCharacter = pUtf8[0];
sl@0
  1033
			}
sl@0
  1034
		else
sl@0
  1035
			{
sl@0
  1036
			// see if well formed utf-8, use table above for reference	
sl@0
  1037
			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
sl@0
  1038
				{
sl@0
  1039
				// 0xc1-0xc2 are not valid bytes
sl@0
  1040
				sequenceLength = 2;
sl@0
  1041
				}
sl@0
  1042
			else if ((pUtf8[0] & 0xf0) == 0xe0)
sl@0
  1043
				{
sl@0
  1044
				sequenceLength = 3;
sl@0
  1045
				}
sl@0
  1046
			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
sl@0
  1047
				{
sl@0
  1048
				// 0xf5-0xff, are not valid bytes
sl@0
  1049
				sequenceLength = 4;
sl@0
  1050
				}
sl@0
  1051
			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
sl@0
  1052
				{
sl@0
  1053
				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
sl@0
  1054
					{
sl@0
  1055
					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
sl@0
  1056
					// the last character in the string) or we've
sl@0
  1057
					// discovered a valid 0xc0 0x80 sequence.  
sl@0
  1058
					sequenceLength = 2;
sl@0
  1059
					}
sl@0
  1060
				}
sl@0
  1061
			
sl@0
  1062
			/* checking to see if we got a valid sequence */
sl@0
  1063
			if (sequenceLength == 1)
sl@0
  1064
				{
sl@0
  1065
				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
sl@0
  1066
				currentUnicodeCharacter = replacementcharacter;
sl@0
  1067
				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1068
						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1069
				}
sl@0
  1070
			else
sl@0
  1071
				{
sl@0
  1072
				// this is a check to see if the sequence goes beyond the input 
sl@0
  1073
				// stream.  if its not the first and only character in the input
sl@0
  1074
				// stream this isn't an error, otherwise it is.
sl@0
  1075
				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
sl@0
  1076
					{
sl@0
  1077
					// check to see if this sequence was the first character
sl@0
  1078
					if ((pUnicode - aUnicode.Ptr()) == 0)
sl@0
  1079
						{
sl@0
  1080
						return EErrorIllFormedInput;
sl@0
  1081
						}
sl@0
  1082
					break;
sl@0
  1083
					}			
sl@0
  1084
				
sl@0
  1085
				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
sl@0
  1086
			
sl@0
  1087
				/* check the trailing bytes, they should begin with 10 */
sl@0
  1088
				TUint i = 1;
sl@0
  1089
sl@0
  1090
				do
sl@0
  1091
					{
sl@0
  1092
					if ((pUtf8[i] & 0xc0) == 0x80)
sl@0
  1093
						{
sl@0
  1094
						// add the trailing 6 bits to the current unicode char
sl@0
  1095
						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
sl@0
  1096
						}
sl@0
  1097
					else
sl@0
  1098
						{
sl@0
  1099
						// ill formed character (doesn't have a lead 10)
sl@0
  1100
						currentUnicodeCharacter = replacementcharacter;
sl@0
  1101
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1102
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1103
						illFormed=ETrue;
sl@0
  1104
						break; 
sl@0
  1105
						}
sl@0
  1106
					i++;
sl@0
  1107
					}
sl@0
  1108
				while (i < sequenceLength);
sl@0
  1109
				}
sl@0
  1110
				
sl@0
  1111
			/* conformance check.  bits of above table for reference.
sl@0
  1112
			 * +----------------------------------------------------------------+
sl@0
  1113
			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
sl@0
  1114
			 * +--------------------+----------+----------+----------+----------+
sl@0
  1115
			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
sl@0
  1116
			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
sl@0
  1117
			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
sl@0
  1118
			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
sl@0
  1119
			 * +--------------------+----------+----------+----------+----------+
sl@0
  1120
			 */
sl@0
  1121
			
sl@0
  1122
			if (currentUnicodeCharacter != replacementcharacter)
sl@0
  1123
				{
sl@0
  1124
				if (sequenceLength == 3)
sl@0
  1125
					{
sl@0
  1126
					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
sl@0
  1127
						{
sl@0
  1128
						currentUnicodeCharacter = replacementcharacter;
sl@0
  1129
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1130
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1131
						illFormed=ETrue;
sl@0
  1132
						}
sl@0
  1133
					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
sl@0
  1134
						{
sl@0
  1135
						currentUnicodeCharacter = replacementcharacter;
sl@0
  1136
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1137
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1138
						illFormed=ETrue;
sl@0
  1139
						}
sl@0
  1140
					}
sl@0
  1141
				else if (sequenceLength == 4)
sl@0
  1142
					{
sl@0
  1143
					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
sl@0
  1144
						{
sl@0
  1145
						currentUnicodeCharacter = replacementcharacter;
sl@0
  1146
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1147
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1148
						illFormed=ETrue;
sl@0
  1149
						}
sl@0
  1150
					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
sl@0
  1151
						{
sl@0
  1152
						currentUnicodeCharacter = replacementcharacter;
sl@0
  1153
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1154
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1155
						illFormed=ETrue;
sl@0
  1156
						}
sl@0
  1157
					}
sl@0
  1158
				
sl@0
  1159
				
sl@0
  1160
				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
sl@0
  1161
				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
sl@0
  1162
				 * points D800..DFFF is ill formed */
sl@0
  1163
				
sl@0
  1164
				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
sl@0
  1165
					{
sl@0
  1166
					currentUnicodeCharacter = replacementcharacter;
sl@0
  1167
					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
  1168
							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
  1169
					illFormed=ETrue;
sl@0
  1170
					}	
sl@0
  1171
				}
sl@0
  1172
				// end conformance check
sl@0
  1173
			}
sl@0
  1174
sl@0
  1175
		// would this character generate a surrogate pair in UTF-16?
sl@0
  1176
		if (currentUnicodeCharacter > 0xFFFF)
sl@0
  1177
			{
sl@0
  1178
			// is there enough space to hold a surrogate pair in the output?
sl@0
  1179
			if (pUnicode >= pLastUnicode)
sl@0
  1180
				{
sl@0
  1181
				break; // no, end processing.
sl@0
  1182
				}
sl@0
  1183
			
sl@0
  1184
			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
sl@0
  1185
			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
sl@0
  1186
					
sl@0
  1187
			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
sl@0
  1188
			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
sl@0
  1189
			}
sl@0
  1190
		else
sl@0
  1191
			{
sl@0
  1192
			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
sl@0
  1193
			}
sl@0
  1194
		
sl@0
  1195
		// move the input pointer
sl@0
  1196
		if (currentUnicodeCharacter != replacementcharacter)
sl@0
  1197
			{
sl@0
  1198
			pUtf8 += sequenceLength;
sl@0
  1199
			}
sl@0
  1200
		else if(illFormed == EFalse)
sl@0
  1201
			{
sl@0
  1202
			pUtf8 += (sequenceLength);
sl@0
  1203
			}
sl@0
  1204
		else
sl@0
  1205
			{
sl@0
  1206
			// we had a character we didn't recognize (i.e. it was invalid)
sl@0
  1207
			// so move to the next character in the input
sl@0
  1208
			pUtf8++;
sl@0
  1209
			}
sl@0
  1210
		
sl@0
  1211
		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
sl@0
  1212
			{ 
sl@0
  1213
			break;  // we've either reached the end of the input or the end of output
sl@0
  1214
			}
sl@0
  1215
		}
sl@0
  1216
sl@0
  1217
	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
sl@0
  1218
	return (pLastUtf8 - pUtf8 + 1);
sl@0
  1219
	}
sl@0
  1220
sl@0
  1221
/** Given a sample text this function attempts to determine whether or not
sl@0
  1222
 *  the same text is encoded using the UTF-8 standard encoding scheme.
sl@0
  1223
sl@0
  1224
@param TInt a confidence level, given at certain value.  if the given sample
sl@0
  1225
			is UTF-8 this value will not be changed (unless > 100) then its
sl@0
  1226
			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
sl@0
  1227
@param TDesC8 sample text.
sl@0
  1228
UTF-8. The default is EFalse.
sl@0
  1229
@return void
sl@0
  1230
 */
sl@0
  1231
sl@0
  1232
/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
sl@0
  1233
 * Well formed UTF-8 Byte Sequences, full table.
sl@0
  1234
 * +----------------------------------------------------------------+
sl@0
  1235
 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
sl@0
  1236
 * +--------------------+----------+----------+----------+----------+
sl@0
  1237
 * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
sl@0
  1238
 * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
sl@0
  1239
 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
sl@0
  1240
 * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
sl@0
  1241
 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
sl@0
  1242
 * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
sl@0
  1243
 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
sl@0
  1244
 * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
sl@0
  1245
 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
sl@0
  1246
 * +--------------------+----------+----------+----------+----------+
sl@0
  1247
 * 
sl@0
  1248
 * As a consequence of the well-formedness conditions specified in table 3-7,
sl@0
  1249
 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
sl@0
  1250
 * 
sl@0
  1251
 * Code Rules:
sl@0
  1252
 *   R1: If the string contains any non-UTF-8 characters the returned confidence
sl@0
  1253
 *       is 0.  Valid UTF-8 combinations are listed in the above table.
sl@0
  1254
 *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
sl@0
  1255
 *       the (see ) the returned confidence is 95.
sl@0
  1256
 *   R3: Otherwise the confidence returned is based upon the sample string 
sl@0
  1257
 *       length.
sl@0
  1258
 *   R4: If the sample string is under 75 characters, the confidence is set to 
sl@0
  1259
 *       75.
sl@0
  1260
 */
sl@0
  1261
GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
sl@0
  1262
	{
sl@0
  1263
sl@0
  1264
	TInt sampleLength = aSample.Length();
sl@0
  1265
	
sl@0
  1266
	if (sampleLength == 0)
sl@0
  1267
		{
sl@0
  1268
		aConfidenceLevel = 89;
sl@0
  1269
		return;
sl@0
  1270
		}
sl@0
  1271
	TInt bytesRemaining  = 0;
sl@0
  1272
	TInt sequenceLength  = 0;
sl@0
  1273
	
sl@0
  1274
	aConfidenceLevel = sampleLength;
sl@0
  1275
sl@0
  1276
	const TUint8* buffer = &aSample[0];
sl@0
  1277
sl@0
  1278
	if (sampleLength < 95)
sl@0
  1279
		{
sl@0
  1280
		// check for the BOM
sl@0
  1281
		if ((sampleLength >= 3) && 
sl@0
  1282
			((buffer[0] == 0xEF) &&
sl@0
  1283
			 (buffer[1] == 0xBB) &&
sl@0
  1284
			 (buffer[2] == 0xBF)) 
sl@0
  1285
			) 
sl@0
  1286
			{
sl@0
  1287
			aConfidenceLevel = 95;
sl@0
  1288
			}
sl@0
  1289
		else if (sampleLength < 75)
sl@0
  1290
			{
sl@0
  1291
			aConfidenceLevel = 75;
sl@0
  1292
			}
sl@0
  1293
		}
sl@0
  1294
	
sl@0
  1295
	for (TInt index = 0;index != sampleLength;index++)
sl@0
  1296
		{
sl@0
  1297
		
sl@0
  1298
		if (bytesRemaining > 0)
sl@0
  1299
			{
sl@0
  1300
			// bytesRemaining > 0, means that a byte representing the start of a 
sl@0
  1301
			// multibyte sequence was encountered and the bytesRemaining is the 
sl@0
  1302
			// number of bytes to follow. 
sl@0
  1303
			
sl@0
  1304
			if ((buffer[index] & 0xc0) == 0x80) 
sl@0
  1305
				{
sl@0
  1306
				// need to check for ill-formed sequences -- all are in the 2nd byte
sl@0
  1307
				
sl@0
  1308
				if ((sequenceLength == 3) && (bytesRemaining == 2))
sl@0
  1309
					{
sl@0
  1310
					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
sl@0
  1311
						{
sl@0
  1312
						aConfidenceLevel = 0;
sl@0
  1313
						break;
sl@0
  1314
						}
sl@0
  1315
					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
sl@0
  1316
						{
sl@0
  1317
						aConfidenceLevel = 0;
sl@0
  1318
						break;
sl@0
  1319
						}
sl@0
  1320
					}
sl@0
  1321
				else if ((sequenceLength == 4) && (bytesRemaining == 3))
sl@0
  1322
					{
sl@0
  1323
					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
sl@0
  1324
						{
sl@0
  1325
						aConfidenceLevel = 0;
sl@0
  1326
						break;
sl@0
  1327
						}
sl@0
  1328
					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
sl@0
  1329
						{
sl@0
  1330
						aConfidenceLevel = 0;
sl@0
  1331
						break;
sl@0
  1332
						}
sl@0
  1333
					}
sl@0
  1334
				
sl@0
  1335
				--bytesRemaining;
sl@0
  1336
				continue;
sl@0
  1337
				}
sl@0
  1338
			else
sl@0
  1339
				{
sl@0
  1340
				aConfidenceLevel = 0;
sl@0
  1341
				break;
sl@0
  1342
				}
sl@0
  1343
			}
sl@0
  1344
		
sl@0
  1345
		if (bytesRemaining == 0)
sl@0
  1346
			{
sl@0
  1347
			if (buffer[index] < 0x80)
sl@0
  1348
				{
sl@0
  1349
				// The value of aSample[index] is in the range 0x00-0x7f
sl@0
  1350
				//UTF8 maintains ASCII transparency. So it's a valid
sl@0
  1351
				//UTF8. Do nothing, check next value.
sl@0
  1352
				continue;
sl@0
  1353
				}
sl@0
  1354
			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
sl@0
  1355
				{
sl@0
  1356
				// valid start of a 2 byte sequence (see conformance note)
sl@0
  1357
				sequenceLength = 2;
sl@0
  1358
				bytesRemaining = 1;
sl@0
  1359
				}
sl@0
  1360
			else if ((buffer[index] & 0xf0) == 0xe0)
sl@0
  1361
				{
sl@0
  1362
				// valid start of a 3 byte sequence
sl@0
  1363
				sequenceLength = 3;
sl@0
  1364
				bytesRemaining = 2;
sl@0
  1365
				}
sl@0
  1366
			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
sl@0
  1367
				{
sl@0
  1368
				// valid start of a 4 byte sequence (see conformance note)
sl@0
  1369
				sequenceLength = 4;
sl@0
  1370
				bytesRemaining = 3;
sl@0
  1371
				}	
sl@0
  1372
			else
sl@0
  1373
				{
sl@0
  1374
				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
sl@0
  1375
				aConfidenceLevel = 0;
sl@0
  1376
				break;
sl@0
  1377
				}
sl@0
  1378
			}
sl@0
  1379
		} // for 
sl@0
  1380
	
sl@0
  1381
	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
sl@0
  1382
	}
sl@0
  1383
sl@0
  1384
GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample)
sl@0
  1385
	{
sl@0
  1386
	TInt sampleLength = aSample.Length();
sl@0
  1387
	aConfidenceLevel = 70;
sl@0
  1388
	for (TInt i=0; i<sampleLength; ++i)
sl@0
  1389
		{
sl@0
  1390
		// UTF-7 value ranges only 7 bits 
sl@0
  1391
		if((aSample[i]&0x80)!=0x00)
sl@0
  1392
			{
sl@0
  1393
			aConfidenceLevel= 0;
sl@0
  1394
			break;
sl@0
  1395
			}
sl@0
  1396
	
sl@0
  1397
		// there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7
sl@0
  1398
		else if (char(aSample[i])=='~')
sl@0
  1399
			{
sl@0
  1400
			aConfidenceLevel = 0; 
sl@0
  1401
			break;
sl@0
  1402
			}
sl@0
  1403
sl@0
  1404
		// The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format
sl@0
  1405
		else if ( (aSample[i]==0x1b) && (i <sampleLength-1) )
sl@0
  1406
			{
sl@0
  1407
			static const TInt smsExtensionTable[11] = 
sl@0
  1408
				{0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65};
sl@0
  1409
			TInt increment1 = i+1;
sl@0
  1410
			if (increment1>= sampleLength)
sl@0
  1411
				break;
sl@0
  1412
			for (TInt j=0; j < 11; ++j)
sl@0
  1413
				{
sl@0
  1414
				if (aSample[increment1] == smsExtensionTable[j])
sl@0
  1415
					{
sl@0
  1416
					aConfidenceLevel-=10;
sl@0
  1417
					}
sl@0
  1418
				}
sl@0
  1419
			}
sl@0
  1420
		// The UTF-7 escape char is 0x2b. The values that follow the escape sequence
sl@0
  1421
		// the values following the escape char value must belong to the modified base64
sl@0
  1422
		// or '-' else it is an ill-formed sequence, so probably not UTF-7
sl@0
  1423
		else if ( (aSample[i]==0x2b)  && (i <sampleLength-1) )
sl@0
  1424
			{
sl@0
  1425
			TInt increment1 = i+1;
sl@0
  1426
			if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) ||
sl@0
  1427
				((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) ||
sl@0
  1428
				((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a))) 
sl@0
  1429
				{
sl@0
  1430
				aConfidenceLevel+=5;
sl@0
  1431
				}
sl@0
  1432
			else
sl@0
  1433
				{
sl@0
  1434
				aConfidenceLevel-=15;
sl@0
  1435
				}
sl@0
  1436
			i++; // should this be here or up in the if loop ??
sl@0
  1437
			}
sl@0
  1438
		} //for
sl@0
  1439
	aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
sl@0
  1440
	}