os/security/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of the License "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
*
sl@0
    16
*/
sl@0
    17
sl@0
    18
sl@0
    19
#include <e32std.h>
sl@0
    20
#include <e32base.h>
sl@0
    21
#include <utf.h>
sl@0
    22
sl@0
    23
#define STATIC_CAST(t,v) static_cast<t>(v)
sl@0
    24
#define CONST_CAST(t,v) const_cast<t>(v)
sl@0
    25
#define FOREVER for(;;)
sl@0
    26
sl@0
    27
const TUint KNotInBase64Alphabet=KMaxTUint;
sl@0
    28
sl@0
    29
enum TPanic
sl@0
    30
	{
sl@0
    31
	EPanicBad6BitNumber=1,
sl@0
    32
	EPanicBadUtf7Pointers1,
sl@0
    33
	EPanicBadUtf7Pointers2,
sl@0
    34
	EPanicBadUtf7Pointers3,
sl@0
    35
	EPanicBadUtf7Pointers4,
sl@0
    36
	EPanicBadUtf7Pointers5,
sl@0
    37
	EPanicBadUtf7Pointers6,
sl@0
    38
	EPanicBadUtf7Pointers7,
sl@0
    39
	EPanicBadUtf7Pointers8,
sl@0
    40
	EPanicBadUtf7Pointers9,
sl@0
    41
	EPanicBadUtf7Pointers10,
sl@0
    42
	EPanicBadUtf7Pointers11,
sl@0
    43
	EPanicNotInBase64Block,
sl@0
    44
	EPanicBadUnicodePointers1,
sl@0
    45
	EPanicBadUnicodePointers2,
sl@0
    46
	EPanicBadUnicodePointers3,
sl@0
    47
	EPanicBadUnicodePointers4,
sl@0
    48
	EPanicBadUnicodePointers5,
sl@0
    49
	EPanicBadUnicodePointers6,
sl@0
    50
	EPanicBadUnicodePointers7,
sl@0
    51
	EPanicBadUnicodePointers8,
sl@0
    52
	EPanicBadUnicodePointers9,
sl@0
    53
	EPanicBadUnicodePointers10,
sl@0
    54
	EPanicBadBitBufferState1,
sl@0
    55
	EPanicBadBitBufferState2,
sl@0
    56
	EPanicBadBitBufferState3,
sl@0
    57
	EPanicBadBitBufferState4,
sl@0
    58
	EPanicBadBitBufferState5,
sl@0
    59
	EPanicBadBitBufferState6,
sl@0
    60
	EPanicBadBitBufferState7,
sl@0
    61
	EPanicBadBitBufferState8,
sl@0
    62
	EPanicBadBitBufferState9,
sl@0
    63
	EPanicBadBitBufferState10,
sl@0
    64
	EPanicBadBitBufferState11,
sl@0
    65
	EPanicBadBitBufferState12,
sl@0
    66
	EPanicBadBitBufferState13,
sl@0
    67
	EPanicBadBitBufferState14,
sl@0
    68
	EPanicBadBitBufferState15,
sl@0
    69
	EPanicBadBitBufferState16,
sl@0
    70
	EPanicBadBitBufferState17,
sl@0
    71
	EPanicUnexpectedNumberOfLoopIterations,
sl@0
    72
	EPanicInitialEscapeCharacterButNoBase64,
sl@0
    73
	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
sl@0
    74
	EPanicBadUtf8Pointers1,
sl@0
    75
	EPanicBadUtf8Pointers2,
sl@0
    76
	EPanicBadUtf8Pointers3,
sl@0
    77
	EPanicBadUtf8Pointers4,
sl@0
    78
	EPanicBadUtf8Pointers5,
sl@0
    79
	EPanicBadUtf8Pointers6,
sl@0
    80
	EPanicBadUtf8Pointers7,
sl@0
    81
	EPanicOutOfSyncUtf7Byte1,
sl@0
    82
	EPanicOutOfSyncUtf7Byte2,
sl@0
    83
	EPanicOutOfSyncBase64Decoding
sl@0
    84
	};
sl@0
    85
sl@0
    86
_LIT(KLitPanicText, "CHARCONV-UTF");
sl@0
    87
sl@0
    88
LOCAL_C void Panic(TPanic aPanic)
sl@0
    89
	{
sl@0
    90
	User::Panic(KLitPanicText, aPanic);
sl@0
    91
	}
sl@0
    92
sl@0
    93
inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
sl@0
    94
sl@0
    95
inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
sl@0
    96
	{
sl@0
    97
	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
sl@0
    98
	}
sl@0
    99
sl@0
   100
sl@0
   101
sl@0
   102
sl@0
   103
sl@0
   104
sl@0
   105
 
sl@0
   106
sl@0
   107
/** Converts Unicode text into UTF-8 encoding.
sl@0
   108
sl@0
   109
@param aUtf8 On return, contains the UTF-8 encoded output string.
sl@0
   110
@param aUnicode The Unicode-encoded input string.
sl@0
   111
@return The number of unconverted characters left at the end of the input 
sl@0
   112
descriptor, or one of the error values defined in TError. */
sl@0
   113
EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
sl@0
   114
	{
sl@0
   115
	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
sl@0
   116
	}
sl@0
   117
sl@0
   118
sl@0
   119
sl@0
   120
/** Converts Unicode text into UTF-8 encoding. 
sl@0
   121
sl@0
   122
Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
sl@0
   123
sl@0
   124
The variant of UTF-8 used internally by Java differs slightly from standard 
sl@0
   125
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
sl@0
   126
sl@0
   127
@param aUtf8 On return, contains the UTF-8 encoded output string.
sl@0
   128
@param aUnicode A UCS-2 encoded input string.
sl@0
   129
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
sl@0
   130
UTF-8. The default is EFalse.
sl@0
   131
@return The number of unconverted characters left at the end of the input descriptor, 
sl@0
   132
or one of the error values defined in TError. */
sl@0
   133
TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
sl@0
   134
											   const TDesC16& aUnicode, 
sl@0
   135
											   TBool aGenerateJavaConformantUtf8)
sl@0
   136
	{
sl@0
   137
	if (aUnicode.Length() == 0)
sl@0
   138
		{
sl@0
   139
		aUtf8.SetLength(0);
sl@0
   140
		return 0;
sl@0
   141
		}
sl@0
   142
	if (aUtf8.MaxLength() == 0)
sl@0
   143
		{
sl@0
   144
		return aUnicode.Length();
sl@0
   145
		}
sl@0
   146
	
sl@0
   147
	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
sl@0
   148
	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
sl@0
   149
	TBool inputIsTruncated = EFalse;
sl@0
   150
	const TUint16* pUnicode = aUnicode.Ptr();
sl@0
   151
	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
sl@0
   152
	
sl@0
   153
	FOREVER
sl@0
   154
		{
sl@0
   155
		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
sl@0
   156
		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
sl@0
   157
	
sl@0
   158
		if (pUnicode[0] < 0x80)
sl@0
   159
			{
sl@0
   160
			// ascii - 1 byte
sl@0
   161
			
sl@0
   162
			// internally java is different since the \x0000 character is 
sl@0
   163
			// translated into \xC0 \x80.
sl@0
   164
			
sl@0
   165
			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
sl@0
   166
				{
sl@0
   167
				if (pUtf8 == pointerToLastUtf8Byte)
sl@0
   168
					{
sl@0
   169
					pUtf8--;
sl@0
   170
					pUnicode--;
sl@0
   171
					break;			
sl@0
   172
					}
sl@0
   173
				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
sl@0
   174
				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
sl@0
   175
				}
sl@0
   176
			else
sl@0
   177
				{
sl@0
   178
				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
sl@0
   179
				}
sl@0
   180
			}
sl@0
   181
		else if (pUnicode[0] < 0x800)
sl@0
   182
			{
sl@0
   183
			// U+0080..U+07FF - 2 bytes
sl@0
   184
			
sl@0
   185
			if (pUtf8 == pointerToLastUtf8Byte)
sl@0
   186
				{
sl@0
   187
				pUtf8--;
sl@0
   188
				pUnicode--;
sl@0
   189
				break;
sl@0
   190
				}
sl@0
   191
			
sl@0
   192
			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
sl@0
   193
			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
sl@0
   194
			
sl@0
   195
			}
sl@0
   196
sl@0
   197
		// check to see if we have a surrogate in the stream, surrogates encode code points outside
sl@0
   198
		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
sl@0
   199
sl@0
   200
		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
sl@0
   201
			{
sl@0
   202
			// surrogate pair - 4 bytes in utf-8
sl@0
   203
			// U+10000..U+10FFFF
sl@0
   204
			
sl@0
   205
			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
sl@0
   206
			// is there enough space to hold the character
sl@0
   207
			if ((pointerToLastUtf8Byte - pUtf8) < 3)
sl@0
   208
				{
sl@0
   209
				pUtf8--;
sl@0
   210
				pUnicode--;
sl@0
   211
				break;  // no go to the exit condition
sl@0
   212
				}
sl@0
   213
			
sl@0
   214
			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
sl@0
   215
			if (pUnicode >= pointerToLastUnicodeCharacter)
sl@0
   216
				{
sl@0
   217
				pUtf8--;
sl@0
   218
				pUnicode--;
sl@0
   219
				inputIsTruncated = ETrue;
sl@0
   220
				break; // middle of a surrogate pair. go to end condition
sl@0
   221
				}
sl@0
   222
			
sl@0
   223
			if ((pUnicode[1] & 0xfc00) != 0xdc00)
sl@0
   224
				{
sl@0
   225
				return EErrorIllFormedInput;
sl@0
   226
				}
sl@0
   227
			
sl@0
   228
			// convert utf-16 surrogate to utf-32
sl@0
   229
			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
sl@0
   230
			
sl@0
   231
			// convert utf-32 to utf-8
sl@0
   232
            *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
sl@0
   233
            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
sl@0
   234
            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
sl@0
   235
            *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
sl@0
   236
			
sl@0
   237
            // we consumed 2 utf-16 values, move this pointer
sl@0
   238
			pUnicode++;
sl@0
   239
			}		
sl@0
   240
		else
sl@0
   241
			{
sl@0
   242
			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
sl@0
   243
			
sl@0
   244
			if (pointerToLastUtf8Byte - pUtf8 < 2)
sl@0
   245
				{
sl@0
   246
				pUtf8--;
sl@0
   247
				pUnicode--;
sl@0
   248
				break;
sl@0
   249
				}
sl@0
   250
			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
sl@0
   251
			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
sl@0
   252
			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
sl@0
   253
			}
sl@0
   254
		
sl@0
   255
		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
sl@0
   256
			{
sl@0
   257
			break;
sl@0
   258
			}
sl@0
   259
		
sl@0
   260
		pUtf8++;
sl@0
   261
		pUnicode++;
sl@0
   262
		
sl@0
   263
		}
sl@0
   264
	
sl@0
   265
	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
sl@0
   266
		{
sl@0
   267
		return EErrorIllFormedInput;
sl@0
   268
		}
sl@0
   269
	
sl@0
   270
	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
sl@0
   271
	return pointerToLastUnicodeCharacter-pUnicode;
sl@0
   272
	}
sl@0
   273
sl@0
   274
sl@0
   275
sl@0
   276
sl@0
   277
sl@0
   278
 
sl@0
   279
sl@0
   280
sl@0
   281
sl@0
   282
sl@0
   283
sl@0
   284
/** Converts text encoded using the Unicode transformation format UTF-8 into the 
sl@0
   285
Unicode UCS-2 character set.
sl@0
   286
sl@0
   287
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   288
@param aUtf8 The UTF-8 encoded input string
sl@0
   289
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   290
or one of the error values defined in TError. */
sl@0
   291
EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
sl@0
   292
	{
sl@0
   293
	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
sl@0
   294
	}
sl@0
   295
sl@0
   296
static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
sl@0
   297
		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
sl@0
   298
	{
sl@0
   299
	if (aNumberOfUnconvertibleCharacters<=0)
sl@0
   300
		{
sl@0
   301
		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
sl@0
   302
		}
sl@0
   303
	++aNumberOfUnconvertibleCharacters;
sl@0
   304
	}
sl@0
   305
sl@0
   306
/** Converts text encoded using the Unicode transformation format UTF-8 into the 
sl@0
   307
Unicode UCS-2 character set.
sl@0
   308
sl@0
   309
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   310
@param aUtf8 The UTF-8 encoded input string
sl@0
   311
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
sl@0
   312
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   313
or one of the error values defined in TError. */
sl@0
   314
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
sl@0
   315
	{
sl@0
   316
	TInt dummyUnconverted, dummyUnconvertedIndex;
sl@0
   317
	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
sl@0
   318
	}
sl@0
   319
sl@0
   320
/** Converts text encoded using the Unicode transformation format UTF-8 into the 
sl@0
   321
Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
sl@0
   322
sl@0
   323
The variant of UTF-8 used internally by Java differs slightly from standard 
sl@0
   324
UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
sl@0
   325
sl@0
   326
@param aUnicode On return, contains the Unicode encoded output string.
sl@0
   327
@param aUtf8 The UTF-8 encoded input string
sl@0
   328
@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
sl@0
   329
UTF-8. The default is EFalse.
sl@0
   330
@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
sl@0
   331
which were not converted.
sl@0
   332
@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
sl@0
   333
of the first byte of the first unconvertible character. For instance if the 
sl@0
   334
first character in the input descriptor (aForeign) could not be converted, 
sl@0
   335
then this parameter is set to the first byte of that character, i.e. zero. 
sl@0
   336
A negative value is returned if all the characters were converted.
sl@0
   337
@return The number of unconverted bytes left at the end of the input descriptor, 
sl@0
   338
or one of the error values defined in TError. */
sl@0
   339
sl@0
   340
/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
sl@0
   341
 * Well formed UTF-8 Byte Sequences, full table.
sl@0
   342
 * +----------------------------------------------------------------+
sl@0
   343
 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
sl@0
   344
 * +--------------------+----------+----------+----------+----------+
sl@0
   345
 * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
sl@0
   346
 * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
sl@0
   347
 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
sl@0
   348
 * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
sl@0
   349
 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
sl@0
   350
 * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
sl@0
   351
 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
sl@0
   352
 * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
sl@0
   353
 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
sl@0
   354
 * +--------------------+----------+----------+----------+----------+
sl@0
   355
 * 
sl@0
   356
 * As a consequence of the well-formedness conditions specified in table 3-7,
sl@0
   357
 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
sl@0
   358
 */
sl@0
   359
TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
sl@0
   360
		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
sl@0
   361
	{	
sl@0
   362
	aUnicode.SetLength(0);
sl@0
   363
	
sl@0
   364
	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
sl@0
   365
		{
sl@0
   366
		return aUtf8.Length();
sl@0
   367
		}
sl@0
   368
sl@0
   369
	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
sl@0
   370
	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
sl@0
   371
	const TUint8*         pUtf8 = aUtf8.Ptr();   
sl@0
   372
	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
sl@0
   373
	const TUint16 replacementcharacter = 0xFFFD;
sl@0
   374
	TUint currentUnicodeCharacter;
sl@0
   375
	TUint sequenceLength;
sl@0
   376
sl@0
   377
	
sl@0
   378
	FOREVER
sl@0
   379
		{
sl@0
   380
		TBool illFormed=EFalse;
sl@0
   381
		
sl@0
   382
		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
sl@0
   383
		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
sl@0
   384
		
sl@0
   385
		sequenceLength = 1;
sl@0
   386
		
sl@0
   387
		// ascii - optimisation (i.e. it isn't a sequence)
sl@0
   388
		if (pUtf8[0] < 0x80)
sl@0
   389
			{
sl@0
   390
			currentUnicodeCharacter = pUtf8[0];
sl@0
   391
			}
sl@0
   392
		else
sl@0
   393
			{
sl@0
   394
			// see if well formed utf-8, use table above for reference	
sl@0
   395
			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
sl@0
   396
				{
sl@0
   397
				// 0xc1-0xc2 are not valid bytes
sl@0
   398
				sequenceLength = 2;
sl@0
   399
				}
sl@0
   400
			else if ((pUtf8[0] & 0xf0) == 0xe0)
sl@0
   401
				{
sl@0
   402
				sequenceLength = 3;
sl@0
   403
				}
sl@0
   404
			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
sl@0
   405
				{
sl@0
   406
				// 0xf5-0xff, are not valid bytes
sl@0
   407
				sequenceLength = 4;
sl@0
   408
				}
sl@0
   409
			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
sl@0
   410
				{
sl@0
   411
				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
sl@0
   412
					{
sl@0
   413
					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
sl@0
   414
					// the last character in the string) or we've
sl@0
   415
					// discovered a valid 0xc0 0x80 sequence.  
sl@0
   416
					sequenceLength = 2;
sl@0
   417
					}
sl@0
   418
				}
sl@0
   419
			
sl@0
   420
			/* checking to see if we got a valid sequence */
sl@0
   421
			if (sequenceLength == 1)
sl@0
   422
				{
sl@0
   423
				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
sl@0
   424
				currentUnicodeCharacter = replacementcharacter;
sl@0
   425
				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   426
						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   427
				}
sl@0
   428
			else
sl@0
   429
				{
sl@0
   430
				// this is a check to see if the sequence goes beyond the input 
sl@0
   431
				// stream.  if its not the first and only character in the input
sl@0
   432
				// stream this isn't an error, otherwise it is.
sl@0
   433
				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
sl@0
   434
					{
sl@0
   435
					// check to see if this sequence was the first character
sl@0
   436
					if ((pUnicode - aUnicode.Ptr()) == 0)
sl@0
   437
						{
sl@0
   438
						return EErrorIllFormedInput;
sl@0
   439
						}
sl@0
   440
					break;
sl@0
   441
					}			
sl@0
   442
				
sl@0
   443
				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
sl@0
   444
			
sl@0
   445
				/* check the trailing bytes, they should begin with 10 */
sl@0
   446
				TUint i = 1;
sl@0
   447
sl@0
   448
				do
sl@0
   449
					{
sl@0
   450
					if ((pUtf8[i] & 0xc0) == 0x80)
sl@0
   451
						{
sl@0
   452
						// add the trailing 6 bits to the current unicode char
sl@0
   453
						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
sl@0
   454
						}
sl@0
   455
					else
sl@0
   456
						{
sl@0
   457
						// ill formed character (doesn't have a lead 10)
sl@0
   458
						currentUnicodeCharacter = replacementcharacter;
sl@0
   459
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   460
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   461
						illFormed=ETrue;
sl@0
   462
						break; 
sl@0
   463
						}
sl@0
   464
					i++;
sl@0
   465
					}
sl@0
   466
				while (i < sequenceLength);
sl@0
   467
				}
sl@0
   468
				
sl@0
   469
			/* conformance check.  bits of above table for reference.
sl@0
   470
			 * +----------------------------------------------------------------+
sl@0
   471
			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
sl@0
   472
			 * +--------------------+----------+----------+----------+----------+
sl@0
   473
			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
sl@0
   474
			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
sl@0
   475
			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
sl@0
   476
			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
sl@0
   477
			 * +--------------------+----------+----------+----------+----------+
sl@0
   478
			 */
sl@0
   479
			
sl@0
   480
			if (currentUnicodeCharacter != replacementcharacter)
sl@0
   481
				{
sl@0
   482
				if (sequenceLength == 3)
sl@0
   483
					{
sl@0
   484
					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
sl@0
   485
						{
sl@0
   486
						currentUnicodeCharacter = replacementcharacter;
sl@0
   487
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   488
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   489
						illFormed=ETrue;
sl@0
   490
						}
sl@0
   491
					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
sl@0
   492
						{
sl@0
   493
						currentUnicodeCharacter = replacementcharacter;
sl@0
   494
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   495
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   496
						illFormed=ETrue;
sl@0
   497
						}
sl@0
   498
					}
sl@0
   499
				else if (sequenceLength == 4)
sl@0
   500
					{
sl@0
   501
					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
sl@0
   502
						{
sl@0
   503
						currentUnicodeCharacter = replacementcharacter;
sl@0
   504
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   505
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   506
						illFormed=ETrue;
sl@0
   507
						}
sl@0
   508
					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
sl@0
   509
						{
sl@0
   510
						currentUnicodeCharacter = replacementcharacter;
sl@0
   511
						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   512
								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   513
						illFormed=ETrue;
sl@0
   514
						}
sl@0
   515
					}
sl@0
   516
				
sl@0
   517
				
sl@0
   518
				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
sl@0
   519
				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
sl@0
   520
				 * points D800..DFFF is ill formed */
sl@0
   521
				
sl@0
   522
				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
sl@0
   523
					{
sl@0
   524
					currentUnicodeCharacter = replacementcharacter;
sl@0
   525
					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
sl@0
   526
							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
sl@0
   527
					illFormed=ETrue;
sl@0
   528
					}	
sl@0
   529
				}
sl@0
   530
				// end conformance check
sl@0
   531
			}
sl@0
   532
sl@0
   533
		// would this character generate a surrogate pair in UTF-16?
sl@0
   534
		if (currentUnicodeCharacter > 0xFFFF)
sl@0
   535
			{
sl@0
   536
			// is there enough space to hold a surrogate pair in the output?
sl@0
   537
			if (pUnicode >= pLastUnicode)
sl@0
   538
				{
sl@0
   539
				break; // no, end processing.
sl@0
   540
				}
sl@0
   541
			
sl@0
   542
			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
sl@0
   543
			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
sl@0
   544
					
sl@0
   545
			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
sl@0
   546
			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
sl@0
   547
			}
sl@0
   548
		else
sl@0
   549
			{
sl@0
   550
			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
sl@0
   551
			}
sl@0
   552
		
sl@0
   553
		// move the input pointer
sl@0
   554
		if (currentUnicodeCharacter != replacementcharacter)
sl@0
   555
			{
sl@0
   556
			pUtf8 += sequenceLength;
sl@0
   557
			}
sl@0
   558
		else if(illFormed == EFalse)
sl@0
   559
			{
sl@0
   560
			pUtf8 += (sequenceLength);
sl@0
   561
			}
sl@0
   562
		else
sl@0
   563
			{
sl@0
   564
			// we had a character we didn't recognize (i.e. it was invalid)
sl@0
   565
			// so move to the next character in the input
sl@0
   566
			pUtf8++;
sl@0
   567
			}
sl@0
   568
		
sl@0
   569
		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
sl@0
   570
			{ 
sl@0
   571
			break;  // we've either reached the end of the input or the end of output
sl@0
   572
			}
sl@0
   573
		}
sl@0
   574
sl@0
   575
	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
sl@0
   576
	return (pLastUtf8 - pUtf8 + 1);
sl@0
   577
	}
sl@0
   578
sl@0
   579
/** Given a sample text this function attempts to determine whether or not
sl@0
   580
 *  the same text is encoded using the UTF-8 standard encoding scheme.
sl@0
   581
sl@0
   582
@param TInt a confidence level, given at certain value.  if the given sample
sl@0
   583
			is UTF-8 this value will not be changed (unless > 100) then its
sl@0
   584
			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
sl@0
   585
@param TDesC8 sample text.
sl@0
   586
UTF-8. The default is EFalse.
sl@0
   587
@return void
sl@0
   588
 */
sl@0
   589
sl@0
   590
/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
sl@0
   591
 * Well formed UTF-8 Byte Sequences, full table.
sl@0
   592
 * +----------------------------------------------------------------+
sl@0
   593
 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
sl@0
   594
 * +--------------------+----------+----------+----------+----------+
sl@0
   595
 * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
sl@0
   596
 * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
sl@0
   597
 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
sl@0
   598
 * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
sl@0
   599
 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
sl@0
   600
 * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
sl@0
   601
 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
sl@0
   602
 * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
sl@0
   603
 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
sl@0
   604
 * +--------------------+----------+----------+----------+----------+
sl@0
   605
 * 
sl@0
   606
 * As a consequence of the well-formedness conditions specified in table 3-7,
sl@0
   607
 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
sl@0
   608
 * 
sl@0
   609
 * Code Rules:
sl@0
   610
 *   R1: If the string contains any non-UTF-8 characters the returned confidence
sl@0
   611
 *       is 0.  Valid UTF-8 combinations are listed in the above table.
sl@0
   612
 *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
sl@0
   613
 *       the (see ) the returned confidence is 95.
sl@0
   614
 *   R3: Otherwise the confidence returned is based upon the sample string 
sl@0
   615
 *       length.
sl@0
   616
 *   R4: If the sample string is under 75 characters, the confidence is set to 
sl@0
   617
 *       75.
sl@0
   618
 */
sl@0
   619
void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
sl@0
   620
	{
sl@0
   621
sl@0
   622
	TInt sampleLength = aSample.Length();
sl@0
   623
	
sl@0
   624
	if (sampleLength == 0)
sl@0
   625
		{
sl@0
   626
		aConfidenceLevel = 89;
sl@0
   627
		return;
sl@0
   628
		}
sl@0
   629
	TInt bytesRemaining  = 0;
sl@0
   630
	TUint sequenceLength  = 0;
sl@0
   631
	
sl@0
   632
	aConfidenceLevel = sampleLength;
sl@0
   633
sl@0
   634
	const TUint8* buffer = &aSample[0];
sl@0
   635
sl@0
   636
	if (sampleLength < 95)
sl@0
   637
		{
sl@0
   638
		// check for the BOM
sl@0
   639
		if ((sampleLength >= 3) && 
sl@0
   640
			((buffer[0] == 0xEF) &&
sl@0
   641
			 (buffer[1] == 0xBB) &&
sl@0
   642
			 (buffer[2] == 0xBF)) 
sl@0
   643
			) 
sl@0
   644
			{
sl@0
   645
			aConfidenceLevel = 95;
sl@0
   646
			}
sl@0
   647
		else if (sampleLength < 75)
sl@0
   648
			{
sl@0
   649
			aConfidenceLevel = 75;
sl@0
   650
			}
sl@0
   651
		}
sl@0
   652
	
sl@0
   653
	for (TInt index = 0;index != sampleLength;index++)
sl@0
   654
		{
sl@0
   655
		
sl@0
   656
		if (bytesRemaining > 0)
sl@0
   657
			{
sl@0
   658
			// bytesRemaining > 0, means that a byte representing the start of a 
sl@0
   659
			// multibyte sequence was encountered and the bytesRemaining is the 
sl@0
   660
			// number of bytes to follow. 
sl@0
   661
			
sl@0
   662
			if ((buffer[index] & 0xc0) == 0x80) 
sl@0
   663
				{
sl@0
   664
				// need to check for ill-formed sequences -- all are in the 2nd byte
sl@0
   665
				
sl@0
   666
				if ((sequenceLength == 3) && (bytesRemaining == 2))
sl@0
   667
					{
sl@0
   668
					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
sl@0
   669
						{
sl@0
   670
						aConfidenceLevel = 0;
sl@0
   671
						break;
sl@0
   672
						}
sl@0
   673
					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
sl@0
   674
						{
sl@0
   675
						aConfidenceLevel = 0;
sl@0
   676
						break;
sl@0
   677
						}
sl@0
   678
					}
sl@0
   679
				else if ((sequenceLength == 4) && (bytesRemaining == 3))
sl@0
   680
					{
sl@0
   681
					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
sl@0
   682
						{
sl@0
   683
						aConfidenceLevel = 0;
sl@0
   684
						break;
sl@0
   685
						}
sl@0
   686
					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
sl@0
   687
						{
sl@0
   688
						aConfidenceLevel = 0;
sl@0
   689
						break;
sl@0
   690
						}
sl@0
   691
					}
sl@0
   692
				
sl@0
   693
				--bytesRemaining;
sl@0
   694
				continue;
sl@0
   695
				}
sl@0
   696
			else
sl@0
   697
				{
sl@0
   698
				aConfidenceLevel = 0;
sl@0
   699
				break;
sl@0
   700
				}
sl@0
   701
			}
sl@0
   702
		
sl@0
   703
		if (bytesRemaining == 0)
sl@0
   704
			{
sl@0
   705
			if (buffer[index] < 0x80)
sl@0
   706
				{
sl@0
   707
				// The value of aSample[index] is in the range 0x00-0x7f
sl@0
   708
				//UTF8 maintains ASCII transparency. So it's a valid
sl@0
   709
				//UTF8. Do nothing, check next value.
sl@0
   710
				continue;
sl@0
   711
				}
sl@0
   712
			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
sl@0
   713
				{
sl@0
   714
				// valid start of a 2 byte sequence (see conformance note)
sl@0
   715
				sequenceLength = 2;
sl@0
   716
				bytesRemaining = 1;
sl@0
   717
				}
sl@0
   718
			else if ((buffer[index] & 0xf0) == 0xe0)
sl@0
   719
				{
sl@0
   720
				// valid start of a 3 byte sequence
sl@0
   721
				sequenceLength = 3;
sl@0
   722
				bytesRemaining = 2;
sl@0
   723
				}
sl@0
   724
			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
sl@0
   725
				{
sl@0
   726
				// valid start of a 4 byte sequence (see conformance note)
sl@0
   727
				sequenceLength = 4;
sl@0
   728
				bytesRemaining = 3;
sl@0
   729
				}	
sl@0
   730
			else
sl@0
   731
				{
sl@0
   732
				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
sl@0
   733
				aConfidenceLevel = 0;
sl@0
   734
				break;
sl@0
   735
				}
sl@0
   736
			}
sl@0
   737
		} // for 
sl@0
   738
	
sl@0
   739
	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
sl@0
   740
	}
sl@0
   741
sl@0
   742
// End of file