First public contribution.
1 // Copyright (c) 1998-2010 Nokia Corporation and/or its subsidiary(-ies).
2 // All rights reserved.
3 // This component and the accompanying materials are made available
4 // under the terms of "Eclipse Public License v1.0"
5 // which accompanies this distribution, and is available
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
8 // Initial Contributors:
9 // Nokia Corporation - initial contribution.
14 // Implementation of the Standard Compression Scheme for Unicode.
15 // This code is compiled only in the Unicode build.
23 const TUint32 TUnicodeCompressionState::iStaticWindow[EStaticWindows] =
26 0x0080, // Latin-1 supplement
27 0x0100, // Latin Extended-A
28 0x0300, // Combining Diacritics
29 0x2000, // General Punctuation
30 0x2080, // Currency Symbols
31 0x2100, // Letterlike Symbols and Number Forms
32 0x3000 // CJK Symbols and Punctuation
35 const TUint32 TUnicodeCompressionState::iDynamicWindowDefault[EDynamicWindows] =
37 0x0080, // Latin-1 supplement
38 0x00C0, // parts of Latin-1 supplement and Latin Extended-A
44 0xFF00 // Fullwidth ASCII
47 const TUint16 TUnicodeCompressionState::iSpecialBase[ESpecialBases] =
49 0x00C0, // Latin 1 letters (not symbols) and some of Extended-A
50 0x0250, // IPA extensions
55 0xFF60 // Halfwidth katakana
58 // Single-byte mode tag values
59 const TUint8 SQ0 = 0x01; // <byte> quote from window 0
60 const TUint8 SDX = 0x0B; // <hbyte> <lbyte> define window in expansion area
61 const TUint8 SQU = 0x0E; // <hbyte> <lbyte> quote Unicode value
62 const TUint8 SCU = 0x0F; // switch to Unicode mode
63 const TUint8 SC0 = 0x10; // select dynamic window 0
64 const TUint8 SD0 = 0x18; // <byte> set dynamic window 0 index to <byte> and select it
66 // Unicode mode tag values
67 const TUint8 UC0 = 0xE0; // select dynamic window 0 and switch to single-byte mode
68 const TUint8 UD0 = 0xE8; // <byte> set dynamic window 0 index to <byte>, select it and switch to
70 const TUint8 UQU = 0xF0; // <hbyte>, <lbyte> quote Unicode value
71 const TUint8 UDX = 0xF1; // <hbyte>, <lbyte> define window in expansion area and switch to single-byte mode
73 TUnicodeCompressionState::TUnicodeCompressionState():
77 iMaxCompressedBytes(0)
82 void TUnicodeCompressionState::Reset()
85 iActiveWindowBase = 0x0080;
86 for (int i = 0; i < EDynamicWindows; i++)
87 iDynamicWindow[i] = iDynamicWindowDefault[i];
91 // Return the index of the static window that contains this code, if any, or -1 if there is none.
92 TInt TUnicodeCompressionState::StaticWindowIndex(TUint16 aCode)
94 for (TInt i = 0; i < EStaticWindows; i++)
95 if (aCode >= iStaticWindow[i] && aCode < iStaticWindow[i] + 128)
101 If aCode can be accommodated in one of the legal dynamic windows, return the index of that window
102 in the offset table. If not return KErrNotFound.
104 TInt TUnicodeCompressionState::DynamicWindowOffsetIndex(TUint16 aCode)
108 if (aCode >= 0x3400 && aCode <= 0xDFFF)
112 Prefer sections that cross half-block boundaries. These are better adapted to actual text.
113 They are represented by offset indices 0xf9..0xff.
115 for (int i = 0; i < ESpecialBases; i++)
116 if (aCode >= iSpecialBase[i] && aCode < iSpecialBase[i] + 128)
120 Offset indices 0x01..0x67 represent half blocks from 0x0080 to 0x3380 and
121 0x68..0xA7 represent half blocks from 0xE000 to 0xFF80.
128 // Return the base of the window represented by offset index <n>. Return 0 if the offset index is illegal.
129 TUint32 TUnicodeCompressionState::DynamicWindowBase(TInt aOffsetIndex)
131 if (aOffsetIndex >= 0xF9 && aOffsetIndex <= 0xFF)
134 WARNING: don't optimise the following two lines by replacing them with
135 'return iSpecialBase[aOffsetIndex - 0xF9];'. To do so would re-introduce an error
136 in ARM builds caused by optimisation and consequent erroneous fixing up
137 of the array base: see defect EDNGASR-4AGJQX in ER5U defects.
139 int special_base_index = aOffsetIndex - 0xF9;
140 return iSpecialBase[special_base_index];
142 if (aOffsetIndex >= 0x01 && aOffsetIndex <= 0x67)
143 return aOffsetIndex * 0x80;
144 if (aOffsetIndex >= 0x68 && aOffsetIndex <= 0xA7)
145 return aOffsetIndex * 0x80 + 0xAC00;
149 TBool TUnicodeCompressionState::EncodeAsIs(TUint16 aCode)
151 return aCode == 0x0000 || aCode == 0x0009 || aCode == 0x000A || aCode == 0x000D ||
152 (aCode >= 0x0020 && aCode <= 0x007F);
155 #pragma BullseyeCoverage off
157 void TUnicodeCompressionState::Panic(TPanic aPanic)
159 User::Panic(_L("ucmp"),aPanic);
162 #pragma BullseyeCoverage on
164 EXPORT_C TUnicodeCompressor::TUnicodeCompressor():
165 iInputBufferStart(0),
167 iOutputBufferStart(0),
168 iOutputBufferSize(0),
169 iDynamicWindowIndex(0),
171 iOutputPointer(NULL),
176 EXPORT_C void TUnicodeCompressor::CompressL(RWriteStream& aOutput,MUnicodeSource& aInput,
177 TInt aMaxOutputBytes,TInt aMaxInputWords,
178 TInt* aOutputBytes,TInt* aInputWords)
180 DoCompressL(&aOutput,NULL,&aInput,aMaxOutputBytes,aMaxInputWords,aOutputBytes,aInputWords);
183 EXPORT_C void TUnicodeCompressor::CompressL(TUint8* aOutput,MUnicodeSource& aInput,
184 TInt aMaxOutputBytes,TInt aMaxInputWords,
185 TInt* aOutputBytes,TInt* aInputWords)
187 DoCompressL(NULL,aOutput,&aInput,aMaxOutputBytes,aMaxInputWords,aOutputBytes,aInputWords);
190 EXPORT_C TInt TUnicodeCompressor::FlushL(RWriteStream& aOutput,TInt aMaxOutputBytes,TInt& aOutputBytes)
192 DoCompressL(&aOutput,NULL,NULL,aMaxOutputBytes,0,&aOutputBytes,NULL);
193 return iOutputBufferSize;
196 EXPORT_C TInt TUnicodeCompressor::FlushL(TUint8* aOutput,TInt aMaxOutputBytes,TInt& aOutputBytes)
198 DoCompressL(NULL,aOutput,NULL,aMaxOutputBytes,0,&aOutputBytes,NULL);
199 return iOutputBufferSize;
202 EXPORT_C TInt TUnicodeCompressor::CompressedSizeL(MUnicodeSource& aInput,TInt aInputWords)
205 TUnicodeCompressor c;
206 c.DoCompressL(NULL,NULL,&aInput,KMaxTInt,aInputWords,&bytes,NULL);
210 // Compress until input or output is exhausted or an exception occurs.
211 void TUnicodeCompressor::DoCompressL(RWriteStream* aOutputStream,TUint8* aOutputPointer,MUnicodeSource* aInput,
212 TInt aMaxOutputBytes,TInt aMaxInputWords,
213 TInt* aOutputBytes,TInt* aInputWords)
215 iOutputStream = aOutputStream;
216 iOutputPointer = aOutputPointer;
218 iMaxCompressedBytes = aMaxOutputBytes;
219 iMaxUnicodeWords = aMaxInputWords;
220 iCompressedBytes = iUnicodeWords = 0;
221 FlushOutputBufferL();
224 while (iUnicodeWords < iMaxUnicodeWords && iCompressedBytes < iMaxCompressedBytes)
226 TUint16 x = iInput->ReadUnicodeValueL();
228 iInputBuffer[(iInputBufferStart + iInputBufferSize) % EMaxInputBufferSize] = action;
231 if (iInputBufferSize == EMaxInputBufferSize)
237 *aOutputBytes = iCompressedBytes;
239 *aInputWords = iUnicodeWords;
242 TUnicodeCompressor::TAction::TAction(TUint16 aCode):
245 if (TUnicodeCompressionState::EncodeAsIs(aCode))
246 iTreatment = EPlainASCII;
249 iTreatment = TUnicodeCompressionState::DynamicWindowOffsetIndex(aCode);
250 if (iTreatment == -1)
252 iTreatment = TUnicodeCompressionState::StaticWindowIndex(aCode);
253 if (iTreatment == -1)
254 iTreatment = EPlainUnicode;
256 iTreatment += EFirstStatic;
261 void TUnicodeCompressor::WriteCharacterFromBuffer()
263 const TAction& action = iInputBuffer[iInputBufferStart];
265 iInputBufferStart = (iInputBufferStart + 1) % EMaxInputBufferSize;
266 WriteCharacter(action);
269 void TUnicodeCompressor::FlushInputBufferL()
271 while (iInputBufferSize > 0 && iCompressedBytes < iMaxCompressedBytes)
275 void TUnicodeCompressor::WriteRunL()
277 // Write out any leading characters that can be passed through.
279 while (iInputBufferSize > 0)
281 const TAction& action = iInputBuffer[iInputBufferStart];
282 if (action.iTreatment == TAction::EPlainASCII ||
283 (action.iCode >= iActiveWindowBase && action.iCode < iActiveWindowBase + 128))
284 WriteCharacterFromBuffer();
289 // Write a run of characters that cannot be passed through.
291 if (iInputBufferSize > 0)
294 Find a run of characters with the same treatment and select that treatment
295 if the run has more than one character.
297 int treatment = iInputBuffer[iInputBufferStart].iTreatment;
298 int next_treatment = treatment;
300 for (i = 1; i < iInputBufferSize; i++)
302 int index = (iInputBufferStart + i) % EMaxInputBufferSize;
303 next_treatment = iInputBuffer[index].iTreatment;
304 if (next_treatment != treatment)
309 SelectTreatment(treatment);
310 for (i = 0; i < run_size; i++)
311 WriteCharacterFromBuffer();
314 FlushOutputBufferL();
317 void TUnicodeCompressor::FlushOutputBufferL()
319 while (iOutputBufferSize > 0 && iCompressedBytes < iMaxCompressedBytes)
321 TUint8 byte = iOutputBuffer[iOutputBufferStart];
323 *iOutputPointer++ = byte;
324 else if (iOutputStream)
325 iOutputStream->WriteUint8L(byte);
328 iOutputBufferStart = (iOutputBufferStart + 1) % EMaxOutputBufferSize;
332 void TUnicodeCompressor::SelectTreatment(TInt aTreatment)
334 if (aTreatment == TAction::EPlainUnicode)
336 // Switch to Unicode mode if not there already.
345 if (aTreatment == TAction::EPlainASCII)
347 // Switch to single-byte mode, using the current dynamic window, if not there already.
350 WriteByte(UC0 + iDynamicWindowIndex);
351 iUnicodeMode = FALSE;
356 if (aTreatment >= TAction::EFirstDynamic && aTreatment <= TAction::ELastDynamic)
358 TUint32 base = DynamicWindowBase(aTreatment);
360 // Switch to the appropriate dynamic window if it is available; if not, redefine and select dynamic window 4.
361 for (int i = 0; i < EDynamicWindows; i++)
362 if (base == iDynamicWindow[i])
366 else if (i != iDynamicWindowIndex)
368 iUnicodeMode = FALSE;
369 iDynamicWindowIndex = i;
370 iActiveWindowBase = base;
377 iDynamicWindowIndex = 4;
378 iUnicodeMode = FALSE;
379 WriteByte(aTreatment);
380 iDynamicWindow[4] = base;
381 iActiveWindowBase = base;
386 // Write a character without changing mode or window.
387 void TUnicodeCompressor::WriteCharacter(const TAction& aAction)
390 WriteUCharacter(aAction.iCode);
392 WriteSCharacter(aAction);
395 void TUnicodeCompressor::WriteUCharacter(TUint16 aCode)
397 // Emit the 'quote Unicode' tag if the character would conflict with a tag.
398 if (aCode >= 0xE000 && aCode <= 0xF2FF)
401 // Write the Unicode value big-end first.
402 WriteByte((aCode >> 8) & 0xFF);
403 WriteByte(aCode & 0xFF);
406 void TUnicodeCompressor::WriteByte(TUint aByte)
408 if (iOutputBufferSize >= EMaxOutputBufferSize)
409 Panic(EOutputBufferOverflow); //Panic here is ok as this is a programming error
410 iOutputBuffer[(iOutputBufferStart + iOutputBufferSize) % EMaxOutputBufferSize] = (TUint8)aByte;
414 void TUnicodeCompressor::WriteSCharacter(const TAction& aAction)
416 // Characters in the range 0x0020..0x007F, plus nul, tab, cr, and lf, can be emitted as their low bytes.
417 if (aAction.iTreatment == TAction::EPlainASCII)
419 WriteByte(aAction.iCode);
423 // Characters in a static window can be written using SQ<n> plus a byte in the range 0x00-0x7F
424 if (aAction.iTreatment >= TAction::EFirstStatic && aAction.iTreatment <= TAction::ELastStatic)
426 int window = aAction.iTreatment - TAction::EFirstStatic;
427 WriteByte(SQ0 + window);
428 WriteByte(aAction.iCode);
432 // Characters in the current dynamic window can be written as a byte in the range 0x80-0xFF.
433 if (aAction.iCode >= iActiveWindowBase && aAction.iCode < iActiveWindowBase + 128)
435 WriteByte(aAction.iCode - iActiveWindowBase + 0x80);
439 // Characters in another dynamic window can be written using SQ<n> plus a byte in the range 0x80-0xFF
441 for (i = 0; i < EDynamicWindows; i++)
442 if (aAction.iCode >= iDynamicWindow[i] && aAction.iCode < iDynamicWindow[i] + 128)
445 WriteByte(aAction.iCode - iDynamicWindow[i] + 0x80);
449 // Other characters can be quoted.
451 WriteByte((aAction.iCode >> 8) & 0xFF);
452 WriteByte(aAction.iCode & 0xFF);
456 EXPORT_C TUnicodeExpander::TUnicodeExpander():
457 iInputBufferStart(0),
459 iOutputBufferStart(0),
460 iOutputBufferSize(0),
467 EXPORT_C void TUnicodeExpander::ExpandL(MUnicodeSink& aOutput,RReadStream& aInput,
468 TInt aMaxOutputWords,TInt aMaxInputBytes,
469 TInt* aOutputWords,TInt* aInputBytes)
471 DoExpandL(&aOutput,&aInput,NULL,aMaxOutputWords,aMaxInputBytes,aOutputWords,aInputBytes);
474 EXPORT_C void TUnicodeExpander::ExpandL(MUnicodeSink& aOutput,const TUint8* aInput,
475 TInt aMaxOutputWords,TInt aMaxInputBytes,
476 TInt* aOutputWords,TInt* aInputBytes)
478 DoExpandL(&aOutput,NULL,aInput,aMaxOutputWords,aMaxInputBytes,aOutputWords,aInputBytes);
481 EXPORT_C TInt TUnicodeExpander::FlushL(MUnicodeSink& aOutput,TInt aMaxOutputWords,TInt& aOutputWords)
483 DoExpandL(&aOutput,NULL,NULL,aMaxOutputWords,0,&aOutputWords,NULL);
484 return iOutputBufferSize;
487 EXPORT_C TInt TUnicodeExpander::ExpandedSizeL(RReadStream& aInput,TInt aInputBytes)
491 e.DoExpandL(NULL,&aInput,NULL,KMaxTInt,aInputBytes,&words,NULL);
495 EXPORT_C TInt TUnicodeExpander::ExpandedSizeL(const TUint8* aInput,TInt aInputBytes)
499 e.DoExpandL(NULL,NULL,aInput,KMaxTInt,aInputBytes,&words,NULL);
503 // Expand until input or output is exhausted or an exception occurs.
504 void TUnicodeExpander::DoExpandL(MUnicodeSink* aOutput,RReadStream* aInputStream,const TUint8* aInputPointer,
505 TInt aMaxOutputWords,TInt aMaxInputBytes,
506 TInt* aOutputWords,TInt* aInputBytes)
509 iInputStream = aInputStream;
510 iInputPointer = aInputPointer;
511 iMaxUnicodeWords = aMaxOutputWords;
512 iMaxCompressedBytes = aMaxInputBytes;
513 iUnicodeWords = iCompressedBytes = 0;
514 iInputBufferStart = 0;
515 FlushOutputBufferL();
516 if (iInputPointer || iInputStream)
518 while (iUnicodeWords + iOutputBufferSize < iMaxUnicodeWords && iCompressedBytes < iMaxCompressedBytes)
522 *aOutputWords = iUnicodeWords;
524 *aInputBytes = iCompressedBytes;
527 void TUnicodeExpander::HandleByteL()
530 TBool handled = FALSE;
534 handled = HandleUByteL(byte);
536 handled = HandleSByteL(byte);
538 iInputBufferStart = 0;
540 iInputBufferSize = 0;
541 FlushOutputBufferL();
544 void TUnicodeExpander::FlushOutputBufferL()
546 while (iOutputBufferSize > 0 && iUnicodeWords < iMaxUnicodeWords)
549 iOutput->WriteUnicodeValueL(iOutputBuffer[iOutputBufferStart]);
552 iOutputBufferStart = (iOutputBufferStart + 1) % EMaxOutputBufferSize;
556 TBool TUnicodeExpander::HandleSByteL(TUint8 aByte)
558 // 'Pass-through' codes.
559 if (TUnicodeCompressionState::EncodeAsIs(aByte))
565 // Codes 0x80-0xFF select a character from the active window.
568 WriteChar32(iActiveWindowBase + aByte - 0x80);
572 // SQU: quote a Unicode character.
574 return QuoteUnicodeL();
576 // SCU: switch to Unicode mode.
583 // SQn: quote from window n.
584 if (aByte >= SQ0 && aByte <= SQ0 + 7)
586 int window = aByte - SQ0;
592 c += iStaticWindow[window];
594 c += iDynamicWindow[window] - 0x80;
602 // SCn: switch to dynamic window n.
603 if (aByte >= SC0 && aByte <= SC0 + 7)
605 iActiveWindowBase = iDynamicWindow[aByte - SC0];
609 // SDn: define dynamic window n and switch to it.
610 if (aByte >= SD0 && aByte <= SD0 + 7)
611 return DefineWindowL(aByte - SD0);
613 // SDX: define window in the expansion space.
615 return DefineExpansionWindowL();
617 User::Leave(KErrCorrupt);
621 TBool TUnicodeExpander::HandleUByteL(TUint8 aByte)
623 // Plain Unicode; get the low byte and emit the Unicode value.
624 if (aByte <= 0xDF || aByte >= 0xF3)
629 TUint16 c = (TUint16)((aByte << 8) | lo);
637 // Quote a Unicode character that would otherwise conflict with a tag.
639 return QuoteUnicodeL();
641 // UCn: change to single byte mode and select window n.
642 if (aByte >= UC0 && aByte <= UC0 + 7)
644 iUnicodeMode = FALSE;
645 iActiveWindowBase = iDynamicWindow[aByte - UC0];
649 // UDn: define dynamic window n and switch to it.
650 if (aByte >= UD0 && aByte <= UD0 + 7)
651 return DefineWindowL(aByte - UD0);
653 // UDX: define window in the expansion space.
655 return DefineExpansionWindowL();
657 User::Leave(KErrCorrupt);
661 TBool TUnicodeExpander::QuoteUnicodeL()
664 if (ReadByteL(hi) && ReadByteL(lo))
666 TUint16 c = (TUint16)((hi << 8) | lo);
674 TBool TUnicodeExpander::DefineWindowL(TInt aIndex)
677 if (ReadByteL(window))
679 iUnicodeMode = FALSE;
680 iActiveWindowBase = DynamicWindowBase(window);
681 iDynamicWindow[aIndex] = iActiveWindowBase;
688 TBool TUnicodeExpander::DefineExpansionWindowL()
691 if (ReadByteL(hi) && ReadByteL(lo))
693 iUnicodeMode = FALSE;
694 iActiveWindowBase = 0x10000 + (0x80 * ((hi & 0x1F) * 0x100 + lo));
695 iDynamicWindow[hi >> 5] = iActiveWindowBase;
702 // Read either from the buffer (in the case of restarting after source finished in mid-operation) or from the source.
703 TBool TUnicodeExpander::ReadByteL(TUint8& aByte)
705 if (iInputBufferStart < iInputBufferSize)
707 aByte = iInputBuffer[iInputBufferStart++];
710 else if (iCompressedBytes < iMaxCompressedBytes)
713 aByte = *iInputPointer++;
715 aByte = iInputStream->ReadUint8L();
716 iInputBuffer[iInputBufferStart++] = aByte;
717 iInputBufferSize = iInputBufferStart;
725 void TUnicodeExpander::WriteChar(TUint16 aChar)
727 if (iOutputBufferSize >= EMaxOutputBufferSize)
728 Panic(EOutputBufferOverflow); //Panic here is ok since this is a programming error
729 iOutputBuffer[(iOutputBufferStart + iOutputBufferSize) % EMaxOutputBufferSize] = aChar;
733 // Write a Unicode character; write using surrogates if in the range 0x10000..0x10FFFF.
734 void TUnicodeExpander::WriteChar32(TUint aChar)
737 WriteChar((TUint16)aChar);
738 else if (aChar <= 0x10FFFF)
740 aChar -= 0x10000; // reduce to 20-bit value in the range 0x0..0xFFFFF
741 WriteChar((TUint16)(0xD800 + (aChar >> 10))); // first high surrogate + high 10 bits
742 WriteChar((TUint16)(0xDC00 + (aChar & 0x03FF))); // first low surrogate + low 10 bits
745 //Panic to be kept here as impossible to test this case (nor the one before). Biggest value that can be passed is 0xFFFFF