os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclEncoding.c
Update contrib.
4 * Contains the implementation of the encoding conversion package.
6 * Copyright (c) 1996-1998 Sun Microsystems, Inc.
7 * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.
9 * See the file "license.terms" for information on usage and redistribution
10 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
12 * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.14 2007/02/12 19:25:42 andreas_kupries Exp $
17 #if defined(__SYMBIAN32__)
18 #include "tclSymbianGlobals.h"
21 typedef size_t (LengthProc)_ANSI_ARGS_((CONST char *src));
24 * The following data structure represents an encoding, which describes how
25 * to convert between various character sets and UTF-8.
28 typedef struct Encoding {
29 char *name; /* Name of encoding. Malloced because (1)
30 * hash table entry that owns this encoding
31 * may be freed prior to this encoding being
32 * freed, (2) string passed in the
33 * Tcl_EncodingType structure may not be
35 Tcl_EncodingConvertProc *toUtfProc;
36 /* Procedure to convert from external
37 * encoding into UTF-8. */
38 Tcl_EncodingConvertProc *fromUtfProc;
39 /* Procedure to convert from UTF-8 into
40 * external encoding. */
41 Tcl_EncodingFreeProc *freeProc;
42 /* If non-NULL, procedure to call when this
43 * encoding is deleted. */
44 int nullSize; /* Number of 0x00 bytes that signify
45 * end-of-string in this encoding. This
46 * number is used to determine the source
47 * string length when the srcLen argument is
48 * negative. This number can be 1 or 2. */
49 ClientData clientData; /* Arbitrary value associated with encoding
50 * type. Passed to conversion procedures. */
51 LengthProc *lengthProc; /* Function to compute length of
52 * null-terminated strings in this encoding.
53 * If nullSize is 1, this is strlen; if
54 * nullSize is 2, this is a function that
55 * returns the number of bytes in a 0x0000
56 * terminated string. */
57 int refCount; /* Number of uses of this structure. */
58 Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */
62 * The following structure is the clientData for a dynamically-loaded,
63 * table-driven encoding created by LoadTableEncoding(). It maps between
64 * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only)
68 typedef struct TableEncodingData {
69 int fallback; /* Character (in this encoding) to
70 * substitute when this encoding cannot
71 * represent a UTF-8 character. */
72 char prefixBytes[256]; /* If a byte in the input stream is a lead
73 * byte for a 2-byte sequence, the
74 * corresponding entry in this array is 1,
75 * otherwise it is 0. */
76 unsigned short **toUnicode; /* Two dimensional sparse matrix to map
77 * characters from the encoding to Unicode.
78 * Each element of the toUnicode array points
79 * to an array of 256 shorts. If there is no
80 * corresponding character in Unicode, the
81 * value in the matrix is 0x0000. malloc'd. */
82 unsigned short **fromUnicode;
83 /* Two dimensional sparse matrix to map
84 * characters from Unicode to the encoding.
85 * Each element of the fromUnicode array
86 * points to an array of 256 shorts. If there
87 * is no corresponding character the encoding,
88 * the value in the matrix is 0x0000.
93 * The following structures is the clientData for a dynamically-loaded,
94 * escape-driven encoding that is itself comprised of other simpler
95 * encodings. An example is "iso-2022-jp", which uses escape sequences to
96 * switch between ascii, jis0208, jis0212, gb2312, and ksc5601. Note that
97 * "escape-driven" does not necessarily mean that the ESCAPE character is
98 * the character used for switching character sets.
101 typedef struct EscapeSubTable {
102 unsigned int sequenceLen; /* Length of following string. */
103 char sequence[16]; /* Escape code that marks this encoding. */
104 char name[32]; /* Name for encoding. */
105 Encoding *encodingPtr; /* Encoding loaded using above name, or NULL
106 * if this sub-encoding has not been needed
110 typedef struct EscapeEncodingData {
111 int fallback; /* Character (in this encoding) to
112 * substitute when this encoding cannot
113 * represent a UTF-8 character. */
114 unsigned int initLen; /* Length of following string. */
115 char init[16]; /* String to emit or expect before first char
117 unsigned int finalLen; /* Length of following string. */
118 char final[16]; /* String to emit or expect after last char
120 char prefixBytes[256]; /* If a byte in the input stream is the
121 * first character of one of the escape
122 * sequences in the following array, the
123 * corresponding entry in this array is 1,
124 * otherwise it is 0. */
125 int numSubTables; /* Length of following array. */
126 EscapeSubTable subTables[1];/* Information about each EscapeSubTable
127 * used by this encoding type. The actual
128 * size will be as large as necessary to
129 * hold all EscapeSubTables. */
130 } EscapeEncodingData;
133 * Constants used when loading an encoding file to identify the type of the
137 #define ENCODING_SINGLEBYTE 0
138 #define ENCODING_DOUBLEBYTE 1
139 #define ENCODING_MULTIBYTE 2
140 #define ENCODING_ESCAPE 3
142 #if !defined(__SYMBIAN32__) || !defined(__WINSCW__)
144 * Initialize the default encoding directory. If this variable contains
145 * a non NULL value, it will be the first path used to locate the
146 * system encoding files.
149 char *tclDefaultEncodingDir = NULL;
151 static int encodingsInitialized = 0;
154 * Hash table that keeps track of all loaded Encodings. Keys are
155 * the string names that represent the encoding, values are (Encoding *).
158 static Tcl_HashTable encodingTable;
159 TCL_DECLARE_MUTEX(encodingMutex)
162 * The following are used to hold the default and current system encodings.
163 * If NULL is passed to one of the conversion routines, the current setting
164 * of the system encoding will be used to perform the conversion.
167 static Tcl_Encoding defaultEncoding;
168 static Tcl_Encoding systemEncoding;
171 * The following variable is used in the sparse matrix code for a
172 * TableEncoding to represent a page in the table that has no entries.
175 static unsigned short emptyPage[256];
178 * Procedures used only in this module.
181 static int BinaryProc _ANSI_ARGS_((ClientData clientData,
182 CONST char *src, int srcLen, int flags,
183 Tcl_EncodingState *statePtr, char *dst, int dstLen,
184 int *srcReadPtr, int *dstWrotePtr,
186 static void DupEncodingIntRep _ANSI_ARGS_((Tcl_Obj *srcPtr,
188 static void EscapeFreeProc _ANSI_ARGS_((ClientData clientData));
189 static int EscapeFromUtfProc _ANSI_ARGS_((ClientData clientData,
190 CONST char *src, int srcLen, int flags,
191 Tcl_EncodingState *statePtr, char *dst, int dstLen,
192 int *srcReadPtr, int *dstWrotePtr,
194 static int EscapeToUtfProc _ANSI_ARGS_((ClientData clientData,
195 CONST char *src, int srcLen, int flags,
196 Tcl_EncodingState *statePtr, char *dst, int dstLen,
197 int *srcReadPtr, int *dstWrotePtr,
199 static void FreeEncoding _ANSI_ARGS_((Tcl_Encoding encoding));
200 static void FreeEncodingIntRep _ANSI_ARGS_((Tcl_Obj *objPtr));
201 static Encoding * GetTableEncoding _ANSI_ARGS_((
202 EscapeEncodingData *dataPtr, int state));
203 static Tcl_Encoding LoadEncodingFile _ANSI_ARGS_((Tcl_Interp *interp,
205 static Tcl_Encoding LoadTableEncoding _ANSI_ARGS_((Tcl_Interp *interp,
206 CONST char *name, int type, Tcl_Channel chan));
207 static Tcl_Encoding LoadEscapeEncoding _ANSI_ARGS_((CONST char *name,
209 static Tcl_Channel OpenEncodingFile _ANSI_ARGS_((CONST char *dir,
211 static void TableFreeProc _ANSI_ARGS_((ClientData clientData));
212 static int TableFromUtfProc _ANSI_ARGS_((ClientData clientData,
213 CONST char *src, int srcLen, int flags,
214 Tcl_EncodingState *statePtr, char *dst, int dstLen,
215 int *srcReadPtr, int *dstWrotePtr,
217 static int TableToUtfProc _ANSI_ARGS_((ClientData clientData,
218 CONST char *src, int srcLen, int flags,
219 Tcl_EncodingState *statePtr, char *dst, int dstLen,
220 int *srcReadPtr, int *dstWrotePtr,
222 static size_t unilen _ANSI_ARGS_((CONST char *src));
223 static int UnicodeToUtfProc _ANSI_ARGS_((ClientData clientData,
224 CONST char *src, int srcLen, int flags,
225 Tcl_EncodingState *statePtr, char *dst, int dstLen,
226 int *srcReadPtr, int *dstWrotePtr,
228 static int UtfToUnicodeProc _ANSI_ARGS_((ClientData clientData,
229 CONST char *src, int srcLen, int flags,
230 Tcl_EncodingState *statePtr, char *dst, int dstLen,
231 int *srcReadPtr, int *dstWrotePtr,
233 static int UtfToUtfProc _ANSI_ARGS_((ClientData clientData,
234 CONST char *src, int srcLen, int flags,
235 Tcl_EncodingState *statePtr, char *dst, int dstLen,
236 int *srcReadPtr, int *dstWrotePtr,
237 int *dstCharsPtr, int pureNullMode));
238 static int UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
239 CONST char *src, int srcLen, int flags,
240 Tcl_EncodingState *statePtr, char *dst, int dstLen,
241 int *srcReadPtr, int *dstWrotePtr,
243 static int UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
244 CONST char *src, int srcLen, int flags,
245 Tcl_EncodingState *statePtr, char *dst, int dstLen,
246 int *srcReadPtr, int *dstWrotePtr,
248 static int TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
251 * A Tcl_ObjType for holding a cached Tcl_Encoding as the intrep.
252 * This should help the lifetime of encodings be more useful.
253 * See concerns raised in [Bug 1077262].
256 static Tcl_ObjType EncodingType = {
257 "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL
262 *----------------------------------------------------------------------
264 * TclGetEncodingFromObj --
266 * Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr),
267 * if possible, and returns TCL_OK. If no such encoding exists,
268 * TCL_ERROR is returned, and if interp is non-NULL, an error message
272 * Standard Tcl return code.
275 * Caches the Tcl_Encoding value as the internal rep of (*objPtr).
277 *----------------------------------------------------------------------
280 TclGetEncodingFromObj(interp, objPtr, encodingPtr)
283 Tcl_Encoding *encodingPtr;
285 CONST char *name = Tcl_GetString(objPtr);
286 if (objPtr->typePtr != &EncodingType) {
287 Tcl_Encoding encoding = Tcl_GetEncoding(interp, name);
289 if (encoding == NULL) {
292 if (objPtr->typePtr && objPtr->typePtr->freeIntRepProc) {
293 objPtr->typePtr->freeIntRepProc(objPtr);
295 objPtr->internalRep.otherValuePtr = (VOID *) encoding;
296 objPtr->typePtr = &EncodingType;
298 *encodingPtr = Tcl_GetEncoding(NULL, name);
303 *----------------------------------------------------------------------
305 * FreeEncodingIntRep --
307 * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType.
309 *----------------------------------------------------------------------
312 FreeEncodingIntRep(objPtr)
315 Tcl_FreeEncoding((Tcl_Encoding) objPtr->internalRep.otherValuePtr);
319 *----------------------------------------------------------------------
321 * DupEncodingIntRep --
323 * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType.
325 *----------------------------------------------------------------------
328 DupEncodingIntRep(srcPtr, dupPtr)
332 dupPtr->internalRep.otherValuePtr = (VOID *)
333 Tcl_GetEncoding(NULL, srcPtr->bytes);
337 *---------------------------------------------------------------------------
339 * TclInitEncodingSubsystem --
341 * Initialize all resources used by this subsystem on a per-process
348 * Depends on the memory, object, and IO subsystems.
350 *---------------------------------------------------------------------------
354 TclInitEncodingSubsystem()
356 Tcl_EncodingType type;
358 Tcl_MutexLock(&encodingMutex);
359 Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
360 Tcl_MutexUnlock(&encodingMutex);
363 * Create a few initial encodings. Note that the UTF-8 to UTF-8
364 * translation is not a no-op, because it will turn a stream of
365 * improperly formed UTF-8 into a properly formed stream.
368 type.encodingName = "identity";
369 type.toUtfProc = BinaryProc;
370 type.fromUtfProc = BinaryProc;
371 type.freeProc = NULL;
373 type.clientData = NULL;
375 defaultEncoding = Tcl_CreateEncoding(&type);
376 systemEncoding = Tcl_GetEncoding(NULL, type.encodingName);
378 type.encodingName = "utf-8";
379 type.toUtfProc = UtfExtToUtfIntProc;
380 type.fromUtfProc = UtfIntToUtfExtProc;
381 type.freeProc = NULL;
383 type.clientData = NULL;
384 Tcl_CreateEncoding(&type);
386 type.encodingName = "unicode";
387 type.toUtfProc = UnicodeToUtfProc;
388 type.fromUtfProc = UtfToUnicodeProc;
389 type.freeProc = NULL;
391 type.clientData = NULL;
392 Tcl_CreateEncoding(&type);
397 *----------------------------------------------------------------------
399 * TclFinalizeEncodingSubsystem --
401 * Release the state associated with the encoding subsystem.
407 * Frees all of the encodings.
409 *----------------------------------------------------------------------
413 TclFinalizeEncodingSubsystem()
415 Tcl_HashSearch search;
418 Tcl_MutexLock(&encodingMutex);
419 encodingsInitialized = 0;
420 FreeEncoding(systemEncoding);
421 hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
422 while (hPtr != NULL) {
424 * Call FreeEncoding instead of doing it directly to handle refcounts
425 * like escape encodings use. [Bug #524674]
426 * Make sure to call Tcl_FirstHashEntry repeatedly so that all
427 * encodings are eventually cleaned up.
429 FreeEncoding((Tcl_Encoding) Tcl_GetHashValue(hPtr));
430 hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
432 Tcl_DeleteHashTable(&encodingTable);
433 Tcl_MutexUnlock(&encodingMutex);
437 *-------------------------------------------------------------------------
439 * Tcl_GetDefaultEncodingDir --
446 *-------------------------------------------------------------------------
449 EXPORT_C CONST char *
450 Tcl_GetDefaultEncodingDir()
452 return tclDefaultEncodingDir;
456 *-------------------------------------------------------------------------
458 * Tcl_SetDefaultEncodingDir --
465 *-------------------------------------------------------------------------
469 Tcl_SetDefaultEncodingDir(path)
472 tclDefaultEncodingDir = (char *)ckalloc((unsigned) strlen(path) + 1);
473 strcpy(tclDefaultEncodingDir, path);
477 *-------------------------------------------------------------------------
481 * Given the name of a encoding, find the corresponding Tcl_Encoding
482 * token. If the encoding did not already exist, Tcl attempts to
483 * dynamically load an encoding by that name.
486 * Returns a token that represents the encoding. If the name didn't
487 * refer to any known or loadable encoding, NULL is returned. If
488 * NULL was returned, an error message is left in interp's result
489 * object, unless interp was NULL.
492 * The new encoding type is entered into a table visible to all
493 * interpreters, keyed off the encoding's name. For each call to
494 * this procedure, there should eventually be a call to
495 * Tcl_FreeEncoding, so that the database can be cleaned up when
496 * encodings aren't needed anymore.
498 *-------------------------------------------------------------------------
501 EXPORT_C Tcl_Encoding
502 Tcl_GetEncoding(interp, name)
503 Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */
504 CONST char *name; /* The name of the desired encoding. */
507 Encoding *encodingPtr;
509 Tcl_MutexLock(&encodingMutex);
511 encodingPtr = (Encoding *) systemEncoding;
512 encodingPtr->refCount++;
513 Tcl_MutexUnlock(&encodingMutex);
514 return systemEncoding;
517 hPtr = Tcl_FindHashEntry(&encodingTable, name);
519 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
520 encodingPtr->refCount++;
521 Tcl_MutexUnlock(&encodingMutex);
522 return (Tcl_Encoding) encodingPtr;
524 Tcl_MutexUnlock(&encodingMutex);
525 return LoadEncodingFile(interp, name);
529 *---------------------------------------------------------------------------
531 * Tcl_FreeEncoding --
533 * This procedure is called to release an encoding allocated by
534 * Tcl_CreateEncoding() or Tcl_GetEncoding().
540 * The reference count associated with the encoding is decremented
541 * and the encoding may be deleted if nothing is using it anymore.
543 *---------------------------------------------------------------------------
547 Tcl_FreeEncoding(encoding)
548 Tcl_Encoding encoding;
550 Tcl_MutexLock(&encodingMutex);
551 FreeEncoding(encoding);
552 Tcl_MutexUnlock(&encodingMutex);
556 *----------------------------------------------------------------------
560 * This procedure is called to release an encoding by procedures
561 * that already have the encodingMutex.
567 * The reference count associated with the encoding is decremented
568 * and the encoding may be deleted if nothing is using it anymore.
570 *----------------------------------------------------------------------
574 FreeEncoding(encoding)
575 Tcl_Encoding encoding;
577 Encoding *encodingPtr;
579 encodingPtr = (Encoding *) encoding;
580 if (encodingPtr == NULL) {
583 encodingPtr->refCount--;
584 if (encodingPtr->refCount == 0) {
585 if (encodingPtr->freeProc != NULL) {
586 (*encodingPtr->freeProc)(encodingPtr->clientData);
588 if (encodingPtr->hPtr != NULL) {
589 Tcl_DeleteHashEntry(encodingPtr->hPtr);
591 ckfree((char *) encodingPtr->name);
592 ckfree((char *) encodingPtr);
597 *-------------------------------------------------------------------------
599 * Tcl_GetEncodingName --
601 * Given an encoding, return the name that was used to constuct
605 * The name of the encoding.
610 *---------------------------------------------------------------------------
613 EXPORT_C CONST char *
614 Tcl_GetEncodingName(encoding)
615 Tcl_Encoding encoding; /* The encoding whose name to fetch. */
617 Encoding *encodingPtr;
619 if (encoding == NULL) {
620 encoding = systemEncoding;
622 encodingPtr = (Encoding *) encoding;
623 return encodingPtr->name;
627 *-------------------------------------------------------------------------
629 * Tcl_GetEncodingNames --
631 * Get the list of all known encodings, including the ones stored
632 * as files on disk in the encoding path.
635 * Modifies interp's result object to hold a list of all the available
641 *-------------------------------------------------------------------------
645 Tcl_GetEncodingNames(interp)
646 Tcl_Interp *interp; /* Interp to hold result. */
648 Tcl_HashSearch search;
650 Tcl_Obj *pathPtr, *resultPtr;
655 Tcl_MutexLock(&encodingMutex);
656 Tcl_InitHashTable(&table, TCL_STRING_KEYS);
657 hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
658 while (hPtr != NULL) {
659 Encoding *encodingPtr;
661 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
662 Tcl_CreateHashEntry(&table, encodingPtr->name, &dummy);
663 hPtr = Tcl_NextHashEntry(&search);
665 Tcl_MutexUnlock(&encodingMutex);
667 pathPtr = TclGetLibraryPath();
668 if (pathPtr != NULL) {
671 char globArgString[10];
672 Tcl_Obj* encodingObj = Tcl_NewStringObj("encoding",-1);
673 Tcl_IncrRefCount(encodingObj);
676 Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv);
678 for (i = 0; i < objc; i++) {
682 * Construct the path from the element of pathPtr,
683 * joined with 'encoding'.
685 searchIn = Tcl_FSJoinToPath(objv[i],1,&encodingObj);
686 Tcl_IncrRefCount(searchIn);
687 Tcl_ResetResult(interp);
690 * TclGlob() changes the contents of globArgString, which causes
691 * a segfault if we pass in a pointer to non-writeable memory.
692 * TclGlob() puts its results directly into interp.
695 strcpy(globArgString, "*.enc");
697 * The GLOBMODE_TAILS flag returns just the tail of each file
698 * which is the encoding name with a .enc extension
700 if ((TclGlob(interp, globArgString, searchIn,
701 TCL_GLOBMODE_TAILS, NULL) == TCL_OK)) {
706 Tcl_ListObjGetElements(NULL, Tcl_GetObjResult(interp), &objc2,
709 for (j = 0; j < objc2; j++) {
712 string = Tcl_GetStringFromObj(objv2[j], &length);
715 string[length] = '\0';
716 Tcl_CreateHashEntry(&table, string, &dummy);
717 string[length] = '.';
721 Tcl_DecrRefCount(searchIn);
723 Tcl_DecrRefCount(encodingObj);
727 * Clear any values placed in the result by globbing.
730 Tcl_ResetResult(interp);
731 resultPtr = Tcl_GetObjResult(interp);
733 hPtr = Tcl_FirstHashEntry(&table, &search);
734 while (hPtr != NULL) {
737 strPtr = Tcl_NewStringObj(Tcl_GetHashKey(&table, hPtr), -1);
738 Tcl_ListObjAppendElement(NULL, resultPtr, strPtr);
739 hPtr = Tcl_NextHashEntry(&search);
741 Tcl_DeleteHashTable(&table);
745 *------------------------------------------------------------------------
747 * Tcl_SetSystemEncoding --
749 * Sets the default encoding that should be used whenever the user
750 * passes a NULL value in to one of the conversion routines.
751 * If the supplied name is NULL, the system encoding is reset to the
752 * default system encoding.
755 * The return value is TCL_OK if the system encoding was successfully
756 * set to the encoding specified by name, TCL_ERROR otherwise. If
757 * TCL_ERROR is returned, an error message is left in interp's result
758 * object, unless interp was NULL.
761 * The reference count of the new system encoding is incremented.
762 * The reference count of the old system encoding is decremented and
765 *------------------------------------------------------------------------
769 Tcl_SetSystemEncoding(interp, name)
770 Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */
771 CONST char *name; /* The name of the desired encoding, or NULL
772 * to reset to default encoding. */
774 Tcl_Encoding encoding;
775 Encoding *encodingPtr;
778 Tcl_MutexLock(&encodingMutex);
779 encoding = defaultEncoding;
780 encodingPtr = (Encoding *) encoding;
781 encodingPtr->refCount++;
782 Tcl_MutexUnlock(&encodingMutex);
784 encoding = Tcl_GetEncoding(interp, name);
785 if (encoding == NULL) {
790 Tcl_MutexLock(&encodingMutex);
791 FreeEncoding(systemEncoding);
792 systemEncoding = encoding;
793 Tcl_MutexUnlock(&encodingMutex);
799 *---------------------------------------------------------------------------
801 * Tcl_CreateEncoding --
803 * This procedure is called to define a new encoding and the procedures
804 * that are used to convert between the specified encoding and Unicode.
807 * Returns a token that represents the encoding. If an encoding with
808 * the same name already existed, the old encoding token remains
809 * valid and continues to behave as it used to, and will eventually
810 * be garbage collected when the last reference to it goes away. Any
811 * subsequent calls to Tcl_GetEncoding with the specified name will
812 * retrieve the most recent encoding token.
815 * The new encoding type is entered into a table visible to all
816 * interpreters, keyed off the encoding's name. For each call to
817 * this procedure, there should eventually be a call to
818 * Tcl_FreeEncoding, so that the database can be cleaned up when
819 * encodings aren't needed anymore.
821 *---------------------------------------------------------------------------
824 EXPORT_C Tcl_Encoding
825 Tcl_CreateEncoding(typePtr)
826 Tcl_EncodingType *typePtr; /* The encoding type. */
830 Encoding *encodingPtr;
833 Tcl_MutexLock(&encodingMutex);
834 hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &new);
837 * Remove old encoding from hash table, but don't delete it until
838 * last reference goes away.
841 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
842 encodingPtr->hPtr = NULL;
845 name = ckalloc((unsigned) strlen(typePtr->encodingName) + 1);
847 encodingPtr = (Encoding *) ckalloc(sizeof(Encoding));
848 encodingPtr->name = strcpy(name, typePtr->encodingName);
849 encodingPtr->toUtfProc = typePtr->toUtfProc;
850 encodingPtr->fromUtfProc = typePtr->fromUtfProc;
851 encodingPtr->freeProc = typePtr->freeProc;
852 encodingPtr->nullSize = typePtr->nullSize;
853 encodingPtr->clientData = typePtr->clientData;
854 if (typePtr->nullSize == 1) {
855 encodingPtr->lengthProc = (LengthProc *) strlen;
857 encodingPtr->lengthProc = (LengthProc *) unilen;
859 encodingPtr->refCount = 1;
860 encodingPtr->hPtr = hPtr;
861 Tcl_SetHashValue(hPtr, encodingPtr);
863 Tcl_MutexUnlock(&encodingMutex);
865 return (Tcl_Encoding) encodingPtr;
869 *-------------------------------------------------------------------------
871 * Tcl_ExternalToUtfDString --
873 * Convert a source buffer from the specified encoding into UTF-8.
874 * If any of the bytes in the source buffer are invalid or cannot
875 * be represented in the target encoding, a default fallback
876 * character will be substituted.
879 * The converted bytes are stored in the DString, which is then NULL
880 * terminated. The return value is a pointer to the value stored
886 *-------------------------------------------------------------------------
890 Tcl_ExternalToUtfDString(encoding, src, srcLen, dstPtr)
891 Tcl_Encoding encoding; /* The encoding for the source string, or
892 * NULL for the default system encoding. */
893 CONST char *src; /* Source string in specified encoding. */
894 int srcLen; /* Source string length in bytes, or < 0 for
895 * encoding-specific string length. */
896 Tcl_DString *dstPtr; /* Uninitialized or free DString in which
897 * the converted string is stored. */
900 Tcl_EncodingState state;
901 Encoding *encodingPtr;
902 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
904 Tcl_DStringInit(dstPtr);
905 dst = Tcl_DStringValue(dstPtr);
906 dstLen = dstPtr->spaceAvl - 1;
908 if (encoding == NULL) {
909 encoding = systemEncoding;
911 encodingPtr = (Encoding *) encoding;
915 } else if (srcLen < 0) {
916 srcLen = (*encodingPtr->lengthProc)(src);
918 flags = TCL_ENCODING_START | TCL_ENCODING_END;
920 result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src,
921 srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
923 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
924 if (result != TCL_CONVERT_NOSPACE) {
925 Tcl_DStringSetLength(dstPtr, soFar);
926 return Tcl_DStringValue(dstPtr);
928 flags &= ~TCL_ENCODING_START;
931 if (Tcl_DStringLength(dstPtr) == 0) {
932 Tcl_DStringSetLength(dstPtr, dstLen);
934 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
935 dst = Tcl_DStringValue(dstPtr) + soFar;
936 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
941 *-------------------------------------------------------------------------
943 * Tcl_ExternalToUtf --
945 * Convert a source buffer from the specified encoding into UTF-8.
948 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
949 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE,
950 * as documented in tcl.h.
953 * The converted bytes are stored in the output buffer.
955 *-------------------------------------------------------------------------
959 Tcl_ExternalToUtf(interp, encoding, src, srcLen, flags, statePtr, dst,
960 dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
961 Tcl_Interp *interp; /* Interp for error return, if not NULL. */
962 Tcl_Encoding encoding; /* The encoding for the source string, or
963 * NULL for the default system encoding. */
964 CONST char *src; /* Source string in specified encoding. */
965 int srcLen; /* Source string length in bytes, or < 0 for
966 * encoding-specific string length. */
967 int flags; /* Conversion control flags. */
968 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
969 * state information used during a piecewise
970 * conversion. Contents of statePtr are
971 * initialized and/or reset by conversion
972 * routine under control of flags argument. */
973 char *dst; /* Output buffer in which converted string
975 int dstLen; /* The maximum length of output buffer in
977 int *srcReadPtr; /* Filled with the number of bytes from the
978 * source string that were converted. This
979 * may be less than the original source length
980 * if there was a problem converting some
981 * source characters. */
982 int *dstWrotePtr; /* Filled with the number of bytes that were
983 * stored in the output buffer as a result of
985 int *dstCharsPtr; /* Filled with the number of characters that
986 * correspond to the bytes stored in the
989 Encoding *encodingPtr;
990 int result, srcRead, dstWrote, dstChars;
991 Tcl_EncodingState state;
993 if (encoding == NULL) {
994 encoding = systemEncoding;
996 encodingPtr = (Encoding *) encoding;
1000 } else if (srcLen < 0) {
1001 srcLen = (*encodingPtr->lengthProc)(src);
1003 if (statePtr == NULL) {
1004 flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1007 if (srcReadPtr == NULL) {
1008 srcReadPtr = &srcRead;
1010 if (dstWrotePtr == NULL) {
1011 dstWrotePtr = &dstWrote;
1013 if (dstCharsPtr == NULL) {
1014 dstCharsPtr = &dstChars;
1018 * If there are any null characters in the middle of the buffer, they will
1019 * converted to the UTF-8 null character (\xC080). To get the actual
1020 * \0 at the end of the destination buffer, we need to append it manually.
1024 result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, srcLen,
1025 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1027 dst[*dstWrotePtr] = '\0';
1032 *-------------------------------------------------------------------------
1034 * Tcl_UtfToExternalDString --
1036 * Convert a source buffer from UTF-8 into the specified encoding.
1037 * If any of the bytes in the source buffer are invalid or cannot
1038 * be represented in the target encoding, a default fallback
1039 * character will be substituted.
1042 * The converted bytes are stored in the DString, which is then
1043 * NULL terminated in an encoding-specific manner. The return value
1044 * is a pointer to the value stored in the DString.
1049 *-------------------------------------------------------------------------
1053 Tcl_UtfToExternalDString(encoding, src, srcLen, dstPtr)
1054 Tcl_Encoding encoding; /* The encoding for the converted string,
1055 * or NULL for the default system encoding. */
1056 CONST char *src; /* Source string in UTF-8. */
1057 int srcLen; /* Source string length in bytes, or < 0 for
1059 Tcl_DString *dstPtr; /* Uninitialized or free DString in which
1060 * the converted string is stored. */
1063 Tcl_EncodingState state;
1064 Encoding *encodingPtr;
1065 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1067 Tcl_DStringInit(dstPtr);
1068 dst = Tcl_DStringValue(dstPtr);
1069 dstLen = dstPtr->spaceAvl - 1;
1071 if (encoding == NULL) {
1072 encoding = systemEncoding;
1074 encodingPtr = (Encoding *) encoding;
1078 } else if (srcLen < 0) {
1079 srcLen = strlen(src);
1081 flags = TCL_ENCODING_START | TCL_ENCODING_END;
1083 result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src,
1084 srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
1086 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1087 if (result != TCL_CONVERT_NOSPACE) {
1088 if (encodingPtr->nullSize == 2) {
1089 Tcl_DStringSetLength(dstPtr, soFar + 1);
1091 Tcl_DStringSetLength(dstPtr, soFar);
1092 return Tcl_DStringValue(dstPtr);
1094 flags &= ~TCL_ENCODING_START;
1097 if (Tcl_DStringLength(dstPtr) == 0) {
1098 Tcl_DStringSetLength(dstPtr, dstLen);
1100 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1101 dst = Tcl_DStringValue(dstPtr) + soFar;
1102 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1107 *-------------------------------------------------------------------------
1109 * Tcl_UtfToExternal --
1111 * Convert a buffer from UTF-8 into the specified encoding.
1114 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1115 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE,
1116 * as documented in tcl.h.
1119 * The converted bytes are stored in the output buffer.
1121 *-------------------------------------------------------------------------
1125 Tcl_UtfToExternal(interp, encoding, src, srcLen, flags, statePtr, dst,
1126 dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
1127 Tcl_Interp *interp; /* Interp for error return, if not NULL. */
1128 Tcl_Encoding encoding; /* The encoding for the converted string,
1129 * or NULL for the default system encoding. */
1130 CONST char *src; /* Source string in UTF-8. */
1131 int srcLen; /* Source string length in bytes, or < 0 for
1133 int flags; /* Conversion control flags. */
1134 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1135 * state information used during a piecewise
1136 * conversion. Contents of statePtr are
1137 * initialized and/or reset by conversion
1138 * routine under control of flags argument. */
1139 char *dst; /* Output buffer in which converted string
1141 int dstLen; /* The maximum length of output buffer in
1143 int *srcReadPtr; /* Filled with the number of bytes from the
1144 * source string that were converted. This
1145 * may be less than the original source length
1146 * if there was a problem converting some
1147 * source characters. */
1148 int *dstWrotePtr; /* Filled with the number of bytes that were
1149 * stored in the output buffer as a result of
1150 * the conversion. */
1151 int *dstCharsPtr; /* Filled with the number of characters that
1152 * correspond to the bytes stored in the
1155 Encoding *encodingPtr;
1156 int result, srcRead, dstWrote, dstChars;
1157 Tcl_EncodingState state;
1159 if (encoding == NULL) {
1160 encoding = systemEncoding;
1162 encodingPtr = (Encoding *) encoding;
1166 } else if (srcLen < 0) {
1167 srcLen = strlen(src);
1169 if (statePtr == NULL) {
1170 flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1173 if (srcReadPtr == NULL) {
1174 srcReadPtr = &srcRead;
1176 if (dstWrotePtr == NULL) {
1177 dstWrotePtr = &dstWrote;
1179 if (dstCharsPtr == NULL) {
1180 dstCharsPtr = &dstChars;
1183 dstLen -= encodingPtr->nullSize;
1184 result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, srcLen,
1185 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1187 if (encodingPtr->nullSize == 2) {
1188 dst[*dstWrotePtr + 1] = '\0';
1190 dst[*dstWrotePtr] = '\0';
1196 *---------------------------------------------------------------------------
1198 * Tcl_FindExecutable --
1200 * This procedure computes the absolute path name of the current
1201 * application, given its argv[0] value.
1207 * The variable tclExecutableName gets filled in with the file
1208 * name for the application, if we figured it out. If we couldn't
1209 * figure it out, tclExecutableName is set to NULL.
1211 *---------------------------------------------------------------------------
1215 Tcl_FindExecutable(argv0)
1216 CONST char *argv0; /* The value of the application's argv[0]
1221 Tcl_DString buffer, nameString;
1223 TclInitSubsystems(argv0);
1225 if (argv0 == NULL) {
1228 if (tclExecutableName != NULL) {
1229 ckfree(tclExecutableName);
1230 tclExecutableName = NULL;
1232 if ((name = TclpFindExecutable(argv0)) == NULL) {
1237 * The value returned from TclpNameOfExecutable is a UTF string that
1238 * is possibly dirty depending on when it was initialized.
1239 * TclFindEncodings will indicate whether we must "clean" the UTF (as
1240 * reported by the underlying system). To assure that the UTF string
1241 * is a properly encoded native string for this system, convert the
1242 * UTF string to the default native encoding before the default
1243 * encoding is initialized. Then, convert it back to UTF after the
1244 * system encoding is loaded.
1247 Tcl_UtfToExternalDString(NULL, name, -1, &buffer);
1248 mustCleanUtf = TclFindEncodings(argv0);
1251 * Now it is OK to convert the native string back to UTF and set
1252 * the value of the tclExecutableName.
1256 Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&buffer), -1,
1258 tclExecutableName = (char *)
1259 ckalloc((unsigned) (Tcl_DStringLength(&nameString) + 1));
1260 strcpy(tclExecutableName, Tcl_DStringValue(&nameString));
1262 Tcl_DStringFree(&nameString);
1264 tclExecutableName = (char *) ckalloc((unsigned) (strlen(name) + 1));
1265 strcpy(tclExecutableName, name);
1267 Tcl_DStringFree(&buffer);
1271 (void) TclFindEncodings(argv0);
1275 *---------------------------------------------------------------------------
1277 * LoadEncodingFile --
1279 * Read a file that describes an encoding and create a new Encoding
1283 * The return value is the newly loaded Encoding, or NULL if
1284 * the file didn't exist of was in the incorrect format. If NULL was
1285 * returned, an error message is left in interp's result object,
1286 * unless interp was NULL.
1289 * File read from disk.
1291 *---------------------------------------------------------------------------
1295 LoadEncodingFile(interp, name)
1296 Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */
1297 CONST char *name; /* The name of the encoding file on disk
1298 * and also the name for new encoding. */
1304 Tcl_Encoding encoding;
1306 pathPtr = TclGetLibraryPath();
1307 if (pathPtr == NULL) {
1311 Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv);
1314 for (i = 0; i < objc; i++) {
1315 chan = OpenEncodingFile(Tcl_GetString(objv[i]), name);
1325 Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8");
1330 Tcl_DStringInit(&ds);
1331 Tcl_Gets(chan, &ds);
1332 ch = Tcl_DStringValue(&ds)[0];
1333 Tcl_DStringFree(&ds);
1342 encoding = LoadTableEncoding(interp, name, ENCODING_SINGLEBYTE,
1347 encoding = LoadTableEncoding(interp, name, ENCODING_DOUBLEBYTE,
1352 encoding = LoadTableEncoding(interp, name, ENCODING_MULTIBYTE,
1357 encoding = LoadEscapeEncoding(name, chan);
1361 if ((encoding == NULL) && (interp != NULL)) {
1362 Tcl_AppendResult(interp, "invalid encoding file \"", name, "\"", NULL);
1364 Tcl_AppendResult(interp, " or missing sub-encoding", NULL);
1367 Tcl_Close(NULL, chan);
1371 if (interp != NULL) {
1372 Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL);
1378 *----------------------------------------------------------------------
1380 * OpenEncodingFile --
1382 * Look for the file encoding/<name>.enc in the specified
1386 * Returns an open file channel if the file exists.
1391 *----------------------------------------------------------------------
1395 OpenEncodingFile(dir, name)
1400 CONST char *argv[3];
1401 Tcl_DString pathString;
1407 argv[1] = "encoding";
1410 Tcl_DStringInit(&pathString);
1411 Tcl_JoinPath(3, argv, &pathString);
1412 path = Tcl_DStringAppend(&pathString, ".enc", -1);
1413 pathPtr = Tcl_NewStringObj(path,-1);
1415 Tcl_IncrRefCount(pathPtr);
1416 chan = Tcl_FSOpenFileChannel(NULL, pathPtr, "r", 0);
1417 Tcl_DecrRefCount(pathPtr);
1419 Tcl_DStringFree(&pathString);
1425 *-------------------------------------------------------------------------
1427 * LoadTableEncoding --
1429 * Helper function for LoadEncodingTable(). Loads a table to that
1430 * converts between Unicode and some other encoding and creates an
1431 * encoding (using a TableEncoding structure) from that information.
1433 * File contains binary data, but begins with a marker to indicate
1434 * byte-ordering, so that same binary file can be read on either
1438 * The return value is the new encoding, or NULL if the encoding
1439 * could not be created (because the file contained invalid data).
1444 *-------------------------------------------------------------------------
1448 LoadTableEncoding(interp, name, type, chan)
1449 Tcl_Interp *interp; /* Interp for temporary obj while reading. */
1450 CONST char *name; /* Name for new encoding. */
1451 int type; /* Type of encoding (ENCODING_?????). */
1452 Tcl_Channel chan; /* File containing new encoding. */
1454 Tcl_DString lineString;
1457 int i, hi, lo, numPages, symbol, fallback;
1458 unsigned char used[256];
1460 TableEncodingData *dataPtr;
1461 unsigned short *pageMemPtr;
1462 Tcl_EncodingType encType;
1465 * Speed over memory. Use a full 256 character table to decode hex
1466 * sequences in the encoding files.
1469 static char staticHex[] = {
1470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 ... 15 */
1471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 ... 31 */
1472 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 ... 47 */
1473 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 48 ... 63 */
1474 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 ... 79 */
1475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 ... 95 */
1476 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96 ... 111 */
1477 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */
1478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */
1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */
1480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */
1481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */
1482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */
1483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */
1484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */
1485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */
1488 Tcl_DStringInit(&lineString);
1489 Tcl_Gets(chan, &lineString);
1490 line = Tcl_DStringValue(&lineString);
1492 fallback = (int) strtol(line, &line, 16);
1493 symbol = (int) strtol(line, &line, 10);
1494 numPages = (int) strtol(line, &line, 10);
1495 Tcl_DStringFree(&lineString);
1499 } else if (numPages > 256) {
1503 memset(used, 0, sizeof(used));
1506 #define PAGESIZE (256 * sizeof(unsigned short))
1508 dataPtr = (TableEncodingData *) ckalloc(sizeof(TableEncodingData));
1509 memset(dataPtr, 0, sizeof(TableEncodingData));
1511 dataPtr->fallback = fallback;
1514 * Read the table that maps characters to Unicode. Performs a single
1515 * malloc to get the memory for the array and all the pages needed by
1519 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1520 dataPtr->toUnicode = (unsigned short **) ckalloc(size);
1521 memset(dataPtr->toUnicode, 0, size);
1522 pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256);
1524 if (interp == NULL) {
1525 objPtr = Tcl_NewObj();
1527 objPtr = Tcl_GetObjResult(interp);
1529 for (i = 0; i < numPages; i++) {
1533 Tcl_ReadChars(chan, objPtr, 3 + 16 * (16 * 4 + 1), 0);
1534 p = Tcl_GetString(objPtr);
1535 hi = (staticHex[(unsigned int)p[0]] << 4) + staticHex[(unsigned int)p[1]];
1536 dataPtr->toUnicode[hi] = pageMemPtr;
1538 for (lo = 0; lo < 256; lo++) {
1539 if ((lo & 0x0f) == 0) {
1542 ch = (staticHex[(unsigned int)p[0]] << 12) + (staticHex[(unsigned int)p[1]] << 8)
1543 + (staticHex[(unsigned int)p[2]] << 4) + staticHex[(unsigned int)p[3]];
1547 *pageMemPtr = (unsigned short) ch;
1552 if (interp == NULL) {
1553 Tcl_DecrRefCount(objPtr);
1555 Tcl_ResetResult(interp);
1558 if (type == ENCODING_DOUBLEBYTE) {
1559 memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
1561 for (hi = 1; hi < 256; hi++) {
1562 if (dataPtr->toUnicode[hi] != NULL) {
1563 dataPtr->prefixBytes[hi] = 1;
1569 * Invert toUnicode array to produce the fromUnicode array. Performs a
1570 * single malloc to get the memory for the array and all the pages
1571 * needed by the array. While reading in the toUnicode array, we
1572 * remembered what pages that would be needed for the fromUnicode array.
1579 for (hi = 0; hi < 256; hi++) {
1584 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1585 dataPtr->fromUnicode = (unsigned short **) ckalloc(size);
1586 memset(dataPtr->fromUnicode, 0, size);
1587 pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256);
1589 for (hi = 0; hi < 256; hi++) {
1590 if (dataPtr->toUnicode[hi] == NULL) {
1591 dataPtr->toUnicode[hi] = emptyPage;
1593 for (lo = 0; lo < 256; lo++) {
1596 ch = dataPtr->toUnicode[hi][lo];
1598 unsigned short *page;
1600 page = dataPtr->fromUnicode[ch >> 8];
1604 dataPtr->fromUnicode[ch >> 8] = page;
1606 page[ch & 0xff] = (unsigned short) ((hi << 8) + lo);
1611 if (type == ENCODING_MULTIBYTE) {
1613 * If multibyte encodings don't have a backslash character, define
1614 * one. Otherwise, on Windows, native file names won't work because
1615 * the backslash in the file name will map to the unknown character
1616 * (question mark) when converting from UTF-8 to external encoding.
1619 if (dataPtr->fromUnicode[0] != NULL) {
1620 if (dataPtr->fromUnicode[0]['\\'] == '\0') {
1621 dataPtr->fromUnicode[0]['\\'] = '\\';
1626 unsigned short *page;
1629 * Make a special symbol encoding that not only maps the symbol
1630 * characters from their Unicode code points down into page 0, but
1631 * also ensure that the characters on page 0 map to themselves.
1632 * This is so that a symbol font can be used to display a simple
1633 * string like "abcd" and have alpha, beta, chi, delta show up,
1634 * rather than have "unknown" chars show up because strictly
1635 * speaking the symbol font doesn't have glyphs for those low ascii
1639 page = dataPtr->fromUnicode[0];
1642 dataPtr->fromUnicode[0] = page;
1644 for (lo = 0; lo < 256; lo++) {
1645 if (dataPtr->toUnicode[0][lo] != 0) {
1646 page[lo] = (unsigned short) lo;
1650 for (hi = 0; hi < 256; hi++) {
1651 if (dataPtr->fromUnicode[hi] == NULL) {
1652 dataPtr->fromUnicode[hi] = emptyPage;
1656 * For trailing 'R'everse encoding, see [Patch #689341]
1658 Tcl_DStringInit(&lineString);
1661 /* skip leading empty lines */
1662 while ((len = Tcl_Gets(chan, &lineString)) == 0)
1667 line = Tcl_DStringValue(&lineString);
1668 if (line[0] != 'R') {
1671 for (Tcl_DStringSetLength(&lineString, 0);
1672 (len = Tcl_Gets(chan, &lineString)) >= 0;
1673 Tcl_DStringSetLength(&lineString, 0)) {
1679 p = (unsigned char*) Tcl_DStringValue(&lineString);
1680 to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1681 + (staticHex[p[2]] << 4) + staticHex[p[3]];
1685 for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) {
1686 from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1687 + (staticHex[p[2]] << 4) + staticHex[p[3]];
1691 dataPtr->fromUnicode[from >> 8][from & 0xff] = to;
1695 Tcl_DStringFree(&lineString);
1697 encType.encodingName = name;
1698 encType.toUtfProc = TableToUtfProc;
1699 encType.fromUtfProc = TableFromUtfProc;
1700 encType.freeProc = TableFreeProc;
1701 encType.nullSize = (type == ENCODING_DOUBLEBYTE) ? 2 : 1;
1702 encType.clientData = (ClientData) dataPtr;
1703 return Tcl_CreateEncoding(&encType);
1707 *-------------------------------------------------------------------------
1709 * LoadEscapeEncoding --
1711 * Helper function for LoadEncodingTable(). Loads a state machine
1712 * that converts between Unicode and some other encoding.
1714 * File contains text data that describes the escape sequences that
1715 * are used to choose an encoding and the associated names for the
1719 * The return value is the new encoding, or NULL if the encoding
1720 * could not be created (because the file contained invalid data).
1725 *-------------------------------------------------------------------------
1729 LoadEscapeEncoding(name, chan)
1730 CONST char *name; /* Name for new encoding. */
1731 Tcl_Channel chan; /* File containing new encoding. */
1733 int i, missingSubEncoding = 0;
1735 Tcl_DString escapeData;
1736 char init[16], final[16];
1737 EscapeEncodingData *dataPtr;
1738 Tcl_EncodingType type;
1742 Tcl_DStringInit(&escapeData);
1748 Tcl_DString lineString;
1750 Tcl_DStringInit(&lineString);
1751 if (Tcl_Gets(chan, &lineString) < 0) {
1754 line = Tcl_DStringValue(&lineString);
1755 if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) {
1759 if (strcmp(argv[0], "name") == 0) {
1761 } else if (strcmp(argv[0], "init") == 0) {
1762 strncpy(init, argv[1], sizeof(init));
1763 init[sizeof(init) - 1] = '\0';
1764 } else if (strcmp(argv[0], "final") == 0) {
1765 strncpy(final, argv[1], sizeof(final));
1766 final[sizeof(final) - 1] = '\0';
1770 strncpy(est.sequence, argv[1], sizeof(est.sequence));
1771 est.sequence[sizeof(est.sequence) - 1] = '\0';
1772 est.sequenceLen = strlen(est.sequence);
1774 strncpy(est.name, argv[0], sizeof(est.name));
1775 est.name[sizeof(est.name) - 1] = '\0';
1778 * Load the subencodings first so we're never stuck
1779 * trying to use a half-loaded system encoding to
1780 * open/read a *.enc file.
1783 est.encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, est.name);
1784 if ((est.encodingPtr == NULL)
1785 || (est.encodingPtr->toUtfProc != TableToUtfProc)) {
1786 missingSubEncoding = 1;
1788 Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est));
1791 ckfree((char *) argv);
1792 Tcl_DStringFree(&lineString);
1794 if (missingSubEncoding) {
1795 Tcl_DStringFree(&escapeData);
1799 size = sizeof(EscapeEncodingData)
1800 - sizeof(EscapeSubTable) + Tcl_DStringLength(&escapeData);
1801 dataPtr = (EscapeEncodingData *) ckalloc(size);
1802 dataPtr->initLen = strlen(init);
1803 strcpy(dataPtr->init, init);
1804 dataPtr->finalLen = strlen(final);
1805 strcpy(dataPtr->final, final);
1806 dataPtr->numSubTables = Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable);
1807 memcpy((VOID *) dataPtr->subTables, (VOID *) Tcl_DStringValue(&escapeData),
1808 (size_t) Tcl_DStringLength(&escapeData));
1809 Tcl_DStringFree(&escapeData);
1811 memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes));
1812 for (i = 0; i < dataPtr->numSubTables; i++) {
1813 dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1;
1815 if (dataPtr->init[0] != '\0') {
1816 dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1;
1818 if (dataPtr->final[0] != '\0') {
1819 dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1;
1822 type.encodingName = name;
1823 type.toUtfProc = EscapeToUtfProc;
1824 type.fromUtfProc = EscapeFromUtfProc;
1825 type.freeProc = EscapeFreeProc;
1827 type.clientData = (ClientData) dataPtr;
1829 return Tcl_CreateEncoding(&type);
1833 *-------------------------------------------------------------------------
1837 * The default conversion when no other conversion is specified.
1838 * No translation is done; source bytes are copied directly to
1839 * destination bytes.
1842 * Returns TCL_OK if conversion was successful.
1847 *-------------------------------------------------------------------------
1851 BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1852 srcReadPtr, dstWrotePtr, dstCharsPtr)
1853 ClientData clientData; /* Not used. */
1854 CONST char *src; /* Source string (unknown encoding). */
1855 int srcLen; /* Source string length in bytes. */
1856 int flags; /* Conversion control flags. */
1857 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1858 * state information used during a piecewise
1859 * conversion. Contents of statePtr are
1860 * initialized and/or reset by conversion
1861 * routine under control of flags argument. */
1862 char *dst; /* Output buffer in which converted string
1864 int dstLen; /* The maximum length of output buffer in
1866 int *srcReadPtr; /* Filled with the number of bytes from the
1867 * source string that were converted. */
1868 int *dstWrotePtr; /* Filled with the number of bytes that were
1869 * stored in the output buffer as a result of
1870 * the conversion. */
1871 int *dstCharsPtr; /* Filled with the number of characters that
1872 * correspond to the bytes stored in the
1878 dstLen -= TCL_UTF_MAX - 1;
1882 if (srcLen > dstLen) {
1884 result = TCL_CONVERT_NOSPACE;
1887 *srcReadPtr = srcLen;
1888 *dstWrotePtr = srcLen;
1889 *dstCharsPtr = srcLen;
1890 memcpy((void *) dst, (void *) src, (size_t) srcLen);
1896 *-------------------------------------------------------------------------
1898 * UtfExtToUtfIntProc --
1900 * Convert from UTF-8 to UTF-8. While converting null-bytes from
1901 * the Tcl's internal representation (0xc0, 0x80) to the official
1902 * representation (0x00). See UtfToUtfProc for details.
1905 * Returns TCL_OK if conversion was successful.
1910 *-------------------------------------------------------------------------
1913 UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1914 srcReadPtr, dstWrotePtr, dstCharsPtr)
1915 ClientData clientData; /* Not used. */
1916 CONST char *src; /* Source string in UTF-8. */
1917 int srcLen; /* Source string length in bytes. */
1918 int flags; /* Conversion control flags. */
1919 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1920 * state information used during a piecewise
1921 * conversion. Contents of statePtr are
1922 * initialized and/or reset by conversion
1923 * routine under control of flags argument. */
1924 char *dst; /* Output buffer in which converted string
1926 int dstLen; /* The maximum length of output buffer in
1928 int *srcReadPtr; /* Filled with the number of bytes from the
1929 * source string that were converted. This
1930 * may be less than the original source length
1931 * if there was a problem converting some
1932 * source characters. */
1933 int *dstWrotePtr; /* Filled with the number of bytes that were
1934 * stored in the output buffer as a result of
1935 * the conversion. */
1936 int *dstCharsPtr; /* Filled with the number of characters that
1937 * correspond to the bytes stored in the
1940 return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1941 srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
1945 *-------------------------------------------------------------------------
1947 * UtfExtToUtfIntProc --
1949 * Convert from UTF-8 to UTF-8 while converting null-bytes from
1950 * the official representation (0x00) to Tcl's internal
1951 * representation (0xc0, 0x80). See UtfToUtfProc for details.
1954 * Returns TCL_OK if conversion was successful.
1959 *-------------------------------------------------------------------------
1962 UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1963 srcReadPtr, dstWrotePtr, dstCharsPtr)
1964 ClientData clientData; /* Not used. */
1965 CONST char *src; /* Source string in UTF-8. */
1966 int srcLen; /* Source string length in bytes. */
1967 int flags; /* Conversion control flags. */
1968 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1969 * state information used during a piecewise
1970 * conversion. Contents of statePtr are
1971 * initialized and/or reset by conversion
1972 * routine under control of flags argument. */
1973 char *dst; /* Output buffer in which converted string
1975 int dstLen; /* The maximum length of output buffer in
1977 int *srcReadPtr; /* Filled with the number of bytes from the
1978 * source string that were converted. This
1979 * may be less than the original source length
1980 * if there was a problem converting some
1981 * source characters. */
1982 int *dstWrotePtr; /* Filled with the number of bytes that were
1983 * stored in the output buffer as a result of
1984 * the conversion. */
1985 int *dstCharsPtr; /* Filled with the number of characters that
1986 * correspond to the bytes stored in the
1989 return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1990 srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
1994 *-------------------------------------------------------------------------
1998 * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8
1999 * translation is not a no-op, because it will turn a stream of
2000 * improperly formed UTF-8 into a properly formed stream.
2003 * Returns TCL_OK if conversion was successful.
2008 *-------------------------------------------------------------------------
2012 UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2013 srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
2014 ClientData clientData; /* Not used. */
2015 CONST char *src; /* Source string in UTF-8. */
2016 int srcLen; /* Source string length in bytes. */
2017 int flags; /* Conversion control flags. */
2018 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2019 * state information used during a piecewise
2020 * conversion. Contents of statePtr are
2021 * initialized and/or reset by conversion
2022 * routine under control of flags argument. */
2023 char *dst; /* Output buffer in which converted string
2025 int dstLen; /* The maximum length of output buffer in
2027 int *srcReadPtr; /* Filled with the number of bytes from the
2028 * source string that were converted. This
2029 * may be less than the original source length
2030 * if there was a problem converting some
2031 * source characters. */
2032 int *dstWrotePtr; /* Filled with the number of bytes that were
2033 * stored in the output buffer as a result of
2034 * the conversion. */
2035 int *dstCharsPtr; /* Filled with the number of characters that
2036 * correspond to the bytes stored in the
2038 int pureNullMode; /* Convert embedded nulls from
2039 * internal representation to real
2040 * null-bytes or vice versa */
2043 CONST char *srcStart, *srcEnd, *srcClose;
2044 char *dstStart, *dstEnd;
2045 int result, numChars;
2051 srcEnd = src + srcLen;
2053 if ((flags & TCL_ENCODING_END) == 0) {
2054 srcClose -= TCL_UTF_MAX;
2058 dstEnd = dst + dstLen - TCL_UTF_MAX;
2060 for (numChars = 0; src < srcEnd; numChars++) {
2061 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2063 * If there is more string to follow, this will ensure that the
2064 * last UTF-8 character in the source buffer hasn't been cut off.
2067 result = TCL_CONVERT_MULTIBYTE;
2071 result = TCL_CONVERT_NOSPACE;
2074 if (UCHAR(*src) < 0x80 &&
2075 !(UCHAR(*src) == 0 && pureNullMode == 0)) {
2077 * Copy 7bit chatacters, but skip null-bytes when we are
2078 * in input mode, so that they get converted to 0xc080.
2081 } else if (pureNullMode == 1 &&
2082 UCHAR(*src) == 0xc0 &&
2083 UCHAR(*(src+1)) == 0x80) {
2085 * Convert 0xc080 to real nulls when we are in output mode.
2089 } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
2090 /* Always check before using Tcl_UtfToUniChar. Not doing
2091 * can so cause it run beyond the endof the buffer! If we
2092 * * happen such an incomplete char its byts are made to *
2093 * represent themselves.
2096 ch = (Tcl_UniChar) *src;
2098 dst += Tcl_UniCharToUtf(ch, dst);
2100 src += Tcl_UtfToUniChar(src, &ch);
2101 dst += Tcl_UniCharToUtf(ch, dst);
2105 *srcReadPtr = src - srcStart;
2106 *dstWrotePtr = dst - dstStart;
2107 *dstCharsPtr = numChars;
2112 *-------------------------------------------------------------------------
2114 * UnicodeToUtfProc --
2116 * Convert from Unicode to UTF-8.
2119 * Returns TCL_OK if conversion was successful.
2124 *-------------------------------------------------------------------------
2128 UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2129 srcReadPtr, dstWrotePtr, dstCharsPtr)
2130 ClientData clientData; /* Not used. */
2131 CONST char *src; /* Source string in Unicode. */
2132 int srcLen; /* Source string length in bytes. */
2133 int flags; /* Conversion control flags. */
2134 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2135 * state information used during a piecewise
2136 * conversion. Contents of statePtr are
2137 * initialized and/or reset by conversion
2138 * routine under control of flags argument. */
2139 char *dst; /* Output buffer in which converted string
2141 int dstLen; /* The maximum length of output buffer in
2143 int *srcReadPtr; /* Filled with the number of bytes from the
2144 * source string that were converted. This
2145 * may be less than the original source length
2146 * if there was a problem converting some
2147 * source characters. */
2148 int *dstWrotePtr; /* Filled with the number of bytes that were
2149 * stored in the output buffer as a result of
2150 * the conversion. */
2151 int *dstCharsPtr; /* Filled with the number of characters that
2152 * correspond to the bytes stored in the
2155 CONST char *srcStart, *srcEnd;
2156 char *dstEnd, *dstStart;
2157 int result, numChars;
2161 if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
2162 result = TCL_CONVERT_MULTIBYTE;
2163 srcLen /= sizeof(Tcl_UniChar);
2164 srcLen *= sizeof(Tcl_UniChar);
2168 srcEnd = src + srcLen;
2171 dstEnd = dst + dstLen - TCL_UTF_MAX;
2173 for (numChars = 0; src < srcEnd; numChars++) {
2175 result = TCL_CONVERT_NOSPACE;
2179 * Special case for 1-byte utf chars for speed. Make sure we
2180 * work with Tcl_UniChar-size data.
2182 ch = *(Tcl_UniChar *)src;
2183 if (ch && ch < 0x80) {
2184 *dst++ = (ch & 0xFF);
2186 dst += Tcl_UniCharToUtf(ch, dst);
2188 src += sizeof(Tcl_UniChar);
2191 *srcReadPtr = src - srcStart;
2192 *dstWrotePtr = dst - dstStart;
2193 *dstCharsPtr = numChars;
2198 *-------------------------------------------------------------------------
2200 * UtfToUnicodeProc --
2202 * Convert from UTF-8 to Unicode.
2205 * Returns TCL_OK if conversion was successful.
2210 *-------------------------------------------------------------------------
2214 UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2215 srcReadPtr, dstWrotePtr, dstCharsPtr)
2216 ClientData clientData; /* TableEncodingData that specifies encoding. */
2217 CONST char *src; /* Source string in UTF-8. */
2218 int srcLen; /* Source string length in bytes. */
2219 int flags; /* Conversion control flags. */
2220 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2221 * state information used during a piecewise
2222 * conversion. Contents of statePtr are
2223 * initialized and/or reset by conversion
2224 * routine under control of flags argument. */
2225 char *dst; /* Output buffer in which converted string
2227 int dstLen; /* The maximum length of output buffer in
2229 int *srcReadPtr; /* Filled with the number of bytes from the
2230 * source string that were converted. This
2231 * may be less than the original source length
2232 * if there was a problem converting some
2233 * source characters. */
2234 int *dstWrotePtr; /* Filled with the number of bytes that were
2235 * stored in the output buffer as a result of
2236 * the conversion. */
2237 int *dstCharsPtr; /* Filled with the number of characters that
2238 * correspond to the bytes stored in the
2241 CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
2242 int result, numChars;
2246 srcEnd = src + srcLen;
2248 if ((flags & TCL_ENCODING_END) == 0) {
2249 srcClose -= TCL_UTF_MAX;
2253 dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
2256 for (numChars = 0; src < srcEnd; numChars++) {
2257 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2259 * If there is more string to follow, this will ensure that the
2260 * last UTF-8 character in the source buffer hasn't been cut off.
2263 result = TCL_CONVERT_MULTIBYTE;
2267 result = TCL_CONVERT_NOSPACE;
2270 src += TclUtfToUniChar(src, &ch);
2272 * Need to handle this in a way that won't cause misalignment
2273 * by casting dst to a Tcl_UniChar. [Bug 1122671]
2274 * XXX: This hard-codes the assumed size of Tcl_UniChar as 2.
2276 #ifdef WORDS_BIGENDIAN
2278 *dst++ = (ch & 0xFF);
2280 *dst++ = (ch & 0xFF);
2284 *srcReadPtr = src - srcStart;
2285 *dstWrotePtr = dst - dstStart;
2286 *dstCharsPtr = numChars;
2291 *-------------------------------------------------------------------------
2295 * Convert from the encoding specified by the TableEncodingData into
2299 * Returns TCL_OK if conversion was successful.
2304 *-------------------------------------------------------------------------
2308 TableToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2309 srcReadPtr, dstWrotePtr, dstCharsPtr)
2310 ClientData clientData; /* TableEncodingData that specifies
2312 CONST char *src; /* Source string in specified encoding. */
2313 int srcLen; /* Source string length in bytes. */
2314 int flags; /* Conversion control flags. */
2315 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2316 * state information used during a piecewise
2317 * conversion. Contents of statePtr are
2318 * initialized and/or reset by conversion
2319 * routine under control of flags argument. */
2320 char *dst; /* Output buffer in which converted string
2322 int dstLen; /* The maximum length of output buffer in
2324 int *srcReadPtr; /* Filled with the number of bytes from the
2325 * source string that were converted. This
2326 * may be less than the original source length
2327 * if there was a problem converting some
2328 * source characters. */
2329 int *dstWrotePtr; /* Filled with the number of bytes that were
2330 * stored in the output buffer as a result of
2331 * the conversion. */
2332 int *dstCharsPtr; /* Filled with the number of characters that
2333 * correspond to the bytes stored in the
2336 CONST char *srcStart, *srcEnd;
2337 char *dstEnd, *dstStart, *prefixBytes;
2338 int result, byte, numChars;
2340 unsigned short **toUnicode;
2341 unsigned short *pageZero;
2342 TableEncodingData *dataPtr;
2345 srcEnd = src + srcLen;
2348 dstEnd = dst + dstLen - TCL_UTF_MAX;
2350 dataPtr = (TableEncodingData *) clientData;
2351 toUnicode = dataPtr->toUnicode;
2352 prefixBytes = dataPtr->prefixBytes;
2353 pageZero = toUnicode[0];
2356 for (numChars = 0; src < srcEnd; numChars++) {
2358 result = TCL_CONVERT_NOSPACE;
2361 byte = *((unsigned char *) src);
2362 if (prefixBytes[byte]) {
2364 if (src >= srcEnd) {
2366 result = TCL_CONVERT_MULTIBYTE;
2369 ch = toUnicode[byte][*((unsigned char *) src)];
2371 ch = pageZero[byte];
2373 if ((ch == 0) && (byte != 0)) {
2374 if (flags & TCL_ENCODING_STOPONERROR) {
2375 result = TCL_CONVERT_SYNTAX;
2378 if (prefixBytes[byte]) {
2381 ch = (Tcl_UniChar) byte;
2384 * Special case for 1-byte utf chars for speed.
2386 if (ch && ch < 0x80) {
2389 dst += Tcl_UniCharToUtf(ch, dst);
2393 *srcReadPtr = src - srcStart;
2394 *dstWrotePtr = dst - dstStart;
2395 *dstCharsPtr = numChars;
2400 *-------------------------------------------------------------------------
2402 * TableFromUtfProc --
2404 * Convert from UTF-8 into the encoding specified by the
2405 * TableEncodingData.
2408 * Returns TCL_OK if conversion was successful.
2413 *-------------------------------------------------------------------------
2417 TableFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2418 srcReadPtr, dstWrotePtr, dstCharsPtr)
2419 ClientData clientData; /* TableEncodingData that specifies
2421 CONST char *src; /* Source string in UTF-8. */
2422 int srcLen; /* Source string length in bytes. */
2423 int flags; /* Conversion control flags. */
2424 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2425 * state information used during a piecewise
2426 * conversion. Contents of statePtr are
2427 * initialized and/or reset by conversion
2428 * routine under control of flags argument. */
2429 char *dst; /* Output buffer in which converted string
2431 int dstLen; /* The maximum length of output buffer in
2433 int *srcReadPtr; /* Filled with the number of bytes from the
2434 * source string that were converted. This
2435 * may be less than the original source length
2436 * if there was a problem converting some
2437 * source characters. */
2438 int *dstWrotePtr; /* Filled with the number of bytes that were
2439 * stored in the output buffer as a result of
2440 * the conversion. */
2441 int *dstCharsPtr; /* Filled with the number of characters that
2442 * correspond to the bytes stored in the
2445 CONST char *srcStart, *srcEnd, *srcClose;
2446 char *dstStart, *dstEnd, *prefixBytes;
2448 int result, len, word, numChars;
2449 TableEncodingData *dataPtr;
2450 unsigned short **fromUnicode;
2454 dataPtr = (TableEncodingData *) clientData;
2455 prefixBytes = dataPtr->prefixBytes;
2456 fromUnicode = dataPtr->fromUnicode;
2459 srcEnd = src + srcLen;
2461 if ((flags & TCL_ENCODING_END) == 0) {
2462 srcClose -= TCL_UTF_MAX;
2466 dstEnd = dst + dstLen - 1;
2468 for (numChars = 0; src < srcEnd; numChars++) {
2469 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2471 * If there is more string to follow, this will ensure that the
2472 * last UTF-8 character in the source buffer hasn't been cut off.
2475 result = TCL_CONVERT_MULTIBYTE;
2478 len = TclUtfToUniChar(src, &ch);
2482 * This prevents a crash condition. More evaluation is required
2483 * for full support of int Tcl_UniChar. [Bug 1004065]
2485 if (ch & 0xffff0000) {
2489 word = fromUnicode[(ch >> 8)][ch & 0xff];
2491 if ((word == 0) && (ch != 0)) {
2492 if (flags & TCL_ENCODING_STOPONERROR) {
2493 result = TCL_CONVERT_UNKNOWN;
2496 word = dataPtr->fallback;
2498 if (prefixBytes[(word >> 8)] != 0) {
2499 if (dst + 1 > dstEnd) {
2500 result = TCL_CONVERT_NOSPACE;
2503 dst[0] = (char) (word >> 8);
2504 dst[1] = (char) word;
2508 result = TCL_CONVERT_NOSPACE;
2511 dst[0] = (char) word;
2516 *srcReadPtr = src - srcStart;
2517 *dstWrotePtr = dst - dstStart;
2518 *dstCharsPtr = numChars;
2523 *---------------------------------------------------------------------------
2527 * This procedure is invoked when an encoding is deleted. It deletes
2528 * the memory used by the TableEncodingData.
2536 *---------------------------------------------------------------------------
2540 TableFreeProc(clientData)
2541 ClientData clientData; /* TableEncodingData that specifies
2544 TableEncodingData *dataPtr;
2547 * Make sure we aren't freeing twice on shutdown. [Bug #219314]
2550 dataPtr = (TableEncodingData *) clientData;
2551 ckfree((char *) dataPtr->toUnicode);
2552 ckfree((char *) dataPtr->fromUnicode);
2553 ckfree((char *) dataPtr);
2557 *-------------------------------------------------------------------------
2559 * EscapeToUtfProc --
2561 * Convert from the encoding specified by the EscapeEncodingData into
2565 * Returns TCL_OK if conversion was successful.
2570 *-------------------------------------------------------------------------
2574 EscapeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2575 srcReadPtr, dstWrotePtr, dstCharsPtr)
2576 ClientData clientData; /* EscapeEncodingData that specifies
2578 CONST char *src; /* Source string in specified encoding. */
2579 int srcLen; /* Source string length in bytes. */
2580 int flags; /* Conversion control flags. */
2581 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2582 * state information used during a piecewise
2583 * conversion. Contents of statePtr are
2584 * initialized and/or reset by conversion
2585 * routine under control of flags argument. */
2586 char *dst; /* Output buffer in which converted string
2588 int dstLen; /* The maximum length of output buffer in
2590 int *srcReadPtr; /* Filled with the number of bytes from the
2591 * source string that were converted. This
2592 * may be less than the original source length
2593 * if there was a problem converting some
2594 * source characters. */
2595 int *dstWrotePtr; /* Filled with the number of bytes that were
2596 * stored in the output buffer as a result of
2597 * the conversion. */
2598 int *dstCharsPtr; /* Filled with the number of characters that
2599 * correspond to the bytes stored in the
2602 EscapeEncodingData *dataPtr;
2603 char *prefixBytes, *tablePrefixBytes;
2604 unsigned short **tableToUnicode;
2605 Encoding *encodingPtr;
2606 int state, result, numChars;
2607 CONST char *srcStart, *srcEnd;
2608 char *dstStart, *dstEnd;
2612 tablePrefixBytes = NULL; /* lint. */
2613 tableToUnicode = NULL; /* lint. */
2615 dataPtr = (EscapeEncodingData *) clientData;
2616 prefixBytes = dataPtr->prefixBytes;
2620 srcEnd = src + srcLen;
2623 dstEnd = dst + dstLen - TCL_UTF_MAX;
2625 state = (int) *statePtr;
2626 if (flags & TCL_ENCODING_START) {
2630 for (numChars = 0; src < srcEnd; ) {
2631 int byte, hi, lo, ch;
2634 result = TCL_CONVERT_NOSPACE;
2637 byte = *((unsigned char *) src);
2638 if (prefixBytes[byte]) {
2639 unsigned int left, len, longest;
2641 EscapeSubTable *subTablePtr;
2644 * Saw the beginning of an escape sequence.
2647 left = srcEnd - src;
2648 len = dataPtr->initLen;
2654 (memcmp(src, dataPtr->init, len) == 0)) {
2656 * If we see initialization string, skip it, even if we're
2657 * not at the beginning of the buffer.
2664 len = dataPtr->finalLen;
2665 if (len > longest) {
2671 (memcmp(src, dataPtr->final, len) == 0)) {
2673 * If we see finalization string, skip it, even if we're
2674 * not at the end of the buffer.
2681 subTablePtr = dataPtr->subTables;
2682 for (i = 0; i < dataPtr->numSubTables; i++) {
2683 len = subTablePtr->sequenceLen;
2684 if (len > longest) {
2690 (memcmp(src, subTablePtr->sequence, len) == 0)) {
2700 if (subTablePtr == NULL) {
2702 * A match was found, the escape sequence was consumed, and
2703 * the state was updated.
2710 * We have a split-up or unrecognized escape sequence. If we
2711 * checked all the sequences, then it's a syntax error,
2712 * otherwise we need more bytes to determine a match.
2715 if ((checked == dataPtr->numSubTables + 2)
2716 || (flags & TCL_ENCODING_END)) {
2717 if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
2719 * Skip the unknown escape sequence.
2725 result = TCL_CONVERT_SYNTAX;
2727 result = TCL_CONVERT_MULTIBYTE;
2732 if (encodingPtr == NULL) {
2733 TableEncodingData *tableDataPtr;
2735 encodingPtr = GetTableEncoding(dataPtr, state);
2736 tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2737 tablePrefixBytes = tableDataPtr->prefixBytes;
2738 tableToUnicode = tableDataPtr->toUnicode;
2740 if (tablePrefixBytes[byte]) {
2742 if (src >= srcEnd) {
2744 result = TCL_CONVERT_MULTIBYTE;
2748 lo = *((unsigned char *) src);
2753 ch = tableToUnicode[hi][lo];
2754 dst += Tcl_UniCharToUtf(ch, dst);
2759 *statePtr = (Tcl_EncodingState) state;
2760 *srcReadPtr = src - srcStart;
2761 *dstWrotePtr = dst - dstStart;
2762 *dstCharsPtr = numChars;
2767 *-------------------------------------------------------------------------
2769 * EscapeFromUtfProc --
2771 * Convert from UTF-8 into the encoding specified by the
2772 * EscapeEncodingData.
2775 * Returns TCL_OK if conversion was successful.
2780 *-------------------------------------------------------------------------
2784 EscapeFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2785 srcReadPtr, dstWrotePtr, dstCharsPtr)
2786 ClientData clientData; /* EscapeEncodingData that specifies
2788 CONST char *src; /* Source string in UTF-8. */
2789 int srcLen; /* Source string length in bytes. */
2790 int flags; /* Conversion control flags. */
2791 Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2792 * state information used during a piecewise
2793 * conversion. Contents of statePtr are
2794 * initialized and/or reset by conversion
2795 * routine under control of flags argument. */
2796 char *dst; /* Output buffer in which converted string
2798 int dstLen; /* The maximum length of output buffer in
2800 int *srcReadPtr; /* Filled with the number of bytes from the
2801 * source string that were converted. This
2802 * may be less than the original source length
2803 * if there was a problem converting some
2804 * source characters. */
2805 int *dstWrotePtr; /* Filled with the number of bytes that were
2806 * stored in the output buffer as a result of
2807 * the conversion. */
2808 int *dstCharsPtr; /* Filled with the number of characters that
2809 * correspond to the bytes stored in the
2812 EscapeEncodingData *dataPtr;
2813 Encoding *encodingPtr;
2814 CONST char *srcStart, *srcEnd, *srcClose;
2815 char *dstStart, *dstEnd;
2816 int state, result, numChars;
2817 TableEncodingData *tableDataPtr;
2818 char *tablePrefixBytes;
2819 unsigned short **tableFromUnicode;
2823 dataPtr = (EscapeEncodingData *) clientData;
2826 srcEnd = src + srcLen;
2828 if ((flags & TCL_ENCODING_END) == 0) {
2829 srcClose -= TCL_UTF_MAX;
2833 dstEnd = dst + dstLen - 1;
2836 * RFC1468 states that the text starts in ASCII, and switches to Japanese
2837 * characters, and that the text must end in ASCII. [Patch #474358]
2840 if (flags & TCL_ENCODING_START) {
2842 if ((dst + dataPtr->initLen) > dstEnd) {
2845 return TCL_CONVERT_NOSPACE;
2847 memcpy((VOID *) dst, (VOID *) dataPtr->init,
2848 (size_t) dataPtr->initLen);
2849 dst += dataPtr->initLen;
2851 state = (int) *statePtr;
2854 encodingPtr = GetTableEncoding(dataPtr, state);
2855 tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2856 tablePrefixBytes = tableDataPtr->prefixBytes;
2857 tableFromUnicode = tableDataPtr->fromUnicode;
2859 for (numChars = 0; src < srcEnd; numChars++) {
2864 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2866 * If there is more string to follow, this will ensure that the
2867 * last UTF-8 character in the source buffer hasn't been cut off.
2870 result = TCL_CONVERT_MULTIBYTE;
2873 len = TclUtfToUniChar(src, &ch);
2874 word = tableFromUnicode[(ch >> 8)][ch & 0xff];
2876 if ((word == 0) && (ch != 0)) {
2878 EscapeSubTable *subTablePtr;
2881 for (state = 0; state < dataPtr->numSubTables; state++) {
2882 encodingPtr = GetTableEncoding(dataPtr, state);
2883 tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2884 word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xff];
2892 if (flags & TCL_ENCODING_STOPONERROR) {
2893 result = TCL_CONVERT_UNKNOWN;
2896 encodingPtr = GetTableEncoding(dataPtr, state);
2897 tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2898 word = tableDataPtr->fallback;
2901 tablePrefixBytes = tableDataPtr->prefixBytes;
2902 tableFromUnicode = tableDataPtr->fromUnicode;
2905 * The state variable has the value of oldState when word is 0.
2906 * In this case, the escape sequense should not be copied to dst
2907 * because the current character set is not changed.
2909 if (state != oldState) {
2910 subTablePtr = &dataPtr->subTables[state];
2911 if ((dst + subTablePtr->sequenceLen) > dstEnd) {
2913 * If there is no space to write the escape sequence, the
2914 * state variable must be changed to the value of oldState
2915 * variable because this escape sequence must be written
2916 * in the next conversion.
2919 result = TCL_CONVERT_NOSPACE;
2922 memcpy((VOID *) dst, (VOID *) subTablePtr->sequence,
2923 (size_t) subTablePtr->sequenceLen);
2924 dst += subTablePtr->sequenceLen;
2928 if (tablePrefixBytes[(word >> 8)] != 0) {
2929 if (dst + 1 > dstEnd) {
2930 result = TCL_CONVERT_NOSPACE;
2933 dst[0] = (char) (word >> 8);
2934 dst[1] = (char) word;
2938 result = TCL_CONVERT_NOSPACE;
2941 dst[0] = (char) word;
2947 if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) {
2948 unsigned int len = dataPtr->subTables[0].sequenceLen;
2951 * Certain encodings like iso2022-jp need to write
2952 * an escape sequence after all characters have
2953 * been converted. This logic checks that enough
2954 * room is available in the buffer for the escape bytes.
2955 * The TCL_ENCODING_END flag is cleared after a final
2956 * escape sequence has been added to the buffer so
2957 * that another call to this method does not attempt
2958 * to append escape bytes a second time.
2960 if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) {
2961 result = TCL_CONVERT_NOSPACE;
2964 memcpy((VOID *) dst, (VOID *) dataPtr->subTables[0].sequence,
2968 memcpy((VOID *) dst, (VOID *) dataPtr->final,
2969 (size_t) dataPtr->finalLen);
2970 dst += dataPtr->finalLen;
2971 state &= ~TCL_ENCODING_END;
2975 *statePtr = (Tcl_EncodingState) state;
2976 *srcReadPtr = src - srcStart;
2977 *dstWrotePtr = dst - dstStart;
2978 *dstCharsPtr = numChars;
2983 *---------------------------------------------------------------------------
2987 * This procedure is invoked when an EscapeEncodingData encoding is
2988 * deleted. It deletes the memory used by the encoding.
2996 *---------------------------------------------------------------------------
3000 EscapeFreeProc(clientData)
3001 ClientData clientData; /* EscapeEncodingData that specifies encoding. */
3003 EscapeEncodingData *dataPtr;
3004 EscapeSubTable *subTablePtr;
3007 dataPtr = (EscapeEncodingData *) clientData;
3008 if (dataPtr == NULL) {
3011 subTablePtr = dataPtr->subTables;
3012 for (i = 0; i < dataPtr->numSubTables; i++) {
3013 FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr);
3016 ckfree((char *) dataPtr);
3020 *---------------------------------------------------------------------------
3022 * GetTableEncoding --
3024 * Helper function for the EscapeEncodingData conversions. Gets the
3025 * encoding (of type TextEncodingData) that represents the specified
3029 * The return value is the encoding.
3032 * If the encoding that represents the specified state has not
3033 * already been used by this EscapeEncoding, it will be loaded
3034 * and cached in the dataPtr.
3036 *---------------------------------------------------------------------------
3040 GetTableEncoding(dataPtr, state)
3041 EscapeEncodingData *dataPtr;/* Contains names of encodings. */
3042 int state; /* Index in dataPtr of desired Encoding. */
3044 EscapeSubTable *subTablePtr;
3045 Encoding *encodingPtr;
3047 subTablePtr = &dataPtr->subTables[state];
3048 encodingPtr = subTablePtr->encodingPtr;
3049 if (encodingPtr == NULL) {
3051 * Now that escape encodings load their sub-encodings first, and
3052 * fail to load if any sub-encodings are missing, this branch should
3055 encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name);
3056 if ((encodingPtr == NULL)
3057 || (encodingPtr->toUtfProc != TableToUtfProc)) {
3058 panic("EscapeToUtfProc: invalid sub table");
3060 subTablePtr->encodingPtr = encodingPtr;
3066 *---------------------------------------------------------------------------
3070 * A helper function for the Tcl_ExternalToUtf functions. This
3071 * function is similar to strlen for double-byte characters: it
3072 * returns the number of bytes in a 0x0000 terminated string.
3080 *---------------------------------------------------------------------------
3089 p = (unsigned short *) src;
3090 while (*p != 0x0000) {
3093 return (char *) p - src;
3097 *-------------------------------------------------------------------------
3099 * TclFindEncodings --
3101 * Find and load the encoding file for this operating system.
3102 * Before this is called, Tcl makes assumptions about the
3103 * native string representation, but the true encoding is not
3107 * Return result of TclpInitLibraryPath, which reports whether the
3108 * path is clean (0) or dirty (1) UTF.
3111 * Varied, see the respective initialization routines.
3113 *-------------------------------------------------------------------------
3117 TclFindEncodings(argv0)
3118 CONST char *argv0; /* Name of executable from argv[0] to main()
3119 * in native multi-byte encoding. */
3121 int mustCleanUtf = 0;
3123 if (encodingsInitialized == 0) {
3125 * Double check inside the mutex. There may be calls
3126 * back into this routine from some of the procedures below.
3130 if (encodingsInitialized == 0) {
3133 Tcl_DString libPath, buffer;
3136 * Have to set this bit here to avoid deadlock with the
3137 * routines below us that call into TclInitSubsystems.
3140 encodingsInitialized = 1;
3142 native = TclpFindExecutable(argv0);
3143 mustCleanUtf = TclpInitLibraryPath(native);
3146 * The library path was set in the TclpInitLibraryPath routine.
3147 * The string set is a dirty UTF string. To preserve the value
3148 * convert the UTF string back to native before setting the new
3152 pathPtr = TclGetLibraryPath();
3153 if ((pathPtr != NULL) && mustCleanUtf) {
3154 Tcl_UtfToExternalDString(NULL, Tcl_GetString(pathPtr), -1,
3158 TclpSetInitialEncodings();
3161 * Now convert the native string back to UTF.
3164 if ((pathPtr != NULL) && mustCleanUtf) {
3165 Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&libPath), -1,
3167 pathPtr = Tcl_NewStringObj(Tcl_DStringValue(&buffer), -1);
3168 TclSetLibraryPath(pathPtr);
3170 Tcl_DStringFree(&libPath);
3171 Tcl_DStringFree(&buffer);
3177 return mustCleanUtf;