os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclEncoding.c
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclEncoding.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,3178 @@
1.4 +/*
1.5 + * tclEncoding.c --
1.6 + *
1.7 + * Contains the implementation of the encoding conversion package.
1.8 + *
1.9 + * Copyright (c) 1996-1998 Sun Microsystems, Inc.
1.10 + * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.
1.11 + *
1.12 + * See the file "license.terms" for information on usage and redistribution
1.13 + * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
1.14 + *
1.15 + * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.14 2007/02/12 19:25:42 andreas_kupries Exp $
1.16 + */
1.17 +
1.18 +#include "tclInt.h"
1.19 +#include "tclPort.h"
1.20 +#if defined(__SYMBIAN32__)
1.21 +#include "tclSymbianGlobals.h"
1.22 +#endif
1.23 +
1.24 +typedef size_t (LengthProc)_ANSI_ARGS_((CONST char *src));
1.25 +
1.26 +/*
1.27 + * The following data structure represents an encoding, which describes how
1.28 + * to convert between various character sets and UTF-8.
1.29 + */
1.30 +
1.31 +typedef struct Encoding {
1.32 + char *name; /* Name of encoding. Malloced because (1)
1.33 + * hash table entry that owns this encoding
1.34 + * may be freed prior to this encoding being
1.35 + * freed, (2) string passed in the
1.36 + * Tcl_EncodingType structure may not be
1.37 + * persistent. */
1.38 + Tcl_EncodingConvertProc *toUtfProc;
1.39 + /* Procedure to convert from external
1.40 + * encoding into UTF-8. */
1.41 + Tcl_EncodingConvertProc *fromUtfProc;
1.42 + /* Procedure to convert from UTF-8 into
1.43 + * external encoding. */
1.44 + Tcl_EncodingFreeProc *freeProc;
1.45 + /* If non-NULL, procedure to call when this
1.46 + * encoding is deleted. */
1.47 + int nullSize; /* Number of 0x00 bytes that signify
1.48 + * end-of-string in this encoding. This
1.49 + * number is used to determine the source
1.50 + * string length when the srcLen argument is
1.51 + * negative. This number can be 1 or 2. */
1.52 + ClientData clientData; /* Arbitrary value associated with encoding
1.53 + * type. Passed to conversion procedures. */
1.54 + LengthProc *lengthProc; /* Function to compute length of
1.55 + * null-terminated strings in this encoding.
1.56 + * If nullSize is 1, this is strlen; if
1.57 + * nullSize is 2, this is a function that
1.58 + * returns the number of bytes in a 0x0000
1.59 + * terminated string. */
1.60 + int refCount; /* Number of uses of this structure. */
1.61 + Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */
1.62 +} Encoding;
1.63 +
1.64 +/*
1.65 + * The following structure is the clientData for a dynamically-loaded,
1.66 + * table-driven encoding created by LoadTableEncoding(). It maps between
1.67 + * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only)
1.68 + * encoding.
1.69 + */
1.70 +
1.71 +typedef struct TableEncodingData {
1.72 + int fallback; /* Character (in this encoding) to
1.73 + * substitute when this encoding cannot
1.74 + * represent a UTF-8 character. */
1.75 + char prefixBytes[256]; /* If a byte in the input stream is a lead
1.76 + * byte for a 2-byte sequence, the
1.77 + * corresponding entry in this array is 1,
1.78 + * otherwise it is 0. */
1.79 + unsigned short **toUnicode; /* Two dimensional sparse matrix to map
1.80 + * characters from the encoding to Unicode.
1.81 + * Each element of the toUnicode array points
1.82 + * to an array of 256 shorts. If there is no
1.83 + * corresponding character in Unicode, the
1.84 + * value in the matrix is 0x0000. malloc'd. */
1.85 + unsigned short **fromUnicode;
1.86 + /* Two dimensional sparse matrix to map
1.87 + * characters from Unicode to the encoding.
1.88 + * Each element of the fromUnicode array
1.89 + * points to an array of 256 shorts. If there
1.90 + * is no corresponding character the encoding,
1.91 + * the value in the matrix is 0x0000.
1.92 + * malloc'd. */
1.93 +} TableEncodingData;
1.94 +
1.95 +/*
1.96 + * The following structures is the clientData for a dynamically-loaded,
1.97 + * escape-driven encoding that is itself comprised of other simpler
1.98 + * encodings. An example is "iso-2022-jp", which uses escape sequences to
1.99 + * switch between ascii, jis0208, jis0212, gb2312, and ksc5601. Note that
1.100 + * "escape-driven" does not necessarily mean that the ESCAPE character is
1.101 + * the character used for switching character sets.
1.102 + */
1.103 +
1.104 +typedef struct EscapeSubTable {
1.105 + unsigned int sequenceLen; /* Length of following string. */
1.106 + char sequence[16]; /* Escape code that marks this encoding. */
1.107 + char name[32]; /* Name for encoding. */
1.108 + Encoding *encodingPtr; /* Encoding loaded using above name, or NULL
1.109 + * if this sub-encoding has not been needed
1.110 + * yet. */
1.111 +} EscapeSubTable;
1.112 +
1.113 +typedef struct EscapeEncodingData {
1.114 + int fallback; /* Character (in this encoding) to
1.115 + * substitute when this encoding cannot
1.116 + * represent a UTF-8 character. */
1.117 + unsigned int initLen; /* Length of following string. */
1.118 + char init[16]; /* String to emit or expect before first char
1.119 + * in conversion. */
1.120 + unsigned int finalLen; /* Length of following string. */
1.121 + char final[16]; /* String to emit or expect after last char
1.122 + * in conversion. */
1.123 + char prefixBytes[256]; /* If a byte in the input stream is the
1.124 + * first character of one of the escape
1.125 + * sequences in the following array, the
1.126 + * corresponding entry in this array is 1,
1.127 + * otherwise it is 0. */
1.128 + int numSubTables; /* Length of following array. */
1.129 + EscapeSubTable subTables[1];/* Information about each EscapeSubTable
1.130 + * used by this encoding type. The actual
1.131 + * size will be as large as necessary to
1.132 + * hold all EscapeSubTables. */
1.133 +} EscapeEncodingData;
1.134 +
1.135 +/*
1.136 + * Constants used when loading an encoding file to identify the type of the
1.137 + * file.
1.138 + */
1.139 +
1.140 +#define ENCODING_SINGLEBYTE 0
1.141 +#define ENCODING_DOUBLEBYTE 1
1.142 +#define ENCODING_MULTIBYTE 2
1.143 +#define ENCODING_ESCAPE 3
1.144 +
1.145 +#if !defined(__SYMBIAN32__) || !defined(__WINSCW__)
1.146 +/*
1.147 + * Initialize the default encoding directory. If this variable contains
1.148 + * a non NULL value, it will be the first path used to locate the
1.149 + * system encoding files.
1.150 + */
1.151 +
1.152 +char *tclDefaultEncodingDir = NULL;
1.153 +
1.154 +static int encodingsInitialized = 0;
1.155 +
1.156 +/*
1.157 + * Hash table that keeps track of all loaded Encodings. Keys are
1.158 + * the string names that represent the encoding, values are (Encoding *).
1.159 + */
1.160 +
1.161 +static Tcl_HashTable encodingTable;
1.162 +TCL_DECLARE_MUTEX(encodingMutex)
1.163 +
1.164 +/*
1.165 + * The following are used to hold the default and current system encodings.
1.166 + * If NULL is passed to one of the conversion routines, the current setting
1.167 + * of the system encoding will be used to perform the conversion.
1.168 + */
1.169 +
1.170 +static Tcl_Encoding defaultEncoding;
1.171 +static Tcl_Encoding systemEncoding;
1.172 +#endif
1.173 +/*
1.174 + * The following variable is used in the sparse matrix code for a
1.175 + * TableEncoding to represent a page in the table that has no entries.
1.176 + */
1.177 +
1.178 +static unsigned short emptyPage[256];
1.179 +
1.180 +/*
1.181 + * Procedures used only in this module.
1.182 + */
1.183 +
1.184 +static int BinaryProc _ANSI_ARGS_((ClientData clientData,
1.185 + CONST char *src, int srcLen, int flags,
1.186 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.187 + int *srcReadPtr, int *dstWrotePtr,
1.188 + int *dstCharsPtr));
1.189 +static void DupEncodingIntRep _ANSI_ARGS_((Tcl_Obj *srcPtr,
1.190 + Tcl_Obj *dupPtr));
1.191 +static void EscapeFreeProc _ANSI_ARGS_((ClientData clientData));
1.192 +static int EscapeFromUtfProc _ANSI_ARGS_((ClientData clientData,
1.193 + CONST char *src, int srcLen, int flags,
1.194 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.195 + int *srcReadPtr, int *dstWrotePtr,
1.196 + int *dstCharsPtr));
1.197 +static int EscapeToUtfProc _ANSI_ARGS_((ClientData clientData,
1.198 + CONST char *src, int srcLen, int flags,
1.199 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.200 + int *srcReadPtr, int *dstWrotePtr,
1.201 + int *dstCharsPtr));
1.202 +static void FreeEncoding _ANSI_ARGS_((Tcl_Encoding encoding));
1.203 +static void FreeEncodingIntRep _ANSI_ARGS_((Tcl_Obj *objPtr));
1.204 +static Encoding * GetTableEncoding _ANSI_ARGS_((
1.205 + EscapeEncodingData *dataPtr, int state));
1.206 +static Tcl_Encoding LoadEncodingFile _ANSI_ARGS_((Tcl_Interp *interp,
1.207 + CONST char *name));
1.208 +static Tcl_Encoding LoadTableEncoding _ANSI_ARGS_((Tcl_Interp *interp,
1.209 + CONST char *name, int type, Tcl_Channel chan));
1.210 +static Tcl_Encoding LoadEscapeEncoding _ANSI_ARGS_((CONST char *name,
1.211 + Tcl_Channel chan));
1.212 +static Tcl_Channel OpenEncodingFile _ANSI_ARGS_((CONST char *dir,
1.213 + CONST char *name));
1.214 +static void TableFreeProc _ANSI_ARGS_((ClientData clientData));
1.215 +static int TableFromUtfProc _ANSI_ARGS_((ClientData clientData,
1.216 + CONST char *src, int srcLen, int flags,
1.217 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.218 + int *srcReadPtr, int *dstWrotePtr,
1.219 + int *dstCharsPtr));
1.220 +static int TableToUtfProc _ANSI_ARGS_((ClientData clientData,
1.221 + CONST char *src, int srcLen, int flags,
1.222 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.223 + int *srcReadPtr, int *dstWrotePtr,
1.224 + int *dstCharsPtr));
1.225 +static size_t unilen _ANSI_ARGS_((CONST char *src));
1.226 +static int UnicodeToUtfProc _ANSI_ARGS_((ClientData clientData,
1.227 + CONST char *src, int srcLen, int flags,
1.228 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.229 + int *srcReadPtr, int *dstWrotePtr,
1.230 + int *dstCharsPtr));
1.231 +static int UtfToUnicodeProc _ANSI_ARGS_((ClientData clientData,
1.232 + CONST char *src, int srcLen, int flags,
1.233 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.234 + int *srcReadPtr, int *dstWrotePtr,
1.235 + int *dstCharsPtr));
1.236 +static int UtfToUtfProc _ANSI_ARGS_((ClientData clientData,
1.237 + CONST char *src, int srcLen, int flags,
1.238 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.239 + int *srcReadPtr, int *dstWrotePtr,
1.240 + int *dstCharsPtr, int pureNullMode));
1.241 +static int UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
1.242 + CONST char *src, int srcLen, int flags,
1.243 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.244 + int *srcReadPtr, int *dstWrotePtr,
1.245 + int *dstCharsPtr));
1.246 +static int UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
1.247 + CONST char *src, int srcLen, int flags,
1.248 + Tcl_EncodingState *statePtr, char *dst, int dstLen,
1.249 + int *srcReadPtr, int *dstWrotePtr,
1.250 + int *dstCharsPtr));
1.251 +static int TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
1.252 +
1.253 +/*
1.254 + * A Tcl_ObjType for holding a cached Tcl_Encoding as the intrep.
1.255 + * This should help the lifetime of encodings be more useful.
1.256 + * See concerns raised in [Bug 1077262].
1.257 + */
1.258 +
1.259 +static Tcl_ObjType EncodingType = {
1.260 + "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL
1.261 +};
1.262 +
1.263 +
1.264 +/*
1.265 + *----------------------------------------------------------------------
1.266 + *
1.267 + * TclGetEncodingFromObj --
1.268 + *
1.269 + * Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr),
1.270 + * if possible, and returns TCL_OK. If no such encoding exists,
1.271 + * TCL_ERROR is returned, and if interp is non-NULL, an error message
1.272 + * is written there.
1.273 + *
1.274 + * Results:
1.275 + * Standard Tcl return code.
1.276 + *
1.277 + * Side effects:
1.278 + * Caches the Tcl_Encoding value as the internal rep of (*objPtr).
1.279 + *
1.280 + *----------------------------------------------------------------------
1.281 + */
1.282 +int
1.283 +TclGetEncodingFromObj(interp, objPtr, encodingPtr)
1.284 + Tcl_Interp *interp;
1.285 + Tcl_Obj *objPtr;
1.286 + Tcl_Encoding *encodingPtr;
1.287 +{
1.288 + CONST char *name = Tcl_GetString(objPtr);
1.289 + if (objPtr->typePtr != &EncodingType) {
1.290 + Tcl_Encoding encoding = Tcl_GetEncoding(interp, name);
1.291 +
1.292 + if (encoding == NULL) {
1.293 + return TCL_ERROR;
1.294 + }
1.295 + if (objPtr->typePtr && objPtr->typePtr->freeIntRepProc) {
1.296 + objPtr->typePtr->freeIntRepProc(objPtr);
1.297 + }
1.298 + objPtr->internalRep.otherValuePtr = (VOID *) encoding;
1.299 + objPtr->typePtr = &EncodingType;
1.300 + }
1.301 + *encodingPtr = Tcl_GetEncoding(NULL, name);
1.302 + return TCL_OK;
1.303 +}
1.304 +
1.305 +/*
1.306 + *----------------------------------------------------------------------
1.307 + *
1.308 + * FreeEncodingIntRep --
1.309 + *
1.310 + * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType.
1.311 + *
1.312 + *----------------------------------------------------------------------
1.313 + */
1.314 +static void
1.315 +FreeEncodingIntRep(objPtr)
1.316 + Tcl_Obj *objPtr;
1.317 +{
1.318 + Tcl_FreeEncoding((Tcl_Encoding) objPtr->internalRep.otherValuePtr);
1.319 +}
1.320 +
1.321 +/*
1.322 + *----------------------------------------------------------------------
1.323 + *
1.324 + * DupEncodingIntRep --
1.325 + *
1.326 + * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType.
1.327 + *
1.328 + *----------------------------------------------------------------------
1.329 + */
1.330 +static void
1.331 +DupEncodingIntRep(srcPtr, dupPtr)
1.332 + Tcl_Obj *srcPtr;
1.333 + Tcl_Obj *dupPtr;
1.334 +{
1.335 + dupPtr->internalRep.otherValuePtr = (VOID *)
1.336 + Tcl_GetEncoding(NULL, srcPtr->bytes);
1.337 +}
1.338 +
1.339 +/*
1.340 + *---------------------------------------------------------------------------
1.341 + *
1.342 + * TclInitEncodingSubsystem --
1.343 + *
1.344 + * Initialize all resources used by this subsystem on a per-process
1.345 + * basis.
1.346 + *
1.347 + * Results:
1.348 + * None.
1.349 + *
1.350 + * Side effects:
1.351 + * Depends on the memory, object, and IO subsystems.
1.352 + *
1.353 + *---------------------------------------------------------------------------
1.354 + */
1.355 +
1.356 +void
1.357 +TclInitEncodingSubsystem()
1.358 +{
1.359 + Tcl_EncodingType type;
1.360 +
1.361 + Tcl_MutexLock(&encodingMutex);
1.362 + Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
1.363 + Tcl_MutexUnlock(&encodingMutex);
1.364 +
1.365 + /*
1.366 + * Create a few initial encodings. Note that the UTF-8 to UTF-8
1.367 + * translation is not a no-op, because it will turn a stream of
1.368 + * improperly formed UTF-8 into a properly formed stream.
1.369 + */
1.370 +
1.371 + type.encodingName = "identity";
1.372 + type.toUtfProc = BinaryProc;
1.373 + type.fromUtfProc = BinaryProc;
1.374 + type.freeProc = NULL;
1.375 + type.nullSize = 1;
1.376 + type.clientData = NULL;
1.377 +
1.378 + defaultEncoding = Tcl_CreateEncoding(&type);
1.379 + systemEncoding = Tcl_GetEncoding(NULL, type.encodingName);
1.380 +
1.381 + type.encodingName = "utf-8";
1.382 + type.toUtfProc = UtfExtToUtfIntProc;
1.383 + type.fromUtfProc = UtfIntToUtfExtProc;
1.384 + type.freeProc = NULL;
1.385 + type.nullSize = 1;
1.386 + type.clientData = NULL;
1.387 + Tcl_CreateEncoding(&type);
1.388 +
1.389 + type.encodingName = "unicode";
1.390 + type.toUtfProc = UnicodeToUtfProc;
1.391 + type.fromUtfProc = UtfToUnicodeProc;
1.392 + type.freeProc = NULL;
1.393 + type.nullSize = 2;
1.394 + type.clientData = NULL;
1.395 + Tcl_CreateEncoding(&type);
1.396 +}
1.397 +
1.398 +
1.399 +/*
1.400 + *----------------------------------------------------------------------
1.401 + *
1.402 + * TclFinalizeEncodingSubsystem --
1.403 + *
1.404 + * Release the state associated with the encoding subsystem.
1.405 + *
1.406 + * Results:
1.407 + * None.
1.408 + *
1.409 + * Side effects:
1.410 + * Frees all of the encodings.
1.411 + *
1.412 + *----------------------------------------------------------------------
1.413 + */
1.414 +
1.415 +void
1.416 +TclFinalizeEncodingSubsystem()
1.417 +{
1.418 + Tcl_HashSearch search;
1.419 + Tcl_HashEntry *hPtr;
1.420 +
1.421 + Tcl_MutexLock(&encodingMutex);
1.422 + encodingsInitialized = 0;
1.423 + FreeEncoding(systemEncoding);
1.424 + hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
1.425 + while (hPtr != NULL) {
1.426 + /*
1.427 + * Call FreeEncoding instead of doing it directly to handle refcounts
1.428 + * like escape encodings use. [Bug #524674]
1.429 + * Make sure to call Tcl_FirstHashEntry repeatedly so that all
1.430 + * encodings are eventually cleaned up.
1.431 + */
1.432 + FreeEncoding((Tcl_Encoding) Tcl_GetHashValue(hPtr));
1.433 + hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
1.434 + }
1.435 + Tcl_DeleteHashTable(&encodingTable);
1.436 + Tcl_MutexUnlock(&encodingMutex);
1.437 +}
1.438 +
1.439 +/*
1.440 + *-------------------------------------------------------------------------
1.441 + *
1.442 + * Tcl_GetDefaultEncodingDir --
1.443 + *
1.444 + *
1.445 + * Results:
1.446 + *
1.447 + * Side effects:
1.448 + *
1.449 + *-------------------------------------------------------------------------
1.450 + */
1.451 +
1.452 +EXPORT_C CONST char *
1.453 +Tcl_GetDefaultEncodingDir()
1.454 +{
1.455 + return tclDefaultEncodingDir;
1.456 +}
1.457 +
1.458 +/*
1.459 + *-------------------------------------------------------------------------
1.460 + *
1.461 + * Tcl_SetDefaultEncodingDir --
1.462 + *
1.463 + *
1.464 + * Results:
1.465 + *
1.466 + * Side effects:
1.467 + *
1.468 + *-------------------------------------------------------------------------
1.469 + */
1.470 +
1.471 +EXPORT_C void
1.472 +Tcl_SetDefaultEncodingDir(path)
1.473 + CONST char *path;
1.474 +{
1.475 + tclDefaultEncodingDir = (char *)ckalloc((unsigned) strlen(path) + 1);
1.476 + strcpy(tclDefaultEncodingDir, path);
1.477 +}
1.478 +
1.479 +/*
1.480 + *-------------------------------------------------------------------------
1.481 + *
1.482 + * Tcl_GetEncoding --
1.483 + *
1.484 + * Given the name of a encoding, find the corresponding Tcl_Encoding
1.485 + * token. If the encoding did not already exist, Tcl attempts to
1.486 + * dynamically load an encoding by that name.
1.487 + *
1.488 + * Results:
1.489 + * Returns a token that represents the encoding. If the name didn't
1.490 + * refer to any known or loadable encoding, NULL is returned. If
1.491 + * NULL was returned, an error message is left in interp's result
1.492 + * object, unless interp was NULL.
1.493 + *
1.494 + * Side effects:
1.495 + * The new encoding type is entered into a table visible to all
1.496 + * interpreters, keyed off the encoding's name. For each call to
1.497 + * this procedure, there should eventually be a call to
1.498 + * Tcl_FreeEncoding, so that the database can be cleaned up when
1.499 + * encodings aren't needed anymore.
1.500 + *
1.501 + *-------------------------------------------------------------------------
1.502 + */
1.503 +
1.504 +EXPORT_C Tcl_Encoding
1.505 +Tcl_GetEncoding(interp, name)
1.506 + Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */
1.507 + CONST char *name; /* The name of the desired encoding. */
1.508 +{
1.509 + Tcl_HashEntry *hPtr;
1.510 + Encoding *encodingPtr;
1.511 +
1.512 + Tcl_MutexLock(&encodingMutex);
1.513 + if (name == NULL) {
1.514 + encodingPtr = (Encoding *) systemEncoding;
1.515 + encodingPtr->refCount++;
1.516 + Tcl_MutexUnlock(&encodingMutex);
1.517 + return systemEncoding;
1.518 + }
1.519 +
1.520 + hPtr = Tcl_FindHashEntry(&encodingTable, name);
1.521 + if (hPtr != NULL) {
1.522 + encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
1.523 + encodingPtr->refCount++;
1.524 + Tcl_MutexUnlock(&encodingMutex);
1.525 + return (Tcl_Encoding) encodingPtr;
1.526 + }
1.527 + Tcl_MutexUnlock(&encodingMutex);
1.528 + return LoadEncodingFile(interp, name);
1.529 +}
1.530 +
1.531 +/*
1.532 + *---------------------------------------------------------------------------
1.533 + *
1.534 + * Tcl_FreeEncoding --
1.535 + *
1.536 + * This procedure is called to release an encoding allocated by
1.537 + * Tcl_CreateEncoding() or Tcl_GetEncoding().
1.538 + *
1.539 + * Results:
1.540 + * None.
1.541 + *
1.542 + * Side effects:
1.543 + * The reference count associated with the encoding is decremented
1.544 + * and the encoding may be deleted if nothing is using it anymore.
1.545 + *
1.546 + *---------------------------------------------------------------------------
1.547 + */
1.548 +
1.549 +EXPORT_C void
1.550 +Tcl_FreeEncoding(encoding)
1.551 + Tcl_Encoding encoding;
1.552 +{
1.553 + Tcl_MutexLock(&encodingMutex);
1.554 + FreeEncoding(encoding);
1.555 + Tcl_MutexUnlock(&encodingMutex);
1.556 +}
1.557 +
1.558 +/*
1.559 + *----------------------------------------------------------------------
1.560 + *
1.561 + * FreeEncoding --
1.562 + *
1.563 + * This procedure is called to release an encoding by procedures
1.564 + * that already have the encodingMutex.
1.565 + *
1.566 + * Results:
1.567 + * None.
1.568 + *
1.569 + * Side effects:
1.570 + * The reference count associated with the encoding is decremented
1.571 + * and the encoding may be deleted if nothing is using it anymore.
1.572 + *
1.573 + *----------------------------------------------------------------------
1.574 + */
1.575 +
1.576 +static void
1.577 +FreeEncoding(encoding)
1.578 + Tcl_Encoding encoding;
1.579 +{
1.580 + Encoding *encodingPtr;
1.581 +
1.582 + encodingPtr = (Encoding *) encoding;
1.583 + if (encodingPtr == NULL) {
1.584 + return;
1.585 + }
1.586 + encodingPtr->refCount--;
1.587 + if (encodingPtr->refCount == 0) {
1.588 + if (encodingPtr->freeProc != NULL) {
1.589 + (*encodingPtr->freeProc)(encodingPtr->clientData);
1.590 + }
1.591 + if (encodingPtr->hPtr != NULL) {
1.592 + Tcl_DeleteHashEntry(encodingPtr->hPtr);
1.593 + }
1.594 + ckfree((char *) encodingPtr->name);
1.595 + ckfree((char *) encodingPtr);
1.596 + }
1.597 +}
1.598 +
1.599 +/*
1.600 + *-------------------------------------------------------------------------
1.601 + *
1.602 + * Tcl_GetEncodingName --
1.603 + *
1.604 + * Given an encoding, return the name that was used to constuct
1.605 + * the encoding.
1.606 + *
1.607 + * Results:
1.608 + * The name of the encoding.
1.609 + *
1.610 + * Side effects:
1.611 + * None.
1.612 + *
1.613 + *---------------------------------------------------------------------------
1.614 + */
1.615 +
1.616 +EXPORT_C CONST char *
1.617 +Tcl_GetEncodingName(encoding)
1.618 + Tcl_Encoding encoding; /* The encoding whose name to fetch. */
1.619 +{
1.620 + Encoding *encodingPtr;
1.621 +
1.622 + if (encoding == NULL) {
1.623 + encoding = systemEncoding;
1.624 + }
1.625 + encodingPtr = (Encoding *) encoding;
1.626 + return encodingPtr->name;
1.627 +}
1.628 +
1.629 +/*
1.630 + *-------------------------------------------------------------------------
1.631 + *
1.632 + * Tcl_GetEncodingNames --
1.633 + *
1.634 + * Get the list of all known encodings, including the ones stored
1.635 + * as files on disk in the encoding path.
1.636 + *
1.637 + * Results:
1.638 + * Modifies interp's result object to hold a list of all the available
1.639 + * encodings.
1.640 + *
1.641 + * Side effects:
1.642 + * None.
1.643 + *
1.644 + *-------------------------------------------------------------------------
1.645 + */
1.646 +
1.647 +EXPORT_C void
1.648 +Tcl_GetEncodingNames(interp)
1.649 + Tcl_Interp *interp; /* Interp to hold result. */
1.650 +{
1.651 + Tcl_HashSearch search;
1.652 + Tcl_HashEntry *hPtr;
1.653 + Tcl_Obj *pathPtr, *resultPtr;
1.654 + int dummy;
1.655 +
1.656 + Tcl_HashTable table;
1.657 +
1.658 + Tcl_MutexLock(&encodingMutex);
1.659 + Tcl_InitHashTable(&table, TCL_STRING_KEYS);
1.660 + hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
1.661 + while (hPtr != NULL) {
1.662 + Encoding *encodingPtr;
1.663 +
1.664 + encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
1.665 + Tcl_CreateHashEntry(&table, encodingPtr->name, &dummy);
1.666 + hPtr = Tcl_NextHashEntry(&search);
1.667 + }
1.668 + Tcl_MutexUnlock(&encodingMutex);
1.669 +
1.670 + pathPtr = TclGetLibraryPath();
1.671 + if (pathPtr != NULL) {
1.672 + int i, objc;
1.673 + Tcl_Obj **objv;
1.674 + char globArgString[10];
1.675 + Tcl_Obj* encodingObj = Tcl_NewStringObj("encoding",-1);
1.676 + Tcl_IncrRefCount(encodingObj);
1.677 +
1.678 + objc = 0;
1.679 + Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv);
1.680 +
1.681 + for (i = 0; i < objc; i++) {
1.682 + Tcl_Obj *searchIn;
1.683 +
1.684 + /*
1.685 + * Construct the path from the element of pathPtr,
1.686 + * joined with 'encoding'.
1.687 + */
1.688 + searchIn = Tcl_FSJoinToPath(objv[i],1,&encodingObj);
1.689 + Tcl_IncrRefCount(searchIn);
1.690 + Tcl_ResetResult(interp);
1.691 +
1.692 + /*
1.693 + * TclGlob() changes the contents of globArgString, which causes
1.694 + * a segfault if we pass in a pointer to non-writeable memory.
1.695 + * TclGlob() puts its results directly into interp.
1.696 + */
1.697 +
1.698 + strcpy(globArgString, "*.enc");
1.699 + /*
1.700 + * The GLOBMODE_TAILS flag returns just the tail of each file
1.701 + * which is the encoding name with a .enc extension
1.702 + */
1.703 + if ((TclGlob(interp, globArgString, searchIn,
1.704 + TCL_GLOBMODE_TAILS, NULL) == TCL_OK)) {
1.705 + int objc2 = 0;
1.706 + Tcl_Obj **objv2;
1.707 + int j;
1.708 +
1.709 + Tcl_ListObjGetElements(NULL, Tcl_GetObjResult(interp), &objc2,
1.710 + &objv2);
1.711 +
1.712 + for (j = 0; j < objc2; j++) {
1.713 + int length;
1.714 + char *string;
1.715 + string = Tcl_GetStringFromObj(objv2[j], &length);
1.716 + length -= 4;
1.717 + if (length > 0) {
1.718 + string[length] = '\0';
1.719 + Tcl_CreateHashEntry(&table, string, &dummy);
1.720 + string[length] = '.';
1.721 + }
1.722 + }
1.723 + }
1.724 + Tcl_DecrRefCount(searchIn);
1.725 + }
1.726 + Tcl_DecrRefCount(encodingObj);
1.727 + }
1.728 +
1.729 + /*
1.730 + * Clear any values placed in the result by globbing.
1.731 + */
1.732 +
1.733 + Tcl_ResetResult(interp);
1.734 + resultPtr = Tcl_GetObjResult(interp);
1.735 +
1.736 + hPtr = Tcl_FirstHashEntry(&table, &search);
1.737 + while (hPtr != NULL) {
1.738 + Tcl_Obj *strPtr;
1.739 +
1.740 + strPtr = Tcl_NewStringObj(Tcl_GetHashKey(&table, hPtr), -1);
1.741 + Tcl_ListObjAppendElement(NULL, resultPtr, strPtr);
1.742 + hPtr = Tcl_NextHashEntry(&search);
1.743 + }
1.744 + Tcl_DeleteHashTable(&table);
1.745 +}
1.746 +
1.747 +/*
1.748 + *------------------------------------------------------------------------
1.749 + *
1.750 + * Tcl_SetSystemEncoding --
1.751 + *
1.752 + * Sets the default encoding that should be used whenever the user
1.753 + * passes a NULL value in to one of the conversion routines.
1.754 + * If the supplied name is NULL, the system encoding is reset to the
1.755 + * default system encoding.
1.756 + *
1.757 + * Results:
1.758 + * The return value is TCL_OK if the system encoding was successfully
1.759 + * set to the encoding specified by name, TCL_ERROR otherwise. If
1.760 + * TCL_ERROR is returned, an error message is left in interp's result
1.761 + * object, unless interp was NULL.
1.762 + *
1.763 + * Side effects:
1.764 + * The reference count of the new system encoding is incremented.
1.765 + * The reference count of the old system encoding is decremented and
1.766 + * it may be freed.
1.767 + *
1.768 + *------------------------------------------------------------------------
1.769 + */
1.770 +
1.771 +EXPORT_C int
1.772 +Tcl_SetSystemEncoding(interp, name)
1.773 + Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */
1.774 + CONST char *name; /* The name of the desired encoding, or NULL
1.775 + * to reset to default encoding. */
1.776 +{
1.777 + Tcl_Encoding encoding;
1.778 + Encoding *encodingPtr;
1.779 +
1.780 + if (name == NULL) {
1.781 + Tcl_MutexLock(&encodingMutex);
1.782 + encoding = defaultEncoding;
1.783 + encodingPtr = (Encoding *) encoding;
1.784 + encodingPtr->refCount++;
1.785 + Tcl_MutexUnlock(&encodingMutex);
1.786 + } else {
1.787 + encoding = Tcl_GetEncoding(interp, name);
1.788 + if (encoding == NULL) {
1.789 + return TCL_ERROR;
1.790 + }
1.791 + }
1.792 +
1.793 + Tcl_MutexLock(&encodingMutex);
1.794 + FreeEncoding(systemEncoding);
1.795 + systemEncoding = encoding;
1.796 + Tcl_MutexUnlock(&encodingMutex);
1.797 +
1.798 + return TCL_OK;
1.799 +}
1.800 +
1.801 +/*
1.802 + *---------------------------------------------------------------------------
1.803 + *
1.804 + * Tcl_CreateEncoding --
1.805 + *
1.806 + * This procedure is called to define a new encoding and the procedures
1.807 + * that are used to convert between the specified encoding and Unicode.
1.808 + *
1.809 + * Results:
1.810 + * Returns a token that represents the encoding. If an encoding with
1.811 + * the same name already existed, the old encoding token remains
1.812 + * valid and continues to behave as it used to, and will eventually
1.813 + * be garbage collected when the last reference to it goes away. Any
1.814 + * subsequent calls to Tcl_GetEncoding with the specified name will
1.815 + * retrieve the most recent encoding token.
1.816 + *
1.817 + * Side effects:
1.818 + * The new encoding type is entered into a table visible to all
1.819 + * interpreters, keyed off the encoding's name. For each call to
1.820 + * this procedure, there should eventually be a call to
1.821 + * Tcl_FreeEncoding, so that the database can be cleaned up when
1.822 + * encodings aren't needed anymore.
1.823 + *
1.824 + *---------------------------------------------------------------------------
1.825 + */
1.826 +
1.827 +EXPORT_C Tcl_Encoding
1.828 +Tcl_CreateEncoding(typePtr)
1.829 + Tcl_EncodingType *typePtr; /* The encoding type. */
1.830 +{
1.831 + Tcl_HashEntry *hPtr;
1.832 + int new;
1.833 + Encoding *encodingPtr;
1.834 + char *name;
1.835 +
1.836 + Tcl_MutexLock(&encodingMutex);
1.837 + hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &new);
1.838 + if (new == 0) {
1.839 + /*
1.840 + * Remove old encoding from hash table, but don't delete it until
1.841 + * last reference goes away.
1.842 + */
1.843 +
1.844 + encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
1.845 + encodingPtr->hPtr = NULL;
1.846 + }
1.847 +
1.848 + name = ckalloc((unsigned) strlen(typePtr->encodingName) + 1);
1.849 +
1.850 + encodingPtr = (Encoding *) ckalloc(sizeof(Encoding));
1.851 + encodingPtr->name = strcpy(name, typePtr->encodingName);
1.852 + encodingPtr->toUtfProc = typePtr->toUtfProc;
1.853 + encodingPtr->fromUtfProc = typePtr->fromUtfProc;
1.854 + encodingPtr->freeProc = typePtr->freeProc;
1.855 + encodingPtr->nullSize = typePtr->nullSize;
1.856 + encodingPtr->clientData = typePtr->clientData;
1.857 + if (typePtr->nullSize == 1) {
1.858 + encodingPtr->lengthProc = (LengthProc *) strlen;
1.859 + } else {
1.860 + encodingPtr->lengthProc = (LengthProc *) unilen;
1.861 + }
1.862 + encodingPtr->refCount = 1;
1.863 + encodingPtr->hPtr = hPtr;
1.864 + Tcl_SetHashValue(hPtr, encodingPtr);
1.865 +
1.866 + Tcl_MutexUnlock(&encodingMutex);
1.867 +
1.868 + return (Tcl_Encoding) encodingPtr;
1.869 +}
1.870 +
1.871 +/*
1.872 + *-------------------------------------------------------------------------
1.873 + *
1.874 + * Tcl_ExternalToUtfDString --
1.875 + *
1.876 + * Convert a source buffer from the specified encoding into UTF-8.
1.877 + * If any of the bytes in the source buffer are invalid or cannot
1.878 + * be represented in the target encoding, a default fallback
1.879 + * character will be substituted.
1.880 + *
1.881 + * Results:
1.882 + * The converted bytes are stored in the DString, which is then NULL
1.883 + * terminated. The return value is a pointer to the value stored
1.884 + * in the DString.
1.885 + *
1.886 + * Side effects:
1.887 + * None.
1.888 + *
1.889 + *-------------------------------------------------------------------------
1.890 + */
1.891 +
1.892 +EXPORT_C char *
1.893 +Tcl_ExternalToUtfDString(encoding, src, srcLen, dstPtr)
1.894 + Tcl_Encoding encoding; /* The encoding for the source string, or
1.895 + * NULL for the default system encoding. */
1.896 + CONST char *src; /* Source string in specified encoding. */
1.897 + int srcLen; /* Source string length in bytes, or < 0 for
1.898 + * encoding-specific string length. */
1.899 + Tcl_DString *dstPtr; /* Uninitialized or free DString in which
1.900 + * the converted string is stored. */
1.901 +{
1.902 + char *dst;
1.903 + Tcl_EncodingState state;
1.904 + Encoding *encodingPtr;
1.905 + int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1.906 +
1.907 + Tcl_DStringInit(dstPtr);
1.908 + dst = Tcl_DStringValue(dstPtr);
1.909 + dstLen = dstPtr->spaceAvl - 1;
1.910 +
1.911 + if (encoding == NULL) {
1.912 + encoding = systemEncoding;
1.913 + }
1.914 + encodingPtr = (Encoding *) encoding;
1.915 +
1.916 + if (src == NULL) {
1.917 + srcLen = 0;
1.918 + } else if (srcLen < 0) {
1.919 + srcLen = (*encodingPtr->lengthProc)(src);
1.920 + }
1.921 + flags = TCL_ENCODING_START | TCL_ENCODING_END;
1.922 + while (1) {
1.923 + result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src,
1.924 + srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
1.925 + &dstChars);
1.926 + soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1.927 + if (result != TCL_CONVERT_NOSPACE) {
1.928 + Tcl_DStringSetLength(dstPtr, soFar);
1.929 + return Tcl_DStringValue(dstPtr);
1.930 + }
1.931 + flags &= ~TCL_ENCODING_START;
1.932 + src += srcRead;
1.933 + srcLen -= srcRead;
1.934 + if (Tcl_DStringLength(dstPtr) == 0) {
1.935 + Tcl_DStringSetLength(dstPtr, dstLen);
1.936 + }
1.937 + Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1.938 + dst = Tcl_DStringValue(dstPtr) + soFar;
1.939 + dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1.940 + }
1.941 +}
1.942 +
1.943 +/*
1.944 + *-------------------------------------------------------------------------
1.945 + *
1.946 + * Tcl_ExternalToUtf --
1.947 + *
1.948 + * Convert a source buffer from the specified encoding into UTF-8.
1.949 + *
1.950 + * Results:
1.951 + * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1.952 + * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE,
1.953 + * as documented in tcl.h.
1.954 + *
1.955 + * Side effects:
1.956 + * The converted bytes are stored in the output buffer.
1.957 + *
1.958 + *-------------------------------------------------------------------------
1.959 + */
1.960 +
1.961 +EXPORT_C int
1.962 +Tcl_ExternalToUtf(interp, encoding, src, srcLen, flags, statePtr, dst,
1.963 + dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
1.964 + Tcl_Interp *interp; /* Interp for error return, if not NULL. */
1.965 + Tcl_Encoding encoding; /* The encoding for the source string, or
1.966 + * NULL for the default system encoding. */
1.967 + CONST char *src; /* Source string in specified encoding. */
1.968 + int srcLen; /* Source string length in bytes, or < 0 for
1.969 + * encoding-specific string length. */
1.970 + int flags; /* Conversion control flags. */
1.971 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.972 + * state information used during a piecewise
1.973 + * conversion. Contents of statePtr are
1.974 + * initialized and/or reset by conversion
1.975 + * routine under control of flags argument. */
1.976 + char *dst; /* Output buffer in which converted string
1.977 + * is stored. */
1.978 + int dstLen; /* The maximum length of output buffer in
1.979 + * bytes. */
1.980 + int *srcReadPtr; /* Filled with the number of bytes from the
1.981 + * source string that were converted. This
1.982 + * may be less than the original source length
1.983 + * if there was a problem converting some
1.984 + * source characters. */
1.985 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.986 + * stored in the output buffer as a result of
1.987 + * the conversion. */
1.988 + int *dstCharsPtr; /* Filled with the number of characters that
1.989 + * correspond to the bytes stored in the
1.990 + * output buffer. */
1.991 +{
1.992 + Encoding *encodingPtr;
1.993 + int result, srcRead, dstWrote, dstChars;
1.994 + Tcl_EncodingState state;
1.995 +
1.996 + if (encoding == NULL) {
1.997 + encoding = systemEncoding;
1.998 + }
1.999 + encodingPtr = (Encoding *) encoding;
1.1000 +
1.1001 + if (src == NULL) {
1.1002 + srcLen = 0;
1.1003 + } else if (srcLen < 0) {
1.1004 + srcLen = (*encodingPtr->lengthProc)(src);
1.1005 + }
1.1006 + if (statePtr == NULL) {
1.1007 + flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1.1008 + statePtr = &state;
1.1009 + }
1.1010 + if (srcReadPtr == NULL) {
1.1011 + srcReadPtr = &srcRead;
1.1012 + }
1.1013 + if (dstWrotePtr == NULL) {
1.1014 + dstWrotePtr = &dstWrote;
1.1015 + }
1.1016 + if (dstCharsPtr == NULL) {
1.1017 + dstCharsPtr = &dstChars;
1.1018 + }
1.1019 +
1.1020 + /*
1.1021 + * If there are any null characters in the middle of the buffer, they will
1.1022 + * converted to the UTF-8 null character (\xC080). To get the actual
1.1023 + * \0 at the end of the destination buffer, we need to append it manually.
1.1024 + */
1.1025 +
1.1026 + dstLen--;
1.1027 + result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, srcLen,
1.1028 + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1.1029 + dstCharsPtr);
1.1030 + dst[*dstWrotePtr] = '\0';
1.1031 + return result;
1.1032 +}
1.1033 +
1.1034 +/*
1.1035 + *-------------------------------------------------------------------------
1.1036 + *
1.1037 + * Tcl_UtfToExternalDString --
1.1038 + *
1.1039 + * Convert a source buffer from UTF-8 into the specified encoding.
1.1040 + * If any of the bytes in the source buffer are invalid or cannot
1.1041 + * be represented in the target encoding, a default fallback
1.1042 + * character will be substituted.
1.1043 + *
1.1044 + * Results:
1.1045 + * The converted bytes are stored in the DString, which is then
1.1046 + * NULL terminated in an encoding-specific manner. The return value
1.1047 + * is a pointer to the value stored in the DString.
1.1048 + *
1.1049 + * Side effects:
1.1050 + * None.
1.1051 + *
1.1052 + *-------------------------------------------------------------------------
1.1053 + */
1.1054 +
1.1055 +EXPORT_C char *
1.1056 +Tcl_UtfToExternalDString(encoding, src, srcLen, dstPtr)
1.1057 + Tcl_Encoding encoding; /* The encoding for the converted string,
1.1058 + * or NULL for the default system encoding. */
1.1059 + CONST char *src; /* Source string in UTF-8. */
1.1060 + int srcLen; /* Source string length in bytes, or < 0 for
1.1061 + * strlen(). */
1.1062 + Tcl_DString *dstPtr; /* Uninitialized or free DString in which
1.1063 + * the converted string is stored. */
1.1064 +{
1.1065 + char *dst;
1.1066 + Tcl_EncodingState state;
1.1067 + Encoding *encodingPtr;
1.1068 + int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1.1069 +
1.1070 + Tcl_DStringInit(dstPtr);
1.1071 + dst = Tcl_DStringValue(dstPtr);
1.1072 + dstLen = dstPtr->spaceAvl - 1;
1.1073 +
1.1074 + if (encoding == NULL) {
1.1075 + encoding = systemEncoding;
1.1076 + }
1.1077 + encodingPtr = (Encoding *) encoding;
1.1078 +
1.1079 + if (src == NULL) {
1.1080 + srcLen = 0;
1.1081 + } else if (srcLen < 0) {
1.1082 + srcLen = strlen(src);
1.1083 + }
1.1084 + flags = TCL_ENCODING_START | TCL_ENCODING_END;
1.1085 + while (1) {
1.1086 + result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src,
1.1087 + srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
1.1088 + &dstChars);
1.1089 + soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1.1090 + if (result != TCL_CONVERT_NOSPACE) {
1.1091 + if (encodingPtr->nullSize == 2) {
1.1092 + Tcl_DStringSetLength(dstPtr, soFar + 1);
1.1093 + }
1.1094 + Tcl_DStringSetLength(dstPtr, soFar);
1.1095 + return Tcl_DStringValue(dstPtr);
1.1096 + }
1.1097 + flags &= ~TCL_ENCODING_START;
1.1098 + src += srcRead;
1.1099 + srcLen -= srcRead;
1.1100 + if (Tcl_DStringLength(dstPtr) == 0) {
1.1101 + Tcl_DStringSetLength(dstPtr, dstLen);
1.1102 + }
1.1103 + Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1.1104 + dst = Tcl_DStringValue(dstPtr) + soFar;
1.1105 + dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1.1106 + }
1.1107 +}
1.1108 +
1.1109 +/*
1.1110 + *-------------------------------------------------------------------------
1.1111 + *
1.1112 + * Tcl_UtfToExternal --
1.1113 + *
1.1114 + * Convert a buffer from UTF-8 into the specified encoding.
1.1115 + *
1.1116 + * Results:
1.1117 + * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1.1118 + * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE,
1.1119 + * as documented in tcl.h.
1.1120 + *
1.1121 + * Side effects:
1.1122 + * The converted bytes are stored in the output buffer.
1.1123 + *
1.1124 + *-------------------------------------------------------------------------
1.1125 + */
1.1126 +
1.1127 +EXPORT_C int
1.1128 +Tcl_UtfToExternal(interp, encoding, src, srcLen, flags, statePtr, dst,
1.1129 + dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
1.1130 + Tcl_Interp *interp; /* Interp for error return, if not NULL. */
1.1131 + Tcl_Encoding encoding; /* The encoding for the converted string,
1.1132 + * or NULL for the default system encoding. */
1.1133 + CONST char *src; /* Source string in UTF-8. */
1.1134 + int srcLen; /* Source string length in bytes, or < 0 for
1.1135 + * strlen(). */
1.1136 + int flags; /* Conversion control flags. */
1.1137 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.1138 + * state information used during a piecewise
1.1139 + * conversion. Contents of statePtr are
1.1140 + * initialized and/or reset by conversion
1.1141 + * routine under control of flags argument. */
1.1142 + char *dst; /* Output buffer in which converted string
1.1143 + * is stored. */
1.1144 + int dstLen; /* The maximum length of output buffer in
1.1145 + * bytes. */
1.1146 + int *srcReadPtr; /* Filled with the number of bytes from the
1.1147 + * source string that were converted. This
1.1148 + * may be less than the original source length
1.1149 + * if there was a problem converting some
1.1150 + * source characters. */
1.1151 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.1152 + * stored in the output buffer as a result of
1.1153 + * the conversion. */
1.1154 + int *dstCharsPtr; /* Filled with the number of characters that
1.1155 + * correspond to the bytes stored in the
1.1156 + * output buffer. */
1.1157 +{
1.1158 + Encoding *encodingPtr;
1.1159 + int result, srcRead, dstWrote, dstChars;
1.1160 + Tcl_EncodingState state;
1.1161 +
1.1162 + if (encoding == NULL) {
1.1163 + encoding = systemEncoding;
1.1164 + }
1.1165 + encodingPtr = (Encoding *) encoding;
1.1166 +
1.1167 + if (src == NULL) {
1.1168 + srcLen = 0;
1.1169 + } else if (srcLen < 0) {
1.1170 + srcLen = strlen(src);
1.1171 + }
1.1172 + if (statePtr == NULL) {
1.1173 + flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1.1174 + statePtr = &state;
1.1175 + }
1.1176 + if (srcReadPtr == NULL) {
1.1177 + srcReadPtr = &srcRead;
1.1178 + }
1.1179 + if (dstWrotePtr == NULL) {
1.1180 + dstWrotePtr = &dstWrote;
1.1181 + }
1.1182 + if (dstCharsPtr == NULL) {
1.1183 + dstCharsPtr = &dstChars;
1.1184 + }
1.1185 +
1.1186 + dstLen -= encodingPtr->nullSize;
1.1187 + result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, srcLen,
1.1188 + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1.1189 + dstCharsPtr);
1.1190 + if (encodingPtr->nullSize == 2) {
1.1191 + dst[*dstWrotePtr + 1] = '\0';
1.1192 + }
1.1193 + dst[*dstWrotePtr] = '\0';
1.1194 +
1.1195 + return result;
1.1196 +}
1.1197 +
1.1198 +/*
1.1199 + *---------------------------------------------------------------------------
1.1200 + *
1.1201 + * Tcl_FindExecutable --
1.1202 + *
1.1203 + * This procedure computes the absolute path name of the current
1.1204 + * application, given its argv[0] value.
1.1205 + *
1.1206 + * Results:
1.1207 + * None.
1.1208 + *
1.1209 + * Side effects:
1.1210 + * The variable tclExecutableName gets filled in with the file
1.1211 + * name for the application, if we figured it out. If we couldn't
1.1212 + * figure it out, tclExecutableName is set to NULL.
1.1213 + *
1.1214 + *---------------------------------------------------------------------------
1.1215 + */
1.1216 +
1.1217 +EXPORT_C void
1.1218 +Tcl_FindExecutable(argv0)
1.1219 + CONST char *argv0; /* The value of the application's argv[0]
1.1220 + * (native). */
1.1221 +{
1.1222 + int mustCleanUtf;
1.1223 + CONST char *name;
1.1224 + Tcl_DString buffer, nameString;
1.1225 +
1.1226 + TclInitSubsystems(argv0);
1.1227 +
1.1228 + if (argv0 == NULL) {
1.1229 + goto done;
1.1230 + }
1.1231 + if (tclExecutableName != NULL) {
1.1232 + ckfree(tclExecutableName);
1.1233 + tclExecutableName = NULL;
1.1234 + }
1.1235 + if ((name = TclpFindExecutable(argv0)) == NULL) {
1.1236 + goto done;
1.1237 + }
1.1238 +
1.1239 + /*
1.1240 + * The value returned from TclpNameOfExecutable is a UTF string that
1.1241 + * is possibly dirty depending on when it was initialized.
1.1242 + * TclFindEncodings will indicate whether we must "clean" the UTF (as
1.1243 + * reported by the underlying system). To assure that the UTF string
1.1244 + * is a properly encoded native string for this system, convert the
1.1245 + * UTF string to the default native encoding before the default
1.1246 + * encoding is initialized. Then, convert it back to UTF after the
1.1247 + * system encoding is loaded.
1.1248 + */
1.1249 +
1.1250 + Tcl_UtfToExternalDString(NULL, name, -1, &buffer);
1.1251 + mustCleanUtf = TclFindEncodings(argv0);
1.1252 +
1.1253 + /*
1.1254 + * Now it is OK to convert the native string back to UTF and set
1.1255 + * the value of the tclExecutableName.
1.1256 + */
1.1257 +
1.1258 + if (mustCleanUtf) {
1.1259 + Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&buffer), -1,
1.1260 + &nameString);
1.1261 + tclExecutableName = (char *)
1.1262 + ckalloc((unsigned) (Tcl_DStringLength(&nameString) + 1));
1.1263 + strcpy(tclExecutableName, Tcl_DStringValue(&nameString));
1.1264 +
1.1265 + Tcl_DStringFree(&nameString);
1.1266 + } else {
1.1267 + tclExecutableName = (char *) ckalloc((unsigned) (strlen(name) + 1));
1.1268 + strcpy(tclExecutableName, name);
1.1269 + }
1.1270 + Tcl_DStringFree(&buffer);
1.1271 + return;
1.1272 +
1.1273 + done:
1.1274 + (void) TclFindEncodings(argv0);
1.1275 +}
1.1276 +
1.1277 +/*
1.1278 + *---------------------------------------------------------------------------
1.1279 + *
1.1280 + * LoadEncodingFile --
1.1281 + *
1.1282 + * Read a file that describes an encoding and create a new Encoding
1.1283 + * from the data.
1.1284 + *
1.1285 + * Results:
1.1286 + * The return value is the newly loaded Encoding, or NULL if
1.1287 + * the file didn't exist of was in the incorrect format. If NULL was
1.1288 + * returned, an error message is left in interp's result object,
1.1289 + * unless interp was NULL.
1.1290 + *
1.1291 + * Side effects:
1.1292 + * File read from disk.
1.1293 + *
1.1294 + *---------------------------------------------------------------------------
1.1295 + */
1.1296 +
1.1297 +static Tcl_Encoding
1.1298 +LoadEncodingFile(interp, name)
1.1299 + Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */
1.1300 + CONST char *name; /* The name of the encoding file on disk
1.1301 + * and also the name for new encoding. */
1.1302 +{
1.1303 + int objc, i, ch;
1.1304 + Tcl_Obj **objv;
1.1305 + Tcl_Obj *pathPtr;
1.1306 + Tcl_Channel chan;
1.1307 + Tcl_Encoding encoding;
1.1308 +
1.1309 + pathPtr = TclGetLibraryPath();
1.1310 + if (pathPtr == NULL) {
1.1311 + goto unknown;
1.1312 + }
1.1313 + objc = 0;
1.1314 + Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv);
1.1315 +
1.1316 + chan = NULL;
1.1317 + for (i = 0; i < objc; i++) {
1.1318 + chan = OpenEncodingFile(Tcl_GetString(objv[i]), name);
1.1319 + if (chan != NULL) {
1.1320 + break;
1.1321 + }
1.1322 + }
1.1323 +
1.1324 + if (chan == NULL) {
1.1325 + goto unknown;
1.1326 + }
1.1327 +
1.1328 + Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8");
1.1329 +
1.1330 + while (1) {
1.1331 + Tcl_DString ds;
1.1332 +
1.1333 + Tcl_DStringInit(&ds);
1.1334 + Tcl_Gets(chan, &ds);
1.1335 + ch = Tcl_DStringValue(&ds)[0];
1.1336 + Tcl_DStringFree(&ds);
1.1337 + if (ch != '#') {
1.1338 + break;
1.1339 + }
1.1340 + }
1.1341 +
1.1342 + encoding = NULL;
1.1343 + switch (ch) {
1.1344 + case 'S': {
1.1345 + encoding = LoadTableEncoding(interp, name, ENCODING_SINGLEBYTE,
1.1346 + chan);
1.1347 + break;
1.1348 + }
1.1349 + case 'D': {
1.1350 + encoding = LoadTableEncoding(interp, name, ENCODING_DOUBLEBYTE,
1.1351 + chan);
1.1352 + break;
1.1353 + }
1.1354 + case 'M': {
1.1355 + encoding = LoadTableEncoding(interp, name, ENCODING_MULTIBYTE,
1.1356 + chan);
1.1357 + break;
1.1358 + }
1.1359 + case 'E': {
1.1360 + encoding = LoadEscapeEncoding(name, chan);
1.1361 + break;
1.1362 + }
1.1363 + }
1.1364 + if ((encoding == NULL) && (interp != NULL)) {
1.1365 + Tcl_AppendResult(interp, "invalid encoding file \"", name, "\"", NULL);
1.1366 + if (ch == 'E') {
1.1367 + Tcl_AppendResult(interp, " or missing sub-encoding", NULL);
1.1368 + }
1.1369 + }
1.1370 + Tcl_Close(NULL, chan);
1.1371 + return encoding;
1.1372 +
1.1373 + unknown:
1.1374 + if (interp != NULL) {
1.1375 + Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL);
1.1376 + }
1.1377 + return NULL;
1.1378 +}
1.1379 +
1.1380 +/*
1.1381 + *----------------------------------------------------------------------
1.1382 + *
1.1383 + * OpenEncodingFile --
1.1384 + *
1.1385 + * Look for the file encoding/<name>.enc in the specified
1.1386 + * directory.
1.1387 + *
1.1388 + * Results:
1.1389 + * Returns an open file channel if the file exists.
1.1390 + *
1.1391 + * Side effects:
1.1392 + * None.
1.1393 + *
1.1394 + *----------------------------------------------------------------------
1.1395 + */
1.1396 +
1.1397 +static Tcl_Channel
1.1398 +OpenEncodingFile(dir, name)
1.1399 + CONST char *dir;
1.1400 + CONST char *name;
1.1401 +
1.1402 +{
1.1403 + CONST char *argv[3];
1.1404 + Tcl_DString pathString;
1.1405 + CONST char *path;
1.1406 + Tcl_Channel chan;
1.1407 + Tcl_Obj *pathPtr;
1.1408 +
1.1409 + argv[0] = dir;
1.1410 + argv[1] = "encoding";
1.1411 + argv[2] = name;
1.1412 +
1.1413 + Tcl_DStringInit(&pathString);
1.1414 + Tcl_JoinPath(3, argv, &pathString);
1.1415 + path = Tcl_DStringAppend(&pathString, ".enc", -1);
1.1416 + pathPtr = Tcl_NewStringObj(path,-1);
1.1417 +
1.1418 + Tcl_IncrRefCount(pathPtr);
1.1419 + chan = Tcl_FSOpenFileChannel(NULL, pathPtr, "r", 0);
1.1420 + Tcl_DecrRefCount(pathPtr);
1.1421 +
1.1422 + Tcl_DStringFree(&pathString);
1.1423 +
1.1424 + return chan;
1.1425 +}
1.1426 +
1.1427 +/*
1.1428 + *-------------------------------------------------------------------------
1.1429 + *
1.1430 + * LoadTableEncoding --
1.1431 + *
1.1432 + * Helper function for LoadEncodingTable(). Loads a table to that
1.1433 + * converts between Unicode and some other encoding and creates an
1.1434 + * encoding (using a TableEncoding structure) from that information.
1.1435 + *
1.1436 + * File contains binary data, but begins with a marker to indicate
1.1437 + * byte-ordering, so that same binary file can be read on either
1.1438 + * endian platforms.
1.1439 + *
1.1440 + * Results:
1.1441 + * The return value is the new encoding, or NULL if the encoding
1.1442 + * could not be created (because the file contained invalid data).
1.1443 + *
1.1444 + * Side effects:
1.1445 + * None.
1.1446 + *
1.1447 + *-------------------------------------------------------------------------
1.1448 + */
1.1449 +
1.1450 +static Tcl_Encoding
1.1451 +LoadTableEncoding(interp, name, type, chan)
1.1452 + Tcl_Interp *interp; /* Interp for temporary obj while reading. */
1.1453 + CONST char *name; /* Name for new encoding. */
1.1454 + int type; /* Type of encoding (ENCODING_?????). */
1.1455 + Tcl_Channel chan; /* File containing new encoding. */
1.1456 +{
1.1457 + Tcl_DString lineString;
1.1458 + Tcl_Obj *objPtr;
1.1459 + char *line;
1.1460 + int i, hi, lo, numPages, symbol, fallback;
1.1461 + unsigned char used[256];
1.1462 + unsigned int size;
1.1463 + TableEncodingData *dataPtr;
1.1464 + unsigned short *pageMemPtr;
1.1465 + Tcl_EncodingType encType;
1.1466 +
1.1467 + /*
1.1468 + * Speed over memory. Use a full 256 character table to decode hex
1.1469 + * sequences in the encoding files.
1.1470 + */
1.1471 +
1.1472 + static char staticHex[] = {
1.1473 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 ... 15 */
1.1474 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 ... 31 */
1.1475 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 ... 47 */
1.1476 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 48 ... 63 */
1.1477 + 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 ... 79 */
1.1478 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 ... 95 */
1.1479 + 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96 ... 111 */
1.1480 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */
1.1481 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */
1.1482 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */
1.1483 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */
1.1484 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */
1.1485 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */
1.1486 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */
1.1487 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */
1.1488 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */
1.1489 + };
1.1490 +
1.1491 + Tcl_DStringInit(&lineString);
1.1492 + Tcl_Gets(chan, &lineString);
1.1493 + line = Tcl_DStringValue(&lineString);
1.1494 +
1.1495 + fallback = (int) strtol(line, &line, 16);
1.1496 + symbol = (int) strtol(line, &line, 10);
1.1497 + numPages = (int) strtol(line, &line, 10);
1.1498 + Tcl_DStringFree(&lineString);
1.1499 +
1.1500 + if (numPages < 0) {
1.1501 + numPages = 0;
1.1502 + } else if (numPages > 256) {
1.1503 + numPages = 256;
1.1504 + }
1.1505 +
1.1506 + memset(used, 0, sizeof(used));
1.1507 +
1.1508 +#undef PAGESIZE
1.1509 +#define PAGESIZE (256 * sizeof(unsigned short))
1.1510 +
1.1511 + dataPtr = (TableEncodingData *) ckalloc(sizeof(TableEncodingData));
1.1512 + memset(dataPtr, 0, sizeof(TableEncodingData));
1.1513 +
1.1514 + dataPtr->fallback = fallback;
1.1515 +
1.1516 + /*
1.1517 + * Read the table that maps characters to Unicode. Performs a single
1.1518 + * malloc to get the memory for the array and all the pages needed by
1.1519 + * the array.
1.1520 + */
1.1521 +
1.1522 + size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1.1523 + dataPtr->toUnicode = (unsigned short **) ckalloc(size);
1.1524 + memset(dataPtr->toUnicode, 0, size);
1.1525 + pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256);
1.1526 +
1.1527 + if (interp == NULL) {
1.1528 + objPtr = Tcl_NewObj();
1.1529 + } else {
1.1530 + objPtr = Tcl_GetObjResult(interp);
1.1531 + }
1.1532 + for (i = 0; i < numPages; i++) {
1.1533 + int ch;
1.1534 + char *p;
1.1535 +
1.1536 + Tcl_ReadChars(chan, objPtr, 3 + 16 * (16 * 4 + 1), 0);
1.1537 + p = Tcl_GetString(objPtr);
1.1538 + hi = (staticHex[(unsigned int)p[0]] << 4) + staticHex[(unsigned int)p[1]];
1.1539 + dataPtr->toUnicode[hi] = pageMemPtr;
1.1540 + p += 2;
1.1541 + for (lo = 0; lo < 256; lo++) {
1.1542 + if ((lo & 0x0f) == 0) {
1.1543 + p++;
1.1544 + }
1.1545 + ch = (staticHex[(unsigned int)p[0]] << 12) + (staticHex[(unsigned int)p[1]] << 8)
1.1546 + + (staticHex[(unsigned int)p[2]] << 4) + staticHex[(unsigned int)p[3]];
1.1547 + if (ch != 0) {
1.1548 + used[ch >> 8] = 1;
1.1549 + }
1.1550 + *pageMemPtr = (unsigned short) ch;
1.1551 + pageMemPtr++;
1.1552 + p += 4;
1.1553 + }
1.1554 + }
1.1555 + if (interp == NULL) {
1.1556 + Tcl_DecrRefCount(objPtr);
1.1557 + } else {
1.1558 + Tcl_ResetResult(interp);
1.1559 + }
1.1560 +
1.1561 + if (type == ENCODING_DOUBLEBYTE) {
1.1562 + memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
1.1563 + } else {
1.1564 + for (hi = 1; hi < 256; hi++) {
1.1565 + if (dataPtr->toUnicode[hi] != NULL) {
1.1566 + dataPtr->prefixBytes[hi] = 1;
1.1567 + }
1.1568 + }
1.1569 + }
1.1570 +
1.1571 + /*
1.1572 + * Invert toUnicode array to produce the fromUnicode array. Performs a
1.1573 + * single malloc to get the memory for the array and all the pages
1.1574 + * needed by the array. While reading in the toUnicode array, we
1.1575 + * remembered what pages that would be needed for the fromUnicode array.
1.1576 + */
1.1577 +
1.1578 + if (symbol) {
1.1579 + used[0] = 1;
1.1580 + }
1.1581 + numPages = 0;
1.1582 + for (hi = 0; hi < 256; hi++) {
1.1583 + if (used[hi]) {
1.1584 + numPages++;
1.1585 + }
1.1586 + }
1.1587 + size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1.1588 + dataPtr->fromUnicode = (unsigned short **) ckalloc(size);
1.1589 + memset(dataPtr->fromUnicode, 0, size);
1.1590 + pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256);
1.1591 +
1.1592 + for (hi = 0; hi < 256; hi++) {
1.1593 + if (dataPtr->toUnicode[hi] == NULL) {
1.1594 + dataPtr->toUnicode[hi] = emptyPage;
1.1595 + } else {
1.1596 + for (lo = 0; lo < 256; lo++) {
1.1597 + int ch;
1.1598 +
1.1599 + ch = dataPtr->toUnicode[hi][lo];
1.1600 + if (ch != 0) {
1.1601 + unsigned short *page;
1.1602 +
1.1603 + page = dataPtr->fromUnicode[ch >> 8];
1.1604 + if (page == NULL) {
1.1605 + page = pageMemPtr;
1.1606 + pageMemPtr += 256;
1.1607 + dataPtr->fromUnicode[ch >> 8] = page;
1.1608 + }
1.1609 + page[ch & 0xff] = (unsigned short) ((hi << 8) + lo);
1.1610 + }
1.1611 + }
1.1612 + }
1.1613 + }
1.1614 + if (type == ENCODING_MULTIBYTE) {
1.1615 + /*
1.1616 + * If multibyte encodings don't have a backslash character, define
1.1617 + * one. Otherwise, on Windows, native file names won't work because
1.1618 + * the backslash in the file name will map to the unknown character
1.1619 + * (question mark) when converting from UTF-8 to external encoding.
1.1620 + */
1.1621 +
1.1622 + if (dataPtr->fromUnicode[0] != NULL) {
1.1623 + if (dataPtr->fromUnicode[0]['\\'] == '\0') {
1.1624 + dataPtr->fromUnicode[0]['\\'] = '\\';
1.1625 + }
1.1626 + }
1.1627 + }
1.1628 + if (symbol) {
1.1629 + unsigned short *page;
1.1630 +
1.1631 + /*
1.1632 + * Make a special symbol encoding that not only maps the symbol
1.1633 + * characters from their Unicode code points down into page 0, but
1.1634 + * also ensure that the characters on page 0 map to themselves.
1.1635 + * This is so that a symbol font can be used to display a simple
1.1636 + * string like "abcd" and have alpha, beta, chi, delta show up,
1.1637 + * rather than have "unknown" chars show up because strictly
1.1638 + * speaking the symbol font doesn't have glyphs for those low ascii
1.1639 + * chars.
1.1640 + */
1.1641 +
1.1642 + page = dataPtr->fromUnicode[0];
1.1643 + if (page == NULL) {
1.1644 + page = pageMemPtr;
1.1645 + dataPtr->fromUnicode[0] = page;
1.1646 + }
1.1647 + for (lo = 0; lo < 256; lo++) {
1.1648 + if (dataPtr->toUnicode[0][lo] != 0) {
1.1649 + page[lo] = (unsigned short) lo;
1.1650 + }
1.1651 + }
1.1652 + }
1.1653 + for (hi = 0; hi < 256; hi++) {
1.1654 + if (dataPtr->fromUnicode[hi] == NULL) {
1.1655 + dataPtr->fromUnicode[hi] = emptyPage;
1.1656 + }
1.1657 + }
1.1658 + /*
1.1659 + * For trailing 'R'everse encoding, see [Patch #689341]
1.1660 + */
1.1661 + Tcl_DStringInit(&lineString);
1.1662 + do {
1.1663 + int len;
1.1664 + /* skip leading empty lines */
1.1665 + while ((len = Tcl_Gets(chan, &lineString)) == 0)
1.1666 + ;
1.1667 + if (len < 0) {
1.1668 + break;
1.1669 + }
1.1670 + line = Tcl_DStringValue(&lineString);
1.1671 + if (line[0] != 'R') {
1.1672 + break;
1.1673 + }
1.1674 + for (Tcl_DStringSetLength(&lineString, 0);
1.1675 + (len = Tcl_Gets(chan, &lineString)) >= 0;
1.1676 + Tcl_DStringSetLength(&lineString, 0)) {
1.1677 + unsigned char* p;
1.1678 + int to, from;
1.1679 + if (len < 5) {
1.1680 + continue;
1.1681 + }
1.1682 + p = (unsigned char*) Tcl_DStringValue(&lineString);
1.1683 + to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1.1684 + + (staticHex[p[2]] << 4) + staticHex[p[3]];
1.1685 + if (to == 0) {
1.1686 + continue;
1.1687 + }
1.1688 + for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) {
1.1689 + from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1.1690 + + (staticHex[p[2]] << 4) + staticHex[p[3]];
1.1691 + if (from == 0) {
1.1692 + continue;
1.1693 + }
1.1694 + dataPtr->fromUnicode[from >> 8][from & 0xff] = to;
1.1695 + }
1.1696 + }
1.1697 + } while (0);
1.1698 + Tcl_DStringFree(&lineString);
1.1699 +
1.1700 + encType.encodingName = name;
1.1701 + encType.toUtfProc = TableToUtfProc;
1.1702 + encType.fromUtfProc = TableFromUtfProc;
1.1703 + encType.freeProc = TableFreeProc;
1.1704 + encType.nullSize = (type == ENCODING_DOUBLEBYTE) ? 2 : 1;
1.1705 + encType.clientData = (ClientData) dataPtr;
1.1706 + return Tcl_CreateEncoding(&encType);
1.1707 +}
1.1708 +
1.1709 +/*
1.1710 + *-------------------------------------------------------------------------
1.1711 + *
1.1712 + * LoadEscapeEncoding --
1.1713 + *
1.1714 + * Helper function for LoadEncodingTable(). Loads a state machine
1.1715 + * that converts between Unicode and some other encoding.
1.1716 + *
1.1717 + * File contains text data that describes the escape sequences that
1.1718 + * are used to choose an encoding and the associated names for the
1.1719 + * sub-encodings.
1.1720 + *
1.1721 + * Results:
1.1722 + * The return value is the new encoding, or NULL if the encoding
1.1723 + * could not be created (because the file contained invalid data).
1.1724 + *
1.1725 + * Side effects:
1.1726 + * None.
1.1727 + *
1.1728 + *-------------------------------------------------------------------------
1.1729 + */
1.1730 +
1.1731 +static Tcl_Encoding
1.1732 +LoadEscapeEncoding(name, chan)
1.1733 + CONST char *name; /* Name for new encoding. */
1.1734 + Tcl_Channel chan; /* File containing new encoding. */
1.1735 +{
1.1736 + int i, missingSubEncoding = 0;
1.1737 + unsigned int size;
1.1738 + Tcl_DString escapeData;
1.1739 + char init[16], final[16];
1.1740 + EscapeEncodingData *dataPtr;
1.1741 + Tcl_EncodingType type;
1.1742 +
1.1743 + init[0] = '\0';
1.1744 + final[0] = '\0';
1.1745 + Tcl_DStringInit(&escapeData);
1.1746 +
1.1747 + while (1) {
1.1748 + int argc;
1.1749 + CONST char **argv;
1.1750 + char *line;
1.1751 + Tcl_DString lineString;
1.1752 +
1.1753 + Tcl_DStringInit(&lineString);
1.1754 + if (Tcl_Gets(chan, &lineString) < 0) {
1.1755 + break;
1.1756 + }
1.1757 + line = Tcl_DStringValue(&lineString);
1.1758 + if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) {
1.1759 + continue;
1.1760 + }
1.1761 + if (argc >= 2) {
1.1762 + if (strcmp(argv[0], "name") == 0) {
1.1763 + ;
1.1764 + } else if (strcmp(argv[0], "init") == 0) {
1.1765 + strncpy(init, argv[1], sizeof(init));
1.1766 + init[sizeof(init) - 1] = '\0';
1.1767 + } else if (strcmp(argv[0], "final") == 0) {
1.1768 + strncpy(final, argv[1], sizeof(final));
1.1769 + final[sizeof(final) - 1] = '\0';
1.1770 + } else {
1.1771 + EscapeSubTable est;
1.1772 +
1.1773 + strncpy(est.sequence, argv[1], sizeof(est.sequence));
1.1774 + est.sequence[sizeof(est.sequence) - 1] = '\0';
1.1775 + est.sequenceLen = strlen(est.sequence);
1.1776 +
1.1777 + strncpy(est.name, argv[0], sizeof(est.name));
1.1778 + est.name[sizeof(est.name) - 1] = '\0';
1.1779 +
1.1780 + /*
1.1781 + * Load the subencodings first so we're never stuck
1.1782 + * trying to use a half-loaded system encoding to
1.1783 + * open/read a *.enc file.
1.1784 + */
1.1785 +
1.1786 + est.encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, est.name);
1.1787 + if ((est.encodingPtr == NULL)
1.1788 + || (est.encodingPtr->toUtfProc != TableToUtfProc)) {
1.1789 + missingSubEncoding = 1;
1.1790 + }
1.1791 + Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est));
1.1792 + }
1.1793 + }
1.1794 + ckfree((char *) argv);
1.1795 + Tcl_DStringFree(&lineString);
1.1796 + }
1.1797 + if (missingSubEncoding) {
1.1798 + Tcl_DStringFree(&escapeData);
1.1799 + return NULL;
1.1800 + }
1.1801 +
1.1802 + size = sizeof(EscapeEncodingData)
1.1803 + - sizeof(EscapeSubTable) + Tcl_DStringLength(&escapeData);
1.1804 + dataPtr = (EscapeEncodingData *) ckalloc(size);
1.1805 + dataPtr->initLen = strlen(init);
1.1806 + strcpy(dataPtr->init, init);
1.1807 + dataPtr->finalLen = strlen(final);
1.1808 + strcpy(dataPtr->final, final);
1.1809 + dataPtr->numSubTables = Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable);
1.1810 + memcpy((VOID *) dataPtr->subTables, (VOID *) Tcl_DStringValue(&escapeData),
1.1811 + (size_t) Tcl_DStringLength(&escapeData));
1.1812 + Tcl_DStringFree(&escapeData);
1.1813 +
1.1814 + memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes));
1.1815 + for (i = 0; i < dataPtr->numSubTables; i++) {
1.1816 + dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1;
1.1817 + }
1.1818 + if (dataPtr->init[0] != '\0') {
1.1819 + dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1;
1.1820 + }
1.1821 + if (dataPtr->final[0] != '\0') {
1.1822 + dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1;
1.1823 + }
1.1824 +
1.1825 + type.encodingName = name;
1.1826 + type.toUtfProc = EscapeToUtfProc;
1.1827 + type.fromUtfProc = EscapeFromUtfProc;
1.1828 + type.freeProc = EscapeFreeProc;
1.1829 + type.nullSize = 1;
1.1830 + type.clientData = (ClientData) dataPtr;
1.1831 +
1.1832 + return Tcl_CreateEncoding(&type);
1.1833 +}
1.1834 +
1.1835 +/*
1.1836 + *-------------------------------------------------------------------------
1.1837 + *
1.1838 + * BinaryProc --
1.1839 + *
1.1840 + * The default conversion when no other conversion is specified.
1.1841 + * No translation is done; source bytes are copied directly to
1.1842 + * destination bytes.
1.1843 + *
1.1844 + * Results:
1.1845 + * Returns TCL_OK if conversion was successful.
1.1846 + *
1.1847 + * Side effects:
1.1848 + * None.
1.1849 + *
1.1850 + *-------------------------------------------------------------------------
1.1851 + */
1.1852 +
1.1853 +static int
1.1854 +BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.1855 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.1856 + ClientData clientData; /* Not used. */
1.1857 + CONST char *src; /* Source string (unknown encoding). */
1.1858 + int srcLen; /* Source string length in bytes. */
1.1859 + int flags; /* Conversion control flags. */
1.1860 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.1861 + * state information used during a piecewise
1.1862 + * conversion. Contents of statePtr are
1.1863 + * initialized and/or reset by conversion
1.1864 + * routine under control of flags argument. */
1.1865 + char *dst; /* Output buffer in which converted string
1.1866 + * is stored. */
1.1867 + int dstLen; /* The maximum length of output buffer in
1.1868 + * bytes. */
1.1869 + int *srcReadPtr; /* Filled with the number of bytes from the
1.1870 + * source string that were converted. */
1.1871 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.1872 + * stored in the output buffer as a result of
1.1873 + * the conversion. */
1.1874 + int *dstCharsPtr; /* Filled with the number of characters that
1.1875 + * correspond to the bytes stored in the
1.1876 + * output buffer. */
1.1877 +{
1.1878 + int result;
1.1879 +
1.1880 + result = TCL_OK;
1.1881 + dstLen -= TCL_UTF_MAX - 1;
1.1882 + if (dstLen < 0) {
1.1883 + dstLen = 0;
1.1884 + }
1.1885 + if (srcLen > dstLen) {
1.1886 + srcLen = dstLen;
1.1887 + result = TCL_CONVERT_NOSPACE;
1.1888 + }
1.1889 +
1.1890 + *srcReadPtr = srcLen;
1.1891 + *dstWrotePtr = srcLen;
1.1892 + *dstCharsPtr = srcLen;
1.1893 + memcpy((void *) dst, (void *) src, (size_t) srcLen);
1.1894 + return result;
1.1895 +}
1.1896 +
1.1897 +
1.1898 +/*
1.1899 + *-------------------------------------------------------------------------
1.1900 + *
1.1901 + * UtfExtToUtfIntProc --
1.1902 + *
1.1903 + * Convert from UTF-8 to UTF-8. While converting null-bytes from
1.1904 + * the Tcl's internal representation (0xc0, 0x80) to the official
1.1905 + * representation (0x00). See UtfToUtfProc for details.
1.1906 + *
1.1907 + * Results:
1.1908 + * Returns TCL_OK if conversion was successful.
1.1909 + *
1.1910 + * Side effects:
1.1911 + * None.
1.1912 + *
1.1913 + *-------------------------------------------------------------------------
1.1914 + */
1.1915 +static int
1.1916 +UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.1917 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.1918 + ClientData clientData; /* Not used. */
1.1919 + CONST char *src; /* Source string in UTF-8. */
1.1920 + int srcLen; /* Source string length in bytes. */
1.1921 + int flags; /* Conversion control flags. */
1.1922 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.1923 + * state information used during a piecewise
1.1924 + * conversion. Contents of statePtr are
1.1925 + * initialized and/or reset by conversion
1.1926 + * routine under control of flags argument. */
1.1927 + char *dst; /* Output buffer in which converted string
1.1928 + * is stored. */
1.1929 + int dstLen; /* The maximum length of output buffer in
1.1930 + * bytes. */
1.1931 + int *srcReadPtr; /* Filled with the number of bytes from the
1.1932 + * source string that were converted. This
1.1933 + * may be less than the original source length
1.1934 + * if there was a problem converting some
1.1935 + * source characters. */
1.1936 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.1937 + * stored in the output buffer as a result of
1.1938 + * the conversion. */
1.1939 + int *dstCharsPtr; /* Filled with the number of characters that
1.1940 + * correspond to the bytes stored in the
1.1941 + * output buffer. */
1.1942 +{
1.1943 + return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.1944 + srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
1.1945 +}
1.1946 +
1.1947 +/*
1.1948 + *-------------------------------------------------------------------------
1.1949 + *
1.1950 + * UtfExtToUtfIntProc --
1.1951 + *
1.1952 + * Convert from UTF-8 to UTF-8 while converting null-bytes from
1.1953 + * the official representation (0x00) to Tcl's internal
1.1954 + * representation (0xc0, 0x80). See UtfToUtfProc for details.
1.1955 + *
1.1956 + * Results:
1.1957 + * Returns TCL_OK if conversion was successful.
1.1958 + *
1.1959 + * Side effects:
1.1960 + * None.
1.1961 + *
1.1962 + *-------------------------------------------------------------------------
1.1963 + */
1.1964 +static int
1.1965 +UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.1966 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.1967 + ClientData clientData; /* Not used. */
1.1968 + CONST char *src; /* Source string in UTF-8. */
1.1969 + int srcLen; /* Source string length in bytes. */
1.1970 + int flags; /* Conversion control flags. */
1.1971 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.1972 + * state information used during a piecewise
1.1973 + * conversion. Contents of statePtr are
1.1974 + * initialized and/or reset by conversion
1.1975 + * routine under control of flags argument. */
1.1976 + char *dst; /* Output buffer in which converted string
1.1977 + * is stored. */
1.1978 + int dstLen; /* The maximum length of output buffer in
1.1979 + * bytes. */
1.1980 + int *srcReadPtr; /* Filled with the number of bytes from the
1.1981 + * source string that were converted. This
1.1982 + * may be less than the original source length
1.1983 + * if there was a problem converting some
1.1984 + * source characters. */
1.1985 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.1986 + * stored in the output buffer as a result of
1.1987 + * the conversion. */
1.1988 + int *dstCharsPtr; /* Filled with the number of characters that
1.1989 + * correspond to the bytes stored in the
1.1990 + * output buffer. */
1.1991 +{
1.1992 + return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.1993 + srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
1.1994 +}
1.1995 +
1.1996 +/*
1.1997 + *-------------------------------------------------------------------------
1.1998 + *
1.1999 + * UtfToUtfProc --
1.2000 + *
1.2001 + * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8
1.2002 + * translation is not a no-op, because it will turn a stream of
1.2003 + * improperly formed UTF-8 into a properly formed stream.
1.2004 + *
1.2005 + * Results:
1.2006 + * Returns TCL_OK if conversion was successful.
1.2007 + *
1.2008 + * Side effects:
1.2009 + * None.
1.2010 + *
1.2011 + *-------------------------------------------------------------------------
1.2012 + */
1.2013 +
1.2014 +static int
1.2015 +UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2016 + srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
1.2017 + ClientData clientData; /* Not used. */
1.2018 + CONST char *src; /* Source string in UTF-8. */
1.2019 + int srcLen; /* Source string length in bytes. */
1.2020 + int flags; /* Conversion control flags. */
1.2021 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2022 + * state information used during a piecewise
1.2023 + * conversion. Contents of statePtr are
1.2024 + * initialized and/or reset by conversion
1.2025 + * routine under control of flags argument. */
1.2026 + char *dst; /* Output buffer in which converted string
1.2027 + * is stored. */
1.2028 + int dstLen; /* The maximum length of output buffer in
1.2029 + * bytes. */
1.2030 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2031 + * source string that were converted. This
1.2032 + * may be less than the original source length
1.2033 + * if there was a problem converting some
1.2034 + * source characters. */
1.2035 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2036 + * stored in the output buffer as a result of
1.2037 + * the conversion. */
1.2038 + int *dstCharsPtr; /* Filled with the number of characters that
1.2039 + * correspond to the bytes stored in the
1.2040 + * output buffer. */
1.2041 + int pureNullMode; /* Convert embedded nulls from
1.2042 + * internal representation to real
1.2043 + * null-bytes or vice versa */
1.2044 +
1.2045 +{
1.2046 + CONST char *srcStart, *srcEnd, *srcClose;
1.2047 + char *dstStart, *dstEnd;
1.2048 + int result, numChars;
1.2049 + Tcl_UniChar ch;
1.2050 +
1.2051 + result = TCL_OK;
1.2052 +
1.2053 + srcStart = src;
1.2054 + srcEnd = src + srcLen;
1.2055 + srcClose = srcEnd;
1.2056 + if ((flags & TCL_ENCODING_END) == 0) {
1.2057 + srcClose -= TCL_UTF_MAX;
1.2058 + }
1.2059 +
1.2060 + dstStart = dst;
1.2061 + dstEnd = dst + dstLen - TCL_UTF_MAX;
1.2062 +
1.2063 + for (numChars = 0; src < srcEnd; numChars++) {
1.2064 + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
1.2065 + /*
1.2066 + * If there is more string to follow, this will ensure that the
1.2067 + * last UTF-8 character in the source buffer hasn't been cut off.
1.2068 + */
1.2069 +
1.2070 + result = TCL_CONVERT_MULTIBYTE;
1.2071 + break;
1.2072 + }
1.2073 + if (dst > dstEnd) {
1.2074 + result = TCL_CONVERT_NOSPACE;
1.2075 + break;
1.2076 + }
1.2077 + if (UCHAR(*src) < 0x80 &&
1.2078 + !(UCHAR(*src) == 0 && pureNullMode == 0)) {
1.2079 + /*
1.2080 + * Copy 7bit chatacters, but skip null-bytes when we are
1.2081 + * in input mode, so that they get converted to 0xc080.
1.2082 + */
1.2083 + *dst++ = *src++;
1.2084 + } else if (pureNullMode == 1 &&
1.2085 + UCHAR(*src) == 0xc0 &&
1.2086 + UCHAR(*(src+1)) == 0x80) {
1.2087 + /*
1.2088 + * Convert 0xc080 to real nulls when we are in output mode.
1.2089 + */
1.2090 + *dst++ = 0;
1.2091 + src += 2;
1.2092 + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
1.2093 + /* Always check before using Tcl_UtfToUniChar. Not doing
1.2094 + * can so cause it run beyond the endof the buffer! If we
1.2095 + * * happen such an incomplete char its byts are made to *
1.2096 + * represent themselves.
1.2097 + */
1.2098 +
1.2099 + ch = (Tcl_UniChar) *src;
1.2100 + src += 1;
1.2101 + dst += Tcl_UniCharToUtf(ch, dst);
1.2102 + } else {
1.2103 + src += Tcl_UtfToUniChar(src, &ch);
1.2104 + dst += Tcl_UniCharToUtf(ch, dst);
1.2105 + }
1.2106 + }
1.2107 +
1.2108 + *srcReadPtr = src - srcStart;
1.2109 + *dstWrotePtr = dst - dstStart;
1.2110 + *dstCharsPtr = numChars;
1.2111 + return result;
1.2112 +}
1.2113 +
1.2114 +/*
1.2115 + *-------------------------------------------------------------------------
1.2116 + *
1.2117 + * UnicodeToUtfProc --
1.2118 + *
1.2119 + * Convert from Unicode to UTF-8.
1.2120 + *
1.2121 + * Results:
1.2122 + * Returns TCL_OK if conversion was successful.
1.2123 + *
1.2124 + * Side effects:
1.2125 + * None.
1.2126 + *
1.2127 + *-------------------------------------------------------------------------
1.2128 + */
1.2129 +
1.2130 +static int
1.2131 +UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2132 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.2133 + ClientData clientData; /* Not used. */
1.2134 + CONST char *src; /* Source string in Unicode. */
1.2135 + int srcLen; /* Source string length in bytes. */
1.2136 + int flags; /* Conversion control flags. */
1.2137 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2138 + * state information used during a piecewise
1.2139 + * conversion. Contents of statePtr are
1.2140 + * initialized and/or reset by conversion
1.2141 + * routine under control of flags argument. */
1.2142 + char *dst; /* Output buffer in which converted string
1.2143 + * is stored. */
1.2144 + int dstLen; /* The maximum length of output buffer in
1.2145 + * bytes. */
1.2146 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2147 + * source string that were converted. This
1.2148 + * may be less than the original source length
1.2149 + * if there was a problem converting some
1.2150 + * source characters. */
1.2151 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2152 + * stored in the output buffer as a result of
1.2153 + * the conversion. */
1.2154 + int *dstCharsPtr; /* Filled with the number of characters that
1.2155 + * correspond to the bytes stored in the
1.2156 + * output buffer. */
1.2157 +{
1.2158 + CONST char *srcStart, *srcEnd;
1.2159 + char *dstEnd, *dstStart;
1.2160 + int result, numChars;
1.2161 + Tcl_UniChar ch;
1.2162 +
1.2163 + result = TCL_OK;
1.2164 + if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
1.2165 + result = TCL_CONVERT_MULTIBYTE;
1.2166 + srcLen /= sizeof(Tcl_UniChar);
1.2167 + srcLen *= sizeof(Tcl_UniChar);
1.2168 + }
1.2169 +
1.2170 + srcStart = src;
1.2171 + srcEnd = src + srcLen;
1.2172 +
1.2173 + dstStart = dst;
1.2174 + dstEnd = dst + dstLen - TCL_UTF_MAX;
1.2175 +
1.2176 + for (numChars = 0; src < srcEnd; numChars++) {
1.2177 + if (dst > dstEnd) {
1.2178 + result = TCL_CONVERT_NOSPACE;
1.2179 + break;
1.2180 + }
1.2181 + /*
1.2182 + * Special case for 1-byte utf chars for speed. Make sure we
1.2183 + * work with Tcl_UniChar-size data.
1.2184 + */
1.2185 + ch = *(Tcl_UniChar *)src;
1.2186 + if (ch && ch < 0x80) {
1.2187 + *dst++ = (ch & 0xFF);
1.2188 + } else {
1.2189 + dst += Tcl_UniCharToUtf(ch, dst);
1.2190 + }
1.2191 + src += sizeof(Tcl_UniChar);
1.2192 + }
1.2193 +
1.2194 + *srcReadPtr = src - srcStart;
1.2195 + *dstWrotePtr = dst - dstStart;
1.2196 + *dstCharsPtr = numChars;
1.2197 + return result;
1.2198 +}
1.2199 +
1.2200 +/*
1.2201 + *-------------------------------------------------------------------------
1.2202 + *
1.2203 + * UtfToUnicodeProc --
1.2204 + *
1.2205 + * Convert from UTF-8 to Unicode.
1.2206 + *
1.2207 + * Results:
1.2208 + * Returns TCL_OK if conversion was successful.
1.2209 + *
1.2210 + * Side effects:
1.2211 + * None.
1.2212 + *
1.2213 + *-------------------------------------------------------------------------
1.2214 + */
1.2215 +
1.2216 +static int
1.2217 +UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2218 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.2219 + ClientData clientData; /* TableEncodingData that specifies encoding. */
1.2220 + CONST char *src; /* Source string in UTF-8. */
1.2221 + int srcLen; /* Source string length in bytes. */
1.2222 + int flags; /* Conversion control flags. */
1.2223 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2224 + * state information used during a piecewise
1.2225 + * conversion. Contents of statePtr are
1.2226 + * initialized and/or reset by conversion
1.2227 + * routine under control of flags argument. */
1.2228 + char *dst; /* Output buffer in which converted string
1.2229 + * is stored. */
1.2230 + int dstLen; /* The maximum length of output buffer in
1.2231 + * bytes. */
1.2232 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2233 + * source string that were converted. This
1.2234 + * may be less than the original source length
1.2235 + * if there was a problem converting some
1.2236 + * source characters. */
1.2237 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2238 + * stored in the output buffer as a result of
1.2239 + * the conversion. */
1.2240 + int *dstCharsPtr; /* Filled with the number of characters that
1.2241 + * correspond to the bytes stored in the
1.2242 + * output buffer. */
1.2243 +{
1.2244 + CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
1.2245 + int result, numChars;
1.2246 + Tcl_UniChar ch;
1.2247 +
1.2248 + srcStart = src;
1.2249 + srcEnd = src + srcLen;
1.2250 + srcClose = srcEnd;
1.2251 + if ((flags & TCL_ENCODING_END) == 0) {
1.2252 + srcClose -= TCL_UTF_MAX;
1.2253 + }
1.2254 +
1.2255 + dstStart = dst;
1.2256 + dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
1.2257 +
1.2258 + result = TCL_OK;
1.2259 + for (numChars = 0; src < srcEnd; numChars++) {
1.2260 + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
1.2261 + /*
1.2262 + * If there is more string to follow, this will ensure that the
1.2263 + * last UTF-8 character in the source buffer hasn't been cut off.
1.2264 + */
1.2265 +
1.2266 + result = TCL_CONVERT_MULTIBYTE;
1.2267 + break;
1.2268 + }
1.2269 + if (dst > dstEnd) {
1.2270 + result = TCL_CONVERT_NOSPACE;
1.2271 + break;
1.2272 + }
1.2273 + src += TclUtfToUniChar(src, &ch);
1.2274 + /*
1.2275 + * Need to handle this in a way that won't cause misalignment
1.2276 + * by casting dst to a Tcl_UniChar. [Bug 1122671]
1.2277 + * XXX: This hard-codes the assumed size of Tcl_UniChar as 2.
1.2278 + */
1.2279 +#ifdef WORDS_BIGENDIAN
1.2280 + *dst++ = (ch >> 8);
1.2281 + *dst++ = (ch & 0xFF);
1.2282 +#else
1.2283 + *dst++ = (ch & 0xFF);
1.2284 + *dst++ = (ch >> 8);
1.2285 +#endif
1.2286 + }
1.2287 + *srcReadPtr = src - srcStart;
1.2288 + *dstWrotePtr = dst - dstStart;
1.2289 + *dstCharsPtr = numChars;
1.2290 + return result;
1.2291 +}
1.2292 +
1.2293 +/*
1.2294 + *-------------------------------------------------------------------------
1.2295 + *
1.2296 + * TableToUtfProc --
1.2297 + *
1.2298 + * Convert from the encoding specified by the TableEncodingData into
1.2299 + * UTF-8.
1.2300 + *
1.2301 + * Results:
1.2302 + * Returns TCL_OK if conversion was successful.
1.2303 + *
1.2304 + * Side effects:
1.2305 + * None.
1.2306 + *
1.2307 + *-------------------------------------------------------------------------
1.2308 + */
1.2309 +
1.2310 +static int
1.2311 +TableToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2312 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.2313 + ClientData clientData; /* TableEncodingData that specifies
1.2314 + * encoding. */
1.2315 + CONST char *src; /* Source string in specified encoding. */
1.2316 + int srcLen; /* Source string length in bytes. */
1.2317 + int flags; /* Conversion control flags. */
1.2318 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2319 + * state information used during a piecewise
1.2320 + * conversion. Contents of statePtr are
1.2321 + * initialized and/or reset by conversion
1.2322 + * routine under control of flags argument. */
1.2323 + char *dst; /* Output buffer in which converted string
1.2324 + * is stored. */
1.2325 + int dstLen; /* The maximum length of output buffer in
1.2326 + * bytes. */
1.2327 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2328 + * source string that were converted. This
1.2329 + * may be less than the original source length
1.2330 + * if there was a problem converting some
1.2331 + * source characters. */
1.2332 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2333 + * stored in the output buffer as a result of
1.2334 + * the conversion. */
1.2335 + int *dstCharsPtr; /* Filled with the number of characters that
1.2336 + * correspond to the bytes stored in the
1.2337 + * output buffer. */
1.2338 +{
1.2339 + CONST char *srcStart, *srcEnd;
1.2340 + char *dstEnd, *dstStart, *prefixBytes;
1.2341 + int result, byte, numChars;
1.2342 + Tcl_UniChar ch;
1.2343 + unsigned short **toUnicode;
1.2344 + unsigned short *pageZero;
1.2345 + TableEncodingData *dataPtr;
1.2346 +
1.2347 + srcStart = src;
1.2348 + srcEnd = src + srcLen;
1.2349 +
1.2350 + dstStart = dst;
1.2351 + dstEnd = dst + dstLen - TCL_UTF_MAX;
1.2352 +
1.2353 + dataPtr = (TableEncodingData *) clientData;
1.2354 + toUnicode = dataPtr->toUnicode;
1.2355 + prefixBytes = dataPtr->prefixBytes;
1.2356 + pageZero = toUnicode[0];
1.2357 +
1.2358 + result = TCL_OK;
1.2359 + for (numChars = 0; src < srcEnd; numChars++) {
1.2360 + if (dst > dstEnd) {
1.2361 + result = TCL_CONVERT_NOSPACE;
1.2362 + break;
1.2363 + }
1.2364 + byte = *((unsigned char *) src);
1.2365 + if (prefixBytes[byte]) {
1.2366 + src++;
1.2367 + if (src >= srcEnd) {
1.2368 + src--;
1.2369 + result = TCL_CONVERT_MULTIBYTE;
1.2370 + break;
1.2371 + }
1.2372 + ch = toUnicode[byte][*((unsigned char *) src)];
1.2373 + } else {
1.2374 + ch = pageZero[byte];
1.2375 + }
1.2376 + if ((ch == 0) && (byte != 0)) {
1.2377 + if (flags & TCL_ENCODING_STOPONERROR) {
1.2378 + result = TCL_CONVERT_SYNTAX;
1.2379 + break;
1.2380 + }
1.2381 + if (prefixBytes[byte]) {
1.2382 + src--;
1.2383 + }
1.2384 + ch = (Tcl_UniChar) byte;
1.2385 + }
1.2386 + /*
1.2387 + * Special case for 1-byte utf chars for speed.
1.2388 + */
1.2389 + if (ch && ch < 0x80) {
1.2390 + *dst++ = (char) ch;
1.2391 + } else {
1.2392 + dst += Tcl_UniCharToUtf(ch, dst);
1.2393 + }
1.2394 + src++;
1.2395 + }
1.2396 + *srcReadPtr = src - srcStart;
1.2397 + *dstWrotePtr = dst - dstStart;
1.2398 + *dstCharsPtr = numChars;
1.2399 + return result;
1.2400 +}
1.2401 +
1.2402 +/*
1.2403 + *-------------------------------------------------------------------------
1.2404 + *
1.2405 + * TableFromUtfProc --
1.2406 + *
1.2407 + * Convert from UTF-8 into the encoding specified by the
1.2408 + * TableEncodingData.
1.2409 + *
1.2410 + * Results:
1.2411 + * Returns TCL_OK if conversion was successful.
1.2412 + *
1.2413 + * Side effects:
1.2414 + * None.
1.2415 + *
1.2416 + *-------------------------------------------------------------------------
1.2417 + */
1.2418 +
1.2419 +static int
1.2420 +TableFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2421 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.2422 + ClientData clientData; /* TableEncodingData that specifies
1.2423 + * encoding. */
1.2424 + CONST char *src; /* Source string in UTF-8. */
1.2425 + int srcLen; /* Source string length in bytes. */
1.2426 + int flags; /* Conversion control flags. */
1.2427 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2428 + * state information used during a piecewise
1.2429 + * conversion. Contents of statePtr are
1.2430 + * initialized and/or reset by conversion
1.2431 + * routine under control of flags argument. */
1.2432 + char *dst; /* Output buffer in which converted string
1.2433 + * is stored. */
1.2434 + int dstLen; /* The maximum length of output buffer in
1.2435 + * bytes. */
1.2436 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2437 + * source string that were converted. This
1.2438 + * may be less than the original source length
1.2439 + * if there was a problem converting some
1.2440 + * source characters. */
1.2441 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2442 + * stored in the output buffer as a result of
1.2443 + * the conversion. */
1.2444 + int *dstCharsPtr; /* Filled with the number of characters that
1.2445 + * correspond to the bytes stored in the
1.2446 + * output buffer. */
1.2447 +{
1.2448 + CONST char *srcStart, *srcEnd, *srcClose;
1.2449 + char *dstStart, *dstEnd, *prefixBytes;
1.2450 + Tcl_UniChar ch;
1.2451 + int result, len, word, numChars;
1.2452 + TableEncodingData *dataPtr;
1.2453 + unsigned short **fromUnicode;
1.2454 +
1.2455 + result = TCL_OK;
1.2456 +
1.2457 + dataPtr = (TableEncodingData *) clientData;
1.2458 + prefixBytes = dataPtr->prefixBytes;
1.2459 + fromUnicode = dataPtr->fromUnicode;
1.2460 +
1.2461 + srcStart = src;
1.2462 + srcEnd = src + srcLen;
1.2463 + srcClose = srcEnd;
1.2464 + if ((flags & TCL_ENCODING_END) == 0) {
1.2465 + srcClose -= TCL_UTF_MAX;
1.2466 + }
1.2467 +
1.2468 + dstStart = dst;
1.2469 + dstEnd = dst + dstLen - 1;
1.2470 +
1.2471 + for (numChars = 0; src < srcEnd; numChars++) {
1.2472 + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
1.2473 + /*
1.2474 + * If there is more string to follow, this will ensure that the
1.2475 + * last UTF-8 character in the source buffer hasn't been cut off.
1.2476 + */
1.2477 +
1.2478 + result = TCL_CONVERT_MULTIBYTE;
1.2479 + break;
1.2480 + }
1.2481 + len = TclUtfToUniChar(src, &ch);
1.2482 +
1.2483 +#if TCL_UTF_MAX > 3
1.2484 + /*
1.2485 + * This prevents a crash condition. More evaluation is required
1.2486 + * for full support of int Tcl_UniChar. [Bug 1004065]
1.2487 + */
1.2488 + if (ch & 0xffff0000) {
1.2489 + word = 0;
1.2490 + } else
1.2491 +#endif
1.2492 + word = fromUnicode[(ch >> 8)][ch & 0xff];
1.2493 +
1.2494 + if ((word == 0) && (ch != 0)) {
1.2495 + if (flags & TCL_ENCODING_STOPONERROR) {
1.2496 + result = TCL_CONVERT_UNKNOWN;
1.2497 + break;
1.2498 + }
1.2499 + word = dataPtr->fallback;
1.2500 + }
1.2501 + if (prefixBytes[(word >> 8)] != 0) {
1.2502 + if (dst + 1 > dstEnd) {
1.2503 + result = TCL_CONVERT_NOSPACE;
1.2504 + break;
1.2505 + }
1.2506 + dst[0] = (char) (word >> 8);
1.2507 + dst[1] = (char) word;
1.2508 + dst += 2;
1.2509 + } else {
1.2510 + if (dst > dstEnd) {
1.2511 + result = TCL_CONVERT_NOSPACE;
1.2512 + break;
1.2513 + }
1.2514 + dst[0] = (char) word;
1.2515 + dst++;
1.2516 + }
1.2517 + src += len;
1.2518 + }
1.2519 + *srcReadPtr = src - srcStart;
1.2520 + *dstWrotePtr = dst - dstStart;
1.2521 + *dstCharsPtr = numChars;
1.2522 + return result;
1.2523 +}
1.2524 +
1.2525 +/*
1.2526 + *---------------------------------------------------------------------------
1.2527 + *
1.2528 + * TableFreeProc --
1.2529 + *
1.2530 + * This procedure is invoked when an encoding is deleted. It deletes
1.2531 + * the memory used by the TableEncodingData.
1.2532 + *
1.2533 + * Results:
1.2534 + * None.
1.2535 + *
1.2536 + * Side effects:
1.2537 + * Memory freed.
1.2538 + *
1.2539 + *---------------------------------------------------------------------------
1.2540 + */
1.2541 +
1.2542 +static void
1.2543 +TableFreeProc(clientData)
1.2544 + ClientData clientData; /* TableEncodingData that specifies
1.2545 + * encoding. */
1.2546 +{
1.2547 + TableEncodingData *dataPtr;
1.2548 +
1.2549 + /*
1.2550 + * Make sure we aren't freeing twice on shutdown. [Bug #219314]
1.2551 + */
1.2552 +
1.2553 + dataPtr = (TableEncodingData *) clientData;
1.2554 + ckfree((char *) dataPtr->toUnicode);
1.2555 + ckfree((char *) dataPtr->fromUnicode);
1.2556 + ckfree((char *) dataPtr);
1.2557 +}
1.2558 +
1.2559 +/*
1.2560 + *-------------------------------------------------------------------------
1.2561 + *
1.2562 + * EscapeToUtfProc --
1.2563 + *
1.2564 + * Convert from the encoding specified by the EscapeEncodingData into
1.2565 + * UTF-8.
1.2566 + *
1.2567 + * Results:
1.2568 + * Returns TCL_OK if conversion was successful.
1.2569 + *
1.2570 + * Side effects:
1.2571 + * None.
1.2572 + *
1.2573 + *-------------------------------------------------------------------------
1.2574 + */
1.2575 +
1.2576 +static int
1.2577 +EscapeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2578 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.2579 + ClientData clientData; /* EscapeEncodingData that specifies
1.2580 + * encoding. */
1.2581 + CONST char *src; /* Source string in specified encoding. */
1.2582 + int srcLen; /* Source string length in bytes. */
1.2583 + int flags; /* Conversion control flags. */
1.2584 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2585 + * state information used during a piecewise
1.2586 + * conversion. Contents of statePtr are
1.2587 + * initialized and/or reset by conversion
1.2588 + * routine under control of flags argument. */
1.2589 + char *dst; /* Output buffer in which converted string
1.2590 + * is stored. */
1.2591 + int dstLen; /* The maximum length of output buffer in
1.2592 + * bytes. */
1.2593 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2594 + * source string that were converted. This
1.2595 + * may be less than the original source length
1.2596 + * if there was a problem converting some
1.2597 + * source characters. */
1.2598 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2599 + * stored in the output buffer as a result of
1.2600 + * the conversion. */
1.2601 + int *dstCharsPtr; /* Filled with the number of characters that
1.2602 + * correspond to the bytes stored in the
1.2603 + * output buffer. */
1.2604 +{
1.2605 + EscapeEncodingData *dataPtr;
1.2606 + char *prefixBytes, *tablePrefixBytes;
1.2607 + unsigned short **tableToUnicode;
1.2608 + Encoding *encodingPtr;
1.2609 + int state, result, numChars;
1.2610 + CONST char *srcStart, *srcEnd;
1.2611 + char *dstStart, *dstEnd;
1.2612 +
1.2613 + result = TCL_OK;
1.2614 +
1.2615 + tablePrefixBytes = NULL; /* lint. */
1.2616 + tableToUnicode = NULL; /* lint. */
1.2617 +
1.2618 + dataPtr = (EscapeEncodingData *) clientData;
1.2619 + prefixBytes = dataPtr->prefixBytes;
1.2620 + encodingPtr = NULL;
1.2621 +
1.2622 + srcStart = src;
1.2623 + srcEnd = src + srcLen;
1.2624 +
1.2625 + dstStart = dst;
1.2626 + dstEnd = dst + dstLen - TCL_UTF_MAX;
1.2627 +
1.2628 + state = (int) *statePtr;
1.2629 + if (flags & TCL_ENCODING_START) {
1.2630 + state = 0;
1.2631 + }
1.2632 +
1.2633 + for (numChars = 0; src < srcEnd; ) {
1.2634 + int byte, hi, lo, ch;
1.2635 +
1.2636 + if (dst > dstEnd) {
1.2637 + result = TCL_CONVERT_NOSPACE;
1.2638 + break;
1.2639 + }
1.2640 + byte = *((unsigned char *) src);
1.2641 + if (prefixBytes[byte]) {
1.2642 + unsigned int left, len, longest;
1.2643 + int checked, i;
1.2644 + EscapeSubTable *subTablePtr;
1.2645 +
1.2646 + /*
1.2647 + * Saw the beginning of an escape sequence.
1.2648 + */
1.2649 +
1.2650 + left = srcEnd - src;
1.2651 + len = dataPtr->initLen;
1.2652 + longest = len;
1.2653 + checked = 0;
1.2654 + if (len <= left) {
1.2655 + checked++;
1.2656 + if ((len > 0) &&
1.2657 + (memcmp(src, dataPtr->init, len) == 0)) {
1.2658 + /*
1.2659 + * If we see initialization string, skip it, even if we're
1.2660 + * not at the beginning of the buffer.
1.2661 + */
1.2662 +
1.2663 + src += len;
1.2664 + continue;
1.2665 + }
1.2666 + }
1.2667 + len = dataPtr->finalLen;
1.2668 + if (len > longest) {
1.2669 + longest = len;
1.2670 + }
1.2671 + if (len <= left) {
1.2672 + checked++;
1.2673 + if ((len > 0) &&
1.2674 + (memcmp(src, dataPtr->final, len) == 0)) {
1.2675 + /*
1.2676 + * If we see finalization string, skip it, even if we're
1.2677 + * not at the end of the buffer.
1.2678 + */
1.2679 +
1.2680 + src += len;
1.2681 + continue;
1.2682 + }
1.2683 + }
1.2684 + subTablePtr = dataPtr->subTables;
1.2685 + for (i = 0; i < dataPtr->numSubTables; i++) {
1.2686 + len = subTablePtr->sequenceLen;
1.2687 + if (len > longest) {
1.2688 + longest = len;
1.2689 + }
1.2690 + if (len <= left) {
1.2691 + checked++;
1.2692 + if ((len > 0) &&
1.2693 + (memcmp(src, subTablePtr->sequence, len) == 0)) {
1.2694 + state = i;
1.2695 + encodingPtr = NULL;
1.2696 + subTablePtr = NULL;
1.2697 + src += len;
1.2698 + break;
1.2699 + }
1.2700 + }
1.2701 + subTablePtr++;
1.2702 + }
1.2703 + if (subTablePtr == NULL) {
1.2704 + /*
1.2705 + * A match was found, the escape sequence was consumed, and
1.2706 + * the state was updated.
1.2707 + */
1.2708 +
1.2709 + continue;
1.2710 + }
1.2711 +
1.2712 + /*
1.2713 + * We have a split-up or unrecognized escape sequence. If we
1.2714 + * checked all the sequences, then it's a syntax error,
1.2715 + * otherwise we need more bytes to determine a match.
1.2716 + */
1.2717 +
1.2718 + if ((checked == dataPtr->numSubTables + 2)
1.2719 + || (flags & TCL_ENCODING_END)) {
1.2720 + if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
1.2721 + /*
1.2722 + * Skip the unknown escape sequence.
1.2723 + */
1.2724 +
1.2725 + src += longest;
1.2726 + continue;
1.2727 + }
1.2728 + result = TCL_CONVERT_SYNTAX;
1.2729 + } else {
1.2730 + result = TCL_CONVERT_MULTIBYTE;
1.2731 + }
1.2732 + break;
1.2733 + }
1.2734 +
1.2735 + if (encodingPtr == NULL) {
1.2736 + TableEncodingData *tableDataPtr;
1.2737 +
1.2738 + encodingPtr = GetTableEncoding(dataPtr, state);
1.2739 + tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
1.2740 + tablePrefixBytes = tableDataPtr->prefixBytes;
1.2741 + tableToUnicode = tableDataPtr->toUnicode;
1.2742 + }
1.2743 + if (tablePrefixBytes[byte]) {
1.2744 + src++;
1.2745 + if (src >= srcEnd) {
1.2746 + src--;
1.2747 + result = TCL_CONVERT_MULTIBYTE;
1.2748 + break;
1.2749 + }
1.2750 + hi = byte;
1.2751 + lo = *((unsigned char *) src);
1.2752 + } else {
1.2753 + hi = 0;
1.2754 + lo = byte;
1.2755 + }
1.2756 + ch = tableToUnicode[hi][lo];
1.2757 + dst += Tcl_UniCharToUtf(ch, dst);
1.2758 + src++;
1.2759 + numChars++;
1.2760 + }
1.2761 +
1.2762 + *statePtr = (Tcl_EncodingState) state;
1.2763 + *srcReadPtr = src - srcStart;
1.2764 + *dstWrotePtr = dst - dstStart;
1.2765 + *dstCharsPtr = numChars;
1.2766 + return result;
1.2767 +}
1.2768 +
1.2769 +/*
1.2770 + *-------------------------------------------------------------------------
1.2771 + *
1.2772 + * EscapeFromUtfProc --
1.2773 + *
1.2774 + * Convert from UTF-8 into the encoding specified by the
1.2775 + * EscapeEncodingData.
1.2776 + *
1.2777 + * Results:
1.2778 + * Returns TCL_OK if conversion was successful.
1.2779 + *
1.2780 + * Side effects:
1.2781 + * None.
1.2782 + *
1.2783 + *-------------------------------------------------------------------------
1.2784 + */
1.2785 +
1.2786 +static int
1.2787 +EscapeFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1.2788 + srcReadPtr, dstWrotePtr, dstCharsPtr)
1.2789 + ClientData clientData; /* EscapeEncodingData that specifies
1.2790 + * encoding. */
1.2791 + CONST char *src; /* Source string in UTF-8. */
1.2792 + int srcLen; /* Source string length in bytes. */
1.2793 + int flags; /* Conversion control flags. */
1.2794 + Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1.2795 + * state information used during a piecewise
1.2796 + * conversion. Contents of statePtr are
1.2797 + * initialized and/or reset by conversion
1.2798 + * routine under control of flags argument. */
1.2799 + char *dst; /* Output buffer in which converted string
1.2800 + * is stored. */
1.2801 + int dstLen; /* The maximum length of output buffer in
1.2802 + * bytes. */
1.2803 + int *srcReadPtr; /* Filled with the number of bytes from the
1.2804 + * source string that were converted. This
1.2805 + * may be less than the original source length
1.2806 + * if there was a problem converting some
1.2807 + * source characters. */
1.2808 + int *dstWrotePtr; /* Filled with the number of bytes that were
1.2809 + * stored in the output buffer as a result of
1.2810 + * the conversion. */
1.2811 + int *dstCharsPtr; /* Filled with the number of characters that
1.2812 + * correspond to the bytes stored in the
1.2813 + * output buffer. */
1.2814 +{
1.2815 + EscapeEncodingData *dataPtr;
1.2816 + Encoding *encodingPtr;
1.2817 + CONST char *srcStart, *srcEnd, *srcClose;
1.2818 + char *dstStart, *dstEnd;
1.2819 + int state, result, numChars;
1.2820 + TableEncodingData *tableDataPtr;
1.2821 + char *tablePrefixBytes;
1.2822 + unsigned short **tableFromUnicode;
1.2823 +
1.2824 + result = TCL_OK;
1.2825 +
1.2826 + dataPtr = (EscapeEncodingData *) clientData;
1.2827 +
1.2828 + srcStart = src;
1.2829 + srcEnd = src + srcLen;
1.2830 + srcClose = srcEnd;
1.2831 + if ((flags & TCL_ENCODING_END) == 0) {
1.2832 + srcClose -= TCL_UTF_MAX;
1.2833 + }
1.2834 +
1.2835 + dstStart = dst;
1.2836 + dstEnd = dst + dstLen - 1;
1.2837 +
1.2838 + /*
1.2839 + * RFC1468 states that the text starts in ASCII, and switches to Japanese
1.2840 + * characters, and that the text must end in ASCII. [Patch #474358]
1.2841 + */
1.2842 +
1.2843 + if (flags & TCL_ENCODING_START) {
1.2844 + state = 0;
1.2845 + if ((dst + dataPtr->initLen) > dstEnd) {
1.2846 + *srcReadPtr = 0;
1.2847 + *dstWrotePtr = 0;
1.2848 + return TCL_CONVERT_NOSPACE;
1.2849 + }
1.2850 + memcpy((VOID *) dst, (VOID *) dataPtr->init,
1.2851 + (size_t) dataPtr->initLen);
1.2852 + dst += dataPtr->initLen;
1.2853 + } else {
1.2854 + state = (int) *statePtr;
1.2855 + }
1.2856 +
1.2857 + encodingPtr = GetTableEncoding(dataPtr, state);
1.2858 + tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
1.2859 + tablePrefixBytes = tableDataPtr->prefixBytes;
1.2860 + tableFromUnicode = tableDataPtr->fromUnicode;
1.2861 +
1.2862 + for (numChars = 0; src < srcEnd; numChars++) {
1.2863 + unsigned int len;
1.2864 + int word;
1.2865 + Tcl_UniChar ch;
1.2866 +
1.2867 + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
1.2868 + /*
1.2869 + * If there is more string to follow, this will ensure that the
1.2870 + * last UTF-8 character in the source buffer hasn't been cut off.
1.2871 + */
1.2872 +
1.2873 + result = TCL_CONVERT_MULTIBYTE;
1.2874 + break;
1.2875 + }
1.2876 + len = TclUtfToUniChar(src, &ch);
1.2877 + word = tableFromUnicode[(ch >> 8)][ch & 0xff];
1.2878 +
1.2879 + if ((word == 0) && (ch != 0)) {
1.2880 + int oldState;
1.2881 + EscapeSubTable *subTablePtr;
1.2882 +
1.2883 + oldState = state;
1.2884 + for (state = 0; state < dataPtr->numSubTables; state++) {
1.2885 + encodingPtr = GetTableEncoding(dataPtr, state);
1.2886 + tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
1.2887 + word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xff];
1.2888 + if (word != 0) {
1.2889 + break;
1.2890 + }
1.2891 + }
1.2892 +
1.2893 + if (word == 0) {
1.2894 + state = oldState;
1.2895 + if (flags & TCL_ENCODING_STOPONERROR) {
1.2896 + result = TCL_CONVERT_UNKNOWN;
1.2897 + break;
1.2898 + }
1.2899 + encodingPtr = GetTableEncoding(dataPtr, state);
1.2900 + tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
1.2901 + word = tableDataPtr->fallback;
1.2902 + }
1.2903 +
1.2904 + tablePrefixBytes = tableDataPtr->prefixBytes;
1.2905 + tableFromUnicode = tableDataPtr->fromUnicode;
1.2906 +
1.2907 + /*
1.2908 + * The state variable has the value of oldState when word is 0.
1.2909 + * In this case, the escape sequense should not be copied to dst
1.2910 + * because the current character set is not changed.
1.2911 + */
1.2912 + if (state != oldState) {
1.2913 + subTablePtr = &dataPtr->subTables[state];
1.2914 + if ((dst + subTablePtr->sequenceLen) > dstEnd) {
1.2915 + /*
1.2916 + * If there is no space to write the escape sequence, the
1.2917 + * state variable must be changed to the value of oldState
1.2918 + * variable because this escape sequence must be written
1.2919 + * in the next conversion.
1.2920 + */
1.2921 + state = oldState;
1.2922 + result = TCL_CONVERT_NOSPACE;
1.2923 + break;
1.2924 + }
1.2925 + memcpy((VOID *) dst, (VOID *) subTablePtr->sequence,
1.2926 + (size_t) subTablePtr->sequenceLen);
1.2927 + dst += subTablePtr->sequenceLen;
1.2928 + }
1.2929 + }
1.2930 +
1.2931 + if (tablePrefixBytes[(word >> 8)] != 0) {
1.2932 + if (dst + 1 > dstEnd) {
1.2933 + result = TCL_CONVERT_NOSPACE;
1.2934 + break;
1.2935 + }
1.2936 + dst[0] = (char) (word >> 8);
1.2937 + dst[1] = (char) word;
1.2938 + dst += 2;
1.2939 + } else {
1.2940 + if (dst > dstEnd) {
1.2941 + result = TCL_CONVERT_NOSPACE;
1.2942 + break;
1.2943 + }
1.2944 + dst[0] = (char) word;
1.2945 + dst++;
1.2946 + }
1.2947 + src += len;
1.2948 + }
1.2949 +
1.2950 + if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) {
1.2951 + unsigned int len = dataPtr->subTables[0].sequenceLen;
1.2952 + /*
1.2953 + * [Bug 1516109].
1.2954 + * Certain encodings like iso2022-jp need to write
1.2955 + * an escape sequence after all characters have
1.2956 + * been converted. This logic checks that enough
1.2957 + * room is available in the buffer for the escape bytes.
1.2958 + * The TCL_ENCODING_END flag is cleared after a final
1.2959 + * escape sequence has been added to the buffer so
1.2960 + * that another call to this method does not attempt
1.2961 + * to append escape bytes a second time.
1.2962 + */
1.2963 + if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) {
1.2964 + result = TCL_CONVERT_NOSPACE;
1.2965 + } else {
1.2966 + if (state) {
1.2967 + memcpy((VOID *) dst, (VOID *) dataPtr->subTables[0].sequence,
1.2968 + (size_t) len);
1.2969 + dst += len;
1.2970 + }
1.2971 + memcpy((VOID *) dst, (VOID *) dataPtr->final,
1.2972 + (size_t) dataPtr->finalLen);
1.2973 + dst += dataPtr->finalLen;
1.2974 + state &= ~TCL_ENCODING_END;
1.2975 + }
1.2976 + }
1.2977 +
1.2978 + *statePtr = (Tcl_EncodingState) state;
1.2979 + *srcReadPtr = src - srcStart;
1.2980 + *dstWrotePtr = dst - dstStart;
1.2981 + *dstCharsPtr = numChars;
1.2982 + return result;
1.2983 +}
1.2984 +
1.2985 +/*
1.2986 + *---------------------------------------------------------------------------
1.2987 + *
1.2988 + * EscapeFreeProc --
1.2989 + *
1.2990 + * This procedure is invoked when an EscapeEncodingData encoding is
1.2991 + * deleted. It deletes the memory used by the encoding.
1.2992 + *
1.2993 + * Results:
1.2994 + * None.
1.2995 + *
1.2996 + * Side effects:
1.2997 + * Memory freed.
1.2998 + *
1.2999 + *---------------------------------------------------------------------------
1.3000 + */
1.3001 +
1.3002 +static void
1.3003 +EscapeFreeProc(clientData)
1.3004 + ClientData clientData; /* EscapeEncodingData that specifies encoding. */
1.3005 +{
1.3006 + EscapeEncodingData *dataPtr;
1.3007 + EscapeSubTable *subTablePtr;
1.3008 + int i;
1.3009 +
1.3010 + dataPtr = (EscapeEncodingData *) clientData;
1.3011 + if (dataPtr == NULL) {
1.3012 + return;
1.3013 + }
1.3014 + subTablePtr = dataPtr->subTables;
1.3015 + for (i = 0; i < dataPtr->numSubTables; i++) {
1.3016 + FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr);
1.3017 + subTablePtr++;
1.3018 + }
1.3019 + ckfree((char *) dataPtr);
1.3020 +}
1.3021 +
1.3022 +/*
1.3023 + *---------------------------------------------------------------------------
1.3024 + *
1.3025 + * GetTableEncoding --
1.3026 + *
1.3027 + * Helper function for the EscapeEncodingData conversions. Gets the
1.3028 + * encoding (of type TextEncodingData) that represents the specified
1.3029 + * state.
1.3030 + *
1.3031 + * Results:
1.3032 + * The return value is the encoding.
1.3033 + *
1.3034 + * Side effects:
1.3035 + * If the encoding that represents the specified state has not
1.3036 + * already been used by this EscapeEncoding, it will be loaded
1.3037 + * and cached in the dataPtr.
1.3038 + *
1.3039 + *---------------------------------------------------------------------------
1.3040 + */
1.3041 +
1.3042 +static Encoding *
1.3043 +GetTableEncoding(dataPtr, state)
1.3044 + EscapeEncodingData *dataPtr;/* Contains names of encodings. */
1.3045 + int state; /* Index in dataPtr of desired Encoding. */
1.3046 +{
1.3047 + EscapeSubTable *subTablePtr;
1.3048 + Encoding *encodingPtr;
1.3049 +
1.3050 + subTablePtr = &dataPtr->subTables[state];
1.3051 + encodingPtr = subTablePtr->encodingPtr;
1.3052 + if (encodingPtr == NULL) {
1.3053 + /*
1.3054 + * Now that escape encodings load their sub-encodings first, and
1.3055 + * fail to load if any sub-encodings are missing, this branch should
1.3056 + * never happen.
1.3057 + */
1.3058 + encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name);
1.3059 + if ((encodingPtr == NULL)
1.3060 + || (encodingPtr->toUtfProc != TableToUtfProc)) {
1.3061 + panic("EscapeToUtfProc: invalid sub table");
1.3062 + }
1.3063 + subTablePtr->encodingPtr = encodingPtr;
1.3064 + }
1.3065 + return encodingPtr;
1.3066 +}
1.3067 +
1.3068 +/*
1.3069 + *---------------------------------------------------------------------------
1.3070 + *
1.3071 + * unilen --
1.3072 + *
1.3073 + * A helper function for the Tcl_ExternalToUtf functions. This
1.3074 + * function is similar to strlen for double-byte characters: it
1.3075 + * returns the number of bytes in a 0x0000 terminated string.
1.3076 + *
1.3077 + * Results:
1.3078 + * As above.
1.3079 + *
1.3080 + * Side effects:
1.3081 + * None.
1.3082 + *
1.3083 + *---------------------------------------------------------------------------
1.3084 + */
1.3085 +
1.3086 +static size_t
1.3087 +unilen(src)
1.3088 + CONST char *src;
1.3089 +{
1.3090 + unsigned short *p;
1.3091 +
1.3092 + p = (unsigned short *) src;
1.3093 + while (*p != 0x0000) {
1.3094 + p++;
1.3095 + }
1.3096 + return (char *) p - src;
1.3097 +}
1.3098 +
1.3099 +/*
1.3100 + *-------------------------------------------------------------------------
1.3101 + *
1.3102 + * TclFindEncodings --
1.3103 + *
1.3104 + * Find and load the encoding file for this operating system.
1.3105 + * Before this is called, Tcl makes assumptions about the
1.3106 + * native string representation, but the true encoding is not
1.3107 + * assured.
1.3108 + *
1.3109 + * Results:
1.3110 + * Return result of TclpInitLibraryPath, which reports whether the
1.3111 + * path is clean (0) or dirty (1) UTF.
1.3112 + *
1.3113 + * Side effects:
1.3114 + * Varied, see the respective initialization routines.
1.3115 + *
1.3116 + *-------------------------------------------------------------------------
1.3117 + */
1.3118 +
1.3119 +static int
1.3120 +TclFindEncodings(argv0)
1.3121 + CONST char *argv0; /* Name of executable from argv[0] to main()
1.3122 + * in native multi-byte encoding. */
1.3123 +{
1.3124 + int mustCleanUtf = 0;
1.3125 +
1.3126 + if (encodingsInitialized == 0) {
1.3127 + /*
1.3128 + * Double check inside the mutex. There may be calls
1.3129 + * back into this routine from some of the procedures below.
1.3130 + */
1.3131 +
1.3132 + TclpInitLock();
1.3133 + if (encodingsInitialized == 0) {
1.3134 + char *native;
1.3135 + Tcl_Obj *pathPtr;
1.3136 + Tcl_DString libPath, buffer;
1.3137 +
1.3138 + /*
1.3139 + * Have to set this bit here to avoid deadlock with the
1.3140 + * routines below us that call into TclInitSubsystems.
1.3141 + */
1.3142 +
1.3143 + encodingsInitialized = 1;
1.3144 +
1.3145 + native = TclpFindExecutable(argv0);
1.3146 + mustCleanUtf = TclpInitLibraryPath(native);
1.3147 +
1.3148 + /*
1.3149 + * The library path was set in the TclpInitLibraryPath routine.
1.3150 + * The string set is a dirty UTF string. To preserve the value
1.3151 + * convert the UTF string back to native before setting the new
1.3152 + * default encoding.
1.3153 + */
1.3154 +
1.3155 + pathPtr = TclGetLibraryPath();
1.3156 + if ((pathPtr != NULL) && mustCleanUtf) {
1.3157 + Tcl_UtfToExternalDString(NULL, Tcl_GetString(pathPtr), -1,
1.3158 + &libPath);
1.3159 + }
1.3160 +
1.3161 + TclpSetInitialEncodings();
1.3162 +
1.3163 + /*
1.3164 + * Now convert the native string back to UTF.
1.3165 + */
1.3166 +
1.3167 + if ((pathPtr != NULL) && mustCleanUtf) {
1.3168 + Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&libPath), -1,
1.3169 + &buffer);
1.3170 + pathPtr = Tcl_NewStringObj(Tcl_DStringValue(&buffer), -1);
1.3171 + TclSetLibraryPath(pathPtr);
1.3172 +
1.3173 + Tcl_DStringFree(&libPath);
1.3174 + Tcl_DStringFree(&buffer);
1.3175 + }
1.3176 + }
1.3177 + TclpInitUnlock();
1.3178 + }
1.3179 +
1.3180 + return mustCleanUtf;
1.3181 +}