os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclUtf.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
 * tclUtf.c --
sl@0
     3
 *
sl@0
     4
 *	Routines for manipulating UTF-8 strings.
sl@0
     5
 *
sl@0
     6
 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
sl@0
     7
 * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.  
sl@0
     8
 *
sl@0
     9
 * See the file "license.terms" for information on usage and redistribution
sl@0
    10
 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
sl@0
    11
 *
sl@0
    12
 * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $
sl@0
    13
 */
sl@0
    14
sl@0
    15
#include "tclInt.h"
sl@0
    16
sl@0
    17
/*
sl@0
    18
 * Include the static character classification tables and macros.
sl@0
    19
 */
sl@0
    20
sl@0
    21
#include "tclUniData.c"
sl@0
    22
sl@0
    23
/*
sl@0
    24
 * The following macros are used for fast character category tests.  The
sl@0
    25
 * x_BITS values are shifted right by the category value to determine whether
sl@0
    26
 * the given category is included in the set.
sl@0
    27
 */ 
sl@0
    28
sl@0
    29
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
sl@0
    30
    | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
sl@0
    31
sl@0
    32
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
sl@0
    33
sl@0
    34
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
sl@0
    35
    | (1 << PARAGRAPH_SEPARATOR))
sl@0
    36
sl@0
    37
#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
sl@0
    38
sl@0
    39
#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
sl@0
    40
	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
sl@0
    41
	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
sl@0
    42
	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
sl@0
    43
	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
sl@0
    44
	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
sl@0
    45
	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
sl@0
    46
	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
sl@0
    47
	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
sl@0
    48
sl@0
    49
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
sl@0
    50
	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
sl@0
    51
	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
sl@0
    52
	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
sl@0
    53
sl@0
    54
/*
sl@0
    55
 * Unicode characters less than this value are represented by themselves 
sl@0
    56
 * in UTF-8 strings. 
sl@0
    57
 */
sl@0
    58
sl@0
    59
#define UNICODE_SELF	0x80
sl@0
    60
sl@0
    61
/*
sl@0
    62
 * The following structures are used when mapping between Unicode (UCS-2)
sl@0
    63
 * and UTF-8.
sl@0
    64
 */
sl@0
    65
sl@0
    66
static CONST unsigned char totalBytes[256] = {
sl@0
    67
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0
    68
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0
    69
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0
    70
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0
    71
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0
    72
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0
    73
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
sl@0
    74
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
sl@0
    75
#if TCL_UTF_MAX > 3
sl@0
    76
    4,4,4,4,4,4,4,4,
sl@0
    77
#else
sl@0
    78
    1,1,1,1,1,1,1,1,
sl@0
    79
#endif
sl@0
    80
#if TCL_UTF_MAX > 4
sl@0
    81
    5,5,5,5,
sl@0
    82
#else
sl@0
    83
    1,1,1,1,
sl@0
    84
#endif
sl@0
    85
#if TCL_UTF_MAX > 5
sl@0
    86
    6,6,6,6
sl@0
    87
#else
sl@0
    88
    1,1,1,1
sl@0
    89
#endif
sl@0
    90
};
sl@0
    91
sl@0
    92
/*
sl@0
    93
 * Procedures used only in this module.
sl@0
    94
 */
sl@0
    95
sl@0
    96
static int UtfCount _ANSI_ARGS_((int ch));
sl@0
    97
sl@0
    98

sl@0
    99
/*
sl@0
   100
 *---------------------------------------------------------------------------
sl@0
   101
 *
sl@0
   102
 * UtfCount --
sl@0
   103
 *
sl@0
   104
 *	Find the number of bytes in the Utf character "ch".
sl@0
   105
 *
sl@0
   106
 * Results:
sl@0
   107
 *	The return values is the number of bytes in the Utf character "ch".
sl@0
   108
 *
sl@0
   109
 * Side effects:
sl@0
   110
 *	None.
sl@0
   111
 *
sl@0
   112
 *---------------------------------------------------------------------------
sl@0
   113
 */
sl@0
   114
 
sl@0
   115
INLINE static int
sl@0
   116
UtfCount(ch)
sl@0
   117
    int ch;			/* The Tcl_UniChar whose size is returned. */
sl@0
   118
{
sl@0
   119
    if ((ch > 0) && (ch < UNICODE_SELF)) {
sl@0
   120
	return 1;
sl@0
   121
    }
sl@0
   122
    if (ch <= 0x7FF) {
sl@0
   123
	return 2;
sl@0
   124
    }
sl@0
   125
    if (ch <= 0xFFFF) {
sl@0
   126
	return 3;
sl@0
   127
    }
sl@0
   128
#if TCL_UTF_MAX > 3
sl@0
   129
    if (ch <= 0x1FFFFF) {
sl@0
   130
	return 4;
sl@0
   131
    }
sl@0
   132
    if (ch <= 0x3FFFFFF) {
sl@0
   133
	return 5;
sl@0
   134
    }
sl@0
   135
    if (ch <= 0x7FFFFFFF) {
sl@0
   136
	return 6;
sl@0
   137
    }
sl@0
   138
#endif
sl@0
   139
    return 3;
sl@0
   140
}
sl@0
   141
sl@0
   142
/*
sl@0
   143
 *---------------------------------------------------------------------------
sl@0
   144
 *
sl@0
   145
 * Tcl_UniCharToUtf --
sl@0
   146
 *
sl@0
   147
 *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
sl@0
   148
 *	provided buffer.  Equivalent to Plan 9 runetochar().
sl@0
   149
 *
sl@0
   150
 * Results:
sl@0
   151
 *	The return values is the number of bytes in the buffer that
sl@0
   152
 *	were consumed.  
sl@0
   153
 *
sl@0
   154
 * Side effects:
sl@0
   155
 *	None.
sl@0
   156
 *
sl@0
   157
 *---------------------------------------------------------------------------
sl@0
   158
 */
sl@0
   159
 
sl@0
   160
EXPORT_C INLINE int
sl@0
   161
Tcl_UniCharToUtf(ch, str)
sl@0
   162
    int ch;			/* The Tcl_UniChar to be stored in the
sl@0
   163
				 * buffer. */
sl@0
   164
    char *str;			/* Buffer in which the UTF-8 representation
sl@0
   165
				 * of the Tcl_UniChar is stored.  Buffer must
sl@0
   166
				 * be large enough to hold the UTF-8 character
sl@0
   167
				 * (at most TCL_UTF_MAX bytes). */
sl@0
   168
{
sl@0
   169
    if ((ch > 0) && (ch < UNICODE_SELF)) {
sl@0
   170
	str[0] = (char) ch;
sl@0
   171
	return 1;
sl@0
   172
    }
sl@0
   173
    if (ch >= 0) {
sl@0
   174
	if (ch <= 0x7FF) {
sl@0
   175
	    str[1] = (char) ((ch | 0x80) & 0xBF);
sl@0
   176
	    str[0] = (char) ((ch >> 6) | 0xC0);
sl@0
   177
	    return 2;
sl@0
   178
	}
sl@0
   179
	if (ch <= 0xFFFF) {
sl@0
   180
	three:
sl@0
   181
	    str[2] = (char) ((ch | 0x80) & 0xBF);
sl@0
   182
	    str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0
   183
	    str[0] = (char) ((ch >> 12) | 0xE0);
sl@0
   184
	    return 3;
sl@0
   185
	}
sl@0
   186
sl@0
   187
#if TCL_UTF_MAX > 3
sl@0
   188
	if (ch <= 0x1FFFFF) {
sl@0
   189
	    str[3] = (char) ((ch | 0x80) & 0xBF);
sl@0
   190
	    str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0
   191
	    str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
sl@0
   192
	    str[0] = (char) ((ch >> 18) | 0xF0);
sl@0
   193
	    return 4;
sl@0
   194
	}
sl@0
   195
	if (ch <= 0x3FFFFFF) {
sl@0
   196
	    str[4] = (char) ((ch | 0x80) & 0xBF);
sl@0
   197
	    str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0
   198
	    str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
sl@0
   199
	    str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
sl@0
   200
	    str[0] = (char) ((ch >> 24) | 0xF8);
sl@0
   201
	    return 5;
sl@0
   202
	}
sl@0
   203
	if (ch <= 0x7FFFFFFF) {
sl@0
   204
	    str[5] = (char) ((ch | 0x80) & 0xBF);
sl@0
   205
	    str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0
   206
	    str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
sl@0
   207
	    str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
sl@0
   208
	    str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
sl@0
   209
	    str[0] = (char) ((ch >> 30) | 0xFC);
sl@0
   210
	    return 6;
sl@0
   211
	}
sl@0
   212
#endif
sl@0
   213
    }
sl@0
   214
sl@0
   215
    ch = 0xFFFD;
sl@0
   216
    goto three;
sl@0
   217
}
sl@0
   218

sl@0
   219
/*
sl@0
   220
 *---------------------------------------------------------------------------
sl@0
   221
 *
sl@0
   222
 * Tcl_UniCharToUtfDString --
sl@0
   223
 *
sl@0
   224
 *	Convert the given Unicode string to UTF-8.
sl@0
   225
 *
sl@0
   226
 * Results:
sl@0
   227
 *	The return value is a pointer to the UTF-8 representation of the
sl@0
   228
 *	Unicode string.  Storage for the return value is appended to the
sl@0
   229
 *	end of dsPtr.
sl@0
   230
 *
sl@0
   231
 * Side effects:
sl@0
   232
 *	None.
sl@0
   233
 *
sl@0
   234
 *---------------------------------------------------------------------------
sl@0
   235
 */
sl@0
   236
 
sl@0
   237
EXPORT_C char *
sl@0
   238
Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
sl@0
   239
    CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */
sl@0
   240
    int numChars;		/* Length of Unicode string in Tcl_UniChars
sl@0
   241
				 * (must be >= 0). */
sl@0
   242
    Tcl_DString *dsPtr;		/* UTF-8 representation of string is
sl@0
   243
				 * appended to this previously initialized
sl@0
   244
				 * DString. */
sl@0
   245
{
sl@0
   246
    CONST Tcl_UniChar *w, *wEnd;
sl@0
   247
    char *p, *string;
sl@0
   248
    int oldLength;
sl@0
   249
sl@0
   250
    /*
sl@0
   251
     * UTF-8 string length in bytes will be <= Unicode string length *
sl@0
   252
     * TCL_UTF_MAX.
sl@0
   253
     */
sl@0
   254
sl@0
   255
    oldLength = Tcl_DStringLength(dsPtr);
sl@0
   256
    Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
sl@0
   257
    string = Tcl_DStringValue(dsPtr) + oldLength;
sl@0
   258
sl@0
   259
    p = string;
sl@0
   260
    wEnd = wString + numChars;
sl@0
   261
    for (w = wString; w < wEnd; ) {
sl@0
   262
	p += Tcl_UniCharToUtf(*w, p);
sl@0
   263
	w++;
sl@0
   264
    }
sl@0
   265
    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
sl@0
   266
sl@0
   267
    return string;
sl@0
   268
}
sl@0
   269

sl@0
   270
/*
sl@0
   271
 *---------------------------------------------------------------------------
sl@0
   272
 *
sl@0
   273
 * Tcl_UtfToUniChar --
sl@0
   274
 *
sl@0
   275
 *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
sl@0
   276
 *	UTF-8 sequences are converted to valid Tcl_UniChars and processing
sl@0
   277
 *	continues.  Equivalent to Plan 9 chartorune().
sl@0
   278
 *
sl@0
   279
 *	The caller must ensure that the source buffer is long enough that
sl@0
   280
 *	this routine does not run off the end and dereference non-existent
sl@0
   281
 *	memory looking for trail bytes.  If the source buffer is known to
sl@0
   282
 *	be '\0' terminated, this cannot happen.  Otherwise, the caller
sl@0
   283
 *	should call Tcl_UtfCharComplete() before calling this routine to
sl@0
   284
 *	ensure that enough bytes remain in the string.
sl@0
   285
 *
sl@0
   286
 * Results:
sl@0
   287
 *	*chPtr is filled with the Tcl_UniChar, and the return value is the
sl@0
   288
 *	number of bytes from the UTF-8 string that were consumed.
sl@0
   289
 *
sl@0
   290
 * Side effects:
sl@0
   291
 *	None.
sl@0
   292
 *
sl@0
   293
 *---------------------------------------------------------------------------
sl@0
   294
 */
sl@0
   295
 
sl@0
   296
EXPORT_C int
sl@0
   297
Tcl_UtfToUniChar(str, chPtr)
sl@0
   298
    register CONST char *str;	 /* The UTF-8 string. */
sl@0
   299
    register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
sl@0
   300
				  * by the UTF-8 string. */
sl@0
   301
{
sl@0
   302
    register int byte;
sl@0
   303
    
sl@0
   304
    /*
sl@0
   305
     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
sl@0
   306
     */
sl@0
   307
sl@0
   308
    byte = *((unsigned char *) str);
sl@0
   309
    if (byte < 0xC0) {
sl@0
   310
	/*
sl@0
   311
	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
sl@0
   312
	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
sl@0
   313
	 * characters representing themselves.
sl@0
   314
	 */
sl@0
   315
sl@0
   316
	*chPtr = (Tcl_UniChar) byte;
sl@0
   317
	return 1;
sl@0
   318
    } else if (byte < 0xE0) {
sl@0
   319
	if ((str[1] & 0xC0) == 0x80) {
sl@0
   320
	    /*
sl@0
   321
	     * Two-byte-character lead-byte followed by a trail-byte.
sl@0
   322
	     */
sl@0
   323
sl@0
   324
	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
sl@0
   325
	    return 2;
sl@0
   326
	}
sl@0
   327
	/*
sl@0
   328
	 * A two-byte-character lead-byte not followed by trail-byte
sl@0
   329
	 * represents itself.
sl@0
   330
	 */
sl@0
   331
sl@0
   332
	*chPtr = (Tcl_UniChar) byte;
sl@0
   333
	return 1;
sl@0
   334
    } else if (byte < 0xF0) {
sl@0
   335
	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
sl@0
   336
	    /*
sl@0
   337
	     * Three-byte-character lead byte followed by two trail bytes.
sl@0
   338
	     */
sl@0
   339
sl@0
   340
	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 
sl@0
   341
		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
sl@0
   342
	    return 3;
sl@0
   343
	}
sl@0
   344
	/*
sl@0
   345
	 * A three-byte-character lead-byte not followed by two trail-bytes
sl@0
   346
	 * represents itself.
sl@0
   347
	 */
sl@0
   348
sl@0
   349
	*chPtr = (Tcl_UniChar) byte;
sl@0
   350
	return 1;
sl@0
   351
    }
sl@0
   352
#if TCL_UTF_MAX > 3
sl@0
   353
    else {
sl@0
   354
	int ch, total, trail;
sl@0
   355
sl@0
   356
	total = totalBytes[byte];
sl@0
   357
	trail = total - 1;
sl@0
   358
	if (trail > 0) {
sl@0
   359
	    ch = byte & (0x3F >> trail);
sl@0
   360
	    do {
sl@0
   361
		str++;
sl@0
   362
		if ((*str & 0xC0) != 0x80) {
sl@0
   363
		    *chPtr = byte;
sl@0
   364
		    return 1;
sl@0
   365
		}
sl@0
   366
		ch <<= 6;
sl@0
   367
		ch |= (*str & 0x3F);
sl@0
   368
		trail--;
sl@0
   369
	    } while (trail > 0);
sl@0
   370
	    *chPtr = ch;
sl@0
   371
	    return total;
sl@0
   372
	}
sl@0
   373
    }
sl@0
   374
#endif
sl@0
   375
sl@0
   376
    *chPtr = (Tcl_UniChar) byte;
sl@0
   377
    return 1;
sl@0
   378
}
sl@0
   379

sl@0
   380
/*
sl@0
   381
 *---------------------------------------------------------------------------
sl@0
   382
 *
sl@0
   383
 * Tcl_UtfToUniCharDString --
sl@0
   384
 *
sl@0
   385
 *	Convert the UTF-8 string to Unicode.
sl@0
   386
 *
sl@0
   387
 * Results:
sl@0
   388
 *	The return value is a pointer to the Unicode representation of the
sl@0
   389
 *	UTF-8 string.  Storage for the return value is appended to the
sl@0
   390
 *	end of dsPtr.  The Unicode string is terminated with a Unicode
sl@0
   391
 *	NULL character.
sl@0
   392
 *
sl@0
   393
 * Side effects:
sl@0
   394
 *	None.
sl@0
   395
 *
sl@0
   396
 *---------------------------------------------------------------------------
sl@0
   397
 */
sl@0
   398
sl@0
   399
EXPORT_C Tcl_UniChar *
sl@0
   400
Tcl_UtfToUniCharDString(string, length, dsPtr)
sl@0
   401
    CONST char *string;		/* UTF-8 string to convert to Unicode. */
sl@0
   402
    int length;			/* Length of UTF-8 string in bytes, or -1
sl@0
   403
				 * for strlen(). */
sl@0
   404
    Tcl_DString *dsPtr;		/* Unicode representation of string is
sl@0
   405
				 * appended to this previously initialized
sl@0
   406
				 * DString. */
sl@0
   407
{
sl@0
   408
    Tcl_UniChar *w, *wString;
sl@0
   409
    CONST char *p, *end;
sl@0
   410
    int oldLength;
sl@0
   411
sl@0
   412
    if (length < 0) {
sl@0
   413
	length = strlen(string);
sl@0
   414
    }
sl@0
   415
sl@0
   416
    /*
sl@0
   417
     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
sl@0
   418
     * in bytes.
sl@0
   419
     */
sl@0
   420
sl@0
   421
    oldLength = Tcl_DStringLength(dsPtr);
sl@0
   422
    Tcl_DStringSetLength(dsPtr,
sl@0
   423
	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
sl@0
   424
    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
sl@0
   425
sl@0
   426
    w = wString;
sl@0
   427
    end = string + length;
sl@0
   428
    for (p = string; p < end; ) {
sl@0
   429
	p += TclUtfToUniChar(p, w);
sl@0
   430
	w++;
sl@0
   431
    }
sl@0
   432
    *w = '\0';
sl@0
   433
    Tcl_DStringSetLength(dsPtr,
sl@0
   434
	    (oldLength + ((char *) w - (char *) wString)));
sl@0
   435
sl@0
   436
    return wString;
sl@0
   437
}
sl@0
   438

sl@0
   439
/*
sl@0
   440
 *---------------------------------------------------------------------------
sl@0
   441
 *
sl@0
   442
 * Tcl_UtfCharComplete --
sl@0
   443
 *
sl@0
   444
 *	Determine if the UTF-8 string of the given length is long enough
sl@0
   445
 *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
sl@0
   446
 *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
sl@0
   447
 *
sl@0
   448
 * Results:
sl@0
   449
 *	The return value is 0 if the string is not long enough, non-zero
sl@0
   450
 *	otherwise.
sl@0
   451
 *
sl@0
   452
 * Side effects:
sl@0
   453
 *	None.
sl@0
   454
 *
sl@0
   455
 *---------------------------------------------------------------------------
sl@0
   456
 */
sl@0
   457
sl@0
   458
EXPORT_C int
sl@0
   459
Tcl_UtfCharComplete(str, len)
sl@0
   460
    CONST char *str;		/* String to check if first few bytes
sl@0
   461
				 * contain a complete UTF-8 character. */
sl@0
   462
    int len;			/* Length of above string in bytes. */
sl@0
   463
{
sl@0
   464
    int ch;
sl@0
   465
sl@0
   466
    ch = *((unsigned char *) str);
sl@0
   467
    return len >= totalBytes[ch];
sl@0
   468
}
sl@0
   469

sl@0
   470
/*
sl@0
   471
 *---------------------------------------------------------------------------
sl@0
   472
 *
sl@0
   473
 * Tcl_NumUtfChars --
sl@0
   474
 *
sl@0
   475
 *	Returns the number of characters (not bytes) in the UTF-8 string,
sl@0
   476
 *	not including the terminating NULL byte.  This is equivalent to
sl@0
   477
 *	Plan 9 utflen() and utfnlen().
sl@0
   478
 *
sl@0
   479
 * Results:
sl@0
   480
 *	As above.  
sl@0
   481
 *
sl@0
   482
 * Side effects:
sl@0
   483
 *	None.
sl@0
   484
 *
sl@0
   485
 *---------------------------------------------------------------------------
sl@0
   486
 */
sl@0
   487
 
sl@0
   488
EXPORT_C int 
sl@0
   489
Tcl_NumUtfChars(str, len)
sl@0
   490
    register CONST char *str;	/* The UTF-8 string to measure. */
sl@0
   491
    int len;			/* The length of the string in bytes, or -1
sl@0
   492
				 * for strlen(string). */
sl@0
   493
{
sl@0
   494
    Tcl_UniChar ch;
sl@0
   495
    register Tcl_UniChar *chPtr = &ch;
sl@0
   496
    register int i;
sl@0
   497
sl@0
   498
    /*
sl@0
   499
     * The separate implementations are faster.
sl@0
   500
     *
sl@0
   501
     * Since this is a time-sensitive function, we also do the check for
sl@0
   502
     * the single-byte char case specially.
sl@0
   503
     */
sl@0
   504
sl@0
   505
    i = 0;
sl@0
   506
    if (len < 0) {
sl@0
   507
	while (*str != '\0') {
sl@0
   508
	    str += TclUtfToUniChar(str, chPtr);
sl@0
   509
	    i++;
sl@0
   510
	}
sl@0
   511
    } else {
sl@0
   512
	register int n;
sl@0
   513
sl@0
   514
	while (len > 0) {
sl@0
   515
	    if (UCHAR(*str) < 0xC0) {
sl@0
   516
		len--;
sl@0
   517
		str++;
sl@0
   518
	    } else {
sl@0
   519
		n = Tcl_UtfToUniChar(str, chPtr);
sl@0
   520
		len -= n;
sl@0
   521
		str += n;
sl@0
   522
	    }
sl@0
   523
	    i++;
sl@0
   524
	}
sl@0
   525
    }
sl@0
   526
    return i;
sl@0
   527
}
sl@0
   528

sl@0
   529
/*
sl@0
   530
 *---------------------------------------------------------------------------
sl@0
   531
 *
sl@0
   532
 * Tcl_UtfFindFirst --
sl@0
   533
 *
sl@0
   534
 *	Returns a pointer to the first occurance of the given Tcl_UniChar
sl@0
   535
 *	in the NULL-terminated UTF-8 string.  The NULL terminator is
sl@0
   536
 *	considered part of the UTF-8 string.  Equivalent to Plan 9
sl@0
   537
 *	utfrune().
sl@0
   538
 *
sl@0
   539
 * Results:
sl@0
   540
 *	As above.  If the Tcl_UniChar does not exist in the given string,
sl@0
   541
 *	the return value is NULL.
sl@0
   542
 *
sl@0
   543
 * Side effects:
sl@0
   544
 *	None.
sl@0
   545
 *
sl@0
   546
 *---------------------------------------------------------------------------
sl@0
   547
 */
sl@0
   548
EXPORT_C CONST char *
sl@0
   549
Tcl_UtfFindFirst(string, ch)
sl@0
   550
    CONST char *string;		/* The UTF-8 string to be searched. */
sl@0
   551
    int ch;			/* The Tcl_UniChar to search for. */
sl@0
   552
{
sl@0
   553
    int len;
sl@0
   554
    Tcl_UniChar find;
sl@0
   555
    
sl@0
   556
    while (1) {
sl@0
   557
	len = TclUtfToUniChar(string, &find);
sl@0
   558
	if (find == ch) {
sl@0
   559
	    return string;
sl@0
   560
	}
sl@0
   561
	if (*string == '\0') {
sl@0
   562
	    return NULL;
sl@0
   563
	}
sl@0
   564
	string += len;
sl@0
   565
    }
sl@0
   566
}
sl@0
   567

sl@0
   568
/*
sl@0
   569
 *---------------------------------------------------------------------------
sl@0
   570
 *
sl@0
   571
 * Tcl_UtfFindLast --
sl@0
   572
 *
sl@0
   573
 *	Returns a pointer to the last occurance of the given Tcl_UniChar
sl@0
   574
 *	in the NULL-terminated UTF-8 string.  The NULL terminator is
sl@0
   575
 *	considered part of the UTF-8 string.  Equivalent to Plan 9
sl@0
   576
 *	utfrrune().
sl@0
   577
 *
sl@0
   578
 * Results:
sl@0
   579
 *	As above.  If the Tcl_UniChar does not exist in the given string,
sl@0
   580
 *	the return value is NULL.
sl@0
   581
 *
sl@0
   582
 * Side effects:
sl@0
   583
 *	None.
sl@0
   584
 *
sl@0
   585
 *---------------------------------------------------------------------------
sl@0
   586
 */
sl@0
   587
sl@0
   588
EXPORT_C CONST char *
sl@0
   589
Tcl_UtfFindLast(string, ch)
sl@0
   590
    CONST char *string;		/* The UTF-8 string to be searched. */
sl@0
   591
    int ch;			/* The Tcl_UniChar to search for. */
sl@0
   592
{
sl@0
   593
    int len;
sl@0
   594
    Tcl_UniChar find;
sl@0
   595
    CONST char *last;
sl@0
   596
	
sl@0
   597
    last = NULL;
sl@0
   598
    while (1) {
sl@0
   599
	len = TclUtfToUniChar(string, &find);
sl@0
   600
	if (find == ch) {
sl@0
   601
	    last = string;
sl@0
   602
	}
sl@0
   603
	if (*string == '\0') {
sl@0
   604
	    break;
sl@0
   605
	}
sl@0
   606
	string += len;
sl@0
   607
    }
sl@0
   608
    return last;
sl@0
   609
}
sl@0
   610

sl@0
   611
/*
sl@0
   612
 *---------------------------------------------------------------------------
sl@0
   613
 *
sl@0
   614
 * Tcl_UtfNext --
sl@0
   615
 *
sl@0
   616
 *	Given a pointer to some current location in a UTF-8 string,
sl@0
   617
 *	move forward one character.  The caller must ensure that they
sl@0
   618
 *	are not asking for the next character after the last character
sl@0
   619
 *	in the string.
sl@0
   620
 *
sl@0
   621
 * Results:
sl@0
   622
 *	The return value is the pointer to the next character in
sl@0
   623
 *	the UTF-8 string.
sl@0
   624
 *
sl@0
   625
 * Side effects:
sl@0
   626
 *	None.
sl@0
   627
 *
sl@0
   628
 *---------------------------------------------------------------------------
sl@0
   629
 */
sl@0
   630
 
sl@0
   631
EXPORT_C CONST char *
sl@0
   632
Tcl_UtfNext(str) 
sl@0
   633
    CONST char *str;		    /* The current location in the string. */
sl@0
   634
{
sl@0
   635
    Tcl_UniChar ch;
sl@0
   636
sl@0
   637
    return str + TclUtfToUniChar(str, &ch);
sl@0
   638
}
sl@0
   639

sl@0
   640
/*
sl@0
   641
 *---------------------------------------------------------------------------
sl@0
   642
 *
sl@0
   643
 * Tcl_UtfPrev --
sl@0
   644
 *
sl@0
   645
 *	Given a pointer to some current location in a UTF-8 string,
sl@0
   646
 *	move backwards one character.  This works correctly when the
sl@0
   647
 *	pointer is in the middle of a UTF-8 character.
sl@0
   648
 *
sl@0
   649
 * Results:
sl@0
   650
 *	The return value is a pointer to the previous character in the
sl@0
   651
 *	UTF-8 string.  If the current location was already at the
sl@0
   652
 *	beginning of the string, the return value will also be a
sl@0
   653
 *	pointer to the beginning of the string.
sl@0
   654
 *
sl@0
   655
 * Side effects:
sl@0
   656
 *	None.
sl@0
   657
 *
sl@0
   658
 *---------------------------------------------------------------------------
sl@0
   659
 */
sl@0
   660
sl@0
   661
EXPORT_C CONST char *
sl@0
   662
Tcl_UtfPrev(str, start)
sl@0
   663
    CONST char *str;		    /* The current location in the string. */
sl@0
   664
    CONST char *start;		    /* Pointer to the beginning of the
sl@0
   665
				     * string, to avoid going backwards too
sl@0
   666
				     * far. */
sl@0
   667
{
sl@0
   668
    CONST char *look;
sl@0
   669
    int i, byte;
sl@0
   670
    
sl@0
   671
    str--;
sl@0
   672
    look = str;
sl@0
   673
    for (i = 0; i < TCL_UTF_MAX; i++) {
sl@0
   674
	if (look < start) {
sl@0
   675
	    if (str < start) {
sl@0
   676
		str = start;
sl@0
   677
	    }
sl@0
   678
	    break;
sl@0
   679
	}
sl@0
   680
	byte = *((unsigned char *) look);
sl@0
   681
	if (byte < 0x80) {
sl@0
   682
	    break;
sl@0
   683
	}
sl@0
   684
	if (byte >= 0xC0) {
sl@0
   685
	    return look;
sl@0
   686
	}
sl@0
   687
	look--;
sl@0
   688
    }
sl@0
   689
    return str;
sl@0
   690
}
sl@0
   691
	
sl@0
   692
/*
sl@0
   693
 *---------------------------------------------------------------------------
sl@0
   694
 *
sl@0
   695
 * Tcl_UniCharAtIndex --
sl@0
   696
 *
sl@0
   697
 *	Returns the Unicode character represented at the specified
sl@0
   698
 *	character (not byte) position in the UTF-8 string.
sl@0
   699
 *
sl@0
   700
 * Results:
sl@0
   701
 *	As above.
sl@0
   702
 *
sl@0
   703
 * Side effects:
sl@0
   704
 *	None.
sl@0
   705
 *
sl@0
   706
 *---------------------------------------------------------------------------
sl@0
   707
 */
sl@0
   708
 
sl@0
   709
EXPORT_C Tcl_UniChar
sl@0
   710
Tcl_UniCharAtIndex(src, index)
sl@0
   711
    register CONST char *src;	/* The UTF-8 string to dereference. */
sl@0
   712
    register int index;		/* The position of the desired character. */
sl@0
   713
{
sl@0
   714
    Tcl_UniChar ch;
sl@0
   715
sl@0
   716
    while (index >= 0) {
sl@0
   717
	index--;
sl@0
   718
	src += TclUtfToUniChar(src, &ch);
sl@0
   719
    }
sl@0
   720
    return ch;
sl@0
   721
}
sl@0
   722

sl@0
   723
/*
sl@0
   724
 *---------------------------------------------------------------------------
sl@0
   725
 *
sl@0
   726
 * Tcl_UtfAtIndex --
sl@0
   727
 *
sl@0
   728
 *	Returns a pointer to the specified character (not byte) position
sl@0
   729
 *	in the UTF-8 string.
sl@0
   730
 *
sl@0
   731
 * Results:
sl@0
   732
 *	As above.
sl@0
   733
 *
sl@0
   734
 * Side effects:
sl@0
   735
 *	None.
sl@0
   736
 *
sl@0
   737
 *---------------------------------------------------------------------------
sl@0
   738
 */
sl@0
   739
sl@0
   740
EXPORT_C CONST char *
sl@0
   741
Tcl_UtfAtIndex(src, index)
sl@0
   742
    register CONST char *src;	/* The UTF-8 string. */
sl@0
   743
    register int index;		/* The position of the desired character. */
sl@0
   744
{
sl@0
   745
    Tcl_UniChar ch;
sl@0
   746
    
sl@0
   747
    while (index > 0) {
sl@0
   748
	index--;
sl@0
   749
	src += TclUtfToUniChar(src, &ch);
sl@0
   750
    }
sl@0
   751
    return src;
sl@0
   752
}
sl@0
   753

sl@0
   754
/*
sl@0
   755
 *---------------------------------------------------------------------------
sl@0
   756
 *
sl@0
   757
 * Tcl_UtfBackslash --
sl@0
   758
 *
sl@0
   759
 *	Figure out how to handle a backslash sequence.
sl@0
   760
 *
sl@0
   761
 * Results:
sl@0
   762
 *	Stores the bytes represented by the backslash sequence in dst and
sl@0
   763
 *	returns the number of bytes written to dst.  At most TCL_UTF_MAX
sl@0
   764
 *	bytes are written to dst; dst must have been large enough to accept
sl@0
   765
 *	those bytes.  If readPtr isn't NULL then it is filled in with a
sl@0
   766
 *	count of the number of bytes in the backslash sequence.  
sl@0
   767
 *
sl@0
   768
 * Side effects:
sl@0
   769
 *	The maximum number of bytes it takes to represent a Unicode
sl@0
   770
 *	character in UTF-8 is guaranteed to be less than the number of
sl@0
   771
 *	bytes used to express the backslash sequence that represents
sl@0
   772
 *	that Unicode character.  If the target buffer into which the
sl@0
   773
 *	caller is going to store the bytes that represent the Unicode
sl@0
   774
 *	character is at least as large as the source buffer from which
sl@0
   775
 *	the backslashed sequence was extracted, no buffer overruns should
sl@0
   776
 *	occur.
sl@0
   777
 *
sl@0
   778
 *---------------------------------------------------------------------------
sl@0
   779
 */
sl@0
   780
sl@0
   781
EXPORT_C int
sl@0
   782
Tcl_UtfBackslash(src, readPtr, dst)
sl@0
   783
    CONST char *src;		/* Points to the backslash character of
sl@0
   784
				 * a backslash sequence. */
sl@0
   785
    int *readPtr;		/* Fill in with number of characters read
sl@0
   786
				 * from src, unless NULL. */
sl@0
   787
    char *dst;			/* Filled with the bytes represented by the
sl@0
   788
				 * backslash sequence. */
sl@0
   789
{
sl@0
   790
#define LINE_LENGTH 128
sl@0
   791
    int numRead;
sl@0
   792
    int result;
sl@0
   793
sl@0
   794
    result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
sl@0
   795
    if (numRead == LINE_LENGTH) {
sl@0
   796
	/* We ate a whole line.  Pay the price of a strlen() */
sl@0
   797
	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
sl@0
   798
    }
sl@0
   799
    if (readPtr != NULL) {
sl@0
   800
	*readPtr = numRead;
sl@0
   801
    }
sl@0
   802
    return result;
sl@0
   803
}
sl@0
   804

sl@0
   805
/*
sl@0
   806
 *----------------------------------------------------------------------
sl@0
   807
 *
sl@0
   808
 * Tcl_UtfToUpper --
sl@0
   809
 *
sl@0
   810
 *	Convert lowercase characters to uppercase characters in a UTF
sl@0
   811
 *	string in place.  The conversion may shrink the UTF string.
sl@0
   812
 *
sl@0
   813
 * Results:
sl@0
   814
 *	Returns the number of bytes in the resulting string
sl@0
   815
 *	excluding the trailing null.
sl@0
   816
 *
sl@0
   817
 * Side effects:
sl@0
   818
 *	Writes a terminating null after the last converted character.
sl@0
   819
 *
sl@0
   820
 *----------------------------------------------------------------------
sl@0
   821
 */
sl@0
   822
sl@0
   823
EXPORT_C int
sl@0
   824
Tcl_UtfToUpper(str)
sl@0
   825
    char *str;			/* String to convert in place. */
sl@0
   826
{
sl@0
   827
    Tcl_UniChar ch, upChar;
sl@0
   828
    char *src, *dst;
sl@0
   829
    int bytes;
sl@0
   830
sl@0
   831
    /*
sl@0
   832
     * Iterate over the string until we hit the terminating null.
sl@0
   833
     */
sl@0
   834
sl@0
   835
    src = dst = str;
sl@0
   836
    while (*src) {
sl@0
   837
        bytes = TclUtfToUniChar(src, &ch);
sl@0
   838
	upChar = Tcl_UniCharToUpper(ch);
sl@0
   839
sl@0
   840
	/*
sl@0
   841
	 * To keep badly formed Utf strings from getting inflated by
sl@0
   842
	 * the conversion (thereby causing a segfault), only copy the
sl@0
   843
	 * upper case char to dst if its size is <= the original char.
sl@0
   844
	 */
sl@0
   845
	
sl@0
   846
	if (bytes < UtfCount(upChar)) {
sl@0
   847
	    memcpy(dst, src, (size_t) bytes);
sl@0
   848
	    dst += bytes;
sl@0
   849
	} else {
sl@0
   850
	    dst += Tcl_UniCharToUtf(upChar, dst);
sl@0
   851
	}
sl@0
   852
	src += bytes;
sl@0
   853
    }
sl@0
   854
    *dst = '\0';
sl@0
   855
    return (dst - str);
sl@0
   856
}
sl@0
   857

sl@0
   858
/*
sl@0
   859
 *----------------------------------------------------------------------
sl@0
   860
 *
sl@0
   861
 * Tcl_UtfToLower --
sl@0
   862
 *
sl@0
   863
 *	Convert uppercase characters to lowercase characters in a UTF
sl@0
   864
 *	string in place.  The conversion may shrink the UTF string.
sl@0
   865
 *
sl@0
   866
 * Results:
sl@0
   867
 *	Returns the number of bytes in the resulting string
sl@0
   868
 *	excluding the trailing null.
sl@0
   869
 *
sl@0
   870
 * Side effects:
sl@0
   871
 *	Writes a terminating null after the last converted character.
sl@0
   872
 *
sl@0
   873
 *----------------------------------------------------------------------
sl@0
   874
 */
sl@0
   875
sl@0
   876
EXPORT_C int
sl@0
   877
Tcl_UtfToLower(str)
sl@0
   878
    char *str;			/* String to convert in place. */
sl@0
   879
{
sl@0
   880
    Tcl_UniChar ch, lowChar;
sl@0
   881
    char *src, *dst;
sl@0
   882
    int bytes;
sl@0
   883
    
sl@0
   884
    /*
sl@0
   885
     * Iterate over the string until we hit the terminating null.
sl@0
   886
     */
sl@0
   887
sl@0
   888
    src = dst = str;
sl@0
   889
    while (*src) {
sl@0
   890
	bytes = TclUtfToUniChar(src, &ch);
sl@0
   891
	lowChar = Tcl_UniCharToLower(ch);
sl@0
   892
sl@0
   893
	/*
sl@0
   894
	 * To keep badly formed Utf strings from getting inflated by
sl@0
   895
	 * the conversion (thereby causing a segfault), only copy the
sl@0
   896
	 * lower case char to dst if its size is <= the original char.
sl@0
   897
	 */
sl@0
   898
	
sl@0
   899
	if (bytes < UtfCount(lowChar)) {
sl@0
   900
	    memcpy(dst, src, (size_t) bytes);
sl@0
   901
	    dst += bytes;
sl@0
   902
	} else {
sl@0
   903
	    dst += Tcl_UniCharToUtf(lowChar, dst);
sl@0
   904
	}
sl@0
   905
	src += bytes;
sl@0
   906
    }
sl@0
   907
    *dst = '\0';
sl@0
   908
    return (dst - str);
sl@0
   909
}
sl@0
   910

sl@0
   911
/*
sl@0
   912
 *----------------------------------------------------------------------
sl@0
   913
 *
sl@0
   914
 * Tcl_UtfToTitle --
sl@0
   915
 *
sl@0
   916
 *	Changes the first character of a UTF string to title case or
sl@0
   917
 *	uppercase and the rest of the string to lowercase.  The
sl@0
   918
 *	conversion happens in place and may shrink the UTF string.
sl@0
   919
 *
sl@0
   920
 * Results:
sl@0
   921
 *	Returns the number of bytes in the resulting string
sl@0
   922
 *	excluding the trailing null.
sl@0
   923
 *
sl@0
   924
 * Side effects:
sl@0
   925
 *	Writes a terminating null after the last converted character.
sl@0
   926
 *
sl@0
   927
 *----------------------------------------------------------------------
sl@0
   928
 */
sl@0
   929
sl@0
   930
EXPORT_C int
sl@0
   931
Tcl_UtfToTitle(str)
sl@0
   932
    char *str;			/* String to convert in place. */
sl@0
   933
{
sl@0
   934
    Tcl_UniChar ch, titleChar, lowChar;
sl@0
   935
    char *src, *dst;
sl@0
   936
    int bytes;
sl@0
   937
    
sl@0
   938
    /*
sl@0
   939
     * Capitalize the first character and then lowercase the rest of the
sl@0
   940
     * characters until we get to a null.
sl@0
   941
     */
sl@0
   942
sl@0
   943
    src = dst = str;
sl@0
   944
sl@0
   945
    if (*src) {
sl@0
   946
	bytes = TclUtfToUniChar(src, &ch);
sl@0
   947
	titleChar = Tcl_UniCharToTitle(ch);
sl@0
   948
sl@0
   949
	if (bytes < UtfCount(titleChar)) {
sl@0
   950
	    memcpy(dst, src, (size_t) bytes);
sl@0
   951
	    dst += bytes;
sl@0
   952
	} else {
sl@0
   953
	    dst += Tcl_UniCharToUtf(titleChar, dst);
sl@0
   954
	}
sl@0
   955
	src += bytes;
sl@0
   956
    }
sl@0
   957
    while (*src) {
sl@0
   958
	bytes = TclUtfToUniChar(src, &ch);
sl@0
   959
	lowChar = Tcl_UniCharToLower(ch);
sl@0
   960
sl@0
   961
	if (bytes < UtfCount(lowChar)) {
sl@0
   962
	    memcpy(dst, src, (size_t) bytes);
sl@0
   963
	    dst += bytes;
sl@0
   964
	} else {
sl@0
   965
	    dst += Tcl_UniCharToUtf(lowChar, dst);
sl@0
   966
	}
sl@0
   967
	src += bytes;
sl@0
   968
    }
sl@0
   969
    *dst = '\0';
sl@0
   970
    return (dst - str);
sl@0
   971
}
sl@0
   972

sl@0
   973
/*
sl@0
   974
 *----------------------------------------------------------------------
sl@0
   975
 *
sl@0
   976
 * TclpUtfNcmp2 --
sl@0
   977
 *
sl@0
   978
 *	Compare at most n bytes of utf-8 strings cs and ct.  Both cs
sl@0
   979
 *	and ct are assumed to be at least n bytes long.
sl@0
   980
 *
sl@0
   981
 * Results:
sl@0
   982
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0
   983
 *
sl@0
   984
 * Side effects:
sl@0
   985
 *	None.
sl@0
   986
 *
sl@0
   987
 *----------------------------------------------------------------------
sl@0
   988
 */
sl@0
   989
sl@0
   990
int
sl@0
   991
TclpUtfNcmp2(cs, ct, n)
sl@0
   992
    CONST char *cs;		/* UTF string to compare to ct. */
sl@0
   993
    CONST char *ct;		/* UTF string cs is compared to. */
sl@0
   994
    unsigned long n;		/* Number of *bytes* to compare. */
sl@0
   995
{
sl@0
   996
    /*
sl@0
   997
     * We can't simply call 'memcmp(cs, ct, n);' because we need to check
sl@0
   998
     * for Tcl's \xC0\x80 non-utf-8 null encoding.
sl@0
   999
     * Otherwise utf-8 lexes fine in the strcmp manner.
sl@0
  1000
     */
sl@0
  1001
    register int result = 0;
sl@0
  1002
sl@0
  1003
    for ( ; n != 0; n--, cs++, ct++) {
sl@0
  1004
	if (*cs != *ct) {
sl@0
  1005
	    result = UCHAR(*cs) - UCHAR(*ct);
sl@0
  1006
	    break;
sl@0
  1007
	}
sl@0
  1008
    }
sl@0
  1009
    if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
sl@0
  1010
	unsigned char c1, c2;
sl@0
  1011
	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
sl@0
  1012
	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
sl@0
  1013
	result = (c1 - c2);
sl@0
  1014
    }
sl@0
  1015
    return result;
sl@0
  1016
}
sl@0
  1017

sl@0
  1018
/*
sl@0
  1019
 *----------------------------------------------------------------------
sl@0
  1020
 *
sl@0
  1021
 * Tcl_UtfNcmp --
sl@0
  1022
 *
sl@0
  1023
 *	Compare at most n UTF chars of string cs to string ct.  Both cs
sl@0
  1024
 *	and ct are assumed to be at least n UTF chars long.
sl@0
  1025
 *
sl@0
  1026
 * Results:
sl@0
  1027
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0
  1028
 *
sl@0
  1029
 * Side effects:
sl@0
  1030
 *	None.
sl@0
  1031
 *
sl@0
  1032
 *----------------------------------------------------------------------
sl@0
  1033
 */
sl@0
  1034
sl@0
  1035
EXPORT_C int
sl@0
  1036
Tcl_UtfNcmp(cs, ct, n)
sl@0
  1037
    CONST char *cs;		/* UTF string to compare to ct. */
sl@0
  1038
    CONST char *ct;		/* UTF string cs is compared to. */
sl@0
  1039
    unsigned long n;		/* Number of UTF chars to compare. */
sl@0
  1040
{
sl@0
  1041
    Tcl_UniChar ch1, ch2;
sl@0
  1042
    /*
sl@0
  1043
     * Cannot use 'memcmp(cs, ct, n);' as byte representation of
sl@0
  1044
     * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
sl@0
  1045
     * representation of \u0001 (the byte 0x01.)
sl@0
  1046
     */
sl@0
  1047
    while (n-- > 0) {
sl@0
  1048
	/*
sl@0
  1049
	 * n must be interpreted as chars, not bytes.
sl@0
  1050
	 * This should be called only when both strings are of
sl@0
  1051
	 * at least n chars long (no need for \0 check)
sl@0
  1052
	 */
sl@0
  1053
	cs += TclUtfToUniChar(cs, &ch1);
sl@0
  1054
	ct += TclUtfToUniChar(ct, &ch2);
sl@0
  1055
	if (ch1 != ch2) {
sl@0
  1056
	    return (ch1 - ch2);
sl@0
  1057
	}
sl@0
  1058
    }
sl@0
  1059
    return 0;
sl@0
  1060
}
sl@0
  1061

sl@0
  1062
/*
sl@0
  1063
 *----------------------------------------------------------------------
sl@0
  1064
 *
sl@0
  1065
 * Tcl_UtfNcasecmp --
sl@0
  1066
 *
sl@0
  1067
 *	Compare at most n UTF chars of string cs to string ct case
sl@0
  1068
 *	insensitive.  Both cs and ct are assumed to be at least n
sl@0
  1069
 *	UTF chars long.
sl@0
  1070
 *
sl@0
  1071
 * Results:
sl@0
  1072
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0
  1073
 *
sl@0
  1074
 * Side effects:
sl@0
  1075
 *	None.
sl@0
  1076
 *
sl@0
  1077
 *----------------------------------------------------------------------
sl@0
  1078
 */
sl@0
  1079
sl@0
  1080
EXPORT_C int
sl@0
  1081
Tcl_UtfNcasecmp(cs, ct, n)
sl@0
  1082
    CONST char *cs;		/* UTF string to compare to ct. */
sl@0
  1083
    CONST char *ct;		/* UTF string cs is compared to. */
sl@0
  1084
    unsigned long n;			/* Number of UTF chars to compare. */
sl@0
  1085
{
sl@0
  1086
    Tcl_UniChar ch1, ch2;
sl@0
  1087
    while (n-- > 0) {
sl@0
  1088
	/*
sl@0
  1089
	 * n must be interpreted as chars, not bytes.
sl@0
  1090
	 * This should be called only when both strings are of
sl@0
  1091
	 * at least n chars long (no need for \0 check)
sl@0
  1092
	 */
sl@0
  1093
	cs += TclUtfToUniChar(cs, &ch1);
sl@0
  1094
	ct += TclUtfToUniChar(ct, &ch2);
sl@0
  1095
	if (ch1 != ch2) {
sl@0
  1096
	    ch1 = Tcl_UniCharToLower(ch1);
sl@0
  1097
	    ch2 = Tcl_UniCharToLower(ch2);
sl@0
  1098
	    if (ch1 != ch2) {
sl@0
  1099
		return (ch1 - ch2);
sl@0
  1100
	    }
sl@0
  1101
	}
sl@0
  1102
    }
sl@0
  1103
    return 0;
sl@0
  1104
}
sl@0
  1105

sl@0
  1106
/*
sl@0
  1107
 *----------------------------------------------------------------------
sl@0
  1108
 *
sl@0
  1109
 * Tcl_UniCharToUpper --
sl@0
  1110
 *
sl@0
  1111
 *	Compute the uppercase equivalent of the given Unicode character.
sl@0
  1112
 *
sl@0
  1113
 * Results:
sl@0
  1114
 *	Returns the uppercase Unicode character.
sl@0
  1115
 *
sl@0
  1116
 * Side effects:
sl@0
  1117
 *	None.
sl@0
  1118
 *
sl@0
  1119
 *----------------------------------------------------------------------
sl@0
  1120
 */
sl@0
  1121
sl@0
  1122
EXPORT_C Tcl_UniChar
sl@0
  1123
Tcl_UniCharToUpper(ch)
sl@0
  1124
    int ch;			/* Unicode character to convert. */
sl@0
  1125
{
sl@0
  1126
    int info = GetUniCharInfo(ch);
sl@0
  1127
sl@0
  1128
    if (GetCaseType(info) & 0x04) {
sl@0
  1129
	return (Tcl_UniChar) (ch - GetDelta(info));
sl@0
  1130
    } else {
sl@0
  1131
	return ch;
sl@0
  1132
    }
sl@0
  1133
}
sl@0
  1134

sl@0
  1135
/*
sl@0
  1136
 *----------------------------------------------------------------------
sl@0
  1137
 *
sl@0
  1138
 * Tcl_UniCharToLower --
sl@0
  1139
 *
sl@0
  1140
 *	Compute the lowercase equivalent of the given Unicode character.
sl@0
  1141
 *
sl@0
  1142
 * Results:
sl@0
  1143
 *	Returns the lowercase Unicode character.
sl@0
  1144
 *
sl@0
  1145
 * Side effects:
sl@0
  1146
 *	None.
sl@0
  1147
 *
sl@0
  1148
 *----------------------------------------------------------------------
sl@0
  1149
 */
sl@0
  1150
sl@0
  1151
EXPORT_C Tcl_UniChar
sl@0
  1152
Tcl_UniCharToLower(ch)
sl@0
  1153
    int ch;			/* Unicode character to convert. */
sl@0
  1154
{
sl@0
  1155
    int info = GetUniCharInfo(ch);
sl@0
  1156
sl@0
  1157
    if (GetCaseType(info) & 0x02) {
sl@0
  1158
	return (Tcl_UniChar) (ch + GetDelta(info));
sl@0
  1159
    } else {
sl@0
  1160
	return ch;
sl@0
  1161
    }
sl@0
  1162
}
sl@0
  1163

sl@0
  1164
/*
sl@0
  1165
 *----------------------------------------------------------------------
sl@0
  1166
 *
sl@0
  1167
 * Tcl_UniCharToTitle --
sl@0
  1168
 *
sl@0
  1169
 *	Compute the titlecase equivalent of the given Unicode character.
sl@0
  1170
 *
sl@0
  1171
 * Results:
sl@0
  1172
 *	Returns the titlecase Unicode character.
sl@0
  1173
 *
sl@0
  1174
 * Side effects:
sl@0
  1175
 *	None.
sl@0
  1176
 *
sl@0
  1177
 *----------------------------------------------------------------------
sl@0
  1178
 */
sl@0
  1179
sl@0
  1180
EXPORT_C Tcl_UniChar
sl@0
  1181
Tcl_UniCharToTitle(ch)
sl@0
  1182
    int ch;			/* Unicode character to convert. */
sl@0
  1183
{
sl@0
  1184
    int info = GetUniCharInfo(ch);
sl@0
  1185
    int mode = GetCaseType(info);
sl@0
  1186
sl@0
  1187
    if (mode & 0x1) {
sl@0
  1188
	/*
sl@0
  1189
	 * Subtract or add one depending on the original case.
sl@0
  1190
	 */
sl@0
  1191
sl@0
  1192
	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
sl@0
  1193
    } else if (mode == 0x4) {
sl@0
  1194
	return (Tcl_UniChar) (ch - GetDelta(info));
sl@0
  1195
    } else {
sl@0
  1196
	return ch;
sl@0
  1197
    }
sl@0
  1198
}
sl@0
  1199

sl@0
  1200
/*
sl@0
  1201
 *----------------------------------------------------------------------
sl@0
  1202
 *
sl@0
  1203
 * Tcl_UniCharLen --
sl@0
  1204
 *
sl@0
  1205
 *	Find the length of a UniChar string.  The str input must be null
sl@0
  1206
 *	terminated.
sl@0
  1207
 *
sl@0
  1208
 * Results:
sl@0
  1209
 *	Returns the length of str in UniChars (not bytes).
sl@0
  1210
 *
sl@0
  1211
 * Side effects:
sl@0
  1212
 *	None.
sl@0
  1213
 *
sl@0
  1214
 *----------------------------------------------------------------------
sl@0
  1215
 */
sl@0
  1216
sl@0
  1217
EXPORT_C int
sl@0
  1218
Tcl_UniCharLen(str)
sl@0
  1219
    CONST Tcl_UniChar *str;	/* Unicode string to find length of. */
sl@0
  1220
{
sl@0
  1221
    int len = 0;
sl@0
  1222
    
sl@0
  1223
    while (*str != '\0') {
sl@0
  1224
	len++;
sl@0
  1225
	str++;
sl@0
  1226
    }
sl@0
  1227
    return len;
sl@0
  1228
}
sl@0
  1229

sl@0
  1230
/*
sl@0
  1231
 *----------------------------------------------------------------------
sl@0
  1232
 *
sl@0
  1233
 * Tcl_UniCharNcmp --
sl@0
  1234
 *
sl@0
  1235
 *	Compare at most n unichars of string cs to string ct.  Both cs
sl@0
  1236
 *	and ct are assumed to be at least n unichars long.
sl@0
  1237
 *
sl@0
  1238
 * Results:
sl@0
  1239
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0
  1240
 *
sl@0
  1241
 * Side effects:
sl@0
  1242
 *	None.
sl@0
  1243
 *
sl@0
  1244
 *----------------------------------------------------------------------
sl@0
  1245
 */
sl@0
  1246
sl@0
  1247
EXPORT_C int
sl@0
  1248
Tcl_UniCharNcmp(cs, ct, n)
sl@0
  1249
    CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
sl@0
  1250
    CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
sl@0
  1251
    unsigned long n;			/* Number of unichars to compare. */
sl@0
  1252
{
sl@0
  1253
#ifdef WORDS_BIGENDIAN
sl@0
  1254
    /*
sl@0
  1255
     * We are definitely on a big-endian machine; memcmp() is safe
sl@0
  1256
     */
sl@0
  1257
    return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
sl@0
  1258
sl@0
  1259
#else /* !WORDS_BIGENDIAN */
sl@0
  1260
    /*
sl@0
  1261
     * We can't simply call memcmp() because that is not lexically correct.
sl@0
  1262
     */
sl@0
  1263
    for ( ; n != 0; cs++, ct++, n--) {
sl@0
  1264
	if (*cs != *ct) {
sl@0
  1265
	    return (*cs - *ct);
sl@0
  1266
	}
sl@0
  1267
    }
sl@0
  1268
    return 0;
sl@0
  1269
#endif /* WORDS_BIGENDIAN */
sl@0
  1270
}
sl@0
  1271

sl@0
  1272
/*
sl@0
  1273
 *----------------------------------------------------------------------
sl@0
  1274
 *
sl@0
  1275
 * Tcl_UniCharNcasecmp --
sl@0
  1276
 *
sl@0
  1277
 *	Compare at most n unichars of string cs to string ct case
sl@0
  1278
 *	insensitive.  Both cs and ct are assumed to be at least n
sl@0
  1279
 *	unichars long.
sl@0
  1280
 *
sl@0
  1281
 * Results:
sl@0
  1282
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0
  1283
 *
sl@0
  1284
 * Side effects:
sl@0
  1285
 *	None.
sl@0
  1286
 *
sl@0
  1287
 *----------------------------------------------------------------------
sl@0
  1288
 */
sl@0
  1289
sl@0
  1290
EXPORT_C int
sl@0
  1291
Tcl_UniCharNcasecmp(cs, ct, n)
sl@0
  1292
    CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
sl@0
  1293
    CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
sl@0
  1294
    unsigned long n;			/* Number of unichars to compare. */
sl@0
  1295
{
sl@0
  1296
    for ( ; n != 0; n--, cs++, ct++) {
sl@0
  1297
	if (*cs != *ct) {
sl@0
  1298
	    Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);
sl@0
  1299
	    Tcl_UniChar lct = Tcl_UniCharToLower(*ct);
sl@0
  1300
	    if (lcs != lct) {
sl@0
  1301
		return (lcs - lct);
sl@0
  1302
	    }
sl@0
  1303
	}
sl@0
  1304
    }
sl@0
  1305
    return 0;
sl@0
  1306
}
sl@0
  1307

sl@0
  1308
/*
sl@0
  1309
 *----------------------------------------------------------------------
sl@0
  1310
 *
sl@0
  1311
 * Tcl_UniCharIsAlnum --
sl@0
  1312
 *
sl@0
  1313
 *	Test if a character is an alphanumeric Unicode character.
sl@0
  1314
 *
sl@0
  1315
 * Results:
sl@0
  1316
 *	Returns 1 if character is alphanumeric.
sl@0
  1317
 *
sl@0
  1318
 * Side effects:
sl@0
  1319
 *	None.
sl@0
  1320
 *
sl@0
  1321
 *----------------------------------------------------------------------
sl@0
  1322
 */
sl@0
  1323
sl@0
  1324
EXPORT_C int
sl@0
  1325
Tcl_UniCharIsAlnum(ch)
sl@0
  1326
    int ch;			/* Unicode character to test. */
sl@0
  1327
{
sl@0
  1328
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1329
sl@0
  1330
    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
sl@0
  1331
}
sl@0
  1332

sl@0
  1333
/*
sl@0
  1334
 *----------------------------------------------------------------------
sl@0
  1335
 *
sl@0
  1336
 * Tcl_UniCharIsAlpha --
sl@0
  1337
 *
sl@0
  1338
 *	Test if a character is an alphabetic Unicode character.
sl@0
  1339
 *
sl@0
  1340
 * Results:
sl@0
  1341
 *	Returns 1 if character is alphabetic.
sl@0
  1342
 *
sl@0
  1343
 * Side effects:
sl@0
  1344
 *	None.
sl@0
  1345
 *
sl@0
  1346
 *----------------------------------------------------------------------
sl@0
  1347
 */
sl@0
  1348
sl@0
  1349
EXPORT_C int
sl@0
  1350
Tcl_UniCharIsAlpha(ch)
sl@0
  1351
    int ch;			/* Unicode character to test. */
sl@0
  1352
{
sl@0
  1353
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1354
    return ((ALPHA_BITS >> category) & 1);
sl@0
  1355
}
sl@0
  1356

sl@0
  1357
/*
sl@0
  1358
 *----------------------------------------------------------------------
sl@0
  1359
 *
sl@0
  1360
 * Tcl_UniCharIsControl --
sl@0
  1361
 *
sl@0
  1362
 *	Test if a character is a Unicode control character.
sl@0
  1363
 *
sl@0
  1364
 * Results:
sl@0
  1365
 *	Returns non-zero if character is a control.
sl@0
  1366
 *
sl@0
  1367
 * Side effects:
sl@0
  1368
 *	None.
sl@0
  1369
 *
sl@0
  1370
 *----------------------------------------------------------------------
sl@0
  1371
 */
sl@0
  1372
sl@0
  1373
EXPORT_C int
sl@0
  1374
Tcl_UniCharIsControl(ch)
sl@0
  1375
    int ch;			/* Unicode character to test. */
sl@0
  1376
{
sl@0
  1377
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
sl@0
  1378
}
sl@0
  1379

sl@0
  1380
/*
sl@0
  1381
 *----------------------------------------------------------------------
sl@0
  1382
 *
sl@0
  1383
 * Tcl_UniCharIsDigit --
sl@0
  1384
 *
sl@0
  1385
 *	Test if a character is a numeric Unicode character.
sl@0
  1386
 *
sl@0
  1387
 * Results:
sl@0
  1388
 *	Returns non-zero if character is a digit.
sl@0
  1389
 *
sl@0
  1390
 * Side effects:
sl@0
  1391
 *	None.
sl@0
  1392
 *
sl@0
  1393
 *----------------------------------------------------------------------
sl@0
  1394
 */
sl@0
  1395
sl@0
  1396
EXPORT_C int
sl@0
  1397
Tcl_UniCharIsDigit(ch)
sl@0
  1398
    int ch;			/* Unicode character to test. */
sl@0
  1399
{
sl@0
  1400
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
sl@0
  1401
	    == DECIMAL_DIGIT_NUMBER);
sl@0
  1402
}
sl@0
  1403

sl@0
  1404
/*
sl@0
  1405
 *----------------------------------------------------------------------
sl@0
  1406
 *
sl@0
  1407
 * Tcl_UniCharIsGraph --
sl@0
  1408
 *
sl@0
  1409
 *	Test if a character is any Unicode print character except space.
sl@0
  1410
 *
sl@0
  1411
 * Results:
sl@0
  1412
 *	Returns non-zero if character is printable, but not space.
sl@0
  1413
 *
sl@0
  1414
 * Side effects:
sl@0
  1415
 *	None.
sl@0
  1416
 *
sl@0
  1417
 *----------------------------------------------------------------------
sl@0
  1418
 */
sl@0
  1419
sl@0
  1420
EXPORT_C int
sl@0
  1421
Tcl_UniCharIsGraph(ch)
sl@0
  1422
    int ch;			/* Unicode character to test. */
sl@0
  1423
{
sl@0
  1424
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1425
    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
sl@0
  1426
}
sl@0
  1427

sl@0
  1428
/*
sl@0
  1429
 *----------------------------------------------------------------------
sl@0
  1430
 *
sl@0
  1431
 * Tcl_UniCharIsLower --
sl@0
  1432
 *
sl@0
  1433
 *	Test if a character is a lowercase Unicode character.
sl@0
  1434
 *
sl@0
  1435
 * Results:
sl@0
  1436
 *	Returns non-zero if character is lowercase.
sl@0
  1437
 *
sl@0
  1438
 * Side effects:
sl@0
  1439
 *	None.
sl@0
  1440
 *
sl@0
  1441
 *----------------------------------------------------------------------
sl@0
  1442
 */
sl@0
  1443
sl@0
  1444
EXPORT_C int
sl@0
  1445
Tcl_UniCharIsLower(ch)
sl@0
  1446
    int ch;			/* Unicode character to test. */
sl@0
  1447
{
sl@0
  1448
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
sl@0
  1449
}
sl@0
  1450

sl@0
  1451
/*
sl@0
  1452
 *----------------------------------------------------------------------
sl@0
  1453
 *
sl@0
  1454
 * Tcl_UniCharIsPrint --
sl@0
  1455
 *
sl@0
  1456
 *	Test if a character is a Unicode print character.
sl@0
  1457
 *
sl@0
  1458
 * Results:
sl@0
  1459
 *	Returns non-zero if character is printable.
sl@0
  1460
 *
sl@0
  1461
 * Side effects:
sl@0
  1462
 *	None.
sl@0
  1463
 *
sl@0
  1464
 *----------------------------------------------------------------------
sl@0
  1465
 */
sl@0
  1466
sl@0
  1467
EXPORT_C int
sl@0
  1468
Tcl_UniCharIsPrint(ch)
sl@0
  1469
    int ch;			/* Unicode character to test. */
sl@0
  1470
{
sl@0
  1471
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1472
    return ((PRINT_BITS >> category) & 1);
sl@0
  1473
}
sl@0
  1474

sl@0
  1475
/*
sl@0
  1476
 *----------------------------------------------------------------------
sl@0
  1477
 *
sl@0
  1478
 * Tcl_UniCharIsPunct --
sl@0
  1479
 *
sl@0
  1480
 *	Test if a character is a Unicode punctuation character.
sl@0
  1481
 *
sl@0
  1482
 * Results:
sl@0
  1483
 *	Returns non-zero if character is punct.
sl@0
  1484
 *
sl@0
  1485
 * Side effects:
sl@0
  1486
 *	None.
sl@0
  1487
 *
sl@0
  1488
 *----------------------------------------------------------------------
sl@0
  1489
 */
sl@0
  1490
sl@0
  1491
EXPORT_C int
sl@0
  1492
Tcl_UniCharIsPunct(ch)
sl@0
  1493
    int ch;			/* Unicode character to test. */
sl@0
  1494
{
sl@0
  1495
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1496
    return ((PUNCT_BITS >> category) & 1);
sl@0
  1497
}
sl@0
  1498

sl@0
  1499
/*
sl@0
  1500
 *----------------------------------------------------------------------
sl@0
  1501
 *
sl@0
  1502
 * Tcl_UniCharIsSpace --
sl@0
  1503
 *
sl@0
  1504
 *	Test if a character is a whitespace Unicode character.
sl@0
  1505
 *
sl@0
  1506
 * Results:
sl@0
  1507
 *	Returns non-zero if character is a space.
sl@0
  1508
 *
sl@0
  1509
 * Side effects:
sl@0
  1510
 *	None.
sl@0
  1511
 *
sl@0
  1512
 *----------------------------------------------------------------------
sl@0
  1513
 */
sl@0
  1514
sl@0
  1515
EXPORT_C int
sl@0
  1516
Tcl_UniCharIsSpace(ch)
sl@0
  1517
    int ch;			/* Unicode character to test. */
sl@0
  1518
{
sl@0
  1519
    register int category;
sl@0
  1520
sl@0
  1521
    /*
sl@0
  1522
     * If the character is within the first 127 characters, just use the
sl@0
  1523
     * standard C function, otherwise consult the Unicode table.
sl@0
  1524
     */
sl@0
  1525
sl@0
  1526
    if (ch < 0x80) {
sl@0
  1527
	return isspace(UCHAR(ch)); /* INTL: ISO space */
sl@0
  1528
    } else {
sl@0
  1529
	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1530
	return ((SPACE_BITS >> category) & 1);
sl@0
  1531
    }
sl@0
  1532
}
sl@0
  1533

sl@0
  1534
/*
sl@0
  1535
 *----------------------------------------------------------------------
sl@0
  1536
 *
sl@0
  1537
 * Tcl_UniCharIsUpper --
sl@0
  1538
 *
sl@0
  1539
 *	Test if a character is a uppercase Unicode character.
sl@0
  1540
 *
sl@0
  1541
 * Results:
sl@0
  1542
 *	Returns non-zero if character is uppercase.
sl@0
  1543
 *
sl@0
  1544
 * Side effects:
sl@0
  1545
 *	None.
sl@0
  1546
 *
sl@0
  1547
 *----------------------------------------------------------------------
sl@0
  1548
 */
sl@0
  1549
sl@0
  1550
EXPORT_C int
sl@0
  1551
Tcl_UniCharIsUpper(ch)
sl@0
  1552
    int ch;			/* Unicode character to test. */
sl@0
  1553
{
sl@0
  1554
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
sl@0
  1555
}
sl@0
  1556

sl@0
  1557
/*
sl@0
  1558
 *----------------------------------------------------------------------
sl@0
  1559
 *
sl@0
  1560
 * Tcl_UniCharIsWordChar --
sl@0
  1561
 *
sl@0
  1562
 *	Test if a character is alphanumeric or a connector punctuation
sl@0
  1563
 *	mark.
sl@0
  1564
 *
sl@0
  1565
 * Results:
sl@0
  1566
 *	Returns 1 if character is a word character.
sl@0
  1567
 *
sl@0
  1568
 * Side effects:
sl@0
  1569
 *	None.
sl@0
  1570
 *
sl@0
  1571
 *----------------------------------------------------------------------
sl@0
  1572
 */
sl@0
  1573
sl@0
  1574
EXPORT_C int
sl@0
  1575
Tcl_UniCharIsWordChar(ch)
sl@0
  1576
    int ch;			/* Unicode character to test. */
sl@0
  1577
{
sl@0
  1578
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0
  1579
sl@0
  1580
    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
sl@0
  1581
}
sl@0
  1582

sl@0
  1583
/*
sl@0
  1584
 *----------------------------------------------------------------------
sl@0
  1585
 *
sl@0
  1586
 * Tcl_UniCharCaseMatch --
sl@0
  1587
 *
sl@0
  1588
 *	See if a particular Unicode string matches a particular pattern.
sl@0
  1589
 *	Allows case insensitivity.  This is the Unicode equivalent of
sl@0
  1590
 *	the char* Tcl_StringCaseMatch.  The UniChar strings must be
sl@0
  1591
 *	NULL-terminated.  This has no provision for counted UniChar
sl@0
  1592
 *	strings, thus should not be used where NULLs are expected in the
sl@0
  1593
 *	UniChar string.  Use TclUniCharMatch where possible.
sl@0
  1594
 *
sl@0
  1595
 * Results:
sl@0
  1596
 *	The return value is 1 if string matches pattern, and
sl@0
  1597
 *	0 otherwise.  The matching operation permits the following
sl@0
  1598
 *	special characters in the pattern: *?\[] (see the manual
sl@0
  1599
 *	entry for details on what these mean).
sl@0
  1600
 *
sl@0
  1601
 * Side effects:
sl@0
  1602
 *	None.
sl@0
  1603
 *
sl@0
  1604
 *----------------------------------------------------------------------
sl@0
  1605
 */
sl@0
  1606
sl@0
  1607
EXPORT_C int
sl@0
  1608
Tcl_UniCharCaseMatch(string, pattern, nocase)
sl@0
  1609
    CONST Tcl_UniChar *string;	/* Unicode String. */
sl@0
  1610
    CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
sl@0
  1611
				 * characters. */
sl@0
  1612
    int nocase;			/* 0 for case sensitive, 1 for insensitive */
sl@0
  1613
{
sl@0
  1614
    Tcl_UniChar ch1, p;
sl@0
  1615
    
sl@0
  1616
    while (1) {
sl@0
  1617
	p = *pattern;
sl@0
  1618
	
sl@0
  1619
	/*
sl@0
  1620
	 * See if we're at the end of both the pattern and the string.  If
sl@0
  1621
	 * so, we succeeded.  If we're at the end of the pattern but not at
sl@0
  1622
	 * the end of the string, we failed.
sl@0
  1623
	 */
sl@0
  1624
	
sl@0
  1625
	if (p == 0) {
sl@0
  1626
	    return (*string == 0);
sl@0
  1627
	}
sl@0
  1628
	if ((*string == 0) && (p != '*')) {
sl@0
  1629
	    return 0;
sl@0
  1630
	}
sl@0
  1631
sl@0
  1632
	/*
sl@0
  1633
	 * Check for a "*" as the next pattern character.  It matches any
sl@0
  1634
	 * substring.  We handle this by skipping all the characters up to the
sl@0
  1635
	 * next matching one in the pattern, and then calling ourselves
sl@0
  1636
	 * recursively for each postfix of string, until either we match or we
sl@0
  1637
	 * reach the end of the string.
sl@0
  1638
	 */
sl@0
  1639
	
sl@0
  1640
	if (p == '*') {
sl@0
  1641
	    /*
sl@0
  1642
	     * Skip all successive *'s in the pattern
sl@0
  1643
	     */
sl@0
  1644
	    while (*(++pattern) == '*') {}
sl@0
  1645
	    p = *pattern;
sl@0
  1646
	    if (p == 0) {
sl@0
  1647
		return 1;
sl@0
  1648
	    }
sl@0
  1649
	    if (nocase) {
sl@0
  1650
		p = Tcl_UniCharToLower(p);
sl@0
  1651
	    }
sl@0
  1652
	    while (1) {
sl@0
  1653
		/*
sl@0
  1654
		 * Optimization for matching - cruise through the string
sl@0
  1655
		 * quickly if the next char in the pattern isn't a special
sl@0
  1656
		 * character
sl@0
  1657
		 */
sl@0
  1658
		if ((p != '[') && (p != '?') && (p != '\\')) {
sl@0
  1659
		    if (nocase) {
sl@0
  1660
			while (*string && (p != *string)
sl@0
  1661
				&& (p != Tcl_UniCharToLower(*string))) {
sl@0
  1662
			    string++;
sl@0
  1663
			}
sl@0
  1664
		    } else {
sl@0
  1665
			while (*string && (p != *string)) { string++; }
sl@0
  1666
		    }
sl@0
  1667
		}
sl@0
  1668
		if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
sl@0
  1669
		    return 1;
sl@0
  1670
		}
sl@0
  1671
		if (*string == 0) {
sl@0
  1672
		    return 0;
sl@0
  1673
		}
sl@0
  1674
		string++;
sl@0
  1675
	    }
sl@0
  1676
	}
sl@0
  1677
sl@0
  1678
	/*
sl@0
  1679
	 * Check for a "?" as the next pattern character.  It matches
sl@0
  1680
	 * any single character.
sl@0
  1681
	 */
sl@0
  1682
sl@0
  1683
	if (p == '?') {
sl@0
  1684
	    pattern++;
sl@0
  1685
	    string++;
sl@0
  1686
	    continue;
sl@0
  1687
	}
sl@0
  1688
sl@0
  1689
	/*
sl@0
  1690
	 * Check for a "[" as the next pattern character.  It is followed
sl@0
  1691
	 * by a list of characters that are acceptable, or by a range
sl@0
  1692
	 * (two characters separated by "-").
sl@0
  1693
	 */
sl@0
  1694
	
sl@0
  1695
	if (p == '[') {
sl@0
  1696
	    Tcl_UniChar startChar, endChar;
sl@0
  1697
sl@0
  1698
	    pattern++;
sl@0
  1699
	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
sl@0
  1700
	    string++;
sl@0
  1701
	    while (1) {
sl@0
  1702
		if ((*pattern == ']') || (*pattern == 0)) {
sl@0
  1703
		    return 0;
sl@0
  1704
		}
sl@0
  1705
		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
sl@0
  1706
		pattern++;
sl@0
  1707
		if (*pattern == '-') {
sl@0
  1708
		    pattern++;
sl@0
  1709
		    if (*pattern == 0) {
sl@0
  1710
			return 0;
sl@0
  1711
		    }
sl@0
  1712
		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
sl@0
  1713
			    : *pattern);
sl@0
  1714
		    pattern++;
sl@0
  1715
		    if (((startChar <= ch1) && (ch1 <= endChar))
sl@0
  1716
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
sl@0
  1717
			/*
sl@0
  1718
			 * Matches ranges of form [a-z] or [z-a].
sl@0
  1719
			 */
sl@0
  1720
			break;
sl@0
  1721
		    }
sl@0
  1722
		} else if (startChar == ch1) {
sl@0
  1723
		    break;
sl@0
  1724
		}
sl@0
  1725
	    }
sl@0
  1726
	    while (*pattern != ']') {
sl@0
  1727
		if (*pattern == 0) {
sl@0
  1728
		    pattern--;
sl@0
  1729
		    break;
sl@0
  1730
		}
sl@0
  1731
		pattern++;
sl@0
  1732
	    }
sl@0
  1733
	    pattern++;
sl@0
  1734
	    continue;
sl@0
  1735
	}
sl@0
  1736
sl@0
  1737
	/*
sl@0
  1738
	 * If the next pattern character is '\', just strip off the '\'
sl@0
  1739
	 * so we do exact matching on the character that follows.
sl@0
  1740
	 */
sl@0
  1741
sl@0
  1742
	if (p == '\\') {
sl@0
  1743
	    if (*(++pattern) == '\0') {
sl@0
  1744
		return 0;
sl@0
  1745
	    }
sl@0
  1746
	}
sl@0
  1747
sl@0
  1748
	/*
sl@0
  1749
	 * There's no special character.  Just make sure that the next
sl@0
  1750
	 * bytes of each string match.
sl@0
  1751
	 */
sl@0
  1752
sl@0
  1753
	if (nocase) {
sl@0
  1754
	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
sl@0
  1755
		return 0;
sl@0
  1756
	    }
sl@0
  1757
	} else if (*string != *pattern) {
sl@0
  1758
	    return 0;
sl@0
  1759
	}
sl@0
  1760
	string++;
sl@0
  1761
	pattern++;
sl@0
  1762
    }
sl@0
  1763
}
sl@0
  1764

sl@0
  1765
/*
sl@0
  1766
 *----------------------------------------------------------------------
sl@0
  1767
 *
sl@0
  1768
 * TclUniCharMatch --
sl@0
  1769
 *
sl@0
  1770
 *	See if a particular Unicode string matches a particular pattern.
sl@0
  1771
 *	Allows case insensitivity.  This is the Unicode equivalent of the
sl@0
  1772
 *	char* Tcl_StringCaseMatch.  This variant of Tcl_UniCharCaseMatch
sl@0
  1773
 *	uses counted Strings, so embedded NULLs are allowed.
sl@0
  1774
 *
sl@0
  1775
 * Results:
sl@0
  1776
 *	The return value is 1 if string matches pattern, and
sl@0
  1777
 *	0 otherwise.  The matching operation permits the following
sl@0
  1778
 *	special characters in the pattern: *?\[] (see the manual
sl@0
  1779
 *	entry for details on what these mean).
sl@0
  1780
 *
sl@0
  1781
 * Side effects:
sl@0
  1782
 *	None.
sl@0
  1783
 *
sl@0
  1784
 *----------------------------------------------------------------------
sl@0
  1785
 */
sl@0
  1786
sl@0
  1787
int
sl@0
  1788
TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
sl@0
  1789
    CONST Tcl_UniChar *string;	/* Unicode String. */
sl@0
  1790
    int strLen;			/* length of String */
sl@0
  1791
    CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
sl@0
  1792
				 * characters. */
sl@0
  1793
    int ptnLen;			/* length of Pattern */
sl@0
  1794
    int nocase;			/* 0 for case sensitive, 1 for insensitive */
sl@0
  1795
{
sl@0
  1796
    CONST Tcl_UniChar *stringEnd, *patternEnd;
sl@0
  1797
    Tcl_UniChar p;
sl@0
  1798
sl@0
  1799
    stringEnd  = string + strLen;
sl@0
  1800
    patternEnd = pattern + ptnLen;
sl@0
  1801
sl@0
  1802
    while (1) {
sl@0
  1803
	/*
sl@0
  1804
	 * See if we're at the end of both the pattern and the string.  If
sl@0
  1805
	 * so, we succeeded.  If we're at the end of the pattern but not at
sl@0
  1806
	 * the end of the string, we failed.
sl@0
  1807
	 */
sl@0
  1808
sl@0
  1809
	if (pattern == patternEnd) {
sl@0
  1810
	    return (string == stringEnd);
sl@0
  1811
	}
sl@0
  1812
	p = *pattern;
sl@0
  1813
	if ((string == stringEnd) && (p != '*')) {
sl@0
  1814
	    return 0;
sl@0
  1815
	}
sl@0
  1816
sl@0
  1817
	/*
sl@0
  1818
	 * Check for a "*" as the next pattern character.  It matches any
sl@0
  1819
	 * substring.  We handle this by skipping all the characters up to the
sl@0
  1820
	 * next matching one in the pattern, and then calling ourselves
sl@0
  1821
	 * recursively for each postfix of string, until either we match or we
sl@0
  1822
	 * reach the end of the string.
sl@0
  1823
	 */
sl@0
  1824
	
sl@0
  1825
	if (p == '*') {
sl@0
  1826
	    /*
sl@0
  1827
	     * Skip all successive *'s in the pattern
sl@0
  1828
	     */
sl@0
  1829
	    while (*(++pattern) == '*') {}
sl@0
  1830
	    if (pattern == patternEnd) {
sl@0
  1831
		return 1;
sl@0
  1832
	    }
sl@0
  1833
	    p = *pattern;
sl@0
  1834
	    if (nocase) {
sl@0
  1835
		p = Tcl_UniCharToLower(p);
sl@0
  1836
	    }
sl@0
  1837
	    while (1) {
sl@0
  1838
		/*
sl@0
  1839
		 * Optimization for matching - cruise through the string
sl@0
  1840
		 * quickly if the next char in the pattern isn't a special
sl@0
  1841
		 * character
sl@0
  1842
		 */
sl@0
  1843
		if ((p != '[') && (p != '?') && (p != '\\')) {
sl@0
  1844
		    if (nocase) {
sl@0
  1845
			while ((string < stringEnd) && (p != *string)
sl@0
  1846
				&& (p != Tcl_UniCharToLower(*string))) {
sl@0
  1847
			    string++;
sl@0
  1848
			}
sl@0
  1849
		    } else {
sl@0
  1850
			while ((string < stringEnd) && (p != *string)) {
sl@0
  1851
			    string++;
sl@0
  1852
			}
sl@0
  1853
		    }
sl@0
  1854
		}
sl@0
  1855
		if (TclUniCharMatch(string, stringEnd - string,
sl@0
  1856
			pattern, patternEnd - pattern, nocase)) {
sl@0
  1857
		    return 1;
sl@0
  1858
		}
sl@0
  1859
		if (string == stringEnd) {
sl@0
  1860
		    return 0;
sl@0
  1861
		}
sl@0
  1862
		string++;
sl@0
  1863
	    }
sl@0
  1864
	}
sl@0
  1865
sl@0
  1866
	/*
sl@0
  1867
	 * Check for a "?" as the next pattern character.  It matches
sl@0
  1868
	 * any single character.
sl@0
  1869
	 */
sl@0
  1870
sl@0
  1871
	if (p == '?') {
sl@0
  1872
	    pattern++;
sl@0
  1873
	    string++;
sl@0
  1874
	    continue;
sl@0
  1875
	}
sl@0
  1876
sl@0
  1877
	/*
sl@0
  1878
	 * Check for a "[" as the next pattern character.  It is followed
sl@0
  1879
	 * by a list of characters that are acceptable, or by a range
sl@0
  1880
	 * (two characters separated by "-").
sl@0
  1881
	 */
sl@0
  1882
	
sl@0
  1883
	if (p == '[') {
sl@0
  1884
	    Tcl_UniChar ch1, startChar, endChar;
sl@0
  1885
sl@0
  1886
	    pattern++;
sl@0
  1887
	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
sl@0
  1888
	    string++;
sl@0
  1889
	    while (1) {
sl@0
  1890
		if ((*pattern == ']') || (pattern == patternEnd)) {
sl@0
  1891
		    return 0;
sl@0
  1892
		}
sl@0
  1893
		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
sl@0
  1894
		pattern++;
sl@0
  1895
		if (*pattern == '-') {
sl@0
  1896
		    pattern++;
sl@0
  1897
		    if (pattern == patternEnd) {
sl@0
  1898
			return 0;
sl@0
  1899
		    }
sl@0
  1900
		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
sl@0
  1901
			    : *pattern);
sl@0
  1902
		    pattern++;
sl@0
  1903
		    if (((startChar <= ch1) && (ch1 <= endChar))
sl@0
  1904
			    || ((endChar <= ch1) && (ch1 <= startChar))) {
sl@0
  1905
			/*
sl@0
  1906
			 * Matches ranges of form [a-z] or [z-a].
sl@0
  1907
			 */
sl@0
  1908
			break;
sl@0
  1909
		    }
sl@0
  1910
		} else if (startChar == ch1) {
sl@0
  1911
		    break;
sl@0
  1912
		}
sl@0
  1913
	    }
sl@0
  1914
	    while (*pattern != ']') {
sl@0
  1915
		if (pattern == patternEnd) {
sl@0
  1916
		    pattern--;
sl@0
  1917
		    break;
sl@0
  1918
		}
sl@0
  1919
		pattern++;
sl@0
  1920
	    }
sl@0
  1921
	    pattern++;
sl@0
  1922
	    continue;
sl@0
  1923
	}
sl@0
  1924
sl@0
  1925
	/*
sl@0
  1926
	 * If the next pattern character is '\', just strip off the '\'
sl@0
  1927
	 * so we do exact matching on the character that follows.
sl@0
  1928
	 */
sl@0
  1929
sl@0
  1930
	if (p == '\\') {
sl@0
  1931
	    if (++pattern == patternEnd) {
sl@0
  1932
		return 0;
sl@0
  1933
	    }
sl@0
  1934
	}
sl@0
  1935
sl@0
  1936
	/*
sl@0
  1937
	 * There's no special character.  Just make sure that the next
sl@0
  1938
	 * bytes of each string match.
sl@0
  1939
	 */
sl@0
  1940
sl@0
  1941
	if (nocase) {
sl@0
  1942
	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
sl@0
  1943
		return 0;
sl@0
  1944
	    }
sl@0
  1945
	} else if (*string != *pattern) {
sl@0
  1946
	    return 0;
sl@0
  1947
	}
sl@0
  1948
	string++;
sl@0
  1949
	pattern++;
sl@0
  1950
    }
sl@0
  1951
}