os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclUtf.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2  * tclUtf.c --
     3  *
     4  *	Routines for manipulating UTF-8 strings.
     5  *
     6  * Copyright (c) 1997-1998 Sun Microsystems, Inc.
     7  * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.  
     8  *
     9  * See the file "license.terms" for information on usage and redistribution
    10  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
    11  *
    12  * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $
    13  */
    14 
    15 #include "tclInt.h"
    16 
    17 /*
    18  * Include the static character classification tables and macros.
    19  */
    20 
    21 #include "tclUniData.c"
    22 
    23 /*
    24  * The following macros are used for fast character category tests.  The
    25  * x_BITS values are shifted right by the category value to determine whether
    26  * the given category is included in the set.
    27  */ 
    28 
    29 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
    30     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
    31 
    32 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
    33 
    34 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
    35     | (1 << PARAGRAPH_SEPARATOR))
    36 
    37 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
    38 
    39 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
    40 	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
    41 	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
    42 	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
    43 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
    44 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
    45 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
    46 	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
    47 	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
    48 
    49 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
    50 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
    51 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
    52 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
    53 
    54 /*
    55  * Unicode characters less than this value are represented by themselves 
    56  * in UTF-8 strings. 
    57  */
    58 
    59 #define UNICODE_SELF	0x80
    60 
    61 /*
    62  * The following structures are used when mapping between Unicode (UCS-2)
    63  * and UTF-8.
    64  */
    65 
    66 static CONST unsigned char totalBytes[256] = {
    67     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    70     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    71     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    72     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    73     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    74     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    75 #if TCL_UTF_MAX > 3
    76     4,4,4,4,4,4,4,4,
    77 #else
    78     1,1,1,1,1,1,1,1,
    79 #endif
    80 #if TCL_UTF_MAX > 4
    81     5,5,5,5,
    82 #else
    83     1,1,1,1,
    84 #endif
    85 #if TCL_UTF_MAX > 5
    86     6,6,6,6
    87 #else
    88     1,1,1,1
    89 #endif
    90 };
    91 
    92 /*
    93  * Procedures used only in this module.
    94  */
    95 
    96 static int UtfCount _ANSI_ARGS_((int ch));
    97 
    98 
    99 /*
   100  *---------------------------------------------------------------------------
   101  *
   102  * UtfCount --
   103  *
   104  *	Find the number of bytes in the Utf character "ch".
   105  *
   106  * Results:
   107  *	The return values is the number of bytes in the Utf character "ch".
   108  *
   109  * Side effects:
   110  *	None.
   111  *
   112  *---------------------------------------------------------------------------
   113  */
   114  
   115 INLINE static int
   116 UtfCount(ch)
   117     int ch;			/* The Tcl_UniChar whose size is returned. */
   118 {
   119     if ((ch > 0) && (ch < UNICODE_SELF)) {
   120 	return 1;
   121     }
   122     if (ch <= 0x7FF) {
   123 	return 2;
   124     }
   125     if (ch <= 0xFFFF) {
   126 	return 3;
   127     }
   128 #if TCL_UTF_MAX > 3
   129     if (ch <= 0x1FFFFF) {
   130 	return 4;
   131     }
   132     if (ch <= 0x3FFFFFF) {
   133 	return 5;
   134     }
   135     if (ch <= 0x7FFFFFFF) {
   136 	return 6;
   137     }
   138 #endif
   139     return 3;
   140 }
   141 
   142 /*
   143  *---------------------------------------------------------------------------
   144  *
   145  * Tcl_UniCharToUtf --
   146  *
   147  *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
   148  *	provided buffer.  Equivalent to Plan 9 runetochar().
   149  *
   150  * Results:
   151  *	The return values is the number of bytes in the buffer that
   152  *	were consumed.  
   153  *
   154  * Side effects:
   155  *	None.
   156  *
   157  *---------------------------------------------------------------------------
   158  */
   159  
   160 EXPORT_C INLINE int
   161 Tcl_UniCharToUtf(ch, str)
   162     int ch;			/* The Tcl_UniChar to be stored in the
   163 				 * buffer. */
   164     char *str;			/* Buffer in which the UTF-8 representation
   165 				 * of the Tcl_UniChar is stored.  Buffer must
   166 				 * be large enough to hold the UTF-8 character
   167 				 * (at most TCL_UTF_MAX bytes). */
   168 {
   169     if ((ch > 0) && (ch < UNICODE_SELF)) {
   170 	str[0] = (char) ch;
   171 	return 1;
   172     }
   173     if (ch >= 0) {
   174 	if (ch <= 0x7FF) {
   175 	    str[1] = (char) ((ch | 0x80) & 0xBF);
   176 	    str[0] = (char) ((ch >> 6) | 0xC0);
   177 	    return 2;
   178 	}
   179 	if (ch <= 0xFFFF) {
   180 	three:
   181 	    str[2] = (char) ((ch | 0x80) & 0xBF);
   182 	    str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
   183 	    str[0] = (char) ((ch >> 12) | 0xE0);
   184 	    return 3;
   185 	}
   186 
   187 #if TCL_UTF_MAX > 3
   188 	if (ch <= 0x1FFFFF) {
   189 	    str[3] = (char) ((ch | 0x80) & 0xBF);
   190 	    str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
   191 	    str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
   192 	    str[0] = (char) ((ch >> 18) | 0xF0);
   193 	    return 4;
   194 	}
   195 	if (ch <= 0x3FFFFFF) {
   196 	    str[4] = (char) ((ch | 0x80) & 0xBF);
   197 	    str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
   198 	    str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
   199 	    str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
   200 	    str[0] = (char) ((ch >> 24) | 0xF8);
   201 	    return 5;
   202 	}
   203 	if (ch <= 0x7FFFFFFF) {
   204 	    str[5] = (char) ((ch | 0x80) & 0xBF);
   205 	    str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
   206 	    str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
   207 	    str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
   208 	    str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
   209 	    str[0] = (char) ((ch >> 30) | 0xFC);
   210 	    return 6;
   211 	}
   212 #endif
   213     }
   214 
   215     ch = 0xFFFD;
   216     goto three;
   217 }
   218 
   219 /*
   220  *---------------------------------------------------------------------------
   221  *
   222  * Tcl_UniCharToUtfDString --
   223  *
   224  *	Convert the given Unicode string to UTF-8.
   225  *
   226  * Results:
   227  *	The return value is a pointer to the UTF-8 representation of the
   228  *	Unicode string.  Storage for the return value is appended to the
   229  *	end of dsPtr.
   230  *
   231  * Side effects:
   232  *	None.
   233  *
   234  *---------------------------------------------------------------------------
   235  */
   236  
   237 EXPORT_C char *
   238 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
   239     CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */
   240     int numChars;		/* Length of Unicode string in Tcl_UniChars
   241 				 * (must be >= 0). */
   242     Tcl_DString *dsPtr;		/* UTF-8 representation of string is
   243 				 * appended to this previously initialized
   244 				 * DString. */
   245 {
   246     CONST Tcl_UniChar *w, *wEnd;
   247     char *p, *string;
   248     int oldLength;
   249 
   250     /*
   251      * UTF-8 string length in bytes will be <= Unicode string length *
   252      * TCL_UTF_MAX.
   253      */
   254 
   255     oldLength = Tcl_DStringLength(dsPtr);
   256     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
   257     string = Tcl_DStringValue(dsPtr) + oldLength;
   258 
   259     p = string;
   260     wEnd = wString + numChars;
   261     for (w = wString; w < wEnd; ) {
   262 	p += Tcl_UniCharToUtf(*w, p);
   263 	w++;
   264     }
   265     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
   266 
   267     return string;
   268 }
   269 
   270 /*
   271  *---------------------------------------------------------------------------
   272  *
   273  * Tcl_UtfToUniChar --
   274  *
   275  *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
   276  *	UTF-8 sequences are converted to valid Tcl_UniChars and processing
   277  *	continues.  Equivalent to Plan 9 chartorune().
   278  *
   279  *	The caller must ensure that the source buffer is long enough that
   280  *	this routine does not run off the end and dereference non-existent
   281  *	memory looking for trail bytes.  If the source buffer is known to
   282  *	be '\0' terminated, this cannot happen.  Otherwise, the caller
   283  *	should call Tcl_UtfCharComplete() before calling this routine to
   284  *	ensure that enough bytes remain in the string.
   285  *
   286  * Results:
   287  *	*chPtr is filled with the Tcl_UniChar, and the return value is the
   288  *	number of bytes from the UTF-8 string that were consumed.
   289  *
   290  * Side effects:
   291  *	None.
   292  *
   293  *---------------------------------------------------------------------------
   294  */
   295  
   296 EXPORT_C int
   297 Tcl_UtfToUniChar(str, chPtr)
   298     register CONST char *str;	 /* The UTF-8 string. */
   299     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
   300 				  * by the UTF-8 string. */
   301 {
   302     register int byte;
   303     
   304     /*
   305      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
   306      */
   307 
   308     byte = *((unsigned char *) str);
   309     if (byte < 0xC0) {
   310 	/*
   311 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
   312 	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
   313 	 * characters representing themselves.
   314 	 */
   315 
   316 	*chPtr = (Tcl_UniChar) byte;
   317 	return 1;
   318     } else if (byte < 0xE0) {
   319 	if ((str[1] & 0xC0) == 0x80) {
   320 	    /*
   321 	     * Two-byte-character lead-byte followed by a trail-byte.
   322 	     */
   323 
   324 	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
   325 	    return 2;
   326 	}
   327 	/*
   328 	 * A two-byte-character lead-byte not followed by trail-byte
   329 	 * represents itself.
   330 	 */
   331 
   332 	*chPtr = (Tcl_UniChar) byte;
   333 	return 1;
   334     } else if (byte < 0xF0) {
   335 	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
   336 	    /*
   337 	     * Three-byte-character lead byte followed by two trail bytes.
   338 	     */
   339 
   340 	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 
   341 		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
   342 	    return 3;
   343 	}
   344 	/*
   345 	 * A three-byte-character lead-byte not followed by two trail-bytes
   346 	 * represents itself.
   347 	 */
   348 
   349 	*chPtr = (Tcl_UniChar) byte;
   350 	return 1;
   351     }
   352 #if TCL_UTF_MAX > 3
   353     else {
   354 	int ch, total, trail;
   355 
   356 	total = totalBytes[byte];
   357 	trail = total - 1;
   358 	if (trail > 0) {
   359 	    ch = byte & (0x3F >> trail);
   360 	    do {
   361 		str++;
   362 		if ((*str & 0xC0) != 0x80) {
   363 		    *chPtr = byte;
   364 		    return 1;
   365 		}
   366 		ch <<= 6;
   367 		ch |= (*str & 0x3F);
   368 		trail--;
   369 	    } while (trail > 0);
   370 	    *chPtr = ch;
   371 	    return total;
   372 	}
   373     }
   374 #endif
   375 
   376     *chPtr = (Tcl_UniChar) byte;
   377     return 1;
   378 }
   379 
   380 /*
   381  *---------------------------------------------------------------------------
   382  *
   383  * Tcl_UtfToUniCharDString --
   384  *
   385  *	Convert the UTF-8 string to Unicode.
   386  *
   387  * Results:
   388  *	The return value is a pointer to the Unicode representation of the
   389  *	UTF-8 string.  Storage for the return value is appended to the
   390  *	end of dsPtr.  The Unicode string is terminated with a Unicode
   391  *	NULL character.
   392  *
   393  * Side effects:
   394  *	None.
   395  *
   396  *---------------------------------------------------------------------------
   397  */
   398 
   399 EXPORT_C Tcl_UniChar *
   400 Tcl_UtfToUniCharDString(string, length, dsPtr)
   401     CONST char *string;		/* UTF-8 string to convert to Unicode. */
   402     int length;			/* Length of UTF-8 string in bytes, or -1
   403 				 * for strlen(). */
   404     Tcl_DString *dsPtr;		/* Unicode representation of string is
   405 				 * appended to this previously initialized
   406 				 * DString. */
   407 {
   408     Tcl_UniChar *w, *wString;
   409     CONST char *p, *end;
   410     int oldLength;
   411 
   412     if (length < 0) {
   413 	length = strlen(string);
   414     }
   415 
   416     /*
   417      * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
   418      * in bytes.
   419      */
   420 
   421     oldLength = Tcl_DStringLength(dsPtr);
   422     Tcl_DStringSetLength(dsPtr,
   423 	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
   424     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
   425 
   426     w = wString;
   427     end = string + length;
   428     for (p = string; p < end; ) {
   429 	p += TclUtfToUniChar(p, w);
   430 	w++;
   431     }
   432     *w = '\0';
   433     Tcl_DStringSetLength(dsPtr,
   434 	    (oldLength + ((char *) w - (char *) wString)));
   435 
   436     return wString;
   437 }
   438 
   439 /*
   440  *---------------------------------------------------------------------------
   441  *
   442  * Tcl_UtfCharComplete --
   443  *
   444  *	Determine if the UTF-8 string of the given length is long enough
   445  *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
   446  *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
   447  *
   448  * Results:
   449  *	The return value is 0 if the string is not long enough, non-zero
   450  *	otherwise.
   451  *
   452  * Side effects:
   453  *	None.
   454  *
   455  *---------------------------------------------------------------------------
   456  */
   457 
   458 EXPORT_C int
   459 Tcl_UtfCharComplete(str, len)
   460     CONST char *str;		/* String to check if first few bytes
   461 				 * contain a complete UTF-8 character. */
   462     int len;			/* Length of above string in bytes. */
   463 {
   464     int ch;
   465 
   466     ch = *((unsigned char *) str);
   467     return len >= totalBytes[ch];
   468 }
   469 
   470 /*
   471  *---------------------------------------------------------------------------
   472  *
   473  * Tcl_NumUtfChars --
   474  *
   475  *	Returns the number of characters (not bytes) in the UTF-8 string,
   476  *	not including the terminating NULL byte.  This is equivalent to
   477  *	Plan 9 utflen() and utfnlen().
   478  *
   479  * Results:
   480  *	As above.  
   481  *
   482  * Side effects:
   483  *	None.
   484  *
   485  *---------------------------------------------------------------------------
   486  */
   487  
   488 EXPORT_C int 
   489 Tcl_NumUtfChars(str, len)
   490     register CONST char *str;	/* The UTF-8 string to measure. */
   491     int len;			/* The length of the string in bytes, or -1
   492 				 * for strlen(string). */
   493 {
   494     Tcl_UniChar ch;
   495     register Tcl_UniChar *chPtr = &ch;
   496     register int i;
   497 
   498     /*
   499      * The separate implementations are faster.
   500      *
   501      * Since this is a time-sensitive function, we also do the check for
   502      * the single-byte char case specially.
   503      */
   504 
   505     i = 0;
   506     if (len < 0) {
   507 	while (*str != '\0') {
   508 	    str += TclUtfToUniChar(str, chPtr);
   509 	    i++;
   510 	}
   511     } else {
   512 	register int n;
   513 
   514 	while (len > 0) {
   515 	    if (UCHAR(*str) < 0xC0) {
   516 		len--;
   517 		str++;
   518 	    } else {
   519 		n = Tcl_UtfToUniChar(str, chPtr);
   520 		len -= n;
   521 		str += n;
   522 	    }
   523 	    i++;
   524 	}
   525     }
   526     return i;
   527 }
   528 
   529 /*
   530  *---------------------------------------------------------------------------
   531  *
   532  * Tcl_UtfFindFirst --
   533  *
   534  *	Returns a pointer to the first occurance of the given Tcl_UniChar
   535  *	in the NULL-terminated UTF-8 string.  The NULL terminator is
   536  *	considered part of the UTF-8 string.  Equivalent to Plan 9
   537  *	utfrune().
   538  *
   539  * Results:
   540  *	As above.  If the Tcl_UniChar does not exist in the given string,
   541  *	the return value is NULL.
   542  *
   543  * Side effects:
   544  *	None.
   545  *
   546  *---------------------------------------------------------------------------
   547  */
   548 EXPORT_C CONST char *
   549 Tcl_UtfFindFirst(string, ch)
   550     CONST char *string;		/* The UTF-8 string to be searched. */
   551     int ch;			/* The Tcl_UniChar to search for. */
   552 {
   553     int len;
   554     Tcl_UniChar find;
   555     
   556     while (1) {
   557 	len = TclUtfToUniChar(string, &find);
   558 	if (find == ch) {
   559 	    return string;
   560 	}
   561 	if (*string == '\0') {
   562 	    return NULL;
   563 	}
   564 	string += len;
   565     }
   566 }
   567 
   568 /*
   569  *---------------------------------------------------------------------------
   570  *
   571  * Tcl_UtfFindLast --
   572  *
   573  *	Returns a pointer to the last occurance of the given Tcl_UniChar
   574  *	in the NULL-terminated UTF-8 string.  The NULL terminator is
   575  *	considered part of the UTF-8 string.  Equivalent to Plan 9
   576  *	utfrrune().
   577  *
   578  * Results:
   579  *	As above.  If the Tcl_UniChar does not exist in the given string,
   580  *	the return value is NULL.
   581  *
   582  * Side effects:
   583  *	None.
   584  *
   585  *---------------------------------------------------------------------------
   586  */
   587 
   588 EXPORT_C CONST char *
   589 Tcl_UtfFindLast(string, ch)
   590     CONST char *string;		/* The UTF-8 string to be searched. */
   591     int ch;			/* The Tcl_UniChar to search for. */
   592 {
   593     int len;
   594     Tcl_UniChar find;
   595     CONST char *last;
   596 	
   597     last = NULL;
   598     while (1) {
   599 	len = TclUtfToUniChar(string, &find);
   600 	if (find == ch) {
   601 	    last = string;
   602 	}
   603 	if (*string == '\0') {
   604 	    break;
   605 	}
   606 	string += len;
   607     }
   608     return last;
   609 }
   610 
   611 /*
   612  *---------------------------------------------------------------------------
   613  *
   614  * Tcl_UtfNext --
   615  *
   616  *	Given a pointer to some current location in a UTF-8 string,
   617  *	move forward one character.  The caller must ensure that they
   618  *	are not asking for the next character after the last character
   619  *	in the string.
   620  *
   621  * Results:
   622  *	The return value is the pointer to the next character in
   623  *	the UTF-8 string.
   624  *
   625  * Side effects:
   626  *	None.
   627  *
   628  *---------------------------------------------------------------------------
   629  */
   630  
   631 EXPORT_C CONST char *
   632 Tcl_UtfNext(str) 
   633     CONST char *str;		    /* The current location in the string. */
   634 {
   635     Tcl_UniChar ch;
   636 
   637     return str + TclUtfToUniChar(str, &ch);
   638 }
   639 
   640 /*
   641  *---------------------------------------------------------------------------
   642  *
   643  * Tcl_UtfPrev --
   644  *
   645  *	Given a pointer to some current location in a UTF-8 string,
   646  *	move backwards one character.  This works correctly when the
   647  *	pointer is in the middle of a UTF-8 character.
   648  *
   649  * Results:
   650  *	The return value is a pointer to the previous character in the
   651  *	UTF-8 string.  If the current location was already at the
   652  *	beginning of the string, the return value will also be a
   653  *	pointer to the beginning of the string.
   654  *
   655  * Side effects:
   656  *	None.
   657  *
   658  *---------------------------------------------------------------------------
   659  */
   660 
   661 EXPORT_C CONST char *
   662 Tcl_UtfPrev(str, start)
   663     CONST char *str;		    /* The current location in the string. */
   664     CONST char *start;		    /* Pointer to the beginning of the
   665 				     * string, to avoid going backwards too
   666 				     * far. */
   667 {
   668     CONST char *look;
   669     int i, byte;
   670     
   671     str--;
   672     look = str;
   673     for (i = 0; i < TCL_UTF_MAX; i++) {
   674 	if (look < start) {
   675 	    if (str < start) {
   676 		str = start;
   677 	    }
   678 	    break;
   679 	}
   680 	byte = *((unsigned char *) look);
   681 	if (byte < 0x80) {
   682 	    break;
   683 	}
   684 	if (byte >= 0xC0) {
   685 	    return look;
   686 	}
   687 	look--;
   688     }
   689     return str;
   690 }
   691 	
   692 /*
   693  *---------------------------------------------------------------------------
   694  *
   695  * Tcl_UniCharAtIndex --
   696  *
   697  *	Returns the Unicode character represented at the specified
   698  *	character (not byte) position in the UTF-8 string.
   699  *
   700  * Results:
   701  *	As above.
   702  *
   703  * Side effects:
   704  *	None.
   705  *
   706  *---------------------------------------------------------------------------
   707  */
   708  
   709 EXPORT_C Tcl_UniChar
   710 Tcl_UniCharAtIndex(src, index)
   711     register CONST char *src;	/* The UTF-8 string to dereference. */
   712     register int index;		/* The position of the desired character. */
   713 {
   714     Tcl_UniChar ch;
   715 
   716     while (index >= 0) {
   717 	index--;
   718 	src += TclUtfToUniChar(src, &ch);
   719     }
   720     return ch;
   721 }
   722 
   723 /*
   724  *---------------------------------------------------------------------------
   725  *
   726  * Tcl_UtfAtIndex --
   727  *
   728  *	Returns a pointer to the specified character (not byte) position
   729  *	in the UTF-8 string.
   730  *
   731  * Results:
   732  *	As above.
   733  *
   734  * Side effects:
   735  *	None.
   736  *
   737  *---------------------------------------------------------------------------
   738  */
   739 
   740 EXPORT_C CONST char *
   741 Tcl_UtfAtIndex(src, index)
   742     register CONST char *src;	/* The UTF-8 string. */
   743     register int index;		/* The position of the desired character. */
   744 {
   745     Tcl_UniChar ch;
   746     
   747     while (index > 0) {
   748 	index--;
   749 	src += TclUtfToUniChar(src, &ch);
   750     }
   751     return src;
   752 }
   753 
   754 /*
   755  *---------------------------------------------------------------------------
   756  *
   757  * Tcl_UtfBackslash --
   758  *
   759  *	Figure out how to handle a backslash sequence.
   760  *
   761  * Results:
   762  *	Stores the bytes represented by the backslash sequence in dst and
   763  *	returns the number of bytes written to dst.  At most TCL_UTF_MAX
   764  *	bytes are written to dst; dst must have been large enough to accept
   765  *	those bytes.  If readPtr isn't NULL then it is filled in with a
   766  *	count of the number of bytes in the backslash sequence.  
   767  *
   768  * Side effects:
   769  *	The maximum number of bytes it takes to represent a Unicode
   770  *	character in UTF-8 is guaranteed to be less than the number of
   771  *	bytes used to express the backslash sequence that represents
   772  *	that Unicode character.  If the target buffer into which the
   773  *	caller is going to store the bytes that represent the Unicode
   774  *	character is at least as large as the source buffer from which
   775  *	the backslashed sequence was extracted, no buffer overruns should
   776  *	occur.
   777  *
   778  *---------------------------------------------------------------------------
   779  */
   780 
   781 EXPORT_C int
   782 Tcl_UtfBackslash(src, readPtr, dst)
   783     CONST char *src;		/* Points to the backslash character of
   784 				 * a backslash sequence. */
   785     int *readPtr;		/* Fill in with number of characters read
   786 				 * from src, unless NULL. */
   787     char *dst;			/* Filled with the bytes represented by the
   788 				 * backslash sequence. */
   789 {
   790 #define LINE_LENGTH 128
   791     int numRead;
   792     int result;
   793 
   794     result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
   795     if (numRead == LINE_LENGTH) {
   796 	/* We ate a whole line.  Pay the price of a strlen() */
   797 	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
   798     }
   799     if (readPtr != NULL) {
   800 	*readPtr = numRead;
   801     }
   802     return result;
   803 }
   804 
   805 /*
   806  *----------------------------------------------------------------------
   807  *
   808  * Tcl_UtfToUpper --
   809  *
   810  *	Convert lowercase characters to uppercase characters in a UTF
   811  *	string in place.  The conversion may shrink the UTF string.
   812  *
   813  * Results:
   814  *	Returns the number of bytes in the resulting string
   815  *	excluding the trailing null.
   816  *
   817  * Side effects:
   818  *	Writes a terminating null after the last converted character.
   819  *
   820  *----------------------------------------------------------------------
   821  */
   822 
   823 EXPORT_C int
   824 Tcl_UtfToUpper(str)
   825     char *str;			/* String to convert in place. */
   826 {
   827     Tcl_UniChar ch, upChar;
   828     char *src, *dst;
   829     int bytes;
   830 
   831     /*
   832      * Iterate over the string until we hit the terminating null.
   833      */
   834 
   835     src = dst = str;
   836     while (*src) {
   837         bytes = TclUtfToUniChar(src, &ch);
   838 	upChar = Tcl_UniCharToUpper(ch);
   839 
   840 	/*
   841 	 * To keep badly formed Utf strings from getting inflated by
   842 	 * the conversion (thereby causing a segfault), only copy the
   843 	 * upper case char to dst if its size is <= the original char.
   844 	 */
   845 	
   846 	if (bytes < UtfCount(upChar)) {
   847 	    memcpy(dst, src, (size_t) bytes);
   848 	    dst += bytes;
   849 	} else {
   850 	    dst += Tcl_UniCharToUtf(upChar, dst);
   851 	}
   852 	src += bytes;
   853     }
   854     *dst = '\0';
   855     return (dst - str);
   856 }
   857 
   858 /*
   859  *----------------------------------------------------------------------
   860  *
   861  * Tcl_UtfToLower --
   862  *
   863  *	Convert uppercase characters to lowercase characters in a UTF
   864  *	string in place.  The conversion may shrink the UTF string.
   865  *
   866  * Results:
   867  *	Returns the number of bytes in the resulting string
   868  *	excluding the trailing null.
   869  *
   870  * Side effects:
   871  *	Writes a terminating null after the last converted character.
   872  *
   873  *----------------------------------------------------------------------
   874  */
   875 
   876 EXPORT_C int
   877 Tcl_UtfToLower(str)
   878     char *str;			/* String to convert in place. */
   879 {
   880     Tcl_UniChar ch, lowChar;
   881     char *src, *dst;
   882     int bytes;
   883     
   884     /*
   885      * Iterate over the string until we hit the terminating null.
   886      */
   887 
   888     src = dst = str;
   889     while (*src) {
   890 	bytes = TclUtfToUniChar(src, &ch);
   891 	lowChar = Tcl_UniCharToLower(ch);
   892 
   893 	/*
   894 	 * To keep badly formed Utf strings from getting inflated by
   895 	 * the conversion (thereby causing a segfault), only copy the
   896 	 * lower case char to dst if its size is <= the original char.
   897 	 */
   898 	
   899 	if (bytes < UtfCount(lowChar)) {
   900 	    memcpy(dst, src, (size_t) bytes);
   901 	    dst += bytes;
   902 	} else {
   903 	    dst += Tcl_UniCharToUtf(lowChar, dst);
   904 	}
   905 	src += bytes;
   906     }
   907     *dst = '\0';
   908     return (dst - str);
   909 }
   910 
   911 /*
   912  *----------------------------------------------------------------------
   913  *
   914  * Tcl_UtfToTitle --
   915  *
   916  *	Changes the first character of a UTF string to title case or
   917  *	uppercase and the rest of the string to lowercase.  The
   918  *	conversion happens in place and may shrink the UTF string.
   919  *
   920  * Results:
   921  *	Returns the number of bytes in the resulting string
   922  *	excluding the trailing null.
   923  *
   924  * Side effects:
   925  *	Writes a terminating null after the last converted character.
   926  *
   927  *----------------------------------------------------------------------
   928  */
   929 
   930 EXPORT_C int
   931 Tcl_UtfToTitle(str)
   932     char *str;			/* String to convert in place. */
   933 {
   934     Tcl_UniChar ch, titleChar, lowChar;
   935     char *src, *dst;
   936     int bytes;
   937     
   938     /*
   939      * Capitalize the first character and then lowercase the rest of the
   940      * characters until we get to a null.
   941      */
   942 
   943     src = dst = str;
   944 
   945     if (*src) {
   946 	bytes = TclUtfToUniChar(src, &ch);
   947 	titleChar = Tcl_UniCharToTitle(ch);
   948 
   949 	if (bytes < UtfCount(titleChar)) {
   950 	    memcpy(dst, src, (size_t) bytes);
   951 	    dst += bytes;
   952 	} else {
   953 	    dst += Tcl_UniCharToUtf(titleChar, dst);
   954 	}
   955 	src += bytes;
   956     }
   957     while (*src) {
   958 	bytes = TclUtfToUniChar(src, &ch);
   959 	lowChar = Tcl_UniCharToLower(ch);
   960 
   961 	if (bytes < UtfCount(lowChar)) {
   962 	    memcpy(dst, src, (size_t) bytes);
   963 	    dst += bytes;
   964 	} else {
   965 	    dst += Tcl_UniCharToUtf(lowChar, dst);
   966 	}
   967 	src += bytes;
   968     }
   969     *dst = '\0';
   970     return (dst - str);
   971 }
   972 
   973 /*
   974  *----------------------------------------------------------------------
   975  *
   976  * TclpUtfNcmp2 --
   977  *
   978  *	Compare at most n bytes of utf-8 strings cs and ct.  Both cs
   979  *	and ct are assumed to be at least n bytes long.
   980  *
   981  * Results:
   982  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
   983  *
   984  * Side effects:
   985  *	None.
   986  *
   987  *----------------------------------------------------------------------
   988  */
   989 
   990 int
   991 TclpUtfNcmp2(cs, ct, n)
   992     CONST char *cs;		/* UTF string to compare to ct. */
   993     CONST char *ct;		/* UTF string cs is compared to. */
   994     unsigned long n;		/* Number of *bytes* to compare. */
   995 {
   996     /*
   997      * We can't simply call 'memcmp(cs, ct, n);' because we need to check
   998      * for Tcl's \xC0\x80 non-utf-8 null encoding.
   999      * Otherwise utf-8 lexes fine in the strcmp manner.
  1000      */
  1001     register int result = 0;
  1002 
  1003     for ( ; n != 0; n--, cs++, ct++) {
  1004 	if (*cs != *ct) {
  1005 	    result = UCHAR(*cs) - UCHAR(*ct);
  1006 	    break;
  1007 	}
  1008     }
  1009     if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
  1010 	unsigned char c1, c2;
  1011 	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
  1012 	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
  1013 	result = (c1 - c2);
  1014     }
  1015     return result;
  1016 }
  1017 
  1018 /*
  1019  *----------------------------------------------------------------------
  1020  *
  1021  * Tcl_UtfNcmp --
  1022  *
  1023  *	Compare at most n UTF chars of string cs to string ct.  Both cs
  1024  *	and ct are assumed to be at least n UTF chars long.
  1025  *
  1026  * Results:
  1027  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
  1028  *
  1029  * Side effects:
  1030  *	None.
  1031  *
  1032  *----------------------------------------------------------------------
  1033  */
  1034 
  1035 EXPORT_C int
  1036 Tcl_UtfNcmp(cs, ct, n)
  1037     CONST char *cs;		/* UTF string to compare to ct. */
  1038     CONST char *ct;		/* UTF string cs is compared to. */
  1039     unsigned long n;		/* Number of UTF chars to compare. */
  1040 {
  1041     Tcl_UniChar ch1, ch2;
  1042     /*
  1043      * Cannot use 'memcmp(cs, ct, n);' as byte representation of
  1044      * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
  1045      * representation of \u0001 (the byte 0x01.)
  1046      */
  1047     while (n-- > 0) {
  1048 	/*
  1049 	 * n must be interpreted as chars, not bytes.
  1050 	 * This should be called only when both strings are of
  1051 	 * at least n chars long (no need for \0 check)
  1052 	 */
  1053 	cs += TclUtfToUniChar(cs, &ch1);
  1054 	ct += TclUtfToUniChar(ct, &ch2);
  1055 	if (ch1 != ch2) {
  1056 	    return (ch1 - ch2);
  1057 	}
  1058     }
  1059     return 0;
  1060 }
  1061 
  1062 /*
  1063  *----------------------------------------------------------------------
  1064  *
  1065  * Tcl_UtfNcasecmp --
  1066  *
  1067  *	Compare at most n UTF chars of string cs to string ct case
  1068  *	insensitive.  Both cs and ct are assumed to be at least n
  1069  *	UTF chars long.
  1070  *
  1071  * Results:
  1072  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
  1073  *
  1074  * Side effects:
  1075  *	None.
  1076  *
  1077  *----------------------------------------------------------------------
  1078  */
  1079 
  1080 EXPORT_C int
  1081 Tcl_UtfNcasecmp(cs, ct, n)
  1082     CONST char *cs;		/* UTF string to compare to ct. */
  1083     CONST char *ct;		/* UTF string cs is compared to. */
  1084     unsigned long n;			/* Number of UTF chars to compare. */
  1085 {
  1086     Tcl_UniChar ch1, ch2;
  1087     while (n-- > 0) {
  1088 	/*
  1089 	 * n must be interpreted as chars, not bytes.
  1090 	 * This should be called only when both strings are of
  1091 	 * at least n chars long (no need for \0 check)
  1092 	 */
  1093 	cs += TclUtfToUniChar(cs, &ch1);
  1094 	ct += TclUtfToUniChar(ct, &ch2);
  1095 	if (ch1 != ch2) {
  1096 	    ch1 = Tcl_UniCharToLower(ch1);
  1097 	    ch2 = Tcl_UniCharToLower(ch2);
  1098 	    if (ch1 != ch2) {
  1099 		return (ch1 - ch2);
  1100 	    }
  1101 	}
  1102     }
  1103     return 0;
  1104 }
  1105 
  1106 /*
  1107  *----------------------------------------------------------------------
  1108  *
  1109  * Tcl_UniCharToUpper --
  1110  *
  1111  *	Compute the uppercase equivalent of the given Unicode character.
  1112  *
  1113  * Results:
  1114  *	Returns the uppercase Unicode character.
  1115  *
  1116  * Side effects:
  1117  *	None.
  1118  *
  1119  *----------------------------------------------------------------------
  1120  */
  1121 
  1122 EXPORT_C Tcl_UniChar
  1123 Tcl_UniCharToUpper(ch)
  1124     int ch;			/* Unicode character to convert. */
  1125 {
  1126     int info = GetUniCharInfo(ch);
  1127 
  1128     if (GetCaseType(info) & 0x04) {
  1129 	return (Tcl_UniChar) (ch - GetDelta(info));
  1130     } else {
  1131 	return ch;
  1132     }
  1133 }
  1134 
  1135 /*
  1136  *----------------------------------------------------------------------
  1137  *
  1138  * Tcl_UniCharToLower --
  1139  *
  1140  *	Compute the lowercase equivalent of the given Unicode character.
  1141  *
  1142  * Results:
  1143  *	Returns the lowercase Unicode character.
  1144  *
  1145  * Side effects:
  1146  *	None.
  1147  *
  1148  *----------------------------------------------------------------------
  1149  */
  1150 
  1151 EXPORT_C Tcl_UniChar
  1152 Tcl_UniCharToLower(ch)
  1153     int ch;			/* Unicode character to convert. */
  1154 {
  1155     int info = GetUniCharInfo(ch);
  1156 
  1157     if (GetCaseType(info) & 0x02) {
  1158 	return (Tcl_UniChar) (ch + GetDelta(info));
  1159     } else {
  1160 	return ch;
  1161     }
  1162 }
  1163 
  1164 /*
  1165  *----------------------------------------------------------------------
  1166  *
  1167  * Tcl_UniCharToTitle --
  1168  *
  1169  *	Compute the titlecase equivalent of the given Unicode character.
  1170  *
  1171  * Results:
  1172  *	Returns the titlecase Unicode character.
  1173  *
  1174  * Side effects:
  1175  *	None.
  1176  *
  1177  *----------------------------------------------------------------------
  1178  */
  1179 
  1180 EXPORT_C Tcl_UniChar
  1181 Tcl_UniCharToTitle(ch)
  1182     int ch;			/* Unicode character to convert. */
  1183 {
  1184     int info = GetUniCharInfo(ch);
  1185     int mode = GetCaseType(info);
  1186 
  1187     if (mode & 0x1) {
  1188 	/*
  1189 	 * Subtract or add one depending on the original case.
  1190 	 */
  1191 
  1192 	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
  1193     } else if (mode == 0x4) {
  1194 	return (Tcl_UniChar) (ch - GetDelta(info));
  1195     } else {
  1196 	return ch;
  1197     }
  1198 }
  1199 
  1200 /*
  1201  *----------------------------------------------------------------------
  1202  *
  1203  * Tcl_UniCharLen --
  1204  *
  1205  *	Find the length of a UniChar string.  The str input must be null
  1206  *	terminated.
  1207  *
  1208  * Results:
  1209  *	Returns the length of str in UniChars (not bytes).
  1210  *
  1211  * Side effects:
  1212  *	None.
  1213  *
  1214  *----------------------------------------------------------------------
  1215  */
  1216 
  1217 EXPORT_C int
  1218 Tcl_UniCharLen(str)
  1219     CONST Tcl_UniChar *str;	/* Unicode string to find length of. */
  1220 {
  1221     int len = 0;
  1222     
  1223     while (*str != '\0') {
  1224 	len++;
  1225 	str++;
  1226     }
  1227     return len;
  1228 }
  1229 
  1230 /*
  1231  *----------------------------------------------------------------------
  1232  *
  1233  * Tcl_UniCharNcmp --
  1234  *
  1235  *	Compare at most n unichars of string cs to string ct.  Both cs
  1236  *	and ct are assumed to be at least n unichars long.
  1237  *
  1238  * Results:
  1239  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
  1240  *
  1241  * Side effects:
  1242  *	None.
  1243  *
  1244  *----------------------------------------------------------------------
  1245  */
  1246 
  1247 EXPORT_C int
  1248 Tcl_UniCharNcmp(cs, ct, n)
  1249     CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
  1250     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
  1251     unsigned long n;			/* Number of unichars to compare. */
  1252 {
  1253 #ifdef WORDS_BIGENDIAN
  1254     /*
  1255      * We are definitely on a big-endian machine; memcmp() is safe
  1256      */
  1257     return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
  1258 
  1259 #else /* !WORDS_BIGENDIAN */
  1260     /*
  1261      * We can't simply call memcmp() because that is not lexically correct.
  1262      */
  1263     for ( ; n != 0; cs++, ct++, n--) {
  1264 	if (*cs != *ct) {
  1265 	    return (*cs - *ct);
  1266 	}
  1267     }
  1268     return 0;
  1269 #endif /* WORDS_BIGENDIAN */
  1270 }
  1271 
  1272 /*
  1273  *----------------------------------------------------------------------
  1274  *
  1275  * Tcl_UniCharNcasecmp --
  1276  *
  1277  *	Compare at most n unichars of string cs to string ct case
  1278  *	insensitive.  Both cs and ct are assumed to be at least n
  1279  *	unichars long.
  1280  *
  1281  * Results:
  1282  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
  1283  *
  1284  * Side effects:
  1285  *	None.
  1286  *
  1287  *----------------------------------------------------------------------
  1288  */
  1289 
  1290 EXPORT_C int
  1291 Tcl_UniCharNcasecmp(cs, ct, n)
  1292     CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
  1293     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
  1294     unsigned long n;			/* Number of unichars to compare. */
  1295 {
  1296     for ( ; n != 0; n--, cs++, ct++) {
  1297 	if (*cs != *ct) {
  1298 	    Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);
  1299 	    Tcl_UniChar lct = Tcl_UniCharToLower(*ct);
  1300 	    if (lcs != lct) {
  1301 		return (lcs - lct);
  1302 	    }
  1303 	}
  1304     }
  1305     return 0;
  1306 }
  1307 
  1308 /*
  1309  *----------------------------------------------------------------------
  1310  *
  1311  * Tcl_UniCharIsAlnum --
  1312  *
  1313  *	Test if a character is an alphanumeric Unicode character.
  1314  *
  1315  * Results:
  1316  *	Returns 1 if character is alphanumeric.
  1317  *
  1318  * Side effects:
  1319  *	None.
  1320  *
  1321  *----------------------------------------------------------------------
  1322  */
  1323 
  1324 EXPORT_C int
  1325 Tcl_UniCharIsAlnum(ch)
  1326     int ch;			/* Unicode character to test. */
  1327 {
  1328     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1329 
  1330     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
  1331 }
  1332 
  1333 /*
  1334  *----------------------------------------------------------------------
  1335  *
  1336  * Tcl_UniCharIsAlpha --
  1337  *
  1338  *	Test if a character is an alphabetic Unicode character.
  1339  *
  1340  * Results:
  1341  *	Returns 1 if character is alphabetic.
  1342  *
  1343  * Side effects:
  1344  *	None.
  1345  *
  1346  *----------------------------------------------------------------------
  1347  */
  1348 
  1349 EXPORT_C int
  1350 Tcl_UniCharIsAlpha(ch)
  1351     int ch;			/* Unicode character to test. */
  1352 {
  1353     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1354     return ((ALPHA_BITS >> category) & 1);
  1355 }
  1356 
  1357 /*
  1358  *----------------------------------------------------------------------
  1359  *
  1360  * Tcl_UniCharIsControl --
  1361  *
  1362  *	Test if a character is a Unicode control character.
  1363  *
  1364  * Results:
  1365  *	Returns non-zero if character is a control.
  1366  *
  1367  * Side effects:
  1368  *	None.
  1369  *
  1370  *----------------------------------------------------------------------
  1371  */
  1372 
  1373 EXPORT_C int
  1374 Tcl_UniCharIsControl(ch)
  1375     int ch;			/* Unicode character to test. */
  1376 {
  1377     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
  1378 }
  1379 
  1380 /*
  1381  *----------------------------------------------------------------------
  1382  *
  1383  * Tcl_UniCharIsDigit --
  1384  *
  1385  *	Test if a character is a numeric Unicode character.
  1386  *
  1387  * Results:
  1388  *	Returns non-zero if character is a digit.
  1389  *
  1390  * Side effects:
  1391  *	None.
  1392  *
  1393  *----------------------------------------------------------------------
  1394  */
  1395 
  1396 EXPORT_C int
  1397 Tcl_UniCharIsDigit(ch)
  1398     int ch;			/* Unicode character to test. */
  1399 {
  1400     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
  1401 	    == DECIMAL_DIGIT_NUMBER);
  1402 }
  1403 
  1404 /*
  1405  *----------------------------------------------------------------------
  1406  *
  1407  * Tcl_UniCharIsGraph --
  1408  *
  1409  *	Test if a character is any Unicode print character except space.
  1410  *
  1411  * Results:
  1412  *	Returns non-zero if character is printable, but not space.
  1413  *
  1414  * Side effects:
  1415  *	None.
  1416  *
  1417  *----------------------------------------------------------------------
  1418  */
  1419 
  1420 EXPORT_C int
  1421 Tcl_UniCharIsGraph(ch)
  1422     int ch;			/* Unicode character to test. */
  1423 {
  1424     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1425     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
  1426 }
  1427 
  1428 /*
  1429  *----------------------------------------------------------------------
  1430  *
  1431  * Tcl_UniCharIsLower --
  1432  *
  1433  *	Test if a character is a lowercase Unicode character.
  1434  *
  1435  * Results:
  1436  *	Returns non-zero if character is lowercase.
  1437  *
  1438  * Side effects:
  1439  *	None.
  1440  *
  1441  *----------------------------------------------------------------------
  1442  */
  1443 
  1444 EXPORT_C int
  1445 Tcl_UniCharIsLower(ch)
  1446     int ch;			/* Unicode character to test. */
  1447 {
  1448     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
  1449 }
  1450 
  1451 /*
  1452  *----------------------------------------------------------------------
  1453  *
  1454  * Tcl_UniCharIsPrint --
  1455  *
  1456  *	Test if a character is a Unicode print character.
  1457  *
  1458  * Results:
  1459  *	Returns non-zero if character is printable.
  1460  *
  1461  * Side effects:
  1462  *	None.
  1463  *
  1464  *----------------------------------------------------------------------
  1465  */
  1466 
  1467 EXPORT_C int
  1468 Tcl_UniCharIsPrint(ch)
  1469     int ch;			/* Unicode character to test. */
  1470 {
  1471     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1472     return ((PRINT_BITS >> category) & 1);
  1473 }
  1474 
  1475 /*
  1476  *----------------------------------------------------------------------
  1477  *
  1478  * Tcl_UniCharIsPunct --
  1479  *
  1480  *	Test if a character is a Unicode punctuation character.
  1481  *
  1482  * Results:
  1483  *	Returns non-zero if character is punct.
  1484  *
  1485  * Side effects:
  1486  *	None.
  1487  *
  1488  *----------------------------------------------------------------------
  1489  */
  1490 
  1491 EXPORT_C int
  1492 Tcl_UniCharIsPunct(ch)
  1493     int ch;			/* Unicode character to test. */
  1494 {
  1495     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1496     return ((PUNCT_BITS >> category) & 1);
  1497 }
  1498 
  1499 /*
  1500  *----------------------------------------------------------------------
  1501  *
  1502  * Tcl_UniCharIsSpace --
  1503  *
  1504  *	Test if a character is a whitespace Unicode character.
  1505  *
  1506  * Results:
  1507  *	Returns non-zero if character is a space.
  1508  *
  1509  * Side effects:
  1510  *	None.
  1511  *
  1512  *----------------------------------------------------------------------
  1513  */
  1514 
  1515 EXPORT_C int
  1516 Tcl_UniCharIsSpace(ch)
  1517     int ch;			/* Unicode character to test. */
  1518 {
  1519     register int category;
  1520 
  1521     /*
  1522      * If the character is within the first 127 characters, just use the
  1523      * standard C function, otherwise consult the Unicode table.
  1524      */
  1525 
  1526     if (ch < 0x80) {
  1527 	return isspace(UCHAR(ch)); /* INTL: ISO space */
  1528     } else {
  1529 	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1530 	return ((SPACE_BITS >> category) & 1);
  1531     }
  1532 }
  1533 
  1534 /*
  1535  *----------------------------------------------------------------------
  1536  *
  1537  * Tcl_UniCharIsUpper --
  1538  *
  1539  *	Test if a character is a uppercase Unicode character.
  1540  *
  1541  * Results:
  1542  *	Returns non-zero if character is uppercase.
  1543  *
  1544  * Side effects:
  1545  *	None.
  1546  *
  1547  *----------------------------------------------------------------------
  1548  */
  1549 
  1550 EXPORT_C int
  1551 Tcl_UniCharIsUpper(ch)
  1552     int ch;			/* Unicode character to test. */
  1553 {
  1554     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
  1555 }
  1556 
  1557 /*
  1558  *----------------------------------------------------------------------
  1559  *
  1560  * Tcl_UniCharIsWordChar --
  1561  *
  1562  *	Test if a character is alphanumeric or a connector punctuation
  1563  *	mark.
  1564  *
  1565  * Results:
  1566  *	Returns 1 if character is a word character.
  1567  *
  1568  * Side effects:
  1569  *	None.
  1570  *
  1571  *----------------------------------------------------------------------
  1572  */
  1573 
  1574 EXPORT_C int
  1575 Tcl_UniCharIsWordChar(ch)
  1576     int ch;			/* Unicode character to test. */
  1577 {
  1578     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
  1579 
  1580     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
  1581 }
  1582 
  1583 /*
  1584  *----------------------------------------------------------------------
  1585  *
  1586  * Tcl_UniCharCaseMatch --
  1587  *
  1588  *	See if a particular Unicode string matches a particular pattern.
  1589  *	Allows case insensitivity.  This is the Unicode equivalent of
  1590  *	the char* Tcl_StringCaseMatch.  The UniChar strings must be
  1591  *	NULL-terminated.  This has no provision for counted UniChar
  1592  *	strings, thus should not be used where NULLs are expected in the
  1593  *	UniChar string.  Use TclUniCharMatch where possible.
  1594  *
  1595  * Results:
  1596  *	The return value is 1 if string matches pattern, and
  1597  *	0 otherwise.  The matching operation permits the following
  1598  *	special characters in the pattern: *?\[] (see the manual
  1599  *	entry for details on what these mean).
  1600  *
  1601  * Side effects:
  1602  *	None.
  1603  *
  1604  *----------------------------------------------------------------------
  1605  */
  1606 
  1607 EXPORT_C int
  1608 Tcl_UniCharCaseMatch(string, pattern, nocase)
  1609     CONST Tcl_UniChar *string;	/* Unicode String. */
  1610     CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
  1611 				 * characters. */
  1612     int nocase;			/* 0 for case sensitive, 1 for insensitive */
  1613 {
  1614     Tcl_UniChar ch1, p;
  1615     
  1616     while (1) {
  1617 	p = *pattern;
  1618 	
  1619 	/*
  1620 	 * See if we're at the end of both the pattern and the string.  If
  1621 	 * so, we succeeded.  If we're at the end of the pattern but not at
  1622 	 * the end of the string, we failed.
  1623 	 */
  1624 	
  1625 	if (p == 0) {
  1626 	    return (*string == 0);
  1627 	}
  1628 	if ((*string == 0) && (p != '*')) {
  1629 	    return 0;
  1630 	}
  1631 
  1632 	/*
  1633 	 * Check for a "*" as the next pattern character.  It matches any
  1634 	 * substring.  We handle this by skipping all the characters up to the
  1635 	 * next matching one in the pattern, and then calling ourselves
  1636 	 * recursively for each postfix of string, until either we match or we
  1637 	 * reach the end of the string.
  1638 	 */
  1639 	
  1640 	if (p == '*') {
  1641 	    /*
  1642 	     * Skip all successive *'s in the pattern
  1643 	     */
  1644 	    while (*(++pattern) == '*') {}
  1645 	    p = *pattern;
  1646 	    if (p == 0) {
  1647 		return 1;
  1648 	    }
  1649 	    if (nocase) {
  1650 		p = Tcl_UniCharToLower(p);
  1651 	    }
  1652 	    while (1) {
  1653 		/*
  1654 		 * Optimization for matching - cruise through the string
  1655 		 * quickly if the next char in the pattern isn't a special
  1656 		 * character
  1657 		 */
  1658 		if ((p != '[') && (p != '?') && (p != '\\')) {
  1659 		    if (nocase) {
  1660 			while (*string && (p != *string)
  1661 				&& (p != Tcl_UniCharToLower(*string))) {
  1662 			    string++;
  1663 			}
  1664 		    } else {
  1665 			while (*string && (p != *string)) { string++; }
  1666 		    }
  1667 		}
  1668 		if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
  1669 		    return 1;
  1670 		}
  1671 		if (*string == 0) {
  1672 		    return 0;
  1673 		}
  1674 		string++;
  1675 	    }
  1676 	}
  1677 
  1678 	/*
  1679 	 * Check for a "?" as the next pattern character.  It matches
  1680 	 * any single character.
  1681 	 */
  1682 
  1683 	if (p == '?') {
  1684 	    pattern++;
  1685 	    string++;
  1686 	    continue;
  1687 	}
  1688 
  1689 	/*
  1690 	 * Check for a "[" as the next pattern character.  It is followed
  1691 	 * by a list of characters that are acceptable, or by a range
  1692 	 * (two characters separated by "-").
  1693 	 */
  1694 	
  1695 	if (p == '[') {
  1696 	    Tcl_UniChar startChar, endChar;
  1697 
  1698 	    pattern++;
  1699 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
  1700 	    string++;
  1701 	    while (1) {
  1702 		if ((*pattern == ']') || (*pattern == 0)) {
  1703 		    return 0;
  1704 		}
  1705 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
  1706 		pattern++;
  1707 		if (*pattern == '-') {
  1708 		    pattern++;
  1709 		    if (*pattern == 0) {
  1710 			return 0;
  1711 		    }
  1712 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
  1713 			    : *pattern);
  1714 		    pattern++;
  1715 		    if (((startChar <= ch1) && (ch1 <= endChar))
  1716 			    || ((endChar <= ch1) && (ch1 <= startChar))) {
  1717 			/*
  1718 			 * Matches ranges of form [a-z] or [z-a].
  1719 			 */
  1720 			break;
  1721 		    }
  1722 		} else if (startChar == ch1) {
  1723 		    break;
  1724 		}
  1725 	    }
  1726 	    while (*pattern != ']') {
  1727 		if (*pattern == 0) {
  1728 		    pattern--;
  1729 		    break;
  1730 		}
  1731 		pattern++;
  1732 	    }
  1733 	    pattern++;
  1734 	    continue;
  1735 	}
  1736 
  1737 	/*
  1738 	 * If the next pattern character is '\', just strip off the '\'
  1739 	 * so we do exact matching on the character that follows.
  1740 	 */
  1741 
  1742 	if (p == '\\') {
  1743 	    if (*(++pattern) == '\0') {
  1744 		return 0;
  1745 	    }
  1746 	}
  1747 
  1748 	/*
  1749 	 * There's no special character.  Just make sure that the next
  1750 	 * bytes of each string match.
  1751 	 */
  1752 
  1753 	if (nocase) {
  1754 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
  1755 		return 0;
  1756 	    }
  1757 	} else if (*string != *pattern) {
  1758 	    return 0;
  1759 	}
  1760 	string++;
  1761 	pattern++;
  1762     }
  1763 }
  1764 
  1765 /*
  1766  *----------------------------------------------------------------------
  1767  *
  1768  * TclUniCharMatch --
  1769  *
  1770  *	See if a particular Unicode string matches a particular pattern.
  1771  *	Allows case insensitivity.  This is the Unicode equivalent of the
  1772  *	char* Tcl_StringCaseMatch.  This variant of Tcl_UniCharCaseMatch
  1773  *	uses counted Strings, so embedded NULLs are allowed.
  1774  *
  1775  * Results:
  1776  *	The return value is 1 if string matches pattern, and
  1777  *	0 otherwise.  The matching operation permits the following
  1778  *	special characters in the pattern: *?\[] (see the manual
  1779  *	entry for details on what these mean).
  1780  *
  1781  * Side effects:
  1782  *	None.
  1783  *
  1784  *----------------------------------------------------------------------
  1785  */
  1786 
  1787 int
  1788 TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
  1789     CONST Tcl_UniChar *string;	/* Unicode String. */
  1790     int strLen;			/* length of String */
  1791     CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
  1792 				 * characters. */
  1793     int ptnLen;			/* length of Pattern */
  1794     int nocase;			/* 0 for case sensitive, 1 for insensitive */
  1795 {
  1796     CONST Tcl_UniChar *stringEnd, *patternEnd;
  1797     Tcl_UniChar p;
  1798 
  1799     stringEnd  = string + strLen;
  1800     patternEnd = pattern + ptnLen;
  1801 
  1802     while (1) {
  1803 	/*
  1804 	 * See if we're at the end of both the pattern and the string.  If
  1805 	 * so, we succeeded.  If we're at the end of the pattern but not at
  1806 	 * the end of the string, we failed.
  1807 	 */
  1808 
  1809 	if (pattern == patternEnd) {
  1810 	    return (string == stringEnd);
  1811 	}
  1812 	p = *pattern;
  1813 	if ((string == stringEnd) && (p != '*')) {
  1814 	    return 0;
  1815 	}
  1816 
  1817 	/*
  1818 	 * Check for a "*" as the next pattern character.  It matches any
  1819 	 * substring.  We handle this by skipping all the characters up to the
  1820 	 * next matching one in the pattern, and then calling ourselves
  1821 	 * recursively for each postfix of string, until either we match or we
  1822 	 * reach the end of the string.
  1823 	 */
  1824 	
  1825 	if (p == '*') {
  1826 	    /*
  1827 	     * Skip all successive *'s in the pattern
  1828 	     */
  1829 	    while (*(++pattern) == '*') {}
  1830 	    if (pattern == patternEnd) {
  1831 		return 1;
  1832 	    }
  1833 	    p = *pattern;
  1834 	    if (nocase) {
  1835 		p = Tcl_UniCharToLower(p);
  1836 	    }
  1837 	    while (1) {
  1838 		/*
  1839 		 * Optimization for matching - cruise through the string
  1840 		 * quickly if the next char in the pattern isn't a special
  1841 		 * character
  1842 		 */
  1843 		if ((p != '[') && (p != '?') && (p != '\\')) {
  1844 		    if (nocase) {
  1845 			while ((string < stringEnd) && (p != *string)
  1846 				&& (p != Tcl_UniCharToLower(*string))) {
  1847 			    string++;
  1848 			}
  1849 		    } else {
  1850 			while ((string < stringEnd) && (p != *string)) {
  1851 			    string++;
  1852 			}
  1853 		    }
  1854 		}
  1855 		if (TclUniCharMatch(string, stringEnd - string,
  1856 			pattern, patternEnd - pattern, nocase)) {
  1857 		    return 1;
  1858 		}
  1859 		if (string == stringEnd) {
  1860 		    return 0;
  1861 		}
  1862 		string++;
  1863 	    }
  1864 	}
  1865 
  1866 	/*
  1867 	 * Check for a "?" as the next pattern character.  It matches
  1868 	 * any single character.
  1869 	 */
  1870 
  1871 	if (p == '?') {
  1872 	    pattern++;
  1873 	    string++;
  1874 	    continue;
  1875 	}
  1876 
  1877 	/*
  1878 	 * Check for a "[" as the next pattern character.  It is followed
  1879 	 * by a list of characters that are acceptable, or by a range
  1880 	 * (two characters separated by "-").
  1881 	 */
  1882 	
  1883 	if (p == '[') {
  1884 	    Tcl_UniChar ch1, startChar, endChar;
  1885 
  1886 	    pattern++;
  1887 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
  1888 	    string++;
  1889 	    while (1) {
  1890 		if ((*pattern == ']') || (pattern == patternEnd)) {
  1891 		    return 0;
  1892 		}
  1893 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
  1894 		pattern++;
  1895 		if (*pattern == '-') {
  1896 		    pattern++;
  1897 		    if (pattern == patternEnd) {
  1898 			return 0;
  1899 		    }
  1900 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
  1901 			    : *pattern);
  1902 		    pattern++;
  1903 		    if (((startChar <= ch1) && (ch1 <= endChar))
  1904 			    || ((endChar <= ch1) && (ch1 <= startChar))) {
  1905 			/*
  1906 			 * Matches ranges of form [a-z] or [z-a].
  1907 			 */
  1908 			break;
  1909 		    }
  1910 		} else if (startChar == ch1) {
  1911 		    break;
  1912 		}
  1913 	    }
  1914 	    while (*pattern != ']') {
  1915 		if (pattern == patternEnd) {
  1916 		    pattern--;
  1917 		    break;
  1918 		}
  1919 		pattern++;
  1920 	    }
  1921 	    pattern++;
  1922 	    continue;
  1923 	}
  1924 
  1925 	/*
  1926 	 * If the next pattern character is '\', just strip off the '\'
  1927 	 * so we do exact matching on the character that follows.
  1928 	 */
  1929 
  1930 	if (p == '\\') {
  1931 	    if (++pattern == patternEnd) {
  1932 		return 0;
  1933 	    }
  1934 	}
  1935 
  1936 	/*
  1937 	 * There's no special character.  Just make sure that the next
  1938 	 * bytes of each string match.
  1939 	 */
  1940 
  1941 	if (nocase) {
  1942 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
  1943 		return 0;
  1944 	    }
  1945 	} else if (*string != *pattern) {
  1946 	    return 0;
  1947 	}
  1948 	string++;
  1949 	pattern++;
  1950     }
  1951 }