Symaptic: os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclUtf.c@260cb5ec6c19

     1 /*

     2  * tclUtf.c --

     3  *

     4  *	Routines for manipulating UTF-8 strings.

     5  *

     6  * Copyright (c) 1997-1998 Sun Microsystems, Inc.

     7  * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.

     8  *

     9  * See the file "license.terms" for information on usage and redistribution

    10  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.

    11  *

    12  * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $

    13  */

    15 #include "tclInt.h"

    17 /*

    18  * Include the static character classification tables and macros.

    19  */

    21 #include "tclUniData.c"

    23 /*

    24  * The following macros are used for fast character category tests.  The

    25  * x_BITS values are shifted right by the category value to determine whether

    26  * the given category is included in the set.

    27  */

    29 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \

    30     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))

    32 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)

    34 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \

    35     | (1 << PARAGRAPH_SEPARATOR))

    37 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)

    39 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \

    40 	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \

    41 	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \

    42 	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \

    43 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \

    44 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \

    45 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \

    46 	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \

    47 	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))

    49 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \

    50 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \

    51 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \

    52 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))

    54 /*

    55  * Unicode characters less than this value are represented by themselves

    56  * in UTF-8 strings.

    57  */

    59 #define UNICODE_SELF	0x80

    61 /*

    62  * The following structures are used when mapping between Unicode (UCS-2)

    63  * and UTF-8.

    64  */

    66 static CONST unsigned char totalBytes[256] = {

    67     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

    68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

    69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

    70     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

    71     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

    72     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

    73     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,

    74     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,

    75 #if TCL_UTF_MAX > 3

    76     4,4,4,4,4,4,4,4,

    77 #else

    78     1,1,1,1,1,1,1,1,

    79 #endif

    80 #if TCL_UTF_MAX > 4

    81     5,5,5,5,

    82 #else

    83     1,1,1,1,

    84 #endif

    85 #if TCL_UTF_MAX > 5

    86     6,6,6,6

    87 #else

    88     1,1,1,1

    89 #endif

    90 };

    92 /*

    93  * Procedures used only in this module.

    94  */

    96 static int UtfCount _ANSI_ARGS_((int ch));

    99 /*

   100  *---------------------------------------------------------------------------

   101  *

   102  * UtfCount --

   103  *

   104  *	Find the number of bytes in the Utf character "ch".

   105  *

   106  * Results:

   107  *	The return values is the number of bytes in the Utf character "ch".

   108  *

   109  * Side effects:

   110  *	None.

   111  *

   112  *---------------------------------------------------------------------------

   113  */

   115 INLINE static int

   116 UtfCount(ch)

   117     int ch;			/* The Tcl_UniChar whose size is returned. */

   118 {

   119     if ((ch > 0) && (ch < UNICODE_SELF)) {

   120 	return 1;

   121     }

   122     if (ch <= 0x7FF) {

   123 	return 2;

   124     }

   125     if (ch <= 0xFFFF) {

   126 	return 3;

   127     }

   128 #if TCL_UTF_MAX > 3

   129     if (ch <= 0x1FFFFF) {

   130 	return 4;

   131     }

   132     if (ch <= 0x3FFFFFF) {

   133 	return 5;

   134     }

   135     if (ch <= 0x7FFFFFFF) {

   136 	return 6;

   137     }

   138 #endif

   139     return 3;

   140 }

   142 /*

   143  *---------------------------------------------------------------------------

   144  *

   145  * Tcl_UniCharToUtf --

   146  *

   147  *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the

   148  *	provided buffer.  Equivalent to Plan 9 runetochar().

   149  *

   150  * Results:

   151  *	The return values is the number of bytes in the buffer that

   152  *	were consumed.

   153  *

   154  * Side effects:

   155  *	None.

   156  *

   157  *---------------------------------------------------------------------------

   158  */

   160 EXPORT_C INLINE int

   161 Tcl_UniCharToUtf(ch, str)

   162     int ch;			/* The Tcl_UniChar to be stored in the

   163 				 * buffer. */

   164     char *str;			/* Buffer in which the UTF-8 representation

   165 				 * of the Tcl_UniChar is stored.  Buffer must

   166 				 * be large enough to hold the UTF-8 character

   167 				 * (at most TCL_UTF_MAX bytes). */

   168 {

   169     if ((ch > 0) && (ch < UNICODE_SELF)) {

   170 	str[0] = (char) ch;

   171 	return 1;

   172     }

   173     if (ch >= 0) {

   174 	if (ch <= 0x7FF) {

   175 	    str[1] = (char) ((ch | 0x80) & 0xBF);

   176 	    str[0] = (char) ((ch >> 6) | 0xC0);

   177 	    return 2;

   178 	}

   179 	if (ch <= 0xFFFF) {

   180 	three:

   181 	    str[2] = (char) ((ch | 0x80) & 0xBF);

   182 	    str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);

   183 	    str[0] = (char) ((ch >> 12) | 0xE0);

   184 	    return 3;

   185 	}

   187 #if TCL_UTF_MAX > 3

   188 	if (ch <= 0x1FFFFF) {

   189 	    str[3] = (char) ((ch | 0x80) & 0xBF);

   190 	    str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);

   191 	    str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);

   192 	    str[0] = (char) ((ch >> 18) | 0xF0);

   193 	    return 4;

   194 	}

   195 	if (ch <= 0x3FFFFFF) {

   196 	    str[4] = (char) ((ch | 0x80) & 0xBF);

   197 	    str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);

   198 	    str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);

   199 	    str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);

   200 	    str[0] = (char) ((ch >> 24) | 0xF8);

   201 	    return 5;

   202 	}

   203 	if (ch <= 0x7FFFFFFF) {

   204 	    str[5] = (char) ((ch | 0x80) & 0xBF);

   205 	    str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);

   206 	    str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);

   207 	    str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);

   208 	    str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);

   209 	    str[0] = (char) ((ch >> 30) | 0xFC);

   210 	    return 6;

   211 	}

   212 #endif

   213     }

   215     ch = 0xFFFD;

   216     goto three;

   217 }

   219 /*

   220  *---------------------------------------------------------------------------

   221  *

   222  * Tcl_UniCharToUtfDString --

   223  *

   224  *	Convert the given Unicode string to UTF-8.

   225  *

   226  * Results:

   227  *	The return value is a pointer to the UTF-8 representation of the

   228  *	Unicode string.  Storage for the return value is appended to the

   229  *	end of dsPtr.

   230  *

   231  * Side effects:

   232  *	None.

   233  *

   234  *---------------------------------------------------------------------------

   235  */

   237 EXPORT_C char *

   238 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)

   239     CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */

   240     int numChars;		/* Length of Unicode string in Tcl_UniChars

   241 				 * (must be >= 0). */

   242     Tcl_DString *dsPtr;		/* UTF-8 representation of string is

   243 				 * appended to this previously initialized

   244 				 * DString. */

   245 {

   246     CONST Tcl_UniChar *w, *wEnd;

   247     char *p, *string;

   248     int oldLength;

   250     /*

   251      * UTF-8 string length in bytes will be <= Unicode string length *

   252      * TCL_UTF_MAX.

   253      */

   255     oldLength = Tcl_DStringLength(dsPtr);

   256     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);

   257     string = Tcl_DStringValue(dsPtr) + oldLength;

   259     p = string;

   260     wEnd = wString + numChars;

   261     for (w = wString; w < wEnd; ) {

   262 	p += Tcl_UniCharToUtf(*w, p);

   263 	w++;

   264     }

   265     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));

   267     return string;

   268 }

   270 /*

   271  *---------------------------------------------------------------------------

   272  *

   273  * Tcl_UtfToUniChar --

   274  *

   275  *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad

   276  *	UTF-8 sequences are converted to valid Tcl_UniChars and processing

   277  *	continues.  Equivalent to Plan 9 chartorune().

   278  *

   279  *	The caller must ensure that the source buffer is long enough that

   280  *	this routine does not run off the end and dereference non-existent

   281  *	memory looking for trail bytes.  If the source buffer is known to

   282  *	be '\0' terminated, this cannot happen.  Otherwise, the caller

   283  *	should call Tcl_UtfCharComplete() before calling this routine to

   284  *	ensure that enough bytes remain in the string.

   285  *

   286  * Results:

   287  *	*chPtr is filled with the Tcl_UniChar, and the return value is the

   288  *	number of bytes from the UTF-8 string that were consumed.

   289  *

   290  * Side effects:

   291  *	None.

   292  *

   293  *---------------------------------------------------------------------------

   294  */

   296 EXPORT_C int

   297 Tcl_UtfToUniChar(str, chPtr)

   298     register CONST char *str;	 /* The UTF-8 string. */

   299     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented

   300 				  * by the UTF-8 string. */

   301 {

   302     register int byte;

   304     /*

   305      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.

   306      */

   308     byte = *((unsigned char *) str);

   309     if (byte < 0xC0) {

   310 	/*

   311 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.

   312 	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid

   313 	 * characters representing themselves.

   314 	 */

   316 	*chPtr = (Tcl_UniChar) byte;

   317 	return 1;

   318     } else if (byte < 0xE0) {

   319 	if ((str[1] & 0xC0) == 0x80) {

   320 	    /*

   321 	     * Two-byte-character lead-byte followed by a trail-byte.

   322 	     */

   324 	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));

   325 	    return 2;

   326 	}

   327 	/*

   328 	 * A two-byte-character lead-byte not followed by trail-byte

   329 	 * represents itself.

   330 	 */

   332 	*chPtr = (Tcl_UniChar) byte;

   333 	return 1;

   334     } else if (byte < 0xF0) {

   335 	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {

   336 	    /*

   337 	     * Three-byte-character lead byte followed by two trail bytes.

   338 	     */

   340 	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)

   341 		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));

   342 	    return 3;

   343 	}

   344 	/*

   345 	 * A three-byte-character lead-byte not followed by two trail-bytes

   346 	 * represents itself.

   347 	 */

   349 	*chPtr = (Tcl_UniChar) byte;

   350 	return 1;

   351     }

   352 #if TCL_UTF_MAX > 3

   353     else {

   354 	int ch, total, trail;

   356 	total = totalBytes[byte];

   357 	trail = total - 1;

   358 	if (trail > 0) {

   359 	    ch = byte & (0x3F >> trail);

   360 	    do {

   361 		str++;

   362 		if ((*str & 0xC0) != 0x80) {

   363 		    *chPtr = byte;

   364 		    return 1;

   365 		}

   366 		ch <<= 6;

   367 		ch |= (*str & 0x3F);

   368 		trail--;

   369 	    } while (trail > 0);

   370 	    *chPtr = ch;

   371 	    return total;

   372 	}

   373     }

   374 #endif

   376     *chPtr = (Tcl_UniChar) byte;

   377     return 1;

   378 }

   380 /*

   381  *---------------------------------------------------------------------------

   382  *

   383  * Tcl_UtfToUniCharDString --

   384  *

   385  *	Convert the UTF-8 string to Unicode.

   386  *

   387  * Results:

   388  *	The return value is a pointer to the Unicode representation of the

   389  *	UTF-8 string.  Storage for the return value is appended to the

   390  *	end of dsPtr.  The Unicode string is terminated with a Unicode

   391  *	NULL character.

   392  *

   393  * Side effects:

   394  *	None.

   395  *

   396  *---------------------------------------------------------------------------

   397  */

   399 EXPORT_C Tcl_UniChar *

   400 Tcl_UtfToUniCharDString(string, length, dsPtr)

   401     CONST char *string;		/* UTF-8 string to convert to Unicode. */

   402     int length;			/* Length of UTF-8 string in bytes, or -1

   403 				 * for strlen(). */

   404     Tcl_DString *dsPtr;		/* Unicode representation of string is

   405 				 * appended to this previously initialized

   406 				 * DString. */

   407 {

   408     Tcl_UniChar *w, *wString;

   409     CONST char *p, *end;

   410     int oldLength;

   412     if (length < 0) {

   413 	length = strlen(string);

   414     }

   416     /*

   417      * Unicode string length in Tcl_UniChars will be <= UTF-8 string length

   418      * in bytes.

   419      */

   421     oldLength = Tcl_DStringLength(dsPtr);

   422     Tcl_DStringSetLength(dsPtr,

   423 	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));

   424     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);

   426     w = wString;

   427     end = string + length;

   428     for (p = string; p < end; ) {

   429 	p += TclUtfToUniChar(p, w);

   430 	w++;

   431     }

   432     *w = '\0';

   433     Tcl_DStringSetLength(dsPtr,

   434 	    (oldLength + ((char *) w - (char *) wString)));

   436     return wString;

   437 }

   439 /*

   440  *---------------------------------------------------------------------------

   441  *

   442  * Tcl_UtfCharComplete --

   443  *

   444  *	Determine if the UTF-8 string of the given length is long enough

   445  *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the

   446  *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().

   447  *

   448  * Results:

   449  *	The return value is 0 if the string is not long enough, non-zero

   450  *	otherwise.

   451  *

   452  * Side effects:

   453  *	None.

   454  *

   455  *---------------------------------------------------------------------------

   456  */

   458 EXPORT_C int

   459 Tcl_UtfCharComplete(str, len)

   460     CONST char *str;		/* String to check if first few bytes

   461 				 * contain a complete UTF-8 character. */

   462     int len;			/* Length of above string in bytes. */

   463 {

   464     int ch;

   466     ch = *((unsigned char *) str);

   467     return len >= totalBytes[ch];

   468 }

   470 /*

   471  *---------------------------------------------------------------------------

   472  *

   473  * Tcl_NumUtfChars --

   474  *

   475  *	Returns the number of characters (not bytes) in the UTF-8 string,

   476  *	not including the terminating NULL byte.  This is equivalent to

   477  *	Plan 9 utflen() and utfnlen().

   478  *

   479  * Results:

   480  *	As above.

   481  *

   482  * Side effects:

   483  *	None.

   484  *

   485  *---------------------------------------------------------------------------

   486  */

   488 EXPORT_C int

   489 Tcl_NumUtfChars(str, len)

   490     register CONST char *str;	/* The UTF-8 string to measure. */

   491     int len;			/* The length of the string in bytes, or -1

   492 				 * for strlen(string). */

   493 {

   494     Tcl_UniChar ch;

   495     register Tcl_UniChar *chPtr = &ch;

   496     register int i;

   498     /*

   499      * The separate implementations are faster.

   500      *

   501      * Since this is a time-sensitive function, we also do the check for

   502      * the single-byte char case specially.

   503      */

   505     i = 0;

   506     if (len < 0) {

   507 	while (*str != '\0') {

   508 	    str += TclUtfToUniChar(str, chPtr);

   509 	    i++;

   510 	}

   511     } else {

   512 	register int n;

   514 	while (len > 0) {

   515 	    if (UCHAR(*str) < 0xC0) {

   516 		len--;

   517 		str++;

   518 	    } else {

   519 		n = Tcl_UtfToUniChar(str, chPtr);

   520 		len -= n;

   521 		str += n;

   522 	    }

   523 	    i++;

   524 	}

   525     }

   526     return i;

   527 }

   529 /*

   530  *---------------------------------------------------------------------------

   531  *

   532  * Tcl_UtfFindFirst --

   533  *

   534  *	Returns a pointer to the first occurance of the given Tcl_UniChar

   535  *	in the NULL-terminated UTF-8 string.  The NULL terminator is

   536  *	considered part of the UTF-8 string.  Equivalent to Plan 9

   537  *	utfrune().

   538  *

   539  * Results:

   540  *	As above.  If the Tcl_UniChar does not exist in the given string,

   541  *	the return value is NULL.

   542  *

   543  * Side effects:

   544  *	None.

   545  *

   546  *---------------------------------------------------------------------------

   547  */

   548 EXPORT_C CONST char *

   549 Tcl_UtfFindFirst(string, ch)

   550     CONST char *string;		/* The UTF-8 string to be searched. */

   551     int ch;			/* The Tcl_UniChar to search for. */

   552 {

   553     int len;

   554     Tcl_UniChar find;

   556     while (1) {

   557 	len = TclUtfToUniChar(string, &find);

   558 	if (find == ch) {

   559 	    return string;

   560 	}

   561 	if (*string == '\0') {

   562 	    return NULL;

   563 	}

   564 	string += len;

   565     }

   566 }

   568 /*

   569  *---------------------------------------------------------------------------

   570  *

   571  * Tcl_UtfFindLast --

   572  *

   573  *	Returns a pointer to the last occurance of the given Tcl_UniChar

   574  *	in the NULL-terminated UTF-8 string.  The NULL terminator is

   575  *	considered part of the UTF-8 string.  Equivalent to Plan 9

   576  *	utfrrune().

   577  *

   578  * Results:

   579  *	As above.  If the Tcl_UniChar does not exist in the given string,

   580  *	the return value is NULL.

   581  *

   582  * Side effects:

   583  *	None.

   584  *

   585  *---------------------------------------------------------------------------

   586  */

   588 EXPORT_C CONST char *

   589 Tcl_UtfFindLast(string, ch)

   590     CONST char *string;		/* The UTF-8 string to be searched. */

   591     int ch;			/* The Tcl_UniChar to search for. */

   592 {

   593     int len;

   594     Tcl_UniChar find;

   595     CONST char *last;

   597     last = NULL;

   598     while (1) {

   599 	len = TclUtfToUniChar(string, &find);

   600 	if (find == ch) {

   601 	    last = string;

   602 	}

   603 	if (*string == '\0') {

   604 	    break;

   605 	}

   606 	string += len;

   607     }

   608     return last;

   609 }

   611 /*

   612  *---------------------------------------------------------------------------

   613  *

   614  * Tcl_UtfNext --

   615  *

   616  *	Given a pointer to some current location in a UTF-8 string,

   617  *	move forward one character.  The caller must ensure that they

   618  *	are not asking for the next character after the last character

   619  *	in the string.

   620  *

   621  * Results:

   622  *	The return value is the pointer to the next character in

   623  *	the UTF-8 string.

   624  *

   625  * Side effects:

   626  *	None.

   627  *

   628  *---------------------------------------------------------------------------

   629  */

   631 EXPORT_C CONST char *

   632 Tcl_UtfNext(str)

   633     CONST char *str;		    /* The current location in the string. */

   634 {

   635     Tcl_UniChar ch;

   637     return str + TclUtfToUniChar(str, &ch);

   638 }

   640 /*

   641  *---------------------------------------------------------------------------

   642  *

   643  * Tcl_UtfPrev --

   644  *

   645  *	Given a pointer to some current location in a UTF-8 string,

   646  *	move backwards one character.  This works correctly when the

   647  *	pointer is in the middle of a UTF-8 character.

   648  *

   649  * Results:

   650  *	The return value is a pointer to the previous character in the

   651  *	UTF-8 string.  If the current location was already at the

   652  *	beginning of the string, the return value will also be a

   653  *	pointer to the beginning of the string.

   654  *

   655  * Side effects:

   656  *	None.

   657  *

   658  *---------------------------------------------------------------------------

   659  */

   661 EXPORT_C CONST char *

   662 Tcl_UtfPrev(str, start)

   663     CONST char *str;		    /* The current location in the string. */

   664     CONST char *start;		    /* Pointer to the beginning of the

   665 				     * string, to avoid going backwards too

   666 				     * far. */

   667 {

   668     CONST char *look;

   669     int i, byte;

   671     str--;

   672     look = str;

   673     for (i = 0; i < TCL_UTF_MAX; i++) {

   674 	if (look < start) {

   675 	    if (str < start) {

   676 		str = start;

   677 	    }

   678 	    break;

   679 	}

   680 	byte = *((unsigned char *) look);

   681 	if (byte < 0x80) {

   682 	    break;

   683 	}

   684 	if (byte >= 0xC0) {

   685 	    return look;

   686 	}

   687 	look--;

   688     }

   689     return str;

   690 }

   692 /*

   693  *---------------------------------------------------------------------------

   694  *

   695  * Tcl_UniCharAtIndex --

   696  *

   697  *	Returns the Unicode character represented at the specified

   698  *	character (not byte) position in the UTF-8 string.

   699  *

   700  * Results:

   701  *	As above.

   702  *

   703  * Side effects:

   704  *	None.

   705  *

   706  *---------------------------------------------------------------------------

   707  */

   709 EXPORT_C Tcl_UniChar

   710 Tcl_UniCharAtIndex(src, index)

   711     register CONST char *src;	/* The UTF-8 string to dereference. */

   712     register int index;		/* The position of the desired character. */

   713 {

   714     Tcl_UniChar ch;

   716     while (index >= 0) {

   717 	index--;

   718 	src += TclUtfToUniChar(src, &ch);

   719     }

   720     return ch;

   721 }

   723 /*

   724  *---------------------------------------------------------------------------

   725  *

   726  * Tcl_UtfAtIndex --

   727  *

   728  *	Returns a pointer to the specified character (not byte) position

   729  *	in the UTF-8 string.

   730  *

   731  * Results:

   732  *	As above.

   733  *

   734  * Side effects:

   735  *	None.

   736  *

   737  *---------------------------------------------------------------------------

   738  */

   740 EXPORT_C CONST char *

   741 Tcl_UtfAtIndex(src, index)

   742     register CONST char *src;	/* The UTF-8 string. */

   743     register int index;		/* The position of the desired character. */

   744 {

   745     Tcl_UniChar ch;

   747     while (index > 0) {

   748 	index--;

   749 	src += TclUtfToUniChar(src, &ch);

   750     }

   751     return src;

   752 }

   754 /*

   755  *---------------------------------------------------------------------------

   756  *

   757  * Tcl_UtfBackslash --

   758  *

   759  *	Figure out how to handle a backslash sequence.

   760  *

   761  * Results:

   762  *	Stores the bytes represented by the backslash sequence in dst and

   763  *	returns the number of bytes written to dst.  At most TCL_UTF_MAX

   764  *	bytes are written to dst; dst must have been large enough to accept

   765  *	those bytes.  If readPtr isn't NULL then it is filled in with a

   766  *	count of the number of bytes in the backslash sequence.

   767  *

   768  * Side effects:

   769  *	The maximum number of bytes it takes to represent a Unicode

   770  *	character in UTF-8 is guaranteed to be less than the number of

   771  *	bytes used to express the backslash sequence that represents

   772  *	that Unicode character.  If the target buffer into which the

   773  *	caller is going to store the bytes that represent the Unicode

   774  *	character is at least as large as the source buffer from which

   775  *	the backslashed sequence was extracted, no buffer overruns should

   776  *	occur.

   777  *

   778  *---------------------------------------------------------------------------

   779  */

   781 EXPORT_C int

   782 Tcl_UtfBackslash(src, readPtr, dst)

   783     CONST char *src;		/* Points to the backslash character of

   784 				 * a backslash sequence. */

   785     int *readPtr;		/* Fill in with number of characters read

   786 				 * from src, unless NULL. */

   787     char *dst;			/* Filled with the bytes represented by the

   788 				 * backslash sequence. */

   789 {

   790 #define LINE_LENGTH 128

   791     int numRead;

   792     int result;

   794     result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);

   795     if (numRead == LINE_LENGTH) {

   796 	/* We ate a whole line.  Pay the price of a strlen() */

   797 	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);

   798     }

   799     if (readPtr != NULL) {

   800 	*readPtr = numRead;

   801     }

   802     return result;

   803 }

   805 /*

   806  *----------------------------------------------------------------------

   807  *

   808  * Tcl_UtfToUpper --

   809  *

   810  *	Convert lowercase characters to uppercase characters in a UTF

   811  *	string in place.  The conversion may shrink the UTF string.

   812  *

   813  * Results:

   814  *	Returns the number of bytes in the resulting string

   815  *	excluding the trailing null.

   816  *

   817  * Side effects:

   818  *	Writes a terminating null after the last converted character.

   819  *

   820  *----------------------------------------------------------------------

   821  */

   823 EXPORT_C int

   824 Tcl_UtfToUpper(str)

   825     char *str;			/* String to convert in place. */

   826 {

   827     Tcl_UniChar ch, upChar;

   828     char *src, *dst;

   829     int bytes;

   831     /*

   832      * Iterate over the string until we hit the terminating null.

   833      */

   835     src = dst = str;

   836     while (*src) {

   837         bytes = TclUtfToUniChar(src, &ch);

   838 	upChar = Tcl_UniCharToUpper(ch);

   840 	/*

   841 	 * To keep badly formed Utf strings from getting inflated by

   842 	 * the conversion (thereby causing a segfault), only copy the

   843 	 * upper case char to dst if its size is <= the original char.

   844 	 */

   846 	if (bytes < UtfCount(upChar)) {

   847 	    memcpy(dst, src, (size_t) bytes);

   848 	    dst += bytes;

   849 	} else {

   850 	    dst += Tcl_UniCharToUtf(upChar, dst);

   851 	}

   852 	src += bytes;

   853     }

   854     *dst = '\0';

   855     return (dst - str);

   856 }

   858 /*

   859  *----------------------------------------------------------------------

   860  *

   861  * Tcl_UtfToLower --

   862  *

   863  *	Convert uppercase characters to lowercase characters in a UTF

   864  *	string in place.  The conversion may shrink the UTF string.

   865  *

   866  * Results:

   867  *	Returns the number of bytes in the resulting string

   868  *	excluding the trailing null.

   869  *

   870  * Side effects:

   871  *	Writes a terminating null after the last converted character.

   872  *

   873  *----------------------------------------------------------------------

   874  */

   876 EXPORT_C int

   877 Tcl_UtfToLower(str)

   878     char *str;			/* String to convert in place. */

   879 {

   880     Tcl_UniChar ch, lowChar;

   881     char *src, *dst;

   882     int bytes;

   884     /*

   885      * Iterate over the string until we hit the terminating null.

   886      */

   888     src = dst = str;

   889     while (*src) {

   890 	bytes = TclUtfToUniChar(src, &ch);

   891 	lowChar = Tcl_UniCharToLower(ch);

   893 	/*

   894 	 * To keep badly formed Utf strings from getting inflated by

   895 	 * the conversion (thereby causing a segfault), only copy the

   896 	 * lower case char to dst if its size is <= the original char.

   897 	 */

   899 	if (bytes < UtfCount(lowChar)) {

   900 	    memcpy(dst, src, (size_t) bytes);

   901 	    dst += bytes;

   902 	} else {

   903 	    dst += Tcl_UniCharToUtf(lowChar, dst);

   904 	}

   905 	src += bytes;

   906     }

   907     *dst = '\0';

   908     return (dst - str);

   909 }

   911 /*

   912  *----------------------------------------------------------------------

   913  *

   914  * Tcl_UtfToTitle --

   915  *

   916  *	Changes the first character of a UTF string to title case or

   917  *	uppercase and the rest of the string to lowercase.  The

   918  *	conversion happens in place and may shrink the UTF string.

   919  *

   920  * Results:

   921  *	Returns the number of bytes in the resulting string

   922  *	excluding the trailing null.

   923  *

   924  * Side effects:

   925  *	Writes a terminating null after the last converted character.

   926  *

   927  *----------------------------------------------------------------------

   928  */

   930 EXPORT_C int

   931 Tcl_UtfToTitle(str)

   932     char *str;			/* String to convert in place. */

   933 {

   934     Tcl_UniChar ch, titleChar, lowChar;

   935     char *src, *dst;

   936     int bytes;

   938     /*

   939      * Capitalize the first character and then lowercase the rest of the

   940      * characters until we get to a null.

   941      */

   943     src = dst = str;

   945     if (*src) {

   946 	bytes = TclUtfToUniChar(src, &ch);

   947 	titleChar = Tcl_UniCharToTitle(ch);

   949 	if (bytes < UtfCount(titleChar)) {

   950 	    memcpy(dst, src, (size_t) bytes);

   951 	    dst += bytes;

   952 	} else {

   953 	    dst += Tcl_UniCharToUtf(titleChar, dst);

   954 	}

   955 	src += bytes;

   956     }

   957     while (*src) {

   958 	bytes = TclUtfToUniChar(src, &ch);

   959 	lowChar = Tcl_UniCharToLower(ch);

   961 	if (bytes < UtfCount(lowChar)) {

   962 	    memcpy(dst, src, (size_t) bytes);

   963 	    dst += bytes;

   964 	} else {

   965 	    dst += Tcl_UniCharToUtf(lowChar, dst);

   966 	}

   967 	src += bytes;

   968     }

   969     *dst = '\0';

   970     return (dst - str);

   971 }

   973 /*

   974  *----------------------------------------------------------------------

   975  *

   976  * TclpUtfNcmp2 --

   977  *

   978  *	Compare at most n bytes of utf-8 strings cs and ct.  Both cs

   979  *	and ct are assumed to be at least n bytes long.

   980  *

   981  * Results:

   982  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.

   983  *

   984  * Side effects:

   985  *	None.

   986  *

   987  *----------------------------------------------------------------------

   988  */

   990 int

   991 TclpUtfNcmp2(cs, ct, n)

   992     CONST char *cs;		/* UTF string to compare to ct. */

   993     CONST char *ct;		/* UTF string cs is compared to. */

   994     unsigned long n;		/* Number of *bytes* to compare. */

   995 {

   996     /*

   997      * We can't simply call 'memcmp(cs, ct, n);' because we need to check

   998      * for Tcl's \xC0\x80 non-utf-8 null encoding.

   999      * Otherwise utf-8 lexes fine in the strcmp manner.

  1000      */

  1001     register int result = 0;

  1003     for ( ; n != 0; n--, cs++, ct++) {

  1004 	if (*cs != *ct) {

  1005 	    result = UCHAR(*cs) - UCHAR(*ct);

  1006 	    break;

  1007 	}

  1008     }

  1009     if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {

  1010 	unsigned char c1, c2;

  1011 	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);

  1012 	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);

  1013 	result = (c1 - c2);

  1014     }

  1015     return result;

  1016 }

  1018 /*

  1019  *----------------------------------------------------------------------

  1020  *

  1021  * Tcl_UtfNcmp --

  1022  *

  1023  *	Compare at most n UTF chars of string cs to string ct.  Both cs

  1024  *	and ct are assumed to be at least n UTF chars long.

  1025  *

  1026  * Results:

  1027  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.

  1028  *

  1029  * Side effects:

  1030  *	None.

  1031  *

  1032  *----------------------------------------------------------------------

  1033  */

  1035 EXPORT_C int

  1036 Tcl_UtfNcmp(cs, ct, n)

  1037     CONST char *cs;		/* UTF string to compare to ct. */

  1038     CONST char *ct;		/* UTF string cs is compared to. */

  1039     unsigned long n;		/* Number of UTF chars to compare. */

  1040 {

  1041     Tcl_UniChar ch1, ch2;

  1042     /*

  1043      * Cannot use 'memcmp(cs, ct, n);' as byte representation of

  1044      * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte

  1045      * representation of \u0001 (the byte 0x01.)

  1046      */

  1047     while (n-- > 0) {

  1048 	/*

  1049 	 * n must be interpreted as chars, not bytes.

  1050 	 * This should be called only when both strings are of

  1051 	 * at least n chars long (no need for \0 check)

  1052 	 */

  1053 	cs += TclUtfToUniChar(cs, &ch1);

  1054 	ct += TclUtfToUniChar(ct, &ch2);

  1055 	if (ch1 != ch2) {

  1056 	    return (ch1 - ch2);

  1057 	}

  1058     }

  1059     return 0;

  1060 }

  1062 /*

  1063  *----------------------------------------------------------------------

  1064  *

  1065  * Tcl_UtfNcasecmp --

  1066  *

  1067  *	Compare at most n UTF chars of string cs to string ct case

  1068  *	insensitive.  Both cs and ct are assumed to be at least n

  1069  *	UTF chars long.

  1070  *

  1071  * Results:

  1072  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.

  1073  *

  1074  * Side effects:

  1075  *	None.

  1076  *

  1077  *----------------------------------------------------------------------

  1078  */

  1080 EXPORT_C int

  1081 Tcl_UtfNcasecmp(cs, ct, n)

  1082     CONST char *cs;		/* UTF string to compare to ct. */

  1083     CONST char *ct;		/* UTF string cs is compared to. */

  1084     unsigned long n;			/* Number of UTF chars to compare. */

  1085 {

  1086     Tcl_UniChar ch1, ch2;

  1087     while (n-- > 0) {

  1088 	/*

  1089 	 * n must be interpreted as chars, not bytes.

  1090 	 * This should be called only when both strings are of

  1091 	 * at least n chars long (no need for \0 check)

  1092 	 */

  1093 	cs += TclUtfToUniChar(cs, &ch1);

  1094 	ct += TclUtfToUniChar(ct, &ch2);

  1095 	if (ch1 != ch2) {

  1096 	    ch1 = Tcl_UniCharToLower(ch1);

  1097 	    ch2 = Tcl_UniCharToLower(ch2);

  1098 	    if (ch1 != ch2) {

  1099 		return (ch1 - ch2);

  1100 	    }

  1101 	}

  1102     }

  1103     return 0;

  1104 }

  1106 /*

  1107  *----------------------------------------------------------------------

  1108  *

  1109  * Tcl_UniCharToUpper --

  1110  *

  1111  *	Compute the uppercase equivalent of the given Unicode character.

  1112  *

  1113  * Results:

  1114  *	Returns the uppercase Unicode character.

  1115  *

  1116  * Side effects:

  1117  *	None.

  1118  *

  1119  *----------------------------------------------------------------------

  1120  */

  1122 EXPORT_C Tcl_UniChar

  1123 Tcl_UniCharToUpper(ch)

  1124     int ch;			/* Unicode character to convert. */

  1125 {

  1126     int info = GetUniCharInfo(ch);

  1128     if (GetCaseType(info) & 0x04) {

  1129 	return (Tcl_UniChar) (ch - GetDelta(info));

  1130     } else {

  1131 	return ch;

  1132     }

  1133 }

  1135 /*

  1136  *----------------------------------------------------------------------

  1137  *

  1138  * Tcl_UniCharToLower --

  1139  *

  1140  *	Compute the lowercase equivalent of the given Unicode character.

  1141  *

  1142  * Results:

  1143  *	Returns the lowercase Unicode character.

  1144  *

  1145  * Side effects:

  1146  *	None.

  1147  *

  1148  *----------------------------------------------------------------------

  1149  */

  1151 EXPORT_C Tcl_UniChar

  1152 Tcl_UniCharToLower(ch)

  1153     int ch;			/* Unicode character to convert. */

  1154 {

  1155     int info = GetUniCharInfo(ch);

  1157     if (GetCaseType(info) & 0x02) {

  1158 	return (Tcl_UniChar) (ch + GetDelta(info));

  1159     } else {

  1160 	return ch;

  1161     }

  1162 }

  1164 /*

  1165  *----------------------------------------------------------------------

  1166  *

  1167  * Tcl_UniCharToTitle --

  1168  *

  1169  *	Compute the titlecase equivalent of the given Unicode character.

  1170  *

  1171  * Results:

  1172  *	Returns the titlecase Unicode character.

  1173  *

  1174  * Side effects:

  1175  *	None.

  1176  *

  1177  *----------------------------------------------------------------------

  1178  */

  1180 EXPORT_C Tcl_UniChar

  1181 Tcl_UniCharToTitle(ch)

  1182     int ch;			/* Unicode character to convert. */

  1183 {

  1184     int info = GetUniCharInfo(ch);

  1185     int mode = GetCaseType(info);

  1187     if (mode & 0x1) {

  1188 	/*

  1189 	 * Subtract or add one depending on the original case.

  1190 	 */

  1192 	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));

  1193     } else if (mode == 0x4) {

  1194 	return (Tcl_UniChar) (ch - GetDelta(info));

  1195     } else {

  1196 	return ch;

  1197     }

  1198 }

  1200 /*

  1201  *----------------------------------------------------------------------

  1202  *

  1203  * Tcl_UniCharLen --

  1204  *

  1205  *	Find the length of a UniChar string.  The str input must be null

  1206  *	terminated.

  1207  *

  1208  * Results:

  1209  *	Returns the length of str in UniChars (not bytes).

  1210  *

  1211  * Side effects:

  1212  *	None.

  1213  *

  1214  *----------------------------------------------------------------------

  1215  */

  1217 EXPORT_C int

  1218 Tcl_UniCharLen(str)

  1219     CONST Tcl_UniChar *str;	/* Unicode string to find length of. */

  1220 {

  1221     int len = 0;

  1223     while (*str != '\0') {

  1224 	len++;

  1225 	str++;

  1226     }

  1227     return len;

  1228 }

  1230 /*

  1231  *----------------------------------------------------------------------

  1232  *

  1233  * Tcl_UniCharNcmp --

  1234  *

  1235  *	Compare at most n unichars of string cs to string ct.  Both cs

  1236  *	and ct are assumed to be at least n unichars long.

  1237  *

  1238  * Results:

  1239  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.

  1240  *

  1241  * Side effects:

  1242  *	None.

  1243  *

  1244  *----------------------------------------------------------------------

  1245  */

  1247 EXPORT_C int

  1248 Tcl_UniCharNcmp(cs, ct, n)

  1249     CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */

  1250     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */

  1251     unsigned long n;			/* Number of unichars to compare. */

  1252 {

  1253 #ifdef WORDS_BIGENDIAN

  1254     /*

  1255      * We are definitely on a big-endian machine; memcmp() is safe

  1256      */

  1257     return memcmp(cs, ct, n*sizeof(Tcl_UniChar));

  1259 #else /* !WORDS_BIGENDIAN */

  1260     /*

  1261      * We can't simply call memcmp() because that is not lexically correct.

  1262      */

  1263     for ( ; n != 0; cs++, ct++, n--) {

  1264 	if (*cs != *ct) {

  1265 	    return (*cs - *ct);

  1266 	}

  1267     }

  1268     return 0;

  1269 #endif /* WORDS_BIGENDIAN */

  1270 }

  1272 /*

  1273  *----------------------------------------------------------------------

  1274  *

  1275  * Tcl_UniCharNcasecmp --

  1276  *

  1277  *	Compare at most n unichars of string cs to string ct case

  1278  *	insensitive.  Both cs and ct are assumed to be at least n

  1279  *	unichars long.

  1280  *

  1281  * Results:

  1282  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.

  1283  *

  1284  * Side effects:

  1285  *	None.

  1286  *

  1287  *----------------------------------------------------------------------

  1288  */

  1290 EXPORT_C int

  1291 Tcl_UniCharNcasecmp(cs, ct, n)

  1292     CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */

  1293     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */

  1294     unsigned long n;			/* Number of unichars to compare. */

  1295 {

  1296     for ( ; n != 0; n--, cs++, ct++) {

  1297 	if (*cs != *ct) {

  1298 	    Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);

  1299 	    Tcl_UniChar lct = Tcl_UniCharToLower(*ct);

  1300 	    if (lcs != lct) {

  1301 		return (lcs - lct);

  1302 	    }

  1303 	}

  1304     }

  1305     return 0;

  1306 }

  1308 /*

  1309  *----------------------------------------------------------------------

  1310  *

  1311  * Tcl_UniCharIsAlnum --

  1312  *

  1313  *	Test if a character is an alphanumeric Unicode character.

  1314  *

  1315  * Results:

  1316  *	Returns 1 if character is alphanumeric.

  1317  *

  1318  * Side effects:

  1319  *	None.

  1320  *

  1321  *----------------------------------------------------------------------

  1322  */

  1324 EXPORT_C int

  1325 Tcl_UniCharIsAlnum(ch)

  1326     int ch;			/* Unicode character to test. */

  1327 {

  1328     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1330     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);

  1331 }

  1333 /*

  1334  *----------------------------------------------------------------------

  1335  *

  1336  * Tcl_UniCharIsAlpha --

  1337  *

  1338  *	Test if a character is an alphabetic Unicode character.

  1339  *

  1340  * Results:

  1341  *	Returns 1 if character is alphabetic.

  1342  *

  1343  * Side effects:

  1344  *	None.

  1345  *

  1346  *----------------------------------------------------------------------

  1347  */

  1349 EXPORT_C int

  1350 Tcl_UniCharIsAlpha(ch)

  1351     int ch;			/* Unicode character to test. */

  1352 {

  1353     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1354     return ((ALPHA_BITS >> category) & 1);

  1355 }

  1357 /*

  1358  *----------------------------------------------------------------------

  1359  *

  1360  * Tcl_UniCharIsControl --

  1361  *

  1362  *	Test if a character is a Unicode control character.

  1363  *

  1364  * Results:

  1365  *	Returns non-zero if character is a control.

  1366  *

  1367  * Side effects:

  1368  *	None.

  1369  *

  1370  *----------------------------------------------------------------------

  1371  */

  1373 EXPORT_C int

  1374 Tcl_UniCharIsControl(ch)

  1375     int ch;			/* Unicode character to test. */

  1376 {

  1377     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);

  1378 }

  1380 /*

  1381  *----------------------------------------------------------------------

  1382  *

  1383  * Tcl_UniCharIsDigit --

  1384  *

  1385  *	Test if a character is a numeric Unicode character.

  1386  *

  1387  * Results:

  1388  *	Returns non-zero if character is a digit.

  1389  *

  1390  * Side effects:

  1391  *	None.

  1392  *

  1393  *----------------------------------------------------------------------

  1394  */

  1396 EXPORT_C int

  1397 Tcl_UniCharIsDigit(ch)

  1398     int ch;			/* Unicode character to test. */

  1399 {

  1400     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)

  1401 	    == DECIMAL_DIGIT_NUMBER);

  1402 }

  1404 /*

  1405  *----------------------------------------------------------------------

  1406  *

  1407  * Tcl_UniCharIsGraph --

  1408  *

  1409  *	Test if a character is any Unicode print character except space.

  1410  *

  1411  * Results:

  1412  *	Returns non-zero if character is printable, but not space.

  1413  *

  1414  * Side effects:

  1415  *	None.

  1416  *

  1417  *----------------------------------------------------------------------

  1418  */

  1420 EXPORT_C int

  1421 Tcl_UniCharIsGraph(ch)

  1422     int ch;			/* Unicode character to test. */

  1423 {

  1424     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1425     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));

  1426 }

  1428 /*

  1429  *----------------------------------------------------------------------

  1430  *

  1431  * Tcl_UniCharIsLower --

  1432  *

  1433  *	Test if a character is a lowercase Unicode character.

  1434  *

  1435  * Results:

  1436  *	Returns non-zero if character is lowercase.

  1437  *

  1438  * Side effects:

  1439  *	None.

  1440  *

  1441  *----------------------------------------------------------------------

  1442  */

  1444 EXPORT_C int

  1445 Tcl_UniCharIsLower(ch)

  1446     int ch;			/* Unicode character to test. */

  1447 {

  1448     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);

  1449 }

  1451 /*

  1452  *----------------------------------------------------------------------

  1453  *

  1454  * Tcl_UniCharIsPrint --

  1455  *

  1456  *	Test if a character is a Unicode print character.

  1457  *

  1458  * Results:

  1459  *	Returns non-zero if character is printable.

  1460  *

  1461  * Side effects:

  1462  *	None.

  1463  *

  1464  *----------------------------------------------------------------------

  1465  */

  1467 EXPORT_C int

  1468 Tcl_UniCharIsPrint(ch)

  1469     int ch;			/* Unicode character to test. */

  1470 {

  1471     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1472     return ((PRINT_BITS >> category) & 1);

  1473 }

  1475 /*

  1476  *----------------------------------------------------------------------

  1477  *

  1478  * Tcl_UniCharIsPunct --

  1479  *

  1480  *	Test if a character is a Unicode punctuation character.

  1481  *

  1482  * Results:

  1483  *	Returns non-zero if character is punct.

  1484  *

  1485  * Side effects:

  1486  *	None.

  1487  *

  1488  *----------------------------------------------------------------------

  1489  */

  1491 EXPORT_C int

  1492 Tcl_UniCharIsPunct(ch)

  1493     int ch;			/* Unicode character to test. */

  1494 {

  1495     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1496     return ((PUNCT_BITS >> category) & 1);

  1497 }

  1499 /*

  1500  *----------------------------------------------------------------------

  1501  *

  1502  * Tcl_UniCharIsSpace --

  1503  *

  1504  *	Test if a character is a whitespace Unicode character.

  1505  *

  1506  * Results:

  1507  *	Returns non-zero if character is a space.

  1508  *

  1509  * Side effects:

  1510  *	None.

  1511  *

  1512  *----------------------------------------------------------------------

  1513  */

  1515 EXPORT_C int

  1516 Tcl_UniCharIsSpace(ch)

  1517     int ch;			/* Unicode character to test. */

  1518 {

  1519     register int category;

  1521     /*

  1522      * If the character is within the first 127 characters, just use the

  1523      * standard C function, otherwise consult the Unicode table.

  1524      */

  1526     if (ch < 0x80) {

  1527 	return isspace(UCHAR(ch)); /* INTL: ISO space */

  1528     } else {

  1529 	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1530 	return ((SPACE_BITS >> category) & 1);

  1531     }

  1532 }

  1534 /*

  1535  *----------------------------------------------------------------------

  1536  *

  1537  * Tcl_UniCharIsUpper --

  1538  *

  1539  *	Test if a character is a uppercase Unicode character.

  1540  *

  1541  * Results:

  1542  *	Returns non-zero if character is uppercase.

  1543  *

  1544  * Side effects:

  1545  *	None.

  1546  *

  1547  *----------------------------------------------------------------------

  1548  */

  1550 EXPORT_C int

  1551 Tcl_UniCharIsUpper(ch)

  1552     int ch;			/* Unicode character to test. */

  1553 {

  1554     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);

  1555 }

  1557 /*

  1558  *----------------------------------------------------------------------

  1559  *

  1560  * Tcl_UniCharIsWordChar --

  1561  *

  1562  *	Test if a character is alphanumeric or a connector punctuation

  1563  *	mark.

  1564  *

  1565  * Results:

  1566  *	Returns 1 if character is a word character.

  1567  *

  1568  * Side effects:

  1569  *	None.

  1570  *

  1571  *----------------------------------------------------------------------

  1572  */

  1574 EXPORT_C int

  1575 Tcl_UniCharIsWordChar(ch)

  1576     int ch;			/* Unicode character to test. */

  1577 {

  1578     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

  1580     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);

  1581 }

  1583 /*

  1584  *----------------------------------------------------------------------

  1585  *

  1586  * Tcl_UniCharCaseMatch --

  1587  *

  1588  *	See if a particular Unicode string matches a particular pattern.

  1589  *	Allows case insensitivity.  This is the Unicode equivalent of

  1590  *	the char* Tcl_StringCaseMatch.  The UniChar strings must be

  1591  *	NULL-terminated.  This has no provision for counted UniChar

  1592  *	strings, thus should not be used where NULLs are expected in the

  1593  *	UniChar string.  Use TclUniCharMatch where possible.

  1594  *

  1595  * Results:

  1596  *	The return value is 1 if string matches pattern, and

  1597  *	0 otherwise.  The matching operation permits the following

  1598  *	special characters in the pattern: *?\[] (see the manual

  1599  *	entry for details on what these mean).

  1600  *

  1601  * Side effects:

  1602  *	None.

  1603  *

  1604  *----------------------------------------------------------------------

  1605  */

  1607 EXPORT_C int

  1608 Tcl_UniCharCaseMatch(string, pattern, nocase)

  1609     CONST Tcl_UniChar *string;	/* Unicode String. */

  1610     CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special

  1611 				 * characters. */

  1612     int nocase;			/* 0 for case sensitive, 1 for insensitive */

  1613 {

  1614     Tcl_UniChar ch1, p;

  1616     while (1) {

  1617 	p = *pattern;

  1619 	/*

  1620 	 * See if we're at the end of both the pattern and the string.  If

  1621 	 * so, we succeeded.  If we're at the end of the pattern but not at

  1622 	 * the end of the string, we failed.

  1623 	 */

  1625 	if (p == 0) {

  1626 	    return (*string == 0);

  1627 	}

  1628 	if ((*string == 0) && (p != '*')) {

  1629 	    return 0;

  1630 	}

  1632 	/*

  1633 	 * Check for a "*" as the next pattern character.  It matches any

  1634 	 * substring.  We handle this by skipping all the characters up to the

  1635 	 * next matching one in the pattern, and then calling ourselves

  1636 	 * recursively for each postfix of string, until either we match or we

  1637 	 * reach the end of the string.

  1638 	 */

  1640 	if (p == '*') {

  1641 	    /*

  1642 	     * Skip all successive *'s in the pattern

  1643 	     */

  1644 	    while (*(++pattern) == '*') {}

  1645 	    p = *pattern;

  1646 	    if (p == 0) {

  1647 		return 1;

  1648 	    }

  1649 	    if (nocase) {

  1650 		p = Tcl_UniCharToLower(p);

  1651 	    }

  1652 	    while (1) {

  1653 		/*

  1654 		 * Optimization for matching - cruise through the string

  1655 		 * quickly if the next char in the pattern isn't a special

  1656 		 * character

  1657 		 */

  1658 		if ((p != '[') && (p != '?') && (p != '\\')) {

  1659 		    if (nocase) {

  1660 			while (*string && (p != *string)

  1661 				&& (p != Tcl_UniCharToLower(*string))) {

  1662 			    string++;

  1663 			}

  1664 		    } else {

  1665 			while (*string && (p != *string)) { string++; }

  1666 		    }

  1667 		}

  1668 		if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {

  1669 		    return 1;

  1670 		}

  1671 		if (*string == 0) {

  1672 		    return 0;

  1673 		}

  1674 		string++;

  1675 	    }

  1676 	}

  1678 	/*

  1679 	 * Check for a "?" as the next pattern character.  It matches

  1680 	 * any single character.

  1681 	 */

  1683 	if (p == '?') {

  1684 	    pattern++;

  1685 	    string++;

  1686 	    continue;

  1687 	}

  1689 	/*

  1690 	 * Check for a "[" as the next pattern character.  It is followed

  1691 	 * by a list of characters that are acceptable, or by a range

  1692 	 * (two characters separated by "-").

  1693 	 */

  1695 	if (p == '[') {

  1696 	    Tcl_UniChar startChar, endChar;

  1698 	    pattern++;

  1699 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);

  1700 	    string++;

  1701 	    while (1) {

  1702 		if ((*pattern == ']') || (*pattern == 0)) {

  1703 		    return 0;

  1704 		}

  1705 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);

  1706 		pattern++;

  1707 		if (*pattern == '-') {

  1708 		    pattern++;

  1709 		    if (*pattern == 0) {

  1710 			return 0;

  1711 		    }

  1712 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)

  1713 			    : *pattern);

  1714 		    pattern++;

  1715 		    if (((startChar <= ch1) && (ch1 <= endChar))

  1716 			    || ((endChar <= ch1) && (ch1 <= startChar))) {

  1717 			/*

  1718 			 * Matches ranges of form [a-z] or [z-a].

  1719 			 */

  1720 			break;

  1721 		    }

  1722 		} else if (startChar == ch1) {

  1723 		    break;

  1724 		}

  1725 	    }

  1726 	    while (*pattern != ']') {

  1727 		if (*pattern == 0) {

  1728 		    pattern--;

  1729 		    break;

  1730 		}

  1731 		pattern++;

  1732 	    }

  1733 	    pattern++;

  1734 	    continue;

  1735 	}

  1737 	/*

  1738 	 * If the next pattern character is '\', just strip off the '\'

  1739 	 * so we do exact matching on the character that follows.

  1740 	 */

  1742 	if (p == '\\') {

  1743 	    if (*(++pattern) == '\0') {

  1744 		return 0;

  1745 	    }

  1746 	}

  1748 	/*

  1749 	 * There's no special character.  Just make sure that the next

  1750 	 * bytes of each string match.

  1751 	 */

  1753 	if (nocase) {

  1754 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {

  1755 		return 0;

  1756 	    }

  1757 	} else if (*string != *pattern) {

  1758 	    return 0;

  1759 	}

  1760 	string++;

  1761 	pattern++;

  1762     }

  1763 }

  1765 /*

  1766  *----------------------------------------------------------------------

  1767  *

  1768  * TclUniCharMatch --

  1769  *

  1770  *	See if a particular Unicode string matches a particular pattern.

  1771  *	Allows case insensitivity.  This is the Unicode equivalent of the

  1772  *	char* Tcl_StringCaseMatch.  This variant of Tcl_UniCharCaseMatch

  1773  *	uses counted Strings, so embedded NULLs are allowed.

  1774  *

  1775  * Results:

  1776  *	The return value is 1 if string matches pattern, and

  1777  *	0 otherwise.  The matching operation permits the following

  1778  *	special characters in the pattern: *?\[] (see the manual

  1779  *	entry for details on what these mean).

  1780  *

  1781  * Side effects:

  1782  *	None.

  1783  *

  1784  *----------------------------------------------------------------------

  1785  */

  1787 int

  1788 TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)

  1789     CONST Tcl_UniChar *string;	/* Unicode String. */

  1790     int strLen;			/* length of String */

  1791     CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special

  1792 				 * characters. */

  1793     int ptnLen;			/* length of Pattern */

  1794     int nocase;			/* 0 for case sensitive, 1 for insensitive */

  1795 {

  1796     CONST Tcl_UniChar *stringEnd, *patternEnd;

  1797     Tcl_UniChar p;

  1799     stringEnd  = string + strLen;

  1800     patternEnd = pattern + ptnLen;

  1802     while (1) {

  1803 	/*

  1804 	 * See if we're at the end of both the pattern and the string.  If

  1805 	 * so, we succeeded.  If we're at the end of the pattern but not at

  1806 	 * the end of the string, we failed.

  1807 	 */

  1809 	if (pattern == patternEnd) {

  1810 	    return (string == stringEnd);

  1811 	}

  1812 	p = *pattern;

  1813 	if ((string == stringEnd) && (p != '*')) {

  1814 	    return 0;

  1815 	}

  1817 	/*

  1818 	 * Check for a "*" as the next pattern character.  It matches any

  1819 	 * substring.  We handle this by skipping all the characters up to the

  1820 	 * next matching one in the pattern, and then calling ourselves

  1821 	 * recursively for each postfix of string, until either we match or we

  1822 	 * reach the end of the string.

  1823 	 */

  1825 	if (p == '*') {

  1826 	    /*

  1827 	     * Skip all successive *'s in the pattern

  1828 	     */

  1829 	    while (*(++pattern) == '*') {}

  1830 	    if (pattern == patternEnd) {

  1831 		return 1;

  1832 	    }

  1833 	    p = *pattern;

  1834 	    if (nocase) {

  1835 		p = Tcl_UniCharToLower(p);

  1836 	    }

  1837 	    while (1) {

  1838 		/*

  1839 		 * Optimization for matching - cruise through the string

  1840 		 * quickly if the next char in the pattern isn't a special

  1841 		 * character

  1842 		 */

  1843 		if ((p != '[') && (p != '?') && (p != '\\')) {

  1844 		    if (nocase) {

  1845 			while ((string < stringEnd) && (p != *string)

  1846 				&& (p != Tcl_UniCharToLower(*string))) {

  1847 			    string++;

  1848 			}

  1849 		    } else {

  1850 			while ((string < stringEnd) && (p != *string)) {

  1851 			    string++;

  1852 			}

  1853 		    }

  1854 		}

  1855 		if (TclUniCharMatch(string, stringEnd - string,

  1856 			pattern, patternEnd - pattern, nocase)) {

  1857 		    return 1;

  1858 		}

  1859 		if (string == stringEnd) {

  1860 		    return 0;

  1861 		}

  1862 		string++;

  1863 	    }

  1864 	}

  1866 	/*

  1867 	 * Check for a "?" as the next pattern character.  It matches

  1868 	 * any single character.

  1869 	 */

  1871 	if (p == '?') {

  1872 	    pattern++;

  1873 	    string++;

  1874 	    continue;

  1875 	}

  1877 	/*

  1878 	 * Check for a "[" as the next pattern character.  It is followed

  1879 	 * by a list of characters that are acceptable, or by a range

  1880 	 * (two characters separated by "-").

  1881 	 */

  1883 	if (p == '[') {

  1884 	    Tcl_UniChar ch1, startChar, endChar;

  1886 	    pattern++;

  1887 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);

  1888 	    string++;

  1889 	    while (1) {

  1890 		if ((*pattern == ']') || (pattern == patternEnd)) {

  1891 		    return 0;

  1892 		}

  1893 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);

  1894 		pattern++;

  1895 		if (*pattern == '-') {

  1896 		    pattern++;

  1897 		    if (pattern == patternEnd) {

  1898 			return 0;

  1899 		    }

  1900 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)

  1901 			    : *pattern);

  1902 		    pattern++;

  1903 		    if (((startChar <= ch1) && (ch1 <= endChar))

  1904 			    || ((endChar <= ch1) && (ch1 <= startChar))) {

  1905 			/*

  1906 			 * Matches ranges of form [a-z] or [z-a].

  1907 			 */

  1908 			break;

  1909 		    }

  1910 		} else if (startChar == ch1) {

  1911 		    break;

  1912 		}

  1913 	    }

  1914 	    while (*pattern != ']') {

  1915 		if (pattern == patternEnd) {

  1916 		    pattern--;

  1917 		    break;

  1918 		}

  1919 		pattern++;

  1920 	    }

  1921 	    pattern++;

  1922 	    continue;

  1923 	}

  1925 	/*

  1926 	 * If the next pattern character is '\', just strip off the '\'

  1927 	 * so we do exact matching on the character that follows.

  1928 	 */

  1930 	if (p == '\\') {

  1931 	    if (++pattern == patternEnd) {

  1932 		return 0;

  1933 	    }

  1934 	}

  1936 	/*

  1937 	 * There's no special character.  Just make sure that the next

  1938 	 * bytes of each string match.

  1939 	 */

  1941 	if (nocase) {

  1942 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {

  1943 		return 0;

  1944 	    }

  1945 	} else if (*string != *pattern) {

  1946 	    return 0;

  1947 	}

  1948 	string++;

  1949 	pattern++;

  1950     }

  1951 }

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--