sl@0: /*
sl@0:  * tclUtf.c --
sl@0:  *
sl@0:  *	Routines for manipulating UTF-8 strings.
sl@0:  *
sl@0:  * Copyright (c) 1997-1998 Sun Microsystems, Inc.
sl@0:  * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.  
sl@0:  *
sl@0:  * See the file "license.terms" for information on usage and redistribution
sl@0:  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
sl@0:  *
sl@0:  * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $
sl@0:  */
sl@0: 
sl@0: #include "tclInt.h"
sl@0: 
sl@0: /*
sl@0:  * Include the static character classification tables and macros.
sl@0:  */
sl@0: 
sl@0: #include "tclUniData.c"
sl@0: 
sl@0: /*
sl@0:  * The following macros are used for fast character category tests.  The
sl@0:  * x_BITS values are shifted right by the category value to determine whether
sl@0:  * the given category is included in the set.
sl@0:  */ 
sl@0: 
sl@0: #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
sl@0:     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
sl@0: 
sl@0: #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
sl@0: 
sl@0: #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
sl@0:     | (1 << PARAGRAPH_SEPARATOR))
sl@0: 
sl@0: #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
sl@0: 
sl@0: #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
sl@0: 	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
sl@0: 	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
sl@0: 	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
sl@0: 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
sl@0: 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
sl@0: 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
sl@0: 	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
sl@0: 	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
sl@0: 
sl@0: #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
sl@0: 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
sl@0: 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
sl@0: 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
sl@0: 
sl@0: /*
sl@0:  * Unicode characters less than this value are represented by themselves 
sl@0:  * in UTF-8 strings. 
sl@0:  */
sl@0: 
sl@0: #define UNICODE_SELF	0x80
sl@0: 
sl@0: /*
sl@0:  * The following structures are used when mapping between Unicode (UCS-2)
sl@0:  * and UTF-8.
sl@0:  */
sl@0: 
sl@0: static CONST unsigned char totalBytes[256] = {
sl@0:     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0:     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0:     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0:     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0:     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0:     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0:     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
sl@0:     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
sl@0: #if TCL_UTF_MAX > 3
sl@0:     4,4,4,4,4,4,4,4,
sl@0: #else
sl@0:     1,1,1,1,1,1,1,1,
sl@0: #endif
sl@0: #if TCL_UTF_MAX > 4
sl@0:     5,5,5,5,
sl@0: #else
sl@0:     1,1,1,1,
sl@0: #endif
sl@0: #if TCL_UTF_MAX > 5
sl@0:     6,6,6,6
sl@0: #else
sl@0:     1,1,1,1
sl@0: #endif
sl@0: };
sl@0: 
sl@0: /*
sl@0:  * Procedures used only in this module.
sl@0:  */
sl@0: 
sl@0: static int UtfCount _ANSI_ARGS_((int ch));
sl@0: 
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * UtfCount --
sl@0:  *
sl@0:  *	Find the number of bytes in the Utf character "ch".
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return values is the number of bytes in the Utf character "ch".
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: INLINE static int
sl@0: UtfCount(ch)
sl@0:     int ch;			/* The Tcl_UniChar whose size is returned. */
sl@0: {
sl@0:     if ((ch > 0) && (ch < UNICODE_SELF)) {
sl@0: 	return 1;
sl@0:     }
sl@0:     if (ch <= 0x7FF) {
sl@0: 	return 2;
sl@0:     }
sl@0:     if (ch <= 0xFFFF) {
sl@0: 	return 3;
sl@0:     }
sl@0: #if TCL_UTF_MAX > 3
sl@0:     if (ch <= 0x1FFFFF) {
sl@0: 	return 4;
sl@0:     }
sl@0:     if (ch <= 0x3FFFFFF) {
sl@0: 	return 5;
sl@0:     }
sl@0:     if (ch <= 0x7FFFFFFF) {
sl@0: 	return 6;
sl@0:     }
sl@0: #endif
sl@0:     return 3;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharToUtf --
sl@0:  *
sl@0:  *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
sl@0:  *	provided buffer.  Equivalent to Plan 9 runetochar().
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return values is the number of bytes in the buffer that
sl@0:  *	were consumed.  
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: EXPORT_C INLINE int
sl@0: Tcl_UniCharToUtf(ch, str)
sl@0:     int ch;			/* The Tcl_UniChar to be stored in the
sl@0: 				 * buffer. */
sl@0:     char *str;			/* Buffer in which the UTF-8 representation
sl@0: 				 * of the Tcl_UniChar is stored.  Buffer must
sl@0: 				 * be large enough to hold the UTF-8 character
sl@0: 				 * (at most TCL_UTF_MAX bytes). */
sl@0: {
sl@0:     if ((ch > 0) && (ch < UNICODE_SELF)) {
sl@0: 	str[0] = (char) ch;
sl@0: 	return 1;
sl@0:     }
sl@0:     if (ch >= 0) {
sl@0: 	if (ch <= 0x7FF) {
sl@0: 	    str[1] = (char) ((ch | 0x80) & 0xBF);
sl@0: 	    str[0] = (char) ((ch >> 6) | 0xC0);
sl@0: 	    return 2;
sl@0: 	}
sl@0: 	if (ch <= 0xFFFF) {
sl@0: 	three:
sl@0: 	    str[2] = (char) ((ch | 0x80) & 0xBF);
sl@0: 	    str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0: 	    str[0] = (char) ((ch >> 12) | 0xE0);
sl@0: 	    return 3;
sl@0: 	}
sl@0: 
sl@0: #if TCL_UTF_MAX > 3
sl@0: 	if (ch <= 0x1FFFFF) {
sl@0: 	    str[3] = (char) ((ch | 0x80) & 0xBF);
sl@0: 	    str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0: 	    str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
sl@0: 	    str[0] = (char) ((ch >> 18) | 0xF0);
sl@0: 	    return 4;
sl@0: 	}
sl@0: 	if (ch <= 0x3FFFFFF) {
sl@0: 	    str[4] = (char) ((ch | 0x80) & 0xBF);
sl@0: 	    str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0: 	    str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
sl@0: 	    str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
sl@0: 	    str[0] = (char) ((ch >> 24) | 0xF8);
sl@0: 	    return 5;
sl@0: 	}
sl@0: 	if (ch <= 0x7FFFFFFF) {
sl@0: 	    str[5] = (char) ((ch | 0x80) & 0xBF);
sl@0: 	    str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
sl@0: 	    str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
sl@0: 	    str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
sl@0: 	    str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
sl@0: 	    str[0] = (char) ((ch >> 30) | 0xFC);
sl@0: 	    return 6;
sl@0: 	}
sl@0: #endif
sl@0:     }
sl@0: 
sl@0:     ch = 0xFFFD;
sl@0:     goto three;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharToUtfDString --
sl@0:  *
sl@0:  *	Convert the given Unicode string to UTF-8.
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is a pointer to the UTF-8 representation of the
sl@0:  *	Unicode string.  Storage for the return value is appended to the
sl@0:  *	end of dsPtr.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: EXPORT_C char *
sl@0: Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
sl@0:     CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */
sl@0:     int numChars;		/* Length of Unicode string in Tcl_UniChars
sl@0: 				 * (must be >= 0). */
sl@0:     Tcl_DString *dsPtr;		/* UTF-8 representation of string is
sl@0: 				 * appended to this previously initialized
sl@0: 				 * DString. */
sl@0: {
sl@0:     CONST Tcl_UniChar *w, *wEnd;
sl@0:     char *p, *string;
sl@0:     int oldLength;
sl@0: 
sl@0:     /*
sl@0:      * UTF-8 string length in bytes will be <= Unicode string length *
sl@0:      * TCL_UTF_MAX.
sl@0:      */
sl@0: 
sl@0:     oldLength = Tcl_DStringLength(dsPtr);
sl@0:     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
sl@0:     string = Tcl_DStringValue(dsPtr) + oldLength;
sl@0: 
sl@0:     p = string;
sl@0:     wEnd = wString + numChars;
sl@0:     for (w = wString; w < wEnd; ) {
sl@0: 	p += Tcl_UniCharToUtf(*w, p);
sl@0: 	w++;
sl@0:     }
sl@0:     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
sl@0: 
sl@0:     return string;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfToUniChar --
sl@0:  *
sl@0:  *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
sl@0:  *	UTF-8 sequences are converted to valid Tcl_UniChars and processing
sl@0:  *	continues.  Equivalent to Plan 9 chartorune().
sl@0:  *
sl@0:  *	The caller must ensure that the source buffer is long enough that
sl@0:  *	this routine does not run off the end and dereference non-existent
sl@0:  *	memory looking for trail bytes.  If the source buffer is known to
sl@0:  *	be '\0' terminated, this cannot happen.  Otherwise, the caller
sl@0:  *	should call Tcl_UtfCharComplete() before calling this routine to
sl@0:  *	ensure that enough bytes remain in the string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	*chPtr is filled with the Tcl_UniChar, and the return value is the
sl@0:  *	number of bytes from the UTF-8 string that were consumed.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: EXPORT_C int
sl@0: Tcl_UtfToUniChar(str, chPtr)
sl@0:     register CONST char *str;	 /* The UTF-8 string. */
sl@0:     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
sl@0: 				  * by the UTF-8 string. */
sl@0: {
sl@0:     register int byte;
sl@0:     
sl@0:     /*
sl@0:      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
sl@0:      */
sl@0: 
sl@0:     byte = *((unsigned char *) str);
sl@0:     if (byte < 0xC0) {
sl@0: 	/*
sl@0: 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
sl@0: 	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
sl@0: 	 * characters representing themselves.
sl@0: 	 */
sl@0: 
sl@0: 	*chPtr = (Tcl_UniChar) byte;
sl@0: 	return 1;
sl@0:     } else if (byte < 0xE0) {
sl@0: 	if ((str[1] & 0xC0) == 0x80) {
sl@0: 	    /*
sl@0: 	     * Two-byte-character lead-byte followed by a trail-byte.
sl@0: 	     */
sl@0: 
sl@0: 	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
sl@0: 	    return 2;
sl@0: 	}
sl@0: 	/*
sl@0: 	 * A two-byte-character lead-byte not followed by trail-byte
sl@0: 	 * represents itself.
sl@0: 	 */
sl@0: 
sl@0: 	*chPtr = (Tcl_UniChar) byte;
sl@0: 	return 1;
sl@0:     } else if (byte < 0xF0) {
sl@0: 	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
sl@0: 	    /*
sl@0: 	     * Three-byte-character lead byte followed by two trail bytes.
sl@0: 	     */
sl@0: 
sl@0: 	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 
sl@0: 		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
sl@0: 	    return 3;
sl@0: 	}
sl@0: 	/*
sl@0: 	 * A three-byte-character lead-byte not followed by two trail-bytes
sl@0: 	 * represents itself.
sl@0: 	 */
sl@0: 
sl@0: 	*chPtr = (Tcl_UniChar) byte;
sl@0: 	return 1;
sl@0:     }
sl@0: #if TCL_UTF_MAX > 3
sl@0:     else {
sl@0: 	int ch, total, trail;
sl@0: 
sl@0: 	total = totalBytes[byte];
sl@0: 	trail = total - 1;
sl@0: 	if (trail > 0) {
sl@0: 	    ch = byte & (0x3F >> trail);
sl@0: 	    do {
sl@0: 		str++;
sl@0: 		if ((*str & 0xC0) != 0x80) {
sl@0: 		    *chPtr = byte;
sl@0: 		    return 1;
sl@0: 		}
sl@0: 		ch <<= 6;
sl@0: 		ch |= (*str & 0x3F);
sl@0: 		trail--;
sl@0: 	    } while (trail > 0);
sl@0: 	    *chPtr = ch;
sl@0: 	    return total;
sl@0: 	}
sl@0:     }
sl@0: #endif
sl@0: 
sl@0:     *chPtr = (Tcl_UniChar) byte;
sl@0:     return 1;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfToUniCharDString --
sl@0:  *
sl@0:  *	Convert the UTF-8 string to Unicode.
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is a pointer to the Unicode representation of the
sl@0:  *	UTF-8 string.  Storage for the return value is appended to the
sl@0:  *	end of dsPtr.  The Unicode string is terminated with a Unicode
sl@0:  *	NULL character.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C Tcl_UniChar *
sl@0: Tcl_UtfToUniCharDString(string, length, dsPtr)
sl@0:     CONST char *string;		/* UTF-8 string to convert to Unicode. */
sl@0:     int length;			/* Length of UTF-8 string in bytes, or -1
sl@0: 				 * for strlen(). */
sl@0:     Tcl_DString *dsPtr;		/* Unicode representation of string is
sl@0: 				 * appended to this previously initialized
sl@0: 				 * DString. */
sl@0: {
sl@0:     Tcl_UniChar *w, *wString;
sl@0:     CONST char *p, *end;
sl@0:     int oldLength;
sl@0: 
sl@0:     if (length < 0) {
sl@0: 	length = strlen(string);
sl@0:     }
sl@0: 
sl@0:     /*
sl@0:      * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
sl@0:      * in bytes.
sl@0:      */
sl@0: 
sl@0:     oldLength = Tcl_DStringLength(dsPtr);
sl@0:     Tcl_DStringSetLength(dsPtr,
sl@0: 	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
sl@0:     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
sl@0: 
sl@0:     w = wString;
sl@0:     end = string + length;
sl@0:     for (p = string; p < end; ) {
sl@0: 	p += TclUtfToUniChar(p, w);
sl@0: 	w++;
sl@0:     }
sl@0:     *w = '\0';
sl@0:     Tcl_DStringSetLength(dsPtr,
sl@0: 	    (oldLength + ((char *) w - (char *) wString)));
sl@0: 
sl@0:     return wString;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfCharComplete --
sl@0:  *
sl@0:  *	Determine if the UTF-8 string of the given length is long enough
sl@0:  *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
sl@0:  *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is 0 if the string is not long enough, non-zero
sl@0:  *	otherwise.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfCharComplete(str, len)
sl@0:     CONST char *str;		/* String to check if first few bytes
sl@0: 				 * contain a complete UTF-8 character. */
sl@0:     int len;			/* Length of above string in bytes. */
sl@0: {
sl@0:     int ch;
sl@0: 
sl@0:     ch = *((unsigned char *) str);
sl@0:     return len >= totalBytes[ch];
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_NumUtfChars --
sl@0:  *
sl@0:  *	Returns the number of characters (not bytes) in the UTF-8 string,
sl@0:  *	not including the terminating NULL byte.  This is equivalent to
sl@0:  *	Plan 9 utflen() and utfnlen().
sl@0:  *
sl@0:  * Results:
sl@0:  *	As above.  
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: EXPORT_C int 
sl@0: Tcl_NumUtfChars(str, len)
sl@0:     register CONST char *str;	/* The UTF-8 string to measure. */
sl@0:     int len;			/* The length of the string in bytes, or -1
sl@0: 				 * for strlen(string). */
sl@0: {
sl@0:     Tcl_UniChar ch;
sl@0:     register Tcl_UniChar *chPtr = &ch;
sl@0:     register int i;
sl@0: 
sl@0:     /*
sl@0:      * The separate implementations are faster.
sl@0:      *
sl@0:      * Since this is a time-sensitive function, we also do the check for
sl@0:      * the single-byte char case specially.
sl@0:      */
sl@0: 
sl@0:     i = 0;
sl@0:     if (len < 0) {
sl@0: 	while (*str != '\0') {
sl@0: 	    str += TclUtfToUniChar(str, chPtr);
sl@0: 	    i++;
sl@0: 	}
sl@0:     } else {
sl@0: 	register int n;
sl@0: 
sl@0: 	while (len > 0) {
sl@0: 	    if (UCHAR(*str) < 0xC0) {
sl@0: 		len--;
sl@0: 		str++;
sl@0: 	    } else {
sl@0: 		n = Tcl_UtfToUniChar(str, chPtr);
sl@0: 		len -= n;
sl@0: 		str += n;
sl@0: 	    }
sl@0: 	    i++;
sl@0: 	}
sl@0:     }
sl@0:     return i;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfFindFirst --
sl@0:  *
sl@0:  *	Returns a pointer to the first occurance of the given Tcl_UniChar
sl@0:  *	in the NULL-terminated UTF-8 string.  The NULL terminator is
sl@0:  *	considered part of the UTF-8 string.  Equivalent to Plan 9
sl@0:  *	utfrune().
sl@0:  *
sl@0:  * Results:
sl@0:  *	As above.  If the Tcl_UniChar does not exist in the given string,
sl@0:  *	the return value is NULL.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: EXPORT_C CONST char *
sl@0: Tcl_UtfFindFirst(string, ch)
sl@0:     CONST char *string;		/* The UTF-8 string to be searched. */
sl@0:     int ch;			/* The Tcl_UniChar to search for. */
sl@0: {
sl@0:     int len;
sl@0:     Tcl_UniChar find;
sl@0:     
sl@0:     while (1) {
sl@0: 	len = TclUtfToUniChar(string, &find);
sl@0: 	if (find == ch) {
sl@0: 	    return string;
sl@0: 	}
sl@0: 	if (*string == '\0') {
sl@0: 	    return NULL;
sl@0: 	}
sl@0: 	string += len;
sl@0:     }
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfFindLast --
sl@0:  *
sl@0:  *	Returns a pointer to the last occurance of the given Tcl_UniChar
sl@0:  *	in the NULL-terminated UTF-8 string.  The NULL terminator is
sl@0:  *	considered part of the UTF-8 string.  Equivalent to Plan 9
sl@0:  *	utfrrune().
sl@0:  *
sl@0:  * Results:
sl@0:  *	As above.  If the Tcl_UniChar does not exist in the given string,
sl@0:  *	the return value is NULL.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C CONST char *
sl@0: Tcl_UtfFindLast(string, ch)
sl@0:     CONST char *string;		/* The UTF-8 string to be searched. */
sl@0:     int ch;			/* The Tcl_UniChar to search for. */
sl@0: {
sl@0:     int len;
sl@0:     Tcl_UniChar find;
sl@0:     CONST char *last;
sl@0: 	
sl@0:     last = NULL;
sl@0:     while (1) {
sl@0: 	len = TclUtfToUniChar(string, &find);
sl@0: 	if (find == ch) {
sl@0: 	    last = string;
sl@0: 	}
sl@0: 	if (*string == '\0') {
sl@0: 	    break;
sl@0: 	}
sl@0: 	string += len;
sl@0:     }
sl@0:     return last;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfNext --
sl@0:  *
sl@0:  *	Given a pointer to some current location in a UTF-8 string,
sl@0:  *	move forward one character.  The caller must ensure that they
sl@0:  *	are not asking for the next character after the last character
sl@0:  *	in the string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is the pointer to the next character in
sl@0:  *	the UTF-8 string.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: EXPORT_C CONST char *
sl@0: Tcl_UtfNext(str) 
sl@0:     CONST char *str;		    /* The current location in the string. */
sl@0: {
sl@0:     Tcl_UniChar ch;
sl@0: 
sl@0:     return str + TclUtfToUniChar(str, &ch);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfPrev --
sl@0:  *
sl@0:  *	Given a pointer to some current location in a UTF-8 string,
sl@0:  *	move backwards one character.  This works correctly when the
sl@0:  *	pointer is in the middle of a UTF-8 character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is a pointer to the previous character in the
sl@0:  *	UTF-8 string.  If the current location was already at the
sl@0:  *	beginning of the string, the return value will also be a
sl@0:  *	pointer to the beginning of the string.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C CONST char *
sl@0: Tcl_UtfPrev(str, start)
sl@0:     CONST char *str;		    /* The current location in the string. */
sl@0:     CONST char *start;		    /* Pointer to the beginning of the
sl@0: 				     * string, to avoid going backwards too
sl@0: 				     * far. */
sl@0: {
sl@0:     CONST char *look;
sl@0:     int i, byte;
sl@0:     
sl@0:     str--;
sl@0:     look = str;
sl@0:     for (i = 0; i < TCL_UTF_MAX; i++) {
sl@0: 	if (look < start) {
sl@0: 	    if (str < start) {
sl@0: 		str = start;
sl@0: 	    }
sl@0: 	    break;
sl@0: 	}
sl@0: 	byte = *((unsigned char *) look);
sl@0: 	if (byte < 0x80) {
sl@0: 	    break;
sl@0: 	}
sl@0: 	if (byte >= 0xC0) {
sl@0: 	    return look;
sl@0: 	}
sl@0: 	look--;
sl@0:     }
sl@0:     return str;
sl@0: }
sl@0: 	
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharAtIndex --
sl@0:  *
sl@0:  *	Returns the Unicode character represented at the specified
sl@0:  *	character (not byte) position in the UTF-8 string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	As above.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0:  
sl@0: EXPORT_C Tcl_UniChar
sl@0: Tcl_UniCharAtIndex(src, index)
sl@0:     register CONST char *src;	/* The UTF-8 string to dereference. */
sl@0:     register int index;		/* The position of the desired character. */
sl@0: {
sl@0:     Tcl_UniChar ch;
sl@0: 
sl@0:     while (index >= 0) {
sl@0: 	index--;
sl@0: 	src += TclUtfToUniChar(src, &ch);
sl@0:     }
sl@0:     return ch;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfAtIndex --
sl@0:  *
sl@0:  *	Returns a pointer to the specified character (not byte) position
sl@0:  *	in the UTF-8 string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	As above.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C CONST char *
sl@0: Tcl_UtfAtIndex(src, index)
sl@0:     register CONST char *src;	/* The UTF-8 string. */
sl@0:     register int index;		/* The position of the desired character. */
sl@0: {
sl@0:     Tcl_UniChar ch;
sl@0:     
sl@0:     while (index > 0) {
sl@0: 	index--;
sl@0: 	src += TclUtfToUniChar(src, &ch);
sl@0:     }
sl@0:     return src;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *---------------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfBackslash --
sl@0:  *
sl@0:  *	Figure out how to handle a backslash sequence.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Stores the bytes represented by the backslash sequence in dst and
sl@0:  *	returns the number of bytes written to dst.  At most TCL_UTF_MAX
sl@0:  *	bytes are written to dst; dst must have been large enough to accept
sl@0:  *	those bytes.  If readPtr isn't NULL then it is filled in with a
sl@0:  *	count of the number of bytes in the backslash sequence.  
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	The maximum number of bytes it takes to represent a Unicode
sl@0:  *	character in UTF-8 is guaranteed to be less than the number of
sl@0:  *	bytes used to express the backslash sequence that represents
sl@0:  *	that Unicode character.  If the target buffer into which the
sl@0:  *	caller is going to store the bytes that represent the Unicode
sl@0:  *	character is at least as large as the source buffer from which
sl@0:  *	the backslashed sequence was extracted, no buffer overruns should
sl@0:  *	occur.
sl@0:  *
sl@0:  *---------------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfBackslash(src, readPtr, dst)
sl@0:     CONST char *src;		/* Points to the backslash character of
sl@0: 				 * a backslash sequence. */
sl@0:     int *readPtr;		/* Fill in with number of characters read
sl@0: 				 * from src, unless NULL. */
sl@0:     char *dst;			/* Filled with the bytes represented by the
sl@0: 				 * backslash sequence. */
sl@0: {
sl@0: #define LINE_LENGTH 128
sl@0:     int numRead;
sl@0:     int result;
sl@0: 
sl@0:     result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
sl@0:     if (numRead == LINE_LENGTH) {
sl@0: 	/* We ate a whole line.  Pay the price of a strlen() */
sl@0: 	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
sl@0:     }
sl@0:     if (readPtr != NULL) {
sl@0: 	*readPtr = numRead;
sl@0:     }
sl@0:     return result;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfToUpper --
sl@0:  *
sl@0:  *	Convert lowercase characters to uppercase characters in a UTF
sl@0:  *	string in place.  The conversion may shrink the UTF string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the number of bytes in the resulting string
sl@0:  *	excluding the trailing null.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	Writes a terminating null after the last converted character.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfToUpper(str)
sl@0:     char *str;			/* String to convert in place. */
sl@0: {
sl@0:     Tcl_UniChar ch, upChar;
sl@0:     char *src, *dst;
sl@0:     int bytes;
sl@0: 
sl@0:     /*
sl@0:      * Iterate over the string until we hit the terminating null.
sl@0:      */
sl@0: 
sl@0:     src = dst = str;
sl@0:     while (*src) {
sl@0:         bytes = TclUtfToUniChar(src, &ch);
sl@0: 	upChar = Tcl_UniCharToUpper(ch);
sl@0: 
sl@0: 	/*
sl@0: 	 * To keep badly formed Utf strings from getting inflated by
sl@0: 	 * the conversion (thereby causing a segfault), only copy the
sl@0: 	 * upper case char to dst if its size is <= the original char.
sl@0: 	 */
sl@0: 	
sl@0: 	if (bytes < UtfCount(upChar)) {
sl@0: 	    memcpy(dst, src, (size_t) bytes);
sl@0: 	    dst += bytes;
sl@0: 	} else {
sl@0: 	    dst += Tcl_UniCharToUtf(upChar, dst);
sl@0: 	}
sl@0: 	src += bytes;
sl@0:     }
sl@0:     *dst = '\0';
sl@0:     return (dst - str);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfToLower --
sl@0:  *
sl@0:  *	Convert uppercase characters to lowercase characters in a UTF
sl@0:  *	string in place.  The conversion may shrink the UTF string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the number of bytes in the resulting string
sl@0:  *	excluding the trailing null.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	Writes a terminating null after the last converted character.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfToLower(str)
sl@0:     char *str;			/* String to convert in place. */
sl@0: {
sl@0:     Tcl_UniChar ch, lowChar;
sl@0:     char *src, *dst;
sl@0:     int bytes;
sl@0:     
sl@0:     /*
sl@0:      * Iterate over the string until we hit the terminating null.
sl@0:      */
sl@0: 
sl@0:     src = dst = str;
sl@0:     while (*src) {
sl@0: 	bytes = TclUtfToUniChar(src, &ch);
sl@0: 	lowChar = Tcl_UniCharToLower(ch);
sl@0: 
sl@0: 	/*
sl@0: 	 * To keep badly formed Utf strings from getting inflated by
sl@0: 	 * the conversion (thereby causing a segfault), only copy the
sl@0: 	 * lower case char to dst if its size is <= the original char.
sl@0: 	 */
sl@0: 	
sl@0: 	if (bytes < UtfCount(lowChar)) {
sl@0: 	    memcpy(dst, src, (size_t) bytes);
sl@0: 	    dst += bytes;
sl@0: 	} else {
sl@0: 	    dst += Tcl_UniCharToUtf(lowChar, dst);
sl@0: 	}
sl@0: 	src += bytes;
sl@0:     }
sl@0:     *dst = '\0';
sl@0:     return (dst - str);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfToTitle --
sl@0:  *
sl@0:  *	Changes the first character of a UTF string to title case or
sl@0:  *	uppercase and the rest of the string to lowercase.  The
sl@0:  *	conversion happens in place and may shrink the UTF string.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the number of bytes in the resulting string
sl@0:  *	excluding the trailing null.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	Writes a terminating null after the last converted character.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfToTitle(str)
sl@0:     char *str;			/* String to convert in place. */
sl@0: {
sl@0:     Tcl_UniChar ch, titleChar, lowChar;
sl@0:     char *src, *dst;
sl@0:     int bytes;
sl@0:     
sl@0:     /*
sl@0:      * Capitalize the first character and then lowercase the rest of the
sl@0:      * characters until we get to a null.
sl@0:      */
sl@0: 
sl@0:     src = dst = str;
sl@0: 
sl@0:     if (*src) {
sl@0: 	bytes = TclUtfToUniChar(src, &ch);
sl@0: 	titleChar = Tcl_UniCharToTitle(ch);
sl@0: 
sl@0: 	if (bytes < UtfCount(titleChar)) {
sl@0: 	    memcpy(dst, src, (size_t) bytes);
sl@0: 	    dst += bytes;
sl@0: 	} else {
sl@0: 	    dst += Tcl_UniCharToUtf(titleChar, dst);
sl@0: 	}
sl@0: 	src += bytes;
sl@0:     }
sl@0:     while (*src) {
sl@0: 	bytes = TclUtfToUniChar(src, &ch);
sl@0: 	lowChar = Tcl_UniCharToLower(ch);
sl@0: 
sl@0: 	if (bytes < UtfCount(lowChar)) {
sl@0: 	    memcpy(dst, src, (size_t) bytes);
sl@0: 	    dst += bytes;
sl@0: 	} else {
sl@0: 	    dst += Tcl_UniCharToUtf(lowChar, dst);
sl@0: 	}
sl@0: 	src += bytes;
sl@0:     }
sl@0:     *dst = '\0';
sl@0:     return (dst - str);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * TclpUtfNcmp2 --
sl@0:  *
sl@0:  *	Compare at most n bytes of utf-8 strings cs and ct.  Both cs
sl@0:  *	and ct are assumed to be at least n bytes long.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: int
sl@0: TclpUtfNcmp2(cs, ct, n)
sl@0:     CONST char *cs;		/* UTF string to compare to ct. */
sl@0:     CONST char *ct;		/* UTF string cs is compared to. */
sl@0:     unsigned long n;		/* Number of *bytes* to compare. */
sl@0: {
sl@0:     /*
sl@0:      * We can't simply call 'memcmp(cs, ct, n);' because we need to check
sl@0:      * for Tcl's \xC0\x80 non-utf-8 null encoding.
sl@0:      * Otherwise utf-8 lexes fine in the strcmp manner.
sl@0:      */
sl@0:     register int result = 0;
sl@0: 
sl@0:     for ( ; n != 0; n--, cs++, ct++) {
sl@0: 	if (*cs != *ct) {
sl@0: 	    result = UCHAR(*cs) - UCHAR(*ct);
sl@0: 	    break;
sl@0: 	}
sl@0:     }
sl@0:     if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
sl@0: 	unsigned char c1, c2;
sl@0: 	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
sl@0: 	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
sl@0: 	result = (c1 - c2);
sl@0:     }
sl@0:     return result;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfNcmp --
sl@0:  *
sl@0:  *	Compare at most n UTF chars of string cs to string ct.  Both cs
sl@0:  *	and ct are assumed to be at least n UTF chars long.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfNcmp(cs, ct, n)
sl@0:     CONST char *cs;		/* UTF string to compare to ct. */
sl@0:     CONST char *ct;		/* UTF string cs is compared to. */
sl@0:     unsigned long n;		/* Number of UTF chars to compare. */
sl@0: {
sl@0:     Tcl_UniChar ch1, ch2;
sl@0:     /*
sl@0:      * Cannot use 'memcmp(cs, ct, n);' as byte representation of
sl@0:      * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
sl@0:      * representation of \u0001 (the byte 0x01.)
sl@0:      */
sl@0:     while (n-- > 0) {
sl@0: 	/*
sl@0: 	 * n must be interpreted as chars, not bytes.
sl@0: 	 * This should be called only when both strings are of
sl@0: 	 * at least n chars long (no need for \0 check)
sl@0: 	 */
sl@0: 	cs += TclUtfToUniChar(cs, &ch1);
sl@0: 	ct += TclUtfToUniChar(ct, &ch2);
sl@0: 	if (ch1 != ch2) {
sl@0: 	    return (ch1 - ch2);
sl@0: 	}
sl@0:     }
sl@0:     return 0;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UtfNcasecmp --
sl@0:  *
sl@0:  *	Compare at most n UTF chars of string cs to string ct case
sl@0:  *	insensitive.  Both cs and ct are assumed to be at least n
sl@0:  *	UTF chars long.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UtfNcasecmp(cs, ct, n)
sl@0:     CONST char *cs;		/* UTF string to compare to ct. */
sl@0:     CONST char *ct;		/* UTF string cs is compared to. */
sl@0:     unsigned long n;			/* Number of UTF chars to compare. */
sl@0: {
sl@0:     Tcl_UniChar ch1, ch2;
sl@0:     while (n-- > 0) {
sl@0: 	/*
sl@0: 	 * n must be interpreted as chars, not bytes.
sl@0: 	 * This should be called only when both strings are of
sl@0: 	 * at least n chars long (no need for \0 check)
sl@0: 	 */
sl@0: 	cs += TclUtfToUniChar(cs, &ch1);
sl@0: 	ct += TclUtfToUniChar(ct, &ch2);
sl@0: 	if (ch1 != ch2) {
sl@0: 	    ch1 = Tcl_UniCharToLower(ch1);
sl@0: 	    ch2 = Tcl_UniCharToLower(ch2);
sl@0: 	    if (ch1 != ch2) {
sl@0: 		return (ch1 - ch2);
sl@0: 	    }
sl@0: 	}
sl@0:     }
sl@0:     return 0;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharToUpper --
sl@0:  *
sl@0:  *	Compute the uppercase equivalent of the given Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the uppercase Unicode character.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C Tcl_UniChar
sl@0: Tcl_UniCharToUpper(ch)
sl@0:     int ch;			/* Unicode character to convert. */
sl@0: {
sl@0:     int info = GetUniCharInfo(ch);
sl@0: 
sl@0:     if (GetCaseType(info) & 0x04) {
sl@0: 	return (Tcl_UniChar) (ch - GetDelta(info));
sl@0:     } else {
sl@0: 	return ch;
sl@0:     }
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharToLower --
sl@0:  *
sl@0:  *	Compute the lowercase equivalent of the given Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the lowercase Unicode character.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C Tcl_UniChar
sl@0: Tcl_UniCharToLower(ch)
sl@0:     int ch;			/* Unicode character to convert. */
sl@0: {
sl@0:     int info = GetUniCharInfo(ch);
sl@0: 
sl@0:     if (GetCaseType(info) & 0x02) {
sl@0: 	return (Tcl_UniChar) (ch + GetDelta(info));
sl@0:     } else {
sl@0: 	return ch;
sl@0:     }
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharToTitle --
sl@0:  *
sl@0:  *	Compute the titlecase equivalent of the given Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the titlecase Unicode character.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C Tcl_UniChar
sl@0: Tcl_UniCharToTitle(ch)
sl@0:     int ch;			/* Unicode character to convert. */
sl@0: {
sl@0:     int info = GetUniCharInfo(ch);
sl@0:     int mode = GetCaseType(info);
sl@0: 
sl@0:     if (mode & 0x1) {
sl@0: 	/*
sl@0: 	 * Subtract or add one depending on the original case.
sl@0: 	 */
sl@0: 
sl@0: 	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
sl@0:     } else if (mode == 0x4) {
sl@0: 	return (Tcl_UniChar) (ch - GetDelta(info));
sl@0:     } else {
sl@0: 	return ch;
sl@0:     }
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharLen --
sl@0:  *
sl@0:  *	Find the length of a UniChar string.  The str input must be null
sl@0:  *	terminated.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns the length of str in UniChars (not bytes).
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharLen(str)
sl@0:     CONST Tcl_UniChar *str;	/* Unicode string to find length of. */
sl@0: {
sl@0:     int len = 0;
sl@0:     
sl@0:     while (*str != '\0') {
sl@0: 	len++;
sl@0: 	str++;
sl@0:     }
sl@0:     return len;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharNcmp --
sl@0:  *
sl@0:  *	Compare at most n unichars of string cs to string ct.  Both cs
sl@0:  *	and ct are assumed to be at least n unichars long.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharNcmp(cs, ct, n)
sl@0:     CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
sl@0:     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
sl@0:     unsigned long n;			/* Number of unichars to compare. */
sl@0: {
sl@0: #ifdef WORDS_BIGENDIAN
sl@0:     /*
sl@0:      * We are definitely on a big-endian machine; memcmp() is safe
sl@0:      */
sl@0:     return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
sl@0: 
sl@0: #else /* !WORDS_BIGENDIAN */
sl@0:     /*
sl@0:      * We can't simply call memcmp() because that is not lexically correct.
sl@0:      */
sl@0:     for ( ; n != 0; cs++, ct++, n--) {
sl@0: 	if (*cs != *ct) {
sl@0: 	    return (*cs - *ct);
sl@0: 	}
sl@0:     }
sl@0:     return 0;
sl@0: #endif /* WORDS_BIGENDIAN */
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharNcasecmp --
sl@0:  *
sl@0:  *	Compare at most n unichars of string cs to string ct case
sl@0:  *	insensitive.  Both cs and ct are assumed to be at least n
sl@0:  *	unichars long.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharNcasecmp(cs, ct, n)
sl@0:     CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
sl@0:     CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
sl@0:     unsigned long n;			/* Number of unichars to compare. */
sl@0: {
sl@0:     for ( ; n != 0; n--, cs++, ct++) {
sl@0: 	if (*cs != *ct) {
sl@0: 	    Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);
sl@0: 	    Tcl_UniChar lct = Tcl_UniCharToLower(*ct);
sl@0: 	    if (lcs != lct) {
sl@0: 		return (lcs - lct);
sl@0: 	    }
sl@0: 	}
sl@0:     }
sl@0:     return 0;
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsAlnum --
sl@0:  *
sl@0:  *	Test if a character is an alphanumeric Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns 1 if character is alphanumeric.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsAlnum(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0: 
sl@0:     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsAlpha --
sl@0:  *
sl@0:  *	Test if a character is an alphabetic Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns 1 if character is alphabetic.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsAlpha(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0:     return ((ALPHA_BITS >> category) & 1);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsControl --
sl@0:  *
sl@0:  *	Test if a character is a Unicode control character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is a control.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsControl(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsDigit --
sl@0:  *
sl@0:  *	Test if a character is a numeric Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is a digit.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsDigit(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
sl@0: 	    == DECIMAL_DIGIT_NUMBER);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsGraph --
sl@0:  *
sl@0:  *	Test if a character is any Unicode print character except space.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is printable, but not space.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsGraph(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0:     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsLower --
sl@0:  *
sl@0:  *	Test if a character is a lowercase Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is lowercase.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsLower(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsPrint --
sl@0:  *
sl@0:  *	Test if a character is a Unicode print character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is printable.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsPrint(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0:     return ((PRINT_BITS >> category) & 1);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsPunct --
sl@0:  *
sl@0:  *	Test if a character is a Unicode punctuation character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is punct.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsPunct(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0:     return ((PUNCT_BITS >> category) & 1);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsSpace --
sl@0:  *
sl@0:  *	Test if a character is a whitespace Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is a space.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsSpace(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category;
sl@0: 
sl@0:     /*
sl@0:      * If the character is within the first 127 characters, just use the
sl@0:      * standard C function, otherwise consult the Unicode table.
sl@0:      */
sl@0: 
sl@0:     if (ch < 0x80) {
sl@0: 	return isspace(UCHAR(ch)); /* INTL: ISO space */
sl@0:     } else {
sl@0: 	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0: 	return ((SPACE_BITS >> category) & 1);
sl@0:     }
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsUpper --
sl@0:  *
sl@0:  *	Test if a character is a uppercase Unicode character.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns non-zero if character is uppercase.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsUpper(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharIsWordChar --
sl@0:  *
sl@0:  *	Test if a character is alphanumeric or a connector punctuation
sl@0:  *	mark.
sl@0:  *
sl@0:  * Results:
sl@0:  *	Returns 1 if character is a word character.
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharIsWordChar(ch)
sl@0:     int ch;			/* Unicode character to test. */
sl@0: {
sl@0:     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0: 
sl@0:     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * Tcl_UniCharCaseMatch --
sl@0:  *
sl@0:  *	See if a particular Unicode string matches a particular pattern.
sl@0:  *	Allows case insensitivity.  This is the Unicode equivalent of
sl@0:  *	the char* Tcl_StringCaseMatch.  The UniChar strings must be
sl@0:  *	NULL-terminated.  This has no provision for counted UniChar
sl@0:  *	strings, thus should not be used where NULLs are expected in the
sl@0:  *	UniChar string.  Use TclUniCharMatch where possible.
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is 1 if string matches pattern, and
sl@0:  *	0 otherwise.  The matching operation permits the following
sl@0:  *	special characters in the pattern: *?\[] (see the manual
sl@0:  *	entry for details on what these mean).
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: EXPORT_C int
sl@0: Tcl_UniCharCaseMatch(string, pattern, nocase)
sl@0:     CONST Tcl_UniChar *string;	/* Unicode String. */
sl@0:     CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
sl@0: 				 * characters. */
sl@0:     int nocase;			/* 0 for case sensitive, 1 for insensitive */
sl@0: {
sl@0:     Tcl_UniChar ch1, p;
sl@0:     
sl@0:     while (1) {
sl@0: 	p = *pattern;
sl@0: 	
sl@0: 	/*
sl@0: 	 * See if we're at the end of both the pattern and the string.  If
sl@0: 	 * so, we succeeded.  If we're at the end of the pattern but not at
sl@0: 	 * the end of the string, we failed.
sl@0: 	 */
sl@0: 	
sl@0: 	if (p == 0) {
sl@0: 	    return (*string == 0);
sl@0: 	}
sl@0: 	if ((*string == 0) && (p != '*')) {
sl@0: 	    return 0;
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * Check for a "*" as the next pattern character.  It matches any
sl@0: 	 * substring.  We handle this by skipping all the characters up to the
sl@0: 	 * next matching one in the pattern, and then calling ourselves
sl@0: 	 * recursively for each postfix of string, until either we match or we
sl@0: 	 * reach the end of the string.
sl@0: 	 */
sl@0: 	
sl@0: 	if (p == '*') {
sl@0: 	    /*
sl@0: 	     * Skip all successive *'s in the pattern
sl@0: 	     */
sl@0: 	    while (*(++pattern) == '*') {}
sl@0: 	    p = *pattern;
sl@0: 	    if (p == 0) {
sl@0: 		return 1;
sl@0: 	    }
sl@0: 	    if (nocase) {
sl@0: 		p = Tcl_UniCharToLower(p);
sl@0: 	    }
sl@0: 	    while (1) {
sl@0: 		/*
sl@0: 		 * Optimization for matching - cruise through the string
sl@0: 		 * quickly if the next char in the pattern isn't a special
sl@0: 		 * character
sl@0: 		 */
sl@0: 		if ((p != '[') && (p != '?') && (p != '\\')) {
sl@0: 		    if (nocase) {
sl@0: 			while (*string && (p != *string)
sl@0: 				&& (p != Tcl_UniCharToLower(*string))) {
sl@0: 			    string++;
sl@0: 			}
sl@0: 		    } else {
sl@0: 			while (*string && (p != *string)) { string++; }
sl@0: 		    }
sl@0: 		}
sl@0: 		if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
sl@0: 		    return 1;
sl@0: 		}
sl@0: 		if (*string == 0) {
sl@0: 		    return 0;
sl@0: 		}
sl@0: 		string++;
sl@0: 	    }
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * Check for a "?" as the next pattern character.  It matches
sl@0: 	 * any single character.
sl@0: 	 */
sl@0: 
sl@0: 	if (p == '?') {
sl@0: 	    pattern++;
sl@0: 	    string++;
sl@0: 	    continue;
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * Check for a "[" as the next pattern character.  It is followed
sl@0: 	 * by a list of characters that are acceptable, or by a range
sl@0: 	 * (two characters separated by "-").
sl@0: 	 */
sl@0: 	
sl@0: 	if (p == '[') {
sl@0: 	    Tcl_UniChar startChar, endChar;
sl@0: 
sl@0: 	    pattern++;
sl@0: 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
sl@0: 	    string++;
sl@0: 	    while (1) {
sl@0: 		if ((*pattern == ']') || (*pattern == 0)) {
sl@0: 		    return 0;
sl@0: 		}
sl@0: 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
sl@0: 		pattern++;
sl@0: 		if (*pattern == '-') {
sl@0: 		    pattern++;
sl@0: 		    if (*pattern == 0) {
sl@0: 			return 0;
sl@0: 		    }
sl@0: 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
sl@0: 			    : *pattern);
sl@0: 		    pattern++;
sl@0: 		    if (((startChar <= ch1) && (ch1 <= endChar))
sl@0: 			    || ((endChar <= ch1) && (ch1 <= startChar))) {
sl@0: 			/*
sl@0: 			 * Matches ranges of form [a-z] or [z-a].
sl@0: 			 */
sl@0: 			break;
sl@0: 		    }
sl@0: 		} else if (startChar == ch1) {
sl@0: 		    break;
sl@0: 		}
sl@0: 	    }
sl@0: 	    while (*pattern != ']') {
sl@0: 		if (*pattern == 0) {
sl@0: 		    pattern--;
sl@0: 		    break;
sl@0: 		}
sl@0: 		pattern++;
sl@0: 	    }
sl@0: 	    pattern++;
sl@0: 	    continue;
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * If the next pattern character is '\', just strip off the '\'
sl@0: 	 * so we do exact matching on the character that follows.
sl@0: 	 */
sl@0: 
sl@0: 	if (p == '\\') {
sl@0: 	    if (*(++pattern) == '\0') {
sl@0: 		return 0;
sl@0: 	    }
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * There's no special character.  Just make sure that the next
sl@0: 	 * bytes of each string match.
sl@0: 	 */
sl@0: 
sl@0: 	if (nocase) {
sl@0: 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
sl@0: 		return 0;
sl@0: 	    }
sl@0: 	} else if (*string != *pattern) {
sl@0: 	    return 0;
sl@0: 	}
sl@0: 	string++;
sl@0: 	pattern++;
sl@0:     }
sl@0: }
sl@0: 
sl@0: /*
sl@0:  *----------------------------------------------------------------------
sl@0:  *
sl@0:  * TclUniCharMatch --
sl@0:  *
sl@0:  *	See if a particular Unicode string matches a particular pattern.
sl@0:  *	Allows case insensitivity.  This is the Unicode equivalent of the
sl@0:  *	char* Tcl_StringCaseMatch.  This variant of Tcl_UniCharCaseMatch
sl@0:  *	uses counted Strings, so embedded NULLs are allowed.
sl@0:  *
sl@0:  * Results:
sl@0:  *	The return value is 1 if string matches pattern, and
sl@0:  *	0 otherwise.  The matching operation permits the following
sl@0:  *	special characters in the pattern: *?\[] (see the manual
sl@0:  *	entry for details on what these mean).
sl@0:  *
sl@0:  * Side effects:
sl@0:  *	None.
sl@0:  *
sl@0:  *----------------------------------------------------------------------
sl@0:  */
sl@0: 
sl@0: int
sl@0: TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
sl@0:     CONST Tcl_UniChar *string;	/* Unicode String. */
sl@0:     int strLen;			/* length of String */
sl@0:     CONST Tcl_UniChar *pattern;	/* Pattern, which may contain special
sl@0: 				 * characters. */
sl@0:     int ptnLen;			/* length of Pattern */
sl@0:     int nocase;			/* 0 for case sensitive, 1 for insensitive */
sl@0: {
sl@0:     CONST Tcl_UniChar *stringEnd, *patternEnd;
sl@0:     Tcl_UniChar p;
sl@0: 
sl@0:     stringEnd  = string + strLen;
sl@0:     patternEnd = pattern + ptnLen;
sl@0: 
sl@0:     while (1) {
sl@0: 	/*
sl@0: 	 * See if we're at the end of both the pattern and the string.  If
sl@0: 	 * so, we succeeded.  If we're at the end of the pattern but not at
sl@0: 	 * the end of the string, we failed.
sl@0: 	 */
sl@0: 
sl@0: 	if (pattern == patternEnd) {
sl@0: 	    return (string == stringEnd);
sl@0: 	}
sl@0: 	p = *pattern;
sl@0: 	if ((string == stringEnd) && (p != '*')) {
sl@0: 	    return 0;
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * Check for a "*" as the next pattern character.  It matches any
sl@0: 	 * substring.  We handle this by skipping all the characters up to the
sl@0: 	 * next matching one in the pattern, and then calling ourselves
sl@0: 	 * recursively for each postfix of string, until either we match or we
sl@0: 	 * reach the end of the string.
sl@0: 	 */
sl@0: 	
sl@0: 	if (p == '*') {
sl@0: 	    /*
sl@0: 	     * Skip all successive *'s in the pattern
sl@0: 	     */
sl@0: 	    while (*(++pattern) == '*') {}
sl@0: 	    if (pattern == patternEnd) {
sl@0: 		return 1;
sl@0: 	    }
sl@0: 	    p = *pattern;
sl@0: 	    if (nocase) {
sl@0: 		p = Tcl_UniCharToLower(p);
sl@0: 	    }
sl@0: 	    while (1) {
sl@0: 		/*
sl@0: 		 * Optimization for matching - cruise through the string
sl@0: 		 * quickly if the next char in the pattern isn't a special
sl@0: 		 * character
sl@0: 		 */
sl@0: 		if ((p != '[') && (p != '?') && (p != '\\')) {
sl@0: 		    if (nocase) {
sl@0: 			while ((string < stringEnd) && (p != *string)
sl@0: 				&& (p != Tcl_UniCharToLower(*string))) {
sl@0: 			    string++;
sl@0: 			}
sl@0: 		    } else {
sl@0: 			while ((string < stringEnd) && (p != *string)) {
sl@0: 			    string++;
sl@0: 			}
sl@0: 		    }
sl@0: 		}
sl@0: 		if (TclUniCharMatch(string, stringEnd - string,
sl@0: 			pattern, patternEnd - pattern, nocase)) {
sl@0: 		    return 1;
sl@0: 		}
sl@0: 		if (string == stringEnd) {
sl@0: 		    return 0;
sl@0: 		}
sl@0: 		string++;
sl@0: 	    }
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * Check for a "?" as the next pattern character.  It matches
sl@0: 	 * any single character.
sl@0: 	 */
sl@0: 
sl@0: 	if (p == '?') {
sl@0: 	    pattern++;
sl@0: 	    string++;
sl@0: 	    continue;
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * Check for a "[" as the next pattern character.  It is followed
sl@0: 	 * by a list of characters that are acceptable, or by a range
sl@0: 	 * (two characters separated by "-").
sl@0: 	 */
sl@0: 	
sl@0: 	if (p == '[') {
sl@0: 	    Tcl_UniChar ch1, startChar, endChar;
sl@0: 
sl@0: 	    pattern++;
sl@0: 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
sl@0: 	    string++;
sl@0: 	    while (1) {
sl@0: 		if ((*pattern == ']') || (pattern == patternEnd)) {
sl@0: 		    return 0;
sl@0: 		}
sl@0: 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
sl@0: 		pattern++;
sl@0: 		if (*pattern == '-') {
sl@0: 		    pattern++;
sl@0: 		    if (pattern == patternEnd) {
sl@0: 			return 0;
sl@0: 		    }
sl@0: 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
sl@0: 			    : *pattern);
sl@0: 		    pattern++;
sl@0: 		    if (((startChar <= ch1) && (ch1 <= endChar))
sl@0: 			    || ((endChar <= ch1) && (ch1 <= startChar))) {
sl@0: 			/*
sl@0: 			 * Matches ranges of form [a-z] or [z-a].
sl@0: 			 */
sl@0: 			break;
sl@0: 		    }
sl@0: 		} else if (startChar == ch1) {
sl@0: 		    break;
sl@0: 		}
sl@0: 	    }
sl@0: 	    while (*pattern != ']') {
sl@0: 		if (pattern == patternEnd) {
sl@0: 		    pattern--;
sl@0: 		    break;
sl@0: 		}
sl@0: 		pattern++;
sl@0: 	    }
sl@0: 	    pattern++;
sl@0: 	    continue;
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * If the next pattern character is '\', just strip off the '\'
sl@0: 	 * so we do exact matching on the character that follows.
sl@0: 	 */
sl@0: 
sl@0: 	if (p == '\\') {
sl@0: 	    if (++pattern == patternEnd) {
sl@0: 		return 0;
sl@0: 	    }
sl@0: 	}
sl@0: 
sl@0: 	/*
sl@0: 	 * There's no special character.  Just make sure that the next
sl@0: 	 * bytes of each string match.
sl@0: 	 */
sl@0: 
sl@0: 	if (nocase) {
sl@0: 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
sl@0: 		return 0;
sl@0: 	    }
sl@0: 	} else if (*string != *pattern) {
sl@0: 	    return 0;
sl@0: 	}
sl@0: 	string++;
sl@0: 	pattern++;
sl@0:     }
sl@0: }