sl@0: /* sl@0: * tclUtf.c -- sl@0: * sl@0: * Routines for manipulating UTF-8 strings. sl@0: * sl@0: * Copyright (c) 1997-1998 Sun Microsystems, Inc. sl@0: * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved. sl@0: * sl@0: * See the file "license.terms" for information on usage and redistribution sl@0: * of this file, and for a DISCLAIMER OF ALL WARRANTIES. sl@0: * sl@0: * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $ sl@0: */ sl@0: sl@0: #include "tclInt.h" sl@0: sl@0: /* sl@0: * Include the static character classification tables and macros. sl@0: */ sl@0: sl@0: #include "tclUniData.c" sl@0: sl@0: /* sl@0: * The following macros are used for fast character category tests. The sl@0: * x_BITS values are shifted right by the category value to determine whether sl@0: * the given category is included in the set. sl@0: */ sl@0: sl@0: #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ sl@0: | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) sl@0: sl@0: #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) sl@0: sl@0: #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ sl@0: | (1 << PARAGRAPH_SEPARATOR)) sl@0: sl@0: #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) sl@0: sl@0: #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ sl@0: (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ sl@0: (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ sl@0: (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ sl@0: (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ sl@0: (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ sl@0: (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ sl@0: (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ sl@0: (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) sl@0: sl@0: #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ sl@0: (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ sl@0: (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ sl@0: (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) sl@0: sl@0: /* sl@0: * Unicode characters less than this value are represented by themselves sl@0: * in UTF-8 strings. sl@0: */ sl@0: sl@0: #define UNICODE_SELF 0x80 sl@0: sl@0: /* sl@0: * The following structures are used when mapping between Unicode (UCS-2) sl@0: * and UTF-8. sl@0: */ sl@0: sl@0: static CONST unsigned char totalBytes[256] = { sl@0: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, sl@0: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, sl@0: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, sl@0: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, sl@0: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, sl@0: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, sl@0: 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, sl@0: 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, sl@0: #if TCL_UTF_MAX > 3 sl@0: 4,4,4,4,4,4,4,4, sl@0: #else sl@0: 1,1,1,1,1,1,1,1, sl@0: #endif sl@0: #if TCL_UTF_MAX > 4 sl@0: 5,5,5,5, sl@0: #else sl@0: 1,1,1,1, sl@0: #endif sl@0: #if TCL_UTF_MAX > 5 sl@0: 6,6,6,6 sl@0: #else sl@0: 1,1,1,1 sl@0: #endif sl@0: }; sl@0: sl@0: /* sl@0: * Procedures used only in this module. sl@0: */ sl@0: sl@0: static int UtfCount _ANSI_ARGS_((int ch)); sl@0: sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * UtfCount -- sl@0: * sl@0: * Find the number of bytes in the Utf character "ch". sl@0: * sl@0: * Results: sl@0: * The return values is the number of bytes in the Utf character "ch". sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: INLINE static int sl@0: UtfCount(ch) sl@0: int ch; /* The Tcl_UniChar whose size is returned. */ sl@0: { sl@0: if ((ch > 0) && (ch < UNICODE_SELF)) { sl@0: return 1; sl@0: } sl@0: if (ch <= 0x7FF) { sl@0: return 2; sl@0: } sl@0: if (ch <= 0xFFFF) { sl@0: return 3; sl@0: } sl@0: #if TCL_UTF_MAX > 3 sl@0: if (ch <= 0x1FFFFF) { sl@0: return 4; sl@0: } sl@0: if (ch <= 0x3FFFFFF) { sl@0: return 5; sl@0: } sl@0: if (ch <= 0x7FFFFFFF) { sl@0: return 6; sl@0: } sl@0: #endif sl@0: return 3; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharToUtf -- sl@0: * sl@0: * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the sl@0: * provided buffer. Equivalent to Plan 9 runetochar(). sl@0: * sl@0: * Results: sl@0: * The return values is the number of bytes in the buffer that sl@0: * were consumed. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C INLINE int sl@0: Tcl_UniCharToUtf(ch, str) sl@0: int ch; /* The Tcl_UniChar to be stored in the sl@0: * buffer. */ sl@0: char *str; /* Buffer in which the UTF-8 representation sl@0: * of the Tcl_UniChar is stored. Buffer must sl@0: * be large enough to hold the UTF-8 character sl@0: * (at most TCL_UTF_MAX bytes). */ sl@0: { sl@0: if ((ch > 0) && (ch < UNICODE_SELF)) { sl@0: str[0] = (char) ch; sl@0: return 1; sl@0: } sl@0: if (ch >= 0) { sl@0: if (ch <= 0x7FF) { sl@0: str[1] = (char) ((ch | 0x80) & 0xBF); sl@0: str[0] = (char) ((ch >> 6) | 0xC0); sl@0: return 2; sl@0: } sl@0: if (ch <= 0xFFFF) { sl@0: three: sl@0: str[2] = (char) ((ch | 0x80) & 0xBF); sl@0: str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); sl@0: str[0] = (char) ((ch >> 12) | 0xE0); sl@0: return 3; sl@0: } sl@0: sl@0: #if TCL_UTF_MAX > 3 sl@0: if (ch <= 0x1FFFFF) { sl@0: str[3] = (char) ((ch | 0x80) & 0xBF); sl@0: str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); sl@0: str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); sl@0: str[0] = (char) ((ch >> 18) | 0xF0); sl@0: return 4; sl@0: } sl@0: if (ch <= 0x3FFFFFF) { sl@0: str[4] = (char) ((ch | 0x80) & 0xBF); sl@0: str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); sl@0: str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); sl@0: str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); sl@0: str[0] = (char) ((ch >> 24) | 0xF8); sl@0: return 5; sl@0: } sl@0: if (ch <= 0x7FFFFFFF) { sl@0: str[5] = (char) ((ch | 0x80) & 0xBF); sl@0: str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); sl@0: str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); sl@0: str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); sl@0: str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); sl@0: str[0] = (char) ((ch >> 30) | 0xFC); sl@0: return 6; sl@0: } sl@0: #endif sl@0: } sl@0: sl@0: ch = 0xFFFD; sl@0: goto three; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharToUtfDString -- sl@0: * sl@0: * Convert the given Unicode string to UTF-8. sl@0: * sl@0: * Results: sl@0: * The return value is a pointer to the UTF-8 representation of the sl@0: * Unicode string. Storage for the return value is appended to the sl@0: * end of dsPtr. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C char * sl@0: Tcl_UniCharToUtfDString(wString, numChars, dsPtr) sl@0: CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ sl@0: int numChars; /* Length of Unicode string in Tcl_UniChars sl@0: * (must be >= 0). */ sl@0: Tcl_DString *dsPtr; /* UTF-8 representation of string is sl@0: * appended to this previously initialized sl@0: * DString. */ sl@0: { sl@0: CONST Tcl_UniChar *w, *wEnd; sl@0: char *p, *string; sl@0: int oldLength; sl@0: sl@0: /* sl@0: * UTF-8 string length in bytes will be <= Unicode string length * sl@0: * TCL_UTF_MAX. sl@0: */ sl@0: sl@0: oldLength = Tcl_DStringLength(dsPtr); sl@0: Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); sl@0: string = Tcl_DStringValue(dsPtr) + oldLength; sl@0: sl@0: p = string; sl@0: wEnd = wString + numChars; sl@0: for (w = wString; w < wEnd; ) { sl@0: p += Tcl_UniCharToUtf(*w, p); sl@0: w++; sl@0: } sl@0: Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); sl@0: sl@0: return string; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfToUniChar -- sl@0: * sl@0: * Extract the Tcl_UniChar represented by the UTF-8 string. Bad sl@0: * UTF-8 sequences are converted to valid Tcl_UniChars and processing sl@0: * continues. Equivalent to Plan 9 chartorune(). sl@0: * sl@0: * The caller must ensure that the source buffer is long enough that sl@0: * this routine does not run off the end and dereference non-existent sl@0: * memory looking for trail bytes. If the source buffer is known to sl@0: * be '\0' terminated, this cannot happen. Otherwise, the caller sl@0: * should call Tcl_UtfCharComplete() before calling this routine to sl@0: * ensure that enough bytes remain in the string. sl@0: * sl@0: * Results: sl@0: * *chPtr is filled with the Tcl_UniChar, and the return value is the sl@0: * number of bytes from the UTF-8 string that were consumed. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfToUniChar(str, chPtr) sl@0: register CONST char *str; /* The UTF-8 string. */ sl@0: register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented sl@0: * by the UTF-8 string. */ sl@0: { sl@0: register int byte; sl@0: sl@0: /* sl@0: * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. sl@0: */ sl@0: sl@0: byte = *((unsigned char *) str); sl@0: if (byte < 0xC0) { sl@0: /* sl@0: * Handles properly formed UTF-8 characters between 0x01 and 0x7F. sl@0: * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid sl@0: * characters representing themselves. sl@0: */ sl@0: sl@0: *chPtr = (Tcl_UniChar) byte; sl@0: return 1; sl@0: } else if (byte < 0xE0) { sl@0: if ((str[1] & 0xC0) == 0x80) { sl@0: /* sl@0: * Two-byte-character lead-byte followed by a trail-byte. sl@0: */ sl@0: sl@0: *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); sl@0: return 2; sl@0: } sl@0: /* sl@0: * A two-byte-character lead-byte not followed by trail-byte sl@0: * represents itself. sl@0: */ sl@0: sl@0: *chPtr = (Tcl_UniChar) byte; sl@0: return 1; sl@0: } else if (byte < 0xF0) { sl@0: if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { sl@0: /* sl@0: * Three-byte-character lead byte followed by two trail bytes. sl@0: */ sl@0: sl@0: *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) sl@0: | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); sl@0: return 3; sl@0: } sl@0: /* sl@0: * A three-byte-character lead-byte not followed by two trail-bytes sl@0: * represents itself. sl@0: */ sl@0: sl@0: *chPtr = (Tcl_UniChar) byte; sl@0: return 1; sl@0: } sl@0: #if TCL_UTF_MAX > 3 sl@0: else { sl@0: int ch, total, trail; sl@0: sl@0: total = totalBytes[byte]; sl@0: trail = total - 1; sl@0: if (trail > 0) { sl@0: ch = byte & (0x3F >> trail); sl@0: do { sl@0: str++; sl@0: if ((*str & 0xC0) != 0x80) { sl@0: *chPtr = byte; sl@0: return 1; sl@0: } sl@0: ch <<= 6; sl@0: ch |= (*str & 0x3F); sl@0: trail--; sl@0: } while (trail > 0); sl@0: *chPtr = ch; sl@0: return total; sl@0: } sl@0: } sl@0: #endif sl@0: sl@0: *chPtr = (Tcl_UniChar) byte; sl@0: return 1; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfToUniCharDString -- sl@0: * sl@0: * Convert the UTF-8 string to Unicode. sl@0: * sl@0: * Results: sl@0: * The return value is a pointer to the Unicode representation of the sl@0: * UTF-8 string. Storage for the return value is appended to the sl@0: * end of dsPtr. The Unicode string is terminated with a Unicode sl@0: * NULL character. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_UniChar * sl@0: Tcl_UtfToUniCharDString(string, length, dsPtr) sl@0: CONST char *string; /* UTF-8 string to convert to Unicode. */ sl@0: int length; /* Length of UTF-8 string in bytes, or -1 sl@0: * for strlen(). */ sl@0: Tcl_DString *dsPtr; /* Unicode representation of string is sl@0: * appended to this previously initialized sl@0: * DString. */ sl@0: { sl@0: Tcl_UniChar *w, *wString; sl@0: CONST char *p, *end; sl@0: int oldLength; sl@0: sl@0: if (length < 0) { sl@0: length = strlen(string); sl@0: } sl@0: sl@0: /* sl@0: * Unicode string length in Tcl_UniChars will be <= UTF-8 string length sl@0: * in bytes. sl@0: */ sl@0: sl@0: oldLength = Tcl_DStringLength(dsPtr); sl@0: Tcl_DStringSetLength(dsPtr, sl@0: (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); sl@0: wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); sl@0: sl@0: w = wString; sl@0: end = string + length; sl@0: for (p = string; p < end; ) { sl@0: p += TclUtfToUniChar(p, w); sl@0: w++; sl@0: } sl@0: *w = '\0'; sl@0: Tcl_DStringSetLength(dsPtr, sl@0: (oldLength + ((char *) w - (char *) wString))); sl@0: sl@0: return wString; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfCharComplete -- sl@0: * sl@0: * Determine if the UTF-8 string of the given length is long enough sl@0: * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the sl@0: * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). sl@0: * sl@0: * Results: sl@0: * The return value is 0 if the string is not long enough, non-zero sl@0: * otherwise. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfCharComplete(str, len) sl@0: CONST char *str; /* String to check if first few bytes sl@0: * contain a complete UTF-8 character. */ sl@0: int len; /* Length of above string in bytes. */ sl@0: { sl@0: int ch; sl@0: sl@0: ch = *((unsigned char *) str); sl@0: return len >= totalBytes[ch]; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_NumUtfChars -- sl@0: * sl@0: * Returns the number of characters (not bytes) in the UTF-8 string, sl@0: * not including the terminating NULL byte. This is equivalent to sl@0: * Plan 9 utflen() and utfnlen(). sl@0: * sl@0: * Results: sl@0: * As above. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_NumUtfChars(str, len) sl@0: register CONST char *str; /* The UTF-8 string to measure. */ sl@0: int len; /* The length of the string in bytes, or -1 sl@0: * for strlen(string). */ sl@0: { sl@0: Tcl_UniChar ch; sl@0: register Tcl_UniChar *chPtr = &ch; sl@0: register int i; sl@0: sl@0: /* sl@0: * The separate implementations are faster. sl@0: * sl@0: * Since this is a time-sensitive function, we also do the check for sl@0: * the single-byte char case specially. sl@0: */ sl@0: sl@0: i = 0; sl@0: if (len < 0) { sl@0: while (*str != '\0') { sl@0: str += TclUtfToUniChar(str, chPtr); sl@0: i++; sl@0: } sl@0: } else { sl@0: register int n; sl@0: sl@0: while (len > 0) { sl@0: if (UCHAR(*str) < 0xC0) { sl@0: len--; sl@0: str++; sl@0: } else { sl@0: n = Tcl_UtfToUniChar(str, chPtr); sl@0: len -= n; sl@0: str += n; sl@0: } sl@0: i++; sl@0: } sl@0: } sl@0: return i; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfFindFirst -- sl@0: * sl@0: * Returns a pointer to the first occurance of the given Tcl_UniChar sl@0: * in the NULL-terminated UTF-8 string. The NULL terminator is sl@0: * considered part of the UTF-8 string. Equivalent to Plan 9 sl@0: * utfrune(). sl@0: * sl@0: * Results: sl@0: * As above. If the Tcl_UniChar does not exist in the given string, sl@0: * the return value is NULL. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: EXPORT_C CONST char * sl@0: Tcl_UtfFindFirst(string, ch) sl@0: CONST char *string; /* The UTF-8 string to be searched. */ sl@0: int ch; /* The Tcl_UniChar to search for. */ sl@0: { sl@0: int len; sl@0: Tcl_UniChar find; sl@0: sl@0: while (1) { sl@0: len = TclUtfToUniChar(string, &find); sl@0: if (find == ch) { sl@0: return string; sl@0: } sl@0: if (*string == '\0') { sl@0: return NULL; sl@0: } sl@0: string += len; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfFindLast -- sl@0: * sl@0: * Returns a pointer to the last occurance of the given Tcl_UniChar sl@0: * in the NULL-terminated UTF-8 string. The NULL terminator is sl@0: * considered part of the UTF-8 string. Equivalent to Plan 9 sl@0: * utfrrune(). sl@0: * sl@0: * Results: sl@0: * As above. If the Tcl_UniChar does not exist in the given string, sl@0: * the return value is NULL. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C CONST char * sl@0: Tcl_UtfFindLast(string, ch) sl@0: CONST char *string; /* The UTF-8 string to be searched. */ sl@0: int ch; /* The Tcl_UniChar to search for. */ sl@0: { sl@0: int len; sl@0: Tcl_UniChar find; sl@0: CONST char *last; sl@0: sl@0: last = NULL; sl@0: while (1) { sl@0: len = TclUtfToUniChar(string, &find); sl@0: if (find == ch) { sl@0: last = string; sl@0: } sl@0: if (*string == '\0') { sl@0: break; sl@0: } sl@0: string += len; sl@0: } sl@0: return last; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfNext -- sl@0: * sl@0: * Given a pointer to some current location in a UTF-8 string, sl@0: * move forward one character. The caller must ensure that they sl@0: * are not asking for the next character after the last character sl@0: * in the string. sl@0: * sl@0: * Results: sl@0: * The return value is the pointer to the next character in sl@0: * the UTF-8 string. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C CONST char * sl@0: Tcl_UtfNext(str) sl@0: CONST char *str; /* The current location in the string. */ sl@0: { sl@0: Tcl_UniChar ch; sl@0: sl@0: return str + TclUtfToUniChar(str, &ch); sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfPrev -- sl@0: * sl@0: * Given a pointer to some current location in a UTF-8 string, sl@0: * move backwards one character. This works correctly when the sl@0: * pointer is in the middle of a UTF-8 character. sl@0: * sl@0: * Results: sl@0: * The return value is a pointer to the previous character in the sl@0: * UTF-8 string. If the current location was already at the sl@0: * beginning of the string, the return value will also be a sl@0: * pointer to the beginning of the string. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C CONST char * sl@0: Tcl_UtfPrev(str, start) sl@0: CONST char *str; /* The current location in the string. */ sl@0: CONST char *start; /* Pointer to the beginning of the sl@0: * string, to avoid going backwards too sl@0: * far. */ sl@0: { sl@0: CONST char *look; sl@0: int i, byte; sl@0: sl@0: str--; sl@0: look = str; sl@0: for (i = 0; i < TCL_UTF_MAX; i++) { sl@0: if (look < start) { sl@0: if (str < start) { sl@0: str = start; sl@0: } sl@0: break; sl@0: } sl@0: byte = *((unsigned char *) look); sl@0: if (byte < 0x80) { sl@0: break; sl@0: } sl@0: if (byte >= 0xC0) { sl@0: return look; sl@0: } sl@0: look--; sl@0: } sl@0: return str; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharAtIndex -- sl@0: * sl@0: * Returns the Unicode character represented at the specified sl@0: * character (not byte) position in the UTF-8 string. sl@0: * sl@0: * Results: sl@0: * As above. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_UniChar sl@0: Tcl_UniCharAtIndex(src, index) sl@0: register CONST char *src; /* The UTF-8 string to dereference. */ sl@0: register int index; /* The position of the desired character. */ sl@0: { sl@0: Tcl_UniChar ch; sl@0: sl@0: while (index >= 0) { sl@0: index--; sl@0: src += TclUtfToUniChar(src, &ch); sl@0: } sl@0: return ch; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfAtIndex -- sl@0: * sl@0: * Returns a pointer to the specified character (not byte) position sl@0: * in the UTF-8 string. sl@0: * sl@0: * Results: sl@0: * As above. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C CONST char * sl@0: Tcl_UtfAtIndex(src, index) sl@0: register CONST char *src; /* The UTF-8 string. */ sl@0: register int index; /* The position of the desired character. */ sl@0: { sl@0: Tcl_UniChar ch; sl@0: sl@0: while (index > 0) { sl@0: index--; sl@0: src += TclUtfToUniChar(src, &ch); sl@0: } sl@0: return src; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfBackslash -- sl@0: * sl@0: * Figure out how to handle a backslash sequence. sl@0: * sl@0: * Results: sl@0: * Stores the bytes represented by the backslash sequence in dst and sl@0: * returns the number of bytes written to dst. At most TCL_UTF_MAX sl@0: * bytes are written to dst; dst must have been large enough to accept sl@0: * those bytes. If readPtr isn't NULL then it is filled in with a sl@0: * count of the number of bytes in the backslash sequence. sl@0: * sl@0: * Side effects: sl@0: * The maximum number of bytes it takes to represent a Unicode sl@0: * character in UTF-8 is guaranteed to be less than the number of sl@0: * bytes used to express the backslash sequence that represents sl@0: * that Unicode character. If the target buffer into which the sl@0: * caller is going to store the bytes that represent the Unicode sl@0: * character is at least as large as the source buffer from which sl@0: * the backslashed sequence was extracted, no buffer overruns should sl@0: * occur. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfBackslash(src, readPtr, dst) sl@0: CONST char *src; /* Points to the backslash character of sl@0: * a backslash sequence. */ sl@0: int *readPtr; /* Fill in with number of characters read sl@0: * from src, unless NULL. */ sl@0: char *dst; /* Filled with the bytes represented by the sl@0: * backslash sequence. */ sl@0: { sl@0: #define LINE_LENGTH 128 sl@0: int numRead; sl@0: int result; sl@0: sl@0: result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); sl@0: if (numRead == LINE_LENGTH) { sl@0: /* We ate a whole line. Pay the price of a strlen() */ sl@0: result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); sl@0: } sl@0: if (readPtr != NULL) { sl@0: *readPtr = numRead; sl@0: } sl@0: return result; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfToUpper -- sl@0: * sl@0: * Convert lowercase characters to uppercase characters in a UTF sl@0: * string in place. The conversion may shrink the UTF string. sl@0: * sl@0: * Results: sl@0: * Returns the number of bytes in the resulting string sl@0: * excluding the trailing null. sl@0: * sl@0: * Side effects: sl@0: * Writes a terminating null after the last converted character. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfToUpper(str) sl@0: char *str; /* String to convert in place. */ sl@0: { sl@0: Tcl_UniChar ch, upChar; sl@0: char *src, *dst; sl@0: int bytes; sl@0: sl@0: /* sl@0: * Iterate over the string until we hit the terminating null. sl@0: */ sl@0: sl@0: src = dst = str; sl@0: while (*src) { sl@0: bytes = TclUtfToUniChar(src, &ch); sl@0: upChar = Tcl_UniCharToUpper(ch); sl@0: sl@0: /* sl@0: * To keep badly formed Utf strings from getting inflated by sl@0: * the conversion (thereby causing a segfault), only copy the sl@0: * upper case char to dst if its size is <= the original char. sl@0: */ sl@0: sl@0: if (bytes < UtfCount(upChar)) { sl@0: memcpy(dst, src, (size_t) bytes); sl@0: dst += bytes; sl@0: } else { sl@0: dst += Tcl_UniCharToUtf(upChar, dst); sl@0: } sl@0: src += bytes; sl@0: } sl@0: *dst = '\0'; sl@0: return (dst - str); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfToLower -- sl@0: * sl@0: * Convert uppercase characters to lowercase characters in a UTF sl@0: * string in place. The conversion may shrink the UTF string. sl@0: * sl@0: * Results: sl@0: * Returns the number of bytes in the resulting string sl@0: * excluding the trailing null. sl@0: * sl@0: * Side effects: sl@0: * Writes a terminating null after the last converted character. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfToLower(str) sl@0: char *str; /* String to convert in place. */ sl@0: { sl@0: Tcl_UniChar ch, lowChar; sl@0: char *src, *dst; sl@0: int bytes; sl@0: sl@0: /* sl@0: * Iterate over the string until we hit the terminating null. sl@0: */ sl@0: sl@0: src = dst = str; sl@0: while (*src) { sl@0: bytes = TclUtfToUniChar(src, &ch); sl@0: lowChar = Tcl_UniCharToLower(ch); sl@0: sl@0: /* sl@0: * To keep badly formed Utf strings from getting inflated by sl@0: * the conversion (thereby causing a segfault), only copy the sl@0: * lower case char to dst if its size is <= the original char. sl@0: */ sl@0: sl@0: if (bytes < UtfCount(lowChar)) { sl@0: memcpy(dst, src, (size_t) bytes); sl@0: dst += bytes; sl@0: } else { sl@0: dst += Tcl_UniCharToUtf(lowChar, dst); sl@0: } sl@0: src += bytes; sl@0: } sl@0: *dst = '\0'; sl@0: return (dst - str); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfToTitle -- sl@0: * sl@0: * Changes the first character of a UTF string to title case or sl@0: * uppercase and the rest of the string to lowercase. The sl@0: * conversion happens in place and may shrink the UTF string. sl@0: * sl@0: * Results: sl@0: * Returns the number of bytes in the resulting string sl@0: * excluding the trailing null. sl@0: * sl@0: * Side effects: sl@0: * Writes a terminating null after the last converted character. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfToTitle(str) sl@0: char *str; /* String to convert in place. */ sl@0: { sl@0: Tcl_UniChar ch, titleChar, lowChar; sl@0: char *src, *dst; sl@0: int bytes; sl@0: sl@0: /* sl@0: * Capitalize the first character and then lowercase the rest of the sl@0: * characters until we get to a null. sl@0: */ sl@0: sl@0: src = dst = str; sl@0: sl@0: if (*src) { sl@0: bytes = TclUtfToUniChar(src, &ch); sl@0: titleChar = Tcl_UniCharToTitle(ch); sl@0: sl@0: if (bytes < UtfCount(titleChar)) { sl@0: memcpy(dst, src, (size_t) bytes); sl@0: dst += bytes; sl@0: } else { sl@0: dst += Tcl_UniCharToUtf(titleChar, dst); sl@0: } sl@0: src += bytes; sl@0: } sl@0: while (*src) { sl@0: bytes = TclUtfToUniChar(src, &ch); sl@0: lowChar = Tcl_UniCharToLower(ch); sl@0: sl@0: if (bytes < UtfCount(lowChar)) { sl@0: memcpy(dst, src, (size_t) bytes); sl@0: dst += bytes; sl@0: } else { sl@0: dst += Tcl_UniCharToUtf(lowChar, dst); sl@0: } sl@0: src += bytes; sl@0: } sl@0: *dst = '\0'; sl@0: return (dst - str); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * TclpUtfNcmp2 -- sl@0: * sl@0: * Compare at most n bytes of utf-8 strings cs and ct. Both cs sl@0: * and ct are assumed to be at least n bytes long. sl@0: * sl@0: * Results: sl@0: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: int sl@0: TclpUtfNcmp2(cs, ct, n) sl@0: CONST char *cs; /* UTF string to compare to ct. */ sl@0: CONST char *ct; /* UTF string cs is compared to. */ sl@0: unsigned long n; /* Number of *bytes* to compare. */ sl@0: { sl@0: /* sl@0: * We can't simply call 'memcmp(cs, ct, n);' because we need to check sl@0: * for Tcl's \xC0\x80 non-utf-8 null encoding. sl@0: * Otherwise utf-8 lexes fine in the strcmp manner. sl@0: */ sl@0: register int result = 0; sl@0: sl@0: for ( ; n != 0; n--, cs++, ct++) { sl@0: if (*cs != *ct) { sl@0: result = UCHAR(*cs) - UCHAR(*ct); sl@0: break; sl@0: } sl@0: } sl@0: if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { sl@0: unsigned char c1, c2; sl@0: c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); sl@0: c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); sl@0: result = (c1 - c2); sl@0: } sl@0: return result; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfNcmp -- sl@0: * sl@0: * Compare at most n UTF chars of string cs to string ct. Both cs sl@0: * and ct are assumed to be at least n UTF chars long. sl@0: * sl@0: * Results: sl@0: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfNcmp(cs, ct, n) sl@0: CONST char *cs; /* UTF string to compare to ct. */ sl@0: CONST char *ct; /* UTF string cs is compared to. */ sl@0: unsigned long n; /* Number of UTF chars to compare. */ sl@0: { sl@0: Tcl_UniChar ch1, ch2; sl@0: /* sl@0: * Cannot use 'memcmp(cs, ct, n);' as byte representation of sl@0: * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte sl@0: * representation of \u0001 (the byte 0x01.) sl@0: */ sl@0: while (n-- > 0) { sl@0: /* sl@0: * n must be interpreted as chars, not bytes. sl@0: * This should be called only when both strings are of sl@0: * at least n chars long (no need for \0 check) sl@0: */ sl@0: cs += TclUtfToUniChar(cs, &ch1); sl@0: ct += TclUtfToUniChar(ct, &ch2); sl@0: if (ch1 != ch2) { sl@0: return (ch1 - ch2); sl@0: } sl@0: } sl@0: return 0; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UtfNcasecmp -- sl@0: * sl@0: * Compare at most n UTF chars of string cs to string ct case sl@0: * insensitive. Both cs and ct are assumed to be at least n sl@0: * UTF chars long. sl@0: * sl@0: * Results: sl@0: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UtfNcasecmp(cs, ct, n) sl@0: CONST char *cs; /* UTF string to compare to ct. */ sl@0: CONST char *ct; /* UTF string cs is compared to. */ sl@0: unsigned long n; /* Number of UTF chars to compare. */ sl@0: { sl@0: Tcl_UniChar ch1, ch2; sl@0: while (n-- > 0) { sl@0: /* sl@0: * n must be interpreted as chars, not bytes. sl@0: * This should be called only when both strings are of sl@0: * at least n chars long (no need for \0 check) sl@0: */ sl@0: cs += TclUtfToUniChar(cs, &ch1); sl@0: ct += TclUtfToUniChar(ct, &ch2); sl@0: if (ch1 != ch2) { sl@0: ch1 = Tcl_UniCharToLower(ch1); sl@0: ch2 = Tcl_UniCharToLower(ch2); sl@0: if (ch1 != ch2) { sl@0: return (ch1 - ch2); sl@0: } sl@0: } sl@0: } sl@0: return 0; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharToUpper -- sl@0: * sl@0: * Compute the uppercase equivalent of the given Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns the uppercase Unicode character. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_UniChar sl@0: Tcl_UniCharToUpper(ch) sl@0: int ch; /* Unicode character to convert. */ sl@0: { sl@0: int info = GetUniCharInfo(ch); sl@0: sl@0: if (GetCaseType(info) & 0x04) { sl@0: return (Tcl_UniChar) (ch - GetDelta(info)); sl@0: } else { sl@0: return ch; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharToLower -- sl@0: * sl@0: * Compute the lowercase equivalent of the given Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns the lowercase Unicode character. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_UniChar sl@0: Tcl_UniCharToLower(ch) sl@0: int ch; /* Unicode character to convert. */ sl@0: { sl@0: int info = GetUniCharInfo(ch); sl@0: sl@0: if (GetCaseType(info) & 0x02) { sl@0: return (Tcl_UniChar) (ch + GetDelta(info)); sl@0: } else { sl@0: return ch; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharToTitle -- sl@0: * sl@0: * Compute the titlecase equivalent of the given Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns the titlecase Unicode character. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_UniChar sl@0: Tcl_UniCharToTitle(ch) sl@0: int ch; /* Unicode character to convert. */ sl@0: { sl@0: int info = GetUniCharInfo(ch); sl@0: int mode = GetCaseType(info); sl@0: sl@0: if (mode & 0x1) { sl@0: /* sl@0: * Subtract or add one depending on the original case. sl@0: */ sl@0: sl@0: return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); sl@0: } else if (mode == 0x4) { sl@0: return (Tcl_UniChar) (ch - GetDelta(info)); sl@0: } else { sl@0: return ch; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharLen -- sl@0: * sl@0: * Find the length of a UniChar string. The str input must be null sl@0: * terminated. sl@0: * sl@0: * Results: sl@0: * Returns the length of str in UniChars (not bytes). sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharLen(str) sl@0: CONST Tcl_UniChar *str; /* Unicode string to find length of. */ sl@0: { sl@0: int len = 0; sl@0: sl@0: while (*str != '\0') { sl@0: len++; sl@0: str++; sl@0: } sl@0: return len; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharNcmp -- sl@0: * sl@0: * Compare at most n unichars of string cs to string ct. Both cs sl@0: * and ct are assumed to be at least n unichars long. sl@0: * sl@0: * Results: sl@0: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharNcmp(cs, ct, n) sl@0: CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ sl@0: CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ sl@0: unsigned long n; /* Number of unichars to compare. */ sl@0: { sl@0: #ifdef WORDS_BIGENDIAN sl@0: /* sl@0: * We are definitely on a big-endian machine; memcmp() is safe sl@0: */ sl@0: return memcmp(cs, ct, n*sizeof(Tcl_UniChar)); sl@0: sl@0: #else /* !WORDS_BIGENDIAN */ sl@0: /* sl@0: * We can't simply call memcmp() because that is not lexically correct. sl@0: */ sl@0: for ( ; n != 0; cs++, ct++, n--) { sl@0: if (*cs != *ct) { sl@0: return (*cs - *ct); sl@0: } sl@0: } sl@0: return 0; sl@0: #endif /* WORDS_BIGENDIAN */ sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharNcasecmp -- sl@0: * sl@0: * Compare at most n unichars of string cs to string ct case sl@0: * insensitive. Both cs and ct are assumed to be at least n sl@0: * unichars long. sl@0: * sl@0: * Results: sl@0: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharNcasecmp(cs, ct, n) sl@0: CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ sl@0: CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ sl@0: unsigned long n; /* Number of unichars to compare. */ sl@0: { sl@0: for ( ; n != 0; n--, cs++, ct++) { sl@0: if (*cs != *ct) { sl@0: Tcl_UniChar lcs = Tcl_UniCharToLower(*cs); sl@0: Tcl_UniChar lct = Tcl_UniCharToLower(*ct); sl@0: if (lcs != lct) { sl@0: return (lcs - lct); sl@0: } sl@0: } sl@0: } sl@0: return 0; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsAlnum -- sl@0: * sl@0: * Test if a character is an alphanumeric Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns 1 if character is alphanumeric. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsAlnum(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: sl@0: return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsAlpha -- sl@0: * sl@0: * Test if a character is an alphabetic Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns 1 if character is alphabetic. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsAlpha(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: return ((ALPHA_BITS >> category) & 1); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsControl -- sl@0: * sl@0: * Test if a character is a Unicode control character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is a control. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsControl(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsDigit -- sl@0: * sl@0: * Test if a character is a numeric Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is a digit. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsDigit(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) sl@0: == DECIMAL_DIGIT_NUMBER); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsGraph -- sl@0: * sl@0: * Test if a character is any Unicode print character except space. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is printable, but not space. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsGraph(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsLower -- sl@0: * sl@0: * Test if a character is a lowercase Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is lowercase. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsLower(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsPrint -- sl@0: * sl@0: * Test if a character is a Unicode print character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is printable. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsPrint(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: return ((PRINT_BITS >> category) & 1); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsPunct -- sl@0: * sl@0: * Test if a character is a Unicode punctuation character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is punct. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsPunct(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: return ((PUNCT_BITS >> category) & 1); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsSpace -- sl@0: * sl@0: * Test if a character is a whitespace Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is a space. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsSpace(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category; sl@0: sl@0: /* sl@0: * If the character is within the first 127 characters, just use the sl@0: * standard C function, otherwise consult the Unicode table. sl@0: */ sl@0: sl@0: if (ch < 0x80) { sl@0: return isspace(UCHAR(ch)); /* INTL: ISO space */ sl@0: } else { sl@0: category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: return ((SPACE_BITS >> category) & 1); sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsUpper -- sl@0: * sl@0: * Test if a character is a uppercase Unicode character. sl@0: * sl@0: * Results: sl@0: * Returns non-zero if character is uppercase. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsUpper(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharIsWordChar -- sl@0: * sl@0: * Test if a character is alphanumeric or a connector punctuation sl@0: * mark. sl@0: * sl@0: * Results: sl@0: * Returns 1 if character is a word character. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharIsWordChar(ch) sl@0: int ch; /* Unicode character to test. */ sl@0: { sl@0: register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); sl@0: sl@0: return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_UniCharCaseMatch -- sl@0: * sl@0: * See if a particular Unicode string matches a particular pattern. sl@0: * Allows case insensitivity. This is the Unicode equivalent of sl@0: * the char* Tcl_StringCaseMatch. The UniChar strings must be sl@0: * NULL-terminated. This has no provision for counted UniChar sl@0: * strings, thus should not be used where NULLs are expected in the sl@0: * UniChar string. Use TclUniCharMatch where possible. sl@0: * sl@0: * Results: sl@0: * The return value is 1 if string matches pattern, and sl@0: * 0 otherwise. The matching operation permits the following sl@0: * special characters in the pattern: *?\[] (see the manual sl@0: * entry for details on what these mean). sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_UniCharCaseMatch(string, pattern, nocase) sl@0: CONST Tcl_UniChar *string; /* Unicode String. */ sl@0: CONST Tcl_UniChar *pattern; /* Pattern, which may contain special sl@0: * characters. */ sl@0: int nocase; /* 0 for case sensitive, 1 for insensitive */ sl@0: { sl@0: Tcl_UniChar ch1, p; sl@0: sl@0: while (1) { sl@0: p = *pattern; sl@0: sl@0: /* sl@0: * See if we're at the end of both the pattern and the string. If sl@0: * so, we succeeded. If we're at the end of the pattern but not at sl@0: * the end of the string, we failed. sl@0: */ sl@0: sl@0: if (p == 0) { sl@0: return (*string == 0); sl@0: } sl@0: if ((*string == 0) && (p != '*')) { sl@0: return 0; sl@0: } sl@0: sl@0: /* sl@0: * Check for a "*" as the next pattern character. It matches any sl@0: * substring. We handle this by skipping all the characters up to the sl@0: * next matching one in the pattern, and then calling ourselves sl@0: * recursively for each postfix of string, until either we match or we sl@0: * reach the end of the string. sl@0: */ sl@0: sl@0: if (p == '*') { sl@0: /* sl@0: * Skip all successive *'s in the pattern sl@0: */ sl@0: while (*(++pattern) == '*') {} sl@0: p = *pattern; sl@0: if (p == 0) { sl@0: return 1; sl@0: } sl@0: if (nocase) { sl@0: p = Tcl_UniCharToLower(p); sl@0: } sl@0: while (1) { sl@0: /* sl@0: * Optimization for matching - cruise through the string sl@0: * quickly if the next char in the pattern isn't a special sl@0: * character sl@0: */ sl@0: if ((p != '[') && (p != '?') && (p != '\\')) { sl@0: if (nocase) { sl@0: while (*string && (p != *string) sl@0: && (p != Tcl_UniCharToLower(*string))) { sl@0: string++; sl@0: } sl@0: } else { sl@0: while (*string && (p != *string)) { string++; } sl@0: } sl@0: } sl@0: if (Tcl_UniCharCaseMatch(string, pattern, nocase)) { sl@0: return 1; sl@0: } sl@0: if (*string == 0) { sl@0: return 0; sl@0: } sl@0: string++; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: * Check for a "?" as the next pattern character. It matches sl@0: * any single character. sl@0: */ sl@0: sl@0: if (p == '?') { sl@0: pattern++; sl@0: string++; sl@0: continue; sl@0: } sl@0: sl@0: /* sl@0: * Check for a "[" as the next pattern character. It is followed sl@0: * by a list of characters that are acceptable, or by a range sl@0: * (two characters separated by "-"). sl@0: */ sl@0: sl@0: if (p == '[') { sl@0: Tcl_UniChar startChar, endChar; sl@0: sl@0: pattern++; sl@0: ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); sl@0: string++; sl@0: while (1) { sl@0: if ((*pattern == ']') || (*pattern == 0)) { sl@0: return 0; sl@0: } sl@0: startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); sl@0: pattern++; sl@0: if (*pattern == '-') { sl@0: pattern++; sl@0: if (*pattern == 0) { sl@0: return 0; sl@0: } sl@0: endChar = (nocase ? Tcl_UniCharToLower(*pattern) sl@0: : *pattern); sl@0: pattern++; sl@0: if (((startChar <= ch1) && (ch1 <= endChar)) sl@0: || ((endChar <= ch1) && (ch1 <= startChar))) { sl@0: /* sl@0: * Matches ranges of form [a-z] or [z-a]. sl@0: */ sl@0: break; sl@0: } sl@0: } else if (startChar == ch1) { sl@0: break; sl@0: } sl@0: } sl@0: while (*pattern != ']') { sl@0: if (*pattern == 0) { sl@0: pattern--; sl@0: break; sl@0: } sl@0: pattern++; sl@0: } sl@0: pattern++; sl@0: continue; sl@0: } sl@0: sl@0: /* sl@0: * If the next pattern character is '\', just strip off the '\' sl@0: * so we do exact matching on the character that follows. sl@0: */ sl@0: sl@0: if (p == '\\') { sl@0: if (*(++pattern) == '\0') { sl@0: return 0; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: * There's no special character. Just make sure that the next sl@0: * bytes of each string match. sl@0: */ sl@0: sl@0: if (nocase) { sl@0: if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { sl@0: return 0; sl@0: } sl@0: } else if (*string != *pattern) { sl@0: return 0; sl@0: } sl@0: string++; sl@0: pattern++; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * TclUniCharMatch -- sl@0: * sl@0: * See if a particular Unicode string matches a particular pattern. sl@0: * Allows case insensitivity. This is the Unicode equivalent of the sl@0: * char* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch sl@0: * uses counted Strings, so embedded NULLs are allowed. sl@0: * sl@0: * Results: sl@0: * The return value is 1 if string matches pattern, and sl@0: * 0 otherwise. The matching operation permits the following sl@0: * special characters in the pattern: *?\[] (see the manual sl@0: * entry for details on what these mean). sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: int sl@0: TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) sl@0: CONST Tcl_UniChar *string; /* Unicode String. */ sl@0: int strLen; /* length of String */ sl@0: CONST Tcl_UniChar *pattern; /* Pattern, which may contain special sl@0: * characters. */ sl@0: int ptnLen; /* length of Pattern */ sl@0: int nocase; /* 0 for case sensitive, 1 for insensitive */ sl@0: { sl@0: CONST Tcl_UniChar *stringEnd, *patternEnd; sl@0: Tcl_UniChar p; sl@0: sl@0: stringEnd = string + strLen; sl@0: patternEnd = pattern + ptnLen; sl@0: sl@0: while (1) { sl@0: /* sl@0: * See if we're at the end of both the pattern and the string. If sl@0: * so, we succeeded. If we're at the end of the pattern but not at sl@0: * the end of the string, we failed. sl@0: */ sl@0: sl@0: if (pattern == patternEnd) { sl@0: return (string == stringEnd); sl@0: } sl@0: p = *pattern; sl@0: if ((string == stringEnd) && (p != '*')) { sl@0: return 0; sl@0: } sl@0: sl@0: /* sl@0: * Check for a "*" as the next pattern character. It matches any sl@0: * substring. We handle this by skipping all the characters up to the sl@0: * next matching one in the pattern, and then calling ourselves sl@0: * recursively for each postfix of string, until either we match or we sl@0: * reach the end of the string. sl@0: */ sl@0: sl@0: if (p == '*') { sl@0: /* sl@0: * Skip all successive *'s in the pattern sl@0: */ sl@0: while (*(++pattern) == '*') {} sl@0: if (pattern == patternEnd) { sl@0: return 1; sl@0: } sl@0: p = *pattern; sl@0: if (nocase) { sl@0: p = Tcl_UniCharToLower(p); sl@0: } sl@0: while (1) { sl@0: /* sl@0: * Optimization for matching - cruise through the string sl@0: * quickly if the next char in the pattern isn't a special sl@0: * character sl@0: */ sl@0: if ((p != '[') && (p != '?') && (p != '\\')) { sl@0: if (nocase) { sl@0: while ((string < stringEnd) && (p != *string) sl@0: && (p != Tcl_UniCharToLower(*string))) { sl@0: string++; sl@0: } sl@0: } else { sl@0: while ((string < stringEnd) && (p != *string)) { sl@0: string++; sl@0: } sl@0: } sl@0: } sl@0: if (TclUniCharMatch(string, stringEnd - string, sl@0: pattern, patternEnd - pattern, nocase)) { sl@0: return 1; sl@0: } sl@0: if (string == stringEnd) { sl@0: return 0; sl@0: } sl@0: string++; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: * Check for a "?" as the next pattern character. It matches sl@0: * any single character. sl@0: */ sl@0: sl@0: if (p == '?') { sl@0: pattern++; sl@0: string++; sl@0: continue; sl@0: } sl@0: sl@0: /* sl@0: * Check for a "[" as the next pattern character. It is followed sl@0: * by a list of characters that are acceptable, or by a range sl@0: * (two characters separated by "-"). sl@0: */ sl@0: sl@0: if (p == '[') { sl@0: Tcl_UniChar ch1, startChar, endChar; sl@0: sl@0: pattern++; sl@0: ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); sl@0: string++; sl@0: while (1) { sl@0: if ((*pattern == ']') || (pattern == patternEnd)) { sl@0: return 0; sl@0: } sl@0: startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); sl@0: pattern++; sl@0: if (*pattern == '-') { sl@0: pattern++; sl@0: if (pattern == patternEnd) { sl@0: return 0; sl@0: } sl@0: endChar = (nocase ? Tcl_UniCharToLower(*pattern) sl@0: : *pattern); sl@0: pattern++; sl@0: if (((startChar <= ch1) && (ch1 <= endChar)) sl@0: || ((endChar <= ch1) && (ch1 <= startChar))) { sl@0: /* sl@0: * Matches ranges of form [a-z] or [z-a]. sl@0: */ sl@0: break; sl@0: } sl@0: } else if (startChar == ch1) { sl@0: break; sl@0: } sl@0: } sl@0: while (*pattern != ']') { sl@0: if (pattern == patternEnd) { sl@0: pattern--; sl@0: break; sl@0: } sl@0: pattern++; sl@0: } sl@0: pattern++; sl@0: continue; sl@0: } sl@0: sl@0: /* sl@0: * If the next pattern character is '\', just strip off the '\' sl@0: * so we do exact matching on the character that follows. sl@0: */ sl@0: sl@0: if (p == '\\') { sl@0: if (++pattern == patternEnd) { sl@0: return 0; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: * There's no special character. Just make sure that the next sl@0: * bytes of each string match. sl@0: */ sl@0: sl@0: if (nocase) { sl@0: if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { sl@0: return 0; sl@0: } sl@0: } else if (*string != *pattern) { sl@0: return 0; sl@0: } sl@0: string++; sl@0: pattern++; sl@0: } sl@0: }