os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclUtf.c
Update contrib.
4 * Routines for manipulating UTF-8 strings.
6 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
7 * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.
9 * See the file "license.terms" for information on usage and redistribution
10 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
12 * RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $
18 * Include the static character classification tables and macros.
21 #include "tclUniData.c"
24 * The following macros are used for fast character category tests. The
25 * x_BITS values are shifted right by the category value to determine whether
26 * the given category is included in the set.
29 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
30 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
32 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
34 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
35 | (1 << PARAGRAPH_SEPARATOR))
37 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
39 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
40 (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
41 (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
42 (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
43 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
44 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
45 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
46 (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
47 (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
49 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
50 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
51 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
52 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
55 * Unicode characters less than this value are represented by themselves
59 #define UNICODE_SELF 0x80
62 * The following structures are used when mapping between Unicode (UCS-2)
66 static CONST unsigned char totalBytes[256] = {
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
93 * Procedures used only in this module.
96 static int UtfCount _ANSI_ARGS_((int ch));
100 *---------------------------------------------------------------------------
104 * Find the number of bytes in the Utf character "ch".
107 * The return values is the number of bytes in the Utf character "ch".
112 *---------------------------------------------------------------------------
117 int ch; /* The Tcl_UniChar whose size is returned. */
119 if ((ch > 0) && (ch < UNICODE_SELF)) {
129 if (ch <= 0x1FFFFF) {
132 if (ch <= 0x3FFFFFF) {
135 if (ch <= 0x7FFFFFFF) {
143 *---------------------------------------------------------------------------
145 * Tcl_UniCharToUtf --
147 * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
148 * provided buffer. Equivalent to Plan 9 runetochar().
151 * The return values is the number of bytes in the buffer that
157 *---------------------------------------------------------------------------
161 Tcl_UniCharToUtf(ch, str)
162 int ch; /* The Tcl_UniChar to be stored in the
164 char *str; /* Buffer in which the UTF-8 representation
165 * of the Tcl_UniChar is stored. Buffer must
166 * be large enough to hold the UTF-8 character
167 * (at most TCL_UTF_MAX bytes). */
169 if ((ch > 0) && (ch < UNICODE_SELF)) {
175 str[1] = (char) ((ch | 0x80) & 0xBF);
176 str[0] = (char) ((ch >> 6) | 0xC0);
181 str[2] = (char) ((ch | 0x80) & 0xBF);
182 str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
183 str[0] = (char) ((ch >> 12) | 0xE0);
188 if (ch <= 0x1FFFFF) {
189 str[3] = (char) ((ch | 0x80) & 0xBF);
190 str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
191 str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
192 str[0] = (char) ((ch >> 18) | 0xF0);
195 if (ch <= 0x3FFFFFF) {
196 str[4] = (char) ((ch | 0x80) & 0xBF);
197 str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
198 str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
199 str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
200 str[0] = (char) ((ch >> 24) | 0xF8);
203 if (ch <= 0x7FFFFFFF) {
204 str[5] = (char) ((ch | 0x80) & 0xBF);
205 str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
206 str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
207 str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
208 str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
209 str[0] = (char) ((ch >> 30) | 0xFC);
220 *---------------------------------------------------------------------------
222 * Tcl_UniCharToUtfDString --
224 * Convert the given Unicode string to UTF-8.
227 * The return value is a pointer to the UTF-8 representation of the
228 * Unicode string. Storage for the return value is appended to the
234 *---------------------------------------------------------------------------
238 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
239 CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
240 int numChars; /* Length of Unicode string in Tcl_UniChars
242 Tcl_DString *dsPtr; /* UTF-8 representation of string is
243 * appended to this previously initialized
246 CONST Tcl_UniChar *w, *wEnd;
251 * UTF-8 string length in bytes will be <= Unicode string length *
255 oldLength = Tcl_DStringLength(dsPtr);
256 Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
257 string = Tcl_DStringValue(dsPtr) + oldLength;
260 wEnd = wString + numChars;
261 for (w = wString; w < wEnd; ) {
262 p += Tcl_UniCharToUtf(*w, p);
265 Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
271 *---------------------------------------------------------------------------
273 * Tcl_UtfToUniChar --
275 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
276 * UTF-8 sequences are converted to valid Tcl_UniChars and processing
277 * continues. Equivalent to Plan 9 chartorune().
279 * The caller must ensure that the source buffer is long enough that
280 * this routine does not run off the end and dereference non-existent
281 * memory looking for trail bytes. If the source buffer is known to
282 * be '\0' terminated, this cannot happen. Otherwise, the caller
283 * should call Tcl_UtfCharComplete() before calling this routine to
284 * ensure that enough bytes remain in the string.
287 * *chPtr is filled with the Tcl_UniChar, and the return value is the
288 * number of bytes from the UTF-8 string that were consumed.
293 *---------------------------------------------------------------------------
297 Tcl_UtfToUniChar(str, chPtr)
298 register CONST char *str; /* The UTF-8 string. */
299 register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
300 * by the UTF-8 string. */
305 * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
308 byte = *((unsigned char *) str);
311 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
312 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
313 * characters representing themselves.
316 *chPtr = (Tcl_UniChar) byte;
318 } else if (byte < 0xE0) {
319 if ((str[1] & 0xC0) == 0x80) {
321 * Two-byte-character lead-byte followed by a trail-byte.
324 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
328 * A two-byte-character lead-byte not followed by trail-byte
332 *chPtr = (Tcl_UniChar) byte;
334 } else if (byte < 0xF0) {
335 if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
337 * Three-byte-character lead byte followed by two trail bytes.
340 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
341 | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
345 * A three-byte-character lead-byte not followed by two trail-bytes
349 *chPtr = (Tcl_UniChar) byte;
354 int ch, total, trail;
356 total = totalBytes[byte];
359 ch = byte & (0x3F >> trail);
362 if ((*str & 0xC0) != 0x80) {
376 *chPtr = (Tcl_UniChar) byte;
381 *---------------------------------------------------------------------------
383 * Tcl_UtfToUniCharDString --
385 * Convert the UTF-8 string to Unicode.
388 * The return value is a pointer to the Unicode representation of the
389 * UTF-8 string. Storage for the return value is appended to the
390 * end of dsPtr. The Unicode string is terminated with a Unicode
396 *---------------------------------------------------------------------------
399 EXPORT_C Tcl_UniChar *
400 Tcl_UtfToUniCharDString(string, length, dsPtr)
401 CONST char *string; /* UTF-8 string to convert to Unicode. */
402 int length; /* Length of UTF-8 string in bytes, or -1
404 Tcl_DString *dsPtr; /* Unicode representation of string is
405 * appended to this previously initialized
408 Tcl_UniChar *w, *wString;
413 length = strlen(string);
417 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
421 oldLength = Tcl_DStringLength(dsPtr);
422 Tcl_DStringSetLength(dsPtr,
423 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
424 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
427 end = string + length;
428 for (p = string; p < end; ) {
429 p += TclUtfToUniChar(p, w);
433 Tcl_DStringSetLength(dsPtr,
434 (oldLength + ((char *) w - (char *) wString)));
440 *---------------------------------------------------------------------------
442 * Tcl_UtfCharComplete --
444 * Determine if the UTF-8 string of the given length is long enough
445 * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
446 * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
449 * The return value is 0 if the string is not long enough, non-zero
455 *---------------------------------------------------------------------------
459 Tcl_UtfCharComplete(str, len)
460 CONST char *str; /* String to check if first few bytes
461 * contain a complete UTF-8 character. */
462 int len; /* Length of above string in bytes. */
466 ch = *((unsigned char *) str);
467 return len >= totalBytes[ch];
471 *---------------------------------------------------------------------------
475 * Returns the number of characters (not bytes) in the UTF-8 string,
476 * not including the terminating NULL byte. This is equivalent to
477 * Plan 9 utflen() and utfnlen().
485 *---------------------------------------------------------------------------
489 Tcl_NumUtfChars(str, len)
490 register CONST char *str; /* The UTF-8 string to measure. */
491 int len; /* The length of the string in bytes, or -1
492 * for strlen(string). */
495 register Tcl_UniChar *chPtr = &ch;
499 * The separate implementations are faster.
501 * Since this is a time-sensitive function, we also do the check for
502 * the single-byte char case specially.
507 while (*str != '\0') {
508 str += TclUtfToUniChar(str, chPtr);
515 if (UCHAR(*str) < 0xC0) {
519 n = Tcl_UtfToUniChar(str, chPtr);
530 *---------------------------------------------------------------------------
532 * Tcl_UtfFindFirst --
534 * Returns a pointer to the first occurance of the given Tcl_UniChar
535 * in the NULL-terminated UTF-8 string. The NULL terminator is
536 * considered part of the UTF-8 string. Equivalent to Plan 9
540 * As above. If the Tcl_UniChar does not exist in the given string,
541 * the return value is NULL.
546 *---------------------------------------------------------------------------
548 EXPORT_C CONST char *
549 Tcl_UtfFindFirst(string, ch)
550 CONST char *string; /* The UTF-8 string to be searched. */
551 int ch; /* The Tcl_UniChar to search for. */
557 len = TclUtfToUniChar(string, &find);
561 if (*string == '\0') {
569 *---------------------------------------------------------------------------
573 * Returns a pointer to the last occurance of the given Tcl_UniChar
574 * in the NULL-terminated UTF-8 string. The NULL terminator is
575 * considered part of the UTF-8 string. Equivalent to Plan 9
579 * As above. If the Tcl_UniChar does not exist in the given string,
580 * the return value is NULL.
585 *---------------------------------------------------------------------------
588 EXPORT_C CONST char *
589 Tcl_UtfFindLast(string, ch)
590 CONST char *string; /* The UTF-8 string to be searched. */
591 int ch; /* The Tcl_UniChar to search for. */
599 len = TclUtfToUniChar(string, &find);
603 if (*string == '\0') {
612 *---------------------------------------------------------------------------
616 * Given a pointer to some current location in a UTF-8 string,
617 * move forward one character. The caller must ensure that they
618 * are not asking for the next character after the last character
622 * The return value is the pointer to the next character in
628 *---------------------------------------------------------------------------
631 EXPORT_C CONST char *
633 CONST char *str; /* The current location in the string. */
637 return str + TclUtfToUniChar(str, &ch);
641 *---------------------------------------------------------------------------
645 * Given a pointer to some current location in a UTF-8 string,
646 * move backwards one character. This works correctly when the
647 * pointer is in the middle of a UTF-8 character.
650 * The return value is a pointer to the previous character in the
651 * UTF-8 string. If the current location was already at the
652 * beginning of the string, the return value will also be a
653 * pointer to the beginning of the string.
658 *---------------------------------------------------------------------------
661 EXPORT_C CONST char *
662 Tcl_UtfPrev(str, start)
663 CONST char *str; /* The current location in the string. */
664 CONST char *start; /* Pointer to the beginning of the
665 * string, to avoid going backwards too
673 for (i = 0; i < TCL_UTF_MAX; i++) {
680 byte = *((unsigned char *) look);
693 *---------------------------------------------------------------------------
695 * Tcl_UniCharAtIndex --
697 * Returns the Unicode character represented at the specified
698 * character (not byte) position in the UTF-8 string.
706 *---------------------------------------------------------------------------
710 Tcl_UniCharAtIndex(src, index)
711 register CONST char *src; /* The UTF-8 string to dereference. */
712 register int index; /* The position of the desired character. */
718 src += TclUtfToUniChar(src, &ch);
724 *---------------------------------------------------------------------------
728 * Returns a pointer to the specified character (not byte) position
729 * in the UTF-8 string.
737 *---------------------------------------------------------------------------
740 EXPORT_C CONST char *
741 Tcl_UtfAtIndex(src, index)
742 register CONST char *src; /* The UTF-8 string. */
743 register int index; /* The position of the desired character. */
749 src += TclUtfToUniChar(src, &ch);
755 *---------------------------------------------------------------------------
757 * Tcl_UtfBackslash --
759 * Figure out how to handle a backslash sequence.
762 * Stores the bytes represented by the backslash sequence in dst and
763 * returns the number of bytes written to dst. At most TCL_UTF_MAX
764 * bytes are written to dst; dst must have been large enough to accept
765 * those bytes. If readPtr isn't NULL then it is filled in with a
766 * count of the number of bytes in the backslash sequence.
769 * The maximum number of bytes it takes to represent a Unicode
770 * character in UTF-8 is guaranteed to be less than the number of
771 * bytes used to express the backslash sequence that represents
772 * that Unicode character. If the target buffer into which the
773 * caller is going to store the bytes that represent the Unicode
774 * character is at least as large as the source buffer from which
775 * the backslashed sequence was extracted, no buffer overruns should
778 *---------------------------------------------------------------------------
782 Tcl_UtfBackslash(src, readPtr, dst)
783 CONST char *src; /* Points to the backslash character of
784 * a backslash sequence. */
785 int *readPtr; /* Fill in with number of characters read
786 * from src, unless NULL. */
787 char *dst; /* Filled with the bytes represented by the
788 * backslash sequence. */
790 #define LINE_LENGTH 128
794 result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
795 if (numRead == LINE_LENGTH) {
796 /* We ate a whole line. Pay the price of a strlen() */
797 result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
799 if (readPtr != NULL) {
806 *----------------------------------------------------------------------
810 * Convert lowercase characters to uppercase characters in a UTF
811 * string in place. The conversion may shrink the UTF string.
814 * Returns the number of bytes in the resulting string
815 * excluding the trailing null.
818 * Writes a terminating null after the last converted character.
820 *----------------------------------------------------------------------
825 char *str; /* String to convert in place. */
827 Tcl_UniChar ch, upChar;
832 * Iterate over the string until we hit the terminating null.
837 bytes = TclUtfToUniChar(src, &ch);
838 upChar = Tcl_UniCharToUpper(ch);
841 * To keep badly formed Utf strings from getting inflated by
842 * the conversion (thereby causing a segfault), only copy the
843 * upper case char to dst if its size is <= the original char.
846 if (bytes < UtfCount(upChar)) {
847 memcpy(dst, src, (size_t) bytes);
850 dst += Tcl_UniCharToUtf(upChar, dst);
859 *----------------------------------------------------------------------
863 * Convert uppercase characters to lowercase characters in a UTF
864 * string in place. The conversion may shrink the UTF string.
867 * Returns the number of bytes in the resulting string
868 * excluding the trailing null.
871 * Writes a terminating null after the last converted character.
873 *----------------------------------------------------------------------
878 char *str; /* String to convert in place. */
880 Tcl_UniChar ch, lowChar;
885 * Iterate over the string until we hit the terminating null.
890 bytes = TclUtfToUniChar(src, &ch);
891 lowChar = Tcl_UniCharToLower(ch);
894 * To keep badly formed Utf strings from getting inflated by
895 * the conversion (thereby causing a segfault), only copy the
896 * lower case char to dst if its size is <= the original char.
899 if (bytes < UtfCount(lowChar)) {
900 memcpy(dst, src, (size_t) bytes);
903 dst += Tcl_UniCharToUtf(lowChar, dst);
912 *----------------------------------------------------------------------
916 * Changes the first character of a UTF string to title case or
917 * uppercase and the rest of the string to lowercase. The
918 * conversion happens in place and may shrink the UTF string.
921 * Returns the number of bytes in the resulting string
922 * excluding the trailing null.
925 * Writes a terminating null after the last converted character.
927 *----------------------------------------------------------------------
932 char *str; /* String to convert in place. */
934 Tcl_UniChar ch, titleChar, lowChar;
939 * Capitalize the first character and then lowercase the rest of the
940 * characters until we get to a null.
946 bytes = TclUtfToUniChar(src, &ch);
947 titleChar = Tcl_UniCharToTitle(ch);
949 if (bytes < UtfCount(titleChar)) {
950 memcpy(dst, src, (size_t) bytes);
953 dst += Tcl_UniCharToUtf(titleChar, dst);
958 bytes = TclUtfToUniChar(src, &ch);
959 lowChar = Tcl_UniCharToLower(ch);
961 if (bytes < UtfCount(lowChar)) {
962 memcpy(dst, src, (size_t) bytes);
965 dst += Tcl_UniCharToUtf(lowChar, dst);
974 *----------------------------------------------------------------------
978 * Compare at most n bytes of utf-8 strings cs and ct. Both cs
979 * and ct are assumed to be at least n bytes long.
982 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
987 *----------------------------------------------------------------------
991 TclpUtfNcmp2(cs, ct, n)
992 CONST char *cs; /* UTF string to compare to ct. */
993 CONST char *ct; /* UTF string cs is compared to. */
994 unsigned long n; /* Number of *bytes* to compare. */
997 * We can't simply call 'memcmp(cs, ct, n);' because we need to check
998 * for Tcl's \xC0\x80 non-utf-8 null encoding.
999 * Otherwise utf-8 lexes fine in the strcmp manner.
1001 register int result = 0;
1003 for ( ; n != 0; n--, cs++, ct++) {
1005 result = UCHAR(*cs) - UCHAR(*ct);
1009 if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
1010 unsigned char c1, c2;
1011 c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
1012 c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
1019 *----------------------------------------------------------------------
1023 * Compare at most n UTF chars of string cs to string ct. Both cs
1024 * and ct are assumed to be at least n UTF chars long.
1027 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1032 *----------------------------------------------------------------------
1036 Tcl_UtfNcmp(cs, ct, n)
1037 CONST char *cs; /* UTF string to compare to ct. */
1038 CONST char *ct; /* UTF string cs is compared to. */
1039 unsigned long n; /* Number of UTF chars to compare. */
1041 Tcl_UniChar ch1, ch2;
1043 * Cannot use 'memcmp(cs, ct, n);' as byte representation of
1044 * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
1045 * representation of \u0001 (the byte 0x01.)
1049 * n must be interpreted as chars, not bytes.
1050 * This should be called only when both strings are of
1051 * at least n chars long (no need for \0 check)
1053 cs += TclUtfToUniChar(cs, &ch1);
1054 ct += TclUtfToUniChar(ct, &ch2);
1063 *----------------------------------------------------------------------
1065 * Tcl_UtfNcasecmp --
1067 * Compare at most n UTF chars of string cs to string ct case
1068 * insensitive. Both cs and ct are assumed to be at least n
1072 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1077 *----------------------------------------------------------------------
1081 Tcl_UtfNcasecmp(cs, ct, n)
1082 CONST char *cs; /* UTF string to compare to ct. */
1083 CONST char *ct; /* UTF string cs is compared to. */
1084 unsigned long n; /* Number of UTF chars to compare. */
1086 Tcl_UniChar ch1, ch2;
1089 * n must be interpreted as chars, not bytes.
1090 * This should be called only when both strings are of
1091 * at least n chars long (no need for \0 check)
1093 cs += TclUtfToUniChar(cs, &ch1);
1094 ct += TclUtfToUniChar(ct, &ch2);
1096 ch1 = Tcl_UniCharToLower(ch1);
1097 ch2 = Tcl_UniCharToLower(ch2);
1107 *----------------------------------------------------------------------
1109 * Tcl_UniCharToUpper --
1111 * Compute the uppercase equivalent of the given Unicode character.
1114 * Returns the uppercase Unicode character.
1119 *----------------------------------------------------------------------
1122 EXPORT_C Tcl_UniChar
1123 Tcl_UniCharToUpper(ch)
1124 int ch; /* Unicode character to convert. */
1126 int info = GetUniCharInfo(ch);
1128 if (GetCaseType(info) & 0x04) {
1129 return (Tcl_UniChar) (ch - GetDelta(info));
1136 *----------------------------------------------------------------------
1138 * Tcl_UniCharToLower --
1140 * Compute the lowercase equivalent of the given Unicode character.
1143 * Returns the lowercase Unicode character.
1148 *----------------------------------------------------------------------
1151 EXPORT_C Tcl_UniChar
1152 Tcl_UniCharToLower(ch)
1153 int ch; /* Unicode character to convert. */
1155 int info = GetUniCharInfo(ch);
1157 if (GetCaseType(info) & 0x02) {
1158 return (Tcl_UniChar) (ch + GetDelta(info));
1165 *----------------------------------------------------------------------
1167 * Tcl_UniCharToTitle --
1169 * Compute the titlecase equivalent of the given Unicode character.
1172 * Returns the titlecase Unicode character.
1177 *----------------------------------------------------------------------
1180 EXPORT_C Tcl_UniChar
1181 Tcl_UniCharToTitle(ch)
1182 int ch; /* Unicode character to convert. */
1184 int info = GetUniCharInfo(ch);
1185 int mode = GetCaseType(info);
1189 * Subtract or add one depending on the original case.
1192 return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1193 } else if (mode == 0x4) {
1194 return (Tcl_UniChar) (ch - GetDelta(info));
1201 *----------------------------------------------------------------------
1205 * Find the length of a UniChar string. The str input must be null
1209 * Returns the length of str in UniChars (not bytes).
1214 *----------------------------------------------------------------------
1219 CONST Tcl_UniChar *str; /* Unicode string to find length of. */
1223 while (*str != '\0') {
1231 *----------------------------------------------------------------------
1233 * Tcl_UniCharNcmp --
1235 * Compare at most n unichars of string cs to string ct. Both cs
1236 * and ct are assumed to be at least n unichars long.
1239 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1244 *----------------------------------------------------------------------
1248 Tcl_UniCharNcmp(cs, ct, n)
1249 CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1250 CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1251 unsigned long n; /* Number of unichars to compare. */
1253 #ifdef WORDS_BIGENDIAN
1255 * We are definitely on a big-endian machine; memcmp() is safe
1257 return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
1259 #else /* !WORDS_BIGENDIAN */
1261 * We can't simply call memcmp() because that is not lexically correct.
1263 for ( ; n != 0; cs++, ct++, n--) {
1269 #endif /* WORDS_BIGENDIAN */
1273 *----------------------------------------------------------------------
1275 * Tcl_UniCharNcasecmp --
1277 * Compare at most n unichars of string cs to string ct case
1278 * insensitive. Both cs and ct are assumed to be at least n
1282 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1287 *----------------------------------------------------------------------
1291 Tcl_UniCharNcasecmp(cs, ct, n)
1292 CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1293 CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1294 unsigned long n; /* Number of unichars to compare. */
1296 for ( ; n != 0; n--, cs++, ct++) {
1298 Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);
1299 Tcl_UniChar lct = Tcl_UniCharToLower(*ct);
1309 *----------------------------------------------------------------------
1311 * Tcl_UniCharIsAlnum --
1313 * Test if a character is an alphanumeric Unicode character.
1316 * Returns 1 if character is alphanumeric.
1321 *----------------------------------------------------------------------
1325 Tcl_UniCharIsAlnum(ch)
1326 int ch; /* Unicode character to test. */
1328 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1330 return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1334 *----------------------------------------------------------------------
1336 * Tcl_UniCharIsAlpha --
1338 * Test if a character is an alphabetic Unicode character.
1341 * Returns 1 if character is alphabetic.
1346 *----------------------------------------------------------------------
1350 Tcl_UniCharIsAlpha(ch)
1351 int ch; /* Unicode character to test. */
1353 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1354 return ((ALPHA_BITS >> category) & 1);
1358 *----------------------------------------------------------------------
1360 * Tcl_UniCharIsControl --
1362 * Test if a character is a Unicode control character.
1365 * Returns non-zero if character is a control.
1370 *----------------------------------------------------------------------
1374 Tcl_UniCharIsControl(ch)
1375 int ch; /* Unicode character to test. */
1377 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1381 *----------------------------------------------------------------------
1383 * Tcl_UniCharIsDigit --
1385 * Test if a character is a numeric Unicode character.
1388 * Returns non-zero if character is a digit.
1393 *----------------------------------------------------------------------
1397 Tcl_UniCharIsDigit(ch)
1398 int ch; /* Unicode character to test. */
1400 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1401 == DECIMAL_DIGIT_NUMBER);
1405 *----------------------------------------------------------------------
1407 * Tcl_UniCharIsGraph --
1409 * Test if a character is any Unicode print character except space.
1412 * Returns non-zero if character is printable, but not space.
1417 *----------------------------------------------------------------------
1421 Tcl_UniCharIsGraph(ch)
1422 int ch; /* Unicode character to test. */
1424 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1425 return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1429 *----------------------------------------------------------------------
1431 * Tcl_UniCharIsLower --
1433 * Test if a character is a lowercase Unicode character.
1436 * Returns non-zero if character is lowercase.
1441 *----------------------------------------------------------------------
1445 Tcl_UniCharIsLower(ch)
1446 int ch; /* Unicode character to test. */
1448 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1452 *----------------------------------------------------------------------
1454 * Tcl_UniCharIsPrint --
1456 * Test if a character is a Unicode print character.
1459 * Returns non-zero if character is printable.
1464 *----------------------------------------------------------------------
1468 Tcl_UniCharIsPrint(ch)
1469 int ch; /* Unicode character to test. */
1471 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1472 return ((PRINT_BITS >> category) & 1);
1476 *----------------------------------------------------------------------
1478 * Tcl_UniCharIsPunct --
1480 * Test if a character is a Unicode punctuation character.
1483 * Returns non-zero if character is punct.
1488 *----------------------------------------------------------------------
1492 Tcl_UniCharIsPunct(ch)
1493 int ch; /* Unicode character to test. */
1495 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1496 return ((PUNCT_BITS >> category) & 1);
1500 *----------------------------------------------------------------------
1502 * Tcl_UniCharIsSpace --
1504 * Test if a character is a whitespace Unicode character.
1507 * Returns non-zero if character is a space.
1512 *----------------------------------------------------------------------
1516 Tcl_UniCharIsSpace(ch)
1517 int ch; /* Unicode character to test. */
1519 register int category;
1522 * If the character is within the first 127 characters, just use the
1523 * standard C function, otherwise consult the Unicode table.
1527 return isspace(UCHAR(ch)); /* INTL: ISO space */
1529 category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1530 return ((SPACE_BITS >> category) & 1);
1535 *----------------------------------------------------------------------
1537 * Tcl_UniCharIsUpper --
1539 * Test if a character is a uppercase Unicode character.
1542 * Returns non-zero if character is uppercase.
1547 *----------------------------------------------------------------------
1551 Tcl_UniCharIsUpper(ch)
1552 int ch; /* Unicode character to test. */
1554 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1558 *----------------------------------------------------------------------
1560 * Tcl_UniCharIsWordChar --
1562 * Test if a character is alphanumeric or a connector punctuation
1566 * Returns 1 if character is a word character.
1571 *----------------------------------------------------------------------
1575 Tcl_UniCharIsWordChar(ch)
1576 int ch; /* Unicode character to test. */
1578 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1580 return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1584 *----------------------------------------------------------------------
1586 * Tcl_UniCharCaseMatch --
1588 * See if a particular Unicode string matches a particular pattern.
1589 * Allows case insensitivity. This is the Unicode equivalent of
1590 * the char* Tcl_StringCaseMatch. The UniChar strings must be
1591 * NULL-terminated. This has no provision for counted UniChar
1592 * strings, thus should not be used where NULLs are expected in the
1593 * UniChar string. Use TclUniCharMatch where possible.
1596 * The return value is 1 if string matches pattern, and
1597 * 0 otherwise. The matching operation permits the following
1598 * special characters in the pattern: *?\[] (see the manual
1599 * entry for details on what these mean).
1604 *----------------------------------------------------------------------
1608 Tcl_UniCharCaseMatch(string, pattern, nocase)
1609 CONST Tcl_UniChar *string; /* Unicode String. */
1610 CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
1612 int nocase; /* 0 for case sensitive, 1 for insensitive */
1620 * See if we're at the end of both the pattern and the string. If
1621 * so, we succeeded. If we're at the end of the pattern but not at
1622 * the end of the string, we failed.
1626 return (*string == 0);
1628 if ((*string == 0) && (p != '*')) {
1633 * Check for a "*" as the next pattern character. It matches any
1634 * substring. We handle this by skipping all the characters up to the
1635 * next matching one in the pattern, and then calling ourselves
1636 * recursively for each postfix of string, until either we match or we
1637 * reach the end of the string.
1642 * Skip all successive *'s in the pattern
1644 while (*(++pattern) == '*') {}
1650 p = Tcl_UniCharToLower(p);
1654 * Optimization for matching - cruise through the string
1655 * quickly if the next char in the pattern isn't a special
1658 if ((p != '[') && (p != '?') && (p != '\\')) {
1660 while (*string && (p != *string)
1661 && (p != Tcl_UniCharToLower(*string))) {
1665 while (*string && (p != *string)) { string++; }
1668 if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
1679 * Check for a "?" as the next pattern character. It matches
1680 * any single character.
1690 * Check for a "[" as the next pattern character. It is followed
1691 * by a list of characters that are acceptable, or by a range
1692 * (two characters separated by "-").
1696 Tcl_UniChar startChar, endChar;
1699 ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
1702 if ((*pattern == ']') || (*pattern == 0)) {
1705 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
1707 if (*pattern == '-') {
1709 if (*pattern == 0) {
1712 endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1715 if (((startChar <= ch1) && (ch1 <= endChar))
1716 || ((endChar <= ch1) && (ch1 <= startChar))) {
1718 * Matches ranges of form [a-z] or [z-a].
1722 } else if (startChar == ch1) {
1726 while (*pattern != ']') {
1727 if (*pattern == 0) {
1738 * If the next pattern character is '\', just strip off the '\'
1739 * so we do exact matching on the character that follows.
1743 if (*(++pattern) == '\0') {
1749 * There's no special character. Just make sure that the next
1750 * bytes of each string match.
1754 if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
1757 } else if (*string != *pattern) {
1766 *----------------------------------------------------------------------
1768 * TclUniCharMatch --
1770 * See if a particular Unicode string matches a particular pattern.
1771 * Allows case insensitivity. This is the Unicode equivalent of the
1772 * char* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch
1773 * uses counted Strings, so embedded NULLs are allowed.
1776 * The return value is 1 if string matches pattern, and
1777 * 0 otherwise. The matching operation permits the following
1778 * special characters in the pattern: *?\[] (see the manual
1779 * entry for details on what these mean).
1784 *----------------------------------------------------------------------
1788 TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
1789 CONST Tcl_UniChar *string; /* Unicode String. */
1790 int strLen; /* length of String */
1791 CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
1793 int ptnLen; /* length of Pattern */
1794 int nocase; /* 0 for case sensitive, 1 for insensitive */
1796 CONST Tcl_UniChar *stringEnd, *patternEnd;
1799 stringEnd = string + strLen;
1800 patternEnd = pattern + ptnLen;
1804 * See if we're at the end of both the pattern and the string. If
1805 * so, we succeeded. If we're at the end of the pattern but not at
1806 * the end of the string, we failed.
1809 if (pattern == patternEnd) {
1810 return (string == stringEnd);
1813 if ((string == stringEnd) && (p != '*')) {
1818 * Check for a "*" as the next pattern character. It matches any
1819 * substring. We handle this by skipping all the characters up to the
1820 * next matching one in the pattern, and then calling ourselves
1821 * recursively for each postfix of string, until either we match or we
1822 * reach the end of the string.
1827 * Skip all successive *'s in the pattern
1829 while (*(++pattern) == '*') {}
1830 if (pattern == patternEnd) {
1835 p = Tcl_UniCharToLower(p);
1839 * Optimization for matching - cruise through the string
1840 * quickly if the next char in the pattern isn't a special
1843 if ((p != '[') && (p != '?') && (p != '\\')) {
1845 while ((string < stringEnd) && (p != *string)
1846 && (p != Tcl_UniCharToLower(*string))) {
1850 while ((string < stringEnd) && (p != *string)) {
1855 if (TclUniCharMatch(string, stringEnd - string,
1856 pattern, patternEnd - pattern, nocase)) {
1859 if (string == stringEnd) {
1867 * Check for a "?" as the next pattern character. It matches
1868 * any single character.
1878 * Check for a "[" as the next pattern character. It is followed
1879 * by a list of characters that are acceptable, or by a range
1880 * (two characters separated by "-").
1884 Tcl_UniChar ch1, startChar, endChar;
1887 ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
1890 if ((*pattern == ']') || (pattern == patternEnd)) {
1893 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
1895 if (*pattern == '-') {
1897 if (pattern == patternEnd) {
1900 endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1903 if (((startChar <= ch1) && (ch1 <= endChar))
1904 || ((endChar <= ch1) && (ch1 <= startChar))) {
1906 * Matches ranges of form [a-z] or [z-a].
1910 } else if (startChar == ch1) {
1914 while (*pattern != ']') {
1915 if (pattern == patternEnd) {
1926 * If the next pattern character is '\', just strip off the '\'
1927 * so we do exact matching on the character that follows.
1931 if (++pattern == patternEnd) {
1937 * There's no special character. Just make sure that the next
1938 * bytes of each string match.
1942 if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
1945 } else if (*string != *pattern) {