Symaptic: os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/tclUtf.c@bde4ae8d615e (annotated)

sl@0	1	/*
sl@0	2	* tclUtf.c --
sl@0	3	*
sl@0	4	* Routines for manipulating UTF-8 strings.
sl@0	5	*
sl@0	6	* Copyright (c) 1997-1998 Sun Microsystems, Inc.
sl@0	7	* Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved.
sl@0	8	*
sl@0	9	* See the file "license.terms" for information on usage and redistribution
sl@0	10	* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
sl@0	11	*
sl@0	12	* RCS: @(#) $Id: tclUtf.c,v 1.30.2.3 2005/09/07 14:35:56 dgp Exp $
sl@0	13	*/
sl@0	14
sl@0	15	#include "tclInt.h"
sl@0	16
sl@0	17	/*
sl@0	18	* Include the static character classification tables and macros.
sl@0	19	*/
sl@0	20
sl@0	21	#include "tclUniData.c"
sl@0	22
sl@0	23	/*
sl@0	24	* The following macros are used for fast character category tests. The
sl@0	25	* x_BITS values are shifted right by the category value to determine whether
sl@0	26	* the given category is included in the set.
sl@0	27	*/
sl@0	28
sl@0	29	#define ALPHA_BITS ((1 << UPPERCASE_LETTER) \| (1 << LOWERCASE_LETTER) \
sl@0	30	\| (1 << TITLECASE_LETTER) \| (1 << MODIFIER_LETTER) \| (1 << OTHER_LETTER))
sl@0	31
sl@0	32	#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
sl@0	33
sl@0	34	#define SPACE_BITS ((1 << SPACE_SEPARATOR) \| (1 << LINE_SEPARATOR) \
sl@0	35	\| (1 << PARAGRAPH_SEPARATOR))
sl@0	36
sl@0	37	#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
sl@0	38
sl@0	39	#define PRINT_BITS (ALPHA_BITS \| DIGIT_BITS \| SPACE_BITS \| \
sl@0	40	(1 << NON_SPACING_MARK) \| (1 << ENCLOSING_MARK) \| \
sl@0	41	(1 << COMBINING_SPACING_MARK) \| (1 << LETTER_NUMBER) \| \
sl@0	42	(1 << OTHER_NUMBER) \| (1 << CONNECTOR_PUNCTUATION) \| \
sl@0	43	(1 << DASH_PUNCTUATION) \| (1 << OPEN_PUNCTUATION) \| \
sl@0	44	(1 << CLOSE_PUNCTUATION) \| (1 << INITIAL_QUOTE_PUNCTUATION) \| \
sl@0	45	(1 << FINAL_QUOTE_PUNCTUATION) \| (1 << OTHER_PUNCTUATION) \| \
sl@0	46	(1 << MATH_SYMBOL) \| (1 << CURRENCY_SYMBOL) \| \
sl@0	47	(1 << MODIFIER_SYMBOL) \| (1 << OTHER_SYMBOL))
sl@0	48
sl@0	49	#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) \| \
sl@0	50	(1 << DASH_PUNCTUATION) \| (1 << OPEN_PUNCTUATION) \| \
sl@0	51	(1 << CLOSE_PUNCTUATION) \| (1 << INITIAL_QUOTE_PUNCTUATION) \| \
sl@0	52	(1 << FINAL_QUOTE_PUNCTUATION) \| (1 << OTHER_PUNCTUATION))
sl@0	53
sl@0	54	/*
sl@0	55	* Unicode characters less than this value are represented by themselves
sl@0	56	* in UTF-8 strings.
sl@0	57	*/
sl@0	58
sl@0	59	#define UNICODE_SELF 0x80
sl@0	60
sl@0	61	/*
sl@0	62	* The following structures are used when mapping between Unicode (UCS-2)
sl@0	63	* and UTF-8.
sl@0	64	*/
sl@0	65
sl@0	66	static CONST unsigned char totalBytes[256] = {
sl@0	67	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0	68	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0	69	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0	70	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0	71	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0	72	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
sl@0	73	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
sl@0	74	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
sl@0	75	#if TCL_UTF_MAX > 3
sl@0	76	4,4,4,4,4,4,4,4,
sl@0	77	#else
sl@0	78	1,1,1,1,1,1,1,1,
sl@0	79	#endif
sl@0	80	#if TCL_UTF_MAX > 4
sl@0	81	5,5,5,5,
sl@0	82	#else
sl@0	83	1,1,1,1,
sl@0	84	#endif
sl@0	85	#if TCL_UTF_MAX > 5
sl@0	86	6,6,6,6
sl@0	87	#else
sl@0	88	1,1,1,1
sl@0	89	#endif
sl@0	90	};
sl@0	91
sl@0	92	/*
sl@0	93	* Procedures used only in this module.
sl@0	94	*/
sl@0	95
sl@0	96	static int UtfCount _ANSI_ARGS_((int ch));
sl@0	97
sl@0	98
sl@0	99	/*
sl@0	100	*---------------------------------------------------------------------------
sl@0	101	*
sl@0	102	* UtfCount --
sl@0	103	*
sl@0	104	* Find the number of bytes in the Utf character "ch".
sl@0	105	*
sl@0	106	* Results:
sl@0	107	* The return values is the number of bytes in the Utf character "ch".
sl@0	108	*
sl@0	109	* Side effects:
sl@0	110	* None.
sl@0	111	*
sl@0	112	*---------------------------------------------------------------------------
sl@0	113	*/
sl@0	114
sl@0	115	INLINE static int
sl@0	116	UtfCount(ch)
sl@0	117	int ch; /* The Tcl_UniChar whose size is returned. */
sl@0	118	{
sl@0	119	if ((ch > 0) && (ch < UNICODE_SELF)) {
sl@0	120	return 1;
sl@0	121	}
sl@0	122	if (ch <= 0x7FF) {
sl@0	123	return 2;
sl@0	124	}
sl@0	125	if (ch <= 0xFFFF) {
sl@0	126	return 3;
sl@0	127	}
sl@0	128	#if TCL_UTF_MAX > 3
sl@0	129	if (ch <= 0x1FFFFF) {
sl@0	130	return 4;
sl@0	131	}
sl@0	132	if (ch <= 0x3FFFFFF) {
sl@0	133	return 5;
sl@0	134	}
sl@0	135	if (ch <= 0x7FFFFFFF) {
sl@0	136	return 6;
sl@0	137	}
sl@0	138	#endif
sl@0	139	return 3;
sl@0	140	}
sl@0	141
sl@0	142	/*
sl@0	143	*---------------------------------------------------------------------------
sl@0	144	*
sl@0	145	* Tcl_UniCharToUtf --
sl@0	146	*
sl@0	147	* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
sl@0	148	* provided buffer. Equivalent to Plan 9 runetochar().
sl@0	149	*
sl@0	150	* Results:
sl@0	151	* The return values is the number of bytes in the buffer that
sl@0	152	* were consumed.
sl@0	153	*
sl@0	154	* Side effects:
sl@0	155	* None.
sl@0	156	*
sl@0	157	*---------------------------------------------------------------------------
sl@0	158	*/
sl@0	159
sl@0	160	EXPORT_C INLINE int
sl@0	161	Tcl_UniCharToUtf(ch, str)
sl@0	162	int ch; /* The Tcl_UniChar to be stored in the
sl@0	163	* buffer. */
sl@0	164	char str; / Buffer in which the UTF-8 representation
sl@0	165	* of the Tcl_UniChar is stored. Buffer must
sl@0	166	* be large enough to hold the UTF-8 character
sl@0	167	* (at most TCL_UTF_MAX bytes). */
sl@0	168	{
sl@0	169	if ((ch > 0) && (ch < UNICODE_SELF)) {
sl@0	170	str[0] = (char) ch;
sl@0	171	return 1;
sl@0	172	}
sl@0	173	if (ch >= 0) {
sl@0	174	if (ch <= 0x7FF) {
sl@0	175	str[1] = (char) ((ch \| 0x80) & 0xBF);
sl@0	176	str[0] = (char) ((ch >> 6) \| 0xC0);
sl@0	177	return 2;
sl@0	178	}
sl@0	179	if (ch <= 0xFFFF) {
sl@0	180	three:
sl@0	181	str[2] = (char) ((ch \| 0x80) & 0xBF);
sl@0	182	str[1] = (char) (((ch >> 6) \| 0x80) & 0xBF);
sl@0	183	str[0] = (char) ((ch >> 12) \| 0xE0);
sl@0	184	return 3;
sl@0	185	}
sl@0	186
sl@0	187	#if TCL_UTF_MAX > 3
sl@0	188	if (ch <= 0x1FFFFF) {
sl@0	189	str[3] = (char) ((ch \| 0x80) & 0xBF);
sl@0	190	str[2] = (char) (((ch >> 6) \| 0x80) & 0xBF);
sl@0	191	str[1] = (char) (((ch >> 12) \| 0x80) & 0xBF);
sl@0	192	str[0] = (char) ((ch >> 18) \| 0xF0);
sl@0	193	return 4;
sl@0	194	}
sl@0	195	if (ch <= 0x3FFFFFF) {
sl@0	196	str[4] = (char) ((ch \| 0x80) & 0xBF);
sl@0	197	str[3] = (char) (((ch >> 6) \| 0x80) & 0xBF);
sl@0	198	str[2] = (char) (((ch >> 12) \| 0x80) & 0xBF);
sl@0	199	str[1] = (char) (((ch >> 18) \| 0x80) & 0xBF);
sl@0	200	str[0] = (char) ((ch >> 24) \| 0xF8);
sl@0	201	return 5;
sl@0	202	}
sl@0	203	if (ch <= 0x7FFFFFFF) {
sl@0	204	str[5] = (char) ((ch \| 0x80) & 0xBF);
sl@0	205	str[4] = (char) (((ch >> 6) \| 0x80) & 0xBF);
sl@0	206	str[3] = (char) (((ch >> 12) \| 0x80) & 0xBF);
sl@0	207	str[2] = (char) (((ch >> 18) \| 0x80) & 0xBF);
sl@0	208	str[1] = (char) (((ch >> 24) \| 0x80) & 0xBF);
sl@0	209	str[0] = (char) ((ch >> 30) \| 0xFC);
sl@0	210	return 6;
sl@0	211	}
sl@0	212	#endif
sl@0	213	}
sl@0	214
sl@0	215	ch = 0xFFFD;
sl@0	216	goto three;
sl@0	217	}
sl@0	218
sl@0	219	/*
sl@0	220	*---------------------------------------------------------------------------
sl@0	221	*
sl@0	222	* Tcl_UniCharToUtfDString --
sl@0	223	*
sl@0	224	* Convert the given Unicode string to UTF-8.
sl@0	225	*
sl@0	226	* Results:
sl@0	227	* The return value is a pointer to the UTF-8 representation of the
sl@0	228	* Unicode string. Storage for the return value is appended to the
sl@0	229	* end of dsPtr.
sl@0	230	*
sl@0	231	* Side effects:
sl@0	232	* None.
sl@0	233	*
sl@0	234	*---------------------------------------------------------------------------
sl@0	235	*/
sl@0	236
sl@0	237	EXPORT_C char *
sl@0	238	Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
sl@0	239	CONST Tcl_UniChar wString; / Unicode string to convert to UTF-8. */
sl@0	240	int numChars; /* Length of Unicode string in Tcl_UniChars
sl@0	241	* (must be >= 0). */
sl@0	242	Tcl_DString dsPtr; / UTF-8 representation of string is
sl@0	243	* appended to this previously initialized
sl@0	244	* DString. */
sl@0	245	{
sl@0	246	CONST Tcl_UniChar w, wEnd;
sl@0	247	char p, string;
sl@0	248	int oldLength;
sl@0	249
sl@0	250	/*
sl@0	251	* UTF-8 string length in bytes will be <= Unicode string length *
sl@0	252	* TCL_UTF_MAX.
sl@0	253	*/
sl@0	254
sl@0	255	oldLength = Tcl_DStringLength(dsPtr);
sl@0	256	Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
sl@0	257	string = Tcl_DStringValue(dsPtr) + oldLength;
sl@0	258
sl@0	259	p = string;
sl@0	260	wEnd = wString + numChars;
sl@0	261	for (w = wString; w < wEnd; ) {
sl@0	262	p += Tcl_UniCharToUtf(*w, p);
sl@0	263	w++;
sl@0	264	}
sl@0	265	Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
sl@0	266
sl@0	267	return string;
sl@0	268	}
sl@0	269
sl@0	270	/*
sl@0	271	*---------------------------------------------------------------------------
sl@0	272	*
sl@0	273	* Tcl_UtfToUniChar --
sl@0	274	*
sl@0	275	* Extract the Tcl_UniChar represented by the UTF-8 string. Bad
sl@0	276	* UTF-8 sequences are converted to valid Tcl_UniChars and processing
sl@0	277	* continues. Equivalent to Plan 9 chartorune().
sl@0	278	*
sl@0	279	* The caller must ensure that the source buffer is long enough that
sl@0	280	* this routine does not run off the end and dereference non-existent
sl@0	281	* memory looking for trail bytes. If the source buffer is known to
sl@0	282	* be '\0' terminated, this cannot happen. Otherwise, the caller
sl@0	283	* should call Tcl_UtfCharComplete() before calling this routine to
sl@0	284	* ensure that enough bytes remain in the string.
sl@0	285	*
sl@0	286	* Results:
sl@0	287	* *chPtr is filled with the Tcl_UniChar, and the return value is the
sl@0	288	* number of bytes from the UTF-8 string that were consumed.
sl@0	289	*
sl@0	290	* Side effects:
sl@0	291	* None.
sl@0	292	*
sl@0	293	*---------------------------------------------------------------------------
sl@0	294	*/
sl@0	295
sl@0	296	EXPORT_C int
sl@0	297	Tcl_UtfToUniChar(str, chPtr)
sl@0	298	register CONST char str; / The UTF-8 string. */
sl@0	299	register Tcl_UniChar chPtr; / Filled with the Tcl_UniChar represented
sl@0	300	* by the UTF-8 string. */
sl@0	301	{
sl@0	302	register int byte;
sl@0	303
sl@0	304	/*
sl@0	305	* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
sl@0	306	*/
sl@0	307
sl@0	308	byte = ((unsigned char ) str);
sl@0	309	if (byte < 0xC0) {
sl@0	310	/*
sl@0	311	* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
sl@0	312	* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
sl@0	313	* characters representing themselves.
sl@0	314	*/
sl@0	315
sl@0	316	*chPtr = (Tcl_UniChar) byte;
sl@0	317	return 1;
sl@0	318	} else if (byte < 0xE0) {
sl@0	319	if ((str[1] & 0xC0) == 0x80) {
sl@0	320	/*
sl@0	321	* Two-byte-character lead-byte followed by a trail-byte.
sl@0	322	*/
sl@0	323
sl@0	324	*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) \| (str[1] & 0x3F));
sl@0	325	return 2;
sl@0	326	}
sl@0	327	/*
sl@0	328	* A two-byte-character lead-byte not followed by trail-byte
sl@0	329	* represents itself.
sl@0	330	*/
sl@0	331
sl@0	332	*chPtr = (Tcl_UniChar) byte;
sl@0	333	return 1;
sl@0	334	} else if (byte < 0xF0) {
sl@0	335	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
sl@0	336	/*
sl@0	337	* Three-byte-character lead byte followed by two trail bytes.
sl@0	338	*/
sl@0	339
sl@0	340	*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
sl@0	341	\| ((str[1] & 0x3F) << 6) \| (str[2] & 0x3F));
sl@0	342	return 3;
sl@0	343	}
sl@0	344	/*
sl@0	345	* A three-byte-character lead-byte not followed by two trail-bytes
sl@0	346	* represents itself.
sl@0	347	*/
sl@0	348
sl@0	349	*chPtr = (Tcl_UniChar) byte;
sl@0	350	return 1;
sl@0	351	}
sl@0	352	#if TCL_UTF_MAX > 3
sl@0	353	else {
sl@0	354	int ch, total, trail;
sl@0	355
sl@0	356	total = totalBytes[byte];
sl@0	357	trail = total - 1;
sl@0	358	if (trail > 0) {
sl@0	359	ch = byte & (0x3F >> trail);
sl@0	360	do {
sl@0	361	str++;
sl@0	362	if ((*str & 0xC0) != 0x80) {
sl@0	363	*chPtr = byte;
sl@0	364	return 1;
sl@0	365	}
sl@0	366	ch <<= 6;
sl@0	367	ch \|= (*str & 0x3F);
sl@0	368	trail--;
sl@0	369	} while (trail > 0);
sl@0	370	*chPtr = ch;
sl@0	371	return total;
sl@0	372	}
sl@0	373	}
sl@0	374	#endif
sl@0	375
sl@0	376	*chPtr = (Tcl_UniChar) byte;
sl@0	377	return 1;
sl@0	378	}
sl@0	379
sl@0	380	/*
sl@0	381	*---------------------------------------------------------------------------
sl@0	382	*
sl@0	383	* Tcl_UtfToUniCharDString --
sl@0	384	*
sl@0	385	* Convert the UTF-8 string to Unicode.
sl@0	386	*
sl@0	387	* Results:
sl@0	388	* The return value is a pointer to the Unicode representation of the
sl@0	389	* UTF-8 string. Storage for the return value is appended to the
sl@0	390	* end of dsPtr. The Unicode string is terminated with a Unicode
sl@0	391	* NULL character.
sl@0	392	*
sl@0	393	* Side effects:
sl@0	394	* None.
sl@0	395	*
sl@0	396	*---------------------------------------------------------------------------
sl@0	397	*/
sl@0	398
sl@0	399	EXPORT_C Tcl_UniChar *
sl@0	400	Tcl_UtfToUniCharDString(string, length, dsPtr)
sl@0	401	CONST char string; / UTF-8 string to convert to Unicode. */
sl@0	402	int length; /* Length of UTF-8 string in bytes, or -1
sl@0	403	* for strlen(). */
sl@0	404	Tcl_DString dsPtr; / Unicode representation of string is
sl@0	405	* appended to this previously initialized
sl@0	406	* DString. */
sl@0	407	{
sl@0	408	Tcl_UniChar w, wString;
sl@0	409	CONST char p, end;
sl@0	410	int oldLength;
sl@0	411
sl@0	412	if (length < 0) {
sl@0	413	length = strlen(string);
sl@0	414	}
sl@0	415
sl@0	416	/*
sl@0	417	* Unicode string length in Tcl_UniChars will be <= UTF-8 string length
sl@0	418	* in bytes.
sl@0	419	*/
sl@0	420
sl@0	421	oldLength = Tcl_DStringLength(dsPtr);
sl@0	422	Tcl_DStringSetLength(dsPtr,
sl@0	423	(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
sl@0	424	wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
sl@0	425
sl@0	426	w = wString;
sl@0	427	end = string + length;
sl@0	428	for (p = string; p < end; ) {
sl@0	429	p += TclUtfToUniChar(p, w);
sl@0	430	w++;
sl@0	431	}
sl@0	432	*w = '\0';
sl@0	433	Tcl_DStringSetLength(dsPtr,
sl@0	434	(oldLength + ((char ) w - (char ) wString)));
sl@0	435
sl@0	436	return wString;
sl@0	437	}
sl@0	438
sl@0	439	/*
sl@0	440	*---------------------------------------------------------------------------
sl@0	441	*
sl@0	442	* Tcl_UtfCharComplete --
sl@0	443	*
sl@0	444	* Determine if the UTF-8 string of the given length is long enough
sl@0	445	* to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
sl@0	446	* UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
sl@0	447	*
sl@0	448	* Results:
sl@0	449	* The return value is 0 if the string is not long enough, non-zero
sl@0	450	* otherwise.
sl@0	451	*
sl@0	452	* Side effects:
sl@0	453	* None.
sl@0	454	*
sl@0	455	*---------------------------------------------------------------------------
sl@0	456	*/
sl@0	457
sl@0	458	EXPORT_C int
sl@0	459	Tcl_UtfCharComplete(str, len)
sl@0	460	CONST char str; / String to check if first few bytes
sl@0	461	* contain a complete UTF-8 character. */
sl@0	462	int len; /* Length of above string in bytes. */
sl@0	463	{
sl@0	464	int ch;
sl@0	465
sl@0	466	ch = ((unsigned char ) str);
sl@0	467	return len >= totalBytes[ch];
sl@0	468	}
sl@0	469
sl@0	470	/*
sl@0	471	*---------------------------------------------------------------------------
sl@0	472	*
sl@0	473	* Tcl_NumUtfChars --
sl@0	474	*
sl@0	475	* Returns the number of characters (not bytes) in the UTF-8 string,
sl@0	476	* not including the terminating NULL byte. This is equivalent to
sl@0	477	* Plan 9 utflen() and utfnlen().
sl@0	478	*
sl@0	479	* Results:
sl@0	480	* As above.
sl@0	481	*
sl@0	482	* Side effects:
sl@0	483	* None.
sl@0	484	*
sl@0	485	*---------------------------------------------------------------------------
sl@0	486	*/
sl@0	487
sl@0	488	EXPORT_C int
sl@0	489	Tcl_NumUtfChars(str, len)
sl@0	490	register CONST char str; / The UTF-8 string to measure. */
sl@0	491	int len; /* The length of the string in bytes, or -1
sl@0	492	* for strlen(string). */
sl@0	493	{
sl@0	494	Tcl_UniChar ch;
sl@0	495	register Tcl_UniChar *chPtr = &ch;
sl@0	496	register int i;
sl@0	497
sl@0	498	/*
sl@0	499	* The separate implementations are faster.
sl@0	500	*
sl@0	501	* Since this is a time-sensitive function, we also do the check for
sl@0	502	* the single-byte char case specially.
sl@0	503	*/
sl@0	504
sl@0	505	i = 0;
sl@0	506	if (len < 0) {
sl@0	507	while (*str != '\0') {
sl@0	508	str += TclUtfToUniChar(str, chPtr);
sl@0	509	i++;
sl@0	510	}
sl@0	511	} else {
sl@0	512	register int n;
sl@0	513
sl@0	514	while (len > 0) {
sl@0	515	if (UCHAR(*str) < 0xC0) {
sl@0	516	len--;
sl@0	517	str++;
sl@0	518	} else {
sl@0	519	n = Tcl_UtfToUniChar(str, chPtr);
sl@0	520	len -= n;
sl@0	521	str += n;
sl@0	522	}
sl@0	523	i++;
sl@0	524	}
sl@0	525	}
sl@0	526	return i;
sl@0	527	}
sl@0	528
sl@0	529	/*
sl@0	530	*---------------------------------------------------------------------------
sl@0	531	*
sl@0	532	* Tcl_UtfFindFirst --
sl@0	533	*
sl@0	534	* Returns a pointer to the first occurance of the given Tcl_UniChar
sl@0	535	* in the NULL-terminated UTF-8 string. The NULL terminator is
sl@0	536	* considered part of the UTF-8 string. Equivalent to Plan 9
sl@0	537	* utfrune().
sl@0	538	*
sl@0	539	* Results:
sl@0	540	* As above. If the Tcl_UniChar does not exist in the given string,
sl@0	541	* the return value is NULL.
sl@0	542	*
sl@0	543	* Side effects:
sl@0	544	* None.
sl@0	545	*
sl@0	546	*---------------------------------------------------------------------------
sl@0	547	*/
sl@0	548	EXPORT_C CONST char *
sl@0	549	Tcl_UtfFindFirst(string, ch)
sl@0	550	CONST char string; / The UTF-8 string to be searched. */
sl@0	551	int ch; /* The Tcl_UniChar to search for. */
sl@0	552	{
sl@0	553	int len;
sl@0	554	Tcl_UniChar find;
sl@0	555
sl@0	556	while (1) {
sl@0	557	len = TclUtfToUniChar(string, &find);
sl@0	558	if (find == ch) {
sl@0	559	return string;
sl@0	560	}
sl@0	561	if (*string == '\0') {
sl@0	562	return NULL;
sl@0	563	}
sl@0	564	string += len;
sl@0	565	}
sl@0	566	}
sl@0	567
sl@0	568	/*
sl@0	569	*---------------------------------------------------------------------------
sl@0	570	*
sl@0	571	* Tcl_UtfFindLast --
sl@0	572	*
sl@0	573	* Returns a pointer to the last occurance of the given Tcl_UniChar
sl@0	574	* in the NULL-terminated UTF-8 string. The NULL terminator is
sl@0	575	* considered part of the UTF-8 string. Equivalent to Plan 9
sl@0	576	* utfrrune().
sl@0	577	*
sl@0	578	* Results:
sl@0	579	* As above. If the Tcl_UniChar does not exist in the given string,
sl@0	580	* the return value is NULL.
sl@0	581	*
sl@0	582	* Side effects:
sl@0	583	* None.
sl@0	584	*
sl@0	585	*---------------------------------------------------------------------------
sl@0	586	*/
sl@0	587
sl@0	588	EXPORT_C CONST char *
sl@0	589	Tcl_UtfFindLast(string, ch)
sl@0	590	CONST char string; / The UTF-8 string to be searched. */
sl@0	591	int ch; /* The Tcl_UniChar to search for. */
sl@0	592	{
sl@0	593	int len;
sl@0	594	Tcl_UniChar find;
sl@0	595	CONST char *last;
sl@0	596
sl@0	597	last = NULL;
sl@0	598	while (1) {
sl@0	599	len = TclUtfToUniChar(string, &find);
sl@0	600	if (find == ch) {
sl@0	601	last = string;
sl@0	602	}
sl@0	603	if (*string == '\0') {
sl@0	604	break;
sl@0	605	}
sl@0	606	string += len;
sl@0	607	}
sl@0	608	return last;
sl@0	609	}
sl@0	610
sl@0	611	/*
sl@0	612	*---------------------------------------------------------------------------
sl@0	613	*
sl@0	614	* Tcl_UtfNext --
sl@0	615	*
sl@0	616	* Given a pointer to some current location in a UTF-8 string,
sl@0	617	* move forward one character. The caller must ensure that they
sl@0	618	* are not asking for the next character after the last character
sl@0	619	* in the string.
sl@0	620	*
sl@0	621	* Results:
sl@0	622	* The return value is the pointer to the next character in
sl@0	623	* the UTF-8 string.
sl@0	624	*
sl@0	625	* Side effects:
sl@0	626	* None.
sl@0	627	*
sl@0	628	*---------------------------------------------------------------------------
sl@0	629	*/
sl@0	630
sl@0	631	EXPORT_C CONST char *
sl@0	632	Tcl_UtfNext(str)
sl@0	633	CONST char str; / The current location in the string. */
sl@0	634	{
sl@0	635	Tcl_UniChar ch;
sl@0	636
sl@0	637	return str + TclUtfToUniChar(str, &ch);
sl@0	638	}
sl@0	639
sl@0	640	/*
sl@0	641	*---------------------------------------------------------------------------
sl@0	642	*
sl@0	643	* Tcl_UtfPrev --
sl@0	644	*
sl@0	645	* Given a pointer to some current location in a UTF-8 string,
sl@0	646	* move backwards one character. This works correctly when the
sl@0	647	* pointer is in the middle of a UTF-8 character.
sl@0	648	*
sl@0	649	* Results:
sl@0	650	* The return value is a pointer to the previous character in the
sl@0	651	* UTF-8 string. If the current location was already at the
sl@0	652	* beginning of the string, the return value will also be a
sl@0	653	* pointer to the beginning of the string.
sl@0	654	*
sl@0	655	* Side effects:
sl@0	656	* None.
sl@0	657	*
sl@0	658	*---------------------------------------------------------------------------
sl@0	659	*/
sl@0	660
sl@0	661	EXPORT_C CONST char *
sl@0	662	Tcl_UtfPrev(str, start)
sl@0	663	CONST char str; / The current location in the string. */
sl@0	664	CONST char start; / Pointer to the beginning of the
sl@0	665	* string, to avoid going backwards too
sl@0	666	* far. */
sl@0	667	{
sl@0	668	CONST char *look;
sl@0	669	int i, byte;
sl@0	670
sl@0	671	str--;
sl@0	672	look = str;
sl@0	673	for (i = 0; i < TCL_UTF_MAX; i++) {
sl@0	674	if (look < start) {
sl@0	675	if (str < start) {
sl@0	676	str = start;
sl@0	677	}
sl@0	678	break;
sl@0	679	}
sl@0	680	byte = ((unsigned char ) look);
sl@0	681	if (byte < 0x80) {
sl@0	682	break;
sl@0	683	}
sl@0	684	if (byte >= 0xC0) {
sl@0	685	return look;
sl@0	686	}
sl@0	687	look--;
sl@0	688	}
sl@0	689	return str;
sl@0	690	}
sl@0	691
sl@0	692	/*
sl@0	693	*---------------------------------------------------------------------------
sl@0	694	*
sl@0	695	* Tcl_UniCharAtIndex --
sl@0	696	*
sl@0	697	* Returns the Unicode character represented at the specified
sl@0	698	* character (not byte) position in the UTF-8 string.
sl@0	699	*
sl@0	700	* Results:
sl@0	701	* As above.
sl@0	702	*
sl@0	703	* Side effects:
sl@0	704	* None.
sl@0	705	*
sl@0	706	*---------------------------------------------------------------------------
sl@0	707	*/
sl@0	708
sl@0	709	EXPORT_C Tcl_UniChar
sl@0	710	Tcl_UniCharAtIndex(src, index)
sl@0	711	register CONST char src; / The UTF-8 string to dereference. */
sl@0	712	register int index; /* The position of the desired character. */
sl@0	713	{
sl@0	714	Tcl_UniChar ch;
sl@0	715
sl@0	716	while (index >= 0) {
sl@0	717	index--;
sl@0	718	src += TclUtfToUniChar(src, &ch);
sl@0	719	}
sl@0	720	return ch;
sl@0	721	}
sl@0	722
sl@0	723	/*
sl@0	724	*---------------------------------------------------------------------------
sl@0	725	*
sl@0	726	* Tcl_UtfAtIndex --
sl@0	727	*
sl@0	728	* Returns a pointer to the specified character (not byte) position
sl@0	729	* in the UTF-8 string.
sl@0	730	*
sl@0	731	* Results:
sl@0	732	* As above.
sl@0	733	*
sl@0	734	* Side effects:
sl@0	735	* None.
sl@0	736	*
sl@0	737	*---------------------------------------------------------------------------
sl@0	738	*/
sl@0	739
sl@0	740	EXPORT_C CONST char *
sl@0	741	Tcl_UtfAtIndex(src, index)
sl@0	742	register CONST char src; / The UTF-8 string. */
sl@0	743	register int index; /* The position of the desired character. */
sl@0	744	{
sl@0	745	Tcl_UniChar ch;
sl@0	746
sl@0	747	while (index > 0) {
sl@0	748	index--;
sl@0	749	src += TclUtfToUniChar(src, &ch);
sl@0	750	}
sl@0	751	return src;
sl@0	752	}
sl@0	753
sl@0	754	/*
sl@0	755	*---------------------------------------------------------------------------
sl@0	756	*
sl@0	757	* Tcl_UtfBackslash --
sl@0	758	*
sl@0	759	* Figure out how to handle a backslash sequence.
sl@0	760	*
sl@0	761	* Results:
sl@0	762	* Stores the bytes represented by the backslash sequence in dst and
sl@0	763	* returns the number of bytes written to dst. At most TCL_UTF_MAX
sl@0	764	* bytes are written to dst; dst must have been large enough to accept
sl@0	765	* those bytes. If readPtr isn't NULL then it is filled in with a
sl@0	766	* count of the number of bytes in the backslash sequence.
sl@0	767	*
sl@0	768	* Side effects:
sl@0	769	* The maximum number of bytes it takes to represent a Unicode
sl@0	770	* character in UTF-8 is guaranteed to be less than the number of
sl@0	771	* bytes used to express the backslash sequence that represents
sl@0	772	* that Unicode character. If the target buffer into which the
sl@0	773	* caller is going to store the bytes that represent the Unicode
sl@0	774	* character is at least as large as the source buffer from which
sl@0	775	* the backslashed sequence was extracted, no buffer overruns should
sl@0	776	* occur.
sl@0	777	*
sl@0	778	*---------------------------------------------------------------------------
sl@0	779	*/
sl@0	780
sl@0	781	EXPORT_C int
sl@0	782	Tcl_UtfBackslash(src, readPtr, dst)
sl@0	783	CONST char src; / Points to the backslash character of
sl@0	784	* a backslash sequence. */
sl@0	785	int readPtr; / Fill in with number of characters read
sl@0	786	* from src, unless NULL. */
sl@0	787	char dst; / Filled with the bytes represented by the
sl@0	788	* backslash sequence. */
sl@0	789	{
sl@0	790	#define LINE_LENGTH 128
sl@0	791	int numRead;
sl@0	792	int result;
sl@0	793
sl@0	794	result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
sl@0	795	if (numRead == LINE_LENGTH) {
sl@0	796	/* We ate a whole line. Pay the price of a strlen() */
sl@0	797	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
sl@0	798	}
sl@0	799	if (readPtr != NULL) {
sl@0	800	*readPtr = numRead;
sl@0	801	}
sl@0	802	return result;
sl@0	803	}
sl@0	804
sl@0	805	/*
sl@0	806	*----------------------------------------------------------------------
sl@0	807	*
sl@0	808	* Tcl_UtfToUpper --
sl@0	809	*
sl@0	810	* Convert lowercase characters to uppercase characters in a UTF
sl@0	811	* string in place. The conversion may shrink the UTF string.
sl@0	812	*
sl@0	813	* Results:
sl@0	814	* Returns the number of bytes in the resulting string
sl@0	815	* excluding the trailing null.
sl@0	816	*
sl@0	817	* Side effects:
sl@0	818	* Writes a terminating null after the last converted character.
sl@0	819	*
sl@0	820	*----------------------------------------------------------------------
sl@0	821	*/
sl@0	822
sl@0	823	EXPORT_C int
sl@0	824	Tcl_UtfToUpper(str)
sl@0	825	char str; / String to convert in place. */
sl@0	826	{
sl@0	827	Tcl_UniChar ch, upChar;
sl@0	828	char src, dst;
sl@0	829	int bytes;
sl@0	830
sl@0	831	/*
sl@0	832	* Iterate over the string until we hit the terminating null.
sl@0	833	*/
sl@0	834
sl@0	835	src = dst = str;
sl@0	836	while (*src) {
sl@0	837	bytes = TclUtfToUniChar(src, &ch);
sl@0	838	upChar = Tcl_UniCharToUpper(ch);
sl@0	839
sl@0	840	/*
sl@0	841	* To keep badly formed Utf strings from getting inflated by
sl@0	842	* the conversion (thereby causing a segfault), only copy the
sl@0	843	* upper case char to dst if its size is <= the original char.
sl@0	844	*/
sl@0	845
sl@0	846	if (bytes < UtfCount(upChar)) {
sl@0	847	memcpy(dst, src, (size_t) bytes);
sl@0	848	dst += bytes;
sl@0	849	} else {
sl@0	850	dst += Tcl_UniCharToUtf(upChar, dst);
sl@0	851	}
sl@0	852	src += bytes;
sl@0	853	}
sl@0	854	*dst = '\0';
sl@0	855	return (dst - str);
sl@0	856	}
sl@0	857
sl@0	858	/*
sl@0	859	*----------------------------------------------------------------------
sl@0	860	*
sl@0	861	* Tcl_UtfToLower --
sl@0	862	*
sl@0	863	* Convert uppercase characters to lowercase characters in a UTF
sl@0	864	* string in place. The conversion may shrink the UTF string.
sl@0	865	*
sl@0	866	* Results:
sl@0	867	* Returns the number of bytes in the resulting string
sl@0	868	* excluding the trailing null.
sl@0	869	*
sl@0	870	* Side effects:
sl@0	871	* Writes a terminating null after the last converted character.
sl@0	872	*
sl@0	873	*----------------------------------------------------------------------
sl@0	874	*/
sl@0	875
sl@0	876	EXPORT_C int
sl@0	877	Tcl_UtfToLower(str)
sl@0	878	char str; / String to convert in place. */
sl@0	879	{
sl@0	880	Tcl_UniChar ch, lowChar;
sl@0	881	char src, dst;
sl@0	882	int bytes;
sl@0	883
sl@0	884	/*
sl@0	885	* Iterate over the string until we hit the terminating null.
sl@0	886	*/
sl@0	887
sl@0	888	src = dst = str;
sl@0	889	while (*src) {
sl@0	890	bytes = TclUtfToUniChar(src, &ch);
sl@0	891	lowChar = Tcl_UniCharToLower(ch);
sl@0	892
sl@0	893	/*
sl@0	894	* To keep badly formed Utf strings from getting inflated by
sl@0	895	* the conversion (thereby causing a segfault), only copy the
sl@0	896	* lower case char to dst if its size is <= the original char.
sl@0	897	*/
sl@0	898
sl@0	899	if (bytes < UtfCount(lowChar)) {
sl@0	900	memcpy(dst, src, (size_t) bytes);
sl@0	901	dst += bytes;
sl@0	902	} else {
sl@0	903	dst += Tcl_UniCharToUtf(lowChar, dst);
sl@0	904	}
sl@0	905	src += bytes;
sl@0	906	}
sl@0	907	*dst = '\0';
sl@0	908	return (dst - str);
sl@0	909	}
sl@0	910
sl@0	911	/*
sl@0	912	*----------------------------------------------------------------------
sl@0	913	*
sl@0	914	* Tcl_UtfToTitle --
sl@0	915	*
sl@0	916	* Changes the first character of a UTF string to title case or
sl@0	917	* uppercase and the rest of the string to lowercase. The
sl@0	918	* conversion happens in place and may shrink the UTF string.
sl@0	919	*
sl@0	920	* Results:
sl@0	921	* Returns the number of bytes in the resulting string
sl@0	922	* excluding the trailing null.
sl@0	923	*
sl@0	924	* Side effects:
sl@0	925	* Writes a terminating null after the last converted character.
sl@0	926	*
sl@0	927	*----------------------------------------------------------------------
sl@0	928	*/
sl@0	929
sl@0	930	EXPORT_C int
sl@0	931	Tcl_UtfToTitle(str)
sl@0	932	char str; / String to convert in place. */
sl@0	933	{
sl@0	934	Tcl_UniChar ch, titleChar, lowChar;
sl@0	935	char src, dst;
sl@0	936	int bytes;
sl@0	937
sl@0	938	/*
sl@0	939	* Capitalize the first character and then lowercase the rest of the
sl@0	940	* characters until we get to a null.
sl@0	941	*/
sl@0	942
sl@0	943	src = dst = str;
sl@0	944
sl@0	945	if (*src) {
sl@0	946	bytes = TclUtfToUniChar(src, &ch);
sl@0	947	titleChar = Tcl_UniCharToTitle(ch);
sl@0	948
sl@0	949	if (bytes < UtfCount(titleChar)) {
sl@0	950	memcpy(dst, src, (size_t) bytes);
sl@0	951	dst += bytes;
sl@0	952	} else {
sl@0	953	dst += Tcl_UniCharToUtf(titleChar, dst);
sl@0	954	}
sl@0	955	src += bytes;
sl@0	956	}
sl@0	957	while (*src) {
sl@0	958	bytes = TclUtfToUniChar(src, &ch);
sl@0	959	lowChar = Tcl_UniCharToLower(ch);
sl@0	960
sl@0	961	if (bytes < UtfCount(lowChar)) {
sl@0	962	memcpy(dst, src, (size_t) bytes);
sl@0	963	dst += bytes;
sl@0	964	} else {
sl@0	965	dst += Tcl_UniCharToUtf(lowChar, dst);
sl@0	966	}
sl@0	967	src += bytes;
sl@0	968	}
sl@0	969	*dst = '\0';
sl@0	970	return (dst - str);
sl@0	971	}
sl@0	972
sl@0	973	/*
sl@0	974	*----------------------------------------------------------------------
sl@0	975	*
sl@0	976	* TclpUtfNcmp2 --
sl@0	977	*
sl@0	978	* Compare at most n bytes of utf-8 strings cs and ct. Both cs
sl@0	979	* and ct are assumed to be at least n bytes long.
sl@0	980	*
sl@0	981	* Results:
sl@0	982	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0	983	*
sl@0	984	* Side effects:
sl@0	985	* None.
sl@0	986	*
sl@0	987	*----------------------------------------------------------------------
sl@0	988	*/
sl@0	989
sl@0	990	int
sl@0	991	TclpUtfNcmp2(cs, ct, n)
sl@0	992	CONST char cs; / UTF string to compare to ct. */
sl@0	993	CONST char ct; / UTF string cs is compared to. */
sl@0	994	unsigned long n; /* Number of bytes to compare. */
sl@0	995	{
sl@0	996	/*
sl@0	997	* We can't simply call 'memcmp(cs, ct, n);' because we need to check
sl@0	998	* for Tcl's \xC0\x80 non-utf-8 null encoding.
sl@0	999	* Otherwise utf-8 lexes fine in the strcmp manner.
sl@0	1000	*/
sl@0	1001	register int result = 0;
sl@0	1002
sl@0	1003	for ( ; n != 0; n--, cs++, ct++) {
sl@0	1004	if (cs != ct) {
sl@0	1005	result = UCHAR(cs) - UCHAR(ct);
sl@0	1006	break;
sl@0	1007	}
sl@0	1008	}
sl@0	1009	if (n && ((UCHAR(cs) == 0xC0) \|\| (UCHAR(ct) == 0xC0))) {
sl@0	1010	unsigned char c1, c2;
sl@0	1011	c1 = ((UCHAR(cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(cs);
sl@0	1012	c2 = ((UCHAR(ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(ct);
sl@0	1013	result = (c1 - c2);
sl@0	1014	}
sl@0	1015	return result;
sl@0	1016	}
sl@0	1017
sl@0	1018	/*
sl@0	1019	*----------------------------------------------------------------------
sl@0	1020	*
sl@0	1021	* Tcl_UtfNcmp --
sl@0	1022	*
sl@0	1023	* Compare at most n UTF chars of string cs to string ct. Both cs
sl@0	1024	* and ct are assumed to be at least n UTF chars long.
sl@0	1025	*
sl@0	1026	* Results:
sl@0	1027	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0	1028	*
sl@0	1029	* Side effects:
sl@0	1030	* None.
sl@0	1031	*
sl@0	1032	*----------------------------------------------------------------------
sl@0	1033	*/
sl@0	1034
sl@0	1035	EXPORT_C int
sl@0	1036	Tcl_UtfNcmp(cs, ct, n)
sl@0	1037	CONST char cs; / UTF string to compare to ct. */
sl@0	1038	CONST char ct; / UTF string cs is compared to. */
sl@0	1039	unsigned long n; /* Number of UTF chars to compare. */
sl@0	1040	{
sl@0	1041	Tcl_UniChar ch1, ch2;
sl@0	1042	/*
sl@0	1043	* Cannot use 'memcmp(cs, ct, n);' as byte representation of
sl@0	1044	* \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
sl@0	1045	* representation of \u0001 (the byte 0x01.)
sl@0	1046	*/
sl@0	1047	while (n-- > 0) {
sl@0	1048	/*
sl@0	1049	* n must be interpreted as chars, not bytes.
sl@0	1050	* This should be called only when both strings are of
sl@0	1051	* at least n chars long (no need for \0 check)
sl@0	1052	*/
sl@0	1053	cs += TclUtfToUniChar(cs, &ch1);
sl@0	1054	ct += TclUtfToUniChar(ct, &ch2);
sl@0	1055	if (ch1 != ch2) {
sl@0	1056	return (ch1 - ch2);
sl@0	1057	}
sl@0	1058	}
sl@0	1059	return 0;
sl@0	1060	}
sl@0	1061
sl@0	1062	/*
sl@0	1063	*----------------------------------------------------------------------
sl@0	1064	*
sl@0	1065	* Tcl_UtfNcasecmp --
sl@0	1066	*
sl@0	1067	* Compare at most n UTF chars of string cs to string ct case
sl@0	1068	* insensitive. Both cs and ct are assumed to be at least n
sl@0	1069	* UTF chars long.
sl@0	1070	*
sl@0	1071	* Results:
sl@0	1072	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0	1073	*
sl@0	1074	* Side effects:
sl@0	1075	* None.
sl@0	1076	*
sl@0	1077	*----------------------------------------------------------------------
sl@0	1078	*/
sl@0	1079
sl@0	1080	EXPORT_C int
sl@0	1081	Tcl_UtfNcasecmp(cs, ct, n)
sl@0	1082	CONST char cs; / UTF string to compare to ct. */
sl@0	1083	CONST char ct; / UTF string cs is compared to. */
sl@0	1084	unsigned long n; /* Number of UTF chars to compare. */
sl@0	1085	{
sl@0	1086	Tcl_UniChar ch1, ch2;
sl@0	1087	while (n-- > 0) {
sl@0	1088	/*
sl@0	1089	* n must be interpreted as chars, not bytes.
sl@0	1090	* This should be called only when both strings are of
sl@0	1091	* at least n chars long (no need for \0 check)
sl@0	1092	*/
sl@0	1093	cs += TclUtfToUniChar(cs, &ch1);
sl@0	1094	ct += TclUtfToUniChar(ct, &ch2);
sl@0	1095	if (ch1 != ch2) {
sl@0	1096	ch1 = Tcl_UniCharToLower(ch1);
sl@0	1097	ch2 = Tcl_UniCharToLower(ch2);
sl@0	1098	if (ch1 != ch2) {
sl@0	1099	return (ch1 - ch2);
sl@0	1100	}
sl@0	1101	}
sl@0	1102	}
sl@0	1103	return 0;
sl@0	1104	}
sl@0	1105
sl@0	1106	/*
sl@0	1107	*----------------------------------------------------------------------
sl@0	1108	*
sl@0	1109	* Tcl_UniCharToUpper --
sl@0	1110	*
sl@0	1111	* Compute the uppercase equivalent of the given Unicode character.
sl@0	1112	*
sl@0	1113	* Results:
sl@0	1114	* Returns the uppercase Unicode character.
sl@0	1115	*
sl@0	1116	* Side effects:
sl@0	1117	* None.
sl@0	1118	*
sl@0	1119	*----------------------------------------------------------------------
sl@0	1120	*/
sl@0	1121
sl@0	1122	EXPORT_C Tcl_UniChar
sl@0	1123	Tcl_UniCharToUpper(ch)
sl@0	1124	int ch; /* Unicode character to convert. */
sl@0	1125	{
sl@0	1126	int info = GetUniCharInfo(ch);
sl@0	1127
sl@0	1128	if (GetCaseType(info) & 0x04) {
sl@0	1129	return (Tcl_UniChar) (ch - GetDelta(info));
sl@0	1130	} else {
sl@0	1131	return ch;
sl@0	1132	}
sl@0	1133	}
sl@0	1134
sl@0	1135	/*
sl@0	1136	*----------------------------------------------------------------------
sl@0	1137	*
sl@0	1138	* Tcl_UniCharToLower --
sl@0	1139	*
sl@0	1140	* Compute the lowercase equivalent of the given Unicode character.
sl@0	1141	*
sl@0	1142	* Results:
sl@0	1143	* Returns the lowercase Unicode character.
sl@0	1144	*
sl@0	1145	* Side effects:
sl@0	1146	* None.
sl@0	1147	*
sl@0	1148	*----------------------------------------------------------------------
sl@0	1149	*/
sl@0	1150
sl@0	1151	EXPORT_C Tcl_UniChar
sl@0	1152	Tcl_UniCharToLower(ch)
sl@0	1153	int ch; /* Unicode character to convert. */
sl@0	1154	{
sl@0	1155	int info = GetUniCharInfo(ch);
sl@0	1156
sl@0	1157	if (GetCaseType(info) & 0x02) {
sl@0	1158	return (Tcl_UniChar) (ch + GetDelta(info));
sl@0	1159	} else {
sl@0	1160	return ch;
sl@0	1161	}
sl@0	1162	}
sl@0	1163
sl@0	1164	/*
sl@0	1165	*----------------------------------------------------------------------
sl@0	1166	*
sl@0	1167	* Tcl_UniCharToTitle --
sl@0	1168	*
sl@0	1169	* Compute the titlecase equivalent of the given Unicode character.
sl@0	1170	*
sl@0	1171	* Results:
sl@0	1172	* Returns the titlecase Unicode character.
sl@0	1173	*
sl@0	1174	* Side effects:
sl@0	1175	* None.
sl@0	1176	*
sl@0	1177	*----------------------------------------------------------------------
sl@0	1178	*/
sl@0	1179
sl@0	1180	EXPORT_C Tcl_UniChar
sl@0	1181	Tcl_UniCharToTitle(ch)
sl@0	1182	int ch; /* Unicode character to convert. */
sl@0	1183	{
sl@0	1184	int info = GetUniCharInfo(ch);
sl@0	1185	int mode = GetCaseType(info);
sl@0	1186
sl@0	1187	if (mode & 0x1) {
sl@0	1188	/*
sl@0	1189	* Subtract or add one depending on the original case.
sl@0	1190	*/
sl@0	1191
sl@0	1192	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
sl@0	1193	} else if (mode == 0x4) {
sl@0	1194	return (Tcl_UniChar) (ch - GetDelta(info));
sl@0	1195	} else {
sl@0	1196	return ch;
sl@0	1197	}
sl@0	1198	}
sl@0	1199
sl@0	1200	/*
sl@0	1201	*----------------------------------------------------------------------
sl@0	1202	*
sl@0	1203	* Tcl_UniCharLen --
sl@0	1204	*
sl@0	1205	* Find the length of a UniChar string. The str input must be null
sl@0	1206	* terminated.
sl@0	1207	*
sl@0	1208	* Results:
sl@0	1209	* Returns the length of str in UniChars (not bytes).
sl@0	1210	*
sl@0	1211	* Side effects:
sl@0	1212	* None.
sl@0	1213	*
sl@0	1214	*----------------------------------------------------------------------
sl@0	1215	*/
sl@0	1216
sl@0	1217	EXPORT_C int
sl@0	1218	Tcl_UniCharLen(str)
sl@0	1219	CONST Tcl_UniChar str; / Unicode string to find length of. */
sl@0	1220	{
sl@0	1221	int len = 0;
sl@0	1222
sl@0	1223	while (*str != '\0') {
sl@0	1224	len++;
sl@0	1225	str++;
sl@0	1226	}
sl@0	1227	return len;
sl@0	1228	}
sl@0	1229
sl@0	1230	/*
sl@0	1231	*----------------------------------------------------------------------
sl@0	1232	*
sl@0	1233	* Tcl_UniCharNcmp --
sl@0	1234	*
sl@0	1235	* Compare at most n unichars of string cs to string ct. Both cs
sl@0	1236	* and ct are assumed to be at least n unichars long.
sl@0	1237	*
sl@0	1238	* Results:
sl@0	1239	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0	1240	*
sl@0	1241	* Side effects:
sl@0	1242	* None.
sl@0	1243	*
sl@0	1244	*----------------------------------------------------------------------
sl@0	1245	*/
sl@0	1246
sl@0	1247	EXPORT_C int
sl@0	1248	Tcl_UniCharNcmp(cs, ct, n)
sl@0	1249	CONST Tcl_UniChar cs; / Unicode string to compare to ct. */
sl@0	1250	CONST Tcl_UniChar ct; / Unicode string cs is compared to. */
sl@0	1251	unsigned long n; /* Number of unichars to compare. */
sl@0	1252	{
sl@0	1253	#ifdef WORDS_BIGENDIAN
sl@0	1254	/*
sl@0	1255	* We are definitely on a big-endian machine; memcmp() is safe
sl@0	1256	*/
sl@0	1257	return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
sl@0	1258
sl@0	1259	#else /* !WORDS_BIGENDIAN */
sl@0	1260	/*
sl@0	1261	* We can't simply call memcmp() because that is not lexically correct.
sl@0	1262	*/
sl@0	1263	for ( ; n != 0; cs++, ct++, n--) {
sl@0	1264	if (cs != ct) {
sl@0	1265	return (cs - ct);
sl@0	1266	}
sl@0	1267	}
sl@0	1268	return 0;
sl@0	1269	#endif /* WORDS_BIGENDIAN */
sl@0	1270	}
sl@0	1271
sl@0	1272	/*
sl@0	1273	*----------------------------------------------------------------------
sl@0	1274	*
sl@0	1275	* Tcl_UniCharNcasecmp --
sl@0	1276	*
sl@0	1277	* Compare at most n unichars of string cs to string ct case
sl@0	1278	* insensitive. Both cs and ct are assumed to be at least n
sl@0	1279	* unichars long.
sl@0	1280	*
sl@0	1281	* Results:
sl@0	1282	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
sl@0	1283	*
sl@0	1284	* Side effects:
sl@0	1285	* None.
sl@0	1286	*
sl@0	1287	*----------------------------------------------------------------------
sl@0	1288	*/
sl@0	1289
sl@0	1290	EXPORT_C int
sl@0	1291	Tcl_UniCharNcasecmp(cs, ct, n)
sl@0	1292	CONST Tcl_UniChar cs; / Unicode string to compare to ct. */
sl@0	1293	CONST Tcl_UniChar ct; / Unicode string cs is compared to. */
sl@0	1294	unsigned long n; /* Number of unichars to compare. */
sl@0	1295	{
sl@0	1296	for ( ; n != 0; n--, cs++, ct++) {
sl@0	1297	if (cs != ct) {
sl@0	1298	Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);
sl@0	1299	Tcl_UniChar lct = Tcl_UniCharToLower(*ct);
sl@0	1300	if (lcs != lct) {
sl@0	1301	return (lcs - lct);
sl@0	1302	}
sl@0	1303	}
sl@0	1304	}
sl@0	1305	return 0;
sl@0	1306	}
sl@0	1307
sl@0	1308	/*
sl@0	1309	*----------------------------------------------------------------------
sl@0	1310	*
sl@0	1311	* Tcl_UniCharIsAlnum --
sl@0	1312	*
sl@0	1313	* Test if a character is an alphanumeric Unicode character.
sl@0	1314	*
sl@0	1315	* Results:
sl@0	1316	* Returns 1 if character is alphanumeric.
sl@0	1317	*
sl@0	1318	* Side effects:
sl@0	1319	* None.
sl@0	1320	*
sl@0	1321	*----------------------------------------------------------------------
sl@0	1322	*/
sl@0	1323
sl@0	1324	EXPORT_C int
sl@0	1325	Tcl_UniCharIsAlnum(ch)
sl@0	1326	int ch; /* Unicode character to test. */
sl@0	1327	{
sl@0	1328	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1329
sl@0	1330	return (((ALPHA_BITS \| DIGIT_BITS) >> category) & 1);
sl@0	1331	}
sl@0	1332
sl@0	1333	/*
sl@0	1334	*----------------------------------------------------------------------
sl@0	1335	*
sl@0	1336	* Tcl_UniCharIsAlpha --
sl@0	1337	*
sl@0	1338	* Test if a character is an alphabetic Unicode character.
sl@0	1339	*
sl@0	1340	* Results:
sl@0	1341	* Returns 1 if character is alphabetic.
sl@0	1342	*
sl@0	1343	* Side effects:
sl@0	1344	* None.
sl@0	1345	*
sl@0	1346	*----------------------------------------------------------------------
sl@0	1347	*/
sl@0	1348
sl@0	1349	EXPORT_C int
sl@0	1350	Tcl_UniCharIsAlpha(ch)
sl@0	1351	int ch; /* Unicode character to test. */
sl@0	1352	{
sl@0	1353	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1354	return ((ALPHA_BITS >> category) & 1);
sl@0	1355	}
sl@0	1356
sl@0	1357	/*
sl@0	1358	*----------------------------------------------------------------------
sl@0	1359	*
sl@0	1360	* Tcl_UniCharIsControl --
sl@0	1361	*
sl@0	1362	* Test if a character is a Unicode control character.
sl@0	1363	*
sl@0	1364	* Results:
sl@0	1365	* Returns non-zero if character is a control.
sl@0	1366	*
sl@0	1367	* Side effects:
sl@0	1368	* None.
sl@0	1369	*
sl@0	1370	*----------------------------------------------------------------------
sl@0	1371	*/
sl@0	1372
sl@0	1373	EXPORT_C int
sl@0	1374	Tcl_UniCharIsControl(ch)
sl@0	1375	int ch; /* Unicode character to test. */
sl@0	1376	{
sl@0	1377	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
sl@0	1378	}
sl@0	1379
sl@0	1380	/*
sl@0	1381	*----------------------------------------------------------------------
sl@0	1382	*
sl@0	1383	* Tcl_UniCharIsDigit --
sl@0	1384	*
sl@0	1385	* Test if a character is a numeric Unicode character.
sl@0	1386	*
sl@0	1387	* Results:
sl@0	1388	* Returns non-zero if character is a digit.
sl@0	1389	*
sl@0	1390	* Side effects:
sl@0	1391	* None.
sl@0	1392	*
sl@0	1393	*----------------------------------------------------------------------
sl@0	1394	*/
sl@0	1395
sl@0	1396	EXPORT_C int
sl@0	1397	Tcl_UniCharIsDigit(ch)
sl@0	1398	int ch; /* Unicode character to test. */
sl@0	1399	{
sl@0	1400	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
sl@0	1401	== DECIMAL_DIGIT_NUMBER);
sl@0	1402	}
sl@0	1403
sl@0	1404	/*
sl@0	1405	*----------------------------------------------------------------------
sl@0	1406	*
sl@0	1407	* Tcl_UniCharIsGraph --
sl@0	1408	*
sl@0	1409	* Test if a character is any Unicode print character except space.
sl@0	1410	*
sl@0	1411	* Results:
sl@0	1412	* Returns non-zero if character is printable, but not space.
sl@0	1413	*
sl@0	1414	* Side effects:
sl@0	1415	* None.
sl@0	1416	*
sl@0	1417	*----------------------------------------------------------------------
sl@0	1418	*/
sl@0	1419
sl@0	1420	EXPORT_C int
sl@0	1421	Tcl_UniCharIsGraph(ch)
sl@0	1422	int ch; /* Unicode character to test. */
sl@0	1423	{
sl@0	1424	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1425	return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
sl@0	1426	}
sl@0	1427
sl@0	1428	/*
sl@0	1429	*----------------------------------------------------------------------
sl@0	1430	*
sl@0	1431	* Tcl_UniCharIsLower --
sl@0	1432	*
sl@0	1433	* Test if a character is a lowercase Unicode character.
sl@0	1434	*
sl@0	1435	* Results:
sl@0	1436	* Returns non-zero if character is lowercase.
sl@0	1437	*
sl@0	1438	* Side effects:
sl@0	1439	* None.
sl@0	1440	*
sl@0	1441	*----------------------------------------------------------------------
sl@0	1442	*/
sl@0	1443
sl@0	1444	EXPORT_C int
sl@0	1445	Tcl_UniCharIsLower(ch)
sl@0	1446	int ch; /* Unicode character to test. */
sl@0	1447	{
sl@0	1448	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
sl@0	1449	}
sl@0	1450
sl@0	1451	/*
sl@0	1452	*----------------------------------------------------------------------
sl@0	1453	*
sl@0	1454	* Tcl_UniCharIsPrint --
sl@0	1455	*
sl@0	1456	* Test if a character is a Unicode print character.
sl@0	1457	*
sl@0	1458	* Results:
sl@0	1459	* Returns non-zero if character is printable.
sl@0	1460	*
sl@0	1461	* Side effects:
sl@0	1462	* None.
sl@0	1463	*
sl@0	1464	*----------------------------------------------------------------------
sl@0	1465	*/
sl@0	1466
sl@0	1467	EXPORT_C int
sl@0	1468	Tcl_UniCharIsPrint(ch)
sl@0	1469	int ch; /* Unicode character to test. */
sl@0	1470	{
sl@0	1471	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1472	return ((PRINT_BITS >> category) & 1);
sl@0	1473	}
sl@0	1474
sl@0	1475	/*
sl@0	1476	*----------------------------------------------------------------------
sl@0	1477	*
sl@0	1478	* Tcl_UniCharIsPunct --
sl@0	1479	*
sl@0	1480	* Test if a character is a Unicode punctuation character.
sl@0	1481	*
sl@0	1482	* Results:
sl@0	1483	* Returns non-zero if character is punct.
sl@0	1484	*
sl@0	1485	* Side effects:
sl@0	1486	* None.
sl@0	1487	*
sl@0	1488	*----------------------------------------------------------------------
sl@0	1489	*/
sl@0	1490
sl@0	1491	EXPORT_C int
sl@0	1492	Tcl_UniCharIsPunct(ch)
sl@0	1493	int ch; /* Unicode character to test. */
sl@0	1494	{
sl@0	1495	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1496	return ((PUNCT_BITS >> category) & 1);
sl@0	1497	}
sl@0	1498
sl@0	1499	/*
sl@0	1500	*----------------------------------------------------------------------
sl@0	1501	*
sl@0	1502	* Tcl_UniCharIsSpace --
sl@0	1503	*
sl@0	1504	* Test if a character is a whitespace Unicode character.
sl@0	1505	*
sl@0	1506	* Results:
sl@0	1507	* Returns non-zero if character is a space.
sl@0	1508	*
sl@0	1509	* Side effects:
sl@0	1510	* None.
sl@0	1511	*
sl@0	1512	*----------------------------------------------------------------------
sl@0	1513	*/
sl@0	1514
sl@0	1515	EXPORT_C int
sl@0	1516	Tcl_UniCharIsSpace(ch)
sl@0	1517	int ch; /* Unicode character to test. */
sl@0	1518	{
sl@0	1519	register int category;
sl@0	1520
sl@0	1521	/*
sl@0	1522	* If the character is within the first 127 characters, just use the
sl@0	1523	* standard C function, otherwise consult the Unicode table.
sl@0	1524	*/
sl@0	1525
sl@0	1526	if (ch < 0x80) {
sl@0	1527	return isspace(UCHAR(ch)); /* INTL: ISO space */
sl@0	1528	} else {
sl@0	1529	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1530	return ((SPACE_BITS >> category) & 1);
sl@0	1531	}
sl@0	1532	}
sl@0	1533
sl@0	1534	/*
sl@0	1535	*----------------------------------------------------------------------
sl@0	1536	*
sl@0	1537	* Tcl_UniCharIsUpper --
sl@0	1538	*
sl@0	1539	* Test if a character is a uppercase Unicode character.
sl@0	1540	*
sl@0	1541	* Results:
sl@0	1542	* Returns non-zero if character is uppercase.
sl@0	1543	*
sl@0	1544	* Side effects:
sl@0	1545	* None.
sl@0	1546	*
sl@0	1547	*----------------------------------------------------------------------
sl@0	1548	*/
sl@0	1549
sl@0	1550	EXPORT_C int
sl@0	1551	Tcl_UniCharIsUpper(ch)
sl@0	1552	int ch; /* Unicode character to test. */
sl@0	1553	{
sl@0	1554	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
sl@0	1555	}
sl@0	1556
sl@0	1557	/*
sl@0	1558	*----------------------------------------------------------------------
sl@0	1559	*
sl@0	1560	* Tcl_UniCharIsWordChar --
sl@0	1561	*
sl@0	1562	* Test if a character is alphanumeric or a connector punctuation
sl@0	1563	* mark.
sl@0	1564	*
sl@0	1565	* Results:
sl@0	1566	* Returns 1 if character is a word character.
sl@0	1567	*
sl@0	1568	* Side effects:
sl@0	1569	* None.
sl@0	1570	*
sl@0	1571	*----------------------------------------------------------------------
sl@0	1572	*/
sl@0	1573
sl@0	1574	EXPORT_C int
sl@0	1575	Tcl_UniCharIsWordChar(ch)
sl@0	1576	int ch; /* Unicode character to test. */
sl@0	1577	{
sl@0	1578	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
sl@0	1579
sl@0	1580	return (((ALPHA_BITS \| DIGIT_BITS \| CONNECTOR_BITS) >> category) & 1);
sl@0	1581	}
sl@0	1582
sl@0	1583	/*
sl@0	1584	*----------------------------------------------------------------------
sl@0	1585	*
sl@0	1586	* Tcl_UniCharCaseMatch --
sl@0	1587	*
sl@0	1588	* See if a particular Unicode string matches a particular pattern.
sl@0	1589	* Allows case insensitivity. This is the Unicode equivalent of
sl@0	1590	* the char* Tcl_StringCaseMatch. The UniChar strings must be
sl@0	1591	* NULL-terminated. This has no provision for counted UniChar
sl@0	1592	* strings, thus should not be used where NULLs are expected in the
sl@0	1593	* UniChar string. Use TclUniCharMatch where possible.
sl@0	1594	*
sl@0	1595	* Results:
sl@0	1596	* The return value is 1 if string matches pattern, and
sl@0	1597	* 0 otherwise. The matching operation permits the following
sl@0	1598	* special characters in the pattern: *?\[] (see the manual
sl@0	1599	* entry for details on what these mean).
sl@0	1600	*
sl@0	1601	* Side effects:
sl@0	1602	* None.
sl@0	1603	*
sl@0	1604	*----------------------------------------------------------------------
sl@0	1605	*/
sl@0	1606
sl@0	1607	EXPORT_C int
sl@0	1608	Tcl_UniCharCaseMatch(string, pattern, nocase)
sl@0	1609	CONST Tcl_UniChar string; / Unicode String. */
sl@0	1610	CONST Tcl_UniChar pattern; / Pattern, which may contain special
sl@0	1611	* characters. */
sl@0	1612	int nocase; /* 0 for case sensitive, 1 for insensitive */
sl@0	1613	{
sl@0	1614	Tcl_UniChar ch1, p;
sl@0	1615
sl@0	1616	while (1) {
sl@0	1617	p = *pattern;
sl@0	1618
sl@0	1619	/*
sl@0	1620	* See if we're at the end of both the pattern and the string. If
sl@0	1621	* so, we succeeded. If we're at the end of the pattern but not at
sl@0	1622	* the end of the string, we failed.
sl@0	1623	*/
sl@0	1624
sl@0	1625	if (p == 0) {
sl@0	1626	return (*string == 0);
sl@0	1627	}
sl@0	1628	if ((string == 0) && (p != '')) {
sl@0	1629	return 0;
sl@0	1630	}
sl@0	1631
sl@0	1632	/*
sl@0	1633	* Check for a "*" as the next pattern character. It matches any
sl@0	1634	* substring. We handle this by skipping all the characters up to the
sl@0	1635	* next matching one in the pattern, and then calling ourselves
sl@0	1636	* recursively for each postfix of string, until either we match or we
sl@0	1637	* reach the end of the string.
sl@0	1638	*/
sl@0	1639
sl@0	1640	if (p == '*') {
sl@0	1641	/*
sl@0	1642	* Skip all successive *'s in the pattern
sl@0	1643	*/
sl@0	1644	while ((++pattern) == '') {}
sl@0	1645	p = *pattern;
sl@0	1646	if (p == 0) {
sl@0	1647	return 1;
sl@0	1648	}
sl@0	1649	if (nocase) {
sl@0	1650	p = Tcl_UniCharToLower(p);
sl@0	1651	}
sl@0	1652	while (1) {
sl@0	1653	/*
sl@0	1654	* Optimization for matching - cruise through the string
sl@0	1655	* quickly if the next char in the pattern isn't a special
sl@0	1656	* character
sl@0	1657	*/
sl@0	1658	if ((p != '[') && (p != '?') && (p != '\\')) {
sl@0	1659	if (nocase) {
sl@0	1660	while (string && (p != string)
sl@0	1661	&& (p != Tcl_UniCharToLower(*string))) {
sl@0	1662	string++;
sl@0	1663	}
sl@0	1664	} else {
sl@0	1665	while (string && (p != string)) { string++; }
sl@0	1666	}
sl@0	1667	}
sl@0	1668	if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
sl@0	1669	return 1;
sl@0	1670	}
sl@0	1671	if (*string == 0) {
sl@0	1672	return 0;
sl@0	1673	}
sl@0	1674	string++;
sl@0	1675	}
sl@0	1676	}
sl@0	1677
sl@0	1678	/*
sl@0	1679	* Check for a "?" as the next pattern character. It matches
sl@0	1680	* any single character.
sl@0	1681	*/
sl@0	1682
sl@0	1683	if (p == '?') {
sl@0	1684	pattern++;
sl@0	1685	string++;
sl@0	1686	continue;
sl@0	1687	}
sl@0	1688
sl@0	1689	/*
sl@0	1690	* Check for a "[" as the next pattern character. It is followed
sl@0	1691	* by a list of characters that are acceptable, or by a range
sl@0	1692	* (two characters separated by "-").
sl@0	1693	*/
sl@0	1694
sl@0	1695	if (p == '[') {
sl@0	1696	Tcl_UniChar startChar, endChar;
sl@0	1697
sl@0	1698	pattern++;
sl@0	1699	ch1 = (nocase ? Tcl_UniCharToLower(string) : string);
sl@0	1700	string++;
sl@0	1701	while (1) {
sl@0	1702	if ((pattern == ']') \|\| (pattern == 0)) {
sl@0	1703	return 0;
sl@0	1704	}
sl@0	1705	startChar = (nocase ? Tcl_UniCharToLower(pattern) : pattern);
sl@0	1706	pattern++;
sl@0	1707	if (*pattern == '-') {
sl@0	1708	pattern++;
sl@0	1709	if (*pattern == 0) {
sl@0	1710	return 0;
sl@0	1711	}
sl@0	1712	endChar = (nocase ? Tcl_UniCharToLower(*pattern)
sl@0	1713	: *pattern);
sl@0	1714	pattern++;
sl@0	1715	if (((startChar <= ch1) && (ch1 <= endChar))
sl@0	1716	\|\| ((endChar <= ch1) && (ch1 <= startChar))) {
sl@0	1717	/*
sl@0	1718	* Matches ranges of form [a-z] or [z-a].
sl@0	1719	*/
sl@0	1720	break;
sl@0	1721	}
sl@0	1722	} else if (startChar == ch1) {
sl@0	1723	break;
sl@0	1724	}
sl@0	1725	}
sl@0	1726	while (*pattern != ']') {
sl@0	1727	if (*pattern == 0) {
sl@0	1728	pattern--;
sl@0	1729	break;
sl@0	1730	}
sl@0	1731	pattern++;
sl@0	1732	}
sl@0	1733	pattern++;
sl@0	1734	continue;
sl@0	1735	}
sl@0	1736
sl@0	1737	/*
sl@0	1738	* If the next pattern character is '\', just strip off the '\'
sl@0	1739	* so we do exact matching on the character that follows.
sl@0	1740	*/
sl@0	1741
sl@0	1742	if (p == '\\') {
sl@0	1743	if (*(++pattern) == '\0') {
sl@0	1744	return 0;
sl@0	1745	}
sl@0	1746	}
sl@0	1747
sl@0	1748	/*
sl@0	1749	* There's no special character. Just make sure that the next
sl@0	1750	* bytes of each string match.
sl@0	1751	*/
sl@0	1752
sl@0	1753	if (nocase) {
sl@0	1754	if (Tcl_UniCharToLower(string) != Tcl_UniCharToLower(pattern)) {
sl@0	1755	return 0;
sl@0	1756	}
sl@0	1757	} else if (string != pattern) {
sl@0	1758	return 0;
sl@0	1759	}
sl@0	1760	string++;
sl@0	1761	pattern++;
sl@0	1762	}
sl@0	1763	}
sl@0	1764
sl@0	1765	/*
sl@0	1766	*----------------------------------------------------------------------
sl@0	1767	*
sl@0	1768	* TclUniCharMatch --
sl@0	1769	*
sl@0	1770	* See if a particular Unicode string matches a particular pattern.
sl@0	1771	* Allows case insensitivity. This is the Unicode equivalent of the
sl@0	1772	* char* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch
sl@0	1773	* uses counted Strings, so embedded NULLs are allowed.
sl@0	1774	*
sl@0	1775	* Results:
sl@0	1776	* The return value is 1 if string matches pattern, and
sl@0	1777	* 0 otherwise. The matching operation permits the following
sl@0	1778	* special characters in the pattern: *?\[] (see the manual
sl@0	1779	* entry for details on what these mean).
sl@0	1780	*
sl@0	1781	* Side effects:
sl@0	1782	* None.
sl@0	1783	*
sl@0	1784	*----------------------------------------------------------------------
sl@0	1785	*/
sl@0	1786
sl@0	1787	int
sl@0	1788	TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
sl@0	1789	CONST Tcl_UniChar string; / Unicode String. */
sl@0	1790	int strLen; /* length of String */
sl@0	1791	CONST Tcl_UniChar pattern; / Pattern, which may contain special
sl@0	1792	* characters. */
sl@0	1793	int ptnLen; /* length of Pattern */
sl@0	1794	int nocase; /* 0 for case sensitive, 1 for insensitive */
sl@0	1795	{
sl@0	1796	CONST Tcl_UniChar stringEnd, patternEnd;
sl@0	1797	Tcl_UniChar p;
sl@0	1798
sl@0	1799	stringEnd = string + strLen;
sl@0	1800	patternEnd = pattern + ptnLen;
sl@0	1801
sl@0	1802	while (1) {
sl@0	1803	/*
sl@0	1804	* See if we're at the end of both the pattern and the string. If
sl@0	1805	* so, we succeeded. If we're at the end of the pattern but not at
sl@0	1806	* the end of the string, we failed.
sl@0	1807	*/
sl@0	1808
sl@0	1809	if (pattern == patternEnd) {
sl@0	1810	return (string == stringEnd);
sl@0	1811	}
sl@0	1812	p = *pattern;
sl@0	1813	if ((string == stringEnd) && (p != '*')) {
sl@0	1814	return 0;
sl@0	1815	}
sl@0	1816
sl@0	1817	/*
sl@0	1818	* Check for a "*" as the next pattern character. It matches any
sl@0	1819	* substring. We handle this by skipping all the characters up to the
sl@0	1820	* next matching one in the pattern, and then calling ourselves
sl@0	1821	* recursively for each postfix of string, until either we match or we
sl@0	1822	* reach the end of the string.
sl@0	1823	*/
sl@0	1824
sl@0	1825	if (p == '*') {
sl@0	1826	/*
sl@0	1827	* Skip all successive *'s in the pattern
sl@0	1828	*/
sl@0	1829	while ((++pattern) == '') {}
sl@0	1830	if (pattern == patternEnd) {
sl@0	1831	return 1;
sl@0	1832	}
sl@0	1833	p = *pattern;
sl@0	1834	if (nocase) {
sl@0	1835	p = Tcl_UniCharToLower(p);
sl@0	1836	}
sl@0	1837	while (1) {
sl@0	1838	/*
sl@0	1839	* Optimization for matching - cruise through the string
sl@0	1840	* quickly if the next char in the pattern isn't a special
sl@0	1841	* character
sl@0	1842	*/
sl@0	1843	if ((p != '[') && (p != '?') && (p != '\\')) {
sl@0	1844	if (nocase) {
sl@0	1845	while ((string < stringEnd) && (p != *string)
sl@0	1846	&& (p != Tcl_UniCharToLower(*string))) {
sl@0	1847	string++;
sl@0	1848	}
sl@0	1849	} else {
sl@0	1850	while ((string < stringEnd) && (p != *string)) {
sl@0	1851	string++;
sl@0	1852	}
sl@0	1853	}
sl@0	1854	}
sl@0	1855	if (TclUniCharMatch(string, stringEnd - string,
sl@0	1856	pattern, patternEnd - pattern, nocase)) {
sl@0	1857	return 1;
sl@0	1858	}
sl@0	1859	if (string == stringEnd) {
sl@0	1860	return 0;
sl@0	1861	}
sl@0	1862	string++;
sl@0	1863	}
sl@0	1864	}
sl@0	1865
sl@0	1866	/*
sl@0	1867	* Check for a "?" as the next pattern character. It matches
sl@0	1868	* any single character.
sl@0	1869	*/
sl@0	1870
sl@0	1871	if (p == '?') {
sl@0	1872	pattern++;
sl@0	1873	string++;
sl@0	1874	continue;
sl@0	1875	}
sl@0	1876
sl@0	1877	/*
sl@0	1878	* Check for a "[" as the next pattern character. It is followed
sl@0	1879	* by a list of characters that are acceptable, or by a range
sl@0	1880	* (two characters separated by "-").
sl@0	1881	*/
sl@0	1882
sl@0	1883	if (p == '[') {
sl@0	1884	Tcl_UniChar ch1, startChar, endChar;
sl@0	1885
sl@0	1886	pattern++;
sl@0	1887	ch1 = (nocase ? Tcl_UniCharToLower(string) : string);
sl@0	1888	string++;
sl@0	1889	while (1) {
sl@0	1890	if ((*pattern == ']') \|\| (pattern == patternEnd)) {
sl@0	1891	return 0;
sl@0	1892	}
sl@0	1893	startChar = (nocase ? Tcl_UniCharToLower(pattern) : pattern);
sl@0	1894	pattern++;
sl@0	1895	if (*pattern == '-') {
sl@0	1896	pattern++;
sl@0	1897	if (pattern == patternEnd) {
sl@0	1898	return 0;
sl@0	1899	}
sl@0	1900	endChar = (nocase ? Tcl_UniCharToLower(*pattern)
sl@0	1901	: *pattern);
sl@0	1902	pattern++;
sl@0	1903	if (((startChar <= ch1) && (ch1 <= endChar))
sl@0	1904	\|\| ((endChar <= ch1) && (ch1 <= startChar))) {
sl@0	1905	/*
sl@0	1906	* Matches ranges of form [a-z] or [z-a].
sl@0	1907	*/
sl@0	1908	break;
sl@0	1909	}
sl@0	1910	} else if (startChar == ch1) {
sl@0	1911	break;
sl@0	1912	}
sl@0	1913	}
sl@0	1914	while (*pattern != ']') {
sl@0	1915	if (pattern == patternEnd) {
sl@0	1916	pattern--;
sl@0	1917	break;
sl@0	1918	}
sl@0	1919	pattern++;
sl@0	1920	}
sl@0	1921	pattern++;
sl@0	1922	continue;
sl@0	1923	}
sl@0	1924
sl@0	1925	/*
sl@0	1926	* If the next pattern character is '\', just strip off the '\'
sl@0	1927	* so we do exact matching on the character that follows.
sl@0	1928	*/
sl@0	1929
sl@0	1930	if (p == '\\') {
sl@0	1931	if (++pattern == patternEnd) {
sl@0	1932	return 0;
sl@0	1933	}
sl@0	1934	}
sl@0	1935
sl@0	1936	/*
sl@0	1937	* There's no special character. Just make sure that the next
sl@0	1938	* bytes of each string match.
sl@0	1939	*/
sl@0	1940
sl@0	1941	if (nocase) {
sl@0	1942	if (Tcl_UniCharToLower(string) != Tcl_UniCharToLower(pattern)) {
sl@0	1943	return 0;
sl@0	1944	}
sl@0	1945	} else if (string != pattern) {
sl@0	1946	return 0;
sl@0	1947	}
sl@0	1948	string++;
sl@0	1949	pattern++;
sl@0	1950	}
sl@0	1951	}

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--