sl@0: /* sl@0: * tclRegexp.c -- sl@0: * sl@0: * This file contains the public interfaces to the Tcl regular sl@0: * expression mechanism. sl@0: * sl@0: * Copyright (c) 1998 by Sun Microsystems, Inc. sl@0: * Copyright (c) 1998-1999 by Scriptics Corporation. sl@0: * Portions Copyright (c) 2007-2008 Nokia Corporation and/or its subsidiaries. All rights reserved. sl@0: * sl@0: * See the file "license.terms" for information on usage and redistribution sl@0: * of this file, and for a DISCLAIMER OF ALL WARRANTIES. sl@0: * sl@0: * RCS: @(#) $Id: tclRegexp.c,v 1.14.4.2 2006/04/07 01:14:28 hobbs Exp $ sl@0: */ sl@0: sl@0: #include "tclInt.h" sl@0: #include "tclPort.h" sl@0: #include "tclRegexp.h" sl@0: #if defined(__SYMBIAN32__) && defined(__WINSCW__) sl@0: #include "tclSymbianGlobals.h" sl@0: #define dataKey getdataKey(6) sl@0: #endif sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * The routines in this file use Henry Spencer's regular expression sl@0: * package contained in the following additional source files: sl@0: * sl@0: * regc_color.c regc_cvec.c regc_lex.c sl@0: * regc_nfa.c regcomp.c regcustom.h sl@0: * rege_dfa.c regerror.c regerrs.h sl@0: * regex.h regexec.c regfree.c sl@0: * regfronts.c regguts.h sl@0: * sl@0: * Copyright (c) 1998 Henry Spencer. All rights reserved. sl@0: * sl@0: * Development of this software was funded, in part, by Cray Research Inc., sl@0: * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics sl@0: * Corporation, none of whom are responsible for the results. The author sl@0: * thanks all of them. sl@0: * sl@0: * Redistribution and use in source and binary forms -- with or without sl@0: * modification -- are permitted for any purpose, provided that sl@0: * redistributions in source form retain this entire copyright notice and sl@0: * indicate the origin and nature of any modifications. sl@0: * sl@0: * I'd appreciate being given credit for this package in the documentation sl@0: * of software which uses it, but that is not a requirement. sl@0: * sl@0: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, sl@0: * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY sl@0: * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL sl@0: * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, sl@0: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, sl@0: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; sl@0: * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, sl@0: * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR sl@0: * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF sl@0: * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sl@0: * sl@0: * *** NOTE: this code has been altered slightly for use in Tcl: *** sl@0: * *** 1. Names have been changed, e.g. from re_comp to *** sl@0: * *** TclRegComp, to avoid clashes with other *** sl@0: * *** regexp implementations used by applications. *** sl@0: */ sl@0: sl@0: /* sl@0: * Thread local storage used to maintain a per-thread cache of compiled sl@0: * regular expressions. sl@0: */ sl@0: sl@0: #define NUM_REGEXPS 30 sl@0: sl@0: typedef struct ThreadSpecificData { sl@0: int initialized; /* Set to 1 when the module is initialized. */ sl@0: char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled sl@0: * regular expression patterns. NULL sl@0: * means that this slot isn't used. sl@0: * Malloc-ed. */ sl@0: int patLengths[NUM_REGEXPS];/* Number of non-null characters in sl@0: * corresponding entry in patterns. sl@0: * -1 means entry isn't used. */ sl@0: struct TclRegexp *regexps[NUM_REGEXPS]; sl@0: /* Compiled forms of above strings. Also sl@0: * malloc-ed, or NULL if not in use yet. */ sl@0: } ThreadSpecificData; sl@0: sl@0: #if !defined(__SYMBIAN32__) || !defined(__WINSCW__) sl@0: static Tcl_ThreadDataKey dataKey; sl@0: #endif sl@0: sl@0: /* sl@0: * Declarations for functions used only in this file. sl@0: */ sl@0: sl@0: static TclRegexp * CompileRegexp _ANSI_ARGS_((Tcl_Interp *interp, sl@0: CONST char *pattern, int length, int flags)); sl@0: static void DupRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr, sl@0: Tcl_Obj *copyPtr)); sl@0: static void FinalizeRegexp _ANSI_ARGS_((ClientData clientData)); sl@0: static void FreeRegexp _ANSI_ARGS_((TclRegexp *regexpPtr)); sl@0: static void FreeRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); sl@0: static int RegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp, sl@0: Tcl_RegExp re, CONST Tcl_UniChar *uniString, sl@0: int numChars, int nmatches, int flags)); sl@0: static int SetRegexpFromAny _ANSI_ARGS_((Tcl_Interp *interp, sl@0: Tcl_Obj *objPtr)); sl@0: sl@0: /* sl@0: * The regular expression Tcl object type. This serves as a cache sl@0: * of the compiled form of the regular expression. sl@0: */ sl@0: sl@0: static Tcl_ObjType tclRegexpType = { sl@0: "regexp", /* name */ sl@0: FreeRegexpInternalRep, /* freeIntRepProc */ sl@0: DupRegexpInternalRep, /* dupIntRepProc */ sl@0: NULL, /* updateStringProc */ sl@0: SetRegexpFromAny /* setFromAnyProc */ sl@0: }; sl@0: sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpCompile -- sl@0: * sl@0: * Compile a regular expression into a form suitable for fast sl@0: * matching. This procedure is DEPRECATED in favor of the sl@0: * object version of the command. sl@0: * sl@0: * Results: sl@0: * The return value is a pointer to the compiled form of string, sl@0: * suitable for passing to Tcl_RegExpExec. This compiled form sl@0: * is only valid up until the next call to this procedure, so sl@0: * don't keep these around for a long time! If an error occurred sl@0: * while compiling the pattern, then NULL is returned and an error sl@0: * message is left in the interp's result. sl@0: * sl@0: * Side effects: sl@0: * Updates the cache of compiled regexps. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_RegExp sl@0: Tcl_RegExpCompile(interp, string) sl@0: Tcl_Interp *interp; /* For use in error reporting and sl@0: * to access the interp regexp cache. */ sl@0: CONST char *string; /* String for which to produce sl@0: * compiled regular expression. */ sl@0: { sl@0: return (Tcl_RegExp) CompileRegexp(interp, string, (int) strlen(string), sl@0: REG_ADVANCED); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpExec -- sl@0: * sl@0: * Execute the regular expression matcher using a compiled form sl@0: * of a regular expression and save information about any match sl@0: * that is found. sl@0: * sl@0: * Results: sl@0: * If an error occurs during the matching operation then -1 sl@0: * is returned and the interp's result contains an error message. sl@0: * Otherwise the return value is 1 if a matching range is sl@0: * found and 0 if there is no matching range. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_RegExpExec(interp, re, string, start) sl@0: Tcl_Interp *interp; /* Interpreter to use for error reporting. */ sl@0: Tcl_RegExp re; /* Compiled regular expression; must have sl@0: * been returned by previous call to sl@0: * Tcl_GetRegExpFromObj. */ sl@0: CONST char *string; /* String against which to match re. */ sl@0: CONST char *start; /* If string is part of a larger string, sl@0: * this identifies beginning of larger sl@0: * string, so that "^" won't match. */ sl@0: { sl@0: int flags, result, numChars; sl@0: TclRegexp *regexp = (TclRegexp *)re; sl@0: Tcl_DString ds; sl@0: CONST Tcl_UniChar *ustr; sl@0: sl@0: /* sl@0: * If the starting point is offset from the beginning of the buffer, sl@0: * then we need to tell the regexp engine not to match "^". sl@0: */ sl@0: sl@0: if (string > start) { sl@0: flags = REG_NOTBOL; sl@0: } else { sl@0: flags = 0; sl@0: } sl@0: sl@0: /* sl@0: * Remember the string for use by Tcl_RegExpRange(). sl@0: */ sl@0: sl@0: regexp->string = string; sl@0: regexp->objPtr = NULL; sl@0: sl@0: /* sl@0: * Convert the string to Unicode and perform the match. sl@0: */ sl@0: sl@0: Tcl_DStringInit(&ds); sl@0: ustr = Tcl_UtfToUniCharDString(string, -1, &ds); sl@0: numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar); sl@0: result = RegExpExecUniChar(interp, re, ustr, numChars, sl@0: -1 /* nmatches */, flags); sl@0: Tcl_DStringFree(&ds); sl@0: sl@0: return result; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpRange -- sl@0: * sl@0: * Returns pointers describing the range of a regular expression match, sl@0: * or one of the subranges within the match. sl@0: * sl@0: * Results: sl@0: * The variables at *startPtr and *endPtr are modified to hold the sl@0: * addresses of the endpoints of the range given by index. If the sl@0: * specified range doesn't exist then NULLs are returned. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C void sl@0: Tcl_RegExpRange(re, index, startPtr, endPtr) sl@0: Tcl_RegExp re; /* Compiled regular expression that has sl@0: * been passed to Tcl_RegExpExec. */ sl@0: int index; /* 0 means give the range of the entire sl@0: * match, > 0 means give the range of sl@0: * a matching subrange. */ sl@0: CONST char **startPtr; /* Store address of first character in sl@0: * (sub-) range here. */ sl@0: CONST char **endPtr; /* Store address of character just after last sl@0: * in (sub-) range here. */ sl@0: { sl@0: TclRegexp *regexpPtr = (TclRegexp *) re; sl@0: CONST char *string; sl@0: sl@0: if ((size_t) index > regexpPtr->re.re_nsub) { sl@0: *startPtr = *endPtr = NULL; sl@0: } else if (regexpPtr->matches[index].rm_so < 0) { sl@0: *startPtr = *endPtr = NULL; sl@0: } else { sl@0: if (regexpPtr->objPtr) { sl@0: string = Tcl_GetString(regexpPtr->objPtr); sl@0: } else { sl@0: string = regexpPtr->string; sl@0: } sl@0: *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so); sl@0: *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo); sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * RegExpExecUniChar -- sl@0: * sl@0: * Execute the regular expression matcher using a compiled form of a sl@0: * regular expression and save information about any match that is sl@0: * found. sl@0: * sl@0: * Results: sl@0: * If an error occurs during the matching operation then -1 is sl@0: * returned and an error message is left in interp's result. sl@0: * Otherwise the return value is 1 if a matching range was found or sl@0: * 0 if there was no matching range. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static int sl@0: RegExpExecUniChar(interp, re, wString, numChars, nmatches, flags) sl@0: Tcl_Interp *interp; /* Interpreter to use for error reporting. */ sl@0: Tcl_RegExp re; /* Compiled regular expression; returned by sl@0: * a previous call to Tcl_GetRegExpFromObj */ sl@0: CONST Tcl_UniChar *wString; /* String against which to match re. */ sl@0: int numChars; /* Length of Tcl_UniChar string (must sl@0: * be >= 0). */ sl@0: int nmatches; /* How many subexpression matches (counting sl@0: * the whole match as subexpression 0) are sl@0: * of interest. -1 means "don't know". */ sl@0: int flags; /* Regular expression flags. */ sl@0: { sl@0: int status; sl@0: TclRegexp *regexpPtr = (TclRegexp *) re; sl@0: size_t last = regexpPtr->re.re_nsub + 1; sl@0: size_t nm = last; sl@0: sl@0: if (nmatches >= 0 && (size_t) nmatches < nm) { sl@0: nm = (size_t) nmatches; sl@0: } sl@0: sl@0: status = TclReExec(®expPtr->re, wString, (size_t) numChars, sl@0: ®expPtr->details, nm, regexpPtr->matches, flags); sl@0: sl@0: /* sl@0: * Check for errors. sl@0: */ sl@0: sl@0: if (status != REG_OKAY) { sl@0: if (status == REG_NOMATCH) { sl@0: return 0; sl@0: } sl@0: if (interp != NULL) { sl@0: TclRegError(interp, "error while matching regular expression: ", sl@0: status); sl@0: } sl@0: return -1; sl@0: } sl@0: return 1; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * TclRegExpRangeUniChar -- sl@0: * sl@0: * Returns pointers describing the range of a regular expression match, sl@0: * or one of the subranges within the match, or the hypothetical range sl@0: * represented by the rm_extend field of the rm_detail_t. sl@0: * sl@0: * Results: sl@0: * The variables at *startPtr and *endPtr are modified to hold the sl@0: * offsets of the endpoints of the range given by index. If the sl@0: * specified range doesn't exist then -1s are supplied. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *--------------------------------------------------------------------------- sl@0: */ sl@0: sl@0: void sl@0: TclRegExpRangeUniChar(re, index, startPtr, endPtr) sl@0: Tcl_RegExp re; /* Compiled regular expression that has sl@0: * been passed to Tcl_RegExpExec. */ sl@0: int index; /* 0 means give the range of the entire sl@0: * match, > 0 means give the range of sl@0: * a matching subrange, -1 means the sl@0: * range of the rm_extend field. */ sl@0: int *startPtr; /* Store address of first character in sl@0: * (sub-) range here. */ sl@0: int *endPtr; /* Store address of character just after last sl@0: * in (sub-) range here. */ sl@0: { sl@0: TclRegexp *regexpPtr = (TclRegexp *) re; sl@0: sl@0: if ((regexpPtr->flags®_EXPECT) && index == -1) { sl@0: *startPtr = regexpPtr->details.rm_extend.rm_so; sl@0: *endPtr = regexpPtr->details.rm_extend.rm_eo; sl@0: } else if ((size_t) index > regexpPtr->re.re_nsub) { sl@0: *startPtr = -1; sl@0: *endPtr = -1; sl@0: } else { sl@0: *startPtr = regexpPtr->matches[index].rm_so; sl@0: *endPtr = regexpPtr->matches[index].rm_eo; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpMatch -- sl@0: * sl@0: * See if a string matches a regular expression. sl@0: * sl@0: * Results: sl@0: * If an error occurs during the matching operation then -1 sl@0: * is returned and the interp's result contains an error message. sl@0: * Otherwise the return value is 1 if "string" matches "pattern" sl@0: * and 0 otherwise. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_RegExpMatch(interp, string, pattern) sl@0: Tcl_Interp *interp; /* Used for error reporting. May be NULL. */ sl@0: CONST char *string; /* String. */ sl@0: CONST char *pattern; /* Regular expression to match against sl@0: * string. */ sl@0: { sl@0: Tcl_RegExp re; sl@0: sl@0: re = Tcl_RegExpCompile(interp, pattern); sl@0: if (re == NULL) { sl@0: return -1; sl@0: } sl@0: return Tcl_RegExpExec(interp, re, string, string); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpExecObj -- sl@0: * sl@0: * Execute a precompiled regexp against the given object. sl@0: * sl@0: * Results: sl@0: * If an error occurs during the matching operation then -1 sl@0: * is returned and the interp's result contains an error message. sl@0: * Otherwise the return value is 1 if "string" matches "pattern" sl@0: * and 0 otherwise. sl@0: * sl@0: * Side effects: sl@0: * Converts the object to a Unicode object. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_RegExpExecObj(interp, re, objPtr, offset, nmatches, flags) sl@0: Tcl_Interp *interp; /* Interpreter to use for error reporting. */ sl@0: Tcl_RegExp re; /* Compiled regular expression; must have sl@0: * been returned by previous call to sl@0: * Tcl_GetRegExpFromObj. */ sl@0: Tcl_Obj *objPtr; /* String against which to match re. */ sl@0: int offset; /* Character index that marks where matching sl@0: * should begin. */ sl@0: int nmatches; /* How many subexpression matches (counting sl@0: * the whole match as subexpression 0) are sl@0: * of interest. -1 means all of them. */ sl@0: int flags; /* Regular expression execution flags. */ sl@0: { sl@0: TclRegexp *regexpPtr = (TclRegexp *) re; sl@0: Tcl_UniChar *udata; sl@0: int length; sl@0: sl@0: /* sl@0: * Save the target object so we can extract strings from it later. sl@0: */ sl@0: sl@0: regexpPtr->string = NULL; sl@0: regexpPtr->objPtr = objPtr; sl@0: sl@0: udata = Tcl_GetUnicodeFromObj(objPtr, &length); sl@0: sl@0: if (offset > length) { sl@0: offset = length; sl@0: } sl@0: udata += offset; sl@0: length -= offset; sl@0: sl@0: return RegExpExecUniChar(interp, re, udata, length, nmatches, flags); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpMatchObj -- sl@0: * sl@0: * See if an object matches a regular expression. sl@0: * sl@0: * Results: sl@0: * If an error occurs during the matching operation then -1 sl@0: * is returned and the interp's result contains an error message. sl@0: * Otherwise the return value is 1 if "string" matches "pattern" sl@0: * and 0 otherwise. sl@0: * sl@0: * Side effects: sl@0: * Changes the internal rep of the pattern and string objects. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C int sl@0: Tcl_RegExpMatchObj(interp, stringObj, patternObj) sl@0: Tcl_Interp *interp; /* Used for error reporting. May be NULL. */ sl@0: Tcl_Obj *stringObj; /* Object containing the String to search. */ sl@0: Tcl_Obj *patternObj; /* Regular expression to match against sl@0: * string. */ sl@0: { sl@0: Tcl_RegExp re; sl@0: sl@0: re = Tcl_GetRegExpFromObj(interp, patternObj, sl@0: TCL_REG_ADVANCED | TCL_REG_NOSUB); sl@0: if (re == NULL) { sl@0: return -1; sl@0: } sl@0: return Tcl_RegExpExecObj(interp, re, stringObj, 0 /* offset */, sl@0: 0 /* nmatches */, 0 /* flags */); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_RegExpGetInfo -- sl@0: * sl@0: * Retrieve information about the current match. sl@0: * sl@0: * Results: sl@0: * None. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C void sl@0: Tcl_RegExpGetInfo(regexp, infoPtr) sl@0: Tcl_RegExp regexp; /* Pattern from which to get subexpressions. */ sl@0: Tcl_RegExpInfo *infoPtr; /* Match information is stored here. */ sl@0: { sl@0: TclRegexp *regexpPtr = (TclRegexp *) regexp; sl@0: sl@0: infoPtr->nsubs = regexpPtr->re.re_nsub; sl@0: infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; sl@0: infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * Tcl_GetRegExpFromObj -- sl@0: * sl@0: * Compile a regular expression into a form suitable for fast sl@0: * matching. This procedure caches the result in a Tcl_Obj. sl@0: * sl@0: * Results: sl@0: * The return value is a pointer to the compiled form of string, sl@0: * suitable for passing to Tcl_RegExpExec. If an error occurred sl@0: * while compiling the pattern, then NULL is returned and an error sl@0: * message is left in the interp's result. sl@0: * sl@0: * Side effects: sl@0: * Updates the native rep of the Tcl_Obj. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: EXPORT_C Tcl_RegExp sl@0: Tcl_GetRegExpFromObj(interp, objPtr, flags) sl@0: Tcl_Interp *interp; /* For use in error reporting, and to access sl@0: * the interp regexp cache. */ sl@0: Tcl_Obj *objPtr; /* Object whose string rep contains regular sl@0: * expression pattern. Internal rep will be sl@0: * changed to compiled form of this regular sl@0: * expression. */ sl@0: int flags; /* Regular expression compilation flags. */ sl@0: { sl@0: int length; sl@0: Tcl_ObjType *typePtr; sl@0: TclRegexp *regexpPtr; sl@0: char *pattern; sl@0: sl@0: typePtr = objPtr->typePtr; sl@0: regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; sl@0: sl@0: if ((typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { sl@0: pattern = Tcl_GetStringFromObj(objPtr, &length); sl@0: sl@0: regexpPtr = CompileRegexp(interp, pattern, length, flags); sl@0: if (regexpPtr == NULL) { sl@0: return NULL; sl@0: } sl@0: sl@0: /* sl@0: * Add a reference to the regexp so it will persist even if it is sl@0: * pushed out of the current thread's regexp cache. This reference sl@0: * will be removed when the object's internal rep is freed. sl@0: */ sl@0: sl@0: regexpPtr->refCount++; sl@0: sl@0: /* sl@0: * Free the old representation and set our type. sl@0: */ sl@0: sl@0: if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) { sl@0: (*typePtr->freeIntRepProc)(objPtr); sl@0: } sl@0: objPtr->internalRep.otherValuePtr = (VOID *) regexpPtr; sl@0: objPtr->typePtr = &tclRegexpType; sl@0: } sl@0: return (Tcl_RegExp) regexpPtr; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * TclRegAbout -- sl@0: * sl@0: * Return information about a compiled regular expression. sl@0: * sl@0: * Results: sl@0: * The return value is -1 for failure, 0 for success, although at sl@0: * the moment there's nothing that could fail. On success, a list sl@0: * is left in the interp's result: first element is the subexpression sl@0: * count, second is a list of re_info bit names. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: int sl@0: TclRegAbout(interp, re) sl@0: Tcl_Interp *interp; /* For use in variable assignment. */ sl@0: Tcl_RegExp re; /* The compiled regular expression. */ sl@0: { sl@0: TclRegexp *regexpPtr = (TclRegexp *)re; sl@0: char buf[TCL_INTEGER_SPACE]; sl@0: static struct infoname { sl@0: int bit; sl@0: char *text; sl@0: } infonames[] = { sl@0: {REG_UBACKREF, "REG_UBACKREF"}, sl@0: {REG_ULOOKAHEAD, "REG_ULOOKAHEAD"}, sl@0: {REG_UBOUNDS, "REG_UBOUNDS"}, sl@0: {REG_UBRACES, "REG_UBRACES"}, sl@0: {REG_UBSALNUM, "REG_UBSALNUM"}, sl@0: {REG_UPBOTCH, "REG_UPBOTCH"}, sl@0: {REG_UBBS, "REG_UBBS"}, sl@0: {REG_UNONPOSIX, "REG_UNONPOSIX"}, sl@0: {REG_UUNSPEC, "REG_UUNSPEC"}, sl@0: {REG_UUNPORT, "REG_UUNPORT"}, sl@0: {REG_ULOCALE, "REG_ULOCALE"}, sl@0: {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, sl@0: {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, sl@0: {REG_USHORTEST, "REG_USHORTEST"}, sl@0: {0, ""} sl@0: }; sl@0: struct infoname *inf; sl@0: int n; sl@0: sl@0: Tcl_ResetResult(interp); sl@0: sl@0: sprintf(buf, "%u", (unsigned)(regexpPtr->re.re_nsub)); sl@0: Tcl_AppendElement(interp, buf); sl@0: sl@0: /* sl@0: * Must count bits before generating list, because we must know sl@0: * whether {} are needed before we start appending names. sl@0: */ sl@0: n = 0; sl@0: for (inf = infonames; inf->bit != 0; inf++) { sl@0: if (regexpPtr->re.re_info&inf->bit) { sl@0: n++; sl@0: } sl@0: } sl@0: if (n != 1) { sl@0: Tcl_AppendResult(interp, " {", NULL); sl@0: } sl@0: for (inf = infonames; inf->bit != 0; inf++) { sl@0: if (regexpPtr->re.re_info&inf->bit) { sl@0: Tcl_AppendElement(interp, inf->text); sl@0: } sl@0: } sl@0: if (n != 1) { sl@0: Tcl_AppendResult(interp, "}", NULL); sl@0: } sl@0: sl@0: return 0; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * TclRegError -- sl@0: * sl@0: * Generate an error message based on the regexp status code. sl@0: * sl@0: * Results: sl@0: * Places an error in the interpreter. sl@0: * sl@0: * Side effects: sl@0: * Sets errorCode as well. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: void sl@0: TclRegError(interp, msg, status) sl@0: Tcl_Interp *interp; /* Interpreter for error reporting. */ sl@0: CONST char *msg; /* Message to prepend to error. */ sl@0: int status; /* Status code to report. */ sl@0: { sl@0: char buf[100]; /* ample in practice */ sl@0: char cbuf[100]; /* lots in practice */ sl@0: size_t n; sl@0: char *p; sl@0: sl@0: Tcl_ResetResult(interp); sl@0: n = TclReError(status, (regex_t *)NULL, buf, sizeof(buf)); sl@0: p = (n > sizeof(buf)) ? "..." : ""; sl@0: Tcl_AppendResult(interp, msg, buf, p, NULL); sl@0: sl@0: sprintf(cbuf, "%d", status); sl@0: (VOID) TclReError(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); sl@0: Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL); sl@0: } sl@0: sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * FreeRegexpInternalRep -- sl@0: * sl@0: * Deallocate the storage associated with a regexp object's internal sl@0: * representation. sl@0: * sl@0: * Results: sl@0: * None. sl@0: * sl@0: * Side effects: sl@0: * Frees the compiled regular expression. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static void sl@0: FreeRegexpInternalRep(objPtr) sl@0: Tcl_Obj *objPtr; /* Regexp object with internal rep to free. */ sl@0: { sl@0: TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; sl@0: sl@0: /* sl@0: * If this is the last reference to the regexp, free it. sl@0: */ sl@0: sl@0: if (--(regexpRepPtr->refCount) <= 0) { sl@0: FreeRegexp(regexpRepPtr); sl@0: } sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * DupRegexpInternalRep -- sl@0: * sl@0: * We copy the reference to the compiled regexp and bump its sl@0: * reference count. sl@0: * sl@0: * Results: sl@0: * None. sl@0: * sl@0: * Side effects: sl@0: * Increments the reference count of the regexp. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static void sl@0: DupRegexpInternalRep(srcPtr, copyPtr) sl@0: Tcl_Obj *srcPtr; /* Object with internal rep to copy. */ sl@0: Tcl_Obj *copyPtr; /* Object with internal rep to set. */ sl@0: { sl@0: TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr; sl@0: regexpPtr->refCount++; sl@0: copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr; sl@0: copyPtr->typePtr = &tclRegexpType; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * SetRegexpFromAny -- sl@0: * sl@0: * Attempt to generate a compiled regular expression for the Tcl object sl@0: * "objPtr". sl@0: * sl@0: * Results: sl@0: * The return value is TCL_OK or TCL_ERROR. If an error occurs during sl@0: * conversion, an error message is left in the interpreter's result sl@0: * unless "interp" is NULL. sl@0: * sl@0: * Side effects: sl@0: * If no error occurs, a regular expression is stored as "objPtr"s sl@0: * internal representation. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static int sl@0: SetRegexpFromAny(interp, objPtr) sl@0: Tcl_Interp *interp; /* Used for error reporting if not NULL. */ sl@0: Tcl_Obj *objPtr; /* The object to convert. */ sl@0: { sl@0: if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) { sl@0: return TCL_ERROR; sl@0: } sl@0: return TCL_OK; sl@0: } sl@0: sl@0: /* sl@0: *--------------------------------------------------------------------------- sl@0: * sl@0: * CompileRegexp -- sl@0: * sl@0: * Attempt to compile the given regexp pattern. If the compiled sl@0: * regular expression can be found in the per-thread cache, it sl@0: * will be used instead of compiling a new copy. sl@0: * sl@0: * Results: sl@0: * The return value is a pointer to a newly allocated TclRegexp sl@0: * that represents the compiled pattern, or NULL if the pattern sl@0: * could not be compiled. If NULL is returned, an error message is sl@0: * left in the interp's result. sl@0: * sl@0: * Side effects: sl@0: * The thread-local regexp cache is updated and a new TclRegexp may sl@0: * be allocated. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static TclRegexp * sl@0: CompileRegexp(interp, string, length, flags) sl@0: Tcl_Interp *interp; /* Used for error reporting if not NULL. */ sl@0: CONST char *string; /* The regexp to compile (UTF-8). */ sl@0: int length; /* The length of the string in bytes. */ sl@0: int flags; /* Compilation flags. */ sl@0: { sl@0: TclRegexp *regexpPtr; sl@0: CONST Tcl_UniChar *uniString; sl@0: int numChars; sl@0: Tcl_DString stringBuf; sl@0: int status, i; sl@0: ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); sl@0: sl@0: if (!tsdPtr->initialized) { sl@0: tsdPtr->initialized = 1; sl@0: Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL); sl@0: } sl@0: sl@0: /* sl@0: * This routine maintains a second-level regular expression cache in sl@0: * addition to the per-object regexp cache. The per-thread cache is needed sl@0: * to handle the case where for various reasons the object is lost between sl@0: * invocations of the regexp command, but the literal pattern is the same. sl@0: */ sl@0: sl@0: /* sl@0: * Check the per-thread compiled regexp cache. We can only reuse sl@0: * a regexp if it has the same pattern and the same flags. sl@0: */ sl@0: sl@0: for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { sl@0: if ((length == tsdPtr->patLengths[i]) sl@0: && (tsdPtr->regexps[i]->flags == flags) sl@0: && (strcmp(string, tsdPtr->patterns[i]) == 0)) { sl@0: /* sl@0: * Move the matched pattern to the first slot in the sl@0: * cache and shift the other patterns down one position. sl@0: */ sl@0: sl@0: if (i != 0) { sl@0: int j; sl@0: char *cachedString; sl@0: sl@0: cachedString = tsdPtr->patterns[i]; sl@0: regexpPtr = tsdPtr->regexps[i]; sl@0: for (j = i-1; j >= 0; j--) { sl@0: tsdPtr->patterns[j+1] = tsdPtr->patterns[j]; sl@0: tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j]; sl@0: tsdPtr->regexps[j+1] = tsdPtr->regexps[j]; sl@0: } sl@0: tsdPtr->patterns[0] = cachedString; sl@0: tsdPtr->patLengths[0] = length; sl@0: tsdPtr->regexps[0] = regexpPtr; sl@0: } sl@0: return tsdPtr->regexps[0]; sl@0: } sl@0: } sl@0: sl@0: /* sl@0: * This is a new expression, so compile it and add it to the cache. sl@0: */ sl@0: sl@0: regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); sl@0: regexpPtr->objPtr = NULL; sl@0: regexpPtr->string = NULL; sl@0: regexpPtr->details.rm_extend.rm_so = -1; sl@0: regexpPtr->details.rm_extend.rm_eo = -1; sl@0: sl@0: /* sl@0: * Get the up-to-date string representation and map to unicode. sl@0: */ sl@0: sl@0: Tcl_DStringInit(&stringBuf); sl@0: uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf); sl@0: numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar); sl@0: sl@0: /* sl@0: * Compile the string and check for errors. sl@0: */ sl@0: sl@0: regexpPtr->flags = flags; sl@0: status = TclReComp(®expPtr->re, uniString, (size_t) numChars, flags); sl@0: Tcl_DStringFree(&stringBuf); sl@0: sl@0: if (status != REG_OKAY) { sl@0: /* sl@0: * Clean up and report errors in the interpreter, if possible. sl@0: */ sl@0: sl@0: ckfree((char *)regexpPtr); sl@0: if (interp) { sl@0: TclRegError(interp, sl@0: "couldn't compile regular expression pattern: ", sl@0: status); sl@0: } sl@0: return NULL; sl@0: } sl@0: sl@0: /* sl@0: * Allocate enough space for all of the subexpressions, plus one sl@0: * extra for the entire pattern. sl@0: */ sl@0: sl@0: regexpPtr->matches = (regmatch_t *) ckalloc( sl@0: sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); sl@0: sl@0: /* sl@0: * Initialize the refcount to one initially, since it is in the cache. sl@0: */ sl@0: sl@0: regexpPtr->refCount = 1; sl@0: sl@0: /* sl@0: * Free the last regexp, if necessary, and make room at the head of the sl@0: * list for the new regexp. sl@0: */ sl@0: sl@0: if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) { sl@0: TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1]; sl@0: if (--(oldRegexpPtr->refCount) <= 0) { sl@0: FreeRegexp(oldRegexpPtr); sl@0: } sl@0: ckfree(tsdPtr->patterns[NUM_REGEXPS-1]); sl@0: } sl@0: for (i = NUM_REGEXPS - 2; i >= 0; i--) { sl@0: tsdPtr->patterns[i+1] = tsdPtr->patterns[i]; sl@0: tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i]; sl@0: tsdPtr->regexps[i+1] = tsdPtr->regexps[i]; sl@0: } sl@0: tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1)); sl@0: strcpy(tsdPtr->patterns[0], string); sl@0: tsdPtr->patLengths[0] = length; sl@0: tsdPtr->regexps[0] = regexpPtr; sl@0: sl@0: return regexpPtr; sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * FreeRegexp -- sl@0: * sl@0: * Release the storage associated with a TclRegexp. sl@0: * sl@0: * Results: sl@0: * None. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static void sl@0: FreeRegexp(regexpPtr) sl@0: TclRegexp *regexpPtr; /* Compiled regular expression to free. */ sl@0: { sl@0: TclReFree(®expPtr->re); sl@0: if (regexpPtr->matches) { sl@0: ckfree((char *) regexpPtr->matches); sl@0: } sl@0: ckfree((char *) regexpPtr); sl@0: } sl@0: sl@0: /* sl@0: *---------------------------------------------------------------------- sl@0: * sl@0: * FinalizeRegexp -- sl@0: * sl@0: * Release the storage associated with the per-thread regexp sl@0: * cache. sl@0: * sl@0: * Results: sl@0: * None. sl@0: * sl@0: * Side effects: sl@0: * None. sl@0: * sl@0: *---------------------------------------------------------------------- sl@0: */ sl@0: sl@0: static void sl@0: FinalizeRegexp(clientData) sl@0: ClientData clientData; /* Not used. */ sl@0: { sl@0: int i; sl@0: TclRegexp *regexpPtr; sl@0: ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); sl@0: sl@0: for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { sl@0: regexpPtr = tsdPtr->regexps[i]; sl@0: if (--(regexpPtr->refCount) <= 0) { sl@0: FreeRegexp(regexpPtr); sl@0: } sl@0: ckfree(tsdPtr->patterns[i]); sl@0: tsdPtr->patterns[i] = NULL; sl@0: } sl@0: /* sl@0: * We may find ourselves reinitialized if another finalization routine sl@0: * invokes regexps. sl@0: */ sl@0: tsdPtr->initialized = 0; sl@0: }