os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/generic/regcomp.c
Update contrib.
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 * forward declarations, up here so forward datatypes etc. are defined early
38 /* =====^!^===== begin forwards =====^!^===== */
39 /* automatically gathered by fwd; do not hand-edit */
40 /* === regcomp.c === */
41 int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
42 static VOID moresubs _ANSI_ARGS_((struct vars *, int));
43 static int freev _ANSI_ARGS_((struct vars *, int));
44 static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
45 static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
46 static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
47 static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
48 static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
49 static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
50 static int scannum _ANSI_ARGS_((struct vars *));
51 static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
52 static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
53 static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
54 static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
55 static chr *scanplain _ANSI_ARGS_((struct vars *));
56 static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
57 static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
58 static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
59 static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
60 static VOID wordchrs _ANSI_ARGS_((struct vars *));
61 static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
62 static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
63 static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
64 static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
65 static int numst _ANSI_ARGS_((struct subre *, int));
66 static VOID markst _ANSI_ARGS_((struct subre *));
67 static VOID cleanst _ANSI_ARGS_((struct vars *));
68 static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
69 static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
70 static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
71 static VOID freelacons _ANSI_ARGS_((struct subre *, int));
72 static VOID rfree _ANSI_ARGS_((regex_t *));
73 static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
74 static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
75 static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
76 static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
77 /* === regc_lex.c === */
78 static VOID lexstart _ANSI_ARGS_((struct vars *));
79 static VOID prefixes _ANSI_ARGS_((struct vars *));
80 static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
81 static VOID lexword _ANSI_ARGS_((struct vars *));
82 static int next _ANSI_ARGS_((struct vars *));
83 static int lexescape _ANSI_ARGS_((struct vars *));
84 static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
85 static int brenext _ANSI_ARGS_((struct vars *, pchr));
86 static VOID skip _ANSI_ARGS_((struct vars *));
87 static chr newline _ANSI_ARGS_((NOPARMS));
89 static chr *ch _ANSI_ARGS_((NOPARMS));
91 static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
92 /* === regc_color.c === */
93 static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
94 static VOID freecm _ANSI_ARGS_((struct colormap *));
95 static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
96 static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
97 static color maxcolor _ANSI_ARGS_((struct colormap *));
98 static color newcolor _ANSI_ARGS_((struct colormap *));
99 static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
100 static color pseudocolor _ANSI_ARGS_((struct colormap *));
101 static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
102 static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
103 static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
104 static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
105 static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
106 static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
107 static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
108 static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
109 static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
110 static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
112 static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
113 static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
114 static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
116 /* === regc_nfa.c === */
117 static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
118 static VOID freenfa _ANSI_ARGS_((struct nfa *));
119 static struct state *newstate _ANSI_ARGS_((struct nfa *));
120 static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
121 static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
122 static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
123 static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
124 static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
125 static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
126 static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
127 static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
128 static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
129 static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
130 static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
131 static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
132 static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
133 static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
134 static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
135 static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
136 static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
137 static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
138 static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
139 static VOID specialcolors _ANSI_ARGS_((struct nfa *));
140 static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
141 static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
142 static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
143 static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
144 static int push _ANSI_ARGS_((struct nfa *, struct arc *));
145 #define INCOMPATIBLE 1 /* destroys arc */
146 #define SATISFIED 2 /* constraint satisfied */
147 #define COMPATIBLE 3 /* compatible but not satisfied yet */
148 static int combine _ANSI_ARGS_((struct arc *, struct arc *));
149 static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
150 static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
151 static VOID cleanup _ANSI_ARGS_((struct nfa *));
152 static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
153 static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
154 static long analyze _ANSI_ARGS_((struct nfa *));
155 static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
156 static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
157 static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
158 static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
160 static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
161 static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
162 static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
163 static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
165 static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
167 static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
169 /* === regc_cvec.c === */
170 static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
171 static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
172 static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
173 static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
174 static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
175 static int haschr _ANSI_ARGS_((struct cvec *, pchr));
176 static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
177 static VOID freecvec _ANSI_ARGS_((struct cvec *));
178 /* === regc_locale.c === */
179 static int nmcces _ANSI_ARGS_((struct vars *));
180 static int nleaders _ANSI_ARGS_((struct vars *));
181 static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
182 static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
183 static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
184 static int before _ANSI_ARGS_((celt, celt));
185 static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
186 static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
187 static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
188 static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
189 static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
190 /* automatically gathered by fwd; do not hand-edit */
191 /* =====^!^===== end forwards =====^!^===== */
195 /* internal variables, bundled for easy passing around */
198 chr *now; /* scan pointer into string */
199 chr *stop; /* end of string */
200 chr *savenow; /* saved now and stop for "subroutine call" */
202 int err; /* error code (0 if none) */
203 int cflags; /* copy of compile flags */
204 int lasttype; /* type of previous token */
205 int nexttype; /* type of next token */
206 chr nextvalue; /* value (if any) of next token */
207 int lexcon; /* lexical context type (see lex.c) */
208 int nsubexp; /* subexpression count */
209 struct subre **subs; /* subRE pointer vector */
210 size_t nsubs; /* length of vector */
211 struct subre *sub10[10]; /* initial vector, enough for most */
212 struct nfa *nfa; /* the NFA */
213 struct colormap *cm; /* character color map */
214 color nlcolor; /* color of newline */
215 struct state *wordchrs; /* state in nfa holding word-char outarcs */
216 struct subre *tree; /* subexpression tree */
217 struct subre *treechain; /* all tree nodes allocated */
218 struct subre *treefree; /* any free tree nodes */
219 int ntree; /* number of tree nodes */
220 struct cvec *cv; /* interface cvec */
221 struct cvec *cv2; /* utility cvec */
222 struct cvec *mcces; /* collating-element information */
223 # define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
224 struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
225 struct state *mccepend; /* in nfa, end of MCCE prototypes */
226 struct subre *lacons; /* lookahead-constraint vector */
227 int nlacons; /* size of lacons */
230 /* parsing macros; most know that `v' is the struct vars pointer */
231 #define NEXT() (next(v)) /* advance by one token */
232 #define SEE(t) (v->nexttype == (t)) /* is next token this? */
233 #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
234 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
235 #define ISERR() VISERR(v)
236 #define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
238 #define ERR(e) VERR(v, e) /* record an error */
239 #define NOERR() {if (ISERR()) return;} /* if error seen, return */
240 #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
241 #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
242 #define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
243 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
244 #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
246 /* token type codes, some also used as NFA arc types */
247 #define EMPTY 'n' /* no token present */
248 #define EOS 'e' /* end of string */
249 #define PLAIN 'p' /* ordinary character */
250 #define DIGIT 'd' /* digit (in bound) */
251 #define BACKREF 'b' /* back reference */
252 #define COLLEL 'I' /* start of [. */
253 #define ECLASS 'E' /* start of [= */
254 #define CCLASS 'C' /* start of [: */
255 #define END 'X' /* end of [. [= [: */
256 #define RANGE 'R' /* - within [] which might be range delim. */
257 #define LACON 'L' /* lookahead constraint subRE */
258 #define AHEAD 'a' /* color-lookahead arc */
259 #define BEHIND 'r' /* color-lookbehind arc */
260 #define WBDRY 'w' /* word boundary constraint */
261 #define NWBDRY 'W' /* non-word-boundary constraint */
262 #define SBEGIN 'A' /* beginning of string (even if not BOL) */
263 #define SEND 'Z' /* end of string (even if not EOL) */
264 #define PREFER 'P' /* length preference */
266 /* is an arc colored, and hence on a color chain? */
267 #define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
272 /* static function list */
273 static struct fns functions = {
274 rfree, /* regfree insides */
280 - compile - compile regular expression
281 ^ int compile(regex_t *, CONST chr *, size_t, int);
284 compile(re, string, len, flags)
291 struct vars *v = &var;
295 FILE *debug = (flags®_PROGRESS) ? stdout : (FILE *)NULL;
296 # define CNOERR() { if (ISERR()) return freev(v, v->err); }
300 if (re == NULL || string == NULL)
302 if ((flags®_QUOTE) &&
303 (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
305 if (!(flags®_EXTENDED) && (flags®_ADVF))
308 /* initial setup (after which freev() is callable) */
310 v->now = (chr *)string;
311 v->stop = v->now + len;
312 v->savenow = v->savestop = NULL;
318 for (j = 0; j < v->nsubs; j++)
322 v->nlcolor = COLORLESS;
332 re->re_magic = REMAGIC;
333 re->re_info = 0; /* bits get set during parse */
334 re->re_csize = sizeof(chr);
336 re->re_fns = VS(&functions);
338 /* more complex setup, malloced things */
339 re->re_guts = VS(MALLOC(sizeof(struct guts)));
340 if (re->re_guts == NULL)
341 return freev(v, REG_ESPACE);
342 g = (struct guts *)re->re_guts;
349 v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
351 v->cv = newcvec(100, 20, 10);
353 return freev(v, REG_ESPACE);
356 v->mcces = newcvec(nleaders(v), 0, i);
358 v->mcces = allmcces(v, v->mcces);
359 leaders(v, v->mcces);
360 addmcce(v->mcces, (chr *)NULL, (chr *)NULL); /* dummy */
365 lexstart(v); /* also handles prefixes */
366 if ((v->cflags®_NLSTOP) || (v->cflags®_NLANCH)) {
367 /* assign newline a unique color */
368 v->nlcolor = subcolor(v->cm, newline());
369 okcolors(v->nfa, v->cm);
372 v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
373 assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
375 assert(v->tree != NULL);
377 /* finish setup of nfa and its subre tree */
378 specialcolors(v->nfa);
381 fprintf(debug, "\n\n\n========= RAW ==========\n");
382 dumpnfa(v->nfa, debug);
383 dumpst(v->tree, debug, 1);
386 v->ntree = numst(v->tree, 1);
390 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
391 dumpst(v->tree, debug, 1);
394 /* build compacted NFAs for tree and lacons */
395 re->re_info |= nfatree(v, v->tree, debug);
397 assert(v->nlacons == 0 || v->lacons != NULL);
398 for (i = 1; i < v->nlacons; i++) {
400 fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
401 nfanode(v, &v->lacons[i], debug);
404 if (v->tree->flags&SHORTER)
407 /* build compacted NFAs for tree, lacons, fast search */
409 fprintf(debug, "\n\n\n========= SEARCH ==========\n");
410 /* can sacrifice main NFA now, so use it as work area */
411 (DISCARD)optimize(v->nfa, debug);
413 makesearch(v, v->nfa);
415 compact(v->nfa, &g->search);
418 /* looks okay, package it up */
419 re->re_nsub = v->nsubexp;
420 v->re = NULL; /* freev no longer frees re */
421 g->magic = GUTSMAGIC;
422 g->cflags = v->cflags;
423 g->info = re->re_info;
424 g->nsub = re->re_nsub;
428 g->compare = (v->cflags®_ICASE) ? casecmp : cmp;
429 g->lacons = v->lacons;
431 g->nlacons = v->nlacons;
441 - moresubs - enlarge subRE vector
442 ^ static VOID moresubs(struct vars *, int);
447 int wanted; /* want enough room for this one */
452 assert(wanted > 0 && (size_t)wanted >= v->nsubs);
453 n = (size_t)wanted * 3 / 2 + 1;
454 if (v->subs == v->sub10) {
455 p = (struct subre **)MALLOC(n * sizeof(struct subre *));
457 memcpy(VS(p), VS(v->subs),
458 v->nsubs * sizeof(struct subre *));
460 p = (struct subre **)REALLOC(v->subs, n*sizeof(struct subre *));
466 for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
468 assert(v->nsubs == n);
469 assert((size_t)wanted < v->nsubs);
473 - freev - free vars struct's substructures where necessary
474 * Optionally does error-number setting, and always returns error code
475 * (if any), to make error-handling code terser.
476 ^ static int freev(struct vars *, int);
485 if (v->subs != v->sub10)
490 freesubre(v, v->tree);
491 if (v->treechain != NULL)
497 if (v->mcces != NULL)
499 if (v->lacons != NULL)
500 freelacons(v->lacons, v->nlacons);
501 ERR(err); /* nop if err==0 */
507 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
508 * NFA must have been optimize()d already.
509 ^ static VOID makesearch(struct vars *, struct nfa *);
518 struct state *pre = nfa->pre;
523 /* no loops are needed if it's anchored */
524 for (a = pre->outs; a != NULL; a = a->outchain) {
525 assert(a->type == PLAIN);
526 if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
530 /* add implicit .* in front */
531 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
533 /* and ^* and \A* too -- not always necessary, but harmless */
534 newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
535 newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
539 * Now here's the subtle part. Because many REs have no lookback
540 * constraints, often knowing when you were in the pre state tells
541 * you little; it's the next state(s) that are informative. But
542 * some of them may have other inarcs, i.e. it may be possible to
543 * make actual progress and then return to one of them. We must
544 * de-optimize such cases, splitting each such state into progress
545 * and no-progress states.
548 /* first, make a list of the states */
550 for (a = pre->outs; a != NULL; a = a->outchain) {
552 for (b = s->ins; b != NULL; b = b->inchain)
555 if (b != NULL) { /* must be split */
556 if (s->tmp == NULL) { /* if not already in the list */
557 /* (fixes bugs 505048, 230589, */
558 /* 840258, 504785) */
566 for (s = slist; s != NULL; s = s2) {
568 copyouts(nfa, s, s2);
569 for (a = s->ins; a != NULL; a = b) {
571 if (a->from != pre) {
572 cparc(nfa, a, a->from, s2);
577 s->tmp = NULL; /* clean up while we're at it */
582 - parse - parse an RE
583 * This is actually just the top level, which parses a bunch of branches
584 * tied together with '|'. They appear in the tree as the left children
585 * of a chain of '|' subres.
586 ^ static struct subre *parse(struct vars *, int, int, struct state *,
589 static struct subre *
590 parse(v, stopper, type, init, final)
592 int stopper; /* EOS or ')' */
593 int type; /* LACON (lookahead subRE) or PLAIN */
594 struct state *init; /* initial state */
595 struct state *final; /* final state */
597 struct state *left; /* scaffolding for branch */
599 struct subre *branches; /* top level */
600 struct subre *branch; /* current branch */
601 struct subre *t; /* temporary */
602 int firstbranch; /* is this the first branch? */
604 assert(stopper == ')' || stopper == EOS);
606 branches = subre(v, '|', LONGER, init, final);
612 /* need a place to hang it */
613 branch->right = subre(v, '|', LONGER, init, final);
615 branch = branch->right;
618 left = newstate(v->nfa);
619 right = newstate(v->nfa);
621 EMPTYARC(init, left);
622 EMPTYARC(right, final);
624 branch->left = parsebranch(v, stopper, type, left, right, 0);
626 branch->flags |= UP(branch->flags | branch->left->flags);
627 if ((branch->flags &~ branches->flags) != 0) /* new flags */
628 for (t = branches; t != branch; t = t->right)
629 t->flags |= branch->flags;
631 assert(SEE(stopper) || SEE(EOS));
634 assert(stopper == ')' && SEE(EOS));
638 /* optimize out simple cases */
639 if (branch == branches) { /* only one branch */
640 assert(branch->right == NULL);
643 freesubre(v, branches);
645 } else if (!MESSY(branches->flags)) { /* no interesting innards */
646 freesubre(v, branches->left);
647 branches->left = NULL;
648 freesubre(v, branches->right);
649 branches->right = NULL;
657 - parsebranch - parse one branch of an RE
658 * This mostly manages concatenation, working closely with parseqatom().
659 * Concatenated things are bundled up as much as possible, with separate
660 * ',' nodes introduced only when necessary due to substructure.
661 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
662 ^ struct state *, int);
664 static struct subre *
665 parsebranch(v, stopper, type, left, right, partial)
667 int stopper; /* EOS or ')' */
668 int type; /* LACON (lookahead subRE) or PLAIN */
669 struct state *left; /* leftmost state */
670 struct state *right; /* rightmost state */
671 int partial; /* is this only part of a branch? */
673 struct state *lp; /* left end of current construct */
674 int seencontent; /* is there anything in this branch yet? */
679 t = subre(v, '=', 0, left, right); /* op '=' is tentative */
681 while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
682 if (seencontent) { /* implicit concat operator */
683 lp = newstate(v->nfa);
685 moveins(v->nfa, right, lp);
689 /* NB, recursion in parseqatom() may swallow rest of branch */
690 parseqatom(v, stopper, type, lp, right, t);
693 if (!seencontent) { /* empty branch */
697 EMPTYARC(left, right);
704 - parseqatom - parse one quantified atom or constraint of an RE
705 * The bookkeeping near the end cooperates very closely with parsebranch();
706 * in particular, it contains a recursion that can involve parsing the rest
707 * of the branch, making this function's name somewhat inaccurate.
708 ^ static VOID parseqatom(struct vars *, int, int, struct state *,
709 ^ struct state *, struct subre *);
712 parseqatom(v, stopper, type, lp, rp, top)
714 int stopper; /* EOS or ')' */
715 int type; /* LACON (lookahead subRE) or PLAIN */
716 struct state *lp; /* left state to hang it on */
717 struct state *rp; /* right state to hang it on */
718 struct subre *top; /* subtree top */
720 struct state *s; /* temporaries for new states */
722 # define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
724 struct subre *atom; /* atom's subtree */
726 int cap; /* capturing parens? */
727 int pos; /* positive lookahead? */
728 int subno; /* capturing-parens or backref number */
730 int qprefer; /* quantifier short/long preference */
732 struct subre **atomp; /* where the pointer to atom is */
734 /* initial bookkeeping */
736 assert(lp->nouts == 0); /* must string new code */
737 assert(rp->nins == 0); /* between lp and rp */
738 subno = 0; /* just to shut lint up */
740 /* an atom or constraint... */
741 atomtype = v->nexttype;
743 /* first, constraints, which end by returning */
746 if (v->cflags®_NLANCH)
747 ARCV(BEHIND, v->nlcolor);
753 if (v->cflags®_NLANCH)
754 ARCV(AHEAD, v->nlcolor);
759 ARCV('^', 1); /* BOL */
760 ARCV('^', 0); /* or BOS */
765 ARCV('$', 1); /* EOL */
766 ARCV('$', 0); /* or EOS */
771 wordchrs(v); /* does NEXT() */
772 s = newstate(v->nfa);
774 nonword(v, BEHIND, lp, s);
775 word(v, AHEAD, s, rp);
779 wordchrs(v); /* does NEXT() */
780 s = newstate(v->nfa);
782 word(v, BEHIND, lp, s);
783 nonword(v, AHEAD, s, rp);
787 wordchrs(v); /* does NEXT() */
788 s = newstate(v->nfa);
790 nonword(v, BEHIND, lp, s);
791 word(v, AHEAD, s, rp);
792 s = newstate(v->nfa);
794 word(v, BEHIND, lp, s);
795 nonword(v, AHEAD, s, rp);
799 wordchrs(v); /* does NEXT() */
800 s = newstate(v->nfa);
802 word(v, BEHIND, lp, s);
803 word(v, AHEAD, s, rp);
804 s = newstate(v->nfa);
806 nonword(v, BEHIND, lp, s);
807 nonword(v, AHEAD, s, rp);
810 case LACON: /* lookahead constraint */
813 s = newstate(v->nfa);
814 s2 = newstate(v->nfa);
816 t = parse(v, ')', LACON, s, s2);
817 freesubre(v, t); /* internal structure irrelevant */
818 assert(SEE(')') || ISERR());
820 n = newlacon(v, s, s2, pos);
825 /* then errors, to get them out of the way */
837 /* then plain characters, and minor variants on that theme */
838 case ')': /* unbalanced paren */
839 if ((v->cflags®_ADVANCED) != REG_EXTENDED) {
843 /* legal in EREs due to specification botch */
845 /* fallthrough into case PLAIN */
847 onechr(v, v->nextvalue, lp, rp);
848 okcolors(v->nfa, v->cm);
853 if (v->nextvalue == 1)
857 assert(SEE(']') || ISERR());
861 rainbow(v->nfa, v->cm, PLAIN,
862 (v->cflags®_NLSTOP) ? v->nlcolor : COLORLESS,
866 /* and finally the ugly stuff */
867 case '(': /* value flags as capturing or non */
868 cap = (type == LACON) ? 0 : v->nextvalue;
872 if ((size_t)subno >= v->nsubs)
874 assert((size_t)subno < v->nsubs);
876 atomtype = PLAIN; /* something that's not '(' */
878 /* need new endpoints because tree will contain pointers */
879 s = newstate(v->nfa);
880 s2 = newstate(v->nfa);
885 atom = parse(v, ')', PLAIN, s, s2);
886 assert(SEE(')') || ISERR());
890 v->subs[subno] = atom;
891 t = subre(v, '(', atom->flags|CAP, lp, rp);
897 /* postpone everything else pending possible {0} */
899 case BACKREF: /* the Feature From The Black Lagoon */
900 INSIST(type != LACON, REG_ESUBREG);
901 INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
902 INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
904 assert(v->nextvalue > 0);
905 atom = subre(v, 'b', BACKR, lp, rp);
906 subno = v->nextvalue;
908 EMPTYARC(lp, rp); /* temporarily, so there's something */
913 /* ...and an atom may be followed by a quantifier */
914 switch (v->nexttype) {
918 qprefer = (v->nextvalue) ? LONGER : SHORTER;
924 qprefer = (v->nextvalue) ? LONGER : SHORTER;
930 qprefer = (v->nextvalue) ? LONGER : SHORTER;
945 /* {m,n} exercises preference, even if it's {m,m} */
946 qprefer = (v->nextvalue) ? LONGER : SHORTER;
949 /* {m} passes operand's preference through */
952 if (!SEE('}')) { /* catches errors too */
958 default: /* no quantifier */
964 /* annoying special case: {0} or {0,0} cancels everything */
965 if (m == 0 && n == 0) {
969 v->subs[subno] = NULL;
970 delsub(v->nfa, lp, rp);
975 /* if not a messy case, avoid hard part */
976 assert(!MESSY(top->flags));
977 f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
978 if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
979 if (!(m == 1 && n == 1))
980 repeat(v, lp, rp, m, n);
988 * hard part: something messy
989 * That is, capturing parens, back reference, short/long clash, or
990 * an atom with substructure containing one of those.
993 /* now we'll need a subre for the contents even if they're boring */
995 atom = subre(v, '=', 0, lp, rp);
1000 * prepare a general-purpose state skeleton
1002 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1004 * [lp] ----> [s2] ----bypass---------------------
1006 * where bypass is an empty, and prefix is some repetitions of atom
1008 s = newstate(v->nfa); /* first, new endpoints for the atom */
1009 s2 = newstate(v->nfa);
1011 moveouts(v->nfa, lp, s);
1012 moveins(v->nfa, rp, s2);
1016 s = newstate(v->nfa); /* and spots for prefix and bypass */
1017 s2 = newstate(v->nfa);
1023 /* break remaining subRE into x{...} and what follows */
1024 t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1027 /* here we should recurse... but we must postpone that to the end */
1029 /* split top into prefix and remaining */
1030 assert(top->op == '=' && top->left == NULL && top->right == NULL);
1031 top->left = subre(v, '=', top->flags, top->begin, lp);
1035 /* if it's a backref, now is the time to replicate the subNFA */
1036 if (atomtype == BACKREF) {
1037 assert(atom->begin->nouts == 1); /* just the EMPTY */
1038 delsub(v->nfa, atom->begin, atom->end);
1039 assert(v->subs[subno] != NULL);
1040 /* and here's why the recursion got postponed: it must */
1041 /* wait until the skeleton is filled in, because it may */
1042 /* hit a backref that wants to copy the filled-in skeleton */
1043 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1044 atom->begin, atom->end);
1048 /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1050 EMPTYARC(s2, atom->end); /* the bypass */
1051 assert(PREF(qprefer) != 0);
1052 f = COMBINE(qprefer, atom->flags);
1053 t = subre(v, '|', f, lp, atom->end);
1056 t->right = subre(v, '|', PREF(f), s2, atom->end);
1058 t->right->left = subre(v, '=', 0, s2, atom->end);
1065 /* deal with the rest of the quantifier */
1066 if (atomtype == BACKREF) {
1067 /* special case: backrefs have internal quantifiers */
1068 EMPTYARC(s, atom->begin); /* empty prefix */
1069 /* just stuff everything into atom */
1070 repeat(v, atom->begin, atom->end, m, n);
1071 atom->min = (short)m;
1072 atom->max = (short)n;
1073 atom->flags |= COMBINE(qprefer, atom->flags);
1074 } else if (m == 1 && n == 1) {
1075 /* no/vacuous quantifier: done */
1076 EMPTYARC(s, atom->begin); /* empty prefix */
1078 /* turn x{m,n} into x{m-1,n-1}x, with capturing */
1079 /* parens in only second x */
1080 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1081 assert(m >= 1 && m != INFINITY && n >= 1);
1082 repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
1083 f = COMBINE(qprefer, atom->flags);
1084 t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1086 t->left = subre(v, '=', PREF(f), s, atom->begin);
1092 /* and finally, look after that postponed recursion */
1094 if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1095 t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1097 EMPTYARC(atom->end, rp);
1098 t->right = subre(v, '=', 0, atom->end, rp);
1100 assert(SEE('|') || SEE(stopper) || SEE(EOS));
1101 t->flags |= COMBINE(t->flags, t->right->flags);
1102 top->flags |= COMBINE(top->flags, t->flags);
1106 - nonword - generate arcs for non-word-character ahead or behind
1107 ^ static VOID nonword(struct vars *, int, struct state *, struct state *);
1110 nonword(v, dir, lp, rp)
1112 int dir; /* AHEAD or BEHIND */
1116 int anchor = (dir == AHEAD) ? '$' : '^';
1118 assert(dir == AHEAD || dir == BEHIND);
1119 newarc(v->nfa, anchor, 1, lp, rp);
1120 newarc(v->nfa, anchor, 0, lp, rp);
1121 colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1122 /* (no need for special attention to \n) */
1126 - word - generate arcs for word character ahead or behind
1127 ^ static VOID word(struct vars *, int, struct state *, struct state *);
1130 word(v, dir, lp, rp)
1132 int dir; /* AHEAD or BEHIND */
1136 assert(dir == AHEAD || dir == BEHIND);
1137 cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1138 /* (no need for special attention to \n) */
1142 - scannum - scan a number
1143 ^ static int scannum(struct vars *);
1145 static int /* value, <= DUPMAX */
1151 while (SEE(DIGIT) && n < DUPMAX) {
1152 n = n*10 + v->nextvalue;
1155 if (SEE(DIGIT) || n > DUPMAX) {
1163 - repeat - replicate subNFA for quantifiers
1164 * The duplication sequences used here are chosen carefully so that any
1165 * pointers starting out pointing into the subexpression end up pointing into
1166 * the last occurrence. (Note that it may not be strung between the same
1167 * left and right end states, however!) This used to be important for the
1168 * subRE tree, although the important bits are now handled by the in-line
1169 * code in parse(), and when this is called, it doesn't matter any more.
1170 ^ static VOID repeat(struct vars *, struct state *, struct state *, int, int);
1173 repeat(v, lp, rp, m, n)
1182 # define PAIR(x, y) ((x)*4 + (y))
1183 # define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1184 CONST int rm = REDUCE(m);
1185 CONST int rn = REDUCE(n);
1189 switch (PAIR(rm, rn)) {
1190 case PAIR(0, 0): /* empty string */
1191 delsub(v->nfa, lp, rp);
1194 case PAIR(0, 1): /* do as x| */
1197 case PAIR(0, SOME): /* do as x{1,n}| */
1198 repeat(v, lp, rp, 1, n);
1202 case PAIR(0, INF): /* loop x around */
1203 s = newstate(v->nfa);
1205 moveouts(v->nfa, lp, s);
1206 moveins(v->nfa, rp, s);
1210 case PAIR(1, 1): /* no action required */
1212 case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1213 s = newstate(v->nfa);
1215 moveouts(v->nfa, lp, s);
1216 dupnfa(v->nfa, s, rp, lp, s);
1218 repeat(v, lp, s, 1, n-1);
1222 case PAIR(1, INF): /* add loopback arc */
1223 s = newstate(v->nfa);
1224 s2 = newstate(v->nfa);
1226 moveouts(v->nfa, lp, s);
1227 moveins(v->nfa, rp, s2);
1232 case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1233 s = newstate(v->nfa);
1235 moveouts(v->nfa, lp, s);
1236 dupnfa(v->nfa, s, rp, lp, s);
1238 repeat(v, lp, s, m-1, n-1);
1240 case PAIR(SOME, INF): /* do as x{m-1,}x */
1241 s = newstate(v->nfa);
1243 moveouts(v->nfa, lp, s);
1244 dupnfa(v->nfa, s, rp, lp, s);
1246 repeat(v, lp, s, m-1, n);
1255 - bracket - handle non-complemented bracket expression
1256 * Also called from cbracket for complemented bracket expressions.
1257 ^ static VOID bracket(struct vars *, struct state *, struct state *);
1267 while (!SEE(']') && !SEE(EOS))
1268 brackpart(v, lp, rp);
1269 assert(SEE(']') || ISERR());
1270 okcolors(v->nfa, v->cm);
1274 - cbracket - handle complemented bracket expression
1275 * We do it by calling bracket() with dummy endpoints, and then complementing
1276 * the result. The alternative would be to invoke rainbow(), and then delete
1277 * arcs as the b.e. is seen... but that gets messy.
1278 ^ static VOID cbracket(struct vars *, struct state *, struct state *);
1286 struct state *left = newstate(v->nfa);
1287 struct state *right = newstate(v->nfa);
1289 struct arc *a; /* arc from lp */
1290 struct arc *ba; /* arc from left, from bracket() */
1291 struct arc *pa; /* MCCE-prototype arc */
1297 bracket(v, left, right);
1298 if (v->cflags®_NLSTOP)
1299 newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1302 assert(lp->nouts == 0); /* all outarcs will be ours */
1304 /* easy part of complementing */
1305 colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1307 if (v->mcces == NULL) { /* no MCCEs -- we're done */
1308 dropstate(v->nfa, left);
1309 assert(right->nins == 0);
1310 freestate(v->nfa, right);
1314 /* but complementing gets messy in the presence of MCCEs... */
1316 for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
1317 co = GETCOLOR(v->cm, *p);
1318 a = findarc(lp, PLAIN, co);
1319 ba = findarc(left, PLAIN, co);
1326 s = newstate(v->nfa);
1328 newarc(v->nfa, PLAIN, co, lp, s);
1330 pa = findarc(v->mccepbegin, PLAIN, co);
1332 if (ba == NULL) { /* easy case, need all of them */
1333 cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1334 newarc(v->nfa, '$', 1, s, rp);
1335 newarc(v->nfa, '$', 0, s, rp);
1336 colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1337 } else { /* must be selective */
1338 if (findarc(ba->to, '$', 1) == NULL) {
1339 newarc(v->nfa, '$', 1, s, rp);
1340 newarc(v->nfa, '$', 0, s, rp);
1341 colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1344 for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1345 if (findarc(ba->to, PLAIN, pa->co) == NULL)
1346 newarc(v->nfa, PLAIN, pa->co, s, rp);
1347 if (s->nouts == 0) /* limit of selectivity: none */
1348 dropstate(v->nfa, s); /* frees arc too */
1353 delsub(v->nfa, left, right);
1354 assert(left->nouts == 0);
1355 freestate(v->nfa, left);
1356 assert(right->nins == 0);
1357 freestate(v->nfa, right);
1361 - brackpart - handle one item (or range) within a bracket expression
1362 ^ static VOID brackpart(struct vars *, struct state *, struct state *);
1365 brackpart(v, lp, rp)
1377 /* parse something, get rid of special cases, take shortcuts */
1378 switch (v->nexttype) {
1379 case RANGE: /* a-b-c or other botch */
1384 c[0] = v->nextvalue;
1386 /* shortcut for ordinary chr (not range, not MCCE leader) */
1387 if (!SEE(RANGE) && !ISCELEADER(v, c[0])) {
1388 onechr(v, c[0], lp, rp);
1391 startc = element(v, c, c+1);
1396 endp = scanplain(v);
1397 INSIST(startp < endp, REG_ECOLLATE);
1399 startc = element(v, startp, endp);
1404 endp = scanplain(v);
1405 INSIST(startp < endp, REG_ECOLLATE);
1407 startc = element(v, startp, endp);
1409 cv = eclass(v, startc, (v->cflags®_ICASE));
1411 dovec(v, cv, lp, rp);
1416 endp = scanplain(v);
1417 INSIST(startp < endp, REG_ECTYPE);
1419 cv = cclass(v, startp, endp, (v->cflags®_ICASE));
1421 dovec(v, cv, lp, rp);
1432 switch (v->nexttype) {
1435 c[0] = v->nextvalue;
1437 endc = element(v, c, c+1);
1442 endp = scanplain(v);
1443 INSIST(startp < endp, REG_ECOLLATE);
1445 endc = element(v, startp, endp);
1457 * Ranges are unportable. Actually, standard C does
1458 * guarantee that digits are contiguous, but making
1459 * that an exception is just too complicated.
1463 cv = range(v, startc, endc, (v->cflags®_ICASE));
1465 dovec(v, cv, lp, rp);
1469 - scanplain - scan PLAIN contents of [. etc.
1470 * Certain bits of trickery in lex.c know that this code does not try
1471 * to look past the final bracket of the [. etc.
1472 ^ static chr *scanplain(struct vars *);
1474 static chr * /* just after end of sequence */
1480 assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1484 while (SEE(PLAIN)) {
1489 assert(SEE(END) || ISERR());
1496 - leaders - process a cvec of collating elements to also include leaders
1497 * Also gives all characters involved their own colors, which is almost
1498 * certainly necessary, and sets up little disconnected subNFA.
1499 ^ static VOID leaders(struct vars *, struct cvec *);
1512 v->mccepbegin = newstate(v->nfa);
1513 v->mccepend = newstate(v->nfa);
1516 for (mcce = 0; mcce < cv->nmcces; mcce++) {
1517 p = cv->mcces[mcce];
1519 if (!haschr(cv, leader)) {
1521 s = newstate(v->nfa);
1522 newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1524 okcolors(v->nfa, v->cm);
1526 a = findarc(v->mccepbegin, PLAIN,
1527 GETCOLOR(v->cm, leader));
1530 assert(s != v->mccepend);
1533 assert(*p != 0 && *(p+1) == 0); /* only 2-char MCCEs for now */
1534 newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1535 okcolors(v->nfa, v->cm);
1540 - onechr - fill in arcs for a plain character, and possible case complements
1541 * This is mostly a shortcut for efficient handling of the common case.
1542 ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
1545 onechr(v, c, lp, rp)
1551 if (!(v->cflags®_ICASE)) {
1552 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1556 /* rats, need general case anyway... */
1557 dovec(v, allcases(v, c), lp, rp);
1561 - dovec - fill in arcs for each element of a cvec
1562 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1563 ^ static VOID dovec(struct vars *, struct cvec *, struct state *,
1567 dovec(v, cv, lp, rp)
1580 struct arc *pa; /* arc in prototype */
1582 struct state *ps; /* state in prototype */
1584 /* need a place to store leaders, if any */
1585 if (nmcces(v) > 0) {
1586 assert(v->mcces != NULL);
1587 if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
1590 v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1594 leads = clearcvec(v->cv2);
1598 /* first, get the ordinary characters out of the way */
1599 for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
1601 if (!ISCELEADER(v, ch))
1602 newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1604 assert(singleton(v->cm, ch));
1605 assert(leads != NULL);
1606 if (!haschr(leads, ch))
1611 /* and the ranges */
1612 for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
1615 while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
1617 subrange(v, from, ce - 1, lp, rp);
1618 assert(singleton(v->cm, ce));
1619 assert(leads != NULL);
1620 if (!haschr(leads, ce))
1625 subrange(v, from, to, lp, rp);
1628 if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1631 /* deal with the MCCE leaders */
1633 for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
1634 co = GETCOLOR(v->cm, *p);
1635 a = findarc(lp, PLAIN, co);
1639 s = newstate(v->nfa);
1641 newarc(v->nfa, PLAIN, co, lp, s);
1644 pa = findarc(v->mccepbegin, PLAIN, co);
1647 newarc(v->nfa, '$', 1, s, rp);
1648 newarc(v->nfa, '$', 0, s, rp);
1649 colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1654 for (i = 0; i < cv->nmcces; i++) {
1656 assert(singleton(v->cm, *p));
1657 if (!singleton(v->cm, *p)) {
1662 co = GETCOLOR(v->cm, ch);
1663 a = findarc(lp, PLAIN, co);
1667 s = newstate(v->nfa);
1669 newarc(v->nfa, PLAIN, co, lp, s);
1672 assert(*p != 0); /* at least two chars */
1673 assert(singleton(v->cm, *p));
1675 co = GETCOLOR(v->cm, ch);
1676 assert(*p == 0); /* and only two, for now */
1677 newarc(v->nfa, PLAIN, co, s, rp);
1683 - nextleader - find next MCCE leader within range
1684 ^ static celt nextleader(struct vars *, pchr, pchr);
1686 static celt /* NOCELT means none */
1687 nextleader(v, from, to)
1697 if (v->mcces == NULL)
1700 for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
1702 if (from <= ch && ch <= to)
1703 if (it == NOCELT || ch < it)
1710 - wordchrs - set up word-chr list for word-boundary stuff, if needed
1711 * The list is kept as a bunch of arcs between two dummy states; it's
1712 * disposed of by the unreachable-states sweep in NFA optimization.
1713 * Does NEXT(). Must not be called from any unusual lexical context.
1714 * This should be reconciled with the \w etc. handling in lex.c, and
1715 * should be cleaned up to reduce dependencies on input scanning.
1716 ^ static VOID wordchrs(struct vars *);
1723 struct state *right;
1725 if (v->wordchrs != NULL) {
1726 NEXT(); /* for consistency */
1730 left = newstate(v->nfa);
1731 right = newstate(v->nfa);
1733 /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1736 assert(v->savenow != NULL && SEE('['));
1737 bracket(v, left, right);
1738 assert((v->savenow != NULL && SEE(']')) || ISERR());
1745 - subre - allocate a subre
1746 ^ static struct subre *subre(struct vars *, int, int, struct state *,
1749 static struct subre *
1750 subre(v, op, flags, begin, end)
1754 struct state *begin;
1761 v->treefree = ret->left;
1763 ret = (struct subre *)MALLOC(sizeof(struct subre));
1768 ret->chain = v->treechain;
1772 assert(strchr("|.b(=", op) != NULL);
1778 ret->min = ret->max = 1;
1789 - freesubre - free a subRE subtree
1790 ^ static VOID freesubre(struct vars *, struct subre *);
1794 struct vars *v; /* might be NULL */
1800 if (sr->left != NULL)
1801 freesubre(v, sr->left);
1802 if (sr->right != NULL)
1803 freesubre(v, sr->right);
1809 - freesrnode - free one node in a subRE subtree
1810 ^ static VOID freesrnode(struct vars *, struct subre *);
1814 struct vars *v; /* might be NULL */
1820 if (!NULLCNFA(sr->cnfa))
1821 freecnfa(&sr->cnfa);
1825 sr->left = v->treefree;
1832 - optst - optimize a subRE subtree
1833 ^ static VOID optst(struct vars *, struct subre *);
1843 /* recurse through children */
1844 if (t->left != NULL)
1846 if (t->right != NULL)
1851 - numst - number tree nodes (assigning retry indexes)
1852 ^ static int numst(struct subre *, int);
1854 static int /* next number */
1857 int start; /* starting point for subtree numbers */
1864 t->retry = (short)i++;
1865 if (t->left != NULL)
1866 i = numst(t->left, i);
1867 if (t->right != NULL)
1868 i = numst(t->right, i);
1873 - markst - mark tree nodes as INUSE
1874 ^ static VOID markst(struct subre *);
1883 if (t->left != NULL)
1885 if (t->right != NULL)
1890 - cleanst - free any tree nodes not marked INUSE
1891 ^ static VOID cleanst(struct vars *);
1900 for (t = v->treechain; t != NULL; t = next) {
1902 if (!(t->flags&INUSE))
1905 v->treechain = NULL;
1906 v->treefree = NULL; /* just on general principles */
1910 - nfatree - turn a subRE subtree into a tree of compacted NFAs
1911 ^ static long nfatree(struct vars *, struct subre *, FILE *);
1913 static long /* optimize results from top node */
1917 FILE *f; /* for debug output */
1919 assert(t != NULL && t->begin != NULL);
1921 if (t->left != NULL)
1922 (DISCARD)nfatree(v, t->left, f);
1923 if (t->right != NULL)
1924 (DISCARD)nfatree(v, t->right, f);
1926 return nfanode(v, t, f);
1930 - nfanode - do one NFA for nfatree
1931 ^ static long nfanode(struct vars *, struct subre *, FILE *);
1933 static long /* optimize results */
1937 FILE *f; /* for debug output */
1943 assert(t->begin != NULL);
1946 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1947 stid(t, idbuf, sizeof(idbuf)));
1948 nfa = newnfa(v, v->cm, v->nfa);
1950 dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1953 ret = optimize(nfa, f);
1956 compact(nfa, &t->cnfa);
1963 - newlacon - allocate a lookahead-constraint subRE
1964 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
1966 static int /* lacon number */
1967 newlacon(v, begin, end, pos)
1969 struct state *begin;
1976 if (v->nlacons == 0) {
1977 v->lacons = (struct subre *)MALLOC(2 * sizeof(struct subre));
1978 n = 1; /* skip 0th */
1981 v->lacons = (struct subre *)REALLOC(v->lacons,
1982 (v->nlacons+1)*sizeof(struct subre));
1985 if (v->lacons == NULL) {
1989 sub = &v->lacons[n];
1998 - freelacons - free lookahead-constraint subRE vector
1999 ^ static VOID freelacons(struct subre *, int);
2010 for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
2011 if (!NULLCNFA(sub->cnfa))
2012 freecnfa(&sub->cnfa);
2017 - rfree - free a whole RE (insides of regfree)
2018 ^ static VOID rfree(regex_t *);
2026 if (re == NULL || re->re_magic != REMAGIC)
2029 re->re_magic = 0; /* invalidate RE */
2030 g = (struct guts *)re->re_guts;
2035 if (g->tree != NULL)
2036 freesubre((struct vars *)NULL, g->tree);
2037 if (g->lacons != NULL)
2038 freelacons(g->lacons, g->nlacons);
2039 if (!NULLCNFA(g->search))
2040 freecnfa(&g->search);
2045 - dump - dump an RE in human-readable form
2046 ^ static VOID dump(regex_t *, FILE *);
2057 if (re->re_magic != REMAGIC)
2058 fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2060 if (re->re_guts == NULL) {
2061 fprintf(f, "NULL guts!!!\n");
2064 g = (struct guts *)re->re_guts;
2065 if (g->magic != GUTSMAGIC)
2066 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2069 fprintf(f, "\n\n\n========= DUMP ==========\n");
2070 fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2071 re->re_nsub, re->re_info, re->re_csize, g->ntree);
2073 dumpcolors(&g->cmap, f);
2074 if (!NULLCNFA(g->search)) {
2075 printf("\nsearch:\n");
2076 dumpcnfa(&g->search, f);
2078 for (i = 1; i < g->nlacons; i++) {
2079 fprintf(f, "\nla%d (%s):\n", i,
2080 (g->lacons[i].subno) ? "positive" : "negative");
2081 dumpcnfa(&g->lacons[i].cnfa, f);
2084 dumpst(g->tree, f, 0);
2089 - dumpst - dump a subRE tree
2090 ^ static VOID dumpst(struct subre *, FILE *, int);
2093 dumpst(t, f, nfapresent)
2096 int nfapresent; /* is the original NFA still around? */
2099 fprintf(f, "null tree\n");
2101 stdump(t, f, nfapresent);
2106 - stdump - recursive guts of dumpst
2107 ^ static VOID stdump(struct subre *, FILE *, int);
2110 stdump(t, f, nfapresent)
2113 int nfapresent; /* is the original NFA still around? */
2117 fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2118 if (t->flags&LONGER)
2119 fprintf(f, " longest");
2120 if (t->flags&SHORTER)
2121 fprintf(f, " shortest");
2123 fprintf(f, " hasmixed");
2125 fprintf(f, " hascapture");
2127 fprintf(f, " hasbackref");
2128 if (!(t->flags&INUSE))
2129 fprintf(f, " UNUSED");
2131 fprintf(f, " (#%d)", t->subno);
2132 if (t->min != 1 || t->max != 1) {
2133 fprintf(f, " {%d,", t->min);
2134 if (t->max != INFINITY)
2135 fprintf(f, "%d", t->max);
2139 fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
2140 if (t->left != NULL)
2141 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2142 if (t->right != NULL)
2143 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2144 if (!NULLCNFA(t->cnfa)) {
2146 dumpcnfa(&t->cnfa, f);
2149 if (t->left != NULL)
2150 stdump(t->left, f, nfapresent);
2151 if (t->right != NULL)
2152 stdump(t->right, f, nfapresent);
2156 - stid - identify a subtree node for dumping
2157 ^ static char *stid(struct subre *, char *, size_t);
2159 static char * /* points to buf or constant string */
2160 stid(t, buf, bufsize)
2165 /* big enough for hex int or decimal t->retry? */
2166 if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
2169 sprintf(buf, "%d", t->retry);
2171 sprintf(buf, "%p", t);
2175 #include "regc_lex.c"
2176 #include "regc_color.c"
2177 #include "regc_nfa.c"
2178 #include "regc_cvec.c"
2179 #include "regc_locale.c"