]> sourceware.org Git - newlib-cygwin.git/blame - winsup/cygwin/regex/regcomp.c
* Merge in cygwin-64bit-branch.
[newlib-cygwin.git] / winsup / cygwin / regex / regcomp.c
CommitLineData
e1e595a6
CV
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
34 */
35
36#if defined(LIBC_SCCS) && !defined(lint)
37static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
38#endif /* LIBC_SCCS and not lint */
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $");
41
42#ifdef __CYGWIN__
48beacf6 43#include "winsup.h"
e1e595a6 44#endif
48beacf6
CF
45#include <sys/types.h>
46#include <stdio.h>
47#include <string.h>
48#include <ctype.h>
49#include <limits.h>
50#include <stdlib.h>
e1e595a6
CV
51#include <regex.h>
52#ifndef __CYGWIN__
53#include <runetype.h>
54#endif
55#include <wchar.h>
56#include <wctype.h>
57
e1e595a6 58#include "collate.h"
48beacf6
CF
59
60#include "utils.h"
61#include "regex2.h"
62
48beacf6
CF
63#include "cname.h"
64
e1e595a6 65#ifdef __CYGWIN__
e1e595a6
CV
66/* These are defined in nlsfuncs.cc. */
67extern LCID collate_lcid;
68extern char collate_charset[];
69#endif
70
48beacf6
CF
71/*
72 * parse structure, passed up and down to avoid global variables and
73 * other clumsinesses
74 */
75struct parse {
76 char *next; /* next character in RE */
77 char *end; /* end of string (-> NUL normally) */
78 int error; /* has an error been seen? */
79 sop *strip; /* malloced strip */
80 sopno ssize; /* malloced strip size (allocated) */
81 sopno slen; /* malloced strip length (used) */
82 int ncsalloc; /* number of csets allocated */
83 struct re_guts *g;
84# define NPAREN 10 /* we need to remember () 1-9 for back refs */
85 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
86 sopno pend[NPAREN]; /* -> ) ([0] unused) */
87};
88
e1e595a6
CV
89/* ========= begin header generated by ./mkh ========= */
90#ifdef __cplusplus
91extern "C" {
92#endif
93
94/* === regcomp.c === */
95#ifdef __CYGWIN__ /* Defined below `int stop'. Our gcc chokes on that. */
96static void p_ere(struct parse *p, int stop);
97#else
98static void p_ere(struct parse *p, wint_t stop);
99#endif
100static void p_ere_exp(struct parse *p);
101static void p_str(struct parse *p);
102#ifdef __CYGWIN__ /* Defined below `int end1/end2'. Our gcc chokes on that. */
103static void p_bre(struct parse *p, int end1, int end2);
104#else
105static void p_bre(struct parse *p, wint_t end1, wint_t end2);
106#endif
107static int p_simp_re(struct parse *p, int starordinary);
108static int p_count(struct parse *p);
109static void p_bracket(struct parse *p);
110static void p_b_term(struct parse *p, cset *cs);
111static void p_b_cclass(struct parse *p, cset *cs);
112static void p_b_eclass(struct parse *p, cset *cs);
113static wint_t p_b_symbol(struct parse *p);
114static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
115static wint_t othercase(wint_t ch);
116static void bothcases(struct parse *p, wint_t ch);
117static void ordinary(struct parse *p, wint_t ch);
118static void nonnewline(struct parse *p);
119static void repeat(struct parse *p, sopno start, int from, int to);
120static int seterr(struct parse *p, int e);
121static cset *allocset(struct parse *p);
122static void freeset(struct parse *p, cset *cs);
123static void CHadd(struct parse *p, cset *cs, wint_t ch);
124static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
125static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
126static wint_t singleton(cset *cs);
127static sopno dupl(struct parse *p, sopno start, sopno finish);
128static void doemit(struct parse *p, sop op, size_t opnd);
129static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
130static void dofwd(struct parse *p, sopno pos, sop value);
131static void enlarge(struct parse *p, sopno size);
132static void stripsnug(struct parse *p, struct re_guts *g);
133static void findmust(struct parse *p, struct re_guts *g);
134static int altoffset(sop *scan, int offset);
135static void computejumps(struct parse *p, struct re_guts *g);
136static void computematchjumps(struct parse *p, struct re_guts *g);
137static sopno pluscount(struct parse *p, struct re_guts *g);
138static wint_t wgetnext(struct parse *p);
7bd2296c 139static size_t xwcrtomb (char *s, wint_t wc, mbstate_t *ps);
e1e595a6
CV
140
141#ifdef __cplusplus
142}
143#endif
144/* ========= end header generated by ./mkh ========= */
48beacf6
CF
145
146static char nuls[10]; /* place to point scanner in event of error */
147
148/*
149 * macros for use with parse structure
150 * BEWARE: these know that the parse structure is named `p' !!!
151 */
152#define PEEK() (*p->next)
153#define PEEK2() (*(p->next+1))
154#define MORE() (p->next < p->end)
155#define MORE2() (p->next+1 < p->end)
156#define SEE(c) (MORE() && PEEK() == (c))
157#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
158#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
159#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
160#define NEXT() (p->next++)
161#define NEXT2() (p->next += 2)
162#define NEXTn(n) (p->next += (n))
163#define GETNEXT() (*p->next++)
e1e595a6 164#define WGETNEXT() wgetnext(p)
48beacf6 165#define SETERROR(e) seterr(p, (e))
e1e595a6 166#define REQUIRE(co, e) ((co) || SETERROR(e))
48beacf6
CF
167#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
168#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
169#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
170#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
171#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
172#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
173#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
174#define HERE() (p->slen)
175#define THERE() (p->slen - 1)
176#define THERETHERE() (p->slen - 2)
177#define DROP(n) (p->slen -= (n))
178
179#ifndef NDEBUG
180static int never = 0; /* for use in asserts; shuts lint up */
181#else
182#define never 0 /* some <assert.h>s have bugs too */
183#endif
184
e1e595a6
CV
185/* Macro used by computejump()/computematchjump() */
186#define MIN(a,b) ((a)<(b)?(a):(b))
187
48beacf6
CF
188/*
189 - regcomp - interface for parser and compilation
190 = extern int regcomp(regex_t *, const char *, int);
191 = #define REG_BASIC 0000
192 = #define REG_EXTENDED 0001
193 = #define REG_ICASE 0002
194 = #define REG_NOSUB 0004
195 = #define REG_NEWLINE 0010
196 = #define REG_NOSPEC 0020
197 = #define REG_PEND 0040
198 = #define REG_DUMP 0200
199 */
200int /* 0 success, otherwise REG_something */
e1e595a6
CV
201regcomp(regex_t * __restrict preg,
202 const char * __restrict pattern,
203 int cflags)
48beacf6
CF
204{
205 struct parse pa;
e1e595a6
CV
206 struct re_guts *g;
207 struct parse *p = &pa;
208 int i;
209 size_t len;
48beacf6
CF
210#ifdef REDEBUG
211# define GOODFLAGS(f) (f)
212#else
213# define GOODFLAGS(f) ((f)&~REG_DUMP)
214#endif
215
216 cflags = GOODFLAGS(cflags);
217 if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
218 return(REG_INVARG);
219
220 if (cflags&REG_PEND) {
221 if (preg->re_endp < pattern)
222 return(REG_INVARG);
223 len = preg->re_endp - pattern;
224 } else
225 len = strlen((char *)pattern);
226
227 /* do the mallocs early so failure handling is easy */
e1e595a6 228 g = (struct re_guts *)malloc(sizeof(struct re_guts));
48beacf6
CF
229 if (g == NULL)
230 return(REG_ESPACE);
231 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
232 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
233 p->slen = 0;
234 if (p->strip == NULL) {
235 free((char *)g);
236 return(REG_ESPACE);
237 }
238
239 /* set things up */
240 p->g = g;
241 p->next = (char *)pattern; /* convenience; we do not modify it */
242 p->end = p->next + len;
243 p->error = 0;
244 p->ncsalloc = 0;
245 for (i = 0; i < NPAREN; i++) {
246 p->pbegin[i] = 0;
247 p->pend[i] = 0;
248 }
48beacf6 249 g->sets = NULL;
48beacf6
CF
250 g->ncsets = 0;
251 g->cflags = cflags;
252 g->iflags = 0;
253 g->nbol = 0;
254 g->neol = 0;
255 g->must = NULL;
e1e595a6
CV
256 g->moffset = -1;
257 g->charjump = NULL;
258 g->matchjump = NULL;
48beacf6
CF
259 g->mlen = 0;
260 g->nsub = 0;
48beacf6
CF
261 g->backrefs = 0;
262
263 /* do it */
264 EMIT(OEND, 0);
265 g->firststate = THERE();
266 if (cflags&REG_EXTENDED)
267 p_ere(p, OUT);
268 else if (cflags&REG_NOSPEC)
269 p_str(p);
270 else
271 p_bre(p, OUT, OUT);
272 EMIT(OEND, 0);
273 g->laststate = THERE();
274
275 /* tidy up loose ends and fill things in */
48beacf6
CF
276 stripsnug(p, g);
277 findmust(p, g);
e1e595a6
CV
278 /* only use Boyer-Moore algorithm if the pattern is bigger
279 * than three characters
280 */
281 if(g->mlen > 3) {
282 computejumps(p, g);
283 computematchjumps(p, g);
284 if(g->matchjump == NULL && g->charjump != NULL) {
285 free(g->charjump);
286 g->charjump = NULL;
287 }
288 }
48beacf6
CF
289 g->nplus = pluscount(p, g);
290 g->magic = MAGIC2;
291 preg->re_nsub = g->nsub;
292 preg->re_g = g;
293 preg->re_magic = MAGIC1;
294#ifndef REDEBUG
295 /* not debugging, so can't rely on the assert() in regexec() */
296 if (g->iflags&BAD)
297 SETERROR(REG_ASSERT);
298#endif
299
300 /* win or lose, we're done */
301 if (p->error != 0) /* lose */
302 regfree(preg);
303 return(p->error);
304}
305
306/*
307 - p_ere - ERE parser top level, concatenation and alternation
e1e595a6 308 == static void p_ere(struct parse *p, int stop);
48beacf6
CF
309 */
310static void
e1e595a6
CV
311p_ere(struct parse *p,
312 int stop) /* character this ERE should end at */
48beacf6 313{
e1e595a6 314 char c;
6e3f2c62
CF
315 sopno prevback = 0;
316 sopno prevfwd = 0;
e1e595a6
CV
317 sopno conc;
318 int first = 1; /* is this the first alternative? */
48beacf6
CF
319
320 for (;;) {
321 /* do a bunch of concatenated expressions */
322 conc = HERE();
323 while (MORE() && (c = PEEK()) != '|' && c != stop)
324 p_ere_exp(p);
3e5a48af
YS
325#ifndef __CYGWIN__
326 /* undefined behaviour according to POSIX; allowed by glibc */
e1e595a6 327 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
3e5a48af 328#endif
48beacf6
CF
329
330 if (!EAT('|'))
331 break; /* NOTE BREAK OUT */
332
333 if (first) {
334 INSERT(OCH_, conc); /* offset is wrong */
335 prevfwd = conc;
336 prevback = conc;
337 first = 0;
338 }
339 ASTERN(OOR1, prevback);
340 prevback = THERE();
341 AHEAD(prevfwd); /* fix previous offset */
342 prevfwd = HERE();
343 EMIT(OOR2, 0); /* offset is very wrong */
344 }
345
346 if (!first) { /* tail-end fixups */
347 AHEAD(prevfwd);
348 ASTERN(O_CH, prevback);
349 }
350
351 assert(!MORE() || SEE(stop));
352}
353
354/*
355 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
e1e595a6 356 == static void p_ere_exp(struct parse *p);
48beacf6
CF
357 */
358static void
e1e595a6 359p_ere_exp(struct parse *p)
48beacf6 360{
e1e595a6
CV
361 char c;
362 wint_t wc;
363 sopno pos;
364 int count;
365 int count2;
366 sopno subno;
48beacf6
CF
367 int wascaret = 0;
368
369 assert(MORE()); /* caller should have ensured this */
370 c = GETNEXT();
371
372 pos = HERE();
373 switch (c) {
374 case '(':
e1e595a6 375 (void)REQUIRE(MORE(), REG_EPAREN);
48beacf6
CF
376 p->g->nsub++;
377 subno = p->g->nsub;
378 if (subno < NPAREN)
379 p->pbegin[subno] = HERE();
380 EMIT(OLPAREN, subno);
381 if (!SEE(')'))
382 p_ere(p, ')');
383 if (subno < NPAREN) {
384 p->pend[subno] = HERE();
385 assert(p->pend[subno] != 0);
386 }
387 EMIT(ORPAREN, subno);
e1e595a6 388 (void)MUSTEAT(')', REG_EPAREN);
48beacf6
CF
389 break;
390#ifndef POSIX_MISTAKE
391 case ')': /* happens only if no current unmatched ( */
392 /*
393 * You may ask, why the ifndef? Because I didn't notice
394 * this until slightly too late for 1003.2, and none of the
395 * other 1003.2 regular-expression reviewers noticed it at
396 * all. So an unmatched ) is legal POSIX, at least until
397 * we can get it fixed.
398 */
399 SETERROR(REG_EPAREN);
400 break;
401#endif
402 case '^':
403 EMIT(OBOL, 0);
404 p->g->iflags |= USEBOL;
405 p->g->nbol++;
406 wascaret = 1;
407 break;
408 case '$':
409 EMIT(OEOL, 0);
410 p->g->iflags |= USEEOL;
411 p->g->neol++;
412 break;
413 case '|':
414 SETERROR(REG_EMPTY);
415 break;
416 case '*':
417 case '+':
418 case '?':
419 SETERROR(REG_BADRPT);
420 break;
421 case '.':
422 if (p->g->cflags&REG_NEWLINE)
423 nonnewline(p);
424 else
425 EMIT(OANY, 0);
426 break;
427 case '[':
428 p_bracket(p);
429 break;
430 case '\\':
e1e595a6
CV
431 (void)REQUIRE(MORE(), REG_EESCAPE);
432 wc = WGETNEXT();
433#ifdef __CYGWIN__
434 /* \< and \> are the GNU equivalents to [[:<:]] and [[:>:]] */
435 switch (wc)
436 {
437 case L'<':
438 EMIT(OBOW, 0);
439 break;
440 case L'>':
441 EMIT(OEOW, 0);
442 break;
443 default:
444 ordinary(p, wc);
445 break;
446 }
447#else
448 ordinary(p, wc);
449#endif
48beacf6
CF
450 break;
451 case '{': /* okay as ordinary except if digit follows */
e1e595a6 452 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
48beacf6
CF
453 /* FALLTHROUGH */
454 default:
e1e595a6
CV
455 p->next--;
456 wc = WGETNEXT();
457 ordinary(p, wc);
48beacf6
CF
458 break;
459 }
460
461 if (!MORE())
462 return;
463 c = PEEK();
464 /* we call { a repetition if followed by a digit */
465 if (!( c == '*' || c == '+' || c == '?' ||
e1e595a6 466 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
48beacf6
CF
467 return; /* no repetition, we're done */
468 NEXT();
469
e1e595a6 470 (void)REQUIRE(!wascaret, REG_BADRPT);
48beacf6
CF
471 switch (c) {
472 case '*': /* implemented as +? */
473 /* this case does not require the (y|) trick, noKLUDGE */
474 INSERT(OPLUS_, pos);
475 ASTERN(O_PLUS, pos);
476 INSERT(OQUEST_, pos);
477 ASTERN(O_QUEST, pos);
478 break;
479 case '+':
480 INSERT(OPLUS_, pos);
481 ASTERN(O_PLUS, pos);
482 break;
483 case '?':
484 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
485 INSERT(OCH_, pos); /* offset slightly wrong */
486 ASTERN(OOR1, pos); /* this one's right */
487 AHEAD(pos); /* fix the OCH_ */
488 EMIT(OOR2, 0); /* offset very wrong... */
489 AHEAD(THERE()); /* ...so fix it */
490 ASTERN(O_CH, THERETHERE());
491 break;
492 case '{':
493 count = p_count(p);
494 if (EAT(',')) {
e1e595a6 495 if (isdigit((uch)PEEK())) {
48beacf6 496 count2 = p_count(p);
e1e595a6 497 (void)REQUIRE(count <= count2, REG_BADBR);
48beacf6
CF
498 } else /* single number with comma */
499 count2 = INFINITY;
500 } else /* just a single number */
501 count2 = count;
502 repeat(p, pos, count, count2);
503 if (!EAT('}')) { /* error heuristics */
504 while (MORE() && PEEK() != '}')
505 NEXT();
e1e595a6 506 (void)REQUIRE(MORE(), REG_EBRACE);
48beacf6
CF
507 SETERROR(REG_BADBR);
508 }
509 break;
510 }
511
512 if (!MORE())
513 return;
514 c = PEEK();
515 if (!( c == '*' || c == '+' || c == '?' ||
e1e595a6 516 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
48beacf6
CF
517 return;
518 SETERROR(REG_BADRPT);
519}
520
521/*
522 - p_str - string (no metacharacters) "parser"
e1e595a6 523 == static void p_str(struct parse *p);
48beacf6
CF
524 */
525static void
e1e595a6 526p_str(struct parse *p)
48beacf6 527{
e1e595a6 528 (void)REQUIRE(MORE(), REG_EMPTY);
48beacf6 529 while (MORE())
e1e595a6 530 ordinary(p, WGETNEXT());
48beacf6
CF
531}
532
533/*
534 - p_bre - BRE parser top level, anchoring and concatenation
e1e595a6
CV
535 == static void p_bre(struct parse *p, int end1, \
536 == int end2);
48beacf6
CF
537 * Giving end1 as OUT essentially eliminates the end1/end2 check.
538 *
539 * This implementation is a bit of a kludge, in that a trailing $ is first
e1e595a6 540 * taken as an ordinary character and then revised to be an anchor.
48beacf6
CF
541 * The amount of lookahead needed to avoid this kludge is excessive.
542 */
543static void
e1e595a6
CV
544p_bre(struct parse *p,
545 int end1, /* first terminating character */
546 int end2) /* second terminating character */
48beacf6 547{
e1e595a6
CV
548 sopno start = HERE();
549 int first = 1; /* first subexpression? */
550 int wasdollar = 0;
48beacf6
CF
551
552 if (EAT('^')) {
553 EMIT(OBOL, 0);
554 p->g->iflags |= USEBOL;
555 p->g->nbol++;
556 }
557 while (MORE() && !SEETWO(end1, end2)) {
558 wasdollar = p_simp_re(p, first);
559 first = 0;
560 }
561 if (wasdollar) { /* oops, that was a trailing anchor */
562 DROP(1);
563 EMIT(OEOL, 0);
564 p->g->iflags |= USEEOL;
565 p->g->neol++;
566 }
567
e1e595a6 568 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
48beacf6
CF
569}
570
571/*
572 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
e1e595a6 573 == static int p_simp_re(struct parse *p, int starordinary);
48beacf6
CF
574 */
575static int /* was the simple RE an unbackslashed $? */
e1e595a6
CV
576p_simp_re(struct parse *p,
577 int starordinary) /* is a leading * an ordinary character? */
48beacf6 578{
e1e595a6
CV
579 int c;
580 int count;
581 int count2;
582 sopno pos;
583 int i;
584 wint_t wc;
585 sopno subno;
48beacf6
CF
586# define BACKSL (1<<CHAR_BIT)
587
588 pos = HERE(); /* repetion op, if any, covers from here */
589
590 assert(MORE()); /* caller should have ensured this */
591 c = GETNEXT();
592 if (c == '\\') {
e1e595a6
CV
593 (void)REQUIRE(MORE(), REG_EESCAPE);
594 c = BACKSL | GETNEXT();
48beacf6
CF
595 }
596 switch (c) {
597 case '.':
598 if (p->g->cflags&REG_NEWLINE)
599 nonnewline(p);
600 else
601 EMIT(OANY, 0);
602 break;
603 case '[':
604 p_bracket(p);
605 break;
e1e595a6
CV
606#ifdef __CYGWIN__
607 case BACKSL|'<':
608 /* \< is the GNU equivalents to [[:<:]] */
609 EMIT(OBOW, 0);
610 break;
611 case BACKSL|'>':
612 /* \> is the GNU equivalents to [[:>:]] */
613 EMIT(OEOW, 0);
614 break;
615#endif
48beacf6
CF
616 case BACKSL|'{':
617 SETERROR(REG_BADRPT);
618 break;
619 case BACKSL|'(':
620 p->g->nsub++;
621 subno = p->g->nsub;
622 if (subno < NPAREN)
623 p->pbegin[subno] = HERE();
624 EMIT(OLPAREN, subno);
625 /* the MORE here is an error heuristic */
626 if (MORE() && !SEETWO('\\', ')'))
627 p_bre(p, '\\', ')');
628 if (subno < NPAREN) {
629 p->pend[subno] = HERE();
630 assert(p->pend[subno] != 0);
631 }
632 EMIT(ORPAREN, subno);
e1e595a6 633 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
48beacf6
CF
634 break;
635 case BACKSL|')': /* should not get here -- must be user */
636 case BACKSL|'}':
637 SETERROR(REG_EPAREN);
638 break;
639 case BACKSL|'1':
640 case BACKSL|'2':
641 case BACKSL|'3':
642 case BACKSL|'4':
643 case BACKSL|'5':
644 case BACKSL|'6':
645 case BACKSL|'7':
646 case BACKSL|'8':
647 case BACKSL|'9':
648 i = (c&~BACKSL) - '0';
649 assert(i < NPAREN);
650 if (p->pend[i] != 0) {
651 assert(i <= p->g->nsub);
652 EMIT(OBACK_, i);
653 assert(p->pbegin[i] != 0);
654 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
655 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
656 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
657 EMIT(O_BACK, i);
658 } else
659 SETERROR(REG_ESUBREG);
660 p->g->backrefs = 1;
661 break;
662 case '*':
e1e595a6 663 (void)REQUIRE(starordinary, REG_BADRPT);
48beacf6
CF
664 /* FALLTHROUGH */
665 default:
e1e595a6
CV
666 p->next--;
667 wc = WGETNEXT();
668 ordinary(p, wc);
48beacf6
CF
669 break;
670 }
671
672 if (EAT('*')) { /* implemented as +? */
673 /* this case does not require the (y|) trick, noKLUDGE */
674 INSERT(OPLUS_, pos);
675 ASTERN(O_PLUS, pos);
676 INSERT(OQUEST_, pos);
677 ASTERN(O_QUEST, pos);
678 } else if (EATTWO('\\', '{')) {
679 count = p_count(p);
680 if (EAT(',')) {
e1e595a6 681 if (MORE() && isdigit((uch)PEEK())) {
48beacf6 682 count2 = p_count(p);
e1e595a6 683 (void)REQUIRE(count <= count2, REG_BADBR);
48beacf6
CF
684 } else /* single number with comma */
685 count2 = INFINITY;
686 } else /* just a single number */
687 count2 = count;
688 repeat(p, pos, count, count2);
689 if (!EATTWO('\\', '}')) { /* error heuristics */
690 while (MORE() && !SEETWO('\\', '}'))
691 NEXT();
e1e595a6 692 (void)REQUIRE(MORE(), REG_EBRACE);
48beacf6
CF
693 SETERROR(REG_BADBR);
694 }
e1e595a6 695 } else if (c == '$') /* $ (but not \$) ends it */
48beacf6
CF
696 return(1);
697
698 return(0);
699}
700
701/*
702 - p_count - parse a repetition count
e1e595a6 703 == static int p_count(struct parse *p);
48beacf6
CF
704 */
705static int /* the value */
e1e595a6 706p_count(struct parse *p)
48beacf6 707{
e1e595a6
CV
708 int count = 0;
709 int ndigits = 0;
48beacf6 710
e1e595a6 711 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
48beacf6
CF
712 count = count*10 + (GETNEXT() - '0');
713 ndigits++;
714 }
715
e1e595a6 716 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
48beacf6
CF
717 return(count);
718}
719
720/*
721 - p_bracket - parse a bracketed character list
e1e595a6 722 == static void p_bracket(struct parse *p);
48beacf6
CF
723 */
724static void
e1e595a6 725p_bracket(struct parse *p)
48beacf6 726{
e1e595a6
CV
727 cset *cs;
728 wint_t ch;
48beacf6
CF
729
730 /* Dept of Truly Sickening Special-Case Kludges */
731 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
732 EMIT(OBOW, 0);
733 NEXTn(6);
734 return;
735 }
736 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
737 EMIT(OEOW, 0);
738 NEXTn(6);
739 return;
740 }
741
e1e595a6
CV
742 if ((cs = allocset(p)) == NULL)
743 return;
744
745 if (p->g->cflags&REG_ICASE)
746 cs->icase = 1;
48beacf6 747 if (EAT('^'))
e1e595a6 748 cs->invert = 1;
48beacf6 749 if (EAT(']'))
e1e595a6 750 CHadd(p, cs, ']');
48beacf6 751 else if (EAT('-'))
e1e595a6 752 CHadd(p, cs, '-');
48beacf6
CF
753 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
754 p_b_term(p, cs);
755 if (EAT('-'))
e1e595a6
CV
756 CHadd(p, cs, '-');
757 (void)MUSTEAT(']', REG_EBRACK);
48beacf6
CF
758
759 if (p->error != 0) /* don't mess things up further */
760 return;
761
e1e595a6
CV
762 if (cs->invert && p->g->cflags&REG_NEWLINE)
763 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
48beacf6 764
44caccfc
CV
765 if ((ch = singleton(cs)) != OUT /* optimize singleton sets */
766 && cs->invert == 0) { /* But not in invert case. */
e1e595a6 767 ordinary(p, ch);
48beacf6
CF
768 freeset(p, cs);
769 } else
e1e595a6 770 EMIT(OANYOF, (int)(cs - p->g->sets));
48beacf6
CF
771}
772
773/*
774 - p_b_term - parse one term of a bracketed character list
e1e595a6 775 == static void p_b_term(struct parse *p, cset *cs);
48beacf6
CF
776 */
777static void
e1e595a6 778p_b_term(struct parse *p, cset *cs)
48beacf6 779{
e1e595a6
CV
780 char c;
781 wint_t start, finish;
782 wint_t i;
48beacf6
CF
783
784 /* classify what we've got */
785 switch ((MORE()) ? PEEK() : '\0') {
786 case '[':
787 c = (MORE2()) ? PEEK2() : '\0';
788 break;
789 case '-':
790 SETERROR(REG_ERANGE);
791 return; /* NOTE RETURN */
792 break;
793 default:
794 c = '\0';
795 break;
796 }
797
798 switch (c) {
799 case ':': /* character class */
800 NEXT2();
e1e595a6 801 (void)REQUIRE(MORE(), REG_EBRACK);
48beacf6 802 c = PEEK();
e1e595a6 803 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
48beacf6 804 p_b_cclass(p, cs);
e1e595a6
CV
805 (void)REQUIRE(MORE(), REG_EBRACK);
806 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
48beacf6
CF
807 break;
808 case '=': /* equivalence class */
809 NEXT2();
e1e595a6 810 (void)REQUIRE(MORE(), REG_EBRACK);
48beacf6 811 c = PEEK();
e1e595a6 812 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
48beacf6 813 p_b_eclass(p, cs);
e1e595a6
CV
814 (void)REQUIRE(MORE(), REG_EBRACK);
815 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
48beacf6
CF
816 break;
817 default: /* symbol, ordinary character, or range */
48beacf6
CF
818 start = p_b_symbol(p);
819 if (SEE('-') && MORE2() && PEEK2() != ']') {
820 /* range */
821 NEXT();
822 if (EAT('-'))
823 finish = '-';
824 else
825 finish = p_b_symbol(p);
44caccfc
CV
826 } else if (SEE('-') && !MORE2()) {
827 SETERROR(REG_EBRACK);
828 return;
48beacf6
CF
829 } else
830 finish = start;
e1e595a6
CV
831 if (start == finish)
832 CHadd(p, cs, start);
833 else {
834#ifdef __CYGWIN__
835 if (!collate_lcid) {
836#else
837 if (__collate_load_error) {
838#endif
839 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
840 CHaddrange(p, cs, start, finish);
841 } else {
842 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
843 for (i = 0; i <= UCHAR_MAX; i++) {
844 if ( __collate_range_cmp(start, i) <= 0
845 && __collate_range_cmp(i, finish) <= 0
846 )
847 CHadd(p, cs, i);
848 }
849 }
850 }
48beacf6
CF
851 break;
852 }
853}
854
855/*
856 - p_b_cclass - parse a character-class name and deal with it
e1e595a6 857 == static void p_b_cclass(struct parse *p, cset *cs);
48beacf6
CF
858 */
859static void
e1e595a6 860p_b_cclass(struct parse *p, cset *cs)
48beacf6 861{
e1e595a6
CV
862 char *sp = p->next;
863 size_t len;
864 wctype_t wct;
865 char clname[16];
48beacf6 866
e1e595a6 867 while (MORE() && isalpha((uch)PEEK()))
48beacf6
CF
868 NEXT();
869 len = p->next - sp;
e1e595a6 870 if (len >= sizeof(clname) - 1) {
48beacf6
CF
871 SETERROR(REG_ECTYPE);
872 return;
873 }
e1e595a6
CV
874 memcpy(clname, sp, len);
875 clname[len] = '\0';
876 if ((wct = wctype(clname)) == 0) {
877 SETERROR(REG_ECTYPE);
878 return;
879 }
880 CHaddtype(p, cs, wct);
48beacf6
CF
881}
882
883/*
884 - p_b_eclass - parse an equivalence-class name and deal with it
e1e595a6 885 == static void p_b_eclass(struct parse *p, cset *cs);
48beacf6
CF
886 *
887 * This implementation is incomplete. xxx
888 */
889static void
e1e595a6 890p_b_eclass(struct parse *p, cset *cs)
48beacf6 891{
e1e595a6 892 wint_t c;
48beacf6
CF
893
894 c = p_b_coll_elem(p, '=');
e1e595a6 895 CHadd(p, cs, c);
48beacf6
CF
896}
897
898/*
899 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
e1e595a6 900 == static char p_b_symbol(struct parse *p);
48beacf6 901 */
e1e595a6
CV
902static wint_t /* value of symbol */
903p_b_symbol(struct parse *p)
48beacf6 904{
e1e595a6 905 wint_t value;
48beacf6 906
e1e595a6 907 (void)REQUIRE(MORE(), REG_EBRACK);
48beacf6 908 if (!EATTWO('[', '.'))
e1e595a6 909 return(WGETNEXT());
48beacf6
CF
910
911 /* collating symbol */
912 value = p_b_coll_elem(p, '.');
e1e595a6 913 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
48beacf6
CF
914 return(value);
915}
916
917/*
918 - p_b_coll_elem - parse a collating-element name and look it up
e1e595a6 919 == static char p_b_coll_elem(struct parse *p, int endc);
48beacf6 920 */
e1e595a6
CV
921static wint_t /* value of collating element */
922p_b_coll_elem(struct parse *p,
923 wint_t endc) /* name ended by endc,']' */
48beacf6 924{
e1e595a6
CV
925 char *sp = p->next;
926 struct cname *cp;
927 int len;
928 mbstate_t mbs;
929 wchar_t wc;
930 size_t clen;
48beacf6
CF
931
932 while (MORE() && !SEETWO(endc, ']'))
933 NEXT();
934 if (!MORE()) {
935 SETERROR(REG_EBRACK);
936 return(0);
937 }
938 len = p->next - sp;
939 for (cp = cnames; cp->name != NULL; cp++)
940 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
941 return(cp->code); /* known name */
e1e595a6
CV
942 memset(&mbs, 0, sizeof(mbs));
943 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
944 return (wc); /* single character */
945 else if (clen == (size_t)-1 || clen == (size_t)-2)
946 SETERROR(REG_ILLSEQ);
947 else
948 SETERROR(REG_ECOLLATE); /* neither */
48beacf6
CF
949 return(0);
950}
951
952/*
953 - othercase - return the case counterpart of an alphabetic
954 == static char othercase(int ch);
955 */
e1e595a6
CV
956static wint_t /* if no counterpart, return ch */
957othercase(wint_t ch)
48beacf6 958{
e1e595a6
CV
959 assert(iswalpha(ch));
960 if (iswupper(ch))
961 return(towlower(ch));
962 else if (iswlower(ch))
963 return(towupper(ch));
48beacf6
CF
964 else /* peculiar, but could happen */
965 return(ch);
966}
967
968/*
969 - bothcases - emit a dualcase version of a two-case character
e1e595a6 970 == static void bothcases(struct parse *p, int ch);
48beacf6
CF
971 *
972 * Boy, is this implementation ever a kludge...
973 */
974static void
e1e595a6 975bothcases(struct parse *p, wint_t ch)
48beacf6 976{
e1e595a6
CV
977 char *oldnext = p->next;
978 char *oldend = p->end;
979 char bracket[3 + MB_LEN_MAX];
980 size_t n;
981 mbstate_t mbs;
48beacf6
CF
982
983 assert(othercase(ch) != ch); /* p_bracket() would recurse */
984 p->next = bracket;
e1e595a6 985 memset(&mbs, 0, sizeof(mbs));
7bd2296c 986 n = xwcrtomb(bracket, ch, &mbs);
e1e595a6
CV
987 assert(n != (size_t)-1);
988 bracket[n] = ']';
989 bracket[n + 1] = '\0';
990 p->end = bracket+n+1;
48beacf6 991 p_bracket(p);
e1e595a6 992 assert(p->next == p->end);
48beacf6
CF
993 p->next = oldnext;
994 p->end = oldend;
995}
996
997/*
998 - ordinary - emit an ordinary character
e1e595a6 999 == static void ordinary(struct parse *p, int ch);
48beacf6
CF
1000 */
1001static void
e1e595a6 1002ordinary(struct parse *p, wint_t ch)
48beacf6 1003{
e1e595a6 1004 cset *cs;
48beacf6 1005
e1e595a6 1006 if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
48beacf6 1007 bothcases(p, ch);
e1e595a6
CV
1008 else if ((ch & OPDMASK) == ch)
1009 EMIT(OCHAR, ch);
48beacf6 1010 else {
e1e595a6
CV
1011 /*
1012 * Kludge: character is too big to fit into an OCHAR operand.
1013 * Emit a singleton set.
1014 */
1015 if ((cs = allocset(p)) == NULL)
1016 return;
1017 CHadd(p, cs, ch);
1018 EMIT(OANYOF, (int)(cs - p->g->sets));
48beacf6
CF
1019 }
1020}
1021
1022/*
1023 - nonnewline - emit REG_NEWLINE version of OANY
e1e595a6 1024 == static void nonnewline(struct parse *p);
48beacf6
CF
1025 *
1026 * Boy, is this implementation ever a kludge...
1027 */
1028static void
e1e595a6 1029nonnewline(struct parse *p)
48beacf6 1030{
e1e595a6
CV
1031 char *oldnext = p->next;
1032 char *oldend = p->end;
48beacf6
CF
1033 char bracket[4];
1034
1035 p->next = bracket;
1036 p->end = bracket+3;
1037 bracket[0] = '^';
1038 bracket[1] = '\n';
1039 bracket[2] = ']';
1040 bracket[3] = '\0';
1041 p_bracket(p);
1042 assert(p->next == bracket+3);
1043 p->next = oldnext;
1044 p->end = oldend;
1045}
1046
1047/*
1048 - repeat - generate code for a bounded repetition, recursively if needed
e1e595a6 1049 == static void repeat(struct parse *p, sopno start, int from, int to);
48beacf6
CF
1050 */
1051static void
e1e595a6
CV
1052repeat(struct parse *p,
1053 sopno start, /* operand from here to end of strip */
1054 int from, /* repeated from this number */
1055 int to) /* to this number of times (maybe INFINITY) */
48beacf6 1056{
e1e595a6 1057 sopno finish = HERE();
48beacf6
CF
1058# define N 2
1059# define INF 3
1060# define REP(f, t) ((f)*8 + (t))
1061# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
e1e595a6 1062 sopno copy;
48beacf6
CF
1063
1064 if (p->error != 0) /* head off possible runaway recursion */
1065 return;
1066
1067 assert(from <= to);
1068
1069 switch (REP(MAP(from), MAP(to))) {
1070 case REP(0, 0): /* must be user doing this */
1071 DROP(finish-start); /* drop the operand */
1072 break;
1073 case REP(0, 1): /* as x{1,1}? */
1074 case REP(0, N): /* as x{1,n}? */
1075 case REP(0, INF): /* as x{1,}? */
1076 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1077 INSERT(OCH_, start); /* offset is wrong... */
1078 repeat(p, start+1, 1, to);
1079 ASTERN(OOR1, start);
1080 AHEAD(start); /* ... fix it */
1081 EMIT(OOR2, 0);
1082 AHEAD(THERE());
1083 ASTERN(O_CH, THERETHERE());
1084 break;
1085 case REP(1, 1): /* trivial case */
1086 /* done */
1087 break;
1088 case REP(1, N): /* as x?x{1,n-1} */
1089 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1090 INSERT(OCH_, start);
1091 ASTERN(OOR1, start);
1092 AHEAD(start);
1093 EMIT(OOR2, 0); /* offset very wrong... */
1094 AHEAD(THERE()); /* ...so fix it */
1095 ASTERN(O_CH, THERETHERE());
1096 copy = dupl(p, start+1, finish+1);
1097 assert(copy == finish+4);
1098 repeat(p, copy, 1, to-1);
1099 break;
1100 case REP(1, INF): /* as x+ */
1101 INSERT(OPLUS_, start);
1102 ASTERN(O_PLUS, start);
1103 break;
1104 case REP(N, N): /* as xx{m-1,n-1} */
1105 copy = dupl(p, start, finish);
1106 repeat(p, copy, from-1, to-1);
1107 break;
1108 case REP(N, INF): /* as xx{n-1,INF} */
1109 copy = dupl(p, start, finish);
1110 repeat(p, copy, from-1, to);
1111 break;
1112 default: /* "can't happen" */
1113 SETERROR(REG_ASSERT); /* just in case */
1114 break;
1115 }
1116}
1117
e1e595a6
CV
1118/*
1119 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1120 - character from the parse struct, signals a REG_ILLSEQ error if the
1121 - character can't be converted. Returns the number of bytes consumed.
1122 */
1123static wint_t
1124wgetnext(struct parse *p)
1125{
1126 mbstate_t mbs;
1127 wchar_t wc;
7bd2296c 1128 wint_t ret;
e1e595a6
CV
1129 size_t n;
1130
1131 memset(&mbs, 0, sizeof(mbs));
1132 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1133 if (n == (size_t)-1 || n == (size_t)-2) {
1134 SETERROR(REG_ILLSEQ);
1135 return (0);
1136 }
7bd2296c 1137 ret = wc;
e1e595a6
CV
1138 if (n == 0)
1139 n = 1;
7bd2296c
CV
1140 else if (sizeof (wchar_t) == 2 && wc >= 0xd800 && wc <= 0xdbff) {
1141 /* UTF-16 surrogate pair. Fetch second half and
1142 compute UTF-32 value */
6b3f923f
CV
1143 size_t n2 = mbrtowc(&wc, p->next + n,
1144 p->end - p->next - n, &mbs);
7bd2296c
CV
1145 if (n2 == 0 || n2 == (size_t)-1 || n2 == (size_t)-2) {
1146 SETERROR(REG_ILLSEQ);
1147 return (0);
1148 }
1149 ret = (((ret & 0x3ff) << 10) | (wc & 0x3ff))
1150 + 0x10000;
1151 n += n2;
1152 }
e1e595a6 1153 p->next += n;
7bd2296c 1154 return (ret);
e1e595a6
CV
1155}
1156
7bd2296c
CV
1157static size_t
1158xwcrtomb (char *s, wint_t wc, mbstate_t *ps)
1159{
1160 if (sizeof (wchar_t) == 2 && wc >= 0x10000)
1161 {
15a9e176
CV
1162 /* UTF-16 wcrtomb can't handle these values directly. The rest of the
1163 code isn't surrogate pair aware, so we handle this here. Convert
1164 value to UTF-16 surrogate and call wcsrtombs to convert the "string"
1165 to the correct multibyte representation, if any. */
d67a6ce4
CF
1166 wchar_t ws[2];
1167 const wchar_t *wsp = ws;
15a9e176
CV
1168
1169 wc -= 0x10000;
1170 ws[0] = 0xd800 | (wc >> 10);
1171 ws[1] = 0xdc00 | (wc & 0x3ff);
1172 return wcsnrtombs (s, &wsp, 2, MB_CUR_MAX, ps);
7bd2296c
CV
1173 }
1174 return wcrtomb (s, wc, ps);
1175}
1176
1177
48beacf6
CF
1178/*
1179 - seterr - set an error condition
e1e595a6 1180 == static int seterr(struct parse *p, int e);
48beacf6
CF
1181 */
1182static int /* useless but makes type checking happy */
e1e595a6 1183seterr(struct parse *p, int e)
48beacf6
CF
1184{
1185 if (p->error == 0) /* keep earliest error condition */
1186 p->error = e;
1187 p->next = nuls; /* try to bring things to a halt */
1188 p->end = nuls;
1189 return(0); /* make the return value well-defined */
1190}
1191
1192/*
1193 - allocset - allocate a set of characters for []
e1e595a6 1194 == static cset *allocset(struct parse *p);
48beacf6
CF
1195 */
1196static cset *
e1e595a6 1197allocset(struct parse *p)
48beacf6 1198{
e1e595a6 1199 cset *cs, *ncs;
48beacf6 1200
e1e595a6
CV
1201 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1202 if (ncs == NULL) {
1203 SETERROR(REG_ESPACE);
1204 return (NULL);
1205 }
1206 p->g->sets = ncs;
1207 cs = &p->g->sets[p->g->ncsets++];
1208 memset(cs, 0, sizeof(*cs));
48beacf6
CF
1209
1210 return(cs);
1211}
1212
1213/*
1214 - freeset - free a now-unused set
e1e595a6 1215 == static void freeset(struct parse *p, cset *cs);
48beacf6
CF
1216 */
1217static void
e1e595a6 1218freeset(struct parse *p, cset *cs)
48beacf6 1219{
e1e595a6 1220 cset *top = &p->g->sets[p->g->ncsets];
48beacf6 1221
e1e595a6
CV
1222 free(cs->wides);
1223 free(cs->ranges);
1224 free(cs->types);
1225 memset(cs, 0, sizeof(*cs));
48beacf6
CF
1226 if (cs == top-1) /* recover only the easy case */
1227 p->g->ncsets--;
1228}
1229
1230/*
e1e595a6
CV
1231 - singleton - Determine whether a set contains only one character,
1232 - returning it if so, otherwise returning OUT.
48beacf6 1233 */
e1e595a6
CV
1234static wint_t
1235singleton(cset *cs)
48beacf6 1236{
e1e595a6 1237 wint_t i, s, n;
48beacf6 1238
e1e595a6
CV
1239 for (i = n = 0; i < NC; i++)
1240 if (CHIN(cs, i)) {
48beacf6 1241 n++;
e1e595a6
CV
1242 s = i;
1243 }
44caccfc 1244 if (n == 1 && cs->nwides == 0)
e1e595a6 1245 return (s);
44caccfc 1246 if (n == 0 && cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
e1e595a6
CV
1247 cs->icase == 0)
1248 return (cs->wides[0]);
1249 /* Don't bother handling the other cases. */
1250 return (OUT);
48beacf6
CF
1251}
1252
1253/*
e1e595a6 1254 - CHadd - add character to character set.
48beacf6
CF
1255 */
1256static void
e1e595a6 1257CHadd(struct parse *p, cset *cs, wint_t ch)
48beacf6 1258{
e1e595a6
CV
1259 wint_t nch, *newwides;
1260 assert(ch >= 0);
1261 if (ch < NC)
1262 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1263 else {
1264 newwides = realloc(cs->wides, (cs->nwides + 1) *
1265 sizeof(*cs->wides));
1266 if (newwides == NULL) {
1267 SETERROR(REG_ESPACE);
1268 return;
1269 }
1270 cs->wides = newwides;
1271 cs->wides[cs->nwides++] = ch;
1272 }
1273 if (cs->icase) {
1274 if ((nch = towlower(ch)) < NC)
1275 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1276 if ((nch = towupper(ch)) < NC)
1277 cs->bmp[nch >> 3] |= 1 << (nch & 7);
48beacf6 1278 }
48beacf6
CF
1279}
1280
1281/*
e1e595a6 1282 - CHaddrange - add all characters in the range [min,max] to a character set.
48beacf6
CF
1283 */
1284static void
e1e595a6 1285CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
48beacf6 1286{
e1e595a6 1287 crange *newranges;
48beacf6 1288
e1e595a6
CV
1289 for (; min < NC && min <= max; min++)
1290 CHadd(p, cs, min);
1291 if (min >= max)
1292 return;
1293 newranges = realloc(cs->ranges, (cs->nranges + 1) *
1294 sizeof(*cs->ranges));
1295 if (newranges == NULL) {
1296 SETERROR(REG_ESPACE);
1297 return;
1298 }
1299 cs->ranges = newranges;
1300 cs->ranges[cs->nranges].min = min;
1301 cs->ranges[cs->nranges].min = max;
1302 cs->nranges++;
48beacf6
CF
1303}
1304
1305/*
e1e595a6 1306 - CHaddtype - add all characters of a certain type to a character set.
48beacf6
CF
1307 */
1308static void
e1e595a6 1309CHaddtype(struct parse *p, cset *cs, wctype_t wct)
48beacf6 1310{
e1e595a6
CV
1311 wint_t i;
1312 wctype_t *newtypes;
1313
1314 for (i = 0; i < NC; i++)
1315 if (iswctype(i, wct))
1316 CHadd(p, cs, i);
1317 newtypes = realloc(cs->types, (cs->ntypes + 1) *
1318 sizeof(*cs->types));
1319 if (newtypes == NULL) {
1320 SETERROR(REG_ESPACE);
48beacf6 1321 return;
e1e595a6
CV
1322 }
1323 cs->types = newtypes;
1324 cs->types[cs->ntypes++] = wct;
48beacf6
CF
1325}
1326
1327/*
1328 - dupl - emit a duplicate of a bunch of sops
e1e595a6 1329 == static sopno dupl(struct parse *p, sopno start, sopno finish);
48beacf6
CF
1330 */
1331static sopno /* start of duplicate */
e1e595a6
CV
1332dupl(struct parse *p,
1333 sopno start, /* from here */
1334 sopno finish) /* to this less one */
48beacf6 1335{
e1e595a6
CV
1336 sopno ret = HERE();
1337 sopno len = finish - start;
48beacf6
CF
1338
1339 assert(finish >= start);
1340 if (len == 0)
1341 return(ret);
1342 enlarge(p, p->ssize + len); /* this many unexpected additions */
1343 assert(p->ssize >= p->slen + len);
1344 (void) memcpy((char *)(p->strip + p->slen),
1345 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1346 p->slen += len;
1347 return(ret);
1348}
1349
1350/*
1351 - doemit - emit a strip operator
e1e595a6 1352 == static void doemit(struct parse *p, sop op, size_t opnd);
48beacf6
CF
1353 *
1354 * It might seem better to implement this as a macro with a function as
1355 * hard-case backup, but it's just too big and messy unless there are
1356 * some changes to the data structures. Maybe later.
1357 */
1358static void
e1e595a6 1359doemit(struct parse *p, sop op, size_t opnd)
48beacf6
CF
1360{
1361 /* avoid making error situations worse */
1362 if (p->error != 0)
1363 return;
1364
1365 /* deal with oversize operands ("can't happen", more or less) */
1366 assert(opnd < 1<<OPSHIFT);
1367
1368 /* deal with undersized strip */
1369 if (p->slen >= p->ssize)
1370 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1371 assert(p->slen < p->ssize);
1372
1373 /* finally, it's all reduced to the easy case */
1374 p->strip[p->slen++] = SOP(op, opnd);
1375}
1376
1377/*
1378 - doinsert - insert a sop into the strip
e1e595a6 1379 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
48beacf6
CF
1380 */
1381static void
e1e595a6 1382doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
48beacf6 1383{
e1e595a6
CV
1384 sopno sn;
1385 sop s;
1386 int i;
48beacf6
CF
1387
1388 /* avoid making error situations worse */
1389 if (p->error != 0)
1390 return;
1391
1392 sn = HERE();
1393 EMIT(op, opnd); /* do checks, ensure space */
1394 assert(HERE() == sn+1);
1395 s = p->strip[sn];
1396
1397 /* adjust paren pointers */
1398 assert(pos > 0);
1399 for (i = 1; i < NPAREN; i++) {
1400 if (p->pbegin[i] >= pos) {
1401 p->pbegin[i]++;
1402 }
1403 if (p->pend[i] >= pos) {
1404 p->pend[i]++;
1405 }
1406 }
1407
1408 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1409 (HERE()-pos-1)*sizeof(sop));
1410 p->strip[pos] = s;
1411}
1412
1413/*
1414 - dofwd - complete a forward reference
e1e595a6 1415 == static void dofwd(struct parse *p, sopno pos, sop value);
48beacf6
CF
1416 */
1417static void
e1e595a6 1418dofwd(struct parse *p, sopno pos, sop value)
48beacf6
CF
1419{
1420 /* avoid making error situations worse */
1421 if (p->error != 0)
1422 return;
1423
1424 assert(value < 1<<OPSHIFT);
1425 p->strip[pos] = OP(p->strip[pos]) | value;
1426}
1427
1428/*
1429 - enlarge - enlarge the strip
e1e595a6 1430 == static void enlarge(struct parse *p, sopno size);
48beacf6
CF
1431 */
1432static void
e1e595a6 1433enlarge(struct parse *p, sopno size)
48beacf6 1434{
e1e595a6 1435 sop *sp;
48beacf6
CF
1436
1437 if (p->ssize >= size)
1438 return;
1439
1440 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1441 if (sp == NULL) {
1442 SETERROR(REG_ESPACE);
1443 return;
1444 }
1445 p->strip = sp;
1446 p->ssize = size;
1447}
1448
1449/*
1450 - stripsnug - compact the strip
e1e595a6 1451 == static void stripsnug(struct parse *p, struct re_guts *g);
48beacf6
CF
1452 */
1453static void
e1e595a6 1454stripsnug(struct parse *p, struct re_guts *g)
48beacf6
CF
1455{
1456 g->nstates = p->slen;
1457 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1458 if (g->strip == NULL) {
1459 SETERROR(REG_ESPACE);
1460 g->strip = p->strip;
1461 }
1462}
1463
1464/*
1465 - findmust - fill in must and mlen with longest mandatory literal string
e1e595a6 1466 == static void findmust(struct parse *p, struct re_guts *g);
48beacf6
CF
1467 *
1468 * This algorithm could do fancy things like analyzing the operands of |
1469 * for common subsequences. Someday. This code is simple and finds most
1470 * of the interesting cases.
1471 *
1472 * Note that must and mlen got initialized during setup.
1473 */
1474static void
e1e595a6 1475findmust(struct parse *p, struct re_guts *g)
48beacf6 1476{
e1e595a6 1477 sop *scan;
61522196
CV
1478 sop *start = NULL;
1479 sop *newstart = NULL;
e1e595a6
CV
1480 sopno newlen;
1481 sop s;
1482 char *cp;
1483 int offset;
1484 char buf[MB_LEN_MAX];
1485 size_t clen;
1486 mbstate_t mbs;
48beacf6
CF
1487
1488 /* avoid making error situations worse */
1489 if (p->error != 0)
1490 return;
1491
e1e595a6
CV
1492 /*
1493 * It's not generally safe to do a ``char'' substring search on
1494 * multibyte character strings, but it's safe for at least
1495 * UTF-8 (see RFC 3629).
1496 */
1497 if (MB_CUR_MAX > 1 &&
1498#ifdef __CYGWIN__
44caccfc 1499 strcmp(__locale_charset (), "UTF-8") != 0)
e1e595a6
CV
1500#else
1501 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1502#endif
1503 return;
1504
48beacf6
CF
1505 /* find the longest OCHAR sequence in strip */
1506 newlen = 0;
e1e595a6
CV
1507 offset = 0;
1508 g->moffset = 0;
48beacf6
CF
1509 scan = g->strip + 1;
1510 do {
1511 s = *scan++;
1512 switch (OP(s)) {
1513 case OCHAR: /* sequence member */
e1e595a6
CV
1514 if (newlen == 0) { /* new sequence */
1515 memset(&mbs, 0, sizeof(mbs));
48beacf6 1516 newstart = scan - 1;
e1e595a6 1517 }
7bd2296c 1518 clen = xwcrtomb(buf, OPND(s), &mbs);
e1e595a6
CV
1519 if (clen == (size_t)-1)
1520 goto toohard;
1521 newlen += clen;
48beacf6
CF
1522 break;
1523 case OPLUS_: /* things that don't break one */
1524 case OLPAREN:
1525 case ORPAREN:
1526 break;
1527 case OQUEST_: /* things that must be skipped */
1528 case OCH_:
e1e595a6 1529 offset = altoffset(scan, offset);
48beacf6
CF
1530 scan--;
1531 do {
1532 scan += OPND(s);
1533 s = *scan;
1534 /* assert() interferes w debug printouts */
1535 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1536 OP(s) != OOR2) {
1537 g->iflags |= BAD;
1538 return;
1539 }
1540 } while (OP(s) != O_QUEST && OP(s) != O_CH);
e1e595a6
CV
1541 /* FALLTHROUGH */
1542 case OBOW: /* things that break a sequence */
1543 case OEOW:
1544 case OBOL:
1545 case OEOL:
1546 case O_QUEST:
1547 case O_CH:
1548 case OEND:
1549 if (newlen > g->mlen) { /* ends one */
1550 start = newstart;
1551 g->mlen = newlen;
1552 if (offset > -1) {
1553 g->moffset += offset;
1554 offset = newlen;
1555 } else
1556 g->moffset = offset;
1557 } else {
1558 if (offset > -1)
1559 offset += newlen;
1560 }
1561 newlen = 0;
1562 break;
1563 case OANY:
48beacf6
CF
1564 if (newlen > g->mlen) { /* ends one */
1565 start = newstart;
1566 g->mlen = newlen;
e1e595a6
CV
1567 if (offset > -1) {
1568 g->moffset += offset;
1569 offset = newlen;
1570 } else
1571 g->moffset = offset;
1572 } else {
1573 if (offset > -1)
1574 offset += newlen;
48beacf6 1575 }
e1e595a6
CV
1576 if (offset > -1)
1577 offset++;
1578 newlen = 0;
1579 break;
1580 case OANYOF: /* may or may not invalidate offset */
1581 /* First, everything as OANY */
1582 if (newlen > g->mlen) { /* ends one */
1583 start = newstart;
1584 g->mlen = newlen;
1585 if (offset > -1) {
1586 g->moffset += offset;
1587 offset = newlen;
1588 } else
1589 g->moffset = offset;
1590 } else {
1591 if (offset > -1)
1592 offset += newlen;
1593 }
1594 if (offset > -1)
1595 offset++;
1596 newlen = 0;
1597 break;
1598 toohard:
1599 default:
1600 /* Anything here makes it impossible or too hard
1601 * to calculate the offset -- so we give up;
1602 * save the last known good offset, in case the
1603 * must sequence doesn't occur later.
1604 */
1605 if (newlen > g->mlen) { /* ends one */
1606 start = newstart;
1607 g->mlen = newlen;
1608 if (offset > -1)
1609 g->moffset += offset;
1610 else
1611 g->moffset = offset;
1612 }
1613 offset = -1;
48beacf6
CF
1614 newlen = 0;
1615 break;
1616 }
1617 } while (OP(s) != OEND);
1618
e1e595a6
CV
1619 if (g->mlen == 0) { /* there isn't one */
1620 g->moffset = -1;
48beacf6 1621 return;
e1e595a6 1622 }
48beacf6
CF
1623
1624 /* turn it into a character string */
1625 g->must = malloc((size_t)g->mlen + 1);
1626 if (g->must == NULL) { /* argh; just forget it */
1627 g->mlen = 0;
e1e595a6 1628 g->moffset = -1;
48beacf6
CF
1629 return;
1630 }
1631 cp = g->must;
1632 scan = start;
e1e595a6
CV
1633 memset(&mbs, 0, sizeof(mbs));
1634 while (cp < g->must + g->mlen) {
48beacf6
CF
1635 while (OP(s = *scan++) != OCHAR)
1636 continue;
7bd2296c 1637 clen = xwcrtomb(cp, OPND(s), &mbs);
e1e595a6
CV
1638 assert(clen != (size_t)-1);
1639 cp += clen;
48beacf6
CF
1640 }
1641 assert(cp == g->must + g->mlen);
1642 *cp++ = '\0'; /* just on general principles */
1643}
1644
e1e595a6
CV
1645/*
1646 - altoffset - choose biggest offset among multiple choices
1647 == static int altoffset(sop *scan, int offset);
1648 *
1649 * Compute, recursively if necessary, the largest offset among multiple
1650 * re paths.
1651 */
1652static int
1653altoffset(sop *scan, int offset)
1654{
1655 int largest;
1656 int try;
1657 sop s;
1658
1659 /* If we gave up already on offsets, return */
1660 if (offset == -1)
1661 return -1;
1662
1663 largest = 0;
1664 try = 0;
1665 s = *scan++;
1666 while (OP(s) != O_QUEST && OP(s) != O_CH) {
1667 switch (OP(s)) {
1668 case OOR1:
1669 if (try > largest)
1670 largest = try;
1671 try = 0;
1672 break;
1673 case OQUEST_:
1674 case OCH_:
1675 try = altoffset(scan, try);
1676 if (try == -1)
1677 return -1;
1678 scan--;
1679 do {
1680 scan += OPND(s);
1681 s = *scan;
1682 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1683 OP(s) != OOR2)
1684 return -1;
1685 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1686 /* We must skip to the next position, or we'll
1687 * leave altoffset() too early.
1688 */
1689 scan++;
1690 break;
1691 case OANYOF:
1692 case OCHAR:
1693 case OANY:
1694 try++;
1695 case OBOW:
1696 case OEOW:
1697 case OLPAREN:
1698 case ORPAREN:
1699 case OOR2:
1700 break;
1701 default:
1702 try = -1;
1703 break;
1704 }
1705 if (try == -1)
1706 return -1;
1707 s = *scan++;
1708 }
1709
1710 if (try > largest)
1711 largest = try;
1712
1713 return largest+offset;
1714}
1715
1716/*
1717 - computejumps - compute char jumps for BM scan
1718 == static void computejumps(struct parse *p, struct re_guts *g);
1719 *
1720 * This algorithm assumes g->must exists and is has size greater than
1721 * zero. It's based on the algorithm found on Computer Algorithms by
1722 * Sara Baase.
1723 *
1724 * A char jump is the number of characters one needs to jump based on
1725 * the value of the character from the text that was mismatched.
1726 */
1727static void
1728computejumps(struct parse *p, struct re_guts *g)
1729{
1730 int ch;
1731 int mindex;
1732
1733 /* Avoid making errors worse */
1734 if (p->error != 0)
1735 return;
1736
1737 g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1738 if (g->charjump == NULL) /* Not a fatal error */
1739 return;
1740 /* Adjust for signed chars, if necessary */
1741 g->charjump = &g->charjump[-(CHAR_MIN)];
1742
1743 /* If the character does not exist in the pattern, the jump
1744 * is equal to the number of characters in the pattern.
1745 */
1746 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1747 g->charjump[ch] = g->mlen;
1748
1749 /* If the character does exist, compute the jump that would
1750 * take us to the last character in the pattern equal to it
1751 * (notice that we match right to left, so that last character
1752 * is the first one that would be matched).
1753 */
1754 for (mindex = 0; mindex < g->mlen; mindex++)
1755 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1756}
1757
1758/*
1759 - computematchjumps - compute match jumps for BM scan
1760 == static void computematchjumps(struct parse *p, struct re_guts *g);
1761 *
1762 * This algorithm assumes g->must exists and is has size greater than
1763 * zero. It's based on the algorithm found on Computer Algorithms by
1764 * Sara Baase.
1765 *
1766 * A match jump is the number of characters one needs to advance based
1767 * on the already-matched suffix.
1768 * Notice that all values here are minus (g->mlen-1), because of the way
1769 * the search algorithm works.
1770 */
1771static void
1772computematchjumps(struct parse *p, struct re_guts *g)
1773{
1774 int mindex; /* General "must" iterator */
1775 int suffix; /* Keeps track of matching suffix */
1776 int ssuffix; /* Keeps track of suffixes' suffix */
1777 int* pmatches; /* pmatches[k] points to the next i
1778 * such that i+1...mlen is a substring
1779 * of k+1...k+mlen-i-1
1780 */
1781
1782 /* Avoid making errors worse */
1783 if (p->error != 0)
1784 return;
1785
1786 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
1787 if (pmatches == NULL) {
1788 g->matchjump = NULL;
1789 return;
1790 }
1791
1792 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
1793 if (g->matchjump == NULL) /* Not a fatal error */
1794 return;
1795
1796 /* Set maximum possible jump for each character in the pattern */
1797 for (mindex = 0; mindex < g->mlen; mindex++)
1798 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1799
1800 /* Compute pmatches[] */
1801 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1802 mindex--, suffix--) {
1803 pmatches[mindex] = suffix;
1804
1805 /* If a mismatch is found, interrupting the substring,
1806 * compute the matchjump for that position. If no
1807 * mismatch is found, then a text substring mismatched
1808 * against the suffix will also mismatch against the
1809 * substring.
1810 */
1811 while (suffix < g->mlen
1812 && g->must[mindex] != g->must[suffix]) {
1813 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1814 g->mlen - mindex - 1);
1815 suffix = pmatches[suffix];
1816 }
1817 }
1818
1819 /* Compute the matchjump up to the last substring found to jump
1820 * to the beginning of the largest must pattern prefix matching
1821 * it's own suffix.
1822 */
1823 for (mindex = 0; mindex <= suffix; mindex++)
1824 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1825 g->mlen + suffix - mindex);
1826
1827 ssuffix = pmatches[suffix];
1828 while (suffix < g->mlen) {
1829 while (suffix <= ssuffix && suffix < g->mlen) {
1830 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1831 g->mlen + ssuffix - suffix);
1832 suffix++;
1833 }
1834 if (suffix < g->mlen)
1835 ssuffix = pmatches[ssuffix];
1836 }
1837
1838 free(pmatches);
1839}
1840
48beacf6
CF
1841/*
1842 - pluscount - count + nesting
e1e595a6 1843 == static sopno pluscount(struct parse *p, struct re_guts *g);
48beacf6
CF
1844 */
1845static sopno /* nesting depth */
e1e595a6 1846pluscount(struct parse *p, struct re_guts *g)
48beacf6 1847{
e1e595a6
CV
1848 sop *scan;
1849 sop s;
1850 sopno plusnest = 0;
1851 sopno maxnest = 0;
48beacf6
CF
1852
1853 if (p->error != 0)
1854 return(0); /* there may not be an OEND */
1855
1856 scan = g->strip + 1;
1857 do {
1858 s = *scan++;
1859 switch (OP(s)) {
1860 case OPLUS_:
1861 plusnest++;
1862 break;
1863 case O_PLUS:
1864 if (plusnest > maxnest)
1865 maxnest = plusnest;
1866 plusnest--;
1867 break;
1868 }
1869 } while (OP(s) != OEND);
1870 if (plusnest != 0)
1871 g->iflags |= BAD;
1872 return(maxnest);
1873}
This page took 0.61377 seconds and 5 git commands to generate.