]> sourceware.org Git - glibc.git/blame - posix/regcomp.c
Update.
[glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
ca3c505e 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
3b0bdc72 21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
15a7d175 22 int length, reg_syntax_t syntax);
3b0bdc72 23static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
24 const re_dfastate_t *init_state,
25 char *fastmap);
3b0bdc72 26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
a9388965 27static reg_errcode_t init_word_char (re_dfa_t *dfa);
c0a0f9a3 28#ifdef RE_ENABLE_I18N
3b0bdc72 29static void free_charset (re_charset_t *cset);
c0a0f9a3 30#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
3b0bdc72
UD
36static reg_errcode_t analyze (re_dfa_t *dfa);
37static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
485d775d 41static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
15a7d175
UD
42 int top_clone_node, int root_node,
43 unsigned int constraint);
a9388965 44static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
15a7d175 45 unsigned int constraint);
a7d5c291
UD
46static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
3b0bdc72 48static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 49static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 50 int node, int root);
3b0bdc72
UD
51static void calc_inveclosure (re_dfa_t *dfa);
52static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 53 reg_syntax_t syntax);
3b0bdc72
UD
54static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
55static int peek_token (re_token_t *token, re_string_t *input,
15a7d175 56 reg_syntax_t syntax);
3b0bdc72 57static int peek_token_bracket (re_token_t *token, re_string_t *input,
15a7d175 58 reg_syntax_t syntax);
3b0bdc72 59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 60 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
3b0bdc72 64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
3b0bdc72 67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
3b0bdc72 70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
3b0bdc72 73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
3b0bdc72 79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax);
3b0bdc72 84static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
85 re_string_t *regexp,
86 re_token_t *token);
434d3784 87#ifndef _LIBC
c0a0f9a3
UD
88# ifdef RE_ENABLE_I18N
89static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
90 re_charset_t *mbcset, int *range_alloc,
91 bracket_elem_t *start_elem,
92 bracket_elem_t *end_elem);
c0a0f9a3 93static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175
UD
94 re_charset_t *mbcset,
95 int *coll_sym_alloc,
96 const unsigned char *name);
c0a0f9a3
UD
97# else /* not RE_ENABLE_I18N */
98static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
99 bracket_elem_t *start_elem,
100 bracket_elem_t *end_elem);
c0a0f9a3 101static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175 102 const unsigned char *name);
c0a0f9a3 103# endif /* not RE_ENABLE_I18N */
434d3784 104#endif /* not _LIBC */
c0a0f9a3
UD
105#ifdef RE_ENABLE_I18N
106static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175
UD
107 re_charset_t *mbcset,
108 int *equiv_class_alloc,
109 const unsigned char *name);
66b110e8
UD
110static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
111 re_bitset_ptr_t sbcset,
15a7d175
UD
112 re_charset_t *mbcset,
113 int *char_class_alloc,
114 const unsigned char *class_name,
115 reg_syntax_t syntax);
c0a0f9a3
UD
116#else /* not RE_ENABLE_I18N */
117static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175 118 const unsigned char *name);
66b110e8
UD
119static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
120 re_bitset_ptr_t sbcset,
15a7d175
UD
121 const unsigned char *class_name,
122 reg_syntax_t syntax);
c0a0f9a3 123#endif /* not RE_ENABLE_I18N */
e2b6bfa3 124static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
06e8303a 125 const unsigned char *class_name,
e2b6bfa3
UD
126 const unsigned char *extra, int not,
127 reg_errcode_t *err);
3b0bdc72
UD
128static void free_bin_tree (bin_tree_t *tree);
129static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
15a7d175 130 re_token_type_t type, int index);
3b0bdc72
UD
131static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
132\f
133/* This table gives an error message for each of the error codes listed
134 in regex.h. Obviously the order here has to be same as there.
135 POSIX doesn't require that we do anything for REG_NOERROR,
136 but why not be nice? */
137
6455d255 138const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
139 {
140#define REG_NOERROR_IDX 0
141 gettext_noop ("Success") /* REG_NOERROR */
142 "\0"
143#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
144 gettext_noop ("No match") /* REG_NOMATCH */
145 "\0"
146#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
147 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
148 "\0"
149#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
150 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
151 "\0"
152#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
153 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
154 "\0"
155#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
156 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
157 "\0"
158#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
159 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
160 "\0"
161#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
162 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
163 "\0"
164#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
165 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
166 "\0"
167#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
168 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
169 "\0"
170#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
171 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
172 "\0"
173#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
174 gettext_noop ("Invalid range end") /* REG_ERANGE */
175 "\0"
176#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
177 gettext_noop ("Memory exhausted") /* REG_ESPACE */
178 "\0"
179#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
180 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
181 "\0"
182#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
183 gettext_noop ("Premature end of regular expression") /* REG_EEND */
184 "\0"
185#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
186 gettext_noop ("Regular expression too big") /* REG_ESIZE */
187 "\0"
188#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
189 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
190 };
191
6455d255 192const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
193 {
194 REG_NOERROR_IDX,
195 REG_NOMATCH_IDX,
196 REG_BADPAT_IDX,
197 REG_ECOLLATE_IDX,
198 REG_ECTYPE_IDX,
199 REG_EESCAPE_IDX,
200 REG_ESUBREG_IDX,
201 REG_EBRACK_IDX,
202 REG_EPAREN_IDX,
203 REG_EBRACE_IDX,
204 REG_BADBR_IDX,
205 REG_ERANGE_IDX,
206 REG_ESPACE_IDX,
207 REG_BADRPT_IDX,
208 REG_EEND_IDX,
209 REG_ESIZE_IDX,
210 REG_ERPAREN_IDX
211 };
212\f
213/* Entry points for GNU code. */
214
215/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 216 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
217 Returns 0 if the pattern was valid, otherwise an error string.
218
219 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
220 are set in BUFP on entry. */
221
222const char *
223re_compile_pattern (pattern, length, bufp)
224 const char *pattern;
225 size_t length;
226 struct re_pattern_buffer *bufp;
227{
228 reg_errcode_t ret;
229
3b0bdc72
UD
230 /* And GNU code determines whether or not to get register information
231 by passing null for the REGS argument to re_match, etc., not by
232 setting no_sub. */
233 bufp->no_sub = 0;
234
235 /* Match anchors at newline. */
236 bufp->newline_anchor = 1;
237
75e4a282 238 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
239
240 if (!ret)
241 return NULL;
6455d255 242 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
243}
244#ifdef _LIBC
245weak_alias (__re_compile_pattern, re_compile_pattern)
246#endif
247
248/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
249 also be assigned to arbitrarily: each pattern buffer stores its own
250 syntax, so it can be changed between regex compilations. */
251/* This has no initializer because initialized variables in Emacs
252 become read-only after dumping. */
253reg_syntax_t re_syntax_options;
254
255
256/* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
259
260 The argument SYNTAX is a bit mask comprised of the various bits
261 defined in regex.h. We return the old syntax. */
262
263reg_syntax_t
264re_set_syntax (syntax)
265 reg_syntax_t syntax;
266{
267 reg_syntax_t ret = re_syntax_options;
268
269 re_syntax_options = syntax;
270 return ret;
271}
272#ifdef _LIBC
273weak_alias (__re_set_syntax, re_set_syntax)
274#endif
275
276int
277re_compile_fastmap (bufp)
278 struct re_pattern_buffer *bufp;
279{
280 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
281 char *fastmap = bufp->fastmap;
282
283 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
284 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
285 if (dfa->init_state != dfa->init_state_word)
286 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
287 if (dfa->init_state != dfa->init_state_nl)
288 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
289 if (dfa->init_state != dfa->init_state_begbuf)
290 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
291 bufp->fastmap_accurate = 1;
292 return 0;
293}
294#ifdef _LIBC
295weak_alias (__re_compile_fastmap, re_compile_fastmap)
296#endif
297
1b2c2628 298static inline void
25337753 299__attribute ((always_inline))
1b2c2628
UD
300re_set_fastmap (char *fastmap, int icase, int ch)
301{
302 fastmap[ch] = 1;
303 if (icase)
304 fastmap[tolower (ch)] = 1;
305}
306
3b0bdc72
UD
307/* Helper function for re_compile_fastmap.
308 Compile fastmap for the initial_state INIT_STATE. */
309
310static void
311re_compile_fastmap_iter (bufp, init_state, fastmap)
312 regex_t *bufp;
313 const re_dfastate_t *init_state;
314 char *fastmap;
315{
316 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
317 int node_cnt;
3c0fb574 318 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
319 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
320 {
321 int node = init_state->nodes.elems[node_cnt];
322 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
323
324 if (type == CHARACTER)
74e12fbc
UD
325 {
326 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
327#ifdef RE_ENABLE_I18N
14744156 328 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 329 {
3c0fb574 330 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
331 wchar_t wc;
332 mbstate_t state;
333
334 p = buf;
335 *p++ = dfa->nodes[node].opr.c;
336 while (++node < dfa->nodes_len
337 && dfa->nodes[node].type == CHARACTER
338 && dfa->nodes[node].mb_partial)
339 *p++ = dfa->nodes[node].opr.c;
340 memset (&state, 0, sizeof (state));
341 if (mbrtowc (&wc, (const char *) buf, p - buf,
342 &state) == p - buf
343 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
344 re_set_fastmap (fastmap, 0, buf[0]);
345 }
346#endif
347 }
3b0bdc72 348 else if (type == SIMPLE_BRACKET)
15a7d175
UD
349 {
350 int i, j, ch;
351 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
352 for (j = 0; j < UINT_BITS; ++j, ++ch)
353 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
354 re_set_fastmap (fastmap, icase, ch);
355 }
c0a0f9a3 356#ifdef RE_ENABLE_I18N
3b0bdc72 357 else if (type == COMPLEX_BRACKET)
15a7d175
UD
358 {
359 int i;
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
363 {
c0a0f9a3 364# ifdef _LIBC
15a7d175
UD
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366 {
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 int j, ch;
374 const int32_t *table = (const int32_t *)
375 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
376 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
377 for (j = 0; j < UINT_BITS; ++j, ++ch)
378 if (table[ch] < 0)
379 re_set_fastmap (fastmap, icase, ch);
380 }
c0a0f9a3 381# else
3c0fb574 382 if (dfa->mb_cur_max > 1)
15a7d175
UD
383 for (i = 0; i < SBC_MAX; ++i)
384 if (__btowc (i) == WEOF)
385 re_set_fastmap (fastmap, icase, i);
c0a0f9a3 386# endif /* not _LIBC */
15a7d175
UD
387 }
388 for (i = 0; i < cset->nmbchars; ++i)
389 {
390 char buf[256];
391 mbstate_t state;
392 memset (&state, '\0', sizeof (state));
393 __wcrtomb (buf, cset->mbchars[i], &state);
394 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
14744156 395 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc
UD
396 {
397 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
398 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
399 }
15a7d175
UD
400 }
401 }
c0a0f9a3 402#endif /* RE_ENABLE_I18N */
1b2c2628 403 else if (type == END_OF_RE || type == OP_PERIOD)
15a7d175
UD
404 {
405 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
406 if (type == END_OF_RE)
407 bufp->can_be_null = 1;
408 return;
409 }
3b0bdc72
UD
410 }
411}
412\f
413/* Entry point for POSIX code. */
414/* regcomp takes a regular expression as a string and compiles it.
415
416 PREG is a regex_t *. We do not expect any fields to be initialized,
417 since POSIX says we shouldn't. Thus, we set
418
419 `buffer' to the compiled pattern;
420 `used' to the length of the compiled pattern;
421 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
422 REG_EXTENDED bit in CFLAGS is set; otherwise, to
423 RE_SYNTAX_POSIX_BASIC;
424 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
425 `fastmap' to an allocated space for the fastmap;
426 `fastmap_accurate' to zero;
427 `re_nsub' to the number of subexpressions in PATTERN.
428
429 PATTERN is the address of the pattern string.
430
431 CFLAGS is a series of bits which affect compilation.
432
433 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
434 use POSIX basic syntax.
435
436 If REG_NEWLINE is set, then . and [^...] don't match newline.
437 Also, regexec will try a match beginning after every newline.
438
439 If REG_ICASE is set, then we considers upper- and lowercase
440 versions of letters to be equivalent when matching.
441
442 If REG_NOSUB is set, then when PREG is passed to regexec, that
443 routine will report only success or failure, and nothing about the
444 registers.
445
446 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
447 the return codes and their meanings.) */
448
449int
450regcomp (preg, pattern, cflags)
75e4a282
UD
451 regex_t *__restrict preg;
452 const char *__restrict pattern;
3b0bdc72
UD
453 int cflags;
454{
455 reg_errcode_t ret;
456 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 457 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
458
459 preg->buffer = NULL;
460 preg->allocated = 0;
461 preg->used = 0;
462
463 /* Try to allocate space for the fastmap. */
464 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 465 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
466 return REG_ESPACE;
467
468 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
469
470 /* If REG_NEWLINE is set, newlines are treated differently. */
471 if (cflags & REG_NEWLINE)
472 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
473 syntax &= ~RE_DOT_NEWLINE;
474 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
475 /* It also changes the matching behavior. */
476 preg->newline_anchor = 1;
477 }
478 else
479 preg->newline_anchor = 0;
480 preg->no_sub = !!(cflags & REG_NOSUB);
481 preg->translate = NULL;
482
483 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
484
485 /* POSIX doesn't distinguish between an unmatched open-group and an
486 unmatched close-group: both are REG_EPAREN. */
487 if (ret == REG_ERPAREN)
488 ret = REG_EPAREN;
489
a9388965 490 /* We have already checked preg->fastmap != NULL. */
bc15410e 491 if (BE (ret == REG_NOERROR, 1))
71ccd330
UD
492 /* Compute the fastmap now, since regexec cannot modify the pattern
493 buffer. This function nevers fails in this implementation. */
83b038f2 494 (void) re_compile_fastmap (preg);
71ccd330 495 else
3b0bdc72 496 {
71ccd330
UD
497 /* Some error occurred while compiling the expression. */
498 re_free (preg->fastmap);
499 preg->fastmap = NULL;
3b0bdc72
UD
500 }
501
502 return (int) ret;
503}
504#ifdef _LIBC
505weak_alias (__regcomp, regcomp)
506#endif
507
508/* Returns a message corresponding to an error code, ERRCODE, returned
509 from either regcomp or regexec. We don't use PREG here. */
510
511size_t
512regerror (errcode, preg, errbuf, errbuf_size)
513 int errcode;
514 const regex_t *preg;
515 char *errbuf;
516 size_t errbuf_size;
517{
518 const char *msg;
519 size_t msg_size;
520
bc15410e 521 if (BE (errcode < 0
15a7d175
UD
522 || errcode >= (int) (sizeof (__re_error_msgid_idx)
523 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
524 /* Only error codes returned by the rest of the code should be passed
525 to this routine. If we are given anything else, or if other regex
526 code generates an invalid error code, then the program has a bug.
527 Dump core so we can fix it. */
528 abort ();
529
6455d255 530 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
531
532 msg_size = strlen (msg) + 1; /* Includes the null. */
533
bc15410e 534 if (BE (errbuf_size != 0, 1))
3b0bdc72 535 {
bc15410e 536 if (BE (msg_size > errbuf_size, 0))
15a7d175 537 {
3b0bdc72
UD
538#if defined HAVE_MEMPCPY || defined _LIBC
539 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
540#else
15a7d175
UD
541 memcpy (errbuf, msg, errbuf_size - 1);
542 errbuf[errbuf_size - 1] = 0;
3b0bdc72 543#endif
15a7d175 544 }
3b0bdc72 545 else
15a7d175 546 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
547 }
548
549 return msg_size;
550}
551#ifdef _LIBC
552weak_alias (__regerror, regerror)
553#endif
554
3b0bdc72 555
71ccd330
UD
556static void
557free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
558{
559 int i, j;
3b0bdc72 560
71ccd330
UD
561 re_free (dfa->subexps);
562
563 for (i = 0; i < dfa->nodes_len; ++i)
564 {
565 re_token_t *node = dfa->nodes + i;
c0a0f9a3 566#ifdef RE_ENABLE_I18N
71ccd330
UD
567 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
568 free_charset (node->opr.mbcset);
569 else
c0a0f9a3 570#endif /* RE_ENABLE_I18N */
71ccd330
UD
571 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
572 re_free (node->opr.sbcset);
573 }
574 re_free (dfa->nexts);
575 for (i = 0; i < dfa->nodes_len; ++i)
576 {
577 if (dfa->eclosures != NULL)
578 re_node_set_free (dfa->eclosures + i);
579 if (dfa->inveclosures != NULL)
580 re_node_set_free (dfa->inveclosures + i);
581 if (dfa->edests != NULL)
582 re_node_set_free (dfa->edests + i);
583 }
584 re_free (dfa->edests);
585 re_free (dfa->eclosures);
586 re_free (dfa->inveclosures);
587 re_free (dfa->nodes);
3b0bdc72 588
71ccd330
UD
589 for (i = 0; i <= dfa->state_hash_mask; ++i)
590 {
591 struct re_state_table_entry *entry = dfa->state_table + i;
592 for (j = 0; j < entry->num; ++j)
593 {
594 re_dfastate_t *state = entry->array[j];
b14993a2 595 free_state (state);
71ccd330
UD
596 }
597 re_free (entry->array);
598 }
599 re_free (dfa->state_table);
3b0bdc72 600
71ccd330
UD
601 if (dfa->word_char != NULL)
602 re_free (dfa->word_char);
0742e48e 603#ifdef DEBUG
71ccd330 604 re_free (dfa->re_str);
0742e48e 605#endif
71ccd330
UD
606
607 re_free (dfa);
608}
609
610
611/* Free dynamically allocated space used by PREG. */
612
613void
614regfree (preg)
615 regex_t *preg;
616{
617 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
618 if (BE (dfa != NULL, 1))
619 free_dfa_content (dfa);
620
3b0bdc72
UD
621 re_free (preg->fastmap);
622}
623#ifdef _LIBC
624weak_alias (__regfree, regfree)
625#endif
626\f
627/* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
629
630#if defined _REGEX_RE_COMP || defined _LIBC
631
632/* BSD has one and only one pattern buffer. */
633static struct re_pattern_buffer re_comp_buf;
634
635char *
636# ifdef _LIBC
637/* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
640weak_function
641# endif
642re_comp (s)
643 const char *s;
644{
645 reg_errcode_t ret;
240e87c2 646 char *fastmap;
3b0bdc72
UD
647
648 if (!s)
649 {
650 if (!re_comp_buf.buffer)
651 return gettext ("No previous regular expression");
652 return 0;
653 }
654
240e87c2
RM
655 if (re_comp_buf.buffer)
656 {
657 fastmap = re_comp_buf.fastmap;
658 re_comp_buf.fastmap = NULL;
659 __regfree (&re_comp_buf);
1b2c2628 660 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
661 re_comp_buf.fastmap = fastmap;
662 }
663
664 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
665 {
666 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
667 if (re_comp_buf.fastmap == NULL)
6455d255
UD
668 return (char *) gettext (__re_error_msgid
669 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
670 }
671
672 /* Since `re_exec' always passes NULL for the `regs' argument, we
673 don't need to initialize the pattern buffer fields which affect it. */
674
675 /* Match anchors at newlines. */
676 re_comp_buf.newline_anchor = 1;
677
678 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
679
680 if (!ret)
681 return NULL;
682
683 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 684 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 685}
240e87c2
RM
686
687#ifdef _LIBC
c877418f 688libc_freeres_fn (free_mem)
240e87c2
RM
689{
690 __regfree (&re_comp_buf);
691}
240e87c2
RM
692#endif
693
3b0bdc72
UD
694#endif /* _REGEX_RE_COMP */
695\f
696/* Internal entry point.
697 Compile the regular expression PATTERN, whose length is LENGTH.
698 SYNTAX indicate regular expression's syntax. */
699
700static reg_errcode_t
701re_compile_internal (preg, pattern, length, syntax)
702 regex_t *preg;
703 const char * pattern;
704 int length;
705 reg_syntax_t syntax;
706{
707 reg_errcode_t err = REG_NOERROR;
708 re_dfa_t *dfa;
709 re_string_t regexp;
710
711 /* Initialize the pattern buffer. */
712 preg->fastmap_accurate = 0;
713 preg->syntax = syntax;
714 preg->not_bol = preg->not_eol = 0;
715 preg->used = 0;
716 preg->re_nsub = 0;
1b2c2628
UD
717 preg->can_be_null = 0;
718 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
719
720 /* Initialize the dfa. */
721 dfa = (re_dfa_t *) preg->buffer;
722 if (preg->allocated < sizeof (re_dfa_t))
723 {
724 /* If zero allocated, but buffer is non-null, try to realloc
725 enough space. This loses if buffer's address is bogus, but
726 that is the user's responsibility. If ->buffer is NULL this
727 is a simple allocation. */
728 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
729 if (dfa == NULL)
730 return REG_ESPACE;
3b0bdc72
UD
731 preg->allocated = sizeof (re_dfa_t);
732 }
733 preg->buffer = (unsigned char *) dfa;
734 preg->used = sizeof (re_dfa_t);
735
736 err = init_dfa (dfa, length);
bc15410e 737 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
738 {
739 re_free (dfa);
740 preg->buffer = NULL;
ca3c505e 741 preg->allocated = 0;
3b0bdc72
UD
742 return err;
743 }
0742e48e
UD
744#ifdef DEBUG
745 dfa->re_str = re_malloc (char, length + 1);
746 strncpy (dfa->re_str, pattern, length + 1);
747#endif
3b0bdc72 748
612546c6 749 err = re_string_construct (&regexp, pattern, length, preg->translate,
3c0fb574
UD
750 syntax & RE_ICASE, dfa->mb_cur_max,
751 dfa->is_utf8);
bc15410e 752 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
753 {
754 re_free (dfa);
755 preg->buffer = NULL;
ca3c505e 756 preg->allocated = 0;
3b0bdc72
UD
757 return err;
758 }
759
760 /* Parse the regular expression, and build a structure tree. */
761 preg->re_nsub = 0;
762 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 763 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
764 goto re_compile_internal_free_return;
765
14744156
UD
766#ifdef RE_ENABLE_I18N
767 /* If possible, do searching in single byte encoding to speed things up. */
768 if (dfa->is_utf8 && !(syntax & RE_ICASE))
769 optimize_utf8 (dfa);
770#endif
771
3b0bdc72
UD
772 /* Analyze the tree and collect information which is necessary to
773 create the dfa. */
774 err = analyze (dfa);
bc15410e 775 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
776 goto re_compile_internal_free_return;
777
778 /* Then create the initial state of the dfa. */
779 err = create_initial_state (dfa);
3b0bdc72 780
3b0bdc72
UD
781 /* Release work areas. */
782 free_workarea_compile (preg);
783 re_string_destruct (&regexp);
784
71ccd330
UD
785 if (BE (err != REG_NOERROR, 0))
786 {
787 re_compile_internal_free_return:
788 free_dfa_content (dfa);
789 preg->buffer = NULL;
ca3c505e 790 preg->allocated = 0;
71ccd330
UD
791 }
792
3b0bdc72
UD
793 return err;
794}
795
796/* Initialize DFA. We use the length of the regular expression PAT_LEN
797 as the initial length of some arrays. */
798
799static reg_errcode_t
800init_dfa (dfa, pat_len)
801 re_dfa_t *dfa;
802 int pat_len;
803{
804 int table_size;
81c64d40
UD
805
806 memset (dfa, '\0', sizeof (re_dfa_t));
807
3b0bdc72
UD
808 dfa->nodes_alloc = pat_len + 1;
809 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
810
811 dfa->states_alloc = pat_len + 1;
812
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; table_size > 0; table_size <<= 1)
815 if (table_size > pat_len)
816 break;
817
818 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
819 dfa->state_hash_mask = table_size - 1;
820
821 dfa->subexps_alloc = 1;
822 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
823 dfa->word_char = NULL;
824
3c0fb574
UD
825 dfa->mb_cur_max = MB_CUR_MAX;
826#ifdef _LIBC
827 if (dfa->mb_cur_max > 1
828 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
829 dfa->is_utf8 = 1;
830#endif
831
bc15410e 832 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
15a7d175 833 || dfa->subexps == NULL, 0))
3b0bdc72
UD
834 {
835 /* We don't bother to free anything which was allocated. Very
836 soon the process will go down anyway. */
837 dfa->subexps = NULL;
838 dfa->state_table = NULL;
839 dfa->nodes = NULL;
840 return REG_ESPACE;
841 }
842 return REG_NOERROR;
843}
844
845/* Initialize WORD_CHAR table, which indicate which character is
846 "word". In this case "word" means that it is the word construction
847 character used by some operators like "\<", "\>", etc. */
848
a9388965 849static reg_errcode_t
3b0bdc72
UD
850init_word_char (dfa)
851 re_dfa_t *dfa;
852{
853 int i, j, ch;
854 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
bc15410e 855 if (BE (dfa->word_char == NULL, 0))
a9388965 856 return REG_ESPACE;
3b0bdc72
UD
857 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
858 for (j = 0; j < UINT_BITS; ++j, ++ch)
859 if (isalnum (ch) || ch == '_')
15a7d175 860 dfa->word_char[i] |= 1 << j;
a9388965 861 return REG_NOERROR;
3b0bdc72
UD
862}
863
864/* Free the work area which are only used while compiling. */
865
866static void
867free_workarea_compile (preg)
868 regex_t *preg;
869{
870 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
871 free_bin_tree (dfa->str_tree);
872 dfa->str_tree = NULL;
a7d5c291
UD
873 re_free (dfa->org_indices);
874 dfa->org_indices = NULL;
3b0bdc72
UD
875}
876
877/* Create initial states for all contexts. */
878
879static reg_errcode_t
880create_initial_state (dfa)
881 re_dfa_t *dfa;
882{
883 int first, i;
884 reg_errcode_t err;
885 re_node_set init_nodes;
886
887 /* Initial states have the epsilon closure of the node which is
888 the first node of the regular expression. */
889 first = dfa->str_tree->first;
890 dfa->init_node = first;
891 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 892 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
893 return err;
894
895 /* The back-references which are in initial states can epsilon transit,
896 since in this case all of the subexpressions can be null.
897 Then we add epsilon closures of the nodes which are the next nodes of
898 the back-references. */
899 if (dfa->nbackref > 0)
900 for (i = 0; i < init_nodes.nelem; ++i)
901 {
15a7d175
UD
902 int node_idx = init_nodes.elems[i];
903 re_token_type_t type = dfa->nodes[node_idx].type;
904
905 int clexp_idx;
906 if (type != OP_BACK_REF)
907 continue;
908 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
909 {
910 re_token_t *clexp_node;
911 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
912 if (clexp_node->type == OP_CLOSE_SUBEXP
913 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
914 break;
915 }
916 if (clexp_idx == init_nodes.nelem)
917 continue;
918
919 if (type == OP_BACK_REF)
920 {
921 int dest_idx = dfa->edests[node_idx].elems[0];
922 if (!re_node_set_contains (&init_nodes, dest_idx))
923 {
924 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
925 i = 0;
926 }
927 }
3b0bdc72
UD
928 }
929
930 /* It must be the first time to invoke acquire_state. */
a9388965
UD
931 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
932 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 933 if (BE (dfa->init_state == NULL, 0))
a9388965 934 return err;
3b0bdc72
UD
935 if (dfa->init_state->has_constraint)
936 {
a9388965 937 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 938 CONTEXT_WORD);
a9388965 939 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 940 CONTEXT_NEWLINE);
a9388965 941 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
942 &init_nodes,
943 CONTEXT_NEWLINE
944 | CONTEXT_BEGBUF);
bc15410e 945 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
946 || dfa->init_state_begbuf == NULL, 0))
947 return err;
3b0bdc72
UD
948 }
949 else
950 dfa->init_state_word = dfa->init_state_nl
951 = dfa->init_state_begbuf = dfa->init_state;
952
3b0bdc72
UD
953 re_node_set_free (&init_nodes);
954 return REG_NOERROR;
955}
956\f
14744156
UD
957#ifdef RE_ENABLE_I18N
958/* If it is possible to do searching in single byte encoding instead of UTF-8
959 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
960 DFA nodes where needed. */
961
962static void
963optimize_utf8 (dfa)
964 re_dfa_t *dfa;
965{
966 int node;
967
968 for (node = 0; node < dfa->nodes_len; ++node)
969 switch (dfa->nodes[node].type)
970 {
971 case CHARACTER:
972 /* Chars >= 0x80 are optimizable in some cases (e.g. when not
973 followed by DUP operator, not in bracket etc.).
974 For now punt on them all. */
975 if (dfa->nodes[node].opr.c >= 0x80)
976 return;
977 break;
978 case ANCHOR:
979 switch (dfa->nodes[node].opr.idx)
980 {
981 case LINE_FIRST:
982 case LINE_LAST:
983 case BUF_FIRST:
984 case BUF_LAST:
985 break;
986 default:
987 /* Word anchors etc. cannot be handled. */
988 return;
989 }
990 break;
991 case OP_BACK_REF:
992 case OP_ALT:
993 case END_OF_RE:
994 case BACK_SLASH:
995 case OP_DUP_ASTERISK:
996 case OP_DUP_QUESTION:
997 case OP_DUP_PLUS:
998 case OP_OPEN_SUBEXP:
999 case OP_CLOSE_SUBEXP:
1000 break;
1001 default:
1002 return;
1003 }
1004
1005 /* The search can be in single byte locale. */
1006 dfa->mb_cur_max = 1;
1007 dfa->is_utf8 = 0;
1008 dfa->has_mb_node = dfa->nbackref > 0;
1009}
1010#endif
1011\f
3b0bdc72
UD
1012/* Analyze the structure tree, and calculate "first", "next", "edest",
1013 "eclosure", and "inveclosure". */
1014
1015static reg_errcode_t
1016analyze (dfa)
1017 re_dfa_t *dfa;
1018{
1019 int i;
1020 reg_errcode_t ret;
1021
1022 /* Allocate arrays. */
3b0bdc72 1023 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1024 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1025 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1026 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1027 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1028 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
15a7d175 1029 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
3b0bdc72
UD
1030 return REG_ESPACE;
1031 /* Initialize them. */
1032 for (i = 0; i < dfa->nodes_len; ++i)
1033 {
3b0bdc72
UD
1034 dfa->nexts[i] = -1;
1035 re_node_set_init_empty (dfa->edests + i);
1036 re_node_set_init_empty (dfa->eclosures + i);
1037 re_node_set_init_empty (dfa->inveclosures + i);
1038 }
1039
1040 ret = analyze_tree (dfa, dfa->str_tree);
bc15410e 1041 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
1042 {
1043 ret = calc_eclosure (dfa);
1044 if (ret == REG_NOERROR)
1045 calc_inveclosure (dfa);
1046 }
1047 return ret;
1048}
1049
1050/* Helper functions for analyze.
1051 This function calculate "first", "next", and "edest" for the subtree
1052 whose root is NODE. */
1053
1054static reg_errcode_t
1055analyze_tree (dfa, node)
1056 re_dfa_t *dfa;
1057 bin_tree_t *node;
1058{
1059 reg_errcode_t ret;
1060 if (node->first == -1)
1061 calc_first (dfa, node);
1062 if (node->next == -1)
1063 calc_next (dfa, node);
1064 if (node->eclosure.nelem == 0)
1065 calc_epsdest (dfa, node);
1066 /* Calculate "first" etc. for the left child. */
1067 if (node->left != NULL)
1068 {
1069 ret = analyze_tree (dfa, node->left);
bc15410e 1070 if (BE (ret != REG_NOERROR, 0))
15a7d175 1071 return ret;
3b0bdc72
UD
1072 }
1073 /* Calculate "first" etc. for the right child. */
1074 if (node->right != NULL)
1075 {
1076 ret = analyze_tree (dfa, node->right);
bc15410e 1077 if (BE (ret != REG_NOERROR, 0))
15a7d175 1078 return ret;
3b0bdc72
UD
1079 }
1080 return REG_NOERROR;
1081}
1082
1083/* Calculate "first" for the node NODE. */
1084static void
1085calc_first (dfa, node)
1086 re_dfa_t *dfa;
1087 bin_tree_t *node;
1088{
1089 int idx, type;
1090 idx = node->node_idx;
1091 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1092
1093 switch (type)
1094 {
1095#ifdef DEBUG
3b0bdc72
UD
1096 case OP_OPEN_BRACKET:
1097 case OP_CLOSE_BRACKET:
1098 case OP_OPEN_DUP_NUM:
1099 case OP_CLOSE_DUP_NUM:
1100 case OP_NON_MATCH_LIST:
1101 case OP_OPEN_COLL_ELEM:
1102 case OP_CLOSE_COLL_ELEM:
1103 case OP_OPEN_EQUIV_CLASS:
1104 case OP_CLOSE_EQUIV_CLASS:
1105 case OP_OPEN_CHAR_CLASS:
1106 case OP_CLOSE_CHAR_CLASS:
1107 /* These must not be appeared here. */
1108 assert (0);
1109#endif
1110 case END_OF_RE:
1111 case CHARACTER:
1112 case OP_PERIOD:
1113 case OP_DUP_ASTERISK:
1114 case OP_DUP_QUESTION:
c0a0f9a3 1115#ifdef RE_ENABLE_I18N
3b0bdc72 1116 case COMPLEX_BRACKET:
c0a0f9a3 1117#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1118 case SIMPLE_BRACKET:
1119 case OP_BACK_REF:
1120 case ANCHOR:
81c64d40
UD
1121 case OP_OPEN_SUBEXP:
1122 case OP_CLOSE_SUBEXP:
3b0bdc72
UD
1123 node->first = idx;
1124 break;
1125 case OP_DUP_PLUS:
1126#ifdef DEBUG
1127 assert (node->left != NULL);
1128#endif
1129 if (node->left->first == -1)
15a7d175 1130 calc_first (dfa, node->left);
3b0bdc72
UD
1131 node->first = node->left->first;
1132 break;
1133 case OP_ALT:
1134 node->first = idx;
1135 break;
3b0bdc72
UD
1136 /* else fall through */
1137 default:
1138#ifdef DEBUG
1139 assert (node->left != NULL);
1140#endif
1141 if (node->left->first == -1)
15a7d175 1142 calc_first (dfa, node->left);
3b0bdc72
UD
1143 node->first = node->left->first;
1144 break;
1145 }
3b0bdc72
UD
1146}
1147
1148/* Calculate "next" for the node NODE. */
1149
1150static void
1151calc_next (dfa, node)
1152 re_dfa_t *dfa;
1153 bin_tree_t *node;
1154{
1155 int idx, type;
1156 bin_tree_t *parent = node->parent;
1157 if (parent == NULL)
1158 {
1159 node->next = -1;
1160 idx = node->node_idx;
1161 if (node->type == 0)
15a7d175 1162 dfa->nexts[idx] = node->next;
3b0bdc72
UD
1163 return;
1164 }
1165
1166 idx = parent->node_idx;
1167 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1168
1169 switch (type)
1170 {
1171 case OP_DUP_ASTERISK:
1172 case OP_DUP_PLUS:
1173 node->next = idx;
1174 break;
1175 case CONCAT:
1176 if (parent->left == node)
15a7d175
UD
1177 {
1178 if (parent->right->first == -1)
1179 calc_first (dfa, parent->right);
1180 node->next = parent->right->first;
1181 break;
1182 }
3b0bdc72
UD
1183 /* else fall through */
1184 default:
1185 if (parent->next == -1)
15a7d175 1186 calc_next (dfa, parent);
3b0bdc72
UD
1187 node->next = parent->next;
1188 break;
1189 }
1190 idx = node->node_idx;
1191 if (node->type == 0)
1192 dfa->nexts[idx] = node->next;
1193}
1194
1195/* Calculate "edest" for the node NODE. */
1196
1197static void
1198calc_epsdest (dfa, node)
1199 re_dfa_t *dfa;
1200 bin_tree_t *node;
1201{
1202 int idx;
1203 idx = node->node_idx;
1204 if (node->type == 0)
1205 {
1206 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
15a7d175
UD
1207 || dfa->nodes[idx].type == OP_DUP_PLUS
1208 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1209 {
1210 if (node->left->first == -1)
1211 calc_first (dfa, node->left);
1212 if (node->next == -1)
1213 calc_next (dfa, node);
1214 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1215 node->next);
1216 }
3b0bdc72 1217 else if (dfa->nodes[idx].type == OP_ALT)
15a7d175
UD
1218 {
1219 int left, right;
1220 if (node->left != NULL)
1221 {
1222 if (node->left->first == -1)
1223 calc_first (dfa, node->left);
1224 left = node->left->first;
1225 }
1226 else
1227 {
1228 if (node->next == -1)
1229 calc_next (dfa, node);
1230 left = node->next;
1231 }
1232 if (node->right != NULL)
1233 {
1234 if (node->right->first == -1)
1235 calc_first (dfa, node->right);
1236 right = node->right->first;
1237 }
1238 else
1239 {
1240 if (node->next == -1)
1241 calc_next (dfa, node);
1242 right = node->next;
1243 }
1244 re_node_set_init_2 (dfa->edests + idx, left, right);
1245 }
81c64d40 1246 else if (dfa->nodes[idx].type == ANCHOR
15a7d175
UD
1247 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1248 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1249 || dfa->nodes[idx].type == OP_BACK_REF)
1250 re_node_set_init_1 (dfa->edests + idx, node->next);
1769a73f
UD
1251 else
1252 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
3b0bdc72
UD
1253 }
1254}
1255
485d775d
UD
1256/* Duplicate the epsilon closure of the node ROOT_NODE.
1257 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1258 to their own constraint. */
1259
1260static reg_errcode_t
1261duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
15a7d175 1262 init_constraint)
485d775d
UD
1263 re_dfa_t *dfa;
1264 int top_org_node, top_clone_node, root_node;
1265 unsigned int init_constraint;
1266{
1267 reg_errcode_t err;
1268 int org_node, clone_node, ret;
1269 unsigned int constraint = init_constraint;
1270 for (org_node = top_org_node, clone_node = top_clone_node;;)
1271 {
1272 int org_dest, clone_dest;
1273 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1274 {
485d775d
UD
1275 /* If the back reference epsilon-transit, its destination must
1276 also have the constraint. Then duplicate the epsilon closure
1277 of the destination of the back reference, and store it in
1278 edests of the back reference. */
15a7d175
UD
1279 org_dest = dfa->nexts[org_node];
1280 re_node_set_empty (dfa->edests + clone_node);
1281 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1282 if (BE (err != REG_NOERROR, 0))
1283 return err;
1284 dfa->nexts[clone_node] = dfa->nexts[org_node];
1285 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1286 if (BE (ret < 0, 0))
1287 return REG_ESPACE;
1288 }
485d775d 1289 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1290 {
485d775d
UD
1291 /* In case of the node can't epsilon-transit, don't duplicate the
1292 destination and store the original destination as the
1293 destination of the node. */
15a7d175
UD
1294 dfa->nexts[clone_node] = dfa->nexts[org_node];
1295 break;
1296 }
485d775d 1297 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1298 {
485d775d
UD
1299 /* In case of the node can epsilon-transit, and it has only one
1300 destination. */
15a7d175
UD
1301 org_dest = dfa->edests[org_node].elems[0];
1302 re_node_set_empty (dfa->edests + clone_node);
1303 if (dfa->nodes[org_node].type == ANCHOR)
1304 {
485d775d 1305 /* In case of the node has another constraint, append it. */
15a7d175
UD
1306 if (org_node == root_node && clone_node != org_node)
1307 {
485d775d
UD
1308 /* ...but if the node is root_node itself, it means the
1309 epsilon closure have a loop, then tie it to the
1310 destination of the root_node. */
15a7d175
UD
1311 ret = re_node_set_insert (dfa->edests + clone_node,
1312 org_dest);
1313 if (BE (ret < 0, 0))
1314 return REG_ESPACE;
1315 break;
1316 }
1317 constraint |= dfa->nodes[org_node].opr.ctx_type;
1318 }
1319 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1320 if (BE (err != REG_NOERROR, 0))
1321 return err;
1322 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1323 if (BE (ret < 0, 0))
1324 return REG_ESPACE;
1325 }
485d775d 1326 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1327 {
485d775d 1328 /* In case of the node can epsilon-transit, and it has two
a7d5c291 1329 destinations. E.g. '|', '*', '+', '?'. */
15a7d175
UD
1330 org_dest = dfa->edests[org_node].elems[0];
1331 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1332 /* Search for a duplicated node which satisfies the constraint. */
1333 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1334 if (clone_dest == -1)
1335 {
1336 /* There are no such a duplicated node, create a new one. */
1337 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1338 if (BE (err != REG_NOERROR, 0))
1339 return err;
1340 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1341 if (BE (ret < 0, 0))
1342 return REG_ESPACE;
1343 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1344 root_node, constraint);
1345 if (BE (err != REG_NOERROR, 0))
1346 return err;
1347 }
1348 else
1349 {
1350 /* There are a duplicated node which satisfy the constraint,
1351 use it to avoid infinite loop. */
1352 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1353 if (BE (ret < 0, 0))
1354 return REG_ESPACE;
1355 }
15a7d175
UD
1356
1357 org_dest = dfa->edests[org_node].elems[1];
1358 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1359 if (BE (err != REG_NOERROR, 0))
1360 return err;
1361 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1362 if (BE (ret < 0, 0))
1363 return REG_ESPACE;
1364 }
485d775d
UD
1365 org_node = org_dest;
1366 clone_node = clone_dest;
1367 }
1368 return REG_NOERROR;
1369}
1370
a7d5c291
UD
1371/* Search for a node which is duplicated from the node ORG_NODE, and
1372 satisfies the constraint CONSTRAINT. */
1373
1374static int
1375search_duplicated_node (dfa, org_node, constraint)
1376 re_dfa_t *dfa;
1377 int org_node;
1378 unsigned int constraint;
1379{
1380 int idx;
1381 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1382 {
1383 if (org_node == dfa->org_indices[idx]
1384 && constraint == dfa->nodes[idx].constraint)
1385 return idx; /* Found. */
1386 }
1387 return -1; /* Not found. */
1388}
1389
a9388965
UD
1390/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1391 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1392 otherwise return the error code. */
1393
1394static reg_errcode_t
1395duplicate_node (new_idx, dfa, org_idx, constraint)
3b0bdc72 1396 re_dfa_t *dfa;
a9388965 1397 int *new_idx, org_idx;
3b0bdc72
UD
1398 unsigned int constraint;
1399{
1400 re_token_t dup;
1401 int dup_idx;
1402
485d775d 1403 dup = dfa->nodes[org_idx];
a9388965 1404 dup_idx = re_dfa_add_node (dfa, dup, 1);
485d775d 1405 if (BE (dup_idx == -1, 0))
a9388965 1406 return REG_ESPACE;
485d775d
UD
1407 dfa->nodes[dup_idx].constraint = constraint;
1408 if (dfa->nodes[org_idx].type == ANCHOR)
1409 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
a9388965 1410 dfa->nodes[dup_idx].duplicated = 1;
485d775d
UD
1411 re_node_set_init_empty (dfa->edests + dup_idx);
1412 re_node_set_init_empty (dfa->eclosures + dup_idx);
1413 re_node_set_init_empty (dfa->inveclosures + dup_idx);
3b0bdc72 1414
a7d5c291
UD
1415 /* Store the index of the original node. */
1416 dfa->org_indices[dup_idx] = org_idx;
a9388965
UD
1417 *new_idx = dup_idx;
1418 return REG_NOERROR;
3b0bdc72
UD
1419}
1420
1421static void
1422calc_inveclosure (dfa)
1423 re_dfa_t *dfa;
1424{
485d775d 1425 int src, idx, dest;
3b0bdc72
UD
1426 for (src = 0; src < dfa->nodes_len; ++src)
1427 {
1428 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175
UD
1429 {
1430 dest = dfa->eclosures[src].elems[idx];
1431 re_node_set_insert (dfa->inveclosures + dest, src);
1432 }
3b0bdc72
UD
1433 }
1434}
1435
1436/* Calculate "eclosure" for all the node in DFA. */
1437
1438static reg_errcode_t
1439calc_eclosure (dfa)
1440 re_dfa_t *dfa;
1441{
485d775d 1442 int node_idx, incomplete;
3b0bdc72
UD
1443#ifdef DEBUG
1444 assert (dfa->nodes_len > 0);
1445#endif
485d775d 1446 incomplete = 0;
3b0bdc72 1447 /* For each nodes, calculate epsilon closure. */
485d775d 1448 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1449 {
a9388965 1450 reg_errcode_t err;
3b0bdc72 1451 re_node_set eclosure_elem;
485d775d 1452 if (node_idx == dfa->nodes_len)
15a7d175
UD
1453 {
1454 if (!incomplete)
1455 break;
1456 incomplete = 0;
1457 node_idx = 0;
1458 }
3b0bdc72
UD
1459
1460#ifdef DEBUG
3b0bdc72
UD
1461 assert (dfa->eclosures[node_idx].nelem != -1);
1462#endif
1463 /* If we have already calculated, skip it. */
1464 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1465 continue;
3b0bdc72 1466 /* Calculate epsilon closure of `node_idx'. */
a9388965 1467 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1468 if (BE (err != REG_NOERROR, 0))
15a7d175 1469 return err;
3b0bdc72
UD
1470
1471 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1472 {
1473 incomplete = 1;
1474 re_node_set_free (&eclosure_elem);
1475 }
3b0bdc72 1476 }
3b0bdc72
UD
1477 return REG_NOERROR;
1478}
1479
1480/* Calculate epsilon closure of NODE. */
1481
a9388965
UD
1482static reg_errcode_t
1483calc_eclosure_iter (new_set, dfa, node, root)
1484 re_node_set *new_set;
3b0bdc72
UD
1485 re_dfa_t *dfa;
1486 int node, root;
1487{
a9388965 1488 reg_errcode_t err;
3b0bdc72 1489 unsigned int constraint;
485d775d 1490 int i, incomplete;
3b0bdc72 1491 re_node_set eclosure;
485d775d 1492 incomplete = 0;
a9388965 1493 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1494 if (BE (err != REG_NOERROR, 0))
a9388965 1495 return err;
3b0bdc72
UD
1496
1497 /* This indicates that we are calculating this node now.
1498 We reference this value to avoid infinite loop. */
1499 dfa->eclosures[node].nelem = -1;
1500
1501 constraint = ((dfa->nodes[node].type == ANCHOR)
15a7d175 1502 ? dfa->nodes[node].opr.ctx_type : 0);
485d775d
UD
1503 /* If the current node has constraints, duplicate all nodes.
1504 Since they must inherit the constraints. */
1505 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1506 {
1507 int org_node, cur_node;
1508 org_node = cur_node = node;
1509 err = duplicate_node_closure (dfa, node, node, node, constraint);
1510 if (BE (err != REG_NOERROR, 0))
15a7d175 1511 return err;
485d775d 1512 }
3b0bdc72
UD
1513
1514 /* Expand each epsilon destination nodes. */
485d775d 1515 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1516 for (i = 0; i < dfa->edests[node].nelem; ++i)
1517 {
15a7d175
UD
1518 re_node_set eclosure_elem;
1519 int edest = dfa->edests[node].elems[i];
1520 /* If calculating the epsilon closure of `edest' is in progress,
1521 return intermediate result. */
1522 if (dfa->eclosures[edest].nelem == -1)
1523 {
1524 incomplete = 1;
1525 continue;
1526 }
1527 /* If we haven't calculated the epsilon closure of `edest' yet,
1528 calculate now. Otherwise use calculated epsilon closure. */
1529 if (dfa->eclosures[edest].nelem == 0)
1530 {
1531 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1532 if (BE (err != REG_NOERROR, 0))
1533 return err;
1534 }
1535 else
1536 eclosure_elem = dfa->eclosures[edest];
1537 /* Merge the epsilon closure of `edest'. */
1538 re_node_set_merge (&eclosure, &eclosure_elem);
1539 /* If the epsilon closure of `edest' is incomplete,
1540 the epsilon closure of this node is also incomplete. */
1541 if (dfa->eclosures[edest].nelem == 0)
1542 {
1543 incomplete = 1;
1544 re_node_set_free (&eclosure_elem);
1545 }
3b0bdc72
UD
1546 }
1547
3b0bdc72
UD
1548 /* Epsilon closures include itself. */
1549 re_node_set_insert (&eclosure, node);
1550 if (incomplete && !root)
1551 dfa->eclosures[node].nelem = 0;
1552 else
1553 dfa->eclosures[node] = eclosure;
a9388965
UD
1554 *new_set = eclosure;
1555 return REG_NOERROR;
3b0bdc72
UD
1556}
1557\f
1558/* Functions for token which are used in the parser. */
1559
1560/* Fetch a token from INPUT.
1561 We must not use this function inside bracket expressions. */
1562
1563static re_token_t
1564fetch_token (input, syntax)
1565 re_string_t *input;
1566 reg_syntax_t syntax;
1567{
1568 re_token_t token;
1569 int consumed_byte;
1570 consumed_byte = peek_token (&token, input, syntax);
1571 re_string_skip_bytes (input, consumed_byte);
1572 return token;
1573}
1574
1575/* Peek a token from INPUT, and return the length of the token.
1576 We must not use this function inside bracket expressions. */
1577
1578static int
1579peek_token (token, input, syntax)
1580 re_token_t *token;
1581 re_string_t *input;
1582 reg_syntax_t syntax;
1583{
1584 unsigned char c;
1585
1586 if (re_string_eoi (input))
1587 {
1588 token->type = END_OF_RE;
1589 return 0;
1590 }
1591
1592 c = re_string_peek_byte (input, 0);
1593 token->opr.c = c;
1594
1595#ifdef RE_ENABLE_I18N
1596 token->mb_partial = 0;
3c0fb574 1597 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1598 !re_string_first_byte (input, re_string_cur_idx (input)))
1599 {
1600 token->type = CHARACTER;
1601 token->mb_partial = 1;
1602 return 1;
1603 }
1604#endif
1605 if (c == '\\')
1606 {
1607 unsigned char c2;
1608 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1609 {
1610 token->type = BACK_SLASH;
1611 return 1;
1612 }
3b0bdc72
UD
1613
1614 c2 = re_string_peek_byte_case (input, 1);
1615 token->opr.c = c2;
1616 token->type = CHARACTER;
1617 switch (c2)
15a7d175
UD
1618 {
1619 case '|':
1620 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1621 token->type = OP_ALT;
1622 break;
1623 case '1': case '2': case '3': case '4': case '5':
1624 case '6': case '7': case '8': case '9':
1625 if (!(syntax & RE_NO_BK_REFS))
1626 {
1627 token->type = OP_BACK_REF;
1628 token->opr.idx = c2 - '0';
1629 }
1630 break;
1631 case '<':
1632 if (!(syntax & RE_NO_GNU_OPS))
1633 {
1634 token->type = ANCHOR;
1635 token->opr.idx = WORD_FIRST;
1636 }
1637 break;
1638 case '>':
1639 if (!(syntax & RE_NO_GNU_OPS))
1640 {
1641 token->type = ANCHOR;
1642 token->opr.idx = WORD_LAST;
1643 }
1644 break;
1645 case 'b':
1646 if (!(syntax & RE_NO_GNU_OPS))
1647 {
1648 token->type = ANCHOR;
1649 token->opr.idx = WORD_DELIM;
1650 }
1651 break;
1652 case 'B':
1653 if (!(syntax & RE_NO_GNU_OPS))
1654 {
1655 token->type = ANCHOR;
1656 token->opr.idx = INSIDE_WORD;
1657 }
1658 break;
1659 case 'w':
1660 if (!(syntax & RE_NO_GNU_OPS))
1661 token->type = OP_WORD;
1662 break;
1663 case 'W':
1664 if (!(syntax & RE_NO_GNU_OPS))
1665 token->type = OP_NOTWORD;
1666 break;
e2b6bfa3
UD
1667 case 's':
1668 if (!(syntax & RE_NO_GNU_OPS))
1669 token->type = OP_SPACE;
1670 break;
1671 case 'S':
1672 if (!(syntax & RE_NO_GNU_OPS))
1673 token->type = OP_NOTSPACE;
1674 break;
15a7d175
UD
1675 case '`':
1676 if (!(syntax & RE_NO_GNU_OPS))
1677 {
1678 token->type = ANCHOR;
1679 token->opr.idx = BUF_FIRST;
1680 }
1681 break;
1682 case '\'':
1683 if (!(syntax & RE_NO_GNU_OPS))
1684 {
1685 token->type = ANCHOR;
1686 token->opr.idx = BUF_LAST;
1687 }
1688 break;
1689 case '(':
1690 if (!(syntax & RE_NO_BK_PARENS))
1691 token->type = OP_OPEN_SUBEXP;
1692 break;
1693 case ')':
1694 if (!(syntax & RE_NO_BK_PARENS))
1695 token->type = OP_CLOSE_SUBEXP;
1696 break;
1697 case '+':
1698 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1699 token->type = OP_DUP_PLUS;
1700 break;
1701 case '?':
1702 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1703 token->type = OP_DUP_QUESTION;
1704 break;
1705 case '{':
1706 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1707 token->type = OP_OPEN_DUP_NUM;
1708 break;
1709 case '}':
1710 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1711 token->type = OP_CLOSE_DUP_NUM;
1712 break;
1713 default:
1714 break;
1715 }
3b0bdc72
UD
1716 return 2;
1717 }
1718
1719 token->type = CHARACTER;
1720 switch (c)
1721 {
1722 case '\n':
1723 if (syntax & RE_NEWLINE_ALT)
15a7d175 1724 token->type = OP_ALT;
3b0bdc72
UD
1725 break;
1726 case '|':
1727 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1728 token->type = OP_ALT;
3b0bdc72
UD
1729 break;
1730 case '*':
1731 token->type = OP_DUP_ASTERISK;
1732 break;
1733 case '+':
1734 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1735 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1736 break;
1737 case '?':
1738 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1739 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1740 break;
1741 case '{':
1742 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1743 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1744 break;
1745 case '}':
1746 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1747 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1748 break;
1749 case '(':
1750 if (syntax & RE_NO_BK_PARENS)
15a7d175 1751 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1752 break;
1753 case ')':
1754 if (syntax & RE_NO_BK_PARENS)
15a7d175 1755 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1756 break;
1757 case '[':
1758 token->type = OP_OPEN_BRACKET;
1759 break;
1760 case '.':
1761 token->type = OP_PERIOD;
1762 break;
1763 case '^':
134abcb5 1764 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1765 re_string_cur_idx (input) != 0)
1766 {
1767 char prev = re_string_peek_byte (input, -1);
134abcb5 1768 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1769 break;
1770 }
3b0bdc72
UD
1771 token->type = ANCHOR;
1772 token->opr.idx = LINE_FIRST;
1773 break;
1774 case '$':
1775 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1776 re_string_cur_idx (input) + 1 != re_string_length (input))
1777 {
1778 re_token_t next;
1779 re_string_skip_bytes (input, 1);
1780 peek_token (&next, input, syntax);
1781 re_string_skip_bytes (input, -1);
1782 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1783 break;
1784 }
3b0bdc72
UD
1785 token->type = ANCHOR;
1786 token->opr.idx = LINE_LAST;
1787 break;
1788 default:
1789 break;
1790 }
1791 return 1;
1792}
1793
1794/* Peek a token from INPUT, and return the length of the token.
1795 We must not use this function out of bracket expressions. */
1796
1797static int
1798peek_token_bracket (token, input, syntax)
1799 re_token_t *token;
1800 re_string_t *input;
1801 reg_syntax_t syntax;
1802{
1803 unsigned char c;
1804 if (re_string_eoi (input))
1805 {
1806 token->type = END_OF_RE;
1807 return 0;
1808 }
1809 c = re_string_peek_byte (input, 0);
1810 token->opr.c = c;
1811
1812#ifdef RE_ENABLE_I18N
3c0fb574 1813 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1814 !re_string_first_byte (input, re_string_cur_idx (input)))
1815 {
1816 token->type = CHARACTER;
1817 return 1;
1818 }
1819#endif /* RE_ENABLE_I18N */
1820
1821 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1822 {
1823 /* In this case, '\' escape a character. */
1824 unsigned char c2;
82bbb29e
UD
1825 re_string_skip_bytes (input, 1);
1826 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
1827 token->opr.c = c2;
1828 token->type = CHARACTER;
1829 return 1;
1830 }
1831 if (c == '[') /* '[' is a special char in a bracket exps. */
1832 {
1833 unsigned char c2;
1834 int token_len;
1835 c2 = re_string_peek_byte (input, 1);
1836 token->opr.c = c2;
1837 token_len = 2;
1838 switch (c2)
15a7d175
UD
1839 {
1840 case '.':
1841 token->type = OP_OPEN_COLL_ELEM;
1842 break;
1843 case '=':
1844 token->type = OP_OPEN_EQUIV_CLASS;
1845 break;
1846 case ':':
1847 if (syntax & RE_CHAR_CLASSES)
1848 {
1849 token->type = OP_OPEN_CHAR_CLASS;
1850 break;
1851 }
1852 /* else fall through. */
1853 default:
1854 token->type = CHARACTER;
1855 token->opr.c = c;
1856 token_len = 1;
1857 break;
1858 }
3b0bdc72
UD
1859 return token_len;
1860 }
1861 switch (c)
1862 {
1863 case '-':
1864 token->type = OP_CHARSET_RANGE;
1865 break;
1866 case ']':
1867 token->type = OP_CLOSE_BRACKET;
1868 break;
1869 case '^':
1870 token->type = OP_NON_MATCH_LIST;
1871 break;
1872 default:
1873 token->type = CHARACTER;
1874 }
1875 return 1;
1876}
1877\f
1878/* Functions for parser. */
1879
1880/* Entry point of the parser.
1881 Parse the regular expression REGEXP and return the structure tree.
1882 If an error is occured, ERR is set by error code, and return NULL.
1883 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
1884 CAT
1885 / \
1886 / \
3b0bdc72
UD
1887 <reg_exp> EOR
1888
1889 CAT means concatenation.
1890 EOR means end of regular expression. */
1891
1892static bin_tree_t *
1893parse (regexp, preg, syntax, err)
1894 re_string_t *regexp;
1895 regex_t *preg;
1896 reg_syntax_t syntax;
1897 reg_errcode_t *err;
1898{
1899 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1900 bin_tree_t *tree, *eor, *root;
1901 re_token_t current_token;
1902 int new_idx;
134abcb5 1903 current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1904 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 1905 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1906 return NULL;
1907 new_idx = re_dfa_add_node (dfa, current_token, 0);
1908 eor = create_tree (NULL, NULL, 0, new_idx);
1909 if (tree != NULL)
1910 root = create_tree (tree, eor, CONCAT, 0);
1911 else
1912 root = eor;
bc15410e 1913 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
485d775d
UD
1914 {
1915 *err = REG_ESPACE;
1916 return NULL;
1917 }
3b0bdc72
UD
1918 return root;
1919}
1920
1921/* This function build the following tree, from regular expression
1922 <branch1>|<branch2>:
15a7d175
UD
1923 ALT
1924 / \
1925 / \
3b0bdc72
UD
1926 <branch1> <branch2>
1927
1928 ALT means alternative, which represents the operator `|'. */
1929
1930static bin_tree_t *
1931parse_reg_exp (regexp, preg, token, syntax, nest, err)
1932 re_string_t *regexp;
1933 regex_t *preg;
1934 re_token_t *token;
1935 reg_syntax_t syntax;
1936 int nest;
1937 reg_errcode_t *err;
1938{
1939 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1940 bin_tree_t *tree, *branch = NULL;
1941 int new_idx;
1942 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 1943 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1944 return NULL;
1945
1946 while (token->type == OP_ALT)
1947 {
1948 re_token_t alt_token = *token;
1949 new_idx = re_dfa_add_node (dfa, alt_token, 0);
134abcb5 1950 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1951 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
1952 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1953 {
1954 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1955 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1956 {
1957 free_bin_tree (tree);
1958 return NULL;
1959 }
1960 }
9b88fc16
UD
1961 else
1962 branch = NULL;
3b0bdc72 1963 tree = create_tree (tree, branch, 0, new_idx);
bc15410e 1964 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
1965 {
1966 *err = REG_ESPACE;
1967 return NULL;
1968 }
0742e48e 1969 dfa->has_plural_match = 1;
3b0bdc72
UD
1970 }
1971 return tree;
1972}
1973
1974/* This function build the following tree, from regular expression
1975 <exp1><exp2>:
15a7d175
UD
1976 CAT
1977 / \
3b0bdc72
UD
1978 / \
1979 <exp1> <exp2>
1980
1981 CAT means concatenation. */
1982
1983static bin_tree_t *
1984parse_branch (regexp, preg, token, syntax, nest, err)
1985 re_string_t *regexp;
1986 regex_t *preg;
1987 re_token_t *token;
1988 reg_syntax_t syntax;
1989 int nest;
1990 reg_errcode_t *err;
1991{
1992 bin_tree_t *tree, *exp;
1993 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 1994 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1995 return NULL;
1996
1997 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 1998 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
1999 {
2000 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2001 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175
UD
2002 {
2003 free_bin_tree (tree);
2004 return NULL;
2005 }
3b0bdc72 2006 if (tree != NULL && exp != NULL)
15a7d175
UD
2007 {
2008 tree = create_tree (tree, exp, CONCAT, 0);
2009 if (tree == NULL)
2010 {
2011 *err = REG_ESPACE;
2012 return NULL;
2013 }
2014 }
3b0bdc72 2015 else if (tree == NULL)
15a7d175 2016 tree = exp;
3b0bdc72
UD
2017 /* Otherwise exp == NULL, we don't need to create new tree. */
2018 }
2019 return tree;
2020}
2021
2022/* This function build the following tree, from regular expression a*:
15a7d175
UD
2023 *
2024 |
2025 a
3b0bdc72
UD
2026*/
2027
2028static bin_tree_t *
2029parse_expression (regexp, preg, token, syntax, nest, err)
2030 re_string_t *regexp;
2031 regex_t *preg;
2032 re_token_t *token;
2033 reg_syntax_t syntax;
2034 int nest;
2035 reg_errcode_t *err;
2036{
2037 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2038 bin_tree_t *tree;
2039 int new_idx;
2040 switch (token->type)
2041 {
2042 case CHARACTER:
2043 new_idx = re_dfa_add_node (dfa, *token, 0);
2044 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2045 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2046 {
2047 *err = REG_ESPACE;
2048 return NULL;
2049 }
3b0bdc72 2050#ifdef RE_ENABLE_I18N
3c0fb574 2051 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2052 {
2053 while (!re_string_eoi (regexp)
2054 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2055 {
15a7d175
UD
2056 bin_tree_t *mbc_remain;
2057 *token = fetch_token (regexp, syntax);
2058 new_idx = re_dfa_add_node (dfa, *token, 0);
2059 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
2060 tree = create_tree (tree, mbc_remain, CONCAT, 0);
2061 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2062 {
2063 *err = REG_ESPACE;
2064 return NULL;
2065 }
15a7d175 2066 }
3b0bdc72
UD
2067 }
2068#endif
2069 break;
2070 case OP_OPEN_SUBEXP:
2071 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2072 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2073 return NULL;
3b0bdc72
UD
2074 break;
2075 case OP_OPEN_BRACKET:
2076 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2078 return NULL;
3b0bdc72
UD
2079 break;
2080 case OP_BACK_REF:
bc15410e 2081 if (BE (preg->re_nsub < token->opr.idx
15a7d175
UD
2082 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2083 {
2084 *err = REG_ESUBREG;
2085 return NULL;
2086 }
6291ee3c 2087 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
3b0bdc72
UD
2088 new_idx = re_dfa_add_node (dfa, *token, 0);
2089 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2090 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2091 {
2092 *err = REG_ESPACE;
2093 return NULL;
2094 }
3b0bdc72
UD
2095 ++dfa->nbackref;
2096 dfa->has_mb_node = 1;
2097 break;
06e8303a
UD
2098 case OP_OPEN_DUP_NUM:
2099 if (syntax & RE_CONTEXT_INVALID_DUP)
2100 {
2101 *err = REG_BADRPT;
2102 return NULL;
2103 }
2104 /* FALLTHROUGH */
3b0bdc72
UD
2105 case OP_DUP_ASTERISK:
2106 case OP_DUP_PLUS:
2107 case OP_DUP_QUESTION:
3b0bdc72 2108 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2109 {
2110 *err = REG_BADRPT;
2111 return NULL;
2112 }
3b0bdc72 2113 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175
UD
2114 {
2115 *token = fetch_token (regexp, syntax);
2116 return parse_expression (regexp, preg, token, syntax, nest, err);
2117 }
3b0bdc72
UD
2118 /* else fall through */
2119 case OP_CLOSE_SUBEXP:
2120 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2121 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2122 {
2123 *err = REG_ERPAREN;
2124 return NULL;
2125 }
3b0bdc72
UD
2126 /* else fall through */
2127 case OP_CLOSE_DUP_NUM:
2128 /* We treat it as a normal character. */
2129
2130 /* Then we can these characters as normal characters. */
2131 token->type = CHARACTER;
2132 new_idx = re_dfa_add_node (dfa, *token, 0);
2133 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2134 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2135 {
2136 *err = REG_ESPACE;
2137 return NULL;
2138 }
3b0bdc72
UD
2139 break;
2140 case ANCHOR:
2141 if (dfa->word_char == NULL)
15a7d175
UD
2142 {
2143 *err = init_word_char (dfa);
2144 if (BE (*err != REG_NOERROR, 0))
2145 return NULL;
2146 }
3b0bdc72 2147 if (token->opr.ctx_type == WORD_DELIM)
15a7d175
UD
2148 {
2149 bin_tree_t *tree_first, *tree_last;
2150 int idx_first, idx_last;
2151 token->opr.ctx_type = WORD_FIRST;
2152 idx_first = re_dfa_add_node (dfa, *token, 0);
2153 tree_first = create_tree (NULL, NULL, 0, idx_first);
2154 token->opr.ctx_type = WORD_LAST;
2155 idx_last = re_dfa_add_node (dfa, *token, 0);
2156 tree_last = create_tree (NULL, NULL, 0, idx_last);
2157 token->type = OP_ALT;
2158 new_idx = re_dfa_add_node (dfa, *token, 0);
2159 tree = create_tree (tree_first, tree_last, 0, new_idx);
2160 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2161 || tree_first == NULL || tree_last == NULL
2162 || tree == NULL, 0))
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 }
3b0bdc72 2168 else
15a7d175
UD
2169 {
2170 new_idx = re_dfa_add_node (dfa, *token, 0);
2171 tree = create_tree (NULL, NULL, 0, new_idx);
2172 if (BE (new_idx == -1 || tree == NULL, 0))
54e1cabc
UD
2173 {
2174 *err = REG_ESPACE;
2175 return NULL;
2176 }
15a7d175 2177 }
3b0bdc72 2178 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2179 by repetition operators.
2180 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2181 it must not be "<ANCHOR(^)><REPEAT(*)>". */
3b0bdc72
UD
2182 *token = fetch_token (regexp, syntax);
2183 return tree;
2184 case OP_PERIOD:
2185 new_idx = re_dfa_add_node (dfa, *token, 0);
2186 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2187 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2188 {
2189 *err = REG_ESPACE;
2190 return NULL;
2191 }
3c0fb574 2192 if (dfa->mb_cur_max > 1)
15a7d175 2193 dfa->has_mb_node = 1;
3b0bdc72
UD
2194 break;
2195 case OP_WORD:
e2b6bfa3 2196 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
bc15410e 2197 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2198 return NULL;
3b0bdc72
UD
2199 break;
2200 case OP_NOTWORD:
e2b6bfa3
UD
2201 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2202 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2203 return NULL;
2204 break;
2205 case OP_SPACE:
2206 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2207 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2208 return NULL;
2209 break;
2210 case OP_NOTSPACE:
2211 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
bc15410e 2212 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2213 return NULL;
3b0bdc72
UD
2214 break;
2215 case OP_ALT:
2216 case END_OF_RE:
2217 return NULL;
2218 case BACK_SLASH:
2219 *err = REG_EESCAPE;
2220 return NULL;
2221 default:
2222 /* Must not happen? */
2223#ifdef DEBUG
2224 assert (0);
2225#endif
2226 return NULL;
2227 }
2228 *token = fetch_token (regexp, syntax);
2229
2230 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2231 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72
UD
2232 {
2233 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2234 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2235 return NULL;
0742e48e 2236 dfa->has_plural_match = 1;
3b0bdc72
UD
2237 }
2238
2239 return tree;
2240}
2241
2242/* This function build the following tree, from regular expression
2243 (<reg_exp>):
15a7d175
UD
2244 SUBEXP
2245 |
2246 <reg_exp>
3b0bdc72
UD
2247*/
2248
2249static bin_tree_t *
2250parse_sub_exp (regexp, preg, token, syntax, nest, err)
2251 re_string_t *regexp;
2252 regex_t *preg;
2253 re_token_t *token;
2254 reg_syntax_t syntax;
2255 int nest;
2256 reg_errcode_t *err;
2257{
2258 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 2259 bin_tree_t *tree, *left_par, *right_par;
3b0bdc72 2260 size_t cur_nsub;
81c64d40 2261 int new_idx;
3b0bdc72
UD
2262 cur_nsub = preg->re_nsub++;
2263 if (dfa->subexps_alloc < preg->re_nsub)
2264 {
2265 re_subexp_t *new_array;
2266 dfa->subexps_alloc *= 2;
2267 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
bc15410e 2268 if (BE (new_array == NULL, 0))
3b0bdc72
UD
2269 {
2270 dfa->subexps_alloc /= 2;
2271 *err = REG_ESPACE;
2272 return NULL;
2273 }
2274 dfa->subexps = new_array;
2275 }
2276 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2277 dfa->subexps[cur_nsub].end = -1;
81c64d40
UD
2278
2279 new_idx = re_dfa_add_node (dfa, *token, 0);
2280 left_par = create_tree (NULL, NULL, 0, new_idx);
2281 if (BE (new_idx == -1 || left_par == NULL, 0))
485d775d
UD
2282 {
2283 *err = REG_ESPACE;
2284 return NULL;
2285 }
81c64d40 2286 dfa->nodes[new_idx].opr.idx = cur_nsub;
134abcb5 2287 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2288
2289 /* The subexpression may be a null string. */
2290 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2291 tree = NULL;
3b0bdc72
UD
2292 else
2293 {
2294 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
bc15410e 2295 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2296 return NULL;
3b0bdc72 2297 }
81c64d40
UD
2298 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2299 {
2300 free_bin_tree (tree);
06e8303a 2301 *err = REG_EPAREN;
81c64d40
UD
2302 return NULL;
2303 }
2304 new_idx = re_dfa_add_node (dfa, *token, 0);
2305 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2306 right_par = create_tree (NULL, NULL, 0, new_idx);
2307 tree = ((tree == NULL) ? right_par
15a7d175 2308 : create_tree (tree, right_par, CONCAT, 0));
81c64d40
UD
2309 tree = create_tree (left_par, tree, CONCAT, 0);
2310 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
485d775d
UD
2311 {
2312 *err = REG_ESPACE;
2313 return NULL;
2314 }
81c64d40
UD
2315 dfa->nodes[new_idx].opr.idx = cur_nsub;
2316
3b0bdc72
UD
2317 return tree;
2318}
2319
2320/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2321
2322static bin_tree_t *
2323parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2324 bin_tree_t *dup_elem;
2325 re_string_t *regexp;
2326 re_dfa_t *dfa;
2327 re_token_t *token;
2328 reg_syntax_t syntax;
2329 reg_errcode_t *err;
2330{
2331 re_token_t dup_token;
2332 bin_tree_t *tree = dup_elem, *work_tree;
2333 int new_idx, start_idx = re_string_cur_idx (regexp);
2334 re_token_t start_token = *token;
2335 if (token->type == OP_OPEN_DUP_NUM)
2336 {
602c2f9d
UD
2337 int i;
2338 int end = 0;
2339 int start = fetch_number (regexp, token, syntax);
3b0bdc72
UD
2340 bin_tree_t *elem;
2341 if (start == -1)
15a7d175
UD
2342 {
2343 if (token->type == CHARACTER && token->opr.c == ',')
2344 start = 0; /* We treat "{,m}" as "{0,m}". */
2345 else
2346 {
2347 *err = REG_BADBR; /* <re>{} is invalid. */
2348 return NULL;
2349 }
2350 }
602c2f9d 2351 if (BE (start != -2, 1))
15a7d175
UD
2352 {
2353 /* We treat "{n}" as "{n,n}". */
2354 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2355 : ((token->type == CHARACTER && token->opr.c == ',')
2356 ? fetch_number (regexp, token, syntax) : -2));
2357 }
602c2f9d 2358 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2359 {
2360 /* Invalid sequence. */
2361 if (token->type == OP_CLOSE_DUP_NUM)
2362 goto parse_dup_op_invalid_interval;
2363 else
2364 goto parse_dup_op_ebrace;
2365 }
602c2f9d 2366 if (BE (start == 0 && end == 0, 0))
15a7d175
UD
2367 {
2368 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2369 *token = fetch_token (regexp, syntax);
2370 free_bin_tree (dup_elem);
2371 return NULL;
2372 }
602c2f9d 2373
3b0bdc72
UD
2374 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2375 elem = tree;
2376 for (i = 0; i < start; ++i)
15a7d175
UD
2377 if (i != 0)
2378 {
2379 work_tree = duplicate_tree (elem, dfa);
2380 tree = create_tree (tree, work_tree, CONCAT, 0);
2381 if (BE (work_tree == NULL || tree == NULL, 0))
2382 goto parse_dup_op_espace;
2383 }
3b0bdc72
UD
2384
2385 if (end == -1)
15a7d175
UD
2386 {
2387 /* We treat "<re>{0,}" as "<re>*". */
2388 dup_token.type = OP_DUP_ASTERISK;
2389 if (start > 0)
2390 {
2391 elem = duplicate_tree (elem, dfa);
2392 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2393 work_tree = create_tree (elem, NULL, 0, new_idx);
2394 tree = create_tree (tree, work_tree, CONCAT, 0);
2395 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2396 || tree == NULL, 0))
2397 goto parse_dup_op_espace;
2398 }
2399 else
2400 {
2401 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2402 tree = create_tree (elem, NULL, 0, new_idx);
2403 if (BE (new_idx == -1 || tree == NULL, 0))
2404 goto parse_dup_op_espace;
2405 }
2406 }
3b0bdc72 2407 else if (end - start > 0)
15a7d175
UD
2408 {
2409 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2410 dup_token.type = OP_DUP_QUESTION;
2411 if (start > 0)
2412 {
2413 elem = duplicate_tree (elem, dfa);
2414 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2415 elem = create_tree (elem, NULL, 0, new_idx);
2416 tree = create_tree (tree, elem, CONCAT, 0);
2417 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2418 goto parse_dup_op_espace;
2419 }
2420 else
2421 {
2422 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2423 tree = elem = create_tree (elem, NULL, 0, new_idx);
2424 if (BE (new_idx == -1 || tree == NULL, 0))
2425 goto parse_dup_op_espace;
2426 }
2427 for (i = 1; i < end - start; ++i)
2428 {
2429 work_tree = duplicate_tree (elem, dfa);
2430 tree = create_tree (tree, work_tree, CONCAT, 0);
2431 if (BE (work_tree == NULL || tree == NULL, 0))
2432 {
2433 *err = REG_ESPACE;
2434 return NULL;
2435 }
2436 }
2437 }
3b0bdc72
UD
2438 }
2439 else
2440 {
2441 new_idx = re_dfa_add_node (dfa, *token, 0);
2442 tree = create_tree (tree, NULL, 0, new_idx);
bc15410e 2443 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2444 {
2445 *err = REG_ESPACE;
2446 return NULL;
2447 }
3b0bdc72
UD
2448 }
2449 *token = fetch_token (regexp, syntax);
2450 return tree;
2451
2452 parse_dup_op_espace:
2453 free_bin_tree (tree);
2454 *err = REG_ESPACE;
2455 return NULL;
2456
602c2f9d 2457 parse_dup_op_ebrace:
bc15410e 2458 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
3b0bdc72
UD
2459 {
2460 *err = REG_EBRACE;
2461 return NULL;
2462 }
602c2f9d
UD
2463 goto parse_dup_op_rollback;
2464 parse_dup_op_invalid_interval:
2465 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2466 {
2467 *err = REG_BADBR;
2468 return NULL;
2469 }
2470 parse_dup_op_rollback:
3b0bdc72
UD
2471 re_string_set_index (regexp, start_idx);
2472 *token = start_token;
2473 token->type = CHARACTER;
2474 return dup_elem;
2475}
2476
2477/* Size of the names for collating symbol/equivalence_class/character_class.
2478 I'm not sure, but maybe enough. */
2479#define BRACKET_NAME_BUF_SIZE 32
2480
434d3784
UD
2481#ifndef _LIBC
2482 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2483 Build the range expression which starts from START_ELEM, and ends
2484 at END_ELEM. The result are written to MBCSET and SBCSET.
2485 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2486 mbcset->range_ends, is a pointer argument sinse we may
2487 update it. */
2488
2489static reg_errcode_t
c0a0f9a3
UD
2490# ifdef RE_ENABLE_I18N
2491build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
434d3784 2492 re_charset_t *mbcset;
434d3784 2493 int *range_alloc;
c0a0f9a3
UD
2494# else /* not RE_ENABLE_I18N */
2495build_range_exp (sbcset, start_elem, end_elem)
2496# endif /* not RE_ENABLE_I18N */
2497 re_bitset_ptr_t sbcset;
434d3784
UD
2498 bracket_elem_t *start_elem, *end_elem;
2499{
2500 unsigned int start_ch, end_ch;
2501 /* Equivalence Classes and Character Classes can't be a range start/end. */
2502 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2503 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2504 0))
434d3784
UD
2505 return REG_ERANGE;
2506
2507 /* We can handle no multi character collating elements without libc
2508 support. */
62439eac 2509 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2510 && strlen ((char *) start_elem->opr.name) > 1)
2511 || (end_elem->type == COLL_SYM
2512 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2513 return REG_ECOLLATE;
2514
2515# ifdef RE_ENABLE_I18N
2516 {
2517 wchar_t wc, start_wc, end_wc;
2518 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2519
2520 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2521 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2522 : 0));
434d3784 2523 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2524 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2525 : 0));
434d3784 2526 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2527 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2528 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2529 ? __btowc (end_ch) : end_elem->opr.wch);
434d3784
UD
2530 cmp_buf[0] = start_wc;
2531 cmp_buf[4] = end_wc;
2532 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2533 return REG_ERANGE;
2534
2535 /* Check the space of the arrays. */
2536 if (*range_alloc == mbcset->nranges)
2537 {
15a7d175
UD
2538 /* There are not enough space, need realloc. */
2539 wchar_t *new_array_start, *new_array_end;
2540 int new_nranges;
2541
2542 /* +1 in case of mbcset->nranges is 0. */
2543 new_nranges = 2 * mbcset->nranges + 1;
2544 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2545 are NULL if *range_alloc == 0. */
2546 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2547 new_nranges);
2548 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2549 new_nranges);
2550
2551 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2552 return REG_ESPACE;
2553
2554 mbcset->range_starts = new_array_start;
2555 mbcset->range_ends = new_array_end;
2556 *range_alloc = new_nranges;
434d3784
UD
2557 }
2558
2559 mbcset->range_starts[mbcset->nranges] = start_wc;
2560 mbcset->range_ends[mbcset->nranges++] = end_wc;
2561
2562 /* Build the table for single byte characters. */
2563 for (wc = 0; wc <= SBC_MAX; ++wc)
2564 {
15a7d175
UD
2565 cmp_buf[2] = wc;
2566 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2567 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2568 bitset_set (sbcset, wc);
434d3784
UD
2569 }
2570 }
2571# else /* not RE_ENABLE_I18N */
2572 {
2573 unsigned int ch;
2574 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2575 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2576 : 0));
434d3784 2577 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2578 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2579 : 0));
434d3784
UD
2580 if (start_ch > end_ch)
2581 return REG_ERANGE;
2582 /* Build the table for single byte characters. */
2583 for (ch = 0; ch <= SBC_MAX; ++ch)
2584 if (start_ch <= ch && ch <= end_ch)
15a7d175 2585 bitset_set (sbcset, ch);
434d3784
UD
2586 }
2587# endif /* not RE_ENABLE_I18N */
2588 return REG_NOERROR;
2589}
2590#endif /* not _LIBC */
2591
2592#ifndef _LIBC
2593/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2594 Build the collating element which is represented by NAME.
2595 The result are written to MBCSET and SBCSET.
2596 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2597 pointer argument since we may update it. */
2598
2599static reg_errcode_t
c0a0f9a3
UD
2600# ifdef RE_ENABLE_I18N
2601build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
434d3784 2602 re_charset_t *mbcset;
434d3784 2603 int *coll_sym_alloc;
c0a0f9a3
UD
2604# else /* not RE_ENABLE_I18N */
2605build_collating_symbol (sbcset, name)
2606# endif /* not RE_ENABLE_I18N */
2607 re_bitset_ptr_t sbcset;
62439eac 2608 const unsigned char *name;
434d3784 2609{
62439eac
UD
2610 size_t name_len = strlen ((const char *) name);
2611 if (BE (name_len != 1, 0))
434d3784
UD
2612 return REG_ECOLLATE;
2613 else
2614 {
62439eac 2615 bitset_set (sbcset, name[0]);
434d3784
UD
2616 return REG_NOERROR;
2617 }
2618}
2619#endif /* not _LIBC */
2620
3b0bdc72
UD
2621/* This function parse bracket expression like "[abc]", "[a-c]",
2622 "[[.a-a.]]" etc. */
2623
2624static bin_tree_t *
2625parse_bracket_exp (regexp, dfa, token, syntax, err)
2626 re_string_t *regexp;
2627 re_dfa_t *dfa;
2628 re_token_t *token;
2629 reg_syntax_t syntax;
2630 reg_errcode_t *err;
2631{
2632#ifdef _LIBC
62439eac
UD
2633 const unsigned char *collseqmb;
2634 const char *collseqwc;
3b0bdc72
UD
2635 uint32_t nrules;
2636 int32_t table_size;
2637 const int32_t *symb_table;
2638 const unsigned char *extra;
2639
434d3784 2640 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2641 Seek the collating symbol entry correspondings to NAME.
2642 Return the index of the symbol in the SYMB_TABLE. */
2643
2644 static inline int32_t
25337753 2645 __attribute ((always_inline))
3b0bdc72 2646 seek_collating_symbol_entry (name, name_len)
15a7d175
UD
2647 const unsigned char *name;
2648 size_t name_len;
3b0bdc72 2649 {
62439eac 2650 int32_t hash = elem_hash ((const char *) name, name_len);
3b0bdc72
UD
2651 int32_t elem = hash % table_size;
2652 int32_t second = hash % (table_size - 2);
2653 while (symb_table[2 * elem] != 0)
15a7d175
UD
2654 {
2655 /* First compare the hashing value. */
2656 if (symb_table[2 * elem] == hash
2657 /* Compare the length of the name. */
2658 && name_len == extra[symb_table[2 * elem + 1]]
2659 /* Compare the name. */
2660 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2661 name_len) == 0)
2662 {
2663 /* Yep, this is the entry. */
2664 break;
2665 }
2666
2667 /* Next entry. */
2668 elem += second;
2669 }
3b0bdc72
UD
2670 return elem;
2671 }
2672
434d3784 2673 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2674 Look up the collation sequence value of BR_ELEM.
2675 Return the value if succeeded, UINT_MAX otherwise. */
2676
2677 static inline unsigned int
25337753 2678 __attribute ((always_inline))
3b0bdc72 2679 lookup_collation_sequence_value (br_elem)
15a7d175 2680 bracket_elem_t *br_elem;
3b0bdc72
UD
2681 {
2682 if (br_elem->type == SB_CHAR)
15a7d175
UD
2683 {
2684 /*
2685 if (MB_CUR_MAX == 1)
2686 */
2687 if (nrules == 0)
2688 return collseqmb[br_elem->opr.ch];
2689 else
2690 {
2691 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2692 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2693 }
2694 }
3b0bdc72 2695 else if (br_elem->type == MB_CHAR)
15a7d175 2696 {
25337753 2697 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2698 }
3b0bdc72 2699 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2700 {
2701 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2702 if (nrules != 0)
2703 {
2704 int32_t elem, idx;
2705 elem = seek_collating_symbol_entry (br_elem->opr.name,
2706 sym_name_len);
2707 if (symb_table[2 * elem] != 0)
2708 {
2709 /* We found the entry. */
2710 idx = symb_table[2 * elem + 1];
2711 /* Skip the name of collating element name. */
2712 idx += 1 + extra[idx];
2713 /* Skip the byte sequence of the collating element. */
2714 idx += 1 + extra[idx];
2715 /* Adjust for the alignment. */
2716 idx = (idx + 3) & ~3;
2717 /* Skip the multibyte collation sequence value. */
2718 idx += sizeof (unsigned int);
2719 /* Skip the wide char sequence of the collating element. */
2720 idx += sizeof (unsigned int) *
2721 (1 + *(unsigned int *) (extra + idx));
2722 /* Return the collation sequence value. */
2723 return *(unsigned int *) (extra + idx);
2724 }
2725 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2726 {
2727 /* No valid character. Match it as a single byte
2728 character. */
2729 return collseqmb[br_elem->opr.name[0]];
2730 }
2731 }
2732 else if (sym_name_len == 1)
2733 return collseqmb[br_elem->opr.name[0]];
2734 }
3b0bdc72
UD
2735 return UINT_MAX;
2736 }
2737
434d3784 2738 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2739 Build the range expression which starts from START_ELEM, and ends
2740 at END_ELEM. The result are written to MBCSET and SBCSET.
2741 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2742 mbcset->range_ends, is a pointer argument sinse we may
2743 update it. */
2744
2745 static inline reg_errcode_t
25337753 2746 __attribute ((always_inline))
c0a0f9a3
UD
2747# ifdef RE_ENABLE_I18N
2748 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
15a7d175
UD
2749 re_charset_t *mbcset;
2750 int *range_alloc;
c0a0f9a3
UD
2751# else /* not RE_ENABLE_I18N */
2752 build_range_exp (sbcset, start_elem, end_elem)
2753# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2754 re_bitset_ptr_t sbcset;
2755 bracket_elem_t *start_elem, *end_elem;
3b0bdc72
UD
2756 {
2757 unsigned int ch;
2758 uint32_t start_collseq;
2759 uint32_t end_collseq;
2760
c0a0f9a3 2761# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2762 /* Check the space of the arrays. */
2763 if (*range_alloc == mbcset->nranges)
15a7d175
UD
2764 {
2765 /* There are not enough space, need realloc. */
2766 uint32_t *new_array_start;
2767 uint32_t *new_array_end;
3b0bdc72
UD
2768 int new_nranges;
2769
15a7d175
UD
2770 /* +1 in case of mbcset->nranges is 0. */
2771 new_nranges = 2 * mbcset->nranges + 1;
a9388965 2772 /* Use realloc since mbcset->range_starts and mbcset->range_ends
15a7d175
UD
2773 are NULL if *range_alloc == 0. */
2774 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2775 new_nranges);
2776 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2777 new_nranges);
a9388965 2778
15a7d175
UD
2779 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2780 return REG_ESPACE;
3b0bdc72 2781
15a7d175
UD
2782 mbcset->range_starts = new_array_start;
2783 mbcset->range_ends = new_array_end;
3b0bdc72 2784 *range_alloc = new_nranges;
15a7d175 2785 }
c0a0f9a3 2786# endif /* RE_ENABLE_I18N */
3b0bdc72 2787
434d3784 2788 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2789 start/end. */
bc15410e 2790 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2791 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2792 0))
2793 return REG_ERANGE;
3b0bdc72
UD
2794
2795 start_collseq = lookup_collation_sequence_value (start_elem);
2796 end_collseq = lookup_collation_sequence_value (end_elem);
2797 /* Check start/end collation sequence values. */
bc15410e 2798 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2799 return REG_ECOLLATE;
bc15410e 2800 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2801 return REG_ERANGE;
3b0bdc72 2802
c0a0f9a3 2803# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2804 /* Got valid collation sequence values, add them as a new entry. */
2805 mbcset->range_starts[mbcset->nranges] = start_collseq;
2806 mbcset->range_ends[mbcset->nranges++] = end_collseq;
c0a0f9a3 2807# endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2808
2809 /* Build the table for single byte characters. */
2810 for (ch = 0; ch <= SBC_MAX; ch++)
15a7d175
UD
2811 {
2812 uint32_t ch_collseq;
2813 /*
2814 if (MB_CUR_MAX == 1)
2815 */
2816 if (nrules == 0)
2817 ch_collseq = collseqmb[ch];
2818 else
25337753 2819 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2820 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2821 bitset_set (sbcset, ch);
2822 }
3b0bdc72
UD
2823 return REG_NOERROR;
2824 }
3b0bdc72 2825
434d3784 2826 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2827 Build the collating element which is represented by NAME.
2828 The result are written to MBCSET and SBCSET.
2829 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2830 pointer argument sinse we may update it. */
2831
2832 static inline reg_errcode_t
25337753 2833 __attribute ((always_inline))
c0a0f9a3
UD
2834# ifdef RE_ENABLE_I18N
2835 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
15a7d175
UD
2836 re_charset_t *mbcset;
2837 int *coll_sym_alloc;
c0a0f9a3
UD
2838# else /* not RE_ENABLE_I18N */
2839 build_collating_symbol (sbcset, name)
2840# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2841 re_bitset_ptr_t sbcset;
2842 const unsigned char *name;
3b0bdc72 2843 {
3b0bdc72 2844 int32_t elem, idx;
62439eac 2845 size_t name_len = strlen ((const char *) name);
3b0bdc72 2846 if (nrules != 0)
15a7d175
UD
2847 {
2848 elem = seek_collating_symbol_entry (name, name_len);
2849 if (symb_table[2 * elem] != 0)
2850 {
2851 /* We found the entry. */
2852 idx = symb_table[2 * elem + 1];
2853 /* Skip the name of collating element name. */
2854 idx += 1 + extra[idx];
2855 }
2856 else if (symb_table[2 * elem] == 0 && name_len == 1)
2857 {
2858 /* No valid character, treat it as a normal
2859 character. */
2860 bitset_set (sbcset, name[0]);
2861 return REG_NOERROR;
2862 }
2863 else
2864 return REG_ECOLLATE;
3b0bdc72 2865
c0a0f9a3 2866# ifdef RE_ENABLE_I18N
15a7d175
UD
2867 /* Got valid collation sequence, add it as a new entry. */
2868 /* Check the space of the arrays. */
2869 if (*coll_sym_alloc == mbcset->ncoll_syms)
2870 {
2871 /* Not enough, realloc it. */
2872 /* +1 in case of mbcset->ncoll_syms is 0. */
2873 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2874 /* Use realloc since mbcset->coll_syms is NULL
2875 if *alloc == 0. */
2876 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2877 *coll_sym_alloc);
2878 if (BE (mbcset->coll_syms == NULL, 0))
2879 return REG_ESPACE;
2880 }
2881 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
c0a0f9a3 2882# endif /* RE_ENABLE_I18N */
15a7d175
UD
2883 return REG_NOERROR;
2884 }
3b0bdc72 2885 else
15a7d175
UD
2886 {
2887 if (BE (name_len != 1, 0))
2888 return REG_ECOLLATE;
2889 else
2890 {
2891 bitset_set (sbcset, name[0]);
2892 return REG_NOERROR;
2893 }
2894 }
3b0bdc72 2895 }
434d3784
UD
2896#endif
2897
3b0bdc72
UD
2898 re_token_t br_token;
2899 re_bitset_ptr_t sbcset;
c0a0f9a3 2900#ifdef RE_ENABLE_I18N
3b0bdc72 2901 re_charset_t *mbcset;
3b0bdc72
UD
2902 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2903 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3
UD
2904#else /* not RE_ENABLE_I18N */
2905 int non_match = 0;
2906#endif /* not RE_ENABLE_I18N */
2907 bin_tree_t *work_tree;
2908 int token_len, new_idx;
3b0bdc72 2909#ifdef _LIBC
62439eac
UD
2910 collseqmb = (const unsigned char *)
2911 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
2912 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2913 if (nrules)
2914 {
2915 /*
2916 if (MB_CUR_MAX > 1)
2917 */
15a7d175 2918 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
2919 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2920 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 2921 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 2922 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 2923 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
2924 }
2925#endif
2926 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 2927#ifdef RE_ENABLE_I18N
3b0bdc72 2928 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
2929#endif /* RE_ENABLE_I18N */
2930#ifdef RE_ENABLE_I18N
bc15410e 2931 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
2932#else
2933 if (BE (sbcset == NULL, 0))
2934#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2935 {
2936 *err = REG_ESPACE;
2937 return NULL;
2938 }
2939
2940 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2941 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2942 {
3b0bdc72 2943 *err = REG_BADPAT;
434d3784 2944 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2945 }
2946 if (token->type == OP_NON_MATCH_LIST)
2947 {
c0a0f9a3 2948#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2949 int i;
2950 mbcset->non_match = 1;
c0a0f9a3
UD
2951#else /* not RE_ENABLE_I18N */
2952 non_match = 1;
2953#endif /* not RE_ENABLE_I18N */
3b0bdc72 2954 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 2955 bitset_set (sbcset, '\0');
3b0bdc72
UD
2956 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2957 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2958 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
2959 {
2960 *err = REG_BADPAT;
2961 goto parse_bracket_exp_free_return;
2962 }
c0a0f9a3 2963#ifdef RE_ENABLE_I18N
3c0fb574 2964 if (dfa->mb_cur_max > 1)
15a7d175
UD
2965 for (i = 0; i < SBC_MAX; ++i)
2966 if (__btowc (i) == WEOF)
2967 bitset_set (sbcset, i);
c0a0f9a3 2968#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2969 }
2970
2971 /* We treat the first ']' as a normal character. */
2972 if (token->type == OP_CLOSE_BRACKET)
2973 token->type = CHARACTER;
2974
2975 while (1)
2976 {
2977 bracket_elem_t start_elem, end_elem;
62439eac
UD
2978 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2979 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
2980 reg_errcode_t ret;
2981 int token_len2 = 0, is_range_exp = 0;
2982 re_token_t token2;
2983
2984 start_elem.opr.name = start_name_buf;
2985 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
15a7d175 2986 syntax);
bc15410e 2987 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
2988 {
2989 *err = ret;
2990 goto parse_bracket_exp_free_return;
2991 }
3b0bdc72
UD
2992
2993 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2994 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
2995 {
2996 *err = REG_BADPAT;
2997 goto parse_bracket_exp_free_return;
2998 }
3b0bdc72 2999 if (token->type == OP_CHARSET_RANGE)
15a7d175
UD
3000 {
3001 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3002 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3003 if (BE (token->type == END_OF_RE, 0))
3004 {
3005 *err = REG_BADPAT;
3006 goto parse_bracket_exp_free_return;
3007 }
3008 if (token2.type == OP_CLOSE_BRACKET)
3009 {
3010 /* We treat the last '-' as a normal character. */
3011 re_string_skip_bytes (regexp, -token_len);
3012 token->type = CHARACTER;
3013 }
3014 else
3015 is_range_exp = 1;
3016 }
3b0bdc72
UD
3017
3018 if (is_range_exp == 1)
15a7d175
UD
3019 {
3020 end_elem.opr.name = end_name_buf;
3021 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3022 dfa, syntax);
3023 if (BE (ret != REG_NOERROR, 0))
3024 {
3025 *err = ret;
3026 goto parse_bracket_exp_free_return;
3027 }
3028
3029 token_len = peek_token_bracket (token, regexp, syntax);
3030 if (BE (token->type == END_OF_RE, 0))
3031 {
3032 *err = REG_BADPAT;
3033 goto parse_bracket_exp_free_return;
3034 }
3035 *err = build_range_exp (sbcset,
c0a0f9a3 3036#ifdef RE_ENABLE_I18N
15a7d175 3037 mbcset, &range_alloc,
c0a0f9a3 3038#endif /* RE_ENABLE_I18N */
15a7d175
UD
3039 &start_elem, &end_elem);
3040 if (BE (*err != REG_NOERROR, 0))
3041 goto parse_bracket_exp_free_return;
3042 }
3b0bdc72 3043 else
15a7d175
UD
3044 {
3045 switch (start_elem.type)
3046 {
3047 case SB_CHAR:
3048 bitset_set (sbcset, start_elem.opr.ch);
3049 break;
c0a0f9a3 3050#ifdef RE_ENABLE_I18N
15a7d175
UD
3051 case MB_CHAR:
3052 /* Check whether the array has enough space. */
3053 if (mbchar_alloc == mbcset->nmbchars)
3054 {
3055 /* Not enough, realloc it. */
3056 /* +1 in case of mbcset->nmbchars is 0. */
3057 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3058 /* Use realloc since array is NULL if *alloc == 0. */
3059 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
3060 mbchar_alloc);
3061 if (BE (mbcset->mbchars == NULL, 0))
3062 goto parse_bracket_exp_espace;
3063 }
3064 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3065 break;
c0a0f9a3 3066#endif /* RE_ENABLE_I18N */
15a7d175
UD
3067 case EQUIV_CLASS:
3068 *err = build_equiv_class (sbcset,
c0a0f9a3 3069#ifdef RE_ENABLE_I18N
15a7d175 3070 mbcset, &equiv_class_alloc,
c0a0f9a3 3071#endif /* RE_ENABLE_I18N */
3b0bdc72 3072 start_elem.opr.name);
15a7d175
UD
3073 if (BE (*err != REG_NOERROR, 0))
3074 goto parse_bracket_exp_free_return;
3075 break;
3076 case COLL_SYM:
3077 *err = build_collating_symbol (sbcset,
c0a0f9a3 3078#ifdef RE_ENABLE_I18N
15a7d175 3079 mbcset, &coll_sym_alloc,
c0a0f9a3 3080#endif /* RE_ENABLE_I18N */
3b0bdc72 3081 start_elem.opr.name);
15a7d175
UD
3082 if (BE (*err != REG_NOERROR, 0))
3083 goto parse_bracket_exp_free_return;
3084 break;
3085 case CHAR_CLASS:
66b110e8 3086 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3087#ifdef RE_ENABLE_I18N
7b7b9e70 3088 mbcset, &char_class_alloc,
c0a0f9a3 3089#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3090 start_elem.opr.name, syntax);
3091 if (BE (*err != REG_NOERROR, 0))
3092 goto parse_bracket_exp_free_return;
15a7d175
UD
3093 break;
3094 default:
3095 assert (0);
3096 break;
3097 }
3098 }
3b0bdc72 3099 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3100 break;
3b0bdc72
UD
3101 }
3102
3103 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3104
3105 /* If it is non-matching list. */
c0a0f9a3 3106#ifdef RE_ENABLE_I18N
3b0bdc72 3107 if (mbcset->non_match)
c0a0f9a3
UD
3108#else /* not RE_ENABLE_I18N */
3109 if (non_match)
3110#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3111 bitset_not (sbcset);
3112
3113 /* Build a tree for simple bracket. */
3114 br_token.type = SIMPLE_BRACKET;
3115 br_token.opr.sbcset = sbcset;
3116 new_idx = re_dfa_add_node (dfa, br_token, 0);
3117 work_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3118 if (BE (new_idx == -1 || work_tree == NULL, 0))
3b0bdc72
UD
3119 goto parse_bracket_exp_espace;
3120
c0a0f9a3 3121#ifdef RE_ENABLE_I18N
3b0bdc72 3122 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3123 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3124 || mbcset->non_match)))
3b0bdc72
UD
3125 {
3126 re_token_t alt_token;
3127 bin_tree_t *mbc_tree;
3128 /* Build a tree for complex bracket. */
3129 br_token.type = COMPLEX_BRACKET;
3130 br_token.opr.mbcset = mbcset;
3131 dfa->has_mb_node = 1;
3132 new_idx = re_dfa_add_node (dfa, br_token, 0);
3133 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3134 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
15a7d175 3135 goto parse_bracket_exp_espace;
3b0bdc72 3136 /* Then join them by ALT node. */
0742e48e 3137 dfa->has_plural_match = 1;
3b0bdc72
UD
3138 alt_token.type = OP_ALT;
3139 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3140 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
bc15410e 3141 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
15a7d175 3142 return work_tree;
3b0bdc72
UD
3143 }
3144 else
3145 {
3146 free_charset (mbcset);
3147 return work_tree;
3148 }
c0a0f9a3
UD
3149#else /* not RE_ENABLE_I18N */
3150 return work_tree;
3151#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3152
3153 parse_bracket_exp_espace:
3b0bdc72 3154 *err = REG_ESPACE;
434d3784
UD
3155 parse_bracket_exp_free_return:
3156 re_free (sbcset);
c0a0f9a3 3157#ifdef RE_ENABLE_I18N
434d3784 3158 free_charset (mbcset);
c0a0f9a3 3159#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3160 return NULL;
3161}
3162
434d3784
UD
3163/* Parse an element in the bracket expression. */
3164
3b0bdc72
UD
3165static reg_errcode_t
3166parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3167 bracket_elem_t *elem;
3168 re_string_t *regexp;
3169 re_token_t *token;
3170 int token_len;
3171 re_dfa_t *dfa;
3172 reg_syntax_t syntax;
3173{
3174#ifdef RE_ENABLE_I18N
3175 int cur_char_size;
3176 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3177 if (cur_char_size > 1)
3178 {
3179 elem->type = MB_CHAR;
3180 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3181 re_string_skip_bytes (regexp, cur_char_size);
3182 return REG_NOERROR;
3183 }
3184#endif /* RE_ENABLE_I18N */
3185 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3186 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3187 || token->type == OP_OPEN_EQUIV_CLASS)
3188 return parse_bracket_symbol (elem, regexp, token);
3189 elem->type = SB_CHAR;
3190 elem->opr.ch = token->opr.c;
3191 return REG_NOERROR;
3192}
3193
434d3784
UD
3194/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3195 such as [:<character_class>:], [.<collating_element>.], and
3196 [=<equivalent_class>=]. */
3197
3b0bdc72
UD
3198static reg_errcode_t
3199parse_bracket_symbol (elem, regexp, token)
3200 bracket_elem_t *elem;
3201 re_string_t *regexp;
3202 re_token_t *token;
3203{
3204 unsigned char ch, delim = token->opr.c;
3205 int i = 0;
602c2f9d 3206 for (;; ++i)
3b0bdc72 3207 {
602c2f9d 3208 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3209 return REG_EBRACK;
3b0bdc72 3210 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3211 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3212 else
15a7d175 3213 ch = re_string_fetch_byte (regexp);
3b0bdc72 3214 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3215 break;
3b0bdc72
UD
3216 elem->opr.name[i] = ch;
3217 }
3218 re_string_skip_bytes (regexp, 1);
3219 elem->opr.name[i] = '\0';
3220 switch (token->type)
3221 {
3222 case OP_OPEN_COLL_ELEM:
3223 elem->type = COLL_SYM;
3224 break;
3225 case OP_OPEN_EQUIV_CLASS:
3226 elem->type = EQUIV_CLASS;
3227 break;
3228 case OP_OPEN_CHAR_CLASS:
3229 elem->type = CHAR_CLASS;
3230 break;
3231 default:
3232 break;
3233 }
3234 return REG_NOERROR;
3235}
3236
3237 /* Helper function for parse_bracket_exp.
3238 Build the equivalence class which is represented by NAME.
3239 The result are written to MBCSET and SBCSET.
3240 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3241 is a pointer argument sinse we may update it. */
3242
3243static reg_errcode_t
c0a0f9a3
UD
3244#ifdef RE_ENABLE_I18N
3245build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3b0bdc72 3246 re_charset_t *mbcset;
3b0bdc72 3247 int *equiv_class_alloc;
c0a0f9a3
UD
3248#else /* not RE_ENABLE_I18N */
3249build_equiv_class (sbcset, name)
3250#endif /* not RE_ENABLE_I18N */
3251 re_bitset_ptr_t sbcset;
62439eac 3252 const unsigned char *name;
3b0bdc72 3253{
c0a0f9a3 3254#if defined _LIBC && defined RE_ENABLE_I18N
3b0bdc72
UD
3255 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3256 if (nrules != 0)
3257 {
3258 const int32_t *table, *indirect;
3259 const unsigned char *weights, *extra, *cp;
62439eac 3260 unsigned char char_buf[2];
3b0bdc72
UD
3261 int32_t idx1, idx2;
3262 unsigned int ch;
3263 size_t len;
3264 /* This #include defines a local function! */
3265# include <locale/weight.h>
3266 /* Calculate the index for equivalence class. */
62439eac 3267 cp = name;
3b0bdc72
UD
3268 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3269 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3270 _NL_COLLATE_WEIGHTMB);
3271 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3272 _NL_COLLATE_EXTRAMB);
3b0bdc72 3273 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3274 _NL_COLLATE_INDIRECTMB);
3b0bdc72 3275 idx1 = findidx (&cp);
62439eac 3276 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
15a7d175
UD
3277 /* This isn't a valid character. */
3278 return REG_ECOLLATE;
3b0bdc72
UD
3279
3280 /* Build single byte matcing table for this equivalence class. */
62439eac 3281 char_buf[1] = (unsigned char) '\0';
3b0bdc72
UD
3282 len = weights[idx1];
3283 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3284 {
3285 char_buf[0] = ch;
3286 cp = char_buf;
3287 idx2 = findidx (&cp);
3b0bdc72 3288/*
15a7d175 3289 idx2 = table[ch];
3b0bdc72 3290*/
15a7d175
UD
3291 if (idx2 == 0)
3292 /* This isn't a valid character. */
3293 continue;
3294 if (len == weights[idx2])
3295 {
3296 int cnt = 0;
3297 while (cnt <= len &&
3298 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3299 ++cnt;
3300
3301 if (cnt > len)
3302 bitset_set (sbcset, ch);
3303 }
3304 }
a9388965
UD
3305 /* Check whether the array has enough space. */
3306 if (*equiv_class_alloc == mbcset->nequiv_classes)
15a7d175
UD
3307 {
3308 /* Not enough, realloc it. */
3309 /* +1 in case of mbcset->nequiv_classes is 0. */
3310 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3311 /* Use realloc since the array is NULL if *alloc == 0. */
3312 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3313 *equiv_class_alloc);
3314 if (BE (mbcset->equiv_classes == NULL, 0))
3315 return REG_ESPACE;
3316 }
3b0bdc72
UD
3317 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3318 }
3319 else
c0a0f9a3 3320#endif /* _LIBC && RE_ENABLE_I18N */
3b0bdc72 3321 {
62439eac 3322 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3323 return REG_ECOLLATE;
62439eac 3324 bitset_set (sbcset, *name);
3b0bdc72
UD
3325 }
3326 return REG_NOERROR;
3327}
3328
3329 /* Helper function for parse_bracket_exp.
3330 Build the character class which is represented by NAME.
3331 The result are written to MBCSET and SBCSET.
3332 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3333 is a pointer argument sinse we may update it. */
3334
3335static reg_errcode_t
c0a0f9a3 3336#ifdef RE_ENABLE_I18N
66b110e8 3337build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3b0bdc72 3338 re_charset_t *mbcset;
3b0bdc72 3339 int *char_class_alloc;
c0a0f9a3 3340#else /* not RE_ENABLE_I18N */
66b110e8 3341build_charclass (trans, sbcset, class_name, syntax)
c0a0f9a3 3342#endif /* not RE_ENABLE_I18N */
66b110e8 3343 RE_TRANSLATE_TYPE trans;
c0a0f9a3 3344 re_bitset_ptr_t sbcset;
62439eac 3345 const unsigned char *class_name;
602c2f9d 3346 reg_syntax_t syntax;
3b0bdc72
UD
3347{
3348 int i;
62439eac 3349 const char *name = (const char *) class_name;
c0a0f9a3
UD
3350
3351 /* In case of REG_ICASE "upper" and "lower" match the both of
3352 upper and lower cases. */
3353 if ((syntax & RE_ICASE)
3354 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3355 name = "alpha";
3356
3357#ifdef RE_ENABLE_I18N
3b0bdc72 3358 /* Check the space of the arrays. */
a9388965
UD
3359 if (*char_class_alloc == mbcset->nchar_classes)
3360 {
3361 /* Not enough, realloc it. */
3362 /* +1 in case of mbcset->nchar_classes is 0. */
3363 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3364 /* Use realloc since array is NULL if *alloc == 0. */
3365 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
15a7d175 3366 *char_class_alloc);
bc15410e 3367 if (BE (mbcset->char_classes == NULL, 0))
15a7d175 3368 return REG_ESPACE;
a9388965 3369 }
3b0bdc72 3370 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3371#endif /* RE_ENABLE_I18N */
3b0bdc72 3372
66b110e8
UD
3373#define BUILD_CHARCLASS_LOOP(ctype_func) \
3374 for (i = 0; i < SBC_MAX; ++i) \
3375 { \
3376 if (ctype_func (i)) \
3377 { \
3378 int ch = trans ? trans[i] : i; \
3379 bitset_set (sbcset, ch); \
3380 } \
3b0bdc72
UD
3381 }
3382
3383 if (strcmp (name, "alnum") == 0)
3384 BUILD_CHARCLASS_LOOP (isalnum)
3385 else if (strcmp (name, "cntrl") == 0)
3386 BUILD_CHARCLASS_LOOP (iscntrl)
3387 else if (strcmp (name, "lower") == 0)
3388 BUILD_CHARCLASS_LOOP (islower)
3389 else if (strcmp (name, "space") == 0)
3390 BUILD_CHARCLASS_LOOP (isspace)
3391 else if (strcmp (name, "alpha") == 0)
3392 BUILD_CHARCLASS_LOOP (isalpha)
3393 else if (strcmp (name, "digit") == 0)
3394 BUILD_CHARCLASS_LOOP (isdigit)
3395 else if (strcmp (name, "print") == 0)
3396 BUILD_CHARCLASS_LOOP (isprint)
3397 else if (strcmp (name, "upper") == 0)
3398 BUILD_CHARCLASS_LOOP (isupper)
3399 else if (strcmp (name, "blank") == 0)
3400 BUILD_CHARCLASS_LOOP (isblank)
3401 else if (strcmp (name, "graph") == 0)
3402 BUILD_CHARCLASS_LOOP (isgraph)
3403 else if (strcmp (name, "punct") == 0)
3404 BUILD_CHARCLASS_LOOP (ispunct)
3405 else if (strcmp (name, "xdigit") == 0)
3406 BUILD_CHARCLASS_LOOP (isxdigit)
3407 else
3408 return REG_ECTYPE;
3409
3410 return REG_NOERROR;
3411}
3412
3413static bin_tree_t *
e2b6bfa3 3414build_charclass_op (dfa, trans, class_name, extra, not, err)
3b0bdc72 3415 re_dfa_t *dfa;
66b110e8 3416 RE_TRANSLATE_TYPE trans;
e2b6bfa3
UD
3417 const unsigned char *class_name;
3418 const unsigned char *extra;
3b0bdc72
UD
3419 int not;
3420 reg_errcode_t *err;
3421{
3422 re_bitset_ptr_t sbcset;
c0a0f9a3 3423#ifdef RE_ENABLE_I18N
3b0bdc72 3424 re_charset_t *mbcset;
c0a0f9a3
UD
3425 int alloc = 0;
3426#else /* not RE_ENABLE_I18N */
3427 int non_match = 0;
3428#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3429 reg_errcode_t ret;
3430 re_token_t br_token;
3431 bin_tree_t *tree;
c0a0f9a3 3432 int new_idx;
3b0bdc72
UD
3433
3434 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3435#ifdef RE_ENABLE_I18N
3b0bdc72 3436 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3437#endif /* RE_ENABLE_I18N */
3438
3439#ifdef RE_ENABLE_I18N
bc15410e 3440 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3441#else /* not RE_ENABLE_I18N */
3442 if (BE (sbcset == NULL, 0))
3443#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3444 {
3445 *err = REG_ESPACE;
3446 return NULL;
3447 }
3448
3449 if (not)
3450 {
c0a0f9a3 3451#ifdef RE_ENABLE_I18N
3b0bdc72 3452 int i;
3b0bdc72
UD
3453 /*
3454 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 3455 bitset_set(cset->sbcset, '\0');
3b0bdc72 3456 */
c0a0f9a3 3457 mbcset->non_match = 1;
3c0fb574 3458 if (dfa->mb_cur_max > 1)
15a7d175
UD
3459 for (i = 0; i < SBC_MAX; ++i)
3460 if (__btowc (i) == WEOF)
3461 bitset_set (sbcset, i);
c0a0f9a3
UD
3462#else /* not RE_ENABLE_I18N */
3463 non_match = 1;
3464#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3465 }
3466
602c2f9d 3467 /* We don't care the syntax in this case. */
66b110e8 3468 ret = build_charclass (trans, sbcset,
c0a0f9a3 3469#ifdef RE_ENABLE_I18N
15a7d175 3470 mbcset, &alloc,
c0a0f9a3 3471#endif /* RE_ENABLE_I18N */
e2b6bfa3 3472 class_name, 0);
c0a0f9a3 3473
bc15410e 3474 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3475 {
3476 re_free (sbcset);
c0a0f9a3 3477#ifdef RE_ENABLE_I18N
3b0bdc72 3478 free_charset (mbcset);
c0a0f9a3 3479#endif /* RE_ENABLE_I18N */
7b7b9e70 3480 *err = ret;
3b0bdc72
UD
3481 return NULL;
3482 }
434d3784 3483 /* \w match '_' also. */
e2b6bfa3
UD
3484 for (; *extra; extra++)
3485 bitset_set (sbcset, *extra);
3b0bdc72
UD
3486
3487 /* If it is non-matching list. */
c0a0f9a3 3488#ifdef RE_ENABLE_I18N
3b0bdc72 3489 if (mbcset->non_match)
c0a0f9a3
UD
3490#else /* not RE_ENABLE_I18N */
3491 if (non_match)
3492#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3493 bitset_not (sbcset);
3494
3495 /* Build a tree for simple bracket. */
3496 br_token.type = SIMPLE_BRACKET;
3497 br_token.opr.sbcset = sbcset;
3498 new_idx = re_dfa_add_node (dfa, br_token, 0);
3499 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3500 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
3501 goto build_word_op_espace;
3502
c0a0f9a3 3503#ifdef RE_ENABLE_I18N
3c0fb574 3504 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
3505 {
3506 re_token_t alt_token;
3507 bin_tree_t *mbc_tree;
3508 /* Build a tree for complex bracket. */
3509 br_token.type = COMPLEX_BRACKET;
3510 br_token.opr.mbcset = mbcset;
3511 dfa->has_mb_node = 1;
3512 new_idx = re_dfa_add_node (dfa, br_token, 0);
3513 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3514 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
15a7d175 3515 goto build_word_op_espace;
3b0bdc72
UD
3516 /* Then join them by ALT node. */
3517 alt_token.type = OP_ALT;
3518 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3519 tree = create_tree (tree, mbc_tree, 0, new_idx);
bc15410e 3520 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
15a7d175 3521 return tree;
3b0bdc72
UD
3522 }
3523 else
3524 {
3525 free_charset (mbcset);
3526 return tree;
3527 }
c0a0f9a3
UD
3528#else /* not RE_ENABLE_I18N */
3529 return tree;
3530#endif /* not RE_ENABLE_I18N */
3531
3b0bdc72
UD
3532 build_word_op_espace:
3533 re_free (sbcset);
c0a0f9a3 3534#ifdef RE_ENABLE_I18N
3b0bdc72 3535 free_charset (mbcset);
c0a0f9a3 3536#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3537 *err = REG_ESPACE;
3538 return NULL;
3539}
3540
3541/* This is intended for the expressions like "a{1,3}".
3542 Fetch a number from `input', and return the number.
3543 Return -1, if the number field is empty like "{,1}".
3544 Return -2, If an error is occured. */
3545
3546static int
3547fetch_number (input, token, syntax)
3548 re_string_t *input;
3549 re_token_t *token;
3550 reg_syntax_t syntax;
3551{
3552 int num = -1;
3553 unsigned char c;
3554 while (1)
3555 {
3556 *token = fetch_token (input, syntax);
3557 c = token->opr.c;
602c2f9d 3558 if (BE (token->type == END_OF_RE, 0))
15a7d175 3559 return -2;
3b0bdc72 3560 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3561 break;
602c2f9d 3562 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3563 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3564 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3565 }
3b0bdc72
UD
3566 return num;
3567}
3568\f
c0a0f9a3 3569#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3570static void
3571free_charset (re_charset_t *cset)
3572{
3573 re_free (cset->mbchars);
c0a0f9a3 3574# ifdef _LIBC
3b0bdc72
UD
3575 re_free (cset->coll_syms);
3576 re_free (cset->equiv_classes);
3577 re_free (cset->range_starts);
3578 re_free (cset->range_ends);
c0a0f9a3 3579# endif
3b0bdc72
UD
3580 re_free (cset->char_classes);
3581 re_free (cset);
3582}
c0a0f9a3 3583#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3584\f
3585/* Functions for binary tree operation. */
3586
3587/* Create a node of tree.
3588 Note: This function automatically free left and right if malloc fails. */
3589
3590static bin_tree_t *
3591create_tree (left, right, type, index)
3592 bin_tree_t *left;
3593 bin_tree_t *right;
3594 re_token_type_t type;
3595 int index;
3596{
3597 bin_tree_t *tree;
3598 tree = re_malloc (bin_tree_t, 1);
bc15410e 3599 if (BE (tree == NULL, 0))
3b0bdc72
UD
3600 {
3601 free_bin_tree (left);
3602 free_bin_tree (right);
3603 return NULL;
3604 }
3605 tree->parent = NULL;
3606 tree->left = left;
3607 tree->right = right;
3608 tree->type = type;
3609 tree->node_idx = index;
3610 tree->first = -1;
3611 tree->next = -1;
3612 re_node_set_init_empty (&tree->eclosure);
3613
3614 if (left != NULL)
3615 left->parent = tree;
3616 if (right != NULL)
3617 right->parent = tree;
3618 return tree;
3619}
3620
3621/* Free the sub tree pointed by TREE. */
3622
3623static void
3624free_bin_tree (tree)
3625 bin_tree_t *tree;
3626{
3627 if (tree == NULL)
3628 return;
3629 /*re_node_set_free (&tree->eclosure);*/
3630 free_bin_tree (tree->left);
3631 free_bin_tree (tree->right);
3632 re_free (tree);
3633}
3634
3635/* Duplicate the node SRC, and return new node. */
3636
3637static bin_tree_t *
3638duplicate_tree (src, dfa)
3639 const bin_tree_t *src;
3640 re_dfa_t *dfa;
3641{
3642 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3643 int new_node_idx;
3644 /* Since node indies must be according to Post-order of the tree,
3645 we must duplicate the left at first. */
3646 if (src->left != NULL)
3647 {
3648 left = duplicate_tree (src->left, dfa);
3649 if (left == NULL)
15a7d175 3650 return NULL;
3b0bdc72
UD
3651 }
3652
3653 /* Secondaly, duplicate the right. */
3654 if (src->right != NULL)
3655 {
3656 right = duplicate_tree (src->right, dfa);
3657 if (right == NULL)
15a7d175
UD
3658 {
3659 free_bin_tree (left);
3660 return NULL;
3661 }
3b0bdc72
UD
3662 }
3663
3664 /* At last, duplicate itself. */
3665 if (src->type == NON_TYPE)
3666 {
3667 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3668 dfa->nodes[new_node_idx].duplicated = 1;
bc15410e 3669 if (BE (new_node_idx == -1, 0))
15a7d175
UD
3670 {
3671 free_bin_tree (left);
3672 free_bin_tree (right);
3673 return NULL;
3674 }
3b0bdc72
UD
3675 }
3676 else
3677 new_node_idx = src->type;
3678
3679 new_tree = create_tree (left, right, src->type, new_node_idx);
bc15410e 3680 if (BE (new_tree == NULL, 0))
3b0bdc72
UD
3681 {
3682 free_bin_tree (left);
3683 free_bin_tree (right);
3684 }
3685 return new_tree;
3686}
This page took 0.525899 seconds and 5 git commands to generate.