]> sourceware.org Git - glibc.git/blob - posix/regcomp.c
Upate.
[glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static reg_errcode_t analyze (re_dfa_t *dfa);
34 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
35 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
36 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
37 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
38 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
39 int top_clone_node, int root_node,
40 unsigned int constraint);
41 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
42 unsigned int constraint);
43 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
44 unsigned int constraint);
45 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
46 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
47 int node, int root);
48 static void calc_inveclosure (re_dfa_t *dfa);
49 static int fetch_number (re_string_t *input, re_token_t *token,
50 reg_syntax_t syntax);
51 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
52 static int peek_token (re_token_t *token, re_string_t *input,
53 reg_syntax_t syntax);
54 static int peek_token_bracket (re_token_t *token, re_string_t *input,
55 reg_syntax_t syntax);
56 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
57 reg_syntax_t syntax, reg_errcode_t *err);
58 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
59 re_token_t *token, reg_syntax_t syntax,
60 int nest, reg_errcode_t *err);
61 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
71 re_dfa_t *dfa, re_token_t *token,
72 reg_syntax_t syntax, reg_errcode_t *err);
73 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
74 re_token_t *token, reg_syntax_t syntax,
75 reg_errcode_t *err);
76 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
77 re_string_t *regexp,
78 re_token_t *token, int token_len,
79 re_dfa_t *dfa,
80 reg_syntax_t syntax);
81 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
82 re_string_t *regexp,
83 re_token_t *token);
84 #ifndef _LIBC
85 # ifdef RE_ENABLE_I18N
86 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
87 re_charset_t *mbcset, int *range_alloc,
88 bracket_elem_t *start_elem,
89 bracket_elem_t *end_elem);
90 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
91 re_charset_t *mbcset,
92 int *coll_sym_alloc,
93 const unsigned char *name);
94 # else /* not RE_ENABLE_I18N */
95 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
96 bracket_elem_t *start_elem,
97 bracket_elem_t *end_elem);
98 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
99 const unsigned char *name);
100 # endif /* not RE_ENABLE_I18N */
101 #endif /* not _LIBC */
102 #ifdef RE_ENABLE_I18N
103 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *equiv_class_alloc,
106 const unsigned char *name);
107 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
108 re_bitset_ptr_t sbcset,
109 re_charset_t *mbcset,
110 int *char_class_alloc,
111 const unsigned char *class_name,
112 reg_syntax_t syntax);
113 #else /* not RE_ENABLE_I18N */
114 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
115 const unsigned char *name);
116 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
117 re_bitset_ptr_t sbcset,
118 const unsigned char *class_name,
119 reg_syntax_t syntax);
120 #endif /* not RE_ENABLE_I18N */
121 static bin_tree_t *build_word_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
122 int not, reg_errcode_t *err);
123 static void free_bin_tree (bin_tree_t *tree);
124 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
125 re_token_type_t type, int index);
126 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
127 \f
128 /* This table gives an error message for each of the error codes listed
129 in regex.h. Obviously the order here has to be same as there.
130 POSIX doesn't require that we do anything for REG_NOERROR,
131 but why not be nice? */
132
133 const char __re_error_msgid[] attribute_hidden =
134 {
135 #define REG_NOERROR_IDX 0
136 gettext_noop ("Success") /* REG_NOERROR */
137 "\0"
138 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139 gettext_noop ("No match") /* REG_NOMATCH */
140 "\0"
141 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 "\0"
144 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 "\0"
147 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 "\0"
150 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 "\0"
153 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 "\0"
156 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
158 "\0"
159 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
160 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 "\0"
162 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 "\0"
165 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 "\0"
168 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 "\0"
171 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 "\0"
174 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 "\0"
177 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 "\0"
180 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 "\0"
183 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185 };
186
187 const size_t __re_error_msgid_idx[] attribute_hidden =
188 {
189 REG_NOERROR_IDX,
190 REG_NOMATCH_IDX,
191 REG_BADPAT_IDX,
192 REG_ECOLLATE_IDX,
193 REG_ECTYPE_IDX,
194 REG_EESCAPE_IDX,
195 REG_ESUBREG_IDX,
196 REG_EBRACK_IDX,
197 REG_EPAREN_IDX,
198 REG_EBRACE_IDX,
199 REG_BADBR_IDX,
200 REG_ERANGE_IDX,
201 REG_ESPACE_IDX,
202 REG_BADRPT_IDX,
203 REG_EEND_IDX,
204 REG_ESIZE_IDX,
205 REG_ERPAREN_IDX
206 };
207 \f
208 /* Entry points for GNU code. */
209
210 /* re_compile_pattern is the GNU regular expression compiler: it
211 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
212 Returns 0 if the pattern was valid, otherwise an error string.
213
214 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
215 are set in BUFP on entry. */
216
217 const char *
218 re_compile_pattern (pattern, length, bufp)
219 const char *pattern;
220 size_t length;
221 struct re_pattern_buffer *bufp;
222 {
223 reg_errcode_t ret;
224
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub. */
228 bufp->no_sub = 0;
229
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
232
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234
235 if (!ret)
236 return NULL;
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 }
239 #ifdef _LIBC
240 weak_alias (__re_compile_pattern, re_compile_pattern)
241 #endif
242
243 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options;
249
250
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
254
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
257
258 reg_syntax_t
259 re_set_syntax (syntax)
260 reg_syntax_t syntax;
261 {
262 reg_syntax_t ret = re_syntax_options;
263
264 re_syntax_options = syntax;
265 return ret;
266 }
267 #ifdef _LIBC
268 weak_alias (__re_set_syntax, re_set_syntax)
269 #endif
270
271 int
272 re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
274 {
275 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
276 char *fastmap = bufp->fastmap;
277
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
287 return 0;
288 }
289 #ifdef _LIBC
290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
291 #endif
292
293 static inline void
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, int icase, int ch)
296 {
297 fastmap[ch] = 1;
298 if (icase)
299 fastmap[tolower (ch)] = 1;
300 }
301
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
304
305 static void
306 re_compile_fastmap_iter (bufp, init_state, fastmap)
307 regex_t *bufp;
308 const re_dfastate_t *init_state;
309 char *fastmap;
310 {
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 int node_cnt;
313 int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 {
316 int node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
318
319 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 else if (type == SIMPLE_BRACKET)
322 {
323 int i, j, ch;
324 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
325 for (j = 0; j < UINT_BITS; ++j, ++ch)
326 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
327 re_set_fastmap (fastmap, icase, ch);
328 }
329 #ifdef RE_ENABLE_I18N
330 else if (type == COMPLEX_BRACKET)
331 {
332 int i;
333 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
334 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
335 || cset->nranges || cset->nchar_classes)
336 {
337 # ifdef _LIBC
338 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
339 {
340 /* In this case we want to catch the bytes which are
341 the first byte of any collation elements.
342 e.g. In da_DK, we want to catch 'a' since "aa"
343 is a valid collation element, and don't catch
344 'b' since 'b' is the only collation element
345 which starts from 'b'. */
346 int j, ch;
347 const int32_t *table = (const int32_t *)
348 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
349 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
350 for (j = 0; j < UINT_BITS; ++j, ++ch)
351 if (table[ch] < 0)
352 re_set_fastmap (fastmap, icase, ch);
353 }
354 # else
355 if (MB_CUR_MAX > 1)
356 for (i = 0; i < SBC_MAX; ++i)
357 if (__btowc (i) == WEOF)
358 re_set_fastmap (fastmap, icase, i);
359 # endif /* not _LIBC */
360 }
361 for (i = 0; i < cset->nmbchars; ++i)
362 {
363 char buf[256];
364 mbstate_t state;
365 memset (&state, '\0', sizeof (state));
366 __wcrtomb (buf, cset->mbchars[i], &state);
367 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
368 }
369 }
370 #endif /* RE_ENABLE_I18N */
371 else if (type == END_OF_RE || type == OP_PERIOD)
372 {
373 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
374 if (type == END_OF_RE)
375 bufp->can_be_null = 1;
376 return;
377 }
378 }
379 }
380 \f
381 /* Entry point for POSIX code. */
382 /* regcomp takes a regular expression as a string and compiles it.
383
384 PREG is a regex_t *. We do not expect any fields to be initialized,
385 since POSIX says we shouldn't. Thus, we set
386
387 `buffer' to the compiled pattern;
388 `used' to the length of the compiled pattern;
389 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
390 REG_EXTENDED bit in CFLAGS is set; otherwise, to
391 RE_SYNTAX_POSIX_BASIC;
392 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
393 `fastmap' to an allocated space for the fastmap;
394 `fastmap_accurate' to zero;
395 `re_nsub' to the number of subexpressions in PATTERN.
396
397 PATTERN is the address of the pattern string.
398
399 CFLAGS is a series of bits which affect compilation.
400
401 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
402 use POSIX basic syntax.
403
404 If REG_NEWLINE is set, then . and [^...] don't match newline.
405 Also, regexec will try a match beginning after every newline.
406
407 If REG_ICASE is set, then we considers upper- and lowercase
408 versions of letters to be equivalent when matching.
409
410 If REG_NOSUB is set, then when PREG is passed to regexec, that
411 routine will report only success or failure, and nothing about the
412 registers.
413
414 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
415 the return codes and their meanings.) */
416
417 int
418 regcomp (preg, pattern, cflags)
419 regex_t *__restrict preg;
420 const char *__restrict pattern;
421 int cflags;
422 {
423 reg_errcode_t ret;
424 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
425 : RE_SYNTAX_POSIX_BASIC);
426
427 preg->buffer = NULL;
428 preg->allocated = 0;
429 preg->used = 0;
430
431 /* Try to allocate space for the fastmap. */
432 preg->fastmap = re_malloc (char, SBC_MAX);
433 if (BE (preg->fastmap == NULL, 0))
434 return REG_ESPACE;
435
436 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
437
438 /* If REG_NEWLINE is set, newlines are treated differently. */
439 if (cflags & REG_NEWLINE)
440 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
441 syntax &= ~RE_DOT_NEWLINE;
442 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
443 /* It also changes the matching behavior. */
444 preg->newline_anchor = 1;
445 }
446 else
447 preg->newline_anchor = 0;
448 preg->no_sub = !!(cflags & REG_NOSUB);
449 preg->translate = NULL;
450
451 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
452
453 /* POSIX doesn't distinguish between an unmatched open-group and an
454 unmatched close-group: both are REG_EPAREN. */
455 if (ret == REG_ERPAREN)
456 ret = REG_EPAREN;
457
458 /* We have already checked preg->fastmap != NULL. */
459 if (BE (ret == REG_NOERROR, 1))
460 /* Compute the fastmap now, since regexec cannot modify the pattern
461 buffer. This function nevers fails in this implementation. */
462 (void) re_compile_fastmap (preg);
463 else
464 {
465 /* Some error occurred while compiling the expression. */
466 re_free (preg->fastmap);
467 preg->fastmap = NULL;
468 }
469
470 return (int) ret;
471 }
472 #ifdef _LIBC
473 weak_alias (__regcomp, regcomp)
474 #endif
475
476 /* Returns a message corresponding to an error code, ERRCODE, returned
477 from either regcomp or regexec. We don't use PREG here. */
478
479 size_t
480 regerror (errcode, preg, errbuf, errbuf_size)
481 int errcode;
482 const regex_t *preg;
483 char *errbuf;
484 size_t errbuf_size;
485 {
486 const char *msg;
487 size_t msg_size;
488
489 if (BE (errcode < 0
490 || errcode >= (int) (sizeof (__re_error_msgid_idx)
491 / sizeof (__re_error_msgid_idx[0])), 0))
492 /* Only error codes returned by the rest of the code should be passed
493 to this routine. If we are given anything else, or if other regex
494 code generates an invalid error code, then the program has a bug.
495 Dump core so we can fix it. */
496 abort ();
497
498 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
499
500 msg_size = strlen (msg) + 1; /* Includes the null. */
501
502 if (BE (errbuf_size != 0, 1))
503 {
504 if (BE (msg_size > errbuf_size, 0))
505 {
506 #if defined HAVE_MEMPCPY || defined _LIBC
507 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
508 #else
509 memcpy (errbuf, msg, errbuf_size - 1);
510 errbuf[errbuf_size - 1] = 0;
511 #endif
512 }
513 else
514 memcpy (errbuf, msg, msg_size);
515 }
516
517 return msg_size;
518 }
519 #ifdef _LIBC
520 weak_alias (__regerror, regerror)
521 #endif
522
523
524 static void
525 free_dfa_content (re_dfa_t *dfa)
526 {
527 int i, j;
528
529 re_free (dfa->subexps);
530
531 for (i = 0; i < dfa->nodes_len; ++i)
532 {
533 re_token_t *node = dfa->nodes + i;
534 #ifdef RE_ENABLE_I18N
535 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
536 free_charset (node->opr.mbcset);
537 else
538 #endif /* RE_ENABLE_I18N */
539 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
540 re_free (node->opr.sbcset);
541 }
542 re_free (dfa->nexts);
543 for (i = 0; i < dfa->nodes_len; ++i)
544 {
545 if (dfa->eclosures != NULL)
546 re_node_set_free (dfa->eclosures + i);
547 if (dfa->inveclosures != NULL)
548 re_node_set_free (dfa->inveclosures + i);
549 if (dfa->edests != NULL)
550 re_node_set_free (dfa->edests + i);
551 }
552 re_free (dfa->edests);
553 re_free (dfa->eclosures);
554 re_free (dfa->inveclosures);
555 re_free (dfa->nodes);
556
557 for (i = 0; i <= dfa->state_hash_mask; ++i)
558 {
559 struct re_state_table_entry *entry = dfa->state_table + i;
560 for (j = 0; j < entry->num; ++j)
561 {
562 re_dfastate_t *state = entry->array[j];
563 free_state (state);
564 }
565 re_free (entry->array);
566 }
567 re_free (dfa->state_table);
568
569 if (dfa->word_char != NULL)
570 re_free (dfa->word_char);
571 #ifdef DEBUG
572 re_free (dfa->re_str);
573 #endif
574
575 re_free (dfa);
576 }
577
578
579 /* Free dynamically allocated space used by PREG. */
580
581 void
582 regfree (preg)
583 regex_t *preg;
584 {
585 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
586 if (BE (dfa != NULL, 1))
587 free_dfa_content (dfa);
588
589 re_free (preg->fastmap);
590 }
591 #ifdef _LIBC
592 weak_alias (__regfree, regfree)
593 #endif
594 \f
595 /* Entry points compatible with 4.2 BSD regex library. We don't define
596 them unless specifically requested. */
597
598 #if defined _REGEX_RE_COMP || defined _LIBC
599
600 /* BSD has one and only one pattern buffer. */
601 static struct re_pattern_buffer re_comp_buf;
602
603 char *
604 # ifdef _LIBC
605 /* Make these definitions weak in libc, so POSIX programs can redefine
606 these names if they don't use our functions, and still use
607 regcomp/regexec above without link errors. */
608 weak_function
609 # endif
610 re_comp (s)
611 const char *s;
612 {
613 reg_errcode_t ret;
614 char *fastmap;
615
616 if (!s)
617 {
618 if (!re_comp_buf.buffer)
619 return gettext ("No previous regular expression");
620 return 0;
621 }
622
623 if (re_comp_buf.buffer)
624 {
625 fastmap = re_comp_buf.fastmap;
626 re_comp_buf.fastmap = NULL;
627 __regfree (&re_comp_buf);
628 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
629 re_comp_buf.fastmap = fastmap;
630 }
631
632 if (re_comp_buf.fastmap == NULL)
633 {
634 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
635 if (re_comp_buf.fastmap == NULL)
636 return (char *) gettext (__re_error_msgid
637 + __re_error_msgid_idx[(int) REG_ESPACE]);
638 }
639
640 /* Since `re_exec' always passes NULL for the `regs' argument, we
641 don't need to initialize the pattern buffer fields which affect it. */
642
643 /* Match anchors at newlines. */
644 re_comp_buf.newline_anchor = 1;
645
646 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
647
648 if (!ret)
649 return NULL;
650
651 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
652 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
653 }
654
655 #ifdef _LIBC
656 libc_freeres_fn (free_mem)
657 {
658 __regfree (&re_comp_buf);
659 }
660 #endif
661
662 #endif /* _REGEX_RE_COMP */
663 \f
664 /* Internal entry point.
665 Compile the regular expression PATTERN, whose length is LENGTH.
666 SYNTAX indicate regular expression's syntax. */
667
668 static reg_errcode_t
669 re_compile_internal (preg, pattern, length, syntax)
670 regex_t *preg;
671 const char * pattern;
672 int length;
673 reg_syntax_t syntax;
674 {
675 reg_errcode_t err = REG_NOERROR;
676 re_dfa_t *dfa;
677 re_string_t regexp;
678
679 /* Initialize the pattern buffer. */
680 preg->fastmap_accurate = 0;
681 preg->syntax = syntax;
682 preg->not_bol = preg->not_eol = 0;
683 preg->used = 0;
684 preg->re_nsub = 0;
685 preg->can_be_null = 0;
686 preg->regs_allocated = REGS_UNALLOCATED;
687
688 /* Initialize the dfa. */
689 dfa = (re_dfa_t *) preg->buffer;
690 if (preg->allocated < sizeof (re_dfa_t))
691 {
692 /* If zero allocated, but buffer is non-null, try to realloc
693 enough space. This loses if buffer's address is bogus, but
694 that is the user's responsibility. If ->buffer is NULL this
695 is a simple allocation. */
696 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
697 if (dfa == NULL)
698 return REG_ESPACE;
699 preg->allocated = sizeof (re_dfa_t);
700 }
701 preg->buffer = (unsigned char *) dfa;
702 preg->used = sizeof (re_dfa_t);
703
704 err = init_dfa (dfa, length);
705 if (BE (err != REG_NOERROR, 0))
706 {
707 re_free (dfa);
708 preg->buffer = NULL;
709 preg->allocated = 0;
710 return err;
711 }
712 #ifdef DEBUG
713 dfa->re_str = re_malloc (char, length + 1);
714 strncpy (dfa->re_str, pattern, length + 1);
715 #endif
716
717 err = re_string_construct (&regexp, pattern, length, preg->translate,
718 syntax & RE_ICASE);
719 if (BE (err != REG_NOERROR, 0))
720 {
721 re_free (dfa);
722 preg->buffer = NULL;
723 preg->allocated = 0;
724 return err;
725 }
726
727 /* Parse the regular expression, and build a structure tree. */
728 preg->re_nsub = 0;
729 dfa->str_tree = parse (&regexp, preg, syntax, &err);
730 if (BE (dfa->str_tree == NULL, 0))
731 goto re_compile_internal_free_return;
732
733 /* Analyze the tree and collect information which is necessary to
734 create the dfa. */
735 err = analyze (dfa);
736 if (BE (err != REG_NOERROR, 0))
737 goto re_compile_internal_free_return;
738
739 /* Then create the initial state of the dfa. */
740 err = create_initial_state (dfa);
741
742 /* Release work areas. */
743 free_workarea_compile (preg);
744 re_string_destruct (&regexp);
745
746 if (BE (err != REG_NOERROR, 0))
747 {
748 re_compile_internal_free_return:
749 free_dfa_content (dfa);
750 preg->buffer = NULL;
751 preg->allocated = 0;
752 }
753
754 return err;
755 }
756
757 /* Initialize DFA. We use the length of the regular expression PAT_LEN
758 as the initial length of some arrays. */
759
760 static reg_errcode_t
761 init_dfa (dfa, pat_len)
762 re_dfa_t *dfa;
763 int pat_len;
764 {
765 int table_size;
766
767 memset (dfa, '\0', sizeof (re_dfa_t));
768
769 dfa->nodes_alloc = pat_len + 1;
770 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
771
772 dfa->states_alloc = pat_len + 1;
773
774 /* table_size = 2 ^ ceil(log pat_len) */
775 for (table_size = 1; table_size > 0; table_size <<= 1)
776 if (table_size > pat_len)
777 break;
778
779 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
780 dfa->state_hash_mask = table_size - 1;
781
782 dfa->subexps_alloc = 1;
783 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
784 dfa->word_char = NULL;
785
786 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
787 || dfa->subexps == NULL, 0))
788 {
789 /* We don't bother to free anything which was allocated. Very
790 soon the process will go down anyway. */
791 dfa->subexps = NULL;
792 dfa->state_table = NULL;
793 dfa->nodes = NULL;
794 return REG_ESPACE;
795 }
796 return REG_NOERROR;
797 }
798
799 /* Initialize WORD_CHAR table, which indicate which character is
800 "word". In this case "word" means that it is the word construction
801 character used by some operators like "\<", "\>", etc. */
802
803 static reg_errcode_t
804 init_word_char (dfa)
805 re_dfa_t *dfa;
806 {
807 int i, j, ch;
808 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
809 if (BE (dfa->word_char == NULL, 0))
810 return REG_ESPACE;
811 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
812 for (j = 0; j < UINT_BITS; ++j, ++ch)
813 if (isalnum (ch) || ch == '_')
814 dfa->word_char[i] |= 1 << j;
815 return REG_NOERROR;
816 }
817
818 /* Free the work area which are only used while compiling. */
819
820 static void
821 free_workarea_compile (preg)
822 regex_t *preg;
823 {
824 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
825 free_bin_tree (dfa->str_tree);
826 dfa->str_tree = NULL;
827 re_free (dfa->org_indices);
828 dfa->org_indices = NULL;
829 }
830
831 /* Create initial states for all contexts. */
832
833 static reg_errcode_t
834 create_initial_state (dfa)
835 re_dfa_t *dfa;
836 {
837 int first, i;
838 reg_errcode_t err;
839 re_node_set init_nodes;
840
841 /* Initial states have the epsilon closure of the node which is
842 the first node of the regular expression. */
843 first = dfa->str_tree->first;
844 dfa->init_node = first;
845 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
846 if (BE (err != REG_NOERROR, 0))
847 return err;
848
849 /* The back-references which are in initial states can epsilon transit,
850 since in this case all of the subexpressions can be null.
851 Then we add epsilon closures of the nodes which are the next nodes of
852 the back-references. */
853 if (dfa->nbackref > 0)
854 for (i = 0; i < init_nodes.nelem; ++i)
855 {
856 int node_idx = init_nodes.elems[i];
857 re_token_type_t type = dfa->nodes[node_idx].type;
858
859 int clexp_idx;
860 if (type != OP_BACK_REF)
861 continue;
862 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
863 {
864 re_token_t *clexp_node;
865 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
866 if (clexp_node->type == OP_CLOSE_SUBEXP
867 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
868 break;
869 }
870 if (clexp_idx == init_nodes.nelem)
871 continue;
872
873 if (type == OP_BACK_REF)
874 {
875 int dest_idx = dfa->edests[node_idx].elems[0];
876 if (!re_node_set_contains (&init_nodes, dest_idx))
877 {
878 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
879 i = 0;
880 }
881 }
882 }
883
884 /* It must be the first time to invoke acquire_state. */
885 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
886 /* We don't check ERR here, since the initial state must not be NULL. */
887 if (BE (dfa->init_state == NULL, 0))
888 return err;
889 if (dfa->init_state->has_constraint)
890 {
891 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
892 CONTEXT_WORD);
893 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
894 CONTEXT_NEWLINE);
895 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
896 &init_nodes,
897 CONTEXT_NEWLINE
898 | CONTEXT_BEGBUF);
899 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
900 || dfa->init_state_begbuf == NULL, 0))
901 return err;
902 }
903 else
904 dfa->init_state_word = dfa->init_state_nl
905 = dfa->init_state_begbuf = dfa->init_state;
906
907 re_node_set_free (&init_nodes);
908 return REG_NOERROR;
909 }
910 \f
911 /* Analyze the structure tree, and calculate "first", "next", "edest",
912 "eclosure", and "inveclosure". */
913
914 static reg_errcode_t
915 analyze (dfa)
916 re_dfa_t *dfa;
917 {
918 int i;
919 reg_errcode_t ret;
920
921 /* Allocate arrays. */
922 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
923 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
924 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
925 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
926 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
927 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
928 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
929 return REG_ESPACE;
930 /* Initialize them. */
931 for (i = 0; i < dfa->nodes_len; ++i)
932 {
933 dfa->nexts[i] = -1;
934 re_node_set_init_empty (dfa->edests + i);
935 re_node_set_init_empty (dfa->eclosures + i);
936 re_node_set_init_empty (dfa->inveclosures + i);
937 }
938
939 ret = analyze_tree (dfa, dfa->str_tree);
940 if (BE (ret == REG_NOERROR, 1))
941 {
942 ret = calc_eclosure (dfa);
943 if (ret == REG_NOERROR)
944 calc_inveclosure (dfa);
945 }
946 return ret;
947 }
948
949 /* Helper functions for analyze.
950 This function calculate "first", "next", and "edest" for the subtree
951 whose root is NODE. */
952
953 static reg_errcode_t
954 analyze_tree (dfa, node)
955 re_dfa_t *dfa;
956 bin_tree_t *node;
957 {
958 reg_errcode_t ret;
959 if (node->first == -1)
960 calc_first (dfa, node);
961 if (node->next == -1)
962 calc_next (dfa, node);
963 if (node->eclosure.nelem == 0)
964 calc_epsdest (dfa, node);
965 /* Calculate "first" etc. for the left child. */
966 if (node->left != NULL)
967 {
968 ret = analyze_tree (dfa, node->left);
969 if (BE (ret != REG_NOERROR, 0))
970 return ret;
971 }
972 /* Calculate "first" etc. for the right child. */
973 if (node->right != NULL)
974 {
975 ret = analyze_tree (dfa, node->right);
976 if (BE (ret != REG_NOERROR, 0))
977 return ret;
978 }
979 return REG_NOERROR;
980 }
981
982 /* Calculate "first" for the node NODE. */
983 static void
984 calc_first (dfa, node)
985 re_dfa_t *dfa;
986 bin_tree_t *node;
987 {
988 int idx, type;
989 idx = node->node_idx;
990 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
991
992 switch (type)
993 {
994 #ifdef DEBUG
995 case OP_OPEN_BRACKET:
996 case OP_CLOSE_BRACKET:
997 case OP_OPEN_DUP_NUM:
998 case OP_CLOSE_DUP_NUM:
999 case OP_NON_MATCH_LIST:
1000 case OP_OPEN_COLL_ELEM:
1001 case OP_CLOSE_COLL_ELEM:
1002 case OP_OPEN_EQUIV_CLASS:
1003 case OP_CLOSE_EQUIV_CLASS:
1004 case OP_OPEN_CHAR_CLASS:
1005 case OP_CLOSE_CHAR_CLASS:
1006 /* These must not be appeared here. */
1007 assert (0);
1008 #endif
1009 case END_OF_RE:
1010 case CHARACTER:
1011 case OP_PERIOD:
1012 case OP_DUP_ASTERISK:
1013 case OP_DUP_QUESTION:
1014 #ifdef RE_ENABLE_I18N
1015 case COMPLEX_BRACKET:
1016 #endif /* RE_ENABLE_I18N */
1017 case SIMPLE_BRACKET:
1018 case OP_BACK_REF:
1019 case ANCHOR:
1020 case OP_OPEN_SUBEXP:
1021 case OP_CLOSE_SUBEXP:
1022 node->first = idx;
1023 break;
1024 case OP_DUP_PLUS:
1025 #ifdef DEBUG
1026 assert (node->left != NULL);
1027 #endif
1028 if (node->left->first == -1)
1029 calc_first (dfa, node->left);
1030 node->first = node->left->first;
1031 break;
1032 case OP_ALT:
1033 node->first = idx;
1034 break;
1035 /* else fall through */
1036 default:
1037 #ifdef DEBUG
1038 assert (node->left != NULL);
1039 #endif
1040 if (node->left->first == -1)
1041 calc_first (dfa, node->left);
1042 node->first = node->left->first;
1043 break;
1044 }
1045 }
1046
1047 /* Calculate "next" for the node NODE. */
1048
1049 static void
1050 calc_next (dfa, node)
1051 re_dfa_t *dfa;
1052 bin_tree_t *node;
1053 {
1054 int idx, type;
1055 bin_tree_t *parent = node->parent;
1056 if (parent == NULL)
1057 {
1058 node->next = -1;
1059 idx = node->node_idx;
1060 if (node->type == 0)
1061 dfa->nexts[idx] = node->next;
1062 return;
1063 }
1064
1065 idx = parent->node_idx;
1066 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1067
1068 switch (type)
1069 {
1070 case OP_DUP_ASTERISK:
1071 case OP_DUP_PLUS:
1072 node->next = idx;
1073 break;
1074 case CONCAT:
1075 if (parent->left == node)
1076 {
1077 if (parent->right->first == -1)
1078 calc_first (dfa, parent->right);
1079 node->next = parent->right->first;
1080 break;
1081 }
1082 /* else fall through */
1083 default:
1084 if (parent->next == -1)
1085 calc_next (dfa, parent);
1086 node->next = parent->next;
1087 break;
1088 }
1089 idx = node->node_idx;
1090 if (node->type == 0)
1091 dfa->nexts[idx] = node->next;
1092 }
1093
1094 /* Calculate "edest" for the node NODE. */
1095
1096 static void
1097 calc_epsdest (dfa, node)
1098 re_dfa_t *dfa;
1099 bin_tree_t *node;
1100 {
1101 int idx;
1102 idx = node->node_idx;
1103 if (node->type == 0)
1104 {
1105 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1106 || dfa->nodes[idx].type == OP_DUP_PLUS
1107 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1108 {
1109 if (node->left->first == -1)
1110 calc_first (dfa, node->left);
1111 if (node->next == -1)
1112 calc_next (dfa, node);
1113 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1114 node->next);
1115 }
1116 else if (dfa->nodes[idx].type == OP_ALT)
1117 {
1118 int left, right;
1119 if (node->left != NULL)
1120 {
1121 if (node->left->first == -1)
1122 calc_first (dfa, node->left);
1123 left = node->left->first;
1124 }
1125 else
1126 {
1127 if (node->next == -1)
1128 calc_next (dfa, node);
1129 left = node->next;
1130 }
1131 if (node->right != NULL)
1132 {
1133 if (node->right->first == -1)
1134 calc_first (dfa, node->right);
1135 right = node->right->first;
1136 }
1137 else
1138 {
1139 if (node->next == -1)
1140 calc_next (dfa, node);
1141 right = node->next;
1142 }
1143 re_node_set_init_2 (dfa->edests + idx, left, right);
1144 }
1145 else if (dfa->nodes[idx].type == ANCHOR
1146 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1147 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1148 || dfa->nodes[idx].type == OP_BACK_REF)
1149 re_node_set_init_1 (dfa->edests + idx, node->next);
1150 }
1151 }
1152
1153 /* Duplicate the epsilon closure of the node ROOT_NODE.
1154 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1155 to their own constraint. */
1156
1157 static reg_errcode_t
1158 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1159 init_constraint)
1160 re_dfa_t *dfa;
1161 int top_org_node, top_clone_node, root_node;
1162 unsigned int init_constraint;
1163 {
1164 reg_errcode_t err;
1165 int org_node, clone_node, ret;
1166 unsigned int constraint = init_constraint;
1167 for (org_node = top_org_node, clone_node = top_clone_node;;)
1168 {
1169 int org_dest, clone_dest;
1170 if (dfa->nodes[org_node].type == OP_BACK_REF)
1171 {
1172 /* If the back reference epsilon-transit, its destination must
1173 also have the constraint. Then duplicate the epsilon closure
1174 of the destination of the back reference, and store it in
1175 edests of the back reference. */
1176 org_dest = dfa->nexts[org_node];
1177 re_node_set_empty (dfa->edests + clone_node);
1178 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1179 if (BE (err != REG_NOERROR, 0))
1180 return err;
1181 dfa->nexts[clone_node] = dfa->nexts[org_node];
1182 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1183 if (BE (ret < 0, 0))
1184 return REG_ESPACE;
1185 }
1186 else if (dfa->edests[org_node].nelem == 0)
1187 {
1188 /* In case of the node can't epsilon-transit, don't duplicate the
1189 destination and store the original destination as the
1190 destination of the node. */
1191 dfa->nexts[clone_node] = dfa->nexts[org_node];
1192 break;
1193 }
1194 else if (dfa->edests[org_node].nelem == 1)
1195 {
1196 /* In case of the node can epsilon-transit, and it has only one
1197 destination. */
1198 org_dest = dfa->edests[org_node].elems[0];
1199 re_node_set_empty (dfa->edests + clone_node);
1200 if (dfa->nodes[org_node].type == ANCHOR)
1201 {
1202 /* In case of the node has another constraint, append it. */
1203 if (org_node == root_node && clone_node != org_node)
1204 {
1205 /* ...but if the node is root_node itself, it means the
1206 epsilon closure have a loop, then tie it to the
1207 destination of the root_node. */
1208 ret = re_node_set_insert (dfa->edests + clone_node,
1209 org_dest);
1210 if (BE (ret < 0, 0))
1211 return REG_ESPACE;
1212 break;
1213 }
1214 constraint |= dfa->nodes[org_node].opr.ctx_type;
1215 }
1216 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1217 if (BE (err != REG_NOERROR, 0))
1218 return err;
1219 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1220 if (BE (ret < 0, 0))
1221 return REG_ESPACE;
1222 }
1223 else /* dfa->edests[org_node].nelem == 2 */
1224 {
1225 /* In case of the node can epsilon-transit, and it has two
1226 destinations. E.g. '|', '*', '+', '?'. */
1227 org_dest = dfa->edests[org_node].elems[0];
1228 re_node_set_empty (dfa->edests + clone_node);
1229 /* Search for a duplicated node which satisfies the constraint. */
1230 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1231 if (clone_dest == -1)
1232 {
1233 /* There are no such a duplicated node, create a new one. */
1234 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1235 if (BE (err != REG_NOERROR, 0))
1236 return err;
1237 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1238 if (BE (ret < 0, 0))
1239 return REG_ESPACE;
1240 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1241 root_node, constraint);
1242 if (BE (err != REG_NOERROR, 0))
1243 return err;
1244 }
1245 else
1246 {
1247 /* There are a duplicated node which satisfy the constraint,
1248 use it to avoid infinite loop. */
1249 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1250 if (BE (ret < 0, 0))
1251 return REG_ESPACE;
1252 }
1253
1254 org_dest = dfa->edests[org_node].elems[1];
1255 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1256 if (BE (err != REG_NOERROR, 0))
1257 return err;
1258 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1259 if (BE (ret < 0, 0))
1260 return REG_ESPACE;
1261 }
1262 org_node = org_dest;
1263 clone_node = clone_dest;
1264 }
1265 return REG_NOERROR;
1266 }
1267
1268 /* Search for a node which is duplicated from the node ORG_NODE, and
1269 satisfies the constraint CONSTRAINT. */
1270
1271 static int
1272 search_duplicated_node (dfa, org_node, constraint)
1273 re_dfa_t *dfa;
1274 int org_node;
1275 unsigned int constraint;
1276 {
1277 int idx;
1278 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1279 {
1280 if (org_node == dfa->org_indices[idx]
1281 && constraint == dfa->nodes[idx].constraint)
1282 return idx; /* Found. */
1283 }
1284 return -1; /* Not found. */
1285 }
1286
1287 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1288 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1289 otherwise return the error code. */
1290
1291 static reg_errcode_t
1292 duplicate_node (new_idx, dfa, org_idx, constraint)
1293 re_dfa_t *dfa;
1294 int *new_idx, org_idx;
1295 unsigned int constraint;
1296 {
1297 re_token_t dup;
1298 int dup_idx;
1299
1300 dup = dfa->nodes[org_idx];
1301 dup_idx = re_dfa_add_node (dfa, dup, 1);
1302 if (BE (dup_idx == -1, 0))
1303 return REG_ESPACE;
1304 dfa->nodes[dup_idx].constraint = constraint;
1305 if (dfa->nodes[org_idx].type == ANCHOR)
1306 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1307 dfa->nodes[dup_idx].duplicated = 1;
1308 re_node_set_init_empty (dfa->edests + dup_idx);
1309 re_node_set_init_empty (dfa->eclosures + dup_idx);
1310 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1311
1312 /* Store the index of the original node. */
1313 dfa->org_indices[dup_idx] = org_idx;
1314 *new_idx = dup_idx;
1315 return REG_NOERROR;
1316 }
1317
1318 static void
1319 calc_inveclosure (dfa)
1320 re_dfa_t *dfa;
1321 {
1322 int src, idx, dest;
1323 for (src = 0; src < dfa->nodes_len; ++src)
1324 {
1325 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1326 {
1327 dest = dfa->eclosures[src].elems[idx];
1328 re_node_set_insert (dfa->inveclosures + dest, src);
1329 }
1330 }
1331 }
1332
1333 /* Calculate "eclosure" for all the node in DFA. */
1334
1335 static reg_errcode_t
1336 calc_eclosure (dfa)
1337 re_dfa_t *dfa;
1338 {
1339 int node_idx, incomplete;
1340 #ifdef DEBUG
1341 assert (dfa->nodes_len > 0);
1342 #endif
1343 incomplete = 0;
1344 /* For each nodes, calculate epsilon closure. */
1345 for (node_idx = 0; ; ++node_idx)
1346 {
1347 reg_errcode_t err;
1348 re_node_set eclosure_elem;
1349 if (node_idx == dfa->nodes_len)
1350 {
1351 if (!incomplete)
1352 break;
1353 incomplete = 0;
1354 node_idx = 0;
1355 }
1356
1357 #ifdef DEBUG
1358 assert (dfa->eclosures[node_idx].nelem != -1);
1359 #endif
1360 /* If we have already calculated, skip it. */
1361 if (dfa->eclosures[node_idx].nelem != 0)
1362 continue;
1363 /* Calculate epsilon closure of `node_idx'. */
1364 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1365 if (BE (err != REG_NOERROR, 0))
1366 return err;
1367
1368 if (dfa->eclosures[node_idx].nelem == 0)
1369 {
1370 incomplete = 1;
1371 re_node_set_free (&eclosure_elem);
1372 }
1373 }
1374 return REG_NOERROR;
1375 }
1376
1377 /* Calculate epsilon closure of NODE. */
1378
1379 static reg_errcode_t
1380 calc_eclosure_iter (new_set, dfa, node, root)
1381 re_node_set *new_set;
1382 re_dfa_t *dfa;
1383 int node, root;
1384 {
1385 reg_errcode_t err;
1386 unsigned int constraint;
1387 int i, incomplete;
1388 re_node_set eclosure;
1389 incomplete = 0;
1390 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1391 if (BE (err != REG_NOERROR, 0))
1392 return err;
1393
1394 /* This indicates that we are calculating this node now.
1395 We reference this value to avoid infinite loop. */
1396 dfa->eclosures[node].nelem = -1;
1397
1398 constraint = ((dfa->nodes[node].type == ANCHOR)
1399 ? dfa->nodes[node].opr.ctx_type : 0);
1400 /* If the current node has constraints, duplicate all nodes.
1401 Since they must inherit the constraints. */
1402 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1403 {
1404 int org_node, cur_node;
1405 org_node = cur_node = node;
1406 err = duplicate_node_closure (dfa, node, node, node, constraint);
1407 if (BE (err != REG_NOERROR, 0))
1408 return err;
1409 }
1410
1411 /* Expand each epsilon destination nodes. */
1412 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1413 for (i = 0; i < dfa->edests[node].nelem; ++i)
1414 {
1415 re_node_set eclosure_elem;
1416 int edest = dfa->edests[node].elems[i];
1417 /* If calculating the epsilon closure of `edest' is in progress,
1418 return intermediate result. */
1419 if (dfa->eclosures[edest].nelem == -1)
1420 {
1421 incomplete = 1;
1422 continue;
1423 }
1424 /* If we haven't calculated the epsilon closure of `edest' yet,
1425 calculate now. Otherwise use calculated epsilon closure. */
1426 if (dfa->eclosures[edest].nelem == 0)
1427 {
1428 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1429 if (BE (err != REG_NOERROR, 0))
1430 return err;
1431 }
1432 else
1433 eclosure_elem = dfa->eclosures[edest];
1434 /* Merge the epsilon closure of `edest'. */
1435 re_node_set_merge (&eclosure, &eclosure_elem);
1436 /* If the epsilon closure of `edest' is incomplete,
1437 the epsilon closure of this node is also incomplete. */
1438 if (dfa->eclosures[edest].nelem == 0)
1439 {
1440 incomplete = 1;
1441 re_node_set_free (&eclosure_elem);
1442 }
1443 }
1444
1445 /* Epsilon closures include itself. */
1446 re_node_set_insert (&eclosure, node);
1447 if (incomplete && !root)
1448 dfa->eclosures[node].nelem = 0;
1449 else
1450 dfa->eclosures[node] = eclosure;
1451 *new_set = eclosure;
1452 return REG_NOERROR;
1453 }
1454 \f
1455 /* Functions for token which are used in the parser. */
1456
1457 /* Fetch a token from INPUT.
1458 We must not use this function inside bracket expressions. */
1459
1460 static re_token_t
1461 fetch_token (input, syntax)
1462 re_string_t *input;
1463 reg_syntax_t syntax;
1464 {
1465 re_token_t token;
1466 int consumed_byte;
1467 consumed_byte = peek_token (&token, input, syntax);
1468 re_string_skip_bytes (input, consumed_byte);
1469 return token;
1470 }
1471
1472 /* Peek a token from INPUT, and return the length of the token.
1473 We must not use this function inside bracket expressions. */
1474
1475 static int
1476 peek_token (token, input, syntax)
1477 re_token_t *token;
1478 re_string_t *input;
1479 reg_syntax_t syntax;
1480 {
1481 unsigned char c;
1482
1483 if (re_string_eoi (input))
1484 {
1485 token->type = END_OF_RE;
1486 return 0;
1487 }
1488
1489 c = re_string_peek_byte (input, 0);
1490 token->opr.c = c;
1491
1492 #ifdef RE_ENABLE_I18N
1493 token->mb_partial = 0;
1494 if (MB_CUR_MAX > 1 &&
1495 !re_string_first_byte (input, re_string_cur_idx (input)))
1496 {
1497 token->type = CHARACTER;
1498 token->mb_partial = 1;
1499 return 1;
1500 }
1501 #endif
1502 if (c == '\\')
1503 {
1504 unsigned char c2;
1505 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1506 {
1507 token->type = BACK_SLASH;
1508 return 1;
1509 }
1510
1511 c2 = re_string_peek_byte_case (input, 1);
1512 token->opr.c = c2;
1513 token->type = CHARACTER;
1514 switch (c2)
1515 {
1516 case '|':
1517 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1518 token->type = OP_ALT;
1519 break;
1520 case '1': case '2': case '3': case '4': case '5':
1521 case '6': case '7': case '8': case '9':
1522 if (!(syntax & RE_NO_BK_REFS))
1523 {
1524 token->type = OP_BACK_REF;
1525 token->opr.idx = c2 - '0';
1526 }
1527 break;
1528 case '<':
1529 if (!(syntax & RE_NO_GNU_OPS))
1530 {
1531 token->type = ANCHOR;
1532 token->opr.idx = WORD_FIRST;
1533 }
1534 break;
1535 case '>':
1536 if (!(syntax & RE_NO_GNU_OPS))
1537 {
1538 token->type = ANCHOR;
1539 token->opr.idx = WORD_LAST;
1540 }
1541 break;
1542 case 'b':
1543 if (!(syntax & RE_NO_GNU_OPS))
1544 {
1545 token->type = ANCHOR;
1546 token->opr.idx = WORD_DELIM;
1547 }
1548 break;
1549 case 'B':
1550 if (!(syntax & RE_NO_GNU_OPS))
1551 {
1552 token->type = ANCHOR;
1553 token->opr.idx = INSIDE_WORD;
1554 }
1555 break;
1556 case 'w':
1557 if (!(syntax & RE_NO_GNU_OPS))
1558 token->type = OP_WORD;
1559 break;
1560 case 'W':
1561 if (!(syntax & RE_NO_GNU_OPS))
1562 token->type = OP_NOTWORD;
1563 break;
1564 case '`':
1565 if (!(syntax & RE_NO_GNU_OPS))
1566 {
1567 token->type = ANCHOR;
1568 token->opr.idx = BUF_FIRST;
1569 }
1570 break;
1571 case '\'':
1572 if (!(syntax & RE_NO_GNU_OPS))
1573 {
1574 token->type = ANCHOR;
1575 token->opr.idx = BUF_LAST;
1576 }
1577 break;
1578 case '(':
1579 if (!(syntax & RE_NO_BK_PARENS))
1580 token->type = OP_OPEN_SUBEXP;
1581 break;
1582 case ')':
1583 if (!(syntax & RE_NO_BK_PARENS))
1584 token->type = OP_CLOSE_SUBEXP;
1585 break;
1586 case '+':
1587 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588 token->type = OP_DUP_PLUS;
1589 break;
1590 case '?':
1591 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1592 token->type = OP_DUP_QUESTION;
1593 break;
1594 case '{':
1595 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596 token->type = OP_OPEN_DUP_NUM;
1597 break;
1598 case '}':
1599 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1600 token->type = OP_CLOSE_DUP_NUM;
1601 break;
1602 default:
1603 break;
1604 }
1605 return 2;
1606 }
1607
1608 token->type = CHARACTER;
1609 switch (c)
1610 {
1611 case '\n':
1612 if (syntax & RE_NEWLINE_ALT)
1613 token->type = OP_ALT;
1614 break;
1615 case '|':
1616 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1617 token->type = OP_ALT;
1618 break;
1619 case '*':
1620 token->type = OP_DUP_ASTERISK;
1621 break;
1622 case '+':
1623 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624 token->type = OP_DUP_PLUS;
1625 break;
1626 case '?':
1627 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1628 token->type = OP_DUP_QUESTION;
1629 break;
1630 case '{':
1631 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632 token->type = OP_OPEN_DUP_NUM;
1633 break;
1634 case '}':
1635 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1636 token->type = OP_CLOSE_DUP_NUM;
1637 break;
1638 case '(':
1639 if (syntax & RE_NO_BK_PARENS)
1640 token->type = OP_OPEN_SUBEXP;
1641 break;
1642 case ')':
1643 if (syntax & RE_NO_BK_PARENS)
1644 token->type = OP_CLOSE_SUBEXP;
1645 break;
1646 case '[':
1647 token->type = OP_OPEN_BRACKET;
1648 break;
1649 case '.':
1650 token->type = OP_PERIOD;
1651 break;
1652 case '^':
1653 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1654 re_string_cur_idx (input) != 0)
1655 {
1656 char prev = re_string_peek_byte (input, -1);
1657 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1658 break;
1659 }
1660 token->type = ANCHOR;
1661 token->opr.idx = LINE_FIRST;
1662 break;
1663 case '$':
1664 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1665 re_string_cur_idx (input) + 1 != re_string_length (input))
1666 {
1667 re_token_t next;
1668 re_string_skip_bytes (input, 1);
1669 peek_token (&next, input, syntax);
1670 re_string_skip_bytes (input, -1);
1671 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1672 break;
1673 }
1674 token->type = ANCHOR;
1675 token->opr.idx = LINE_LAST;
1676 break;
1677 default:
1678 break;
1679 }
1680 return 1;
1681 }
1682
1683 /* Peek a token from INPUT, and return the length of the token.
1684 We must not use this function out of bracket expressions. */
1685
1686 static int
1687 peek_token_bracket (token, input, syntax)
1688 re_token_t *token;
1689 re_string_t *input;
1690 reg_syntax_t syntax;
1691 {
1692 unsigned char c;
1693 if (re_string_eoi (input))
1694 {
1695 token->type = END_OF_RE;
1696 return 0;
1697 }
1698 c = re_string_peek_byte (input, 0);
1699 token->opr.c = c;
1700
1701 #ifdef RE_ENABLE_I18N
1702 if (MB_CUR_MAX > 1 &&
1703 !re_string_first_byte (input, re_string_cur_idx (input)))
1704 {
1705 token->type = CHARACTER;
1706 return 1;
1707 }
1708 #endif /* RE_ENABLE_I18N */
1709
1710 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1711 {
1712 /* In this case, '\' escape a character. */
1713 unsigned char c2;
1714 re_string_skip_bytes (input, 1);
1715 c2 = re_string_peek_byte (input, 0);
1716 token->opr.c = c2;
1717 token->type = CHARACTER;
1718 return 1;
1719 }
1720 if (c == '[') /* '[' is a special char in a bracket exps. */
1721 {
1722 unsigned char c2;
1723 int token_len;
1724 c2 = re_string_peek_byte (input, 1);
1725 token->opr.c = c2;
1726 token_len = 2;
1727 switch (c2)
1728 {
1729 case '.':
1730 token->type = OP_OPEN_COLL_ELEM;
1731 break;
1732 case '=':
1733 token->type = OP_OPEN_EQUIV_CLASS;
1734 break;
1735 case ':':
1736 if (syntax & RE_CHAR_CLASSES)
1737 {
1738 token->type = OP_OPEN_CHAR_CLASS;
1739 break;
1740 }
1741 /* else fall through. */
1742 default:
1743 token->type = CHARACTER;
1744 token->opr.c = c;
1745 token_len = 1;
1746 break;
1747 }
1748 return token_len;
1749 }
1750 switch (c)
1751 {
1752 case '-':
1753 token->type = OP_CHARSET_RANGE;
1754 break;
1755 case ']':
1756 token->type = OP_CLOSE_BRACKET;
1757 break;
1758 case '^':
1759 token->type = OP_NON_MATCH_LIST;
1760 break;
1761 default:
1762 token->type = CHARACTER;
1763 }
1764 return 1;
1765 }
1766 \f
1767 /* Functions for parser. */
1768
1769 /* Entry point of the parser.
1770 Parse the regular expression REGEXP and return the structure tree.
1771 If an error is occured, ERR is set by error code, and return NULL.
1772 This function build the following tree, from regular expression <reg_exp>:
1773 CAT
1774 / \
1775 / \
1776 <reg_exp> EOR
1777
1778 CAT means concatenation.
1779 EOR means end of regular expression. */
1780
1781 static bin_tree_t *
1782 parse (regexp, preg, syntax, err)
1783 re_string_t *regexp;
1784 regex_t *preg;
1785 reg_syntax_t syntax;
1786 reg_errcode_t *err;
1787 {
1788 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1789 bin_tree_t *tree, *eor, *root;
1790 re_token_t current_token;
1791 int new_idx;
1792 current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1793 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1794 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1795 return NULL;
1796 new_idx = re_dfa_add_node (dfa, current_token, 0);
1797 eor = create_tree (NULL, NULL, 0, new_idx);
1798 if (tree != NULL)
1799 root = create_tree (tree, eor, CONCAT, 0);
1800 else
1801 root = eor;
1802 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1803 {
1804 *err = REG_ESPACE;
1805 return NULL;
1806 }
1807 return root;
1808 }
1809
1810 /* This function build the following tree, from regular expression
1811 <branch1>|<branch2>:
1812 ALT
1813 / \
1814 / \
1815 <branch1> <branch2>
1816
1817 ALT means alternative, which represents the operator `|'. */
1818
1819 static bin_tree_t *
1820 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1821 re_string_t *regexp;
1822 regex_t *preg;
1823 re_token_t *token;
1824 reg_syntax_t syntax;
1825 int nest;
1826 reg_errcode_t *err;
1827 {
1828 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1829 bin_tree_t *tree, *branch = NULL;
1830 int new_idx;
1831 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1832 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1833 return NULL;
1834
1835 while (token->type == OP_ALT)
1836 {
1837 re_token_t alt_token = *token;
1838 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1839 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1840 if (token->type != OP_ALT && token->type != END_OF_RE
1841 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1842 {
1843 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1844 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1845 {
1846 free_bin_tree (tree);
1847 return NULL;
1848 }
1849 }
1850 else
1851 branch = NULL;
1852 tree = create_tree (tree, branch, 0, new_idx);
1853 if (BE (new_idx == -1 || tree == NULL, 0))
1854 {
1855 *err = REG_ESPACE;
1856 return NULL;
1857 }
1858 dfa->has_plural_match = 1;
1859 }
1860 return tree;
1861 }
1862
1863 /* This function build the following tree, from regular expression
1864 <exp1><exp2>:
1865 CAT
1866 / \
1867 / \
1868 <exp1> <exp2>
1869
1870 CAT means concatenation. */
1871
1872 static bin_tree_t *
1873 parse_branch (regexp, preg, token, syntax, nest, err)
1874 re_string_t *regexp;
1875 regex_t *preg;
1876 re_token_t *token;
1877 reg_syntax_t syntax;
1878 int nest;
1879 reg_errcode_t *err;
1880 {
1881 bin_tree_t *tree, *exp;
1882 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1883 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1884 return NULL;
1885
1886 while (token->type != OP_ALT && token->type != END_OF_RE
1887 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1888 {
1889 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1890 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1891 {
1892 free_bin_tree (tree);
1893 return NULL;
1894 }
1895 if (tree != NULL && exp != NULL)
1896 {
1897 tree = create_tree (tree, exp, CONCAT, 0);
1898 if (tree == NULL)
1899 {
1900 *err = REG_ESPACE;
1901 return NULL;
1902 }
1903 }
1904 else if (tree == NULL)
1905 tree = exp;
1906 /* Otherwise exp == NULL, we don't need to create new tree. */
1907 }
1908 return tree;
1909 }
1910
1911 /* This function build the following tree, from regular expression a*:
1912 *
1913 |
1914 a
1915 */
1916
1917 static bin_tree_t *
1918 parse_expression (regexp, preg, token, syntax, nest, err)
1919 re_string_t *regexp;
1920 regex_t *preg;
1921 re_token_t *token;
1922 reg_syntax_t syntax;
1923 int nest;
1924 reg_errcode_t *err;
1925 {
1926 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1927 bin_tree_t *tree;
1928 int new_idx;
1929 switch (token->type)
1930 {
1931 case CHARACTER:
1932 new_idx = re_dfa_add_node (dfa, *token, 0);
1933 tree = create_tree (NULL, NULL, 0, new_idx);
1934 if (BE (new_idx == -1 || tree == NULL, 0))
1935 {
1936 *err = REG_ESPACE;
1937 return NULL;
1938 }
1939 #ifdef RE_ENABLE_I18N
1940 if (MB_CUR_MAX > 1)
1941 {
1942 while (!re_string_eoi (regexp)
1943 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1944 {
1945 bin_tree_t *mbc_remain;
1946 *token = fetch_token (regexp, syntax);
1947 new_idx = re_dfa_add_node (dfa, *token, 0);
1948 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1949 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1950 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1951 {
1952 *err = REG_ESPACE;
1953 return NULL;
1954 }
1955 }
1956 }
1957 #endif
1958 break;
1959 case OP_OPEN_SUBEXP:
1960 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1961 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1962 return NULL;
1963 break;
1964 case OP_OPEN_BRACKET:
1965 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1966 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1967 return NULL;
1968 break;
1969 case OP_BACK_REF:
1970 if (BE (preg->re_nsub < token->opr.idx
1971 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1972 {
1973 *err = REG_ESUBREG;
1974 return NULL;
1975 }
1976 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1977 new_idx = re_dfa_add_node (dfa, *token, 0);
1978 tree = create_tree (NULL, NULL, 0, new_idx);
1979 if (BE (new_idx == -1 || tree == NULL, 0))
1980 {
1981 *err = REG_ESPACE;
1982 return NULL;
1983 }
1984 ++dfa->nbackref;
1985 dfa->has_mb_node = 1;
1986 break;
1987 case OP_DUP_ASTERISK:
1988 case OP_DUP_PLUS:
1989 case OP_DUP_QUESTION:
1990 case OP_OPEN_DUP_NUM:
1991 if (syntax & RE_CONTEXT_INVALID_OPS)
1992 {
1993 *err = REG_BADRPT;
1994 return NULL;
1995 }
1996 else if (syntax & RE_CONTEXT_INDEP_OPS)
1997 {
1998 *token = fetch_token (regexp, syntax);
1999 return parse_expression (regexp, preg, token, syntax, nest, err);
2000 }
2001 /* else fall through */
2002 case OP_CLOSE_SUBEXP:
2003 if ((token->type == OP_CLOSE_SUBEXP) &&
2004 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2005 {
2006 *err = REG_ERPAREN;
2007 return NULL;
2008 }
2009 /* else fall through */
2010 case OP_CLOSE_DUP_NUM:
2011 /* We treat it as a normal character. */
2012
2013 /* Then we can these characters as normal characters. */
2014 token->type = CHARACTER;
2015 new_idx = re_dfa_add_node (dfa, *token, 0);
2016 tree = create_tree (NULL, NULL, 0, new_idx);
2017 if (BE (new_idx == -1 || tree == NULL, 0))
2018 {
2019 *err = REG_ESPACE;
2020 return NULL;
2021 }
2022 break;
2023 case ANCHOR:
2024 if (dfa->word_char == NULL)
2025 {
2026 *err = init_word_char (dfa);
2027 if (BE (*err != REG_NOERROR, 0))
2028 return NULL;
2029 }
2030 if (token->opr.ctx_type == WORD_DELIM)
2031 {
2032 bin_tree_t *tree_first, *tree_last;
2033 int idx_first, idx_last;
2034 token->opr.ctx_type = WORD_FIRST;
2035 idx_first = re_dfa_add_node (dfa, *token, 0);
2036 tree_first = create_tree (NULL, NULL, 0, idx_first);
2037 token->opr.ctx_type = WORD_LAST;
2038 idx_last = re_dfa_add_node (dfa, *token, 0);
2039 tree_last = create_tree (NULL, NULL, 0, idx_last);
2040 token->type = OP_ALT;
2041 new_idx = re_dfa_add_node (dfa, *token, 0);
2042 tree = create_tree (tree_first, tree_last, 0, new_idx);
2043 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2044 || tree_first == NULL || tree_last == NULL
2045 || tree == NULL, 0))
2046 {
2047 *err = REG_ESPACE;
2048 return NULL;
2049 }
2050 }
2051 else
2052 {
2053 new_idx = re_dfa_add_node (dfa, *token, 0);
2054 tree = create_tree (NULL, NULL, 0, new_idx);
2055 if (BE (new_idx == -1 || tree == NULL, 0))
2056 {
2057 *err = REG_ESPACE;
2058 return NULL;
2059 }
2060 }
2061 /* We must return here, since ANCHORs can't be followed
2062 by repetition operators.
2063 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2064 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2065 *token = fetch_token (regexp, syntax);
2066 return tree;
2067 case OP_PERIOD:
2068 new_idx = re_dfa_add_node (dfa, *token, 0);
2069 tree = create_tree (NULL, NULL, 0, new_idx);
2070 if (BE (new_idx == -1 || tree == NULL, 0))
2071 {
2072 *err = REG_ESPACE;
2073 return NULL;
2074 }
2075 if (MB_CUR_MAX > 1)
2076 dfa->has_mb_node = 1;
2077 break;
2078 case OP_WORD:
2079 tree = build_word_op (dfa, regexp->trans, 0, err);
2080 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2081 return NULL;
2082 break;
2083 case OP_NOTWORD:
2084 tree = build_word_op (dfa, regexp->trans, 1, err);
2085 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2086 return NULL;
2087 break;
2088 case OP_ALT:
2089 case END_OF_RE:
2090 return NULL;
2091 case BACK_SLASH:
2092 *err = REG_EESCAPE;
2093 return NULL;
2094 default:
2095 /* Must not happen? */
2096 #ifdef DEBUG
2097 assert (0);
2098 #endif
2099 return NULL;
2100 }
2101 *token = fetch_token (regexp, syntax);
2102
2103 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2104 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2105 {
2106 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2107 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108 return NULL;
2109 dfa->has_plural_match = 1;
2110 }
2111
2112 return tree;
2113 }
2114
2115 /* This function build the following tree, from regular expression
2116 (<reg_exp>):
2117 SUBEXP
2118 |
2119 <reg_exp>
2120 */
2121
2122 static bin_tree_t *
2123 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2124 re_string_t *regexp;
2125 regex_t *preg;
2126 re_token_t *token;
2127 reg_syntax_t syntax;
2128 int nest;
2129 reg_errcode_t *err;
2130 {
2131 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2132 bin_tree_t *tree, *left_par, *right_par;
2133 size_t cur_nsub;
2134 int new_idx;
2135 cur_nsub = preg->re_nsub++;
2136 if (dfa->subexps_alloc < preg->re_nsub)
2137 {
2138 re_subexp_t *new_array;
2139 dfa->subexps_alloc *= 2;
2140 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2141 if (BE (new_array == NULL, 0))
2142 {
2143 dfa->subexps_alloc /= 2;
2144 *err = REG_ESPACE;
2145 return NULL;
2146 }
2147 dfa->subexps = new_array;
2148 }
2149 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2150 dfa->subexps[cur_nsub].end = -1;
2151
2152 new_idx = re_dfa_add_node (dfa, *token, 0);
2153 left_par = create_tree (NULL, NULL, 0, new_idx);
2154 if (BE (new_idx == -1 || left_par == NULL, 0))
2155 {
2156 *err = REG_ESPACE;
2157 return NULL;
2158 }
2159 dfa->nodes[new_idx].opr.idx = cur_nsub;
2160 *token = fetch_token (regexp, syntax);
2161
2162 /* The subexpression may be a null string. */
2163 if (token->type == OP_CLOSE_SUBEXP)
2164 tree = NULL;
2165 else
2166 {
2167 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2168 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2169 return NULL;
2170 }
2171 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2172 {
2173 free_bin_tree (tree);
2174 *err = REG_BADPAT;
2175 return NULL;
2176 }
2177 new_idx = re_dfa_add_node (dfa, *token, 0);
2178 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2179 right_par = create_tree (NULL, NULL, 0, new_idx);
2180 tree = ((tree == NULL) ? right_par
2181 : create_tree (tree, right_par, CONCAT, 0));
2182 tree = create_tree (left_par, tree, CONCAT, 0);
2183 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2184 {
2185 *err = REG_ESPACE;
2186 return NULL;
2187 }
2188 dfa->nodes[new_idx].opr.idx = cur_nsub;
2189
2190 return tree;
2191 }
2192
2193 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2194
2195 static bin_tree_t *
2196 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2197 bin_tree_t *dup_elem;
2198 re_string_t *regexp;
2199 re_dfa_t *dfa;
2200 re_token_t *token;
2201 reg_syntax_t syntax;
2202 reg_errcode_t *err;
2203 {
2204 re_token_t dup_token;
2205 bin_tree_t *tree = dup_elem, *work_tree;
2206 int new_idx, start_idx = re_string_cur_idx (regexp);
2207 re_token_t start_token = *token;
2208 if (token->type == OP_OPEN_DUP_NUM)
2209 {
2210 int i;
2211 int end = 0;
2212 int start = fetch_number (regexp, token, syntax);
2213 bin_tree_t *elem;
2214 if (start == -1)
2215 {
2216 if (token->type == CHARACTER && token->opr.c == ',')
2217 start = 0; /* We treat "{,m}" as "{0,m}". */
2218 else
2219 {
2220 *err = REG_BADBR; /* <re>{} is invalid. */
2221 return NULL;
2222 }
2223 }
2224 if (BE (start != -2, 1))
2225 {
2226 /* We treat "{n}" as "{n,n}". */
2227 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2228 : ((token->type == CHARACTER && token->opr.c == ',')
2229 ? fetch_number (regexp, token, syntax) : -2));
2230 }
2231 if (BE (start == -2 || end == -2, 0))
2232 {
2233 /* Invalid sequence. */
2234 if (token->type == OP_CLOSE_DUP_NUM)
2235 goto parse_dup_op_invalid_interval;
2236 else
2237 goto parse_dup_op_ebrace;
2238 }
2239 if (BE (start == 0 && end == 0, 0))
2240 {
2241 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2242 *token = fetch_token (regexp, syntax);
2243 free_bin_tree (dup_elem);
2244 return NULL;
2245 }
2246
2247 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2248 elem = tree;
2249 for (i = 0; i < start; ++i)
2250 if (i != 0)
2251 {
2252 work_tree = duplicate_tree (elem, dfa);
2253 tree = create_tree (tree, work_tree, CONCAT, 0);
2254 if (BE (work_tree == NULL || tree == NULL, 0))
2255 goto parse_dup_op_espace;
2256 }
2257
2258 if (end == -1)
2259 {
2260 /* We treat "<re>{0,}" as "<re>*". */
2261 dup_token.type = OP_DUP_ASTERISK;
2262 if (start > 0)
2263 {
2264 elem = duplicate_tree (elem, dfa);
2265 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2266 work_tree = create_tree (elem, NULL, 0, new_idx);
2267 tree = create_tree (tree, work_tree, CONCAT, 0);
2268 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2269 || tree == NULL, 0))
2270 goto parse_dup_op_espace;
2271 }
2272 else
2273 {
2274 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2275 tree = create_tree (elem, NULL, 0, new_idx);
2276 if (BE (new_idx == -1 || tree == NULL, 0))
2277 goto parse_dup_op_espace;
2278 }
2279 }
2280 else if (end - start > 0)
2281 {
2282 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2283 dup_token.type = OP_DUP_QUESTION;
2284 if (start > 0)
2285 {
2286 elem = duplicate_tree (elem, dfa);
2287 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2288 elem = create_tree (elem, NULL, 0, new_idx);
2289 tree = create_tree (tree, elem, CONCAT, 0);
2290 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2291 goto parse_dup_op_espace;
2292 }
2293 else
2294 {
2295 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2296 tree = elem = create_tree (elem, NULL, 0, new_idx);
2297 if (BE (new_idx == -1 || tree == NULL, 0))
2298 goto parse_dup_op_espace;
2299 }
2300 for (i = 1; i < end - start; ++i)
2301 {
2302 work_tree = duplicate_tree (elem, dfa);
2303 tree = create_tree (tree, work_tree, CONCAT, 0);
2304 if (BE (work_tree == NULL || tree == NULL, 0))
2305 {
2306 *err = REG_ESPACE;
2307 return NULL;
2308 }
2309 }
2310 }
2311 }
2312 else
2313 {
2314 new_idx = re_dfa_add_node (dfa, *token, 0);
2315 tree = create_tree (tree, NULL, 0, new_idx);
2316 if (BE (new_idx == -1 || tree == NULL, 0))
2317 {
2318 *err = REG_ESPACE;
2319 return NULL;
2320 }
2321 }
2322 *token = fetch_token (regexp, syntax);
2323 return tree;
2324
2325 parse_dup_op_espace:
2326 free_bin_tree (tree);
2327 *err = REG_ESPACE;
2328 return NULL;
2329
2330 parse_dup_op_ebrace:
2331 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2332 {
2333 *err = REG_EBRACE;
2334 return NULL;
2335 }
2336 goto parse_dup_op_rollback;
2337 parse_dup_op_invalid_interval:
2338 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2339 {
2340 *err = REG_BADBR;
2341 return NULL;
2342 }
2343 parse_dup_op_rollback:
2344 re_string_set_index (regexp, start_idx);
2345 *token = start_token;
2346 token->type = CHARACTER;
2347 return dup_elem;
2348 }
2349
2350 /* Size of the names for collating symbol/equivalence_class/character_class.
2351 I'm not sure, but maybe enough. */
2352 #define BRACKET_NAME_BUF_SIZE 32
2353
2354 #ifndef _LIBC
2355 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2356 Build the range expression which starts from START_ELEM, and ends
2357 at END_ELEM. The result are written to MBCSET and SBCSET.
2358 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2359 mbcset->range_ends, is a pointer argument sinse we may
2360 update it. */
2361
2362 static reg_errcode_t
2363 # ifdef RE_ENABLE_I18N
2364 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2365 re_charset_t *mbcset;
2366 int *range_alloc;
2367 # else /* not RE_ENABLE_I18N */
2368 build_range_exp (sbcset, start_elem, end_elem)
2369 # endif /* not RE_ENABLE_I18N */
2370 re_bitset_ptr_t sbcset;
2371 bracket_elem_t *start_elem, *end_elem;
2372 {
2373 unsigned int start_ch, end_ch;
2374 /* Equivalence Classes and Character Classes can't be a range start/end. */
2375 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2376 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2377 0))
2378 return REG_ERANGE;
2379
2380 /* We can handle no multi character collating elements without libc
2381 support. */
2382 if (BE ((start_elem->type == COLL_SYM
2383 && strlen ((char *) start_elem->opr.name) > 1)
2384 || (end_elem->type == COLL_SYM
2385 && strlen ((char *) end_elem->opr.name) > 1), 0))
2386 return REG_ECOLLATE;
2387
2388 # ifdef RE_ENABLE_I18N
2389 {
2390 wchar_t wc, start_wc, end_wc;
2391 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2392
2393 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2394 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2395 : 0));
2396 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2397 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2398 : 0));
2399 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2400 ? __btowc (start_ch) : start_elem->opr.wch);
2401 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2402 ? __btowc (end_ch) : end_elem->opr.wch);
2403 cmp_buf[0] = start_wc;
2404 cmp_buf[4] = end_wc;
2405 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2406 return REG_ERANGE;
2407
2408 /* Check the space of the arrays. */
2409 if (*range_alloc == mbcset->nranges)
2410 {
2411 /* There are not enough space, need realloc. */
2412 wchar_t *new_array_start, *new_array_end;
2413 int new_nranges;
2414
2415 /* +1 in case of mbcset->nranges is 0. */
2416 new_nranges = 2 * mbcset->nranges + 1;
2417 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2418 are NULL if *range_alloc == 0. */
2419 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2420 new_nranges);
2421 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2422 new_nranges);
2423
2424 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2425 return REG_ESPACE;
2426
2427 mbcset->range_starts = new_array_start;
2428 mbcset->range_ends = new_array_end;
2429 *range_alloc = new_nranges;
2430 }
2431
2432 mbcset->range_starts[mbcset->nranges] = start_wc;
2433 mbcset->range_ends[mbcset->nranges++] = end_wc;
2434
2435 /* Build the table for single byte characters. */
2436 for (wc = 0; wc <= SBC_MAX; ++wc)
2437 {
2438 cmp_buf[2] = wc;
2439 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2440 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2441 bitset_set (sbcset, wc);
2442 }
2443 }
2444 # else /* not RE_ENABLE_I18N */
2445 {
2446 unsigned int ch;
2447 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2448 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2449 : 0));
2450 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2451 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2452 : 0));
2453 if (start_ch > end_ch)
2454 return REG_ERANGE;
2455 /* Build the table for single byte characters. */
2456 for (ch = 0; ch <= SBC_MAX; ++ch)
2457 if (start_ch <= ch && ch <= end_ch)
2458 bitset_set (sbcset, ch);
2459 }
2460 # endif /* not RE_ENABLE_I18N */
2461 return REG_NOERROR;
2462 }
2463 #endif /* not _LIBC */
2464
2465 #ifndef _LIBC
2466 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2467 Build the collating element which is represented by NAME.
2468 The result are written to MBCSET and SBCSET.
2469 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2470 pointer argument since we may update it. */
2471
2472 static reg_errcode_t
2473 # ifdef RE_ENABLE_I18N
2474 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2475 re_charset_t *mbcset;
2476 int *coll_sym_alloc;
2477 # else /* not RE_ENABLE_I18N */
2478 build_collating_symbol (sbcset, name)
2479 # endif /* not RE_ENABLE_I18N */
2480 re_bitset_ptr_t sbcset;
2481 const unsigned char *name;
2482 {
2483 size_t name_len = strlen ((const char *) name);
2484 if (BE (name_len != 1, 0))
2485 return REG_ECOLLATE;
2486 else
2487 {
2488 bitset_set (sbcset, name[0]);
2489 return REG_NOERROR;
2490 }
2491 }
2492 #endif /* not _LIBC */
2493
2494 /* This function parse bracket expression like "[abc]", "[a-c]",
2495 "[[.a-a.]]" etc. */
2496
2497 static bin_tree_t *
2498 parse_bracket_exp (regexp, dfa, token, syntax, err)
2499 re_string_t *regexp;
2500 re_dfa_t *dfa;
2501 re_token_t *token;
2502 reg_syntax_t syntax;
2503 reg_errcode_t *err;
2504 {
2505 #ifdef _LIBC
2506 const unsigned char *collseqmb;
2507 const char *collseqwc;
2508 uint32_t nrules;
2509 int32_t table_size;
2510 const int32_t *symb_table;
2511 const unsigned char *extra;
2512
2513 /* Local function for parse_bracket_exp used in _LIBC environement.
2514 Seek the collating symbol entry correspondings to NAME.
2515 Return the index of the symbol in the SYMB_TABLE. */
2516
2517 static inline int32_t
2518 __attribute ((always_inline))
2519 seek_collating_symbol_entry (name, name_len)
2520 const unsigned char *name;
2521 size_t name_len;
2522 {
2523 int32_t hash = elem_hash ((const char *) name, name_len);
2524 int32_t elem = hash % table_size;
2525 int32_t second = hash % (table_size - 2);
2526 while (symb_table[2 * elem] != 0)
2527 {
2528 /* First compare the hashing value. */
2529 if (symb_table[2 * elem] == hash
2530 /* Compare the length of the name. */
2531 && name_len == extra[symb_table[2 * elem + 1]]
2532 /* Compare the name. */
2533 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2534 name_len) == 0)
2535 {
2536 /* Yep, this is the entry. */
2537 break;
2538 }
2539
2540 /* Next entry. */
2541 elem += second;
2542 }
2543 return elem;
2544 }
2545
2546 /* Local function for parse_bracket_exp used in _LIBC environement.
2547 Look up the collation sequence value of BR_ELEM.
2548 Return the value if succeeded, UINT_MAX otherwise. */
2549
2550 static inline unsigned int
2551 __attribute ((always_inline))
2552 lookup_collation_sequence_value (br_elem)
2553 bracket_elem_t *br_elem;
2554 {
2555 if (br_elem->type == SB_CHAR)
2556 {
2557 /*
2558 if (MB_CUR_MAX == 1)
2559 */
2560 if (nrules == 0)
2561 return collseqmb[br_elem->opr.ch];
2562 else
2563 {
2564 wint_t wc = __btowc (br_elem->opr.ch);
2565 return __collseq_table_lookup (collseqwc, wc);
2566 }
2567 }
2568 else if (br_elem->type == MB_CHAR)
2569 {
2570 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2571 }
2572 else if (br_elem->type == COLL_SYM)
2573 {
2574 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2575 if (nrules != 0)
2576 {
2577 int32_t elem, idx;
2578 elem = seek_collating_symbol_entry (br_elem->opr.name,
2579 sym_name_len);
2580 if (symb_table[2 * elem] != 0)
2581 {
2582 /* We found the entry. */
2583 idx = symb_table[2 * elem + 1];
2584 /* Skip the name of collating element name. */
2585 idx += 1 + extra[idx];
2586 /* Skip the byte sequence of the collating element. */
2587 idx += 1 + extra[idx];
2588 /* Adjust for the alignment. */
2589 idx = (idx + 3) & ~3;
2590 /* Skip the multibyte collation sequence value. */
2591 idx += sizeof (unsigned int);
2592 /* Skip the wide char sequence of the collating element. */
2593 idx += sizeof (unsigned int) *
2594 (1 + *(unsigned int *) (extra + idx));
2595 /* Return the collation sequence value. */
2596 return *(unsigned int *) (extra + idx);
2597 }
2598 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2599 {
2600 /* No valid character. Match it as a single byte
2601 character. */
2602 return collseqmb[br_elem->opr.name[0]];
2603 }
2604 }
2605 else if (sym_name_len == 1)
2606 return collseqmb[br_elem->opr.name[0]];
2607 }
2608 return UINT_MAX;
2609 }
2610
2611 /* Local function for parse_bracket_exp used in _LIBC environement.
2612 Build the range expression which starts from START_ELEM, and ends
2613 at END_ELEM. The result are written to MBCSET and SBCSET.
2614 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2615 mbcset->range_ends, is a pointer argument sinse we may
2616 update it. */
2617
2618 static inline reg_errcode_t
2619 __attribute ((always_inline))
2620 # ifdef RE_ENABLE_I18N
2621 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2622 re_charset_t *mbcset;
2623 int *range_alloc;
2624 # else /* not RE_ENABLE_I18N */
2625 build_range_exp (sbcset, start_elem, end_elem)
2626 # endif /* not RE_ENABLE_I18N */
2627 re_bitset_ptr_t sbcset;
2628 bracket_elem_t *start_elem, *end_elem;
2629 {
2630 unsigned int ch;
2631 uint32_t start_collseq;
2632 uint32_t end_collseq;
2633
2634 # ifdef RE_ENABLE_I18N
2635 /* Check the space of the arrays. */
2636 if (*range_alloc == mbcset->nranges)
2637 {
2638 /* There are not enough space, need realloc. */
2639 uint32_t *new_array_start;
2640 uint32_t *new_array_end;
2641 int new_nranges;
2642
2643 /* +1 in case of mbcset->nranges is 0. */
2644 new_nranges = 2 * mbcset->nranges + 1;
2645 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2646 are NULL if *range_alloc == 0. */
2647 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2648 new_nranges);
2649 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2650 new_nranges);
2651
2652 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2653 return REG_ESPACE;
2654
2655 mbcset->range_starts = new_array_start;
2656 mbcset->range_ends = new_array_end;
2657 *range_alloc = new_nranges;
2658 }
2659 # endif /* RE_ENABLE_I18N */
2660
2661 /* Equivalence Classes and Character Classes can't be a range
2662 start/end. */
2663 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2664 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2665 0))
2666 return REG_ERANGE;
2667
2668 start_collseq = lookup_collation_sequence_value (start_elem);
2669 end_collseq = lookup_collation_sequence_value (end_elem);
2670 /* Check start/end collation sequence values. */
2671 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2672 return REG_ECOLLATE;
2673 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2674 return REG_ERANGE;
2675
2676 # ifdef RE_ENABLE_I18N
2677 /* Got valid collation sequence values, add them as a new entry. */
2678 mbcset->range_starts[mbcset->nranges] = start_collseq;
2679 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2680 # endif /* RE_ENABLE_I18N */
2681
2682 /* Build the table for single byte characters. */
2683 for (ch = 0; ch <= SBC_MAX; ch++)
2684 {
2685 uint32_t ch_collseq;
2686 /*
2687 if (MB_CUR_MAX == 1)
2688 */
2689 if (nrules == 0)
2690 ch_collseq = collseqmb[ch];
2691 else
2692 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2693 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2694 bitset_set (sbcset, ch);
2695 }
2696 return REG_NOERROR;
2697 }
2698
2699 /* Local function for parse_bracket_exp used in _LIBC environement.
2700 Build the collating element which is represented by NAME.
2701 The result are written to MBCSET and SBCSET.
2702 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2703 pointer argument sinse we may update it. */
2704
2705 static inline reg_errcode_t
2706 __attribute ((always_inline))
2707 # ifdef RE_ENABLE_I18N
2708 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2709 re_charset_t *mbcset;
2710 int *coll_sym_alloc;
2711 # else /* not RE_ENABLE_I18N */
2712 build_collating_symbol (sbcset, name)
2713 # endif /* not RE_ENABLE_I18N */
2714 re_bitset_ptr_t sbcset;
2715 const unsigned char *name;
2716 {
2717 int32_t elem, idx;
2718 size_t name_len = strlen ((const char *) name);
2719 if (nrules != 0)
2720 {
2721 elem = seek_collating_symbol_entry (name, name_len);
2722 if (symb_table[2 * elem] != 0)
2723 {
2724 /* We found the entry. */
2725 idx = symb_table[2 * elem + 1];
2726 /* Skip the name of collating element name. */
2727 idx += 1 + extra[idx];
2728 }
2729 else if (symb_table[2 * elem] == 0 && name_len == 1)
2730 {
2731 /* No valid character, treat it as a normal
2732 character. */
2733 bitset_set (sbcset, name[0]);
2734 return REG_NOERROR;
2735 }
2736 else
2737 return REG_ECOLLATE;
2738
2739 # ifdef RE_ENABLE_I18N
2740 /* Got valid collation sequence, add it as a new entry. */
2741 /* Check the space of the arrays. */
2742 if (*coll_sym_alloc == mbcset->ncoll_syms)
2743 {
2744 /* Not enough, realloc it. */
2745 /* +1 in case of mbcset->ncoll_syms is 0. */
2746 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2747 /* Use realloc since mbcset->coll_syms is NULL
2748 if *alloc == 0. */
2749 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2750 *coll_sym_alloc);
2751 if (BE (mbcset->coll_syms == NULL, 0))
2752 return REG_ESPACE;
2753 }
2754 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2755 # endif /* RE_ENABLE_I18N */
2756 return REG_NOERROR;
2757 }
2758 else
2759 {
2760 if (BE (name_len != 1, 0))
2761 return REG_ECOLLATE;
2762 else
2763 {
2764 bitset_set (sbcset, name[0]);
2765 return REG_NOERROR;
2766 }
2767 }
2768 }
2769 #endif
2770
2771 re_token_t br_token;
2772 re_bitset_ptr_t sbcset;
2773 #ifdef RE_ENABLE_I18N
2774 re_charset_t *mbcset;
2775 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2776 int equiv_class_alloc = 0, char_class_alloc = 0;
2777 #else /* not RE_ENABLE_I18N */
2778 int non_match = 0;
2779 #endif /* not RE_ENABLE_I18N */
2780 bin_tree_t *work_tree;
2781 int token_len, new_idx;
2782 #ifdef _LIBC
2783 collseqmb = (const unsigned char *)
2784 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2785 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2786 if (nrules)
2787 {
2788 /*
2789 if (MB_CUR_MAX > 1)
2790 */
2791 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2792 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2793 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2794 _NL_COLLATE_SYMB_TABLEMB);
2795 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2796 _NL_COLLATE_SYMB_EXTRAMB);
2797 }
2798 #endif
2799 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2800 #ifdef RE_ENABLE_I18N
2801 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2802 #endif /* RE_ENABLE_I18N */
2803 #ifdef RE_ENABLE_I18N
2804 if (BE (sbcset == NULL || mbcset == NULL, 0))
2805 #else
2806 if (BE (sbcset == NULL, 0))
2807 #endif /* RE_ENABLE_I18N */
2808 {
2809 *err = REG_ESPACE;
2810 return NULL;
2811 }
2812
2813 token_len = peek_token_bracket (token, regexp, syntax);
2814 if (BE (token->type == END_OF_RE, 0))
2815 {
2816 *err = REG_BADPAT;
2817 goto parse_bracket_exp_free_return;
2818 }
2819 if (token->type == OP_NON_MATCH_LIST)
2820 {
2821 #ifdef RE_ENABLE_I18N
2822 int i;
2823 mbcset->non_match = 1;
2824 #else /* not RE_ENABLE_I18N */
2825 non_match = 1;
2826 #endif /* not RE_ENABLE_I18N */
2827 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2828 bitset_set (sbcset, '\0');
2829 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2830 token_len = peek_token_bracket (token, regexp, syntax);
2831 if (BE (token->type == END_OF_RE, 0))
2832 {
2833 *err = REG_BADPAT;
2834 goto parse_bracket_exp_free_return;
2835 }
2836 #ifdef RE_ENABLE_I18N
2837 if (MB_CUR_MAX > 1)
2838 for (i = 0; i < SBC_MAX; ++i)
2839 if (__btowc (i) == WEOF)
2840 bitset_set (sbcset, i);
2841 #endif /* RE_ENABLE_I18N */
2842 }
2843
2844 /* We treat the first ']' as a normal character. */
2845 if (token->type == OP_CLOSE_BRACKET)
2846 token->type = CHARACTER;
2847
2848 while (1)
2849 {
2850 bracket_elem_t start_elem, end_elem;
2851 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2852 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2853 reg_errcode_t ret;
2854 int token_len2 = 0, is_range_exp = 0;
2855 re_token_t token2;
2856
2857 start_elem.opr.name = start_name_buf;
2858 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2859 syntax);
2860 if (BE (ret != REG_NOERROR, 0))
2861 {
2862 *err = ret;
2863 goto parse_bracket_exp_free_return;
2864 }
2865
2866 token_len = peek_token_bracket (token, regexp, syntax);
2867 if (BE (token->type == END_OF_RE, 0))
2868 {
2869 *err = REG_BADPAT;
2870 goto parse_bracket_exp_free_return;
2871 }
2872 if (token->type == OP_CHARSET_RANGE)
2873 {
2874 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2875 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2876 if (BE (token->type == END_OF_RE, 0))
2877 {
2878 *err = REG_BADPAT;
2879 goto parse_bracket_exp_free_return;
2880 }
2881 if (token2.type == OP_CLOSE_BRACKET)
2882 {
2883 /* We treat the last '-' as a normal character. */
2884 re_string_skip_bytes (regexp, -token_len);
2885 token->type = CHARACTER;
2886 }
2887 else
2888 is_range_exp = 1;
2889 }
2890
2891 if (is_range_exp == 1)
2892 {
2893 end_elem.opr.name = end_name_buf;
2894 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2895 dfa, syntax);
2896 if (BE (ret != REG_NOERROR, 0))
2897 {
2898 *err = ret;
2899 goto parse_bracket_exp_free_return;
2900 }
2901
2902 token_len = peek_token_bracket (token, regexp, syntax);
2903 if (BE (token->type == END_OF_RE, 0))
2904 {
2905 *err = REG_BADPAT;
2906 goto parse_bracket_exp_free_return;
2907 }
2908 *err = build_range_exp (sbcset,
2909 #ifdef RE_ENABLE_I18N
2910 mbcset, &range_alloc,
2911 #endif /* RE_ENABLE_I18N */
2912 &start_elem, &end_elem);
2913 if (BE (*err != REG_NOERROR, 0))
2914 goto parse_bracket_exp_free_return;
2915 }
2916 else
2917 {
2918 switch (start_elem.type)
2919 {
2920 case SB_CHAR:
2921 bitset_set (sbcset, start_elem.opr.ch);
2922 break;
2923 #ifdef RE_ENABLE_I18N
2924 case MB_CHAR:
2925 /* Check whether the array has enough space. */
2926 if (mbchar_alloc == mbcset->nmbchars)
2927 {
2928 /* Not enough, realloc it. */
2929 /* +1 in case of mbcset->nmbchars is 0. */
2930 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2931 /* Use realloc since array is NULL if *alloc == 0. */
2932 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2933 mbchar_alloc);
2934 if (BE (mbcset->mbchars == NULL, 0))
2935 goto parse_bracket_exp_espace;
2936 }
2937 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2938 break;
2939 #endif /* RE_ENABLE_I18N */
2940 case EQUIV_CLASS:
2941 *err = build_equiv_class (sbcset,
2942 #ifdef RE_ENABLE_I18N
2943 mbcset, &equiv_class_alloc,
2944 #endif /* RE_ENABLE_I18N */
2945 start_elem.opr.name);
2946 if (BE (*err != REG_NOERROR, 0))
2947 goto parse_bracket_exp_free_return;
2948 break;
2949 case COLL_SYM:
2950 *err = build_collating_symbol (sbcset,
2951 #ifdef RE_ENABLE_I18N
2952 mbcset, &coll_sym_alloc,
2953 #endif /* RE_ENABLE_I18N */
2954 start_elem.opr.name);
2955 if (BE (*err != REG_NOERROR, 0))
2956 goto parse_bracket_exp_free_return;
2957 break;
2958 case CHAR_CLASS:
2959 *err = build_charclass (regexp->trans, sbcset,
2960 #ifdef RE_ENABLE_I18N
2961 mbcset, &char_class_alloc,
2962 #endif /* RE_ENABLE_I18N */
2963 start_elem.opr.name, syntax);
2964 if (BE (*err != REG_NOERROR, 0))
2965 goto parse_bracket_exp_free_return;
2966 break;
2967 default:
2968 assert (0);
2969 break;
2970 }
2971 }
2972 if (token->type == OP_CLOSE_BRACKET)
2973 break;
2974 }
2975
2976 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2977
2978 /* If it is non-matching list. */
2979 #ifdef RE_ENABLE_I18N
2980 if (mbcset->non_match)
2981 #else /* not RE_ENABLE_I18N */
2982 if (non_match)
2983 #endif /* not RE_ENABLE_I18N */
2984 bitset_not (sbcset);
2985
2986 /* Build a tree for simple bracket. */
2987 br_token.type = SIMPLE_BRACKET;
2988 br_token.opr.sbcset = sbcset;
2989 new_idx = re_dfa_add_node (dfa, br_token, 0);
2990 work_tree = create_tree (NULL, NULL, 0, new_idx);
2991 if (BE (new_idx == -1 || work_tree == NULL, 0))
2992 goto parse_bracket_exp_espace;
2993
2994 #ifdef RE_ENABLE_I18N
2995 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2996 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2997 || mbcset->non_match)))
2998 {
2999 re_token_t alt_token;
3000 bin_tree_t *mbc_tree;
3001 /* Build a tree for complex bracket. */
3002 br_token.type = COMPLEX_BRACKET;
3003 br_token.opr.mbcset = mbcset;
3004 dfa->has_mb_node = 1;
3005 new_idx = re_dfa_add_node (dfa, br_token, 0);
3006 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3007 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3008 goto parse_bracket_exp_espace;
3009 /* Then join them by ALT node. */
3010 dfa->has_plural_match = 1;
3011 alt_token.type = OP_ALT;
3012 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3013 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3014 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3015 return work_tree;
3016 }
3017 else
3018 {
3019 free_charset (mbcset);
3020 return work_tree;
3021 }
3022 #else /* not RE_ENABLE_I18N */
3023 return work_tree;
3024 #endif /* not RE_ENABLE_I18N */
3025
3026 parse_bracket_exp_espace:
3027 *err = REG_ESPACE;
3028 parse_bracket_exp_free_return:
3029 re_free (sbcset);
3030 #ifdef RE_ENABLE_I18N
3031 free_charset (mbcset);
3032 #endif /* RE_ENABLE_I18N */
3033 return NULL;
3034 }
3035
3036 /* Parse an element in the bracket expression. */
3037
3038 static reg_errcode_t
3039 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3040 bracket_elem_t *elem;
3041 re_string_t *regexp;
3042 re_token_t *token;
3043 int token_len;
3044 re_dfa_t *dfa;
3045 reg_syntax_t syntax;
3046 {
3047 #ifdef RE_ENABLE_I18N
3048 int cur_char_size;
3049 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3050 if (cur_char_size > 1)
3051 {
3052 elem->type = MB_CHAR;
3053 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3054 re_string_skip_bytes (regexp, cur_char_size);
3055 return REG_NOERROR;
3056 }
3057 #endif /* RE_ENABLE_I18N */
3058 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3059 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3060 || token->type == OP_OPEN_EQUIV_CLASS)
3061 return parse_bracket_symbol (elem, regexp, token);
3062 elem->type = SB_CHAR;
3063 elem->opr.ch = token->opr.c;
3064 return REG_NOERROR;
3065 }
3066
3067 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3068 such as [:<character_class>:], [.<collating_element>.], and
3069 [=<equivalent_class>=]. */
3070
3071 static reg_errcode_t
3072 parse_bracket_symbol (elem, regexp, token)
3073 bracket_elem_t *elem;
3074 re_string_t *regexp;
3075 re_token_t *token;
3076 {
3077 unsigned char ch, delim = token->opr.c;
3078 int i = 0;
3079 for (;; ++i)
3080 {
3081 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3082 return REG_EBRACK;
3083 if (token->type == OP_OPEN_CHAR_CLASS)
3084 ch = re_string_fetch_byte_case (regexp);
3085 else
3086 ch = re_string_fetch_byte (regexp);
3087 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3088 break;
3089 elem->opr.name[i] = ch;
3090 }
3091 re_string_skip_bytes (regexp, 1);
3092 elem->opr.name[i] = '\0';
3093 switch (token->type)
3094 {
3095 case OP_OPEN_COLL_ELEM:
3096 elem->type = COLL_SYM;
3097 break;
3098 case OP_OPEN_EQUIV_CLASS:
3099 elem->type = EQUIV_CLASS;
3100 break;
3101 case OP_OPEN_CHAR_CLASS:
3102 elem->type = CHAR_CLASS;
3103 break;
3104 default:
3105 break;
3106 }
3107 return REG_NOERROR;
3108 }
3109
3110 /* Helper function for parse_bracket_exp.
3111 Build the equivalence class which is represented by NAME.
3112 The result are written to MBCSET and SBCSET.
3113 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3114 is a pointer argument sinse we may update it. */
3115
3116 static reg_errcode_t
3117 #ifdef RE_ENABLE_I18N
3118 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3119 re_charset_t *mbcset;
3120 int *equiv_class_alloc;
3121 #else /* not RE_ENABLE_I18N */
3122 build_equiv_class (sbcset, name)
3123 #endif /* not RE_ENABLE_I18N */
3124 re_bitset_ptr_t sbcset;
3125 const unsigned char *name;
3126 {
3127 #if defined _LIBC && defined RE_ENABLE_I18N
3128 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3129 if (nrules != 0)
3130 {
3131 const int32_t *table, *indirect;
3132 const unsigned char *weights, *extra, *cp;
3133 unsigned char char_buf[2];
3134 int32_t idx1, idx2;
3135 unsigned int ch;
3136 size_t len;
3137 /* This #include defines a local function! */
3138 # include <locale/weight.h>
3139 /* Calculate the index for equivalence class. */
3140 cp = name;
3141 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3142 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3143 _NL_COLLATE_WEIGHTMB);
3144 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3145 _NL_COLLATE_EXTRAMB);
3146 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3147 _NL_COLLATE_INDIRECTMB);
3148 idx1 = findidx (&cp);
3149 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3150 /* This isn't a valid character. */
3151 return REG_ECOLLATE;
3152
3153 /* Build single byte matcing table for this equivalence class. */
3154 char_buf[1] = (unsigned char) '\0';
3155 len = weights[idx1];
3156 for (ch = 0; ch < SBC_MAX; ++ch)
3157 {
3158 char_buf[0] = ch;
3159 cp = char_buf;
3160 idx2 = findidx (&cp);
3161 /*
3162 idx2 = table[ch];
3163 */
3164 if (idx2 == 0)
3165 /* This isn't a valid character. */
3166 continue;
3167 if (len == weights[idx2])
3168 {
3169 int cnt = 0;
3170 while (cnt <= len &&
3171 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3172 ++cnt;
3173
3174 if (cnt > len)
3175 bitset_set (sbcset, ch);
3176 }
3177 }
3178 /* Check whether the array has enough space. */
3179 if (*equiv_class_alloc == mbcset->nequiv_classes)
3180 {
3181 /* Not enough, realloc it. */
3182 /* +1 in case of mbcset->nequiv_classes is 0. */
3183 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3184 /* Use realloc since the array is NULL if *alloc == 0. */
3185 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3186 *equiv_class_alloc);
3187 if (BE (mbcset->equiv_classes == NULL, 0))
3188 return REG_ESPACE;
3189 }
3190 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3191 }
3192 else
3193 #endif /* _LIBC && RE_ENABLE_I18N */
3194 {
3195 if (BE (strlen ((const char *) name) != 1, 0))
3196 return REG_ECOLLATE;
3197 bitset_set (sbcset, *name);
3198 }
3199 return REG_NOERROR;
3200 }
3201
3202 /* Helper function for parse_bracket_exp.
3203 Build the character class which is represented by NAME.
3204 The result are written to MBCSET and SBCSET.
3205 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3206 is a pointer argument sinse we may update it. */
3207
3208 static reg_errcode_t
3209 #ifdef RE_ENABLE_I18N
3210 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3211 re_charset_t *mbcset;
3212 int *char_class_alloc;
3213 #else /* not RE_ENABLE_I18N */
3214 build_charclass (trans, sbcset, class_name, syntax)
3215 #endif /* not RE_ENABLE_I18N */
3216 RE_TRANSLATE_TYPE trans;
3217 re_bitset_ptr_t sbcset;
3218 const unsigned char *class_name;
3219 reg_syntax_t syntax;
3220 {
3221 int i;
3222 const char *name = (const char *) class_name;
3223
3224 /* In case of REG_ICASE "upper" and "lower" match the both of
3225 upper and lower cases. */
3226 if ((syntax & RE_ICASE)
3227 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3228 name = "alpha";
3229
3230 #ifdef RE_ENABLE_I18N
3231 /* Check the space of the arrays. */
3232 if (*char_class_alloc == mbcset->nchar_classes)
3233 {
3234 /* Not enough, realloc it. */
3235 /* +1 in case of mbcset->nchar_classes is 0. */
3236 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3237 /* Use realloc since array is NULL if *alloc == 0. */
3238 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3239 *char_class_alloc);
3240 if (BE (mbcset->char_classes == NULL, 0))
3241 return REG_ESPACE;
3242 }
3243 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3244 #endif /* RE_ENABLE_I18N */
3245
3246 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3247 for (i = 0; i < SBC_MAX; ++i) \
3248 { \
3249 if (ctype_func (i)) \
3250 { \
3251 int ch = trans ? trans[i] : i; \
3252 bitset_set (sbcset, ch); \
3253 } \
3254 }
3255
3256 if (strcmp (name, "alnum") == 0)
3257 BUILD_CHARCLASS_LOOP (isalnum)
3258 else if (strcmp (name, "cntrl") == 0)
3259 BUILD_CHARCLASS_LOOP (iscntrl)
3260 else if (strcmp (name, "lower") == 0)
3261 BUILD_CHARCLASS_LOOP (islower)
3262 else if (strcmp (name, "space") == 0)
3263 BUILD_CHARCLASS_LOOP (isspace)
3264 else if (strcmp (name, "alpha") == 0)
3265 BUILD_CHARCLASS_LOOP (isalpha)
3266 else if (strcmp (name, "digit") == 0)
3267 BUILD_CHARCLASS_LOOP (isdigit)
3268 else if (strcmp (name, "print") == 0)
3269 BUILD_CHARCLASS_LOOP (isprint)
3270 else if (strcmp (name, "upper") == 0)
3271 BUILD_CHARCLASS_LOOP (isupper)
3272 else if (strcmp (name, "blank") == 0)
3273 BUILD_CHARCLASS_LOOP (isblank)
3274 else if (strcmp (name, "graph") == 0)
3275 BUILD_CHARCLASS_LOOP (isgraph)
3276 else if (strcmp (name, "punct") == 0)
3277 BUILD_CHARCLASS_LOOP (ispunct)
3278 else if (strcmp (name, "xdigit") == 0)
3279 BUILD_CHARCLASS_LOOP (isxdigit)
3280 else
3281 return REG_ECTYPE;
3282
3283 return REG_NOERROR;
3284 }
3285
3286 static bin_tree_t *
3287 build_word_op (dfa, trans, not, err)
3288 re_dfa_t *dfa;
3289 RE_TRANSLATE_TYPE trans;
3290 int not;
3291 reg_errcode_t *err;
3292 {
3293 re_bitset_ptr_t sbcset;
3294 #ifdef RE_ENABLE_I18N
3295 re_charset_t *mbcset;
3296 int alloc = 0;
3297 #else /* not RE_ENABLE_I18N */
3298 int non_match = 0;
3299 #endif /* not RE_ENABLE_I18N */
3300 reg_errcode_t ret;
3301 re_token_t br_token;
3302 bin_tree_t *tree;
3303 int new_idx;
3304
3305 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3306 #ifdef RE_ENABLE_I18N
3307 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3308 #endif /* RE_ENABLE_I18N */
3309
3310 #ifdef RE_ENABLE_I18N
3311 if (BE (sbcset == NULL || mbcset == NULL, 0))
3312 #else /* not RE_ENABLE_I18N */
3313 if (BE (sbcset == NULL, 0))
3314 #endif /* not RE_ENABLE_I18N */
3315 {
3316 *err = REG_ESPACE;
3317 return NULL;
3318 }
3319
3320 if (not)
3321 {
3322 #ifdef RE_ENABLE_I18N
3323 int i;
3324 /*
3325 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3326 bitset_set(cset->sbcset, '\0');
3327 */
3328 mbcset->non_match = 1;
3329 if (MB_CUR_MAX > 1)
3330 for (i = 0; i < SBC_MAX; ++i)
3331 if (__btowc (i) == WEOF)
3332 bitset_set (sbcset, i);
3333 #else /* not RE_ENABLE_I18N */
3334 non_match = 1;
3335 #endif /* not RE_ENABLE_I18N */
3336 }
3337
3338 /* We don't care the syntax in this case. */
3339 ret = build_charclass (trans, sbcset,
3340 #ifdef RE_ENABLE_I18N
3341 mbcset, &alloc,
3342 #endif /* RE_ENABLE_I18N */
3343 (const unsigned char *) "alnum", 0);
3344
3345 if (BE (ret != REG_NOERROR, 0))
3346 {
3347 re_free (sbcset);
3348 #ifdef RE_ENABLE_I18N
3349 free_charset (mbcset);
3350 #endif /* RE_ENABLE_I18N */
3351 *err = ret;
3352 return NULL;
3353 }
3354 /* \w match '_' also. */
3355 bitset_set (sbcset, '_');
3356
3357 /* If it is non-matching list. */
3358 #ifdef RE_ENABLE_I18N
3359 if (mbcset->non_match)
3360 #else /* not RE_ENABLE_I18N */
3361 if (non_match)
3362 #endif /* not RE_ENABLE_I18N */
3363 bitset_not (sbcset);
3364
3365 /* Build a tree for simple bracket. */
3366 br_token.type = SIMPLE_BRACKET;
3367 br_token.opr.sbcset = sbcset;
3368 new_idx = re_dfa_add_node (dfa, br_token, 0);
3369 tree = create_tree (NULL, NULL, 0, new_idx);
3370 if (BE (new_idx == -1 || tree == NULL, 0))
3371 goto build_word_op_espace;
3372
3373 #ifdef RE_ENABLE_I18N
3374 if (MB_CUR_MAX > 1)
3375 {
3376 re_token_t alt_token;
3377 bin_tree_t *mbc_tree;
3378 /* Build a tree for complex bracket. */
3379 br_token.type = COMPLEX_BRACKET;
3380 br_token.opr.mbcset = mbcset;
3381 dfa->has_mb_node = 1;
3382 new_idx = re_dfa_add_node (dfa, br_token, 0);
3383 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3384 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3385 goto build_word_op_espace;
3386 /* Then join them by ALT node. */
3387 alt_token.type = OP_ALT;
3388 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3389 tree = create_tree (tree, mbc_tree, 0, new_idx);
3390 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3391 return tree;
3392 }
3393 else
3394 {
3395 free_charset (mbcset);
3396 return tree;
3397 }
3398 #else /* not RE_ENABLE_I18N */
3399 return tree;
3400 #endif /* not RE_ENABLE_I18N */
3401
3402 build_word_op_espace:
3403 re_free (sbcset);
3404 #ifdef RE_ENABLE_I18N
3405 free_charset (mbcset);
3406 #endif /* RE_ENABLE_I18N */
3407 *err = REG_ESPACE;
3408 return NULL;
3409 }
3410
3411 /* This is intended for the expressions like "a{1,3}".
3412 Fetch a number from `input', and return the number.
3413 Return -1, if the number field is empty like "{,1}".
3414 Return -2, If an error is occured. */
3415
3416 static int
3417 fetch_number (input, token, syntax)
3418 re_string_t *input;
3419 re_token_t *token;
3420 reg_syntax_t syntax;
3421 {
3422 int num = -1;
3423 unsigned char c;
3424 while (1)
3425 {
3426 *token = fetch_token (input, syntax);
3427 c = token->opr.c;
3428 if (BE (token->type == END_OF_RE, 0))
3429 return -2;
3430 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3431 break;
3432 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3433 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3434 num = (num > RE_DUP_MAX) ? -2 : num;
3435 }
3436 return num;
3437 }
3438 \f
3439 #ifdef RE_ENABLE_I18N
3440 static void
3441 free_charset (re_charset_t *cset)
3442 {
3443 re_free (cset->mbchars);
3444 # ifdef _LIBC
3445 re_free (cset->coll_syms);
3446 re_free (cset->equiv_classes);
3447 re_free (cset->range_starts);
3448 re_free (cset->range_ends);
3449 # endif
3450 re_free (cset->char_classes);
3451 re_free (cset);
3452 }
3453 #endif /* RE_ENABLE_I18N */
3454 \f
3455 /* Functions for binary tree operation. */
3456
3457 /* Create a node of tree.
3458 Note: This function automatically free left and right if malloc fails. */
3459
3460 static bin_tree_t *
3461 create_tree (left, right, type, index)
3462 bin_tree_t *left;
3463 bin_tree_t *right;
3464 re_token_type_t type;
3465 int index;
3466 {
3467 bin_tree_t *tree;
3468 tree = re_malloc (bin_tree_t, 1);
3469 if (BE (tree == NULL, 0))
3470 {
3471 free_bin_tree (left);
3472 free_bin_tree (right);
3473 return NULL;
3474 }
3475 tree->parent = NULL;
3476 tree->left = left;
3477 tree->right = right;
3478 tree->type = type;
3479 tree->node_idx = index;
3480 tree->first = -1;
3481 tree->next = -1;
3482 re_node_set_init_empty (&tree->eclosure);
3483
3484 if (left != NULL)
3485 left->parent = tree;
3486 if (right != NULL)
3487 right->parent = tree;
3488 return tree;
3489 }
3490
3491 /* Free the sub tree pointed by TREE. */
3492
3493 static void
3494 free_bin_tree (tree)
3495 bin_tree_t *tree;
3496 {
3497 if (tree == NULL)
3498 return;
3499 /*re_node_set_free (&tree->eclosure);*/
3500 free_bin_tree (tree->left);
3501 free_bin_tree (tree->right);
3502 re_free (tree);
3503 }
3504
3505 /* Duplicate the node SRC, and return new node. */
3506
3507 static bin_tree_t *
3508 duplicate_tree (src, dfa)
3509 const bin_tree_t *src;
3510 re_dfa_t *dfa;
3511 {
3512 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3513 int new_node_idx;
3514 /* Since node indies must be according to Post-order of the tree,
3515 we must duplicate the left at first. */
3516 if (src->left != NULL)
3517 {
3518 left = duplicate_tree (src->left, dfa);
3519 if (left == NULL)
3520 return NULL;
3521 }
3522
3523 /* Secondaly, duplicate the right. */
3524 if (src->right != NULL)
3525 {
3526 right = duplicate_tree (src->right, dfa);
3527 if (right == NULL)
3528 {
3529 free_bin_tree (left);
3530 return NULL;
3531 }
3532 }
3533
3534 /* At last, duplicate itself. */
3535 if (src->type == NON_TYPE)
3536 {
3537 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3538 dfa->nodes[new_node_idx].duplicated = 1;
3539 if (BE (new_node_idx == -1, 0))
3540 {
3541 free_bin_tree (left);
3542 free_bin_tree (right);
3543 return NULL;
3544 }
3545 }
3546 else
3547 new_node_idx = src->type;
3548
3549 new_tree = create_tree (left, right, src->type, new_node_idx);
3550 if (BE (new_tree == NULL, 0))
3551 {
3552 free_bin_tree (left);
3553 free_bin_tree (right);
3554 }
3555 return new_tree;
3556 }
This page took 0.197907 seconds and 5 git commands to generate.