1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
34 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
35 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
36 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
37 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
38 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
39 int top_clone_node
, int root_node
,
40 unsigned int constraint
);
41 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
42 unsigned int constraint
);
43 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
44 unsigned int constraint
);
45 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
46 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
48 static void calc_inveclosure (re_dfa_t
*dfa
);
49 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
51 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
52 static int peek_token (re_token_t
*token
, re_string_t
*input
,
54 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
56 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
57 reg_syntax_t syntax
, reg_errcode_t
*err
);
58 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
59 re_token_t
*token
, reg_syntax_t syntax
,
60 int nest
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 int nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 int nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
71 re_dfa_t
*dfa
, re_token_t
*token
,
72 reg_syntax_t syntax
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
74 re_token_t
*token
, reg_syntax_t syntax
,
76 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
78 re_token_t
*token
, int token_len
,
81 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
85 # ifdef RE_ENABLE_I18N
86 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
87 re_charset_t
*mbcset
, int *range_alloc
,
88 bracket_elem_t
*start_elem
,
89 bracket_elem_t
*end_elem
);
90 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
93 const unsigned char *name
);
94 # else /* not RE_ENABLE_I18N */
95 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
96 bracket_elem_t
*start_elem
,
97 bracket_elem_t
*end_elem
);
98 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
99 const unsigned char *name
);
100 # endif /* not RE_ENABLE_I18N */
101 #endif /* not _LIBC */
102 #ifdef RE_ENABLE_I18N
103 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
104 re_charset_t
*mbcset
,
105 int *equiv_class_alloc
,
106 const unsigned char *name
);
107 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
108 re_bitset_ptr_t sbcset
,
109 re_charset_t
*mbcset
,
110 int *char_class_alloc
,
111 const unsigned char *class_name
,
112 reg_syntax_t syntax
);
113 #else /* not RE_ENABLE_I18N */
114 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
115 const unsigned char *name
);
116 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
117 re_bitset_ptr_t sbcset
,
118 const unsigned char *class_name
,
119 reg_syntax_t syntax
);
120 #endif /* not RE_ENABLE_I18N */
121 static bin_tree_t
*build_word_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
122 int not, reg_errcode_t
*err
);
123 static void free_bin_tree (bin_tree_t
*tree
);
124 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
125 re_token_type_t type
, int index
);
126 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
128 /* This table gives an error message for each of the error codes listed
129 in regex.h. Obviously the order here has to be same as there.
130 POSIX doesn't require that we do anything for REG_NOERROR,
131 but why not be nice? */
133 const char __re_error_msgid
[] attribute_hidden
=
135 #define REG_NOERROR_IDX 0
136 gettext_noop ("Success") /* REG_NOERROR */
138 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139 gettext_noop ("No match") /* REG_NOMATCH */
141 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
160 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
187 const size_t __re_error_msgid_idx
[] attribute_hidden
=
208 /* Entry points for GNU code. */
210 /* re_compile_pattern is the GNU regular expression compiler: it
211 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
212 Returns 0 if the pattern was valid, otherwise an error string.
214 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
215 are set in BUFP on entry. */
218 re_compile_pattern (pattern
, length
, bufp
)
221 struct re_pattern_buffer
*bufp
;
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
230 /* Match anchors at newline. */
231 bufp
->newline_anchor
= 1;
233 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
237 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
240 weak_alias (__re_compile_pattern
, re_compile_pattern
)
243 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options
;
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
259 re_set_syntax (syntax
)
262 reg_syntax_t ret
= re_syntax_options
;
264 re_syntax_options
= syntax
;
268 weak_alias (__re_set_syntax
, re_set_syntax
)
272 re_compile_fastmap (bufp
)
273 struct re_pattern_buffer
*bufp
;
275 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
276 char *fastmap
= bufp
->fastmap
;
278 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
279 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
280 if (dfa
->init_state
!= dfa
->init_state_word
)
281 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
282 if (dfa
->init_state
!= dfa
->init_state_nl
)
283 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
284 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
285 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
286 bufp
->fastmap_accurate
= 1;
290 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
294 __attribute ((always_inline
))
295 re_set_fastmap (char *fastmap
, int icase
, int ch
)
299 fastmap
[tolower (ch
)] = 1;
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
306 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
308 const re_dfastate_t
*init_state
;
311 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
313 int icase
= (MB_CUR_MAX
== 1 && (bufp
->syntax
& RE_ICASE
));
314 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
316 int node
= init_state
->nodes
.elems
[node_cnt
];
317 re_token_type_t type
= dfa
->nodes
[node
].type
;
319 if (type
== CHARACTER
)
320 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
321 else if (type
== SIMPLE_BRACKET
)
324 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
325 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
326 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
327 re_set_fastmap (fastmap
, icase
, ch
);
329 #ifdef RE_ENABLE_I18N
330 else if (type
== COMPLEX_BRACKET
)
333 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
334 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
335 || cset
->nranges
|| cset
->nchar_classes
)
338 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
340 /* In this case we want to catch the bytes which are
341 the first byte of any collation elements.
342 e.g. In da_DK, we want to catch 'a' since "aa"
343 is a valid collation element, and don't catch
344 'b' since 'b' is the only collation element
345 which starts from 'b'. */
347 const int32_t *table
= (const int32_t *)
348 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
349 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
350 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
352 re_set_fastmap (fastmap
, icase
, ch
);
356 for (i
= 0; i
< SBC_MAX
; ++i
)
357 if (__btowc (i
) == WEOF
)
358 re_set_fastmap (fastmap
, icase
, i
);
359 # endif /* not _LIBC */
361 for (i
= 0; i
< cset
->nmbchars
; ++i
)
365 memset (&state
, '\0', sizeof (state
));
366 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
367 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
370 #endif /* RE_ENABLE_I18N */
371 else if (type
== END_OF_RE
|| type
== OP_PERIOD
)
373 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
374 if (type
== END_OF_RE
)
375 bufp
->can_be_null
= 1;
381 /* Entry point for POSIX code. */
382 /* regcomp takes a regular expression as a string and compiles it.
384 PREG is a regex_t *. We do not expect any fields to be initialized,
385 since POSIX says we shouldn't. Thus, we set
387 `buffer' to the compiled pattern;
388 `used' to the length of the compiled pattern;
389 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
390 REG_EXTENDED bit in CFLAGS is set; otherwise, to
391 RE_SYNTAX_POSIX_BASIC;
392 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
393 `fastmap' to an allocated space for the fastmap;
394 `fastmap_accurate' to zero;
395 `re_nsub' to the number of subexpressions in PATTERN.
397 PATTERN is the address of the pattern string.
399 CFLAGS is a series of bits which affect compilation.
401 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
402 use POSIX basic syntax.
404 If REG_NEWLINE is set, then . and [^...] don't match newline.
405 Also, regexec will try a match beginning after every newline.
407 If REG_ICASE is set, then we considers upper- and lowercase
408 versions of letters to be equivalent when matching.
410 If REG_NOSUB is set, then when PREG is passed to regexec, that
411 routine will report only success or failure, and nothing about the
414 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
415 the return codes and their meanings.) */
418 regcomp (preg
, pattern
, cflags
)
419 regex_t
*__restrict preg
;
420 const char *__restrict pattern
;
424 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
425 : RE_SYNTAX_POSIX_BASIC
);
431 /* Try to allocate space for the fastmap. */
432 preg
->fastmap
= re_malloc (char, SBC_MAX
);
433 if (BE (preg
->fastmap
== NULL
, 0))
436 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
438 /* If REG_NEWLINE is set, newlines are treated differently. */
439 if (cflags
& REG_NEWLINE
)
440 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
441 syntax
&= ~RE_DOT_NEWLINE
;
442 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
443 /* It also changes the matching behavior. */
444 preg
->newline_anchor
= 1;
447 preg
->newline_anchor
= 0;
448 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
449 preg
->translate
= NULL
;
451 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
453 /* POSIX doesn't distinguish between an unmatched open-group and an
454 unmatched close-group: both are REG_EPAREN. */
455 if (ret
== REG_ERPAREN
)
458 /* We have already checked preg->fastmap != NULL. */
459 if (BE (ret
== REG_NOERROR
, 1))
460 /* Compute the fastmap now, since regexec cannot modify the pattern
461 buffer. This function nevers fails in this implementation. */
462 (void) re_compile_fastmap (preg
);
465 /* Some error occurred while compiling the expression. */
466 re_free (preg
->fastmap
);
467 preg
->fastmap
= NULL
;
473 weak_alias (__regcomp
, regcomp
)
476 /* Returns a message corresponding to an error code, ERRCODE, returned
477 from either regcomp or regexec. We don't use PREG here. */
480 regerror (errcode
, preg
, errbuf
, errbuf_size
)
490 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
491 / sizeof (__re_error_msgid_idx
[0])), 0))
492 /* Only error codes returned by the rest of the code should be passed
493 to this routine. If we are given anything else, or if other regex
494 code generates an invalid error code, then the program has a bug.
495 Dump core so we can fix it. */
498 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
500 msg_size
= strlen (msg
) + 1; /* Includes the null. */
502 if (BE (errbuf_size
!= 0, 1))
504 if (BE (msg_size
> errbuf_size
, 0))
506 #if defined HAVE_MEMPCPY || defined _LIBC
507 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
509 memcpy (errbuf
, msg
, errbuf_size
- 1);
510 errbuf
[errbuf_size
- 1] = 0;
514 memcpy (errbuf
, msg
, msg_size
);
520 weak_alias (__regerror
, regerror
)
525 free_dfa_content (re_dfa_t
*dfa
)
529 re_free (dfa
->subexps
);
531 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
533 re_token_t
*node
= dfa
->nodes
+ i
;
534 #ifdef RE_ENABLE_I18N
535 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
536 free_charset (node
->opr
.mbcset
);
538 #endif /* RE_ENABLE_I18N */
539 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
540 re_free (node
->opr
.sbcset
);
542 re_free (dfa
->nexts
);
543 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
545 if (dfa
->eclosures
!= NULL
)
546 re_node_set_free (dfa
->eclosures
+ i
);
547 if (dfa
->inveclosures
!= NULL
)
548 re_node_set_free (dfa
->inveclosures
+ i
);
549 if (dfa
->edests
!= NULL
)
550 re_node_set_free (dfa
->edests
+ i
);
552 re_free (dfa
->edests
);
553 re_free (dfa
->eclosures
);
554 re_free (dfa
->inveclosures
);
555 re_free (dfa
->nodes
);
557 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
559 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
560 for (j
= 0; j
< entry
->num
; ++j
)
562 re_dfastate_t
*state
= entry
->array
[j
];
565 re_free (entry
->array
);
567 re_free (dfa
->state_table
);
569 if (dfa
->word_char
!= NULL
)
570 re_free (dfa
->word_char
);
572 re_free (dfa
->re_str
);
579 /* Free dynamically allocated space used by PREG. */
585 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
586 if (BE (dfa
!= NULL
, 1))
587 free_dfa_content (dfa
);
589 re_free (preg
->fastmap
);
592 weak_alias (__regfree
, regfree
)
595 /* Entry points compatible with 4.2 BSD regex library. We don't define
596 them unless specifically requested. */
598 #if defined _REGEX_RE_COMP || defined _LIBC
600 /* BSD has one and only one pattern buffer. */
601 static struct re_pattern_buffer re_comp_buf
;
605 /* Make these definitions weak in libc, so POSIX programs can redefine
606 these names if they don't use our functions, and still use
607 regcomp/regexec above without link errors. */
618 if (!re_comp_buf
.buffer
)
619 return gettext ("No previous regular expression");
623 if (re_comp_buf
.buffer
)
625 fastmap
= re_comp_buf
.fastmap
;
626 re_comp_buf
.fastmap
= NULL
;
627 __regfree (&re_comp_buf
);
628 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
629 re_comp_buf
.fastmap
= fastmap
;
632 if (re_comp_buf
.fastmap
== NULL
)
634 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
635 if (re_comp_buf
.fastmap
== NULL
)
636 return (char *) gettext (__re_error_msgid
637 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
640 /* Since `re_exec' always passes NULL for the `regs' argument, we
641 don't need to initialize the pattern buffer fields which affect it. */
643 /* Match anchors at newlines. */
644 re_comp_buf
.newline_anchor
= 1;
646 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
651 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
652 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
656 libc_freeres_fn (free_mem
)
658 __regfree (&re_comp_buf
);
662 #endif /* _REGEX_RE_COMP */
664 /* Internal entry point.
665 Compile the regular expression PATTERN, whose length is LENGTH.
666 SYNTAX indicate regular expression's syntax. */
669 re_compile_internal (preg
, pattern
, length
, syntax
)
671 const char * pattern
;
675 reg_errcode_t err
= REG_NOERROR
;
679 /* Initialize the pattern buffer. */
680 preg
->fastmap_accurate
= 0;
681 preg
->syntax
= syntax
;
682 preg
->not_bol
= preg
->not_eol
= 0;
685 preg
->can_be_null
= 0;
686 preg
->regs_allocated
= REGS_UNALLOCATED
;
688 /* Initialize the dfa. */
689 dfa
= (re_dfa_t
*) preg
->buffer
;
690 if (preg
->allocated
< sizeof (re_dfa_t
))
692 /* If zero allocated, but buffer is non-null, try to realloc
693 enough space. This loses if buffer's address is bogus, but
694 that is the user's responsibility. If ->buffer is NULL this
695 is a simple allocation. */
696 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
699 preg
->allocated
= sizeof (re_dfa_t
);
701 preg
->buffer
= (unsigned char *) dfa
;
702 preg
->used
= sizeof (re_dfa_t
);
704 err
= init_dfa (dfa
, length
);
705 if (BE (err
!= REG_NOERROR
, 0))
713 dfa
->re_str
= re_malloc (char, length
+ 1);
714 strncpy (dfa
->re_str
, pattern
, length
+ 1);
717 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
719 if (BE (err
!= REG_NOERROR
, 0))
727 /* Parse the regular expression, and build a structure tree. */
729 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
730 if (BE (dfa
->str_tree
== NULL
, 0))
731 goto re_compile_internal_free_return
;
733 /* Analyze the tree and collect information which is necessary to
736 if (BE (err
!= REG_NOERROR
, 0))
737 goto re_compile_internal_free_return
;
739 /* Then create the initial state of the dfa. */
740 err
= create_initial_state (dfa
);
742 /* Release work areas. */
743 free_workarea_compile (preg
);
744 re_string_destruct (®exp
);
746 if (BE (err
!= REG_NOERROR
, 0))
748 re_compile_internal_free_return
:
749 free_dfa_content (dfa
);
757 /* Initialize DFA. We use the length of the regular expression PAT_LEN
758 as the initial length of some arrays. */
761 init_dfa (dfa
, pat_len
)
767 memset (dfa
, '\0', sizeof (re_dfa_t
));
769 dfa
->nodes_alloc
= pat_len
+ 1;
770 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
772 dfa
->states_alloc
= pat_len
+ 1;
774 /* table_size = 2 ^ ceil(log pat_len) */
775 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
776 if (table_size
> pat_len
)
779 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
780 dfa
->state_hash_mask
= table_size
- 1;
782 dfa
->subexps_alloc
= 1;
783 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
784 dfa
->word_char
= NULL
;
786 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
787 || dfa
->subexps
== NULL
, 0))
789 /* We don't bother to free anything which was allocated. Very
790 soon the process will go down anyway. */
792 dfa
->state_table
= NULL
;
799 /* Initialize WORD_CHAR table, which indicate which character is
800 "word". In this case "word" means that it is the word construction
801 character used by some operators like "\<", "\>", etc. */
808 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
809 if (BE (dfa
->word_char
== NULL
, 0))
811 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
812 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
813 if (isalnum (ch
) || ch
== '_')
814 dfa
->word_char
[i
] |= 1 << j
;
818 /* Free the work area which are only used while compiling. */
821 free_workarea_compile (preg
)
824 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
825 free_bin_tree (dfa
->str_tree
);
826 dfa
->str_tree
= NULL
;
827 re_free (dfa
->org_indices
);
828 dfa
->org_indices
= NULL
;
831 /* Create initial states for all contexts. */
834 create_initial_state (dfa
)
839 re_node_set init_nodes
;
841 /* Initial states have the epsilon closure of the node which is
842 the first node of the regular expression. */
843 first
= dfa
->str_tree
->first
;
844 dfa
->init_node
= first
;
845 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
846 if (BE (err
!= REG_NOERROR
, 0))
849 /* The back-references which are in initial states can epsilon transit,
850 since in this case all of the subexpressions can be null.
851 Then we add epsilon closures of the nodes which are the next nodes of
852 the back-references. */
853 if (dfa
->nbackref
> 0)
854 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
856 int node_idx
= init_nodes
.elems
[i
];
857 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
860 if (type
!= OP_BACK_REF
)
862 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
864 re_token_t
*clexp_node
;
865 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
866 if (clexp_node
->type
== OP_CLOSE_SUBEXP
867 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
870 if (clexp_idx
== init_nodes
.nelem
)
873 if (type
== OP_BACK_REF
)
875 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
876 if (!re_node_set_contains (&init_nodes
, dest_idx
))
878 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
884 /* It must be the first time to invoke acquire_state. */
885 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
886 /* We don't check ERR here, since the initial state must not be NULL. */
887 if (BE (dfa
->init_state
== NULL
, 0))
889 if (dfa
->init_state
->has_constraint
)
891 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
893 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
895 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
899 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
900 || dfa
->init_state_begbuf
== NULL
, 0))
904 dfa
->init_state_word
= dfa
->init_state_nl
905 = dfa
->init_state_begbuf
= dfa
->init_state
;
907 re_node_set_free (&init_nodes
);
911 /* Analyze the structure tree, and calculate "first", "next", "edest",
912 "eclosure", and "inveclosure". */
921 /* Allocate arrays. */
922 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
923 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
924 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
925 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
926 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
927 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
928 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
930 /* Initialize them. */
931 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
934 re_node_set_init_empty (dfa
->edests
+ i
);
935 re_node_set_init_empty (dfa
->eclosures
+ i
);
936 re_node_set_init_empty (dfa
->inveclosures
+ i
);
939 ret
= analyze_tree (dfa
, dfa
->str_tree
);
940 if (BE (ret
== REG_NOERROR
, 1))
942 ret
= calc_eclosure (dfa
);
943 if (ret
== REG_NOERROR
)
944 calc_inveclosure (dfa
);
949 /* Helper functions for analyze.
950 This function calculate "first", "next", and "edest" for the subtree
951 whose root is NODE. */
954 analyze_tree (dfa
, node
)
959 if (node
->first
== -1)
960 calc_first (dfa
, node
);
961 if (node
->next
== -1)
962 calc_next (dfa
, node
);
963 if (node
->eclosure
.nelem
== 0)
964 calc_epsdest (dfa
, node
);
965 /* Calculate "first" etc. for the left child. */
966 if (node
->left
!= NULL
)
968 ret
= analyze_tree (dfa
, node
->left
);
969 if (BE (ret
!= REG_NOERROR
, 0))
972 /* Calculate "first" etc. for the right child. */
973 if (node
->right
!= NULL
)
975 ret
= analyze_tree (dfa
, node
->right
);
976 if (BE (ret
!= REG_NOERROR
, 0))
982 /* Calculate "first" for the node NODE. */
984 calc_first (dfa
, node
)
989 idx
= node
->node_idx
;
990 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
995 case OP_OPEN_BRACKET
:
996 case OP_CLOSE_BRACKET
:
997 case OP_OPEN_DUP_NUM
:
998 case OP_CLOSE_DUP_NUM
:
999 case OP_NON_MATCH_LIST
:
1000 case OP_OPEN_COLL_ELEM
:
1001 case OP_CLOSE_COLL_ELEM
:
1002 case OP_OPEN_EQUIV_CLASS
:
1003 case OP_CLOSE_EQUIV_CLASS
:
1004 case OP_OPEN_CHAR_CLASS
:
1005 case OP_CLOSE_CHAR_CLASS
:
1006 /* These must not be appeared here. */
1012 case OP_DUP_ASTERISK
:
1013 case OP_DUP_QUESTION
:
1014 #ifdef RE_ENABLE_I18N
1015 case COMPLEX_BRACKET
:
1016 #endif /* RE_ENABLE_I18N */
1017 case SIMPLE_BRACKET
:
1020 case OP_OPEN_SUBEXP
:
1021 case OP_CLOSE_SUBEXP
:
1026 assert (node
->left
!= NULL
);
1028 if (node
->left
->first
== -1)
1029 calc_first (dfa
, node
->left
);
1030 node
->first
= node
->left
->first
;
1035 /* else fall through */
1038 assert (node
->left
!= NULL
);
1040 if (node
->left
->first
== -1)
1041 calc_first (dfa
, node
->left
);
1042 node
->first
= node
->left
->first
;
1047 /* Calculate "next" for the node NODE. */
1050 calc_next (dfa
, node
)
1055 bin_tree_t
*parent
= node
->parent
;
1059 idx
= node
->node_idx
;
1060 if (node
->type
== 0)
1061 dfa
->nexts
[idx
] = node
->next
;
1065 idx
= parent
->node_idx
;
1066 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1070 case OP_DUP_ASTERISK
:
1075 if (parent
->left
== node
)
1077 if (parent
->right
->first
== -1)
1078 calc_first (dfa
, parent
->right
);
1079 node
->next
= parent
->right
->first
;
1082 /* else fall through */
1084 if (parent
->next
== -1)
1085 calc_next (dfa
, parent
);
1086 node
->next
= parent
->next
;
1089 idx
= node
->node_idx
;
1090 if (node
->type
== 0)
1091 dfa
->nexts
[idx
] = node
->next
;
1094 /* Calculate "edest" for the node NODE. */
1097 calc_epsdest (dfa
, node
)
1102 idx
= node
->node_idx
;
1103 if (node
->type
== 0)
1105 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1106 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1107 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1109 if (node
->left
->first
== -1)
1110 calc_first (dfa
, node
->left
);
1111 if (node
->next
== -1)
1112 calc_next (dfa
, node
);
1113 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1116 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1119 if (node
->left
!= NULL
)
1121 if (node
->left
->first
== -1)
1122 calc_first (dfa
, node
->left
);
1123 left
= node
->left
->first
;
1127 if (node
->next
== -1)
1128 calc_next (dfa
, node
);
1131 if (node
->right
!= NULL
)
1133 if (node
->right
->first
== -1)
1134 calc_first (dfa
, node
->right
);
1135 right
= node
->right
->first
;
1139 if (node
->next
== -1)
1140 calc_next (dfa
, node
);
1143 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1145 else if (dfa
->nodes
[idx
].type
== ANCHOR
1146 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1147 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1148 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1149 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1153 /* Duplicate the epsilon closure of the node ROOT_NODE.
1154 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1155 to their own constraint. */
1157 static reg_errcode_t
1158 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1161 int top_org_node
, top_clone_node
, root_node
;
1162 unsigned int init_constraint
;
1165 int org_node
, clone_node
, ret
;
1166 unsigned int constraint
= init_constraint
;
1167 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1169 int org_dest
, clone_dest
;
1170 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1172 /* If the back reference epsilon-transit, its destination must
1173 also have the constraint. Then duplicate the epsilon closure
1174 of the destination of the back reference, and store it in
1175 edests of the back reference. */
1176 org_dest
= dfa
->nexts
[org_node
];
1177 re_node_set_empty (dfa
->edests
+ clone_node
);
1178 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1179 if (BE (err
!= REG_NOERROR
, 0))
1181 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1182 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1183 if (BE (ret
< 0, 0))
1186 else if (dfa
->edests
[org_node
].nelem
== 0)
1188 /* In case of the node can't epsilon-transit, don't duplicate the
1189 destination and store the original destination as the
1190 destination of the node. */
1191 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1194 else if (dfa
->edests
[org_node
].nelem
== 1)
1196 /* In case of the node can epsilon-transit, and it has only one
1198 org_dest
= dfa
->edests
[org_node
].elems
[0];
1199 re_node_set_empty (dfa
->edests
+ clone_node
);
1200 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1202 /* In case of the node has another constraint, append it. */
1203 if (org_node
== root_node
&& clone_node
!= org_node
)
1205 /* ...but if the node is root_node itself, it means the
1206 epsilon closure have a loop, then tie it to the
1207 destination of the root_node. */
1208 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1210 if (BE (ret
< 0, 0))
1214 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1216 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1217 if (BE (err
!= REG_NOERROR
, 0))
1219 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1220 if (BE (ret
< 0, 0))
1223 else /* dfa->edests[org_node].nelem == 2 */
1225 /* In case of the node can epsilon-transit, and it has two
1226 destinations. E.g. '|', '*', '+', '?'. */
1227 org_dest
= dfa
->edests
[org_node
].elems
[0];
1228 re_node_set_empty (dfa
->edests
+ clone_node
);
1229 /* Search for a duplicated node which satisfies the constraint. */
1230 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1231 if (clone_dest
== -1)
1233 /* There are no such a duplicated node, create a new one. */
1234 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1235 if (BE (err
!= REG_NOERROR
, 0))
1237 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1238 if (BE (ret
< 0, 0))
1240 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1241 root_node
, constraint
);
1242 if (BE (err
!= REG_NOERROR
, 0))
1247 /* There are a duplicated node which satisfy the constraint,
1248 use it to avoid infinite loop. */
1249 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1250 if (BE (ret
< 0, 0))
1254 org_dest
= dfa
->edests
[org_node
].elems
[1];
1255 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1256 if (BE (err
!= REG_NOERROR
, 0))
1258 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1259 if (BE (ret
< 0, 0))
1262 org_node
= org_dest
;
1263 clone_node
= clone_dest
;
1268 /* Search for a node which is duplicated from the node ORG_NODE, and
1269 satisfies the constraint CONSTRAINT. */
1272 search_duplicated_node (dfa
, org_node
, constraint
)
1275 unsigned int constraint
;
1278 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1280 if (org_node
== dfa
->org_indices
[idx
]
1281 && constraint
== dfa
->nodes
[idx
].constraint
)
1282 return idx
; /* Found. */
1284 return -1; /* Not found. */
1287 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1288 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1289 otherwise return the error code. */
1291 static reg_errcode_t
1292 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1294 int *new_idx
, org_idx
;
1295 unsigned int constraint
;
1300 dup
= dfa
->nodes
[org_idx
];
1301 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1302 if (BE (dup_idx
== -1, 0))
1304 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1305 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1306 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1307 dfa
->nodes
[dup_idx
].duplicated
= 1;
1308 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1309 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1310 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1312 /* Store the index of the original node. */
1313 dfa
->org_indices
[dup_idx
] = org_idx
;
1319 calc_inveclosure (dfa
)
1323 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1325 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1327 dest
= dfa
->eclosures
[src
].elems
[idx
];
1328 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1333 /* Calculate "eclosure" for all the node in DFA. */
1335 static reg_errcode_t
1339 int node_idx
, incomplete
;
1341 assert (dfa
->nodes_len
> 0);
1344 /* For each nodes, calculate epsilon closure. */
1345 for (node_idx
= 0; ; ++node_idx
)
1348 re_node_set eclosure_elem
;
1349 if (node_idx
== dfa
->nodes_len
)
1358 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1360 /* If we have already calculated, skip it. */
1361 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1363 /* Calculate epsilon closure of `node_idx'. */
1364 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1365 if (BE (err
!= REG_NOERROR
, 0))
1368 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1371 re_node_set_free (&eclosure_elem
);
1377 /* Calculate epsilon closure of NODE. */
1379 static reg_errcode_t
1380 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1381 re_node_set
*new_set
;
1386 unsigned int constraint
;
1388 re_node_set eclosure
;
1390 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1391 if (BE (err
!= REG_NOERROR
, 0))
1394 /* This indicates that we are calculating this node now.
1395 We reference this value to avoid infinite loop. */
1396 dfa
->eclosures
[node
].nelem
= -1;
1398 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1399 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1400 /* If the current node has constraints, duplicate all nodes.
1401 Since they must inherit the constraints. */
1402 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1404 int org_node
, cur_node
;
1405 org_node
= cur_node
= node
;
1406 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1407 if (BE (err
!= REG_NOERROR
, 0))
1411 /* Expand each epsilon destination nodes. */
1412 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1413 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1415 re_node_set eclosure_elem
;
1416 int edest
= dfa
->edests
[node
].elems
[i
];
1417 /* If calculating the epsilon closure of `edest' is in progress,
1418 return intermediate result. */
1419 if (dfa
->eclosures
[edest
].nelem
== -1)
1424 /* If we haven't calculated the epsilon closure of `edest' yet,
1425 calculate now. Otherwise use calculated epsilon closure. */
1426 if (dfa
->eclosures
[edest
].nelem
== 0)
1428 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1429 if (BE (err
!= REG_NOERROR
, 0))
1433 eclosure_elem
= dfa
->eclosures
[edest
];
1434 /* Merge the epsilon closure of `edest'. */
1435 re_node_set_merge (&eclosure
, &eclosure_elem
);
1436 /* If the epsilon closure of `edest' is incomplete,
1437 the epsilon closure of this node is also incomplete. */
1438 if (dfa
->eclosures
[edest
].nelem
== 0)
1441 re_node_set_free (&eclosure_elem
);
1445 /* Epsilon closures include itself. */
1446 re_node_set_insert (&eclosure
, node
);
1447 if (incomplete
&& !root
)
1448 dfa
->eclosures
[node
].nelem
= 0;
1450 dfa
->eclosures
[node
] = eclosure
;
1451 *new_set
= eclosure
;
1455 /* Functions for token which are used in the parser. */
1457 /* Fetch a token from INPUT.
1458 We must not use this function inside bracket expressions. */
1461 fetch_token (input
, syntax
)
1463 reg_syntax_t syntax
;
1467 consumed_byte
= peek_token (&token
, input
, syntax
);
1468 re_string_skip_bytes (input
, consumed_byte
);
1472 /* Peek a token from INPUT, and return the length of the token.
1473 We must not use this function inside bracket expressions. */
1476 peek_token (token
, input
, syntax
)
1479 reg_syntax_t syntax
;
1483 if (re_string_eoi (input
))
1485 token
->type
= END_OF_RE
;
1489 c
= re_string_peek_byte (input
, 0);
1492 #ifdef RE_ENABLE_I18N
1493 token
->mb_partial
= 0;
1494 if (MB_CUR_MAX
> 1 &&
1495 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1497 token
->type
= CHARACTER
;
1498 token
->mb_partial
= 1;
1505 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1507 token
->type
= BACK_SLASH
;
1511 c2
= re_string_peek_byte_case (input
, 1);
1513 token
->type
= CHARACTER
;
1517 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1518 token
->type
= OP_ALT
;
1520 case '1': case '2': case '3': case '4': case '5':
1521 case '6': case '7': case '8': case '9':
1522 if (!(syntax
& RE_NO_BK_REFS
))
1524 token
->type
= OP_BACK_REF
;
1525 token
->opr
.idx
= c2
- '0';
1529 if (!(syntax
& RE_NO_GNU_OPS
))
1531 token
->type
= ANCHOR
;
1532 token
->opr
.idx
= WORD_FIRST
;
1536 if (!(syntax
& RE_NO_GNU_OPS
))
1538 token
->type
= ANCHOR
;
1539 token
->opr
.idx
= WORD_LAST
;
1543 if (!(syntax
& RE_NO_GNU_OPS
))
1545 token
->type
= ANCHOR
;
1546 token
->opr
.idx
= WORD_DELIM
;
1550 if (!(syntax
& RE_NO_GNU_OPS
))
1552 token
->type
= ANCHOR
;
1553 token
->opr
.idx
= INSIDE_WORD
;
1557 if (!(syntax
& RE_NO_GNU_OPS
))
1558 token
->type
= OP_WORD
;
1561 if (!(syntax
& RE_NO_GNU_OPS
))
1562 token
->type
= OP_NOTWORD
;
1565 if (!(syntax
& RE_NO_GNU_OPS
))
1567 token
->type
= ANCHOR
;
1568 token
->opr
.idx
= BUF_FIRST
;
1572 if (!(syntax
& RE_NO_GNU_OPS
))
1574 token
->type
= ANCHOR
;
1575 token
->opr
.idx
= BUF_LAST
;
1579 if (!(syntax
& RE_NO_BK_PARENS
))
1580 token
->type
= OP_OPEN_SUBEXP
;
1583 if (!(syntax
& RE_NO_BK_PARENS
))
1584 token
->type
= OP_CLOSE_SUBEXP
;
1587 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1588 token
->type
= OP_DUP_PLUS
;
1591 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1592 token
->type
= OP_DUP_QUESTION
;
1595 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1596 token
->type
= OP_OPEN_DUP_NUM
;
1599 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1600 token
->type
= OP_CLOSE_DUP_NUM
;
1608 token
->type
= CHARACTER
;
1612 if (syntax
& RE_NEWLINE_ALT
)
1613 token
->type
= OP_ALT
;
1616 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1617 token
->type
= OP_ALT
;
1620 token
->type
= OP_DUP_ASTERISK
;
1623 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1624 token
->type
= OP_DUP_PLUS
;
1627 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1628 token
->type
= OP_DUP_QUESTION
;
1631 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1632 token
->type
= OP_OPEN_DUP_NUM
;
1635 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1636 token
->type
= OP_CLOSE_DUP_NUM
;
1639 if (syntax
& RE_NO_BK_PARENS
)
1640 token
->type
= OP_OPEN_SUBEXP
;
1643 if (syntax
& RE_NO_BK_PARENS
)
1644 token
->type
= OP_CLOSE_SUBEXP
;
1647 token
->type
= OP_OPEN_BRACKET
;
1650 token
->type
= OP_PERIOD
;
1653 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1654 re_string_cur_idx (input
) != 0)
1656 char prev
= re_string_peek_byte (input
, -1);
1657 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1660 token
->type
= ANCHOR
;
1661 token
->opr
.idx
= LINE_FIRST
;
1664 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1665 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1668 re_string_skip_bytes (input
, 1);
1669 peek_token (&next
, input
, syntax
);
1670 re_string_skip_bytes (input
, -1);
1671 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1674 token
->type
= ANCHOR
;
1675 token
->opr
.idx
= LINE_LAST
;
1683 /* Peek a token from INPUT, and return the length of the token.
1684 We must not use this function out of bracket expressions. */
1687 peek_token_bracket (token
, input
, syntax
)
1690 reg_syntax_t syntax
;
1693 if (re_string_eoi (input
))
1695 token
->type
= END_OF_RE
;
1698 c
= re_string_peek_byte (input
, 0);
1701 #ifdef RE_ENABLE_I18N
1702 if (MB_CUR_MAX
> 1 &&
1703 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1705 token
->type
= CHARACTER
;
1708 #endif /* RE_ENABLE_I18N */
1710 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1712 /* In this case, '\' escape a character. */
1714 re_string_skip_bytes (input
, 1);
1715 c2
= re_string_peek_byte (input
, 0);
1717 token
->type
= CHARACTER
;
1720 if (c
== '[') /* '[' is a special char in a bracket exps. */
1724 c2
= re_string_peek_byte (input
, 1);
1730 token
->type
= OP_OPEN_COLL_ELEM
;
1733 token
->type
= OP_OPEN_EQUIV_CLASS
;
1736 if (syntax
& RE_CHAR_CLASSES
)
1738 token
->type
= OP_OPEN_CHAR_CLASS
;
1741 /* else fall through. */
1743 token
->type
= CHARACTER
;
1753 token
->type
= OP_CHARSET_RANGE
;
1756 token
->type
= OP_CLOSE_BRACKET
;
1759 token
->type
= OP_NON_MATCH_LIST
;
1762 token
->type
= CHARACTER
;
1767 /* Functions for parser. */
1769 /* Entry point of the parser.
1770 Parse the regular expression REGEXP and return the structure tree.
1771 If an error is occured, ERR is set by error code, and return NULL.
1772 This function build the following tree, from regular expression <reg_exp>:
1778 CAT means concatenation.
1779 EOR means end of regular expression. */
1782 parse (regexp
, preg
, syntax
, err
)
1783 re_string_t
*regexp
;
1785 reg_syntax_t syntax
;
1788 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1789 bin_tree_t
*tree
, *eor
, *root
;
1790 re_token_t current_token
;
1792 current_token
= fetch_token (regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
1793 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1794 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1796 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1797 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1799 root
= create_tree (tree
, eor
, CONCAT
, 0);
1802 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1810 /* This function build the following tree, from regular expression
1811 <branch1>|<branch2>:
1817 ALT means alternative, which represents the operator `|'. */
1820 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1821 re_string_t
*regexp
;
1824 reg_syntax_t syntax
;
1828 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1829 bin_tree_t
*tree
, *branch
= NULL
;
1831 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1832 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1835 while (token
->type
== OP_ALT
)
1837 re_token_t alt_token
= *token
;
1838 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1839 *token
= fetch_token (regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
1840 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1841 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1843 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1844 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1846 free_bin_tree (tree
);
1852 tree
= create_tree (tree
, branch
, 0, new_idx
);
1853 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1858 dfa
->has_plural_match
= 1;
1863 /* This function build the following tree, from regular expression
1870 CAT means concatenation. */
1873 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
1874 re_string_t
*regexp
;
1877 reg_syntax_t syntax
;
1881 bin_tree_t
*tree
, *exp
;
1882 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1883 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1886 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1887 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1889 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
1890 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
1892 free_bin_tree (tree
);
1895 if (tree
!= NULL
&& exp
!= NULL
)
1897 tree
= create_tree (tree
, exp
, CONCAT
, 0);
1904 else if (tree
== NULL
)
1906 /* Otherwise exp == NULL, we don't need to create new tree. */
1911 /* This function build the following tree, from regular expression a*:
1918 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
1919 re_string_t
*regexp
;
1922 reg_syntax_t syntax
;
1926 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1929 switch (token
->type
)
1932 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1933 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1934 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1939 #ifdef RE_ENABLE_I18N
1942 while (!re_string_eoi (regexp
)
1943 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
1945 bin_tree_t
*mbc_remain
;
1946 *token
= fetch_token (regexp
, syntax
);
1947 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1948 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
1949 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
1950 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
1959 case OP_OPEN_SUBEXP
:
1960 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
1961 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1964 case OP_OPEN_BRACKET
:
1965 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
1966 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1970 if (BE (preg
->re_nsub
< token
->opr
.idx
1971 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
1976 dfa
->used_bkref_map
|= 1 << (token
->opr
.idx
- 1);
1977 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
1978 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
1979 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1985 dfa
->has_mb_node
= 1;
1987 case OP_DUP_ASTERISK
:
1989 case OP_DUP_QUESTION
:
1990 case OP_OPEN_DUP_NUM
:
1991 if (syntax
& RE_CONTEXT_INVALID_OPS
)
1996 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
1998 *token
= fetch_token (regexp
, syntax
);
1999 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2001 /* else fall through */
2002 case OP_CLOSE_SUBEXP
:
2003 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2004 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2009 /* else fall through */
2010 case OP_CLOSE_DUP_NUM
:
2011 /* We treat it as a normal character. */
2013 /* Then we can these characters as normal characters. */
2014 token
->type
= CHARACTER
;
2015 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2016 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2017 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2024 if (dfa
->word_char
== NULL
)
2026 *err
= init_word_char (dfa
);
2027 if (BE (*err
!= REG_NOERROR
, 0))
2030 if (token
->opr
.ctx_type
== WORD_DELIM
)
2032 bin_tree_t
*tree_first
, *tree_last
;
2033 int idx_first
, idx_last
;
2034 token
->opr
.ctx_type
= WORD_FIRST
;
2035 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
2036 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
2037 token
->opr
.ctx_type
= WORD_LAST
;
2038 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
2039 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
2040 token
->type
= OP_ALT
;
2041 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2042 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
2043 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
2044 || tree_first
== NULL
|| tree_last
== NULL
2045 || tree
== NULL
, 0))
2053 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2054 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2055 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2061 /* We must return here, since ANCHORs can't be followed
2062 by repetition operators.
2063 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2064 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2065 *token
= fetch_token (regexp
, syntax
);
2068 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2069 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2070 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2076 dfa
->has_mb_node
= 1;
2079 tree
= build_word_op (dfa
, regexp
->trans
, 0, err
);
2080 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2084 tree
= build_word_op (dfa
, regexp
->trans
, 1, err
);
2085 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2095 /* Must not happen? */
2101 *token
= fetch_token (regexp
, syntax
);
2103 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2104 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2106 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2107 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2109 dfa
->has_plural_match
= 1;
2115 /* This function build the following tree, from regular expression
2123 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2124 re_string_t
*regexp
;
2127 reg_syntax_t syntax
;
2131 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2132 bin_tree_t
*tree
, *left_par
, *right_par
;
2135 cur_nsub
= preg
->re_nsub
++;
2136 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2138 re_subexp_t
*new_array
;
2139 dfa
->subexps_alloc
*= 2;
2140 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2141 if (BE (new_array
== NULL
, 0))
2143 dfa
->subexps_alloc
/= 2;
2147 dfa
->subexps
= new_array
;
2149 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2150 dfa
->subexps
[cur_nsub
].end
= -1;
2152 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2153 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2154 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2159 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2160 *token
= fetch_token (regexp
, syntax
);
2162 /* The subexpression may be a null string. */
2163 if (token
->type
== OP_CLOSE_SUBEXP
)
2167 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2168 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2171 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2173 free_bin_tree (tree
);
2177 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2178 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2179 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2180 tree
= ((tree
== NULL
) ? right_par
2181 : create_tree (tree
, right_par
, CONCAT
, 0));
2182 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2183 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2188 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2193 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2196 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2197 bin_tree_t
*dup_elem
;
2198 re_string_t
*regexp
;
2201 reg_syntax_t syntax
;
2204 re_token_t dup_token
;
2205 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2206 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2207 re_token_t start_token
= *token
;
2208 if (token
->type
== OP_OPEN_DUP_NUM
)
2212 int start
= fetch_number (regexp
, token
, syntax
);
2216 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2217 start
= 0; /* We treat "{,m}" as "{0,m}". */
2220 *err
= REG_BADBR
; /* <re>{} is invalid. */
2224 if (BE (start
!= -2, 1))
2226 /* We treat "{n}" as "{n,n}". */
2227 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2228 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2229 ? fetch_number (regexp
, token
, syntax
) : -2));
2231 if (BE (start
== -2 || end
== -2, 0))
2233 /* Invalid sequence. */
2234 if (token
->type
== OP_CLOSE_DUP_NUM
)
2235 goto parse_dup_op_invalid_interval
;
2237 goto parse_dup_op_ebrace
;
2239 if (BE (start
== 0 && end
== 0, 0))
2241 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2242 *token
= fetch_token (regexp
, syntax
);
2243 free_bin_tree (dup_elem
);
2247 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2249 for (i
= 0; i
< start
; ++i
)
2252 work_tree
= duplicate_tree (elem
, dfa
);
2253 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2254 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2255 goto parse_dup_op_espace
;
2260 /* We treat "<re>{0,}" as "<re>*". */
2261 dup_token
.type
= OP_DUP_ASTERISK
;
2264 elem
= duplicate_tree (elem
, dfa
);
2265 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2266 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2267 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2268 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2269 || tree
== NULL
, 0))
2270 goto parse_dup_op_espace
;
2274 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2275 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2276 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2277 goto parse_dup_op_espace
;
2280 else if (end
- start
> 0)
2282 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2283 dup_token
.type
= OP_DUP_QUESTION
;
2286 elem
= duplicate_tree (elem
, dfa
);
2287 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2288 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2289 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2290 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2291 goto parse_dup_op_espace
;
2295 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2296 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2297 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2298 goto parse_dup_op_espace
;
2300 for (i
= 1; i
< end
- start
; ++i
)
2302 work_tree
= duplicate_tree (elem
, dfa
);
2303 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2304 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2314 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2315 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2316 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2322 *token
= fetch_token (regexp
, syntax
);
2325 parse_dup_op_espace
:
2326 free_bin_tree (tree
);
2330 parse_dup_op_ebrace
:
2331 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2336 goto parse_dup_op_rollback
;
2337 parse_dup_op_invalid_interval
:
2338 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2343 parse_dup_op_rollback
:
2344 re_string_set_index (regexp
, start_idx
);
2345 *token
= start_token
;
2346 token
->type
= CHARACTER
;
2350 /* Size of the names for collating symbol/equivalence_class/character_class.
2351 I'm not sure, but maybe enough. */
2352 #define BRACKET_NAME_BUF_SIZE 32
2355 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2356 Build the range expression which starts from START_ELEM, and ends
2357 at END_ELEM. The result are written to MBCSET and SBCSET.
2358 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2359 mbcset->range_ends, is a pointer argument sinse we may
2362 static reg_errcode_t
2363 # ifdef RE_ENABLE_I18N
2364 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2365 re_charset_t
*mbcset
;
2367 # else /* not RE_ENABLE_I18N */
2368 build_range_exp (sbcset
, start_elem
, end_elem
)
2369 # endif /* not RE_ENABLE_I18N */
2370 re_bitset_ptr_t sbcset
;
2371 bracket_elem_t
*start_elem
, *end_elem
;
2373 unsigned int start_ch
, end_ch
;
2374 /* Equivalence Classes and Character Classes can't be a range start/end. */
2375 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2376 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2380 /* We can handle no multi character collating elements without libc
2382 if (BE ((start_elem
->type
== COLL_SYM
2383 && strlen ((char *) start_elem
->opr
.name
) > 1)
2384 || (end_elem
->type
== COLL_SYM
2385 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2386 return REG_ECOLLATE
;
2388 # ifdef RE_ENABLE_I18N
2390 wchar_t wc
, start_wc
, end_wc
;
2391 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2393 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2394 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2396 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2397 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2399 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2400 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2401 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2402 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2403 cmp_buf
[0] = start_wc
;
2404 cmp_buf
[4] = end_wc
;
2405 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2408 /* Check the space of the arrays. */
2409 if (*range_alloc
== mbcset
->nranges
)
2411 /* There are not enough space, need realloc. */
2412 wchar_t *new_array_start
, *new_array_end
;
2415 /* +1 in case of mbcset->nranges is 0. */
2416 new_nranges
= 2 * mbcset
->nranges
+ 1;
2417 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2418 are NULL if *range_alloc == 0. */
2419 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2421 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2424 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2427 mbcset
->range_starts
= new_array_start
;
2428 mbcset
->range_ends
= new_array_end
;
2429 *range_alloc
= new_nranges
;
2432 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2433 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2435 /* Build the table for single byte characters. */
2436 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2439 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2440 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2441 bitset_set (sbcset
, wc
);
2444 # else /* not RE_ENABLE_I18N */
2447 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2448 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2450 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2451 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2453 if (start_ch
> end_ch
)
2455 /* Build the table for single byte characters. */
2456 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2457 if (start_ch
<= ch
&& ch
<= end_ch
)
2458 bitset_set (sbcset
, ch
);
2460 # endif /* not RE_ENABLE_I18N */
2463 #endif /* not _LIBC */
2466 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2467 Build the collating element which is represented by NAME.
2468 The result are written to MBCSET and SBCSET.
2469 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2470 pointer argument since we may update it. */
2472 static reg_errcode_t
2473 # ifdef RE_ENABLE_I18N
2474 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2475 re_charset_t
*mbcset
;
2476 int *coll_sym_alloc
;
2477 # else /* not RE_ENABLE_I18N */
2478 build_collating_symbol (sbcset
, name
)
2479 # endif /* not RE_ENABLE_I18N */
2480 re_bitset_ptr_t sbcset
;
2481 const unsigned char *name
;
2483 size_t name_len
= strlen ((const char *) name
);
2484 if (BE (name_len
!= 1, 0))
2485 return REG_ECOLLATE
;
2488 bitset_set (sbcset
, name
[0]);
2492 #endif /* not _LIBC */
2494 /* This function parse bracket expression like "[abc]", "[a-c]",
2498 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2499 re_string_t
*regexp
;
2502 reg_syntax_t syntax
;
2506 const unsigned char *collseqmb
;
2507 const char *collseqwc
;
2510 const int32_t *symb_table
;
2511 const unsigned char *extra
;
2513 /* Local function for parse_bracket_exp used in _LIBC environement.
2514 Seek the collating symbol entry correspondings to NAME.
2515 Return the index of the symbol in the SYMB_TABLE. */
2517 static inline int32_t
2518 __attribute ((always_inline
))
2519 seek_collating_symbol_entry (name
, name_len
)
2520 const unsigned char *name
;
2523 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2524 int32_t elem
= hash
% table_size
;
2525 int32_t second
= hash
% (table_size
- 2);
2526 while (symb_table
[2 * elem
] != 0)
2528 /* First compare the hashing value. */
2529 if (symb_table
[2 * elem
] == hash
2530 /* Compare the length of the name. */
2531 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2532 /* Compare the name. */
2533 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2536 /* Yep, this is the entry. */
2546 /* Local function for parse_bracket_exp used in _LIBC environement.
2547 Look up the collation sequence value of BR_ELEM.
2548 Return the value if succeeded, UINT_MAX otherwise. */
2550 static inline unsigned int
2551 __attribute ((always_inline
))
2552 lookup_collation_sequence_value (br_elem
)
2553 bracket_elem_t
*br_elem
;
2555 if (br_elem
->type
== SB_CHAR
)
2558 if (MB_CUR_MAX == 1)
2561 return collseqmb
[br_elem
->opr
.ch
];
2564 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2565 return __collseq_table_lookup (collseqwc
, wc
);
2568 else if (br_elem
->type
== MB_CHAR
)
2570 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2572 else if (br_elem
->type
== COLL_SYM
)
2574 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2578 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2580 if (symb_table
[2 * elem
] != 0)
2582 /* We found the entry. */
2583 idx
= symb_table
[2 * elem
+ 1];
2584 /* Skip the name of collating element name. */
2585 idx
+= 1 + extra
[idx
];
2586 /* Skip the byte sequence of the collating element. */
2587 idx
+= 1 + extra
[idx
];
2588 /* Adjust for the alignment. */
2589 idx
= (idx
+ 3) & ~3;
2590 /* Skip the multibyte collation sequence value. */
2591 idx
+= sizeof (unsigned int);
2592 /* Skip the wide char sequence of the collating element. */
2593 idx
+= sizeof (unsigned int) *
2594 (1 + *(unsigned int *) (extra
+ idx
));
2595 /* Return the collation sequence value. */
2596 return *(unsigned int *) (extra
+ idx
);
2598 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2600 /* No valid character. Match it as a single byte
2602 return collseqmb
[br_elem
->opr
.name
[0]];
2605 else if (sym_name_len
== 1)
2606 return collseqmb
[br_elem
->opr
.name
[0]];
2611 /* Local function for parse_bracket_exp used in _LIBC environement.
2612 Build the range expression which starts from START_ELEM, and ends
2613 at END_ELEM. The result are written to MBCSET and SBCSET.
2614 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2615 mbcset->range_ends, is a pointer argument sinse we may
2618 static inline reg_errcode_t
2619 __attribute ((always_inline
))
2620 # ifdef RE_ENABLE_I18N
2621 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2622 re_charset_t
*mbcset
;
2624 # else /* not RE_ENABLE_I18N */
2625 build_range_exp (sbcset
, start_elem
, end_elem
)
2626 # endif /* not RE_ENABLE_I18N */
2627 re_bitset_ptr_t sbcset
;
2628 bracket_elem_t
*start_elem
, *end_elem
;
2631 uint32_t start_collseq
;
2632 uint32_t end_collseq
;
2634 # ifdef RE_ENABLE_I18N
2635 /* Check the space of the arrays. */
2636 if (*range_alloc
== mbcset
->nranges
)
2638 /* There are not enough space, need realloc. */
2639 uint32_t *new_array_start
;
2640 uint32_t *new_array_end
;
2643 /* +1 in case of mbcset->nranges is 0. */
2644 new_nranges
= 2 * mbcset
->nranges
+ 1;
2645 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2646 are NULL if *range_alloc == 0. */
2647 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2649 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2652 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2655 mbcset
->range_starts
= new_array_start
;
2656 mbcset
->range_ends
= new_array_end
;
2657 *range_alloc
= new_nranges
;
2659 # endif /* RE_ENABLE_I18N */
2661 /* Equivalence Classes and Character Classes can't be a range
2663 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2664 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2668 start_collseq
= lookup_collation_sequence_value (start_elem
);
2669 end_collseq
= lookup_collation_sequence_value (end_elem
);
2670 /* Check start/end collation sequence values. */
2671 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2672 return REG_ECOLLATE
;
2673 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2676 # ifdef RE_ENABLE_I18N
2677 /* Got valid collation sequence values, add them as a new entry. */
2678 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2679 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2680 # endif /* RE_ENABLE_I18N */
2682 /* Build the table for single byte characters. */
2683 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2685 uint32_t ch_collseq
;
2687 if (MB_CUR_MAX == 1)
2690 ch_collseq
= collseqmb
[ch
];
2692 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2693 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2694 bitset_set (sbcset
, ch
);
2699 /* Local function for parse_bracket_exp used in _LIBC environement.
2700 Build the collating element which is represented by NAME.
2701 The result are written to MBCSET and SBCSET.
2702 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2703 pointer argument sinse we may update it. */
2705 static inline reg_errcode_t
2706 __attribute ((always_inline
))
2707 # ifdef RE_ENABLE_I18N
2708 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2709 re_charset_t
*mbcset
;
2710 int *coll_sym_alloc
;
2711 # else /* not RE_ENABLE_I18N */
2712 build_collating_symbol (sbcset
, name
)
2713 # endif /* not RE_ENABLE_I18N */
2714 re_bitset_ptr_t sbcset
;
2715 const unsigned char *name
;
2718 size_t name_len
= strlen ((const char *) name
);
2721 elem
= seek_collating_symbol_entry (name
, name_len
);
2722 if (symb_table
[2 * elem
] != 0)
2724 /* We found the entry. */
2725 idx
= symb_table
[2 * elem
+ 1];
2726 /* Skip the name of collating element name. */
2727 idx
+= 1 + extra
[idx
];
2729 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2731 /* No valid character, treat it as a normal
2733 bitset_set (sbcset
, name
[0]);
2737 return REG_ECOLLATE
;
2739 # ifdef RE_ENABLE_I18N
2740 /* Got valid collation sequence, add it as a new entry. */
2741 /* Check the space of the arrays. */
2742 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2744 /* Not enough, realloc it. */
2745 /* +1 in case of mbcset->ncoll_syms is 0. */
2746 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2747 /* Use realloc since mbcset->coll_syms is NULL
2749 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2751 if (BE (mbcset
->coll_syms
== NULL
, 0))
2754 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2755 # endif /* RE_ENABLE_I18N */
2760 if (BE (name_len
!= 1, 0))
2761 return REG_ECOLLATE
;
2764 bitset_set (sbcset
, name
[0]);
2771 re_token_t br_token
;
2772 re_bitset_ptr_t sbcset
;
2773 #ifdef RE_ENABLE_I18N
2774 re_charset_t
*mbcset
;
2775 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2776 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2777 #else /* not RE_ENABLE_I18N */
2779 #endif /* not RE_ENABLE_I18N */
2780 bin_tree_t
*work_tree
;
2781 int token_len
, new_idx
;
2783 collseqmb
= (const unsigned char *)
2784 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2785 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2791 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2792 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2793 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2794 _NL_COLLATE_SYMB_TABLEMB
);
2795 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2796 _NL_COLLATE_SYMB_EXTRAMB
);
2799 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2800 #ifdef RE_ENABLE_I18N
2801 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2802 #endif /* RE_ENABLE_I18N */
2803 #ifdef RE_ENABLE_I18N
2804 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2806 if (BE (sbcset
== NULL
, 0))
2807 #endif /* RE_ENABLE_I18N */
2813 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2814 if (BE (token
->type
== END_OF_RE
, 0))
2817 goto parse_bracket_exp_free_return
;
2819 if (token
->type
== OP_NON_MATCH_LIST
)
2821 #ifdef RE_ENABLE_I18N
2823 mbcset
->non_match
= 1;
2824 #else /* not RE_ENABLE_I18N */
2826 #endif /* not RE_ENABLE_I18N */
2827 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2828 bitset_set (sbcset
, '\0');
2829 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2830 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2831 if (BE (token
->type
== END_OF_RE
, 0))
2834 goto parse_bracket_exp_free_return
;
2836 #ifdef RE_ENABLE_I18N
2838 for (i
= 0; i
< SBC_MAX
; ++i
)
2839 if (__btowc (i
) == WEOF
)
2840 bitset_set (sbcset
, i
);
2841 #endif /* RE_ENABLE_I18N */
2844 /* We treat the first ']' as a normal character. */
2845 if (token
->type
== OP_CLOSE_BRACKET
)
2846 token
->type
= CHARACTER
;
2850 bracket_elem_t start_elem
, end_elem
;
2851 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
2852 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
2854 int token_len2
= 0, is_range_exp
= 0;
2857 start_elem
.opr
.name
= start_name_buf
;
2858 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
2860 if (BE (ret
!= REG_NOERROR
, 0))
2863 goto parse_bracket_exp_free_return
;
2866 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2867 if (BE (token
->type
== END_OF_RE
, 0))
2870 goto parse_bracket_exp_free_return
;
2872 if (token
->type
== OP_CHARSET_RANGE
)
2874 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
2875 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
2876 if (BE (token
->type
== END_OF_RE
, 0))
2879 goto parse_bracket_exp_free_return
;
2881 if (token2
.type
== OP_CLOSE_BRACKET
)
2883 /* We treat the last '-' as a normal character. */
2884 re_string_skip_bytes (regexp
, -token_len
);
2885 token
->type
= CHARACTER
;
2891 if (is_range_exp
== 1)
2893 end_elem
.opr
.name
= end_name_buf
;
2894 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
2896 if (BE (ret
!= REG_NOERROR
, 0))
2899 goto parse_bracket_exp_free_return
;
2902 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2903 if (BE (token
->type
== END_OF_RE
, 0))
2906 goto parse_bracket_exp_free_return
;
2908 *err
= build_range_exp (sbcset
,
2909 #ifdef RE_ENABLE_I18N
2910 mbcset
, &range_alloc
,
2911 #endif /* RE_ENABLE_I18N */
2912 &start_elem
, &end_elem
);
2913 if (BE (*err
!= REG_NOERROR
, 0))
2914 goto parse_bracket_exp_free_return
;
2918 switch (start_elem
.type
)
2921 bitset_set (sbcset
, start_elem
.opr
.ch
);
2923 #ifdef RE_ENABLE_I18N
2925 /* Check whether the array has enough space. */
2926 if (mbchar_alloc
== mbcset
->nmbchars
)
2928 /* Not enough, realloc it. */
2929 /* +1 in case of mbcset->nmbchars is 0. */
2930 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
2931 /* Use realloc since array is NULL if *alloc == 0. */
2932 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
2934 if (BE (mbcset
->mbchars
== NULL
, 0))
2935 goto parse_bracket_exp_espace
;
2937 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
2939 #endif /* RE_ENABLE_I18N */
2941 *err
= build_equiv_class (sbcset
,
2942 #ifdef RE_ENABLE_I18N
2943 mbcset
, &equiv_class_alloc
,
2944 #endif /* RE_ENABLE_I18N */
2945 start_elem
.opr
.name
);
2946 if (BE (*err
!= REG_NOERROR
, 0))
2947 goto parse_bracket_exp_free_return
;
2950 *err
= build_collating_symbol (sbcset
,
2951 #ifdef RE_ENABLE_I18N
2952 mbcset
, &coll_sym_alloc
,
2953 #endif /* RE_ENABLE_I18N */
2954 start_elem
.opr
.name
);
2955 if (BE (*err
!= REG_NOERROR
, 0))
2956 goto parse_bracket_exp_free_return
;
2959 *err
= build_charclass (regexp
->trans
, sbcset
,
2960 #ifdef RE_ENABLE_I18N
2961 mbcset
, &char_class_alloc
,
2962 #endif /* RE_ENABLE_I18N */
2963 start_elem
.opr
.name
, syntax
);
2964 if (BE (*err
!= REG_NOERROR
, 0))
2965 goto parse_bracket_exp_free_return
;
2972 if (token
->type
== OP_CLOSE_BRACKET
)
2976 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2978 /* If it is non-matching list. */
2979 #ifdef RE_ENABLE_I18N
2980 if (mbcset
->non_match
)
2981 #else /* not RE_ENABLE_I18N */
2983 #endif /* not RE_ENABLE_I18N */
2984 bitset_not (sbcset
);
2986 /* Build a tree for simple bracket. */
2987 br_token
.type
= SIMPLE_BRACKET
;
2988 br_token
.opr
.sbcset
= sbcset
;
2989 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
2990 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2991 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
2992 goto parse_bracket_exp_espace
;
2994 #ifdef RE_ENABLE_I18N
2995 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
2996 || mbcset
->nranges
|| (MB_CUR_MAX
> 1 && (mbcset
->nchar_classes
2997 || mbcset
->non_match
)))
2999 re_token_t alt_token
;
3000 bin_tree_t
*mbc_tree
;
3001 /* Build a tree for complex bracket. */
3002 br_token
.type
= COMPLEX_BRACKET
;
3003 br_token
.opr
.mbcset
= mbcset
;
3004 dfa
->has_mb_node
= 1;
3005 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3006 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3007 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3008 goto parse_bracket_exp_espace
;
3009 /* Then join them by ALT node. */
3010 dfa
->has_plural_match
= 1;
3011 alt_token
.type
= OP_ALT
;
3012 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3013 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
3014 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3019 free_charset (mbcset
);
3022 #else /* not RE_ENABLE_I18N */
3024 #endif /* not RE_ENABLE_I18N */
3026 parse_bracket_exp_espace
:
3028 parse_bracket_exp_free_return
:
3030 #ifdef RE_ENABLE_I18N
3031 free_charset (mbcset
);
3032 #endif /* RE_ENABLE_I18N */
3036 /* Parse an element in the bracket expression. */
3038 static reg_errcode_t
3039 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
)
3040 bracket_elem_t
*elem
;
3041 re_string_t
*regexp
;
3045 reg_syntax_t syntax
;
3047 #ifdef RE_ENABLE_I18N
3049 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3050 if (cur_char_size
> 1)
3052 elem
->type
= MB_CHAR
;
3053 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3054 re_string_skip_bytes (regexp
, cur_char_size
);
3057 #endif /* RE_ENABLE_I18N */
3058 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3059 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3060 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3061 return parse_bracket_symbol (elem
, regexp
, token
);
3062 elem
->type
= SB_CHAR
;
3063 elem
->opr
.ch
= token
->opr
.c
;
3067 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3068 such as [:<character_class>:], [.<collating_element>.], and
3069 [=<equivalent_class>=]. */
3071 static reg_errcode_t
3072 parse_bracket_symbol (elem
, regexp
, token
)
3073 bracket_elem_t
*elem
;
3074 re_string_t
*regexp
;
3077 unsigned char ch
, delim
= token
->opr
.c
;
3081 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3083 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3084 ch
= re_string_fetch_byte_case (regexp
);
3086 ch
= re_string_fetch_byte (regexp
);
3087 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3089 elem
->opr
.name
[i
] = ch
;
3091 re_string_skip_bytes (regexp
, 1);
3092 elem
->opr
.name
[i
] = '\0';
3093 switch (token
->type
)
3095 case OP_OPEN_COLL_ELEM
:
3096 elem
->type
= COLL_SYM
;
3098 case OP_OPEN_EQUIV_CLASS
:
3099 elem
->type
= EQUIV_CLASS
;
3101 case OP_OPEN_CHAR_CLASS
:
3102 elem
->type
= CHAR_CLASS
;
3110 /* Helper function for parse_bracket_exp.
3111 Build the equivalence class which is represented by NAME.
3112 The result are written to MBCSET and SBCSET.
3113 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3114 is a pointer argument sinse we may update it. */
3116 static reg_errcode_t
3117 #ifdef RE_ENABLE_I18N
3118 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3119 re_charset_t
*mbcset
;
3120 int *equiv_class_alloc
;
3121 #else /* not RE_ENABLE_I18N */
3122 build_equiv_class (sbcset
, name
)
3123 #endif /* not RE_ENABLE_I18N */
3124 re_bitset_ptr_t sbcset
;
3125 const unsigned char *name
;
3127 #if defined _LIBC && defined RE_ENABLE_I18N
3128 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3131 const int32_t *table
, *indirect
;
3132 const unsigned char *weights
, *extra
, *cp
;
3133 unsigned char char_buf
[2];
3137 /* This #include defines a local function! */
3138 # include <locale/weight.h>
3139 /* Calculate the index for equivalence class. */
3141 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3142 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3143 _NL_COLLATE_WEIGHTMB
);
3144 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3145 _NL_COLLATE_EXTRAMB
);
3146 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3147 _NL_COLLATE_INDIRECTMB
);
3148 idx1
= findidx (&cp
);
3149 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3150 /* This isn't a valid character. */
3151 return REG_ECOLLATE
;
3153 /* Build single byte matcing table for this equivalence class. */
3154 char_buf
[1] = (unsigned char) '\0';
3155 len
= weights
[idx1
];
3156 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3160 idx2
= findidx (&cp
);
3165 /* This isn't a valid character. */
3167 if (len
== weights
[idx2
])
3170 while (cnt
<= len
&&
3171 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3175 bitset_set (sbcset
, ch
);
3178 /* Check whether the array has enough space. */
3179 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3181 /* Not enough, realloc it. */
3182 /* +1 in case of mbcset->nequiv_classes is 0. */
3183 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3184 /* Use realloc since the array is NULL if *alloc == 0. */
3185 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3186 *equiv_class_alloc
);
3187 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3190 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3193 #endif /* _LIBC && RE_ENABLE_I18N */
3195 if (BE (strlen ((const char *) name
) != 1, 0))
3196 return REG_ECOLLATE
;
3197 bitset_set (sbcset
, *name
);
3202 /* Helper function for parse_bracket_exp.
3203 Build the character class which is represented by NAME.
3204 The result are written to MBCSET and SBCSET.
3205 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3206 is a pointer argument sinse we may update it. */
3208 static reg_errcode_t
3209 #ifdef RE_ENABLE_I18N
3210 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3211 re_charset_t
*mbcset
;
3212 int *char_class_alloc
;
3213 #else /* not RE_ENABLE_I18N */
3214 build_charclass (trans
, sbcset
, class_name
, syntax
)
3215 #endif /* not RE_ENABLE_I18N */
3216 RE_TRANSLATE_TYPE trans
;
3217 re_bitset_ptr_t sbcset
;
3218 const unsigned char *class_name
;
3219 reg_syntax_t syntax
;
3222 const char *name
= (const char *) class_name
;
3224 /* In case of REG_ICASE "upper" and "lower" match the both of
3225 upper and lower cases. */
3226 if ((syntax
& RE_ICASE
)
3227 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3230 #ifdef RE_ENABLE_I18N
3231 /* Check the space of the arrays. */
3232 if (*char_class_alloc
== mbcset
->nchar_classes
)
3234 /* Not enough, realloc it. */
3235 /* +1 in case of mbcset->nchar_classes is 0. */
3236 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3237 /* Use realloc since array is NULL if *alloc == 0. */
3238 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3240 if (BE (mbcset
->char_classes
== NULL
, 0))
3243 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3244 #endif /* RE_ENABLE_I18N */
3246 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3247 for (i = 0; i < SBC_MAX; ++i) \
3249 if (ctype_func (i)) \
3251 int ch = trans ? trans[i] : i; \
3252 bitset_set (sbcset, ch); \
3256 if (strcmp (name
, "alnum") == 0)
3257 BUILD_CHARCLASS_LOOP (isalnum
)
3258 else if (strcmp (name
, "cntrl") == 0)
3259 BUILD_CHARCLASS_LOOP (iscntrl
)
3260 else if (strcmp (name
, "lower") == 0)
3261 BUILD_CHARCLASS_LOOP (islower
)
3262 else if (strcmp (name
, "space") == 0)
3263 BUILD_CHARCLASS_LOOP (isspace
)
3264 else if (strcmp (name
, "alpha") == 0)
3265 BUILD_CHARCLASS_LOOP (isalpha
)
3266 else if (strcmp (name
, "digit") == 0)
3267 BUILD_CHARCLASS_LOOP (isdigit
)
3268 else if (strcmp (name
, "print") == 0)
3269 BUILD_CHARCLASS_LOOP (isprint
)
3270 else if (strcmp (name
, "upper") == 0)
3271 BUILD_CHARCLASS_LOOP (isupper
)
3272 else if (strcmp (name
, "blank") == 0)
3273 BUILD_CHARCLASS_LOOP (isblank
)
3274 else if (strcmp (name
, "graph") == 0)
3275 BUILD_CHARCLASS_LOOP (isgraph
)
3276 else if (strcmp (name
, "punct") == 0)
3277 BUILD_CHARCLASS_LOOP (ispunct
)
3278 else if (strcmp (name
, "xdigit") == 0)
3279 BUILD_CHARCLASS_LOOP (isxdigit
)
3287 build_word_op (dfa
, trans
, not, err
)
3289 RE_TRANSLATE_TYPE trans
;
3293 re_bitset_ptr_t sbcset
;
3294 #ifdef RE_ENABLE_I18N
3295 re_charset_t
*mbcset
;
3297 #else /* not RE_ENABLE_I18N */
3299 #endif /* not RE_ENABLE_I18N */
3301 re_token_t br_token
;
3305 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3306 #ifdef RE_ENABLE_I18N
3307 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3308 #endif /* RE_ENABLE_I18N */
3310 #ifdef RE_ENABLE_I18N
3311 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3312 #else /* not RE_ENABLE_I18N */
3313 if (BE (sbcset
== NULL
, 0))
3314 #endif /* not RE_ENABLE_I18N */
3322 #ifdef RE_ENABLE_I18N
3325 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3326 bitset_set(cset->sbcset, '\0');
3328 mbcset
->non_match
= 1;
3330 for (i
= 0; i
< SBC_MAX
; ++i
)
3331 if (__btowc (i
) == WEOF
)
3332 bitset_set (sbcset
, i
);
3333 #else /* not RE_ENABLE_I18N */
3335 #endif /* not RE_ENABLE_I18N */
3338 /* We don't care the syntax in this case. */
3339 ret
= build_charclass (trans
, sbcset
,
3340 #ifdef RE_ENABLE_I18N
3342 #endif /* RE_ENABLE_I18N */
3343 (const unsigned char *) "alnum", 0);
3345 if (BE (ret
!= REG_NOERROR
, 0))
3348 #ifdef RE_ENABLE_I18N
3349 free_charset (mbcset
);
3350 #endif /* RE_ENABLE_I18N */
3354 /* \w match '_' also. */
3355 bitset_set (sbcset
, '_');
3357 /* If it is non-matching list. */
3358 #ifdef RE_ENABLE_I18N
3359 if (mbcset
->non_match
)
3360 #else /* not RE_ENABLE_I18N */
3362 #endif /* not RE_ENABLE_I18N */
3363 bitset_not (sbcset
);
3365 /* Build a tree for simple bracket. */
3366 br_token
.type
= SIMPLE_BRACKET
;
3367 br_token
.opr
.sbcset
= sbcset
;
3368 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3369 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3370 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3371 goto build_word_op_espace
;
3373 #ifdef RE_ENABLE_I18N
3376 re_token_t alt_token
;
3377 bin_tree_t
*mbc_tree
;
3378 /* Build a tree for complex bracket. */
3379 br_token
.type
= COMPLEX_BRACKET
;
3380 br_token
.opr
.mbcset
= mbcset
;
3381 dfa
->has_mb_node
= 1;
3382 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3383 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3384 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3385 goto build_word_op_espace
;
3386 /* Then join them by ALT node. */
3387 alt_token
.type
= OP_ALT
;
3388 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3389 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3390 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3395 free_charset (mbcset
);
3398 #else /* not RE_ENABLE_I18N */
3400 #endif /* not RE_ENABLE_I18N */
3402 build_word_op_espace
:
3404 #ifdef RE_ENABLE_I18N
3405 free_charset (mbcset
);
3406 #endif /* RE_ENABLE_I18N */
3411 /* This is intended for the expressions like "a{1,3}".
3412 Fetch a number from `input', and return the number.
3413 Return -1, if the number field is empty like "{,1}".
3414 Return -2, If an error is occured. */
3417 fetch_number (input
, token
, syntax
)
3420 reg_syntax_t syntax
;
3426 *token
= fetch_token (input
, syntax
);
3428 if (BE (token
->type
== END_OF_RE
, 0))
3430 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3432 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3433 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3434 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3439 #ifdef RE_ENABLE_I18N
3441 free_charset (re_charset_t
*cset
)
3443 re_free (cset
->mbchars
);
3445 re_free (cset
->coll_syms
);
3446 re_free (cset
->equiv_classes
);
3447 re_free (cset
->range_starts
);
3448 re_free (cset
->range_ends
);
3450 re_free (cset
->char_classes
);
3453 #endif /* RE_ENABLE_I18N */
3455 /* Functions for binary tree operation. */
3457 /* Create a node of tree.
3458 Note: This function automatically free left and right if malloc fails. */
3461 create_tree (left
, right
, type
, index
)
3464 re_token_type_t type
;
3468 tree
= re_malloc (bin_tree_t
, 1);
3469 if (BE (tree
== NULL
, 0))
3471 free_bin_tree (left
);
3472 free_bin_tree (right
);
3475 tree
->parent
= NULL
;
3477 tree
->right
= right
;
3479 tree
->node_idx
= index
;
3482 re_node_set_init_empty (&tree
->eclosure
);
3485 left
->parent
= tree
;
3487 right
->parent
= tree
;
3491 /* Free the sub tree pointed by TREE. */
3494 free_bin_tree (tree
)
3499 /*re_node_set_free (&tree->eclosure);*/
3500 free_bin_tree (tree
->left
);
3501 free_bin_tree (tree
->right
);
3505 /* Duplicate the node SRC, and return new node. */
3508 duplicate_tree (src
, dfa
)
3509 const bin_tree_t
*src
;
3512 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3514 /* Since node indies must be according to Post-order of the tree,
3515 we must duplicate the left at first. */
3516 if (src
->left
!= NULL
)
3518 left
= duplicate_tree (src
->left
, dfa
);
3523 /* Secondaly, duplicate the right. */
3524 if (src
->right
!= NULL
)
3526 right
= duplicate_tree (src
->right
, dfa
);
3529 free_bin_tree (left
);
3534 /* At last, duplicate itself. */
3535 if (src
->type
== NON_TYPE
)
3537 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3538 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3539 if (BE (new_node_idx
== -1, 0))
3541 free_bin_tree (left
);
3542 free_bin_tree (right
);
3547 new_node_idx
= src
->type
;
3549 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3550 if (BE (new_tree
== NULL
, 0))
3552 free_bin_tree (left
);
3553 free_bin_tree (right
);