]> sourceware.org Git - glibc.git/blame - posix/regexec.c
Upate.
[glibc.git] / posix / regexec.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
54e1cabc 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
a9388965 21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
612546c6 22 re_string_t *input, int n);
6291ee3c 23static void match_ctx_clean (re_match_context_t *mctx);
3b0bdc72 24static void match_ctx_free (re_match_context_t *cache);
6291ee3c 25static void match_ctx_free_subtops (re_match_context_t *mctx);
a9388965 26static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
15a7d175 27 int str_idx, int from, int to);
6291ee3c 28static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx);
0742e48e 29static void match_ctx_clear_flag (re_match_context_t *mctx);
6291ee3c
UD
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx);
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx);
0742e48e 34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
15a7d175
UD
35 re_dfastate_t **limited_sts, int last_node,
36 int last_str_idx, int check_subexp);
a9388965 37static reg_errcode_t re_search_internal (const regex_t *preg,
15a7d175
UD
38 const char *string, int length,
39 int start, int range, int stop,
40 size_t nmatch, regmatch_t pmatch[],
41 int eflags);
ac3d553b 42static int re_search_2_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
43 const char *string1, int length1,
44 const char *string2, int length2,
45 int start, int range, struct re_registers *regs,
46 int stop, int ret_len);
ac3d553b 47static int re_search_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
48 const char *string, int length, int start,
49 int range, int stop, struct re_registers *regs,
50 int ret_len);
ac3d553b 51static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
15a7d175 52 int nregs, int regs_allocated);
25337753
UD
53static re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
54 const regex_t *preg,
55 const re_match_context_t *mctx,
56 int idx);
6291ee3c
UD
57static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
58 re_match_context_t *mctx);
612546c6 59static int check_matching (const regex_t *preg, re_match_context_t *mctx,
15a7d175 60 int fl_search, int fl_longest_match);
3b0bdc72 61static int check_halt_node_context (const re_dfa_t *dfa, int node,
15a7d175 62 unsigned int context);
3b0bdc72 63static int check_halt_state_context (const regex_t *preg,
15a7d175
UD
64 const re_dfastate_t *state,
65 const re_match_context_t *mctx, int idx);
81c64d40 66static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
15a7d175 67 int cur_idx, int nmatch);
a3022b82 68static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
15a7d175
UD
69 const re_match_context_t *mctx,
70 int *pidx, int node, re_node_set *eps_via_nodes,
71 struct re_fail_stack_t *fs);
1b2c2628 72static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
15a7d175
UD
73 int str_idx, int *dests, int nregs,
74 regmatch_t *regs,
75 re_node_set *eps_via_nodes);
a3022b82 76static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
15a7d175 77 regmatch_t *regs, re_node_set *eps_via_nodes);
612546c6 78static reg_errcode_t set_regs (const regex_t *preg,
15a7d175
UD
79 const re_match_context_t *mctx,
80 size_t nmatch, regmatch_t *pmatch,
81 int fl_backtrack);
485d775d
UD
82static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
83
434d3784 84#ifdef RE_ENABLE_I18N
612546c6 85static int sift_states_iter_mb (const regex_t *preg,
15a7d175
UD
86 const re_match_context_t *mctx,
87 re_sift_context_t *sctx,
88 int node_idx, int str_idx, int max_str_idx);
434d3784 89#endif /* RE_ENABLE_I18N */
a9388965 90static reg_errcode_t sift_states_backward (const regex_t *preg,
15a7d175
UD
91 re_match_context_t *mctx,
92 re_sift_context_t *sctx);
0742e48e 93static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
15a7d175
UD
94 re_match_context_t *mctx,
95 re_sift_context_t *sctx,
96 int str_idx,
97 re_node_set *dest_nodes);
0742e48e 98static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
15a7d175
UD
99 re_node_set *dest_nodes,
100 const re_node_set *candidates);
0742e48e 101static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
15a7d175
UD
102 re_node_set *dest_nodes,
103 const re_node_set *and_nodes);
a3022b82 104static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
15a7d175
UD
105 re_match_context_t *mctx, int dst_node,
106 int dst_idx, int src_node, int src_idx);
a3022b82 107static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
15a7d175
UD
108 int limit, re_node_set *eclosures,
109 int subexp_idx, int node, int str_idx);
0742e48e 110static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
15a7d175
UD
111 re_node_set *dest_nodes,
112 const re_node_set *candidates,
113 re_node_set *limits,
114 struct re_backref_cache_entry *bkref_ents,
115 int str_idx);
0742e48e 116static reg_errcode_t sift_states_bkref (const regex_t *preg,
15a7d175
UD
117 re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, re_node_set *dest_nodes);
612546c6 120static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
15a7d175 121 int next_state_log_idx);
0742e48e 122static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
15a7d175 123 re_dfastate_t **src, int num);
a9388965 124static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
15a7d175
UD
125 re_match_context_t *mctx,
126 re_dfastate_t *state, int fl_search);
6291ee3c
UD
127static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
128 re_match_context_t *mctx,
129 re_node_set *cur_nodes,
130 int str_idx);
a9388965 131static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
15a7d175
UD
132 re_dfastate_t *pstate,
133 int fl_search,
134 re_match_context_t *mctx);
434d3784 135#ifdef RE_ENABLE_I18N
a9388965 136static reg_errcode_t transit_state_mb (const regex_t *preg,
15a7d175
UD
137 re_dfastate_t *pstate,
138 re_match_context_t *mctx);
434d3784 139#endif /* RE_ENABLE_I18N */
a9388965 140static reg_errcode_t transit_state_bkref (const regex_t *preg,
6291ee3c 141 re_node_set *nodes,
15a7d175 142 re_match_context_t *mctx);
6291ee3c
UD
143static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx,
144 int bkref_node, int bkref_str_idx);
145static reg_errcode_t get_subexp_sub (const regex_t *preg,
146 re_match_context_t *mctx,
147 re_sub_match_top_t *sub_top,
148 re_sub_match_last_t *sub_last,
149 int bkref_node, int bkref_str);
150static int find_subexp_node (re_dfa_t *dfa, re_node_set *nodes,
151 int subexp_idx, int fl_open);
54e1cabc 152static reg_errcode_t check_arrival (const regex_t *preg,
6291ee3c
UD
153 re_match_context_t *mctx,
154 state_array_t *path, int top_node,
155 int top_str, int last_node, int last_str,
156 int fl_open);
157static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg,
158 re_dfa_t *dfa,
159 re_match_context_t *mctx,
160 int str_idx,
161 re_node_set *cur_nodes,
162 re_node_set *next_nodes);
163static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
164 re_node_set *cur_nodes,
165 int ex_subexp, int fl_open);
166static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
167 re_node_set *dst_nodes,
168 int target, int ex_subexp,
169 int fl_open);
170static reg_errcode_t expand_bkref_cache (const regex_t *preg,
171 re_match_context_t *mctx,
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
174 int fl_open);
3b0bdc72 175static re_dfastate_t **build_trtable (const regex_t *dfa,
15a7d175
UD
176 const re_dfastate_t *state,
177 int fl_search);
434d3784 178#ifdef RE_ENABLE_I18N
3b0bdc72 179static int check_node_accept_bytes (const regex_t *preg, int node_idx,
15a7d175 180 const re_string_t *input, int idx);
434d3784 181# ifdef _LIBC
c202c2c5 182static unsigned int find_collation_sequence_value (const unsigned char *mbs,
15a7d175 183 size_t name_len);
434d3784
UD
184# endif /* _LIBC */
185#endif /* RE_ENABLE_I18N */
3b0bdc72 186static int group_nodes_into_DFAstates (const regex_t *dfa,
15a7d175
UD
187 const re_dfastate_t *state,
188 re_node_set *states_node,
189 bitset *states_ch);
3b0bdc72 190static int check_node_accept (const regex_t *preg, const re_token_t *node,
15a7d175 191 const re_match_context_t *mctx, int idx);
612546c6 192static reg_errcode_t extend_buffers (re_match_context_t *mctx);
3b0bdc72
UD
193\f
194/* Entry point for POSIX code. */
195
196/* regexec searches for a given pattern, specified by PREG, in the
197 string STRING.
198
199 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
200 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
201 least NMATCH elements, and we set them to the offsets of the
202 corresponding matched substrings.
203
204 EFLAGS specifies `execution flags' which affect matching: if
205 REG_NOTBOL is set, then ^ does not match at the beginning of the
206 string; if REG_NOTEOL is set, then $ does not match at the end.
207
208 We return 0 if we find a match and REG_NOMATCH if not. */
209
210int
211regexec (preg, string, nmatch, pmatch, eflags)
92b27c74
UD
212 const regex_t *__restrict preg;
213 const char *__restrict string;
3b0bdc72
UD
214 size_t nmatch;
215 regmatch_t pmatch[];
216 int eflags;
217{
a9388965 218 reg_errcode_t err;
3b0bdc72
UD
219 int length = strlen (string);
220 if (preg->no_sub)
ac3d553b 221 err = re_search_internal (preg, string, length, 0, length, length, 0,
15a7d175 222 NULL, eflags);
3b0bdc72 223 else
ac3d553b 224 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
15a7d175 225 pmatch, eflags);
a9388965 226 return err != REG_NOERROR;
3b0bdc72
UD
227}
228#ifdef _LIBC
229weak_alias (__regexec, regexec)
230#endif
231
232/* Entry points for GNU code. */
233
ac3d553b
UD
234/* re_match, re_search, re_match_2, re_search_2
235
236 The former two functions operate on STRING with length LENGTH,
237 while the later two operate on concatenation of STRING1 and STRING2
238 with lengths LENGTH1 and LENGTH2, respectively.
239
240 re_match() matches the compiled pattern in BUFP against the string,
241 starting at index START.
242
243 re_search() first tries matching at index START, then it tries to match
244 starting from index START + 1, and so on. The last start position tried
245 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
246 way as re_match().)
247
248 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
249 the first STOP characters of the concatenation of the strings should be
250 concerned.
251
252 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
253 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
254 computed relative to the concatenation, not relative to the individual
255 strings.)
256
257 On success, re_match* functions return the length of the match, re_search*
258 return the position of the start of the match. Return value -1 means no
259 match was found and -2 indicates an internal error. */
3b0bdc72
UD
260
261int
ac3d553b
UD
262re_match (bufp, string, length, start, regs)
263 struct re_pattern_buffer *bufp;
3b0bdc72
UD
264 const char *string;
265 int length, start;
266 struct re_registers *regs;
267{
ac3d553b 268 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
3b0bdc72
UD
269}
270#ifdef _LIBC
271weak_alias (__re_match, re_match)
272#endif
273
ac3d553b
UD
274int
275re_search (bufp, string, length, start, range, regs)
276 struct re_pattern_buffer *bufp;
277 const char *string;
278 int length, start, range;
279 struct re_registers *regs;
280{
281 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
282}
283#ifdef _LIBC
284weak_alias (__re_search, re_search)
285#endif
3b0bdc72
UD
286
287int
ac3d553b
UD
288re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
289 struct re_pattern_buffer *bufp;
290 const char *string1, *string2;
291 int length1, length2, start, stop;
292 struct re_registers *regs;
3b0bdc72 293{
ac3d553b 294 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 295 start, 0, regs, stop, 1);
3b0bdc72
UD
296}
297#ifdef _LIBC
298weak_alias (__re_match_2, re_match_2)
299#endif
300
3b0bdc72 301int
ac3d553b
UD
302re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
303 struct re_pattern_buffer *bufp;
304 const char *string1, *string2;
305 int length1, length2, start, range, stop;
306 struct re_registers *regs;
307{
308 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 309 start, range, regs, stop, 0);
ac3d553b
UD
310}
311#ifdef _LIBC
312weak_alias (__re_search_2, re_search_2)
313#endif
314
315static int
316re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
15a7d175 317 stop, ret_len)
ac3d553b
UD
318 struct re_pattern_buffer *bufp;
319 const char *string1, *string2;
320 int length1, length2, start, range, stop, ret_len;
321 struct re_registers *regs;
322{
323 const char *str;
324 int rval;
325 int len = length1 + length2;
326 int free_str = 0;
327
328 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
329 return -2;
330
331 /* Concatenate the strings. */
332 if (length2 > 0)
333 if (length1 > 0)
334 {
15a7d175
UD
335 char *s = re_malloc (char, len);
336
337 if (BE (s == NULL, 0))
338 return -2;
339 memcpy (s, string1, length1);
340 memcpy (s + length1, string2, length2);
341 str = s;
342 free_str = 1;
ac3d553b
UD
343 }
344 else
345 str = string2;
346 else
347 str = string1;
348
349 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
15a7d175 350 ret_len);
ac3d553b 351 if (free_str)
1b2c2628 352 re_free ((char *) str);
ac3d553b
UD
353 return rval;
354}
355
356/* The parameters have the same meaning as those of re_search.
357 Additional parameters:
358 If RET_LEN is nonzero the length of the match is returned (re_match style);
359 otherwise the position of the match is returned. */
360
361static int
362re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
363 struct re_pattern_buffer *bufp;
364 const char *string;
365 int length, start, range, stop, ret_len;
366 struct re_registers *regs;
3b0bdc72 367{
a9388965 368 reg_errcode_t result;
3b0bdc72 369 regmatch_t *pmatch;
ac3d553b
UD
370 int nregs, rval;
371 int eflags = 0;
372
373 /* Check for out-of-range. */
a877402c 374 if (BE (start < 0 || start > length, 0))
ac3d553b
UD
375 return -1;
376 if (BE (start + range > length, 0))
377 range = length - start;
a877402c
UD
378 else if (BE (start + range < 0, 0))
379 range = -start;
3b0bdc72
UD
380
381 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
382 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
383
ac3d553b
UD
384 /* Compile fastmap if we haven't yet. */
385 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
386 re_compile_fastmap (bufp);
387
388 if (BE (bufp->no_sub, 0))
389 regs = NULL;
3b0bdc72
UD
390
391 /* We need at least 1 register. */
ac3d553b
UD
392 if (regs == NULL)
393 nregs = 1;
394 else if (BE (bufp->regs_allocated == REGS_FIXED &&
15a7d175 395 regs->num_regs < bufp->re_nsub + 1, 0))
ac3d553b
UD
396 {
397 nregs = regs->num_regs;
398 if (BE (nregs < 1, 0))
15a7d175
UD
399 {
400 /* Nothing can be copied to regs. */
401 regs = NULL;
402 nregs = 1;
403 }
ac3d553b
UD
404 }
405 else
406 nregs = bufp->re_nsub + 1;
3b0bdc72 407 pmatch = re_malloc (regmatch_t, nregs);
bc15410e
UD
408 if (BE (pmatch == NULL, 0))
409 return -2;
3b0bdc72 410
ac3d553b 411 result = re_search_internal (bufp, string, length, start, range, stop,
15a7d175 412 nregs, pmatch, eflags);
3b0bdc72 413
ac3d553b
UD
414 rval = 0;
415
416 /* I hope we needn't fill ther regs with -1's when no match was found. */
417 if (result != REG_NOERROR)
418 rval = -1;
419 else if (regs != NULL)
3b0bdc72 420 {
ac3d553b
UD
421 /* If caller wants register contents data back, copy them. */
422 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
15a7d175 423 bufp->regs_allocated);
ac3d553b 424 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
15a7d175 425 rval = -2;
3b0bdc72
UD
426 }
427
ac3d553b 428 if (BE (rval == 0, 1))
3b0bdc72 429 {
ac3d553b 430 if (ret_len)
15a7d175
UD
431 {
432 assert (pmatch[0].rm_so == start);
433 rval = pmatch[0].rm_eo - start;
434 }
ac3d553b 435 else
15a7d175 436 rval = pmatch[0].rm_so;
3b0bdc72 437 }
3b0bdc72
UD
438 re_free (pmatch);
439 return rval;
440}
3b0bdc72 441
ac3d553b
UD
442static unsigned
443re_copy_regs (regs, pmatch, nregs, regs_allocated)
3b0bdc72 444 struct re_registers *regs;
ac3d553b
UD
445 regmatch_t *pmatch;
446 int nregs, regs_allocated;
3b0bdc72 447{
ac3d553b
UD
448 int rval = REGS_REALLOCATE;
449 int i;
450 int need_regs = nregs + 1;
451 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
452 uses. */
453
454 /* Have the register data arrays been allocated? */
455 if (regs_allocated == REGS_UNALLOCATED)
456 { /* No. So allocate them with malloc. */
457 regs->start = re_malloc (regoff_t, need_regs);
458 if (BE (regs->start == NULL, 0))
15a7d175 459 return REGS_UNALLOCATED;
ac3d553b
UD
460 regs->end = re_malloc (regoff_t, need_regs);
461 if (BE (regs->end == NULL, 0))
15a7d175
UD
462 {
463 re_free (regs->start);
464 return REGS_UNALLOCATED;
465 }
ac3d553b
UD
466 regs->num_regs = need_regs;
467 }
468 else if (regs_allocated == REGS_REALLOCATE)
469 { /* Yes. If we need more elements than were already
15a7d175
UD
470 allocated, reallocate them. If we need fewer, just
471 leave it alone. */
ac3d553b 472 if (need_regs > regs->num_regs)
15a7d175
UD
473 {
474 regs->start = re_realloc (regs->start, regoff_t, need_regs);
475 if (BE (regs->start == NULL, 0))
476 {
477 if (regs->end != NULL)
478 re_free (regs->end);
479 return REGS_UNALLOCATED;
480 }
481 regs->end = re_realloc (regs->end, regoff_t, need_regs);
482 if (BE (regs->end == NULL, 0))
483 {
484 re_free (regs->start);
485 return REGS_UNALLOCATED;
486 }
487 regs->num_regs = need_regs;
488 }
ac3d553b
UD
489 }
490 else
491 {
492 assert (regs_allocated == REGS_FIXED);
493 /* This function may not be called with REGS_FIXED and nregs too big. */
494 assert (regs->num_regs >= nregs);
495 rval = REGS_FIXED;
496 }
497
498 /* Copy the regs. */
499 for (i = 0; i < nregs; ++i)
500 {
501 regs->start[i] = pmatch[i].rm_so;
502 regs->end[i] = pmatch[i].rm_eo;
503 }
504 for ( ; i < regs->num_regs; ++i)
505 regs->start[i] = regs->end[i] = -1;
506
507 return rval;
3b0bdc72 508}
3b0bdc72
UD
509
510/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
511 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
512 this memory for recording register information. STARTS and ENDS
513 must be allocated using the malloc library routine, and must each
514 be at least NUM_REGS * sizeof (regoff_t) bytes long.
515
516 If NUM_REGS == 0, then subsequent matches should allocate their own
517 register data.
518
519 Unless this function is called, the first search or match using
520 PATTERN_BUFFER will allocate its own register data, without
521 freeing the old data. */
522
523void
524re_set_registers (bufp, regs, num_regs, starts, ends)
525 struct re_pattern_buffer *bufp;
526 struct re_registers *regs;
527 unsigned num_regs;
528 regoff_t *starts, *ends;
529{
530 if (num_regs)
531 {
532 bufp->regs_allocated = REGS_REALLOCATE;
533 regs->num_regs = num_regs;
534 regs->start = starts;
535 regs->end = ends;
536 }
537 else
538 {
539 bufp->regs_allocated = REGS_UNALLOCATED;
540 regs->num_regs = 0;
541 regs->start = regs->end = (regoff_t *) 0;
542 }
543}
544#ifdef _LIBC
545weak_alias (__re_set_registers, re_set_registers)
546#endif
547\f
548/* Entry points compatible with 4.2 BSD regex library. We don't define
549 them unless specifically requested. */
550
551#if defined _REGEX_RE_COMP || defined _LIBC
552int
553# ifdef _LIBC
554weak_function
555# endif
556re_exec (s)
557 const char *s;
558{
559 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
560}
561#endif /* _REGEX_RE_COMP */
562\f
563static re_node_set empty_set;
564
565/* Internal entry point. */
566
567/* Searches for a compiled pattern PREG in the string STRING, whose
568 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
569 mingings with regexec. START, and RANGE have the same meanings
570 with re_search.
a9388965
UD
571 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
572 otherwise return the error code.
3b0bdc72
UD
573 Note: We assume front end functions already check ranges.
574 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
575
a9388965 576static reg_errcode_t
ac3d553b 577re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
15a7d175 578 eflags)
3b0bdc72
UD
579 const regex_t *preg;
580 const char *string;
ac3d553b 581 int length, start, range, stop, eflags;
3b0bdc72
UD
582 size_t nmatch;
583 regmatch_t pmatch[];
584{
a9388965 585 reg_errcode_t err;
3b0bdc72
UD
586 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
587 re_string_t input;
612546c6 588 int left_lim, right_lim, incr;
3b0bdc72 589 int fl_longest_match, match_first, match_last = -1;
1b2c2628 590 int fast_translate, sb;
3b0bdc72 591 re_match_context_t mctx;
1b2c2628
UD
592 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
593 && range && !preg->can_be_null) ? preg->fastmap : NULL);
3b0bdc72
UD
594
595 /* Check if the DFA haven't been compiled. */
bc15410e 596 if (BE (preg->used == 0 || dfa->init_state == NULL
15a7d175
UD
597 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
598 || dfa->init_state_begbuf == NULL, 0))
a9388965 599 return REG_NOMATCH;
3b0bdc72
UD
600
601 re_node_set_init_empty (&empty_set);
1b2c2628 602 memset (&mctx, '\0', sizeof (re_match_context_t));
3b0bdc72
UD
603
604 /* We must check the longest matching, if nmatch > 0. */
6291ee3c 605 fl_longest_match = (nmatch != 0 || dfa->nbackref);
3b0bdc72 606
612546c6 607 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
15a7d175 608 preg->translate, preg->syntax & RE_ICASE);
612546c6 609 if (BE (err != REG_NOERROR, 0))
1b2c2628 610 goto free_return;
ac3d553b 611 input.stop = stop;
612546c6
UD
612
613 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
614 if (BE (err != REG_NOERROR, 0))
1b2c2628 615 goto free_return;
612546c6 616
3b0bdc72
UD
617 /* We will log all the DFA states through which the dfa pass,
618 if nmatch > 1, or this dfa has "multibyte node", which is a
619 back-reference or a node which can accept multibyte character or
620 multi character collating element. */
621 if (nmatch > 1 || dfa->has_mb_node)
a9388965 622 {
612546c6
UD
623 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
624 if (BE (mctx.state_log == NULL, 0))
15a7d175
UD
625 {
626 err = REG_ESPACE;
627 goto free_return;
628 }
a9388965 629 }
3b0bdc72 630 else
612546c6 631 mctx.state_log = NULL;
3b0bdc72
UD
632
633#ifdef DEBUG
634 /* We assume front-end functions already check them. */
635 assert (start + range >= 0 && start + range <= length);
636#endif
637
612546c6
UD
638 match_first = start;
639 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
15a7d175 640 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
612546c6 641
3b0bdc72 642 /* Check incrementally whether of not the input string match. */
612546c6
UD
643 incr = (range < 0) ? -1 : 1;
644 left_lim = (range < 0) ? start + range : start;
645 right_lim = (range < 0) ? start : start + range;
1b2c2628
UD
646 sb = MB_CUR_MAX == 1;
647 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
612546c6
UD
648
649 for (;;)
3b0bdc72 650 {
612546c6 651 /* At first get the current byte from input string. */
1b2c2628
UD
652 if (fastmap)
653 {
654 if (BE (fast_translate, 1))
655 {
656 unsigned RE_TRANSLATE_TYPE t
657 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
658 if (BE (range >= 0, 1))
659 {
660 if (BE (t != NULL, 0))
661 {
662 while (BE (match_first < right_lim, 1)
663 && !fastmap[t[(unsigned char) string[match_first]]])
664 ++match_first;
665 }
666 else
667 {
668 while (BE (match_first < right_lim, 1)
669 && !fastmap[(unsigned char) string[match_first]])
670 ++match_first;
671 }
672 if (BE (match_first == right_lim, 0))
673 {
674 int ch = match_first >= length
675 ? 0 : (unsigned char) string[match_first];
676 if (!fastmap[t ? t[ch] : ch])
677 break;
678 }
679 }
680 else
681 {
682 while (match_first >= left_lim)
683 {
684 int ch = match_first >= length
685 ? 0 : (unsigned char) string[match_first];
686 if (fastmap[t ? t[ch] : ch])
687 break;
688 --match_first;
689 }
690 if (match_first < left_lim)
691 break;
692 }
693 }
694 else
695 {
696 int ch;
697
698 do
699 {
700 /* In this case, we can't determine easily the current byte,
701 since it might be a component byte of a multibyte
702 character. Then we use the constructed buffer
703 instead. */
704 /* If MATCH_FIRST is out of the valid range, reconstruct the
705 buffers. */
706 if (input.raw_mbs_idx + input.valid_len <= match_first
707 || match_first < input.raw_mbs_idx)
708 {
709 err = re_string_reconstruct (&input, match_first, eflags,
710 preg->newline_anchor);
711 if (BE (err != REG_NOERROR, 0))
712 goto free_return;
713 }
714 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
715 Note that MATCH_FIRST must not be smaller than 0. */
716 ch = ((match_first >= length) ? 0
717 : re_string_byte_at (&input,
718 match_first - input.raw_mbs_idx));
719 if (fastmap[ch])
720 break;
721 match_first += incr;
722 }
723 while (match_first >= left_lim && match_first <= right_lim);
724 if (! fastmap[ch])
725 break;
726 }
727 }
612546c6 728
1b2c2628 729 /* Reconstruct the buffers so that the matcher can assume that
15a7d175 730 the matching starts from the begining of the buffer. */
1b2c2628
UD
731 err = re_string_reconstruct (&input, match_first, eflags,
732 preg->newline_anchor);
733 if (BE (err != REG_NOERROR, 0))
734 goto free_return;
3b0bdc72 735#ifdef RE_ENABLE_I18N
1b2c2628 736 /* Eliminate it when it is a component of a multibyte character
15a7d175 737 and isn't the head of a multibyte character. */
1b2c2628 738 if (sb || re_string_first_byte (&input, 0))
3b0bdc72 739#endif
1b2c2628
UD
740 {
741 /* It seems to be appropriate one, then use the matcher. */
742 /* We assume that the matching starts from 0. */
743 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
744 match_last = check_matching (preg, &mctx, 0, fl_longest_match);
745 if (match_last != -1)
746 {
747 if (BE (match_last == -2, 0))
748 {
749 err = REG_ESPACE;
750 goto free_return;
751 }
752 else
6291ee3c
UD
753 {
754 mctx.match_last = match_last;
755 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
756 {
757 re_dfastate_t *pstate = mctx.state_log[match_last];
758 mctx.last_node = check_halt_state_context (preg, pstate,
759 &mctx, match_last);
760 }
761 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
762 || dfa->nbackref)
763 {
764 err = prune_impossible_nodes (preg, &mctx);
765 if (err == REG_NOERROR)
766 break;
767 if (BE (err != REG_NOMATCH, 0))
768 goto free_return;
769 }
770 else
771 break; /* We found a matching. */
772 }
1b2c2628 773 }
6291ee3c 774 match_ctx_clean (&mctx);
1b2c2628 775 }
3b0bdc72 776 /* Update counter. */
612546c6
UD
777 match_first += incr;
778 if (match_first < left_lim || right_lim < match_first)
15a7d175 779 break;
3b0bdc72
UD
780 }
781
782 /* Set pmatch[] if we need. */
783 if (match_last != -1 && nmatch > 0)
784 {
785 int reg_idx;
786
787 /* Initialize registers. */
788 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
15a7d175 789 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
3b0bdc72
UD
790
791 /* Set the points where matching start/end. */
612546c6 792 pmatch[0].rm_so = 0;
6291ee3c 793 pmatch[0].rm_eo = mctx.match_last;
3b0bdc72
UD
794
795 if (!preg->no_sub && nmatch > 1)
15a7d175 796 {
15a7d175
UD
797 err = set_regs (preg, &mctx, nmatch, pmatch,
798 dfa->has_plural_match && dfa->nbackref > 0);
799 if (BE (err != REG_NOERROR, 0))
800 goto free_return;
801 }
612546c6
UD
802
803 /* At last, add the offset to the each registers, since we slided
15a7d175 804 the buffers so that We can assume that the matching starts from 0. */
612546c6 805 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
15a7d175
UD
806 if (pmatch[reg_idx].rm_so != -1)
807 {
808 pmatch[reg_idx].rm_so += match_first;
809 pmatch[reg_idx].rm_eo += match_first;
810 }
3b0bdc72 811 }
1b2c2628
UD
812 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
813 free_return:
612546c6 814 re_free (mctx.state_log);
3b0bdc72
UD
815 if (dfa->nbackref)
816 match_ctx_free (&mctx);
817 re_string_destruct (&input);
1b2c2628 818 return err;
3b0bdc72
UD
819}
820
6291ee3c
UD
821static reg_errcode_t
822prune_impossible_nodes (preg, mctx)
823 const regex_t *preg;
824 re_match_context_t *mctx;
825{
826 int halt_node, match_last;
827 reg_errcode_t ret;
828 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
829 re_dfastate_t **sifted_states;
830 re_dfastate_t **lim_states = NULL;
831 re_sift_context_t sctx;
832#ifdef DEBUG
833 assert (mctx->state_log != NULL);
834#endif
835 match_last = mctx->match_last;
836 halt_node = mctx->last_node;
837 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
838 if (BE (sifted_states == NULL, 0))
839 {
840 ret = REG_ESPACE;
841 goto free_return;
842 }
843 if (dfa->nbackref)
844 {
845 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
846 if (BE (lim_states == NULL, 0))
847 {
848 ret = REG_ESPACE;
849 goto free_return;
850 }
851 while (1)
852 {
853 memset (lim_states, '\0',
854 sizeof (re_dfastate_t *) * (match_last + 1));
855 match_ctx_clear_flag (mctx);
856 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
857 match_last, 0);
858 ret = sift_states_backward (preg, mctx, &sctx);
859 re_node_set_free (&sctx.limits);
860 if (BE (ret != REG_NOERROR, 0))
861 goto free_return;
862 if (sifted_states[0] != NULL || lim_states[0] != NULL)
863 break;
864 do
865 {
866 --match_last;
867 if (match_last < 0)
868 {
869 ret = REG_NOMATCH;
870 goto free_return;
871 }
872 } while (!mctx->state_log[match_last]->halt);
873 halt_node = check_halt_state_context (preg,
874 mctx->state_log[match_last],
875 mctx, match_last);
876 }
877 ret = merge_state_array (dfa, sifted_states, lim_states,
878 match_last + 1);
879 re_free (lim_states);
880 lim_states = NULL;
881 if (BE (ret != REG_NOERROR, 0))
882 goto free_return;
883 }
884 else
885 {
886 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
887 match_last, 0);
888 ret = sift_states_backward (preg, mctx, &sctx);
889 re_node_set_free (&sctx.limits);
890 if (BE (ret != REG_NOERROR, 0))
891 goto free_return;
892 }
893 re_free (mctx->state_log);
894 mctx->state_log = sifted_states;
895 sifted_states = NULL;
896 mctx->last_node = halt_node;
897 mctx->match_last = match_last;
898 ret = REG_NOERROR;
899 free_return:
900 re_free (sifted_states);
901 re_free (lim_states);
902 return ret;
903}
904
a9388965 905/* Acquire an initial state and return it.
3b0bdc72
UD
906 We must select appropriate initial state depending on the context,
907 since initial states may have constraints like "\<", "^", etc.. */
908
25337753 909static re_dfastate_t *
612546c6 910acquire_init_state_context (err, preg, mctx, idx)
a9388965
UD
911 reg_errcode_t *err;
912 const regex_t *preg;
612546c6
UD
913 const re_match_context_t *mctx;
914 int idx;
3b0bdc72
UD
915{
916 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
917
a9388965 918 *err = REG_NOERROR;
3b0bdc72
UD
919 if (dfa->init_state->has_constraint)
920 {
921 unsigned int context;
612546c6 922 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
15a7d175 923 preg->newline_anchor);
3b0bdc72 924 if (IS_WORD_CONTEXT (context))
15a7d175 925 return dfa->init_state_word;
3b0bdc72 926 else if (IS_ORDINARY_CONTEXT (context))
15a7d175 927 return dfa->init_state;
3b0bdc72 928 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
15a7d175 929 return dfa->init_state_begbuf;
3b0bdc72 930 else if (IS_NEWLINE_CONTEXT (context))
15a7d175 931 return dfa->init_state_nl;
3b0bdc72 932 else if (IS_BEGBUF_CONTEXT (context))
15a7d175
UD
933 {
934 /* It is relatively rare case, then calculate on demand. */
935 return re_acquire_state_context (err, dfa,
936 dfa->init_state->entrance_nodes,
937 context);
938 }
3b0bdc72 939 else
15a7d175
UD
940 /* Must not happen? */
941 return dfa->init_state;
3b0bdc72
UD
942 }
943 else
944 return dfa->init_state;
945}
946
947/* Check whether the regular expression match input string INPUT or not,
a9388965
UD
948 and return the index where the matching end, return -1 if not match,
949 or return -2 in case of an error.
3b0bdc72 950 FL_SEARCH means we must search where the matching starts,
612546c6
UD
951 FL_LONGEST_MATCH means we want the POSIX longest matching.
952 Note that the matcher assume that the maching starts from the current
953 index of the buffer. */
3b0bdc72
UD
954
955static int
612546c6 956check_matching (preg, mctx, fl_search, fl_longest_match)
3b0bdc72 957 const regex_t *preg;
3b0bdc72 958 re_match_context_t *mctx;
612546c6 959 int fl_search, fl_longest_match;
3b0bdc72 960{
6291ee3c 961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
a9388965 962 reg_errcode_t err;
612546c6
UD
963 int match = 0;
964 int match_last = -1;
965 int cur_str_idx = re_string_cur_idx (mctx->input);
3b0bdc72
UD
966 re_dfastate_t *cur_state;
967
612546c6 968 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
a9388965 969 /* An initial state must not be NULL(invalid state). */
bc15410e 970 if (BE (cur_state == NULL, 0))
a9388965 971 return -2;
612546c6
UD
972 if (mctx->state_log != NULL)
973 mctx->state_log[cur_str_idx] = cur_state;
0742e48e 974
6291ee3c
UD
975 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
976 later. E.g. Processing back references. */
977 if (dfa->nbackref)
978 {
979 err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0);
980 if (BE (err != REG_NOERROR, 0))
981 return err;
982 }
983
0742e48e
UD
984 if (cur_state->has_backref)
985 {
6291ee3c
UD
986 err = transit_state_bkref (preg, &cur_state->nodes, mctx);
987 if (BE (err != REG_NOERROR, 0))
988 return err;
0742e48e
UD
989 }
990
3b0bdc72
UD
991 /* If the RE accepts NULL string. */
992 if (cur_state->halt)
993 {
994 if (!cur_state->has_constraint
15a7d175
UD
995 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
996 {
997 if (!fl_longest_match)
998 return cur_str_idx;
999 else
1000 {
1001 match_last = cur_str_idx;
1002 match = 1;
1003 }
1004 }
3b0bdc72
UD
1005 }
1006
612546c6 1007 while (!re_string_eoi (mctx->input))
3b0bdc72 1008 {
612546c6 1009 cur_state = transit_state (&err, preg, mctx, cur_state,
15a7d175 1010 fl_search && !match);
a9388965 1011 if (cur_state == NULL) /* Reached at the invalid state or an error. */
15a7d175
UD
1012 {
1013 cur_str_idx = re_string_cur_idx (mctx->input);
1014 if (BE (err != REG_NOERROR, 0))
1015 return -2;
1016 if (fl_search && !match)
1017 {
1018 /* Restart from initial state, since we are searching
1019 the point from where matching start. */
3b0bdc72 1020#ifdef RE_ENABLE_I18N
15a7d175
UD
1021 if (MB_CUR_MAX == 1
1022 || re_string_first_byte (mctx->input, cur_str_idx))
3b0bdc72 1023#endif /* RE_ENABLE_I18N */
15a7d175
UD
1024 cur_state = acquire_init_state_context (&err, preg, mctx,
1025 cur_str_idx);
1026 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
1027 return -2;
1028 if (mctx->state_log != NULL)
1029 mctx->state_log[cur_str_idx] = cur_state;
1030 }
1031 else if (!fl_longest_match && match)
1032 break;
1033 else /* (fl_longest_match && match) || (!fl_search && !match) */
1034 {
1035 if (mctx->state_log == NULL)
1036 break;
1037 else
1038 {
1039 int max = mctx->state_log_top;
1040 for (; cur_str_idx <= max; ++cur_str_idx)
1041 if (mctx->state_log[cur_str_idx] != NULL)
1042 break;
1043 if (cur_str_idx > max)
1044 break;
1045 }
1046 }
1047 }
3b0bdc72
UD
1048
1049 if (cur_state != NULL && cur_state->halt)
15a7d175
UD
1050 {
1051 /* Reached at a halt state.
1052 Check the halt state can satisfy the current context. */
1053 if (!cur_state->has_constraint
1054 || check_halt_state_context (preg, cur_state, mctx,
1055 re_string_cur_idx (mctx->input)))
1056 {
1057 /* We found an appropriate halt state. */
1058 match_last = re_string_cur_idx (mctx->input);
1059 match = 1;
1060 if (!fl_longest_match)
1061 break;
1062 }
1063 }
3b0bdc72
UD
1064 }
1065 return match_last;
1066}
1067
1068/* Check NODE match the current context. */
1069
1070static int check_halt_node_context (dfa, node, context)
1071 const re_dfa_t *dfa;
1072 int node;
1073 unsigned int context;
1074{
3b0bdc72 1075 re_token_type_t type = dfa->nodes[node].type;
485d775d
UD
1076 unsigned int constraint = dfa->nodes[node].constraint;
1077 if (type != END_OF_RE)
3b0bdc72 1078 return 0;
485d775d
UD
1079 if (!constraint)
1080 return 1;
1081 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
3b0bdc72
UD
1082 return 0;
1083 return 1;
1084}
1085
1086/* Check the halt state STATE match the current context.
1087 Return 0 if not match, if the node, STATE has, is a halt node and
1088 match the context, return the node. */
1089
1090static int
612546c6 1091check_halt_state_context (preg, state, mctx, idx)
3b0bdc72
UD
1092 const regex_t *preg;
1093 const re_dfastate_t *state;
612546c6
UD
1094 const re_match_context_t *mctx;
1095 int idx;
3b0bdc72
UD
1096{
1097 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1098 int i;
1099 unsigned int context;
1100#ifdef DEBUG
1101 assert (state->halt);
1102#endif
612546c6 1103 context = re_string_context_at (mctx->input, idx, mctx->eflags,
15a7d175 1104 preg->newline_anchor);
3b0bdc72
UD
1105 for (i = 0; i < state->nodes.nelem; ++i)
1106 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
1107 return state->nodes.elems[i];
1108 return 0;
1109}
1110
a9388965
UD
1111/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1112 corresponding to the DFA).
1113 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1114 of errors. */
3b0bdc72
UD
1115
1116static int
a3022b82 1117proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
3b0bdc72 1118 const regex_t *preg;
0742e48e 1119 regmatch_t *regs;
3b0bdc72 1120 const re_match_context_t *mctx;
a3022b82 1121 int nregs, *pidx, node;
3b0bdc72 1122 re_node_set *eps_via_nodes;
a3022b82 1123 struct re_fail_stack_t *fs;
3b0bdc72
UD
1124{
1125 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
485d775d 1126 int i, err, dest_node;
81c64d40 1127 dest_node = -1;
3b0bdc72
UD
1128 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1129 {
485d775d
UD
1130 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1131 int ndest, dest_nodes[2];
a9388965 1132 err = re_node_set_insert (eps_via_nodes, node);
bc15410e 1133 if (BE (err < 0, 0))
15a7d175 1134 return -1;
a3022b82 1135 /* Pick up valid destinations. */
485d775d 1136 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
15a7d175
UD
1137 {
1138 int candidate = dfa->edests[node].elems[i];
1139 if (!re_node_set_contains (cur_nodes, candidate))
1140 continue;
1141 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1142 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1143 ++ndest;
1144 }
a3022b82 1145 if (ndest <= 1)
15a7d175 1146 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
a3022b82
UD
1147 /* In order to avoid infinite loop like "(a*)*". */
1148 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
15a7d175 1149 return dest_nodes[1];
a3022b82 1150 if (fs != NULL)
15a7d175 1151 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
a3022b82 1152 return dest_nodes[0];
3b0bdc72
UD
1153 }
1154 else
1155 {
485d775d 1156 int naccepted = 0;
3b0bdc72 1157 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72 1158
434d3784 1159#ifdef RE_ENABLE_I18N
3b0bdc72 1160 if (ACCEPT_MB_NODE (type))
15a7d175 1161 naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx);
434d3784
UD
1162 else
1163#endif /* RE_ENABLE_I18N */
1164 if (type == OP_BACK_REF)
15a7d175
UD
1165 {
1166 int subexp_idx = dfa->nodes[node].opr.idx;
1167 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1168 if (fs != NULL)
1169 {
1170 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1171 return -1;
1172 else if (naccepted)
1173 {
85c54a32 1174 char *buf = (char *) re_string_get_buffer (mctx->input);
6291ee3c
UD
1175 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1176 naccepted) != 0)
15a7d175
UD
1177 return -1;
1178 }
1179 }
1180
1181 if (naccepted == 0)
1182 {
1183 err = re_node_set_insert (eps_via_nodes, node);
1184 if (BE (err < 0, 0))
1185 return -2;
1186 dest_node = dfa->edests[node].elems[0];
1187 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1188 dest_node))
1189 return dest_node;
1190 }
1191 }
3b0bdc72
UD
1192
1193 if (naccepted != 0
15a7d175
UD
1194 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1195 {
1196 dest_node = dfa->nexts[node];
1197 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1198 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1199 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1200 dest_node)))
1201 return -1;
1202 re_node_set_empty (eps_via_nodes);
1203 return dest_node;
1204 }
3b0bdc72 1205 }
a3022b82
UD
1206 return -1;
1207}
1208
1209static reg_errcode_t
1210push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1211 struct re_fail_stack_t *fs;
1212 int str_idx, *dests, nregs;
1213 regmatch_t *regs;
1214 re_node_set *eps_via_nodes;
1215{
1216 reg_errcode_t err;
1217 int num = fs->num++;
1218 if (fs->num == fs->alloc)
1219 {
1b2c2628 1220 struct re_fail_stack_ent_t *new_array;
a3022b82 1221 fs->alloc *= 2;
1b2c2628 1222 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
15a7d175 1223 * fs->alloc));
1b2c2628 1224 if (new_array == NULL)
15a7d175 1225 return REG_ESPACE;
1b2c2628 1226 fs->stack = new_array;
a3022b82
UD
1227 }
1228 fs->stack[num].idx = str_idx;
1229 fs->stack[num].node = dests[1];
1230 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1231 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1232 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1233 return err;
1234}
1b2c2628 1235
a3022b82
UD
1236static int
1237pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1238 struct re_fail_stack_t *fs;
1239 int *pidx, nregs;
1240 regmatch_t *regs;
1241 re_node_set *eps_via_nodes;
1242{
1243 int num = --fs->num;
1244 assert (num >= 0);
1245 *pidx = fs->stack[num].idx;
1246 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1247 re_node_set_free (eps_via_nodes);
485d775d 1248 re_free (fs->stack[num].regs);
a3022b82
UD
1249 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1250 return fs->stack[num].node;
3b0bdc72
UD
1251}
1252
1253/* Set the positions where the subexpressions are starts/ends to registers
1254 PMATCH.
1255 Note: We assume that pmatch[0] is already set, and
1256 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1257
a9388965 1258static reg_errcode_t
a3022b82
UD
1259set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1260 const regex_t *preg;
1261 const re_match_context_t *mctx;
1262 size_t nmatch;
1263 regmatch_t *pmatch;
1264 int fl_backtrack;
3b0bdc72
UD
1265{
1266 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
81c64d40 1267 int idx, cur_node, real_nmatch;
3b0bdc72 1268 re_node_set eps_via_nodes;
a3022b82
UD
1269 struct re_fail_stack_t *fs;
1270 struct re_fail_stack_t fs_body = {0, 2, NULL};
3b0bdc72
UD
1271#ifdef DEBUG
1272 assert (nmatch > 1);
612546c6 1273 assert (mctx->state_log != NULL);
3b0bdc72 1274#endif
a3022b82
UD
1275 if (fl_backtrack)
1276 {
1277 fs = &fs_body;
1278 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1279 }
1280 else
1281 fs = NULL;
3b0bdc72
UD
1282 cur_node = dfa->init_node;
1283 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1284 re_node_set_init_empty (&eps_via_nodes);
1285 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1286 {
81c64d40 1287 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
a3022b82 1288 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
15a7d175
UD
1289 {
1290 int reg_idx;
1291 if (fs)
1292 {
1293 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1294 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1295 break;
1296 if (reg_idx == nmatch)
1297 {
1298 re_node_set_free (&eps_via_nodes);
1299 return free_fail_stack_return (fs);
1300 }
1301 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1302 &eps_via_nodes);
1303 }
1304 else
1305 {
1306 re_node_set_free (&eps_via_nodes);
1307 return REG_NOERROR;
1308 }
1309 }
3b0bdc72
UD
1310
1311 /* Proceed to next node. */
a3022b82 1312 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
15a7d175 1313 &eps_via_nodes, fs);
a3022b82 1314
bc15410e 1315 if (BE (cur_node < 0, 0))
15a7d175
UD
1316 {
1317 if (cur_node == -2)
1318 return REG_ESPACE;
1319 if (fs)
1320 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1321 &eps_via_nodes);
1322 else
1323 {
1324 re_node_set_free (&eps_via_nodes);
1325 return REG_NOMATCH;
1326 }
1327 }
3b0bdc72
UD
1328 }
1329 re_node_set_free (&eps_via_nodes);
485d775d
UD
1330 return free_fail_stack_return (fs);
1331}
1332
1333static reg_errcode_t
1334free_fail_stack_return (fs)
1335 struct re_fail_stack_t *fs;
1336{
1337 if (fs)
1338 {
1339 int fs_idx;
1340 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
15a7d175
UD
1341 {
1342 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1343 re_free (fs->stack[fs_idx].regs);
1344 }
485d775d
UD
1345 re_free (fs->stack);
1346 }
a9388965 1347 return REG_NOERROR;
3b0bdc72
UD
1348}
1349
81c64d40
UD
1350static void
1351update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1352 re_dfa_t *dfa;
1353 regmatch_t *pmatch;
1354 int cur_node, cur_idx, nmatch;
1355{
1356 int type = dfa->nodes[cur_node].type;
1357 int reg_num;
1358 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1359 return;
1360 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1361 if (reg_num >= nmatch)
1362 return;
1363 if (type == OP_OPEN_SUBEXP)
1364 {
1365 /* We are at the first node of this sub expression. */
1366 pmatch[reg_num].rm_so = cur_idx;
1367 pmatch[reg_num].rm_eo = -1;
1368 }
1369 else if (type == OP_CLOSE_SUBEXP)
1370 /* We are at the first node of this sub expression. */
1371 pmatch[reg_num].rm_eo = cur_idx;
0742e48e 1372}
81c64d40 1373
3b0bdc72
UD
1374#define NUMBER_OF_STATE 1
1375
0742e48e 1376/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
612546c6
UD
1377 and sift the nodes in each states according to the following rules.
1378 Updated state_log will be wrote to STATE_LOG.
3b0bdc72
UD
1379
1380 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1381 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
15a7d175
UD
1382 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1383 the LAST_NODE, we throw away the node `a'.
612546c6 1384 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
15a7d175
UD
1385 string `s' and transit to `b':
1386 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1387 away the node `a'.
1388 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1389 throwed away, we throw away the node `a'.
3b0bdc72 1390 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
15a7d175
UD
1391 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1392 node `a'.
1393 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1394 we throw away the node `a'. */
3b0bdc72
UD
1395
1396#define STATE_NODE_CONTAINS(state,node) \
1397 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1398
a9388965 1399static reg_errcode_t
0742e48e
UD
1400sift_states_backward (preg, mctx, sctx)
1401 const regex_t *preg;
1402 re_match_context_t *mctx;
1403 re_sift_context_t *sctx;
3b0bdc72 1404{
a9388965 1405 reg_errcode_t err;
3b0bdc72 1406 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
0742e48e
UD
1407 int null_cnt = 0;
1408 int str_idx = sctx->last_str_idx;
1409 re_node_set cur_dest;
1410 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
3b0bdc72
UD
1411
1412#ifdef DEBUG
612546c6 1413 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
3b0bdc72 1414#endif
0742e48e 1415 cur_src = &mctx->state_log[str_idx]->nodes;
3b0bdc72
UD
1416
1417 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1418 transit to the last_node and the last_node itself. */
0742e48e 1419 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
bc15410e 1420 if (BE (err != REG_NOERROR, 0))
a9388965 1421 return err;
0742e48e
UD
1422 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1423 if (BE (err != REG_NOERROR, 0))
1b2c2628 1424 goto free_return;
3b0bdc72
UD
1425
1426 /* Then check each states in the state_log. */
612546c6 1427 while (str_idx > 0)
3b0bdc72 1428 {
0742e48e 1429 int i, ret;
3b0bdc72 1430 /* Update counters. */
0742e48e
UD
1431 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1432 if (null_cnt > mctx->max_mb_elem_len)
15a7d175
UD
1433 {
1434 memset (sctx->sifted_states, '\0',
1435 sizeof (re_dfastate_t *) * str_idx);
1436 re_node_set_free (&cur_dest);
1437 return REG_NOERROR;
1438 }
0742e48e 1439 re_node_set_empty (&cur_dest);
3b0bdc72 1440 --str_idx;
0742e48e 1441 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1442 : &mctx->state_log[str_idx]->nodes);
3b0bdc72
UD
1443
1444 /* Then build the next sifted state.
15a7d175
UD
1445 We build the next sifted state on `cur_dest', and update
1446 `sifted_states[str_idx]' with `cur_dest'.
1447 Note:
1448 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1449 `cur_src' points the node_set of the old `state_log[str_idx]'. */
0742e48e 1450 for (i = 0; i < cur_src->nelem; i++)
15a7d175
UD
1451 {
1452 int prev_node = cur_src->elems[i];
1453 int naccepted = 0;
1454 re_token_type_t type = dfa->nodes[prev_node].type;
0742e48e 1455
15a7d175
UD
1456 if (IS_EPSILON_NODE(type))
1457 continue;
434d3784 1458#ifdef RE_ENABLE_I18N
15a7d175
UD
1459 /* If the node may accept `multi byte'. */
1460 if (ACCEPT_MB_NODE (type))
1461 naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node,
1462 str_idx, sctx->last_str_idx);
3b0bdc72 1463
434d3784 1464#endif /* RE_ENABLE_I18N */
15a7d175
UD
1465 /* We don't check backreferences here.
1466 See update_cur_sifted_state(). */
1467
1468 if (!naccepted
1469 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1470 str_idx)
1471 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1472 dfa->nexts[prev_node]))
1473 naccepted = 1;
1474
1475 if (naccepted == 0)
1476 continue;
1477
1478 if (sctx->limits.nelem)
1479 {
1480 int to_idx = str_idx + naccepted;
1481 if (check_dst_limits (dfa, &sctx->limits, mctx,
1482 dfa->nexts[prev_node], to_idx,
1483 prev_node, str_idx))
1484 continue;
1485 }
1486 ret = re_node_set_insert (&cur_dest, prev_node);
1487 if (BE (ret == -1, 0))
1488 {
1489 err = REG_ESPACE;
1490 goto free_return;
1491 }
1492 }
3b0bdc72 1493
0742e48e 1494 /* Add all the nodes which satisfy the following conditions:
15a7d175
UD
1495 - It can epsilon transit to a node in CUR_DEST.
1496 - It is in CUR_SRC.
1497 And update state_log. */
0742e48e
UD
1498 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1499 if (BE (err != REG_NOERROR, 0))
15a7d175 1500 goto free_return;
3b0bdc72 1501 }
1b2c2628
UD
1502 err = REG_NOERROR;
1503 free_return:
0742e48e 1504 re_node_set_free (&cur_dest);
1b2c2628 1505 return err;
3b0bdc72
UD
1506}
1507
1508/* Helper functions. */
1509
25337753 1510static reg_errcode_t
612546c6 1511clean_state_log_if_need (mctx, next_state_log_idx)
3b0bdc72
UD
1512 re_match_context_t *mctx;
1513 int next_state_log_idx;
1514{
1515 int top = mctx->state_log_top;
612546c6
UD
1516
1517 if (next_state_log_idx >= mctx->input->bufs_len
1518 || (next_state_log_idx >= mctx->input->valid_len
15a7d175 1519 && mctx->input->valid_len < mctx->input->len))
612546c6
UD
1520 {
1521 reg_errcode_t err;
1522 err = extend_buffers (mctx);
1523 if (BE (err != REG_NOERROR, 0))
15a7d175 1524 return err;
612546c6
UD
1525 }
1526
3b0bdc72
UD
1527 if (top < next_state_log_idx)
1528 {
612546c6 1529 memset (mctx->state_log + top + 1, '\0',
15a7d175 1530 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
3b0bdc72
UD
1531 mctx->state_log_top = next_state_log_idx;
1532 }
612546c6 1533 return REG_NOERROR;
3b0bdc72
UD
1534}
1535
1b2c2628
UD
1536static reg_errcode_t
1537merge_state_array (dfa, dst, src, num)
0742e48e
UD
1538 re_dfa_t *dfa;
1539 re_dfastate_t **dst;
1540 re_dfastate_t **src;
1541 int num;
3b0bdc72 1542{
0742e48e
UD
1543 int st_idx;
1544 reg_errcode_t err;
1545 for (st_idx = 0; st_idx < num; ++st_idx)
1546 {
1547 if (dst[st_idx] == NULL)
15a7d175 1548 dst[st_idx] = src[st_idx];
0742e48e 1549 else if (src[st_idx] != NULL)
15a7d175
UD
1550 {
1551 re_node_set merged_set;
1552 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1553 &src[st_idx]->nodes);
1554 if (BE (err != REG_NOERROR, 0))
1555 return err;
1556 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1557 re_node_set_free (&merged_set);
1558 if (BE (err != REG_NOERROR, 0))
1559 return err;
1560 }
0742e48e
UD
1561 }
1562 return REG_NOERROR;
3b0bdc72
UD
1563}
1564
0742e48e
UD
1565static reg_errcode_t
1566update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1567 const regex_t *preg;
1568 re_match_context_t *mctx;
1569 re_sift_context_t *sctx;
1570 int str_idx;
1571 re_node_set *dest_nodes;
3b0bdc72 1572{
0742e48e
UD
1573 reg_errcode_t err;
1574 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1575 const re_node_set *candidates;
1576 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1577 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1578
1579 /* At first, add the nodes which can epsilon transit to a node in
1580 DEST_NODE. */
485d775d
UD
1581 if (dest_nodes->nelem)
1582 {
1583 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1584 if (BE (err != REG_NOERROR, 0))
1585 return err;
1586 }
0742e48e
UD
1587
1588 /* Then, check the limitations in the current sift_context. */
485d775d 1589 if (dest_nodes->nelem && sctx->limits.nelem)
0742e48e
UD
1590 {
1591 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
15a7d175 1592 mctx->bkref_ents, str_idx);
0742e48e 1593 if (BE (err != REG_NOERROR, 0))
15a7d175 1594 return err;
0742e48e
UD
1595 }
1596
1597 /* Update state_log. */
1598 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1599 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1600 return err;
1601
0742e48e
UD
1602 if ((mctx->state_log[str_idx] != NULL
1603 && mctx->state_log[str_idx]->has_backref))
1604 {
1605 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1606 if (BE (err != REG_NOERROR, 0))
15a7d175 1607 return err;
0742e48e
UD
1608 }
1609 return REG_NOERROR;
3b0bdc72
UD
1610}
1611
a9388965 1612static reg_errcode_t
0742e48e
UD
1613add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1614 re_dfa_t *dfa;
1615 re_node_set *dest_nodes;
1616 const re_node_set *candidates;
3b0bdc72 1617{
0742e48e
UD
1618 reg_errcode_t err;
1619 int src_idx;
1620 re_node_set src_copy;
1621
1622 err = re_node_set_init_copy (&src_copy, dest_nodes);
1623 if (BE (err != REG_NOERROR, 0))
1624 return err;
1625 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
3b0bdc72 1626 {
0742e48e 1627 err = re_node_set_add_intersect (dest_nodes, candidates,
15a7d175
UD
1628 dfa->inveclosures
1629 + src_copy.elems[src_idx]);
0742e48e 1630 if (BE (err != REG_NOERROR, 0))
15a7d175
UD
1631 {
1632 re_node_set_free (&src_copy);
1633 return err;
1634 }
0742e48e
UD
1635 }
1636 re_node_set_free (&src_copy);
1637 return REG_NOERROR;
1638}
3b0bdc72 1639
0742e48e
UD
1640static reg_errcode_t
1641sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1642 re_dfa_t *dfa;
1643 int node;
1644 re_node_set *dest_nodes;
1645 const re_node_set *candidates;
1646{
1647 int ecl_idx;
1648 reg_errcode_t err;
1649 re_node_set *inv_eclosure = dfa->inveclosures + node;
1650 re_node_set except_nodes;
1651 re_node_set_init_empty (&except_nodes);
1652 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1653 {
15a7d175
UD
1654 int cur_node = inv_eclosure->elems[ecl_idx];
1655 if (cur_node == node)
1656 continue;
1657 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1658 {
1659 int edst1 = dfa->edests[cur_node].elems[0];
1660 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1661 ? dfa->edests[cur_node].elems[1] : -1);
1662 if ((!re_node_set_contains (inv_eclosure, edst1)
1663 && re_node_set_contains (dest_nodes, edst1))
1664 || (edst2 > 0
1665 && !re_node_set_contains (inv_eclosure, edst2)
1666 && re_node_set_contains (dest_nodes, edst2)))
1667 {
1668 err = re_node_set_add_intersect (&except_nodes, candidates,
1669 dfa->inveclosures + cur_node);
1670 if (BE (err != REG_NOERROR, 0))
1671 {
1672 re_node_set_free (&except_nodes);
1673 return err;
1674 }
1675 }
1676 }
0742e48e
UD
1677 }
1678 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1679 {
15a7d175
UD
1680 int cur_node = inv_eclosure->elems[ecl_idx];
1681 if (!re_node_set_contains (&except_nodes, cur_node))
1682 {
1683 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1684 re_node_set_remove_at (dest_nodes, idx);
1685 }
0742e48e
UD
1686 }
1687 re_node_set_free (&except_nodes);
1688 return REG_NOERROR;
1689}
1690
a3022b82
UD
1691static int
1692check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1693 re_dfa_t *dfa;
1694 re_node_set *limits;
1695 re_match_context_t *mctx;
1696 int dst_node, dst_idx, src_node, src_idx;
1697{
1698 int lim_idx, src_pos, dst_pos;
1699
1700 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1701 {
485d775d 1702 int subexp_idx;
a3022b82
UD
1703 struct re_backref_cache_entry *ent;
1704 ent = mctx->bkref_ents + limits->elems[lim_idx];
485d775d 1705 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
a3022b82
UD
1706
1707 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
15a7d175
UD
1708 dfa->eclosures + dst_node,
1709 subexp_idx, dst_node, dst_idx);
a3022b82 1710 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
15a7d175
UD
1711 dfa->eclosures + src_node,
1712 subexp_idx, src_node, src_idx);
a3022b82
UD
1713
1714 /* In case of:
15a7d175
UD
1715 <src> <dst> ( <subexp> )
1716 ( <subexp> ) <src> <dst>
1717 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
a3022b82 1718 if (src_pos == dst_pos)
15a7d175 1719 continue; /* This is unrelated limitation. */
a3022b82 1720 else
15a7d175 1721 return 1;
a3022b82
UD
1722 }
1723 return 0;
1724}
1725
1726static int
1727check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node,
15a7d175 1728 str_idx)
a3022b82
UD
1729 re_dfa_t *dfa;
1730 re_match_context_t *mctx;
1731 re_node_set *eclosures;
1732 int limit, subexp_idx, node, str_idx;
1733{
1734 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1735 int pos = (str_idx < lim->subexp_from ? -1
15a7d175 1736 : (lim->subexp_to < str_idx ? 1 : 0));
a3022b82
UD
1737 if (pos == 0
1738 && (str_idx == lim->subexp_from || str_idx == lim->subexp_to))
1739 {
1740 int node_idx;
1741 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
15a7d175
UD
1742 {
1743 int node = eclosures->elems[node_idx];
1744 re_token_type_t type= dfa->nodes[node].type;
1745 if (type == OP_BACK_REF)
1746 {
6291ee3c
UD
1747 int bi = search_cur_bkref_entry (mctx, str_idx);
1748 for (; bi < mctx->nbkref_ents; ++bi)
15a7d175
UD
1749 {
1750 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
6291ee3c
UD
1751 if (ent->str_idx > str_idx)
1752 break;
1753 if (ent->node == node && ent->subexp_from == ent->subexp_to)
15a7d175
UD
1754 {
1755 int cpos, dst;
1756 dst = dfa->edests[node].elems[0];
1757 cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1758 dfa->eclosures + dst,
1759 subexp_idx, dst,
1760 str_idx);
1761 if ((str_idx == lim->subexp_from && cpos == -1)
1762 || (str_idx == lim->subexp_to && cpos == 0))
1763 return cpos;
1764 }
1765 }
1766 }
1767 if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1768 && str_idx == lim->subexp_from)
1769 {
1770 pos = -1;
1771 break;
1772 }
1773 if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1774 && str_idx == lim->subexp_to)
1775 break;
1776 }
a3022b82 1777 if (node_idx == eclosures->nelem && str_idx == lim->subexp_to)
15a7d175 1778 pos = 1;
a3022b82
UD
1779 }
1780 return pos;
1781}
1782
0742e48e
UD
1783/* Check the limitations of sub expressions LIMITS, and remove the nodes
1784 which are against limitations from DEST_NODES. */
1785
1786static reg_errcode_t
1787check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1788 re_dfa_t *dfa;
1789 re_node_set *dest_nodes;
1790 const re_node_set *candidates;
1791 re_node_set *limits;
1792 struct re_backref_cache_entry *bkref_ents;
1793 int str_idx;
1794{
1795 reg_errcode_t err;
1796 int node_idx, lim_idx;
1797
1798 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1799 {
485d775d 1800 int subexp_idx;
0742e48e
UD
1801 struct re_backref_cache_entry *ent;
1802 ent = bkref_ents + limits->elems[lim_idx];
1803
1804 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
15a7d175 1805 continue; /* This is unrelated limitation. */
0742e48e 1806
485d775d 1807 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
0742e48e 1808 if (ent->subexp_to == str_idx)
15a7d175
UD
1809 {
1810 int ops_node = -1;
1811 int cls_node = -1;
1812 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1813 {
1814 int node = dest_nodes->elems[node_idx];
1815 re_token_type_t type= dfa->nodes[node].type;
1816 if (type == OP_OPEN_SUBEXP
1817 && subexp_idx == dfa->nodes[node].opr.idx)
1818 ops_node = node;
1819 else if (type == OP_CLOSE_SUBEXP
1820 && subexp_idx == dfa->nodes[node].opr.idx)
1821 cls_node = node;
1822 }
1823
1824 /* Check the limitation of the open subexpression. */
1825 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1826 if (ops_node >= 0)
1827 {
1828 err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes,
1829 candidates);
1830 if (BE (err != REG_NOERROR, 0))
1831 return err;
1832 }
1833 /* Check the limitation of the close subexpression. */
1834 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1835 {
1836 int node = dest_nodes->elems[node_idx];
1837 if (!re_node_set_contains (dfa->inveclosures + node, cls_node)
1838 && !re_node_set_contains (dfa->eclosures + node, cls_node))
1839 {
1840 /* It is against this limitation.
1841 Remove it form the current sifted state. */
1842 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1843 candidates);
1844 if (BE (err != REG_NOERROR, 0))
1845 return err;
1846 --node_idx;
1847 }
1848 }
1849 }
0742e48e 1850 else /* (ent->subexp_to != str_idx) */
15a7d175
UD
1851 {
1852 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1853 {
1854 int node = dest_nodes->elems[node_idx];
1855 re_token_type_t type= dfa->nodes[node].type;
1856 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1857 {
1858 if (subexp_idx != dfa->nodes[node].opr.idx)
1859 continue;
1860 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1861 || (type == OP_OPEN_SUBEXP))
1862 {
1863 /* It is against this limitation.
1864 Remove it form the current sifted state. */
1865 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1866 candidates);
1867 if (BE (err != REG_NOERROR, 0))
1868 return err;
1869 }
1870 }
1871 }
1872 }
0742e48e
UD
1873 }
1874 return REG_NOERROR;
1875}
1876
0742e48e
UD
1877static reg_errcode_t
1878sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1879 const regex_t *preg;
1880 re_match_context_t *mctx;
1881 re_sift_context_t *sctx;
1882 int str_idx;
1883 re_node_set *dest_nodes;
1884{
1885 reg_errcode_t err;
1886 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1887 int node_idx, node;
1888 re_sift_context_t local_sctx;
1889 const re_node_set *candidates;
1890 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1891 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1892 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1893
1894 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1895 {
0742e48e
UD
1896 int cur_bkref_idx = re_string_cur_idx (mctx->input);
1897 re_token_type_t type;
1898 node = candidates->elems[node_idx];
1899 type = dfa->nodes[node].type;
0742e48e 1900 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
15a7d175 1901 continue;
0742e48e
UD
1902 /* Avoid infinite loop for the REs like "()\1+". */
1903 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
15a7d175 1904 continue;
0742e48e 1905 if (type == OP_BACK_REF)
15a7d175 1906 {
6291ee3c
UD
1907 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1908 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
1909 {
1910 int disabled_idx, subexp_len, to_idx, dst_node;
1911 struct re_backref_cache_entry *entry;
1912 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
1913 if (entry->str_idx > str_idx)
1914 break;
1915 if (entry->node != node)
1916 continue;
15a7d175
UD
1917 subexp_len = entry->subexp_to - entry->subexp_from;
1918 to_idx = str_idx + subexp_len;
1919 dst_node = (subexp_len ? dfa->nexts[node]
1920 : dfa->edests[node].elems[0]);
1921
6291ee3c
UD
1922 if (to_idx > sctx->last_str_idx
1923 || sctx->sifted_states[to_idx] == NULL
1924 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
1925 dst_node)
1926 || check_dst_limits (dfa, &sctx->limits, mctx, node,
1927 str_idx, dst_node, to_idx))
15a7d175 1928 continue;
15a7d175
UD
1929 {
1930 re_dfastate_t *cur_state;
1931 entry->flag = 0;
1932 for (disabled_idx = enabled_idx + 1;
1933 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
1934 {
1935 struct re_backref_cache_entry *entry2;
1936 entry2 = mctx->bkref_ents + disabled_idx;
6291ee3c
UD
1937 if (entry2->str_idx > str_idx)
1938 break;
1939 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
15a7d175
UD
1940 }
1941
1942 if (local_sctx.sifted_states == NULL)
1943 {
1944 local_sctx = *sctx;
1945 err = re_node_set_init_copy (&local_sctx.limits,
1946 &sctx->limits);
1947 if (BE (err != REG_NOERROR, 0))
1b2c2628 1948 goto free_return;
15a7d175
UD
1949 }
1950 local_sctx.last_node = node;
1951 local_sctx.last_str_idx = str_idx;
1952 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
1953 if (BE (err < 0, 0))
1b2c2628
UD
1954 {
1955 err = REG_ESPACE;
1956 goto free_return;
1957 }
15a7d175
UD
1958 cur_state = local_sctx.sifted_states[str_idx];
1959 err = sift_states_backward (preg, mctx, &local_sctx);
1960 if (BE (err != REG_NOERROR, 0))
1b2c2628 1961 goto free_return;
15a7d175
UD
1962 if (sctx->limited_states != NULL)
1963 {
1964 err = merge_state_array (dfa, sctx->limited_states,
1965 local_sctx.sifted_states,
1966 str_idx + 1);
1967 if (BE (err != REG_NOERROR, 0))
1968 goto free_return;
1969 }
1970 local_sctx.sifted_states[str_idx] = cur_state;
1971 re_node_set_remove (&local_sctx.limits, enabled_idx);
485d775d
UD
1972 /* We must not use the variable entry here, since
1973 mctx->bkref_ents might be realloced. */
1974 mctx->bkref_ents[enabled_idx].flag = 1;
15a7d175
UD
1975 }
1976 }
6291ee3c
UD
1977 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1978 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
1979 {
1980 struct re_backref_cache_entry *entry;
1981 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
1982 if (entry->str_idx > str_idx)
1983 break;
1984 if (entry->node == node)
15a7d175
UD
1985 entry->flag = 0;
1986 }
1987 }
0742e48e 1988 }
1b2c2628
UD
1989 err = REG_NOERROR;
1990 free_return:
0742e48e
UD
1991 if (local_sctx.sifted_states != NULL)
1992 {
0742e48e
UD
1993 re_node_set_free (&local_sctx.limits);
1994 }
1995
1b2c2628 1996 return err;
0742e48e
UD
1997}
1998
1999
2000#ifdef RE_ENABLE_I18N
2001static int
2002sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2003 const regex_t *preg;
2004 const re_match_context_t *mctx;
2005 re_sift_context_t *sctx;
2006 int node_idx, str_idx, max_str_idx;
2007{
2008 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2009 int naccepted;
2010 /* Check the node can accept `multi byte'. */
2011 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2012 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2013 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
15a7d175 2014 dfa->nexts[node_idx]))
0742e48e
UD
2015 /* The node can't accept the `multi byte', or the
2016 destination was already throwed away, then the node
2017 could't accept the current input `multi byte'. */
2018 naccepted = 0;
2019 /* Otherwise, it is sure that the node could accept
2020 `naccepted' bytes input. */
2021 return naccepted;
2022}
2023#endif /* RE_ENABLE_I18N */
2024
3b0bdc72
UD
2025\f
2026/* Functions for state transition. */
2027
2028/* Return the next state to which the current state STATE will transit by
2029 accepting the current input byte, and update STATE_LOG if necessary.
2030 If STATE can accept a multibyte char/collating element/back reference
2031 update the destination of STATE_LOG. */
2032
2033static re_dfastate_t *
612546c6 2034transit_state (err, preg, mctx, state, fl_search)
a9388965
UD
2035 reg_errcode_t *err;
2036 const regex_t *preg;
a9388965 2037 re_match_context_t *mctx;
612546c6
UD
2038 re_dfastate_t *state;
2039 int fl_search;
3b0bdc72
UD
2040{
2041 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2042 re_dfastate_t **trtable, *next_state;
2043 unsigned char ch;
6291ee3c 2044 int cur_idx;
3b0bdc72 2045
612546c6
UD
2046 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2047 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
15a7d175 2048 && mctx->input->valid_len < mctx->input->len))
612546c6
UD
2049 {
2050 *err = extend_buffers (mctx);
2051 if (BE (*err != REG_NOERROR, 0))
15a7d175 2052 return NULL;
612546c6
UD
2053 }
2054
a9388965 2055 *err = REG_NOERROR;
3b0bdc72
UD
2056 if (state == NULL)
2057 {
2058 next_state = state;
612546c6 2059 re_string_skip_bytes (mctx->input, 1);
3b0bdc72
UD
2060 }
2061 else
2062 {
434d3784 2063#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2064 /* If the current state can accept multibyte. */
2065 if (state->accept_mb)
15a7d175
UD
2066 {
2067 *err = transit_state_mb (preg, state, mctx);
2068 if (BE (*err != REG_NOERROR, 0))
2069 return NULL;
2070 }
434d3784 2071#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2072
2073 /* Then decide the next state with the single byte. */
2074 if (1)
15a7d175
UD
2075 {
2076 /* Use transition table */
2077 ch = re_string_fetch_byte (mctx->input);
2078 trtable = fl_search ? state->trtable_search : state->trtable;
2079 if (trtable == NULL)
2080 {
2081 trtable = build_trtable (preg, state, fl_search);
2082 if (fl_search)
2083 state->trtable_search = trtable;
2084 else
2085 state->trtable = trtable;
2086 }
2087 next_state = trtable[ch];
2088 }
3b0bdc72 2089 else
15a7d175
UD
2090 {
2091 /* don't use transition table */
2092 next_state = transit_state_sb (err, preg, state, fl_search, mctx);
2093 if (BE (next_state == NULL && err != REG_NOERROR, 0))
2094 return NULL;
2095 }
3b0bdc72
UD
2096 }
2097
6291ee3c 2098 cur_idx = re_string_cur_idx (mctx->input);
3b0bdc72 2099 /* Update the state_log if we need. */
612546c6 2100 if (mctx->state_log != NULL)
3b0bdc72 2101 {
3b0bdc72 2102 if (cur_idx > mctx->state_log_top)
15a7d175
UD
2103 {
2104 mctx->state_log[cur_idx] = next_state;
2105 mctx->state_log_top = cur_idx;
2106 }
612546c6 2107 else if (mctx->state_log[cur_idx] == 0)
15a7d175
UD
2108 {
2109 mctx->state_log[cur_idx] = next_state;
2110 }
3b0bdc72 2111 else
15a7d175
UD
2112 {
2113 re_dfastate_t *pstate;
2114 unsigned int context;
2115 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2116 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2117 the destination of a multibyte char/collating element/
2118 back reference. Then the next state is the union set of
2119 these destinations and the results of the transition table. */
2120 pstate = mctx->state_log[cur_idx];
2121 log_nodes = pstate->entrance_nodes;
2122 if (next_state != NULL)
2123 {
2124 table_nodes = next_state->entrance_nodes;
2125 *err = re_node_set_init_union (&next_nodes, table_nodes,
2126 log_nodes);
2127 if (BE (*err != REG_NOERROR, 0))
2128 return NULL;
2129 }
2130 else
2131 next_nodes = *log_nodes;
2132 /* Note: We already add the nodes of the initial state,
2133 then we don't need to add them here. */
2134
2135 context = re_string_context_at (mctx->input,
2136 re_string_cur_idx (mctx->input) - 1,
2137 mctx->eflags, preg->newline_anchor);
2138 next_state = mctx->state_log[cur_idx]
2139 = re_acquire_state_context (err, dfa, &next_nodes, context);
2140 /* We don't need to check errors here, since the return value of
2141 this function is next_state and ERR is already set. */
2142
2143 if (table_nodes != NULL)
2144 re_node_set_free (&next_nodes);
2145 }
6291ee3c
UD
2146 }
2147
2148 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2149 later. We must check them here, since the back references in the
2150 next state might use them. */
2151 if (dfa->nbackref && next_state/* && fl_process_bkref */)
2152 {
2153 *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes,
2154 cur_idx);
2155 if (BE (*err != REG_NOERROR, 0))
2156 return NULL;
2157 }
2158
2159 /* If the next state has back references. */
2160 if (next_state != NULL && next_state->has_backref)
2161 {
2162 *err = transit_state_bkref (preg, &next_state->nodes, mctx);
2163 if (BE (*err != REG_NOERROR, 0))
2164 return NULL;
2165 next_state = mctx->state_log[cur_idx];
3b0bdc72
UD
2166 }
2167 return next_state;
2168}
2169
2170/* Helper functions for transit_state. */
2171
6291ee3c
UD
2172/* From the node set CUR_NODES, pick up the nodes whose types are
2173 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2174 expression. And register them to use them later for evaluating the
2175 correspoding back references. */
2176
2177static reg_errcode_t
2178check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx)
2179 re_dfa_t *dfa;
2180 re_match_context_t *mctx;
2181 re_node_set *cur_nodes;
2182 int str_idx;
2183{
2184 int node_idx;
2185 reg_errcode_t err;
2186
2187 /* TODO: This isn't efficient.
2188 Because there might be more than one nodes whose types are
2189 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2190 nodes.
2191 E.g. RE: (a){2} */
2192 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2193 {
2194 int node = cur_nodes->elems[node_idx];
2195 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
ce859332 2196 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
6291ee3c
UD
2197 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2198 {
2199 err = match_ctx_add_subtop (mctx, node, str_idx);
2200 if (BE (err != REG_NOERROR, 0))
2201 return err;
2202 }
2203 }
2204 return REG_NOERROR;
2205}
2206
3b0bdc72
UD
2207/* Return the next state to which the current state STATE will transit by
2208 accepting the current input byte. */
2209
2210static re_dfastate_t *
612546c6 2211transit_state_sb (err, preg, state, fl_search, mctx)
a9388965
UD
2212 reg_errcode_t *err;
2213 const regex_t *preg;
2214 re_dfastate_t *state;
a9388965
UD
2215 int fl_search;
2216 re_match_context_t *mctx;
3b0bdc72
UD
2217{
2218 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2219 re_node_set next_nodes;
2220 re_dfastate_t *next_state;
612546c6 2221 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
3b0bdc72
UD
2222 unsigned int context;
2223
a9388965 2224 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
bc15410e 2225 if (BE (*err != REG_NOERROR, 0))
a9388965 2226 return NULL;
3b0bdc72
UD
2227 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2228 {
2229 int cur_node = state->nodes.elems[node_cnt];
612546c6 2230 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
15a7d175
UD
2231 {
2232 *err = re_node_set_merge (&next_nodes,
2233 dfa->eclosures + dfa->nexts[cur_node]);
2234 if (BE (*err != REG_NOERROR, 0))
2235 {
2236 re_node_set_free (&next_nodes);
2237 return NULL;
2238 }
2239 }
3b0bdc72
UD
2240 }
2241 if (fl_search)
2242 {
2243#ifdef RE_ENABLE_I18N
2244 int not_initial = 0;
2245 if (MB_CUR_MAX > 1)
15a7d175
UD
2246 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
2247 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
2248 {
2249 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
2250 break;
2251 }
3b0bdc72
UD
2252 if (!not_initial)
2253#endif
15a7d175
UD
2254 {
2255 *err = re_node_set_merge (&next_nodes,
2256 dfa->init_state->entrance_nodes);
2257 if (BE (*err != REG_NOERROR, 0))
2258 {
2259 re_node_set_free (&next_nodes);
2260 return NULL;
2261 }
2262 }
3b0bdc72 2263 }
612546c6 2264 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
15a7d175 2265 preg->newline_anchor);
a9388965
UD
2266 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2267 /* We don't need to check errors here, since the return value of
2268 this function is next_state and ERR is already set. */
2269
3b0bdc72 2270 re_node_set_free (&next_nodes);
612546c6 2271 re_string_skip_bytes (mctx->input, 1);
3b0bdc72
UD
2272 return next_state;
2273}
2274
434d3784 2275#ifdef RE_ENABLE_I18N
a9388965 2276static reg_errcode_t
612546c6 2277transit_state_mb (preg, pstate, mctx)
3b0bdc72 2278 const regex_t *preg;
612546c6 2279 re_dfastate_t *pstate;
3b0bdc72
UD
2280 re_match_context_t *mctx;
2281{
a9388965 2282 reg_errcode_t err;
3b0bdc72
UD
2283 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2284 int i;
2285
2286 for (i = 0; i < pstate->nodes.nelem; ++i)
2287 {
2288 re_node_set dest_nodes, *new_nodes;
2289 int cur_node_idx = pstate->nodes.elems[i];
2290 int naccepted = 0, dest_idx;
2291 unsigned int context;
2292 re_dfastate_t *dest_state;
2293
485d775d 2294 if (dfa->nodes[cur_node_idx].constraint)
15a7d175
UD
2295 {
2296 context = re_string_context_at (mctx->input,
2297 re_string_cur_idx (mctx->input),
2298 mctx->eflags, preg->newline_anchor);
2299 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2300 context))
2301 continue;
2302 }
3b0bdc72
UD
2303
2304 /* How many bytes the node can accepts? */
2305 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
15a7d175
UD
2306 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2307 re_string_cur_idx (mctx->input));
3b0bdc72 2308 if (naccepted == 0)
15a7d175 2309 continue;
3b0bdc72
UD
2310
2311 /* The node can accepts `naccepted' bytes. */
612546c6 2312 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
0742e48e 2313 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
15a7d175 2314 : mctx->max_mb_elem_len);
612546c6
UD
2315 err = clean_state_log_if_need (mctx, dest_idx);
2316 if (BE (err != REG_NOERROR, 0))
15a7d175 2317 return err;
3b0bdc72
UD
2318#ifdef DEBUG
2319 assert (dfa->nexts[cur_node_idx] != -1);
2320#endif
2321 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
15a7d175 2322 then we use pstate->nodes.elems[i] instead. */
3b0bdc72
UD
2323 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2324
612546c6 2325 dest_state = mctx->state_log[dest_idx];
3b0bdc72 2326 if (dest_state == NULL)
15a7d175 2327 dest_nodes = *new_nodes;
3b0bdc72 2328 else
15a7d175
UD
2329 {
2330 err = re_node_set_init_union (&dest_nodes,
2331 dest_state->entrance_nodes, new_nodes);
2332 if (BE (err != REG_NOERROR, 0))
2333 return err;
2334 }
612546c6 2335 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
15a7d175 2336 preg->newline_anchor);
612546c6 2337 mctx->state_log[dest_idx]
15a7d175 2338 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
3b0bdc72 2339 if (dest_state != NULL)
15a7d175 2340 re_node_set_free (&dest_nodes);
1b2c2628 2341 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
15a7d175 2342 return err;
3b0bdc72 2343 }
a9388965 2344 return REG_NOERROR;
3b0bdc72 2345}
434d3784 2346#endif /* RE_ENABLE_I18N */
3b0bdc72 2347
a9388965 2348static reg_errcode_t
6291ee3c 2349transit_state_bkref (preg, nodes, mctx)
3b0bdc72 2350 const regex_t *preg;
3b0bdc72 2351 re_node_set *nodes;
3b0bdc72
UD
2352 re_match_context_t *mctx;
2353{
a9388965 2354 reg_errcode_t err;
3b0bdc72 2355 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
0742e48e 2356 int i;
612546c6 2357 int cur_str_idx = re_string_cur_idx (mctx->input);
3b0bdc72
UD
2358
2359 for (i = 0; i < nodes->nelem; ++i)
2360 {
6291ee3c 2361 int dest_str_idx, prev_nelem, bkc_idx;
3b0bdc72
UD
2362 int node_idx = nodes->elems[i];
2363 unsigned int context;
2364 re_token_t *node = dfa->nodes + node_idx;
3b0bdc72
UD
2365 re_node_set *new_dest_nodes;
2366
2367 /* Check whether `node' is a backreference or not. */
6291ee3c 2368 if (node->type != OP_BACK_REF)
15a7d175 2369 continue;
485d775d
UD
2370
2371 if (node->constraint)
15a7d175
UD
2372 {
2373 context = re_string_context_at (mctx->input, cur_str_idx,
2374 mctx->eflags, preg->newline_anchor);
2375 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2376 continue;
2377 }
3b0bdc72
UD
2378
2379 /* `node' is a backreference.
15a7d175 2380 Check the substring which the substring matched. */
6291ee3c
UD
2381 bkc_idx = mctx->nbkref_ents;
2382 err = get_subexp (preg, mctx, node_idx, cur_str_idx);
612546c6 2383 if (BE (err != REG_NOERROR, 0))
15a7d175 2384 goto free_return;
3b0bdc72
UD
2385
2386 /* And add the epsilon closures (which is `new_dest_nodes') of
15a7d175 2387 the backreference to appropriate state_log. */
3b0bdc72
UD
2388#ifdef DEBUG
2389 assert (dfa->nexts[node_idx] != -1);
2390#endif
6291ee3c 2391 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
15a7d175
UD
2392 {
2393 int subexp_len;
2394 re_dfastate_t *dest_state;
2395 struct re_backref_cache_entry *bkref_ent;
2396 bkref_ent = mctx->bkref_ents + bkc_idx;
2397 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2398 continue;
2399 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2400 new_dest_nodes = (subexp_len == 0
2401 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2402 : dfa->eclosures + dfa->nexts[node_idx]);
2403 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2404 - bkref_ent->subexp_from);
6291ee3c
UD
2405 context = re_string_context_at (mctx->input, dest_str_idx - 1,
2406 mctx->eflags, preg->newline_anchor);
15a7d175
UD
2407 dest_state = mctx->state_log[dest_str_idx];
2408 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2409 : mctx->state_log[cur_str_idx]->nodes.nelem);
2410 /* Add `new_dest_node' to state_log. */
2411 if (dest_state == NULL)
2412 {
2413 mctx->state_log[dest_str_idx]
2414 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2415 context);
2416 if (BE (mctx->state_log[dest_str_idx] == NULL
2417 && err != REG_NOERROR, 0))
2418 goto free_return;
2419 }
2420 else
2421 {
2422 re_node_set dest_nodes;
2423 err = re_node_set_init_union (&dest_nodes,
2424 dest_state->entrance_nodes,
2425 new_dest_nodes);
2426 if (BE (err != REG_NOERROR, 0))
2427 {
2428 re_node_set_free (&dest_nodes);
2429 goto free_return;
2430 }
2431 mctx->state_log[dest_str_idx]
2432 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2433 re_node_set_free (&dest_nodes);
2434 if (BE (mctx->state_log[dest_str_idx] == NULL
2435 && err != REG_NOERROR, 0))
2436 goto free_return;
2437 }
2438 /* We need to check recursively if the backreference can epsilon
2439 transit. */
2440 if (subexp_len == 0
2441 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2442 {
6291ee3c
UD
2443 err = check_subexp_matching_top (dfa, mctx, new_dest_nodes,
2444 cur_str_idx);
2445 if (BE (err != REG_NOERROR, 0))
2446 goto free_return;
2447 err = transit_state_bkref (preg, new_dest_nodes, mctx);
15a7d175
UD
2448 if (BE (err != REG_NOERROR, 0))
2449 goto free_return;
2450 }
2451 }
3b0bdc72 2452 }
1b2c2628
UD
2453 err = REG_NOERROR;
2454 free_return:
1b2c2628 2455 return err;
3b0bdc72
UD
2456}
2457
6291ee3c
UD
2458/* Enumerate all the candidates which the backreference BKREF_NODE can match
2459 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2460 Note that we might collect inappropriate candidates here.
2461 However, the cost of checking them strictly here is too high, then we
2462 delay these checking for prune_impossible_nodes(). */
3b0bdc72 2463
6291ee3c
UD
2464static reg_errcode_t
2465get_subexp (preg, mctx, bkref_node, bkref_str_idx)
2466 const regex_t *preg;
2467 re_match_context_t *mctx;
2468 int bkref_node, bkref_str_idx;
3b0bdc72 2469{
6291ee3c 2470 int subexp_num, sub_top_idx;
3b0bdc72 2471 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
85c54a32 2472 char *buf = (char *) re_string_get_buffer (mctx->input);
6291ee3c
UD
2473 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2474 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2475 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2476 {
2477 struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2478 if (entry->str_idx > bkref_str_idx)
2479 break;
2480 if (entry->node == bkref_node)
2481 return REG_NOERROR; /* We already checked it. */
2482 }
2483 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2484
2485 /* For each sub expression */
2486 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2487 {
2488 reg_errcode_t err;
2489 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2490 re_sub_match_last_t *sub_last;
2491 int sub_last_idx, sl_str;
2492 char *bkref_str;
2493
2494 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2495 continue; /* It isn't related. */
2496
2497 sl_str = sub_top->str_idx;
2498 bkref_str = buf + bkref_str_idx;
2499 /* At first, check the last node of sub expressions we already
2500 evaluated. */
2501 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2502 {
2503 int sl_str_diff;
2504 sub_last = sub_top->lasts[sub_last_idx];
2505 sl_str_diff = sub_last->str_idx - sl_str;
2506 /* The matched string by the sub expression match with the substring
2507 at the back reference? */
2508 if (sl_str_diff > 0
2509 && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0)
2510 break; /* We don't need to search this sub expression any more. */
2511 bkref_str += sl_str_diff;
2512 sl_str += sl_str_diff;
2513 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2514 bkref_str_idx);
2515 if (err == REG_NOMATCH)
2516 continue;
2517 if (BE (err != REG_NOERROR, 0))
2518 return err;
2519 }
2520 if (sub_last_idx < sub_top->nlasts)
2521 continue;
2522 if (sub_last_idx > 0)
2523 ++sl_str;
2524 /* Then, search for the other last nodes of the sub expression. */
2525 for (; sl_str <= bkref_str_idx; ++sl_str)
2526 {
2527 int cls_node, sl_str_off;
2528 re_node_set *nodes;
2529 sl_str_off = sl_str - sub_top->str_idx;
2530 /* The matched string by the sub expression match with the substring
2531 at the back reference? */
2532 if (sl_str_off > 0
2533 && memcmp (bkref_str++, buf + sl_str - 1, 1) != 0)
2534 break; /* We don't need to search this sub expression any more. */
2535 if (mctx->state_log[sl_str] == NULL)
2536 continue;
2537 /* Does this state have a ')' of the sub expression? */
2538 nodes = &mctx->state_log[sl_str]->nodes;
2539 cls_node = find_subexp_node (dfa, nodes, subexp_num, 0);
2540 if (cls_node == -1)
2541 continue; /* No. */
2542 if (sub_top->path == NULL)
2543 {
2544 sub_top->path = calloc (sizeof (state_array_t),
2545 sl_str - sub_top->str_idx + 1);
2546 if (sub_top->path == NULL)
2547 return REG_ESPACE;
2548 }
2549 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2550 in the current context? */
2551 err = check_arrival (preg, mctx, sub_top->path, sub_top->node,
2552 sub_top->str_idx, cls_node, sl_str, 0);
2553 if (err == REG_NOMATCH)
2554 continue;
2555 if (BE (err != REG_NOERROR, 0))
2556 return err;
2557 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2558 if (BE (sub_last == NULL, 0))
2559 return REG_ESPACE;
2560 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2561 bkref_str_idx);
2562 if (err == REG_NOMATCH)
2563 continue;
2564 }
2565 }
2566 return REG_NOERROR;
2567}
2568
2569/* Helper functions for get_subexp(). */
2570
2571/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2572 If it can arrive, register the sub expression expressed with SUB_TOP
2573 and SUB_LAST. */
2574
2575static reg_errcode_t
2576get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str)
2577 const regex_t *preg;
2578 re_match_context_t *mctx;
2579 re_sub_match_top_t *sub_top;
2580 re_sub_match_last_t *sub_last;
2581 int bkref_node, bkref_str;
2582{
2583 reg_errcode_t err;
2584 int to_idx;
2585 /* Can the subexpression arrive the back reference? */
2586 err = check_arrival (preg, mctx, &sub_last->path, sub_last->node,
2587 sub_last->str_idx, bkref_node, bkref_str, 1);
2588 if (err != REG_NOERROR)
2589 return err;
2590 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2591 sub_last->str_idx);
2592 if (BE (err != REG_NOERROR, 0))
2593 return err;
2594 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2595 clean_state_log_if_need (mctx, to_idx);
2596 return REG_NOERROR;
2597}
2598
2599/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2600 Search '(' if FL_OPEN, or search ')' otherwise.
2601 TODO: This function isn't efficient...
2602 Because there might be more than one nodes whose types are
2603 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2604 nodes.
2605 E.g. RE: (a){2} */
2606
2607static int
2608find_subexp_node (dfa, nodes, subexp_idx, fl_open)
2609 re_dfa_t *dfa;
2610 re_node_set *nodes;
2611 int subexp_idx, fl_open;
2612{
2613 int cls_idx;
2614 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2615 {
2616 int cls_node = nodes->elems[cls_idx];
2617 re_token_t *node = dfa->nodes + cls_node;
2618 if (((fl_open && node->type == OP_OPEN_SUBEXP)
2619 || (!fl_open && node->type == OP_CLOSE_SUBEXP))
2620 && node->opr.idx == subexp_idx)
2621 return cls_node;
2622 }
2623 return -1;
2624}
2625
2626/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2627 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2628 heavily reused.
2629 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2630
2631static reg_errcode_t
2632check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str,
2633 fl_open)
2634 const regex_t *preg;
2635 re_match_context_t *mctx;
2636 state_array_t *path;
2637 int top_node, top_str, last_node, last_str, fl_open;
2638{
2639 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2640 reg_errcode_t err;
2641 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2642 re_dfastate_t *cur_state = NULL;
2643 re_node_set *cur_nodes, next_nodes;
2644 re_dfastate_t **backup_state_log;
2645 unsigned int context;
2646
2647 subexp_num = dfa->nodes[top_node].opr.idx;
2648 /* Extend the buffer if we need. */
2649 if (path->alloc < last_str + mctx->max_mb_elem_len + 1)
2650 {
2651 re_dfastate_t **new_array;
2652 int old_alloc = path->alloc;
2653 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2654 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2655 if (new_array == NULL)
2656 return REG_ESPACE;
2657 path->array = new_array;
2658 memset (new_array + old_alloc, '\0',
2659 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2660 }
2661
2662 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2663
2664 /* Temporary modify MCTX. */
2665 backup_state_log = mctx->state_log;
2666 backup_cur_idx = mctx->input->cur_idx;
2667 mctx->state_log = path->array;
2668 mctx->input->cur_idx = str_idx;
2669
2670 /* Setup initial node set. */
2671 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2672 preg->newline_anchor);
2673 if (str_idx == top_str)
2674 {
2675 err = re_node_set_init_1 (&next_nodes, top_node);
2676 if (BE (err != REG_NOERROR, 0))
2677 return err;
2678 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, fl_open);
2679 if (BE (err != REG_NOERROR, 0))
2680 {
2681 re_node_set_free (&next_nodes);
2682 return err;
2683 }
2684 }
2685 else
2686 {
2687 cur_state = mctx->state_log[str_idx];
2688 if (cur_state && cur_state->has_backref)
2689 {
2690 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2691 if (BE ( err != REG_NOERROR, 0))
2692 return err;
2693 }
2694 else
2695 re_node_set_init_empty (&next_nodes);
2696 }
2697 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2698 {
2699 if (next_nodes.nelem)
2700 {
2701 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2702 subexp_num, fl_open);
2703 if (BE ( err != REG_NOERROR, 0))
2704 {
2705 re_node_set_free (&next_nodes);
2706 return err;
2707 }
2708 }
2709 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2710 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2711 {
2712 re_node_set_free (&next_nodes);
2713 return err;
2714 }
2715 mctx->state_log[str_idx] = cur_state;
2716 }
2717
2718 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2719 {
2720 re_node_set_empty (&next_nodes);
2721 if (mctx->state_log[str_idx + 1])
2722 {
2723 err = re_node_set_merge (&next_nodes,
2724 &mctx->state_log[str_idx + 1]->nodes);
2725 if (BE (err != REG_NOERROR, 0))
2726 {
2727 re_node_set_free (&next_nodes);
2728 return err;
2729 }
2730 }
2731 if (cur_state)
2732 {
2733 err = check_arrival_add_next_nodes(preg, dfa, mctx, str_idx,
2734 &cur_state->nodes, &next_nodes);
2735 if (BE (err != REG_NOERROR, 0))
2736 {
2737 re_node_set_free (&next_nodes);
2738 return err;
2739 }
2740 }
2741 ++str_idx;
2742 if (next_nodes.nelem)
2743 {
2744 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num,
2745 fl_open);
2746 if (BE (err != REG_NOERROR, 0))
2747 {
2748 re_node_set_free (&next_nodes);
2749 return err;
2750 }
2751 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2752 subexp_num, fl_open);
2753 if (BE ( err != REG_NOERROR, 0))
2754 {
2755 re_node_set_free (&next_nodes);
2756 return err;
2757 }
2758 }
2759 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2760 preg->newline_anchor);
2761 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2762 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2763 {
2764 re_node_set_free (&next_nodes);
2765 return err;
2766 }
2767 mctx->state_log[str_idx] = cur_state;
2768 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2769 }
2770 re_node_set_free (&next_nodes);
2771 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2772 : &mctx->state_log[last_str]->nodes);
2773 path->next_idx = str_idx;
2774
2775 /* Fix MCTX. */
2776 mctx->state_log = backup_state_log;
2777 mctx->input->cur_idx = backup_cur_idx;
2778
2779 if (cur_nodes == NULL)
2780 return REG_NOMATCH;
2781 /* Then check the current node set has the node LAST_NODE. */
2782 return (re_node_set_contains (cur_nodes, last_node)
2783 || re_node_set_contains (cur_nodes, last_node) ? REG_NOERROR
2784 : REG_NOMATCH);
2785}
2786
2787/* Helper functions for check_arrival. */
2788
2789/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2790 to NEXT_NODES.
2791 TODO: This function is similar to the functions transit_state*(),
2792 however this function has many additional works.
2793 Can't we unify them? */
2794
2795static reg_errcode_t
2796check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes)
2797 const regex_t *preg;
2798 re_dfa_t *dfa;
2799 re_match_context_t *mctx;
2800 int str_idx;
2801 re_node_set *cur_nodes, *next_nodes;
2802{
2803 int cur_idx;
2804 reg_errcode_t err;
2805 re_node_set union_set;
2806 re_node_set_init_empty (&union_set);
2807 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2808 {
2809 int naccepted = 0;
2810 int cur_node = cur_nodes->elems[cur_idx];
2811 re_token_type_t type = dfa->nodes[cur_node].type;
2812 if (IS_EPSILON_NODE(type))
2813 continue;
2814#ifdef RE_ENABLE_I18N
2815 /* If the node may accept `multi byte'. */
2816 if (ACCEPT_MB_NODE (type))
2817 {
2818 naccepted = check_node_accept_bytes (preg, cur_node, mctx->input,
2819 str_idx);
2820 if (naccepted > 1)
2821 {
2822 re_dfastate_t *dest_state;
2823 int next_node = dfa->nexts[cur_node];
2824 int next_idx = str_idx + naccepted;
2825 dest_state = mctx->state_log[next_idx];
2826 re_node_set_empty (&union_set);
2827 if (dest_state)
2828 {
2829 err = re_node_set_merge (&union_set, &dest_state->nodes);
2830 if (BE (err != REG_NOERROR, 0))
2831 {
2832 re_node_set_free (&union_set);
2833 return err;
2834 }
2835 err = re_node_set_insert (&union_set, next_node);
2836 if (BE (err < 0, 0))
2837 {
2838 re_node_set_free (&union_set);
2839 return REG_ESPACE;
2840 }
2841 }
2842 else
2843 {
2844 err = re_node_set_insert (&union_set, next_node);
2845 if (BE (err < 0, 0))
2846 {
2847 re_node_set_free (&union_set);
2848 return REG_ESPACE;
2849 }
2850 }
2851 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2852 &union_set);
2853 if (BE (mctx->state_log[next_idx] == NULL
2854 && err != REG_NOERROR, 0))
2855 {
2856 re_node_set_free (&union_set);
2857 return err;
2858 }
2859 }
2860 }
2861#endif /* RE_ENABLE_I18N */
2862 if (naccepted
2863 || check_node_accept (preg, dfa->nodes + cur_node, mctx,
2864 str_idx))
2865 {
2866 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
2867 if (BE (err < 0, 0))
2868 {
2869 re_node_set_free (&union_set);
2870 return REG_ESPACE;
2871 }
2872 }
2873 }
2874 re_node_set_free (&union_set);
2875 return REG_NOERROR;
2876}
2877
2878/* For all the nodes in CUR_NODES, add the epsilon closures of them to
2879 CUR_NODES, however exclude the nodes which are:
2880 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2881 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2882*/
2883
2884static reg_errcode_t
2885check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, fl_open)
2886 re_dfa_t *dfa;
2887 re_node_set *cur_nodes;
2888 int ex_subexp, fl_open;
2889{
2890 reg_errcode_t err;
2891 int idx, outside_node;
2892 re_node_set new_nodes;
2893#ifdef DEBUG
2894 assert (cur_nodes->nelem);
2895#endif
2896 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
2897 if (BE (err != REG_NOERROR, 0))
2898 return err;
2899 /* Create a new node set NEW_NODES with the nodes which are epsilon
2900 closures of the node in CUR_NODES. */
2901
2902 for (idx = 0; idx < cur_nodes->nelem; ++idx)
2903 {
2904 int cur_node = cur_nodes->elems[idx];
2905 re_node_set *eclosure = dfa->eclosures + cur_node;
2906 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, fl_open);
2907 if (outside_node == -1)
2908 {
2909 /* There are no problematic nodes, just merge them. */
2910 err = re_node_set_merge (&new_nodes, eclosure);
2911 if (BE (err != REG_NOERROR, 0))
2912 {
2913 re_node_set_free (&new_nodes);
2914 return err;
2915 }
2916 }
2917 else
2918 {
2919 /* There are problematic nodes, re-calculate incrementally. */
2920 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
2921 ex_subexp, fl_open);
2922 if (BE (err != REG_NOERROR, 0))
2923 {
2924 re_node_set_free (&new_nodes);
2925 return err;
2926 }
2927 }
2928 }
2929 re_node_set_free (cur_nodes);
2930 *cur_nodes = new_nodes;
2931 return REG_NOERROR;
2932}
2933
2934/* Helper function for check_arrival_expand_ecl.
2935 Check incrementally the epsilon closure of TARGET, and if it isn't
2936 problematic append it to DST_NODES. */
2937
2938static reg_errcode_t
2939check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, fl_open)
2940 re_dfa_t *dfa;
2941 int target, ex_subexp, fl_open;
2942 re_node_set *dst_nodes;
2943{
2944 int cur_node, type;
2945 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
2946 {
2947 int err;
2948 type = dfa->nodes[cur_node].type;
2949
2950 if (((type == OP_OPEN_SUBEXP && fl_open)
2951 || (type == OP_CLOSE_SUBEXP && !fl_open))
2952 && dfa->nodes[cur_node].opr.idx == ex_subexp)
2953 {
2954 if (!fl_open)
2955 {
2956 err = re_node_set_insert (dst_nodes, cur_node);
2957 if (BE (err == -1, 0))
2958 return REG_ESPACE;
2959 }
2960 break;
2961 }
2962 err = re_node_set_insert (dst_nodes, cur_node);
2963 if (BE (err == -1, 0))
2964 return REG_ESPACE;
2965 if (dfa->edests[cur_node].nelem == 0)
2966 break;
2967 if (dfa->edests[cur_node].nelem == 2)
2968 {
2969 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
2970 dfa->edests[cur_node].elems[1],
2971 ex_subexp, fl_open);
2972 if (BE (err != REG_NOERROR, 0))
2973 return err;
2974 }
2975 cur_node = dfa->edests[cur_node].elems[0];
2976 }
2977 return REG_NOERROR;
2978}
2979
2980
2981/* For all the back references in the current state, calculate the
2982 destination of the back references by the appropriate entry
2983 in MCTX->BKREF_ENTS. */
2984
2985static reg_errcode_t
2986expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num,
2987 fl_open)
2988 const regex_t *preg;
2989 re_match_context_t *mctx;
2990 int cur_str, last_str, subexp_num, fl_open;
2991 re_node_set *cur_nodes;
2992{
2993 reg_errcode_t err;
2994 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2995 int cache_idx, cache_idx_start;
2996 /* The current state. */
2997
2998 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
2999 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3000 {
3001 int to_idx, next_node;
3002 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3003 if (ent->str_idx > cur_str)
3004 break;
3005 /* Is this entry ENT is appropriate? */
3006 if (!re_node_set_contains (cur_nodes, ent->node))
3007 continue; /* No. */
3008
3009 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3010 /* Calculate the destination of the back reference, and append it
3011 to MCTX->STATE_LOG. */
3012 if (to_idx == cur_str)
3013 {
3014 /* The backreference did epsilon transit, we must re-check all the
3015 node in the current state. */
3016 re_node_set new_dests;
3017 reg_errcode_t err2, err3;
3018 next_node = dfa->edests[ent->node].elems[0];
3019 if (re_node_set_contains (cur_nodes, next_node))
3020 continue;
3021 err = re_node_set_init_1 (&new_dests, next_node);
3022 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num,
3023 fl_open);
3024 err3 = re_node_set_merge (cur_nodes, &new_dests);
3025 re_node_set_free (&new_dests);
3026 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3027 || err3 != REG_NOERROR, 0))
3028 {
3029 err = (err != REG_NOERROR ? err
3030 : (err2 != REG_NOERROR ? err2 : err3));
3031 return err;
3032 }
3033 /* TODO: It is still inefficient... */
3034 cache_idx = cache_idx_start - 1;
3035 continue;
3036 }
3037 else
3038 {
3039 re_node_set union_set;
3040 next_node = dfa->nexts[ent->node];
3041 if (mctx->state_log[to_idx])
3042 {
3043 int ret;
3044 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3045 next_node))
3046 continue;
3047 err = re_node_set_init_copy (&union_set,
3048 &mctx->state_log[to_idx]->nodes);
3049 ret = re_node_set_insert (&union_set, next_node);
3050 if (BE (err != REG_NOERROR || ret < 0, 0))
3051 {
3052 re_node_set_free (&union_set);
3053 err = err != REG_NOERROR ? err : REG_ESPACE;
3054 return err;
3055 }
3056 }
3057 else
3058 {
3059 err = re_node_set_init_1 (&union_set, next_node);
3060 if (BE (err != REG_NOERROR, 0))
3061 return err;
3062 }
3063 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3064 re_node_set_free (&union_set);
3065 if (BE (mctx->state_log[to_idx] == NULL
3066 && err != REG_NOERROR, 0))
3067 return err;
3068 }
3069 }
3070 return REG_NOERROR;
3071}
3072
3073/* Build transition table for the state.
3074 Return the new table if succeeded, otherwise return NULL. */
3075
3076static re_dfastate_t **
3077build_trtable (preg, state, fl_search)
3078 const regex_t *preg;
3079 const re_dfastate_t *state;
3080 int fl_search;
3081{
3082 reg_errcode_t err;
3083 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3084 int i, j, k, ch;
3085 int dests_node_malloced = 0, dest_states_malloced = 0;
3086 int ndests; /* Number of the destination states from `state'. */
3087 re_dfastate_t **trtable;
3088 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3089 re_node_set follows, *dests_node;
3090 bitset *dests_ch;
3091 bitset acceptable;
3b0bdc72
UD
3092
3093 /* We build DFA states which corresponds to the destination nodes
3094 from `state'. `dests_node[i]' represents the nodes which i-th
3095 destination state contains, and `dests_ch[i]' represents the
3096 characters which i-th destination state accepts. */
05dab910
RM
3097#ifdef _LIBC
3098 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3099 dests_node = (re_node_set *)
3100 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3101 else
3102#endif
3103 {
3104 dests_node = (re_node_set *)
3105 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3106 if (BE (dests_node == NULL, 0))
3107 return NULL;
3108 dests_node_malloced = 1;
3109 }
3110 dests_ch = (bitset *) (dests_node + SBC_MAX);
3b0bdc72
UD
3111
3112 /* Initialize transiton table. */
3113 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
05dab910
RM
3114 if (BE (trtable == NULL, 0))
3115 {
3116 if (dests_node_malloced)
3117 free (dests_node);
3118 return NULL;
3119 }
3b0bdc72
UD
3120
3121 /* At first, group all nodes belonging to `state' into several
3122 destinations. */
3123 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
bc15410e 3124 if (BE (ndests <= 0, 0))
3b0bdc72 3125 {
05dab910
RM
3126 if (dests_node_malloced)
3127 free (dests_node);
a9388965 3128 /* Return NULL in case of an error, trtable otherwise. */
05dab910
RM
3129 if (ndests == 0)
3130 return trtable;
3131 free (trtable);
3132 return NULL;
3b0bdc72
UD
3133 }
3134
a9388965 3135 err = re_node_set_alloc (&follows, ndests + 1);
05dab910
RM
3136 if (BE (err != REG_NOERROR, 0))
3137 goto out_free;
3138
3139#ifdef _LIBC
3140 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3141 + ndests * 3 * sizeof (re_dfastate_t *)))
3142 dest_states = (re_dfastate_t **)
3143 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3144 else
3145#endif
3146 {
3147 dest_states = (re_dfastate_t **)
3148 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3149 if (BE (dest_states == NULL, 0))
3150 {
3151out_free:
3152 if (dest_states_malloced)
3153 free (dest_states);
3154 re_node_set_free (&follows);
3155 for (i = 0; i < ndests; ++i)
3156 re_node_set_free (dests_node + i);
3157 free (trtable);
3158 if (dests_node_malloced)
3159 free (dests_node);
3160 return NULL;
3161 }
3162 dest_states_malloced = 1;
3163 }
3164 dest_states_word = dest_states + ndests;
3165 dest_states_nl = dest_states_word + ndests;
3166 bitset_empty (acceptable);
a9388965 3167
3b0bdc72
UD
3168 /* Then build the states for all destinations. */
3169 for (i = 0; i < ndests; ++i)
3170 {
3171 int next_node;
3172 re_node_set_empty (&follows);
3173 /* Merge the follows of this destination states. */
3174 for (j = 0; j < dests_node[i].nelem; ++j)
15a7d175
UD
3175 {
3176 next_node = dfa->nexts[dests_node[i].elems[j]];
3177 if (next_node != -1)
3178 {
3179 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3180 if (BE (err != REG_NOERROR, 0))
05dab910 3181 goto out_free;
15a7d175
UD
3182 }
3183 }
3b0bdc72
UD
3184 /* If search flag is set, merge the initial state. */
3185 if (fl_search)
15a7d175 3186 {
3b0bdc72 3187#ifdef RE_ENABLE_I18N
15a7d175
UD
3188 int not_initial = 0;
3189 for (j = 0; j < follows.nelem; ++j)
3190 if (dfa->nodes[follows.elems[j]].type == CHARACTER)
3191 {
3192 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
3193 break;
3194 }
3195 if (!not_initial)
3b0bdc72 3196#endif
15a7d175
UD
3197 {
3198 err = re_node_set_merge (&follows,
3199 dfa->init_state->entrance_nodes);
3200 if (BE (err != REG_NOERROR, 0))
3201 goto out_free;
3202 }
3203 }
a9388965 3204 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
bc15410e 3205 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
15a7d175 3206 goto out_free;
3b0bdc72 3207 /* If the new state has context constraint,
15a7d175 3208 build appropriate states for these contexts. */
3b0bdc72 3209 if (dest_states[i]->has_constraint)
15a7d175
UD
3210 {
3211 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3212 CONTEXT_WORD);
3213 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3214 goto out_free;
3215 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3216 CONTEXT_NEWLINE);
3217 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3218 goto out_free;
3219 }
3b0bdc72 3220 else
15a7d175
UD
3221 {
3222 dest_states_word[i] = dest_states[i];
3223 dest_states_nl[i] = dest_states[i];
3224 }
3b0bdc72
UD
3225 bitset_merge (acceptable, dests_ch[i]);
3226 }
3227
3228 /* Update the transition table. */
c202c2c5 3229 /* For all characters ch...: */
3b0bdc72
UD
3230 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
3231 for (j = 0; j < UINT_BITS; ++j, ++ch)
3232 if ((acceptable[i] >> j) & 1)
15a7d175
UD
3233 {
3234 /* The current state accepts the character ch. */
3235 if (IS_WORD_CHAR (ch))
3236 {
3237 for (k = 0; k < ndests; ++k)
3238 if ((dests_ch[k][i] >> j) & 1)
3239 {
3240 /* k-th destination accepts the word character ch. */
3241 trtable[ch] = dest_states_word[k];
3242 /* There must be only one destination which accepts
3243 character ch. See group_nodes_into_DFAstates. */
3244 break;
3245 }
3246 }
3247 else /* not WORD_CHAR */
3248 {
3249 for (k = 0; k < ndests; ++k)
3250 if ((dests_ch[k][i] >> j) & 1)
3251 {
3252 /* k-th destination accepts the non-word character ch. */
3253 trtable[ch] = dest_states[k];
3254 /* There must be only one destination which accepts
3255 character ch. See group_nodes_into_DFAstates. */
3256 break;
3257 }
3258 }
3259 }
3b0bdc72 3260 /* new line */
c202c2c5
UD
3261 if (bitset_contain (acceptable, NEWLINE_CHAR))
3262 {
3263 /* The current state accepts newline character. */
3264 for (k = 0; k < ndests; ++k)
15a7d175
UD
3265 if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
3266 {
3267 /* k-th destination accepts newline character. */
3268 trtable[NEWLINE_CHAR] = dest_states_nl[k];
3269 /* There must be only one destination which accepts
3270 newline. See group_nodes_into_DFAstates. */
3271 break;
3272 }
c202c2c5 3273 }
3b0bdc72 3274
05dab910
RM
3275 if (dest_states_malloced)
3276 free (dest_states);
3b0bdc72
UD
3277
3278 re_node_set_free (&follows);
3279 for (i = 0; i < ndests; ++i)
3280 re_node_set_free (dests_node + i);
3281
05dab910
RM
3282 if (dests_node_malloced)
3283 free (dests_node);
3b0bdc72
UD
3284
3285 return trtable;
3286}
3287
3288/* Group all nodes belonging to STATE into several destinations.
3289 Then for all destinations, set the nodes belonging to the destination
3290 to DESTS_NODE[i] and set the characters accepted by the destination
3291 to DEST_CH[i]. This function return the number of destinations. */
3292
3293static int
3294group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
3295 const regex_t *preg;
3296 const re_dfastate_t *state;
3297 re_node_set *dests_node;
3298 bitset *dests_ch;
3299{
a9388965 3300 reg_errcode_t err;
3b0bdc72
UD
3301 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3302 int i, j, k;
3303 int ndests; /* Number of the destinations from `state'. */
3304 bitset accepts; /* Characters a node can accept. */
3305 const re_node_set *cur_nodes = &state->nodes;
3306 bitset_empty (accepts);
3307 ndests = 0;
3308
3309 /* For all the nodes belonging to `state', */
3310 for (i = 0; i < cur_nodes->nelem; ++i)
3311 {
3b0bdc72
UD
3312 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3313 re_token_type_t type = node->type;
485d775d 3314 unsigned int constraint = node->constraint;
3b0bdc72
UD
3315
3316 /* Enumerate all single byte character this node can accept. */
3317 if (type == CHARACTER)
15a7d175 3318 bitset_set (accepts, node->opr.c);
3b0bdc72 3319 else if (type == SIMPLE_BRACKET)
15a7d175
UD
3320 {
3321 bitset_merge (accepts, node->opr.sbcset);
3322 }
3b0bdc72 3323 else if (type == OP_PERIOD)
15a7d175
UD
3324 {
3325 bitset_set_all (accepts);
3326 if (!(preg->syntax & RE_DOT_NEWLINE))
3327 bitset_clear (accepts, '\n');
3328 if (preg->syntax & RE_DOT_NOT_NULL)
3329 bitset_clear (accepts, '\0');
3330 }
3b0bdc72 3331 else
15a7d175 3332 continue;
3b0bdc72
UD
3333
3334 /* Check the `accepts' and sift the characters which are not
15a7d175 3335 match it the context. */
3b0bdc72 3336 if (constraint)
15a7d175 3337 {
15a7d175
UD
3338 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3339 {
3340 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3341 bitset_empty (accepts);
3342 if (accepts_newline)
3343 bitset_set (accepts, NEWLINE_CHAR);
3344 else
3345 continue;
3346 }
66b110e8
UD
3347 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3348 {
3349 bitset_empty (accepts);
3350 continue;
3351 }
3352 if (constraint & NEXT_WORD_CONSTRAINT)
3353 for (j = 0; j < BITSET_UINTS; ++j)
3354 accepts[j] &= dfa->word_char[j];
3355 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3356 for (j = 0; j < BITSET_UINTS; ++j)
3357 accepts[j] &= ~dfa->word_char[j];
15a7d175 3358 }
3b0bdc72
UD
3359
3360 /* Then divide `accepts' into DFA states, or create a new
15a7d175 3361 state. */
3b0bdc72 3362 for (j = 0; j < ndests; ++j)
15a7d175
UD
3363 {
3364 bitset intersec; /* Intersection sets, see below. */
3365 bitset remains;
3366 /* Flags, see below. */
3367 int has_intersec, not_subset, not_consumed;
3368
3369 /* Optimization, skip if this state doesn't accept the character. */
3370 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3371 continue;
3372
3373 /* Enumerate the intersection set of this state and `accepts'. */
3374 has_intersec = 0;
3375 for (k = 0; k < BITSET_UINTS; ++k)
3376 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3377 /* And skip if the intersection set is empty. */
3378 if (!has_intersec)
3379 continue;
3380
3381 /* Then check if this state is a subset of `accepts'. */
3382 not_subset = not_consumed = 0;
3383 for (k = 0; k < BITSET_UINTS; ++k)
3384 {
3385 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3386 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3387 }
3388
3389 /* If this state isn't a subset of `accepts', create a
3390 new group state, which has the `remains'. */
3391 if (not_subset)
3392 {
3393 bitset_copy (dests_ch[ndests], remains);
3394 bitset_copy (dests_ch[j], intersec);
3395 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3396 if (BE (err != REG_NOERROR, 0))
3397 goto error_return;
3398 ++ndests;
3399 }
3400
3401 /* Put the position in the current group. */
3402 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3403 if (BE (err < 0, 0))
3404 goto error_return;
3405
3406 /* If all characters are consumed, go to next node. */
3407 if (!not_consumed)
3408 break;
3409 }
3b0bdc72
UD
3410 /* Some characters remain, create a new group. */
3411 if (j == ndests)
15a7d175
UD
3412 {
3413 bitset_copy (dests_ch[ndests], accepts);
3414 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3415 if (BE (err != REG_NOERROR, 0))
3416 goto error_return;
3417 ++ndests;
3418 bitset_empty (accepts);
3419 }
3b0bdc72
UD
3420 }
3421 return ndests;
1b2c2628
UD
3422 error_return:
3423 for (j = 0; j < ndests; ++j)
3424 re_node_set_free (dests_node + j);
3425 return -1;
3b0bdc72
UD
3426}
3427
434d3784
UD
3428#ifdef RE_ENABLE_I18N
3429/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3430 Return the number of the bytes the node accepts.
3431 STR_IDX is the current index of the input string.
3432
3433 This function handles the nodes which can accept one character, or
3434 one collating element like '.', '[a-z]', opposite to the other nodes
3435 can only accept one byte. */
3b0bdc72
UD
3436
3437static int
3438check_node_accept_bytes (preg, node_idx, input, str_idx)
3439 const regex_t *preg;
3440 int node_idx, str_idx;
3441 const re_string_t *input;
3442{
3443 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3444 const re_token_t *node = dfa->nodes + node_idx;
3445 int elem_len = re_string_elem_size_at (input, str_idx);
3446 int char_len = re_string_char_size_at (input, str_idx);
434d3784
UD
3447 int i;
3448# ifdef _LIBC
3449 int j;
3b0bdc72 3450 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
434d3784 3451# endif /* _LIBC */
3b0bdc72
UD
3452 if (elem_len <= 1 && char_len <= 1)
3453 return 0;
3454 if (node->type == OP_PERIOD)
3455 {
434d3784 3456 /* '.' accepts any one character except the following two cases. */
3b0bdc72 3457 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
15a7d175
UD
3458 re_string_byte_at (input, str_idx) == '\n') ||
3459 ((preg->syntax & RE_DOT_NOT_NULL) &&
3460 re_string_byte_at (input, str_idx) == '\0'))
3461 return 0;
3b0bdc72
UD
3462 return char_len;
3463 }
3464 else if (node->type == COMPLEX_BRACKET)
3465 {
3466 const re_charset_t *cset = node->opr.mbcset;
434d3784 3467# ifdef _LIBC
85c54a32
UD
3468 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3469 + str_idx);
434d3784
UD
3470# endif /* _LIBC */
3471 int match_len = 0;
3472 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
15a7d175 3473 ? re_string_wchar_at (input, str_idx) : 0);
434d3784
UD
3474
3475 /* match with multibyte character? */
3476 for (i = 0; i < cset->nmbchars; ++i)
15a7d175
UD
3477 if (wc == cset->mbchars[i])
3478 {
3479 match_len = char_len;
3480 goto check_node_accept_bytes_match;
3481 }
434d3784
UD
3482 /* match with character_class? */
3483 for (i = 0; i < cset->nchar_classes; ++i)
15a7d175
UD
3484 {
3485 wctype_t wt = cset->char_classes[i];
3486 if (__iswctype (wc, wt))
3487 {
3488 match_len = char_len;
3489 goto check_node_accept_bytes_match;
3490 }
3491 }
434d3784
UD
3492
3493# ifdef _LIBC
3b0bdc72 3494 if (nrules != 0)
15a7d175
UD
3495 {
3496 unsigned int in_collseq = 0;
3497 const int32_t *table, *indirect;
3498 const unsigned char *weights, *extra;
3499 const char *collseqwc;
3500 int32_t idx;
3501 /* This #include defines a local function! */
434d3784 3502# include <locale/weight.h>
3b0bdc72 3503
15a7d175
UD
3504 /* match with collating_symbol? */
3505 if (cset->ncoll_syms)
3506 extra = (const unsigned char *)
3507 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3508 for (i = 0; i < cset->ncoll_syms; ++i)
3509 {
3510 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3511 /* Compare the length of input collating element and
3512 the length of current collating element. */
3513 if (*coll_sym != elem_len)
3514 continue;
3515 /* Compare each bytes. */
3516 for (j = 0; j < *coll_sym; j++)
3517 if (pin[j] != coll_sym[1 + j])
3518 break;
3519 if (j == *coll_sym)
3520 {
3521 /* Match if every bytes is equal. */
3522 match_len = j;
3523 goto check_node_accept_bytes_match;
3524 }
3525 }
3526
3527 if (cset->nranges)
3528 {
3529 if (elem_len <= char_len)
3530 {
3531 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
25337753 3532 in_collseq = __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
3533 }
3534 else
3535 in_collseq = find_collation_sequence_value (pin, elem_len);
3536 }
3537 /* match with range expression? */
3538 for (i = 0; i < cset->nranges; ++i)
3539 if (cset->range_starts[i] <= in_collseq
3540 && in_collseq <= cset->range_ends[i])
3541 {
3542 match_len = elem_len;
3543 goto check_node_accept_bytes_match;
3544 }
3545
3546 /* match with equivalence_class? */
3547 if (cset->nequiv_classes)
3548 {
3549 const unsigned char *cp = pin;
3550 table = (const int32_t *)
3551 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3552 weights = (const unsigned char *)
3553 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3554 extra = (const unsigned char *)
3555 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3556 indirect = (const int32_t *)
3557 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3558 idx = findidx (&cp);
3559 if (idx > 0)
3560 for (i = 0; i < cset->nequiv_classes; ++i)
3561 {
3562 int32_t equiv_class_idx = cset->equiv_classes[i];
3563 size_t weight_len = weights[idx];
3564 if (weight_len == weights[equiv_class_idx])
3565 {
3566 int cnt = 0;
3567 while (cnt <= weight_len
3568 && (weights[equiv_class_idx + 1 + cnt]
3569 == weights[idx + 1 + cnt]))
3570 ++cnt;
3571 if (cnt > weight_len)
3572 {
3573 match_len = elem_len;
3574 goto check_node_accept_bytes_match;
3575 }
3576 }
3577 }
3578 }
3579 }
434d3784
UD
3580 else
3581# endif /* _LIBC */
15a7d175
UD
3582 {
3583 /* match with range expression? */
92b27c74 3584#if __GNUC__ >= 2
15a7d175 3585 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
92b27c74 3586#else
15a7d175
UD
3587 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3588 cmp_buf[2] = wc;
92b27c74 3589#endif
15a7d175
UD
3590 for (i = 0; i < cset->nranges; ++i)
3591 {
3592 cmp_buf[0] = cset->range_starts[i];
3593 cmp_buf[4] = cset->range_ends[i];
3594 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3595 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3596 {
3597 match_len = char_len;
3598 goto check_node_accept_bytes_match;
3599 }
3600 }
3601 }
434d3784
UD
3602 check_node_accept_bytes_match:
3603 if (!cset->non_match)
15a7d175 3604 return match_len;
434d3784 3605 else
15a7d175
UD
3606 {
3607 if (match_len > 0)
3608 return 0;
3609 else
3610 return (elem_len > char_len) ? elem_len : char_len;
3611 }
3b0bdc72
UD
3612 }
3613 return 0;
3614}
3615
434d3784 3616# ifdef _LIBC
3b0bdc72
UD
3617static unsigned int
3618find_collation_sequence_value (mbs, mbs_len)
c202c2c5 3619 const unsigned char *mbs;
3b0bdc72
UD
3620 size_t mbs_len;
3621{
3622 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3623 if (nrules == 0)
3624 {
3625 if (mbs_len == 1)
15a7d175
UD
3626 {
3627 /* No valid character. Match it as a single byte character. */
3628 const unsigned char *collseq = (const unsigned char *)
3629 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3630 return collseq[mbs[0]];
3631 }
3b0bdc72
UD
3632 return UINT_MAX;
3633 }
3634 else
3635 {
3636 int32_t idx;
3637 const unsigned char *extra = (const unsigned char *)
15a7d175 3638 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3639
3640 for (idx = 0; ;)
15a7d175
UD
3641 {
3642 int mbs_cnt, found = 0;
3643 int32_t elem_mbs_len;
3644 /* Skip the name of collating element name. */
3645 idx = idx + extra[idx] + 1;
3646 elem_mbs_len = extra[idx++];
3647 if (mbs_len == elem_mbs_len)
3648 {
3649 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3650 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3651 break;
3652 if (mbs_cnt == elem_mbs_len)
3653 /* Found the entry. */
3654 found = 1;
3655 }
3656 /* Skip the byte sequence of the collating element. */
3657 idx += elem_mbs_len;
3658 /* Adjust for the alignment. */
3659 idx = (idx + 3) & ~3;
3660 /* Skip the collation sequence value. */
3661 idx += sizeof (uint32_t);
3662 /* Skip the wide char sequence of the collating element. */
3663 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3664 /* If we found the entry, return the sequence value. */
3665 if (found)
3666 return *(uint32_t *) (extra + idx);
3667 /* Skip the collation sequence value. */
3668 idx += sizeof (uint32_t);
3669 }
3b0bdc72
UD
3670 }
3671}
434d3784
UD
3672# endif /* _LIBC */
3673#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3674
3675/* Check whether the node accepts the byte which is IDX-th
3676 byte of the INPUT. */
3677
3678static int
612546c6 3679check_node_accept (preg, node, mctx, idx)
3b0bdc72
UD
3680 const regex_t *preg;
3681 const re_token_t *node;
612546c6
UD
3682 const re_match_context_t *mctx;
3683 int idx;
3b0bdc72 3684{
3b0bdc72 3685 unsigned char ch;
485d775d 3686 if (node->constraint)
3b0bdc72
UD
3687 {
3688 /* The node has constraints. Check whether the current context
15a7d175 3689 satisfies the constraints. */
612546c6 3690 unsigned int context = re_string_context_at (mctx->input, idx,
15a7d175
UD
3691 mctx->eflags,
3692 preg->newline_anchor);
3b0bdc72 3693 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
15a7d175 3694 return 0;
3b0bdc72 3695 }
612546c6 3696 ch = re_string_byte_at (mctx->input, idx);
485d775d
UD
3697 if (node->type == CHARACTER)
3698 return node->opr.c == ch;
3699 else if (node->type == SIMPLE_BRACKET)
3700 return bitset_contain (node->opr.sbcset, ch);
3701 else if (node->type == OP_PERIOD)
3b0bdc72 3702 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
15a7d175 3703 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3b0bdc72
UD
3704 else
3705 return 0;
3706}
612546c6
UD
3707
3708/* Extend the buffers, if the buffers have run out. */
3709
3710static reg_errcode_t
3711extend_buffers (mctx)
3712 re_match_context_t *mctx;
3713{
3714 reg_errcode_t ret;
3715 re_string_t *pstr = mctx->input;
3716
3717 /* Double the lengthes of the buffers. */
3718 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3719 if (BE (ret != REG_NOERROR, 0))
3720 return ret;
3721
3722 if (mctx->state_log != NULL)
3723 {
3724 /* And double the length of state_log. */
1b2c2628
UD
3725 re_dfastate_t **new_array;
3726 new_array = re_realloc (mctx->state_log, re_dfastate_t *,
15a7d175 3727 pstr->bufs_len * 2);
1b2c2628 3728 if (BE (new_array == NULL, 0))
15a7d175 3729 return REG_ESPACE;
1b2c2628 3730 mctx->state_log = new_array;
612546c6
UD
3731 }
3732
3733 /* Then reconstruct the buffers. */
3734 if (pstr->icase)
3735 {
3736#ifdef RE_ENABLE_I18N
3737 if (MB_CUR_MAX > 1)
15a7d175 3738 build_wcs_upper_buffer (pstr);
612546c6
UD
3739 else
3740#endif /* RE_ENABLE_I18N */
15a7d175 3741 build_upper_buffer (pstr);
612546c6
UD
3742 }
3743 else
3744 {
3745#ifdef RE_ENABLE_I18N
3746 if (MB_CUR_MAX > 1)
15a7d175 3747 build_wcs_buffer (pstr);
612546c6
UD
3748 else
3749#endif /* RE_ENABLE_I18N */
15a7d175
UD
3750 {
3751 if (pstr->trans != NULL)
3752 re_string_translate_buffer (pstr);
3753 else
3754 pstr->valid_len = pstr->bufs_len;
3755 }
612546c6
UD
3756 }
3757 return REG_NOERROR;
3758}
3759
3b0bdc72
UD
3760\f
3761/* Functions for matching context. */
3762
6291ee3c
UD
3763/* Initialize MCTX. */
3764
a9388965 3765static reg_errcode_t
612546c6 3766match_ctx_init (mctx, eflags, input, n)
3b0bdc72 3767 re_match_context_t *mctx;
612546c6
UD
3768 int eflags, n;
3769 re_string_t *input;
3b0bdc72
UD
3770{
3771 mctx->eflags = eflags;
612546c6
UD
3772 mctx->input = input;
3773 mctx->match_last = -1;
3b0bdc72 3774 if (n > 0)
a9388965
UD
3775 {
3776 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
6291ee3c
UD
3777 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
3778 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
15a7d175 3779 return REG_ESPACE;
a9388965 3780 }
3b0bdc72
UD
3781 else
3782 mctx->bkref_ents = NULL;
3783 mctx->nbkref_ents = 0;
3784 mctx->abkref_ents = n;
6291ee3c
UD
3785 mctx->max_mb_elem_len = 1;
3786 mctx->nsub_tops = 0;
3787 mctx->asub_tops = n;
a9388965 3788 return REG_NOERROR;
3b0bdc72
UD
3789}
3790
6291ee3c
UD
3791/* Clean the entries which depend on the current input in MCTX.
3792 This function must be invoked when the matcher changes the start index
3793 of the input, or changes the input string. */
3794
3795static void
3796match_ctx_clean (mctx)
3797 re_match_context_t *mctx;
3798{
3799 match_ctx_free_subtops (mctx);
3800 mctx->nsub_tops = 0;
3801 mctx->nbkref_ents = 0;
3802}
3803
3804/* Free all the memory associated with MCTX. */
3805
3b0bdc72
UD
3806static void
3807match_ctx_free (mctx)
3808 re_match_context_t *mctx;
3809{
6291ee3c
UD
3810 match_ctx_free_subtops (mctx);
3811 re_free (mctx->sub_tops);
3b0bdc72
UD
3812 re_free (mctx->bkref_ents);
3813}
3814
6291ee3c
UD
3815/* Free all the memory associated with MCTX->SUB_TOPS. */
3816
3817static void
3818match_ctx_free_subtops (mctx)
3819 re_match_context_t *mctx;
3820{
3821 int st_idx;
3822 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
3823 {
3824 int sl_idx;
3825 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
3826 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
3827 {
3828 re_sub_match_last_t *last = top->lasts[sl_idx];
3829 re_free (last->path.array);
3830 re_free (last);
3831 }
3832 re_free (top->lasts);
3833 if (top->path)
3834 {
3835 re_free (top->path->array);
3836 re_free (top->path);
3837 }
3838 free (top);
3839 }
3840}
3841
3842/* Add a new backreference entry to MCTX.
3843 Note that we assume that caller never call this function with duplicate
3844 entry, and call with STR_IDX which isn't smaller than any existing entry.
3845*/
3b0bdc72 3846
a9388965 3847static reg_errcode_t
0742e48e 3848match_ctx_add_entry (mctx, node, str_idx, from, to)
6291ee3c
UD
3849 re_match_context_t *mctx;
3850 int node, str_idx, from, to;
3b0bdc72
UD
3851{
3852 if (mctx->nbkref_ents >= mctx->abkref_ents)
3853 {
1b2c2628
UD
3854 struct re_backref_cache_entry* new_entry;
3855 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
15a7d175 3856 mctx->abkref_ents * 2);
1b2c2628 3857 if (BE (new_entry == NULL, 0))
15a7d175
UD
3858 {
3859 re_free (mctx->bkref_ents);
3860 return REG_ESPACE;
3861 }
1b2c2628 3862 mctx->bkref_ents = new_entry;
3b0bdc72 3863 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
15a7d175 3864 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3b0bdc72
UD
3865 mctx->abkref_ents *= 2;
3866 }
3867 mctx->bkref_ents[mctx->nbkref_ents].node = node;
0742e48e
UD
3868 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
3869 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
3870 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
3871 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
3872 if (mctx->max_mb_elem_len < to - from)
3873 mctx->max_mb_elem_len = to - from;
a9388965 3874 return REG_NOERROR;
3b0bdc72 3875}
0742e48e 3876
6291ee3c
UD
3877/* Search for the first entry which has the same str_idx.
3878 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
3879
3880static int
3881search_cur_bkref_entry (mctx, str_idx)
3882 re_match_context_t *mctx;
3883 int str_idx;
3884{
3885 int left, right, mid;
3886 right = mctx->nbkref_ents;
3887 for (left = 0; left < right;)
3888 {
3889 mid = (left + right) / 2;
3890 if (mctx->bkref_ents[mid].str_idx < str_idx)
3891 left = mid + 1;
3892 else
3893 right = mid;
3894 }
3895 return left;
3896}
3897
0742e48e
UD
3898static void
3899match_ctx_clear_flag (mctx)
3900 re_match_context_t *mctx;
3901{
3902 int i;
3903 for (i = 0; i < mctx->nbkref_ents; ++i)
3904 {
3905 mctx->bkref_ents[i].flag = 0;
3906 }
3907}
3908
6291ee3c
UD
3909/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
3910 at STR_IDX. */
3911
3912static reg_errcode_t
3913match_ctx_add_subtop (mctx, node, str_idx)
3914 re_match_context_t *mctx;
3915 int node, str_idx;
3916{
3917#ifdef DEBUG
3918 assert (mctx->sub_tops != NULL);
3919 assert (mctx->asub_tops > 0);
3920#endif
3921 if (mctx->nsub_tops == mctx->asub_tops)
3922 {
3923 re_sub_match_top_t **new_array;
3924 mctx->asub_tops *= 2;
3925 new_array = re_realloc (mctx->sub_tops, re_sub_match_top_t *,
3926 mctx->asub_tops);
3927 if (BE (new_array == NULL, 0))
3928 return REG_ESPACE;
3929 mctx->sub_tops = new_array;
3930 }
3931 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
3932 if (mctx->sub_tops[mctx->nsub_tops] == NULL)
3933 return REG_ESPACE;
3934 mctx->sub_tops[mctx->nsub_tops]->node = node;
3935 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
3936 return REG_NOERROR;
3937}
3938
3939/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
3940 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
3941
3942static re_sub_match_last_t *
3943match_ctx_add_sublast (subtop, node, str_idx)
3944 re_sub_match_top_t *subtop;
3945 int node, str_idx;
3946{
3947 re_sub_match_last_t *new_entry;
3948 if (subtop->nlasts == subtop->alasts)
3949 {
3950 re_sub_match_last_t **new_array;
3951 subtop->alasts = 2 * subtop->alasts + 1;
3952 new_array = re_realloc (subtop->lasts, re_sub_match_last_t *,
3953 subtop->alasts);
3954 if (BE (new_array == NULL, 0))
3955 return NULL;
3956 subtop->lasts = new_array;
3957 }
3958 new_entry = calloc (1, sizeof (re_sub_match_last_t));
3959 if (BE (new_entry == NULL, 0))
3960 return NULL;
3961 subtop->lasts[subtop->nlasts] = new_entry;
3962 new_entry->node = node;
3963 new_entry->str_idx = str_idx;
3964 ++subtop->nlasts;
3965 return new_entry;
3966}
3967
0742e48e
UD
3968static void
3969sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
15a7d175 3970 check_subexp)
0742e48e
UD
3971 re_sift_context_t *sctx;
3972 re_dfastate_t **sifted_sts, **limited_sts;
3973 int last_node, last_str_idx, check_subexp;
3974{
3975 sctx->sifted_states = sifted_sts;
3976 sctx->limited_states = limited_sts;
3977 sctx->last_node = last_node;
3978 sctx->last_str_idx = last_str_idx;
3979 sctx->check_subexp = check_subexp;
485d775d
UD
3980 sctx->cur_bkref = -1;
3981 sctx->cls_subexp_idx = -1;
0742e48e
UD
3982 re_node_set_init_empty (&sctx->limits);
3983}
This page took 0.52322 seconds and 5 git commands to generate.