]> sourceware.org Git - glibc.git/blame - posix/regex_internal.c
* posix/regex_internal.h (re_sub_match_top_t): Remove unused member
[glibc.git] / posix / regex_internal.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
1c99f950 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
92b27c74 21static void re_string_construct_common (const char *str, int len,
1b2c2628 22 re_string_t *pstr,
3c0fb574 23 RE_TRANSLATE_TYPE trans, int icase,
8cae99db 24 const re_dfa_t *dfa) internal_function;
a334319f
UD
25#ifdef RE_ENABLE_I18N
26static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28#endif /* RE_ENABLE_I18N */
29static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 unsigned int hash) internal_function;
31static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
1b2c2628 32 const re_node_set *nodes,
8cae99db 33 unsigned int hash) internal_function;
a334319f 34static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
1b2c2628
UD
35 const re_node_set *nodes,
36 unsigned int context,
8cae99db 37 unsigned int hash) internal_function;
2d87db5b 38static inline unsigned int calc_state_hash (const re_node_set *nodes,
a334319f 39 unsigned int context) internal_function;
3b0bdc72
UD
40\f
41/* Functions for string operation. */
42
612546c6
UD
43/* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
45
3b0bdc72 46static reg_errcode_t
a334319f
UD
47re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
48 re_string_t *pstr;
49 const char *str;
50 int len, init_len, icase;
51 RE_TRANSLATE_TYPE trans;
52 const re_dfa_t *dfa;
3b0bdc72
UD
53{
54 reg_errcode_t ret;
97fd3a30
UD
55 int init_buf_len;
56
57 /* Ensure at least one character fits into the buffers. */
58 if (init_len < dfa->mb_cur_max)
59 init_len = dfa->mb_cur_max;
60 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
f0c7c524 61 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
62
63 ret = re_string_realloc_buffers (pstr, init_buf_len);
64 if (BE (ret != REG_NOERROR, 0))
65 return ret;
66
56b168be
UD
67 pstr->word_char = dfa->word_char;
68 pstr->word_ops_used = dfa->word_ops_used;
37369d1c
UD
69 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71 pstr->valid_raw_len = pstr->valid_len;
612546c6
UD
72 return REG_NOERROR;
73}
74
75/* This function allocate the buffers, and initialize them. */
76
77static reg_errcode_t
a334319f
UD
78re_string_construct (pstr, str, len, trans, icase, dfa)
79 re_string_t *pstr;
80 const char *str;
81 int len, icase;
82 RE_TRANSLATE_TYPE trans;
83 const re_dfa_t *dfa;
612546c6
UD
84{
85 reg_errcode_t ret;
56b168be 86 memset (pstr, '\0', sizeof (re_string_t));
f0c7c524 87 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
88
89 if (len > 0)
3b0bdc72 90 {
612546c6 91 ret = re_string_realloc_buffers (pstr, len + 1);
bc15410e 92 if (BE (ret != REG_NOERROR, 0))
1b2c2628 93 return ret;
3b0bdc72 94 }
37369d1c 95 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
612546c6
UD
96
97 if (icase)
98 {
99#ifdef RE_ENABLE_I18N
f0c7c524 100 if (dfa->mb_cur_max > 1)
37369d1c
UD
101 {
102 while (1)
103 {
104 ret = build_wcs_upper_buffer (pstr);
105 if (BE (ret != REG_NOERROR, 0))
106 return ret;
107 if (pstr->valid_raw_len >= len)
108 break;
109 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
110 break;
111 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
112 if (BE (ret != REG_NOERROR, 0))
113 return ret;
114 }
115 }
612546c6 116 else
3b0bdc72 117#endif /* RE_ENABLE_I18N */
1b2c2628 118 build_upper_buffer (pstr);
612546c6
UD
119 }
120 else
3b0bdc72 121 {
612546c6 122#ifdef RE_ENABLE_I18N
f0c7c524 123 if (dfa->mb_cur_max > 1)
1b2c2628 124 build_wcs_buffer (pstr);
612546c6
UD
125 else
126#endif /* RE_ENABLE_I18N */
1b2c2628
UD
127 {
128 if (trans != NULL)
129 re_string_translate_buffer (pstr);
130 else
37369d1c
UD
131 {
132 pstr->valid_len = pstr->bufs_len;
133 pstr->valid_raw_len = pstr->bufs_len;
134 }
1b2c2628 135 }
3b0bdc72 136 }
612546c6 137
3b0bdc72
UD
138 return REG_NOERROR;
139}
140
612546c6 141/* Helper functions for re_string_allocate, and re_string_construct. */
3b0bdc72
UD
142
143static reg_errcode_t
a334319f
UD
144re_string_realloc_buffers (pstr, new_buf_len)
145 re_string_t *pstr;
146 int new_buf_len;
3b0bdc72 147{
3b0bdc72 148#ifdef RE_ENABLE_I18N
3c0fb574 149 if (pstr->mb_cur_max > 1)
3b0bdc72 150 {
2d87db5b
UD
151 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
152 if (BE (new_wcs == NULL, 0))
1b2c2628 153 return REG_ESPACE;
2d87db5b 154 pstr->wcs = new_wcs;
37369d1c
UD
155 if (pstr->offsets != NULL)
156 {
2d87db5b
UD
157 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
158 if (BE (new_offsets == NULL, 0))
37369d1c 159 return REG_ESPACE;
2d87db5b 160 pstr->offsets = new_offsets;
37369d1c 161 }
3b0bdc72 162 }
3b0bdc72 163#endif /* RE_ENABLE_I18N */
37369d1c 164 if (pstr->mbs_allocated)
3b0bdc72 165 {
2d87db5b
UD
166 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
167 new_buf_len);
168 if (BE (new_mbs == NULL, 0))
1b2c2628 169 return REG_ESPACE;
2d87db5b 170 pstr->mbs = new_mbs;
3b0bdc72 171 }
612546c6 172 pstr->bufs_len = new_buf_len;
3b0bdc72
UD
173 return REG_NOERROR;
174}
175
612546c6 176
3b0bdc72 177static void
a334319f
UD
178re_string_construct_common (str, len, pstr, trans, icase, dfa)
179 const char *str;
180 int len;
181 re_string_t *pstr;
182 RE_TRANSLATE_TYPE trans;
183 int icase;
184 const re_dfa_t *dfa;
3b0bdc72 185{
c202c2c5 186 pstr->raw_mbs = (const unsigned char *) str;
3b0bdc72 187 pstr->len = len;
37369d1c 188 pstr->raw_len = len;
a334319f 189 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
612546c6 190 pstr->icase = icase ? 1 : 0;
37369d1c 191 pstr->mbs_allocated = (trans != NULL || icase);
f0c7c524
UD
192 pstr->mb_cur_max = dfa->mb_cur_max;
193 pstr->is_utf8 = dfa->is_utf8;
194 pstr->map_notascii = dfa->map_notascii;
37369d1c
UD
195 pstr->stop = pstr->len;
196 pstr->raw_stop = pstr->stop;
3b0bdc72
UD
197}
198
199#ifdef RE_ENABLE_I18N
200
612546c6 201/* Build wide character buffer PSTR->WCS.
3b0bdc72
UD
202 If the byte sequence of the string are:
203 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204 Then wide character buffer will be:
205 <wc1> , WEOF , <wc2> , WEOF , <wc3>
206 We use WEOF for padding, they indicate that the position isn't
612546c6 207 a first byte of a multibyte character.
3b0bdc72 208
612546c6
UD
209 Note that this function assumes PSTR->VALID_LEN elements are already
210 built and starts from PSTR->VALID_LEN. */
211
212static void
a334319f
UD
213build_wcs_buffer (pstr)
214 re_string_t *pstr;
3b0bdc72 215{
37369d1c 216#ifdef _LIBC
ec73fd87
UD
217 unsigned char buf[MB_LEN_MAX];
218 assert (MB_LEN_MAX >= pstr->mb_cur_max);
37369d1c
UD
219#else
220 unsigned char buf[64];
221#endif
612546c6 222 mbstate_t prev_st;
88764ae2
UD
223 int byte_idx, end_idx, remain_len;
224 size_t mbclen;
37369d1c 225
612546c6
UD
226 /* Build the buffers from pstr->valid_len to either pstr->len or
227 pstr->bufs_len. */
37369d1c 228 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
612546c6
UD
229 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
230 {
231 wchar_t wc;
37369d1c
UD
232 const char *p;
233
612546c6
UD
234 remain_len = end_idx - byte_idx;
235 prev_st = pstr->cur_state;
37369d1c
UD
236 /* Apply the translation if we need. */
237 if (BE (pstr->trans != NULL, 0))
238 {
239 int i, ch;
240
241 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
242 {
243 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
d40eb37a 244 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
37369d1c
UD
245 }
246 p = (const char *) buf;
247 }
248 else
249 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
250 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
612546c6 251 if (BE (mbclen == (size_t) -2, 0))
1b2c2628
UD
252 {
253 /* The buffer doesn't have enough space, finish to build. */
254 pstr->cur_state = prev_st;
255 break;
256 }
612546c6 257 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
1b2c2628
UD
258 {
259 /* We treat these cases as a singlebyte character. */
260 mbclen = 1;
261 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
37369d1c
UD
262 if (BE (pstr->trans != NULL, 0))
263 wc = pstr->trans[wc];
1b2c2628
UD
264 pstr->cur_state = prev_st;
265 }
612546c6 266
3b0bdc72 267 /* Write wide character and padding. */
612546c6
UD
268 pstr->wcs[byte_idx++] = wc;
269 /* Write paddings. */
270 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1b2c2628 271 pstr->wcs[byte_idx++] = WEOF;
3b0bdc72 272 }
612546c6 273 pstr->valid_len = byte_idx;
37369d1c 274 pstr->valid_raw_len = byte_idx;
3b0bdc72
UD
275}
276
612546c6
UD
277/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
278 but for REG_ICASE. */
279
a334319f
UD
280static int
281build_wcs_upper_buffer (pstr)
282 re_string_t *pstr;
3b0bdc72 283{
612546c6 284 mbstate_t prev_st;
88764ae2
UD
285 int src_idx, byte_idx, end_idx, remain_len;
286 size_t mbclen;
37369d1c 287#ifdef _LIBC
ec73fd87
UD
288 char buf[MB_LEN_MAX];
289 assert (MB_LEN_MAX >= pstr->mb_cur_max);
37369d1c 290#else
1c99f950 291 char buf[64];
37369d1c
UD
292#endif
293
294 byte_idx = pstr->valid_len;
295 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
f0c7c524 296
e40a38b3
UD
297 /* The following optimization assumes that ASCII characters can be
298 mapped to wide characters with a simple cast. */
37369d1c
UD
299 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
300 {
301 while (byte_idx < end_idx)
1b2c2628 302 {
f0c7c524 303 wchar_t wc;
37369d1c
UD
304
305 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
306 && mbsinit (&pstr->cur_state))
307 {
308 /* In case of a singlebyte character. */
309 pstr->mbs[byte_idx]
310 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
311 /* The next step uses the assumption that wchar_t is encoded
e40a38b3 312 ASCII-safe: all ASCII values can be converted like this. */
37369d1c
UD
313 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
314 ++byte_idx;
315 continue;
316 }
317
f0c7c524
UD
318 remain_len = end_idx - byte_idx;
319 prev_st = pstr->cur_state;
320 mbclen = mbrtowc (&wc,
321 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
322 + byte_idx), remain_len, &pstr->cur_state);
88764ae2 323 if (BE (mbclen + 2 > 2, 1))
1b2c2628 324 {
37369d1c 325 wchar_t wcu = wc;
f0c7c524 326 if (iswlower (wc))
37369d1c 327 {
88764ae2 328 size_t mbcdlen;
37369d1c
UD
329
330 wcu = towupper (wc);
331 mbcdlen = wcrtomb (buf, wcu, &prev_st);
332 if (BE (mbclen == mbcdlen, 1))
333 memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 else
335 {
336 src_idx = byte_idx;
337 goto offsets_needed;
338 }
339 }
f0c7c524
UD
340 else
341 memcpy (pstr->mbs + byte_idx,
342 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
37369d1c 343 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
344 /* Write paddings. */
345 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 pstr->wcs[byte_idx++] = WEOF;
347 }
348 else if (mbclen == (size_t) -1 || mbclen == 0)
349 {
37369d1c 350 /* It is an invalid character or '\0'. Just use the byte. */
f0c7c524 351 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
02b50340
UD
352 pstr->mbs[byte_idx] = ch;
353 /* And also cast it to wide char. */
354 pstr->wcs[byte_idx++] = (wchar_t) ch;
f0c7c524
UD
355 if (BE (mbclen == (size_t) -1, 0))
356 pstr->cur_state = prev_st;
1b2c2628 357 }
1b2c2628 358 else
f0c7c524
UD
359 {
360 /* The buffer doesn't have enough space, finish to build. */
361 pstr->cur_state = prev_st;
362 break;
363 }
1b2c2628 364 }
37369d1c
UD
365 pstr->valid_len = byte_idx;
366 pstr->valid_raw_len = byte_idx;
367 return REG_NOERROR;
368 }
f0c7c524 369 else
37369d1c 370 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
f0c7c524
UD
371 {
372 wchar_t wc;
37369d1c 373 const char *p;
e40a38b3 374 offsets_needed:
f0c7c524
UD
375 remain_len = end_idx - byte_idx;
376 prev_st = pstr->cur_state;
37369d1c 377 if (BE (pstr->trans != NULL, 0))
f0c7c524 378 {
37369d1c
UD
379 int i, ch;
380
381 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
f0c7c524 382 {
37369d1c
UD
383 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384 buf[i] = pstr->trans[ch];
f0c7c524 385 }
37369d1c 386 p = (const char *) buf;
f0c7c524 387 }
37369d1c
UD
388 else
389 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
88764ae2 391 if (BE (mbclen + 2 > 2, 1))
f0c7c524 392 {
37369d1c 393 wchar_t wcu = wc;
f0c7c524 394 if (iswlower (wc))
37369d1c 395 {
88764ae2 396 size_t mbcdlen;
37369d1c
UD
397
398 wcu = towupper (wc);
17568537 399 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
37369d1c
UD
400 if (BE (mbclen == mbcdlen, 1))
401 memcpy (pstr->mbs + byte_idx, buf, mbclen);
88764ae2 402 else if (mbcdlen != (size_t) -1)
37369d1c 403 {
88764ae2 404 size_t i;
37369d1c
UD
405
406 if (byte_idx + mbcdlen > pstr->bufs_len)
407 {
408 pstr->cur_state = prev_st;
409 break;
410 }
411
412 if (pstr->offsets == NULL)
413 {
414 pstr->offsets = re_malloc (int, pstr->bufs_len);
415
416 if (pstr->offsets == NULL)
417 return REG_ESPACE;
418 }
419 if (!pstr->offsets_needed)
420 {
88764ae2 421 for (i = 0; i < (size_t) byte_idx; ++i)
37369d1c
UD
422 pstr->offsets[i] = i;
423 pstr->offsets_needed = 1;
424 }
425
426 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 pstr->wcs[byte_idx] = wcu;
428 pstr->offsets[byte_idx] = src_idx;
429 for (i = 1; i < mbcdlen; ++i)
430 {
431 pstr->offsets[byte_idx + i]
432 = src_idx + (i < mbclen ? i : mbclen - 1);
433 pstr->wcs[byte_idx + i] = WEOF;
434 }
435 pstr->len += mbcdlen - mbclen;
436 if (pstr->raw_stop > src_idx)
437 pstr->stop += mbcdlen - mbclen;
438 end_idx = (pstr->bufs_len > pstr->len)
439 ? pstr->len : pstr->bufs_len;
440 byte_idx += mbcdlen;
441 src_idx += mbclen;
442 continue;
443 }
88764ae2
UD
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
37369d1c 446 }
f0c7c524 447 else
37369d1c
UD
448 memcpy (pstr->mbs + byte_idx, p, mbclen);
449
450 if (BE (pstr->offsets_needed != 0, 0))
451 {
88764ae2 452 size_t i;
37369d1c
UD
453 for (i = 0; i < mbclen; ++i)
454 pstr->offsets[byte_idx + i] = src_idx + i;
455 }
456 src_idx += mbclen;
457
458 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
459 /* Write paddings. */
460 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 pstr->wcs[byte_idx++] = WEOF;
462 }
02b50340
UD
463 else if (mbclen == (size_t) -1 || mbclen == 0)
464 {
37369d1c
UD
465 /* It is an invalid character or '\0'. Just use the byte. */
466 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467
468 if (BE (pstr->trans != NULL, 0))
469 ch = pstr->trans [ch];
02b50340 470 pstr->mbs[byte_idx] = ch;
37369d1c
UD
471
472 if (BE (pstr->offsets_needed != 0, 0))
473 pstr->offsets[byte_idx] = src_idx;
474 ++src_idx;
475
02b50340
UD
476 /* And also cast it to wide char. */
477 pstr->wcs[byte_idx++] = (wchar_t) ch;
478 if (BE (mbclen == (size_t) -1, 0))
479 pstr->cur_state = prev_st;
480 }
f0c7c524
UD
481 else
482 {
483 /* The buffer doesn't have enough space, finish to build. */
484 pstr->cur_state = prev_st;
485 break;
486 }
487 }
612546c6 488 pstr->valid_len = byte_idx;
37369d1c
UD
489 pstr->valid_raw_len = src_idx;
490 return REG_NOERROR;
612546c6
UD
491}
492
493/* Skip characters until the index becomes greater than NEW_RAW_IDX.
494 Return the index. */
495
496static int
a334319f
UD
497re_string_skip_chars (pstr, new_raw_idx, last_wc)
498 re_string_t *pstr;
499 int new_raw_idx;
500 wint_t *last_wc;
612546c6
UD
501{
502 mbstate_t prev_st;
88764ae2
UD
503 int rawbuf_idx;
504 size_t mbclen;
a334319f 505 wchar_t wc = 0;
612546c6
UD
506
507 /* Skip the characters which are not necessary to check. */
37369d1c 508 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
612546c6
UD
509 rawbuf_idx < new_raw_idx;)
510 {
1843975c
RM
511 int remain_len;
512 remain_len = pstr->len - rawbuf_idx;
612546c6 513 prev_st = pstr->cur_state;
1843975c
RM
514 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
515 remain_len, &pstr->cur_state);
612546c6 516 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1b2c2628 517 {
a334319f 518 /* We treat these cases as a singlebyte character. */
1b2c2628
UD
519 mbclen = 1;
520 pstr->cur_state = prev_st;
521 }
612546c6
UD
522 /* Then proceed the next character. */
523 rawbuf_idx += mbclen;
524 }
1843975c 525 *last_wc = (wint_t) wc;
612546c6 526 return rawbuf_idx;
3b0bdc72
UD
527}
528#endif /* RE_ENABLE_I18N */
529
1b2c2628 530/* Build the buffer PSTR->MBS, and apply the translation if we need.
612546c6
UD
531 This function is used in case of REG_ICASE. */
532
533static void
a334319f
UD
534build_upper_buffer (pstr)
535 re_string_t *pstr;
3b0bdc72 536{
612546c6
UD
537 int char_idx, end_idx;
538 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
3b0bdc72 539
612546c6 540 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
3b0bdc72 541 {
612546c6 542 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
37369d1c
UD
543 if (BE (pstr->trans != NULL, 0))
544 ch = pstr->trans[ch];
612546c6 545 if (islower (ch))
1b2c2628 546 pstr->mbs[char_idx] = toupper (ch);
3b0bdc72 547 else
1b2c2628 548 pstr->mbs[char_idx] = ch;
3b0bdc72 549 }
612546c6 550 pstr->valid_len = char_idx;
37369d1c 551 pstr->valid_raw_len = char_idx;
3b0bdc72
UD
552}
553
612546c6 554/* Apply TRANS to the buffer in PSTR. */
3b0bdc72 555
612546c6 556static void
a334319f
UD
557re_string_translate_buffer (pstr)
558 re_string_t *pstr;
3b0bdc72 559{
612546c6
UD
560 int buf_idx, end_idx;
561 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562
563 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
3b0bdc72 564 {
612546c6 565 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
37369d1c 566 pstr->mbs[buf_idx] = pstr->trans[ch];
3b0bdc72 567 }
612546c6
UD
568
569 pstr->valid_len = buf_idx;
37369d1c 570 pstr->valid_raw_len = buf_idx;
612546c6
UD
571}
572
573/* This function re-construct the buffers.
3c0fb574 574 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
612546c6
UD
575 convert to upper case in case of REG_ICASE, apply translation. */
576
577static reg_errcode_t
a334319f
UD
578re_string_reconstruct (pstr, idx, eflags)
579 re_string_t *pstr;
580 int idx, eflags;
612546c6
UD
581{
582 int offset = idx - pstr->raw_mbs_idx;
bb677c95 583 if (BE (offset < 0, 0))
612546c6
UD
584 {
585 /* Reset buffer. */
434d3784 586#ifdef RE_ENABLE_I18N
3c0fb574 587 if (pstr->mb_cur_max > 1)
1b2c2628 588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
434d3784 589#endif /* RE_ENABLE_I18N */
37369d1c
UD
590 pstr->len = pstr->raw_len;
591 pstr->stop = pstr->raw_stop;
592 pstr->valid_len = 0;
593 pstr->raw_mbs_idx = 0;
594 pstr->valid_raw_len = 0;
595 pstr->offsets_needed = 0;
612546c6 596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1b2c2628 597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
37369d1c 598 if (!pstr->mbs_allocated)
1b2c2628 599 pstr->mbs = (unsigned char *) pstr->raw_mbs;
612546c6
UD
600 offset = idx;
601 }
602
bb677c95 603 if (BE (offset != 0, 1))
612546c6 604 {
a334319f
UD
605 /* Are the characters which are already checked remain? */
606 if (BE (offset < pstr->valid_raw_len, 1)
3b0bdc72 607#ifdef RE_ENABLE_I18N
a334319f
UD
608 /* Handling this would enlarge the code too much.
609 Accept a slowdown in that case. */
610 && pstr->offsets_needed == 0
0ecb606c 611#endif
a334319f
UD
612 )
613 {
614 /* Yes, move them to the front of the buffer. */
615 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
0ecb606c 616#ifdef RE_ENABLE_I18N
a334319f
UD
617 if (pstr->mb_cur_max > 1)
618 memmove (pstr->wcs, pstr->wcs + offset,
619 (pstr->valid_len - offset) * sizeof (wint_t));
612546c6 620#endif /* RE_ENABLE_I18N */
a334319f
UD
621 if (BE (pstr->mbs_allocated, 0))
622 memmove (pstr->mbs, pstr->mbs + offset,
623 pstr->valid_len - offset);
624 pstr->valid_len -= offset;
625 pstr->valid_raw_len -= offset;
612546c6 626#if DEBUG
a334319f 627 assert (pstr->valid_len > 0);
3b0bdc72 628#endif
1b2c2628 629 }
612546c6 630 else
1b2c2628
UD
631 {
632 /* No, skip all characters until IDX. */
37369d1c
UD
633#ifdef RE_ENABLE_I18N
634 if (BE (pstr->offsets_needed, 0))
635 {
636 pstr->len = pstr->raw_len - idx + offset;
637 pstr->stop = pstr->raw_stop - idx + offset;
638 pstr->offsets_needed = 0;
639 }
640#endif
1b2c2628 641 pstr->valid_len = 0;
a334319f 642 pstr->valid_raw_len = 0;
3b0bdc72 643#ifdef RE_ENABLE_I18N
3c0fb574 644 if (pstr->mb_cur_max > 1)
1b2c2628
UD
645 {
646 int wcs_idx;
3c0fb574
UD
647 wint_t wc = WEOF;
648
649 if (pstr->is_utf8)
650 {
37369d1c 651 const unsigned char *raw, *p, *q, *end;
3c0fb574
UD
652
653 /* Special case UTF-8. Multi-byte chars start with any
654 byte other than 0x80 - 0xbf. */
655 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
97fd3a30 656 end = raw + (offset - pstr->mb_cur_max);
a334319f
UD
657 for (p = raw + offset - 1; p >= end; --p)
658 if ((*p & 0xc0) != 0x80)
659 {
660 mbstate_t cur_state;
661 wchar_t wc2;
662 int mlen = raw + pstr->len - p;
663 unsigned char buf[6];
664
665 q = p;
666 if (BE (pstr->trans != NULL, 0))
667 {
668 int i = mlen < 6 ? mlen : 6;
669 while (--i >= 0)
670 buf[i] = pstr->trans[p[i]];
671 q = buf;
672 }
673 /* XXX Don't use mbrtowc, we know which conversion
674 to use (UTF-8 -> UCS4). */
675 memset (&cur_state, 0, sizeof (cur_state));
1c99f950
UD
676 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
677 &cur_state)
678 - (raw + offset - p));
a334319f
UD
679 if (mlen >= 0)
680 {
681 memset (&pstr->cur_state, '\0',
682 sizeof (mbstate_t));
683 pstr->valid_len = mlen;
684 wc = wc2;
685 }
686 break;
687 }
3c0fb574 688 }
e40a38b3 689
3c0fb574 690 if (wc == WEOF)
97fd3a30 691 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
37369d1c
UD
692 if (BE (pstr->valid_len, 0))
693 {
694 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
695 pstr->wcs[wcs_idx] = WEOF;
696 if (pstr->mbs_allocated)
697 memset (pstr->mbs, 255, pstr->valid_len);
698 }
699 pstr->valid_raw_len = pstr->valid_len;
a334319f
UD
700 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
701 && IS_WIDE_WORD_CHAR (wc))
702 ? CONTEXT_WORD
703 : ((IS_WIDE_NEWLINE (wc)
704 && pstr->newline_anchor)
705 ? CONTEXT_NEWLINE : 0));
1b2c2628 706 }
1843975c 707 else
612546c6 708#endif /* RE_ENABLE_I18N */
1843975c
RM
709 {
710 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
711 if (pstr->trans)
712 c = pstr->trans[c];
56b168be
UD
713 pstr->tip_context = (bitset_contain (pstr->word_char, c)
714 ? CONTEXT_WORD
715 : ((IS_NEWLINE (c) && pstr->newline_anchor)
1843975c
RM
716 ? CONTEXT_NEWLINE : 0));
717 }
1b2c2628 718 }
bb677c95 719 if (!BE (pstr->mbs_allocated, 0))
37369d1c 720 pstr->mbs += offset;
3b0bdc72 721 }
612546c6
UD
722 pstr->raw_mbs_idx = idx;
723 pstr->len -= offset;
ac3d553b 724 pstr->stop -= offset;
612546c6
UD
725
726 /* Then build the buffers. */
727#ifdef RE_ENABLE_I18N
3c0fb574 728 if (pstr->mb_cur_max > 1)
3b0bdc72 729 {
612546c6 730 if (pstr->icase)
37369d1c 731 {
a334319f 732 int ret = build_wcs_upper_buffer (pstr);
37369d1c
UD
733 if (BE (ret != REG_NOERROR, 0))
734 return ret;
735 }
612546c6 736 else
1b2c2628 737 build_wcs_buffer (pstr);
3b0bdc72
UD
738 }
739 else
612546c6 740#endif /* RE_ENABLE_I18N */
a334319f
UD
741 if (BE (pstr->mbs_allocated, 0))
742 {
743 if (pstr->icase)
744 build_upper_buffer (pstr);
745 else if (pstr->trans != NULL)
746 re_string_translate_buffer (pstr);
747 }
748 else
749 pstr->valid_len = pstr->len;
612546c6 750
bb677c95 751 pstr->cur_idx = 0;
3b0bdc72
UD
752 return REG_NOERROR;
753}
754
37369d1c 755static unsigned char
a334319f
UD
756re_string_peek_byte_case (pstr, idx)
757 const re_string_t *pstr;
758 int idx;
37369d1c
UD
759{
760 int ch, off;
761
762 /* Handle the common (easiest) cases first. */
f39eef7b 763 if (BE (!pstr->mbs_allocated, 1))
37369d1c
UD
764 return re_string_peek_byte (pstr, idx);
765
766#ifdef RE_ENABLE_I18N
767 if (pstr->mb_cur_max > 1
768 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
769 return re_string_peek_byte (pstr, idx);
770#endif
771
772 off = pstr->cur_idx + idx;
773#ifdef RE_ENABLE_I18N
774 if (pstr->offsets_needed)
775 off = pstr->offsets[off];
776#endif
777
778 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
779
780#ifdef RE_ENABLE_I18N
781 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
782 this function returns CAPITAL LETTER I instead of first byte of
783 DOTLESS SMALL LETTER I. The latter would confuse the parser,
784 since peek_byte_case doesn't advance cur_idx in any way. */
785 if (pstr->offsets_needed && !isascii (ch))
786 return re_string_peek_byte (pstr, idx);
787#endif
788
789 return ch;
790}
791
792static unsigned char
a334319f
UD
793re_string_fetch_byte_case (pstr)
794 re_string_t *pstr;
37369d1c 795{
f39eef7b 796 if (BE (!pstr->mbs_allocated, 1))
37369d1c
UD
797 return re_string_fetch_byte (pstr);
798
799#ifdef RE_ENABLE_I18N
800 if (pstr->offsets_needed)
801 {
f39eef7b 802 int off, ch;
c0d5034e 803
37369d1c
UD
804 /* For tr_TR.UTF-8 [[:islower:]] there is
805 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
806 in that case the whole multi-byte character and return
807 the original letter. On the other side, with
808 [[: DOTLESS SMALL LETTER I return [[:I, as doing
809 anything else would complicate things too much. */
810
811 if (!re_string_first_byte (pstr, pstr->cur_idx))
812 return re_string_fetch_byte (pstr);
813
814 off = pstr->offsets[pstr->cur_idx];
815 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
816
817 if (! isascii (ch))
818 return re_string_fetch_byte (pstr);
819
820 re_string_skip_bytes (pstr,
821 re_string_char_size_at (pstr, pstr->cur_idx));
822 return ch;
823 }
824#endif
825
f39eef7b 826 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
37369d1c
UD
827}
828
3b0bdc72 829static void
a334319f
UD
830re_string_destruct (pstr)
831 re_string_t *pstr;
3b0bdc72
UD
832{
833#ifdef RE_ENABLE_I18N
834 re_free (pstr->wcs);
37369d1c 835 re_free (pstr->offsets);
3b0bdc72 836#endif /* RE_ENABLE_I18N */
37369d1c 837 if (pstr->mbs_allocated)
612546c6 838 re_free (pstr->mbs);
3b0bdc72
UD
839}
840
841/* Return the context at IDX in INPUT. */
612546c6 842
3b0bdc72 843static unsigned int
a334319f
UD
844re_string_context_at (input, idx, eflags)
845 const re_string_t *input;
846 int idx, eflags;
3b0bdc72
UD
847{
848 int c;
bb677c95
UD
849 if (BE (idx < 0, 0))
850 /* In this case, we use the value stored in input->tip_context,
851 since we can't know the character in input->mbs[-1] here. */
852 return input->tip_context;
853 if (BE (idx == input->len, 0))
854 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
855 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
a0756231 856#ifdef RE_ENABLE_I18N
3c0fb574 857 if (input->mb_cur_max > 1)
1843975c
RM
858 {
859 wint_t wc;
860 int wc_idx = idx;
861 while(input->wcs[wc_idx] == WEOF)
862 {
863#ifdef DEBUG
864 /* It must not happen. */
865 assert (wc_idx >= 0);
866#endif
867 --wc_idx;
868 if (wc_idx < 0)
869 return input->tip_context;
870 }
871 wc = input->wcs[wc_idx];
56b168be 872 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1843975c 873 return CONTEXT_WORD;
56b168be
UD
874 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
875 ? CONTEXT_NEWLINE : 0);
1843975c 876 }
a0756231
RM
877 else
878#endif
879 {
880 c = re_string_byte_at (input, idx);
56b168be 881 if (bitset_contain (input->word_char, c))
a0756231 882 return CONTEXT_WORD;
56b168be 883 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
a0756231 884 }
3b0bdc72
UD
885}
886\f
887/* Functions for set operation. */
888
889static reg_errcode_t
a334319f
UD
890re_node_set_alloc (set, size)
891 re_node_set *set;
892 int size;
3b0bdc72
UD
893{
894 set->alloc = size;
895 set->nelem = 0;
896 set->elems = re_malloc (int, size);
bc15410e 897 if (BE (set->elems == NULL, 0))
3b0bdc72
UD
898 return REG_ESPACE;
899 return REG_NOERROR;
900}
901
902static reg_errcode_t
a334319f
UD
903re_node_set_init_1 (set, elem)
904 re_node_set *set;
905 int elem;
3b0bdc72
UD
906{
907 set->alloc = 1;
908 set->nelem = 1;
909 set->elems = re_malloc (int, 1);
bc15410e 910 if (BE (set->elems == NULL, 0))
6291ee3c
UD
911 {
912 set->alloc = set->nelem = 0;
913 return REG_ESPACE;
914 }
3b0bdc72
UD
915 set->elems[0] = elem;
916 return REG_NOERROR;
917}
918
919static reg_errcode_t
a334319f
UD
920re_node_set_init_2 (set, elem1, elem2)
921 re_node_set *set;
922 int elem1, elem2;
3b0bdc72
UD
923{
924 set->alloc = 2;
925 set->elems = re_malloc (int, 2);
bc15410e 926 if (BE (set->elems == NULL, 0))
3b0bdc72
UD
927 return REG_ESPACE;
928 if (elem1 == elem2)
929 {
930 set->nelem = 1;
931 set->elems[0] = elem1;
932 }
933 else
934 {
935 set->nelem = 2;
936 if (elem1 < elem2)
1b2c2628
UD
937 {
938 set->elems[0] = elem1;
939 set->elems[1] = elem2;
940 }
3b0bdc72 941 else
1b2c2628
UD
942 {
943 set->elems[0] = elem2;
944 set->elems[1] = elem1;
945 }
3b0bdc72
UD
946 }
947 return REG_NOERROR;
948}
949
950static reg_errcode_t
a334319f
UD
951re_node_set_init_copy (dest, src)
952 re_node_set *dest;
953 const re_node_set *src;
3b0bdc72
UD
954{
955 dest->nelem = src->nelem;
956 if (src->nelem > 0)
957 {
958 dest->alloc = dest->nelem;
959 dest->elems = re_malloc (int, dest->alloc);
bc15410e 960 if (BE (dest->elems == NULL, 0))
6291ee3c
UD
961 {
962 dest->alloc = dest->nelem = 0;
963 return REG_ESPACE;
964 }
3b0bdc72
UD
965 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
966 }
967 else
968 re_node_set_init_empty (dest);
969 return REG_NOERROR;
970}
971
a9388965
UD
972/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
973 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
974 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
975
3b0bdc72 976static reg_errcode_t
a334319f
UD
977re_node_set_add_intersect (dest, src1, src2)
978 re_node_set *dest;
979 const re_node_set *src1, *src2;
3b0bdc72 980{
59e7ebcc
UD
981 int i1, i2, is, id, delta, sbase;
982 if (src1->nelem == 0 || src2->nelem == 0)
983 return REG_NOERROR;
984
985 /* We need dest->nelem + 2 * elems_in_intersection; this is a
986 conservative estimate. */
987 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
3b0bdc72 988 {
59e7ebcc
UD
989 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
990 int *new_elems = re_realloc (dest->elems, int, new_alloc);
991 if (BE (new_elems == NULL, 0))
992 return REG_ESPACE;
993 dest->elems = new_elems;
994 dest->alloc = new_alloc;
3b0bdc72 995 }
3b0bdc72 996
59e7ebcc
UD
997 /* Find the items in the intersection of SRC1 and SRC2, and copy
998 into the top of DEST those that are not already in DEST itself. */
999 sbase = dest->nelem + src1->nelem + src2->nelem;
1000 i1 = src1->nelem - 1;
1001 i2 = src2->nelem - 1;
1002 id = dest->nelem - 1;
1003 for (;;)
3b0bdc72 1004 {
59e7ebcc 1005 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1006 {
59e7ebcc
UD
1007 /* Try to find the item in DEST. Maybe we could binary search? */
1008 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1009 --id;
1010
1011 if (id < 0 || dest->elems[id] != src1->elems[i1])
1012 dest->elems[--sbase] = src1->elems[i1];
1013
1014 if (--i1 < 0 || --i2 < 0)
1015 break;
1b2c2628 1016 }
59e7ebcc
UD
1017
1018 /* Lower the highest of the two items. */
1019 else if (src1->elems[i1] < src2->elems[i2])
1b2c2628 1020 {
59e7ebcc
UD
1021 if (--i2 < 0)
1022 break;
1023 }
1024 else
1025 {
1026 if (--i1 < 0)
1027 break;
1b2c2628 1028 }
3b0bdc72 1029 }
59e7ebcc
UD
1030
1031 id = dest->nelem - 1;
1032 is = dest->nelem + src1->nelem + src2->nelem - 1;
1033 delta = is - sbase + 1;
1034
1035 /* Now copy. When DELTA becomes zero, the remaining
1036 DEST elements are already in place; this is more or
1037 less the same loop that is in re_node_set_merge. */
1038 dest->nelem += delta;
1039 if (delta > 0 && id >= 0)
1040 for (;;)
1041 {
1042 if (dest->elems[is] > dest->elems[id])
1043 {
1044 /* Copy from the top. */
1045 dest->elems[id + delta--] = dest->elems[is--];
1046 if (delta == 0)
1047 break;
1048 }
1049 else
1050 {
1051 /* Slide from the bottom. */
1052 dest->elems[id + delta] = dest->elems[id];
1053 if (--id < 0)
1054 break;
1055 }
1056 }
1057
1058 /* Copy remaining SRC elements. */
1059 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1060
3b0bdc72
UD
1061 return REG_NOERROR;
1062}
1063
a9388965
UD
1064/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1065 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1066
3b0bdc72 1067static reg_errcode_t
a334319f
UD
1068re_node_set_init_union (dest, src1, src2)
1069 re_node_set *dest;
1070 const re_node_set *src1, *src2;
3b0bdc72
UD
1071{
1072 int i1, i2, id;
1073 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1074 {
1075 dest->alloc = src1->nelem + src2->nelem;
1076 dest->elems = re_malloc (int, dest->alloc);
bc15410e 1077 if (BE (dest->elems == NULL, 0))
1b2c2628 1078 return REG_ESPACE;
3b0bdc72
UD
1079 }
1080 else
1081 {
1082 if (src1 != NULL && src1->nelem > 0)
1b2c2628 1083 return re_node_set_init_copy (dest, src1);
3b0bdc72 1084 else if (src2 != NULL && src2->nelem > 0)
1b2c2628 1085 return re_node_set_init_copy (dest, src2);
3b0bdc72 1086 else
1b2c2628 1087 re_node_set_init_empty (dest);
3b0bdc72
UD
1088 return REG_NOERROR;
1089 }
1090 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1091 {
1092 if (src1->elems[i1] > src2->elems[i2])
1b2c2628
UD
1093 {
1094 dest->elems[id++] = src2->elems[i2++];
1095 continue;
1096 }
3b0bdc72 1097 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1098 ++i2;
3b0bdc72
UD
1099 dest->elems[id++] = src1->elems[i1++];
1100 }
1101 if (i1 < src1->nelem)
1102 {
1103 memcpy (dest->elems + id, src1->elems + i1,
1b2c2628 1104 (src1->nelem - i1) * sizeof (int));
3b0bdc72
UD
1105 id += src1->nelem - i1;
1106 }
1107 else if (i2 < src2->nelem)
1108 {
1109 memcpy (dest->elems + id, src2->elems + i2,
1b2c2628 1110 (src2->nelem - i2) * sizeof (int));
3b0bdc72
UD
1111 id += src2->nelem - i2;
1112 }
1113 dest->nelem = id;
1114 return REG_NOERROR;
1115}
1116
a9388965
UD
1117/* Calculate the union set of the sets DEST and SRC. And store it to
1118 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1119
3b0bdc72 1120static reg_errcode_t
a334319f
UD
1121re_node_set_merge (dest, src)
1122 re_node_set *dest;
1123 const re_node_set *src;
3b0bdc72 1124{
59e7ebcc 1125 int is, id, sbase, delta;
3b0bdc72
UD
1126 if (src == NULL || src->nelem == 0)
1127 return REG_NOERROR;
59e7ebcc 1128 if (dest->alloc < 2 * src->nelem + dest->nelem)
3b0bdc72 1129 {
951d6408
UD
1130 int new_alloc = 2 * (src->nelem + dest->alloc);
1131 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1b2c2628
UD
1132 if (BE (new_buffer == NULL, 0))
1133 return REG_ESPACE;
1134 dest->elems = new_buffer;
951d6408 1135 dest->alloc = new_alloc;
3b0bdc72
UD
1136 }
1137
59e7ebcc 1138 if (BE (dest->nelem == 0, 0))
3b0bdc72 1139 {
59e7ebcc
UD
1140 dest->nelem = src->nelem;
1141 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1142 return REG_NOERROR;
1143 }
3b0bdc72 1144
59e7ebcc
UD
1145 /* Copy into the top of DEST the items of SRC that are not
1146 found in DEST. Maybe we could binary search in DEST? */
1147 for (sbase = dest->nelem + 2 * src->nelem,
1148 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1149 {
1150 if (dest->elems[id] == src->elems[is])
1151 is--, id--;
1152 else if (dest->elems[id] < src->elems[is])
1153 dest->elems[--sbase] = src->elems[is--];
1154 else /* if (dest->elems[id] > src->elems[is]) */
1155 --id;
1156 }
3b0bdc72 1157
59e7ebcc
UD
1158 if (is >= 0)
1159 {
1160 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1161 sbase -= is + 1;
1162 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
3b0bdc72
UD
1163 }
1164
59e7ebcc
UD
1165 id = dest->nelem - 1;
1166 is = dest->nelem + 2 * src->nelem - 1;
1167 delta = is - sbase + 1;
1168 if (delta == 0)
1169 return REG_NOERROR;
1170
1171 /* Now copy. When DELTA becomes zero, the remaining
1172 DEST elements are already in place. */
1173 dest->nelem += delta;
1174 for (;;)
3b0bdc72 1175 {
59e7ebcc
UD
1176 if (dest->elems[is] > dest->elems[id])
1177 {
1178 /* Copy from the top. */
1179 dest->elems[id + delta--] = dest->elems[is--];
1180 if (delta == 0)
1181 break;
1182 }
1183 else
1184 {
1185 /* Slide from the bottom. */
1186 dest->elems[id + delta] = dest->elems[id];
1187 if (--id < 0)
1188 {
1189 /* Copy remaining SRC elements. */
1190 memcpy (dest->elems, dest->elems + sbase,
1191 delta * sizeof (int));
1192 break;
1193 }
1194 }
3b0bdc72 1195 }
59e7ebcc 1196
3b0bdc72
UD
1197 return REG_NOERROR;
1198}
1199
1200/* Insert the new element ELEM to the re_node_set* SET.
8503c987 1201 SET should not already have ELEM.
3b0bdc72
UD
1202 return -1 if an error is occured, return 1 otherwise. */
1203
1204static int
a334319f
UD
1205re_node_set_insert (set, elem)
1206 re_node_set *set;
1207 int elem;
3b0bdc72 1208{
56b168be 1209 int idx;
8503c987
UD
1210 /* In case the set is empty. */
1211 if (set->alloc == 0)
3b0bdc72 1212 {
bc15410e 1213 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1b2c2628 1214 return 1;
3b0bdc72 1215 else
1b2c2628 1216 return -1;
3b0bdc72
UD
1217 }
1218
8503c987 1219 if (BE (set->nelem, 0) == 0)
3b0bdc72 1220 {
8503c987
UD
1221 /* We already guaranteed above that set->alloc != 0. */
1222 set->elems[0] = elem;
1223 ++set->nelem;
1224 return 1;
3b0bdc72
UD
1225 }
1226
1227 /* Realloc if we need. */
8503c987 1228 if (set->alloc == set->nelem)
3b0bdc72 1229 {
2d87db5b 1230 int *new_elems;
3b0bdc72 1231 set->alloc = set->alloc * 2;
2d87db5b
UD
1232 new_elems = re_realloc (set->elems, int, set->alloc);
1233 if (BE (new_elems == NULL, 0))
1b2c2628 1234 return -1;
2d87db5b 1235 set->elems = new_elems;
3b0bdc72 1236 }
8503c987
UD
1237
1238 /* Move the elements which follows the new element. Test the
1239 first element separately to skip a check in the inner loop. */
1240 if (elem < set->elems[0])
1241 {
1242 idx = 0;
59e7ebcc
UD
1243 for (idx = set->nelem; idx > 0; idx--)
1244 set->elems[idx] = set->elems[idx - 1];
8503c987 1245 }
3b0bdc72
UD
1246 else
1247 {
8503c987
UD
1248 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1249 set->elems[idx] = set->elems[idx - 1];
3b0bdc72 1250 }
8503c987 1251
3b0bdc72
UD
1252 /* Insert the new element. */
1253 set->elems[idx] = elem;
1254 ++set->nelem;
1255 return 1;
1256}
1257
20dc2f79
UD
1258/* Insert the new element ELEM to the re_node_set* SET.
1259 SET should not already have any element greater than or equal to ELEM.
1260 Return -1 if an error is occured, return 1 otherwise. */
1261
1262static int
a334319f
UD
1263re_node_set_insert_last (set, elem)
1264 re_node_set *set;
1265 int elem;
20dc2f79
UD
1266{
1267 /* Realloc if we need. */
1268 if (set->alloc == set->nelem)
1269 {
2d87db5b 1270 int *new_elems;
20dc2f79 1271 set->alloc = (set->alloc + 1) * 2;
2d87db5b
UD
1272 new_elems = re_realloc (set->elems, int, set->alloc);
1273 if (BE (new_elems == NULL, 0))
20dc2f79 1274 return -1;
2d87db5b 1275 set->elems = new_elems;
20dc2f79
UD
1276 }
1277
1278 /* Insert the new element. */
1279 set->elems[set->nelem++] = elem;
1280 return 1;
1281}
1282
3b0bdc72 1283/* Compare two node sets SET1 and SET2.
56b168be 1284 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
3b0bdc72
UD
1285
1286static int
a334319f
UD
1287re_node_set_compare (set1, set2)
1288 const re_node_set *set1, *set2;
3b0bdc72
UD
1289{
1290 int i;
1291 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1292 return 0;
59e7ebcc 1293 for (i = set1->nelem ; --i >= 0 ; )
3b0bdc72
UD
1294 if (set1->elems[i] != set2->elems[i])
1295 return 0;
1296 return 1;
1297}
1298
0742e48e 1299/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
3b0bdc72
UD
1300
1301static int
a334319f
UD
1302re_node_set_contains (set, elem)
1303 const re_node_set *set;
1304 int elem;
3b0bdc72 1305{
5cf1ec52 1306 unsigned int idx, right, mid;
3b0bdc72
UD
1307 if (set->nelem <= 0)
1308 return 0;
1309
1310 /* Binary search the element. */
1311 idx = 0;
1312 right = set->nelem - 1;
1313 while (idx < right)
1314 {
1315 mid = (idx + right) / 2;
1316 if (set->elems[mid] < elem)
1b2c2628 1317 idx = mid + 1;
3b0bdc72 1318 else
1b2c2628 1319 right = mid;
3b0bdc72 1320 }
0742e48e 1321 return set->elems[idx] == elem ? idx + 1 : 0;
3b0bdc72
UD
1322}
1323
1324static void
a334319f
UD
1325re_node_set_remove_at (set, idx)
1326 re_node_set *set;
1327 int idx;
3b0bdc72
UD
1328{
1329 if (idx < 0 || idx >= set->nelem)
1330 return;
3b0bdc72 1331 --set->nelem;
59e7ebcc
UD
1332 for (; idx < set->nelem; idx++)
1333 set->elems[idx] = set->elems[idx + 1];
3b0bdc72
UD
1334}
1335\f
1336
1337/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1338 Or return -1, if an error will be occured. */
1339
1340static int
02f3550c 1341re_dfa_add_node (dfa, token)
a334319f
UD
1342 re_dfa_t *dfa;
1343 re_token_t token;
3b0bdc72 1344{
02f3550c 1345 int type = token.type;
18e3dc56 1346 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
3b0bdc72 1347 {
a334319f 1348 int new_nodes_alloc = dfa->nodes_alloc * 2;
02f3550c 1349 int *new_nexts, *new_indices;
963d8d78 1350 re_node_set *new_edests, *new_eclosures;
02f3550c 1351
2d87db5b 1352 re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
a334319f 1353 new_nodes_alloc);
2d87db5b 1354 if (BE (new_nodes == NULL, 0))
0ecb606c 1355 return -1;
2d87db5b 1356 dfa->nodes = new_nodes;
02f3550c
UD
1357 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1358 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1359 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1360 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
02f3550c 1361 if (BE (new_nexts == NULL || new_indices == NULL
963d8d78 1362 || new_edests == NULL || new_eclosures == NULL, 0))
02f3550c
UD
1363 return -1;
1364 dfa->nexts = new_nexts;
1365 dfa->org_indices = new_indices;
1366 dfa->edests = new_edests;
1367 dfa->eclosures = new_eclosures;
951d6408 1368 dfa->nodes_alloc = new_nodes_alloc;
3b0bdc72
UD
1369 }
1370 dfa->nodes[dfa->nodes_len] = token;
485d775d 1371 dfa->nodes[dfa->nodes_len].constraint = 0;
02f3550c
UD
1372#ifdef RE_ENABLE_I18N
1373 dfa->nodes[dfa->nodes_len].accept_mb =
1374 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1375#endif
1376 dfa->nexts[dfa->nodes_len] = -1;
1377 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1378 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
3b0bdc72
UD
1379 return dfa->nodes_len++;
1380}
1381
2d87db5b 1382static inline unsigned int
a334319f
UD
1383calc_state_hash (nodes, context)
1384 const re_node_set *nodes;
1385 unsigned int context;
3b0bdc72
UD
1386{
1387 unsigned int hash = nodes->nelem + context;
1388 int i;
1389 for (i = 0 ; i < nodes->nelem ; i++)
1390 hash += nodes->elems[i];
1391 return hash;
1392}
1393
1394/* Search for the state whose node_set is equivalent to NODES.
1395 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1396 Otherwise create the new one and return it. In case of an error
1397 return NULL and set the error code in ERR.
1398 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1399 return value is NULL and ERR is REG_NOERROR.
1400 - We never return non-NULL value in case of any errors, it is for
1401 optimization. */
a9388965 1402
a334319f
UD
1403static re_dfastate_t*
1404re_acquire_state (err, dfa, nodes)
1405 reg_errcode_t *err;
1406 re_dfa_t *dfa;
1407 const re_node_set *nodes;
3b0bdc72
UD
1408{
1409 unsigned int hash;
a9388965 1410 re_dfastate_t *new_state;
3b0bdc72
UD
1411 struct re_state_table_entry *spot;
1412 int i;
bc15410e 1413 if (BE (nodes->nelem == 0, 0))
a9388965
UD
1414 {
1415 *err = REG_NOERROR;
1416 return NULL;
1417 }
3b0bdc72
UD
1418 hash = calc_state_hash (nodes, 0);
1419 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1420
bc15410e 1421 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1422 {
bc15410e
UD
1423 re_dfastate_t *state = spot->array[i];
1424 if (hash != state->hash)
1b2c2628 1425 continue;
bc15410e 1426 if (re_node_set_compare (&state->nodes, nodes))
1b2c2628 1427 return state;
3b0bdc72 1428 }
3b0bdc72
UD
1429
1430 /* There are no appropriate state in the dfa, create the new one. */
a9388965 1431 new_state = create_ci_newstate (dfa, nodes, hash);
2d87db5b
UD
1432 if (BE (new_state == NULL, 0))
1433 *err = REG_ESPACE;
1434
1435 return new_state;
3b0bdc72
UD
1436}
1437
1438/* Search for the state whose node_set is equivalent to NODES and
1439 whose context is equivalent to CONTEXT.
1440 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1441 Otherwise create the new one and return it. In case of an error
1442 return NULL and set the error code in ERR.
1443 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1444 return value is NULL and ERR is REG_NOERROR.
1445 - We never return non-NULL value in case of any errors, it is for
1446 optimization. */
a9388965 1447
a334319f
UD
1448static re_dfastate_t*
1449re_acquire_state_context (err, dfa, nodes, context)
1450 reg_errcode_t *err;
1451 re_dfa_t *dfa;
1452 const re_node_set *nodes;
1453 unsigned int context;
3b0bdc72
UD
1454{
1455 unsigned int hash;
a9388965 1456 re_dfastate_t *new_state;
3b0bdc72
UD
1457 struct re_state_table_entry *spot;
1458 int i;
1459 if (nodes->nelem == 0)
a9388965
UD
1460 {
1461 *err = REG_NOERROR;
1462 return NULL;
1463 }
3b0bdc72
UD
1464 hash = calc_state_hash (nodes, context);
1465 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1466
bc15410e 1467 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1468 {
bc15410e 1469 re_dfastate_t *state = spot->array[i];
a0a8461c
UD
1470 if (state->hash == hash
1471 && state->context == context
1472 && re_node_set_compare (state->entrance_nodes, nodes))
1b2c2628 1473 return state;
3b0bdc72 1474 }
3b0bdc72 1475 /* There are no appropriate state in `dfa', create the new one. */
a9388965 1476 new_state = create_cd_newstate (dfa, nodes, context, hash);
2d87db5b
UD
1477 if (BE (new_state == NULL, 0))
1478 *err = REG_ESPACE;
1479
1480 return new_state;
3b0bdc72
UD
1481}
1482
5cf1ec52
UD
1483/* Finish initialization of the new state NEWSTATE, and using its hash value
1484 HASH put in the appropriate bucket of DFA's state table. Return value
1485 indicates the error code if failed. */
a9388965 1486
5cf1ec52 1487static reg_errcode_t
a334319f
UD
1488register_state (dfa, newstate, hash)
1489 re_dfa_t *dfa;
1490 re_dfastate_t *newstate;
1491 unsigned int hash;
3b0bdc72 1492{
5cf1ec52 1493 struct re_state_table_entry *spot;
1b2c2628 1494 reg_errcode_t err;
5cf1ec52
UD
1495 int i;
1496
1497 newstate->hash = hash;
1498 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1b2c2628 1499 if (BE (err != REG_NOERROR, 0))
5cf1ec52
UD
1500 return REG_ESPACE;
1501 for (i = 0; i < newstate->nodes.nelem; i++)
1b2c2628 1502 {
5cf1ec52
UD
1503 int elem = newstate->nodes.elems[i];
1504 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1505 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1b2c2628 1506 }
3b0bdc72 1507
3b0bdc72 1508 spot = dfa->state_table + (hash & dfa->state_hash_mask);
951d6408 1509 if (BE (spot->alloc <= spot->num, 0))
3b0bdc72 1510 {
951d6408
UD
1511 int new_alloc = 2 * spot->num + 2;
1512 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1513 new_alloc);
1b2c2628
UD
1514 if (BE (new_array == NULL, 0))
1515 return REG_ESPACE;
1516 spot->array = new_array;
951d6408 1517 spot->alloc = new_alloc;
3b0bdc72 1518 }
bc15410e 1519 spot->array[spot->num++] = newstate;
a9388965 1520 return REG_NOERROR;
3b0bdc72
UD
1521}
1522
a9388965
UD
1523/* Create the new state which is independ of contexts.
1524 Return the new state if succeeded, otherwise return NULL. */
1525
3b0bdc72 1526static re_dfastate_t *
a334319f
UD
1527create_ci_newstate (dfa, nodes, hash)
1528 re_dfa_t *dfa;
1529 const re_node_set *nodes;
1530 unsigned int hash;
3b0bdc72
UD
1531{
1532 int i;
a9388965 1533 reg_errcode_t err;
3b0bdc72 1534 re_dfastate_t *newstate;
5cf1ec52
UD
1535
1536 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
bc15410e 1537 if (BE (newstate == NULL, 0))
a9388965 1538 return NULL;
5cf1ec52
UD
1539 err = re_node_set_init_copy (&newstate->nodes, nodes);
1540 if (BE (err != REG_NOERROR, 0))
1541 {
1542 re_free (newstate);
1543 return NULL;
1544 }
3b0bdc72 1545
5cf1ec52 1546 newstate->entrance_nodes = &newstate->nodes;
3b0bdc72
UD
1547 for (i = 0 ; i < nodes->nelem ; i++)
1548 {
1549 re_token_t *node = dfa->nodes + nodes->elems[i];
1550 re_token_type_t type = node->type;
485d775d 1551 if (type == CHARACTER && !node->constraint)
1b2c2628 1552 continue;
02f3550c
UD
1553#ifdef RE_ENABLE_I18N
1554 newstate->accept_mb |= node->accept_mb;
1555#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1556
1557 /* If the state has the halt node, the state is a halt state. */
02f3550c 1558 if (type == END_OF_RE)
1b2c2628 1559 newstate->halt = 1;
3b0bdc72 1560 else if (type == OP_BACK_REF)
1b2c2628 1561 newstate->has_backref = 1;
485d775d 1562 else if (type == ANCHOR || node->constraint)
1b2c2628 1563 newstate->has_constraint = 1;
3b0bdc72 1564 }
a9388965 1565 err = register_state (dfa, newstate, hash);
1b2c2628
UD
1566 if (BE (err != REG_NOERROR, 0))
1567 {
1568 free_state (newstate);
1569 newstate = NULL;
1570 }
1571 return newstate;
3b0bdc72
UD
1572}
1573
a9388965
UD
1574/* Create the new state which is depend on the context CONTEXT.
1575 Return the new state if succeeded, otherwise return NULL. */
1576
3b0bdc72 1577static re_dfastate_t *
a334319f
UD
1578create_cd_newstate (dfa, nodes, context, hash)
1579 re_dfa_t *dfa;
1580 const re_node_set *nodes;
1581 unsigned int context, hash;
3b0bdc72
UD
1582{
1583 int i, nctx_nodes = 0;
a9388965 1584 reg_errcode_t err;
3b0bdc72
UD
1585 re_dfastate_t *newstate;
1586
5cf1ec52 1587 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
bc15410e 1588 if (BE (newstate == NULL, 0))
a9388965 1589 return NULL;
5cf1ec52
UD
1590 err = re_node_set_init_copy (&newstate->nodes, nodes);
1591 if (BE (err != REG_NOERROR, 0))
1592 {
1593 re_free (newstate);
1594 return NULL;
1595 }
1596
3b0bdc72
UD
1597 newstate->context = context;
1598 newstate->entrance_nodes = &newstate->nodes;
1599
1600 for (i = 0 ; i < nodes->nelem ; i++)
1601 {
1602 unsigned int constraint = 0;
1603 re_token_t *node = dfa->nodes + nodes->elems[i];
1604 re_token_type_t type = node->type;
485d775d 1605 if (node->constraint)
1b2c2628 1606 constraint = node->constraint;
3b0bdc72 1607
485d775d 1608 if (type == CHARACTER && !constraint)
1b2c2628 1609 continue;
a334319f 1610#ifdef RE_ENABLE_I18N
02f3550c 1611 newstate->accept_mb |= node->accept_mb;
a334319f 1612#endif /* RE_ENABLE_I18N */
02f3550c
UD
1613
1614 /* If the state has the halt node, the state is a halt state. */
1615 if (type == END_OF_RE)
1616 newstate->halt = 1;
3b0bdc72 1617 else if (type == OP_BACK_REF)
1b2c2628 1618 newstate->has_backref = 1;
3b0bdc72 1619 else if (type == ANCHOR)
1b2c2628 1620 constraint = node->opr.ctx_type;
3b0bdc72
UD
1621
1622 if (constraint)
1b2c2628
UD
1623 {
1624 if (newstate->entrance_nodes == &newstate->nodes)
1625 {
1626 newstate->entrance_nodes = re_malloc (re_node_set, 1);
bc15410e 1627 if (BE (newstate->entrance_nodes == NULL, 0))
1b2c2628
UD
1628 {
1629 free_state (newstate);
1630 return NULL;
1631 }
1632 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1633 nctx_nodes = 0;
1634 newstate->has_constraint = 1;
1635 }
1636
1637 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1638 {
1639 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1640 ++nctx_nodes;
1641 }
1642 }
3b0bdc72 1643 }
a9388965 1644 err = register_state (dfa, newstate, hash);
1b2c2628
UD
1645 if (BE (err != REG_NOERROR, 0))
1646 {
1647 free_state (newstate);
1648 newstate = NULL;
1649 }
1650 return newstate;
1651}
a334319f
UD
1652
1653static void
1654free_state (state)
1655 re_dfastate_t *state;
1656{
1657 re_node_set_free (&state->non_eps_nodes);
1658 re_node_set_free (&state->inveclosure);
1659 if (state->entrance_nodes != &state->nodes)
1660 {
1661 re_node_set_free (state->entrance_nodes);
1662 re_free (state->entrance_nodes);
1663 }
1664 re_node_set_free (&state->nodes);
5cf53cc2 1665 re_free (state->word_trtable);
a334319f
UD
1666 re_free (state->trtable);
1667 re_free (state);
1668}
This page took 0.348202 seconds and 5 git commands to generate.