[PATCH] Regex word char speedup
Jakub Jelinek
jakub@redhat.com
Thu Jan 1 16:29:00 GMT 2004
Hi!
Below is a patch which implements one of the alternatives I talked about.
Another thing which could help could be to copy preg->syntax and
preg->re_nsub in re_search_internal into re_string_t as well
and then kill all passing of preg around in routines called (indirectly)
from re_search_internal. We could pass re_dfa_t * directly, or remove the
argument altogether depending on function. That way we'd avoid the
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
in almost every routine in regexec.c.
Also, perhaps re_string_t and re_match_context_t types could be merged,
whether by adding re_match_context_t fields to re_string_t or adding
re_string_t as first field of re_match_context_t instead of the re_string_t
pointer.
2004-01-01 Jakub Jelinek <jakub@redhat.com>
Paolo Bonzini <bonzini@gnu.org>
* posix/regex_internal.h (re_string_t): Add newline_anchor
and word_char fields.
(re_string_context_at, re_string_reconstruct): Remove last argument.
* posix/regex_internal.c (re_string_allocate): Initialize
pstr->word_char.
(re_string_context_at): Remove newline_anchor argument.
Use input->newline_anchor instead, swap && conditions.
Only use IS_WIDE_WORD_CHAR if input->word_char != NULL.
Use input->word_char bitmap instead of IS_WORD_CHAR.
(re_string_reconstruct): Likewise.
Adjust re_string_context_at caller.
* posix/regexec.c (acquire_init_state_context,
check_halt_state_context, transit_state, transit_state_sb,
transit_state_mb, transit_state_bkref, check_arrival,
check_node_accept): Adjust re_string_context_at and
re_string_reconstruct callers.
(re_search_internal): Likewise. Set input.newline_anchor.
(build_trtable): Use dfa->word_char bitmap instead of IS_WORD_CHAR.
--- libc/posix/regexec.c.jj 2004-01-01 15:44:58.000000000 +0100
+++ libc/posix/regexec.c 2004-01-01 16:07:59.000000000 +0100
@@ -1,5 +1,5 @@
/* Extended regular expression matching and search library.
- Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+ Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
@@ -616,6 +616,7 @@ re_search_internal (preg, string, length
goto free_return;
input.stop = stop;
input.raw_stop = stop;
+ input.newline_anchor = preg->newline_anchor;
err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
if (BE (err != REG_NOERROR, 0))
@@ -708,8 +709,7 @@ re_search_internal (preg, string, length
if (input.raw_mbs_idx + input.valid_raw_len <= match_first
|| match_first < input.raw_mbs_idx)
{
- err = re_string_reconstruct (&input, match_first, eflags,
- preg->newline_anchor);
+ err = re_string_reconstruct (&input, match_first, eflags);
if (BE (err != REG_NOERROR, 0))
goto free_return;
}
@@ -730,8 +730,7 @@ re_search_internal (preg, string, length
/* Reconstruct the buffers so that the matcher can assume that
the matching starts from the beginning of the buffer. */
- err = re_string_reconstruct (&input, match_first, eflags,
- preg->newline_anchor);
+ err = re_string_reconstruct (&input, match_first, eflags);
if (BE (err != REG_NOERROR, 0))
goto free_return;
#ifdef RE_ENABLE_I18N
@@ -939,8 +938,7 @@ acquire_init_state_context (err, preg, m
if (dfa->init_state->has_constraint)
{
unsigned int context;
- context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, idx - 1, mctx->eflags);
if (IS_WORD_CONTEXT (context))
return dfa->init_state_word;
else if (IS_ORDINARY_CONTEXT (context))
@@ -1103,8 +1101,7 @@ check_halt_state_context (preg, state, m
#ifdef DEBUG
assert (state->halt);
#endif
- context = re_string_context_at (mctx->input, idx, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, idx, mctx->eflags);
for (i = 0; i < state->nodes.nelem; ++i)
if (check_halt_node_context (dfa, state->nodes.elems[i], context))
return state->nodes.elems[i];
@@ -2177,7 +2174,7 @@ transit_state (err, preg, mctx, state)
context
= re_string_context_at (mctx->input,
re_string_cur_idx (mctx->input) - 1,
- mctx->eflags, preg->newline_anchor);
+ mctx->eflags);
if (IS_WORD_CONTEXT (context))
next_state = trtable[ch + SBC_MAX];
else
@@ -2236,7 +2233,7 @@ transit_state (err, preg, mctx, state)
context = re_string_context_at (mctx->input,
re_string_cur_idx (mctx->input) - 1,
- mctx->eflags, preg->newline_anchor);
+ mctx->eflags);
next_state = mctx->state_log[cur_idx]
= re_acquire_state_context (err, dfa, &next_nodes, context);
/* We don't need to check errors here, since the return value of
@@ -2340,8 +2337,7 @@ transit_state_sb (err, preg, state, mctx
}
}
}
- context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags);
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
/* We don't need to check errors here, since the return value of
this function is next_state and ERR is already set. */
@@ -2375,7 +2371,7 @@ transit_state_mb (preg, pstate, mctx)
{
context = re_string_context_at (mctx->input,
re_string_cur_idx (mctx->input),
- mctx->eflags, preg->newline_anchor);
+ mctx->eflags);
if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
context))
continue;
@@ -2412,8 +2408,7 @@ transit_state_mb (preg, pstate, mctx)
if (BE (err != REG_NOERROR, 0))
return err;
}
- context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags);
mctx->state_log[dest_idx]
= re_acquire_state_context (&err, dfa, &dest_nodes, context);
if (dest_state != NULL)
@@ -2451,7 +2446,7 @@ transit_state_bkref (preg, nodes, mctx)
if (node->constraint)
{
context = re_string_context_at (mctx->input, cur_str_idx,
- mctx->eflags, preg->newline_anchor);
+ mctx->eflags);
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
continue;
}
@@ -2483,7 +2478,7 @@ transit_state_bkref (preg, nodes, mctx)
dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
- bkref_ent->subexp_from);
context = re_string_context_at (mctx->input, dest_str_idx - 1,
- mctx->eflags, preg->newline_anchor);
+ mctx->eflags);
dest_state = mctx->state_log[dest_str_idx];
prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
: mctx->state_log[cur_str_idx]->nodes.nelem);
@@ -2757,8 +2752,7 @@ check_arrival (preg, mctx, path, top_nod
mctx->input->cur_idx = str_idx;
/* Setup initial node set. */
- context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags);
if (str_idx == top_str)
{
err = re_node_set_init_1 (&next_nodes, top_node);
@@ -2844,8 +2838,7 @@ check_arrival (preg, mctx, path, top_nod
return err;
}
}
- context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags);
cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
{
@@ -3304,7 +3297,8 @@ out_free:
;
/* j-th destination accepts the word character ch. */
- if (IS_WORD_CHAR (ch))
+ if (BE (dfa->word_char != NULL, 0)
+ && (dfa->word_char[i] & mask))
trtable[ch] = dest_states_word[j];
else
trtable[ch] = dest_states[j];
@@ -3880,8 +3874,7 @@ check_node_accept (preg, node, mctx, idx
/* The node has constraints. Check whether the current context
satisfies the constraints. */
unsigned int context = re_string_context_at (mctx->input, idx,
- mctx->eflags,
- preg->newline_anchor);
+ mctx->eflags);
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
return 0;
}
--- libc/posix/regex_internal.h.jj 2003-12-28 00:34:45.000000000 +0100
+++ libc/posix/regex_internal.h 2004-01-01 15:53:28.000000000 +0100
@@ -1,5 +1,5 @@
/* Extended regular expression matching and search library.
- Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+ Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
@@ -337,12 +337,15 @@ struct re_string_t
unsigned int tip_context;
/* The translation passed as a part of an argument of re_compile_pattern. */
RE_TRANSLATE_TYPE trans;
+ /* Copy of re_dfa_t's word_char. */
+ re_bitset_ptr_t word_char;
/* 1 if REG_ICASE. */
unsigned char icase;
unsigned char is_utf8;
unsigned char map_notascii;
unsigned char mbs_allocated;
unsigned char offsets_needed;
+ unsigned char newline_anchor;
int mb_cur_max;
};
typedef struct re_string_t re_string_t;
@@ -368,7 +371,7 @@ static reg_errcode_t re_string_construct
int len, RE_TRANSLATE_TYPE trans,
int icase, const re_dfa_t *dfa) internal_function;
static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
- int eflags, int newline) internal_function;
+ int eflags) internal_function;
static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
int new_buf_len) internal_function;
# ifdef RE_ENABLE_I18N
@@ -384,7 +387,7 @@ static inline int re_string_char_size_at
static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx) internal_function;
# endif /* RE_ENABLE_I18N */
static unsigned int re_string_context_at (const re_string_t *input, int idx,
- int eflags, int newline_anchor) internal_function;
+ int eflags) internal_function;
static unsigned char re_string_peek_byte_case (const re_string_t *pstr,
int idx) internal_function;
static unsigned char re_string_fetch_byte_case (re_string_t *pstr) internal_function;
--- libc/posix/regex_internal.c.jj 2003-12-28 00:34:31.000000000 +0100
+++ libc/posix/regex_internal.c 2004-01-01 16:22:22.000000000 +0100
@@ -1,5 +1,5 @@
/* Extended regular expression matching and search library.
- Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+ Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
@@ -67,6 +67,7 @@ re_string_allocate (pstr, str, len, init
if (BE (ret != REG_NOERROR, 0))
return ret;
+ pstr->word_char = dfa->word_char;
pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
pstr->valid_raw_len = pstr->valid_len;
@@ -572,9 +573,9 @@ re_string_translate_buffer (pstr)
convert to upper case in case of REG_ICASE, apply translation. */
static reg_errcode_t
-re_string_reconstruct (pstr, idx, eflags, newline)
+re_string_reconstruct (pstr, idx, eflags)
re_string_t *pstr;
- int idx, eflags, newline;
+ int idx, eflags;
{
int offset = idx - pstr->raw_mbs_idx;
if (offset < 0)
@@ -609,8 +610,7 @@ re_string_reconstruct (pstr, idx, eflags
)
{
/* Yes, move them to the front of the buffer. */
- pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
- newline);
+ pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
#ifdef RE_ENABLE_I18N
if (pstr->mb_cur_max > 1)
memmove (pstr->wcs, pstr->wcs + offset,
@@ -695,9 +695,12 @@ re_string_reconstruct (pstr, idx, eflags
memset (pstr->mbs, 255, pstr->valid_len);
}
pstr->valid_raw_len = pstr->valid_len;
- pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
- : ((newline && IS_WIDE_NEWLINE (wc))
- ? CONTEXT_NEWLINE : 0));
+ pstr->tip_context = (BE (pstr->word_char != NULL, 0)
+ && IS_WIDE_WORD_CHAR (wc))
+ ? CONTEXT_WORD
+ : ((IS_WIDE_NEWLINE (wc)
+ && pstr->newline_anchor)
+ ? CONTEXT_NEWLINE : 0);
}
else
#endif /* RE_ENABLE_I18N */
@@ -705,9 +708,11 @@ re_string_reconstruct (pstr, idx, eflags
int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
if (pstr->trans)
c = pstr->trans[c];
- pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
- : ((newline && IS_NEWLINE (c))
- ? CONTEXT_NEWLINE : 0));
+ pstr->tip_context = (BE (pstr->word_char != NULL, 0)
+ && bitset_contain (pstr->word_char, c))
+ ? CONTEXT_WORD
+ : ((IS_NEWLINE (c) && pstr->newline_anchor)
+ ? CONTEXT_NEWLINE : 0);
}
}
if (!pstr->mbs_allocated)
@@ -843,9 +848,9 @@ re_string_destruct (pstr)
/* Return the context at IDX in INPUT. */
static unsigned int
-re_string_context_at (input, idx, eflags, newline_anchor)
+re_string_context_at (input, idx, eflags)
const re_string_t *input;
- int idx, eflags, newline_anchor;
+ int idx, eflags;
{
int c;
if (idx < 0 || idx == input->len)
@@ -874,17 +879,18 @@ re_string_context_at (input, idx, eflags
return input->tip_context;
}
wc = input->wcs[wc_idx];
- if (IS_WIDE_WORD_CHAR (wc))
+ if (BE (input->word_char != NULL, 0) && IS_WIDE_WORD_CHAR (wc))
return CONTEXT_WORD;
- return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
+ return (IS_WIDE_NEWLINE (wc) && input->newline_anchor)
+ ? CONTEXT_NEWLINE : 0;
}
else
#endif
{
c = re_string_byte_at (input, idx);
- if (IS_WORD_CHAR (c))
+ if (BE (input->word_char != NULL, 0) && bitset_contain (input->word_char, c))
return CONTEXT_WORD;
- return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
+ return (IS_NEWLINE (c) && input->newline_anchor) ? CONTEXT_NEWLINE : 0;
}
}
Jakub
More information about the Libc-alpha
mailing list