This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
[regex] BZ #522 Enable UTF-8 and ASCII optimizations outside glibc,take 2
- From: Paolo Bonzini <paolo dot bonzini at lu dot unisi dot ch>
- To: libc-alpha at sources dot redhat dot com, bruno at clisp dot org
- Date: Mon, 08 Nov 2004 11:11:18 +0100
- Subject: [regex] BZ #522 Enable UTF-8 and ASCII optimizations outside glibc,take 2
This is a simpler version of the patch I submitted on October 27. It
avoids reimplementing locale_charset, because the only charset name it
has to check against is UTF-8: the four possibilities UTF-8, UTF8, utf-8
and utf8 should cover most OSes, and were anyway the same that the
bigger patch covered.
This patch avoids checking against a list of known ASCII-superset
character sets, because I think it is enough to check that "btowc (x) ==
(wchar_t) x" for 0<=x<=127. We already have a loop executing btowc 256
times, so the cost of this additional, more robust check is small.
As I told Bruno in private mail, I'm quite reluctant to make sed
slower/faster depending on the presence of a file other than /bin/sed,
so using localcharset.c is not my favorite option. Bruno, are you ok
with this (or do you dislike it less)?
Paolo
2004-09-08 Paolo Bonzini <bonzini@gnu.org>
* regcomp.c (init_dfa): Get the codeset name outside glibc as
well. Check if it is spelled UTF8 as well as UTF-8, and check
case-insensitively. Set dfa->map_notascii manually when outside
glibc.
* regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
optimizations based on map_notascii.
* regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
|| _LIBC]: Include langinfo.h.
--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -824,6 +824,7 @@ init_dfa (dfa, pat_len)
int pat_len;
{
int table_size;
+ char *codeset_name;
memset (dfa, '\0', sizeof (re_dfa_t));
@@ -847,13 +848,36 @@ init_dfa (dfa, pat_len)
dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
dfa->mb_cur_max = MB_CUR_MAX;
-#ifdef _LIBC
+#if defined _LIBC
+ codeset_name = _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME);
+#elif defined HAVE_LANGINFO_CODESET
+ codeset_name = nl_langinfo (CODESET);
+#else
+ codeset_name = getenv ("LC_ALL");
+ if (codeset_name == NULL || codeset[0] == '\0')
+ codeset_name = getenv ("LC_CTYPE");
+ if (codeset_name == NULL || codeset[0] == '\0')
+ codeset_name = getenv ("LANG");
+ if (codeset_name == NULL)
+ codeset_name = "";
+ else if (strchr (codeset_name, '.')
+ codeset_name = strchr (codeset_name, '.') + 1;
+#endif
+
if (dfa->mb_cur_max == 6
- && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
+ && (strcasecmp (codeset_name, "UTF-8") == 0
+ || strcasecmp (codeset_name, "UTF8") == 0))
dfa->is_utf8 = 1;
+
+#ifdef _LIBC
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0);
+#else
+ /* We check exhaustively in the loop below if this charset is a
+ superset of ASCII. */
+ dfa->map_notascii = 0;
#endif
+
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
{
@@ -867,8 +891,15 @@ init_dfa (dfa, pat_len)
else
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
for (j = 0; j < UINT_BITS; ++j, ++ch)
- if (__btowc (ch) != WEOF)
- dfa->sb_char[i] |= 1 << j;
+ {
+ wchar_t wch = __btowc (ch);
+ if (wch != WEOF)
+ dfa->sb_char[i] |= 1 << j;
+#ifndef _LIBC
+ if (ch <= 127 && wch != (wchar_t) ch)
+ dfa->map_notascii = 1;
+#endif
+ }
}
#endif
--- orig/lib/regex_internal.c
+++ mod/lib/regex_internal.c
@@ -293,7 +293,6 @@ build_wcs_upper_buffer (pstr)
byte_idx = pstr->valid_len;
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
-#ifdef _LIBC
/* The following optimization assumes that the wchar_t encoding is
always ISO 10646. */
if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
@@ -368,14 +367,11 @@ build_wcs_upper_buffer (pstr)
return REG_NOERROR;
}
else
-#endif
for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
{
wchar_t wc;
const char *p;
-#ifdef _LIBC
offsets_needed:
-#endif
remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state;
if (BE (pstr->trans != NULL, 0))
@@ -647,7 +643,6 @@ re_string_reconstruct (pstr, idx, eflags
int wcs_idx;
wint_t wc = WEOF;
-#ifdef _LIBC
if (pstr->is_utf8)
{
const unsigned char *raw, *p, *q, *end;
@@ -687,7 +682,6 @@ re_string_reconstruct (pstr, idx, eflags
break;
}
}
-#endif
if (wc == WEOF)
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
if (BE (pstr->valid_len, 0))
--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -27,6 +27,9 @@
#include <stdlib.h>
#include <string.h>
+#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
+# include <langinfo.h>
+#endif
#if defined HAVE_LOCALE_H || defined _LIBC
# include <locale.h>
#endif