This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[regex] BZ #522 Enable UTF-8 and ASCII optimizations outside glibc,take 2

From: Paolo Bonzini <paolo dot bonzini at lu dot unisi dot ch>
To: libc-alpha at sources dot redhat dot com, bruno at clisp dot org
Date: Mon, 08 Nov 2004 11:11:18 +0100
Subject: [regex] BZ #522 Enable UTF-8 and ASCII optimizations outside glibc,take 2

This is a simpler version of the patch I submitted on October 27. It avoids reimplementing locale_charset, because the only charset name it has to check against is UTF-8: the four possibilities UTF-8, UTF8, utf-8 and utf8 should cover most OSes, and were anyway the same that the bigger patch covered.

This patch avoids checking against a list of known ASCII-superset character sets, because I think it is enough to check that "btowc (x) == (wchar_t) x" for 0<=x<=127. We already have a loop executing btowc 256 times, so the cost of this additional, more robust check is small.

As I told Bruno in private mail, I'm quite reluctant to make sed slower/faster depending on the presence of a file other than /bin/sed, so using localcharset.c is not my favorite option. Bruno, are you ok with this (or do you dislike it less)?

Paolo

2004-09-08  Paolo Bonzini  <bonzini@gnu.org>

	* regcomp.c (init_dfa): Get the codeset name outside glibc as
	well.  Check if it is spelled UTF8 as well as UTF-8, and check
	case-insensitively.  Set dfa->map_notascii manually when outside
	glibc.
	* regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
	optimizations based on map_notascii.
	* regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
	|| _LIBC]: Include langinfo.h.

--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -824,6 +824,7 @@ init_dfa (dfa, pat_len)
      int pat_len;
 {
   int table_size;
+  char *codeset_name;
 
   memset (dfa, '\0', sizeof (re_dfa_t));
 
@@ -847,13 +848,36 @@ init_dfa (dfa, pat_len)
   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 
   dfa->mb_cur_max = MB_CUR_MAX;
-#ifdef _LIBC
+#if defined _LIBC
+  codeset_name = _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME);
+#elif defined HAVE_LANGINFO_CODESET
+  codeset_name = nl_langinfo (CODESET);
+#else
+  codeset_name = getenv ("LC_ALL");
+  if (codeset_name == NULL || codeset[0] == '\0')
+    codeset_name = getenv ("LC_CTYPE");
+  if (codeset_name == NULL || codeset[0] == '\0')
+    codeset_name = getenv ("LANG");
+  if (codeset_name == NULL)
+    codeset_name = "";
+  else if (strchr (codeset_name, '.')
+    codeset_name = strchr (codeset_name, '.') + 1;
+#endif
+
   if (dfa->mb_cur_max == 6
-      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
+      && (strcasecmp (codeset_name, "UTF-8") == 0
+	  || strcasecmp (codeset_name, "UTF8") == 0))
     dfa->is_utf8 = 1;
+
+#ifdef _LIBC
   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 		       != 0);
+#else
+  /* We check exhaustively in the loop below if this charset is a
+     superset of ASCII.  */
+  dfa->map_notascii = 0;
 #endif
+
 #ifdef RE_ENABLE_I18N
   if (dfa->mb_cur_max > 1)
     {
@@ -867,8 +891,15 @@ init_dfa (dfa, pat_len)
       else
 	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 	  for (j = 0; j < UINT_BITS; ++j, ++ch)
-	    if (__btowc (ch) != WEOF)
-	      dfa->sb_char[i] |= 1 << j;
+	    {
+	      wchar_t wch = __btowc (ch);
+	      if (wch != WEOF)
+	        dfa->sb_char[i] |= 1 << j;
+#ifndef _LIBC
+	      if (ch <= 127 && wch != (wchar_t) ch)
+	        dfa->map_notascii = 1;
+#endif
+	    }
     }
 #endif
 


--- orig/lib/regex_internal.c
+++ mod/lib/regex_internal.c
@@ -293,7 +293,6 @@ build_wcs_upper_buffer (pstr)
   byte_idx = pstr->valid_len;
   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 
-#ifdef _LIBC
   /* The following optimization assumes that the wchar_t encoding is
      always ISO 10646.  */
   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
@@ -368,14 +367,11 @@ build_wcs_upper_buffer (pstr)
       return REG_NOERROR;
     }
   else
-#endif
     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
       {
 	wchar_t wc;
 	const char *p;
-#ifdef _LIBC
 offsets_needed:
-#endif
 	remain_len = end_idx - byte_idx;
 	prev_st = pstr->cur_state;
 	if (BE (pstr->trans != NULL, 0))
@@ -647,7 +643,6 @@ re_string_reconstruct (pstr, idx, eflags
 	      int wcs_idx;
 	      wint_t wc = WEOF;
 
-#ifdef _LIBC
 	      if (pstr->is_utf8)
 		{
 		  const unsigned char *raw, *p, *q, *end;
@@ -687,7 +682,6 @@ re_string_reconstruct (pstr, idx, eflags
 			break;
 		      }
 		}
-#endif
 	      if (wc == WEOF)
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 	      if (BE (pstr->valid_len, 0))


--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -27,6 +27,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
+# include <langinfo.h>
+#endif
 #if defined HAVE_LOCALE_H || defined _LIBC
 # include <locale.h>
 #endif

Follow-Ups:
- Re: [regex] BZ #522 Enable UTF-8 and ASCII optimizations outsideglibc, take 2
  - From: Ulrich Drepper

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]