This is the mail archive of the libc-hacker@sourceware.org mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH] Fix re_search with multibyte locales other than UTF-8

From: Jakub Jelinek <jakub at redhat dot com>
To: Ulrich Drepper <drepper at redhat dot com>
Cc: Glibc hackers <libc-hacker at sources dot redhat dot com>
Date: Fri, 2 Jun 2006 16:39:30 +0200
Subject: [PATCH] Fix re_search with multibyte locales other than UTF-8
Reply-to: Jakub Jelinek <jakub at redhat dot com>

Hi!

As the attached testcase shows, we have some issues with non-UTF-8 mb
charset handling.  One bug (the one that causes the failure) is that
we clear valid_raw_len but then in re_string_skip_chars (which has just this single
caller) we use that.  This means that we can try to convert bytes from the
middle of multibyte character and return non-zero valid_len when a mb
character starts at idx.  Another thing is that if mbrtowc failed, wc would
be undefined.  Also, if idx is close after the end of current valid raw
string, there might not be any complete characters re_string_skip_chars
would skip over and in that case we want to use context from the end of
the previous valid buffer.

2006-06-02  Jakub Jelinek  <jakub@redhat.com>

	* posix/regex_internal.c (re_string_skip_chars): If no character has been
	converted at all, set *last_wc to WEOF.  If mbrtowc failed, set wc to the
	byte which couldn't be converted.
	(re_string_reconstruct): Don't clear valid_raw_len before calling
	re_string_skip_chars.  If wc is WEOF after re_string_skip_chars, set
	tip_context using re_string_context_at.
	* posix/Makefile: Add rules to build and run bug-regex25 test.
	* posix/bug-regex25.c: New test.

--- libc/posix/regex_internal.c.jj	2006-06-02 16:19:33.000000000 +0200
+++ libc/posix/regex_internal.c	2006-06-02 16:19:42.000000000 +0200
@@ -482,7 +482,7 @@ re_string_skip_chars (re_string_t *pstr,
   mbstate_t prev_st;
   int rawbuf_idx;
   size_t mbclen;
-  wchar_t wc = 0;
+  wchar_t wc = WEOF;
 
   /* Skip the characters which are not necessary to check.  */
   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
@@ -495,7 +495,11 @@ re_string_skip_chars (re_string_t *pstr,
 			remain_len, &pstr->cur_state);
       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 	{
-	  /* We treat these cases as a singlebyte character.  */
+	  /* We treat these cases as a single byte character.  */
+	  if (mbclen == 0 || remain_len == 0)
+	    wc = L'\0';
+	  else
+	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 	  mbclen = 1;
 	  pstr->cur_state = prev_st;
 	}
@@ -618,7 +622,6 @@ re_string_reconstruct (re_string_t *pstr
 	    }
 #endif
 	  pstr->valid_len = 0;
-	  pstr->valid_raw_len = 0;
 #ifdef RE_ENABLE_I18N
 	  if (pstr->mb_cur_max > 1)
 	    {
@@ -681,6 +684,16 @@ re_string_reconstruct (re_string_t *pstr
 
 	      if (wc == WEOF)
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+	      if (wc == WEOF)
+		pstr->tip_context
+		  = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+	      else
+		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
+				      && IS_WIDE_WORD_CHAR (wc))
+				     ? CONTEXT_WORD
+				     : ((IS_WIDE_NEWLINE (wc)
+					 && pstr->newline_anchor)
+					? CONTEXT_NEWLINE : 0));
 	      if (BE (pstr->valid_len, 0))
 		{
 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
@@ -689,17 +702,12 @@ re_string_reconstruct (re_string_t *pstr
 		    memset (pstr->mbs, 255, pstr->valid_len);
 		}
 	      pstr->valid_raw_len = pstr->valid_len;
-	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
-				    && IS_WIDE_WORD_CHAR (wc))
-				   ? CONTEXT_WORD
-				   : ((IS_WIDE_NEWLINE (wc)
-				       && pstr->newline_anchor)
-				      ? CONTEXT_NEWLINE : 0));
 	    }
 	  else
 #endif /* RE_ENABLE_I18N */
 	    {
 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
+	      pstr->valid_raw_len = 0;
 	      if (pstr->trans)
 		c = pstr->trans[c];
 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
--- libc/posix/Makefile.jj	2006-05-03 21:38:02.000000000 +0200
+++ libc/posix/Makefile	2006-06-02 16:20:27.000000000 +0200
@@ -81,7 +81,7 @@ tests		:= tstgetopt testfnm runtests run
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
 		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
 		   bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
-		   tst-nice tst-nanosleep tst-regex2 \
+		   bug-regex25 tst-nice tst-nanosleep tst-regex2 \
 		   transbug tst-rxspencer tst-pcre tst-boost \
 		   bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
 		   tst-getaddrinfo2 bug-glob1 bug-glob2 tst-sysconf \
@@ -188,6 +188,7 @@ bug-regex19-ENV = LOCPATH=$(common-objpf
 bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex22-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex23-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex25-ENV = LOCPATH=$(common-objpfx)localedata
 tst-rxspencer-ARGS = --utf8 rxspencer/tests
 tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
 tst-pcre-ARGS = PCRE.tests
--- libc/posix/bug-regex25.c.jj	2006-06-02 16:14:35.000000000 +0200
+++ libc/posix/bug-regex25.c	2006-06-02 16:18:49.000000000 +0200
@@ -0,0 +1,57 @@
+/* Test re_search in multibyte locale other than UTF-8.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2006.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <locale.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+
+const char *str1 = "\xa3\xd8\xa3\xc9\xa3\xc9";
+const char *str2 = "\xa3\xd8\xa3\xc9";
+
+int
+main (void)
+{
+  setlocale (LC_ALL, "ja_JP.eucJP");
+
+  re_set_syntax (RE_SYNTAX_SED);
+
+  struct re_pattern_buffer re;
+  memset (&re, 0, sizeof (re));
+
+  struct re_registers regs;
+  memset (&regs, 0, sizeof (regs));
+
+  re_compile_pattern ("$", 1, &re);
+
+  int ret = 0, r = re_search (&re, str1, 4, 0, 4, &regs);
+  if (r != 4)
+    {
+      printf ("First re_search returned %d\n", r);
+      ret = 1;
+    }
+  r = re_search (&re, str2, 4, 0, 4, &regs);
+  if (r != 4)
+    {
+      printf ("Second re_search returned %d\n", r);
+      ret = 1;
+    }
+  return ret;
+}

	Jakub

Follow-Ups:
- Re: [PATCH] Fix re_search with multibyte locales other than UTF-8
  - From: Ulrich Drepper

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]