This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] fix fall-out from Jakub's patches


This patch brings regex back to compile when !RE_ENABLE_I18N.

WRT the optional subexpressions, the second patch fixed the segmentation violation caused by the first alone; I simply had forgot a hunk. However running the glibc exposed further cases which need a more complicated approach.

In particular, consider a(b|((c)*))?d. With my approach \1 and \3 are set correctly because they are repeated, but \2 is set wrong because it is not inside a repeated subexpression. Avoiding all empty matches inside a repeated subexpression is blatantly wrong because of (Z())*. The right fix probably involves fiddling with the epsilon closure.

I have now embedded the glibc regex testsuite into the sed testsuite, which is going to help in doing my changes.

Thanks for accepting the patches,

Paolo
Index: regcomp.c
===================================================================
RCS file: /cvs/glibc/libc/posix/regcomp.c,v
retrieving revision 1.67
diff -u -p -r1.67 regcomp.c
--- regcomp.c	29 Nov 2003 03:36:20 -0000	1.67
+++ regcomp.c	15 Dec 2003 09:38:10 -0000
@@ -406,8 +409,11 @@ re_compile_fastmap_iter (bufp, init_stat
 	    }
 	}
 #endif /* RE_ENABLE_I18N */
-      else if (type == END_OF_RE || type == OP_PERIOD
-	       || type == OP_UTF8_PERIOD)
+      else if (type == OP_PERIOD
+#ifdef RE_ENABLE_I18N
+	       || type == OP_UTF8_PERIOD
+#endif /* RE_ENABLE_I18N */
+	       || type == END_OF_RE)
 	{
 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 	  if (type == END_OF_RE)
Index: regex_internal.c
===================================================================
RCS file: /cvs/glibc/libc/posix/regex_internal.c,v
retrieving revision 1.32
diff -u -p -r1.32 regex_internal.c
--- regex_internal.c	15 Dec 2003 00:45:07 -0000	1.32
+++ regex_internal.c	15 Dec 2003 09:38:11 -0000
@@ -786,7 +786,7 @@ re_string_peek_byte_case (const re_strin
 static unsigned char
 re_string_fetch_byte_case (re_string_t *pstr)
 {
-  int ch, off;
+  int ch;
 
   if (BE (!pstr->icase, 1))
     return re_string_fetch_byte (pstr);
@@ -794,6 +794,8 @@ re_string_fetch_byte_case (re_string_t *
 #ifdef RE_ENABLE_I18N
   if (pstr->offsets_needed)
     {
+      int off;
+
       /* For tr_TR.UTF-8 [[:islower:]] there is
 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 	 in that case the whole multi-byte character and return
Index: regexec.c
===================================================================
RCS file: /cvs/glibc/libc/posix/regexec.c,v
retrieving revision 1.42
diff -u -p -r1.42 regexec.c
--- regexec.c	14 Dec 2003 23:40:44 -0000	1.42
+++ regexec.c	15 Dec 2003 09:38:11 -0000
@@ -807,6 +807,7 @@ re_search_internal (preg, string, length
       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 	if (pmatch[reg_idx].rm_so != -1)
 	  {
+#ifdef RE_ENABLE_I18N
 	    if (BE (input.offsets_needed != 0, 0))
 	      {
 		if (pmatch[reg_idx].rm_so == input.valid_len)
@@ -818,6 +819,9 @@ re_search_internal (preg, string, length
 		else
 		  pmatch[reg_idx].rm_eo = input.offsets[pmatch[reg_idx].rm_eo];
 	      }
+#else
+	    assert (input.offsets_needed == 0);
+#endif
 	    pmatch[reg_idx].rm_so += match_first;
 	    pmatch[reg_idx].rm_eo += match_first;
 	  }
@@ -3365,6 +3401,7 @@ group_nodes_into_DFAstates (preg, state,
 	  if (preg->syntax & RE_DOT_NOT_NULL)
 	    bitset_clear (accepts, '\0');
 	}
+#ifdef RE_ENABLE_I18N
       else if (type == OP_UTF8_PERIOD)
         {
 	  memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
@@ -3373,6 +3410,7 @@ group_nodes_into_DFAstates (preg, state,
 	  if (preg->syntax & RE_DOT_NOT_NULL)
 	    bitset_clear (accepts, '\0');
         }
+#endif
       else
 	continue;
 
@@ -3820,10 +3858,12 @@ check_node_accept (preg, node, mctx, idx
       return node->opr.c == ch;
     case SIMPLE_BRACKET:
       return bitset_contain (node->opr.sbcset, ch);
+#ifdef RE_ENABLE_I18N
     case OP_UTF8_PERIOD:
       if (ch >= 0x80)
         return 0;
       /* FALLTHROUGH */
+#endif
     case OP_PERIOD:
       return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
 	       || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]