This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Change \B to match previous regex implementation (as well as dfa.c and perl regex)


Hi!

As http://sources.redhat.com/bugzilla/show_bug.cgi?id=693
shows, old GNU regex, as well as dfa.c and perl regex
all implement \B as negation of \b (but still regex.texi
documents it as an empty string inside of word, rather than
empty string inside of word or surrounded by non-word characters).

2005-01-26  Jakub Jelinek  <jakub@redhat.com>

	[BZ #693]
	* posix/regex_internal.h (DUMMY_CONSTRAINT): Rename to...
	(WORD_DELIM_CONSTRAINT): ...this.
	(NOT_WORD_DELIM_CONSTRAINT): Define.
	(re_context_type): Add INSIDE_NOTWORD and NOT_WORD_DELIM,
	change WORD_DELIM to use WORD_DELIM_CONSTRAINT.
	* posix/regcomp.c (peek_token): For \B create NOT_WORD_DELIM
	anchor instead of INSIDE_WORD.
	(parse_expression): Handle NOT_WORD_DELIM constraint.
	* posix/bug-regex19.c (tests): Adjust tests that relied on \B
	being inside word instead of not word delim.
	* posix/tst-rxspencer.c (mb_frob_pattern): Don't frob escaped
	characters.
	* posix/rxspencer/tests: Add some new tests.

--- libc/posix/regex_internal.h.jj	2005-01-08 16:50:24.000000000 +0100
+++ libc/posix/regex_internal.h	2005-01-26 14:56:20.210528592 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -143,18 +143,21 @@ static inline void bitset_mask (bitset d
 #define NEXT_NEWLINE_CONSTRAINT 0x0020
 #define PREV_BEGBUF_CONSTRAINT 0x0040
 #define NEXT_ENDBUF_CONSTRAINT 0x0080
-#define DUMMY_CONSTRAINT 0x0100
+#define WORD_DELIM_CONSTRAINT 0x0100
+#define NOT_WORD_DELIM_CONSTRAINT 0x0200
 
 typedef enum
 {
   INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
   WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
   WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
+  INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
   LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
   LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
   BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
   BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
-  WORD_DELIM = DUMMY_CONSTRAINT
+  WORD_DELIM = WORD_DELIM_CONSTRAINT,
+  NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
 } re_context_type;
 
 typedef struct
--- libc/posix/regcomp.c.jj	2005-01-19 14:13:00.000000000 +0100
+++ libc/posix/regcomp.c	2005-01-26 15:01:58.715941197 +0100
@@ -1859,7 +1859,7 @@ peek_token (token, input, syntax)
 	  if (!(syntax & RE_NO_GNU_OPS))
 	    {
 	      token->type = ANCHOR;
-	      token->opr.ctx_type = INSIDE_WORD;
+	      token->opr.ctx_type = NOT_WORD_DELIM;
 	    }
 	  break;
 	case 'w':
@@ -2349,15 +2349,25 @@ parse_expression (regexp, preg, token, s
       break;
     case ANCHOR:
       if ((token->opr.ctx_type
-	   & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
+	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
 	  && dfa->word_ops_used == 0)
 	init_word_char (dfa);
-      if (token->opr.ctx_type == WORD_DELIM)
+      if (token->opr.ctx_type == WORD_DELIM
+          || token->opr.ctx_type == NOT_WORD_DELIM)
 	{
 	  bin_tree_t *tree_first, *tree_last;
-	  token->opr.ctx_type = WORD_FIRST;
-	  tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
-	  token->opr.ctx_type = WORD_LAST;
+	  if (token->opr.ctx_type == WORD_DELIM)
+	    {
+	      token->opr.ctx_type = WORD_FIRST;
+	      tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
+	      token->opr.ctx_type = WORD_LAST;
+            }
+          else
+            {
+	      token->opr.ctx_type = INSIDE_WORD;
+	      tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
+	      token->opr.ctx_type = INSIDE_NOTWORD;
+            }
 	  tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
 	  token->type = OP_ALT;
 	  tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
--- libc/posix/bug-regex19.c.jj	2003-12-22 13:38:11.000000000 +0100
+++ libc/posix/bug-regex19.c	2005-01-26 15:10:51.334648249 +0100
@@ -1,5 +1,5 @@
 /* Regular expression tests.
-   Copyright (C) 2003 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
 
@@ -170,22 +170,22 @@ static struct test_s
   {ERE, "[^k]\\B[^k]", "kBk", 0, -1},
   {ERE, "[^C]\\B[^C]", "CCCABA", 0, 3},
   {ERE, "[^C]\\B[^C]", "CBC", 0, -1},
-  {ERE, ".(\\b|\\B).", "=~AB", 0, 1},
+  {ERE, ".(\\b|\\B).", "=~AB", 0, 0},
   {ERE, ".(\\b|\\B).", "A=C", 0, 0},
   {ERE, ".(\\b|\\B).", "ABC", 0, 0},
-  {ERE, ".(\\b|\\B).", "=~\\!", 0, -1},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 1},
+  {ERE, ".(\\b|\\B).", "=~\\!", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 0},
   {ERE, "[^k](\\b|\\B)[^k]", "A=C", 0, 0},
   {ERE, "[^k](\\b|\\B)[^k]", "ABC", 0, 0},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 3},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, -1},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, -1},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 1},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 0},
   {ERE, "[^C](\\b|\\B)[^C]", "A=C", 0, 0},
   {ERE, "[^C](\\b|\\B)[^C]", "ABC", 0, 0},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 3},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, -1},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, -1},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, 0},
   {ERE, "\\b([A]|[!]|.B)", "A=AC", 0, 0},
   {ERE, "\\b([A]|[!]|.B)", "=AC", 0, 1},
   {ERE, "\\b([A]|[!]|.B)", "!AC", 0, 1},
--- libc/posix/tst-rxspencer.c.jj	2003-12-29 15:01:28.000000000 +0100
+++ libc/posix/tst-rxspencer.c	2005-01-26 17:01:31.234511142 +0100
@@ -1,5 +1,5 @@
 /* Regular expression tests.
-   Copyright (C) 2003 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
 
@@ -127,14 +127,15 @@ mb_frob_string (const char *str, const c
 }
 
 /* Like mb_frob_string, but don't replace anything between
-   [: and :], [. and .] or [= and =].  */
+   [: and :], [. and .] or [= and =] or characters escaped
+   with a backslash.  */
 
 static char *
 mb_frob_pattern (const char *str, const char *letters)
 {
   char *ret, *dst;
   const char *src;
-  int in_class = 0;
+  int in_class = 0, escaped = 0;
 
   if (str == NULL)
     return NULL;
@@ -144,7 +145,18 @@ mb_frob_pattern (const char *str, const 
     return NULL;
 
   for (src = str, dst = ret; *src; ++src)
-    if (!in_class && strchr (letters, *src))
+    if (*src == '\\')
+      {
+	escaped ^= 1;
+	*dst++ = *src;
+      }
+    else if (escaped)
+      {
+	escaped = 0;
+	*dst++ = *src;
+	continue;
+      }
+    else if (!in_class && strchr (letters, *src))
       dst = mb_replace (dst, *src);
     else
       {
--- libc/posix/rxspencer/tests.jj	2004-11-19 13:38:46.000000000 +0100
+++ libc/posix/rxspencer/tests	2005-01-26 16:35:32.596730739 +0100
@@ -526,3 +526,12 @@ a((b+|((c)*)))+d	-	abcd	abcd	c,c,c,c
 (((\b))){0}	-	x	@x	-,-,-
 a(((.*)))b((\2)){0}c	-	abc	abc	@bc,@bc,@bc,-,-
 a(((.*)))b((\1)){0}c	-	axbc	axbc	x,x,x,-,-
+
+\b	&	SaT	@aT
+\b	&	aT	@aT
+a.*\b	&	abT	ab
+\b	&	STSS
+\B	&	abc	@bc
+\B	&	aSbTc
+\B	&	SaT	@SaT
+\B	&	aSTSb	@TSb

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]