This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Kill regexp.h


<regexp.h> (not to be confused with <regex.h>) is an obsolete and
frankly horrible regular expression-matching API.  It was part of SVID
but was withdrawn in Issue 5 (for reference, we're on Issue 7 now).
It doesn't do anything you can't do with <regex.h>, and using it
involves defining a bunch of macros before including the header.
Moreover, the code in regexp.h that uses those macros has been buggy
since its creation (in 1996) and no one has noticed, which indicates
to me that there are no users.  (Specifically, RETURN() is used in a
whole bunch of cases where it should have been ERROR().)

Therefore, this patch stubs out the header and demotes the
implementation to compatibility symbols.  I hope this is not too late
to squeeze into glibc 2.22.  (Note: there isn't any predefined macro
to put .bss symbols in a compatibility subsection.  Using
'attribute_data_compat_section' would make the libc on disk bigger,
which seems like a move in the wrong direction.  It's only three
pointers' worth of junk, anyway...)

The ABI checker does not appear to cover these symbols; I manually
tested the effect of the patch as follows:

# glibc 2.19 (Debian)
$ readelf --dyn-syms /lib/x86_64-linux-gnu/libc.so.6 | grep -E 'advance|step|loc[12s]'
   540: 00000000000e4da0   104 FUNC    WEAK   DEFAULT   12 step@@GLIBC_2.2.5
  1364: 00000000000e4e10    84 FUNC    WEAK   DEFAULT   12 advance@@GLIBC_2.2.5
  2051: 00000000003a85e8     8 OBJECT  GLOBAL DEFAULT   32 loc1@@GLIBC_2.2.5
  2054: 00000000003a85f0     8 OBJECT  GLOBAL DEFAULT   32 loc2@@GLIBC_2.2.5
  2198: 00000000003a85f8     8 OBJECT  GLOBAL DEFAULT   32 locs@@GLIBC_2.2.5

# patched libc
$ readelf --dyn-syms ./libc.so.6 | grep -E 'advance|step|loc[12s]'
   541: 000000000011eb80   104 FUNC    WEAK   DEFAULT   12 step@GLIBC_2.2.5
  1374: 000000000011ebf0    84 FUNC    WEAK   DEFAULT   12 advance@GLIBC_2.2.5
  2065: 00000000003a35e0     8 OBJECT  GLOBAL DEFAULT   32 loc1@GLIBC_2.2.5
  2068: 00000000003a35e8     8 OBJECT  GLOBAL DEFAULT   32 loc2@GLIBC_2.2.5
  2211: 00000000003a35d8     8 OBJECT  GLOBAL DEFAULT   32 locs@GLIBC_2.2.5

$ cat test.c
#include <inttypes.h>

extern char *loc1;
extern char *loc2;
extern char *locs;
extern int step();
extern int advance();

int main(void)
{
  return (int)((intptr_t)step + (intptr_t)advance + (intptr_t)&locs + (intptr_t)&loc1 + (intptr_t)&loc2);
}

$ gcc-5 test.c; echo $?
0
$ ./testrun.sh ./a.out; echo $?
136
$ gcc-5 -nostdlib -nostartfiles csu/crt1.o csu/crti.o `gcc-5 --print-file-name=crtbegin.o` test.c libc.so.6 libc_nonshared.a -lgcc `gcc-5 --print-file-name=crtend.o` csu/crtn.o
/tmp/ccC6lyqC.o: In function `main':
test.c:(.text+0x5): undefined reference to `locs'
test.c:(.text+0xc): undefined reference to `step'
test.c:(.text+0x13): undefined reference to `loc2'
test.c:(.text+0x1c): undefined reference to `advance'
test.c:(.text+0x23): undefined reference to `loc1'
collect2: error: ld returned 1 exit status

---

N.B. I believe there *is* a past-and-future-changes copyright
assignment on file for me for glibc, but it was filed long, long
ago, if I need to do new paperwork that's OK.

zw

---
2015-07-12  Zack Weinberg  <zackw@panix.com>

	* misc/regexp.h: This interface is no longer supported.
	Remove all contents, leaving only an #error directive.
	* misc/regexp.c: Do not include regexp.h.
	(loc1, loc2, locs, step, advance): Demote to compatibility symbols.

 NEWS          |   4 ++
 misc/regexp.c |  37 +++++++----
 misc/regexp.h | 207 ++--------------------------------------------------------
 3 files changed, 34 insertions(+), 214 deletions(-)

diff --git a/NEWS b/NEWS
index 2cfc43e..c00dba1 100644
--- a/NEWS
+++ b/NEWS
@@ -71,6 +71,10 @@ Version 2.22
   compliance. The new implementation fixes the following long-standing
   issues: BZ#6544, BZ#11216, BZ#12836, BZ#13151, BZ#13152, and BZ#14292. The
   old implementation is still present for use be by existing binaries.
+
+* The obsolete header <regexp.h> (from SUSv2) has been replaced with a
+  stub.  Programs that require this header must be updated to use
+  <regex.h> instead.
 
 Version 2.21
 
diff --git a/misc/regexp.c b/misc/regexp.c
index 3b83203..d35888f 100644
--- a/misc/regexp.c
+++ b/misc/regexp.c
@@ -1,4 +1,4 @@
-/* Define function and variables for the obsolete <regexp.h> interface.
+/* Compatibility symbols for the obsolete <regexp.h> interface.
    Copyright (C) 1996-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -17,24 +17,32 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define __DO_NOT_DEFINE_COMPILE
-#include <regexp.h>
+/* regexp.h now contains only an #error directive, so it cannot be
+   used in this file.
+
+   The function that would produce an 'expbuf' to use as the second
+   argument to 'step' and 'advance' was defined only in regexp.h,
+   as its definition depended on macros defined by the user.  */
+
+#include <regex.h>
+#include <shlib-compat.h>
+
+#if SHLIB_COMPAT(libc, GLIBC_2_0, GLIBC_2_22)
 
-/* Define the variables used for the interface.  */
 char *loc1;
 char *loc2;
-
-/* Although we do not support the use we define this variable as well.  */
 char *locs;
-
+compat_symbol(libc, loc1, loc1, GLIBC_2_0);
+compat_symbol(libc, loc2, loc2, GLIBC_2_0);
+compat_symbol(libc, locs, locs, GLIBC_2_0);
 
 /* Find the next match in STRING.  The compiled regular expression is
    found in the buffer starting at EXPBUF.  `loc1' will return the
    first character matched and `loc2' points to the next unmatched
    character.  */
-extern int __step (const char *string, const char *expbuf);
 int
-__step (const char *string, const char *expbuf)
+weak_function attribute_compat_text_section
+step (const char *string, const char *expbuf)
 {
   regmatch_t match;	/* We only need info about the full match.  */
 
@@ -49,15 +57,14 @@ __step (const char *string, const char *expbuf)
   loc2 = (char *) string + match.rm_eo;
   return 1;
 }
-weak_alias (__step, step)
-
+compat_symbol (libc, step, step, GLIBC_2_0);
 
 /* Match the beginning of STRING with the compiled regular expression
    in EXPBUF.  If the match is successful `loc2' will contain the
    position of the first unmatched character.  */
-extern int __advance (const char *string, const char *expbuf);
 int
-__advance (const char *string, const char *expbuf)
+weak_function attribute_compat_text_section
+advance (const char *string, const char *expbuf)
 {
   regmatch_t match;	/* We only need info about the full match.  */
 
@@ -74,4 +81,6 @@ __advance (const char *string, const char *expbuf)
   loc2 = (char *) string + match.rm_eo;
   return 1;
 }
-weak_alias (__advance, advance)
+compat_symbol (libc, advance, advance, GLIBC_2_0);
+
+#endif /* SHLIB_COMPAT(2.0, 2.22) */
diff --git a/misc/regexp.h b/misc/regexp.h
index 3fc0bc5..43c6e10 100644
--- a/misc/regexp.h
+++ b/misc/regexp.h
@@ -19,208 +19,15 @@
 #ifndef _REGEXP_H
 #define _REGEXP_H	1
 
-/* The contents of this header file was first standardized in X/Open
+/* The contents of this header file were first standardized in X/Open
    System Interface and Headers Issue 2, originally coming from SysV.
-   In issue 4, version 2, it is marked as TO BE WITDRAWN, and it has
-   been withdrawn in SUSv3.
+   In issue 4, version 2, it was marked as TO BE WITHDRAWN, and it was
+   duly withdrawn in issue 5.
 
-   This code shouldn't be used in any newly written code.  It is
-   included only for compatibility reasons.  Use the POSIX definition
-   in <regex.h> for portable applications and a reasonable interface.  */
+   As of GNU libc 2.22, the interfaces in this header have been removed.
+   Use the <regex.h> interfaces instead. */
 
-#include <features.h>
-#include <alloca.h>
-#include <regex.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* The implementation provided here emulates the needed functionality
-   by mapping to the POSIX regular expression matcher.  The interface
-   for the here included function is weird (this really is a harmless
-   word).
-
-   The user has to provide six macros before this header file can be
-   included:
-
-   INIT		Declarations vor variables which can be used by the
-		other macros.
-
-   GETC()	Return the value of the next character in the regular
-		expression pattern.  Successive calls should return
-		successive characters.
-
-   PEEKC()	Return the value of the next character in the regular
-		expression pattern.  Immediately successive calls to
-		PEEKC() should return the same character which should
-		also be the next character returned by GETC().
-
-   UNGETC(c)	Cause `c' to be returned by the next call to GETC() and
-		PEEKC().
-
-   RETURN(ptr)	Used for normal exit of the `compile' function.  `ptr'
-		is a pointer to the character after the last character of
-		the compiled regular expression.
-
-   ERROR(val)	Used for abnormal return from `compile'.  `val' is the
-		error number.  The error codes are:
-		11	Range endpoint too large.
-		16	Bad number.
-		25	\digit out of range.
-		36	Illegal or missing delimiter.
-		41	No remembered search string.
-		42	\( \) imbalance.
-		43	Too many \(.
-		44	More tan two numbers given in \{ \}.
-		45	} expected after \.
-		46	First number exceeds second in \{ \}.
-		49	[ ] imbalance.
-		50	Regular expression overflow.
-
-  */
-
-__BEGIN_DECLS
-
-/* Interface variables.  They contain the results of the successful
-   calls to `setp' and `advance'.  */
-extern char *loc1;
-extern char *loc2;
-
-/* The use of this variable in the `advance' function is not
-   supported.  */
-extern char *locs;
-
-
-#ifndef __DO_NOT_DEFINE_COMPILE
-/* Get and compile the user supplied pattern up to end of line or
-   string or until EOF is seen, whatever happens first.  The result is
-   placed in the buffer starting at EXPBUF and delimited by ENDBUF.
-
-   This function cannot be defined in the libc itself since it depends
-   on the macros.  */
-char *
-compile (char *__restrict instring, char *__restrict expbuf,
-	 const char *__restrict endbuf, int eof)
-{
-  char *__input_buffer = NULL;
-  size_t __input_size = 0;
-  size_t __current_size = 0;
-  int __ch;
-  int __error;
-  INIT
-
-  /* Align the expression buffer according to the needs for an object
-     of type `regex_t'.  Then check for minimum size of the buffer for
-     the compiled regular expression.  */
-  regex_t *__expr_ptr;
-# if defined __GNUC__ && __GNUC__ >= 2
-  const size_t __req = __alignof__ (regex_t *);
-# else
-  /* How shall we find out?  We simply guess it and can change it is
-     this really proofs to be wrong.  */
-  const size_t __req = 8;
-# endif
-  expbuf += __req;
-  expbuf -= (expbuf - ((char *) 0)) % __req;
-  if (endbuf < expbuf + sizeof (regex_t))
-    {
-      ERROR (50);
-    }
-  __expr_ptr = (regex_t *) expbuf;
-  /* The remaining space in the buffer can be used for the compiled
-     pattern.  */
-  __expr_ptr->__REPB_PREFIX (buffer) = expbuf + sizeof (regex_t);
-  __expr_ptr->__REPB_PREFIX (allocated)
-    = endbuf - (char *) __expr_ptr->__REPB_PREFIX (buffer);
-
-  while ((__ch = (GETC ())) != eof)
-    {
-      if (__ch == '\0' || __ch == '\n')
-	{
-	  UNGETC (__ch);
-	  break;
-	}
-
-      if (__current_size + 1 >= __input_size)
-	{
-	  size_t __new_size = __input_size ? 2 * __input_size : 128;
-	  char *__new_room = (char *) alloca (__new_size);
-	  /* See whether we can use the old buffer.  */
-	  if (__new_room + __new_size == __input_buffer)
-	    {
-	      __input_size += __new_size;
-	      __input_buffer = (char *) memcpy (__new_room, __input_buffer,
-					       __current_size);
-	    }
-	  else if (__input_buffer + __input_size == __new_room)
-	    __input_size += __new_size;
-	  else
-	    {
-	      __input_size = __new_size;
-	      __input_buffer = (char *) memcpy (__new_room, __input_buffer,
-						__current_size);
-	    }
-	}
-      __input_buffer[__current_size++] = __ch;
-    }
-  if (__current_size)
-    __input_buffer[__current_size++] = '\0';
-  else
-    __input_buffer = "";
-
-  /* Now compile the pattern.  */
-  __error = regcomp (__expr_ptr, __input_buffer, REG_NEWLINE);
-  if (__error != 0)
-    /* Oh well, we have to translate POSIX error codes.  */
-    switch (__error)
-      {
-      case REG_BADPAT:
-      case REG_ECOLLATE:
-      case REG_ECTYPE:
-      case REG_EESCAPE:
-      case REG_BADRPT:
-      case REG_EEND:
-      case REG_ERPAREN:
-      default:
-	/* There is no matching error code.  */
-	RETURN (36);
-      case REG_ESUBREG:
-	RETURN (25);
-      case REG_EBRACK:
-	RETURN (49);
-      case REG_EPAREN:
-	RETURN (42);
-      case REG_EBRACE:
-	RETURN (44);
-      case REG_BADBR:
-	RETURN (46);
-      case REG_ERANGE:
-	RETURN (11);
-      case REG_ESPACE:
-      case REG_ESIZE:
-	ERROR (50);
-      }
-
-  /* Everything is ok.  */
-  RETURN ((char *) (__expr_ptr->__REPB_PREFIX (buffer)
-		    + __expr_ptr->__REPB_PREFIX (used)));
-}
-#endif
-
-
-/* Find the next match in STRING.  The compiled regular expression is
-   found in the buffer starting at EXPBUF.  `loc1' will return the
-   first character matched and `loc2' points to the next unmatched
-   character.  */
-extern int step (const char *__restrict __string,
-		 const char *__restrict __expbuf) __THROW;
-
-/* Match the beginning of STRING with the compiled regular expression
-   in EXPBUF.  If the match is successful `loc2' will contain the
-   position of the first unmatched character.  */
-extern int advance (const char *__restrict __string,
-		    const char *__restrict __expbuf) __THROW;
-
-
-__END_DECLS
+#error "GNU libc no longer implements <regexp.h>."
+#error "Revise your code to use <regex.h> (no P)."
 
 #endif /* regexp.h */
-- 
2.1.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]