This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.
Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi! The following patch speeds up UTF-8 handling in regex (and perhaps other MB charsets if start of mb character can be determined, though for them I haven't implemented the hooks yet). The extended tst-regex test can last several hours without this patch and finish within seconds with it. >From real world tests, e.g.: time LC_ALL=en_US.UTF-8 LD_LIBRARY_PATH=/usr/src/libc.old/obj /bin/sed 's/./x/g' /etc/termcap > /dev/null real 0m8.884s user 0m8.880s sys 0m0.010s time LC_ALL=en_US.UTF-8 LD_LIBRARY_PATH=/usr/src/libc/obj /bin/sed 's/./x/g' /etc/termcap > /dev/null real 0m3.121s user 0m3.100s sys 0m0.020s (where the only difference between those 2 libcs is this patch). 2003-11-11 Jakub Jelinek <jakub@redhat.com> * iconv/gconv.h (__gconv_prevmb_fct): New typedef. (struct __gconv_step): New field __prevmb_fct. * iconv/gconv_int.h (__gconv_prevmb_ascii): New declaration. * iconv/gconv_simple.c (BUILTIN_TRANSFORMATION): Add PrevMbFct argument. (__gconv_prevmb_ascii): New function. * iconv/gconv_builtin.h: Add PrevMbFct argument to all BUILTIN_TRANSFORMATION invocations. * iconv/gconv_conf.c (BUILTIN_TRANSFORMATION): Add PrevMbFct argument. * iconv/iconvconfig.c (BUILTIN_TRANSFORMATION): Likewise. * iconv/gconv_builtin.c (map): New field prevmb_fct. (BUILTIN_TRANSFORMATION): Add PrevMbFct argument. Use it to initialize prevmb_fct field. (__gconv_get_builtin_trans): Initialize __prevmb_fct field. * iconv/gconv_cache.c (find_module): Initialize __prevmb_fct field. * iconv/gconv_db.c (gen_steps, increment_counter): Likewise. * iconv/skeleton.c: Document FROM_PREVMB. (gconv_init): Initialize __prevmb_fct field. Undefine FROM_PREVMB at the end. * iconv/loop.c: Document PREVMB_BODY. (gconv_prevmb, FROM_PREVMB): Define if PREVMB_BODY is defined. Undefine PREVMB_BODY at the end. * posix/regex_internal.c [_LIBC]: Include wcsmbs/wcsmbsload.h and dlfcn.h. (re_string_reconstruct) [_LIBC]: Use __prevmb_fct if available. * posix/tst-regex.c (umemlen): New variable. (test_expr): Add expectedicase argument. Test case insensitive searches as well as backwards searches (case sensitive and insensitive) too. (run_test): Add icase argument. Use it to compute regcomp flags. (run_test_backwards): New function. (main): Cast read to size_t to avoid warning. Set umemlen. Add expectedicase arguments to test_expr. --- libc/iconv/gconv_conf.c.jj 2003-09-14 20:13:39.000000000 +0200 +++ libc/iconv/gconv_conf.c 2003-11-11 11:44:23.000000000 +0100 @@ -62,7 +62,7 @@ static const char gconv_module_ext[] = M static struct gconv_module builtin_modules[] = { #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ - MinF, MaxF, MinT, MaxT) \ + PrevMbFct, MinF, MaxF, MinT, MaxT) \ { \ .from_string = From, \ .to_string = To, \ @@ -81,7 +81,7 @@ static struct gconv_module builtin_modul static const char *builtin_aliases[] = { #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ - MinF, MaxF, MinT, MaxT) + PrevMbFct, MinF, MaxF, MinT, MaxT) #define BUILTIN_ALIAS(From, To) From " " To, #include "gconv_builtin.h" --- libc/iconv/gconv.h.jj 2002-12-02 22:44:26.000000000 +0100 +++ libc/iconv/gconv.h 2003-11-11 12:05:21.000000000 +0100 @@ -74,6 +74,13 @@ typedef int (*__gconv_fct) (struct __gco /* Type of a specialized conversion function for a single byte to INTERNAL. */ typedef wint_t (*__gconv_btowc_fct) (struct __gconv_step *, unsigned char); +/* Type of a specialized function to return starting byte of a multi-byte + character. Searching starts from ptr-1 backwards. If no starting byte + of a multi-byte character is found even at the byte pointed by first, + the function returns NULL. */ +typedef __const unsigned char *(*__gconv_prevmb_fct) (__const unsigned char *, + __const unsigned char *); + /* Constructor and destructor for local data for conversion step. */ typedef int (*__gconv_init_fct) (struct __gconv_step *); typedef void (*__gconv_end_fct) (struct __gconv_step *); @@ -124,6 +131,7 @@ struct __gconv_step __gconv_fct __fct; __gconv_btowc_fct __btowc_fct; + __gconv_prevmb_fct __prevmb_fct; __gconv_init_fct __init_fct; __gconv_end_fct __end_fct; --- libc/iconv/gconv_builtin.c.jj 2002-12-02 22:48:08.000000000 +0100 +++ libc/iconv/gconv_builtin.c 2003-11-11 11:43:00.000000000 +0100 @@ -31,6 +31,7 @@ static struct builtin_map { const char *name; __gconv_fct fct; + __gconv_prevmb_fct prevmb_fct; __gconv_btowc_fct btowc_fct; int min_needed_from; @@ -41,11 +42,12 @@ static struct builtin_map } map[] = { #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ - MinF, MaxF, MinT, MaxT) \ + PrevMbFct, MinF, MaxF, MinT, MaxT) \ { \ .name = Name, \ .fct = Fct, \ .btowc_fct = BtowcFct, \ + .prevmb_fct = PrevMbFct, \ \ .min_needed_from = MinF, \ .max_needed_from = MaxF, \ @@ -72,6 +74,7 @@ __gconv_get_builtin_trans (const char *n step->__fct = map[cnt].fct; step->__btowc_fct = map[cnt].btowc_fct; + step->__prevmb_fct = map[cnt].prevmb_fct; step->__init_fct = NULL; step->__end_fct = NULL; step->__shlib_handle = NULL; --- libc/iconv/gconv_int.h.jj 2003-06-11 23:33:21.000000000 +0200 +++ libc/iconv/gconv_int.h 2003-11-11 12:14:58.000000000 +0100 @@ -297,6 +297,14 @@ __BUILTIN_TRANSFORM (__gconv_transform_u only ASCII characters. */ extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c); +/* Specialized function to return starting byte of a multi-byte + character for encodings where only ASCII characters start multi-byte + sequences. Searching starts from ptr-1 backwards. If no starting byte + of a multi-byte character is found even at the byte pointed by first, + the function returns NULL. */ +extern const unsigned char *__gconv_prevmb_ascii (const unsigned char *ptr, + const unsigned char *first); + #endif __END_DECLS --- libc/iconv/gconv_simple.c.jj 2003-06-11 23:36:37.000000000 +0200 +++ libc/iconv/gconv_simple.c 2003-11-11 12:14:37.000000000 +0100 @@ -32,7 +32,7 @@ #define BUILTIN_ALIAS(s1, s2) /* nothing */ #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ - MinF, MaxF, MinT, MaxT) \ + PrevMbFct, MinF, MaxF, MinT, MaxT) \ extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \ __const unsigned char **, __const unsigned char *, \ unsigned char **, size_t *, int, int); @@ -56,6 +56,22 @@ __gconv_btwoc_ascii (struct __gconv_step } +/* Specialized function to return starting byte of a multi-byte + character for encodings where only ASCII characters start multi-byte + sequences. Searching starts from ptr-1 backwards. If no starting byte + of a multi-byte character is found even at the byte pointed by first, + the function returns NULL. */ +const unsigned char * +__gconv_prevmb_ascii (const unsigned char *ptr, + const unsigned char *first) +{ + while (--ptr >= first) + if (*ptr < 0x80) + return ptr; + return NULL; +} + + /* Transform from the internal, UCS4-like format, to UCS4. The difference between the internal ucs4 format and the real UCS4 format is, if any, the endianess. The Unicode/ISO 10646 says that --- libc/iconv/iconvconfig.c.jj 2003-06-11 23:38:47.000000000 +0200 +++ libc/iconv/iconvconfig.c 2003-11-11 11:45:22.000000000 +0100 @@ -202,7 +202,7 @@ static struct #define BUILTIN_ALIAS(alias, real) \ { .from = alias, .to = real }, #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ - MinF, MaxF, MinT, MaxT) + PrevMbFct, MinF, MaxF, MinT, MaxT) #include <gconv_builtin.h> }; #undef BUILTIN_ALIAS @@ -219,7 +219,7 @@ static struct { #define BUILTIN_ALIAS(alias, real) #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ - MinF, MaxF, MinT, MaxT) \ + PrevMbFct, MinF, MaxF, MinT, MaxT) \ { .from = From, .to = To, .module = Name, .cost = Cost }, #include <gconv_builtin.h> }; --- libc/iconv/gconv_builtin.h.jj 2002-12-02 22:46:00.000000000 +0100 +++ libc/iconv/gconv_builtin.h 2003-11-11 12:15:19.000000000 +0100 @@ -30,14 +30,18 @@ BUILTIN_ALIAS ("OSF00010105//", "ISO-106 BUILTIN_ALIAS ("OSF00010106//", "ISO-10646/UCS4/") /* level 3 */ BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS4/", 1, "=INTERNAL->ucs4", - __gconv_transform_internal_ucs4, NULL, 4, 4, 4, 4) + __gconv_transform_internal_ucs4, NULL, NULL, + 4, 4, 4, 4) BUILTIN_TRANSFORMATION ("ISO-10646/UCS4/", "INTERNAL", 1, "=ucs4->INTERNAL", - __gconv_transform_ucs4_internal, NULL, 4, 4, 4, 4) + __gconv_transform_ucs4_internal, NULL, NULL, + 4, 4, 4, 4) BUILTIN_TRANSFORMATION ("INTERNAL", "UCS-4LE//", 1, "=INTERNAL->ucs4le", - __gconv_transform_internal_ucs4le, NULL, 4, 4, 4, 4) + __gconv_transform_internal_ucs4le, NULL, NULL, + 4, 4, 4, 4) BUILTIN_TRANSFORMATION ("UCS-4LE//", "INTERNAL", 1, "=ucs4le->INTERNAL", - __gconv_transform_ucs4le_internal, NULL, 4, 4, 4, 4) + __gconv_transform_ucs4le_internal, NULL, NULL, + 4, 4, 4, 4) BUILTIN_ALIAS ("WCHAR_T//", "INTERNAL") @@ -48,11 +52,12 @@ BUILTIN_ALIAS ("OSF05010001//", "ISO-106 BUILTIN_ALIAS ("ISO-10646/UTF-8/", "ISO-10646/UTF8/") BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UTF8/", 1, "=INTERNAL->utf8", - __gconv_transform_internal_utf8, NULL, 4, 4, 1, 6) + __gconv_transform_internal_utf8, NULL, NULL, + 4, 4, 1, 6) BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "INTERNAL", 1, "=utf8->INTERNAL", __gconv_transform_utf8_internal, __gconv_btwoc_ascii, - 1, 6, 4, 4) + __gconv_prevmb_ascii, 1, 6, 4, 4) BUILTIN_ALIAS ("UCS2//", "ISO-10646/UCS2/") BUILTIN_ALIAS ("UCS-2//", "ISO-10646/UCS2/") @@ -61,10 +66,12 @@ BUILTIN_ALIAS ("OSF00010101//", "ISO-106 BUILTIN_ALIAS ("OSF00010102//", "ISO-10646/UCS2/") /* level 3 */ BUILTIN_TRANSFORMATION ("ISO-10646/UCS2/", "INTERNAL", 1, "=ucs2->INTERNAL", - __gconv_transform_ucs2_internal, NULL, 2, 2, 4, 4) + __gconv_transform_ucs2_internal, NULL, NULL, + 2, 2, 4, 4) BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS2/", 1, "=INTERNAL->ucs2", - __gconv_transform_internal_ucs2, NULL, 4, 4, 2, 2) + __gconv_transform_internal_ucs2, NULL, NULL, + 4, 4, 2, 2) BUILTIN_ALIAS ("ANSI_X3.4//", "ANSI_X3.4-1968//") @@ -82,10 +89,11 @@ BUILTIN_ALIAS ("OSF00010020//", "ANSI_X3 BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "INTERNAL", 1, "=ascii->INTERNAL", __gconv_transform_ascii_internal, __gconv_btwoc_ascii, - 4, 4, 1, 1) + NULL, 4, 4, 1, 1) BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii", - __gconv_transform_internal_ascii, NULL, 4, 4, 1, 1) + __gconv_transform_internal_ascii, NULL, NULL, + 4, 4, 1, 1) #if BYTE_ORDER == BIG_ENDIAN @@ -96,12 +104,12 @@ BUILTIN_ALIAS ("UCS-2LE//", "UNICODELITT BUILTIN_TRANSFORMATION ("UNICODELITTLE//", "INTERNAL", 1, "=ucs2reverse->INTERNAL", - __gconv_transform_ucs2reverse_internal, NULL, + __gconv_transform_ucs2reverse_internal, NULL, NULL, 2, 2, 4, 4) BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODELITTLE//", 1, "=INTERNAL->ucs2reverse", - __gconv_transform_internal_ucs2reverse, NULL, + __gconv_transform_internal_ucs2reverse, NULL, NULL, 4, 4, 2, 2) #else BUILTIN_ALIAS ("UNICODELITTLE//", "ISO-10646/UCS2/") @@ -111,11 +119,11 @@ BUILTIN_ALIAS ("UCS-2BE//", "UNICODEBIG/ BUILTIN_TRANSFORMATION ("UNICODEBIG//", "INTERNAL", 1, "=ucs2reverse->INTERNAL", - __gconv_transform_ucs2reverse_internal, NULL, + __gconv_transform_ucs2reverse_internal, NULL, NULL, 2, 2, 4, 4) BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1, "=INTERNAL->ucs2reverse", - __gconv_transform_internal_ucs2reverse, NULL, + __gconv_transform_internal_ucs2reverse, NULL, NULL, 4, 4, 2, 2) #endif --- libc/iconv/skeleton.c.jj 2002-12-02 22:49:35.000000000 +0100 +++ libc/iconv/skeleton.c 2003-11-11 11:56:37.000000000 +0100 @@ -339,6 +339,9 @@ gconv_init (struct __gconv_step *step) #ifdef FROM_ONEBYTE step->__btowc_fct = FROM_ONEBYTE; #endif +#ifdef FROM_PREVMB + step->__prevmb_fct = FROM_PREVMB; +#endif } else if (__builtin_expect (strcmp (step->__to_name, CHARSET_NAME), 0) == 0) { --- libc/iconv/loop.c.jj 2003-06-11 23:38:13.000000000 +0200 +++ libc/iconv/loop.c 2003-11-11 12:05:59.000000000 +0100 @@ -46,6 +46,8 @@ ONEBYTE_BODY body of the specialized conversion function for a single byte from the current character set to INTERNAL. + PREVMB_BODY body of the specialized function for searching backwards + for start of a multi-byte character. */ #include <assert.h> @@ -471,6 +473,14 @@ gconv_btowc (struct __gconv_step *step, #endif +#ifdef PREVMB_BODY +static const unsigned char * +gconv_prevmb (const unsigned char *ptr, const unsigned char *first) + PREVMB_BODY +# define FROM_PREVMB gconv_prevmb +#endif + + /* We remove the macro definitions so that we can include this file again for the definition of another function. */ #undef MIN_NEEDED_INPUT @@ -484,6 +494,7 @@ gconv_btowc (struct __gconv_step *step, #undef INIT_PARAMS #undef UPDATE_PARAMS #undef ONEBYTE_BODY +#undef PREVMB_BODY #undef UNPACK_BYTES #undef CLEAR_STATE #undef LOOP_NEED_STATE --- libc/iconv/gconv_cache.c.jj 2003-06-11 23:38:47.000000000 +0200 +++ libc/iconv/gconv_cache.c 2003-11-11 19:52:25.000000000 +0100 @@ -205,6 +205,7 @@ find_module (const char *directory, cons /* These settings can be overridden by the init function. */ result->__btowc_fct = NULL; + result->__prevmb_fct = NULL; result->__data = NULL; /* Call the init function. */ --- libc/iconv/gconv_db.c.jj 2003-06-11 23:31:59.000000000 +0200 +++ libc/iconv/gconv_db.c 2003-11-11 19:53:39.000000000 +0100 @@ -269,6 +269,7 @@ gen_steps (struct derivation_step *best, /* These settings can be overridden by the init function. */ result[step_cnt].__btowc_fct = NULL; + result[step_cnt].__prevmb_fct = NULL; /* Call the init function. */ if (result[step_cnt].__init_fct != NULL) @@ -358,6 +359,7 @@ increment_counter (struct __gconv_step * /* These settings can be overridden by the init function. */ step->__btowc_fct = NULL; + step->__prevmb_fct = NULL; } /* Call the init function. */ --- libc/posix/tst-regex.c.jj 2001-07-06 06:55:38.000000000 +0200 +++ libc/posix/tst-regex.c 2003-11-11 18:57:30.000000000 +0100 @@ -1,4 +1,4 @@ -/* Copyright (C) 2001 Free Software Foundation, Inc. +/* Copyright (C) 2001, 2003 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -44,10 +44,13 @@ static iconv_t cd; static char *mem; static char *umem; static size_t memlen; +static size_t umemlen; -static int test_expr (const char *expr, int expected); +static int test_expr (const char *expr, int expected, int expectedicase); static int run_test (const char *expr, const char *mem, size_t memlen, - int expected); + int icase, int expected); +static int run_test_backwards (const char *expr, const char *mem, + size_t memlen, int icase, int expected); int @@ -78,7 +81,7 @@ main (void) if (mem == NULL) error (EXIT_FAILURE, errno, "while allocating buffer"); - if (read (fd, mem, memlen) != memlen) + if ((size_t) read (fd, mem, memlen) != memlen) error (EXIT_FAILURE, 0, "cannot read entire file"); mem[memlen] = '\0'; @@ -102,6 +105,7 @@ main (void) outmem = umem; outlen = 2 * memlen - 1; iconv (cd, &inmem, &inlen, &outmem, &outlen); + umemlen = outmem - umem; if (inlen != 0) error (EXIT_FAILURE, errno, "cannot convert buffer"); @@ -116,11 +120,11 @@ main (void) /* Run the actual tests. All tests are run in a single-byte and a multi-byte locale. */ - result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2); - result |= test_expr ("G.ran", 2); - result |= test_expr ("G.\\{1\\}ran", 2); - result |= test_expr ("G.*ran", 3); - result |= test_expr ("[äáàâ]", 0); + result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2, 2); + result |= test_expr ("G.ran", 2, 3); + result |= test_expr ("G.\\{1\\}ran", 2, 3); + result |= test_expr ("G.*ran", 3, 44); + result |= test_expr ("[äáàâ]", 0, 0); /* Free the resources. */ free (umem); @@ -132,7 +136,7 @@ main (void) static int -test_expr (const char *expr, int expected) +test_expr (const char *expr, int expected, int expectedicase) { int result; char *inmem; @@ -146,7 +150,14 @@ test_expr (const char *expr, int expecte error (EXIT_FAILURE, 0, "cannot set locale de_DE.ISO-8859-1"); printf ("\nTest \"%s\" with 8-bit locale\n", expr); - result = run_test (expr, mem, memlen, expected); + result = run_test (expr, mem, memlen, 0, expected); + printf ("\nTest \"%s\" with 8-bit locale, case insensitive\n", expr); + result |= run_test (expr, mem, memlen, 1, expectedicase); + printf ("\nTest \"%s\" backwards with 8-bit locale\n", expr); + result |= run_test_backwards (expr, mem, memlen, 0, expected); + printf ("\nTest \"%s\" backwards with 8-bit locale, case insensitive\n", + expr); + result |= run_test_backwards (expr, mem, memlen, 1, expectedicase); /* Second test: search with an UTF-8 locale. */ if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL) @@ -163,14 +174,22 @@ test_expr (const char *expr, int expecte /* Run the tests. */ printf ("\nTest \"%s\" with multi-byte locale\n", expr); - result |= run_test (uexpr, umem, 2 * memlen - outlen, expected); + result |= run_test (uexpr, umem, umemlen, 0, expected); + printf ("\nTest \"%s\" with multi-byte locale, case insensitive\n", expr); + result |= run_test (uexpr, umem, umemlen, 1, expectedicase); + printf ("\nTest \"%s\" backwards with multi-byte locale\n", expr); + result |= run_test_backwards (uexpr, umem, umemlen, 0, expected); + printf ("\nTest \"%s\" backwards with multi-byte locale, case insensitive\n", + expr); + result |= run_test_backwards (uexpr, umem, umemlen, 1, expectedicase); return result; } static int -run_test (const char *expr, const char *mem, size_t memlen, int expected) +run_test (const char *expr, const char *mem, size_t memlen, int icase, + int expected) { #ifdef _POSIX_CPUTIME struct timespec start; @@ -186,7 +205,7 @@ run_test (const char *expr, const char * use_clock = clock_gettime (cl, &start) == 0; #endif - err = regcomp (&re, expr, REG_NEWLINE); + err = regcomp (&re, expr, REG_NEWLINE | (icase ? REG_ICASE : 0)); if (err != REG_NOERROR) { char buf[200]; @@ -257,3 +276,97 @@ run_test (const char *expr, const char * expect. */ return cnt != expected; } + + +static int +run_test_backwards (const char *expr, const char *mem, size_t memlen, + int icase, int expected) +{ +#ifdef _POSIX_CPUTIME + struct timespec start; + struct timespec finish; +#endif + struct re_pattern_buffer re; + const char *err; + size_t offset; + int cnt; + +#ifdef _POSIX_CPUTIME + if (use_clock) + use_clock = clock_gettime (cl, &start) == 0; +#endif + + re_set_syntax ((RE_SYNTAX_POSIX_BASIC & ~RE_DOT_NEWLINE) + | RE_HAT_LISTS_NOT_NEWLINE + | (icase ? RE_ICASE : 0)); + + memset (&re, 0, sizeof (re)); + re.fastmap = malloc (256); + if (re.fastmap == NULL) + error (EXIT_FAILURE, errno, "cannot allocate fastmap"); + + err = re_compile_pattern (expr, strlen (expr), &re); + if (err != NULL) + error (EXIT_FAILURE, 0, "cannot compile expression: %s", err); + + if (re_compile_fastmap (&re)) + error (EXIT_FAILURE, 0, "couldn't compile fastmap"); + + cnt = 0; + offset = memlen; + assert (mem[memlen] == '\0'); + while (offset <= memlen) + { + int start; + const char *sp; + const char *ep; + + start = re_search (&re, mem, memlen, offset, -offset, NULL); + if (start == -1) + break; + + if (start == -2) + error (EXIT_FAILURE, 0, "internal error in re_search"); + + sp = mem + start; + while (sp > mem && sp[-1] != '\n') + --sp; + + ep = mem + start; + while (*ep != '\0' && *ep != '\n') + ++ep; + + printf ("match %d: \"%.*s\"\n", ++cnt, (int) (ep - sp), sp); + + offset = sp - 1 - mem; + } + + regfree (&re); + +#ifdef _POSIX_CPUTIME + if (use_clock) + { + use_clock = clock_gettime (cl, &finish) == 0; + if (use_clock) + { + if (finish.tv_nsec < start.tv_nsec) + { + finish.tv_nsec -= start.tv_nsec - 1000000000; + finish.tv_sec -= 1 + start.tv_sec; + } + else + { + finish.tv_nsec -= start.tv_nsec; + finish.tv_sec -= start.tv_sec; + } + + printf ("elapsed time: %ld.%09ld sec\n", + finish.tv_sec, finish.tv_nsec); + } + } +#endif + + /* Return an error if the number of matches found is not match we + expect. */ + return cnt != expected; +} --- libc/posix/regex_internal.c.jj 2003-11-11 17:35:49.000000000 +0100 +++ libc/posix/regex_internal.c 2003-11-11 19:29:19.000000000 +0100 @@ -18,6 +18,11 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#ifdef _LIBC +# include <wcsmbs/wcsmbsload.h> +# include <dlfcn.h> +#endif + static void re_string_construct_common (const char *str, int len, re_string_t *pstr, RE_TRANSLATE_TYPE trans, int icase); @@ -432,10 +437,42 @@ re_string_reconstruct (pstr, idx, eflags if (MB_CUR_MAX > 1) { int wcs_idx; - wint_t wc; - pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; - for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) - pstr->wcs[wcs_idx] = WEOF; + wint_t wc = WEOF; +# ifdef _LIBC + const struct gconv_fcts *fcts; + + /* Get the conversion functions. */ + fcts = get_gconv_fcts (_NL_CURRENT_DATA (LC_CTYPE)); + + if (__builtin_expect (fcts->towc_nsteps == 1, 1) + && __builtin_expect (fcts->towc->__prevmb_fct != NULL, 1)) + { + /* Use the shortcut function. */ + const char *prev, *raw; + raw = pstr->raw_mbs + pstr->raw_mbs_idx; + prev = DL_CALL_FCT (fcts->towc->__prevmb_fct, + (raw + offset, raw + pstr->valid_len)); + if (prev != NULL) + { + mbstate_t cur_state; + wchar_t wc2; + + memset (&cur_state, 0, sizeof (cur_state)); + if (mbrtowc (&wc2, prev, raw + offset - prev, &cur_state) + == raw + offset - prev) + { + memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); + wc = wc2; + } + } + } +# endif + if (wc == WEOF) + { + pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; + for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) + pstr->wcs[wcs_idx] = WEOF; + } if (pstr->trans && wc <= 0xff) wc = pstr->trans[wc]; pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD Jakub
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |