From ac8295d23b59e34d2f7c5757ea71336eab2c9e6e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 31 Dec 1999 22:21:25 +0000 Subject: [PATCH] (collate_output): Update. * locale/programs/ld-collate.c (collate_output): Emit correct information for collation elements. Don't write over end of array idx. * posix/regex.c: Handle also collation elements at end of range. * posix/PTESTS: Fix a few typos. --- ChangeLog | 7 +++ locale/programs/ld-collate.c | 45 ++++++++++-------- posix/PTESTS | 12 ++--- posix/ptestcases.h | 12 ++--- posix/regex.c | 91 +++++++++++++++++------------------- 5 files changed, 86 insertions(+), 81 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8c10f3a301..0192430ee2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 1999-12-31 Ulrich Drepper + * locale/programs/ld-collate.c (collate_output): Emit correct + information for collation elements. + Don't write over end of array idx. + * posix/regex.c: Handle also collation elements at end of range. + + * posix/PTESTS: Fix a few typos. + * posix/bits/posix2_lim.h: Remove _POSIX2_EQUIV_CLASS_MAX. I have no idea where this came from. * sysdeps/posix/sysconf.c: Remove _POSIX2_EQUIV_CLASS_MAX diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c index 8eb47d7f8e..2cbea388b2 100644 --- a/locale/programs/ld-collate.c +++ b/locale/programs/ld-collate.c @@ -91,8 +91,6 @@ struct element_t unsigned int used_in_level; struct element_list_t *weights; - /* Index in the `weight' table in the output file for the character. */ - int32_t weights_idx; /* Nonzero if this is a real character definition. */ int is_character; @@ -301,7 +299,6 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen, /* Will be allocated later. */ newp->weights = NULL; - newp->weights_idx = 0; newp->file = NULL; newp->line = 0; @@ -1809,9 +1806,6 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate, obstack_grow (pool, buf, len); } - /* Remember the index. */ - elem->weights_idx = retval; - return retval | ((elem->section->ruleidx & 0x7f) << 24); } @@ -1899,11 +1893,26 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, /* If we have no LC_COLLATE data emit only the number of rules as zero. */ if (collate == NULL) { + int32_t dummy = 0; + while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE)) { - iov[2 + cnt].iov_base = (char *) ""; - iov[2 + cnt].iov_len = 0; - idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; + /* The words have to be handled specially. */ + if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE) + || cnt == _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS) + || cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB)) + { + iov[2 + cnt].iov_base = &dummy; + iov[2 + cnt].iov_len = sizeof (int32_t); + } + else + { + iov[2 + cnt].iov_base = (char *) ""; + iov[2 + cnt].iov_len = 0; + } + + if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE)) + idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; ++cnt; } @@ -2453,23 +2462,20 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, elem_table[idx * 2] = hash; elem_table[idx * 2 + 1] = obstack_object_size (&extrapool); - /* Now add the index into the weights table. We know the - address is always 32bit aligned. */ - if (sizeof (int) == sizeof (int32_t)) - obstack_int_grow (&extrapool, runp->weights_idx); - else - obstack_grow (&extrapool, &runp->weights_idx, - sizeof (int32_t)); - /* The the string itself including length. */ obstack_1grow (&extrapool, namelen); obstack_grow (&extrapool, runp->name, namelen); + /* And the multibyte representation. */ + obstack_1grow (&extrapool, runp->nmbs); + obstack_grow (&extrapool, runp->mbs, runp->nmbs); + /* And align again to 32 bits. */ - if ((1 + namelen) % sizeof (int32_t) != 0) + if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0) obstack_grow (&extrapool, "\0\0", (sizeof (int32_t) - - (1 + namelen) % sizeof (int32_t))); + - ((1 + namelen + 1 + runp->nmbs) + % sizeof (int32_t)))); } } @@ -2492,7 +2498,6 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB)); iov[2 + cnt].iov_len = obstack_object_size (&extrapool); iov[2 + cnt].iov_base = obstack_finish (&extrapool); - idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; ++cnt; diff --git a/posix/PTESTS b/posix/PTESTS index 3580c81a03..b017f5b3f2 100644 --- a/posix/PTESTS +++ b/posix/PTESTS @@ -115,7 +115,7 @@ 3¦3¦[][.-.]-0]¦ab0-]¦ 3¦3¦[A-[.].]c]¦ab]!¦ # GA122 --2¦-2¦[[.ch]]¦abc¦ +-2¦-2¦[[.ch.]]¦abc¦ -2¦-2¦[[.ab.][.CD.][.EF.]]¦yZabCDEFQ9¦ # GA125 2¦2¦[[=a=]b]¦Abc¦ @@ -163,12 +163,12 @@ 2¦6¦bc[d-w]xy¦abchxyz¦ # GA129 1¦1¦[a-cd-f]¦dbccde¦ --1¦-1¦[a-ce-f¦dBCCdE¦ +-1¦-1¦[a-ce-f]¦dBCCdE¦ 2¦4¦b[n-zA-M]Y¦absY9Z¦ 2¦4¦b[n-zA-M]Y¦abGY9Z¦ # GA130 3¦3¦[-xy]¦ac-¦ -2¦4¦[c[-xy]D¦ac-D+¦ +2¦4¦c[-xy]D¦ac-D+¦ 2¦2¦[--/]¦a.b¦ 2¦4¦c[--/]D¦ac.D+b¦ 2¦2¦[^-ac]¦abcde-¦ @@ -189,7 +189,7 @@ 3¦4¦[a-c][e-f]¦acbedf¦ 4¦8¦abc*XYZ¦890abXYZ#*¦ 4¦9¦abc*XYZ¦890abcXYZ#*¦ -4¦15¦abc*XYZ¦890abccccccccXYZ#*¦ +4¦15¦abc*XYZ¦890abcccccccXYZ#*¦ -1¦-1¦abc*XYZ¦890abc*XYZ#*¦ # GA132 2¦4¦\(*bc\)¦a*bc¦ @@ -267,7 +267,7 @@ 1¦1¦^a¦abc¦ -1¦-1¦^b¦abc¦ -1¦-1¦^[a-zA-Z]¦99Nine¦ -1¦4¦^[a-zA-Z]¦Nine99¦ +1¦4¦^[a-zA-Z]*¦Nine99¦ # GA145(1) 1¦2¦\(^a\)\1¦aabc¦ -1¦-1¦\(^a\)\1¦^a^abc¦ @@ -284,7 +284,7 @@ 3¦3¦a$¦cba¦ -1¦-1¦a$¦abc¦ 5¦7¦[a-z]*$¦99ZZxyz¦ --1¦-1¦[a-z]*$¦99ZZxyz99¦ +9¦9¦[a-z]*$¦99ZZxyz99¦ 3¦3¦$$¦ab$¦ -1¦-1¦$$¦$ab¦ 3¦3¦\$$¦ab$¦ diff --git a/posix/ptestcases.h b/posix/ptestcases.h index d6e099c82b..87f584d2e3 100644 --- a/posix/ptestcases.h +++ b/posix/ptestcases.h @@ -110,7 +110,7 @@ { 3, 3, "[][.-.]-0]", "ab0-]", }, { 3, 3, "[A-[.].]c]", "ab]!", }, { 0, 0, "GA122", NULL, }, - { -2, -2, "[[.ch]]", "abc", }, + { -2, -2, "[[.ch.]]", "abc", }, { -2, -2, "[[.ab.][.CD.][.EF.]]", "yZabCDEFQ9", }, { 0, 0, "GA125", NULL, }, { 2, 2, "[[=a=]b]", "Abc", }, @@ -158,12 +158,12 @@ { 2, 6, "bc[d-w]xy", "abchxyz", }, { 0, 0, "GA129", NULL, }, { 1, 1, "[a-cd-f]", "dbccde", }, - { -1, -1, "[a-ce-f", "dBCCdE", }, + { -1, -1, "[a-ce-f]", "dBCCdE", }, { 2, 4, "b[n-zA-M]Y", "absY9Z", }, { 2, 4, "b[n-zA-M]Y", "abGY9Z", }, { 0, 0, "GA130", NULL, }, { 3, 3, "[-xy]", "ac-", }, - { 2, 4, "[c[-xy]D", "ac-D+", }, + { 2, 4, "c[-xy]D", "ac-D+", }, { 2, 2, "[--/]", "a.b", }, { 2, 4, "c[--/]D", "ac.D+b", }, { 2, 2, "[^-ac]", "abcde-", }, @@ -184,7 +184,7 @@ { 3, 4, "[a-c][e-f]", "acbedf", }, { 4, 8, "abc*XYZ", "890abXYZ#*", }, { 4, 9, "abc*XYZ", "890abcXYZ#*", }, - { 4, 15, "abc*XYZ", "890abccccccccXYZ#*", }, + { 4, 15, "abc*XYZ", "890abcccccccXYZ#*", }, { -1, -1, "abc*XYZ", "890abc*XYZ#*", }, { 0, 0, "GA132", NULL, }, { 2, 4, "\\(*bc\\)", "a*bc", }, @@ -262,7 +262,7 @@ { 1, 1, "^a", "abc", }, { -1, -1, "^b", "abc", }, { -1, -1, "^[a-zA-Z]", "99Nine", }, - { 1, 4, "^[a-zA-Z]", "Nine99", }, + { 1, 4, "^[a-zA-Z]*", "Nine99", }, { 0, 0, "GA145(1)", NULL, }, { 1, 2, "\\(^a\\)\\1", "aabc", }, { -1, -1, "\\(^a\\)\\1", "^a^abc", }, @@ -274,7 +274,7 @@ { 3, 3, "a$", "cba", }, { -1, -1, "a$", "abc", }, { 5, 7, "[a-z]*$", "99ZZxyz", }, - { -1, -1, "[a-z]*$", "99ZZxyz99", }, + { 9, 9, "[a-z]*$", "99ZZxyz99", }, { 3, 3, "$$", "ab$", }, { -1, -1, "$$", "$ab", }, { 3, 3, "\\$$", "ab$", }, diff --git a/posix/regex.c b/posix/regex.c index a59f5d4a71..d036a7dd3a 100644 --- a/posix/regex.c +++ b/posix/regex.c @@ -1570,7 +1570,8 @@ static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p, reg_syntax_t syntax)); static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend, reg_syntax_t syntax)); -static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, +static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start, + const char **p_ptr, const char *pend, char *translate, reg_syntax_t syntax, @@ -2174,6 +2175,7 @@ regex_compile (pattern, size, syntax, bufp) case '[': { boolean had_char_class = false; + unsigned int range_start = 0xffffffff; if (p == pend) FREE_STACK_RETURN (REG_EBRACK); @@ -2217,6 +2219,7 @@ regex_compile (pattern, size, syntax, bufp) PATFETCH (c1); SET_LIST_BIT (c1); + range_start = c1; continue; } @@ -2241,8 +2244,10 @@ regex_compile (pattern, size, syntax, bufp) && *p != ']') { reg_errcode_t ret - = compile_range (&p, pend, translate, syntax, b); + = compile_range (range_start, &p, pend, translate, + syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; } else if (p[0] == '-' && p[1] != ']') @@ -2252,8 +2257,9 @@ regex_compile (pattern, size, syntax, bufp) /* Move past the `-'. */ PATFETCH (c1); - ret = compile_range (&p, pend, translate, syntax, b); + ret = compile_range (c, &p, pend, translate, syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; } /* See if we're at the beginning of a possible character @@ -2376,6 +2382,7 @@ regex_compile (pattern, size, syntax, bufp) PATUNFETCH; SET_LIST_BIT ('['); SET_LIST_BIT (':'); + range_start = ':'; had_char_class = false; } } @@ -2503,6 +2510,16 @@ regex_compile (pattern, size, syntax, bufp) #endif had_char_class = true; } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT ('='); + range_start = '='; + had_char_class = false; + } } else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') { @@ -2553,6 +2570,7 @@ regex_compile (pattern, size, syntax, bufp) /* Set the bit for the character. */ SET_LIST_BIT (str[0]); + range_start = ((const unsigned char *) str)[0]; } #ifdef _LIBC else @@ -2561,9 +2579,7 @@ regex_compile (pattern, size, syntax, bufp) those known to the collate implementation. First find out whether the bytes in `str' are actually from exactly one character. */ - const unsigned char *weights; int32_t table_size; - const int32_t *table; const int32_t *symb_table; const unsigned char *extra; int32_t idx; @@ -2574,10 +2590,6 @@ regex_compile (pattern, size, syntax, bufp) int32_t hash; int ch; - table = (const int32_t *) - _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - weights = (const unsigned char *) - _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); @@ -2598,17 +2610,15 @@ regex_compile (pattern, size, syntax, bufp) { /* First compare the hashing value. */ if (symb_table[2 * elem] == hash - && (c1 == extra[symb_table[2 * elem + 1] - + sizeof (int32_t)]) + && c1 == extra[symb_table[2 * elem + 1]] && memcmp (str, &extra[symb_table[2 * elem + 1] - + sizeof (int32_t) + 1], + + 1], c1) == 0) { /* Yep, this is the entry. */ - idx = *((int32_t *) - (extra - + symb_table[2 * elem + 1])); + idx = symb_table[2 * elem + 1]; + idx += 1 + extra[idx]; break; } @@ -2624,40 +2634,21 @@ regex_compile (pattern, size, syntax, bufp) class. */ PATFETCH (c); - /* Now we have to go throught the whole table - and find all characters which have the same - first level weight. + /* Now add the multibyte character(s) we found + to the acceptabed list. XXX Note that this is not entirely correct. we would have to match multibyte sequences but this is not possible with the current - implementation. */ - for (ch = 1; ch < 256; ++ch) - /* XXX This test would have to be changed if we - would allow matching multibyte sequences. */ - if (table[ch] > 0) - { - int32_t idx2 = table[ch]; - size_t len = weights[idx2]; - - /* Test whether the lenghts match. */ - if (weights[idx] == len) - { - /* They do. New compare the bytes of - the weight. */ - size_t cnt = 0; - - while (cnt < len - && (weights[idx + 1 + cnt] - == weights[idx2 + 1 + cnt])) - ++len; - - if (cnt == len) - /* They match. Mark the character as - acceptable. */ - SET_LIST_BIT (ch); - } - } + implementation. Also, we have to match + collating symbols, which expand to more than + one file, as a whole and not allow the + individual bytes. */ + c1 = extra[idx++]; + if (c1 == 1) + range_start = extra[idx]; + while (c1-- > 0) + SET_LIST_BIT (extra[idx++]); } #endif had_char_class = false; @@ -2668,7 +2659,8 @@ regex_compile (pattern, size, syntax, bufp) while (c1--) PATUNFETCH; SET_LIST_BIT ('['); - SET_LIST_BIT ('='); + SET_LIST_BIT ('.'); + range_start = '.'; had_char_class = false; } } @@ -2676,6 +2668,7 @@ regex_compile (pattern, size, syntax, bufp) { had_char_class = false; SET_LIST_BIT (c); + range_start = c; } } @@ -3425,7 +3418,8 @@ group_in_compile_stack (compile_stack, regnum) `regex_compile' itself. */ static reg_errcode_t -compile_range (p_ptr, pend, translate, syntax, b) +compile_range (range_start, p_ptr, pend, translate, syntax, b) + unsigned int range_start; const char **p_ptr, *pend; RE_TRANSLATE_TYPE translate; reg_syntax_t syntax; @@ -3434,7 +3428,7 @@ compile_range (p_ptr, pend, translate, syntax, b) unsigned this_char; const char *p = *p_ptr; - unsigned int range_start, range_end; + unsigned int range_end; if (p == pend) return REG_ERANGE; @@ -3447,7 +3441,6 @@ compile_range (p_ptr, pend, translate, syntax, b) We also want to fetch the endpoints without translating them; the appropriate translation is done in the bit-setting loop below. */ /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ - range_start = ((const unsigned char *) p)[-2]; range_end = ((const unsigned char *) p)[0]; /* Have to increment the pointer into the pattern string, so the -- 2.43.5