]> sourceware.org Git - newlib-cygwin.git/commitdiff
Cygwin: glob: implement collating symbol support
authorCorinna Vinschen <corinna@vinschen.de>
Mon, 20 Feb 2023 21:50:17 +0000 (22:50 +0100)
committerCorinna Vinschen <corinna@vinschen.de>
Mon, 20 Feb 2023 21:50:17 +0000 (22:50 +0100)
Allow the [.<sym>.] expression

This requires a string comparision rather than a character
comparison.  Introduce and use __wscollate_range_cmp.

Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
winsup/cygwin/glob.cc
winsup/cygwin/local_includes/collate.h
winsup/cygwin/nlsfuncs.cc

index 4ef947929a58a75f2f61e579404ba8fec3729d48..66681786aae85da0f1e1534a89a0513199a02042 100644 (file)
@@ -160,6 +160,9 @@ typedef char Char;
 #define        M_SET           META('[')
 #define        M_NAMED         META(':')
 #define        M_EQUIV         META('=')
+#define        M_COLL(_ccnt)   META('.' | ((_ccnt) << 8))
+#define M_COLL_P(_c)   (((_c) & M_COLL_MASK) == META('.'))
+#define M_COLL_CNT(_c) (((_c) & ~M_COLL_MASK) >> 8)
 #define        ismeta(c)       (((c)&M_QUOTE) != 0)
 
 static int      compare(const void *, const void *);
@@ -528,41 +531,61 @@ glob0(const Char *pattern, glob_t *pglob, size_t *limit)
                        *bufnext++ = M_SET;
                        if (c == NOT)
                                *bufnext++ = M_NOT;
-                       c = *qpatnext;
+                       c = *qpatnext++;
                        do {
                                wint_t wclass[64];
                                Char ctype;
 
-                               ctype = check_classes_expr(qpatnext, wclass,
+                               ctype = check_classes_expr(--qpatnext, wclass,
                                                           64);
-                               if (ctype) {
+                               ++qpatnext;
+                               if (ctype == COLON) {
                                        wctype_t type;
-
-                                       if (ctype == COLON) {
-                                           char cclass[64];
-
-                                           /* No worries, char classes are
-                                              ASCII-only anyway */
-                                           wcitoascii (cclass, wclass);
-                                           if ((type = wctype (cclass))) {
-                                               *bufnext++ = M_NAMED;
-                                               *bufnext++ = CHAR (type);
-                                           }
-                                       } else if (ctype == EQUALS &&
-                                                  wclass[0] && !wclass[1]) {
+                                       char cclass[64];
+
+                                       /* No worries, char classes are
+                                          ASCII-only anyway */
+                                       wcitoascii (cclass, wclass);
+                                       if ((type = wctype (cclass))) {
+                                           *bufnext++ = M_NAMED;
+                                           *bufnext++ = CHAR (type);
+                                       }
+                                       continue;
+                               }
+                               if (ctype == EQUALS) {
+                                       if (wclass[0] && !wclass[1]) {
                                            *bufnext++ = M_EQUIV;
                                            *bufnext++ = CHAR (wclass[0]);
                                        }
-                                       /* TODO: [. is ignored yet */
-                                       qpatnext++;
                                        continue;
                                }
-                               *bufnext++ = CHAR(c);
+                               if (ctype == DOT &&
+                                   is_unicode_coll_elem (wclass)) {
+                                       *bufnext++ =
+                                           M_COLL (wcilen (wclass));
+                                       wint_t *wcp = wclass;
+                                       while ((*bufnext++ = *wcp++))
+                                           ;
+                                       --bufnext; /* drop NUL */
+                               } else
+                                       *bufnext++ = CHAR(c);
                                if (*qpatnext == RANGE &&
                                    (c = qpatnext[1]) != RBRACKET) {
                                        *bufnext++ = M_RNG;
-                                       *bufnext++ = CHAR(c);
-                                       qpatnext += 2;
+
+                                       ctype = check_classes_expr(++qpatnext,
+                                                                  wclass, 64);
+                                       if (ctype == DOT &&
+                                           is_unicode_coll_elem (wclass)) {
+                                               *bufnext++ =
+                                                   M_COLL (wcilen (wclass));
+                                               wint_t *wcp = wclass;
+                                               while ((*bufnext++ = *wcp++))
+                                                   ;
+                                               --bufnext; /* drop NUL */
+                                       } else
+                                               *bufnext++ = CHAR(c);
+                                       ++qpatnext;
                                }
                        } while ((c = *qpatnext++) != RBRACKET);
                        pglob->gl_flags |= GLOB_MAGCHAR;
@@ -849,11 +872,12 @@ static int
 match(Char *name, Char *pat, Char *patend)
 {
        int ok, negate_range;
-       Char c, k;
+       Char *c, *k;
+       size_t k_len;
 
        while (pat < patend) {
-               c = *pat++;
-               switch (c & M_MASK) {
+               c = pat++;
+               switch (*c & M_MASK) {
                case M_ALL:
                        if (pat == patend)
                                return(1);
@@ -868,36 +892,53 @@ match(Char *name, Char *pat, Char *patend)
                        break;
                case M_SET:
                        ok = 0;
-                       if ((k = *name++) == EOS)
+                       if (*(k = name) == EOS)
                                return(0);
+                       k_len = next_unicode_char (k);
+                       name += k_len;
                        if ((negate_range = ((*pat & M_MASK) == M_NOT)) != EOS)
                                ++pat;
-                       while (((c = *pat++) & M_MASK) != M_END)
-                               if ((c & M_MASK) == M_NAMED) {
-                                       if (iswctype (k, *pat++))
+                       while ((*(c = pat++) & M_MASK) != M_END) {
+                               size_t len1 = 1, len2 = 1;
+
+                               if ((*c & M_MASK) == M_NAMED) {
+                                       if (iswctype (*k, *pat++))
                                                ok = 1;
-                               } else if ((c & M_MASK) == M_EQUIV) {
-                                       if (is_unicode_equiv (k, *pat++))
+                                       continue;
+                               }
+                               if ((*c & M_MASK) == M_EQUIV) {
+                                       if (is_unicode_equiv (*k, *pat++))
                                                ok = 1;
-                               } else if ((*pat & M_MASK) == M_RNG) {
+                                       continue;
+                               }
+                               if (M_COLL_P(*c)) {
+                                       len1 = M_COLL_CNT(*c);
+                                       ++c;
+                                       pat += len1;
+                               }
+                               if ((*pat & M_MASK) == M_RNG) {
+                                       if (M_COLL_P(pat[1]))
+                                               len2 = M_COLL_CNT(*++pat);
 #ifdef __CYGWIN__
                                        if ((!__get_current_collate_locale ()->lcid) ?
 #else
                                        if (__collate_load_error ?
 #endif
-                                           CCHAR(c) <= CCHAR(k) && CCHAR(k) <= CCHAR(pat[1]) :
-                                              __wcollate_range_cmp(CCHAR(c), CCHAR(k)) <= 0
-                                           && __wcollate_range_cmp(CCHAR(k), CCHAR(pat[1])) <= 0
+                                           *c <= *k && *k <= pat[1] :
+                                              __wscollate_range_cmp(c, k, len1, k_len) <= 0
+                                           && __wscollate_range_cmp(k, pat + 1, k_len, len2) <= 0
                                           )
                                                ok = 1;
-                                       pat += 2;
-                               } else if (c == k)
+                                       pat += len2 + 1;
+                               } else if (len1 == k_len &&
+                                          wcincmp (c, k, len1) == 0)
                                        ok = 1;
+                       }
                        if (ok == negate_range)
                                return(0);
                        break;
                default:
-                       if (Cchar(*name++) != Cchar(c))
+                       if (Cchar(*name++) != Cchar(*c))
                                return(0);
                        break;
                }
index 7b4c72dd582e6f3cd0082befb0cb2766ecfa3716..498d5e1cd43157ad6f4efb996bbdacac415e0f7d 100644 (file)
@@ -14,6 +14,7 @@ extern "C" {
 extern const int __collate_load_error;
 
 extern int __wcollate_range_cmp (wint_t, wint_t);
+extern int __wscollate_range_cmp (wint_t *, wint_t *, size_t, size_t);
 
 int is_unicode_equiv (wint_t, wint_t);
 
index 20143f19d8d3b69979d8c7bc4e961b411c14dc36..eb9948dd37fc97e5b6c72bff9cc143acd94997a5 100644 (file)
@@ -1195,6 +1195,25 @@ __wcollate_range_cmp (wint_t c1, wint_t c2)
   return wcscoll (s1, s2);
 }
 
+/* Not so much BSD.  Used from glob.cc, fnmatch.c and regcomp.c.
+
+   First arg is always from pattern space, second arg is the tested string.
+   len is the length of the pattern in the first arg. */
+extern "C" int
+__wscollate_range_cmp (wint_t *c1, wint_t *c2,
+                      size_t c1len, size_t c2len)
+{
+  wchar_t s1[c1len * 2 + 1] = { 0 };   /* # of chars if all are surrogates */
+  wchar_t s2[c2len * 2 + 1] = { 0 };
+
+  wcintowcs (s1, c1, c1len);
+  wcintowcs (s2, c2, c2len);
+  return wcscoll_l (s1, s2, __get_current_locale ());
+}
+
+const size_t ce_size = sizeof collating_element / sizeof *collating_element;
+const size_t ce_e_size = sizeof *collating_element;
+
 /* Check if UTF-32 input character `test' is in the same equivalence class
    as UTF-32 character 'eqv'.
    Note that we only recognize input in Unicode normalization form C, that
This page took 0.038175 seconds and 5 git commands to generate.