This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCHv4a] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).


On 07/26/2018 04:34 AM, Carlos O'Donell wrote:
On 07/25/2018 04:57 PM, Carlos O'Donell wrote:
v4
- Fixed ar_SA, km_KH, lo_LA, or_IN, sl_SI, th_TH.
- Added range checking for a-z, A-Z for all supported UTF-8 locales.

All of my testers are clean.

Attaching v4 on top of the current master.

This fixes all the locales.

I wrote another enumeration tester, this time covering all locales. It found these issues:

az_AZ: U+000069 fails to match /[a-z]/
az_AZ: U+000049 fails to match /[A-Z]/
az_AZ.utf8: U+000069 fails to match /[a-z]/
az_AZ.utf8: U+000049 fails to match /[A-Z]/
crh_UA: U+000069 fails to match /[a-z]/
crh_UA: U+000049 fails to match /[A-Z]/
crh_UA.utf8: U+000069 fails to match /[a-z]/
crh_UA.utf8: U+000049 fails to match /[A-Z]/
ku_TR: U+000069 fails to match /[a-z]/
ku_TR: U+000049 fails to match /[A-Z]/
ku_TR.iso88599: U+000069 fails to match /[a-z]/
ku_TR.iso88599: U+000049 fails to match /[A-Z]/
ku_TR.utf8: U+000069 fails to match /[a-z]/
ku_TR.utf8: U+000049 fails to match /[A-Z]/
lv_LV: U+000079 fails to match /[a-z]/
lv_LV: U+000059 fails to match /[A-Z]/
lv_LV.iso885913: U+000079 fails to match /[a-z]/
lv_LV.iso885913: U+000059 fails to match /[A-Z]/
lv_LV.utf8: U+000079 fails to match /[a-z]/
lv_LV.utf8: U+000059 fails to match /[A-Z]/
shs_CA: U+0000E6 matches /[a-z]/ unexpectedly
shs_CA: U+0000C6 matches /[A-Z]/ unexpectedly
shs_CA.utf8: U+0000E6 matches /[a-z]/ unexpectedly
shs_CA.utf8: U+0000C6 matches /[A-Z]/ unexpectedly
slovene: U+00006A fails to match /[a-z]/
slovene: U+00006B fails to match /[a-z]/
slovene: U+00006C fails to match /[a-z]/
slovene: U+00006D fails to match /[a-z]/
slovene: U+00006E fails to match /[a-z]/
slovene: U+00006F fails to match /[a-z]/
slovenian: U+00006A fails to match /[a-z]/
slovenian: U+00006B fails to match /[a-z]/
slovenian: U+00006C fails to match /[a-z]/
slovenian: U+00006D fails to match /[a-z]/
slovenian: U+00006E fails to match /[a-z]/
slovenian: U+00006F fails to match /[a-z]/
sl_SI: U+00006A fails to match /[a-z]/
sl_SI: U+00006B fails to match /[a-z]/
sl_SI: U+00006C fails to match /[a-z]/
sl_SI: U+00006D fails to match /[a-z]/
sl_SI: U+00006E fails to match /[a-z]/
sl_SI: U+00006F fails to match /[a-z]/
sl_SI.iso88592: U+00006A fails to match /[a-z]/
sl_SI.iso88592: U+00006B fails to match /[a-z]/
sl_SI.iso88592: U+00006C fails to match /[a-z]/
sl_SI.iso88592: U+00006D fails to match /[a-z]/
sl_SI.iso88592: U+00006E fails to match /[a-z]/
sl_SI.iso88592: U+00006F fails to match /[a-z]/
sl_SI.utf8: U+00006A fails to match /[a-z]/
sl_SI.utf8: U+00006B fails to match /[a-z]/
sl_SI.utf8: U+00006C fails to match /[a-z]/
sl_SI.utf8: U+00006D fails to match /[a-z]/
sl_SI.utf8: U+00006E fails to match /[a-z]/
sl_SI.utf8: U+00006F fails to match /[a-z]/
sv_FI: U+000077 fails to match /[a-z]/
sv_FI: U+000057 fails to match /[A-Z]/
sv_FI@euro: U+000077 fails to match /[a-z]/
sv_FI@euro: U+000057 fails to match /[A-Z]/
sv_FI.iso88591: U+000077 fails to match /[a-z]/
sv_FI.iso88591: U+000057 fails to match /[A-Z]/
sv_FI.iso885915@euro: U+000077 fails to match /[a-z]/
sv_FI.iso885915@euro: U+000057 fails to match /[A-Z]/
sv_FI.utf8: U+000077 fails to match /[a-z]/
sv_FI.utf8: U+000057 fails to match /[A-Z]/
sv_SE: U+000077 fails to match /[a-z]/
sv_SE: U+000057 fails to match /[A-Z]/
sv_SE.iso88591: U+000077 fails to match /[a-z]/
sv_SE.iso88591: U+000057 fails to match /[A-Z]/
sv_SE.utf8: U+000077 fails to match /[a-z]/
sv_SE.utf8: U+000057 fails to match /[A-Z]/
swedish: U+000077 fails to match /[a-z]/
swedish: U+000057 fails to match /[A-Z]/
tt_RU: U+000069 fails to match /[a-z]/
tt_RU: U+000049 fails to match /[A-Z]/
tt_RU@iqtelif: U+000069 fails to match /[a-z]/
tt_RU@iqtelif: U+000049 fails to match /[A-Z]/
tt_RU.utf8: U+000069 fails to match /[a-z]/
tt_RU.utf8: U+000049 fails to match /[A-Z]/
tt_RU.utf8@iqtelif: U+000069 fails to match /[a-z]/
tt_RU.utf8@iqtelif: U+000049 fails to match /[A-Z]/

Thanks,
Florian
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <locale.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>

#include <algorithm>
#include <string>
#include <vector>

static std::vector<std::string>
get_locales()
{
  FILE *fp = popen("locale -a", "r");
  if (fp == NULL)
    err(1, "running locale -a");

  std::vector<std::string> result;
  while (!feof(fp))
    {
      char *elem{};
      int ret = fscanf(fp, "%ms", &elem);
      if (ret == 1)
        {
          if (elem == nullptr)
            errx(1, "invalid fscanf result");
          result.emplace_back(elem);
          free(elem);
        }
      else if (ferror(fp))
        err(1, "fscanf failed");
    }

  int ret = pclose(fp);
  if (ret != 0)
    err(1, "locale -a failed with status %d", ret);

  std::sort(result.begin(), result.end());
  return result;
}

static void
test_regexp_range(const char *locale, const char *pattern,
                  std::pair<wchar_t, wchar_t> range)
{

  regex_t reg;
  {
    int ret = regcomp(&reg, pattern, REG_EXTENDED | REG_NOSUB);
    if (ret != 0)
      errx(1, "Cannot compile regular expression /%s/: %d", pattern, ret);
  }

  const wchar_t maximum_character = 0x10FFFF;
  const unsigned maximum_length = 5; /* With NUL.  */
  for (wchar_t ch = 1; ch <= maximum_character; ++ch)
    {
      char uch[MB_LEN_MAX];
      mbstate_t ps{};
      {
        size_t ret = wcrtomb(uch, ch, &ps);
        if (ret == static_cast<size_t>(-1))
          {
            if (errno == EILSEQ)
              continue;
            err(1, "wcrtomb(0x%x)", static_cast<unsigned>(ch));
          }
        else if (ret == 0)
          continue;             // Some anomaly.
        if (ret >= maximum_length)
          errx(1, "multi-byte length %zu at 0x%x exceeds %u",
               ret, ch, maximum_length);
        uch[ret]  = '\0';
      }
      int ret = regexec(&reg, uch, 0, NULL, 0);
      if (ret != 0 && ret != REG_NOMATCH)
        errx(1, "regexec of /%s/ failed with code %d", pattern, ret);
      bool regex_matches = ret == 0;
      bool range_matches = range.first <= ch && ch <= range.second;
      if (regex_matches != range_matches) {
        if (regex_matches)
          printf("%s: U+%06X matches /%s/ unexpectedly\n",
                 locale, static_cast<unsigned>(ch), pattern);
        else
          printf("%s: U+%06X fails to match /%s/\n",
                 locale, static_cast<unsigned>(ch), pattern);
      }
    }

  regfree(&reg);
}

int
main()
{
  std::vector<std::string> locales{get_locales()};
  for (const auto &locale : locales) {
    if (setlocale(LC_ALL, locale.c_str()) == NULL)
      err(1, "Cannot set locale to %s", locale.c_str());
    test_regexp_range(locale.c_str(), "[0-9]", std::make_pair(L'0', L'9'));
    test_regexp_range(locale.c_str(), "[a-z]", std::make_pair(L'a', L'z'));
    test_regexp_range(locale.c_str(), "[A-Z]", std::make_pair(L'A', L'Z'));
  }
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]