This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCHv4a] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).
On 07/26/2018 04:34 AM, Carlos O'Donell wrote:
On 07/25/2018 04:57 PM, Carlos O'Donell wrote:
v4
- Fixed ar_SA, km_KH, lo_LA, or_IN, sl_SI, th_TH.
- Added range checking for a-z, A-Z for all supported UTF-8 locales.
All of my testers are clean.
Attaching v4 on top of the current master.
This fixes all the locales.
I wrote another enumeration tester, this time covering all locales. It
found these issues:
az_AZ: U+000069 fails to match /[a-z]/
az_AZ: U+000049 fails to match /[A-Z]/
az_AZ.utf8: U+000069 fails to match /[a-z]/
az_AZ.utf8: U+000049 fails to match /[A-Z]/
crh_UA: U+000069 fails to match /[a-z]/
crh_UA: U+000049 fails to match /[A-Z]/
crh_UA.utf8: U+000069 fails to match /[a-z]/
crh_UA.utf8: U+000049 fails to match /[A-Z]/
ku_TR: U+000069 fails to match /[a-z]/
ku_TR: U+000049 fails to match /[A-Z]/
ku_TR.iso88599: U+000069 fails to match /[a-z]/
ku_TR.iso88599: U+000049 fails to match /[A-Z]/
ku_TR.utf8: U+000069 fails to match /[a-z]/
ku_TR.utf8: U+000049 fails to match /[A-Z]/
lv_LV: U+000079 fails to match /[a-z]/
lv_LV: U+000059 fails to match /[A-Z]/
lv_LV.iso885913: U+000079 fails to match /[a-z]/
lv_LV.iso885913: U+000059 fails to match /[A-Z]/
lv_LV.utf8: U+000079 fails to match /[a-z]/
lv_LV.utf8: U+000059 fails to match /[A-Z]/
shs_CA: U+0000E6 matches /[a-z]/ unexpectedly
shs_CA: U+0000C6 matches /[A-Z]/ unexpectedly
shs_CA.utf8: U+0000E6 matches /[a-z]/ unexpectedly
shs_CA.utf8: U+0000C6 matches /[A-Z]/ unexpectedly
slovene: U+00006A fails to match /[a-z]/
slovene: U+00006B fails to match /[a-z]/
slovene: U+00006C fails to match /[a-z]/
slovene: U+00006D fails to match /[a-z]/
slovene: U+00006E fails to match /[a-z]/
slovene: U+00006F fails to match /[a-z]/
slovenian: U+00006A fails to match /[a-z]/
slovenian: U+00006B fails to match /[a-z]/
slovenian: U+00006C fails to match /[a-z]/
slovenian: U+00006D fails to match /[a-z]/
slovenian: U+00006E fails to match /[a-z]/
slovenian: U+00006F fails to match /[a-z]/
sl_SI: U+00006A fails to match /[a-z]/
sl_SI: U+00006B fails to match /[a-z]/
sl_SI: U+00006C fails to match /[a-z]/
sl_SI: U+00006D fails to match /[a-z]/
sl_SI: U+00006E fails to match /[a-z]/
sl_SI: U+00006F fails to match /[a-z]/
sl_SI.iso88592: U+00006A fails to match /[a-z]/
sl_SI.iso88592: U+00006B fails to match /[a-z]/
sl_SI.iso88592: U+00006C fails to match /[a-z]/
sl_SI.iso88592: U+00006D fails to match /[a-z]/
sl_SI.iso88592: U+00006E fails to match /[a-z]/
sl_SI.iso88592: U+00006F fails to match /[a-z]/
sl_SI.utf8: U+00006A fails to match /[a-z]/
sl_SI.utf8: U+00006B fails to match /[a-z]/
sl_SI.utf8: U+00006C fails to match /[a-z]/
sl_SI.utf8: U+00006D fails to match /[a-z]/
sl_SI.utf8: U+00006E fails to match /[a-z]/
sl_SI.utf8: U+00006F fails to match /[a-z]/
sv_FI: U+000077 fails to match /[a-z]/
sv_FI: U+000057 fails to match /[A-Z]/
sv_FI@euro: U+000077 fails to match /[a-z]/
sv_FI@euro: U+000057 fails to match /[A-Z]/
sv_FI.iso88591: U+000077 fails to match /[a-z]/
sv_FI.iso88591: U+000057 fails to match /[A-Z]/
sv_FI.iso885915@euro: U+000077 fails to match /[a-z]/
sv_FI.iso885915@euro: U+000057 fails to match /[A-Z]/
sv_FI.utf8: U+000077 fails to match /[a-z]/
sv_FI.utf8: U+000057 fails to match /[A-Z]/
sv_SE: U+000077 fails to match /[a-z]/
sv_SE: U+000057 fails to match /[A-Z]/
sv_SE.iso88591: U+000077 fails to match /[a-z]/
sv_SE.iso88591: U+000057 fails to match /[A-Z]/
sv_SE.utf8: U+000077 fails to match /[a-z]/
sv_SE.utf8: U+000057 fails to match /[A-Z]/
swedish: U+000077 fails to match /[a-z]/
swedish: U+000057 fails to match /[A-Z]/
tt_RU: U+000069 fails to match /[a-z]/
tt_RU: U+000049 fails to match /[A-Z]/
tt_RU@iqtelif: U+000069 fails to match /[a-z]/
tt_RU@iqtelif: U+000049 fails to match /[A-Z]/
tt_RU.utf8: U+000069 fails to match /[a-z]/
tt_RU.utf8: U+000049 fails to match /[A-Z]/
tt_RU.utf8@iqtelif: U+000069 fails to match /[a-z]/
tt_RU.utf8@iqtelif: U+000049 fails to match /[A-Z]/
Thanks,
Florian
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <locale.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <algorithm>
#include <string>
#include <vector>
static std::vector<std::string>
get_locales()
{
FILE *fp = popen("locale -a", "r");
if (fp == NULL)
err(1, "running locale -a");
std::vector<std::string> result;
while (!feof(fp))
{
char *elem{};
int ret = fscanf(fp, "%ms", &elem);
if (ret == 1)
{
if (elem == nullptr)
errx(1, "invalid fscanf result");
result.emplace_back(elem);
free(elem);
}
else if (ferror(fp))
err(1, "fscanf failed");
}
int ret = pclose(fp);
if (ret != 0)
err(1, "locale -a failed with status %d", ret);
std::sort(result.begin(), result.end());
return result;
}
static void
test_regexp_range(const char *locale, const char *pattern,
std::pair<wchar_t, wchar_t> range)
{
regex_t reg;
{
int ret = regcomp(®, pattern, REG_EXTENDED | REG_NOSUB);
if (ret != 0)
errx(1, "Cannot compile regular expression /%s/: %d", pattern, ret);
}
const wchar_t maximum_character = 0x10FFFF;
const unsigned maximum_length = 5; /* With NUL. */
for (wchar_t ch = 1; ch <= maximum_character; ++ch)
{
char uch[MB_LEN_MAX];
mbstate_t ps{};
{
size_t ret = wcrtomb(uch, ch, &ps);
if (ret == static_cast<size_t>(-1))
{
if (errno == EILSEQ)
continue;
err(1, "wcrtomb(0x%x)", static_cast<unsigned>(ch));
}
else if (ret == 0)
continue; // Some anomaly.
if (ret >= maximum_length)
errx(1, "multi-byte length %zu at 0x%x exceeds %u",
ret, ch, maximum_length);
uch[ret] = '\0';
}
int ret = regexec(®, uch, 0, NULL, 0);
if (ret != 0 && ret != REG_NOMATCH)
errx(1, "regexec of /%s/ failed with code %d", pattern, ret);
bool regex_matches = ret == 0;
bool range_matches = range.first <= ch && ch <= range.second;
if (regex_matches != range_matches) {
if (regex_matches)
printf("%s: U+%06X matches /%s/ unexpectedly\n",
locale, static_cast<unsigned>(ch), pattern);
else
printf("%s: U+%06X fails to match /%s/\n",
locale, static_cast<unsigned>(ch), pattern);
}
}
regfree(®);
}
int
main()
{
std::vector<std::string> locales{get_locales()};
for (const auto &locale : locales) {
if (setlocale(LC_ALL, locale.c_str()) == NULL)
err(1, "Cannot set locale to %s", locale.c_str());
test_regexp_range(locale.c_str(), "[0-9]", std::make_pair(L'0', L'9'));
test_regexp_range(locale.c_str(), "[a-z]", std::make_pair(L'a', L'z'));
test_regexp_range(locale.c_str(), "[A-Z]", std::make_pair(L'A', L'Z'));
}
}
- References:
- [PATCH] Keep expected behaviour for [a-z] and [A-z] (Bug 23393).
- Re: [PATCH] Keep expected behaviour for [a-z] and [A-z] (Bug 23393).
- Re: [PATCH] Keep expected behaviour for [a-z] and [A-z] (Bug 23393).
- Re: [PATCH] Keep expected behaviour for [a-z] and [A-z] (Bug 23393).
- Re: [PATCH] Keep expected behaviour for [a-z] and [A-z] (Bug 23393).
- Re: [PATCH] Keep expected behaviour for [a-z] and [A-z] (Bug 23393).
- [PATCHv3] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).
- Re: [PATCHv3] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).
- Re: [PATCHv3] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).
- Re: [PATCHv3] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).
- [PATCHv4] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).
- Re: [PATCHv4a] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393).