This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
collation bugs involving UNDEFINED in glibc (cvs 2 days ago), with illustrative app and commented output
- From: mjn3 at codepoet dot org (Manuel Novoa III)
- To: libc-alpha at sources dot redhat dot com
- Cc: Ulrich Drepper <drepper at redhat dot com>
- Date: Fri, 12 Sep 2003 10:16:35 -0600
- Subject: collation bugs involving UNDEFINED in glibc (cvs 2 days ago), with illustrative app and commented output
Hello,
The attached app illustrates several problems in glibc's handling
of implicit/explicit UNDEFINED entries in collation. The problem(s)
may lie in the collation code, in localedef, or both. See the
annotations in the comment header for details.
Manuel
/* Illustration of glibc collation bugs involving UNDEFINED.
Annotated output for glibc -- cvs Sep 10 14:37 CDT
Test strings:
0: empty string
1: <UFFFD>
2: <U0061>
3: <U0061><U0062>
4: <U0061><UFFFD>
5: <UFFFD>
6: <U0030><UFFFD>
7: <UFFFD><U0030>
8: <U05C3><UFFFD>
9: <UFFFD><U05C3>
10: <U0001><UFFFD>
11: <UFFFD><U0001>
12: <U009F><UFFFD>
13: <UFFFD><U009F>
14: <U0001>
15: <UFFFD>
locale: LC_CTYPE=th_TH.UTF-8 LC_COLLATE=th_TH.UTF-8
explicit "UNDEFINED IGNORE;IGNORE;IGNORE;IGNORE"
order_start forward;forward;forward;forward
wcscoll((0),(1)) = -1 --- wscxfrm gives
0:
1: 1 1 7
wcscoll((2),(3)) = -1 --- wscxfrm gives
2: 40 1 2 1 2 1 5
3: 40 41 1 2 2 1 2 2 1 5 5
wcscoll((4),(5)) = 1 --- wscxfrm gives
4: 40 1 2 1 2 7 1 5
5: 1 1 7
wcscoll((6),(7)) = -5 --- wscxfrm gives
6: 36 1 2 1 2 7 1 2
7: 36 1 2 1 7 2 1 2
wcscoll((8),(9)) = 0 --- wscxfrm gives
8: 1 1 7 7
9: 1 1 7 7
wcscoll((10),(11)) = 0 --- wscxfrm gives
10: 1 1 7 7
11: 1 1 7 7
wcscoll((12),(13)) = 0 --- wscxfrm gives
12: 1 1 7 7
13: 1 1 7 7
wcscoll((14),(15)) = 0 --- wscxfrm gives
14: 1 1 7
15: 1 1 7
*******************************************************************************
BUG: (th_TH.UTF-8)
Since <UFFFD> is undefined, it should be ignored at all weight levels
in this locale as per the explicit definition of UNDEFINED. Also, position
is not specified for any weight level. So with respect to collation,
lines (0) and (1) should compare as equivilant. The same is true for
lines (6) and (7).
*******************************************************************************
locale: LC_CTYPE=ja_JP.UTF-8 LC_COLLATE=ja_JP.UTF-8
explicit UNDEFINED entry as last entry
order_start forward
wcscoll((0),(1)) = -1 --- wscxfrm gives
0:
1: 2
wcscoll((2),(3)) = -1 --- wscxfrm gives
2: 63
3: 63 64
wcscoll((4),(5)) = 97 --- wscxfrm gives
4: 63 2
5: 2
wcscoll((6),(7)) = 48 --- wscxfrm gives
6: 32 2
7: 2 32
wcscoll((8),(9)) = 0 --- wscxfrm gives
8: 2 2
9: 2 2
wcscoll((10),(11)) = 1 --- wscxfrm gives
10: 3 2
11: 2 3
wcscoll((12),(13)) = 157 --- wscxfrm gives
12: 9f 2
13: 2 9f
wcscoll((14),(15)) = 1 --- wscxfrm gives
14: 3
15: 2
*******************************************************************************
BUG: (ja_JP.UTF-8)
Since <UFFFD> is undefined and there is an explicit UNDEFINED as the last
entry in the collation order, <UFFFD> should have a collation weight
greater than that of any explictly weighted wchar.
*******************************************************************************
locale: LC_CTYPE=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
no explicit UNDEFINED entry
wcscoll((0),(1)) = -1 --- wscxfrm gives
0:
1: 1 1 1 1 154
wcscoll((2),(3)) = -1 --- wscxfrm gives
2: 2e 1 10 1 2
3: 2e 2f 1 10 10 1 2 2
wcscoll((4),(5)) = 1 --- wscxfrm gives
4: 2e 1 10 1 2 1 2 154
5: 1 1 1 1 154
wcscoll((6),(7)) = 1 --- wscxfrm gives
6: 24 1 10 1 2 1 2 154
7: 24 1 10 1 2 1 1 154
wcscoll((8),(9)) = -1 --- wscxfrm gives
8: 1 1 1 1 153 1 154
9: 1 1 1 1 154 1 153
wcscoll((10),(11)) = 0 --- wscxfrm gives
10: 1 1 1 1 154 1 154
11: 1 1 1 1 154 1 154
wcscoll((12),(13)) = 63 --- wscxfrm gives
12: 1 1 1 1 193 1 154
13: 1 1 1 1 154 1 193
wcscoll((14),(15)) = 0 --- wscxfrm gives
14: 1 1 1 1 154
15: 1 1 1 1 154
*******************************************************************************
BUG: (en_US.UTF-8 and probably any other iso14651_t derived locale)
The implicit UNDEFINED entry is being treated as <U0001>.
BACKGROUND:
DTR 14652 (specificly n972-14652w25.pdf) specifies:
If no "UNDEFINED" symbol is specified, and the current coded character
set contains characters not specified in this clause, the utility
issues a warning message and place such characters at the end of the
character collation order.
However, DTR 14652 does not specify what "order" would be in effect in
this case.
en_US simply copies iso14651_t to obtain its collation information.
There is no definition of UNDEFINED in iso14651_t, and the last keyword
is "order_end".
QUESTIONS:
1) What implicit definition is glibc using for UNDEFINED?
2) What implicit ordering rule is glibc using for UNDEFINED?
ANSWERS:
Through trial and error (line 8-13), we discover that glibc is apparently
treating the implicit UNDEFINED as <U0001>!
*******************************************************************************
*/
#define _GNU_SOURCE
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <errno.h>
#include <wchar.h>
#include <locale.h>
#include <assert.h>
static const wchar_t * lines[] = {
L"", /* 0 */
L"\xFFFD",
L"a", /* 2 */
L"ab",
L"a\xFFFD", /* 4 */
L"\xFFFD",
L"\x0030\xFFFD", /* 6 */
L"\xFFFD\x0030",
L"\x05C3\xFFFD", /* 8 */
L"\xFFFD\x05C3",
L"\x0001\xFFFD", /* 10 */
L"\xFFFD\x0001",
L"\x009F\xFFFD", /* 12 */
L"\xFFFD\x009F",
L"\x0001", /* 14 */
L"\xFFFD",
NULL
};
static const char *locales[] = {
"th_TH.UTF-8",
"explicit \"UNDEFINED IGNORE;IGNORE;IGNORE;IGNORE\"",
"order_start forward;forward;forward;forward",
"ja_JP.UTF-8",
"explicit UNDEFINED entry as last entry",
"order_start forward",
"en_US.UTF-8",
"no explicit UNDEFINED entry",
"",
NULL
};
int main(int argc, char **argv)
{
wchar_t buf1[1024];
size_t r;
int j, i, retval;
const wchar_t **line;
const wchar_t *p;
char *lctype;
char *lcollate;
retval = EXIT_SUCCESS;
if (!setlocale(LC_CTYPE,"en_US.UTF-8")) {
fprintf(stderr, "setlocale failed\n");
return EXIT_FAILURE;
}
wprintf(L"\nTest strings:\n");
for (line = lines ; *line ; ++line) {
wprintf(L"%5d: ", (int)(line - lines));
if (!**line) {
wprintf(L"empty string");
} else {
for (p=*line, j=6 ; *p ; p++) {
if (j >= 68) {
wprintf(L"\n ");
j = 6;
}
if (((unsigned long) *p) <= 0xffffUL) {
wprintf(L"<U%04X>", (unsigned long) *p);
j += 7;
} else {
wprintf(L"<U%08X>", (unsigned long) *p);
j += 11;
}
}
}
wprintf(L"\n");
}
wprintf(L"\n");
for (i=0 ; locales[i] ; i++) {
if (!(lctype = setlocale(LC_CTYPE,locales[i]))) {
fprintf(stderr, "setlocale failed\n");
return EXIT_FAILURE;
}
if (!(lcollate = setlocale(LC_COLLATE,locales[i]))) {
fprintf(stderr, "setlocale failed\n");
return EXIT_FAILURE;
}
wprintf(L"locale: LC_CTYPE=%s LC_COLLATE=%s\n", lctype, lcollate);
wprintf(L" %s\n", locales[++i]);
wprintf(L" %s\n", locales[++i]);
for (line = lines ; *line ; ++line) {
int n = line - lines;
if (!(n & 1)) {
wprintf(L" wcscoll((%d),(%d)) = %d --- wscxfrm gives\n",
n, n+1, wcscoll(lines[n], lines[n+1]));
}
if ((r = wcsxfrm(buf1, *line, 1024)) >= 1024) {
fprintf(stderr, "wcsxfrm returned %u >= 1024\n", r);
return EXIT_FAILURE;
}
wprintf(L"%5d:", n);
for (j=0 ; j < r ; j++) {
if (j && !(j % 14)) {
wprintf(L"\n ");
}
wprintf(L"%5x", buf1[j]);
}
wprintf(L"\n");
}
wprintf(L"\n");
}
return retval;
}