Bug 29511 - default/C/POSIX locale is 7-bit (127 characters), must be 8-bit (256 characters) since POSIX Issue 7 TC2/Issue 8
Summary: default/C/POSIX locale is 7-bit (127 characters), must be 8-bit (256 characte...
Status: UNCONFIRMED
Alias: None
Product: glibc
Classification: Unclassified
Component: locale (show other bugs)
Version: unspecified
: P2 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2022-08-21 23:27 UTC by наб
Modified: 2023-06-28 20:12 UTC (History)
4 users (show)

See Also:
Host:
Target:
Build:
Last reconfirmed:
fweimer: security-


Attachments
Working diff to fix this issue :) (2.19 KB, patch)
2022-08-21 23:27 UTC, наб
Details | Diff
CHARSET=POSIX for default/C/POSIX locale, [0x80, 0xFF] in [U+DF80, U+DFFF] (5.54 KB, patch)
2022-08-30 16:07 UTC, наб
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description наб 2022-08-21 23:27:33 UTC
Created attachment 14286 [details]
Working diff to fix this issue :)

I'm forwarding https://bugs.debian.org/1017852 as directed. It's reproduced below:

---===---

Consider the following reproducer:
-- >8 --
#include <stdio.h>
#include <wchar.h>
#include <locale.h>

int main(int argc, char **) {
	if(argc > 1)
		fprintf(stderr, "loc=%s\n", setlocale(LC_ALL, ""));

	for(int i = 0; i <= 0xFF; ++i) {
		char bs[] = {i, 0};
		mbstate_t ctx = {};
		wchar_t wc = -1;
		printf("%02x: %d, ", i, mbrtowc(&wc, bs, 1, &ctx));
		printf("%ld\n", wc);
	}
}
-- >8 --

Yielding the following output:
-- >8 --
$ ./b | paste - - - - - - - -
00: 0, 0	01: 1, 1	02: 1, 2	03: 1, 3	04: 1, 4	05: 1, 5	06: 1, 6	07: 1, 7
08: 1, 8	09: 1, 9	0a: 1, 10	0b: 1, 11	0c: 1, 12	0d: 1, 13	0e: 1, 14	0f: 1, 15
10: 1, 16	11: 1, 17	12: 1, 18	13: 1, 19	14: 1, 20	15: 1, 21	16: 1, 22	17: 1, 23
18: 1, 24	19: 1, 25	1a: 1, 26	1b: 1, 27	1c: 1, 28	1d: 1, 29	1e: 1, 30	1f: 1, 31
20: 1, 32	21: 1, 33	22: 1, 34	23: 1, 35	24: 1, 36	25: 1, 37	26: 1, 38	27: 1, 39
28: 1, 40	29: 1, 41	2a: 1, 42	2b: 1, 43	2c: 1, 44	2d: 1, 45	2e: 1, 46	2f: 1, 47
30: 1, 48	31: 1, 49	32: 1, 50	33: 1, 51	34: 1, 52	35: 1, 53	36: 1, 54	37: 1, 55
38: 1, 56	39: 1, 57	3a: 1, 58	3b: 1, 59	3c: 1, 60	3d: 1, 61	3e: 1, 62	3f: 1, 63
40: 1, 64	41: 1, 65	42: 1, 66	43: 1, 67	44: 1, 68	45: 1, 69	46: 1, 70	47: 1, 71
48: 1, 72	49: 1, 73	4a: 1, 74	4b: 1, 75	4c: 1, 76	4d: 1, 77	4e: 1, 78	4f: 1, 79
50: 1, 80	51: 1, 81	52: 1, 82	53: 1, 83	54: 1, 84	55: 1, 85	56: 1, 86	57: 1, 87
58: 1, 88	59: 1, 89	5a: 1, 90	5b: 1, 91	5c: 1, 92	5d: 1, 93	5e: 1, 94	5f: 1, 95
60: 1, 96	61: 1, 97	62: 1, 98	63: 1, 99	64: 1, 100	65: 1, 101	66: 1, 102	67: 1, 103
68: 1, 104	69: 1, 105	6a: 1, 106	6b: 1, 107	6c: 1, 108	6d: 1, 109	6e: 1, 110	6f: 1, 111
70: 1, 112	71: 1, 113	72: 1, 114	73: 1, 115	74: 1, 116	75: 1, 117	76: 1, 118	77: 1, 119
78: 1, 120	79: 1, 121	7a: 1, 122	7b: 1, 123	7c: 1, 124	7d: 1, 125	7e: 1, 126	7f: 1, 127
80: -1, -1	81: -1, -1	82: -1, -1	83: -1, -1	84: -1, -1	85: -1, -1	86: -1, -1	87: -1, -1
88: -1, -1	89: -1, -1	8a: -1, -1	8b: -1, -1	8c: -1, -1	8d: -1, -1	8e: -1, -1	8f: -1, -1
90: -1, -1	91: -1, -1	92: -1, -1	93: -1, -1	94: -1, -1	95: -1, -1	96: -1, -1	97: -1, -1
98: -1, -1	99: -1, -1	9a: -1, -1	9b: -1, -1	9c: -1, -1	9d: -1, -1	9e: -1, -1	9f: -1, -1
a0: -1, -1	a1: -1, -1	a2: -1, -1	a3: -1, -1	a4: -1, -1	a5: -1, -1	a6: -1, -1	a7: -1, -1
a8: -1, -1	a9: -1, -1	aa: -1, -1	ab: -1, -1	ac: -1, -1	ad: -1, -1	ae: -1, -1	af: -1, -1
b0: -1, -1	b1: -1, -1	b2: -1, -1	b3: -1, -1	b4: -1, -1	b5: -1, -1	b6: -1, -1	b7: -1, -1
b8: -1, -1	b9: -1, -1	ba: -1, -1	bb: -1, -1	bc: -1, -1	bd: -1, -1	be: -1, -1	bf: -1, -1
c0: -1, -1	c1: -1, -1	c2: -1, -1	c3: -1, -1	c4: -1, -1	c5: -1, -1	c6: -1, -1	c7: -1, -1
c8: -1, -1	c9: -1, -1	ca: -1, -1	cb: -1, -1	cc: -1, -1	cd: -1, -1	ce: -1, -1	cf: -1, -1
d0: -1, -1	d1: -1, -1	d2: -1, -1	d3: -1, -1	d4: -1, -1	d5: -1, -1	d6: -1, -1	d7: -1, -1
d8: -1, -1	d9: -1, -1	da: -1, -1	db: -1, -1	dc: -1, -1	dd: -1, -1	de: -1, -1	df: -1, -1
e0: -1, -1	e1: -1, -1	e2: -1, -1	e3: -1, -1	e4: -1, -1	e5: -1, -1	e6: -1, -1	e7: -1, -1
e8: -1, -1	e9: -1, -1	ea: -1, -1	eb: -1, -1	ec: -1, -1	ed: -1, -1	ee: -1, -1	ef: -1, -1
f0: -1, -1	f1: -1, -1	f2: -1, -1	f3: -1, -1	f4: -1, -1	f5: -1, -1	f6: -1, -1	f7: -1, -1
f8: -1, -1	f9: -1, -1	fa: -1, -1	fb: -1, -1	fc: -1, -1	fd: -1, -1	fe: -1, -1	ff: -1, -1

$ LC_ALL=POSIX ./b _ | paste - - - - - - - -
loc=C
00: 0, 0	01: 1, 1	02: 1, 2	03: 1, 3	04: 1, 4	05: 1, 5	06: 1, 6	07: 1, 7
08: 1, 8	09: 1, 9	0a: 1, 10	0b: 1, 11	0c: 1, 12	0d: 1, 13	0e: 1, 14	0f: 1, 15
10: 1, 16	11: 1, 17	12: 1, 18	13: 1, 19	14: 1, 20	15: 1, 21	16: 1, 22	17: 1, 23
18: 1, 24	19: 1, 25	1a: 1, 26	1b: 1, 27	1c: 1, 28	1d: 1, 29	1e: 1, 30	1f: 1, 31
20: 1, 32	21: 1, 33	22: 1, 34	23: 1, 35	24: 1, 36	25: 1, 37	26: 1, 38	27: 1, 39
28: 1, 40	29: 1, 41	2a: 1, 42	2b: 1, 43	2c: 1, 44	2d: 1, 45	2e: 1, 46	2f: 1, 47
30: 1, 48	31: 1, 49	32: 1, 50	33: 1, 51	34: 1, 52	35: 1, 53	36: 1, 54	37: 1, 55
38: 1, 56	39: 1, 57	3a: 1, 58	3b: 1, 59	3c: 1, 60	3d: 1, 61	3e: 1, 62	3f: 1, 63
40: 1, 64	41: 1, 65	42: 1, 66	43: 1, 67	44: 1, 68	45: 1, 69	46: 1, 70	47: 1, 71
48: 1, 72	49: 1, 73	4a: 1, 74	4b: 1, 75	4c: 1, 76	4d: 1, 77	4e: 1, 78	4f: 1, 79
50: 1, 80	51: 1, 81	52: 1, 82	53: 1, 83	54: 1, 84	55: 1, 85	56: 1, 86	57: 1, 87
58: 1, 88	59: 1, 89	5a: 1, 90	5b: 1, 91	5c: 1, 92	5d: 1, 93	5e: 1, 94	5f: 1, 95
60: 1, 96	61: 1, 97	62: 1, 98	63: 1, 99	64: 1, 100	65: 1, 101	66: 1, 102	67: 1, 103
68: 1, 104	69: 1, 105	6a: 1, 106	6b: 1, 107	6c: 1, 108	6d: 1, 109	6e: 1, 110	6f: 1, 111
70: 1, 112	71: 1, 113	72: 1, 114	73: 1, 115	74: 1, 116	75: 1, 117	76: 1, 118	77: 1, 119
78: 1, 120	79: 1, 121	7a: 1, 122	7b: 1, 123	7c: 1, 124	7d: 1, 125	7e: 1, 126	7f: 1, 127
80: -1, -1	81: -1, -1	82: -1, -1	83: -1, -1	84: -1, -1	85: -1, -1	86: -1, -1	87: -1, -1
88: -1, -1	89: -1, -1	8a: -1, -1	8b: -1, -1	8c: -1, -1	8d: -1, -1	8e: -1, -1	8f: -1, -1
90: -1, -1	91: -1, -1	92: -1, -1	93: -1, -1	94: -1, -1	95: -1, -1	96: -1, -1	97: -1, -1
98: -1, -1	99: -1, -1	9a: -1, -1	9b: -1, -1	9c: -1, -1	9d: -1, -1	9e: -1, -1	9f: -1, -1
a0: -1, -1	a1: -1, -1	a2: -1, -1	a3: -1, -1	a4: -1, -1	a5: -1, -1	a6: -1, -1	a7: -1, -1
a8: -1, -1	a9: -1, -1	aa: -1, -1	ab: -1, -1	ac: -1, -1	ad: -1, -1	ae: -1, -1	af: -1, -1
b0: -1, -1	b1: -1, -1	b2: -1, -1	b3: -1, -1	b4: -1, -1	b5: -1, -1	b6: -1, -1	b7: -1, -1
b8: -1, -1	b9: -1, -1	ba: -1, -1	bb: -1, -1	bc: -1, -1	bd: -1, -1	be: -1, -1	bf: -1, -1
c0: -1, -1	c1: -1, -1	c2: -1, -1	c3: -1, -1	c4: -1, -1	c5: -1, -1	c6: -1, -1	c7: -1, -1
c8: -1, -1	c9: -1, -1	ca: -1, -1	cb: -1, -1	cc: -1, -1	cd: -1, -1	ce: -1, -1	cf: -1, -1
d0: -1, -1	d1: -1, -1	d2: -1, -1	d3: -1, -1	d4: -1, -1	d5: -1, -1	d6: -1, -1	d7: -1, -1
d8: -1, -1	d9: -1, -1	da: -1, -1	db: -1, -1	dc: -1, -1	dd: -1, -1	de: -1, -1	df: -1, -1
e0: -1, -1	e1: -1, -1	e2: -1, -1	e3: -1, -1	e4: -1, -1	e5: -1, -1	e6: -1, -1	e7: -1, -1
e8: -1, -1	e9: -1, -1	ea: -1, -1	eb: -1, -1	ec: -1, -1	ed: -1, -1	ee: -1, -1	ef: -1, -1
f0: -1, -1	f1: -1, -1	f2: -1, -1	f3: -1, -1	f4: -1, -1	f5: -1, -1	f6: -1, -1	f7: -1, -1
f8: -1, -1	f9: -1, -1	fa: -1, -1	fb: -1, -1	fc: -1, -1	fd: -1, -1	fe: -1, -1	ff: -1, -1
-- >8 --

This breaks all programs that expect to process text/data portably,
since in LC_ALL=C half of all bytes collapse to one character
(for sort this means that they all collate equally, &c., &c.)!

Consider a diff of XBD 6.2 ("Character Encoding"), Issue 7 vs Issue 7 TC2:
-- >8 --
@@ -1768,9 +1664,13 @@

 <h3><a name="tag_06_02">   6.2 </a>Character Encoding</h3>

-<p>The POSIX locale contains the characters in <a href="#tagtcjh_3">Portable Character Set</a> , which have the properties listed
-in <a href="../basedefs/V1_chap07.html#tag_07_03_01"><i>LC_CTYPE</i></a> . In other locales, the presence, meaning, and
-representation of any additional characters are locale-specific.</p>
+<p>The POSIX locale shall contain 256 single-byte characters including the characters in <a href="#tagtcjh_3">Portable Character
+Set</a> and <a href="#tagtcjh_4">Non-Portable Control Characters</a>, which have the properties listed in <a href=
+"../basedefs/V1_chap07.html#tag_07_03_01"><i>LC_CTYPE</i></a>. It is unspecified whether characters not listed in those two tables
+are classified as <b>punct</b> or <b>cntrl</b>, or neither. Other locales shall contain the characters in <a href=
+"#tagtcjh_3">Portable Character Set</a> and may contain any or all of the control characters identified in <a href=
+"#tagtcjh_4">Non-Portable Control Characters</a>; the presence, meaning, and representation of any additional characters are
+locale-specific.</p>

 <p>In locales other than the POSIX locale, a character may have a state-dependent encoding. There are two types of these
 encodings:</p>
-- >8 --

This text is widely supported with global changes later originating from bug 674:
  > An invalid character sequence is detected. In the POSIX locale an EILSEQ error cannot occur since all byte values are valid characters.[/CX]
  > In the POSIX locale each byte is a valid single-byte character, and therefore this problem is avoided.
&c.
This text is unchanged in Issue 8 Draft 2.1.

---===---

Additionally, consider the diff of XBD 7.3.2 ("LC_COLLATE"):
-- >8 --
@@ -3006,13 +3097,17 @@

 <h5><a name="tag_07_03_02_06"></a>LC_COLLATE Category in the POSIX Locale</h5>

-<p>The collation sequence definition of the POSIX locale follows; the code listing depicts the <a href=
-"../utilities/localedef.html"><i>localedef</i></a> input.</p>
+<p>The minimum collation sequence definition of the POSIX locale follows; the code listing depicts the <a href=
+"../utilities/localedef.html"><i>localedef</i></a> input. All characters not explicitly listed here shall be inserted in the
+character collation order after the listed characters and shall be assigned unique primary weights. If the listed characters have
+ASCII encoding, the other characters shall be in ascending order according to their coded character set values; otherwise, the
+order of the other characters is unspecified. The collation sequence shall not include any multi-character collating elements.</p>

 <pre>
 <tt>LC_COLLATE
-# This is the POSIX locale definition for the LC_COLLATE category.
-# The order is the same as in the ASCII codeset.
+# This is the minimum input for the POSIX locale definition for the
+# LC_COLLATE category. Characters in this list are in the same order
+# as in the ASCII codeset.
 order_start forward
 &lt;NUL&gt;
 &lt;SOH&gt;
-- >8 --

I've dug in a bit into patching this, but I ran into some issues:
  * wcsmbsload.c to_{wc,mb} do not convert from/to ANSI_X3.4-1968//TRANSLIT, but... what? or is it ANSI_X3.4-1968//TRANSLIT but fudged? or something else?
  * there's an s390 assembly implementation of both conversion functions
nevertheless I'm attaching a working diff.
Comment 1 Florian Weimer 2022-08-30 07:23:51 UTC
I don't see how this is a bug, sorry.

There is no well-defined conversion from US-ASCII to the internal UCS-4 encoding, so reporting an error for input bytes that do not have a mapping makes sense.

If another glibc function does not behave as expected (fnmatch?), please file a separate bug for that.
Comment 2 наб 2022-08-30 11:33:13 UTC
I don't see how that's relevant? The POSIX locale is /not/ US-ASCII: it's /some/ encoding that contains the PCS and NPCS as the first 128 characters (this coincides with 7-bit ASCII) /and/ contains mappings from all 256 single bytes to a character (and back).

This is very obviously listed in the first hunk, which I'll reiterate again: POSIX.1, Issue 7 TC 2, XBD, 6.2 "Character Encoding" starts with:
  > The POSIX locale shall contain 256 single-byte characters including the characters in Portable Character Set and Non-Portable Control Characters, which have the properties listed in LC_CTYPE. 

The bug here isn't "glibc doesn't decode ASCII [0xfa, 0] to my favourite character", it's "glibc treats POSIX locale as-if it were ASCII" – which it's not.

From your response it appears you also haven't consulted bug 663 (an oversight on my part, it seems I only linked it directly in the Debian bug, and here indirectly): https://www.austingroupbugs.net/view.php?id=663 – it contains a summary of changes that make the Issue 7 TC 2/Issue 8 POSIX locale as-described.

POSIX.1, Issue 7 TC 2, XSH, mbrtowc(), ERRORS:
-- >8 --
 [EILSEQ]
    An invalid character sequence is detected. [CX]In the POSIX locale an EILSEQ error cannot occur since all byte values are valid characters.[/CX]
-- >8 --
(the same text is seen in mbrlen(), mbstowcs(), &c., &c.).

POSIX.1, Issue 7 TC 2, XRAT, XBD, A.6.2 Character Encoding:
-- >8 --
Earlier versions of this standard did not state the requirement that the POSIX locale contains 256 single-byte characters. This was an oversight; the intention was always that the POSIX locale should have an 8-bit-clean single-byte encoding.
-- >8 --

I hope this convinces you that this is not RESOLVED INVALID, but, indeed, a conformance error.

A different, less messy than the one in my patch (or just making it a KOI-8 variant, which is admittedly the worst one so far), solution I arrived at today would be to map [0, 0x7F] to [U+0, U+7F] as is done currently and B[0x80, 0xFF] to C[U+FFFFFF80, U+FFFFFFFF] or C[U+FFFFFF00, U+FFFFFF7F] — I don't think Unicode will reach that high any-time soon – or to pick a PUA and map to C[U+100000, U+10007F]/C[U+E080, U+E0FF]/you get the point here – "using glibc" reasonably fits as a "private agreement between collaborating users".

Changing the CODESET is also, presumably, a given: naively, to "POSIX", I guess?
Comment 3 Florian Weimer 2022-08-30 12:30:07 UTC
Sorry, I had assumed that C was specifically worded in such a way that implementations could default to UTF-8 if they wanted, and POSIX kept this possibility.

Not sure what to do about this issue. We had this idea to switch to the UTF-8 character set by default for a long time. It's strange that POSIX no longer allows that.
Comment 4 наб 2022-08-30 16:06:50 UTC
Out of curiosity, I checked what musl does (represented here by current Void, musl-1.1.24_10):
-- >8 --
bash-5.1# ./b | paste - - - - - - - -
CODESET=ASCII
00: 0, 0        01: 1, 1        02: 1, 2        03: 1, 3        04: 1, 4        05: 1, 5        06: 1, 6        07: 1, 7
08: 1, 8        09: 1, 9        0a: 1, a        0b: 1, b        0c: 1, c        0d: 1, d        0e: 1, e        0f: 1, f
10: 1, 10       11: 1, 11       12: 1, 12       13: 1, 13       14: 1, 14       15: 1, 15       16: 1, 16       17: 1, 17
18: 1, 18       19: 1, 19       1a: 1, 1a       1b: 1, 1b       1c: 1, 1c       1d: 1, 1d       1e: 1, 1e       1f: 1, 1f
20: 1, 20       21: 1, 21       22: 1, 22       23: 1, 23       24: 1, 24       25: 1, 25       26: 1, 26       27: 1, 27
28: 1, 28       29: 1, 29       2a: 1, 2a       2b: 1, 2b       2c: 1, 2c       2d: 1, 2d       2e: 1, 2e       2f: 1, 2f
30: 1, 30       31: 1, 31       32: 1, 32       33: 1, 33       34: 1, 34       35: 1, 35       36: 1, 36       37: 1, 37
38: 1, 38       39: 1, 39       3a: 1, 3a       3b: 1, 3b       3c: 1, 3c       3d: 1, 3d       3e: 1, 3e       3f: 1, 3f
40: 1, 40       41: 1, 41       42: 1, 42       43: 1, 43       44: 1, 44       45: 1, 45       46: 1, 46       47: 1, 47
48: 1, 48       49: 1, 49       4a: 1, 4a       4b: 1, 4b       4c: 1, 4c       4d: 1, 4d       4e: 1, 4e       4f: 1, 4f
50: 1, 50       51: 1, 51       52: 1, 52       53: 1, 53       54: 1, 54       55: 1, 55       56: 1, 56       57: 1, 57
58: 1, 58       59: 1, 59       5a: 1, 5a       5b: 1, 5b       5c: 1, 5c       5d: 1, 5d       5e: 1, 5e       5f: 1, 5f
60: 1, 60       61: 1, 61       62: 1, 62       63: 1, 63       64: 1, 64       65: 1, 65       66: 1, 66       67: 1, 67
68: 1, 68       69: 1, 69       6a: 1, 6a       6b: 1, 6b       6c: 1, 6c       6d: 1, 6d       6e: 1, 6e       6f: 1, 6f
70: 1, 70       71: 1, 71       72: 1, 72       73: 1, 73       74: 1, 74       75: 1, 75       76: 1, 76       77: 1, 77
78: 1, 78       79: 1, 79       7a: 1, 7a       7b: 1, 7b       7c: 1, 7c       7d: 1, 7d       7e: 1, 7e       7f: 1, 7f
80: 1, df80     81: 1, df81     82: 1, df82     83: 1, df83     84: 1, df84     85: 1, df85     86: 1, df86     87: 1, df87
88: 1, df88     89: 1, df89     8a: 1, df8a     8b: 1, df8b     8c: 1, df8c     8d: 1, df8d     8e: 1, df8e     8f: 1, df8f
90: 1, df90     91: 1, df91     92: 1, df92     93: 1, df93     94: 1, df94     95: 1, df95     96: 1, df96     97: 1, df97
98: 1, df98     99: 1, df99     9a: 1, df9a     9b: 1, df9b     9c: 1, df9c     9d: 1, df9d     9e: 1, df9e     9f: 1, df9f
a0: 1, dfa0     a1: 1, dfa1     a2: 1, dfa2     a3: 1, dfa3     a4: 1, dfa4     a5: 1, dfa5     a6: 1, dfa6     a7: 1, dfa7
a8: 1, dfa8     a9: 1, dfa9     aa: 1, dfaa     ab: 1, dfab     ac: 1, dfac     ad: 1, dfad     ae: 1, dfae     af: 1, dfaf
b0: 1, dfb0     b1: 1, dfb1     b2: 1, dfb2     b3: 1, dfb3     b4: 1, dfb4     b5: 1, dfb5     b6: 1, dfb6     b7: 1, dfb7
b8: 1, dfb8     b9: 1, dfb9     ba: 1, dfba     bb: 1, dfbb     bc: 1, dfbc     bd: 1, dfbd     be: 1, dfbe     bf: 1, dfbf
c0: 1, dfc0     c1: 1, dfc1     c2: 1, dfc2     c3: 1, dfc3     c4: 1, dfc4     c5: 1, dfc5     c6: 1, dfc6     c7: 1, dfc7
c8: 1, dfc8     c9: 1, dfc9     ca: 1, dfca     cb: 1, dfcb     cc: 1, dfcc     cd: 1, dfcd     ce: 1, dfce     cf: 1, dfcf
d0: 1, dfd0     d1: 1, dfd1     d2: 1, dfd2     d3: 1, dfd3     d4: 1, dfd4     d5: 1, dfd5     d6: 1, dfd6     d7: 1, dfd7
d8: 1, dfd8     d9: 1, dfd9     da: 1, dfda     db: 1, dfdb     dc: 1, dfdc     dd: 1, dfdd     de: 1, dfde     df: 1, dfdf
e0: 1, dfe0     e1: 1, dfe1     e2: 1, dfe2     e3: 1, dfe3     e4: 1, dfe4     e5: 1, dfe5     e6: 1, dfe6     e7: 1, dfe7
e8: 1, dfe8     e9: 1, dfe9     ea: 1, dfea     eb: 1, dfeb     ec: 1, dfec     ed: 1, dfed     ee: 1, dfee     ef: 1, dfef
f0: 1, dff0     f1: 1, dff1     f2: 1, dff2     f3: 1, dff3     f4: 1, dff4     f5: 1, dff5     f6: 1, dff6     f7: 1, dff7
f8: 1, dff8     f9: 1, dff9     fa: 1, dffa     fb: 1, dffb     fc: 1, dffc     fd: 1, dffd     fe: 1, dffe     ff: 1, dfff
-- >8 --

Starting at U+DF80 falls into the Low Surrogate Area at DC00-DFFF, which the chart describes as:
> Isolated surrogate code points have no interpretation; consequently, no character code charts or names lists are provided for this range.

Which is perfect. I'm attaching a diff that adds a POSIX charset that does the same, and uses it for default/C/POSIX locale, alongside some new tests.
Comment 5 наб 2022-08-30 16:07:56 UTC
Created attachment 14305 [details]
CHARSET=POSIX for default/C/POSIX locale, [0x80, 0xFF] in [U+DF80, U+DFFF]
Comment 6 Bruno Haible 2023-06-28 19:18:35 UTC
I think this is a duplicate of bug #19932.