This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Handle surrogate pairs in c16rtomb (bug 23794, DR#488, C2X)
- From: Joseph Myers <joseph at codesourcery dot com>
- To: <libc-alpha at sourceware dot org>
- Date: Thu, 18 Oct 2018 21:39:14 +0000
- Subject: Handle surrogate pairs in c16rtomb (bug 23794, DR#488, C2X)
The c16rtomb implementation has:
// XXX The ISO C 11 spec I have does not say anything about handling
// XXX surrogates in this interface.
The DR#488 resolution, as applied to C2X, requires surrogate pairs to
be handled here (so the first call returns 0 and stores the high
surrogate in the mbstate_t, while the second call combines the
surrogates, produces a multibyte character and returns the number of
bytes written). This patch implements that. (mbrtoc16 already
handled producing surrogates as output.)
Tested for x86_64.
2018-10-18 Joseph Myers <joseph@codesourcery.com>
[BZ #23794]
* wcsmbs/c16rtomb.c (c16rtomb): Save first character of surrogate
pair and return 0 in that case, and use saved character to
interpret following character.
* wcsmbs/tst-c16-surrogate.c: New file.
* wcsmbs/Makefile (tests): Add tst-c16-surrogate.c.
[$(run-built-tests) = yes] ($(objpfx)tst-c16-surrogate.out):
Depend on $(gen-locales)
diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile
index 2e02b22c3b..a58d36ff8f 100644
--- a/wcsmbs/Makefile
+++ b/wcsmbs/Makefile
@@ -51,7 +51,8 @@ tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \
tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \
tst-c16c32-1 wcsatcliff tst-wcstol-locale tst-wcstod-nan-locale \
tst-wcstod-round test-char-types tst-fgetwc-after-eof \
- tst-wcstod-nan-sign $(addprefix test-,$(strop-tests))
+ tst-wcstod-nan-sign tst-c16-surrogate \
+ $(addprefix test-,$(strop-tests))
include ../Rules
@@ -68,6 +69,7 @@ $(objpfx)tst-wcrtomb.out: $(gen-locales)
$(objpfx)wcsmbs-tst1.out: $(gen-locales)
$(objpfx)tst-wcstol-locale.out: $(gen-locales)
$(objpfx)tst-wcstod-nan-locale.out: $(gen-locales)
+$(objpfx)tst-c16-surrogate.out: $(gen-locales)
endif
$(objpfx)tst-wcstod-round: $(libm)
diff --git a/wcsmbs/c16rtomb.c b/wcsmbs/c16rtomb.c
index 48a63d067b..2b45a0f443 100644
--- a/wcsmbs/c16rtomb.c
+++ b/wcsmbs/c16rtomb.c
@@ -26,7 +26,40 @@ static mbstate_t state;
size_t
c16rtomb (char *s, char16_t c16, mbstate_t *ps)
{
- // XXX The ISO C 11 spec I have does not say anything about handling
- // XXX surrogates in this interface.
- return wcrtomb (s, c16, ps ?: &state);
+ wchar_t wc = c16;
+
+ if (ps == NULL)
+ ps = &state;
+
+ if (s == NULL)
+ {
+ /* Reset any state relating to surrogate pairs. */
+ ps->__count &= 0x7fffffff;
+ wc = 0;
+ }
+
+ if (ps->__count & 0x80000000)
+ {
+ /* The previous call passed in the first surrogate of a
+ surrogate pair. */
+ ps->__count &= 0x7fffffff;
+ if (wc >= 0xdc00 && wc < 0xe000)
+ wc = (0x10000
+ + ((ps->__value.__wch & 0x3ff) << 10)
+ + (wc & 0x3ff));
+ else
+ /* This is not a low surrogate; ensure an EILSEQ error by
+ trying to decode the high surrogate as a wide character on
+ its own. */
+ wc = ps->__value.__wch;
+ }
+ else if (wc >= 0xd800 && wc < 0xdc00)
+ {
+ /* The high part of a surrogate pair. */
+ ps->__count |= 0x80000000;
+ ps->__value.__wch = wc;
+ return 0;
+ }
+
+ return wcrtomb (s, wc, ps);
}
diff --git a/wcsmbs/tst-c16-surrogate.c b/wcsmbs/tst-c16-surrogate.c
new file mode 100644
index 0000000000..e40426256f
--- /dev/null
+++ b/wcsmbs/tst-c16-surrogate.c
@@ -0,0 +1,89 @@
+/* Test c16rtomb handling of surrogate pairs (DR#488, bug 23794).
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+#include <uchar.h>
+#include <wchar.h>
+#include <array_length.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+ TEST_VERIFY_EXIT (setlocale (LC_ALL, "de_DE.UTF-8") != NULL);
+ /* Test conversions of surrogate pairs. */
+ for (char32_t c = 0x10000; c <= 0x10ffff; c += 0x123)
+ {
+ char32_t c_pos = c - 0x10000;
+ char16_t c_hi = (c_pos >> 10) + 0xd800;
+ char16_t c_lo = (c_pos & 0x3ff) + 0xdc00;
+ printf ("testing U+0x%08x (0x%x 0x%x)\n",
+ (unsigned int) c, (unsigned int) c_hi, (unsigned int) c_lo);
+ char buf[16] = { 0 };
+ size_t ret_hi = c16rtomb (buf, c_hi, NULL);
+ TEST_VERIFY (ret_hi == 0);
+ size_t ret_lo = c16rtomb (buf, c_lo, NULL);
+ TEST_VERIFY (ret_lo == 4);
+ wchar_t wc = 0;
+ size_t ret_wc = mbrtowc (&wc, buf, 4, NULL);
+ TEST_VERIFY (ret_wc == 4);
+ TEST_VERIFY (wc == (wchar_t) c);
+ }
+ /* Test errors for invalid conversions. */
+ static const char16_t err_cases[][2] =
+ {
+ /* High surrogate followed by non-surrogate. */
+ { 0xd800, 0x1 },
+ /* High surrogate followed by another high surrogate. */
+ { 0xd800, 0xd800 },
+ /* Low surrogate not following high surrogate. */
+ { 0xdc00, 0 }
+ };
+ for (size_t i = 0; i < array_length (err_cases); i++)
+ {
+ char16_t c_hi = err_cases[i][0];
+ char16_t c_lo = err_cases[i][1];
+ printf ("testing error case: 0x%x 0x%x\n", (unsigned int) c_hi,
+ (unsigned int) c_lo);
+ c16rtomb (NULL, 0, NULL);
+ char buf[16] = { 0 };
+ errno = 0;
+ size_t ret_hi = c16rtomb (buf, c_hi, NULL);
+ if (c_lo == 0)
+ {
+ /* Unmatched low surrogate in first place. */
+ TEST_VERIFY (ret_hi == (size_t) -1);
+ TEST_VERIFY (errno == EILSEQ);
+ }
+ else
+ {
+ /* High surrogate; error in second place. */
+ TEST_VERIFY (ret_hi == 0);
+ errno = 0;
+ size_t ret_lo = c16rtomb (buf, c_lo, NULL);
+ TEST_VERIFY (ret_lo == (size_t) -1);
+ TEST_VERIFY (errno == EILSEQ);
+ }
+ }
+ return 0;
+}
+
+#include <support/test-driver.c>
--
Joseph S. Myers
joseph@codesourcery.com