From 9954432e309c8fddaec2fe53e601702a5c981624 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 7 Jan 2012 10:52:53 -0500 Subject: [PATCH] More char16_t and char32_t support It works now for UTF-8 locales --- ChangeLog | 16 ++ iconv/gconv_builtin.h | 25 ++-- iconv/gconv_int.h | 8 +- iconv/gconv_simple.c | 340 ++++++++++++++++++++++++++++++++++++++++-- iconv/iconv_prog.c | 9 +- wcsmbs/Makefile | 3 +- wcsmbs/c16rtomb.c | 19 ++- wcsmbs/mbrtoc16.c | 52 ++++++- wcsmbs/tst-c16c32-1.c | 131 ++++++++++++++++ wcsmbs/wcrtomb.c | 5 +- wcsmbs/wcsmbsload.c | 84 ++++++----- 11 files changed, 607 insertions(+), 85 deletions(-) create mode 100644 wcsmbs/tst-c16c32-1.c diff --git a/ChangeLog b/ChangeLog index f089e19b87..2c0b0f8bef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,21 @@ 2012-01-07 Ulrich Drepper + * iconv/gconv_builtin.h: Use CHAR16 for the char16_t conversions. + * iconv/gconv_simple.c: Rename char16_t routines. Add char16_t<->utf8 + routines. + * iconv/gconv_int.h: Adjust prototypes for char16_t routines. + * iconv/iconv_prog.c: Recognize CHAR16 as internal name. + * wcsmbs/c16rtomb.c: Fix a few problems. Disable all the code and + fall back to using wcrtomb. + * wcsmbs/mbrtoc16.: Fix implementation to handle real conversions. + * wcsmbs/wcsmbsload.c: Make char16 routines optional. Adjust for + renaming. + * wcsmbs/Makefile (tests): Add tst-c16c32-1: + * wcsmbs/tst-c16c32-1.c: New file. + + * wcsmbs/wcrtomb.c: Use MB_LEN_MAX instead of MB_CUR_MAX for sizing + local variable. + * libio/stdio.h: Do not declare gets at all for _GNU_SOURCE. * elf/tst-unique3.cc: Add explicit declaration of gets. diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h index fd736a480d..6820f828ec 100644 --- a/iconv/gconv_builtin.h +++ b/iconv/gconv_builtin.h @@ -1,5 +1,5 @@ /* Builtin transformations. - Copyright (C) 1997-1999, 2000-2002, 2006, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-1999, 2000-2002, 2006, 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -124,22 +124,15 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1, #endif -BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16//", 1, "=ascii->UTF-16", - __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1) +BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "CHAR16", 1, "=ascii->CHAR16", + __gconv_transform_ascii_char16, NULL, 1, 1, 2, 4) -BUILTIN_TRANSFORMATION ("UTF-16//", "ANSI_X3.4-1968//", 1, "=UTF-16->ascii", - __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1) +BUILTIN_TRANSFORMATION ("CHAR16", "ANSI_X3.4-1968//", 1, "=CHAR16->ascii", + __gconv_transform_char16_ascii, NULL, 2, 4, 1, 1) -#if BYTE_ORDER == BIG_ENDIAN -BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16BE//", 1, "=ascii->UTF-16BE", - __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1) -BUILTIN_TRANSFORMATION ("UTF-16BE//", "ANSI_X3.4-1968//", 1, "=UTF-16BE->ascii", - __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1) -#else -BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16LE//", 1, "=ascii->UTF-16LE", - __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1) +BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "CHAR16", 1, "=utf8->CHAR16", + __gconv_transform_utf8_char16, NULL, 1, 6, 2, 4) -BUILTIN_TRANSFORMATION ("UTF-16LE//", "ANSI_X3.4-1968//", 1, "=UTF-16LE->ascii", - __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1) -#endif +BUILTIN_TRANSFORMATION ("CHAR16", "ISO-10646/UTF8/", 1, "=CHAR16->utf8", + __gconv_transform_char16_utf8, NULL, 2, 4, 1, 6) diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h index 80253dd5be..79de975410 100644 --- a/iconv/gconv_int.h +++ b/iconv/gconv_int.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2005, 2006, 2007, 2011 Free Software Foundation, Inc. +/* Copyright (C) 1997-2005, 2006, 2007, 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -303,8 +303,10 @@ __BUILTIN_TRANSFORM (__gconv_transform_internal_ucs4le); __BUILTIN_TRANSFORM (__gconv_transform_ucs4le_internal); __BUILTIN_TRANSFORM (__gconv_transform_internal_utf16); __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal); -__BUILTIN_TRANSFORM (__gconv_transform_ascii_utf16); -__BUILTIN_TRANSFORM (__gconv_transform_utf16_ascii); +__BUILTIN_TRANSFORM (__gconv_transform_ascii_char16); +__BUILTIN_TRANSFORM (__gconv_transform_char16_ascii); +__BUILTIN_TRANSFORM (__gconv_transform_utf8_char16); +__BUILTIN_TRANSFORM (__gconv_transform_char16_utf8); # undef __BUITLIN_TRANSFORM /* Specialized conversion function for a single byte to INTERNAL, recognizing diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c index b0ef3e67b0..d145a3e5ff 100644 --- a/iconv/gconv_simple.c +++ b/iconv/gconv_simple.c @@ -1,5 +1,5 @@ /* Simple transformations functions. - Copyright (C) 1997-2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -1065,6 +1065,7 @@ ucs4le_internal_loop_single (struct __gconv_step *step, \ state->__count = inend - *inptrp; \ \ + assert (ch != 0xc0 && ch != 0xc1); \ if (ch >= 0xc2 && ch < 0xe0) \ { \ /* We expect two bytes. The first byte cannot be 0xc0 or \ @@ -1322,15 +1323,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step, #include -/* Convert from ISO 646-IRV to UTF-16. */ +/* Convert from ISO 646-IRV to the char16_t format. */ #define DEFINE_INIT 0 #define DEFINE_FINI 0 #define MIN_NEEDED_FROM 1 #define MIN_NEEDED_TO 2 #define FROM_DIRECTION 1 -#define FROM_LOOP ascii_utf16_loop -#define TO_LOOP ascii_utf16_loop /* This is not used. */ -#define FUNCTION_NAME __gconv_transform_ascii_utf16 +#define FROM_LOOP ascii_char16_loop +#define TO_LOOP ascii_char16_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_ascii_char16 #define ONE_DIRECTION 1 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM @@ -1358,15 +1359,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step, #include -/* Convert from UTF-16 to ISO 646-IRV. */ +/* Convert from the char16_t format to ISO 646-IRV. */ #define DEFINE_INIT 0 #define DEFINE_FINI 0 #define MIN_NEEDED_FROM 2 #define MIN_NEEDED_TO 1 #define FROM_DIRECTION 1 -#define FROM_LOOP utf16_ascii_loop -#define TO_LOOP utf16_ascii_loop /* This is not used. */ -#define FUNCTION_NAME __gconv_transform_utf16_ascii +#define FROM_LOOP char16_ascii_loop +#define TO_LOOP char16_ascii_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_char16_ascii #define ONE_DIRECTION 1 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM @@ -1383,9 +1384,328 @@ ucs4le_internal_loop_single (struct __gconv_step *step, { \ /* It's an one byte sequence. */ \ *outptr++ = *((const uint16_t *) inptr); \ - inptr += sizeof (uint16_t); \ + inptr += 2; \ + } \ + } +#define LOOP_NEED_FLAGS +#include +#include + + +/* Convert from the char16_t format to UTF-8. */ +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 2 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 1 +#define MAX_NEEDED_TO 6 +#define FROM_DIRECTION 1 +#define FROM_LOOP char16_utf8_loop +#define TO_LOOP char16_utf8_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_char16_utf8 +#define ONE_DIRECTION 1 + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + /* Yes, reading a 16-bit number and storing it as 32-bit is correct. */ \ + uint32_t wc = *((const uint16_t *) inptr); \ + inptr += 2; \ + \ + if (__builtin_expect (wc < 0x80, 1)) \ + /* It's an one byte sequence. */ \ + *outptr++ = (unsigned char) wc; \ + else \ + { \ + size_t step; \ + \ + if (__builtin_expect (wc < 0xd800 || wc > 0xdfff, 1)) \ + step = wc < 0x800 ? 2 : 3; \ + else \ + { \ + if (__builtin_expect (inptr + 2 > inend, 0)) \ + { \ + /* We don't have enough input for another complete input \ + character. */ \ + inptr -= 2; \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + uint32_t sec = *((const uint16_t *) inptr); \ + if (__builtin_expect (sec < 0xdc00, 0) \ + || __builtin_expect (sec > 0xdfff, 0)) \ + { \ + /* This is no valid second word for a surrogate. */ \ + STANDARD_FROM_LOOP_ERR_HANDLER (2); \ + } \ + inptr += 2; \ + wc = ((wc - 0xd7c0) << 10) + (sec - 0xdc00); \ + \ + step = wc < 0x200000 ? 4 : 5; \ + } \ + \ + if (__builtin_expect (outptr + step > outend, 0)) \ + { \ + /* Too long. */ \ + result = __GCONV_FULL_OUTPUT; \ + inptr -= step >= 4 ? 4 : 2; \ + break; \ + } \ + \ + unsigned char *start = outptr; \ + *outptr = (unsigned char) (~0xff >> step); \ + outptr += step; \ + do \ + { \ + start[--step] = 0x80 | (wc & 0x3f); \ + wc >>= 6; \ + } \ + while (step > 1); \ + start[0] |= wc; \ } \ } #define LOOP_NEED_FLAGS #include #include + + +/* Convert from UTF-8 to the char16_t format. */ +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 6 +#define MIN_NEEDED_TO 2 +#define MAX_NEEDED_TO 4 +#define FROM_DIRECTION 1 +#define FROM_LOOP utf8_char16_loop +#define TO_LOOP utf8_char16_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_utf8_char16 +#define ONE_DIRECTION 1 + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + /* Next input byte. */ \ + uint32_t ch = *inptr; \ + \ + if (__builtin_expect (ch < 0x80, 1)) \ + { \ + /* One byte sequence. */ \ + *((uint16_t *) outptr) = ch; \ + outptr += 2; \ + ++inptr; \ + } \ + else \ + { \ + uint_fast32_t cnt; \ + uint_fast32_t i; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ + otherwise the wide character could have been represented \ + using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + else \ + { \ + /* Search the end of this ill-formed UTF-8 character. This \ + is the next byte with (x & 0xc0) != 0x80. */ \ + i = 0; \ + do \ + ++i; \ + while (inptr + i < inend \ + && (*(inptr + i) & 0xc0) == 0x80 \ + && i < 5); \ + \ + errout: \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + \ + if (__builtin_expect (inptr + cnt > inend, 0)) \ + { \ + /* We don't have enough input. But before we report that check \ + that all the bytes are correct. */ \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + goto errout; \ + } \ + \ + /* Read the possible remaining bytes. */ \ + for (i = 1; i < cnt; ++i) \ + { \ + uint32_t byte = inptr[i]; \ + \ + if ((byte & 0xc0) != 0x80) \ + /* This is an illegal encoding. */ \ + break; \ + \ + ch <<= 6; \ + ch |= byte & 0x3f; \ + } \ + \ + /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ + If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ + have been represented with fewer than cnt bytes. */ \ + if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \ + /* Do not accept UTF-16 surrogates. */ \ + || (ch >= 0xd800 && ch <= 0xdfff)) \ + { \ + /* This is an illegal encoding. */ \ + goto errout; \ + } \ + \ + /* Now adjust the pointers and store the result. */ \ + if (ch < 0x10000) \ + *((uint16_t *) outptr) = ch; \ + else \ + { \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *((uint16_t *) outptr) = 0xd7c0 + (ch >> 10); \ + outptr += 2; \ + *((uint16_t *) outptr) = 0xdc00 + (ch & 0x3ff); \ + } \ + \ + outptr += 2; \ + inptr += cnt; \ + } \ + } +#define LOOP_NEED_FLAGS + +#define STORE_REST \ + { \ + /* We store the remaining bytes while converting them into the UCS4 \ + format. We can assume that the first byte in the buffer is \ + correct and that it requires a larger number of bytes than there \ + are in the input buffer. */ \ + wint_t ch = **inptrp; \ + size_t cnt, r; \ + \ + state->__count = inend - *inptrp; \ + \ + assert (ch != 0xc0 && ch != 0xc1); \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or \ + 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + \ + /* The first byte is already consumed. */ \ + r = cnt - 1; \ + while (++(*inptrp) < inend) \ + { \ + ch <<= 6; \ + ch |= **inptrp & 0x3f; \ + --r; \ + } \ + \ + /* Shift for the so far missing bytes. */ \ + ch <<= r * 6; \ + \ + /* Store the number of bytes expected for the entire sequence. */ \ + state->__count |= cnt << 8; \ + \ + /* Store the value. */ \ + state->__value.__wch = ch; \ + } + +#define UNPACK_BYTES \ + { \ + static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ + wint_t wch = state->__value.__wch; \ + size_t ntotal = state->__count >> 8; \ + \ + inlen = state->__count & 255; \ + \ + bytebuf[0] = inmask[ntotal - 2]; \ + \ + do \ + { \ + if (--ntotal < inlen) \ + bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ + wch >>= 6; \ + } \ + while (ntotal > 1); \ + \ + bytebuf[0] |= wch; \ + } + +#define CLEAR_STATE \ + state->__count = 0 + + +#include +#include diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index 59c6402eb9..13facc8235 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -719,10 +719,12 @@ add_known_names (struct gconv_module *node) add_known_names (node->right); do { - if (strcmp (node->from_string, "INTERNAL")) + if (strcmp (node->from_string, "INTERNAL") != 0 + && strcmp (node->from_string, "CHAR16") != 0) tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp); - if (strcmp (node->to_string, "INTERNAL") != 0) + if (strcmp (node->to_string, "INTERNAL") != 0 + && strcmp (node->to_string, "CHAR16") != 0) tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp); node = node->same; @@ -748,7 +750,8 @@ insert_cache (void) { const char *str = strtab + hashtab[cnt].string_offset; - if (strcmp (str, "INTERNAL") != 0) + if (strcmp (str, "INTERNAL") != 0 + && strcmp (str, "CHAR16") != 0) tsearch (str, &printlist, (__compar_fn_t) strverscmp); } } diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile index 8c446e1fd3..010e0c8d03 100644 --- a/wcsmbs/Makefile +++ b/wcsmbs/Makefile @@ -1,4 +1,4 @@ -# Copyright (C) 1995-2000,2002,2003,2004,2005,2006,2007,2011 +# Copyright (C) 1995-2000,2002,2003,2004,2005,2006,2007,2011,2012 # Free Software Foundation, Inc. # This file is part of the GNU C Library. @@ -46,6 +46,7 @@ routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \ strop-tests := wcscmp wmemcmp wcslen wcschr wcsrchr wcscpy tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \ tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \ + tst-c16c32-1 \ wcsatcliff $(addprefix test-,$(strop-tests)) include ../Rules diff --git a/wcsmbs/c16rtomb.c b/wcsmbs/c16rtomb.c index c75ca3bf21..3fed0b5d63 100644 --- a/wcsmbs/c16rtomb.c +++ b/wcsmbs/c16rtomb.c @@ -1,6 +1,6 @@ /* Copyright (C) 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper , 2011. + Contributed by Ulrich Drepper , 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -44,7 +44,12 @@ static mbstate_t state; size_t c16rtomb (char *s, char16_t c16, mbstate_t *ps) { - char buf[MB_CUR_MAX]; +#if 1 + // XXX The ISO C 11 spec I have does not say anything about handling + // XXX surrogates in this interface. + return wcrtomb (s, c16, ps ?: &state); +#else + char buf[MB_LEN_MAX]; struct __gconv_step_data data; int status; size_t result; @@ -78,9 +83,9 @@ c16rtomb (char *s, char16_t c16, mbstate_t *ps) PTR_DEMANGLE (fct); #endif - /* If C16 is the NUL character we write into the output buffer the byte - sequence necessary for PS to get into the initial state, followed - by a NUL byte. */ + /* If C16 is the NUL character we write into the output buffer + the byte sequence necessary for PS to get into the initial + state, followed by a NUL byte. */ if (c16 == L'\0') { status = DL_CALL_FCT (fct, (fcts->fromc16, &data, NULL, NULL, @@ -96,7 +101,8 @@ c16rtomb (char *s, char16_t c16, mbstate_t *ps) status = DL_CALL_FCT (fct, (fcts->fromc16, &data, &inbuf, - inbuf + sizeof (char16_t), NULL, &dummy, 0, 1)); + inbuf + sizeof (char16_t), NULL, &dummy, + 0, 1)); } /* There must not be any problems with the conversion but illegal input @@ -118,4 +124,5 @@ c16rtomb (char *s, char16_t c16, mbstate_t *ps) } return result; +#endif } diff --git a/wcsmbs/mbrtoc16.c b/wcsmbs/mbrtoc16.c index 7b5822d690..df970fba4f 100644 --- a/wcsmbs/mbrtoc16.c +++ b/wcsmbs/mbrtoc16.c @@ -1,6 +1,6 @@ /* Copyright (C) 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper , 2011. + Contributed by Ulrich Drepper , 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -43,20 +43,32 @@ static mbstate_t state; size_t mbrtoc16 (char16_t *pc16, const char *s, size_t n, mbstate_t *ps) { - char16_t buf[1]; + if (ps == NULL) + ps = &state; + + if (ps->__count & 0x80000000) + { + /* We have to return the second word for a surrogate. */ + ps->__count &= 0x7fffffff; + *pc16 = ps->__value.__wch; + ps->__value.__wch = L'\0'; + return (size_t) -3; + } + + char16_t buf[2]; struct __gconv_step_data data; int status; size_t result; size_t dummy; const unsigned char *inbuf, *endbuf; - unsigned char *outbuf = (unsigned char *) (pc16 ?: buf); + unsigned char *outbuf = (unsigned char *) buf; const struct gconv_fcts *fcts; /* Set information for this step. */ data.__invocation_counter = 0; data.__internal_use = 1; data.__flags = __GCONV_IS_LAST; - data.__statep = ps ?: &state; + data.__statep = ps; data.__trans = NULL; /* A first special case is if S is NULL. This means put PS in the @@ -85,9 +97,22 @@ mbrtoc16 (char16_t *pc16, const char *s, size_t n, mbstate_t *ps) if (fcts->toc16->__shlib_handle != NULL) PTR_DEMANGLE (fct); #endif + + /* We first have to check whether the character can be represented + without a surrogate. If we immediately pass in a buffer large + enough to hold two char16_t values and the first character does + not require a surrogate the routine will try to convert more + input if N is larger then needed for the first character. */ status = DL_CALL_FCT (fct, (fcts->toc16, &data, &inbuf, endbuf, NULL, &dummy, 0, 1)); + if (status == __GCONV_FULL_OUTPUT && data.__outbuf == outbuf) + { + data.__outbufend = outbuf + 2 * sizeof (char16_t); + status = DL_CALL_FCT (fct, (fcts->toc16, &data, &inbuf, endbuf, + NULL, &dummy, 0, 1)); + } + /* There must not be any problems with the conversion but illegal input characters. The output buffer must be large enough, otherwise the definition of MB_CUR_MAX is not correct. All the other possible @@ -100,15 +125,28 @@ mbrtoc16 (char16_t *pc16, const char *s, size_t n, mbstate_t *ps) if (status == __GCONV_OK || status == __GCONV_EMPTY_INPUT || status == __GCONV_FULL_OUTPUT) { - if (data.__outbuf != (unsigned char *) outbuf - && *(char16_t *) outbuf == U('\0')) + if (pc16 != NULL) + *pc16 = buf[0]; + + if (data.__outbuf != outbuf && *(char16_t *) outbuf == U('\0')) { /* The converted character is the NUL character. */ assert (__mbsinit (data.__statep)); result = 0; } else - result = inbuf - (const unsigned char *) s; + { + result = inbuf - (const unsigned char *) s; + + if (data.__outbuf != outbuf + 2) + { + /* This is a surrogate. */ + assert (buf[0] >= 0xd800 && buf[0] <= 0xdfff); + assert (buf[1] >= 0xdc00 && buf[1] <= 0xdfff); + ps->__count |= 0x80000000; + ps->__value.__wch = buf[1]; + } + } } else if (status == __GCONV_INCOMPLETE_INPUT) result = (size_t) -2; diff --git a/wcsmbs/tst-c16c32-1.c b/wcsmbs/tst-c16c32-1.c new file mode 100644 index 0000000000..f4534c5d93 --- /dev/null +++ b/wcsmbs/tst-c16c32-1.c @@ -0,0 +1,131 @@ +#include +#include +#include +#include + + +static int +do_test (void) +{ + if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL) + { + puts ("cannot set locale"); + return 1; + } + + int result = 0; + + char32_t c32 = 48; + do + { + if (c32 >= 0xd800 && c32 <= 0xe000) + continue; + + char buf[20]; + size_t n1 = c32rtomb (buf, c32, NULL); + if (n1 <= 0) + { + printf ("c32rtomb for U'\\x%" PRIx32 "' failed\n", (uint32_t) c32); + result = 1; + continue; + } + + char32_t c32out; + size_t n2 = mbrtoc32 (&c32out, buf, n1, NULL); + if ((ssize_t) n2 < 0) + { + printf ("mbrtoc32 for U'\\x%" PRIx32 "' failed\n", (uint32_t) c32); + result = 1; + continue; + } + if (n2 != n1) + { + printf ("mbrtoc32 for U'\\x%" PRIx32 "' consumed %zu bytes, not %zu\n", + (uint32_t) c32, n2, n1); + result = 1; + } + else if (c32out != c32) + { + printf ("mbrtoc32 for U'\\x%" PRIx32 "' produced U'\\x%" PRIx32 "\n", + (uint32_t) c32, (uint32_t) c32out); + result = 1; + } + + char16_t c16; + size_t n3 = mbrtoc16 (&c16, buf, n1, NULL); + if (n3 != n1) + { + printf ("mbrtoc16 for U'\\x%" PRIx32 "' did not consume all bytes\n", + (uint32_t) c32); + result = 1; + continue; + } + if (c32 < 0x10000) + { + if (c16 != c32) + { + printf ("mbrtoc16 for U'\\x%" PRIx32 "' produce u'\\x%" PRIx16 "'\n", + (uint32_t) c32, (uint16_t) c16); + result = 1; + continue; + } + } + else + { + buf[0] = '1'; + char16_t c16_2; + size_t n4 = mbrtoc16 (&c16_2, buf, 1, NULL); + if (n4 != (size_t) -3) + { + printf ("second mbrtoc16 for U'\\x%" PRIx32 "' did not return -3\n", + (uint32_t) c32); + result = 1; + continue; + } + + if (c32 != (((uint32_t) (c16 - 0xd7c0)) << 10) + (c16_2 - 0xdc00)) + { + printf ("mbrtoc16 for U'\\x%" PRIx32 "' returns U'\\x%" PRIx32 "\n", + (uint32_t) c32, + (((uint32_t) (c16 - 0xd7c0)) << 10) + (c16_2 - 0xdc00)); + result = 1; + continue; + } + } + + buf[0] = '\0'; + char16_t c16_nul; + n3 = mbrtoc16 (&c16_nul, buf, n1, NULL); + if (n3 != 0) + { + printf ("mbrtoc16 for '\\0' returns %zd\n", n3); + result = 1; + continue; + } + + if (c32 < 0x10000) + { + size_t n5 = c16rtomb (buf, c16, NULL); + if ((ssize_t) n5 < 0) + { + printf ("c16rtomb for U'\\x%" PRIx32 "' failed with %zd\n", + (uint32_t) c32, n5); + result = 1; + continue; + } + if (n5 != n1) + { + printf ("c16rtomb for U'\\x%" PRIx32 "' produced %zu bytes instead of %zu bytes\n", + (uint32_t) c32, n5, n1); + result = 1; + continue; + } + } + } + while ((c32 += 0x1111) <= U'\x12000'); + + return result; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/wcsmbs/wcrtomb.c b/wcsmbs/wcrtomb.c index 547b05aa9c..946fdaf47f 100644 --- a/wcsmbs/wcrtomb.c +++ b/wcsmbs/wcrtomb.c @@ -1,4 +1,5 @@ -/* Copyright (C) 1996-1998,2000,2002,2005,2011 Free Software Foundation, Inc. +/* Copyright (C) 1996-1998,2000,2002,2005,2011,2012 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -38,7 +39,7 @@ static mbstate_t state; size_t __wcrtomb (char *s, wchar_t wc, mbstate_t *ps) { - char buf[MB_CUR_MAX]; + char buf[MB_LEN_MAX]; struct __gconv_step_data data; int status; size_t result; diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c index 212a6c8135..9ce26f1dc0 100644 --- a/wcsmbs/wcsmbsload.c +++ b/wcsmbs/wcsmbsload.c @@ -1,4 +1,5 @@ -/* Copyright (C) 1998-2002,2004,2005,2008,2010,2011 Free Software Foundation, Inc. +/* Copyright (C) 1998-2002,2004,2005,2008,2010,2011,2012 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1998. @@ -74,7 +75,7 @@ static const struct __gconv_step to_c16 = .__counter = INT_MAX, .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT", .__to_name = (char *) "UTF-16//", - .__fct = __gconv_transform_ascii_utf16, + .__fct = __gconv_transform_ascii_char16, .__btowc_fct = NULL, .__init_fct = NULL, .__end_fct = NULL, @@ -93,7 +94,7 @@ static const struct __gconv_step from_c16 = .__counter = INT_MAX, .__from_name = (char *) "UTF-16//", .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT", - .__fct = __gconv_transform_utf16_ascii, + .__fct = __gconv_transform_char16_ascii, .__btowc_fct = NULL, .__init_fct = NULL, .__end_fct = NULL, @@ -209,7 +210,7 @@ __wcsmbs_load_conv (struct __locale_data *new_category) int use_translit; /* Allocate the gconv_fcts structure. */ - new_fcts = malloc (sizeof *new_fcts); + new_fcts = calloc (1, sizeof *new_fcts); if (new_fcts == NULL) goto failed; @@ -229,16 +230,24 @@ __wcsmbs_load_conv (struct __locale_data *new_category) represent all others. */ new_fcts->towc = __wcsmbs_getfct ("INTERNAL", complete_name, &new_fcts->towc_nsteps); - new_fcts->tomb = (new_fcts->towc != NULL - ? __wcsmbs_getfct (complete_name, "INTERNAL", - &new_fcts->tomb_nsteps) - : NULL); + if (new_fcts->towc != NULL) + new_fcts->tomb = __wcsmbs_getfct (complete_name, "INTERNAL", + &new_fcts->tomb_nsteps); - // XXX - new_fcts->toc16 = (struct __gconv_step *) &to_c16; - new_fcts->toc16_nsteps = 1; - new_fcts->fromc16 = (struct __gconv_step *) &from_c16; - new_fcts->fromc16_nsteps = 1; + if (new_fcts->tomb != NULL) + { + new_fcts->toc16 = __wcsmbs_getfct ("CHAR16", complete_name, + &new_fcts->toc16_nsteps); + + if (new_fcts->toc16 != NULL) + new_fcts->fromc16 = __wcsmbs_getfct (complete_name, "CHAR16", + &new_fcts->fromc16_nsteps); + else + { + __gconv_close_transform (new_fcts->toc16, new_fcts->toc16_nsteps); + new_fcts->toc16 = NULL; + } + } /* If any of the conversion functions is not available we don't use any since this would mean we cannot convert back and @@ -255,6 +264,12 @@ __wcsmbs_load_conv (struct __locale_data *new_category) } else { + // XXX At least for now we live with the CHAR16 not being available. + if (new_fcts->toc16 == NULL) + new_fcts->toc16 = __wcsmbs_gconv_fcts_c.toc16; + if (new_fcts->fromc16 == NULL) + new_fcts->fromc16 = __wcsmbs_gconv_fcts_c.fromc16; + new_category->private.ctype = new_fcts; new_category->private.cleanup = &_nl_cleanup_ctype; } @@ -277,11 +292,15 @@ __wcsmbs_clone_conv (struct gconv_fcts *copy) *copy = *orig; /* Now increment the usage counters. - Note: This assumes copy->towc_nsteps == 1 and copy->tomb_nsteps == 1. */ + Note: This assumes copy->*_nsteps == 1. */ if (copy->towc->__shlib_handle != NULL) ++copy->towc->__counter; if (copy->tomb->__shlib_handle != NULL) ++copy->tomb->__counter; + if (copy->toc16->__shlib_handle != NULL) + ++copy->toc16->__counter; + if (copy->fromc16->__shlib_handle != NULL) + ++copy->fromc16->__counter; } @@ -296,30 +315,24 @@ __wcsmbs_named_conv (struct gconv_fcts *copy, const char *name) copy->tomb = __wcsmbs_getfct (name, "INTERNAL", ©->tomb_nsteps); if (copy->tomb == NULL) - goto out_mb; - -#if 0 - copy->fromc16 = __wcsmbs_getfct (name, "UTF-16//", ©->fromc16_nsteps); - if (copy->fromc16 == NULL) - goto out_fromc16; - - copy->toc16 = __wcsmbs_getfct ("UTF-16//", name, ©->toc16_nsteps); - if (copy->toc16 == NULL) -#else - if (0) -#endif { -#if 0 - __gconv_close_transform (copy->fromc16, copy->fromc16_nsteps); - out_fromc16: - __gconv_close_transform (copy->tomb, copy->tomb_nsteps); -#endif - out_mb: __gconv_close_transform (copy->towc, copy->towc_nsteps); - out_wc: return 1; } + copy->fromc16 = __wcsmbs_getfct (name, "CHAR16", ©->fromc16_nsteps); + if (copy->fromc16 == NULL) + copy->toc16 = NULL; + else + { + copy->toc16 = __wcsmbs_getfct ("CHAR16", name, ©->toc16_nsteps); + if (copy->toc16 == NULL) + { + __gconv_close_transform (copy->fromc16, copy->fromc16_nsteps); + copy->fromc16 = NULL; + } + } + return 0; } @@ -335,11 +348,8 @@ _nl_cleanup_ctype (struct __locale_data *locale) /* Free the old conversions. */ __gconv_close_transform (data->tomb, data->tomb_nsteps); __gconv_close_transform (data->towc, data->towc_nsteps); -#if 0 - // XXX __gconv_close_transform (data->fromc16, data->fromc16_nsteps); - __gconv_close_transform (data->toc16, data->toc16c_nsteps); -#endif + __gconv_close_transform (data->toc16, data->toc16_nsteps); free ((char *) data); } } -- 2.43.5