This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: [PATCH] Remove erroneous Uincode conversions from _wctomb_r and mbtowc_r
- From: Corinna Vinschen <vinschen at redhat dot com>
- To: newlib at sourceware dot org
- Date: Tue, 24 Feb 2009 10:26:30 +0100
- Subject: Re: [PATCH] Remove erroneous Uincode conversions from _wctomb_r and mbtowc_r
- References: <20090217174905.GI9744@calimero.vinschen.de>
- Reply-to: newlib at sourceware dot org
Ping?
On Feb 17 18:49, Corinna Vinschen wrote:
> Hi,
>
> the conversion functions _wctomb_r and _mbtowc_r convert 5 and 6 byte
> UTF-8 sequences into a wchar counterpart. Vice versa, wchar_t values >
> 0x10ffff are converted to 4, 5 and 6 byte UTF-8 sequences. However, per
> the Unicode standard (http://www.unicode.org/standard/standard.html),
> these values are invalid. Unicode is restricted to the value range
> 0x000000 to 0x10ffff. Any character outside this range has to be
> treated as invalid.
>
> The below patch fixes the two functions to handle only valid UTF characters.
>
>
> Corinna
>
>
> * mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
> sequences since they are invalid in the Unicode standard.
> * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
> values beyond 0x10ffff into UTF-8 chars.
>
>
> Index: libc/stdlib/mbtowc_r.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 mbtowc_r.c
> --- libc/stdlib/mbtowc_r.c 23 Apr 2004 21:44:22 -0000 1.7
> +++ libc/stdlib/mbtowc_r.c 17 Feb 2009 17:48:07 -0000
> @@ -193,120 +193,6 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state)
> state->__count = 0;
> return i;
> }
> - else if (ch >= 0xf8 && ch <= 0xfb)
> - {
> - /* five-byte sequence */
> - if (sizeof(wchar_t) < 4)
> - return -1; /* we can't store such a value */
> - state->__value.__wchb[0] = ch;
> - if (state->__count == 0)
> - state->__count = 1;
> - else
> - ++n;
> - if (n < 2)
> - return -2;
> - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
> - if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
> - /* overlong UTF-8 sequence */
> - return -1;
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - state->__value.__wchb[1] = ch;
> - if (state->__count == 1)
> - state->__count = 2;
> - else
> - ++n;
> - if (n < 3)
> - return -2;
> - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - state->__value.__wchb[2] = ch;
> - if (state->__count == 2)
> - state->__count = 3;
> - else
> - ++n;
> - if (n < 4)
> - return -2;
> - ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - state->__value.__wchb[3] = ch;
> - state->__count = 4;
> - if (n < 5)
> - return -2;
> - ch = t[i++];
> - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
> - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
> - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
> - | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
> - | (wchar_t)(ch & 0x3f);
> -
> - state->__count = 0;
> - return i;
> - }
> - else if (ch >= 0xfc && ch <= 0xfd)
> - {
> - /* six-byte sequence */
> - int ch2;
> - if (sizeof(wchar_t) < 4)
> - return -1; /* we can't store such a value */
> - state->__value.__wchb[0] = ch;
> - if (state->__count == 0)
> - state->__count = 1;
> - else
> - ++n;
> - if (n < 2)
> - return -2;
> - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
> - if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
> - /* overlong UTF-8 sequence */
> - return -1;
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - state->__value.__wchb[1] = ch;
> - if (state->__count == 1)
> - state->__count = 2;
> - else
> - ++n;
> - if (n < 3)
> - return -2;
> - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - state->__value.__wchb[2] = ch;
> - if (state->__count == 2)
> - state->__count = 3;
> - else
> - ++n;
> - if (n < 4)
> - return -2;
> - ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - state->__value.__wchb[3] = ch;
> - if (state->__count == 3)
> - state->__count = 4;
> - else
> - ++n;
> - if (n < 5)
> - return -2;
> - if (n == 5)
> - return -1; /* at this point we can't save enough to restart */
> - ch = t[i++];
> - if (ch < 0x80 || ch > 0xbf)
> - return -1;
> - ch2 = t[i++];
> - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
> - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
> - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
> - | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
> - | (wchar_t)((ch & 0x3f) << 6)
> - | (wchar_t)(ch2 & 0x3f);
> -
> - state->__count = 0;
> - return i;
> - }
> else
> return -1;
> }
> Index: libc/stdlib/wctomb_r.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/wctomb_r.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 wctomb_r.c
> --- libc/stdlib/wctomb_r.c 16 May 2007 19:31:06 -0000 1.7
> +++ libc/stdlib/wctomb_r.c 17 Feb 2009 17:48:07 -0000
> @@ -50,7 +50,7 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
> *s = 0x80 | (wchar & 0x3f);
> return 3;
> }
> - else if (wchar >= 0x10000 && wchar <= 0x1fffff)
> + else if (wchar >= 0x10000 && wchar <= 0x10ffff)
> {
> *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
> *s++ = 0x80 | ((wchar & 0x3f000) >> 12);
> @@ -58,25 +58,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
> *s = 0x80 | (wchar & 0x3f);
> return 4;
> }
> - else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
> - {
> - *s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
> - *s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
> - *s++ = 0x80 | ((wchar & 0x3f000) >> 12);
> - *s++ = 0x80 | ((wchar & 0xfc0) >> 6);
> - *s = 0x80 | (wchar & 0x3f);
> - return 5;
> - }
> - else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
> - {
> - *s++ = 0xfc | ((wchar & 0x40000000) >> 30);
> - *s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
> - *s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
> - *s++ = 0x80 | ((wchar & 0x3f000) >> 12);
> - *s++ = 0x80 | ((wchar & 0xfc0) >> 6);
> - *s = 0x80 | (wchar & 0x3f);
> - return 6;
> - }
> else
> return -1;
> }
>
>
> --
> Corinna Vinschen
> Cygwin Project Co-Leader
> Red Hat
--
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat