This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
surrogate handling in UTF-16 and UCS-2 converters
- To: libc-alpha at sources dot redhat dot com
- Subject: surrogate handling in UTF-16 and UCS-2 converters
- From: Bruno Haible <haible at ilog dot fr>
- Date: Mon, 18 Sep 2000 15:09:31 +0200 (CEST)
- Cc: Markus Kuhn <Markus dot Kuhn at cl dot cam dot ac dot uk>
Markus Kuhn pointed out that an UTF-8 to UTF-16 conversion should reject
code points which belong to single surrogates (0xD800..0xDFFF), otherwise
we have a security problem: An attacker could provide an UTF-8 encoded
input, which passes some kind of application dependent validation, and
then gets converted to UTF-16 where neighbour surrogate codes suddenly get
a different semantics. We don't know yet which kinds of characters will
appear in Unicode planes 1 to 16 and which properties this will have.
This patch blocks this attack by rejecting single surrogates in the
INTERNAL to UTF-16 converter. Same for the INTERNAL to UCS-2 converter,
because UCS-2 strings are sometimes interpreted as UTF-16 (some applications
are currently silently switching from UCS-2 as internal representation
to UTF-16).
The patch also rejects single surrogates in the UCS-2 to INTERNAL direction,
for consistency. (If surrogates occur in UCS-2 input, it means the input is
actually UTF-16, and applications should be told about it.)
2000-09-17 Bruno Haible <haible@clisp.cons.org>
* iconvdata/utf-16.c (BODY for TO_LOOP): Reject UCS-4 input in the
range 0xD800..0xDFFF.
* iconvdata/unicode.c (BODY for TO_LOOP): Likewise.
(BODY for FROM_LOOP): Likewise.
* iconv/gconv_simple.c (ucs2_internal_loop): Likewise.
(internal_ucs2_loop): Likewise.
(ucs2reverse_internal_loop): Likewise.
(internal_ucs2reverse_loop): Likewise.
*** glibc-20000914/iconvdata/utf-16.c.bak Mon Jul 3 16:39:27 2000
--- glibc-20000914/iconvdata/utf-16.c Thu Sep 14 11:26:49 2000
***************
*** 195,200 ****
--- 195,216 ----
#define BODY \
{ \
uint32_t c = get32 (inptr); \
+ \
+ if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0)) \
+ { \
+ /* Surrogate characters in UCS-4 input are not valid. \
+ We must catch this. If we let surrogates pass through, \
+ attackers could make a security hole exploit by \
+ synthesizing any desired plane 1-16 character. */ \
+ if (! ignore_errors_p ()) \
+ { \
+ result = __GCONV_ILLEGAL_INPUT; \
+ break; \
+ } \
+ inptr += 4; \
+ ++*irreversible; \
+ continue; \
+ } \
\
if (swap) \
{ \
*** glibc-20000914/iconvdata/unicode.c.bak Fri Aug 25 23:52:57 2000
--- glibc-20000914/iconvdata/unicode.c Thu Sep 14 11:52:45 2000
***************
*** 154,159 ****
--- 154,176 ----
{ \
STANDARD_ERR_HANDLER (4); \
} \
+ else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0)) \
+ { \
+ /* Surrogate characters in UCS-4 input are not valid. \
+ We must catch this, because the UCS-2 output might be \
+ interpreted as UTF-16 by other programs. If we let \
+ surrogates pass through, attackers could make a security \
+ hole exploit by synthesizing any desired plane 1-16 \
+ character. */ \
+ if (! ignore_errors_p ()) \
+ { \
+ result = __GCONV_ILLEGAL_INPUT; \
+ break; \
+ } \
+ inptr += 4; \
+ ++*irreversible; \
+ continue; \
+ } \
else \
{ \
put16 (outptr, c); \
***************
*** 179,189 ****
--- 196,221 ----
if (swap) \
u1 = bswap_16 (u1); \
\
+ if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \
+ { \
+ /* Surrogate characters in UCS-2 input are not valid. Reject \
+ them. (Catching this here is not security relevant.) */ \
+ if (! ignore_errors_p ()) \
+ { \
+ result = __GCONV_ILLEGAL_INPUT; \
+ break; \
+ } \
+ inptr += 2; \
+ ++*irreversible; \
+ continue; \
+ } \
+ \
put32 (outptr, u1); \
\
inptr += 2; \
outptr += 4; \
}
+ #define LOOP_NEED_FLAGS
#define EXTRA_LOOP_DECLS \
, int swap
#include <iconv/loop.c>
*** glibc-20000914/iconv/gconv_simple.c.bak Tue Sep 5 15:24:48 2000
--- glibc-20000914/iconv/gconv_simple.c Thu Sep 14 11:57:22 2000
***************
*** 773,779 ****
} \
else \
/* It's an one byte sequence. */ \
- /* XXX unaligned. */ \
*((uint32_t *) outptr)++ = *inptr++; \
}
#define LOOP_NEED_FLAGS
--- 773,778 ----
***************
*** 797,803 ****
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
- /* XXX unaligned. */ \
if (__builtin_expect (*((uint32_t *) inptr), 0) > 0x7f) \
{ \
STANDARD_ERR_HANDLER (4); \
--- 796,801 ----
***************
*** 1147,1153 ****
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
! *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++;
#include <iconv/loop.c>
#include <iconv/skeleton.c>
--- 1145,1171 ----
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
! { \
! uint16_t u1 = *((uint16_t *) inptr); \
! \
! if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \
! { \
! /* Surrogate characters in UCS-2 input are not valid. Reject \
! them. (Catching this here is not security relevant.) */ \
! if (! ignore_errors_p ()) \
! { \
! result = __GCONV_ILLEGAL_INPUT; \
! break; \
! } \
! inptr += 2; \
! ++*irreversible; \
! continue; \
! } \
! \
! *((uint32_t *) outptr)++ = u1; \
! inptr += 2; \
! }
! #define LOOP_NEED_FLAGS
#include <iconv/loop.c>
#include <iconv/skeleton.c>
***************
*** 1168,1179 ****
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
! if (__builtin_expect (*((uint32_t *) inptr), 0) >= 0x10000) \
{ \
STANDARD_ERR_HANDLER (4); \
} \
else \
! *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
--- 1186,1219 ----
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
! uint32_t val = *((uint32_t *) inptr); \
! \
! if (__builtin_expect (val, 0) >= 0x10000) \
{ \
STANDARD_ERR_HANDLER (4); \
} \
+ else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0)) \
+ { \
+ /* Surrogate characters in UCS-4 input are not valid. \
+ We must catch this, because the UCS-2 output might be \
+ interpreted as UTF-16 by other programs. If we let \
+ surrogates pass through, attackers could make a security \
+ hole exploit by synthesizing any desired plane 1-16 \
+ character. */ \
+ if (! ignore_errors_p ()) \
+ { \
+ result = __GCONV_ILLEGAL_INPUT; \
+ break; \
+ } \
+ inptr += 4; \
+ ++*irreversible; \
+ continue; \
+ } \
else \
! { \
! *((uint16_t *) outptr)++ = val; \
! inptr += 4; \
! } \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
***************
*** 1195,1202 ****
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
! *((uint32_t *) outptr)++ = bswap_16 (*(uint16_t *) inptr); \
! inptr += 2;
#include <iconv/loop.c>
#include <iconv/skeleton.c>
--- 1235,1261 ----
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
! { \
! uint16_t u1 = bswap_16 (*((uint16_t *) inptr)); \
! \
! if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \
! { \
! /* Surrogate characters in UCS-2 input are not valid. Reject \
! them. (Catching this here is not security relevant.) */ \
! if (! ignore_errors_p ()) \
! { \
! result = __GCONV_ILLEGAL_INPUT; \
! break; \
! } \
! inptr += 2; \
! ++*irreversible; \
! continue; \
! } \
! \
! *((uint32_t *) outptr)++ = u1; \
! inptr += 2; \
! }
! #define LOOP_NEED_FLAGS
#include <iconv/loop.c>
#include <iconv/skeleton.c>
***************
*** 1222,1229 ****
{ \
STANDARD_ERR_HANDLER (4); \
} \
! *((uint16_t *) outptr)++ = bswap_16 (val); \
! inptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
--- 1281,1308 ----
{ \
STANDARD_ERR_HANDLER (4); \
} \
! else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0)) \
! { \
! /* Surrogate characters in UCS-4 input are not valid. \
! We must catch this, because the UCS-2 output might be \
! interpreted as UTF-16 by other programs. If we let \
! surrogates pass through, attackers could make a security \
! hole exploit by synthesizing any desired plane 1-16 \
! character. */ \
! if (! ignore_errors_p ()) \
! { \
! result = __GCONV_ILLEGAL_INPUT; \
! break; \
! } \
! inptr += 4; \
! ++*irreversible; \
! continue; \
! } \
! else \
! { \
! *((uint16_t *) outptr)++ = bswap_16 (val); \
! inptr += 4; \
! } \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>