[PATCH] Fix UTF-16 surrogate handling in __utf8_mbtowc

Tue Jul 28 21:32:00 GMT 2009

On Jul 28 14:56, Jeff Johnston wrote:
> Please go ahead.

Thanks, will do.  But here's a question:

>> +	  tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
>> +	    |   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
>> +	    |   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
>> +	  tmp = (tmp - 0x10000) >> 10;
>> +	  /* Check if the sequence can fit into a surrogate pair at all.
>> +	     If tmp is > 0x3ff at this point, the full Unicode value will
>> +	     be > 0x10ffff.  This is an invalid Unicode value and outside
>> +	     of the defintion of UTF-16 surrogate pairs. */
>> +	  if (tmp > 0x3ff)
>> +	    {
>> +	      r->_errno = EILSEQ;
>> +	      return -1;
>> +	    }

This code tests if the wide char value is within the boundaries of the
Unicode specification, because it's impossible to represent bigger
values (> 0x10ffff) as UTF-16 surrogate pair.

However, the surrounding code still allows values > 0x10ffff for UTF-32
systems.  So on these systems the code will happily allow and generate
non-Unicode values.

The question is, shouldn't the code be changed to disallow values beyond
0x10ffff on all systems, rather than just checking it in the UTF-16
case?

New patch attached, which checks for values beyond the Unicode range
generically.  Whichever you prefer.


Corinna


        * libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Rework UTF-16 surrogate
	pair handling to be more bullet-proof even with incomplete UTF-8
	sequences.  Add check for 4 byte sequences resulting in values
	outside the valid Unicode range.


Index: libc/stdlib/mbtowc_r.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
retrieving revision 1.14
diff -u -p -r1.14 mbtowc_r.c

--- libc/stdlib/mbtowc_r.c	28 Jul 2009 16:49:19 -0000	1.14
+++ libc/stdlib/mbtowc_r.c	28 Jul 2009 19:56:07 -0000
@@ -205,18 +205,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
   if (n == 0)
     return -2;
 
-  if (state->__count == 4)
-    {
-      /* Create the second half of the surrogate pair.  For a description
-	 see the comment below. */
-      wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
-	|   (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
-	|   (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
-	|   (wchar_t)(state->__value.__wchb[3] & 0x3f);
-      state->__count = 0;
-      *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
-      return 2;
-    }
   if (state->__count == 0)
     ch = t[i++];
   else
@@ -312,7 +300,7 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
       *pwc = tmp;
       return i;
     }
-  if (ch >= 0xf0 && ch <= 0xf7)
+  if (ch >= 0xf0 && ch <= 0xf4)
     {
       /* four-byte sequence */
       wint_t tmp;
@@ -324,9 +312,10 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
       if (n < 2)
 	return -2;
       ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
-      if (state->__value.__wchb[0] == 0xf0 && ch < 0x90)
+      if ((state->__value.__wchb[0] == 0xf0 && ch < 0x90)
+	  || (state->__value.__wchb[0] == 0xf4 && ch >= 0x90))
 	{
-	  /* overlong UTF-8 sequence */
+	  /* overlong UTF-8 sequence or result is > 0x10ffff */
 	  r->_errno = EILSEQ;
 	  return -1;
 	}
@@ -353,6 +342,26 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
 	state->__count = 3;
       else if (n < (size_t)-1)
 	++n;
+      if (state->__count == 3 && sizeof(wchar_t) == 2)
+	{
+	  /* On systems which have wchar_t being UTF-16 values, the value
+	     doesn't fit into a single wchar_t in this case.  So what we
+	     do here is to store the state with a special value of __count
+	     and return the first half of a surrogate pair.  The first
+	     three bytes of a UTF-8 sequence are enough to generate the
+	     first half of a UTF-16 surrogate pair.  As return value we
+	     choose to return the number of bytes actually read up to
+	     here.
+	     The second half of the surrogate pair is returned in case we
+	     recognize the special __count value of four, and the next
+	     byte is actually a valid value.  See below. */
+	  tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
+	    |   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
+	    |   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
+	  state->__count = 4;
+	  *pwc = 0xd800 | ((tmp - 0x10000) >> 10);
+	  return i;
+	}
       if (n < 4)
 	return -2;
       ch = t[i++];
@@ -365,21 +374,12 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
 	|   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
 	|   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
 	|   (wint_t)(ch & 0x3f);
-      if (tmp > 0xffff && sizeof(wchar_t) == 2)
-	{
-	  /* On systems which have wchar_t being UTF-16 values, the value
-	     doesn't fit into a single wchar_t in this case.  So what we
-	     do here is to store the state with a special value of __count
-	     and return the first half of a surrogate pair.  As return
-	     value we choose to return the half of the actual UTF-8 char.
-	     The second half is returned in case we recognize the special
-	     __count value above. */
-	  state->__value.__wchb[3] = ch;
-	  state->__count = 4;
-	  *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
-	  return 2;
-	}
-      *pwc = tmp;
+      if (state->__count == 4 && sizeof(wchar_t) == 2)
+	/* Create the second half of the surrogate pair for systems with
+	   wchar_t == UTF-16 . */
+	*pwc = 0xdc00 | (tmp & 0x3ff);
+      else
+	*pwc = tmp;
       state->__count = 0;
       return i;
     }


-- 
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat