This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

bug fix for EUC-JISX0213 iconv converter


The EUC-JISX0213 converter leads to a failed assertion in iconv/skeleton.c.
The situation is as follows: iconv from EUC-JISX0213 to UTF-8. In the
EUC-JISX0213 to UCS-4 direction, when a sequence of two Unicode characters
shall be generated but the destination buffer has room for only one UTF-8
character, the backtracking code in skeleton.c relies on the converter being
able to produce exactly 1 Unicode character, not 0, not 2.

Here is a fix. I'm submitting a testcase two mails after this one.


2002-09-22  Bruno Haible  <bruno@clisp.org>

	Revert 2002-04-18 patch.
        * iconvdata/euc-jisx0213.c (EMIT_SHIFT_TO_INIT, BODY for
        FROM_DIRECTION): Make the FROM direction stateful again.
        * iconvdata/shift_jisx0213.c (EMIT_SHIFT_TO_INIT, BODY for
        FROM_DIRECTION): Likewise.

--- glibc-20020828/iconvdata/euc-jisx0213.c.bak	Mon Jul  1 13:29:11 2002
+++ glibc-20020828/iconvdata/euc-jisx0213.c	Sun Sep 22 21:01:39 2002
@@ -67,7 +67,9 @@
     *statep = saved_state
 
 
-/* During UCS-4 to EUC-JISX0213 conversion, the COUNT element of the state
+/* During EUC-JISX0213 to UCS-4 conversion, the COUNT element of the state
+   contains the last UCS-4 character, shifted by 3 bits.
+   During UCS-4 to EUC-JISX0213 conversion, the COUNT element of the state
    contains the last two bytes to be output, shifted by 3 bits.  */
 
 /* Since this is a stateful encoding we have to provide code which resets
@@ -77,8 +79,17 @@
   if (data->__statep->__count != 0)					      \
     {									      \
       if (FROM_DIRECTION)						      \
-	/* We don't use shift states in the FROM_DIRECTION.  */		      \
-	data->__statep->__count = 0;					      \
+	{								      \
+	  if (__builtin_expect (outbuf + 4 <= outend, 1))		      \
+	    {								      \
+	      /* Write out the last character.  */			      \
+	      *((uint32_t *) outbuf)++ = data->__statep->__count >> 3;	      \
+	      data->__statep->__count = 0;				      \
+	    }								      \
+	  else								      \
+	    /* We don't have enough room in the output buffer.  */	      \
+	    status = __GCONV_FULL_OUTPUT;				      \
+	}								      \
       else								      \
 	{								      \
 	  if (__builtin_expect (outbuf + 2 <= outend, 1))		      \
@@ -104,104 +115,114 @@
 #define LOOPFCT			FROM_LOOP
 #define BODY \
   {									      \
-    uint32_t ch = *inptr;						      \
+    uint32_t ch;							      \
 									      \
-    if (ch < 0x80)							      \
-      /* Plain ASCII character.  */					      \
-      ++inptr;								      \
-    else if ((ch >= 0xa1 && ch <= 0xfe) || ch == 0x8e || ch == 0x8f)	      \
+    /* Determine whether there is a buffered character pending.  */	      \
+    ch = *statep >> 3;							      \
+    if (__builtin_expect (ch == 0, 1))					      \
       {									      \
-	/* Two or three byte character.  */				      \
-	uint32_t ch2;							      \
+	/* No - so look at the next input byte.  */			      \
+	ch = *inptr;							      \
 									      \
-	if (__builtin_expect (inptr + 1 >= inend, 0))			      \
+	if (ch < 0x80)							      \
+	  /* Plain ASCII character.  */					      \
+	  ++inptr;							      \
+	else if ((ch >= 0xa1 && ch <= 0xfe) || ch == 0x8e || ch == 0x8f)      \
 	  {								      \
-	    /* The second byte is not available.  */			      \
-	    result = __GCONV_INCOMPLETE_INPUT;				      \
-	    break;							      \
-	  }								      \
+	    /* Two or three byte character.  */				      \
+	    uint32_t ch2;						      \
 									      \
-	ch2 = inptr[1];							      \
+	    if (__builtin_expect (inptr + 1 >= inend, 0))		      \
+	      {								      \
+		/* The second byte is not available.  */		      \
+		result = __GCONV_INCOMPLETE_INPUT;			      \
+		break;							      \
+	      }								      \
 									      \
-	/* The second byte must be >= 0xa1 and <= 0xfe.  */		      \
-	if (__builtin_expect (ch2 < 0xa1 || ch2 > 0xfe, 0))		      \
-	  {								      \
-	    /* This is an illegal character.  */			      \
-	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
-	  }								      \
+	    ch2 = inptr[1];						      \
 									      \
-	if (ch == 0x8e)							      \
-	  {								      \
-	    /* Half-width katakana.  */					      \
-	    if (__builtin_expect (ch2 > 0xdf, 0))			      \
-	      STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
+	    /* The second byte must be >= 0xa1 and <= 0xfe.  */		      \
+	    if (__builtin_expect (ch2 < 0xa1 || ch2 > 0xfe, 0))		      \
+	      {								      \
+		/* This is an illegal character.  */			      \
+		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
+	      }								      \
 									      \
-	    ch = ch2 + 0xfec0;						      \
-	    inptr += 2;							      \
-	  }								      \
-	else								      \
-	  {								      \
-	    const unsigned char *endp;					      \
+	    if (ch == 0x8e)						      \
+	      {								      \
+		/* Half-width katakana.  */				      \
+		if (__builtin_expect (ch2 > 0xdf, 0))			      \
+		  STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
 									      \
-	    if (ch == 0x8f)						      \
+		ch = ch2 + 0xfec0;					      \
+		inptr += 2;						      \
+	      }								      \
+	    else							      \
 	      {								      \
-		/* JISX 0213 plane 2.  */				      \
-		uint32_t ch3;						      \
+		const unsigned char *endp;				      \
 									      \
-		if (__builtin_expect (inptr + 2 >= inend, 0))		      \
+		if (ch == 0x8f)						      \
 		  {							      \
-		    /* The third byte is not available.  */		      \
-		    result = __GCONV_INCOMPLETE_INPUT;			      \
-		    break;						      \
-		  }							      \
+		    /* JISX 0213 plane 2.  */				      \
+		    uint32_t ch3;					      \
 									      \
-		ch3 = inptr[2];						      \
-		endp = inptr + 3;					      \
+		    if (__builtin_expect (inptr + 2 >= inend, 0))	      \
+		      {							      \
+			/* The third byte is not available.  */		      \
+			result = __GCONV_INCOMPLETE_INPUT;		      \
+			break;						      \
+		      }							      \
 									      \
-		ch = jisx0213_to_ucs4 (0x200 - 0x80 + ch2, ch3 ^ 0x80);	      \
-	      }								      \
-	    else							      \
-	      {								      \
-		/* JISX 0213 plane 1.  */				      \
-		endp = inptr + 2;					      \
+		    ch3 = inptr[2];					      \
+		    endp = inptr + 3;					      \
 									      \
-		ch = jisx0213_to_ucs4 (0x100 - 0x80 + ch, ch2 ^ 0x80);	      \
-	      }								      \
+		    ch = jisx0213_to_ucs4 (0x200 - 0x80 + ch2, ch3 ^ 0x80);   \
+		  }							      \
+		else							      \
+		  {							      \
+		    /* JISX 0213 plane 1.  */				      \
+		    endp = inptr + 2;					      \
+									      \
+		    ch = jisx0213_to_ucs4 (0x100 - 0x80 + ch, ch2 ^ 0x80);    \
+		  }							      \
 									      \
-	    if (ch == 0)						      \
-	      /* This is an illegal character.  */			      \
-	      STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
+		if (ch == 0)						      \
+		  /* This is an illegal character.  */			      \
+		  STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
 									      \
-	    if (ch < 0x80)						      \
-	      {								      \
-		/* It's a combining character.  */			      \
-		uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
-		uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
+		inptr = endp;						      \
 									      \
-		/* See whether we have room for two characters.  */	      \
-		if (outptr + 8 <= outend)				      \
+		if (ch < 0x80)						      \
 		  {							      \
-		    inptr = endp;					      \
+		    /* It's a combining character.  */			      \
+		    uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];     \
+		    uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];     \
+									      \
 		    put32 (outptr, u1);					      \
 		    outptr += 4;					      \
-		    put32 (outptr, u2);					      \
-		    outptr += 4;					      \
-		    continue;						      \
-		  }							      \
-		else							      \
-		  {							      \
+									      \
+		    /* See whether we have room for two characters.  */	      \
+		    if (outptr + 4 <= outend)				      \
+		      {							      \
+			put32 (outptr, u2);				      \
+			outptr += 4;					      \
+			continue;					      \
+		      }							      \
+									      \
+		    /* Otherwise store only the first character now, and      \
+		       put the second one into the queue.  */		      \
+		    *statep = u2 << 3;					      \
+		    /* Tell the caller why we terminate the loop.  */	      \
 		    result = __GCONV_FULL_OUTPUT;			      \
 		    break;						      \
 		  }							      \
 	      }								      \
-									      \
-	    inptr = endp;						      \
 	  }								      \
-      }									      \
-    else								      \
-      {									      \
-	/* This is illegal.  */						      \
-	STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
+	else								      \
+	  {								      \
+	    /* This is illegal.  */					      \
+	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
+	  }								      \
       }									      \
 									      \
     put32 (outptr, ch);							      \
--- glibc-20020828/iconvdata/shift_jisx0213.c.bak	Mon Jul  1 13:29:14 2002
+++ glibc-20020828/iconvdata/shift_jisx0213.c	Sun Sep 22 21:00:56 2002
@@ -67,7 +67,9 @@
     *statep = saved_state
 
 
-/* During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
+/* During Shift_JISX0213 to UCS-4 conversion, the COUNT element of the state
+   contains the last UCS-4 character, shifted by 3 bits.
+   During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
    contains the last two bytes to be output, shifted by 3 bits.  */
 
 /* Since this is a stateful encoding we have to provide code which resets
@@ -77,8 +79,17 @@
   if (data->__statep->__count != 0)					      \
     {									      \
       if (FROM_DIRECTION)						      \
-	/* We don't use shift states in the FROM_DIRECTION.  */		      \
-	data->__statep->__count = 0;					      \
+	{								      \
+	  if (__builtin_expect (outbuf + 4 <= outend, 1))		      \
+	    {								      \
+	      /* Write out the last character.  */			      \
+	      *((uint32_t *) outbuf)++ = data->__statep->__count >> 3;	      \
+	      data->__statep->__count = 0;				      \
+	    }								      \
+	  else								      \
+	    /* We don't have enough room in the output buffer.  */	      \
+	    status = __GCONV_FULL_OUTPUT;				      \
+	}								      \
       else								      \
 	{								      \
 	  if (__builtin_expect (outbuf + 2 <= outend, 1))		      \
@@ -104,106 +115,116 @@
 #define LOOPFCT			FROM_LOOP
 #define BODY \
   {									      \
-    uint32_t ch = *inptr;						      \
+    uint32_t ch;							      \
 									      \
-    if (ch < 0x80)							      \
-      {									      \
-	/* Plain ISO646-JP character.  */				      \
-	if (__builtin_expect (ch == 0x5c, 0))				      \
-	  ch = 0xa5;							      \
-	else if (__builtin_expect (ch == 0x7e, 0))			      \
-	  ch = 0x203e;							      \
-	++inptr;							      \
-      }									      \
-    else if (ch >= 0xa1 && ch <= 0xdf)					      \
+    /* Determine whether there is a buffered character pending.  */	      \
+    ch = *statep >> 3;							      \
+    if (__builtin_expect (ch == 0, 1))					      \
       {									      \
-	/* Half-width katakana.  */					      \
-	ch += 0xfec0;							      \
-	++inptr;							      \
-      }									      \
-    else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))	      \
-      {									      \
-	/* Two byte character.  */					      \
-	uint32_t ch2;							      \
+	/* No - so look at the next input byte.  */			      \
+	ch = *inptr;							      \
 									      \
-	if (__builtin_expect (inptr + 1 >= inend, 0))			      \
+	if (ch < 0x80)							      \
 	  {								      \
-	    /* The second byte is not available.  */			      \
-	    result = __GCONV_INCOMPLETE_INPUT;				      \
-	    break;							      \
+	    /* Plain ISO646-JP character.  */				      \
+	    if (__builtin_expect (ch == 0x5c, 0))			      \
+	      ch = 0xa5;						      \
+	    else if (__builtin_expect (ch == 0x7e, 0))			      \
+	      ch = 0x203e;						      \
+	    ++inptr;							      \
 	  }								      \
-									      \
-	ch2 = inptr[1];							      \
-									      \
-	/* The second byte must be in the range 0x{40..7E,80..FC}.  */	      \
-	if (__builtin_expect (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc, 0))    \
+	else if (ch >= 0xa1 && ch <= 0xdf)				      \
 	  {								      \
-	    /* This is an illegal character.  */			      \
-	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
+	    /* Half-width katakana.  */					      \
+	    ch += 0xfec0;						      \
+	    ++inptr;							      \
 	  }								      \
-									      \
-	/* Convert to row and column.  */				      \
-	if (ch < 0xe0)							      \
-	  ch -= 0x81;							      \
-	else								      \
-	  ch -= 0xc1;							      \
-	if (ch2 < 0x80)							      \
-	  ch2 -= 0x40;							      \
-	else								      \
-	  ch2 -= 0x41;							      \
-	/* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */			      \
-	ch = 2 * ch;							      \
-	if (ch2 >= 0x5e)						      \
-	  ch2 -= 0x5e, ch++;						      \
-	ch2 += 0x21;							      \
-	if (ch >= 0x5e)							      \
+	else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))    \
 	  {								      \
-	    /* Handling of JISX 0213 plane 2 rows.  */			      \
-	    if (ch >= 0x67)						      \
-	      ch += 230;						      \
-	    else if (ch >= 0x63 || ch == 0x5f)				      \
-	      ch += 168;						      \
-	    else 							      \
-	      ch += 162;						      \
-	  }								      \
+	    /* Two byte character.  */					      \
+	    uint32_t ch2;						      \
 									      \
-	ch = jisx0213_to_ucs4 (0x121 + ch, ch2);			      \
-									      \
-	if (ch == 0)							      \
-	  {								      \
-	    /* This is an illegal character.  */			      \
-	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
-	  }								      \
+	    if (__builtin_expect (inptr + 1 >= inend, 0))		      \
+	      {								      \
+		/* The second byte is not available.  */		      \
+		result = __GCONV_INCOMPLETE_INPUT;			      \
+		break;							      \
+	      }								      \
 									      \
-	if (ch < 0x80)							      \
-	  {								      \
-	    /* It's a combining character.  */				      \
-	    uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
-	    uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
+	    ch2 = inptr[1];						      \
 									      \
-	    /* See whether we have room for two characters.  */		      \
-	    if (outptr + 8 <= outend)					      \
+	    /* The second byte must be in the range 0x{40..7E,80..FC}.  */    \
+	    if (__builtin_expect (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc, 0))\
 	      {								      \
-		inptr += 2;						      \
-		put32 (outptr, u1);					      \
-		outptr += 4;						      \
-		put32 (outptr, u2);					      \
-		outptr += 4;						      \
-		continue;						      \
+		/* This is an illegal character.  */			      \
+		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
 	      }								      \
+									      \
+	    /* Convert to row and column.  */				      \
+	    if (ch < 0xe0)						      \
+	      ch -= 0x81;						      \
 	    else							      \
+	      ch -= 0xc1;						      \
+	    if (ch2 < 0x80)						      \
+	      ch2 -= 0x40;						      \
+	    else							      \
+	      ch2 -= 0x41;						      \
+	    /* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */		      \
+	    ch = 2 * ch;						      \
+	    if (ch2 >= 0x5e)						      \
+	      ch2 -= 0x5e, ch++;					      \
+	    ch2 += 0x21;						      \
+	    if (ch >= 0x5e)						      \
 	      {								      \
+		/* Handling of JISX 0213 plane 2 rows.  */		      \
+		if (ch >= 0x67)						      \
+		  ch += 230;						      \
+		else if (ch >= 0x63 || ch == 0x5f)			      \
+		  ch += 168;						      \
+		else 							      \
+		  ch += 162;						      \
+	      }								      \
+									      \
+	    ch = jisx0213_to_ucs4 (0x121 + ch, ch2);			      \
+									      \
+	    if (ch == 0)						      \
+	      {								      \
+		/* This is an illegal character.  */			      \
+		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
+	      }								      \
+									      \
+	    inptr += 2;							      \
+									      \
+	    if (ch < 0x80)						      \
+	      {								      \
+		/* It's a combining character.  */			      \
+		uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
+		uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
+									      \
+		put32 (outptr, u1);					      \
+		outptr += 4;						      \
+									      \
+		/* See whether we have room for two characters.  */	      \
+		if (outptr + 4 <= outend)				      \
+		  {							      \
+		    put32 (outptr, u2);					      \
+		    outptr += 4;					      \
+		    continue;						      \
+		  }							      \
+									      \
+		/* Otherwise store only the first character now, and	      \
+		   put the second one into the queue.  */		      \
+		*statep = u2 << 3;					      \
+		/* Tell the caller why we terminate the loop.  */	      \
 		result = __GCONV_FULL_OUTPUT;				      \
 		break;							      \
 	      }								      \
 	  }								      \
-									      \
-	inptr += 2;							      \
-      }									      \
-    else								      \
-      {									      \
-	/* This is illegal.  */						      \
-	STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
+	else								      \
+	  {								      \
+	    /* This is illegal.  */					      \
+	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
+	  }								      \
       }									      \
 									      \
     put32 (outptr, ch);							      \


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]