This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

two UTF-16 decoder bugs



Bug 1:
The UTF-16 decoder does not correctly decode input which starts with a
byte-order mark in other endianness.

Example: On a little-endian machine,

$ echo abc | /usr/bin/iconv -t UTF-16 | \
  /usr/bin/iconv -f UTF-16LE -t UTF-16BE | \
  /usr/bin/iconv -f UTF-16 -t UTF-8

should print abc, but prints a few ideographs instead.

The reason is that the local 'swap' variable is not updated, therefore
swapping will only be enabled at the second call (for the second 8 KB
buffer).

Bug 2:
Conversion from UTF-16 converts the BOM instead of dropping it.

Example:

$ echo abc | /usr/bin/iconv -t UTF-16 | \
  /usr/bin/iconv -f UTF-16 -t UTF-8 | \
  wc -c

should print 4, but prints 7.

The reason is that 'inptr' is incremented but then ignored, because it
is initialized from '*inptrp' at the beginning of the loop.

This second bug also exists for the UNICODE encoding.

Here is a fix for both.


2001-04-11  Bruno Haible  <haible@clisp.cons.org>

	* iconvdata/utf-16.c (PREPARE_LOOP): Initialize 'swap' after possibly
	changing it in the state. After incrementing 'inptr', store it back.
	* iconvdata/unicode.c (PREPARE_LOOP): After incrementing 'inptr',
	store it back.

*** glibc-20010315/iconvdata/utf-16.c.bak	Mon Dec  4 19:53:44 2000
--- glibc-20010315/iconvdata/utf-16.c	Thu Apr 12 02:25:05 2001
***************
*** 44,50 ****
  #define PREPARE_LOOP \
    enum direction dir = ((struct utf16_data *) step->__data)->dir;	      \
    enum variant var = ((struct utf16_data *) step->__data)->var;		      \
!   int swap = ((struct utf16_data *) step->__data)->swap;		      \
    if (FROM_DIRECTION && var == UTF_16)					      \
      {									      \
        if (data->__invocation_counter == 0)				      \
--- 44,50 ----
  #define PREPARE_LOOP \
    enum direction dir = ((struct utf16_data *) step->__data)->dir;	      \
    enum variant var = ((struct utf16_data *) step->__data)->var;		      \
!   int swap;								      \
    if (FROM_DIRECTION && var == UTF_16)					      \
      {									      \
        if (data->__invocation_counter == 0)				      \
***************
*** 55,65 ****
  									      \
  	  if (get16u (inptr) == BOM)					      \
  	    /* Simply ignore the BOM character.  */			      \
! 	    inptr += 2;							      \
  	  else if (get16u (inptr) == BOM_OE)				      \
  	    {								      \
  	      ((struct utf16_data *) step->__data)->swap = 1;		      \
! 	      inptr += 2;						      \
  	    }								      \
  	}								      \
      }									      \
--- 55,65 ----
  									      \
  	  if (get16u (inptr) == BOM)					      \
  	    /* Simply ignore the BOM character.  */			      \
! 	    *inptrp = inptr += 2;					      \
  	  else if (get16u (inptr) == BOM_OE)				      \
  	    {								      \
  	      ((struct utf16_data *) step->__data)->swap = 1;		      \
! 	      *inptrp = inptr += 2;					      \
  	    }								      \
  	}								      \
      }									      \
***************
*** 72,78 ****
  									      \
        put16u (outbuf, BOM);						      \
        outbuf += 2;							      \
!     }
  #define EXTRA_LOOP_ARGS		, var, swap
  
  
--- 72,79 ----
  									      \
        put16u (outbuf, BOM);						      \
        outbuf += 2;							      \
!     }									      \
!   swap = ((struct utf16_data *) step->__data)->swap;
  #define EXTRA_LOOP_ARGS		, var, swap
  
  
*** glibc-20010315/iconvdata/unicode.c.bak	Mon Dec  4 19:53:44 2000
--- glibc-20010315/iconvdata/unicode.c	Thu Apr 12 01:29:50 2001
***************
*** 53,63 ****
  									      \
  	  if (get16u (inptr) == BOM)					      \
  	    /* Simply ignore the BOM character.  */			      \
! 	    inptr += 2;							      \
  	  else if (get16u (inptr) == BOM_OE)				      \
  	    {								      \
  	      ((struct unicode_data *) step->__data)->swap = 1;		      \
! 	      inptr += 2;						      \
  	    }								      \
  	}								      \
      }									      \
--- 53,63 ----
  									      \
  	  if (get16u (inptr) == BOM)					      \
  	    /* Simply ignore the BOM character.  */			      \
! 	    *inptrp = inptr += 2;					      \
  	  else if (get16u (inptr) == BOM_OE)				      \
  	    {								      \
  	      ((struct unicode_data *) step->__data)->swap = 1;		      \
! 	      *inptrp = inptr += 2;					      \
  	    }								      \
  	}								      \
      }									      \


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]