[PATCH/RFA] Fix EUCJP multibyte/widechar conversion

Jeff Johnston jjohnstn@redhat.com
Thu Apr 9 00:23:00 GMT 2009


Ok.

-- Jeff J.

Corinna Vinschen wrote:
> Ping?
>
> On Apr  6 12:21, Corinna Vinschen wrote:
>   
>> Hi,
>>
>> while looking into a problem in the eucJP conversion under Cygwin it
>> occured to me that the eucJP character conversion in newlib is incomplete.
>>
>> It only correctly recognizes and converts characters from the lower half
>> of JIS-X-0201 (ASCII) and the characters from JIS-X-0208, which are
>> implemented as doublebyte values {0xa1-0xfe, 0xa1-0xfe}.
>>
>> It does neither recognize characters from the upper half of JIS-X-0201
>> (Halfwidth Katakana), implemented in eucJP as doublebyte sequences
>> {0x8e, 0xa1-0xdf}, nor characters from JIS-X-0212, implemented as
>> triplebyte sequences {0x8f, 0xa1-0xfe, 0xa1-0xfe}.
>>
>> This also points to a bug in locale.c.  __mb_cur_max is set to 2 for
>> eucJP, even though eucJP contains triplebyte sequences.
>>
>> Below is a patch which implements the missing sequences in __eucjp_mbtowc
>> and __eucjp_wctomb.  It also sets __mb_cur_max to 3 in loadlocale.
>>
>> The triplebyte sequences are converted to and from widechar using a
>> trick borrowed from the implementation of the Windows codepage 20932,
>> which is Windows' eucJP implementation.  It has only one minor flaw:  It
>> is incompatible to eucJP.
>>
>> Instead of the aforementioned triplebyte sequences it has a doublebyte
>> substitute representation.  The leading 0x8f byte is skipped, the second
>> byte is taken as is, the third byte is masked with 0x7f.  This leads to
>> a well-defined doublebyte representation.  I'm using this method to de-
>> and encode the widechar value for the eucJP triplebyte sequences.
>>
>> I tested this patch by enabling these functions temporarily for Cygwin.
>>
>>
>> Corinna
>>
>>
>> 	* libc/locale/locale.c (loadlocale): Set mbc_max to 3 for EUCJP.
>> 	* libc/stdlib/mbctype.h (_iseucjp1): Like _iseucjp, but also
>> 	recognizes 0x8e and 0x8f lead bytes.
>> 	(_iseucjp2): Rename from _iseucjp.
>> 	* libc/stdlib/mbtowc_r.c (__eucjp_mbtowc): Convert JIS-X-0212
>> 	triplebyte sequences as well.
>> 	* libc/stdlib/wctomb_r.c (__eucjp_wctomb): Convert to JIS-X-0212
>> 	triplebyte sequences as well.
>>
>>
>> Index: libc/locale/locale.c
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/locale/locale.c,v
>> retrieving revision 1.17
>> diff -u -p -r1.17 locale.c
>> --- libc/locale/locale.c	31 Mar 2009 09:31:38 -0000	1.17
>> +++ libc/locale/locale.c	5 Apr 2009 20:52:23 -0000
>> @@ -468,7 +468,7 @@ loadlocale(struct _reent *p, int categor
>>        if (!strcmp (charset, "EUCJP") || !strcmp (charset, "eucJP"))
>>  	{
>>  	  strcpy (charset, "EUCJP");
>> -	  mbc_max = 2;
>> +	  mbc_max = 3;
>>  #ifdef _MB_CAPABLE
>>  	  l_wctomb = __eucjp_wctomb;
>>  	  l_mbtowc = __eucjp_mbtowc;
>> Index: libc/stdlib/mbctype.h
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/stdlib/mbctype.h,v
>> retrieving revision 1.2
>> diff -u -p -r1.2 mbctype.h
>> --- libc/stdlib/mbctype.h	17 Apr 2000 17:10:17 -0000	1.2
>> +++ libc/stdlib/mbctype.h	5 Apr 2009 20:52:23 -0000
>> @@ -14,7 +14,8 @@ int _EXFUN(_isjis, (int c));
>>  
>>  #define _issjis1(c)    (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xef))
>>  #define _issjis2(c)    (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
>> -#define _iseucjp(c)    ((c) >= 0xa1 && (c) <= 0xfe)
>> +#define _iseucjp1(c)   ((c) == 0x8e || (c) == 0x8f || ((c) >= 0xa1 && (c) <= 0xfe))
>> +#define _iseucjp2(c)   ((c) >= 0xa1 && (c) <= 0xfe)
>>  #define _isjis(c)      ((c) >= 0x21 && (c) <= 0x7e)
>>  
>>  #endif /* _MBCTYPE_H_ */
>> Index: libc/stdlib/mbtowc_r.c
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
>> retrieving revision 1.12
>> diff -u -p -r1.12 mbtowc_r.c
>> --- libc/stdlib/mbtowc_r.c	24 Mar 2009 10:13:27 -0000	1.12
>> +++ libc/stdlib/mbtowc_r.c	6 Apr 2009 10:17:25 -0000
>> @@ -470,7 +470,7 @@ _DEFUN (__eucjp_mbtowc, (r, pwc, s, n, c
>>    ch = t[i++];
>>    if (state->__count == 0)
>>      {
>> -      if (_iseucjp (ch))
>> +      if (_iseucjp1 (ch))
>>  	{
>>  	  state->__value.__wchb[0] = ch;
>>  	  state->__count = 1;
>> @@ -481,9 +481,35 @@ _DEFUN (__eucjp_mbtowc, (r, pwc, s, n, c
>>      }
>>    if (state->__count == 1)
>>      {
>> -      if (_iseucjp (ch))
>> +      if (_iseucjp2 (ch))
>>  	{
>> -	  *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
>> +	  if (state->__value.__wchb[0] == 0x8f)
>> +	    {
>> +	      state->__value.__wchb[1] = ch;
>> +	      state->__count = 2;
>> +	      if (n <= i)
>> +		return -2;
>> +	      ch = t[i++];
>> +	    }
>> +	  else
>> +	    {
>> +	      *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
>> +	      state->__count = 0;
>> +	      return i;
>> +	    }
>> +	}
>> +      else
>> +	{
>> +	  r->_errno = EILSEQ;
>> +	  return -1;
>> +	}
>> +    }
>> +  if (state->__count == 2)
>> +    {
>> +      if (_iseucjp2 (ch))
>> +	{
>> +	  *pwc = (((wchar_t)state->__value.__wchb[1]) << 8)
>> +		 + (wchar_t)(ch & 0x7f);
>>  	  state->__count = 0;
>>  	  return i;
>>  	}
>> Index: libc/stdlib/wctomb_r.c
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/stdlib/wctomb_r.c,v
>> retrieving revision 1.13
>> diff -u -p -r1.13 wctomb_r.c
>> --- libc/stdlib/wctomb_r.c	24 Mar 2009 10:13:27 -0000	1.13
>> +++ libc/stdlib/wctomb_r.c	6 Apr 2009 10:17:26 -0000
>> @@ -195,12 +195,19 @@ _DEFUN (__eucjp_wctomb, (r, s, wchar, ch
>>    if (char1 != 0x00)
>>      {
>>      /* first byte is non-zero..validate multi-byte char */
>> -      if (_iseucjp (char1) && _iseucjp (char2)) 
>> +      if (_iseucjp1 (char1) && _iseucjp2 (char2)) 
>>  	{
>>  	  *s++ = (char)char1;
>>  	  *s = (char)char2;
>>  	  return 2;
>>  	}
>> +      else if (_iseucjp2 (char1) && _iseucjp2 (char2 | 0x80))
>> +	{
>> +	  *s++ = (char)0x8f;
>> +	  *s++ = (char)char1;
>> +	  *s = (char)(char2 | 0x80);
>> +	  return 3;
>> +	}
>>        else
>>  	{
>>  	  r->_errno = EILSEQ;
>>
>>
>> -- 
>> Corinna Vinschen
>> Cygwin Project Co-Leader
>> Red Hat
>>     
>
>   



More information about the Newlib mailing list