This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
UTF-8 decoder simplification
- To: libc-alpha at sources dot redhat dot com
- Subject: UTF-8 decoder simplification
- From: Bruno Haible <haible at ilog dot fr>
- Date: Mon, 4 Sep 2000 14:54:01 +0200 (CEST)
Less instructions + fewer memory accesses = more speed.
2000-09-03 Bruno Haible <haible@clisp.cons.org>
* iconv/gconv_simple.c (encoding_mask, encoding_byte): Remove.
(__gconv_transform_internal_utf8) [BODY]: Use simple shifts instead.
*** glibc-20000831/iconv/gconv_simple.c.bak Mon Aug 28 13:34:25 2000
--- glibc-20000831/iconv/gconv_simple.c Sat Sep 2 17:47:00 2000
***************
*** 34,52 ****
#endif
- /* These are definitions used by some of the functions for handling
- UTF-8 encoding below. */
- static const uint32_t encoding_mask[] =
- {
- ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
- };
-
- static const unsigned char encoding_byte[] =
- {
- 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
- };
-
-
/* Transform from the internal, UCS4-like format, to UCS4. The
difference between the internal ucs4 format and the real UCS4
format is, if any, the endianess. The Unicode/ISO 10646 says that
--- 34,39 ----
***************
*** 856,862 ****
char *start; \
\
for (step = 2; step < 6; ++step) \
! if ((wc & encoding_mask[step - 2]) == 0) \
break; \
\
if (__builtin_expect (outptr + step > outend, 0)) \
--- 843,849 ----
char *start; \
\
for (step = 2; step < 6; ++step) \
! if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
break; \
\
if (__builtin_expect (outptr + step > outend, 0)) \
***************
*** 867,873 ****
} \
\
start = outptr; \
! *outptr = encoding_byte[step - 2]; \
outptr += step; \
--step; \
do \
--- 854,860 ----
} \
\
start = outptr; \
! *outptr = (unsigned char) (~0xff >> step); \
outptr += step; \
--step; \
do \