]>
Commit | Line | Data |
---|---|---|
6973fc01 | 1 | /* Simple transformations functions. |
bd32e4a6 | 2 | Copyright (C) 1997, 1998, 1999, 2000 Free Software Foundation, Inc. |
6973fc01 UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Library General Public License as | |
8 | published by the Free Software Foundation; either version 2 of the | |
9 | License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Library General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Library General Public | |
17 | License along with the GNU C Library; see the file COPYING.LIB. If not, | |
18 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
19 | Boston, MA 02111-1307, USA. */ | |
20 | ||
f1fa8b68 UD |
21 | #include <byteswap.h> |
22 | #include <endian.h> | |
f4017d20 | 23 | #include <errno.h> |
6973fc01 | 24 | #include <gconv.h> |
d2374599 | 25 | #include <stdint.h> |
6973fc01 UD |
26 | #include <stdlib.h> |
27 | #include <string.h> | |
28 | #include <wchar.h> | |
29 | #include <sys/param.h> | |
30 | ||
a904b5d9 UD |
31 | #ifndef EILSEQ |
32 | # define EILSEQ EINVAL | |
33 | #endif | |
34 | ||
35 | ||
36 | /* These are definitions used by some of the functions for handling | |
37 | UTF-8 encoding below. */ | |
8619129f | 38 | static const uint32_t encoding_mask[] = |
a904b5d9 UD |
39 | { |
40 | ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff | |
41 | }; | |
42 | ||
43 | static const unsigned char encoding_byte[] = | |
44 | { | |
45 | 0xc0, 0xe0, 0xf0, 0xf8, 0xfc | |
46 | }; | |
47 | ||
48 | ||
f1fa8b68 UD |
49 | /* Transform from the internal, UCS4-like format, to UCS4. The |
50 | difference between the internal ucs4 format and the real UCS4 | |
51 | format is, if any, the endianess. The Unicode/ISO 10646 says that | |
52 | unless some higher protocol specifies it differently, the byte | |
53 | order is big endian.*/ | |
8619129f UD |
54 | #define DEFINE_INIT 0 |
55 | #define DEFINE_FINI 0 | |
56 | #define MIN_NEEDED_FROM 4 | |
57 | #define MIN_NEEDED_TO 4 | |
58 | #define FROM_DIRECTION 1 | |
59 | #define FROM_LOOP internal_ucs4_loop | |
60 | #define TO_LOOP internal_ucs4_loop /* This is not used. */ | |
61 | #define FUNCTION_NAME __gconv_transform_internal_ucs4 | |
62 | ||
63 | ||
64 | static inline int | |
65 | internal_ucs4_loop (const unsigned char **inptrp, const unsigned char *inend, | |
66 | unsigned char **outptrp, unsigned char *outend, | |
67 | mbstate_t *state, void *data, size_t *converted) | |
4bca4c17 | 68 | { |
8619129f UD |
69 | const unsigned char *inptr = *inptrp; |
70 | unsigned char *outptr = *outptrp; | |
71 | size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; | |
4bca4c17 UD |
72 | int result; |
73 | ||
f1fa8b68 | 74 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
8619129f UD |
75 | /* Sigh, we have to do some real work. */ |
76 | size_t cnt; | |
f1fa8b68 | 77 | |
fdf19bf7 UD |
78 | for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) |
79 | *((uint32_t *) outptr)++ = bswap_32 (*(uint32_t *) inptr); | |
f1fa8b68 | 80 | |
8619129f UD |
81 | *inptrp = inptr; |
82 | *outptrp = outptr; | |
f1fa8b68 | 83 | #elif __BYTE_ORDER == __BIG_ENDIAN |
8619129f UD |
84 | /* Simply copy the data. */ |
85 | *inptrp = inptr + n_convert * 4; | |
86 | *outptrp = __mempcpy (outptr, inptr, n_convert * 4); | |
f1fa8b68 UD |
87 | #else |
88 | # error "This endianess is not supported." | |
89 | #endif | |
90 | ||
8619129f UD |
91 | /* Determine the status. */ |
92 | if (*outptrp == outend) | |
d64b6ad0 | 93 | result = __GCONV_FULL_OUTPUT; |
8619129f | 94 | else if (*inptrp == inend) |
d64b6ad0 | 95 | result = __GCONV_EMPTY_INPUT; |
6973fc01 | 96 | else |
d64b6ad0 | 97 | result = __GCONV_INCOMPLETE_INPUT; |
6973fc01 | 98 | |
8619129f UD |
99 | if (converted != NULL) |
100 | converted += n_convert; | |
6973fc01 | 101 | |
f43ce637 | 102 | return result; |
6973fc01 | 103 | } |
d2374599 | 104 | |
8619129f | 105 | #include <iconv/skeleton.c> |
d2374599 | 106 | |
d2374599 | 107 | |
8619129f UD |
108 | /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ |
109 | #define DEFINE_INIT 0 | |
110 | #define DEFINE_FINI 0 | |
111 | #define MIN_NEEDED_FROM 1 | |
112 | #define MIN_NEEDED_TO 4 | |
113 | #define FROM_DIRECTION 1 | |
114 | #define FROM_LOOP ascii_internal_loop | |
115 | #define TO_LOOP ascii_internal_loop /* This is not used. */ | |
116 | #define FUNCTION_NAME __gconv_transform_ascii_internal | |
117 | ||
118 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
119 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
120 | #define LOOPFCT FROM_LOOP | |
121 | #define BODY \ | |
122 | { \ | |
123 | if (*inptr > '\x7f') \ | |
124 | { \ | |
125 | /* This is no correct ANSI_X3.4-1968 character. */ \ | |
d64b6ad0 | 126 | result = __GCONV_ILLEGAL_INPUT; \ |
8619129f UD |
127 | break; \ |
128 | } \ | |
129 | \ | |
130 | /* It's an one byte sequence. */ \ | |
131 | *((uint32_t *) outptr)++ = *inptr++; \ | |
132 | } | |
133 | #include <iconv/loop.c> | |
134 | #include <iconv/skeleton.c> | |
135 | ||
136 | ||
137 | /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */ | |
138 | #define DEFINE_INIT 0 | |
139 | #define DEFINE_FINI 0 | |
140 | #define MIN_NEEDED_FROM 4 | |
141 | #define MIN_NEEDED_TO 1 | |
142 | #define FROM_DIRECTION 1 | |
143 | #define FROM_LOOP internal_ascii_loop | |
144 | #define TO_LOOP internal_ascii_loop /* This is not used. */ | |
145 | #define FUNCTION_NAME __gconv_transform_internal_ascii | |
146 | ||
147 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
148 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
149 | #define LOOPFCT FROM_LOOP | |
150 | #define BODY \ | |
151 | { \ | |
5aa8ff62 | 152 | if (*((uint32_t *) inptr) > 0x7f) \ |
8619129f UD |
153 | { \ |
154 | /* This is no correct ANSI_X3.4-1968 character. */ \ | |
d64b6ad0 | 155 | result = __GCONV_ILLEGAL_INPUT; \ |
8619129f UD |
156 | break; \ |
157 | } \ | |
158 | \ | |
159 | /* It's an one byte sequence. */ \ | |
160 | *outptr++ = *((uint32_t *) inptr)++; \ | |
161 | } | |
162 | #include <iconv/loop.c> | |
163 | #include <iconv/skeleton.c> | |
164 | ||
165 | ||
166 | /* Convert from the internal (UCS4-like) format to UTF-8. */ | |
167 | #define DEFINE_INIT 0 | |
168 | #define DEFINE_FINI 0 | |
169 | #define MIN_NEEDED_FROM 4 | |
170 | #define MIN_NEEDED_TO 1 | |
171 | #define MAX_NEEDED_TO 6 | |
172 | #define FROM_DIRECTION 1 | |
173 | #define FROM_LOOP internal_utf8_loop | |
174 | #define TO_LOOP internal_utf8_loop /* This is not used. */ | |
175 | #define FUNCTION_NAME __gconv_transform_internal_utf8 | |
176 | ||
177 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
178 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
5aa8ff62 | 179 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO |
8619129f UD |
180 | #define LOOPFCT FROM_LOOP |
181 | #define BODY \ | |
182 | { \ | |
183 | uint32_t wc = *((uint32_t *) inptr); \ | |
184 | \ | |
185 | /* Since we control every character we read this cannot happen. */ \ | |
186 | assert (wc <= 0x7fffffff); \ | |
187 | \ | |
188 | if (wc < 0x80) \ | |
189 | /* It's an one byte sequence. */ \ | |
190 | *outptr++ = (unsigned char) wc; \ | |
191 | else \ | |
192 | { \ | |
193 | size_t step; \ | |
194 | char *start; \ | |
195 | \ | |
196 | for (step = 2; step < 6; ++step) \ | |
197 | if ((wc & encoding_mask[step - 2]) == 0) \ | |
198 | break; \ | |
199 | \ | |
200 | if (outptr + step >= outend) \ | |
201 | { \ | |
202 | /* Too long. */ \ | |
d64b6ad0 | 203 | result = __GCONV_FULL_OUTPUT; \ |
8619129f UD |
204 | break; \ |
205 | } \ | |
206 | \ | |
207 | start = outptr; \ | |
208 | *outptr = encoding_byte[step - 2]; \ | |
209 | outptr += step; \ | |
210 | --step; \ | |
211 | do \ | |
212 | { \ | |
213 | start[step] = 0x80 | (wc & 0x3f); \ | |
214 | wc >>= 6; \ | |
215 | } \ | |
216 | while (--step > 0); \ | |
217 | start[0] |= wc; \ | |
218 | } \ | |
219 | \ | |
220 | inptr += 4; \ | |
221 | } | |
222 | #include <iconv/loop.c> | |
223 | #include <iconv/skeleton.c> | |
224 | ||
225 | ||
226 | /* Convert from UTF-8 to the internal (UCS4-like) format. */ | |
227 | #define DEFINE_INIT 0 | |
228 | #define DEFINE_FINI 0 | |
229 | #define MIN_NEEDED_FROM 1 | |
230 | #define MAX_NEEDED_FROM 6 | |
231 | #define MIN_NEEDED_TO 4 | |
232 | #define FROM_DIRECTION 1 | |
233 | #define FROM_LOOP utf8_internal_loop | |
234 | #define TO_LOOP utf8_internal_loop /* This is not used. */ | |
235 | #define FUNCTION_NAME __gconv_transform_utf8_internal | |
236 | ||
237 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
5aa8ff62 | 238 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM |
8619129f UD |
239 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
240 | #define LOOPFCT FROM_LOOP | |
241 | #define BODY \ | |
242 | { \ | |
243 | uint32_t ch; \ | |
244 | uint_fast32_t cnt; \ | |
245 | uint_fast32_t i; \ | |
246 | \ | |
247 | /* Next input byte. */ \ | |
248 | ch = *inptr; \ | |
249 | \ | |
250 | if (ch < 0x80) \ | |
8619129f | 251 | { \ |
5aa8ff62 UD |
252 | /* One byte sequence. */ \ |
253 | cnt = 1; \ | |
254 | ++inptr; \ | |
8619129f UD |
255 | } \ |
256 | else \ | |
257 | { \ | |
bd32e4a6 | 258 | if (ch >= 0xc2 && ch < 0xe0) \ |
5aa8ff62 | 259 | { \ |
bd32e4a6 UD |
260 | /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ |
261 | otherwise the wide character could have been represented \ | |
262 | using a single byte. */ \ | |
5aa8ff62 UD |
263 | cnt = 2; \ |
264 | ch &= 0x1f; \ | |
265 | } \ | |
266 | else if ((ch & 0xf0) == 0xe0) \ | |
267 | { \ | |
268 | /* We expect three bytes. */ \ | |
269 | cnt = 3; \ | |
270 | ch &= 0x0f; \ | |
271 | } \ | |
272 | else if ((ch & 0xf8) == 0xf0) \ | |
273 | { \ | |
274 | /* We expect four bytes. */ \ | |
275 | cnt = 4; \ | |
276 | ch &= 0x07; \ | |
277 | } \ | |
278 | else if ((ch & 0xfc) == 0xf8) \ | |
279 | { \ | |
280 | /* We expect five bytes. */ \ | |
281 | cnt = 5; \ | |
282 | ch &= 0x03; \ | |
283 | } \ | |
284 | else if ((ch & 0xfe) == 0xfc) \ | |
285 | { \ | |
286 | /* We expect six bytes. */ \ | |
287 | cnt = 6; \ | |
288 | ch &= 0x01; \ | |
289 | } \ | |
290 | else \ | |
8619129f UD |
291 | { \ |
292 | /* This is an illegal encoding. */ \ | |
d64b6ad0 | 293 | result = __GCONV_ILLEGAL_INPUT; \ |
8619129f UD |
294 | break; \ |
295 | } \ | |
296 | \ | |
5aa8ff62 UD |
297 | if (NEED_LENGTH_TEST && inptr + cnt > inend) \ |
298 | { \ | |
299 | /* We don't have enough input. */ \ | |
d64b6ad0 | 300 | result = __GCONV_INCOMPLETE_INPUT; \ |
5aa8ff62 UD |
301 | break; \ |
302 | } \ | |
303 | \ | |
304 | /* Read the possible remaining bytes. */ \ | |
305 | for (i = 1; i < cnt; ++i) \ | |
306 | { \ | |
307 | uint32_t byte = inptr[i]; \ | |
308 | \ | |
309 | if ((byte & 0xc0) != 0x80) \ | |
bd32e4a6 UD |
310 | /* This is an illegal encoding. */ \ |
311 | break; \ | |
5aa8ff62 UD |
312 | \ |
313 | ch <<= 6; \ | |
314 | ch |= byte & 0x3f; \ | |
315 | } \ | |
bd32e4a6 UD |
316 | \ |
317 | /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ | |
318 | If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ | |
319 | have been represented with fewer than cnt bytes. */ \ | |
320 | if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \ | |
321 | { \ | |
322 | /* This is an illegal encoding. */ \ | |
786731fc | 323 | result = __GCONV_ILLEGAL_INPUT; \ |
bd32e4a6 UD |
324 | break; \ |
325 | } \ | |
326 | \ | |
5aa8ff62 | 327 | inptr += cnt; \ |
8619129f UD |
328 | } \ |
329 | \ | |
330 | /* Now adjust the pointers and store the result. */ \ | |
8619129f UD |
331 | *((uint32_t *) outptr)++ = ch; \ |
332 | } | |
333 | #include <iconv/loop.c> | |
334 | #include <iconv/skeleton.c> | |
335 | ||
336 | ||
337 | /* Convert from UCS2 to the internal (UCS4-like) format. */ | |
338 | #define DEFINE_INIT 0 | |
339 | #define DEFINE_FINI 0 | |
340 | #define MIN_NEEDED_FROM 2 | |
341 | #define MIN_NEEDED_TO 4 | |
342 | #define FROM_DIRECTION 1 | |
343 | #define FROM_LOOP ucs2_internal_loop | |
344 | #define TO_LOOP ucs2_internal_loop /* This is not used. */ | |
345 | #define FUNCTION_NAME __gconv_transform_ucs2_internal | |
346 | ||
347 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
348 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
349 | #define LOOPFCT FROM_LOOP | |
f1fa8b68 | 350 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
8619129f | 351 | # define BODY \ |
fdf19bf7 UD |
352 | *((uint32_t *) outptr)++ = bswap_16 (*(uint16_t *) inptr); \ |
353 | inptr += 2; | |
f1fa8b68 | 354 | #else |
8619129f UD |
355 | # define BODY \ |
356 | *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++; | |
f1fa8b68 | 357 | #endif |
8619129f UD |
358 | #include <iconv/loop.c> |
359 | #include <iconv/skeleton.c> | |
360 | ||
361 | ||
362 | /* Convert from the internal (UCS4-like) format to UCS2. */ | |
363 | #define DEFINE_INIT 0 | |
364 | #define DEFINE_FINI 0 | |
365 | #define MIN_NEEDED_FROM 4 | |
366 | #define MIN_NEEDED_TO 2 | |
367 | #define FROM_DIRECTION 1 | |
368 | #define FROM_LOOP internal_ucs2_loop | |
369 | #define TO_LOOP internal_ucs2_loop /* This is not used. */ | |
370 | #define FUNCTION_NAME __gconv_transform_internal_ucs2 | |
371 | ||
372 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
373 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
374 | #define LOOPFCT FROM_LOOP | |
f1fa8b68 | 375 | #if __BYTE_ORDER == __LITTLE_ENDIAN |
8619129f UD |
376 | # define BODY \ |
377 | { \ | |
378 | if (*((uint32_t *) inptr) >= 0x10000) \ | |
379 | { \ | |
d64b6ad0 | 380 | result = __GCONV_ILLEGAL_INPUT; \ |
8619129f UD |
381 | break; \ |
382 | } \ | |
383 | /* Please note that we use the `uint32_t' from-pointer as an `uint16_t' \ | |
384 | pointer which works since we are on a little endian machine. */ \ | |
385 | *((uint16_t *) outptr)++ = bswap_16 (*((uint16_t *) inptr)); \ | |
386 | inptr += 4; \ | |
387 | } | |
f1fa8b68 | 388 | #else |
8619129f UD |
389 | # define BODY \ |
390 | { \ | |
391 | if (*((uint32_t *) inptr) >= 0x10000) \ | |
392 | { \ | |
d64b6ad0 | 393 | result = __GCONV_ILLEGAL_INPUT; \ |
8619129f UD |
394 | break; \ |
395 | } \ | |
396 | *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \ | |
397 | } | |
f1fa8b68 | 398 | #endif |
8619129f UD |
399 | #include <iconv/loop.c> |
400 | #include <iconv/skeleton.c> | |
9b26f5c4 UD |
401 | |
402 | ||
403 | /* Convert from UCS2 in little endian to the internal (UCS4-like) format. */ | |
404 | #define DEFINE_INIT 0 | |
405 | #define DEFINE_FINI 0 | |
406 | #define MIN_NEEDED_FROM 2 | |
407 | #define MIN_NEEDED_TO 4 | |
408 | #define FROM_DIRECTION 1 | |
409 | #define FROM_LOOP ucs2little_internal_loop | |
410 | #define TO_LOOP ucs2little_internal_loop /* This is not used.*/ | |
411 | #define FUNCTION_NAME __gconv_transform_ucs2little_internal | |
412 | ||
413 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
414 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
415 | #define LOOPFCT FROM_LOOP | |
416 | #if __BYTE_ORDER == __LITTLE_ENDIAN | |
417 | # define BODY \ | |
418 | *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++; | |
419 | #else | |
420 | # define BODY \ | |
fdf19bf7 UD |
421 | *((uint32_t *) outptr)++ = bswap_16 (*(uint16_t *) inptr); \ |
422 | inptr += 2; | |
9b26f5c4 UD |
423 | #endif |
424 | #include <iconv/loop.c> | |
425 | #include <iconv/skeleton.c> | |
426 | ||
427 | ||
428 | /* Convert from the internal (UCS4-like) format to UCS2 in little endian. */ | |
429 | #define DEFINE_INIT 0 | |
430 | #define DEFINE_FINI 0 | |
431 | #define MIN_NEEDED_FROM 4 | |
432 | #define MIN_NEEDED_TO 2 | |
433 | #define FROM_DIRECTION 1 | |
434 | #define FROM_LOOP internal_ucs2little_loop | |
435 | #define TO_LOOP internal_ucs2little_loop /* This is not used.*/ | |
436 | #define FUNCTION_NAME __gconv_transform_internal_ucs2little | |
437 | ||
438 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
439 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
440 | #define LOOPFCT FROM_LOOP | |
441 | #if __BYTE_ORDER == __LITTLE_ENDIAN | |
442 | # define BODY \ | |
443 | { \ | |
444 | if (*((uint32_t *) inptr) >= 0x10000) \ | |
445 | { \ | |
d64b6ad0 | 446 | result = __GCONV_ILLEGAL_INPUT; \ |
9b26f5c4 UD |
447 | break; \ |
448 | } \ | |
449 | *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++; \ | |
450 | } | |
451 | #else | |
452 | # define BODY \ | |
453 | { \ | |
454 | if (*((uint32_t *) inptr) >= 0x10000) \ | |
455 | { \ | |
d64b6ad0 | 456 | result = __GCONV_ILLEGAL_INPUT; \ |
9b26f5c4 UD |
457 | break; \ |
458 | } \ | |
fdf19bf7 | 459 | *((uint16_t *) outptr)++ = bswap_16 (((uint16_t *) inptr)[1]); \ |
9b26f5c4 UD |
460 | inptr += 4; \ |
461 | } | |
462 | #endif | |
463 | #include <iconv/loop.c> | |
464 | #include <iconv/skeleton.c> |