]>
Commit | Line | Data |
---|---|---|
1 | /* Simple transformations functions. | |
2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <https://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <byteswap.h> | |
20 | #include <dlfcn.h> | |
21 | #include <endian.h> | |
22 | #include <errno.h> | |
23 | #include <gconv.h> | |
24 | #include <stdint.h> | |
25 | #include <stdlib.h> | |
26 | #include <string.h> | |
27 | #include <wchar.h> | |
28 | #include <sys/param.h> | |
29 | #include <gconv_int.h> | |
30 | ||
31 | #define BUILTIN_ALIAS(s1, s2) /* nothing */ | |
32 | #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ | |
33 | MinF, MaxF, MinT, MaxT) \ | |
34 | extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \ | |
35 | const unsigned char **, const unsigned char *, \ | |
36 | unsigned char **, size_t *, int, int); | |
37 | #include "gconv_builtin.h" | |
38 | ||
39 | ||
40 | #ifndef EILSEQ | |
41 | # define EILSEQ EINVAL | |
42 | #endif | |
43 | ||
44 | ||
45 | /* Specialized conversion function for a single byte to INTERNAL, recognizing | |
46 | only ASCII characters. */ | |
47 | wint_t | |
48 | __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c) | |
49 | { | |
50 | if (c < 0x80) | |
51 | return c; | |
52 | else | |
53 | return WEOF; | |
54 | } | |
55 | ||
56 | ||
57 | /* Transform from the internal, UCS4-like format, to UCS4. The | |
58 | difference between the internal ucs4 format and the real UCS4 | |
59 | format is, if any, the endianness. The Unicode/ISO 10646 says that | |
60 | unless some higher protocol specifies it differently, the byte | |
61 | order is big endian.*/ | |
62 | #define DEFINE_INIT 0 | |
63 | #define DEFINE_FINI 0 | |
64 | #define MIN_NEEDED_FROM 4 | |
65 | #define MIN_NEEDED_TO 4 | |
66 | #define FROM_DIRECTION 1 | |
67 | #define FROM_LOOP internal_ucs4_loop | |
68 | #define TO_LOOP internal_ucs4_loop /* This is not used. */ | |
69 | #define FUNCTION_NAME __gconv_transform_internal_ucs4 | |
70 | #define ONE_DIRECTION 0 | |
71 | ||
72 | ||
73 | static inline int | |
74 | __attribute ((always_inline)) | |
75 | internal_ucs4_loop (struct __gconv_step *step, | |
76 | struct __gconv_step_data *step_data, | |
77 | const unsigned char **inptrp, const unsigned char *inend, | |
78 | unsigned char **outptrp, const unsigned char *outend, | |
79 | size_t *irreversible) | |
80 | { | |
81 | const unsigned char *inptr = *inptrp; | |
82 | unsigned char *outptr = *outptrp; | |
83 | size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; | |
84 | int result; | |
85 | ||
86 | #if __BYTE_ORDER == __LITTLE_ENDIAN | |
87 | /* Sigh, we have to do some real work. */ | |
88 | size_t cnt; | |
89 | ||
90 | for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) | |
91 | { | |
92 | uint32_t val = get32 (inptr); | |
93 | put32 (outptr, __builtin_bswap32 (val)); | |
94 | } | |
95 | ||
96 | *inptrp = inptr; | |
97 | *outptrp = outptr; | |
98 | #elif __BYTE_ORDER == __BIG_ENDIAN | |
99 | /* Simply copy the data. */ | |
100 | *inptrp = inptr + n_convert * 4; | |
101 | *outptrp = __mempcpy (outptr, inptr, n_convert * 4); | |
102 | #else | |
103 | # error "This endianness is not supported." | |
104 | #endif | |
105 | ||
106 | /* Determine the status. */ | |
107 | if (*inptrp == inend) | |
108 | result = __GCONV_EMPTY_INPUT; | |
109 | else if (*outptrp + 4 > outend) | |
110 | result = __GCONV_FULL_OUTPUT; | |
111 | else | |
112 | result = __GCONV_INCOMPLETE_INPUT; | |
113 | ||
114 | return result; | |
115 | } | |
116 | ||
117 | ||
118 | static inline int | |
119 | __attribute ((always_inline)) | |
120 | internal_ucs4_loop_single (struct __gconv_step *step, | |
121 | struct __gconv_step_data *step_data, | |
122 | const unsigned char **inptrp, | |
123 | const unsigned char *inend, | |
124 | unsigned char **outptrp, | |
125 | const unsigned char *outend, | |
126 | size_t *irreversible) | |
127 | { | |
128 | mbstate_t *state = step_data->__statep; | |
129 | size_t cnt = state->__count & 7; | |
130 | ||
131 | while (*inptrp < inend && cnt < 4) | |
132 | state->__value.__wchb[cnt++] = *(*inptrp)++; | |
133 | ||
134 | if (__glibc_unlikely (cnt < 4)) | |
135 | { | |
136 | /* Still not enough bytes. Store the ones in the input buffer. */ | |
137 | state->__count &= ~7; | |
138 | state->__count |= cnt; | |
139 | ||
140 | return __GCONV_INCOMPLETE_INPUT; | |
141 | } | |
142 | ||
143 | #if __BYTE_ORDER == __LITTLE_ENDIAN | |
144 | (*outptrp)[0] = state->__value.__wchb[3]; | |
145 | (*outptrp)[1] = state->__value.__wchb[2]; | |
146 | (*outptrp)[2] = state->__value.__wchb[1]; | |
147 | (*outptrp)[3] = state->__value.__wchb[0]; | |
148 | ||
149 | #elif __BYTE_ORDER == __BIG_ENDIAN | |
150 | /* XXX unaligned */ | |
151 | (*outptrp)[0] = state->__value.__wchb[0]; | |
152 | (*outptrp)[1] = state->__value.__wchb[1]; | |
153 | (*outptrp)[2] = state->__value.__wchb[2]; | |
154 | (*outptrp)[3] = state->__value.__wchb[3]; | |
155 | #else | |
156 | # error "This endianness is not supported." | |
157 | #endif | |
158 | *outptrp += 4; | |
159 | ||
160 | /* Clear the state buffer. */ | |
161 | state->__count &= ~7; | |
162 | ||
163 | return __GCONV_OK; | |
164 | } | |
165 | ||
166 | #include <iconv/skeleton.c> | |
167 | ||
168 | ||
169 | /* Transform from UCS4 to the internal, UCS4-like format. Unlike | |
170 | for the other direction we have to check for correct values here. */ | |
171 | #define DEFINE_INIT 0 | |
172 | #define DEFINE_FINI 0 | |
173 | #define MIN_NEEDED_FROM 4 | |
174 | #define MIN_NEEDED_TO 4 | |
175 | #define FROM_DIRECTION 1 | |
176 | #define FROM_LOOP ucs4_internal_loop | |
177 | #define TO_LOOP ucs4_internal_loop /* This is not used. */ | |
178 | #define FUNCTION_NAME __gconv_transform_ucs4_internal | |
179 | #define ONE_DIRECTION 0 | |
180 | ||
181 | ||
182 | static inline int | |
183 | __attribute ((always_inline)) | |
184 | ucs4_internal_loop (struct __gconv_step *step, | |
185 | struct __gconv_step_data *step_data, | |
186 | const unsigned char **inptrp, const unsigned char *inend, | |
187 | unsigned char **outptrp, const unsigned char *outend, | |
188 | size_t *irreversible) | |
189 | { | |
190 | int flags = step_data->__flags; | |
191 | const unsigned char *inptr = *inptrp; | |
192 | unsigned char *outptr = *outptrp; | |
193 | int result; | |
194 | ||
195 | for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4) | |
196 | { | |
197 | uint32_t inval = get32 (inptr); | |
198 | #if __BYTE_ORDER == __LITTLE_ENDIAN | |
199 | inval = __builtin_bswap32 (inval); | |
200 | #endif | |
201 | ||
202 | if (__glibc_unlikely (inval > 0x7fffffff)) | |
203 | { | |
204 | /* The value is too large. We don't try transliteration here since | |
205 | this is not an error because of the lack of possibilities to | |
206 | represent the result. This is a genuine bug in the input since | |
207 | UCS4 does not allow such values. */ | |
208 | if (irreversible == NULL) | |
209 | /* We are transliterating, don't try to correct anything. */ | |
210 | return __GCONV_ILLEGAL_INPUT; | |
211 | ||
212 | if (flags & __GCONV_IGNORE_ERRORS) | |
213 | { | |
214 | /* Just ignore this character. */ | |
215 | ++*irreversible; | |
216 | continue; | |
217 | } | |
218 | ||
219 | *inptrp = inptr; | |
220 | *outptrp = outptr; | |
221 | return __GCONV_ILLEGAL_INPUT; | |
222 | } | |
223 | ||
224 | put32 (outptr, inval); | |
225 | outptr += sizeof (uint32_t); | |
226 | } | |
227 | ||
228 | *inptrp = inptr; | |
229 | *outptrp = outptr; | |
230 | ||
231 | /* Determine the status. */ | |
232 | if (*inptrp == inend) | |
233 | result = __GCONV_EMPTY_INPUT; | |
234 | else if (*outptrp + 4 > outend) | |
235 | result = __GCONV_FULL_OUTPUT; | |
236 | else | |
237 | result = __GCONV_INCOMPLETE_INPUT; | |
238 | ||
239 | return result; | |
240 | } | |
241 | ||
242 | ||
243 | static inline int | |
244 | __attribute ((always_inline)) | |
245 | ucs4_internal_loop_single (struct __gconv_step *step, | |
246 | struct __gconv_step_data *step_data, | |
247 | const unsigned char **inptrp, | |
248 | const unsigned char *inend, | |
249 | unsigned char **outptrp, | |
250 | const unsigned char *outend, | |
251 | size_t *irreversible) | |
252 | { | |
253 | mbstate_t *state = step_data->__statep; | |
254 | int flags = step_data->__flags; | |
255 | size_t cnt = state->__count & 7; | |
256 | ||
257 | while (*inptrp < inend && cnt < 4) | |
258 | state->__value.__wchb[cnt++] = *(*inptrp)++; | |
259 | ||
260 | if (__glibc_unlikely (cnt < 4)) | |
261 | { | |
262 | /* Still not enough bytes. Store the ones in the input buffer. */ | |
263 | state->__count &= ~7; | |
264 | state->__count |= cnt; | |
265 | ||
266 | return __GCONV_INCOMPLETE_INPUT; | |
267 | } | |
268 | ||
269 | if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80, | |
270 | 0)) | |
271 | { | |
272 | /* The value is too large. We don't try transliteration here since | |
273 | this is not an error because of the lack of possibilities to | |
274 | represent the result. This is a genuine bug in the input since | |
275 | UCS4 does not allow such values. */ | |
276 | if (!(flags & __GCONV_IGNORE_ERRORS)) | |
277 | { | |
278 | *inptrp -= cnt - (state->__count & 7); | |
279 | return __GCONV_ILLEGAL_INPUT; | |
280 | } | |
281 | } | |
282 | else | |
283 | { | |
284 | #if __BYTE_ORDER == __LITTLE_ENDIAN | |
285 | (*outptrp)[0] = state->__value.__wchb[3]; | |
286 | (*outptrp)[1] = state->__value.__wchb[2]; | |
287 | (*outptrp)[2] = state->__value.__wchb[1]; | |
288 | (*outptrp)[3] = state->__value.__wchb[0]; | |
289 | #elif __BYTE_ORDER == __BIG_ENDIAN | |
290 | (*outptrp)[0] = state->__value.__wchb[0]; | |
291 | (*outptrp)[1] = state->__value.__wchb[1]; | |
292 | (*outptrp)[2] = state->__value.__wchb[2]; | |
293 | (*outptrp)[3] = state->__value.__wchb[3]; | |
294 | #endif | |
295 | ||
296 | *outptrp += 4; | |
297 | } | |
298 | ||
299 | /* Clear the state buffer. */ | |
300 | state->__count &= ~7; | |
301 | ||
302 | return __GCONV_OK; | |
303 | } | |
304 | ||
305 | #include <iconv/skeleton.c> | |
306 | ||
307 | ||
308 | /* Similarly for the little endian form. */ | |
309 | #define DEFINE_INIT 0 | |
310 | #define DEFINE_FINI 0 | |
311 | #define MIN_NEEDED_FROM 4 | |
312 | #define MIN_NEEDED_TO 4 | |
313 | #define FROM_DIRECTION 1 | |
314 | #define FROM_LOOP internal_ucs4le_loop | |
315 | #define TO_LOOP internal_ucs4le_loop /* This is not used. */ | |
316 | #define FUNCTION_NAME __gconv_transform_internal_ucs4le | |
317 | #define ONE_DIRECTION 0 | |
318 | ||
319 | ||
320 | static inline int | |
321 | __attribute ((always_inline)) | |
322 | internal_ucs4le_loop (struct __gconv_step *step, | |
323 | struct __gconv_step_data *step_data, | |
324 | const unsigned char **inptrp, const unsigned char *inend, | |
325 | unsigned char **outptrp, const unsigned char *outend, | |
326 | size_t *irreversible) | |
327 | { | |
328 | const unsigned char *inptr = *inptrp; | |
329 | unsigned char *outptr = *outptrp; | |
330 | size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; | |
331 | int result; | |
332 | ||
333 | #if __BYTE_ORDER == __BIG_ENDIAN | |
334 | /* Sigh, we have to do some real work. */ | |
335 | size_t cnt; | |
336 | ||
337 | for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) | |
338 | { | |
339 | uint32_t val = get32 (inptr); | |
340 | put32 (outptr, __builtin_bswap32 (val)); | |
341 | } | |
342 | ||
343 | *inptrp = inptr; | |
344 | *outptrp = outptr; | |
345 | #elif __BYTE_ORDER == __LITTLE_ENDIAN | |
346 | /* Simply copy the data. */ | |
347 | *inptrp = inptr + n_convert * 4; | |
348 | *outptrp = __mempcpy (outptr, inptr, n_convert * 4); | |
349 | #else | |
350 | # error "This endianness is not supported." | |
351 | #endif | |
352 | ||
353 | /* Determine the status. */ | |
354 | if (*inptrp == inend) | |
355 | result = __GCONV_EMPTY_INPUT; | |
356 | else if (*outptrp + 4 > outend) | |
357 | result = __GCONV_FULL_OUTPUT; | |
358 | else | |
359 | result = __GCONV_INCOMPLETE_INPUT; | |
360 | ||
361 | return result; | |
362 | } | |
363 | ||
364 | ||
365 | static inline int | |
366 | __attribute ((always_inline)) | |
367 | internal_ucs4le_loop_single (struct __gconv_step *step, | |
368 | struct __gconv_step_data *step_data, | |
369 | const unsigned char **inptrp, | |
370 | const unsigned char *inend, | |
371 | unsigned char **outptrp, | |
372 | const unsigned char *outend, | |
373 | size_t *irreversible) | |
374 | { | |
375 | mbstate_t *state = step_data->__statep; | |
376 | size_t cnt = state->__count & 7; | |
377 | ||
378 | while (*inptrp < inend && cnt < 4) | |
379 | state->__value.__wchb[cnt++] = *(*inptrp)++; | |
380 | ||
381 | if (__glibc_unlikely (cnt < 4)) | |
382 | { | |
383 | /* Still not enough bytes. Store the ones in the input buffer. */ | |
384 | state->__count &= ~7; | |
385 | state->__count |= cnt; | |
386 | ||
387 | return __GCONV_INCOMPLETE_INPUT; | |
388 | } | |
389 | ||
390 | #if __BYTE_ORDER == __BIG_ENDIAN | |
391 | (*outptrp)[0] = state->__value.__wchb[3]; | |
392 | (*outptrp)[1] = state->__value.__wchb[2]; | |
393 | (*outptrp)[2] = state->__value.__wchb[1]; | |
394 | (*outptrp)[3] = state->__value.__wchb[0]; | |
395 | ||
396 | #else | |
397 | /* XXX unaligned */ | |
398 | (*outptrp)[0] = state->__value.__wchb[0]; | |
399 | (*outptrp)[1] = state->__value.__wchb[1]; | |
400 | (*outptrp)[2] = state->__value.__wchb[2]; | |
401 | (*outptrp)[3] = state->__value.__wchb[3]; | |
402 | ||
403 | #endif | |
404 | ||
405 | *outptrp += 4; | |
406 | ||
407 | /* Clear the state buffer. */ | |
408 | state->__count &= ~7; | |
409 | ||
410 | return __GCONV_OK; | |
411 | } | |
412 | ||
413 | #include <iconv/skeleton.c> | |
414 | ||
415 | ||
416 | /* And finally from UCS4-LE to the internal encoding. */ | |
417 | #define DEFINE_INIT 0 | |
418 | #define DEFINE_FINI 0 | |
419 | #define MIN_NEEDED_FROM 4 | |
420 | #define MIN_NEEDED_TO 4 | |
421 | #define FROM_DIRECTION 1 | |
422 | #define FROM_LOOP ucs4le_internal_loop | |
423 | #define TO_LOOP ucs4le_internal_loop /* This is not used. */ | |
424 | #define FUNCTION_NAME __gconv_transform_ucs4le_internal | |
425 | #define ONE_DIRECTION 0 | |
426 | ||
427 | ||
428 | static inline int | |
429 | __attribute ((always_inline)) | |
430 | ucs4le_internal_loop (struct __gconv_step *step, | |
431 | struct __gconv_step_data *step_data, | |
432 | const unsigned char **inptrp, const unsigned char *inend, | |
433 | unsigned char **outptrp, const unsigned char *outend, | |
434 | size_t *irreversible) | |
435 | { | |
436 | int flags = step_data->__flags; | |
437 | const unsigned char *inptr = *inptrp; | |
438 | unsigned char *outptr = *outptrp; | |
439 | int result; | |
440 | ||
441 | for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4) | |
442 | { | |
443 | uint32_t inval = get32 (inptr); | |
444 | #if __BYTE_ORDER == __BIG_ENDIAN | |
445 | inval = __builtin_bswap32 (inval); | |
446 | #endif | |
447 | ||
448 | if (__glibc_unlikely (inval > 0x7fffffff)) | |
449 | { | |
450 | /* The value is too large. We don't try transliteration here since | |
451 | this is not an error because of the lack of possibilities to | |
452 | represent the result. This is a genuine bug in the input since | |
453 | UCS4 does not allow such values. */ | |
454 | if (irreversible == NULL) | |
455 | /* We are transliterating, don't try to correct anything. */ | |
456 | return __GCONV_ILLEGAL_INPUT; | |
457 | ||
458 | if (flags & __GCONV_IGNORE_ERRORS) | |
459 | { | |
460 | /* Just ignore this character. */ | |
461 | ++*irreversible; | |
462 | continue; | |
463 | } | |
464 | ||
465 | *inptrp = inptr; | |
466 | *outptrp = outptr; | |
467 | return __GCONV_ILLEGAL_INPUT; | |
468 | } | |
469 | ||
470 | put32 (outptr, inval); | |
471 | outptr += sizeof (uint32_t); | |
472 | } | |
473 | ||
474 | *inptrp = inptr; | |
475 | *outptrp = outptr; | |
476 | ||
477 | /* Determine the status. */ | |
478 | if (*inptrp == inend) | |
479 | result = __GCONV_EMPTY_INPUT; | |
480 | else if (*inptrp + 4 > inend) | |
481 | result = __GCONV_INCOMPLETE_INPUT; | |
482 | else | |
483 | { | |
484 | assert (*outptrp + 4 > outend); | |
485 | result = __GCONV_FULL_OUTPUT; | |
486 | } | |
487 | ||
488 | return result; | |
489 | } | |
490 | ||
491 | ||
492 | static inline int | |
493 | __attribute ((always_inline)) | |
494 | ucs4le_internal_loop_single (struct __gconv_step *step, | |
495 | struct __gconv_step_data *step_data, | |
496 | const unsigned char **inptrp, | |
497 | const unsigned char *inend, | |
498 | unsigned char **outptrp, | |
499 | const unsigned char *outend, | |
500 | size_t *irreversible) | |
501 | { | |
502 | mbstate_t *state = step_data->__statep; | |
503 | int flags = step_data->__flags; | |
504 | size_t cnt = state->__count & 7; | |
505 | ||
506 | while (*inptrp < inend && cnt < 4) | |
507 | state->__value.__wchb[cnt++] = *(*inptrp)++; | |
508 | ||
509 | if (__glibc_unlikely (cnt < 4)) | |
510 | { | |
511 | /* Still not enough bytes. Store the ones in the input buffer. */ | |
512 | state->__count &= ~7; | |
513 | state->__count |= cnt; | |
514 | ||
515 | return __GCONV_INCOMPLETE_INPUT; | |
516 | } | |
517 | ||
518 | if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80, | |
519 | 0)) | |
520 | { | |
521 | /* The value is too large. We don't try transliteration here since | |
522 | this is not an error because of the lack of possibilities to | |
523 | represent the result. This is a genuine bug in the input since | |
524 | UCS4 does not allow such values. */ | |
525 | if (!(flags & __GCONV_IGNORE_ERRORS)) | |
526 | return __GCONV_ILLEGAL_INPUT; | |
527 | } | |
528 | else | |
529 | { | |
530 | #if __BYTE_ORDER == __BIG_ENDIAN | |
531 | (*outptrp)[0] = state->__value.__wchb[3]; | |
532 | (*outptrp)[1] = state->__value.__wchb[2]; | |
533 | (*outptrp)[2] = state->__value.__wchb[1]; | |
534 | (*outptrp)[3] = state->__value.__wchb[0]; | |
535 | #else | |
536 | (*outptrp)[0] = state->__value.__wchb[0]; | |
537 | (*outptrp)[1] = state->__value.__wchb[1]; | |
538 | (*outptrp)[2] = state->__value.__wchb[2]; | |
539 | (*outptrp)[3] = state->__value.__wchb[3]; | |
540 | #endif | |
541 | ||
542 | *outptrp += 4; | |
543 | } | |
544 | ||
545 | /* Clear the state buffer. */ | |
546 | state->__count &= ~7; | |
547 | ||
548 | return __GCONV_OK; | |
549 | } | |
550 | ||
551 | #include <iconv/skeleton.c> | |
552 | ||
553 | ||
554 | /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ | |
555 | #define DEFINE_INIT 0 | |
556 | #define DEFINE_FINI 0 | |
557 | #define MIN_NEEDED_FROM 1 | |
558 | #define MIN_NEEDED_TO 4 | |
559 | #define FROM_DIRECTION 1 | |
560 | #define FROM_LOOP ascii_internal_loop | |
561 | #define TO_LOOP ascii_internal_loop /* This is not used. */ | |
562 | #define FUNCTION_NAME __gconv_transform_ascii_internal | |
563 | #define ONE_DIRECTION 1 | |
564 | ||
565 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
566 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
567 | #define LOOPFCT FROM_LOOP | |
568 | #define BODY \ | |
569 | { \ | |
570 | if (__glibc_unlikely (*inptr > '\x7f')) \ | |
571 | { \ | |
572 | /* The value is too large. We don't try transliteration here since \ | |
573 | this is not an error because of the lack of possibilities to \ | |
574 | represent the result. This is a genuine bug in the input since \ | |
575 | ASCII does not allow such values. */ \ | |
576 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ | |
577 | } \ | |
578 | else \ | |
579 | { \ | |
580 | /* It's an one byte sequence. */ \ | |
581 | *((uint32_t *) outptr) = *inptr++; \ | |
582 | outptr += sizeof (uint32_t); \ | |
583 | } \ | |
584 | } | |
585 | #define LOOP_NEED_FLAGS | |
586 | #include <iconv/loop.c> | |
587 | #include <iconv/skeleton.c> | |
588 | ||
589 | ||
590 | /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */ | |
591 | #define DEFINE_INIT 0 | |
592 | #define DEFINE_FINI 0 | |
593 | #define MIN_NEEDED_FROM 4 | |
594 | #define MIN_NEEDED_TO 1 | |
595 | #define FROM_DIRECTION 1 | |
596 | #define FROM_LOOP internal_ascii_loop | |
597 | #define TO_LOOP internal_ascii_loop /* This is not used. */ | |
598 | #define FUNCTION_NAME __gconv_transform_internal_ascii | |
599 | #define ONE_DIRECTION 1 | |
600 | ||
601 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
602 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
603 | #define LOOPFCT FROM_LOOP | |
604 | #define BODY \ | |
605 | { \ | |
606 | if (__glibc_unlikely (*((const uint32_t *) inptr) > 0x7f)) \ | |
607 | { \ | |
608 | UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \ | |
609 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ | |
610 | } \ | |
611 | else \ | |
612 | { \ | |
613 | /* It's an one byte sequence. */ \ | |
614 | *outptr++ = *((const uint32_t *) inptr); \ | |
615 | inptr += sizeof (uint32_t); \ | |
616 | } \ | |
617 | } | |
618 | #define LOOP_NEED_FLAGS | |
619 | #include <iconv/loop.c> | |
620 | #include <iconv/skeleton.c> | |
621 | ||
622 | ||
623 | /* Convert from the internal (UCS4-like) format to UTF-8. */ | |
624 | #define DEFINE_INIT 0 | |
625 | #define DEFINE_FINI 0 | |
626 | #define MIN_NEEDED_FROM 4 | |
627 | #define MIN_NEEDED_TO 1 | |
628 | #define MAX_NEEDED_TO 6 | |
629 | #define FROM_DIRECTION 1 | |
630 | #define FROM_LOOP internal_utf8_loop | |
631 | #define TO_LOOP internal_utf8_loop /* This is not used. */ | |
632 | #define FUNCTION_NAME __gconv_transform_internal_utf8 | |
633 | #define ONE_DIRECTION 1 | |
634 | ||
635 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
636 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
637 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO | |
638 | #define LOOPFCT FROM_LOOP | |
639 | #define BODY \ | |
640 | { \ | |
641 | uint32_t wc = *((const uint32_t *) inptr); \ | |
642 | \ | |
643 | if (__glibc_likely (wc < 0x80)) \ | |
644 | /* It's an one byte sequence. */ \ | |
645 | *outptr++ = (unsigned char) wc; \ | |
646 | else if (__glibc_likely (wc <= 0x7fffffff \ | |
647 | && (wc < 0xd800 || wc > 0xdfff))) \ | |
648 | { \ | |
649 | size_t step; \ | |
650 | unsigned char *start; \ | |
651 | \ | |
652 | for (step = 2; step < 6; ++step) \ | |
653 | if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \ | |
654 | break; \ | |
655 | \ | |
656 | if (__glibc_unlikely (outptr + step > outend)) \ | |
657 | { \ | |
658 | /* Too long. */ \ | |
659 | result = __GCONV_FULL_OUTPUT; \ | |
660 | break; \ | |
661 | } \ | |
662 | \ | |
663 | start = outptr; \ | |
664 | *outptr = (unsigned char) (~0xff >> step); \ | |
665 | outptr += step; \ | |
666 | do \ | |
667 | { \ | |
668 | start[--step] = 0x80 | (wc & 0x3f); \ | |
669 | wc >>= 6; \ | |
670 | } \ | |
671 | while (step > 1); \ | |
672 | start[0] |= wc; \ | |
673 | } \ | |
674 | else \ | |
675 | { \ | |
676 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ | |
677 | } \ | |
678 | \ | |
679 | inptr += 4; \ | |
680 | } | |
681 | #define LOOP_NEED_FLAGS | |
682 | #include <iconv/loop.c> | |
683 | #include <iconv/skeleton.c> | |
684 | ||
685 | ||
686 | /* Convert from UTF-8 to the internal (UCS4-like) format. */ | |
687 | #define DEFINE_INIT 0 | |
688 | #define DEFINE_FINI 0 | |
689 | #define MIN_NEEDED_FROM 1 | |
690 | #define MAX_NEEDED_FROM 6 | |
691 | #define MIN_NEEDED_TO 4 | |
692 | #define FROM_DIRECTION 1 | |
693 | #define FROM_LOOP utf8_internal_loop | |
694 | #define TO_LOOP utf8_internal_loop /* This is not used. */ | |
695 | #define FUNCTION_NAME __gconv_transform_utf8_internal | |
696 | #define ONE_DIRECTION 1 | |
697 | ||
698 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
699 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM | |
700 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
701 | #define LOOPFCT FROM_LOOP | |
702 | #define BODY \ | |
703 | { \ | |
704 | /* Next input byte. */ \ | |
705 | uint32_t ch = *inptr; \ | |
706 | \ | |
707 | if (__glibc_likely (ch < 0x80)) \ | |
708 | { \ | |
709 | /* One byte sequence. */ \ | |
710 | ++inptr; \ | |
711 | } \ | |
712 | else \ | |
713 | { \ | |
714 | unsigned int cnt; \ | |
715 | unsigned int i; \ | |
716 | \ | |
717 | if (ch >= 0xc2 && ch < 0xe0) \ | |
718 | { \ | |
719 | /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ | |
720 | otherwise the wide character could have been represented \ | |
721 | using a single byte. */ \ | |
722 | cnt = 2; \ | |
723 | ch &= 0x1f; \ | |
724 | } \ | |
725 | else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \ | |
726 | { \ | |
727 | /* We expect three bytes. */ \ | |
728 | cnt = 3; \ | |
729 | ch &= 0x0f; \ | |
730 | } \ | |
731 | else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \ | |
732 | { \ | |
733 | /* We expect four bytes. */ \ | |
734 | cnt = 4; \ | |
735 | ch &= 0x07; \ | |
736 | } \ | |
737 | else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \ | |
738 | { \ | |
739 | /* We expect five bytes. */ \ | |
740 | cnt = 5; \ | |
741 | ch &= 0x03; \ | |
742 | } \ | |
743 | else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \ | |
744 | { \ | |
745 | /* We expect six bytes. */ \ | |
746 | cnt = 6; \ | |
747 | ch &= 0x01; \ | |
748 | } \ | |
749 | else \ | |
750 | { \ | |
751 | /* Search the end of this ill-formed UTF-8 character. This \ | |
752 | is the next byte with (x & 0xc0) != 0x80. */ \ | |
753 | i = 0; \ | |
754 | do \ | |
755 | ++i; \ | |
756 | while (inptr + i < inend \ | |
757 | && (*(inptr + i) & 0xc0) == 0x80 \ | |
758 | && i < 5); \ | |
759 | \ | |
760 | errout: \ | |
761 | STANDARD_FROM_LOOP_ERR_HANDLER (i); \ | |
762 | } \ | |
763 | \ | |
764 | if (__glibc_unlikely (inptr + cnt > inend)) \ | |
765 | { \ | |
766 | /* We don't have enough input. But before we report that check \ | |
767 | that all the bytes are correct. */ \ | |
768 | for (i = 1; inptr + i < inend; ++i) \ | |
769 | if ((inptr[i] & 0xc0) != 0x80) \ | |
770 | break; \ | |
771 | \ | |
772 | if (__glibc_likely (inptr + i == inend)) \ | |
773 | { \ | |
774 | result = __GCONV_INCOMPLETE_INPUT; \ | |
775 | break; \ | |
776 | } \ | |
777 | \ | |
778 | goto errout; \ | |
779 | } \ | |
780 | \ | |
781 | /* Read the possible remaining bytes. */ \ | |
782 | for (i = 1; i < cnt; ++i) \ | |
783 | { \ | |
784 | uint32_t byte = inptr[i]; \ | |
785 | \ | |
786 | if ((byte & 0xc0) != 0x80) \ | |
787 | /* This is an illegal encoding. */ \ | |
788 | break; \ | |
789 | \ | |
790 | ch <<= 6; \ | |
791 | ch |= byte & 0x3f; \ | |
792 | } \ | |
793 | \ | |
794 | /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ | |
795 | If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ | |
796 | have been represented with fewer than cnt bytes. */ \ | |
797 | if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \ | |
798 | /* Do not accept UTF-16 surrogates. */ \ | |
799 | || (ch >= 0xd800 && ch <= 0xdfff)) \ | |
800 | { \ | |
801 | /* This is an illegal encoding. */ \ | |
802 | goto errout; \ | |
803 | } \ | |
804 | \ | |
805 | inptr += cnt; \ | |
806 | } \ | |
807 | \ | |
808 | /* Now adjust the pointers and store the result. */ \ | |
809 | *((uint32_t *) outptr) = ch; \ | |
810 | outptr += sizeof (uint32_t); \ | |
811 | } | |
812 | #define LOOP_NEED_FLAGS | |
813 | ||
814 | #define STORE_REST \ | |
815 | { \ | |
816 | /* We store the remaining bytes while converting them into the UCS4 \ | |
817 | format. We can assume that the first byte in the buffer is \ | |
818 | correct and that it requires a larger number of bytes than there \ | |
819 | are in the input buffer. */ \ | |
820 | wint_t ch = **inptrp; \ | |
821 | size_t cnt, r; \ | |
822 | \ | |
823 | state->__count = inend - *inptrp; \ | |
824 | \ | |
825 | assert (ch != 0xc0 && ch != 0xc1); \ | |
826 | if (ch >= 0xc2 && ch < 0xe0) \ | |
827 | { \ | |
828 | /* We expect two bytes. The first byte cannot be 0xc0 or \ | |
829 | 0xc1, otherwise the wide character could have been \ | |
830 | represented using a single byte. */ \ | |
831 | cnt = 2; \ | |
832 | ch &= 0x1f; \ | |
833 | } \ | |
834 | else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \ | |
835 | { \ | |
836 | /* We expect three bytes. */ \ | |
837 | cnt = 3; \ | |
838 | ch &= 0x0f; \ | |
839 | } \ | |
840 | else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \ | |
841 | { \ | |
842 | /* We expect four bytes. */ \ | |
843 | cnt = 4; \ | |
844 | ch &= 0x07; \ | |
845 | } \ | |
846 | else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \ | |
847 | { \ | |
848 | /* We expect five bytes. */ \ | |
849 | cnt = 5; \ | |
850 | ch &= 0x03; \ | |
851 | } \ | |
852 | else \ | |
853 | { \ | |
854 | /* We expect six bytes. */ \ | |
855 | cnt = 6; \ | |
856 | ch &= 0x01; \ | |
857 | } \ | |
858 | \ | |
859 | /* The first byte is already consumed. */ \ | |
860 | r = cnt - 1; \ | |
861 | while (++(*inptrp) < inend) \ | |
862 | { \ | |
863 | ch <<= 6; \ | |
864 | ch |= **inptrp & 0x3f; \ | |
865 | --r; \ | |
866 | } \ | |
867 | \ | |
868 | /* Shift for the so far missing bytes. */ \ | |
869 | ch <<= r * 6; \ | |
870 | \ | |
871 | /* Store the number of bytes expected for the entire sequence. */ \ | |
872 | state->__count |= cnt << 8; \ | |
873 | \ | |
874 | /* Store the value. */ \ | |
875 | state->__value.__wch = ch; \ | |
876 | } | |
877 | ||
878 | #define UNPACK_BYTES \ | |
879 | { \ | |
880 | static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ | |
881 | wint_t wch = state->__value.__wch; \ | |
882 | size_t ntotal = state->__count >> 8; \ | |
883 | \ | |
884 | inlen = state->__count & 255; \ | |
885 | \ | |
886 | bytebuf[0] = inmask[ntotal - 2]; \ | |
887 | \ | |
888 | do \ | |
889 | { \ | |
890 | if (--ntotal < inlen) \ | |
891 | bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ | |
892 | wch >>= 6; \ | |
893 | } \ | |
894 | while (ntotal > 1); \ | |
895 | \ | |
896 | bytebuf[0] |= wch; \ | |
897 | } | |
898 | ||
899 | #define CLEAR_STATE \ | |
900 | state->__count = 0 | |
901 | ||
902 | ||
903 | #include <iconv/loop.c> | |
904 | #include <iconv/skeleton.c> | |
905 | ||
906 | ||
907 | /* Convert from UCS2 to the internal (UCS4-like) format. */ | |
908 | #define DEFINE_INIT 0 | |
909 | #define DEFINE_FINI 0 | |
910 | #define MIN_NEEDED_FROM 2 | |
911 | #define MIN_NEEDED_TO 4 | |
912 | #define FROM_DIRECTION 1 | |
913 | #define FROM_LOOP ucs2_internal_loop | |
914 | #define TO_LOOP ucs2_internal_loop /* This is not used. */ | |
915 | #define FUNCTION_NAME __gconv_transform_ucs2_internal | |
916 | #define ONE_DIRECTION 1 | |
917 | ||
918 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
919 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
920 | #define LOOPFCT FROM_LOOP | |
921 | #define BODY \ | |
922 | { \ | |
923 | uint16_t u1 = get16 (inptr); \ | |
924 | \ | |
925 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ | |
926 | { \ | |
927 | /* Surrogate characters in UCS-2 input are not valid. Reject \ | |
928 | them. (Catching this here is not security relevant.) */ \ | |
929 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ | |
930 | } \ | |
931 | \ | |
932 | *((uint32_t *) outptr) = u1; \ | |
933 | outptr += sizeof (uint32_t); \ | |
934 | inptr += 2; \ | |
935 | } | |
936 | #define LOOP_NEED_FLAGS | |
937 | #include <iconv/loop.c> | |
938 | #include <iconv/skeleton.c> | |
939 | ||
940 | ||
941 | /* Convert from the internal (UCS4-like) format to UCS2. */ | |
942 | #define DEFINE_INIT 0 | |
943 | #define DEFINE_FINI 0 | |
944 | #define MIN_NEEDED_FROM 4 | |
945 | #define MIN_NEEDED_TO 2 | |
946 | #define FROM_DIRECTION 1 | |
947 | #define FROM_LOOP internal_ucs2_loop | |
948 | #define TO_LOOP internal_ucs2_loop /* This is not used. */ | |
949 | #define FUNCTION_NAME __gconv_transform_internal_ucs2 | |
950 | #define ONE_DIRECTION 1 | |
951 | ||
952 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
953 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
954 | #define LOOPFCT FROM_LOOP | |
955 | #define BODY \ | |
956 | { \ | |
957 | uint32_t val = *((const uint32_t *) inptr); \ | |
958 | \ | |
959 | if (__glibc_unlikely (val >= 0x10000)) \ | |
960 | { \ | |
961 | UNICODE_TAG_HANDLER (val, 4); \ | |
962 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ | |
963 | } \ | |
964 | else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \ | |
965 | { \ | |
966 | /* Surrogate characters in UCS-4 input are not valid. \ | |
967 | We must catch this, because the UCS-2 output might be \ | |
968 | interpreted as UTF-16 by other programs. If we let \ | |
969 | surrogates pass through, attackers could make a security \ | |
970 | hole exploit by synthesizing any desired plane 1-16 \ | |
971 | character. */ \ | |
972 | result = __GCONV_ILLEGAL_INPUT; \ | |
973 | if (! ignore_errors_p ()) \ | |
974 | break; \ | |
975 | inptr += 4; \ | |
976 | ++*irreversible; \ | |
977 | continue; \ | |
978 | } \ | |
979 | else \ | |
980 | { \ | |
981 | put16 (outptr, val); \ | |
982 | outptr += sizeof (uint16_t); \ | |
983 | inptr += 4; \ | |
984 | } \ | |
985 | } | |
986 | #define LOOP_NEED_FLAGS | |
987 | #include <iconv/loop.c> | |
988 | #include <iconv/skeleton.c> | |
989 | ||
990 | ||
991 | /* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */ | |
992 | #define DEFINE_INIT 0 | |
993 | #define DEFINE_FINI 0 | |
994 | #define MIN_NEEDED_FROM 2 | |
995 | #define MIN_NEEDED_TO 4 | |
996 | #define FROM_DIRECTION 1 | |
997 | #define FROM_LOOP ucs2reverse_internal_loop | |
998 | #define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/ | |
999 | #define FUNCTION_NAME __gconv_transform_ucs2reverse_internal | |
1000 | #define ONE_DIRECTION 1 | |
1001 | ||
1002 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
1003 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
1004 | #define LOOPFCT FROM_LOOP | |
1005 | #define BODY \ | |
1006 | { \ | |
1007 | uint16_t u1 = bswap_16 (get16 (inptr)); \ | |
1008 | \ | |
1009 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ | |
1010 | { \ | |
1011 | /* Surrogate characters in UCS-2 input are not valid. Reject \ | |
1012 | them. (Catching this here is not security relevant.) */ \ | |
1013 | if (! ignore_errors_p ()) \ | |
1014 | { \ | |
1015 | result = __GCONV_ILLEGAL_INPUT; \ | |
1016 | break; \ | |
1017 | } \ | |
1018 | inptr += 2; \ | |
1019 | ++*irreversible; \ | |
1020 | continue; \ | |
1021 | } \ | |
1022 | \ | |
1023 | *((uint32_t *) outptr) = u1; \ | |
1024 | outptr += sizeof (uint32_t); \ | |
1025 | inptr += 2; \ | |
1026 | } | |
1027 | #define LOOP_NEED_FLAGS | |
1028 | #include <iconv/loop.c> | |
1029 | #include <iconv/skeleton.c> | |
1030 | ||
1031 | ||
1032 | /* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */ | |
1033 | #define DEFINE_INIT 0 | |
1034 | #define DEFINE_FINI 0 | |
1035 | #define MIN_NEEDED_FROM 4 | |
1036 | #define MIN_NEEDED_TO 2 | |
1037 | #define FROM_DIRECTION 1 | |
1038 | #define FROM_LOOP internal_ucs2reverse_loop | |
1039 | #define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/ | |
1040 | #define FUNCTION_NAME __gconv_transform_internal_ucs2reverse | |
1041 | #define ONE_DIRECTION 1 | |
1042 | ||
1043 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
1044 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
1045 | #define LOOPFCT FROM_LOOP | |
1046 | #define BODY \ | |
1047 | { \ | |
1048 | uint32_t val = *((const uint32_t *) inptr); \ | |
1049 | if (__glibc_unlikely (val >= 0x10000)) \ | |
1050 | { \ | |
1051 | UNICODE_TAG_HANDLER (val, 4); \ | |
1052 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ | |
1053 | } \ | |
1054 | else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \ | |
1055 | { \ | |
1056 | /* Surrogate characters in UCS-4 input are not valid. \ | |
1057 | We must catch this, because the UCS-2 output might be \ | |
1058 | interpreted as UTF-16 by other programs. If we let \ | |
1059 | surrogates pass through, attackers could make a security \ | |
1060 | hole exploit by synthesizing any desired plane 1-16 \ | |
1061 | character. */ \ | |
1062 | if (! ignore_errors_p ()) \ | |
1063 | { \ | |
1064 | result = __GCONV_ILLEGAL_INPUT; \ | |
1065 | break; \ | |
1066 | } \ | |
1067 | inptr += 4; \ | |
1068 | ++*irreversible; \ | |
1069 | continue; \ | |
1070 | } \ | |
1071 | else \ | |
1072 | { \ | |
1073 | put16 (outptr, bswap_16 (val)); \ | |
1074 | outptr += sizeof (uint16_t); \ | |
1075 | inptr += 4; \ | |
1076 | } \ | |
1077 | } | |
1078 | #define LOOP_NEED_FLAGS | |
1079 | #include <iconv/loop.c> | |
1080 | #include <iconv/skeleton.c> |