]>
Commit | Line | Data |
---|---|---|
91927b7c AS |
1 | /* Test iconv's TRANSLIT and IGNORE option handling |
2 | ||
dff8da6b | 3 | Copyright (C) 2020-2024 Free Software Foundation, Inc. |
91927b7c AS |
4 | This file is part of the GNU C Library. |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, see | |
18 | <https://www.gnu.org/licenses/>. */ | |
19 | ||
20 | ||
21 | #include <iconv.h> | |
22 | #include <locale.h> | |
23 | #include <errno.h> | |
24 | #include <string.h> | |
25 | #include <support/support.h> | |
26 | #include <support/check.h> | |
27 | ||
28 | ||
29 | /* Run one iconv test. Arguments: | |
30 | to: destination character set and options | |
31 | from: source character set | |
32 | input: input string to be converted | |
33 | exp_in: expected number of bytes consumed | |
34 | exp_ret: expected return value (error or number of irreversible conversions) | |
35 | exp_out: expected output string | |
36 | exp_err: expected value of `errno' after iconv returns. */ | |
37 | static void | |
38 | test_iconv (const char *to, const char *from, char *input, size_t exp_in, | |
39 | size_t exp_ret, const char *exp_out, int exp_err) | |
40 | { | |
41 | iconv_t cd; | |
42 | char outbuf[500]; | |
43 | size_t inlen, outlen; | |
44 | char *inptr, *outptr; | |
45 | size_t n; | |
46 | ||
47 | cd = iconv_open (to, from); | |
48 | TEST_VERIFY (cd != (iconv_t) -1); | |
49 | ||
50 | inlen = strlen (input); | |
51 | outlen = sizeof (outbuf); | |
52 | inptr = input; | |
53 | outptr = outbuf; | |
54 | ||
55 | errno = 0; | |
56 | n = iconv (cd, &inptr, &inlen, &outptr, &outlen); | |
57 | ||
58 | TEST_COMPARE (n, exp_ret); | |
59 | TEST_VERIFY (inptr == input + exp_in); | |
60 | TEST_COMPARE (errno, exp_err); | |
61 | TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out)); | |
62 | TEST_VERIFY (iconv_close (cd) == 0); | |
63 | } | |
64 | ||
65 | ||
66 | /* We test option parsing by converting UTF-8 inputs to ASCII under various | |
67 | option combinations. The UTF-8 inputs fall into three categories: | |
68 | - ASCII-only, | |
69 | - non-ASCII, | |
70 | - non-ASCII with invalid UTF-8 characters. */ | |
71 | ||
72 | /* 1. */ | |
73 | char ascii[] = "Just some ASCII text"; | |
74 | ||
75 | /* 2. Valid UTF-8 input and some corresponding expected outputs with various | |
76 | options. The two non-ASCII characters below are accented alphabets: | |
77 | an `a' then an `o'. */ | |
78 | char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters"; | |
79 | char u2a[] = "UTF-8 text with "; | |
80 | char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters"; | |
81 | char u2a_ignore[] = "UTF-8 text with couple f non-ASCII characters"; | |
82 | ||
83 | /* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is | |
84 | invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */ | |
85 | char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7"; | |
86 | char iu2a[] = "Invalid UTF-8 "; | |
87 | char iu2a_ignore[] = "Invalid UTF-8 text"; | |
88 | char iu2a_both[] = "Invalid UTF-8 [|text|]"; | |
89 | ||
90 | /* 4. Another invalid UTF-8 input and corresponding expected outputs. This time | |
91 | the valid non-ASCII UTF-8 characters appear before the invalid \xff. */ | |
92 | char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext"; | |
93 | char ju2a[] = "Invalid "; | |
94 | char ju2a_translit[] = "Invalid [|UTF-8|] "; | |
95 | char ju2a_ignore[] = "Invalid UTF-8 text"; | |
96 | char ju2a_both[] = "Invalid [|UTF-8|] text"; | |
97 | ||
98 | /* We also test option handling for character set names that have the form | |
99 | "A/B". In this test, we test conversions "ISO-10646/UTF-8", and either | |
100 | ISO-8859-1 or ASCII. */ | |
101 | ||
102 | /* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an | |
103 | equivalent ASCII transliteration. */ | |
104 | char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */ | |
105 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */ | |
106 | 0x00}; | |
107 | char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5" | |
108 | "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5"; | |
109 | char ascii_a[] = "AAAAAAaaaaaa"; | |
110 | ||
111 | /* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */ | |
112 | char iascii [] = {0x80, '~', '\0'}; | |
113 | char empty[] = ""; | |
114 | char ia2u_ignore[] = "~"; | |
115 | ||
116 | static int | |
117 | do_test (void) | |
118 | { | |
119 | xsetlocale (LC_ALL, "en_US.UTF-8"); | |
120 | ||
121 | ||
122 | /* 0. iconv_open should gracefully fail for invalid character sets. */ | |
123 | ||
124 | TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1); | |
125 | TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1); | |
126 | TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1); | |
127 | ||
128 | ||
129 | /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */ | |
130 | ||
131 | test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0); | |
132 | test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0); | |
133 | test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0); | |
134 | test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii, | |
135 | 0); | |
136 | test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0); | |
137 | test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0); | |
138 | ||
139 | ||
140 | /* 2. Valid UTF-8 input with non-ASCII characters: */ | |
141 | ||
142 | /* EILSEQ when converted to ASCII. */ | |
143 | test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ); | |
144 | ||
145 | /* Converted without error with TRANSLIT enabled. */ | |
146 | test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit, | |
147 | 0); | |
148 | ||
149 | /* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */ | |
150 | test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1, | |
151 | u2a_ignore, EILSEQ); | |
152 | ||
153 | /* With TRANSLIT and IGNORE enabled, transliterated without error. We test | |
154 | four combinations. */ | |
155 | ||
156 | test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2, | |
157 | u2a_translit, 0); | |
158 | test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2, | |
159 | u2a_translit, 0); | |
160 | test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, | |
161 | u2a_translit, 0); | |
162 | /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */ | |
163 | test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, | |
164 | u2a_translit, 0); | |
165 | ||
166 | /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still | |
167 | works while respecting any other correctly spelled options. */ | |
168 | ||
169 | test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, | |
170 | EILSEQ); | |
171 | test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1, | |
172 | u2a, EILSEQ); | |
173 | test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, | |
174 | EILSEQ); | |
175 | test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, | |
176 | EILSEQ); | |
177 | test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a), | |
178 | (size_t) -1, u2a, EILSEQ); | |
179 | test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a), | |
180 | (size_t) -1, u2a, EILSEQ); | |
181 | test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, | |
182 | EILSEQ); | |
183 | ||
184 | test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2, | |
185 | u2a_translit, 0); | |
186 | /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */ | |
187 | test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, | |
188 | u2a_translit, 0); | |
189 | test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, | |
190 | u2a_translit, 0); | |
191 | test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2, | |
192 | u2a_translit, 0); | |
193 | ||
194 | test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1, | |
195 | u2a_ignore, EILSEQ); | |
196 | test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1, | |
197 | u2a_ignore, EILSEQ); | |
198 | /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */ | |
199 | test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8), | |
200 | (size_t) -1, u2a_ignore, EILSEQ); | |
201 | test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8), | |
202 | (size_t) -1, u2a_ignore, EILSEQ); | |
203 | ||
204 | ||
205 | /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */ | |
206 | ||
207 | /* EILSEQ; output is truncated at the first invalid UTF-8 character. */ | |
208 | test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a, | |
209 | EILSEQ); | |
210 | ||
211 | /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid | |
212 | UTF-8 character. */ | |
213 | test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, | |
214 | iu2a, EILSEQ); | |
215 | ||
216 | /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and | |
217 | valid UTF-8 non-ASCII characters. */ | |
218 | test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1, | |
219 | iu2a_ignore, EILSEQ); | |
220 | ||
221 | /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8 | |
222 | characters and transliterates valid non-ASCII UTF-8 characters. We test | |
223 | four combinations. */ | |
224 | ||
225 | test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2, | |
226 | iu2a_both, 0); | |
227 | /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */ | |
228 | test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2, | |
229 | iu2a_both, 0); | |
230 | test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2, | |
231 | iu2a_both, 0); | |
232 | /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */ | |
233 | test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2, | |
234 | iu2a_both, 0); | |
235 | ||
236 | ||
237 | /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */ | |
238 | ||
239 | /* EILSEQ; output is truncated at the first non-ASCII character. */ | |
240 | test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a, | |
241 | EILSEQ); | |
242 | ||
243 | /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid | |
244 | UTF-8 character. */ | |
245 | test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5, | |
246 | (size_t) -1, ju2a_translit, EILSEQ); | |
247 | test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5, | |
248 | (size_t) -1, ju2a_translit, EILSEQ); | |
249 | ||
250 | /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and | |
251 | valid UTF-8 non-ASCII characters. */ | |
252 | test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1, | |
253 | ju2a_ignore, EILSEQ); | |
254 | test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1, | |
255 | ju2a_ignore, EILSEQ); | |
256 | ||
257 | /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8 | |
258 | characters and transliterates valid non-ASCII UTF-8 characters. We test | |
259 | several combinations. */ | |
260 | ||
261 | test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2, | |
262 | ju2a_both, 0); | |
263 | /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */ | |
264 | test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2, | |
265 | ju2a_both, 0); | |
266 | test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2, | |
267 | ju2a_both, 0); | |
268 | /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */ | |
269 | test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2, | |
270 | ju2a_both, 0); | |
271 | test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2, | |
272 | ju2a_both, 0); | |
273 | /* Trailing whitespace and separators should be ignored. */ | |
274 | test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2, | |
275 | ju2a_both, 0); | |
276 | test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2, | |
277 | ju2a_both, 0); | |
278 | test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2, | |
279 | ju2a_both, 0); | |
280 | test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2, | |
281 | ju2a_both, 0); | |
282 | test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2, | |
283 | ju2a_both, 0); | |
284 | test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2, | |
285 | ju2a_both, 0); | |
286 | ||
287 | /* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */ | |
288 | test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1, | |
289 | ju2a, EILSEQ); | |
290 | test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1, | |
291 | ju2a, EILSEQ); | |
292 | test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a), | |
293 | (size_t) -1, ju2a, EILSEQ); | |
294 | ||
295 | ||
296 | /* 5. Charset names of the form "A/B/": */ | |
297 | ||
298 | /* ISO-8859-1 is converted to UTF-8 without needing transliteration. */ | |
299 | test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a, | |
300 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
301 | test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a, | |
302 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
303 | test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a, | |
304 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
305 | test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a, | |
306 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
307 | test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a, | |
308 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
309 | test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a, | |
310 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
311 | test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a, | |
312 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
313 | test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a, | |
314 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
315 | test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a, | |
316 | strlen (iso8859_1_a), 0, utf8_a, 0); | |
317 | ||
318 | /* UTF-8 with accented A's is converted to ASCII with transliteration. */ | |
319 | test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a, | |
320 | 0, (size_t) -1, empty, EILSEQ); | |
321 | test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a, | |
322 | strlen (utf8_a), (size_t) -1, empty, EILSEQ); | |
323 | test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a, | |
324 | strlen (utf8_a), 12, ascii_a, 0); | |
325 | ||
326 | /* Invalid ASCII is converted to UTF-8 only with IGNORE. */ | |
327 | test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1, | |
328 | empty, EILSEQ); | |
329 | test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty), | |
330 | (size_t) -1, empty, EILSEQ); | |
331 | test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii), | |
332 | (size_t) -1, ia2u_ignore, EILSEQ); | |
333 | test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii, | |
334 | strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ); | |
335 | /* Due to bug 19519, iconv was ignoring IGNORE for the following three | |
336 | inputs: */ | |
337 | test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii, | |
338 | strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ); | |
339 | test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii, | |
340 | strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ); | |
341 | test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii, | |
342 | strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ); | |
343 | ||
344 | return 0; | |
345 | } | |
346 | ||
347 | #include <support/test-driver.c> |