1 /* Test iconv's TRANSLIT and IGNORE option handling
2
3 Copyright (C) 2020-2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20
21 #include <iconv.h>
22 #include <locale.h>
23 #include <errno.h>
24 #include <string.h>
25 #include <support/support.h>
26 #include <support/check.h>
27
28
29 /* Run one iconv test. Arguments:
30 to: destination character set and options
31 from: source character set
32 input: input string to be converted
33 exp_in: expected number of bytes consumed
34 exp_ret: expected return value (error or number of irreversible conversions)
35 exp_out: expected output string
36 exp_err: expected value of `errno' after iconv returns. */
37 static void
test_iconv(const char * to,const char * from,char * input,size_t exp_in,size_t exp_ret,const char * exp_out,int exp_err)38 test_iconv (const char *to, const char *from, char *input, size_t exp_in,
39 size_t exp_ret, const char *exp_out, int exp_err)
40 {
41 iconv_t cd;
42 char outbuf[500];
43 size_t inlen, outlen;
44 char *inptr, *outptr;
45 size_t n;
46
47 cd = iconv_open (to, from);
48 TEST_VERIFY (cd != (iconv_t) -1);
49
50 inlen = strlen (input);
51 outlen = sizeof (outbuf);
52 inptr = input;
53 outptr = outbuf;
54
55 errno = 0;
56 n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
57
58 TEST_COMPARE (n, exp_ret);
59 TEST_VERIFY (inptr == input + exp_in);
60 TEST_COMPARE (errno, exp_err);
61 TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
62 TEST_VERIFY (iconv_close (cd) == 0);
63 }
64
65
66 /* We test option parsing by converting UTF-8 inputs to ASCII under various
67 option combinations. The UTF-8 inputs fall into three categories:
68 - ASCII-only,
69 - non-ASCII,
70 - non-ASCII with invalid UTF-8 characters. */
71
72 /* 1. */
73 char ascii[] = "Just some ASCII text";
74
75 /* 2. Valid UTF-8 input and some corresponding expected outputs with various
76 options. The two non-ASCII characters below are accented alphabets:
77 an `a' then an `o'. */
78 char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
79 char u2a[] = "UTF-8 text with ";
80 char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
81 char u2a_ignore[] = "UTF-8 text with couple f non-ASCII characters";
82
83 /* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is
84 invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */
85 char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
86 char iu2a[] = "Invalid UTF-8 ";
87 char iu2a_ignore[] = "Invalid UTF-8 text";
88 char iu2a_both[] = "Invalid UTF-8 [|text|]";
89
90 /* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
91 the valid non-ASCII UTF-8 characters appear before the invalid \xff. */
92 char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
93 char ju2a[] = "Invalid ";
94 char ju2a_translit[] = "Invalid [|UTF-8|] ";
95 char ju2a_ignore[] = "Invalid UTF-8 text";
96 char ju2a_both[] = "Invalid [|UTF-8|] text";
97
98 /* We also test option handling for character set names that have the form
99 "A/B". In this test, we test conversions "ISO-10646/UTF-8", and either
100 ISO-8859-1 or ASCII. */
101
102 /* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
103 equivalent ASCII transliteration. */
104 char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */
105 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */
106 0x00};
107 char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
108 "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
109 char ascii_a[] = "AAAAAAaaaaaa";
110
111 /* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */
112 char iascii [] = {0x80, '~', '\0'};
113 char empty[] = "";
114 char ia2u_ignore[] = "~";
115
116 static int
do_test(void)117 do_test (void)
118 {
119 xsetlocale (LC_ALL, "en_US.UTF-8");
120
121
122 /* 0. iconv_open should gracefully fail for invalid character sets. */
123
124 TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
125 TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
126 TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
127
128
129 /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */
130
131 test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
132 test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
133 test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
134 test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
135 0);
136 test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
137 test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
138
139
140 /* 2. Valid UTF-8 input with non-ASCII characters: */
141
142 /* EILSEQ when converted to ASCII. */
143 test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
144
145 /* Converted without error with TRANSLIT enabled. */
146 test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
147 0);
148
149 /* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */
150 test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
151 u2a_ignore, EILSEQ);
152
153 /* With TRANSLIT and IGNORE enabled, transliterated without error. We test
154 four combinations. */
155
156 test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
157 u2a_translit, 0);
158 test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
159 u2a_translit, 0);
160 test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
161 u2a_translit, 0);
162 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
163 test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
164 u2a_translit, 0);
165
166 /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
167 works while respecting any other correctly spelled options. */
168
169 test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
170 EILSEQ);
171 test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
172 u2a, EILSEQ);
173 test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
174 EILSEQ);
175 test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
176 EILSEQ);
177 test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
178 (size_t) -1, u2a, EILSEQ);
179 test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
180 (size_t) -1, u2a, EILSEQ);
181 test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
182 EILSEQ);
183
184 test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
185 u2a_translit, 0);
186 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
187 test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
188 u2a_translit, 0);
189 test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
190 u2a_translit, 0);
191 test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
192 u2a_translit, 0);
193
194 test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
195 u2a_ignore, EILSEQ);
196 test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
197 u2a_ignore, EILSEQ);
198 /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
199 test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
200 (size_t) -1, u2a_ignore, EILSEQ);
201 test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
202 (size_t) -1, u2a_ignore, EILSEQ);
203
204
205 /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */
206
207 /* EILSEQ; output is truncated at the first invalid UTF-8 character. */
208 test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
209 EILSEQ);
210
211 /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
212 UTF-8 character. */
213 test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
214 iu2a, EILSEQ);
215
216 /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
217 valid UTF-8 non-ASCII characters. */
218 test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
219 iu2a_ignore, EILSEQ);
220
221 /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
222 characters and transliterates valid non-ASCII UTF-8 characters. We test
223 four combinations. */
224
225 test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
226 iu2a_both, 0);
227 /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
228 test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
229 iu2a_both, 0);
230 test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
231 iu2a_both, 0);
232 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
233 test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
234 iu2a_both, 0);
235
236
237 /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */
238
239 /* EILSEQ; output is truncated at the first non-ASCII character. */
240 test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
241 EILSEQ);
242
243 /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
244 UTF-8 character. */
245 test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
246 (size_t) -1, ju2a_translit, EILSEQ);
247 test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
248 (size_t) -1, ju2a_translit, EILSEQ);
249
250 /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
251 valid UTF-8 non-ASCII characters. */
252 test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
253 ju2a_ignore, EILSEQ);
254 test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
255 ju2a_ignore, EILSEQ);
256
257 /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
258 characters and transliterates valid non-ASCII UTF-8 characters. We test
259 several combinations. */
260
261 test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
262 ju2a_both, 0);
263 /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
264 test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
265 ju2a_both, 0);
266 test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
267 ju2a_both, 0);
268 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
269 test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
270 ju2a_both, 0);
271 test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
272 ju2a_both, 0);
273 /* Trailing whitespace and separators should be ignored. */
274 test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
275 ju2a_both, 0);
276 test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
277 ju2a_both, 0);
278 test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
279 ju2a_both, 0);
280 test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
281 ju2a_both, 0);
282 test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
283 ju2a_both, 0);
284 test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
285 ju2a_both, 0);
286
287 /* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */
288 test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
289 ju2a, EILSEQ);
290 test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
291 ju2a, EILSEQ);
292 test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
293 (size_t) -1, ju2a, EILSEQ);
294
295
296 /* 5. Charset names of the form "A/B/": */
297
298 /* ISO-8859-1 is converted to UTF-8 without needing transliteration. */
299 test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
300 strlen (iso8859_1_a), 0, utf8_a, 0);
301 test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
302 strlen (iso8859_1_a), 0, utf8_a, 0);
303 test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
304 strlen (iso8859_1_a), 0, utf8_a, 0);
305 test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
306 strlen (iso8859_1_a), 0, utf8_a, 0);
307 test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
308 strlen (iso8859_1_a), 0, utf8_a, 0);
309 test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
310 strlen (iso8859_1_a), 0, utf8_a, 0);
311 test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
312 strlen (iso8859_1_a), 0, utf8_a, 0);
313 test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
314 strlen (iso8859_1_a), 0, utf8_a, 0);
315 test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
316 strlen (iso8859_1_a), 0, utf8_a, 0);
317
318 /* UTF-8 with accented A's is converted to ASCII with transliteration. */
319 test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
320 0, (size_t) -1, empty, EILSEQ);
321 test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
322 strlen (utf8_a), (size_t) -1, empty, EILSEQ);
323 test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
324 strlen (utf8_a), 12, ascii_a, 0);
325
326 /* Invalid ASCII is converted to UTF-8 only with IGNORE. */
327 test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
328 empty, EILSEQ);
329 test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
330 (size_t) -1, empty, EILSEQ);
331 test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
332 (size_t) -1, ia2u_ignore, EILSEQ);
333 test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
334 strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
335 /* Due to bug 19519, iconv was ignoring IGNORE for the following three
336 inputs: */
337 test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
338 strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
339 test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
340 strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
341 test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
342 strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
343
344 return 0;
345 }
346
347 #include <support/test-driver.c>
348