1 /* Test iconv's TRANSLIT and IGNORE option handling
2 
3    Copyright (C) 2020-2022 Free Software Foundation, Inc.
4    This file is part of the GNU C Library.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 
21 #include <iconv.h>
22 #include <locale.h>
23 #include <errno.h>
24 #include <string.h>
25 #include <support/support.h>
26 #include <support/check.h>
27 
28 
29 /* Run one iconv test.  Arguments:
30    to: destination character set and options
31    from: source character set
32    input: input string to be converted
33    exp_in: expected number of bytes consumed
34    exp_ret: expected return value (error or number of irreversible conversions)
35    exp_out: expected output string
36    exp_err: expected value of `errno' after iconv returns.  */
37 static void
test_iconv(const char * to,const char * from,char * input,size_t exp_in,size_t exp_ret,const char * exp_out,int exp_err)38 test_iconv (const char *to, const char *from, char *input, size_t exp_in,
39             size_t exp_ret, const char *exp_out, int exp_err)
40 {
41   iconv_t cd;
42   char outbuf[500];
43   size_t inlen, outlen;
44   char *inptr, *outptr;
45   size_t n;
46 
47   cd = iconv_open (to, from);
48   TEST_VERIFY (cd != (iconv_t) -1);
49 
50   inlen = strlen (input);
51   outlen = sizeof (outbuf);
52   inptr = input;
53   outptr = outbuf;
54 
55   errno = 0;
56   n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
57 
58   TEST_COMPARE (n, exp_ret);
59   TEST_VERIFY (inptr == input + exp_in);
60   TEST_COMPARE (errno, exp_err);
61   TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
62   TEST_VERIFY (iconv_close (cd) == 0);
63 }
64 
65 
66 /* We test option parsing by converting UTF-8 inputs to ASCII under various
67    option combinations. The UTF-8 inputs fall into three categories:
68    - ASCII-only,
69    - non-ASCII,
70    - non-ASCII with invalid UTF-8 characters.  */
71 
72 /* 1.  */
73 char ascii[] = "Just some ASCII text";
74 
75 /* 2. Valid UTF-8 input and some corresponding expected outputs with various
76    options.  The two non-ASCII characters below are accented alphabets:
77    an `a' then an `o'.  */
78 char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
79 char u2a[] = "UTF-8 text with ";
80 char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
81 char u2a_ignore[] = "UTF-8 text with  couple f non-ASCII characters";
82 
83 /* 3. Invalid UTF-8 input and some corresponding expected outputs.  \xff is
84    invalid UTF-8. It's followed by some valid but non-ASCII UTF-8.  */
85 char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
86 char iu2a[] = "Invalid UTF-8 ";
87 char iu2a_ignore[] = "Invalid UTF-8 text";
88 char iu2a_both[] = "Invalid UTF-8 [|text|]";
89 
90 /* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
91    the valid non-ASCII UTF-8 characters appear before the invalid \xff.  */
92 char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
93 char ju2a[] = "Invalid ";
94 char ju2a_translit[] = "Invalid [|UTF-8|] ";
95 char ju2a_ignore[] = "Invalid UTF-8 text";
96 char ju2a_both[] = "Invalid [|UTF-8|] text";
97 
98 /* We also test option handling for character set names that have the form
99    "A/B".  In this test, we test conversions "ISO-10646/UTF-8", and either
100    ISO-8859-1 or ASCII.  */
101 
102 /* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
103    equivalent ASCII transliteration.  */
104 char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's.  */
105                       0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's.  */
106                       0x00};
107 char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
108                 "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
109 char ascii_a[] = "AAAAAAaaaaaa";
110 
111 /* 6. An invalid ASCII string where [0] is invalid and [1] is '~'.  */
112 char iascii [] = {0x80, '~', '\0'};
113 char empty[] = "";
114 char ia2u_ignore[] = "~";
115 
116 static int
do_test(void)117 do_test (void)
118 {
119   xsetlocale (LC_ALL, "en_US.UTF-8");
120 
121 
122   /* 0. iconv_open should gracefully fail for invalid character sets.  */
123 
124   TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
125   TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
126   TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
127 
128 
129   /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes:  */
130 
131   test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
132   test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
133   test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
134   test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
135               0);
136   test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
137   test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
138 
139 
140   /* 2. Valid UTF-8 input with non-ASCII characters:  */
141 
142   /* EILSEQ when converted to ASCII.  */
143   test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
144 
145   /* Converted without error with TRANSLIT enabled.  */
146   test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
147               0);
148 
149   /* EILSEQ with IGNORE enabled.  Non-ASCII chars dropped from output.  */
150   test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
151               u2a_ignore, EILSEQ);
152 
153   /* With TRANSLIT and IGNORE enabled, transliterated without error.  We test
154      four combinations.  */
155 
156   test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
157               u2a_translit, 0);
158   test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
159               u2a_translit, 0);
160   test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
161               u2a_translit, 0);
162   /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
163   test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
164               u2a_translit, 0);
165 
166   /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
167      works while respecting any other correctly spelled options.  */
168 
169   test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
170               EILSEQ);
171   test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
172               u2a, EILSEQ);
173   test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
174               EILSEQ);
175   test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
176               EILSEQ);
177   test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
178               (size_t) -1, u2a, EILSEQ);
179   test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
180               (size_t) -1, u2a, EILSEQ);
181   test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
182               EILSEQ);
183 
184   test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
185               u2a_translit, 0);
186   /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
187   test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
188               u2a_translit, 0);
189   test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
190               u2a_translit, 0);
191   test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
192               u2a_translit, 0);
193 
194   test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
195               u2a_ignore, EILSEQ);
196   test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
197               u2a_ignore, EILSEQ);
198   /* Due to bug 19519, iconv was ignoring IGNORE for the following input.  */
199   test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
200               (size_t) -1, u2a_ignore, EILSEQ);
201   test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
202               (size_t) -1, u2a_ignore, EILSEQ);
203 
204 
205   /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters:  */
206 
207   /* EILSEQ; output is truncated at the first invalid UTF-8 character.  */
208   test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
209               EILSEQ);
210 
211   /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
212      UTF-8 character.  */
213   test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
214               iu2a, EILSEQ);
215 
216   /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
217      valid UTF-8 non-ASCII characters.  */
218   test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
219               iu2a_ignore, EILSEQ);
220 
221   /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
222      characters and transliterates valid non-ASCII UTF-8 characters.  We test
223      four combinations.  */
224 
225   test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
226               iu2a_both, 0);
227   /* Due to bug 19519, iconv was ignoring IGNORE for the following input.  */
228   test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
229               iu2a_both, 0);
230   test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
231               iu2a_both, 0);
232   /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
233   test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
234               iu2a_both, 0);
235 
236 
237   /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first:  */
238 
239   /* EILSEQ; output is truncated at the first non-ASCII character.  */
240   test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
241               EILSEQ);
242 
243   /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
244      UTF-8 character.  */
245   test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
246               (size_t) -1, ju2a_translit, EILSEQ);
247   test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
248               (size_t) -1, ju2a_translit, EILSEQ);
249 
250   /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
251      valid UTF-8 non-ASCII characters.  */
252   test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
253               ju2a_ignore, EILSEQ);
254   test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
255               ju2a_ignore, EILSEQ);
256 
257   /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
258      characters and transliterates valid non-ASCII UTF-8 characters.  We test
259      several combinations.  */
260 
261   test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
262               ju2a_both, 0);
263   /* Due to bug 19519, iconv was ignoring IGNORE for the following input.  */
264   test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
265               ju2a_both, 0);
266   test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
267               ju2a_both, 0);
268   /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input.  */
269   test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
270               ju2a_both, 0);
271   test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
272               ju2a_both, 0);
273   /* Trailing whitespace and separators should be ignored.  */
274   test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
275               ju2a_both, 0);
276   test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
277               ju2a_both, 0);
278   test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
279               ju2a_both, 0);
280   test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
281               ju2a_both, 0);
282   test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
283               ju2a_both, 0);
284   test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
285               ju2a_both, 0);
286 
287   /* TRANSLIT or IGNORE suffixes in fromcode should be ignored.  */
288   test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
289               ju2a, EILSEQ);
290   test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
291               ju2a, EILSEQ);
292   test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
293               (size_t) -1, ju2a, EILSEQ);
294 
295 
296   /* 5. Charset names of the form "A/B/":  */
297 
298   /* ISO-8859-1 is converted to UTF-8 without needing transliteration.  */
299   test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
300               strlen (iso8859_1_a), 0, utf8_a, 0);
301   test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
302               strlen (iso8859_1_a), 0, utf8_a, 0);
303   test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
304               strlen (iso8859_1_a), 0, utf8_a, 0);
305   test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
306               strlen (iso8859_1_a), 0, utf8_a, 0);
307   test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
308               strlen (iso8859_1_a), 0, utf8_a, 0);
309   test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
310               strlen (iso8859_1_a), 0, utf8_a, 0);
311   test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
312               strlen (iso8859_1_a), 0, utf8_a, 0);
313   test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
314               strlen (iso8859_1_a), 0, utf8_a, 0);
315   test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
316               strlen (iso8859_1_a), 0, utf8_a, 0);
317 
318   /* UTF-8 with accented A's is converted to ASCII with transliteration.  */
319   test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
320               0, (size_t) -1, empty, EILSEQ);
321   test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
322               strlen (utf8_a), (size_t) -1, empty, EILSEQ);
323   test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
324               strlen (utf8_a), 12, ascii_a, 0);
325 
326   /* Invalid ASCII is converted to UTF-8 only with IGNORE.  */
327   test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
328               empty, EILSEQ);
329   test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
330               (size_t) -1, empty, EILSEQ);
331   test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
332               (size_t) -1, ia2u_ignore, EILSEQ);
333   test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
334               strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
335   /* Due to bug 19519, iconv was ignoring IGNORE for the following three
336      inputs: */
337   test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
338               strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
339   test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
340               strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
341   test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
342               strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
343 
344   return 0;
345 }
346 
347 #include <support/test-driver.c>
348