1 /* Copyright (C) 2000-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    The GNU C Library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License as published by the Free Software Foundation; either
7    version 2.1 of the License, or (at your option) any later version.
8 
9    The GNU C Library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Lesser General Public License for more details.
13 
14    You should have received a copy of the GNU Lesser General Public
15    License along with the GNU C Library; if not, see
16    <https://www.gnu.org/licenses/>.  */
17 
18 /* Create a table from CHARSET to Unicode.
19    This is a good test for CHARSET's iconv() module, in particular the
20    FROM_LOOP BODY macro.  */
21 
22 #include <stddef.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <iconv.h>
27 #include <errno.h>
28 
29 /* If nonzero, ignore conversions outside Unicode plane 0.  */
30 static int bmp_only;
31 
32 /* Converts a byte buffer to a hexadecimal string.  */
33 static const char*
hexbuf(unsigned char buf[],unsigned int buflen)34 hexbuf (unsigned char buf[], unsigned int buflen)
35 {
36   static char msg[50];
37 
38   switch (buflen)
39     {
40     case 1:
41       sprintf (msg, "0x%02X", buf[0]);
42       break;
43     case 2:
44       sprintf (msg, "0x%02X%02X", buf[0], buf[1]);
45       break;
46     case 3:
47       sprintf (msg, "0x%02X%02X%02X", buf[0], buf[1], buf[2]);
48       break;
49     case 4:
50       sprintf (msg, "0x%02X%02X%02X%02X", buf[0], buf[1], buf[2], buf[3]);
51       break;
52     default:
53       abort ();
54     }
55   return msg;
56 }
57 
58 /* Attempts to convert a byte buffer BUF (BUFLEN bytes) to OUT (12 bytes)
59    using the conversion descriptor CD.  Returns the number of written bytes,
60    or 0 if ambiguous, or -1 if invalid.  */
61 static int
try(iconv_t cd,unsigned char buf[],unsigned int buflen,unsigned char * out)62 try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned char *out)
63 {
64   const char *inbuf = (const char *) buf;
65   size_t inbytesleft = buflen;
66   char *outbuf = (char *) out;
67   size_t outbytesleft = 12;
68   size_t result;
69 
70   iconv (cd, NULL, NULL, NULL, NULL);
71   result = iconv (cd, (char **) &inbuf, &inbytesleft, &outbuf, &outbytesleft);
72   if (result != (size_t)(-1))
73     result = iconv (cd, NULL, NULL, &outbuf, &outbytesleft);
74 
75   if (result == (size_t)(-1))
76     {
77       if (errno == EILSEQ)
78 	{
79 	  return -1;
80 	}
81       else if (errno == EINVAL)
82 	{
83 	  return 0;
84 	}
85       else
86 	{
87 	  int saved_errno = errno;
88 	  fprintf (stderr, "%s: iconv error: ", hexbuf (buf, buflen));
89 	  errno = saved_errno;
90 	  perror ("");
91 	  exit (1);
92 	}
93     }
94   else
95     {
96       if (inbytesleft != 0)
97 	{
98 	  fprintf (stderr, "%s: inbytes = %ld, outbytes = %ld\n",
99 		   hexbuf (buf, buflen),
100 		   (long) (buflen - inbytesleft),
101 		   (long) (12 - outbytesleft));
102 	  exit (1);
103 	}
104       return 12 - outbytesleft;
105     }
106 }
107 
108 /* Returns the out[] buffer as a Unicode value, formatted as 0x%04X.  */
109 static const char *
utf8_decode(const unsigned char * out,unsigned int outlen)110 utf8_decode (const unsigned char *out, unsigned int outlen)
111 {
112   static char hexbuf[84];
113   char *p = hexbuf;
114 
115   while (outlen > 0)
116     {
117       if (p > hexbuf)
118 	*p++ = ' ';
119 
120       if (out[0] < 0x80)
121 	{
122 	  sprintf (p, "0x%04X", out[0]);
123 	  out += 1; outlen -= 1;
124 	}
125       else if (out[0] >= 0xc0 && out[0] < 0xe0 && outlen >= 2)
126 	{
127 	  sprintf (p, "0x%04X", ((out[0] & 0x1f) << 6) + (out[1] & 0x3f));
128 	  out += 2; outlen -= 2;
129 	}
130       else if (out[0] >= 0xe0 && out[0] < 0xf0 && outlen >= 3)
131 	{
132 	  sprintf (p, "0x%04X", ((out[0] & 0x0f) << 12)
133 				+ ((out[1] & 0x3f) << 6) + (out[2] & 0x3f));
134 	  out += 3; outlen -= 3;
135 	}
136       else if (out[0] >= 0xf0 && out[0] < 0xf8 && outlen >= 4)
137 	{
138 	  sprintf (p, "0x%04X", ((out[0] & 0x07) << 18)
139 				+ ((out[1] & 0x3f) << 12)
140 				+ ((out[2] & 0x3f) << 6) + (out[3] & 0x3f));
141 	  out += 4; outlen -= 4;
142 	}
143       else if (out[0] >= 0xf8 && out[0] < 0xfc && outlen >= 5)
144 	{
145 	  sprintf (p, "0x%04X", ((out[0] & 0x03) << 24)
146 				+ ((out[1] & 0x3f) << 18)
147 				+ ((out[2] & 0x3f) << 12)
148 				+ ((out[3] & 0x3f) << 6) + (out[4] & 0x3f));
149 	  out += 5; outlen -= 5;
150 	}
151       else if (out[0] >= 0xfc && out[0] < 0xfe && outlen >= 6)
152 	{
153 	  sprintf (p, "0x%04X", ((out[0] & 0x01) << 30)
154 				+ ((out[1] & 0x3f) << 24)
155 				+ ((out[2] & 0x3f) << 18)
156 				+ ((out[3] & 0x3f) << 12)
157 				+ ((out[4] & 0x3f) << 6) + (out[5] & 0x3f));
158 	  out += 6; outlen -= 6;
159 	}
160       else
161 	{
162 	  sprintf (p, "0x????");
163 	  out += 1; outlen -= 1;
164 	}
165 
166       if (bmp_only && strlen (p) > 6)
167 	/* Ignore conversions outside Unicode plane 0.  */
168 	return NULL;
169 
170       p += strlen (p);
171     }
172 
173   return hexbuf;
174 }
175 
176 int
main(int argc,char * argv[])177 main (int argc, char *argv[])
178 {
179   const char *charset;
180   iconv_t cd;
181   int search_depth;
182 
183   if (argc != 2)
184     {
185       fprintf (stderr, "Usage: tst-table-from charset\n");
186       exit (1);
187     }
188   charset = argv[1];
189 
190   cd = iconv_open ("UTF-8", charset);
191   if (cd == (iconv_t)(-1))
192     {
193       perror ("iconv_open");
194       exit (1);
195     }
196 
197   /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output
198      file gets too big.  */
199   bmp_only = (strcmp (charset, "UTF-8") == 0
200 	      || strcmp (charset, "GB18030") == 0);
201   search_depth = (strcmp (charset, "UTF-8") == 0 ? 3 : 4);
202 
203   {
204     unsigned char out[12];
205     unsigned char buf[4];
206     unsigned int i0, i1, i2, i3;
207     int result;
208 
209     for (i0 = 0; i0 < 0x100; i0++)
210       {
211 	buf[0] = i0;
212 	result = try (cd, buf, 1, out);
213 	if (result < 0)
214 	  {
215 	  }
216 	else if (result > 0)
217 	  {
218 	    const char *unicode = utf8_decode (out, result);
219 	    if (unicode != NULL)
220 	      printf ("0x%02X\t%s\n", i0, unicode);
221 	  }
222 	else
223 	  {
224 	    for (i1 = 0; i1 < 0x100; i1++)
225 	      {
226 		buf[1] = i1;
227 		result = try (cd, buf, 2, out);
228 		if (result < 0)
229 		  {
230 		  }
231 		else if (result > 0)
232 		  {
233 		    const char *unicode = utf8_decode (out, result);
234 		    if (unicode != NULL)
235 		      printf ("0x%02X%02X\t%s\n", i0, i1, unicode);
236 		  }
237 		else
238 		  {
239 		    for (i2 = 0; i2 < 0x100; i2++)
240 		      {
241 			buf[2] = i2;
242 			result = try (cd, buf, 3, out);
243 			if (result < 0)
244 			  {
245 			  }
246 			else if (result > 0)
247 			  {
248 			    const char *unicode = utf8_decode (out, result);
249 			    if (unicode != NULL)
250 			      printf ("0x%02X%02X%02X\t%s\n",
251 				      i0, i1, i2, unicode);
252 			  }
253 			else if (search_depth > 3)
254 			  {
255 			    for (i3 = 0; i3 < 0x100; i3++)
256 			      {
257 				buf[3] = i3;
258 				result = try (cd, buf, 4, out);
259 				if (result < 0)
260 				  {
261 				  }
262 				else if (result > 0)
263 				  {
264 				    const char *unicode =
265 				      utf8_decode (out, result);
266 				    if (unicode != NULL)
267 				      printf ("0x%02X%02X%02X%02X\t%s\n",
268 					      i0, i1, i2, i3, unicode);
269 				  }
270 				else
271 				  {
272 				    fprintf (stderr,
273 					     "%s: incomplete byte sequence\n",
274 					     hexbuf (buf, 4));
275 				    exit (1);
276 				  }
277 			      }
278 			  }
279 		      }
280 		  }
281 	      }
282 	  }
283       }
284   }
285 
286   if (iconv_close (cd) < 0)
287     {
288       perror ("iconv_close");
289       exit (1);
290     }
291 
292   if (ferror (stdin) || fflush (stdout) || ferror (stdout))
293     {
294       fprintf (stderr, "I/O error\n");
295       exit (1);
296     }
297 
298   return 0;
299 }
300