1 /* Convert using charmaps and possibly iconv().
2    Copyright (C) 2001-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published
7    by the Free Software Foundation; version 2 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
17 
18 #include <assert.h>
19 #include <errno.h>
20 #include <error.h>
21 #include <fcntl.h>
22 #include <iconv.h>
23 #include <libintl.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <unistd.h>
27 #include <stdint.h>
28 #include <sys/mman.h>
29 #include <sys/stat.h>
30 
31 #include "iconv_prog.h"
32 
33 
34 /* Prototypes for a few program-wide used functions.  */
35 #include <programs/xmalloc.h>
36 
37 
38 struct convtable
39 {
40   int term[256 / 8];
41   union
42   {
43     struct convtable *sub;
44     struct charseq *out;
45   } val[256];
46 };
47 
48 
49 static inline struct convtable *
allocate_table(void)50 allocate_table (void)
51 {
52   return (struct convtable *) xcalloc (1, sizeof (struct convtable));
53 }
54 
55 static inline void
free_table(struct convtable * tbl)56 free_table (struct convtable *tbl)
57 {
58   free (tbl);
59 }
60 
61 
62 static inline int
is_term(struct convtable * tbl,unsigned int idx)63 is_term (struct convtable *tbl, unsigned int idx)
64 {
65   return tbl->term[idx / 8] & (1 << (idx % 8));
66 }
67 
68 
69 static inline void
clear_term(struct convtable * tbl,unsigned int idx)70 clear_term (struct convtable *tbl, unsigned int idx)
71 {
72   tbl->term[idx / 8] &= ~(1 << (idx % 8));
73 }
74 
75 
76 static inline void
set_term(struct convtable * tbl,unsigned int idx)77 set_term (struct convtable *tbl, unsigned int idx)
78 {
79   tbl->term[idx / 8] |= 1 << (idx % 8);
80 }
81 
82 
83 /* Generate the conversion table.  */
84 static struct convtable *use_from_charmap (struct charmap_t *from_charmap,
85 					   const char *to_code);
86 static struct convtable *use_to_charmap (const char *from_code,
87 					 struct charmap_t *to_charmap);
88 static struct convtable *use_both_charmaps (struct charmap_t *from_charmap,
89 					    struct charmap_t *to_charmap);
90 
91 /* Prototypes for the functions doing the actual work.  */
92 static int process_block (struct convtable *tbl, char *addr, size_t len,
93 			  FILE *output);
94 static int process_fd (struct convtable *tbl, int fd, FILE *output);
95 static int process_file (struct convtable *tbl, FILE *input, FILE *output);
96 
97 
98 int
charmap_conversion(const char * from_code,struct charmap_t * from_charmap,const char * to_code,struct charmap_t * to_charmap,int argc,int remaining,char * argv[],const char * output_file)99 charmap_conversion (const char *from_code, struct charmap_t *from_charmap,
100 		    const char *to_code, struct charmap_t *to_charmap,
101 		    int argc, int remaining, char *argv[],
102 		    const char *output_file)
103 {
104   struct convtable *cvtbl;
105   int status = EXIT_SUCCESS;
106 
107   /* We have three different cases to handle:
108 
109      - both, from_charmap and to_charmap, are available.  This means we
110        can assume that the symbolic names match and use them to create
111        the mapping.
112 
113      - only from_charmap is available.  In this case we can only hope that
114        the symbolic names used are of the <Uxxxx> form in which case we
115        can use a UCS4->"to_code" iconv() conversion for the second step.
116 
117      - only to_charmap is available.  This is similar, only that we would
118        use iconv() for the "to_code"->UCS4 conversion.
119 
120        We first create a table which maps input bytes into output bytes.
121        Once this is done we can handle all three of the cases above
122        equally.  */
123   if (from_charmap != NULL)
124     {
125       if (to_charmap == NULL)
126 	cvtbl = use_from_charmap (from_charmap, to_code);
127       else
128 	cvtbl = use_both_charmaps (from_charmap, to_charmap);
129     }
130   else
131     {
132       assert (to_charmap != NULL);
133       cvtbl = use_to_charmap (from_code, to_charmap);
134     }
135 
136   /* If we couldn't generate a table stop now.  */
137   if (cvtbl == NULL)
138     return EXIT_FAILURE;
139 
140   /* Determine output file.  */
141   FILE *output;
142   if (output_file != NULL && strcmp (output_file, "-") != 0)
143     {
144       output = fopen (output_file, "w");
145       if (output == NULL)
146 	error (EXIT_FAILURE, errno, _("cannot open output file"));
147     }
148   else
149     output = stdout;
150 
151   /* We can now start the conversion.  */
152   if (remaining == argc)
153     {
154       if (process_file (cvtbl, stdin, output) != 0)
155 	status = EXIT_FAILURE;
156     }
157   else
158     do
159       {
160 	int fd;
161 
162 	if (verbose)
163 	  printf ("%s:\n", argv[remaining]);
164 	if (strcmp (argv[remaining], "-") == 0)
165 	  fd = 0;
166 	else
167 	  {
168 	    fd = open (argv[remaining], O_RDONLY);
169 
170 	    if (fd == -1)
171 	      {
172 		error (0, errno, _("cannot open input file `%s'"),
173 		       argv[remaining]);
174 		status = EXIT_FAILURE;
175 		continue;
176 	      }
177 	  }
178 
179 #ifdef _POSIX_MAPPED_FILES
180 	struct stat64 st;
181 	char *addr;
182 	/* We have possibilities for reading the input file.  First try
183 	   to mmap() it since this will provide the fastest solution.  */
184 	if (fstat64 (fd, &st) == 0
185 	    && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
186 			      fd, 0)) != MAP_FAILED))
187 	  {
188 	    /* Yes, we can use mmap().  The descriptor is not needed
189 	       anymore.  */
190 	    if (close (fd) != 0)
191 	      error (EXIT_FAILURE, errno,
192 		     _("error while closing input `%s'"), argv[remaining]);
193 
194 	    if (process_block (cvtbl, addr, st.st_size, output) < 0)
195 	      {
196 		/* Something went wrong.  */
197 		status = EXIT_FAILURE;
198 
199 		/* We don't need the input data anymore.  */
200 		munmap ((void *) addr, st.st_size);
201 
202 		/* We cannot go on with producing output since it might
203 		   lead to problem because the last output might leave
204 		   the output stream in an undefined state.  */
205 		break;
206 	      }
207 
208 	    /* We don't need the input data anymore.  */
209 	    munmap ((void *) addr, st.st_size);
210 	  }
211 	else
212 #endif	/* _POSIX_MAPPED_FILES */
213 	  {
214 	    /* Read the file in pieces.  */
215 	    if (process_fd (cvtbl, fd, output) != 0)
216 	      {
217 		/* Something went wrong.  */
218 		status = EXIT_FAILURE;
219 
220 		/* We don't need the input file anymore.  */
221 		close (fd);
222 
223 		/* We cannot go on with producing output since it might
224 		   lead to problem because the last output might leave
225 		   the output stream in an undefined state.  */
226 		break;
227 	      }
228 
229 	    /* Now close the file.  */
230 	    close (fd);
231 	  }
232       }
233     while (++remaining < argc);
234 
235   /* All done.  */
236   if (output != stdout)
237     fclose (output);
238   free_table (cvtbl);
239   return status;
240 }
241 
242 
243 /* Add the IN->OUT mapping to TBL.  OUT is potentially stored in the table.
244    IN is used only here, so it need not be kept live afterwards.  */
245 static void
add_bytes(struct convtable * tbl,const struct charseq * in,struct charseq * out)246 add_bytes (struct convtable *tbl, const struct charseq *in, struct charseq *out)
247 {
248   int n = 0;
249   unsigned int byte;
250 
251   assert (in->nbytes > 0);
252 
253   byte = ((unsigned char *) in->bytes)[n];
254   while (n + 1 < in->nbytes)
255     {
256       if (is_term (tbl, byte) || tbl->val[byte].sub == NULL)
257 	{
258 	  /* Note that we simply ignore a definition for a byte sequence
259 	     which is also the prefix for a longer one.  */
260 	  clear_term (tbl, byte);
261 	  tbl->val[byte].sub =
262 	    (struct convtable *) xcalloc (1, sizeof (struct convtable));
263 	}
264 
265       tbl = tbl->val[byte].sub;
266 
267       byte = ((unsigned char *) in->bytes)[++n];
268     }
269 
270   /* Only add the new sequence if there is none yet and the byte sequence
271      is not part of an even longer one.  */
272   if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL)
273     {
274       set_term (tbl, byte);
275       tbl->val[byte].out = out;
276     }
277 }
278 
279 /* Try to convert SEQ from WCHAR_T format using CD.
280    Returns a malloc'd struct or NULL.  */
281 static struct charseq *
convert_charseq(iconv_t cd,const struct charseq * seq)282 convert_charseq (iconv_t cd, const struct charseq *seq)
283 {
284   struct charseq *result = NULL;
285 
286   if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
287     {
288       /* There is a chance.  Try the iconv module.  */
289       wchar_t inbuf[1] = { seq->ucs4 };
290       unsigned char outbuf[64];
291       char *inptr = (char *) inbuf;
292       size_t inlen = sizeof (inbuf);
293       char *outptr = (char *) outbuf;
294       size_t outlen = sizeof (outbuf);
295 
296       (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
297 
298       if (outptr != (char *) outbuf)
299         {
300           /* We got some output.  Good, use it.  */
301           outlen = sizeof (outbuf) - outlen;
302           assert ((char *) outbuf + outlen == outptr);
303 
304           result = xmalloc (sizeof (struct charseq) + outlen);
305           result->name = seq->name;
306           result->ucs4 = seq->ucs4;
307           result->nbytes = outlen;
308           memcpy (result->bytes, outbuf, outlen);
309         }
310 
311       /* Clear any possible state left behind.  */
312       (void) iconv (cd, NULL, NULL, NULL, NULL);
313     }
314 
315   return result;
316 }
317 
318 
319 static struct convtable *
use_from_charmap(struct charmap_t * from_charmap,const char * to_code)320 use_from_charmap (struct charmap_t *from_charmap, const char *to_code)
321 {
322   /* We iterate over all entries in the from_charmap and for those which
323      have a known UCS4 representation we use an iconv() call to determine
324      the mapping to the to_code charset.  */
325   struct convtable *rettbl;
326   iconv_t cd;
327   void *ptr = NULL;
328   const void *key;
329   size_t keylen;
330   void *data;
331 
332   cd = iconv_open (to_code, "WCHAR_T");
333   if (cd == (iconv_t) -1)
334     /* We cannot do anything.  */
335     return NULL;
336 
337   rettbl = allocate_table ();
338 
339   while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
340 	 >= 0)
341     {
342       struct charseq *in = data;
343       struct charseq *newp = convert_charseq (cd, in);
344       if (newp != NULL)
345         add_bytes (rettbl, in, newp);
346     }
347 
348   iconv_close (cd);
349 
350   return rettbl;
351 }
352 
353 
354 static struct convtable *
use_to_charmap(const char * from_code,struct charmap_t * to_charmap)355 use_to_charmap (const char *from_code, struct charmap_t *to_charmap)
356 {
357   /* We iterate over all entries in the to_charmap and for those which
358      have a known UCS4 representation we use an iconv() call to determine
359      the mapping to the from_code charset.  */
360   struct convtable *rettbl;
361   iconv_t cd;
362   void *ptr = NULL;
363   const void *key;
364   size_t keylen;
365   void *data;
366 
367   /* Note that the conversion we use here is the reverse direction.  Without
368      exhaustive search we cannot figure out which input yields the UCS4
369      character we are looking for.  Therefore we determine it the other
370      way round.  */
371   cd = iconv_open (from_code, "WCHAR_T");
372   if (cd == (iconv_t) -1)
373     /* We cannot do anything.  */
374     return NULL;
375 
376   rettbl = allocate_table ();
377 
378   while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data)
379 	 >= 0)
380     {
381       struct charseq *out = data;
382       struct charseq *newp = convert_charseq (cd, out);
383       if (newp != NULL)
384         {
385           add_bytes (rettbl, newp, out);
386           free (newp);
387         }
388     }
389 
390   iconv_close (cd);
391 
392   return rettbl;
393 }
394 
395 
396 static struct convtable *
use_both_charmaps(struct charmap_t * from_charmap,struct charmap_t * to_charmap)397 use_both_charmaps (struct charmap_t *from_charmap,
398 		   struct charmap_t *to_charmap)
399 {
400   /* In this case we iterate over all the entries in the from_charmap,
401      determine the internal name, and find an appropriate entry in the
402      to_charmap (if it exists).  */
403   struct convtable *rettbl = allocate_table ();
404   void *ptr = NULL;
405   const void *key;
406   size_t keylen;
407   void *data;
408 
409   while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
410 	 >= 0)
411     {
412       struct charseq *in = (struct charseq *) data;
413       struct charseq *out = charmap_find_value (to_charmap, key, keylen);
414 
415       if (out != NULL)
416 	add_bytes (rettbl, in, out);
417     }
418 
419   return rettbl;
420 }
421 
422 
423 static int
process_block(struct convtable * tbl,char * addr,size_t len,FILE * output)424 process_block (struct convtable *tbl, char *addr, size_t len, FILE *output)
425 {
426   size_t n = 0;
427 
428   while (n < len)
429     {
430       struct convtable *cur = tbl;
431       unsigned char *curp = (unsigned char *) addr;
432       unsigned int byte = *curp;
433       int cnt;
434       struct charseq *out;
435 
436       while (! is_term (cur, byte))
437 	if (cur->val[byte].sub == NULL)
438 	  {
439 	    /* This is an invalid sequence.  Skip the first byte if we are
440 	       ignoring errors.  Otherwise punt.  */
441 	    if (! omit_invalid)
442 	      {
443 		error (0, 0, _("illegal input sequence at position %Zd"), n);
444 		return -1;
445 	      }
446 
447 	    n -= curp - (unsigned char *) addr;
448 
449 	    byte = *(curp = (unsigned char *) ++addr);
450 	    if (++n >= len)
451 	      /* All converted.  */
452 	      return 0;
453 
454 	    cur = tbl;
455 	  }
456 	else
457 	  {
458 	    cur = cur->val[byte].sub;
459 
460 	    if (++n >= len)
461 	      {
462 		error (0, 0, _("\
463 incomplete character or shift sequence at end of buffer"));
464 		return -1;
465 	      }
466 
467 	    byte = *++curp;
468 	  }
469 
470       /* We found a final byte.  Write the output bytes.  */
471       out = cur->val[byte].out;
472       for (cnt = 0; cnt < out->nbytes; ++cnt)
473 	fputc_unlocked (out->bytes[cnt], output);
474 
475       addr = (char *) curp + 1;
476       ++n;
477     }
478 
479   return 0;
480 }
481 
482 
483 static int
process_fd(struct convtable * tbl,int fd,FILE * output)484 process_fd (struct convtable *tbl, int fd, FILE *output)
485 {
486   /* We have a problem with reading from a descriptor since we must not
487      provide the iconv() function an incomplete character or shift
488      sequence at the end of the buffer.  Since we have to deal with
489      arbitrary encodings we must read the whole text in a buffer and
490      process it in one step.  */
491   static char *inbuf = NULL;
492   static size_t maxlen = 0;
493   char *inptr = inbuf;
494   size_t actlen = 0;
495 
496   while (actlen < maxlen)
497     {
498       ssize_t n = read (fd, inptr, maxlen - actlen);
499 
500       if (n == 0)
501 	/* No more text to read.  */
502 	break;
503 
504       if (n == -1)
505 	{
506 	  /* Error while reading.  */
507 	  error (0, errno, _("error while reading the input"));
508 	  return -1;
509 	}
510 
511       inptr += n;
512       actlen += n;
513     }
514 
515   if (actlen == maxlen)
516     while (1)
517       {
518 	ssize_t n;
519 	char *new_inbuf;
520 
521 	/* Increase the buffer.  */
522 	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
523 	if (new_inbuf == NULL)
524 	  {
525 	    error (0, errno, _("unable to allocate buffer for input"));
526 	    return -1;
527 	  }
528 	inbuf = new_inbuf;
529 	maxlen += 32768;
530 	inptr = inbuf + actlen;
531 
532 	do
533 	  {
534 	    n = read (fd, inptr, maxlen - actlen);
535 
536 	    if (n == 0)
537 	      /* No more text to read.  */
538 	      break;
539 
540 	    if (n == -1)
541 	      {
542 		/* Error while reading.  */
543 		error (0, errno, _("error while reading the input"));
544 		return -1;
545 	      }
546 
547 	    inptr += n;
548 	    actlen += n;
549 	  }
550 	while (actlen < maxlen);
551 
552 	if (n == 0)
553 	  /* Break again so we leave both loops.  */
554 	  break;
555       }
556 
557   /* Now we have all the input in the buffer.  Process it in one run.  */
558   return process_block (tbl, inbuf, actlen, output);
559 }
560 
561 
562 static int
process_file(struct convtable * tbl,FILE * input,FILE * output)563 process_file (struct convtable *tbl, FILE *input, FILE *output)
564 {
565   /* This should be safe since we use this function only for `stdin' and
566      we haven't read anything so far.  */
567   return process_fd (tbl, fileno (input), output);
568 }
569