1 /* Convert text in given files from the specified from-set to the to-set.
2    Copyright (C) 1998-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published
7    by the Free Software Foundation; version 2 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
17 
18 #include <argp.h>
19 #include <assert.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <error.h>
23 #include <fcntl.h>
24 #include <iconv.h>
25 #include <langinfo.h>
26 #include <locale.h>
27 #include <search.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 #include <libintl.h>
34 #ifdef _POSIX_MAPPED_FILES
35 # include <sys/mman.h>
36 #endif
37 #include <charmap.h>
38 #include <gconv_int.h>
39 #include "iconv_prog.h"
40 #include "iconvconfig.h"
41 #include "gconv_charset.h"
42 
43 /* Get libc version number.  */
44 #include "../version.h"
45 
46 #define PACKAGE _libc_intl_domainname
47 
48 
49 /* Name and version of program.  */
50 static void print_version (FILE *stream, struct argp_state *state);
51 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
52 
53 #define OPT_VERBOSE	1000
54 #define OPT_LIST	'l'
55 
56 /* Definitions of arguments for argp functions.  */
57 static const struct argp_option options[] =
58 {
59   { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
60   { "from-code", 'f', N_("NAME"), 0, N_("encoding of original text") },
61   { "to-code", 't', N_("NAME"), 0, N_("encoding for output") },
62   { NULL, 0, NULL, 0, N_("Information:") },
63   { "list", 'l', NULL, 0, N_("list all known coded character sets") },
64   { NULL, 0, NULL, 0, N_("Output control:") },
65   { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
66   { "output", 'o', N_("FILE"), 0, N_("output file") },
67   { "silent", 's', NULL, 0, N_("suppress warnings") },
68   { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
69   { NULL, 0, NULL, 0, NULL }
70 };
71 
72 /* Short description of program.  */
73 static const char doc[] = N_("\
74 Convert encoding of given files from one encoding to another.");
75 
76 /* Strings for arguments in help texts.  */
77 static const char args_doc[] = N_("[FILE...]");
78 
79 /* Prototype for option handler.  */
80 static error_t parse_opt (int key, char *arg, struct argp_state *state);
81 
82 /* Function to print some extra text in the help message.  */
83 static char *more_help (int key, const char *text, void *input);
84 
85 /* Data structure to communicate with argp functions.  */
86 static struct argp argp =
87 {
88   options, parse_opt, args_doc, doc, NULL, more_help
89 };
90 
91 /* Code sets to convert from and to respectively.  An empty string as the
92    default causes the 'iconv_open' function to look up the charset of the
93    currently selected locale and use it.  */
94 static const char *from_code = "";
95 static const char *to_code = "";
96 
97 /* File to write output to.  If NULL write to stdout.  */
98 static const char *output_file;
99 
100 /* Nonzero if list of all coded character sets is wanted.  */
101 static int list;
102 
103 /* If nonzero omit invalid character from output.  */
104 int omit_invalid;
105 
106 /* Prototypes for the functions doing the actual work.  */
107 static int process_block (iconv_t cd, char *addr, size_t len, FILE **output,
108 			  const char *output_file);
109 static int process_fd (iconv_t cd, int fd, FILE **output,
110 		       const char *output_file);
111 static int process_file (iconv_t cd, FILE *input, FILE **output,
112 			 const char *output_file);
113 static void print_known_names (void);
114 
115 
116 int
main(int argc,char * argv[])117 main (int argc, char *argv[])
118 {
119   int status = EXIT_SUCCESS;
120   int remaining;
121   __gconv_t cd;
122   struct charmap_t *from_charmap = NULL;
123   struct charmap_t *to_charmap = NULL;
124 
125   /* Set locale via LC_ALL.  */
126   setlocale (LC_ALL, "");
127 
128   /* Set the text message domain.  */
129   textdomain (_libc_intl_domainname);
130 
131   /* Parse and process arguments.  */
132   argp_parse (&argp, argc, argv, 0, &remaining, NULL);
133 
134   /* List all coded character sets if wanted.  */
135   if (list)
136     {
137       print_known_names ();
138       exit (EXIT_SUCCESS);
139     }
140 
141   /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
142      can be file names of charmaps.  In this case iconv will have to read
143      those charmaps and use them to do the conversion.  But there are
144      holes in the specification.  There is nothing said that if -f is a
145      charmap filename that -t must be, too.  And vice versa.  There is
146      also no word about the symbolic names used.  What if they don't
147      match?  */
148   if (strchr (from_code, '/') != NULL)
149     /* The from-name might be a charmap file name.  Try reading the
150        file.  */
151     from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
152 
153   if (strchr (to_code, '/') != NULL)
154     /* The to-name might be a charmap file name.  Try reading the
155        file.  */
156     to_charmap = charmap_read (to_code, /*0, 1,*/1, 0, 0, 0);
157 
158 
159   /* At this point we have to handle two cases.  The first one is
160      where a charmap is used for the from- or to-charset, or both.  We
161      handle this special since it is very different from the sane way of
162      doing things.  The other case allows converting using the iconv()
163      function.  */
164   if (from_charmap != NULL || to_charmap != NULL)
165     /* Construct the conversion table and do the conversion.  */
166     status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
167 				 argc, remaining, argv, output_file);
168   else
169     {
170       struct gconv_spec conv_spec;
171       int res;
172 
173       if (__gconv_create_spec (&conv_spec, from_code, to_code) == NULL)
174         {
175           error (EXIT_FAILURE, errno,
176                  _("failed to start conversion processing"));
177           exit (1);
178         }
179 
180       if (omit_invalid)
181         conv_spec.ignore = true;
182 
183       /* Let's see whether we have these coded character sets.  */
184       res = __gconv_open (&conv_spec, &cd, 0);
185 
186       __gconv_destroy_spec (&conv_spec);
187 
188       if (res != __GCONV_OK)
189 	{
190 	  if (errno == EINVAL)
191 	    {
192 	      /* Try to be nice with the user and tell her which of the
193 		 two encoding names is wrong.  This is possible because
194 		 all supported encodings can be converted from/to Unicode,
195 		 in other words, because the graph of encodings is
196 		 connected.  */
197 	      bool from_wrong =
198 		(iconv_open ("UTF-8", from_code) == (iconv_t) -1
199 		 && errno == EINVAL);
200 	      bool to_wrong =
201 		(iconv_open (to_code, "UTF-8") == (iconv_t) -1
202 		 && errno == EINVAL);
203 	      const char *from_pretty =
204 		(from_code[0] ? from_code : nl_langinfo (CODESET));
205 	      const char *to_pretty =
206 		(to_code[0] ? to_code : nl_langinfo (CODESET));
207 
208 	      if (from_wrong)
209 		{
210 		  if (to_wrong)
211 		    error (0, 0,
212 			   _("\
213 conversions from `%s' and to `%s' are not supported"),
214 			   from_pretty, to_pretty);
215 		  else
216 		    error (0, 0,
217 			   _("conversion from `%s' is not supported"),
218 			   from_pretty);
219 		}
220 	      else
221 		{
222 		  if (to_wrong)
223 		    error (0, 0,
224 			   _("conversion to `%s' is not supported"),
225 			   to_pretty);
226 		  else
227 		    error (0, 0,
228 			   _("conversion from `%s' to `%s' is not supported"),
229 			   from_pretty, to_pretty);
230 		}
231 
232 	      argp_help (&argp, stderr, ARGP_HELP_SEE,
233 			 program_invocation_short_name);
234 	      exit (1);
235 	    }
236 	  else
237 	    error (EXIT_FAILURE, errno,
238 		   _("failed to start conversion processing"));
239 	}
240 
241       /* The output file.  Will be opened when we are ready to produce
242 	 output.  */
243       FILE *output = NULL;
244 
245       /* Now process the remaining files.  Write them to stdout or the file
246 	 specified with the `-o' parameter.  If we have no file given as
247 	 the parameter process all from stdin.  */
248       if (remaining == argc)
249 	{
250 	  if (process_file (cd, stdin, &output, output_file) != 0)
251 	    status = EXIT_FAILURE;
252 	}
253       else
254 	do
255 	  {
256 #ifdef _POSIX_MAPPED_FILES
257 	    struct stat64 st;
258 	    char *addr;
259 #endif
260 	    int fd, ret;
261 
262 	    if (verbose)
263 	      fprintf (stderr, "%s:\n", argv[remaining]);
264 	    if (strcmp (argv[remaining], "-") == 0)
265 	      fd = 0;
266 	    else
267 	      {
268 		fd = open (argv[remaining], O_RDONLY);
269 
270 		if (fd == -1)
271 		  {
272 		    error (0, errno, _("cannot open input file `%s'"),
273 			   argv[remaining]);
274 		    status = EXIT_FAILURE;
275 		    continue;
276 		  }
277 	      }
278 
279 #ifdef _POSIX_MAPPED_FILES
280 	    /* We have possibilities for reading the input file.  First try
281 	       to mmap() it since this will provide the fastest solution.  */
282 	    if (fstat64 (fd, &st) == 0
283 		&& ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
284 				  fd, 0)) != MAP_FAILED))
285 	      {
286 		/* Yes, we can use mmap().  The descriptor is not needed
287 		   anymore.  */
288 		if (close (fd) != 0)
289 		  error (EXIT_FAILURE, errno,
290 			 _("error while closing input `%s'"),
291 			 argv[remaining]);
292 
293 		ret = process_block (cd, addr, st.st_size, &output,
294 				     output_file);
295 
296 		/* We don't need the input data anymore.  */
297 		munmap ((void *) addr, st.st_size);
298 
299 		if (ret != 0)
300 		  {
301 		    status = EXIT_FAILURE;
302 
303 		    if (ret < 0)
304 		      /* We cannot go on with producing output since it might
305 			 lead to problem because the last output might leave
306 			 the output stream in an undefined state.  */
307 		      break;
308 		  }
309 	      }
310 	    else
311 #endif	/* _POSIX_MAPPED_FILES */
312 	      {
313 		/* Read the file in pieces.  */
314 		ret = process_fd (cd, fd, &output, output_file);
315 
316 		/* Now close the file.  */
317 		close (fd);
318 
319 		if (ret != 0)
320 		  {
321 		    /* Something went wrong.  */
322 		    status = EXIT_FAILURE;
323 
324 		    if (ret < 0)
325 		      /* We cannot go on with producing output since it might
326 			 lead to problem because the last output might leave
327 			 the output stream in an undefined state.  */
328 		      break;
329 		  }
330 	      }
331 	  }
332 	while (++remaining < argc);
333 
334       /* Close the output file now.  */
335       if (output != NULL && fclose (output))
336 	error (EXIT_FAILURE, errno, _("error while closing output file"));
337     }
338 
339   return status;
340 }
341 
342 
343 /* Handle program arguments.  */
344 static error_t
parse_opt(int key,char * arg,struct argp_state * state)345 parse_opt (int key, char *arg, struct argp_state *state)
346 {
347   switch (key)
348     {
349     case 'f':
350       from_code = arg;
351       break;
352     case 't':
353       to_code = arg;
354       break;
355     case 'o':
356       output_file = arg;
357       break;
358     case 's':
359       /* Nothing, for now at least.  We are not giving out any information
360 	 about missing character or so.  */
361       break;
362     case 'c':
363       /* Omit invalid characters from output.  */
364       omit_invalid = 1;
365       break;
366     case OPT_VERBOSE:
367       verbose = 1;
368       break;
369     case OPT_LIST:
370       list = 1;
371       break;
372     default:
373       return ARGP_ERR_UNKNOWN;
374     }
375   return 0;
376 }
377 
378 
379 static char *
more_help(int key,const char * text,void * input)380 more_help (int key, const char *text, void *input)
381 {
382   char *tp = NULL;
383   switch (key)
384     {
385     case ARGP_KEY_HELP_EXTRA:
386       /* We print some extra information.  */
387       if (asprintf (&tp, gettext ("\
388 For bug reporting instructions, please see:\n\
389 %s.\n"), REPORT_BUGS_TO) < 0)
390 	return NULL;
391       return tp;
392     default:
393       break;
394     }
395   return (char *) text;
396 }
397 
398 
399 /* Print the version information.  */
400 static void
print_version(FILE * stream,struct argp_state * state)401 print_version (FILE *stream, struct argp_state *state)
402 {
403   fprintf (stream, "iconv %s%s\n", PKGVERSION, VERSION);
404   fprintf (stream, gettext ("\
405 Copyright (C) %s Free Software Foundation, Inc.\n\
406 This is free software; see the source for copying conditions.  There is NO\n\
407 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
408 "), "2022");
409   fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
410 }
411 
412 
413 static int
write_output(const char * outbuf,const char * outptr,FILE ** output,const char * output_file)414 write_output (const char *outbuf, const char *outptr, FILE **output,
415 	      const char *output_file)
416 {
417   /* We have something to write out.  */
418   int errno_save = errno;
419 
420   if (*output == NULL)
421     {
422       /* Determine output file.  */
423       if (output_file != NULL && strcmp (output_file, "-") != 0)
424 	{
425 	  *output = fopen (output_file, "w");
426 	  if (*output == NULL)
427 	    error (EXIT_FAILURE, errno, _("cannot open output file"));
428 	}
429       else
430 	*output = stdout;
431     }
432 
433   if (fwrite (outbuf, 1, outptr - outbuf, *output) < (size_t) (outptr - outbuf)
434       || ferror (*output))
435     {
436       /* Error occurred while printing the result.  */
437       error (0, 0, _("\
438 conversion stopped due to problem in writing the output"));
439       return -1;
440     }
441 
442   errno = errno_save;
443 
444   return 0;
445 }
446 
447 
448 static int
process_block(iconv_t cd,char * addr,size_t len,FILE ** output,const char * output_file)449 process_block (iconv_t cd, char *addr, size_t len, FILE **output,
450 	       const char *output_file)
451 {
452 #define OUTBUF_SIZE	32768
453   const char *start = addr;
454   char outbuf[OUTBUF_SIZE];
455   char *outptr;
456   size_t outlen;
457   size_t n;
458   int ret = 0;
459 
460   while (len > 0)
461     {
462       outptr = outbuf;
463       outlen = OUTBUF_SIZE;
464       n = iconv (cd, &addr, &len, &outptr, &outlen);
465 
466       if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
467 	{
468 	  ret = 1;
469 	  if (len == 0)
470 	    n = 0;
471 	  else
472 	    errno = E2BIG;
473 	}
474 
475       if (outptr != outbuf)
476 	{
477 	  ret = write_output (outbuf, outptr, output, output_file);
478 	  if (ret != 0)
479 	    break;
480 	}
481 
482       if (n != (size_t) -1)
483 	{
484 	  /* All the input test is processed.  For state-dependent
485 	     character sets we have to flush the state now.  */
486 	  outptr = outbuf;
487 	  outlen = OUTBUF_SIZE;
488 	  n = iconv (cd, NULL, NULL, &outptr, &outlen);
489 
490 	  if (outptr != outbuf)
491 	    {
492 	      ret = write_output (outbuf, outptr, output, output_file);
493 	      if (ret != 0)
494 		break;
495 	    }
496 
497 	  if (n != (size_t) -1)
498 	    break;
499 
500 	  if (omit_invalid && errno == EILSEQ)
501 	    {
502 	      ret = 1;
503 	      break;
504 	    }
505 	}
506 
507       if (errno != E2BIG)
508 	{
509 	  /* iconv() ran into a problem.  */
510 	  switch (errno)
511 	    {
512 	    case EILSEQ:
513 	      if (! omit_invalid)
514 		error (0, 0, _("illegal input sequence at position %ld"),
515 		       (long int) (addr - start));
516 	      break;
517 	    case EINVAL:
518 	      error (0, 0, _("\
519 incomplete character or shift sequence at end of buffer"));
520 	      break;
521 	    case EBADF:
522 	      error (0, 0, _("internal error (illegal descriptor)"));
523 	      break;
524 	    default:
525 	      error (0, 0, _("unknown iconv() error %d"), errno);
526 	      break;
527 	    }
528 
529 	  return -1;
530 	}
531     }
532 
533   return ret;
534 }
535 
536 
537 static int
process_fd(iconv_t cd,int fd,FILE ** output,const char * output_file)538 process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
539 {
540   /* we have a problem with reading from a desriptor since we must not
541      provide the iconv() function an incomplete character or shift
542      sequence at the end of the buffer.  Since we have to deal with
543      arbitrary encodings we must read the whole text in a buffer and
544      process it in one step.  */
545   static char *inbuf = NULL;
546   static size_t maxlen = 0;
547   char *inptr = NULL;
548   size_t actlen = 0;
549 
550   while (actlen < maxlen)
551     {
552       ssize_t n = read (fd, inptr, maxlen - actlen);
553 
554       if (n == 0)
555 	/* No more text to read.  */
556 	break;
557 
558       if (n == -1)
559 	{
560 	  /* Error while reading.  */
561 	  error (0, errno, _("error while reading the input"));
562 	  return -1;
563 	}
564 
565       inptr += n;
566       actlen += n;
567     }
568 
569   if (actlen == maxlen)
570     while (1)
571       {
572 	ssize_t n;
573 	char *new_inbuf;
574 
575 	/* Increase the buffer.  */
576 	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
577 	if (new_inbuf == NULL)
578 	  {
579 	    error (0, errno, _("unable to allocate buffer for input"));
580 	    return -1;
581 	  }
582 	inbuf = new_inbuf;
583 	maxlen += 32768;
584 	inptr = inbuf + actlen;
585 
586 	do
587 	  {
588 	    n = read (fd, inptr, maxlen - actlen);
589 
590 	    if (n == 0)
591 	      /* No more text to read.  */
592 	      break;
593 
594 	    if (n == -1)
595 	      {
596 		/* Error while reading.  */
597 		error (0, errno, _("error while reading the input"));
598 		return -1;
599 	      }
600 
601 	    inptr += n;
602 	    actlen += n;
603 	  }
604 	while (actlen < maxlen);
605 
606 	if (n == 0)
607 	  /* Break again so we leave both loops.  */
608 	  break;
609       }
610 
611   /* Now we have all the input in the buffer.  Process it in one run.  */
612   return process_block (cd, inbuf, actlen, output, output_file);
613 }
614 
615 
616 static int
process_file(iconv_t cd,FILE * input,FILE ** output,const char * output_file)617 process_file (iconv_t cd, FILE *input, FILE **output, const char *output_file)
618 {
619   /* This should be safe since we use this function only for `stdin' and
620      we haven't read anything so far.  */
621   return process_fd (cd, fileno (input), output, output_file);
622 }
623 
624 
625 /* Print all known character sets/encodings.  */
626 static void *printlist;
627 static size_t column;
628 static int not_first;
629 
630 static void
insert_print_list(const void * nodep,VISIT value,int level)631 insert_print_list (const void *nodep, VISIT value, int level)
632 {
633   if (value == leaf || value == postorder)
634     {
635       const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
636       tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
637     }
638 }
639 
640 static void
do_print_human(const void * nodep,VISIT value,int level)641 do_print_human  (const void *nodep, VISIT value, int level)
642 {
643   if (value == leaf || value == postorder)
644     {
645       const char *s = *(const char **) nodep;
646       size_t len = strlen (s);
647       size_t cnt;
648 
649       while (len > 0 && s[len - 1] == '/')
650 	--len;
651 
652       for (cnt = 0; cnt < len; ++cnt)
653 	if (isalnum (s[cnt]))
654 	  break;
655       if (cnt == len)
656 	return;
657 
658       if (not_first)
659 	{
660 	  putchar (',');
661 	  ++column;
662 
663 	  if (column > 2 && column + len > 77)
664 	    {
665 	      fputs ("\n  ", stdout);
666 	      column = 2;
667 	    }
668 	  else
669 	    {
670 	      putchar (' ');
671 	      ++column;
672 	    }
673 	}
674       else
675 	not_first = 1;
676 
677       fwrite (s, len, 1, stdout);
678       column += len;
679     }
680 }
681 
682 static void
do_print(const void * nodep,VISIT value,int level)683 do_print  (const void *nodep, VISIT value, int level)
684 {
685   if (value == leaf || value == postorder)
686     {
687       const char *s = *(const char **) nodep;
688 
689       puts (s);
690     }
691 }
692 
693 static void
add_known_names(struct gconv_module * node)694 add_known_names (struct gconv_module *node)
695 {
696   if (node->left != NULL)
697     add_known_names (node->left);
698   if (node->right != NULL)
699     add_known_names (node->right);
700   do
701     {
702       if (strcmp (node->from_string, "INTERNAL") != 0)
703 	tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp);
704       if (strcmp (node->to_string, "INTERNAL") != 0)
705 	tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
706 
707       node = node->same;
708     }
709   while (node != NULL);
710 }
711 
712 
713 static void
insert_cache(void)714 insert_cache (void)
715 {
716   const struct gconvcache_header *header;
717   const char *strtab;
718   const struct hash_entry *hashtab;
719   size_t cnt;
720 
721   header = (const struct gconvcache_header *) __gconv_get_cache ();
722   strtab = (char *) header + header->string_offset;
723   hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
724 
725   for (cnt = 0; cnt < header->hash_size; ++cnt)
726     if (hashtab[cnt].string_offset != 0)
727       {
728 	const char *str = strtab + hashtab[cnt].string_offset;
729 
730 	if (strcmp (str, "INTERNAL") != 0)
731 	  tsearch (str, &printlist, (__compar_fn_t) strverscmp);
732       }
733 }
734 
735 
736 static void
print_known_names(void)737 print_known_names (void)
738 {
739   iconv_t h;
740   void *cache;
741 
742   /* We must initialize the internal databases first.  */
743   h = iconv_open ("L1", "L1");
744   iconv_close (h);
745 
746   /* See whether we have a cache.  */
747   cache = __gconv_get_cache ();
748   if (cache != NULL)
749     /* Yep, use only this information.  */
750     insert_cache ();
751   else
752     {
753       struct gconv_module *modules;
754 
755       /* No, then use the information read from the gconv-modules file.
756 	 First add the aliases.  */
757       twalk (__gconv_get_alias_db (), insert_print_list);
758 
759       /* Add the from- and to-names from the known modules.  */
760       modules = __gconv_get_modules_db ();
761       if (modules != NULL)
762 	add_known_names (modules);
763     }
764 
765   bool human_readable = isatty (fileno (stdout));
766 
767   if (human_readable)
768     fputs (_("\
769 The following list contains all the coded character sets known.  This does\n\
770 not necessarily mean that all combinations of these names can be used for\n\
771 the FROM and TO command line parameters.  One coded character set can be\n\
772 listed with several different names (aliases).\n\n  "), stdout);
773 
774   /* Now print the collected names.  */
775   column = 2;
776   twalk (printlist, human_readable ? do_print_human : do_print);
777 
778   if (human_readable && column != 0)
779     puts ("");
780 }
781