1 /* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published
6    by the Free Software Foundation; version 2 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
16 
17 #ifdef HAVE_CONFIG_H
18 # include "config.h"
19 #endif
20 
21 #include <argp.h>
22 #include <assert.h>
23 #include <ctype.h>
24 #include <endian.h>
25 #include <errno.h>
26 #include <error.h>
27 #include <fcntl.h>
28 #include <iconv.h>
29 #include <langinfo.h>
30 #include <locale.h>
31 #include <libintl.h>
32 #include <limits.h>
33 #include <nl_types.h>
34 #include <obstack.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <wchar.h>
41 
42 #include "version.h"
43 
44 #include "catgetsinfo.h"
45 
46 
47 #define SWAPU32(w) \
48   (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
49 
50 struct message_list
51 {
52   int number;
53   const char *message;
54 
55   const char *fname;
56   size_t line;
57   const char *symbol;
58 
59   struct message_list *next;
60 };
61 
62 
63 struct set_list
64 {
65   int number;
66   int deleted;
67   struct message_list *messages;
68   int last_message;
69 
70   const char *fname;
71   size_t line;
72   const char *symbol;
73 
74   struct set_list *next;
75 };
76 
77 
78 struct catalog
79 {
80   struct set_list *all_sets;
81   struct set_list *current_set;
82   size_t total_messages;
83   wint_t quote_char;
84   int last_set;
85 
86   struct obstack mem_pool;
87 };
88 
89 
90 /* If non-zero force creation of new file, not using existing one.  */
91 static int force_new;
92 
93 /* Name of output file.  */
94 static const char *output_name;
95 
96 /* Name of generated C header file.  */
97 static const char *header_name;
98 
99 /* Name and version of program.  */
100 static void print_version (FILE *stream, struct argp_state *state);
101 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
102 
103 #define OPT_NEW 1
104 
105 /* Definitions of arguments for argp functions.  */
106 static const struct argp_option options[] =
107 {
108   { "header", 'H', N_("NAME"), 0,
109     N_("Create C header file NAME containing symbol definitions") },
110   { "new", OPT_NEW, NULL, 0,
111     N_("Do not use existing catalog, force new output file") },
112   { "output", 'o', N_("NAME"), 0, N_("Write output to file NAME") },
113   { NULL, 0, NULL, 0, NULL }
114 };
115 
116 /* Short description of program.  */
117 static const char doc[] = N_("Generate message catalog.\
118 \vIf INPUT-FILE is -, input is read from standard input.  If OUTPUT-FILE\n\
119 is -, output is written to standard output.\n");
120 
121 /* Strings for arguments in help texts.  */
122 static const char args_doc[] = N_("\
123 -o OUTPUT-FILE [INPUT-FILE]...\n[OUTPUT-FILE [INPUT-FILE]...]");
124 
125 /* Prototype for option handler.  */
126 static error_t parse_opt (int key, char *arg, struct argp_state *state);
127 
128 /* Function to print some extra text in the help message.  */
129 static char *more_help (int key, const char *text, void *input);
130 
131 /* Data structure to communicate with argp functions.  */
132 static struct argp argp =
133 {
134   options, parse_opt, args_doc, doc, NULL, more_help
135 };
136 
137 
138 /* Wrapper functions with error checking for standard functions.  */
139 #include <programs/xmalloc.h>
140 
141 /* Prototypes for local functions.  */
142 static void error_print (void);
143 static struct catalog *read_input_file (struct catalog *current,
144 					const char *fname);
145 static void write_out (struct catalog *result, const char *output_name,
146 		       const char *header_name);
147 static struct set_list *find_set (struct catalog *current, int number);
148 static void normalize_line (const char *fname, size_t line, iconv_t cd,
149 			    wchar_t *string, wchar_t quote_char,
150 			    wchar_t escape_char);
151 static void read_old (struct catalog *catalog, const char *file_name);
152 static int open_conversion (const char *codesetp, iconv_t *cd_towcp,
153 			    iconv_t *cd_tombp, wchar_t *escape_charp);
154 
155 
156 int
main(int argc,char * argv[])157 main (int argc, char *argv[])
158 {
159   struct catalog *result;
160   int remaining;
161 
162   /* Set program name for messages.  */
163   error_print_progname = error_print;
164 
165   /* Set locale via LC_ALL.  */
166   setlocale (LC_ALL, "");
167 
168   /* Set the text message domain.  */
169   textdomain (PACKAGE);
170 
171   /* Initialize local variables.  */
172   result = NULL;
173 
174   /* Parse and process arguments.  */
175   argp_parse (&argp, argc, argv, 0, &remaining, NULL);
176 
177   /* Determine output file.  */
178   if (output_name == NULL)
179     output_name = remaining < argc ? argv[remaining++] : "-";
180 
181   /* Process all input files.  */
182   setlocale (LC_CTYPE, "C");
183   if (remaining < argc)
184     do
185       result = read_input_file (result, argv[remaining]);
186     while (++remaining < argc);
187   else
188     result = read_input_file (NULL, "-");
189 
190   /* Write out the result.  */
191   if (result != NULL)
192     write_out (result, output_name, header_name);
193 
194   return error_message_count != 0;
195 }
196 
197 
198 /* Handle program arguments.  */
199 static error_t
parse_opt(int key,char * arg,struct argp_state * state)200 parse_opt (int key, char *arg, struct argp_state *state)
201 {
202   switch (key)
203     {
204     case 'H':
205       header_name = arg;
206       break;
207     case OPT_NEW:
208       force_new = 1;
209       break;
210     case 'o':
211       output_name = arg;
212       break;
213     default:
214       return ARGP_ERR_UNKNOWN;
215     }
216   return 0;
217 }
218 
219 
220 static char *
more_help(int key,const char * text,void * input)221 more_help (int key, const char *text, void *input)
222 {
223   char *tp = NULL;
224   switch (key)
225     {
226     case ARGP_KEY_HELP_EXTRA:
227       /* We print some extra information.  */
228       if (asprintf (&tp, gettext ("\
229 For bug reporting instructions, please see:\n\
230 %s.\n"), REPORT_BUGS_TO) < 0)
231 	return NULL;
232       return tp;
233     default:
234       break;
235     }
236   return (char *) text;
237 }
238 
239 /* Print the version information.  */
240 static void
print_version(FILE * stream,struct argp_state * state)241 print_version (FILE *stream, struct argp_state *state)
242 {
243   fprintf (stream, "gencat %s%s\n", PKGVERSION, VERSION);
244   fprintf (stream, gettext ("\
245 Copyright (C) %s Free Software Foundation, Inc.\n\
246 This is free software; see the source for copying conditions.  There is NO\n\
247 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
248 "), "2022");
249   fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
250 }
251 
252 
253 /* The address of this function will be assigned to the hook in the
254    error functions.  */
255 static void
error_print(void)256 error_print (void)
257 {
258   /* We don't want the program name to be printed in messages.  Emacs'
259      compile.el does not like this.  */
260 }
261 
262 
263 static struct catalog *
read_input_file(struct catalog * current,const char * fname)264 read_input_file (struct catalog *current, const char *fname)
265 {
266   FILE *fp;
267   char *buf;
268   size_t len;
269   size_t line_number;
270   wchar_t *wbuf;
271   size_t wbufsize;
272   iconv_t cd_towc = (iconv_t) -1;
273   iconv_t cd_tomb = (iconv_t) -1;
274   wchar_t escape_char = L'\\';
275   char *codeset = NULL;
276 
277   if (strcmp (fname, "-") == 0 || strcmp (fname, "/dev/stdin") == 0)
278     {
279       fp = stdin;
280       fname = gettext ("*standard input*");
281     }
282   else
283     fp = fopen (fname, "r");
284   if (fp == NULL)
285     {
286       error (0, errno, gettext ("cannot open input file `%s'"), fname);
287       return current;
288     }
289 
290   /* If we haven't seen anything yet, allocate result structure.  */
291   if (current == NULL)
292     {
293       current = (struct catalog *) xcalloc (1, sizeof (*current));
294 
295 #define obstack_chunk_alloc malloc
296 #define obstack_chunk_free free
297       obstack_init (&current->mem_pool);
298 
299       current->current_set = find_set (current, NL_SETD);
300     }
301 
302   buf = NULL;
303   len = 0;
304   line_number = 0;
305 
306   wbufsize = 1024;
307   wbuf = (wchar_t *) xmalloc (wbufsize);
308 
309   while (!feof (fp))
310     {
311       int continued;
312       int used;
313       size_t start_line = line_number + 1;
314       char *this_line;
315 
316       do
317 	{
318 	  int act_len;
319 
320 	  act_len = getline (&buf, &len, fp);
321 	  if (act_len <= 0)
322 	    break;
323 	  ++line_number;
324 
325 	  /* It the line continued?  */
326 	  continued = 0;
327 	  if (buf[act_len - 1] == '\n')
328 	    {
329 	      --act_len;
330 
331 	      /* There might be more than one backslash at the end of
332 		 the line.  Only if there is an odd number of them is
333 		 the line continued.  */
334 	      if (act_len > 0 && buf[act_len - 1] == '\\')
335 		{
336 		  int temp_act_len = act_len;
337 
338 		  do
339 		    {
340 		      --temp_act_len;
341 		      continued = !continued;
342 		    }
343 		  while (temp_act_len > 0 && buf[temp_act_len - 1] == '\\');
344 
345 		  if (continued)
346 		    --act_len;
347 		}
348 	    }
349 
350 	  /* Append to currently selected line.  */
351 	  obstack_grow (&current->mem_pool, buf, act_len);
352 	}
353       while (continued);
354 
355       obstack_1grow (&current->mem_pool, '\0');
356       this_line = (char *) obstack_finish (&current->mem_pool);
357 
358       used = 0;
359       if (this_line[0] == '$')
360 	{
361 	  if (isblank (this_line[1]))
362 	    {
363 	      int cnt = 1;
364 	      while (isblank (this_line[cnt]))
365 		++cnt;
366 	      if (strncmp (&this_line[cnt], "codeset=", 8) != 0)
367 		/* This is a comment line. Do nothing.  */;
368 	      else if (codeset != NULL)
369 		/* Ignore multiple codeset. */;
370 	      else
371 		{
372 		  int start = cnt + 8;
373 		  cnt = start;
374 		  while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
375 		    ++cnt;
376 		  if (cnt != start)
377 		    {
378 		      int len = cnt - start;
379 		      codeset = xmalloc (len + 1);
380 		      *((char *) mempcpy (codeset, &this_line[start], len))
381 			= '\0';
382 		    }
383 		}
384 	    }
385 	  else if (strncmp (&this_line[1], "set", 3) == 0)
386 	    {
387 	      int cnt = sizeof ("set");
388 	      int set_number;
389 	      const char *symbol = NULL;
390 	      while (isspace (this_line[cnt]))
391 		++cnt;
392 
393 	      if (isdigit (this_line[cnt]))
394 		{
395 		  set_number = atol (&this_line[cnt]);
396 
397 		  /* If the given number for the character set is
398 		     higher than any we used for symbolic set names
399 		     avoid clashing by using only higher numbers for
400 		     the following symbolic definitions.  */
401 		  if (set_number > current->last_set)
402 		    current->last_set = set_number;
403 		}
404 	      else
405 		{
406 		  /* See whether it is a reasonable identifier.  */
407 		  int start = cnt;
408 		  while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
409 		    ++cnt;
410 
411 		  if (cnt == start)
412 		    {
413 		      /* No correct character found.  */
414 		      error_at_line (0, 0, fname, start_line,
415 				     gettext ("illegal set number"));
416 		      set_number = 0;
417 		    }
418 		  else
419 		    {
420 		      /* We have found seomthing that looks like a
421 			 correct identifier.  */
422 		      struct set_list *runp;
423 
424 		      this_line[cnt] = '\0';
425 		      used = 1;
426 		      symbol = &this_line[start];
427 
428 		      /* Test whether the identifier was already used.  */
429 		      runp = current->all_sets;
430 		      while (runp != 0)
431 			if (runp->symbol != NULL
432 			    && strcmp (runp->symbol, symbol) == 0)
433 			  break;
434 			else
435 			  runp = runp->next;
436 
437 		      if (runp != NULL)
438 			{
439 			  /* We cannot allow duplicate identifiers for
440 			     message sets.  */
441 			  error_at_line (0, 0, fname, start_line,
442 					 gettext ("duplicate set definition"));
443 			  error_at_line (0, 0, runp->fname, runp->line,
444 					 gettext ("\
445 this is the first definition"));
446 			  set_number = 0;
447 			}
448 		      else
449 			/* Allocate next free message set for identifier.  */
450 			set_number = ++current->last_set;
451 		    }
452 		}
453 
454 	      if (set_number != 0)
455 		{
456 		  /* We found a legal set number.  */
457 		  current->current_set = find_set (current, set_number);
458 		  if (symbol != NULL)
459 		      used = 1;
460 		  current->current_set->symbol = symbol;
461 		  current->current_set->fname = fname;
462 		  current->current_set->line = start_line;
463 		}
464 	    }
465 	  else if (strncmp (&this_line[1], "delset", 6) == 0)
466 	    {
467 	      int cnt = sizeof ("delset");
468 	      while (isspace (this_line[cnt]))
469 		++cnt;
470 
471 	      if (isdigit (this_line[cnt]))
472 		{
473 		  size_t set_number = atol (&this_line[cnt]);
474 		  struct set_list *set;
475 
476 		  /* Mark the message set with the given number as
477 		     deleted.  */
478 		  set = find_set (current, set_number);
479 		  set->deleted = 1;
480 		}
481 	      else
482 		{
483 		  /* See whether it is a reasonable identifier.  */
484 		  int start = cnt;
485 		  while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
486 		    ++cnt;
487 
488 		  if (cnt == start)
489 		    error_at_line (0, 0, fname, start_line,
490 				   gettext ("illegal set number"));
491 		  else
492 		    {
493 		      const char *symbol;
494 		      struct set_list *runp;
495 
496 		      this_line[cnt] = '\0';
497 		      used = 1;
498 		      symbol = &this_line[start];
499 
500 		      /* We have a symbolic set name.  This name must
501 			 appear somewhere else in the catalogs read so
502 			 far.  */
503 		      for (runp = current->all_sets; runp != NULL;
504 			   runp = runp->next)
505 			{
506 			  if (strcmp (runp->symbol, symbol) == 0)
507 			    {
508 			      runp->deleted = 1;
509 			      break;
510 			    }
511 			}
512 		      if (runp == NULL)
513 			/* Name does not exist before.  */
514 			error_at_line (0, 0, fname, start_line,
515 				       gettext ("unknown set `%s'"), symbol);
516 		    }
517 		}
518 	    }
519 	  else if (strncmp (&this_line[1], "quote", 5) == 0)
520 	    {
521 	      char buf[2];
522 	      char *bufptr;
523 	      size_t buflen;
524 	      char *wbufptr;
525 	      size_t wbuflen;
526 	      int cnt;
527 
528 	      cnt = sizeof ("quote");
529 	      while (isspace (this_line[cnt]))
530 		++cnt;
531 
532 	      /* We need the conversion.  */
533 	      if (cd_towc == (iconv_t) -1
534 		  && open_conversion (codeset, &cd_towc, &cd_tomb,
535 				      &escape_char) != 0)
536 		/* Something is wrong.  */
537 		goto out;
538 
539 	      /* Yes, the quote char can be '\0'; this means no quote
540 		 char.  The function using the information works on
541 		 wide characters so we have to convert it here.  */
542 	      buf[0] = this_line[cnt];
543 	      buf[1] = '\0';
544 	      bufptr = buf;
545 	      buflen = 2;
546 
547 	      wbufptr = (char *) wbuf;
548 	      wbuflen = wbufsize;
549 
550 	      /* Flush the state.  */
551 	      iconv (cd_towc, NULL, NULL, NULL, NULL);
552 
553 	      iconv (cd_towc, &bufptr, &buflen, &wbufptr, &wbuflen);
554 	      if (buflen != 0 || (wchar_t *) wbufptr != &wbuf[2])
555 		error_at_line (0, 0, fname, start_line,
556 			       gettext ("invalid quote character"));
557 	      else
558 		/* Use the converted wide character.  */
559 		current->quote_char = wbuf[0];
560 	    }
561 	  else
562 	    {
563 	      int cnt;
564 	      cnt = 2;
565 	      while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
566 		++cnt;
567 	      this_line[cnt] = '\0';
568 	      error_at_line (0, 0, fname, start_line,
569 			     gettext ("unknown directive `%s': line ignored"),
570 			     &this_line[1]);
571 	    }
572 	}
573       else if (isalnum (this_line[0]) || this_line[0] == '_')
574 	{
575 	  const char *ident = this_line;
576 	  char *line = this_line;
577 	  int message_number;
578 
579 	  do
580 	    ++line;
581 	  while (line[0] != '\0' && !isspace (line[0]));
582 	  if (line[0] != '\0')
583 	    *line++ = '\0';	/* Terminate the identifier.  */
584 
585 	  /* Now we found the beginning of the message itself.  */
586 
587 	  if (isdigit (ident[0]))
588 	    {
589 	      struct message_list *runp;
590 	      struct message_list *lastp;
591 
592 	      message_number = atoi (ident);
593 
594 	      /* Find location to insert the new message.  */
595 	      runp = current->current_set->messages;
596 	      lastp = NULL;
597 	      while (runp != NULL)
598 		if (runp->number == message_number)
599 		  break;
600 		else
601 		  {
602 		    lastp = runp;
603 		    runp = runp->next;
604 		  }
605 	      if (runp != NULL)
606 		{
607 		  /* Oh, oh.  There is already a message with this
608 		     number in the message set.  */
609 		  if (runp->symbol == NULL)
610 		    {
611 		      /* The existing message had its number specified
612 			 by the user.  Fatal collision type uh, oh.  */
613 		      error_at_line (0, 0, fname, start_line,
614 				     gettext ("duplicated message number"));
615 		      error_at_line (0, 0, runp->fname, runp->line,
616 				     gettext ("this is the first definition"));
617 		      message_number = 0;
618 		    }
619 		  else
620 		    {
621 		      /* Collision was with number auto-assigned to a
622 			 symbolic.  Change existing symbolic number
623 			 and move to end the list (if not already there).  */
624 		      runp->number = ++current->current_set->last_message;
625 
626 		      if (runp->next != NULL)
627 			{
628 			  struct message_list *endp;
629 
630 			  if (lastp == NULL)
631 			    current->current_set->messages=runp->next;
632 			  else
633 			    lastp->next=runp->next;
634 
635 			  endp = runp->next;
636 			  while (endp->next != NULL)
637 			    endp = endp->next;
638 
639 			  endp->next = runp;
640 			  runp->next = NULL;
641 			}
642 		    }
643 		}
644 	      ident = NULL;	/* We don't have a symbol.  */
645 
646 	      if (message_number != 0
647 		  && message_number > current->current_set->last_message)
648 		current->current_set->last_message = message_number;
649 	    }
650 	  else if (ident[0] != '\0')
651 	    {
652 	      struct message_list *runp;
653 
654 	      /* Test whether the symbolic name was not used for
655 		 another message in this message set.  */
656 	      runp = current->current_set->messages;
657 	      while (runp != NULL)
658 		if (runp->symbol != NULL && strcmp (ident, runp->symbol) == 0)
659 		  break;
660 		else
661 		  runp = runp->next;
662 	      if (runp != NULL)
663 		{
664 		  /* The name is already used.  */
665 		  error_at_line (0, 0, fname, start_line, gettext ("\
666 duplicated message identifier"));
667 		  error_at_line (0, 0, runp->fname, runp->line,
668 				 gettext ("this is the first definition"));
669 		  message_number = 0;
670 		}
671 	      else
672 		/* Give the message the next unused number.  */
673 		message_number = ++current->current_set->last_message;
674 	    }
675 	  else
676 	    message_number = 0;
677 
678 	  if (message_number != 0)
679 	    {
680 	      char *inbuf;
681 	      size_t inlen;
682 	      char *outbuf;
683 	      size_t outlen;
684 	      struct message_list *newp;
685 	      size_t line_len = strlen (line) + 1;
686 	      size_t ident_len = 0;
687 
688 	      /* We need the conversion.  */
689 	      if (cd_towc == (iconv_t) -1
690 		  && open_conversion (codeset, &cd_towc, &cd_tomb,
691 				      &escape_char) != 0)
692 		/* Something is wrong.  */
693 		goto out;
694 
695 	      /* Convert to a wide character string.  We have to
696 		 interpret escape sequences which will be impossible
697 		 without doing the conversion if the codeset of the
698 		 message is stateful.  */
699 	      while (1)
700 		{
701 		  inbuf = line;
702 		  inlen = line_len;
703 		  outbuf = (char *) wbuf;
704 		  outlen = wbufsize;
705 
706 		  /* Flush the state.  */
707 		  iconv (cd_towc, NULL, NULL, NULL, NULL);
708 
709 		  iconv (cd_towc, &inbuf, &inlen, &outbuf, &outlen);
710 		  if (inlen == 0)
711 		    {
712 		      /* The string is converted.  */
713 		      assert (outlen < wbufsize);
714 		      assert (wbuf[(wbufsize - outlen) / sizeof (wchar_t) - 1]
715 			      == L'\0');
716 		      break;
717 		    }
718 
719 		  if (outlen != 0)
720 		    {
721 		      /* Something is wrong with this string, we ignore it.  */
722 		      error_at_line (0, 0, fname, start_line, gettext ("\
723 invalid character: message ignored"));
724 		      goto ignore;
725 		    }
726 
727 		  /* The output buffer is too small.  */
728 		  wbufsize *= 2;
729 		  wbuf = (wchar_t *) xrealloc (wbuf, wbufsize);
730 		}
731 
732 	      /* Strip quote characters, change escape sequences into
733 		 correct characters etc.  */
734 	      normalize_line (fname, start_line, cd_towc, wbuf,
735 			      current->quote_char, escape_char);
736 
737 	      if (ident)
738 		ident_len = line - this_line;
739 
740 	      /* Now the string is free of escape sequences.  Convert it
741 		 back into a multibyte character string.  First free the
742 		 memory allocated for the original string.  */
743 	      obstack_free (&current->mem_pool, this_line);
744 
745 	      used = 1;	/* Yes, we use the line.  */
746 
747 	      /* Now fill in the new string.  It should never happen that
748 		 the replaced string is longer than the original.  */
749 	      inbuf = (char *) wbuf;
750 	      inlen = (wcslen (wbuf) + 1) * sizeof (wchar_t);
751 
752 	      outlen = obstack_room (&current->mem_pool);
753 	      obstack_blank (&current->mem_pool, outlen);
754 	      this_line = (char *) obstack_base (&current->mem_pool);
755 	      outbuf = this_line + ident_len;
756 	      outlen -= ident_len;
757 
758 	      /* Flush the state.  */
759 	      iconv (cd_tomb, NULL, NULL, NULL, NULL);
760 
761 	      iconv (cd_tomb, &inbuf, &inlen, &outbuf, &outlen);
762 	      if (inlen != 0)
763 		{
764 		  error_at_line (0, 0, fname, start_line,
765 				 gettext ("invalid line"));
766 		  goto ignore;
767 		}
768 	      assert (outbuf[-1] == '\0');
769 
770 	      /* Free the memory in the obstack we don't use.  */
771 	      obstack_blank (&current->mem_pool, -(int) outlen);
772 	      line = obstack_finish (&current->mem_pool);
773 
774 	      newp = (struct message_list *) xmalloc (sizeof (*newp));
775 	      newp->number = message_number;
776 	      newp->message = line + ident_len;
777 	      /* Remember symbolic name; is NULL if no is given.  */
778 	      newp->symbol = ident ? line : NULL;
779 	      /* Remember where we found the character.  */
780 	      newp->fname = fname;
781 	      newp->line = start_line;
782 
783 	      /* Find place to insert to message.  We keep them in a
784 		 sorted single linked list.  */
785 	      if (current->current_set->messages == NULL
786 		  || current->current_set->messages->number > message_number)
787 		{
788 		  newp->next = current->current_set->messages;
789 		  current->current_set->messages = newp;
790 		}
791 	      else
792 		{
793 		  struct message_list *runp;
794 		  runp = current->current_set->messages;
795 		  while (runp->next != NULL)
796 		    if (runp->next->number > message_number)
797 		      break;
798 		    else
799 		      runp = runp->next;
800 		  newp->next = runp->next;
801 		  runp->next = newp;
802 		}
803 	    }
804 	  ++current->total_messages;
805 	}
806       else
807 	{
808 	  size_t cnt;
809 
810 	  cnt = 0;
811 	  /* See whether we have any non-white space character in this
812 	     line.  */
813 	  while (this_line[cnt] != '\0' && isspace (this_line[cnt]))
814 	    ++cnt;
815 
816 	  if (this_line[cnt] != '\0')
817 	    /* Yes, some unknown characters found.  */
818 	    error_at_line (0, 0, fname, start_line,
819 			   gettext ("malformed line ignored"));
820 	}
821 
822     ignore:
823       /* We can save the memory for the line if it was not used.  */
824       if (!used)
825 	obstack_free (&current->mem_pool, this_line);
826     }
827 
828   /* Close the conversion modules.  */
829   iconv_close (cd_towc);
830   iconv_close (cd_tomb);
831   free (codeset);
832 
833  out:
834   free (wbuf);
835 
836   if (fp != stdin)
837     fclose (fp);
838   return current;
839 }
840 
841 
842 static void
write_out(struct catalog * catalog,const char * output_name,const char * header_name)843 write_out (struct catalog *catalog, const char *output_name,
844 	   const char *header_name)
845 {
846   /* Computing the "optimal" size.  */
847   struct set_list *set_run;
848   size_t best_total, best_size, best_depth;
849   size_t act_size, act_depth;
850   struct catalog_obj obj;
851   struct obstack string_pool;
852   const char *strings;
853   size_t strings_size;
854   uint32_t *array1, *array2;
855   size_t cnt;
856   int fd;
857 
858   /* If not otherwise told try to read file with existing
859      translations.  */
860   if (!force_new)
861     read_old (catalog, output_name);
862 
863   /* Initialize best_size with a very high value.  */
864   best_total = best_size = best_depth = UINT_MAX;
865 
866   /* We need some start size for testing.  Let's start with
867      TOTAL_MESSAGES / 5, which theoretically provides a mean depth of
868      5.  */
869   act_size = 1 + catalog->total_messages / 5;
870 
871   /* We determine the size of a hash table here.  Because the message
872      numbers can be chosen arbitrary by the programmer we cannot use
873      the simple method of accessing the array using the message
874      number.  The algorithm is based on the trivial hash function
875      NUMBER % TABLE_SIZE, where collisions are stored in a second
876      dimension up to TABLE_DEPTH.  We here compute TABLE_SIZE so that
877      the needed space (= TABLE_SIZE * TABLE_DEPTH) is minimal.  */
878   while (act_size <= best_total)
879     {
880       size_t deep[act_size];
881 
882       act_depth = 1;
883       memset (deep, '\0', act_size * sizeof (size_t));
884       set_run = catalog->all_sets;
885       while (set_run != NULL)
886 	{
887 	  struct message_list *message_run;
888 
889 	  message_run = set_run->messages;
890 	  while (message_run != NULL)
891 	    {
892 	      size_t idx = (message_run->number * set_run->number) % act_size;
893 
894 	      ++deep[idx];
895 	      if (deep[idx] > act_depth)
896 		{
897 		  act_depth = deep[idx];
898 		  if (act_depth * act_size > best_total)
899 		    break;
900 		}
901 	      message_run = message_run->next;
902 	    }
903 	  set_run = set_run->next;
904 	}
905 
906       if (act_depth * act_size <= best_total)
907 	{
908 	  /* We have found a better solution.  */
909 	  best_total = act_depth * act_size;
910 	  best_size = act_size;
911 	  best_depth = act_depth;
912 	}
913 
914       ++act_size;
915     }
916 
917   /* let's be prepared for an empty message file.  */
918   if (best_size == UINT_MAX)
919     {
920       best_size = 1;
921       best_depth = 1;
922     }
923 
924   /* OK, now we have the size we will use.  Fill in the header, build
925      the table and the second one with swapped byte order.  */
926   obj.magic = CATGETS_MAGIC;
927   obj.plane_size = best_size;
928   obj.plane_depth = best_depth;
929 
930   /* Allocate room for all needed arrays.  */
931   array1 =
932     (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
933   memset (array1, '\0', best_size * best_depth * sizeof (uint32_t) * 3);
934   array2
935     = (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
936   obstack_init (&string_pool);
937 
938   set_run = catalog->all_sets;
939   while (set_run != NULL)
940     {
941       struct message_list *message_run;
942 
943       message_run = set_run->messages;
944       while (message_run != NULL)
945 	{
946 	  size_t idx = (((message_run->number * set_run->number) % best_size)
947 			* 3);
948 	  /* Determine collision depth.  */
949 	  while (array1[idx] != 0)
950 	    idx += best_size * 3;
951 
952 	  /* Store set number, message number and pointer into string
953 	     space, relative to the first string.  */
954 	  array1[idx + 0] = set_run->number;
955 	  array1[idx + 1] = message_run->number;
956 	  array1[idx + 2] = obstack_object_size (&string_pool);
957 
958 	  /* Add current string to the continuous space containing all
959 	     strings.  */
960 	  obstack_grow0 (&string_pool, message_run->message,
961 			 strlen (message_run->message));
962 
963 	  message_run = message_run->next;
964 	}
965 
966       set_run = set_run->next;
967     }
968   strings_size = obstack_object_size (&string_pool);
969   strings = obstack_finish (&string_pool);
970 
971   /* Compute ARRAY2 by changing the byte order.  */
972   for (cnt = 0; cnt < best_size * best_depth * 3; ++cnt)
973     array2[cnt] = SWAPU32 (array1[cnt]);
974 
975   /* Now we can write out the whole data.  */
976   if (strcmp (output_name, "-") == 0
977       || strcmp (output_name, "/dev/stdout") == 0)
978     fd = STDOUT_FILENO;
979   else
980     {
981       fd = creat (output_name, 0666);
982       if (fd < 0)
983 	error (EXIT_FAILURE, errno, gettext ("cannot open output file `%s'"),
984 	       output_name);
985     }
986 
987   /* Write out header.  */
988   write (fd, &obj, sizeof (obj));
989 
990   /* We always write out the little endian version of the index
991      arrays.  */
992 #if __BYTE_ORDER == __LITTLE_ENDIAN
993   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
994   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
995 #elif __BYTE_ORDER == __BIG_ENDIAN
996   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
997   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
998 #else
999 # error Cannot handle __BYTE_ORDER byte order
1000 #endif
1001 
1002   /* Finally write the strings.  */
1003   write (fd, strings, strings_size);
1004 
1005   if (fd != STDOUT_FILENO)
1006     close (fd);
1007 
1008   /* If requested now write out the header file.  */
1009   if (header_name != NULL)
1010     {
1011       int first = 1;
1012       FILE *fp;
1013 
1014       /* Open output file.  "-" or "/dev/stdout" means write to
1015 	 standard output.  */
1016       if (strcmp (header_name, "-") == 0
1017 	  || strcmp (header_name, "/dev/stdout") == 0)
1018 	fp = stdout;
1019       else
1020 	{
1021 	  fp = fopen (header_name, "w");
1022 	  if (fp == NULL)
1023 	    error (EXIT_FAILURE, errno,
1024 		   gettext ("cannot open output file `%s'"), header_name);
1025 	}
1026 
1027       /* Iterate over all sets and all messages.  */
1028       set_run = catalog->all_sets;
1029       while (set_run != NULL)
1030 	{
1031 	  struct message_list *message_run;
1032 
1033 	  /* If the current message set has a symbolic name write this
1034 	     out first.  */
1035 	  if (set_run->symbol != NULL)
1036 	    fprintf (fp, "%s#define %sSet %#x\t/* %s:%Zu */\n",
1037 		     first ? "" : "\n", set_run->symbol, set_run->number - 1,
1038 		     set_run->fname, set_run->line);
1039 	  first = 0;
1040 
1041 	  message_run = set_run->messages;
1042 	  while (message_run != NULL)
1043 	    {
1044 	      /* If the current message has a symbolic name write
1045 		 #define out.  But we have to take care for the set
1046 		 not having a symbolic name.  */
1047 	      if (message_run->symbol != NULL)
1048 		{
1049 		  if (set_run->symbol == NULL)
1050 		    fprintf (fp, "#define AutomaticSet%d%s %#x\t/* %s:%Zu */\n",
1051 			     set_run->number, message_run->symbol,
1052 			     message_run->number, message_run->fname,
1053 			     message_run->line);
1054 		  else
1055 		    fprintf (fp, "#define %s%s %#x\t/* %s:%Zu */\n",
1056 			     set_run->symbol, message_run->symbol,
1057 			     message_run->number, message_run->fname,
1058 			     message_run->line);
1059 		}
1060 
1061 	      message_run = message_run->next;
1062 	    }
1063 
1064 	  set_run = set_run->next;
1065 	}
1066 
1067       if (fp != stdout)
1068 	fclose (fp);
1069     }
1070 }
1071 
1072 
1073 static struct set_list *
find_set(struct catalog * current,int number)1074 find_set (struct catalog *current, int number)
1075 {
1076   struct set_list *result = current->all_sets;
1077 
1078   /* We must avoid set number 0 because a set of this number signals
1079      in the tables that the entry is not occupied.  */
1080   ++number;
1081 
1082   while (result != NULL)
1083     if (result->number == number)
1084       return result;
1085     else
1086       result = result->next;
1087 
1088   /* Prepare new message set.  */
1089   result = (struct set_list *) xcalloc (1, sizeof (*result));
1090   result->number = number;
1091   result->next = current->all_sets;
1092   current->all_sets = result;
1093 
1094   return result;
1095 }
1096 
1097 
1098 /* Normalize given string *in*place* by processing escape sequences
1099    and quote characters.  */
1100 static void
normalize_line(const char * fname,size_t line,iconv_t cd,wchar_t * string,wchar_t quote_char,wchar_t escape_char)1101 normalize_line (const char *fname, size_t line, iconv_t cd, wchar_t *string,
1102 		wchar_t quote_char, wchar_t escape_char)
1103 {
1104   int is_quoted;
1105   wchar_t *rp = string;
1106   wchar_t *wp = string;
1107 
1108   if (quote_char != L'\0' && *rp == quote_char)
1109     {
1110       is_quoted = 1;
1111       ++rp;
1112     }
1113   else
1114     is_quoted = 0;
1115 
1116   while (*rp != L'\0')
1117     if (*rp == quote_char)
1118       /* We simply end the string when we find the first time an
1119 	 not-escaped quote character.  */
1120 	break;
1121     else if (*rp == escape_char)
1122       {
1123 	++rp;
1124 	if (quote_char != L'\0' && *rp == quote_char)
1125 	  /* This is an extension to XPG.  */
1126 	  *wp++ = *rp++;
1127 	else
1128 	  /* Recognize escape sequences.  */
1129 	  switch (*rp)
1130 	    {
1131 	    case L'n':
1132 	      *wp++ = L'\n';
1133 	      ++rp;
1134 	      break;
1135 	    case L't':
1136 	      *wp++ = L'\t';
1137 	      ++rp;
1138 	      break;
1139 	    case L'v':
1140 	      *wp++ = L'\v';
1141 	      ++rp;
1142 	      break;
1143 	    case L'b':
1144 	      *wp++ = L'\b';
1145 	      ++rp;
1146 	      break;
1147 	    case L'r':
1148 	      *wp++ = L'\r';
1149 	      ++rp;
1150 	      break;
1151 	    case L'f':
1152 	      *wp++ = L'\f';
1153 	      ++rp;
1154 	      break;
1155 	    case L'0' ... L'7':
1156 	      {
1157 		int number;
1158 		char cbuf[2];
1159 		char *cbufptr;
1160 		size_t cbufin;
1161 		wchar_t wcbuf[2];
1162 		char *wcbufptr;
1163 		size_t wcbufin;
1164 
1165 		number = *rp++ - L'0';
1166 		while (number <= (255 / 8) && *rp >= L'0' && *rp <= L'7')
1167 		  {
1168 		    number *= 8;
1169 		    number += *rp++ - L'0';
1170 		  }
1171 
1172 		cbuf[0] = (char) number;
1173 		cbuf[1] = '\0';
1174 		cbufptr = cbuf;
1175 		cbufin = 2;
1176 
1177 		wcbufptr = (char *) wcbuf;
1178 		wcbufin = sizeof (wcbuf);
1179 
1180 		/* Flush the state.  */
1181 		iconv (cd, NULL, NULL, NULL, NULL);
1182 
1183 		iconv (cd, &cbufptr, &cbufin, &wcbufptr, &wcbufin);
1184 		if (cbufptr != &cbuf[2] || (wchar_t *) wcbufptr != &wcbuf[2])
1185 		  error_at_line (0, 0, fname, line,
1186 				 gettext ("invalid escape sequence"));
1187 		else
1188 		  *wp++ = wcbuf[0];
1189 	      }
1190 	      break;
1191 	    default:
1192 	      if (*rp == escape_char)
1193 		{
1194 		  *wp++ = escape_char;
1195 		  ++rp;
1196 		}
1197 	      else
1198 		{
1199 		  /* Simply ignore the backslash character.  */
1200 		}
1201 	      break;
1202 	    }
1203       }
1204     else
1205       *wp++ = *rp++;
1206 
1207   /* If we saw a quote character at the beginning we expect another
1208      one at the end.  */
1209   if (is_quoted && *rp != quote_char)
1210     error_at_line (0, 0, fname, line, gettext ("unterminated message"));
1211 
1212   /* Terminate string.  */
1213   *wp = L'\0';
1214   return;
1215 }
1216 
1217 
1218 static void
read_old(struct catalog * catalog,const char * file_name)1219 read_old (struct catalog *catalog, const char *file_name)
1220 {
1221   struct catalog_info old_cat_obj;
1222   struct set_list *set = NULL;
1223   int last_set = -1;
1224   size_t cnt;
1225 
1226   /* Try to open catalog, but don't look through the NLSPATH.  */
1227   if (__open_catalog (file_name, NULL, NULL, &old_cat_obj) != 0)
1228     {
1229       if (errno == ENOENT)
1230 	/* No problem, the catalog simply does not exist.  */
1231 	return;
1232       else
1233 	error (EXIT_FAILURE, errno,
1234 	       gettext ("while opening old catalog file"));
1235     }
1236 
1237   /* OK, we have the catalog loaded.  Now read all messages and merge
1238      them.  When set and message number clash for any message the new
1239      one is used.  If the new one is empty it indicates that the
1240      message should be deleted.  */
1241   for (cnt = 0; cnt < old_cat_obj.plane_size * old_cat_obj.plane_depth; ++cnt)
1242     {
1243       struct message_list *message, *last;
1244 
1245       if (old_cat_obj.name_ptr[cnt * 3 + 0] == 0)
1246 	/* No message in this slot.  */
1247 	continue;
1248 
1249       if (old_cat_obj.name_ptr[cnt * 3 + 0] - 1 != (uint32_t) last_set)
1250 	{
1251 	  last_set = old_cat_obj.name_ptr[cnt * 3 + 0] - 1;
1252 	  set = find_set (catalog, old_cat_obj.name_ptr[cnt * 3 + 0] - 1);
1253 	}
1254 
1255       last = NULL;
1256       message = set->messages;
1257       while (message != NULL)
1258 	{
1259 	  if ((uint32_t) message->number >= old_cat_obj.name_ptr[cnt * 3 + 1])
1260 	    break;
1261 	  last = message;
1262 	  message = message->next;
1263 	}
1264 
1265       if (message == NULL
1266 	  || (uint32_t) message->number > old_cat_obj.name_ptr[cnt * 3 + 1])
1267 	{
1268 	  /* We have found a message which is not yet in the catalog.
1269 	     Insert it at the right position.  */
1270 	  struct message_list *newp;
1271 
1272 	  newp = (struct message_list *) xmalloc (sizeof (*newp));
1273 	  newp->number = old_cat_obj.name_ptr[cnt * 3 + 1];
1274 	  newp->message =
1275 	    &old_cat_obj.strings[old_cat_obj.name_ptr[cnt * 3 + 2]];
1276 	  newp->fname = NULL;
1277 	  newp->line = 0;
1278 	  newp->symbol = NULL;
1279 	  newp->next = message;
1280 
1281 	  if (last == NULL)
1282 	    set->messages = newp;
1283 	  else
1284 	    last->next = newp;
1285 
1286 	  ++catalog->total_messages;
1287 	}
1288       else if (*message->message == '\0')
1289 	{
1290 	  /* The new empty message has overridden the old one thus
1291 	     "deleting" it as required.  Now remove the empty remains. */
1292 	  if (last == NULL)
1293 	    set->messages = message->next;
1294 	  else
1295 	    last->next = message->next;
1296 	}
1297     }
1298 }
1299 
1300 
1301 static int
open_conversion(const char * codeset,iconv_t * cd_towcp,iconv_t * cd_tombp,wchar_t * escape_charp)1302 open_conversion (const char *codeset, iconv_t *cd_towcp, iconv_t *cd_tombp,
1303 		 wchar_t *escape_charp)
1304 {
1305   char buf[2];
1306   char *bufptr;
1307   size_t bufsize;
1308   wchar_t wbuf[2];
1309   char *wbufptr;
1310   size_t wbufsize;
1311 
1312   /* If the input file does not specify the codeset use the locale's.  */
1313   if (codeset == NULL)
1314     {
1315       setlocale (LC_ALL, "");
1316       codeset = nl_langinfo (CODESET);
1317       setlocale (LC_ALL, "C");
1318     }
1319 
1320   /* Get the conversion modules.  */
1321   *cd_towcp = iconv_open ("WCHAR_T", codeset);
1322   *cd_tombp = iconv_open (codeset, "WCHAR_T");
1323   if (*cd_towcp == (iconv_t) -1 || *cd_tombp == (iconv_t) -1)
1324     {
1325       error (0, 0, gettext ("conversion modules not available"));
1326       if (*cd_towcp != (iconv_t) -1)
1327 	iconv_close (*cd_towcp);
1328 
1329       return 1;
1330     }
1331 
1332   /* One special case for historical reasons is the backslash
1333      character.  In some codesets the byte value 0x5c is not mapped to
1334      U005c in Unicode.  These charsets then don't have a backslash
1335      character at all.  Therefore we have to live with whatever the
1336      codeset provides and recognize, instead of the U005c, the character
1337      the byte value 0x5c is mapped to.  */
1338   buf[0] = '\\';
1339   buf[1] = '\0';
1340   bufptr = buf;
1341   bufsize = 2;
1342 
1343   wbufptr = (char *) wbuf;
1344   wbufsize = sizeof (wbuf);
1345 
1346   iconv (*cd_towcp, &bufptr, &bufsize, &wbufptr, &wbufsize);
1347   if (bufsize != 0 || wbufsize != 0)
1348     {
1349       /* Something went wrong, we couldn't convert the byte 0x5c.  Go
1350 	 on with using U005c.  */
1351       error (0, 0, gettext ("cannot determine escape character"));
1352       *escape_charp = L'\\';
1353     }
1354   else
1355     *escape_charp = wbuf[0];
1356 
1357   return 0;
1358 }
1359