1 /* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published
6    by the Free Software Foundation; version 2 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
16 
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <libintl.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <stdint.h>
29 
30 #include "localedef.h"
31 #include "linereader.h"
32 #include "charmap.h"
33 #include "charmap-dir.h"
34 
35 #include <assert.h>
36 
37 
38 /* Define the lookup function.  */
39 #include "charmap-kw.h"
40 
41 
42 /* Prototypes for local functions.  */
43 static struct charmap_t *parse_charmap (struct linereader *cmfile,
44 					int verbose, int be_quiet);
45 static void new_width (struct linereader *cmfile, struct charmap_t *result,
46 		       const char *from, const char *to,
47 		       unsigned long int width);
48 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
49 			      size_t nbytes, unsigned char *bytes,
50 			      const char *from, const char *to,
51 			      int decimal_ellipsis, int step);
52 
53 
54 bool enc_not_ascii_compatible;
55 
56 
57 #ifdef NEED_NULL_POINTER
58 static const char *null_pointer;
59 #endif
60 
61 static struct linereader *
cmlr_open(const char * directory,const char * name,kw_hash_fct_t hf)62 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
63 {
64   FILE *fp;
65 
66   fp = charmap_open (directory, name);
67   if (fp == NULL)
68     return NULL;
69   else
70     {
71       size_t dlen = strlen (directory);
72       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
73       size_t nlen = strlen (name);
74       char *pathname;
75       char *p;
76 
77       pathname = alloca (dlen + add_slash + nlen + 1);
78       p = stpcpy (pathname, directory);
79       if (add_slash)
80 	*p++ = '/';
81       stpcpy (p, name);
82 
83       return lr_create (fp, pathname, hf);
84     }
85 }
86 
87 struct charmap_t *
charmap_read(const char * filename,int verbose,int error_not_found,int be_quiet,int use_default)88 charmap_read (const char *filename, int verbose, int error_not_found,
89 	      int be_quiet, int use_default)
90 {
91   struct charmap_t *result = NULL;
92 
93   if (filename != NULL)
94     {
95       struct linereader *cmfile;
96 
97       /* First try the name as found in the parameter.  */
98       cmfile = lr_open (filename, charmap_hash);
99       if (cmfile == NULL)
100 	{
101 	  /* No successful.  So start looking through the directories
102 	     in the I18NPATH if this is a simple name.  */
103 	  if (strchr (filename, '/') == NULL)
104 	    {
105 	      char *i18npath = getenv ("I18NPATH");
106 	      if (i18npath != NULL && *i18npath != '\0')
107 		{
108 		  const size_t pathlen = strlen (i18npath);
109 		  char i18npathbuf[pathlen + 1];
110 		  char path[pathlen + sizeof ("/charmaps")];
111 		  char *next;
112 		  i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
113 
114 		  while (cmfile == NULL
115 			 && (next = strsep (&i18npath, ":")) != NULL)
116 		    {
117 		      stpcpy (stpcpy (path, next), "/charmaps");
118 		      cmfile = cmlr_open (path, filename, charmap_hash);
119 
120 		      if (cmfile == NULL)
121 			/* Try without the "/charmaps" part.  */
122 			cmfile = cmlr_open (next, filename, charmap_hash);
123 		    }
124 		}
125 
126 	      if (cmfile == NULL)
127 		/* Try the default directory.  */
128 		cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
129 	    }
130 	}
131 
132       if (cmfile != NULL)
133 	result = parse_charmap (cmfile, verbose, be_quiet);
134 
135       if (result == NULL && error_not_found)
136 	record_error (0, errno,
137 		      _("character map file `%s' not found"),
138 		      filename);
139     }
140 
141   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
142     {
143       /* OK, one more try.  We also accept the names given to the
144 	 character sets in the files.  Sometimes they differ from the
145 	 file name.  */
146       CHARMAP_DIR *dir;
147 
148       dir = charmap_opendir (CHARMAP_PATH);
149       if (dir != NULL)
150 	{
151 	  const char *dirent;
152 
153 	  while ((dirent = charmap_readdir (dir)) != NULL)
154 	    {
155 	      char **aliases;
156 	      char **p;
157 	      int found;
158 
159 	      aliases = charmap_aliases (CHARMAP_PATH, dirent);
160 	      found = 0;
161 	      for (p = aliases; *p; p++)
162 		if (strcasecmp (*p, filename) == 0)
163 		  {
164 		    found = 1;
165 		    break;
166 		  }
167 	      charmap_free_aliases (aliases);
168 
169 	      if (found)
170 		{
171 		  struct linereader *cmfile;
172 
173 		  cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
174 		  if (cmfile != NULL)
175 		    result = parse_charmap (cmfile, verbose, be_quiet);
176 
177 		  break;
178 		}
179 	    }
180 
181 	  charmap_closedir (dir);
182 	}
183     }
184 
185   if (result == NULL && DEFAULT_CHARMAP != NULL)
186     {
187       struct linereader *cmfile;
188 
189       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
190       if (cmfile != NULL)
191 	result = parse_charmap (cmfile, verbose, be_quiet);
192 
193       if (result == NULL)
194 	record_error (4, errno,
195 		      _("default character map file `%s' not found"),
196 		      DEFAULT_CHARMAP);
197     }
198 
199   if (result != NULL && result->code_set_name == NULL)
200     /* The input file does not specify a code set name.  This
201        shouldn't happen but we should cope with it.  */
202     result->code_set_name = basename (filename);
203 
204   /* Test of ASCII compatibility of locale encoding.
205 
206      Verify that the encoding to be used in a locale is ASCII compatible,
207      at least for the graphic characters, excluding the control characters,
208      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
209 
210      ISO C 99 section 7.17.(2) (about wchar_t):
211        the null character shall have the code value zero and each member of
212        the basic character set shall have a code value equal to its value
213        when used as the lone character in an integer character constant.
214      ISO C 99 section 5.2.1.(3):
215        Both the basic source and basic execution character sets shall have
216        the following members: the 26 uppercase letters of the Latin alphabet
217             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
218        the 26 lowercase letters of the Latin alphabet
219             a b c d e f g h i j k l m n o p q r s t u v w x y z
220        the 10 decimal digits
221             0 1 2 3 4 5 6 7 8 9
222        the following 29 graphic characters
223             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
224        the space character, and control characters representing horizontal
225        tab, vertical tab, and form feed.
226 
227      Therefore, for all members of the "basic character set", the 'char' code
228      must have the same value as the 'wchar_t' code, which in glibc is the
229      same as the Unicode code, which for all of the enumerated characters
230      is identical to the ASCII code. */
231   if (result != NULL && use_default)
232     {
233       static const char basic_charset[] =
234 	{
235 	  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
236 	  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
237 	  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
238 	  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
239 	  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
240 	  '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
241 	  '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
242 	  '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
243 	};
244       int failed = 0;
245       const char *p = basic_charset;
246 
247       do
248 	{
249 	  struct charseq *seq = charmap_find_symbol (result, p, 1);
250 
251 	  if (seq == NULL || seq->ucs4 != (uint32_t) *p)
252 	    failed = 1;
253 	}
254       while (*p++ != '\0');
255 
256       if (failed)
257 	{
258 	  /* A user may disable the ASCII compatibility warning check,
259 	     but we must remember that the encoding is not ASCII
260 	     compatible, since it may have other implications.  Later
261 	     we will set _NL_CTYPE_MAP_TO_NONASCII from this value.  */
262 	  if (warn_ascii)
263 	    record_warning (_(
264 "character map `%s' is not ASCII compatible, locale not ISO C compliant "
265 "[--no-warnings=ascii]"),
266 			    result->code_set_name);
267 	  enc_not_ascii_compatible = true;
268 	}
269     }
270 
271   return result;
272 }
273 
274 
275 static struct charmap_t *
parse_charmap(struct linereader * cmfile,int verbose,int be_quiet)276 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
277 {
278   struct charmap_t *result;
279   int state;
280   enum token_t expected_tok = tok_error;
281   const char *expected_str = NULL;
282   char *from_name = NULL;
283   char *to_name = NULL;
284   enum token_t ellipsis = 0;
285   int step = 1;
286 
287   /* We don't want symbolic names in string to be translated.  */
288   cmfile->translate_strings = 0;
289 
290   /* Allocate room for result.  */
291   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
292   memset (result, '\0', sizeof (struct charmap_t));
293   /* The default DEFAULT_WIDTH is 1.  */
294   result->width_default = 1;
295 
296 #define obstack_chunk_alloc malloc
297 #define obstack_chunk_free free
298   obstack_init (&result->mem_pool);
299 
300   if (init_hash (&result->char_table, 256)
301       || init_hash (&result->byte_table, 256))
302     {
303       free (result);
304       return NULL;
305     }
306 
307   /* We use a state machine to describe the charmap description file
308      format.  */
309   state = 1;
310   while (1)
311     {
312       /* What's on?  */
313       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
314       enum token_t nowtok = now->tok;
315       struct token *arg;
316 
317       if (nowtok == tok_eof)
318 	break;
319 
320       switch (state)
321 	{
322 	case 1:
323 	  /* The beginning.  We expect the special declarations, EOL or
324 	     `CHARMAP'.  */
325 	  if (nowtok == tok_eol)
326 	    /* Ignore empty lines.  */
327 	    continue;
328 
329 	  if (nowtok == tok_charmap)
330 	    {
331 	      from_name = NULL;
332 	      to_name = NULL;
333 
334 	      /* We have to set up the real work.  Fill in some
335 		 default values.  */
336 	      if (result->mb_cur_max == 0)
337 		result->mb_cur_max = 1;
338 	      if (result->mb_cur_min == 0)
339 		result->mb_cur_min = result->mb_cur_max;
340 	      if (result->mb_cur_min > result->mb_cur_max)
341 		{
342 		  record_error (0, 0, _("\
343 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
344 				cmfile->fname);
345 
346 		  result->mb_cur_min = result->mb_cur_max;
347 		}
348 
349 	      lr_ignore_rest (cmfile, 1);
350 
351 	      state = 2;
352 	      continue;
353 	    }
354 
355 	  if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
356 	      && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
357 	      && nowtok != tok_comment_char && nowtok != tok_g0esc
358 	      && nowtok != tok_g1esc && nowtok != tok_g2esc
359 	      && nowtok != tok_g3esc && nowtok != tok_repertoiremap
360 	      && nowtok != tok_include)
361 	    {
362 	      lr_error (cmfile, _("syntax error in prolog: %s"),
363 			_("invalid definition"));
364 
365 	      lr_ignore_rest (cmfile, 0);
366 	      continue;
367 	    }
368 
369 	  /* We know that we need an argument.  */
370 	  arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
371 
372 	  switch (nowtok)
373 	    {
374 	    case tok_code_set_name:
375 	    case tok_repertoiremap:
376 	      if (arg->tok != tok_ident && arg->tok != tok_string)
377 		{
378 		badarg:
379 		  lr_error (cmfile, _("syntax error in prolog: %s"),
380 			    _("bad argument"));
381 
382 		  lr_ignore_rest (cmfile, 0);
383 		  continue;
384 		}
385 
386 	      if (nowtok == tok_code_set_name)
387 		result->code_set_name = obstack_copy0 (&result->mem_pool,
388 						       arg->val.str.startmb,
389 						       arg->val.str.lenmb);
390 	      else
391 		result->repertoiremap = obstack_copy0 (&result->mem_pool,
392 						       arg->val.str.startmb,
393 						       arg->val.str.lenmb);
394 
395 	      lr_ignore_rest (cmfile, 1);
396 	      continue;
397 
398 	    case tok_mb_cur_max:
399 	    case tok_mb_cur_min:
400 	      if (arg->tok != tok_number)
401 		goto badarg;
402 
403 	      if ((nowtok == tok_mb_cur_max
404 		       && result->mb_cur_max != 0)
405 		      || (nowtok == tok_mb_cur_max
406 			  && result->mb_cur_max != 0))
407 		lr_error (cmfile, _("duplicate definition of <%s>"),
408 			  nowtok == tok_mb_cur_min
409 			  ? "mb_cur_min" : "mb_cur_max");
410 
411 	      if (arg->val.num < 1)
412 		{
413 		  lr_error (cmfile,
414 			    _("value for <%s> must be 1 or greater"),
415 			    nowtok == tok_mb_cur_min
416 			    ? "mb_cur_min" : "mb_cur_max");
417 
418 		  lr_ignore_rest (cmfile, 0);
419 		  continue;
420 		}
421 	      if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
422 		   && (int) arg->val.num < result->mb_cur_min)
423 		  || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
424 		      && (int) arg->val.num > result->mb_cur_max))
425 		{
426 		  lr_error (cmfile, _("\
427 value of <%s> must be greater or equal than the value of <%s>"),
428 			    "mb_cur_max", "mb_cur_min");
429 
430 		  lr_ignore_rest (cmfile, 0);
431 		  continue;
432 		}
433 
434 	      if (nowtok == tok_mb_cur_max)
435 		result->mb_cur_max = arg->val.num;
436 	      else
437 		result->mb_cur_min = arg->val.num;
438 
439 	      lr_ignore_rest (cmfile, 1);
440 	      continue;
441 
442 	    case tok_escape_char:
443 	    case tok_comment_char:
444 	      if (arg->tok != tok_ident)
445 		goto badarg;
446 
447 	      if (arg->val.str.lenmb != 1)
448 		{
449 		  lr_error (cmfile, _("\
450 argument to <%s> must be a single character"),
451 			    nowtok == tok_escape_char ? "escape_char"
452 						      : "comment_char");
453 
454 		  lr_ignore_rest (cmfile, 0);
455 		  continue;
456 		}
457 
458 	      if (nowtok == tok_escape_char)
459 		cmfile->escape_char = *arg->val.str.startmb;
460 	      else
461 		cmfile->comment_char = *arg->val.str.startmb;
462 
463 	      lr_ignore_rest (cmfile, 1);
464 	      continue;
465 
466 	    case tok_g0esc:
467 	    case tok_g1esc:
468 	    case tok_g2esc:
469 	    case tok_g3esc:
470 	    case tok_escseq:
471 	      lr_ignore_rest (cmfile, 0); /* XXX */
472 	      continue;
473 
474 	    case tok_include:
475 	      lr_error (cmfile, _("\
476 character sets with locking states are not supported"));
477 	      exit (4);
478 
479 	    default:
480 	      /* Cannot happen.  */
481 	      assert (! "Should not happen");
482 	    }
483 	  break;
484 
485 	case 2:
486 	  /* We have seen `CHARMAP' and now are in the body.  Each line
487 	     must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
488 	  if (nowtok == tok_eol)
489 	    /* Ignore empty lines.  */
490 	    continue;
491 
492 	  if (nowtok == tok_end)
493 	    {
494 	      expected_tok = tok_charmap;
495 	      expected_str = "CHARMAP";
496 	      state = 90;
497 	      continue;
498 	    }
499 
500 	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
501 	    {
502 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
503 			"CHARMAP", _("no symbolic name given"));
504 
505 	      lr_ignore_rest (cmfile, 0);
506 	      continue;
507 	    }
508 
509 	  /* If the previous line was not completely correct free the
510 	     used memory.  */
511 	  if (from_name != NULL)
512 	    obstack_free (&result->mem_pool, from_name);
513 
514 	  if (nowtok == tok_bsymbol)
515 	    from_name = (char *) obstack_copy0 (&result->mem_pool,
516 						now->val.str.startmb,
517 						now->val.str.lenmb);
518 	  else
519 	    {
520 	      obstack_printf (&result->mem_pool, "U%08X",
521 			      cmfile->token.val.ucs4);
522 	      obstack_1grow (&result->mem_pool, '\0');
523 	      from_name = (char *) obstack_finish (&result->mem_pool);
524 	    }
525 	  to_name = NULL;
526 
527 	  state = 3;
528 	  continue;
529 
530 	case 3:
531 	  /* We have two possibilities: We can see an ellipsis or an
532 	     encoding value.  */
533 	  if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
534 	      || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
535 	      || nowtok == tok_ellipsis2_2)
536 	    {
537 	      ellipsis = nowtok;
538 	      if (nowtok == tok_ellipsis4_2)
539 		{
540 		  step = 2;
541 		  nowtok = tok_ellipsis4;
542 		}
543 	      else if (nowtok == tok_ellipsis2_2)
544 		{
545 		  step = 2;
546 		  nowtok = tok_ellipsis2;
547 		}
548 	      state = 4;
549 	      continue;
550 	    }
551 	  /* FALLTHROUGH */
552 
553 	case 5:
554 	  if (nowtok != tok_charcode)
555 	    {
556 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
557 			"CHARMAP", _("invalid encoding given"));
558 
559 	      lr_ignore_rest (cmfile, 0);
560 
561 	      state = 2;
562 	      continue;
563 	    }
564 
565 	  if (now->val.charcode.nbytes < result->mb_cur_min)
566 	    lr_error (cmfile, _("too few bytes in character encoding"));
567 	  else if (now->val.charcode.nbytes > result->mb_cur_max)
568 	    lr_error (cmfile, _("too many bytes in character encoding"));
569 	  else
570 	    charmap_new_char (cmfile, result, now->val.charcode.nbytes,
571 			      now->val.charcode.bytes, from_name, to_name,
572 			      ellipsis != tok_ellipsis2, step);
573 
574 	  /* Ignore trailing comment silently.  */
575 	  lr_ignore_rest (cmfile, 0);
576 
577 	  from_name = NULL;
578 	  to_name = NULL;
579 	  ellipsis = tok_none;
580 	  step = 1;
581 
582 	  state = 2;
583 	  continue;
584 
585 	case 4:
586 	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
587 	    {
588 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
589 			"CHARMAP",
590 			_("no symbolic name given for end of range"));
591 
592 	      lr_ignore_rest (cmfile, 0);
593 	      continue;
594 	    }
595 
596 	  /* Copy the to-name in a safe place.  */
597 	  if (nowtok == tok_bsymbol)
598 	    to_name = (char *) obstack_copy0 (&result->mem_pool,
599 					      cmfile->token.val.str.startmb,
600 					      cmfile->token.val.str.lenmb);
601 	  else
602 	    {
603 	      obstack_printf (&result->mem_pool, "U%08X",
604 			      cmfile->token.val.ucs4);
605 	      obstack_1grow (&result->mem_pool, '\0');
606 	      to_name = (char *) obstack_finish (&result->mem_pool);
607 	    }
608 
609 	  state = 5;
610 	  continue;
611 
612 	case 90:
613 	  if (nowtok != expected_tok)
614 	    lr_error (cmfile, _("\
615 %1$s: definition does not end with `END %1$s'"), expected_str);
616 
617 	  lr_ignore_rest (cmfile, nowtok == expected_tok);
618 	  state = 91;
619 	  continue;
620 
621 	case 91:
622 	  /* Waiting for WIDTH... */
623 	  if (nowtok == tok_eol)
624 	    /* Ignore empty lines.  */
625 	    continue;
626 
627 	  if (nowtok == tok_width_default)
628 	    {
629 	      state = 92;
630 	      continue;
631 	    }
632 
633 	  if (nowtok == tok_width)
634 	    {
635 	      lr_ignore_rest (cmfile, 1);
636 	      state = 93;
637 	      continue;
638 	    }
639 
640 	  if (nowtok == tok_width_variable)
641 	    {
642 	      lr_ignore_rest (cmfile, 1);
643 	      state = 98;
644 	      continue;
645 	    }
646 
647 	  lr_error (cmfile, _("\
648 only WIDTH definitions are allowed to follow the CHARMAP definition"));
649 
650 	  lr_ignore_rest (cmfile, 0);
651 	  continue;
652 
653 	case 92:
654 	  if (nowtok != tok_number)
655 	    lr_error (cmfile, _("value for %s must be an integer"),
656 		      "WIDTH_DEFAULT");
657 	  else
658 	    result->width_default = now->val.num;
659 
660 	  lr_ignore_rest (cmfile, nowtok == tok_number);
661 
662 	  state = 91;
663 	  continue;
664 
665 	case 93:
666 	  /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
667 	     "%s...%s %d\n".  */
668 	  if (nowtok == tok_eol)
669 	    /* ignore empty lines.  */
670 	    continue;
671 
672 	  if (nowtok == tok_end)
673 	    {
674 	      expected_tok = tok_width;
675 	      expected_str = "WIDTH";
676 	      state = 90;
677 	      continue;
678 	    }
679 
680 	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
681 	    {
682 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
683 			"WIDTH", _("no symbolic name given"));
684 
685 	      lr_ignore_rest (cmfile, 0);
686 	      continue;
687 	    }
688 
689 	  if (from_name != NULL)
690 	    obstack_free (&result->mem_pool, from_name);
691 
692 	  if (nowtok == tok_bsymbol)
693 	    from_name = (char *) obstack_copy0 (&result->mem_pool,
694 						now->val.str.startmb,
695 						now->val.str.lenmb);
696 	  else
697 	    {
698 	      obstack_printf (&result->mem_pool, "U%08X",
699 			      cmfile->token.val.ucs4);
700 	      obstack_1grow (&result->mem_pool, '\0');
701 	      from_name = (char *) obstack_finish (&result->mem_pool);
702 	    }
703 
704 	  to_name = NULL;
705 
706 	  state = 94;
707 	  continue;
708 
709 	case 94:
710 	  if (nowtok == tok_ellipsis3)
711 	    {
712 	      state = 95;
713 	      continue;
714 	    }
715 	  /* Fall through.  */
716 
717 	case 96:
718 	  if (nowtok != tok_number)
719 	    lr_error (cmfile, _("value for %s must be an integer"),
720 		      "WIDTH");
721 	  else
722 	    {
723 	      /* Store width for chars.  */
724 	      new_width (cmfile, result, from_name, to_name, now->val.num);
725 
726 	      from_name = NULL;
727 	      to_name = NULL;
728 	    }
729 
730 	  lr_ignore_rest (cmfile, nowtok == tok_number);
731 
732 	  state = 93;
733 	  continue;
734 
735 	case 95:
736 	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
737 	    {
738 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
739 			"WIDTH", _("no symbolic name given for end of range"));
740 
741 	      lr_ignore_rest (cmfile, 0);
742 
743 	      state = 93;
744 	      continue;
745 	    }
746 
747 	  if (nowtok == tok_bsymbol)
748 	    to_name = (char *) obstack_copy0 (&result->mem_pool,
749 					      now->val.str.startmb,
750 					      now->val.str.lenmb);
751 	  else
752 	    {
753 	      obstack_printf (&result->mem_pool, "U%08X",
754 			      cmfile->token.val.ucs4);
755 	      obstack_1grow (&result->mem_pool, '\0');
756 	      to_name = (char *) obstack_finish (&result->mem_pool);
757 	    }
758 
759 	  state = 96;
760 	  continue;
761 
762 	case 98:
763 	  /* We now expect `END WIDTH_VARIABLE' or lines of the format
764 	     "%s\n" or "%s...%s\n".  */
765 	  if (nowtok == tok_eol)
766 	    /* ignore empty lines.  */
767 	    continue;
768 
769 	  if (nowtok == tok_end)
770 	    {
771 	      expected_tok = tok_width_variable;
772 	      expected_str = "WIDTH_VARIABLE";
773 	      state = 90;
774 	      continue;
775 	    }
776 
777 	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
778 	    {
779 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
780 			"WIDTH_VARIABLE", _("no symbolic name given"));
781 
782 	      lr_ignore_rest (cmfile, 0);
783 
784 	      continue;
785 	    }
786 
787 	  if (from_name != NULL)
788 	    obstack_free (&result->mem_pool, from_name);
789 
790 	  if (nowtok == tok_bsymbol)
791 	    from_name = (char *) obstack_copy0 (&result->mem_pool,
792 						now->val.str.startmb,
793 						now->val.str.lenmb);
794 	  else
795 	    {
796 	      obstack_printf (&result->mem_pool, "U%08X",
797 			      cmfile->token.val.ucs4);
798 	      obstack_1grow (&result->mem_pool, '\0');
799 	      from_name = (char *) obstack_finish (&result->mem_pool);
800 	    }
801 	  to_name = NULL;
802 
803 	  state = 99;
804 	  continue;
805 
806 	case 99:
807 	  if (nowtok == tok_ellipsis3)
808 	    state = 100;
809 
810 	  /* Store info.  */
811 	  from_name = NULL;
812 
813 	  /* Warn */
814 	  state = 98;
815 	  continue;
816 
817 	case 100:
818 	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
819 	    {
820 	      lr_error (cmfile, _("syntax error in %s definition: %s"),
821 			"WIDTH_VARIABLE",
822 			_("no symbolic name given for end of range"));
823 	      lr_ignore_rest (cmfile, 0);
824 	      continue;
825 	    }
826 
827 	  if (nowtok == tok_bsymbol)
828 	    to_name = (char *) obstack_copy0 (&result->mem_pool,
829 					      now->val.str.startmb,
830 					      now->val.str.lenmb);
831 	  else
832 	    {
833 	      obstack_printf (&result->mem_pool, "U%08X",
834 			      cmfile->token.val.ucs4);
835 	      obstack_1grow (&result->mem_pool, '\0');
836 	      to_name = (char *) obstack_finish (&result->mem_pool);
837 	    }
838 
839 	  /* XXX Enter value into table.  */
840 
841 	  lr_ignore_rest (cmfile, 1);
842 
843 	  state = 98;
844 	  continue;
845 
846 	default:
847 	  record_error (5, 0, _("%s: error in state machine"),
848 			__FILE__);
849 	  /* NOTREACHED */
850 	}
851       break;
852     }
853 
854   if (state != 91)
855     record_error (0, 0, _("%s: premature end of file"),
856 		  cmfile->fname);
857 
858   lr_close (cmfile);
859 
860   return result;
861 }
862 
863 
864 static void
new_width(struct linereader * cmfile,struct charmap_t * result,const char * from,const char * to,unsigned long int width)865 new_width (struct linereader *cmfile, struct charmap_t *result,
866 	   const char *from, const char *to, unsigned long int width)
867 {
868   struct charseq *from_val;
869   struct charseq *to_val;
870 
871   from_val = charmap_find_value (result, from, strlen (from));
872   if (from_val == NULL)
873     {
874       lr_error (cmfile, _("unknown character `%s'"), from);
875       return;
876     }
877 
878   if (to == NULL)
879     to_val = from_val;
880   else
881     {
882       to_val = charmap_find_value (result, to, strlen (to));
883       if (to_val == NULL)
884 	{
885 	  lr_error (cmfile, _("unknown character `%s'"), to);
886 	  return;
887 	}
888 
889       /* Make sure the number of bytes for the end points of the range
890 	 is correct.  */
891       if (from_val->nbytes != to_val->nbytes)
892 	{
893 	  lr_error (cmfile, _("\
894 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
895 		    from_val->nbytes, to_val->nbytes);
896 	  return;
897 	}
898     }
899 
900   if (result->nwidth_rules >= result->nwidth_rules_max)
901     {
902       size_t new_size = result->nwidth_rules + 32;
903       struct width_rule *new_rules =
904 	(struct width_rule *) obstack_alloc (&result->mem_pool,
905 					     (new_size
906 					      * sizeof (struct width_rule)));
907 
908       memcpy (new_rules, result->width_rules,
909 	      result->nwidth_rules_max * sizeof (struct width_rule));
910 
911       result->width_rules = new_rules;
912       result->nwidth_rules_max = new_size;
913     }
914 
915   result->width_rules[result->nwidth_rules].from = from_val;
916   result->width_rules[result->nwidth_rules].to = to_val;
917   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
918   ++result->nwidth_rules;
919 }
920 
921 
922 struct charseq *
charmap_find_value(const struct charmap_t * cm,const char * name,size_t len)923 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
924 {
925   void *result;
926 
927   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
928 	  < 0 ? NULL : (struct charseq *) result);
929 }
930 
931 
932 static void
charmap_new_char(struct linereader * lr,struct charmap_t * cm,size_t nbytes,unsigned char * bytes,const char * from,const char * to,int decimal_ellipsis,int step)933 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
934 		  size_t nbytes, unsigned char *bytes,
935 		  const char *from, const char *to,
936 		  int decimal_ellipsis, int step)
937 {
938   hash_table *ht = &cm->char_table;
939   hash_table *bt = &cm->byte_table;
940   struct obstack *ob = &cm->mem_pool;
941   char *from_end;
942   char *to_end;
943   const char *cp;
944   int prefix_len, len1, len2;
945   unsigned int from_nr, to_nr, cnt;
946   struct charseq *newp;
947 
948   len1 = strlen (from);
949 
950   if (to == NULL)
951     {
952       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
953       newp->nbytes = nbytes;
954       memcpy (newp->bytes, bytes, nbytes);
955       newp->name = from;
956 
957       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
958       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
959 	{
960 	  /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
961 	     xxxx and xxxxxxxx are hexadecimal numbers.  In this case
962 	     we use the value of xxxx or xxxxxxxx as the UCS4 value of
963 	     this character and we don't have to consult the repertoire
964 	     map.
965 
966 	     If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
967 	     and xxxxxxxx also give the code point in UCS4 but this must
968 	     be in the private, i.e., unassigned, area.  This should be
969 	     used for characters which do not (yet) have an equivalent
970 	     in ISO 10646 and Unicode.  */
971 	  char *endp;
972 
973 	  errno = 0;
974 	  newp->ucs4 = strtoul (from + 1, &endp, 16);
975 	  if (endp - from != len1
976 	      || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
977 	      || newp->ucs4 >= 0x80000000)
978 	    /* This wasn't successful.  Signal this name cannot be a
979 	       correct UCS value.  */
980 	    newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
981 	}
982 
983       insert_entry (ht, from, len1, newp);
984       insert_entry (bt, newp->bytes, nbytes, newp);
985       /* Please note that it isn't a bug if a symbol is defined more
986 	 than once.  All later definitions are simply discarded.  */
987       return;
988     }
989 
990   /* We have a range: the names must have names with equal prefixes
991      and an equal number of digits, where the second number is greater
992      or equal than the first.  */
993   len2 = strlen (to);
994 
995   if (len1 != len2)
996     {
997     illegal_range:
998       lr_error (lr, _("invalid names for character range"));
999       return;
1000     }
1001 
1002   cp = &from[len1 - 1];
1003   if (decimal_ellipsis)
1004     while (isdigit (*cp) && cp >= from)
1005       --cp;
1006   else
1007     while (isxdigit (*cp) && cp >= from)
1008       {
1009 	if (!isdigit (*cp) && !isupper (*cp))
1010 	  lr_error (lr, _("\
1011 hexadecimal range format should use only capital characters"));
1012 	--cp;
1013       }
1014 
1015   prefix_len = (cp - from) + 1;
1016 
1017   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1018     goto illegal_range;
1019 
1020   errno = 0;
1021   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1022   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1023       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1024 			    decimal_ellipsis ? 10 : 16)) == UINT_MAX
1025 	  && errno == ERANGE)
1026       || *to_end != '\0')
1027     {
1028       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1029       return;
1030     }
1031 
1032   if (from_nr > to_nr)
1033     {
1034       lr_error (lr, _("upper limit in range is smaller than lower limit"));
1035       return;
1036     }
1037 
1038   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1039     {
1040       char *name_end;
1041       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1042 		      prefix_len, from, len1 - prefix_len, cnt);
1043       obstack_1grow (ob, '\0');
1044       name_end = obstack_finish (ob);
1045 
1046       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1047       newp->nbytes = nbytes;
1048       memcpy (newp->bytes, bytes, nbytes);
1049       newp->name = name_end;
1050 
1051       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052       if ((name_end[0] == 'U' || name_end[0] == 'P')
1053 	  && (len1 == 5 || len1 == 9))
1054 	{
1055 	  /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1056 	     xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1057 	     we use the value of xxxx or xxxxxxxx as the UCS4 value of
1058 	     this character and we don't have to consult the repertoire
1059 	     map.
1060 
1061 	     If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1062 	     and xxxxxxxx also give the code point in UCS4 but this must
1063 	     be in the private, i.e., unassigned, area.  This should be
1064 	     used for characters which do not (yet) have an equivalent
1065 	     in ISO 10646 and Unicode.  */
1066 	  char *endp;
1067 
1068 	  errno = 0;
1069 	  newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1070 	  if (endp - name_end != len1
1071 	      || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1072 	      || newp->ucs4 >= 0x80000000)
1073 	    /* This wasn't successful.  Signal this name cannot be a
1074 	       correct UCS value.  */
1075 	    newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1076 	}
1077 
1078       insert_entry (ht, name_end, len1, newp);
1079       insert_entry (bt, newp->bytes, nbytes, newp);
1080       /* Please note we don't examine the return value since it is no error
1081 	 if we have two definitions for a symbol.  */
1082 
1083       /* Increment the value in the byte sequence.  */
1084       if (++bytes[nbytes - 1] == '\0')
1085 	{
1086 	  int b = nbytes - 2;
1087 
1088 	  do
1089 	    if (b < 0)
1090 	      {
1091 		lr_error (lr,
1092 			  _("resulting bytes for range not representable."));
1093 		return;
1094 	      }
1095 	  while (++bytes[b--] == 0);
1096 	}
1097     }
1098 }
1099 
1100 
1101 struct charseq *
charmap_find_symbol(const struct charmap_t * cm,const char * bytes,size_t nbytes)1102 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1103 		     size_t nbytes)
1104 {
1105   void *result;
1106 
1107   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1108 	  < 0 ? NULL : (struct charseq *) result);
1109 }
1110