1 /* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
16
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20
21 #include <ctype.h>
22 #include <errno.h>
23 #include <libintl.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <stdint.h>
29
30 #include "localedef.h"
31 #include "linereader.h"
32 #include "charmap.h"
33 #include "charmap-dir.h"
34
35 #include <assert.h>
36
37
38 /* Define the lookup function. */
39 #include "charmap-kw.h"
40
41
42 /* Prototypes for local functions. */
43 static struct charmap_t *parse_charmap (struct linereader *cmfile,
44 int verbose, int be_quiet);
45 static void new_width (struct linereader *cmfile, struct charmap_t *result,
46 const char *from, const char *to,
47 unsigned long int width);
48 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
49 size_t nbytes, unsigned char *bytes,
50 const char *from, const char *to,
51 int decimal_ellipsis, int step);
52
53
54 bool enc_not_ascii_compatible;
55
56
57 #ifdef NEED_NULL_POINTER
58 static const char *null_pointer;
59 #endif
60
61 static struct linereader *
cmlr_open(const char * directory,const char * name,kw_hash_fct_t hf)62 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
63 {
64 FILE *fp;
65
66 fp = charmap_open (directory, name);
67 if (fp == NULL)
68 return NULL;
69 else
70 {
71 size_t dlen = strlen (directory);
72 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
73 size_t nlen = strlen (name);
74 char *pathname;
75 char *p;
76
77 pathname = alloca (dlen + add_slash + nlen + 1);
78 p = stpcpy (pathname, directory);
79 if (add_slash)
80 *p++ = '/';
81 stpcpy (p, name);
82
83 return lr_create (fp, pathname, hf);
84 }
85 }
86
87 struct charmap_t *
charmap_read(const char * filename,int verbose,int error_not_found,int be_quiet,int use_default)88 charmap_read (const char *filename, int verbose, int error_not_found,
89 int be_quiet, int use_default)
90 {
91 struct charmap_t *result = NULL;
92
93 if (filename != NULL)
94 {
95 struct linereader *cmfile;
96
97 /* First try the name as found in the parameter. */
98 cmfile = lr_open (filename, charmap_hash);
99 if (cmfile == NULL)
100 {
101 /* No successful. So start looking through the directories
102 in the I18NPATH if this is a simple name. */
103 if (strchr (filename, '/') == NULL)
104 {
105 char *i18npath = getenv ("I18NPATH");
106 if (i18npath != NULL && *i18npath != '\0')
107 {
108 const size_t pathlen = strlen (i18npath);
109 char i18npathbuf[pathlen + 1];
110 char path[pathlen + sizeof ("/charmaps")];
111 char *next;
112 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
113
114 while (cmfile == NULL
115 && (next = strsep (&i18npath, ":")) != NULL)
116 {
117 stpcpy (stpcpy (path, next), "/charmaps");
118 cmfile = cmlr_open (path, filename, charmap_hash);
119
120 if (cmfile == NULL)
121 /* Try without the "/charmaps" part. */
122 cmfile = cmlr_open (next, filename, charmap_hash);
123 }
124 }
125
126 if (cmfile == NULL)
127 /* Try the default directory. */
128 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
129 }
130 }
131
132 if (cmfile != NULL)
133 result = parse_charmap (cmfile, verbose, be_quiet);
134
135 if (result == NULL && error_not_found)
136 record_error (0, errno,
137 _("character map file `%s' not found"),
138 filename);
139 }
140
141 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
142 {
143 /* OK, one more try. We also accept the names given to the
144 character sets in the files. Sometimes they differ from the
145 file name. */
146 CHARMAP_DIR *dir;
147
148 dir = charmap_opendir (CHARMAP_PATH);
149 if (dir != NULL)
150 {
151 const char *dirent;
152
153 while ((dirent = charmap_readdir (dir)) != NULL)
154 {
155 char **aliases;
156 char **p;
157 int found;
158
159 aliases = charmap_aliases (CHARMAP_PATH, dirent);
160 found = 0;
161 for (p = aliases; *p; p++)
162 if (strcasecmp (*p, filename) == 0)
163 {
164 found = 1;
165 break;
166 }
167 charmap_free_aliases (aliases);
168
169 if (found)
170 {
171 struct linereader *cmfile;
172
173 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
174 if (cmfile != NULL)
175 result = parse_charmap (cmfile, verbose, be_quiet);
176
177 break;
178 }
179 }
180
181 charmap_closedir (dir);
182 }
183 }
184
185 if (result == NULL && DEFAULT_CHARMAP != NULL)
186 {
187 struct linereader *cmfile;
188
189 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
190 if (cmfile != NULL)
191 result = parse_charmap (cmfile, verbose, be_quiet);
192
193 if (result == NULL)
194 record_error (4, errno,
195 _("default character map file `%s' not found"),
196 DEFAULT_CHARMAP);
197 }
198
199 if (result != NULL && result->code_set_name == NULL)
200 /* The input file does not specify a code set name. This
201 shouldn't happen but we should cope with it. */
202 result->code_set_name = basename (filename);
203
204 /* Test of ASCII compatibility of locale encoding.
205
206 Verify that the encoding to be used in a locale is ASCII compatible,
207 at least for the graphic characters, excluding the control characters,
208 '$' and '@'. This constraint comes from an ISO C 99 restriction.
209
210 ISO C 99 section 7.17.(2) (about wchar_t):
211 the null character shall have the code value zero and each member of
212 the basic character set shall have a code value equal to its value
213 when used as the lone character in an integer character constant.
214 ISO C 99 section 5.2.1.(3):
215 Both the basic source and basic execution character sets shall have
216 the following members: the 26 uppercase letters of the Latin alphabet
217 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
218 the 26 lowercase letters of the Latin alphabet
219 a b c d e f g h i j k l m n o p q r s t u v w x y z
220 the 10 decimal digits
221 0 1 2 3 4 5 6 7 8 9
222 the following 29 graphic characters
223 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
224 the space character, and control characters representing horizontal
225 tab, vertical tab, and form feed.
226
227 Therefore, for all members of the "basic character set", the 'char' code
228 must have the same value as the 'wchar_t' code, which in glibc is the
229 same as the Unicode code, which for all of the enumerated characters
230 is identical to the ASCII code. */
231 if (result != NULL && use_default)
232 {
233 static const char basic_charset[] =
234 {
235 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
236 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
237 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
238 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
239 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
240 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
241 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
242 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
243 };
244 int failed = 0;
245 const char *p = basic_charset;
246
247 do
248 {
249 struct charseq *seq = charmap_find_symbol (result, p, 1);
250
251 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
252 failed = 1;
253 }
254 while (*p++ != '\0');
255
256 if (failed)
257 {
258 /* A user may disable the ASCII compatibility warning check,
259 but we must remember that the encoding is not ASCII
260 compatible, since it may have other implications. Later
261 we will set _NL_CTYPE_MAP_TO_NONASCII from this value. */
262 if (warn_ascii)
263 record_warning (_(
264 "character map `%s' is not ASCII compatible, locale not ISO C compliant "
265 "[--no-warnings=ascii]"),
266 result->code_set_name);
267 enc_not_ascii_compatible = true;
268 }
269 }
270
271 return result;
272 }
273
274
275 static struct charmap_t *
parse_charmap(struct linereader * cmfile,int verbose,int be_quiet)276 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
277 {
278 struct charmap_t *result;
279 int state;
280 enum token_t expected_tok = tok_error;
281 const char *expected_str = NULL;
282 char *from_name = NULL;
283 char *to_name = NULL;
284 enum token_t ellipsis = 0;
285 int step = 1;
286
287 /* We don't want symbolic names in string to be translated. */
288 cmfile->translate_strings = 0;
289
290 /* Allocate room for result. */
291 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
292 memset (result, '\0', sizeof (struct charmap_t));
293 /* The default DEFAULT_WIDTH is 1. */
294 result->width_default = 1;
295
296 #define obstack_chunk_alloc malloc
297 #define obstack_chunk_free free
298 obstack_init (&result->mem_pool);
299
300 if (init_hash (&result->char_table, 256)
301 || init_hash (&result->byte_table, 256))
302 {
303 free (result);
304 return NULL;
305 }
306
307 /* We use a state machine to describe the charmap description file
308 format. */
309 state = 1;
310 while (1)
311 {
312 /* What's on? */
313 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
314 enum token_t nowtok = now->tok;
315 struct token *arg;
316
317 if (nowtok == tok_eof)
318 break;
319
320 switch (state)
321 {
322 case 1:
323 /* The beginning. We expect the special declarations, EOL or
324 `CHARMAP'. */
325 if (nowtok == tok_eol)
326 /* Ignore empty lines. */
327 continue;
328
329 if (nowtok == tok_charmap)
330 {
331 from_name = NULL;
332 to_name = NULL;
333
334 /* We have to set up the real work. Fill in some
335 default values. */
336 if (result->mb_cur_max == 0)
337 result->mb_cur_max = 1;
338 if (result->mb_cur_min == 0)
339 result->mb_cur_min = result->mb_cur_max;
340 if (result->mb_cur_min > result->mb_cur_max)
341 {
342 record_error (0, 0, _("\
343 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
344 cmfile->fname);
345
346 result->mb_cur_min = result->mb_cur_max;
347 }
348
349 lr_ignore_rest (cmfile, 1);
350
351 state = 2;
352 continue;
353 }
354
355 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
356 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
357 && nowtok != tok_comment_char && nowtok != tok_g0esc
358 && nowtok != tok_g1esc && nowtok != tok_g2esc
359 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
360 && nowtok != tok_include)
361 {
362 lr_error (cmfile, _("syntax error in prolog: %s"),
363 _("invalid definition"));
364
365 lr_ignore_rest (cmfile, 0);
366 continue;
367 }
368
369 /* We know that we need an argument. */
370 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
371
372 switch (nowtok)
373 {
374 case tok_code_set_name:
375 case tok_repertoiremap:
376 if (arg->tok != tok_ident && arg->tok != tok_string)
377 {
378 badarg:
379 lr_error (cmfile, _("syntax error in prolog: %s"),
380 _("bad argument"));
381
382 lr_ignore_rest (cmfile, 0);
383 continue;
384 }
385
386 if (nowtok == tok_code_set_name)
387 result->code_set_name = obstack_copy0 (&result->mem_pool,
388 arg->val.str.startmb,
389 arg->val.str.lenmb);
390 else
391 result->repertoiremap = obstack_copy0 (&result->mem_pool,
392 arg->val.str.startmb,
393 arg->val.str.lenmb);
394
395 lr_ignore_rest (cmfile, 1);
396 continue;
397
398 case tok_mb_cur_max:
399 case tok_mb_cur_min:
400 if (arg->tok != tok_number)
401 goto badarg;
402
403 if ((nowtok == tok_mb_cur_max
404 && result->mb_cur_max != 0)
405 || (nowtok == tok_mb_cur_max
406 && result->mb_cur_max != 0))
407 lr_error (cmfile, _("duplicate definition of <%s>"),
408 nowtok == tok_mb_cur_min
409 ? "mb_cur_min" : "mb_cur_max");
410
411 if (arg->val.num < 1)
412 {
413 lr_error (cmfile,
414 _("value for <%s> must be 1 or greater"),
415 nowtok == tok_mb_cur_min
416 ? "mb_cur_min" : "mb_cur_max");
417
418 lr_ignore_rest (cmfile, 0);
419 continue;
420 }
421 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
422 && (int) arg->val.num < result->mb_cur_min)
423 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
424 && (int) arg->val.num > result->mb_cur_max))
425 {
426 lr_error (cmfile, _("\
427 value of <%s> must be greater or equal than the value of <%s>"),
428 "mb_cur_max", "mb_cur_min");
429
430 lr_ignore_rest (cmfile, 0);
431 continue;
432 }
433
434 if (nowtok == tok_mb_cur_max)
435 result->mb_cur_max = arg->val.num;
436 else
437 result->mb_cur_min = arg->val.num;
438
439 lr_ignore_rest (cmfile, 1);
440 continue;
441
442 case tok_escape_char:
443 case tok_comment_char:
444 if (arg->tok != tok_ident)
445 goto badarg;
446
447 if (arg->val.str.lenmb != 1)
448 {
449 lr_error (cmfile, _("\
450 argument to <%s> must be a single character"),
451 nowtok == tok_escape_char ? "escape_char"
452 : "comment_char");
453
454 lr_ignore_rest (cmfile, 0);
455 continue;
456 }
457
458 if (nowtok == tok_escape_char)
459 cmfile->escape_char = *arg->val.str.startmb;
460 else
461 cmfile->comment_char = *arg->val.str.startmb;
462
463 lr_ignore_rest (cmfile, 1);
464 continue;
465
466 case tok_g0esc:
467 case tok_g1esc:
468 case tok_g2esc:
469 case tok_g3esc:
470 case tok_escseq:
471 lr_ignore_rest (cmfile, 0); /* XXX */
472 continue;
473
474 case tok_include:
475 lr_error (cmfile, _("\
476 character sets with locking states are not supported"));
477 exit (4);
478
479 default:
480 /* Cannot happen. */
481 assert (! "Should not happen");
482 }
483 break;
484
485 case 2:
486 /* We have seen `CHARMAP' and now are in the body. Each line
487 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
488 if (nowtok == tok_eol)
489 /* Ignore empty lines. */
490 continue;
491
492 if (nowtok == tok_end)
493 {
494 expected_tok = tok_charmap;
495 expected_str = "CHARMAP";
496 state = 90;
497 continue;
498 }
499
500 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
501 {
502 lr_error (cmfile, _("syntax error in %s definition: %s"),
503 "CHARMAP", _("no symbolic name given"));
504
505 lr_ignore_rest (cmfile, 0);
506 continue;
507 }
508
509 /* If the previous line was not completely correct free the
510 used memory. */
511 if (from_name != NULL)
512 obstack_free (&result->mem_pool, from_name);
513
514 if (nowtok == tok_bsymbol)
515 from_name = (char *) obstack_copy0 (&result->mem_pool,
516 now->val.str.startmb,
517 now->val.str.lenmb);
518 else
519 {
520 obstack_printf (&result->mem_pool, "U%08X",
521 cmfile->token.val.ucs4);
522 obstack_1grow (&result->mem_pool, '\0');
523 from_name = (char *) obstack_finish (&result->mem_pool);
524 }
525 to_name = NULL;
526
527 state = 3;
528 continue;
529
530 case 3:
531 /* We have two possibilities: We can see an ellipsis or an
532 encoding value. */
533 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
534 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
535 || nowtok == tok_ellipsis2_2)
536 {
537 ellipsis = nowtok;
538 if (nowtok == tok_ellipsis4_2)
539 {
540 step = 2;
541 nowtok = tok_ellipsis4;
542 }
543 else if (nowtok == tok_ellipsis2_2)
544 {
545 step = 2;
546 nowtok = tok_ellipsis2;
547 }
548 state = 4;
549 continue;
550 }
551 /* FALLTHROUGH */
552
553 case 5:
554 if (nowtok != tok_charcode)
555 {
556 lr_error (cmfile, _("syntax error in %s definition: %s"),
557 "CHARMAP", _("invalid encoding given"));
558
559 lr_ignore_rest (cmfile, 0);
560
561 state = 2;
562 continue;
563 }
564
565 if (now->val.charcode.nbytes < result->mb_cur_min)
566 lr_error (cmfile, _("too few bytes in character encoding"));
567 else if (now->val.charcode.nbytes > result->mb_cur_max)
568 lr_error (cmfile, _("too many bytes in character encoding"));
569 else
570 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
571 now->val.charcode.bytes, from_name, to_name,
572 ellipsis != tok_ellipsis2, step);
573
574 /* Ignore trailing comment silently. */
575 lr_ignore_rest (cmfile, 0);
576
577 from_name = NULL;
578 to_name = NULL;
579 ellipsis = tok_none;
580 step = 1;
581
582 state = 2;
583 continue;
584
585 case 4:
586 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
587 {
588 lr_error (cmfile, _("syntax error in %s definition: %s"),
589 "CHARMAP",
590 _("no symbolic name given for end of range"));
591
592 lr_ignore_rest (cmfile, 0);
593 continue;
594 }
595
596 /* Copy the to-name in a safe place. */
597 if (nowtok == tok_bsymbol)
598 to_name = (char *) obstack_copy0 (&result->mem_pool,
599 cmfile->token.val.str.startmb,
600 cmfile->token.val.str.lenmb);
601 else
602 {
603 obstack_printf (&result->mem_pool, "U%08X",
604 cmfile->token.val.ucs4);
605 obstack_1grow (&result->mem_pool, '\0');
606 to_name = (char *) obstack_finish (&result->mem_pool);
607 }
608
609 state = 5;
610 continue;
611
612 case 90:
613 if (nowtok != expected_tok)
614 lr_error (cmfile, _("\
615 %1$s: definition does not end with `END %1$s'"), expected_str);
616
617 lr_ignore_rest (cmfile, nowtok == expected_tok);
618 state = 91;
619 continue;
620
621 case 91:
622 /* Waiting for WIDTH... */
623 if (nowtok == tok_eol)
624 /* Ignore empty lines. */
625 continue;
626
627 if (nowtok == tok_width_default)
628 {
629 state = 92;
630 continue;
631 }
632
633 if (nowtok == tok_width)
634 {
635 lr_ignore_rest (cmfile, 1);
636 state = 93;
637 continue;
638 }
639
640 if (nowtok == tok_width_variable)
641 {
642 lr_ignore_rest (cmfile, 1);
643 state = 98;
644 continue;
645 }
646
647 lr_error (cmfile, _("\
648 only WIDTH definitions are allowed to follow the CHARMAP definition"));
649
650 lr_ignore_rest (cmfile, 0);
651 continue;
652
653 case 92:
654 if (nowtok != tok_number)
655 lr_error (cmfile, _("value for %s must be an integer"),
656 "WIDTH_DEFAULT");
657 else
658 result->width_default = now->val.num;
659
660 lr_ignore_rest (cmfile, nowtok == tok_number);
661
662 state = 91;
663 continue;
664
665 case 93:
666 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
667 "%s...%s %d\n". */
668 if (nowtok == tok_eol)
669 /* ignore empty lines. */
670 continue;
671
672 if (nowtok == tok_end)
673 {
674 expected_tok = tok_width;
675 expected_str = "WIDTH";
676 state = 90;
677 continue;
678 }
679
680 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
681 {
682 lr_error (cmfile, _("syntax error in %s definition: %s"),
683 "WIDTH", _("no symbolic name given"));
684
685 lr_ignore_rest (cmfile, 0);
686 continue;
687 }
688
689 if (from_name != NULL)
690 obstack_free (&result->mem_pool, from_name);
691
692 if (nowtok == tok_bsymbol)
693 from_name = (char *) obstack_copy0 (&result->mem_pool,
694 now->val.str.startmb,
695 now->val.str.lenmb);
696 else
697 {
698 obstack_printf (&result->mem_pool, "U%08X",
699 cmfile->token.val.ucs4);
700 obstack_1grow (&result->mem_pool, '\0');
701 from_name = (char *) obstack_finish (&result->mem_pool);
702 }
703
704 to_name = NULL;
705
706 state = 94;
707 continue;
708
709 case 94:
710 if (nowtok == tok_ellipsis3)
711 {
712 state = 95;
713 continue;
714 }
715 /* Fall through. */
716
717 case 96:
718 if (nowtok != tok_number)
719 lr_error (cmfile, _("value for %s must be an integer"),
720 "WIDTH");
721 else
722 {
723 /* Store width for chars. */
724 new_width (cmfile, result, from_name, to_name, now->val.num);
725
726 from_name = NULL;
727 to_name = NULL;
728 }
729
730 lr_ignore_rest (cmfile, nowtok == tok_number);
731
732 state = 93;
733 continue;
734
735 case 95:
736 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
737 {
738 lr_error (cmfile, _("syntax error in %s definition: %s"),
739 "WIDTH", _("no symbolic name given for end of range"));
740
741 lr_ignore_rest (cmfile, 0);
742
743 state = 93;
744 continue;
745 }
746
747 if (nowtok == tok_bsymbol)
748 to_name = (char *) obstack_copy0 (&result->mem_pool,
749 now->val.str.startmb,
750 now->val.str.lenmb);
751 else
752 {
753 obstack_printf (&result->mem_pool, "U%08X",
754 cmfile->token.val.ucs4);
755 obstack_1grow (&result->mem_pool, '\0');
756 to_name = (char *) obstack_finish (&result->mem_pool);
757 }
758
759 state = 96;
760 continue;
761
762 case 98:
763 /* We now expect `END WIDTH_VARIABLE' or lines of the format
764 "%s\n" or "%s...%s\n". */
765 if (nowtok == tok_eol)
766 /* ignore empty lines. */
767 continue;
768
769 if (nowtok == tok_end)
770 {
771 expected_tok = tok_width_variable;
772 expected_str = "WIDTH_VARIABLE";
773 state = 90;
774 continue;
775 }
776
777 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
778 {
779 lr_error (cmfile, _("syntax error in %s definition: %s"),
780 "WIDTH_VARIABLE", _("no symbolic name given"));
781
782 lr_ignore_rest (cmfile, 0);
783
784 continue;
785 }
786
787 if (from_name != NULL)
788 obstack_free (&result->mem_pool, from_name);
789
790 if (nowtok == tok_bsymbol)
791 from_name = (char *) obstack_copy0 (&result->mem_pool,
792 now->val.str.startmb,
793 now->val.str.lenmb);
794 else
795 {
796 obstack_printf (&result->mem_pool, "U%08X",
797 cmfile->token.val.ucs4);
798 obstack_1grow (&result->mem_pool, '\0');
799 from_name = (char *) obstack_finish (&result->mem_pool);
800 }
801 to_name = NULL;
802
803 state = 99;
804 continue;
805
806 case 99:
807 if (nowtok == tok_ellipsis3)
808 state = 100;
809
810 /* Store info. */
811 from_name = NULL;
812
813 /* Warn */
814 state = 98;
815 continue;
816
817 case 100:
818 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
819 {
820 lr_error (cmfile, _("syntax error in %s definition: %s"),
821 "WIDTH_VARIABLE",
822 _("no symbolic name given for end of range"));
823 lr_ignore_rest (cmfile, 0);
824 continue;
825 }
826
827 if (nowtok == tok_bsymbol)
828 to_name = (char *) obstack_copy0 (&result->mem_pool,
829 now->val.str.startmb,
830 now->val.str.lenmb);
831 else
832 {
833 obstack_printf (&result->mem_pool, "U%08X",
834 cmfile->token.val.ucs4);
835 obstack_1grow (&result->mem_pool, '\0');
836 to_name = (char *) obstack_finish (&result->mem_pool);
837 }
838
839 /* XXX Enter value into table. */
840
841 lr_ignore_rest (cmfile, 1);
842
843 state = 98;
844 continue;
845
846 default:
847 record_error (5, 0, _("%s: error in state machine"),
848 __FILE__);
849 /* NOTREACHED */
850 }
851 break;
852 }
853
854 if (state != 91)
855 record_error (0, 0, _("%s: premature end of file"),
856 cmfile->fname);
857
858 lr_close (cmfile);
859
860 return result;
861 }
862
863
864 static void
new_width(struct linereader * cmfile,struct charmap_t * result,const char * from,const char * to,unsigned long int width)865 new_width (struct linereader *cmfile, struct charmap_t *result,
866 const char *from, const char *to, unsigned long int width)
867 {
868 struct charseq *from_val;
869 struct charseq *to_val;
870
871 from_val = charmap_find_value (result, from, strlen (from));
872 if (from_val == NULL)
873 {
874 lr_error (cmfile, _("unknown character `%s'"), from);
875 return;
876 }
877
878 if (to == NULL)
879 to_val = from_val;
880 else
881 {
882 to_val = charmap_find_value (result, to, strlen (to));
883 if (to_val == NULL)
884 {
885 lr_error (cmfile, _("unknown character `%s'"), to);
886 return;
887 }
888
889 /* Make sure the number of bytes for the end points of the range
890 is correct. */
891 if (from_val->nbytes != to_val->nbytes)
892 {
893 lr_error (cmfile, _("\
894 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
895 from_val->nbytes, to_val->nbytes);
896 return;
897 }
898 }
899
900 if (result->nwidth_rules >= result->nwidth_rules_max)
901 {
902 size_t new_size = result->nwidth_rules + 32;
903 struct width_rule *new_rules =
904 (struct width_rule *) obstack_alloc (&result->mem_pool,
905 (new_size
906 * sizeof (struct width_rule)));
907
908 memcpy (new_rules, result->width_rules,
909 result->nwidth_rules_max * sizeof (struct width_rule));
910
911 result->width_rules = new_rules;
912 result->nwidth_rules_max = new_size;
913 }
914
915 result->width_rules[result->nwidth_rules].from = from_val;
916 result->width_rules[result->nwidth_rules].to = to_val;
917 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
918 ++result->nwidth_rules;
919 }
920
921
922 struct charseq *
charmap_find_value(const struct charmap_t * cm,const char * name,size_t len)923 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
924 {
925 void *result;
926
927 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
928 < 0 ? NULL : (struct charseq *) result);
929 }
930
931
932 static void
charmap_new_char(struct linereader * lr,struct charmap_t * cm,size_t nbytes,unsigned char * bytes,const char * from,const char * to,int decimal_ellipsis,int step)933 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
934 size_t nbytes, unsigned char *bytes,
935 const char *from, const char *to,
936 int decimal_ellipsis, int step)
937 {
938 hash_table *ht = &cm->char_table;
939 hash_table *bt = &cm->byte_table;
940 struct obstack *ob = &cm->mem_pool;
941 char *from_end;
942 char *to_end;
943 const char *cp;
944 int prefix_len, len1, len2;
945 unsigned int from_nr, to_nr, cnt;
946 struct charseq *newp;
947
948 len1 = strlen (from);
949
950 if (to == NULL)
951 {
952 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
953 newp->nbytes = nbytes;
954 memcpy (newp->bytes, bytes, nbytes);
955 newp->name = from;
956
957 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
958 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
959 {
960 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
961 xxxx and xxxxxxxx are hexadecimal numbers. In this case
962 we use the value of xxxx or xxxxxxxx as the UCS4 value of
963 this character and we don't have to consult the repertoire
964 map.
965
966 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
967 and xxxxxxxx also give the code point in UCS4 but this must
968 be in the private, i.e., unassigned, area. This should be
969 used for characters which do not (yet) have an equivalent
970 in ISO 10646 and Unicode. */
971 char *endp;
972
973 errno = 0;
974 newp->ucs4 = strtoul (from + 1, &endp, 16);
975 if (endp - from != len1
976 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
977 || newp->ucs4 >= 0x80000000)
978 /* This wasn't successful. Signal this name cannot be a
979 correct UCS value. */
980 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
981 }
982
983 insert_entry (ht, from, len1, newp);
984 insert_entry (bt, newp->bytes, nbytes, newp);
985 /* Please note that it isn't a bug if a symbol is defined more
986 than once. All later definitions are simply discarded. */
987 return;
988 }
989
990 /* We have a range: the names must have names with equal prefixes
991 and an equal number of digits, where the second number is greater
992 or equal than the first. */
993 len2 = strlen (to);
994
995 if (len1 != len2)
996 {
997 illegal_range:
998 lr_error (lr, _("invalid names for character range"));
999 return;
1000 }
1001
1002 cp = &from[len1 - 1];
1003 if (decimal_ellipsis)
1004 while (isdigit (*cp) && cp >= from)
1005 --cp;
1006 else
1007 while (isxdigit (*cp) && cp >= from)
1008 {
1009 if (!isdigit (*cp) && !isupper (*cp))
1010 lr_error (lr, _("\
1011 hexadecimal range format should use only capital characters"));
1012 --cp;
1013 }
1014
1015 prefix_len = (cp - from) + 1;
1016
1017 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1018 goto illegal_range;
1019
1020 errno = 0;
1021 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1022 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1023 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1024 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1025 && errno == ERANGE)
1026 || *to_end != '\0')
1027 {
1028 lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1029 return;
1030 }
1031
1032 if (from_nr > to_nr)
1033 {
1034 lr_error (lr, _("upper limit in range is smaller than lower limit"));
1035 return;
1036 }
1037
1038 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1039 {
1040 char *name_end;
1041 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1042 prefix_len, from, len1 - prefix_len, cnt);
1043 obstack_1grow (ob, '\0');
1044 name_end = obstack_finish (ob);
1045
1046 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1047 newp->nbytes = nbytes;
1048 memcpy (newp->bytes, bytes, nbytes);
1049 newp->name = name_end;
1050
1051 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052 if ((name_end[0] == 'U' || name_end[0] == 'P')
1053 && (len1 == 5 || len1 == 9))
1054 {
1055 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1056 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1057 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1058 this character and we don't have to consult the repertoire
1059 map.
1060
1061 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1062 and xxxxxxxx also give the code point in UCS4 but this must
1063 be in the private, i.e., unassigned, area. This should be
1064 used for characters which do not (yet) have an equivalent
1065 in ISO 10646 and Unicode. */
1066 char *endp;
1067
1068 errno = 0;
1069 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1070 if (endp - name_end != len1
1071 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1072 || newp->ucs4 >= 0x80000000)
1073 /* This wasn't successful. Signal this name cannot be a
1074 correct UCS value. */
1075 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1076 }
1077
1078 insert_entry (ht, name_end, len1, newp);
1079 insert_entry (bt, newp->bytes, nbytes, newp);
1080 /* Please note we don't examine the return value since it is no error
1081 if we have two definitions for a symbol. */
1082
1083 /* Increment the value in the byte sequence. */
1084 if (++bytes[nbytes - 1] == '\0')
1085 {
1086 int b = nbytes - 2;
1087
1088 do
1089 if (b < 0)
1090 {
1091 lr_error (lr,
1092 _("resulting bytes for range not representable."));
1093 return;
1094 }
1095 while (++bytes[b--] == 0);
1096 }
1097 }
1098 }
1099
1100
1101 struct charseq *
charmap_find_symbol(const struct charmap_t * cm,const char * bytes,size_t nbytes)1102 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1103 size_t nbytes)
1104 {
1105 void *result;
1106
1107 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1108 < 0 ? NULL : (struct charseq *) result);
1109 }
1110