1 /* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
16
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <libintl.h>
25 #include <stdarg.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <stdint.h>
29
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
34 #include "locfile.h"
35
36 /* Prototypes for local functions. */
37 static struct token *get_toplvl_escape (struct linereader *lr);
38 static struct token *get_symname (struct linereader *lr);
39 static struct token *get_ident (struct linereader *lr);
40 static struct token *get_string (struct linereader *lr,
41 const struct charmap_t *charmap,
42 struct localedef_t *locale,
43 const struct repertoire_t *repertoire,
44 int verbose);
45 static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
46
47
48 struct linereader *
lr_open(const char * fname,kw_hash_fct_t hf)49 lr_open (const char *fname, kw_hash_fct_t hf)
50 {
51 FILE *fp;
52
53 if (fname == NULL || strcmp (fname, "-") == 0
54 || strcmp (fname, "/dev/stdin") == 0)
55 return lr_create (stdin, "<stdin>", hf);
56 else
57 {
58 fp = fopen (fname, "rm");
59 if (fp == NULL)
60 return NULL;
61 return lr_create (fp, fname, hf);
62 }
63 }
64
65 struct linereader *
lr_create(FILE * fp,const char * fname,kw_hash_fct_t hf)66 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
67 {
68 struct linereader *result;
69 int n;
70
71 result = (struct linereader *) xmalloc (sizeof (*result));
72
73 result->fp = fp;
74 result->fname = xstrdup (fname);
75 result->buf = NULL;
76 result->bufsize = 0;
77 result->lineno = 1;
78 result->idx = 0;
79 result->comment_char = '#';
80 result->escape_char = '\\';
81 result->translate_strings = 1;
82 result->return_widestr = 0;
83
84 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85 if (n < 0)
86 {
87 int save = errno;
88 fclose (result->fp);
89 free ((char *) result->fname);
90 free (result);
91 errno = save;
92 return NULL;
93 }
94
95 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96 n -= 2;
97
98 result->buf[n] = '\0';
99 result->bufact = n;
100 result->hash_fct = hf;
101
102 return result;
103 }
104
105
106 int
lr_eof(struct linereader * lr)107 lr_eof (struct linereader *lr)
108 {
109 return lr->bufact = 0;
110 }
111
112
113 void
lr_ignore_rest(struct linereader * lr,int verbose)114 lr_ignore_rest (struct linereader *lr, int verbose)
115 {
116 if (verbose)
117 {
118 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
119 && lr->buf[lr->idx] != lr->comment_char)
120 if (lr->buf[lr->idx] == '\0')
121 {
122 if (lr_next (lr) < 0)
123 return;
124 }
125 else
126 ++lr->idx;
127
128 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
129 && lr->buf[lr->idx] != lr->comment_char)
130 lr_error (lr, _("trailing garbage at end of line"));
131 }
132
133 /* Ignore continued line. */
134 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
135 if (lr_next (lr) < 0)
136 break;
137
138 lr->idx = lr->bufact;
139 }
140
141
142 void
lr_close(struct linereader * lr)143 lr_close (struct linereader *lr)
144 {
145 fclose (lr->fp);
146 free (lr->buf);
147 free (lr);
148 }
149
150
151 int
lr_next(struct linereader * lr)152 lr_next (struct linereader *lr)
153 {
154 int n;
155
156 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
157 if (n < 0)
158 return -1;
159
160 ++lr->lineno;
161
162 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
163 {
164 #if 0
165 /* XXX Is this correct? */
166 /* An escaped newline character is substituted with a single <SP>. */
167 --n;
168 lr->buf[n - 1] = ' ';
169 #else
170 n -= 2;
171 #endif
172 }
173
174 lr->buf[n] = '\0';
175 lr->bufact = n;
176 lr->idx = 0;
177
178 return 0;
179 }
180
181
182 /* Defined in error.c. */
183 /* This variable is incremented each time `error' is called. */
184 extern unsigned int error_message_count;
185
186 /* The calling program should define program_name and set it to the
187 name of the executing program. */
188 extern char *program_name;
189
190
191 struct token *
lr_token(struct linereader * lr,const struct charmap_t * charmap,struct localedef_t * locale,const struct repertoire_t * repertoire,int verbose)192 lr_token (struct linereader *lr, const struct charmap_t *charmap,
193 struct localedef_t *locale, const struct repertoire_t *repertoire,
194 int verbose)
195 {
196 int ch;
197
198 while (1)
199 {
200 do
201 {
202 ch = lr_getc (lr);
203
204 if (ch == EOF)
205 {
206 lr->token.tok = tok_eof;
207 return &lr->token;
208 };
209
210 if (ch == '\n')
211 {
212 lr->token.tok = tok_eol;
213 return &lr->token;
214 }
215 }
216 while (isspace (ch));
217
218 if (ch != lr->comment_char)
219 break;
220
221 /* Is there an newline at the end of the buffer? */
222 if (lr->buf[lr->bufact - 1] != '\n')
223 {
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
226 Let's try this. */
227 lr->idx = lr->bufact;
228 continue;
229 }
230
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr, 0);
233 lr->token.tok = tok_eol;
234 return &lr->token;
235 }
236
237 /* Match escape sequences. */
238 if (ch == lr->escape_char)
239 return get_toplvl_escape (lr);
240
241 /* Match ellipsis. */
242 if (ch == '.')
243 {
244 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
245 {
246 int cnt;
247 for (cnt = 0; cnt < 10; ++cnt)
248 lr_getc (lr);
249 lr->token.tok = tok_ellipsis4_2;
250 return &lr->token;
251 }
252 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
253 {
254 lr_getc (lr);
255 lr_getc (lr);
256 lr_getc (lr);
257 lr->token.tok = tok_ellipsis4;
258 return &lr->token;
259 }
260 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
261 {
262 lr_getc (lr);
263 lr_getc (lr);
264 lr->token.tok = tok_ellipsis3;
265 return &lr->token;
266 }
267 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
268 {
269 int cnt;
270 for (cnt = 0; cnt < 6; ++cnt)
271 lr_getc (lr);
272 lr->token.tok = tok_ellipsis2_2;
273 return &lr->token;
274 }
275 if (lr->buf[lr->idx] == '.')
276 {
277 lr_getc (lr);
278 lr->token.tok = tok_ellipsis2;
279 return &lr->token;
280 }
281 }
282
283 switch (ch)
284 {
285 case '<':
286 return get_symname (lr);
287
288 case '0' ... '9':
289 lr->token.tok = tok_number;
290 lr->token.val.num = ch - '0';
291
292 while (isdigit (ch = lr_getc (lr)))
293 {
294 lr->token.val.num *= 10;
295 lr->token.val.num += ch - '0';
296 }
297 if (isalpha (ch))
298 lr_error (lr, _("garbage at end of number"));
299 lr_ungetn (lr, 1);
300
301 return &lr->token;
302
303 case ';':
304 lr->token.tok = tok_semicolon;
305 return &lr->token;
306
307 case ',':
308 lr->token.tok = tok_comma;
309 return &lr->token;
310
311 case '(':
312 lr->token.tok = tok_open_brace;
313 return &lr->token;
314
315 case ')':
316 lr->token.tok = tok_close_brace;
317 return &lr->token;
318
319 case '"':
320 return get_string (lr, charmap, locale, repertoire, verbose);
321
322 case '-':
323 ch = lr_getc (lr);
324 if (ch == '1')
325 {
326 lr->token.tok = tok_minus1;
327 return &lr->token;
328 }
329 lr_ungetn (lr, 2);
330 break;
331
332 case 0x80 ... 0xff: /* UTF-8 sequence. */
333 {
334 uint32_t wch;
335 if (!utf8_decode (lr, ch, &wch))
336 {
337 lr->token.tok = tok_error;
338 return &lr->token;
339 }
340 lr->token.tok = tok_ucs4;
341 lr->token.val.ucs4 = wch;
342 return &lr->token;
343 }
344 }
345
346 return get_ident (lr);
347 }
348
349
350 static struct token *
get_toplvl_escape(struct linereader * lr)351 get_toplvl_escape (struct linereader *lr)
352 {
353 /* This is supposed to be a numeric value. We return the
354 numerical value and the number of bytes. */
355 size_t start_idx = lr->idx - 1;
356 unsigned char *bytes = lr->token.val.charcode.bytes;
357 size_t nbytes = 0;
358 int ch;
359
360 do
361 {
362 unsigned int byte = 0;
363 unsigned int base = 8;
364
365 ch = lr_getc (lr);
366
367 if (ch == 'd')
368 {
369 base = 10;
370 ch = lr_getc (lr);
371 }
372 else if (ch == 'x')
373 {
374 base = 16;
375 ch = lr_getc (lr);
376 }
377
378 if ((base == 16 && !isxdigit (ch))
379 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
380 {
381 esc_error:
382 lr->token.val.str.startmb = &lr->buf[start_idx];
383
384 while (ch != EOF && !isspace (ch))
385 ch = lr_getc (lr);
386 lr->token.val.str.lenmb = lr->idx - start_idx;
387
388 lr->token.tok = tok_error;
389 return &lr->token;
390 }
391
392 if (isdigit (ch))
393 byte = ch - '0';
394 else
395 byte = tolower (ch) - 'a' + 10;
396
397 ch = lr_getc (lr);
398 if ((base == 16 && !isxdigit (ch))
399 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
400 goto esc_error;
401
402 byte *= base;
403 if (isdigit (ch))
404 byte += ch - '0';
405 else
406 byte += tolower (ch) - 'a' + 10;
407
408 ch = lr_getc (lr);
409 if (base != 16 && isdigit (ch))
410 {
411 byte *= base;
412 byte += ch - '0';
413
414 ch = lr_getc (lr);
415 }
416
417 bytes[nbytes++] = byte;
418 }
419 while (ch == lr->escape_char
420 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
421
422 if (!isspace (ch))
423 lr_error (lr, _("garbage at end of character code specification"));
424
425 lr_ungetn (lr, 1);
426
427 lr->token.tok = tok_charcode;
428 lr->token.val.charcode.nbytes = nbytes;
429
430 return &lr->token;
431 }
432
433 /* Multibyte string buffer. */
434 struct lr_buffer
435 {
436 size_t act;
437 size_t max;
438 char *buf;
439 };
440
441 /* Initialize *LRB with a default-sized buffer. */
442 static void
lr_buffer_init(struct lr_buffer * lrb)443 lr_buffer_init (struct lr_buffer *lrb)
444 {
445 lrb->act = 0;
446 lrb->max = 56;
447 lrb->buf = xmalloc (lrb->max);
448 }
449
450 /* Transfers the buffer string from *LRB to LR->token.mbstr. */
451 static void
lr_buffer_to_token(struct lr_buffer * lrb,struct linereader * lr)452 lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
453 {
454 lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
455 lr->token.val.str.startmb[lrb->act] = '\0';
456 lr->token.val.str.lenmb = lrb->act;
457 }
458
459 /* Adds CH to *LRB. */
460 static void
addc(struct lr_buffer * lrb,char ch)461 addc (struct lr_buffer *lrb, char ch)
462 {
463 if (lrb->act == lrb->max)
464 {
465 lrb->max *= 2;
466 lrb->buf = xrealloc (lrb->buf, lrb->max);
467 }
468 lrb->buf[lrb->act++] = ch;
469 }
470
471 /* Adds L bytes at S to *LRB. */
472 static void
adds(struct lr_buffer * lrb,const unsigned char * s,size_t l)473 adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
474 {
475 if (lrb->max - lrb->act < l)
476 {
477 size_t required_size = lrb->act + l;
478 size_t new_max = 2 * lrb->max;
479 if (new_max < required_size)
480 new_max = required_size;
481 lrb->buf = xrealloc (lrb->buf, new_max);
482 lrb->max = new_max;
483 }
484 memcpy (lrb->buf + lrb->act, s, l);
485 lrb->act += l;
486 }
487
488 #define ADDWC(ch) \
489 do \
490 { \
491 if (buf2act == buf2max) \
492 { \
493 buf2max *= 2; \
494 buf2 = xrealloc (buf2, buf2max * 4); \
495 } \
496 buf2[buf2act++] = (ch); \
497 } \
498 while (0)
499
500
501 static struct token *
get_symname(struct linereader * lr)502 get_symname (struct linereader *lr)
503 {
504 /* Symbol in brackets. We must distinguish three kinds:
505 1. reserved words
506 2. ISO 10646 position values
507 3. all other. */
508 const struct keyword_t *kw;
509 int ch;
510 struct lr_buffer lrb;
511
512 lr_buffer_init (&lrb);
513
514 do
515 {
516 ch = lr_getc (lr);
517 if (ch == lr->escape_char)
518 {
519 int c2 = lr_getc (lr);
520 addc (&lrb, c2);
521
522 if (c2 == '\n')
523 ch = '\n';
524 }
525 else
526 addc (&lrb, ch);
527 }
528 while (ch != '>' && ch != '\n');
529
530 if (ch == '\n')
531 lr_error (lr, _("unterminated symbolic name"));
532
533 /* Test for ISO 10646 position value. */
534 if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
535 {
536 char *cp = lrb.buf + 1;
537 while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
538 ++cp;
539
540 if (cp == &lrb.buf[lrb.act - 1])
541 {
542 /* Yes, it is. */
543 lr->token.tok = tok_ucs4;
544 lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
545
546 return &lr->token;
547 }
548 }
549
550 /* It is a symbolic name. Test for reserved words. */
551 kw = lr->hash_fct (lrb.buf, lrb.act - 1);
552
553 if (kw != NULL && kw->symname_or_ident == 1)
554 {
555 lr->token.tok = kw->token;
556 free (lrb.buf);
557 }
558 else
559 {
560 lr->token.tok = tok_bsymbol;
561 lr_buffer_to_token (&lrb, lr);
562 --lr->token.val.str.lenmb; /* Hide the training '>'. */
563 }
564
565 return &lr->token;
566 }
567
568
569 static struct token *
get_ident(struct linereader * lr)570 get_ident (struct linereader *lr)
571 {
572 const struct keyword_t *kw;
573 int ch;
574 struct lr_buffer lrb;
575
576 lr_buffer_init (&lrb);
577
578 addc (&lrb, lr->buf[lr->idx - 1]);
579
580 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
581 && ch != '<' && ch != ',' && ch != EOF)
582 {
583 if (ch == lr->escape_char)
584 {
585 ch = lr_getc (lr);
586 if (ch == '\n' || ch == EOF)
587 {
588 lr_error (lr, _("invalid escape sequence"));
589 break;
590 }
591 }
592 addc (&lrb, ch);
593 }
594
595 lr_ungetc (lr, ch);
596
597 kw = lr->hash_fct (lrb.buf, lrb.act);
598
599 if (kw != NULL && kw->symname_or_ident == 0)
600 {
601 lr->token.tok = kw->token;
602 free (lrb.buf);
603 }
604 else
605 {
606 lr->token.tok = tok_ident;
607 lr_buffer_to_token (&lrb, lr);
608 }
609
610 return &lr->token;
611 }
612
613 /* Process a decoded Unicode codepoint WCH in a string, placing the
614 multibyte sequence into LRB. Return false if the character is not
615 found in CHARMAP/REPERTOIRE. */
616 static bool
translate_unicode_codepoint(struct localedef_t * locale,const struct charmap_t * charmap,const struct repertoire_t * repertoire,uint32_t wch,struct lr_buffer * lrb)617 translate_unicode_codepoint (struct localedef_t *locale,
618 const struct charmap_t *charmap,
619 const struct repertoire_t *repertoire,
620 uint32_t wch, struct lr_buffer *lrb)
621 {
622 /* See whether the charmap contains the Uxxxxxxxx names. */
623 char utmp[10];
624 snprintf (utmp, sizeof (utmp), "U%08X", wch);
625 struct charseq *seq = charmap_find_value (charmap, utmp, 9);
626
627 if (seq == NULL)
628 {
629 /* No, this isn't the case. Now determine from
630 the repertoire the name of the character and
631 find it in the charmap. */
632 if (repertoire != NULL)
633 {
634 const char *symbol = repertoire_find_symbol (repertoire, wch);
635 if (symbol != NULL)
636 seq = charmap_find_value (charmap, symbol, strlen (symbol));
637 }
638
639 if (seq == NULL)
640 {
641 #ifndef NO_TRANSLITERATION
642 /* Transliterate if possible. */
643 if (locale != NULL)
644 {
645 if ((locale->avail & CTYPE_LOCALE) == 0)
646 {
647 /* Load the CTYPE data now. */
648 int old_needed = locale->needed;
649
650 locale->needed = 0;
651 locale = load_locale (LC_CTYPE, locale->name,
652 locale->repertoire_name,
653 charmap, locale);
654 locale->needed = old_needed;
655 }
656
657 uint32_t *translit;
658 if ((locale->avail & CTYPE_LOCALE) != 0
659 && ((translit = find_translit (locale, charmap, wch))
660 != NULL))
661 /* The CTYPE data contains a matching
662 transliteration. */
663 {
664 for (int i = 0; translit[i] != 0; ++i)
665 {
666 snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
667 seq = charmap_find_value (charmap, utmp, 9);
668 assert (seq != NULL);
669 adds (lrb, seq->bytes, seq->nbytes);
670 }
671 return true;
672 }
673 }
674 #endif /* NO_TRANSLITERATION */
675
676 /* Not a known name. */
677 return false;
678 }
679 }
680
681 if (seq != NULL)
682 {
683 adds (lrb, seq->bytes, seq->nbytes);
684 return true;
685 }
686 else
687 return false;
688 }
689
690 /* Returns true if ch is not EOF (that is, non-negative) and a valid
691 UTF-8 trailing byte. */
692 static bool
utf8_valid_trailing(int ch)693 utf8_valid_trailing (int ch)
694 {
695 return ch >= 0 && (ch & 0xc0) == 0x80;
696 }
697
698 /* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be
699 EOF. Always returns false. */
700 static bool
utf8_sequence_error(struct linereader * lr,uint8_t ch1,int ch2,int ch3,int ch4)701 utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
702 int ch4)
703 {
704 char buf[30];
705
706 if (ch2 < 0)
707 snprintf (buf, sizeof (buf), "0x%02x", ch1);
708 else if (ch3 < 0)
709 snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
710 else if (ch4 < 0)
711 snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
712 else
713 snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
714 ch1, ch2, ch3, ch4);
715
716 lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
717 return false;
718 }
719
720 /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
721 stores the decoded codepoint in *WCH. Returns false on failure and
722 reports an error. */
723 static bool
utf8_decode(struct linereader * lr,uint8_t ch1,uint32_t * wch)724 utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
725 {
726 /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */
727 if (ch1 < 0xc2)
728 return utf8_sequence_error (lr, ch1, -1, -1, -1);
729
730 int ch2 = lr_getc (lr);
731 if (!utf8_valid_trailing (ch2))
732 return utf8_sequence_error (lr, ch1, ch2, -1, -1);
733
734 if (ch1 <= 0xdf)
735 {
736 uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f);
737 if (result < 0x80)
738 return utf8_sequence_error (lr, ch1, ch2, -1, -1);
739 *wch = result;
740 return true;
741 }
742
743 int ch3 = lr_getc (lr);
744 if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
745 return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
746
747 if (ch1 <= 0xef)
748 {
749 uint32_t result = (((ch1 & 0x0f) << 12)
750 | ((ch2 & 0x3f) << 6)
751 | (ch3 & 0x3f));
752 if (result < 0x800)
753 return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
754 *wch = result;
755 return true;
756 }
757
758 int ch4 = lr_getc (lr);
759 if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
760 return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
761
762 uint32_t result = (((ch1 & 0x07) << 18)
763 | ((ch2 & 0x3f) << 12)
764 | ((ch3 & 0x3f) << 6)
765 | (ch4 & 0x3f));
766 if (result < 0x10000)
767 return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
768 *wch = result;
769 return true;
770 }
771
772 static struct token *
get_string(struct linereader * lr,const struct charmap_t * charmap,struct localedef_t * locale,const struct repertoire_t * repertoire,int verbose)773 get_string (struct linereader *lr, const struct charmap_t *charmap,
774 struct localedef_t *locale, const struct repertoire_t *repertoire,
775 int verbose)
776 {
777 int return_widestr = lr->return_widestr;
778 struct lr_buffer lrb;
779 wchar_t *buf2 = NULL;
780
781 lr_buffer_init (&lrb);
782
783 /* We know it'll be a string. */
784 lr->token.tok = tok_string;
785
786 /* If we need not translate the strings (i.e., expand <...> parts)
787 we can run a simple loop. */
788 if (!lr->translate_strings)
789 {
790 int ch;
791
792 buf2 = NULL;
793 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
794 {
795 if (ch >= 0x80)
796 lr_error (lr, _("illegal 8-bit character in untranslated string"));
797 addc (&lrb, ch);
798 }
799
800 /* Catch errors with trailing escape character. */
801 if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
802 && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
803 {
804 lr_error (lr, _("illegal escape sequence at end of string"));
805 --lrb.act;
806 }
807 else if (ch == '\n' || ch == EOF)
808 lr_error (lr, _("unterminated string"));
809
810 addc (&lrb, '\0');
811 }
812 else
813 {
814 bool illegal_string = false;
815 size_t buf2act = 0;
816 size_t buf2max = 56 * sizeof (uint32_t);
817 int ch;
818
819 /* We have to provide the wide character result as well. */
820 if (return_widestr)
821 buf2 = xmalloc (buf2max);
822
823 /* Read until the end of the string (or end of the line or file). */
824 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
825 {
826 size_t startidx;
827 uint32_t wch;
828 struct charseq *seq;
829
830 if (ch != '<')
831 {
832 /* The standards leave it up to the implementation to
833 decide what to do with characters which stand for
834 themselves. This implementation treats the input
835 file as encoded in UTF-8. */
836 if (ch == lr->escape_char)
837 {
838 ch = lr_getc (lr);
839 if (ch >= 0x80)
840 {
841 lr_error (lr, _("illegal 8-bit escape sequence"));
842 illegal_string = true;
843 break;
844 }
845 if (ch == '\n' || ch == EOF)
846 break;
847 addc (&lrb, ch);
848 wch = ch;
849 }
850 else if (ch < 0x80)
851 {
852 wch = ch;
853 addc (&lrb, ch);
854 }
855 else /* UTF-8 sequence. */
856 {
857 if (!utf8_decode (lr, ch, &wch))
858 {
859 illegal_string = true;
860 break;
861 }
862 if (!translate_unicode_codepoint (locale, charmap,
863 repertoire, wch, &lrb))
864 {
865 /* Ignore the rest of the string. Callers may
866 skip this string because it cannot be encoded
867 in the output character set. */
868 illegal_string = true;
869 continue;
870 }
871 }
872
873 if (return_widestr)
874 ADDWC (wch);
875
876 continue;
877 }
878
879 /* Now we have to search for the end of the symbolic name, i.e.,
880 the closing '>'. */
881 startidx = lrb.act;
882 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
883 {
884 if (ch == lr->escape_char)
885 {
886 ch = lr_getc (lr);
887 if (ch == '\n' || ch == EOF)
888 break;
889 }
890 addc (&lrb, ch);
891 }
892 if (ch == '\n' || ch == EOF)
893 /* Not a correct string. */
894 break;
895 if (lrb.act == startidx)
896 {
897 /* <> is no correct name. Ignore it and also signal an
898 error. */
899 illegal_string = true;
900 continue;
901 }
902
903 /* It might be a Uxxxx symbol. */
904 if (lrb.buf[startidx] == 'U'
905 && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
906 {
907 char *cp = lrb.buf + startidx + 1;
908 while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
909 ++cp;
910
911 if (cp == &lrb.buf[lrb.act])
912 {
913 /* Yes, it is. */
914 addc (&lrb, '\0');
915 wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
916
917 /* Now forget about the name we just added. */
918 lrb.act = startidx;
919
920 if (return_widestr)
921 ADDWC (wch);
922
923 if (!translate_unicode_codepoint (locale, charmap,
924 repertoire, wch, &lrb))
925 illegal_string = true;
926 continue;
927 }
928 }
929
930 /* We now have the symbolic name in lrb.buf[startidx] to
931 lrb.buf[lrb.act-1]. Now find out the value for this character
932 in the charmap as well as in the repertoire map (in this
933 order). */
934 seq = charmap_find_value (charmap, &lrb.buf[startidx],
935 lrb.act - startidx);
936
937 if (seq == NULL)
938 {
939 /* This name is not in the charmap. */
940 lr_error (lr, _("symbol `%.*s' not in charmap"),
941 (int) (lrb.act - startidx), &lrb.buf[startidx]);
942 illegal_string = true;
943 }
944
945 if (return_widestr)
946 {
947 /* Now the same for the multibyte representation. */
948 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
949 wch = seq->ucs4;
950 else
951 {
952 wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
953 lrb.act - startidx);
954 if (seq != NULL)
955 seq->ucs4 = wch;
956 }
957
958 if (wch == ILLEGAL_CHAR_VALUE)
959 {
960 /* This name is not in the repertoire map. */
961 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
962 (int) (lrb.act - startidx), &lrb.buf[startidx]);
963 illegal_string = true;
964 }
965 else
966 ADDWC (wch);
967 }
968
969 /* Now forget about the name we just added. */
970 lrb.act = startidx;
971
972 /* And copy the bytes. */
973 if (seq != NULL)
974 adds (&lrb, seq->bytes, seq->nbytes);
975 }
976
977 if (ch == '\n' || ch == EOF)
978 {
979 lr_error (lr, _("unterminated string"));
980 illegal_string = true;
981 }
982
983 if (illegal_string)
984 {
985 free (lrb.buf);
986 free (buf2);
987 lr->token.val.str.startmb = NULL;
988 lr->token.val.str.lenmb = 0;
989 lr->token.val.str.startwc = NULL;
990 lr->token.val.str.lenwc = 0;
991
992 return &lr->token;
993 }
994
995 addc (&lrb, '\0');
996
997 if (return_widestr)
998 {
999 ADDWC (0);
1000 lr->token.val.str.startwc = xrealloc (buf2,
1001 buf2act * sizeof (uint32_t));
1002 lr->token.val.str.lenwc = buf2act;
1003 }
1004 }
1005
1006 lr_buffer_to_token (&lrb, lr);
1007
1008 return &lr->token;
1009 }
1010