1 /* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published
6    by the Free Software Foundation; version 2 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
16 
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <libintl.h>
25 #include <stdarg.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <stdint.h>
29 
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
34 #include "locfile.h"
35 
36 /* Prototypes for local functions.  */
37 static struct token *get_toplvl_escape (struct linereader *lr);
38 static struct token *get_symname (struct linereader *lr);
39 static struct token *get_ident (struct linereader *lr);
40 static struct token *get_string (struct linereader *lr,
41 				 const struct charmap_t *charmap,
42 				 struct localedef_t *locale,
43 				 const struct repertoire_t *repertoire,
44 				 int verbose);
45 static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
46 
47 
48 struct linereader *
lr_open(const char * fname,kw_hash_fct_t hf)49 lr_open (const char *fname, kw_hash_fct_t hf)
50 {
51   FILE *fp;
52 
53   if (fname == NULL || strcmp (fname, "-") == 0
54       || strcmp (fname, "/dev/stdin") == 0)
55     return lr_create (stdin, "<stdin>", hf);
56   else
57     {
58       fp = fopen (fname, "rm");
59       if (fp == NULL)
60 	return NULL;
61       return lr_create (fp, fname, hf);
62     }
63 }
64 
65 struct linereader *
lr_create(FILE * fp,const char * fname,kw_hash_fct_t hf)66 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
67 {
68   struct linereader *result;
69   int n;
70 
71   result = (struct linereader *) xmalloc (sizeof (*result));
72 
73   result->fp = fp;
74   result->fname = xstrdup (fname);
75   result->buf = NULL;
76   result->bufsize = 0;
77   result->lineno = 1;
78   result->idx = 0;
79   result->comment_char = '#';
80   result->escape_char = '\\';
81   result->translate_strings = 1;
82   result->return_widestr = 0;
83 
84   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85   if (n < 0)
86     {
87       int save = errno;
88       fclose (result->fp);
89       free ((char *) result->fname);
90       free (result);
91       errno = save;
92       return NULL;
93     }
94 
95   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96     n -= 2;
97 
98   result->buf[n] = '\0';
99   result->bufact = n;
100   result->hash_fct = hf;
101 
102   return result;
103 }
104 
105 
106 int
lr_eof(struct linereader * lr)107 lr_eof (struct linereader *lr)
108 {
109   return lr->bufact = 0;
110 }
111 
112 
113 void
lr_ignore_rest(struct linereader * lr,int verbose)114 lr_ignore_rest (struct linereader *lr, int verbose)
115 {
116   if (verbose)
117     {
118       while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
119 	     && lr->buf[lr->idx] != lr->comment_char)
120 	if (lr->buf[lr->idx] == '\0')
121 	  {
122 	    if (lr_next (lr) < 0)
123 	      return;
124 	  }
125 	else
126 	  ++lr->idx;
127 
128       if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
129 	  && lr->buf[lr->idx] != lr->comment_char)
130 	lr_error (lr, _("trailing garbage at end of line"));
131     }
132 
133   /* Ignore continued line.  */
134   while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
135     if (lr_next (lr) < 0)
136       break;
137 
138   lr->idx = lr->bufact;
139 }
140 
141 
142 void
lr_close(struct linereader * lr)143 lr_close (struct linereader *lr)
144 {
145   fclose (lr->fp);
146   free (lr->buf);
147   free (lr);
148 }
149 
150 
151 int
lr_next(struct linereader * lr)152 lr_next (struct linereader *lr)
153 {
154   int n;
155 
156   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
157   if (n < 0)
158     return -1;
159 
160   ++lr->lineno;
161 
162   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
163     {
164 #if 0
165       /* XXX Is this correct?  */
166       /* An escaped newline character is substituted with a single <SP>.  */
167       --n;
168       lr->buf[n - 1] = ' ';
169 #else
170       n -= 2;
171 #endif
172     }
173 
174   lr->buf[n] = '\0';
175   lr->bufact = n;
176   lr->idx = 0;
177 
178   return 0;
179 }
180 
181 
182 /* Defined in error.c.  */
183 /* This variable is incremented each time `error' is called.  */
184 extern unsigned int error_message_count;
185 
186 /* The calling program should define program_name and set it to the
187    name of the executing program.  */
188 extern char *program_name;
189 
190 
191 struct token *
lr_token(struct linereader * lr,const struct charmap_t * charmap,struct localedef_t * locale,const struct repertoire_t * repertoire,int verbose)192 lr_token (struct linereader *lr, const struct charmap_t *charmap,
193 	  struct localedef_t *locale, const struct repertoire_t *repertoire,
194 	  int verbose)
195 {
196   int ch;
197 
198   while (1)
199     {
200       do
201 	{
202 	  ch = lr_getc (lr);
203 
204 	  if (ch == EOF)
205 	    {
206 	      lr->token.tok = tok_eof;
207 	      return &lr->token;
208 	    };
209 
210 	  if (ch == '\n')
211 	    {
212 	      lr->token.tok = tok_eol;
213 	      return &lr->token;
214 	    }
215 	}
216       while (isspace (ch));
217 
218       if (ch != lr->comment_char)
219 	break;
220 
221       /* Is there an newline at the end of the buffer?  */
222       if (lr->buf[lr->bufact - 1] != '\n')
223 	{
224 	  /* No.  Some people want this to mean that only the line in
225 	     the file not the logical, concatenated line is ignored.
226 	     Let's try this.  */
227 	  lr->idx = lr->bufact;
228 	  continue;
229 	}
230 
231       /* Ignore rest of line.  */
232       lr_ignore_rest (lr, 0);
233       lr->token.tok = tok_eol;
234       return &lr->token;
235     }
236 
237   /* Match escape sequences.  */
238   if (ch == lr->escape_char)
239     return get_toplvl_escape (lr);
240 
241   /* Match ellipsis.  */
242   if (ch == '.')
243     {
244       if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
245 	{
246 	  int cnt;
247 	  for (cnt = 0; cnt < 10; ++cnt)
248 	    lr_getc (lr);
249 	  lr->token.tok = tok_ellipsis4_2;
250 	  return &lr->token;
251 	}
252       if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
253 	{
254 	  lr_getc (lr);
255 	  lr_getc (lr);
256 	  lr_getc (lr);
257 	  lr->token.tok = tok_ellipsis4;
258 	  return &lr->token;
259 	}
260       if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
261 	{
262 	  lr_getc (lr);
263 	  lr_getc (lr);
264 	  lr->token.tok = tok_ellipsis3;
265 	  return &lr->token;
266 	}
267       if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
268 	{
269 	  int cnt;
270 	  for (cnt = 0; cnt < 6; ++cnt)
271 	    lr_getc (lr);
272 	  lr->token.tok = tok_ellipsis2_2;
273 	  return &lr->token;
274 	}
275       if (lr->buf[lr->idx] == '.')
276 	{
277 	  lr_getc (lr);
278 	  lr->token.tok = tok_ellipsis2;
279 	  return &lr->token;
280 	}
281     }
282 
283   switch (ch)
284     {
285     case '<':
286       return get_symname (lr);
287 
288     case '0' ... '9':
289       lr->token.tok = tok_number;
290       lr->token.val.num = ch - '0';
291 
292       while (isdigit (ch = lr_getc (lr)))
293 	{
294 	  lr->token.val.num *= 10;
295 	  lr->token.val.num += ch - '0';
296 	}
297       if (isalpha (ch))
298 	lr_error (lr, _("garbage at end of number"));
299       lr_ungetn (lr, 1);
300 
301       return &lr->token;
302 
303     case ';':
304       lr->token.tok = tok_semicolon;
305       return &lr->token;
306 
307     case ',':
308       lr->token.tok = tok_comma;
309       return &lr->token;
310 
311     case '(':
312       lr->token.tok = tok_open_brace;
313       return &lr->token;
314 
315     case ')':
316       lr->token.tok = tok_close_brace;
317       return &lr->token;
318 
319     case '"':
320       return get_string (lr, charmap, locale, repertoire, verbose);
321 
322     case '-':
323       ch = lr_getc (lr);
324       if (ch == '1')
325 	{
326 	  lr->token.tok = tok_minus1;
327 	  return &lr->token;
328 	}
329       lr_ungetn (lr, 2);
330       break;
331 
332     case 0x80 ... 0xff:		/* UTF-8 sequence.  */
333       {
334 	uint32_t wch;
335 	if (!utf8_decode (lr, ch, &wch))
336 	  {
337 	    lr->token.tok = tok_error;
338 	    return &lr->token;
339 	  }
340 	lr->token.tok = tok_ucs4;
341 	lr->token.val.ucs4 = wch;
342 	return &lr->token;
343       }
344     }
345 
346   return get_ident (lr);
347 }
348 
349 
350 static struct token *
get_toplvl_escape(struct linereader * lr)351 get_toplvl_escape (struct linereader *lr)
352 {
353   /* This is supposed to be a numeric value.  We return the
354      numerical value and the number of bytes.  */
355   size_t start_idx = lr->idx - 1;
356   unsigned char *bytes = lr->token.val.charcode.bytes;
357   size_t nbytes = 0;
358   int ch;
359 
360   do
361     {
362       unsigned int byte = 0;
363       unsigned int base = 8;
364 
365       ch = lr_getc (lr);
366 
367       if (ch == 'd')
368 	{
369 	  base = 10;
370 	  ch = lr_getc (lr);
371 	}
372       else if (ch == 'x')
373 	{
374 	  base = 16;
375 	  ch = lr_getc (lr);
376 	}
377 
378       if ((base == 16 && !isxdigit (ch))
379 	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
380 	{
381 	esc_error:
382 	  lr->token.val.str.startmb = &lr->buf[start_idx];
383 
384 	  while (ch != EOF && !isspace (ch))
385 	    ch = lr_getc (lr);
386 	  lr->token.val.str.lenmb = lr->idx - start_idx;
387 
388 	  lr->token.tok = tok_error;
389 	  return &lr->token;
390 	}
391 
392       if (isdigit (ch))
393 	byte = ch - '0';
394       else
395 	byte = tolower (ch) - 'a' + 10;
396 
397       ch = lr_getc (lr);
398       if ((base == 16 && !isxdigit (ch))
399 	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
400 	goto esc_error;
401 
402       byte *= base;
403       if (isdigit (ch))
404 	byte += ch - '0';
405       else
406 	byte += tolower (ch) - 'a' + 10;
407 
408       ch = lr_getc (lr);
409       if (base != 16 && isdigit (ch))
410 	{
411 	  byte *= base;
412 	  byte += ch - '0';
413 
414 	  ch = lr_getc (lr);
415 	}
416 
417       bytes[nbytes++] = byte;
418     }
419   while (ch == lr->escape_char
420 	 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
421 
422   if (!isspace (ch))
423     lr_error (lr, _("garbage at end of character code specification"));
424 
425   lr_ungetn (lr, 1);
426 
427   lr->token.tok = tok_charcode;
428   lr->token.val.charcode.nbytes = nbytes;
429 
430   return &lr->token;
431 }
432 
433 /* Multibyte string buffer.  */
434 struct lr_buffer
435 {
436   size_t act;
437   size_t max;
438   char *buf;
439 };
440 
441 /* Initialize *LRB with a default-sized buffer.  */
442 static void
lr_buffer_init(struct lr_buffer * lrb)443 lr_buffer_init (struct lr_buffer *lrb)
444 {
445  lrb->act = 0;
446  lrb->max = 56;
447  lrb->buf = xmalloc (lrb->max);
448 }
449 
450 /* Transfers the buffer string from *LRB to LR->token.mbstr.  */
451 static void
lr_buffer_to_token(struct lr_buffer * lrb,struct linereader * lr)452 lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
453 {
454   lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
455   lr->token.val.str.startmb[lrb->act] = '\0';
456   lr->token.val.str.lenmb = lrb->act;
457 }
458 
459 /* Adds CH to *LRB.  */
460 static void
addc(struct lr_buffer * lrb,char ch)461 addc (struct lr_buffer *lrb, char ch)
462 {
463   if (lrb->act == lrb->max)
464     {
465       lrb->max *= 2;
466       lrb->buf = xrealloc (lrb->buf, lrb->max);
467     }
468   lrb->buf[lrb->act++] = ch;
469 }
470 
471 /* Adds L bytes at S to *LRB.  */
472 static void
adds(struct lr_buffer * lrb,const unsigned char * s,size_t l)473 adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
474 {
475   if (lrb->max - lrb->act < l)
476     {
477       size_t required_size = lrb->act + l;
478       size_t new_max = 2 * lrb->max;
479       if (new_max < required_size)
480 	new_max = required_size;
481       lrb->buf = xrealloc (lrb->buf, new_max);
482       lrb->max = new_max;
483     }
484   memcpy (lrb->buf + lrb->act, s, l);
485   lrb->act += l;
486 }
487 
488 #define ADDWC(ch) \
489   do									      \
490     {									      \
491       if (buf2act == buf2max)						      \
492 	{								      \
493 	  buf2max *= 2;							      \
494 	  buf2 = xrealloc (buf2, buf2max * 4);				      \
495 	}								      \
496       buf2[buf2act++] = (ch);						      \
497     }									      \
498   while (0)
499 
500 
501 static struct token *
get_symname(struct linereader * lr)502 get_symname (struct linereader *lr)
503 {
504   /* Symbol in brackets.  We must distinguish three kinds:
505      1. reserved words
506      2. ISO 10646 position values
507      3. all other.  */
508   const struct keyword_t *kw;
509   int ch;
510   struct lr_buffer lrb;
511 
512   lr_buffer_init (&lrb);
513 
514   do
515     {
516       ch = lr_getc (lr);
517       if (ch == lr->escape_char)
518 	{
519 	  int c2 = lr_getc (lr);
520 	  addc (&lrb, c2);
521 
522 	  if (c2 == '\n')
523 	    ch = '\n';
524 	}
525       else
526 	addc (&lrb, ch);
527     }
528   while (ch != '>' && ch != '\n');
529 
530   if (ch == '\n')
531     lr_error (lr, _("unterminated symbolic name"));
532 
533   /* Test for ISO 10646 position value.  */
534   if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
535     {
536       char *cp = lrb.buf + 1;
537       while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
538 	++cp;
539 
540       if (cp == &lrb.buf[lrb.act - 1])
541 	{
542 	  /* Yes, it is.  */
543 	  lr->token.tok = tok_ucs4;
544 	  lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
545 
546 	  return &lr->token;
547 	}
548     }
549 
550   /* It is a symbolic name.  Test for reserved words.  */
551   kw = lr->hash_fct (lrb.buf, lrb.act - 1);
552 
553   if (kw != NULL && kw->symname_or_ident == 1)
554     {
555       lr->token.tok = kw->token;
556       free (lrb.buf);
557     }
558   else
559     {
560       lr->token.tok = tok_bsymbol;
561       lr_buffer_to_token (&lrb, lr);
562       --lr->token.val.str.lenmb;  /* Hide the training '>'.  */
563     }
564 
565   return &lr->token;
566 }
567 
568 
569 static struct token *
get_ident(struct linereader * lr)570 get_ident (struct linereader *lr)
571 {
572   const struct keyword_t *kw;
573   int ch;
574   struct lr_buffer lrb;
575 
576   lr_buffer_init (&lrb);
577 
578   addc (&lrb, lr->buf[lr->idx - 1]);
579 
580   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
581 	 && ch != '<' && ch != ',' && ch != EOF)
582     {
583       if (ch == lr->escape_char)
584 	{
585 	  ch = lr_getc (lr);
586 	  if (ch == '\n' || ch == EOF)
587 	    {
588 	      lr_error (lr, _("invalid escape sequence"));
589 	      break;
590 	    }
591 	}
592       addc (&lrb, ch);
593     }
594 
595   lr_ungetc (lr, ch);
596 
597   kw = lr->hash_fct (lrb.buf, lrb.act);
598 
599   if (kw != NULL && kw->symname_or_ident == 0)
600     {
601       lr->token.tok = kw->token;
602       free (lrb.buf);
603     }
604   else
605     {
606       lr->token.tok = tok_ident;
607       lr_buffer_to_token (&lrb, lr);
608     }
609 
610   return &lr->token;
611 }
612 
613 /* Process a decoded Unicode codepoint WCH in a string, placing the
614    multibyte sequence into LRB.  Return false if the character is not
615    found in CHARMAP/REPERTOIRE.  */
616 static bool
translate_unicode_codepoint(struct localedef_t * locale,const struct charmap_t * charmap,const struct repertoire_t * repertoire,uint32_t wch,struct lr_buffer * lrb)617 translate_unicode_codepoint (struct localedef_t *locale,
618 			     const struct charmap_t *charmap,
619 			     const struct repertoire_t *repertoire,
620 			     uint32_t wch, struct lr_buffer *lrb)
621 {
622   /* See whether the charmap contains the Uxxxxxxxx names.  */
623   char utmp[10];
624   snprintf (utmp, sizeof (utmp), "U%08X", wch);
625   struct charseq *seq = charmap_find_value (charmap, utmp, 9);
626 
627   if (seq == NULL)
628     {
629       /* No, this isn't the case.  Now determine from
630 	 the repertoire the name of the character and
631 	 find it in the charmap.  */
632       if (repertoire != NULL)
633 	{
634 	  const char *symbol = repertoire_find_symbol (repertoire, wch);
635 	  if (symbol != NULL)
636 	    seq = charmap_find_value (charmap, symbol, strlen (symbol));
637 	}
638 
639       if (seq == NULL)
640 	{
641 #ifndef NO_TRANSLITERATION
642 	  /* Transliterate if possible.  */
643 	  if (locale != NULL)
644 	    {
645 	      if ((locale->avail & CTYPE_LOCALE) == 0)
646 		{
647 		  /* Load the CTYPE data now.  */
648 		  int old_needed = locale->needed;
649 
650 		  locale->needed = 0;
651 		  locale = load_locale (LC_CTYPE, locale->name,
652 					locale->repertoire_name,
653 					charmap, locale);
654 		  locale->needed = old_needed;
655 		}
656 
657 	      uint32_t *translit;
658 	      if ((locale->avail & CTYPE_LOCALE) != 0
659 		  && ((translit = find_translit (locale, charmap, wch))
660 		      != NULL))
661 		/* The CTYPE data contains a matching
662 		   transliteration.  */
663 		{
664 		  for (int i = 0; translit[i] != 0; ++i)
665 		    {
666 		      snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
667 		      seq = charmap_find_value (charmap, utmp, 9);
668 		      assert (seq != NULL);
669 		      adds (lrb, seq->bytes, seq->nbytes);
670 		    }
671 		  return true;
672 		}
673 	    }
674 #endif	/* NO_TRANSLITERATION */
675 
676 	  /* Not a known name.  */
677 	  return false;
678 	}
679     }
680 
681   if (seq != NULL)
682     {
683       adds (lrb, seq->bytes, seq->nbytes);
684       return true;
685     }
686   else
687     return false;
688 }
689 
690 /* Returns true if ch is not EOF (that is, non-negative) and a valid
691    UTF-8 trailing byte.  */
692 static bool
utf8_valid_trailing(int ch)693 utf8_valid_trailing (int ch)
694 {
695   return ch >= 0 && (ch & 0xc0) == 0x80;
696 }
697 
698 /* Reports an error for a broken UTF-8 sequence.  CH2 to CH4 may be
699    EOF.  Always returns false.  */
700 static bool
utf8_sequence_error(struct linereader * lr,uint8_t ch1,int ch2,int ch3,int ch4)701 utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
702 		     int ch4)
703 {
704   char buf[30];
705 
706   if (ch2 < 0)
707     snprintf (buf, sizeof (buf), "0x%02x", ch1);
708   else if (ch3 < 0)
709     snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
710   else if (ch4 < 0)
711     snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
712   else
713     snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
714 	      ch1, ch2, ch3, ch4);
715 
716   lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
717   return false;
718 }
719 
720 /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
721    stores the decoded codepoint in *WCH.  Returns false on failure and
722    reports an error.  */
723 static bool
utf8_decode(struct linereader * lr,uint8_t ch1,uint32_t * wch)724 utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
725 {
726   /* See RFC 3629 section 4 and __gconv_transform_utf8_internal.  */
727   if (ch1 < 0xc2)
728     return utf8_sequence_error (lr, ch1, -1, -1, -1);
729 
730   int ch2 = lr_getc (lr);
731   if (!utf8_valid_trailing (ch2))
732     return utf8_sequence_error (lr, ch1, ch2, -1, -1);
733 
734   if (ch1 <= 0xdf)
735     {
736       uint32_t result = ((ch1 & 0x1f)  << 6) | (ch2 & 0x3f);
737       if (result < 0x80)
738 	return utf8_sequence_error (lr, ch1, ch2, -1, -1);
739       *wch = result;
740       return true;
741     }
742 
743   int ch3 = lr_getc (lr);
744   if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
745     return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
746 
747   if (ch1 <= 0xef)
748     {
749       uint32_t result = (((ch1 & 0x0f)  << 12)
750 			 | ((ch2 & 0x3f) << 6)
751 			 | (ch3 & 0x3f));
752       if (result < 0x800)
753 	return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
754       *wch = result;
755       return true;
756     }
757 
758   int ch4 = lr_getc (lr);
759   if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
760     return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
761 
762   uint32_t result = (((ch1 & 0x07)  << 18)
763 		     | ((ch2 & 0x3f) << 12)
764 		     | ((ch3 & 0x3f) << 6)
765 		     | (ch4 & 0x3f));
766   if (result < 0x10000)
767     return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
768   *wch = result;
769   return true;
770 }
771 
772 static struct token *
get_string(struct linereader * lr,const struct charmap_t * charmap,struct localedef_t * locale,const struct repertoire_t * repertoire,int verbose)773 get_string (struct linereader *lr, const struct charmap_t *charmap,
774 	    struct localedef_t *locale, const struct repertoire_t *repertoire,
775 	    int verbose)
776 {
777   int return_widestr = lr->return_widestr;
778   struct lr_buffer lrb;
779   wchar_t *buf2 = NULL;
780 
781   lr_buffer_init (&lrb);
782 
783   /* We know it'll be a string.  */
784   lr->token.tok = tok_string;
785 
786   /* If we need not translate the strings (i.e., expand <...> parts)
787      we can run a simple loop.  */
788   if (!lr->translate_strings)
789     {
790       int ch;
791 
792       buf2 = NULL;
793       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
794 	{
795 	  if (ch >= 0x80)
796 	    lr_error (lr, _("illegal 8-bit character in untranslated string"));
797 	  addc (&lrb, ch);
798 	}
799 
800       /* Catch errors with trailing escape character.  */
801       if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
802 	  && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
803 	{
804 	  lr_error (lr, _("illegal escape sequence at end of string"));
805 	  --lrb.act;
806 	}
807       else if (ch == '\n' || ch == EOF)
808 	lr_error (lr, _("unterminated string"));
809 
810       addc (&lrb, '\0');
811     }
812   else
813     {
814       bool illegal_string = false;
815       size_t buf2act = 0;
816       size_t buf2max = 56 * sizeof (uint32_t);
817       int ch;
818 
819       /* We have to provide the wide character result as well.  */
820       if (return_widestr)
821 	buf2 = xmalloc (buf2max);
822 
823       /* Read until the end of the string (or end of the line or file).  */
824       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
825 	{
826 	  size_t startidx;
827 	  uint32_t wch;
828 	  struct charseq *seq;
829 
830 	  if (ch != '<')
831 	    {
832 	      /* The standards leave it up to the implementation to
833 		 decide what to do with characters which stand for
834 		 themselves.  This implementation treats the input
835 		 file as encoded in UTF-8.  */
836 	      if (ch == lr->escape_char)
837 		{
838 		  ch = lr_getc (lr);
839 		  if (ch >= 0x80)
840 		    {
841 		      lr_error (lr, _("illegal 8-bit escape sequence"));
842 		      illegal_string = true;
843 		      break;
844 		    }
845 		  if (ch == '\n' || ch == EOF)
846 		    break;
847 		  addc (&lrb, ch);
848 		  wch = ch;
849 		}
850 	      else if (ch < 0x80)
851 		{
852 		  wch = ch;
853 		  addc (&lrb, ch);
854 		}
855 	      else 		/* UTF-8 sequence.  */
856 		{
857 		  if (!utf8_decode (lr, ch, &wch))
858 		    {
859 		      illegal_string = true;
860 		      break;
861 		    }
862 		  if (!translate_unicode_codepoint (locale, charmap,
863 						    repertoire, wch, &lrb))
864 		    {
865 		      /* Ignore the rest of the string.  Callers may
866 			 skip this string because it cannot be encoded
867 			 in the output character set.  */
868 		      illegal_string = true;
869 		      continue;
870 		    }
871 		}
872 
873 	      if (return_widestr)
874 		ADDWC (wch);
875 
876 	      continue;
877 	    }
878 
879 	  /* Now we have to search for the end of the symbolic name, i.e.,
880 	     the closing '>'.  */
881 	  startidx = lrb.act;
882 	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
883 	    {
884 	      if (ch == lr->escape_char)
885 		{
886 		  ch = lr_getc (lr);
887 		  if (ch == '\n' || ch == EOF)
888 		    break;
889 		}
890 	      addc (&lrb, ch);
891 	    }
892 	  if (ch == '\n' || ch == EOF)
893 	    /* Not a correct string.  */
894 	    break;
895 	  if (lrb.act == startidx)
896 	    {
897 	      /* <> is no correct name.  Ignore it and also signal an
898 		 error.  */
899 	      illegal_string = true;
900 	      continue;
901 	    }
902 
903 	  /* It might be a Uxxxx symbol.  */
904 	  if (lrb.buf[startidx] == 'U'
905 	      && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
906 	    {
907 	      char *cp = lrb.buf + startidx + 1;
908 	      while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
909 		++cp;
910 
911 	      if (cp == &lrb.buf[lrb.act])
912 		{
913 		  /* Yes, it is.  */
914 		  addc (&lrb, '\0');
915 		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
916 
917 		  /* Now forget about the name we just added.  */
918 		  lrb.act = startidx;
919 
920 		  if (return_widestr)
921 		    ADDWC (wch);
922 
923 		  if (!translate_unicode_codepoint (locale, charmap,
924 						    repertoire, wch, &lrb))
925 		    illegal_string = true;
926 		  continue;
927 		}
928 	    }
929 
930 	  /* We now have the symbolic name in lrb.buf[startidx] to
931 	     lrb.buf[lrb.act-1].  Now find out the value for this character
932 	     in the charmap as well as in the repertoire map (in this
933 	     order).  */
934 	  seq = charmap_find_value (charmap, &lrb.buf[startidx],
935 				    lrb.act - startidx);
936 
937 	  if (seq == NULL)
938 	    {
939 	      /* This name is not in the charmap.  */
940 	      lr_error (lr, _("symbol `%.*s' not in charmap"),
941 			(int) (lrb.act - startidx), &lrb.buf[startidx]);
942 	      illegal_string = true;
943 	    }
944 
945 	  if (return_widestr)
946 	    {
947 	      /* Now the same for the multibyte representation.  */
948 	      if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
949 		wch = seq->ucs4;
950 	      else
951 		{
952 		  wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
953 					       lrb.act - startidx);
954 		  if (seq != NULL)
955 		    seq->ucs4 = wch;
956 		}
957 
958 	      if (wch == ILLEGAL_CHAR_VALUE)
959 		{
960 		  /* This name is not in the repertoire map.  */
961 		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
962 			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
963 		  illegal_string = true;
964 		}
965 	      else
966 		ADDWC (wch);
967 	    }
968 
969 	  /* Now forget about the name we just added.  */
970 	  lrb.act = startidx;
971 
972 	  /* And copy the bytes.  */
973 	  if (seq != NULL)
974 	    adds (&lrb, seq->bytes, seq->nbytes);
975 	}
976 
977       if (ch == '\n' || ch == EOF)
978 	{
979 	  lr_error (lr, _("unterminated string"));
980 	  illegal_string = true;
981 	}
982 
983       if (illegal_string)
984 	{
985 	  free (lrb.buf);
986 	  free (buf2);
987 	  lr->token.val.str.startmb = NULL;
988 	  lr->token.val.str.lenmb = 0;
989 	  lr->token.val.str.startwc = NULL;
990 	  lr->token.val.str.lenwc = 0;
991 
992 	  return &lr->token;
993 	}
994 
995       addc (&lrb, '\0');
996 
997       if (return_widestr)
998 	{
999 	  ADDWC (0);
1000 	  lr->token.val.str.startwc = xrealloc (buf2,
1001 						buf2act * sizeof (uint32_t));
1002 	  lr->token.val.str.lenwc = buf2act;
1003 	}
1004     }
1005 
1006   lr_buffer_to_token (&lrb, lr);
1007 
1008   return &lr->token;
1009 }
1010