1 /* Copyright (C) 1995-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
16
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <stdlib.h>
23 #include <wchar.h>
24 #include <stdint.h>
25 #include <sys/param.h>
26 #include <array_length.h>
27
28 #include "localedef.h"
29 #include "charmap.h"
30 #include "localeinfo.h"
31 #include "linereader.h"
32 #include "locfile.h"
33 #include "elem-hash.h"
34
35 /* Uncomment the following line in the production version. */
36 /* #define NDEBUG 1 */
37 #include <assert.h>
38
39 #define obstack_chunk_alloc malloc
40 #define obstack_chunk_free free
41
42 static inline void
43 __attribute ((always_inline))
obstack_int32_grow(struct obstack * obstack,int32_t data)44 obstack_int32_grow (struct obstack *obstack, int32_t data)
45 {
46 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
47 data = maybe_swap_uint32 (data);
48 if (sizeof (int32_t) == sizeof (int))
49 obstack_int_grow (obstack, data);
50 else
51 obstack_grow (obstack, &data, sizeof (int32_t));
52 }
53
54 static inline void
55 __attribute ((always_inline))
obstack_int32_grow_fast(struct obstack * obstack,int32_t data)56 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
57 {
58 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
59 data = maybe_swap_uint32 (data);
60 if (sizeof (int32_t) == sizeof (int))
61 obstack_int_grow_fast (obstack, data);
62 else
63 obstack_grow (obstack, &data, sizeof (int32_t));
64 }
65
66 /* Forward declaration. */
67 struct element_t;
68
69 /* Data type for list of strings. */
70 struct section_list
71 {
72 /* Successor in the known_sections list. */
73 struct section_list *def_next;
74 /* Successor in the sections list. */
75 struct section_list *next;
76 /* Name of the section. */
77 const char *name;
78 /* First element of this section. */
79 struct element_t *first;
80 /* Last element of this section. */
81 struct element_t *last;
82 /* These are the rules for this section. */
83 enum coll_sort_rule *rules;
84 /* Index of the rule set in the appropriate section of the output file. */
85 int ruleidx;
86 };
87
88 struct element_t;
89
90 struct element_list_t
91 {
92 /* Number of elements. */
93 int cnt;
94
95 struct element_t **w;
96 };
97
98 /* Data type for collating element. */
99 struct element_t
100 {
101 const char *name;
102
103 const char *mbs;
104 size_t nmbs;
105 const uint32_t *wcs;
106 size_t nwcs;
107 int *mborder;
108 int wcorder;
109
110 /* The following is a bit mask which bits are set if this element is
111 used in the appropriate level. Interesting for the singlebyte
112 weight computation.
113
114 XXX The type here restricts the number of levels to 32. It could
115 be changed if necessary but I doubt this is necessary. */
116 unsigned int used_in_level;
117
118 struct element_list_t *weights;
119
120 /* Nonzero if this is a real character definition. */
121 int is_character;
122
123 /* Order of the character in the sequence. This information will
124 be used in range expressions. */
125 int mbseqorder;
126 int wcseqorder;
127
128 /* Where does the definition come from. */
129 const char *file;
130 size_t line;
131
132 /* Which section does this belong to. */
133 struct section_list *section;
134
135 /* Predecessor and successor in the order list. */
136 struct element_t *last;
137 struct element_t *next;
138
139 /* Next element in multibyte output list. */
140 struct element_t *mbnext;
141 struct element_t *mblast;
142
143 /* Next element in wide character output list. */
144 struct element_t *wcnext;
145 struct element_t *wclast;
146 };
147
148 /* Special element value. */
149 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
150 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
151 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
152
153 /* Data type for collating symbol. */
154 struct symbol_t
155 {
156 const char *name;
157
158 /* Point to place in the order list. */
159 struct element_t *order;
160
161 /* Where does the definition come from. */
162 const char *file;
163 size_t line;
164 };
165
166 /* Sparse table of struct element_t *. */
167 #define TABLE wchead_table
168 #define ELEMENT struct element_t *
169 #define DEFAULT NULL
170 #define ITERATE
171 #define NO_ADD_LOCALE
172 #include "3level.h"
173
174 /* Sparse table of int32_t. */
175 #define TABLE collidx_table
176 #define ELEMENT int32_t
177 #define DEFAULT 0
178 #include "3level.h"
179
180 /* Sparse table of uint32_t. */
181 #define TABLE collseq_table
182 #define ELEMENT uint32_t
183 #define DEFAULT ~((uint32_t) 0)
184 #include "3level.h"
185
186
187 /* Simple name list for the preprocessor. */
188 struct name_list
189 {
190 struct name_list *next;
191 char str[0];
192 };
193
194
195 /* The real definition of the struct for the LC_COLLATE locale. */
196 struct locale_collate_t
197 {
198 /* Does the locale use code points to compare the encoding? */
199 bool codepoint_collation;
200
201 int col_weight_max;
202 int cur_weight_max;
203
204 /* List of known scripts. */
205 struct section_list *known_sections;
206 /* List of used sections. */
207 struct section_list *sections;
208 /* Current section using definition. */
209 struct section_list *current_section;
210 /* There always can be an unnamed section. */
211 struct section_list unnamed_section;
212 /* Flag whether the unnamed section has been defined. */
213 bool unnamed_section_defined;
214 /* To make handling of errors easier we have another section. */
215 struct section_list error_section;
216 /* Sometimes we are defining the values for collating symbols before
217 the first actual section. */
218 struct section_list symbol_section;
219
220 /* Start of the order list. */
221 struct element_t *start;
222
223 /* The undefined element. */
224 struct element_t undefined;
225
226 /* This is the cursor for `reorder_after' insertions. */
227 struct element_t *cursor;
228
229 /* This value is used when handling ellipsis. */
230 struct element_t ellipsis_weight;
231
232 /* Known collating elements. */
233 hash_table elem_table;
234
235 /* Known collating symbols. */
236 hash_table sym_table;
237
238 /* Known collation sequences. */
239 hash_table seq_table;
240
241 struct obstack mempool;
242
243 /* The LC_COLLATE category is a bit special as it is sometimes possible
244 that the definitions from more than one input file contains information.
245 Therefore we keep all relevant input in a list. */
246 struct locale_collate_t *next;
247
248 /* Arrays with heads of the list for each of the leading bytes in
249 the multibyte sequences. */
250 struct element_t *mbheads[256];
251
252 /* Arrays with heads of the list for each of the leading bytes in
253 the multibyte sequences. */
254 struct wchead_table wcheads;
255
256 /* The arrays with the collation sequence order. */
257 unsigned char mbseqorder[256];
258 struct collseq_table wcseqorder;
259
260 /* State of the preprocessor. */
261 enum
262 {
263 else_none = 0,
264 else_ignore,
265 else_seen
266 }
267 else_action;
268 };
269
270
271 /* We have a few global variables which are used for reading all
272 LC_COLLATE category descriptions in all files. */
273 static uint32_t nrules;
274
275 /* List of defined preprocessor symbols. */
276 static struct name_list *defined;
277
278
279 /* We need UTF-8 encoding of numbers. */
280 static inline int
281 __attribute ((always_inline))
utf8_encode(char * buf,int val)282 utf8_encode (char *buf, int val)
283 {
284 int retval;
285
286 if (val < 0x80)
287 {
288 *buf++ = (char) val;
289 retval = 1;
290 }
291 else
292 {
293 int step;
294
295 for (step = 2; step < 6; ++step)
296 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
297 break;
298 retval = step;
299
300 *buf = (unsigned char) (~0xff >> step);
301 --step;
302 do
303 {
304 buf[step] = 0x80 | (val & 0x3f);
305 val >>= 6;
306 }
307 while (--step > 0);
308 *buf |= val;
309 }
310
311 return retval;
312 }
313
314
315 static struct section_list *
make_seclist_elem(struct locale_collate_t * collate,const char * string,struct section_list * next)316 make_seclist_elem (struct locale_collate_t *collate, const char *string,
317 struct section_list *next)
318 {
319 struct section_list *newp;
320
321 newp = (struct section_list *) obstack_alloc (&collate->mempool,
322 sizeof (*newp));
323 newp->next = next;
324 newp->name = string;
325 newp->first = NULL;
326 newp->last = NULL;
327
328 return newp;
329 }
330
331
332 static struct element_t *
new_element(struct locale_collate_t * collate,const char * mbs,size_t mbslen,const uint32_t * wcs,const char * name,size_t namelen,int is_character)333 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
334 const uint32_t *wcs, const char *name, size_t namelen,
335 int is_character)
336 {
337 struct element_t *newp;
338
339 newp = (struct element_t *) obstack_alloc (&collate->mempool,
340 sizeof (*newp));
341 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
342 name, namelen);
343 if (mbs != NULL)
344 {
345 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
346 newp->nmbs = mbslen;
347 }
348 else
349 {
350 newp->mbs = NULL;
351 newp->nmbs = 0;
352 }
353 if (wcs != NULL)
354 {
355 size_t nwcs = wcslen ((wchar_t *) wcs);
356 uint32_t zero = 0;
357 /* Handle <U0000> as a single character. */
358 if (nwcs == 0)
359 nwcs = 1;
360 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
361 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
362 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
363 newp->nwcs = nwcs;
364 }
365 else
366 {
367 newp->wcs = NULL;
368 newp->nwcs = 0;
369 }
370 newp->mborder = NULL;
371 newp->wcorder = 0;
372 newp->used_in_level = 0;
373 newp->is_character = is_character;
374
375 /* Will be assigned later. XXX */
376 newp->mbseqorder = 0;
377 newp->wcseqorder = 0;
378
379 /* Will be allocated later. */
380 newp->weights = NULL;
381
382 newp->file = NULL;
383 newp->line = 0;
384
385 newp->section = collate->current_section;
386
387 newp->last = NULL;
388 newp->next = NULL;
389
390 newp->mbnext = NULL;
391 newp->mblast = NULL;
392
393 newp->wcnext = NULL;
394 newp->wclast = NULL;
395
396 return newp;
397 }
398
399
400 static struct symbol_t *
new_symbol(struct locale_collate_t * collate,const char * name,size_t len)401 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
402 {
403 struct symbol_t *newp;
404
405 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
406
407 newp->name = obstack_copy0 (&collate->mempool, name, len);
408 newp->order = NULL;
409
410 newp->file = NULL;
411 newp->line = 0;
412
413 return newp;
414 }
415
416
417 /* Test whether this name is already defined somewhere. */
418 static int
check_duplicate(struct linereader * ldfile,struct locale_collate_t * collate,const struct charmap_t * charmap,struct repertoire_t * repertoire,const char * symbol,size_t symbol_len)419 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
420 const struct charmap_t *charmap,
421 struct repertoire_t *repertoire, const char *symbol,
422 size_t symbol_len)
423 {
424 void *ignore = NULL;
425
426 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
427 {
428 lr_error (ldfile, _("`%.*s' already defined in charmap"),
429 (int) symbol_len, symbol);
430 return 1;
431 }
432
433 if (repertoire != NULL
434 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
435 == 0))
436 {
437 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
438 (int) symbol_len, symbol);
439 return 1;
440 }
441
442 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
443 {
444 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
445 (int) symbol_len, symbol);
446 return 1;
447 }
448
449 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
450 {
451 lr_error (ldfile, _("`%.*s' already defined as collating element"),
452 (int) symbol_len, symbol);
453 return 1;
454 }
455
456 return 0;
457 }
458
459
460 /* Read the direction specification. */
461 static void
read_directions(struct linereader * ldfile,struct token * arg,const struct charmap_t * charmap,struct repertoire_t * repertoire,struct localedef_t * result)462 read_directions (struct linereader *ldfile, struct token *arg,
463 const struct charmap_t *charmap,
464 struct repertoire_t *repertoire, struct localedef_t *result)
465 {
466 int cnt = 0;
467 int max = nrules ?: 10;
468 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
469 int warned = 0;
470 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
471
472 while (1)
473 {
474 int valid = 0;
475
476 if (arg->tok == tok_forward)
477 {
478 if (rules[cnt] & sort_backward)
479 {
480 if (! warned)
481 {
482 lr_error (ldfile, _("\
483 %s: `forward' and `backward' are mutually excluding each other"),
484 "LC_COLLATE");
485 warned = 1;
486 }
487 }
488 else if (rules[cnt] & sort_forward)
489 {
490 if (! warned)
491 {
492 lr_error (ldfile, _("\
493 %s: `%s' mentioned more than once in definition of weight %d"),
494 "LC_COLLATE", "forward", cnt + 1);
495 }
496 }
497 else
498 rules[cnt] |= sort_forward;
499
500 valid = 1;
501 }
502 else if (arg->tok == tok_backward)
503 {
504 if (rules[cnt] & sort_forward)
505 {
506 if (! warned)
507 {
508 lr_error (ldfile, _("\
509 %s: `forward' and `backward' are mutually excluding each other"),
510 "LC_COLLATE");
511 warned = 1;
512 }
513 }
514 else if (rules[cnt] & sort_backward)
515 {
516 if (! warned)
517 {
518 lr_error (ldfile, _("\
519 %s: `%s' mentioned more than once in definition of weight %d"),
520 "LC_COLLATE", "backward", cnt + 1);
521 }
522 }
523 else
524 rules[cnt] |= sort_backward;
525
526 valid = 1;
527 }
528 else if (arg->tok == tok_position)
529 {
530 if (rules[cnt] & sort_position)
531 {
532 if (! warned)
533 {
534 lr_error (ldfile, _("\
535 %s: `%s' mentioned more than once in definition of weight %d"),
536 "LC_COLLATE", "position", cnt + 1);
537 }
538 }
539 else
540 rules[cnt] |= sort_position;
541
542 valid = 1;
543 }
544
545 if (valid)
546 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
547
548 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
549 || arg->tok == tok_semicolon)
550 {
551 if (! valid && ! warned)
552 {
553 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
554 warned = 1;
555 }
556
557 /* See whether we have to increment the counter. */
558 if (arg->tok != tok_comma && rules[cnt] != 0)
559 {
560 /* Add the default `forward' if we have seen only `position'. */
561 if (rules[cnt] == sort_position)
562 rules[cnt] = sort_position | sort_forward;
563
564 ++cnt;
565 }
566
567 if (arg->tok == tok_eof || arg->tok == tok_eol)
568 /* End of line or file, so we exit the loop. */
569 break;
570
571 if (nrules == 0)
572 {
573 /* See whether we have enough room in the array. */
574 if (cnt == max)
575 {
576 max += 10;
577 rules = (enum coll_sort_rule *) xrealloc (rules,
578 max
579 * sizeof (*rules));
580 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
581 }
582 }
583 else
584 {
585 if (cnt == nrules)
586 {
587 /* There must not be any more rule. */
588 if (! warned)
589 {
590 lr_error (ldfile, _("\
591 %s: too many rules; first entry only had %d"),
592 "LC_COLLATE", nrules);
593 warned = 1;
594 }
595
596 lr_ignore_rest (ldfile, 0);
597 break;
598 }
599 }
600 }
601 else
602 {
603 if (! warned)
604 {
605 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
606 warned = 1;
607 }
608 }
609
610 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
611 }
612
613 if (nrules == 0)
614 {
615 /* Now we know how many rules we have. */
616 nrules = cnt;
617 rules = (enum coll_sort_rule *) xrealloc (rules,
618 nrules * sizeof (*rules));
619 }
620 else
621 {
622 if (cnt < nrules)
623 {
624 /* Not enough rules in this specification. */
625 if (! warned)
626 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
627
628 do
629 rules[cnt] = sort_forward;
630 while (++cnt < nrules);
631 }
632 }
633
634 collate->current_section->rules = rules;
635 }
636
637
638 static struct element_t *
find_element(struct linereader * ldfile,struct locale_collate_t * collate,const char * str,size_t len)639 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
640 const char *str, size_t len)
641 {
642 void *result = NULL;
643
644 /* Search for the entries among the collation sequences already define. */
645 if (find_entry (&collate->seq_table, str, len, &result) != 0)
646 {
647 /* Nope, not define yet. So we see whether it is a
648 collation symbol. */
649 void *ptr;
650
651 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
652 {
653 /* It's a collation symbol. */
654 struct symbol_t *sym = (struct symbol_t *) ptr;
655 result = sym->order;
656
657 if (result == NULL)
658 result = sym->order = new_element (collate, NULL, 0, NULL,
659 NULL, 0, 0);
660 }
661 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
662 {
663 /* It's also no collation element. So it is a character
664 element defined later. */
665 result = new_element (collate, NULL, 0, NULL, str, len, 1);
666 /* Insert it into the sequence table. */
667 insert_entry (&collate->seq_table, str, len, result);
668 }
669 }
670
671 return (struct element_t *) result;
672 }
673
674
675 static void
unlink_element(struct locale_collate_t * collate)676 unlink_element (struct locale_collate_t *collate)
677 {
678 if (collate->cursor == collate->start)
679 {
680 assert (collate->cursor->next == NULL);
681 assert (collate->cursor->last == NULL);
682 collate->cursor = NULL;
683 }
684 else
685 {
686 if (collate->cursor->next != NULL)
687 collate->cursor->next->last = collate->cursor->last;
688 if (collate->cursor->last != NULL)
689 collate->cursor->last->next = collate->cursor->next;
690 collate->cursor = collate->cursor->last;
691 }
692 }
693
694
695 static void
insert_weights(struct linereader * ldfile,struct element_t * elem,const struct charmap_t * charmap,struct repertoire_t * repertoire,struct localedef_t * result,enum token_t ellipsis)696 insert_weights (struct linereader *ldfile, struct element_t *elem,
697 const struct charmap_t *charmap,
698 struct repertoire_t *repertoire, struct localedef_t *result,
699 enum token_t ellipsis)
700 {
701 int weight_cnt;
702 struct token *arg;
703 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
704
705 /* Initialize all the fields. */
706 elem->file = ldfile->fname;
707 elem->line = ldfile->lineno;
708
709 elem->last = collate->cursor;
710 elem->next = collate->cursor ? collate->cursor->next : NULL;
711 if (collate->cursor != NULL && collate->cursor->next != NULL)
712 collate->cursor->next->last = elem;
713 if (collate->cursor != NULL)
714 collate->cursor->next = elem;
715 if (collate->start == NULL)
716 {
717 assert (collate->cursor == NULL);
718 collate->start = elem;
719 }
720
721 elem->section = collate->current_section;
722
723 if (collate->current_section->first == NULL)
724 collate->current_section->first = elem;
725 if (collate->current_section->last == collate->cursor)
726 collate->current_section->last = elem;
727
728 collate->cursor = elem;
729
730 elem->weights = (struct element_list_t *)
731 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
732 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
733
734 weight_cnt = 0;
735
736 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
737 do
738 {
739 if (arg->tok == tok_eof || arg->tok == tok_eol)
740 break;
741
742 if (arg->tok == tok_ignore)
743 {
744 /* The weight for this level has to be ignored. We use the
745 null pointer to indicate this. */
746 elem->weights[weight_cnt].w = (struct element_t **)
747 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
748 elem->weights[weight_cnt].w[0] = NULL;
749 elem->weights[weight_cnt].cnt = 1;
750 }
751 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
752 {
753 char ucs4str[10];
754 struct element_t *val;
755 char *symstr;
756 size_t symlen;
757
758 if (arg->tok == tok_bsymbol)
759 {
760 symstr = arg->val.str.startmb;
761 symlen = arg->val.str.lenmb;
762 }
763 else
764 {
765 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
766 symstr = ucs4str;
767 symlen = 9;
768 }
769
770 val = find_element (ldfile, collate, symstr, symlen);
771 if (val == NULL)
772 break;
773
774 elem->weights[weight_cnt].w = (struct element_t **)
775 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
776 elem->weights[weight_cnt].w[0] = val;
777 elem->weights[weight_cnt].cnt = 1;
778 }
779 else if (arg->tok == tok_string)
780 {
781 /* Split the string up in the individual characters and put
782 the element definitions in the list. */
783 const char *cp = arg->val.str.startmb;
784 int cnt = 0;
785 struct element_t *charelem;
786 struct element_t **weights = NULL;
787 int max = 0;
788
789 if (*cp == '\0')
790 {
791 lr_error (ldfile, _("%s: empty weight string not allowed"),
792 "LC_COLLATE");
793 lr_ignore_rest (ldfile, 0);
794 break;
795 }
796
797 do
798 {
799 if (*cp == '<')
800 {
801 /* Ahh, it's a bsymbol or an UCS4 value. If it's
802 the latter we have to unify the name. */
803 const char *startp = ++cp;
804 size_t len;
805
806 while (*cp != '>')
807 {
808 if (*cp == ldfile->escape_char)
809 ++cp;
810 if (*cp == '\0')
811 /* It's a syntax error. */
812 goto syntax;
813
814 ++cp;
815 }
816
817 if (cp - startp == 5 && startp[0] == 'U'
818 && isxdigit (startp[1]) && isxdigit (startp[2])
819 && isxdigit (startp[3]) && isxdigit (startp[4]))
820 {
821 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
822 char *newstr;
823
824 newstr = (char *) xmalloc (10);
825 snprintf (newstr, 10, "U%08X", ucs4);
826 startp = newstr;
827
828 len = 9;
829 }
830 else
831 len = cp - startp;
832
833 charelem = find_element (ldfile, collate, startp, len);
834 ++cp;
835 }
836 else
837 {
838 /* People really shouldn't use characters directly in
839 the string. Especially since it's not really clear
840 what this means. We interpret all characters in the
841 string as if that would be bsymbols. Otherwise we
842 would have to match back to bsymbols somehow and this
843 is normally not what people normally expect. */
844 charelem = find_element (ldfile, collate, cp++, 1);
845 }
846
847 if (charelem == NULL)
848 {
849 /* We ignore the rest of the line. */
850 lr_ignore_rest (ldfile, 0);
851 break;
852 }
853
854 /* Add the pointer. */
855 if (cnt >= max)
856 {
857 struct element_t **newp;
858 max += 10;
859 newp = (struct element_t **)
860 alloca (max * sizeof (struct element_t *));
861 memcpy (newp, weights, cnt * sizeof (struct element_t *));
862 weights = newp;
863 }
864 weights[cnt++] = charelem;
865 }
866 while (*cp != '\0');
867
868 /* Now store the information. */
869 elem->weights[weight_cnt].w = (struct element_t **)
870 obstack_alloc (&collate->mempool,
871 cnt * sizeof (struct element_t *));
872 memcpy (elem->weights[weight_cnt].w, weights,
873 cnt * sizeof (struct element_t *));
874 elem->weights[weight_cnt].cnt = cnt;
875
876 /* We don't need the string anymore. */
877 free (arg->val.str.startmb);
878 }
879 else if (ellipsis != tok_none
880 && (arg->tok == tok_ellipsis2
881 || arg->tok == tok_ellipsis3
882 || arg->tok == tok_ellipsis4))
883 {
884 /* It must be the same ellipsis as used in the initial column. */
885 if (arg->tok != ellipsis)
886 lr_error (ldfile, _("\
887 %s: weights must use the same ellipsis symbol as the name"),
888 "LC_COLLATE");
889
890 /* The weight for this level will depend on the element
891 iterating over the range. Put a placeholder. */
892 elem->weights[weight_cnt].w = (struct element_t **)
893 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
894 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
895 elem->weights[weight_cnt].cnt = 1;
896 }
897 else
898 {
899 syntax:
900 /* It's a syntax error. */
901 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
902 lr_ignore_rest (ldfile, 0);
903 break;
904 }
905
906 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
907 /* This better should be the end of the line or a semicolon. */
908 if (arg->tok == tok_semicolon)
909 /* OK, ignore this and read the next token. */
910 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
911 else if (arg->tok != tok_eof && arg->tok != tok_eol)
912 {
913 /* It's a syntax error. */
914 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
915 lr_ignore_rest (ldfile, 0);
916 break;
917 }
918 }
919 while (++weight_cnt < nrules);
920
921 if (weight_cnt < nrules)
922 {
923 /* This means the rest of the line uses the current element as
924 the weight. */
925 do
926 {
927 elem->weights[weight_cnt].w = (struct element_t **)
928 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
929 if (ellipsis == tok_none)
930 elem->weights[weight_cnt].w[0] = elem;
931 else
932 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
933 elem->weights[weight_cnt].cnt = 1;
934 }
935 while (++weight_cnt < nrules);
936 }
937 else
938 {
939 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
940 {
941 /* Too many rule values. */
942 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
943 lr_ignore_rest (ldfile, 0);
944 }
945 else
946 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
947 }
948 }
949
950
951 static int
insert_value(struct linereader * ldfile,const char * symstr,size_t symlen,const struct charmap_t * charmap,struct repertoire_t * repertoire,struct localedef_t * result)952 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
953 const struct charmap_t *charmap, struct repertoire_t *repertoire,
954 struct localedef_t *result)
955 {
956 /* First find out what kind of symbol this is. */
957 struct charseq *seq;
958 uint32_t wc;
959 struct element_t *elem = NULL;
960 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
961
962 /* Try to find the character in the charmap. */
963 seq = charmap_find_value (charmap, symstr, symlen);
964
965 /* Determine the wide character. */
966 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
967 {
968 wc = repertoire_find_value (repertoire, symstr, symlen);
969 if (seq != NULL)
970 seq->ucs4 = wc;
971 }
972 else
973 wc = seq->ucs4;
974
975 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
976 {
977 /* It's no character, so look through the collation elements and
978 symbol list. */
979 void *ptr = elem;
980 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
981 {
982 void *result;
983 struct symbol_t *sym = NULL;
984
985 /* It's also collation element. Therefore it's either a
986 collating symbol or it's a character which is not
987 supported by the character set. In the later case we
988 simply create a dummy entry. */
989 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
990 {
991 /* It's a collation symbol. */
992 sym = (struct symbol_t *) result;
993
994 elem = sym->order;
995 }
996
997 if (elem == NULL)
998 {
999 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
1000
1001 if (sym != NULL)
1002 sym->order = elem;
1003 else
1004 /* Enter a fake element in the sequence table. This
1005 won't cause anything in the output since there is
1006 no multibyte or wide character associated with
1007 it. */
1008 insert_entry (&collate->seq_table, symstr, symlen, elem);
1009 }
1010 }
1011 else
1012 /* Copy the result back. */
1013 elem = ptr;
1014 }
1015 else
1016 {
1017 /* Otherwise the symbols stands for a character. */
1018 void *ptr = elem;
1019 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
1020 {
1021 uint32_t wcs[2] = { wc, 0 };
1022
1023 /* We have to allocate an entry. */
1024 elem = new_element (collate,
1025 seq != NULL ? (char *) seq->bytes : NULL,
1026 seq != NULL ? seq->nbytes : 0,
1027 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
1028 symstr, symlen, 1);
1029
1030 /* And add it to the table. */
1031 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1032 /* This cannot happen. */
1033 assert (! "Internal error");
1034 }
1035 else
1036 {
1037 /* Copy the result back. */
1038 elem = ptr;
1039
1040 /* Maybe the character was used before the definition. In this case
1041 we have to insert the byte sequences now. */
1042 if (elem->mbs == NULL && seq != NULL)
1043 {
1044 elem->mbs = obstack_copy0 (&collate->mempool,
1045 seq->bytes, seq->nbytes);
1046 elem->nmbs = seq->nbytes;
1047 }
1048
1049 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1050 {
1051 uint32_t wcs[2] = { wc, 0 };
1052
1053 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1054 elem->nwcs = 1;
1055 }
1056 }
1057 }
1058
1059 /* Test whether this element is not already in the list. */
1060 if (elem->next != NULL || elem == collate->cursor)
1061 {
1062 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1063 (int) symlen, symstr, elem->file, elem->line);
1064 lr_ignore_rest (ldfile, 0);
1065 return 1;
1066 }
1067
1068 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1069
1070 return 0;
1071 }
1072
1073
1074 static void
handle_ellipsis(struct linereader * ldfile,const char * symstr,size_t symlen,enum token_t ellipsis,const struct charmap_t * charmap,struct repertoire_t * repertoire,struct localedef_t * result)1075 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1076 enum token_t ellipsis, const struct charmap_t *charmap,
1077 struct repertoire_t *repertoire,
1078 struct localedef_t *result)
1079 {
1080 struct element_t *startp;
1081 struct element_t *endp;
1082 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1083
1084 /* Unlink the entry added for the ellipsis. */
1085 unlink_element (collate);
1086 startp = collate->cursor;
1087
1088 /* Process and add the end-entry. */
1089 if (symstr != NULL
1090 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1091 /* Something went wrong with inserting the to-value. This means
1092 we cannot process the ellipsis. */
1093 return;
1094
1095 /* Reset the cursor. */
1096 collate->cursor = startp;
1097
1098 /* Now we have to handle many different situations:
1099 - we have to distinguish between the three different ellipsis forms
1100 - the is the ellipsis at the beginning, in the middle, or at the end.
1101 */
1102 endp = collate->cursor->next;
1103 assert (symstr == NULL || endp != NULL);
1104
1105 /* XXX The following is probably very wrong since also collating symbols
1106 can appear in ranges. But do we want/can refine the test for that? */
1107 #if 0
1108 /* Both, the start and the end symbol, must stand for characters. */
1109 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1110 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1111 {
1112 lr_error (ldfile, _("\
1113 %s: the start and the end symbol of a range must stand for characters"),
1114 "LC_COLLATE");
1115 return;
1116 }
1117 #endif
1118
1119 if (ellipsis == tok_ellipsis3)
1120 {
1121 /* One requirement we make here: the length of the byte
1122 sequences for the first and end character must be the same.
1123 This is mainly to prevent unwanted effects and this is often
1124 not what is wanted. */
1125 size_t len = (startp->mbs != NULL ? startp->nmbs
1126 : (endp->mbs != NULL ? endp->nmbs : 0));
1127 char mbcnt[len + 1];
1128 char mbend[len + 1];
1129
1130 /* Well, this should be caught somewhere else already. Just to
1131 make sure. */
1132 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1133 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1134
1135 if (startp != NULL && endp != NULL
1136 && startp->mbs != NULL && endp->mbs != NULL
1137 && startp->nmbs != endp->nmbs)
1138 {
1139 lr_error (ldfile, _("\
1140 %s: byte sequences of first and last character must have the same length"),
1141 "LC_COLLATE");
1142 return;
1143 }
1144
1145 /* Determine whether we have to generate multibyte sequences. */
1146 if ((startp == NULL || startp->mbs != NULL)
1147 && (endp == NULL || endp->mbs != NULL))
1148 {
1149 int cnt;
1150 int ret;
1151
1152 /* Prepare the beginning byte sequence. This is either from the
1153 beginning byte sequence or it is all nulls if it was an
1154 initial ellipsis. */
1155 if (startp == NULL || startp->mbs == NULL)
1156 memset (mbcnt, '\0', len);
1157 else
1158 {
1159 memcpy (mbcnt, startp->mbs, len);
1160
1161 /* And increment it so that the value is the first one we will
1162 try to insert. */
1163 for (cnt = len - 1; cnt >= 0; --cnt)
1164 if (++mbcnt[cnt] != '\0')
1165 break;
1166 }
1167 mbcnt[len] = '\0';
1168
1169 /* And the end sequence. */
1170 if (endp == NULL || endp->mbs == NULL)
1171 memset (mbend, '\0', len);
1172 else
1173 memcpy (mbend, endp->mbs, len);
1174 mbend[len] = '\0';
1175
1176 /* Test whether we have a correct range. */
1177 ret = memcmp (mbcnt, mbend, len);
1178 if (ret >= 0)
1179 {
1180 if (ret > 0)
1181 lr_error (ldfile, _("%s: byte sequence of first character of \
1182 range is not lower than that of the last character"), "LC_COLLATE");
1183 return;
1184 }
1185
1186 /* Generate the byte sequences data. */
1187 while (1)
1188 {
1189 struct charseq *seq;
1190
1191 /* Quite a bit of work ahead. We have to find the character
1192 definition for the byte sequence and then determine the
1193 wide character belonging to it. */
1194 seq = charmap_find_symbol (charmap, mbcnt, len);
1195 if (seq != NULL)
1196 {
1197 struct element_t *elem;
1198 size_t namelen;
1199
1200 /* I don't think this can ever happen. */
1201 assert (seq->name != NULL);
1202 namelen = strlen (seq->name);
1203
1204 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1205 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1206 namelen);
1207
1208 /* Now we are ready to insert the new value in the
1209 sequence. Find out whether the element is
1210 already known. */
1211 void *ptr;
1212 if (find_entry (&collate->seq_table, seq->name, namelen,
1213 &ptr) != 0)
1214 {
1215 uint32_t wcs[2] = { seq->ucs4, 0 };
1216
1217 /* We have to allocate an entry. */
1218 elem = new_element (collate, mbcnt, len,
1219 seq->ucs4 == ILLEGAL_CHAR_VALUE
1220 ? NULL : wcs, seq->name,
1221 namelen, 1);
1222
1223 /* And add it to the table. */
1224 if (insert_entry (&collate->seq_table, seq->name,
1225 namelen, elem) != 0)
1226 /* This cannot happen. */
1227 assert (! "Internal error");
1228 }
1229 else
1230 /* Copy the result. */
1231 elem = ptr;
1232
1233 /* Test whether this element is not already in the list. */
1234 if (elem->next != NULL || (collate->cursor != NULL
1235 && elem->next == collate->cursor))
1236 {
1237 lr_error (ldfile, _("\
1238 order for `%.*s' already defined at %s:%Zu"),
1239 (int) namelen, seq->name,
1240 elem->file, elem->line);
1241 goto increment;
1242 }
1243
1244 /* Enqueue the new element. */
1245 elem->last = collate->cursor;
1246 if (collate->cursor == NULL)
1247 elem->next = NULL;
1248 else
1249 {
1250 elem->next = collate->cursor->next;
1251 elem->last->next = elem;
1252 if (elem->next != NULL)
1253 elem->next->last = elem;
1254 }
1255 if (collate->start == NULL)
1256 {
1257 assert (collate->cursor == NULL);
1258 collate->start = elem;
1259 }
1260 collate->cursor = elem;
1261
1262 /* Add the weight value. We take them from the
1263 `ellipsis_weights' member of `collate'. */
1264 elem->weights = (struct element_list_t *)
1265 obstack_alloc (&collate->mempool,
1266 nrules * sizeof (struct element_list_t));
1267 for (cnt = 0; cnt < nrules; ++cnt)
1268 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1269 && (collate->ellipsis_weight.weights[cnt].w[0]
1270 == ELEMENT_ELLIPSIS2))
1271 {
1272 elem->weights[cnt].w = (struct element_t **)
1273 obstack_alloc (&collate->mempool,
1274 sizeof (struct element_t *));
1275 elem->weights[cnt].w[0] = elem;
1276 elem->weights[cnt].cnt = 1;
1277 }
1278 else
1279 {
1280 /* Simply use the weight from `ellipsis_weight'. */
1281 elem->weights[cnt].w =
1282 collate->ellipsis_weight.weights[cnt].w;
1283 elem->weights[cnt].cnt =
1284 collate->ellipsis_weight.weights[cnt].cnt;
1285 }
1286 }
1287
1288 /* Increment for the next round. */
1289 increment:
1290 for (cnt = len - 1; cnt >= 0; --cnt)
1291 if (++mbcnt[cnt] != '\0')
1292 break;
1293
1294 /* Find out whether this was all. */
1295 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1296 /* Yep, that's all. */
1297 break;
1298 }
1299 }
1300 }
1301 else
1302 {
1303 /* For symbolic range we naturally must have a beginning and an
1304 end specified by the user. */
1305 if (startp == NULL)
1306 lr_error (ldfile, _("\
1307 %s: symbolic range ellipsis must not directly follow `order_start'"),
1308 "LC_COLLATE");
1309 else if (endp == NULL)
1310 lr_error (ldfile, _("\
1311 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1312 "LC_COLLATE");
1313 else
1314 {
1315 /* Determine the range. To do so we have to determine the
1316 common prefix of the both names and then the numeric
1317 values of both ends. */
1318 size_t lenfrom = strlen (startp->name);
1319 size_t lento = strlen (endp->name);
1320 char buf[lento + 1];
1321 int preflen = 0;
1322 long int from;
1323 long int to;
1324 char *cp;
1325 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1326
1327 if (lenfrom != lento)
1328 {
1329 invalid_range:
1330 lr_error (ldfile, _("\
1331 `%s' and `%.*s' are not valid names for symbolic range"),
1332 startp->name, (int) lento, endp->name);
1333 return;
1334 }
1335
1336 while (startp->name[preflen] == endp->name[preflen])
1337 if (startp->name[preflen] == '\0')
1338 /* Nothing to be done. The start and end point are identical
1339 and while inserting the end point we have already given
1340 the user an error message. */
1341 return;
1342 else
1343 ++preflen;
1344
1345 errno = 0;
1346 from = strtol (startp->name + preflen, &cp, base);
1347 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1348 goto invalid_range;
1349
1350 errno = 0;
1351 to = strtol (endp->name + preflen, &cp, base);
1352 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1353 goto invalid_range;
1354
1355 /* Copy the prefix. */
1356 memcpy (buf, startp->name, preflen);
1357
1358 /* Loop over all values. */
1359 for (++from; from < to; ++from)
1360 {
1361 struct element_t *elem = NULL;
1362 struct charseq *seq;
1363 uint32_t wc;
1364 int cnt;
1365
1366 /* Generate the name. */
1367 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1368 (int) (lenfrom - preflen), from);
1369
1370 /* Look whether this name is already defined. */
1371 void *ptr;
1372 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1373 {
1374 /* Copy back the result. */
1375 elem = ptr;
1376
1377 if (elem->next != NULL || (collate->cursor != NULL
1378 && elem->next == collate->cursor))
1379 {
1380 lr_error (ldfile, _("\
1381 %s: order for `%.*s' already defined at %s:%Zu"),
1382 "LC_COLLATE", (int) lenfrom, buf,
1383 elem->file, elem->line);
1384 continue;
1385 }
1386
1387 if (elem->name == NULL)
1388 {
1389 lr_error (ldfile, _("%s: `%s' must be a character"),
1390 "LC_COLLATE", buf);
1391 continue;
1392 }
1393 }
1394
1395 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1396 {
1397 /* Search for a character of this name. */
1398 seq = charmap_find_value (charmap, buf, lenfrom);
1399 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1400 {
1401 wc = repertoire_find_value (repertoire, buf, lenfrom);
1402
1403 if (seq != NULL)
1404 seq->ucs4 = wc;
1405 }
1406 else
1407 wc = seq->ucs4;
1408
1409 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1410 /* We don't know anything about a character with this
1411 name. XXX Should we warn? */
1412 continue;
1413
1414 if (elem == NULL)
1415 {
1416 uint32_t wcs[2] = { wc, 0 };
1417
1418 /* We have to allocate an entry. */
1419 elem = new_element (collate,
1420 seq != NULL
1421 ? (char *) seq->bytes : NULL,
1422 seq != NULL ? seq->nbytes : 0,
1423 wc == ILLEGAL_CHAR_VALUE
1424 ? NULL : wcs, buf, lenfrom, 1);
1425 }
1426 else
1427 {
1428 /* Update the element. */
1429 if (seq != NULL)
1430 {
1431 elem->mbs = obstack_copy0 (&collate->mempool,
1432 seq->bytes, seq->nbytes);
1433 elem->nmbs = seq->nbytes;
1434 }
1435
1436 if (wc != ILLEGAL_CHAR_VALUE)
1437 {
1438 uint32_t zero = 0;
1439
1440 obstack_grow (&collate->mempool,
1441 &wc, sizeof (uint32_t));
1442 obstack_grow (&collate->mempool,
1443 &zero, sizeof (uint32_t));
1444 elem->wcs = obstack_finish (&collate->mempool);
1445 elem->nwcs = 1;
1446 }
1447 }
1448
1449 elem->file = ldfile->fname;
1450 elem->line = ldfile->lineno;
1451 elem->section = collate->current_section;
1452 }
1453
1454 /* Enqueue the new element. */
1455 elem->last = collate->cursor;
1456 elem->next = collate->cursor->next;
1457 elem->last->next = elem;
1458 if (elem->next != NULL)
1459 elem->next->last = elem;
1460 collate->cursor = elem;
1461
1462 /* Now add the weights. They come from the `ellipsis_weights'
1463 member of `collate'. */
1464 elem->weights = (struct element_list_t *)
1465 obstack_alloc (&collate->mempool,
1466 nrules * sizeof (struct element_list_t));
1467 for (cnt = 0; cnt < nrules; ++cnt)
1468 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1469 && (collate->ellipsis_weight.weights[cnt].w[0]
1470 == ELEMENT_ELLIPSIS2))
1471 {
1472 elem->weights[cnt].w = (struct element_t **)
1473 obstack_alloc (&collate->mempool,
1474 sizeof (struct element_t *));
1475 elem->weights[cnt].w[0] = elem;
1476 elem->weights[cnt].cnt = 1;
1477 }
1478 else
1479 {
1480 /* Simly use the weight from `ellipsis_weight'. */
1481 elem->weights[cnt].w =
1482 collate->ellipsis_weight.weights[cnt].w;
1483 elem->weights[cnt].cnt =
1484 collate->ellipsis_weight.weights[cnt].cnt;
1485 }
1486 }
1487 }
1488 }
1489 /* Move the cursor to the last entry in the ellipsis.
1490 Subsequent operations need to start from the last entry. */
1491 collate->cursor = endp;
1492 }
1493
1494
1495 static void
collate_startup(struct linereader * ldfile,struct localedef_t * locale,struct localedef_t * copy_locale,int ignore_content)1496 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1497 struct localedef_t *copy_locale, int ignore_content)
1498 {
1499 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1500 {
1501 struct locale_collate_t *collate;
1502
1503 if (copy_locale == NULL)
1504 {
1505 collate = locale->categories[LC_COLLATE].collate =
1506 (struct locale_collate_t *)
1507 xcalloc (1, sizeof (struct locale_collate_t));
1508
1509 /* Init the various data structures. */
1510 init_hash (&collate->elem_table, 100);
1511 init_hash (&collate->sym_table, 100);
1512 init_hash (&collate->seq_table, 500);
1513 obstack_init (&collate->mempool);
1514
1515 collate->col_weight_max = -1;
1516 collate->codepoint_collation = false;
1517 }
1518 else
1519 /* Reuse the copy_locale's data structures. */
1520 collate = locale->categories[LC_COLLATE].collate =
1521 copy_locale->categories[LC_COLLATE].collate;
1522 }
1523
1524 ldfile->translate_strings = 0;
1525 ldfile->return_widestr = 0;
1526 }
1527
1528
1529 void
collate_finish(struct localedef_t * locale,const struct charmap_t * charmap)1530 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1531 {
1532 /* Now is the time when we can assign the individual collation
1533 values for all the symbols. We have possibly different values
1534 for the wide- and the multibyte-character symbols. This is done
1535 since it might make a difference in the encoding if there is in
1536 some cases no multibyte-character but there are wide-characters.
1537 (The other way around it is not important since theencoded
1538 collation value in the wide-character case is 32 bits wide and
1539 therefore requires no encoding).
1540
1541 The lowest collation value assigned is 2. Zero is reserved for
1542 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1543 functions and 1 is used to separate the individual passes for the
1544 different rules.
1545
1546 We also have to construct is list with all the bytes/words which
1547 can come first in a sequence, followed by all the elements which
1548 also start with this byte/word. The order is reverse which has
1549 among others the important effect that longer strings are located
1550 first in the list. This is required for the output data since
1551 the algorithm used in `strcoll' etc depends on this.
1552
1553 The multibyte case is easy. We simply sort into an array with
1554 256 elements. */
1555 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1556 int mbact[nrules];
1557 int wcact;
1558 int mbseqact;
1559 int wcseqact;
1560 struct element_t *runp;
1561 int i;
1562 int need_undefined = 0;
1563 struct section_list *sect;
1564 int ruleidx;
1565
1566 if (collate == NULL)
1567 {
1568 /* No data, no check. Issue a warning. */
1569 record_warning (_("No definition for %s category found"),
1570 "LC_COLLATE");
1571 return;
1572 }
1573
1574 /* No data required. */
1575 if (collate->codepoint_collation)
1576 return;
1577
1578 /* If this assertion is hit change the type in `element_t'. */
1579 assert (nrules <= sizeof (runp->used_in_level) * 8);
1580
1581 /* Make sure that the `position' rule is used either in all sections
1582 or in none. */
1583 for (i = 0; i < nrules; ++i)
1584 for (sect = collate->sections; sect != NULL; sect = sect->next)
1585 if (sect != collate->current_section
1586 && sect->rules != NULL
1587 && ((sect->rules[i] & sort_position)
1588 != (collate->current_section->rules[i] & sort_position)))
1589 {
1590 record_error (0, 0, _("\
1591 %s: `position' must be used for a specific level in all sections or none"),
1592 "LC_COLLATE");
1593 break;
1594 }
1595
1596 /* Find out which elements are used at which level. At the same
1597 time we find out whether we have any undefined symbols. */
1598 runp = collate->start;
1599 while (runp != NULL)
1600 {
1601 if (runp->mbs != NULL)
1602 {
1603 for (i = 0; i < nrules; ++i)
1604 {
1605 int j;
1606
1607 for (j = 0; j < runp->weights[i].cnt; ++j)
1608 /* A NULL pointer as the weight means IGNORE. */
1609 if (runp->weights[i].w[j] != NULL)
1610 {
1611 if (runp->weights[i].w[j]->weights == NULL)
1612 {
1613 record_error_at_line (0, 0, runp->file, runp->line,
1614 _("symbol `%s' not defined"),
1615 runp->weights[i].w[j]->name);
1616
1617 need_undefined = 1;
1618 runp->weights[i].w[j] = &collate->undefined;
1619 }
1620 else
1621 /* Set the bit for the level. */
1622 runp->weights[i].w[j]->used_in_level |= 1 << i;
1623 }
1624 }
1625 }
1626
1627 /* Up to the next entry. */
1628 runp = runp->next;
1629 }
1630
1631 /* Walk through the list of defined sequences and assign weights. Also
1632 create the data structure which will allow generating the single byte
1633 character based tables.
1634
1635 Since at each time only the weights for each of the rules are
1636 only compared to other weights for this rule it is possible to
1637 assign more compact weight values than simply counting all
1638 weights in sequence. We can assign weights from 3, one for each
1639 rule individually and only for those elements, which are actually
1640 used for this rule.
1641
1642 Why is this important? It is not for the wide char table. But
1643 it is for the singlebyte output since here larger numbers have to
1644 be encoded to make it possible to emit the value as a byte
1645 string. */
1646 for (i = 0; i < nrules; ++i)
1647 mbact[i] = 2;
1648 wcact = 2;
1649 mbseqact = 0;
1650 wcseqact = 0;
1651 runp = collate->start;
1652 while (runp != NULL)
1653 {
1654 /* Determine the order. */
1655 if (runp->used_in_level != 0)
1656 {
1657 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1658 nrules * sizeof (int));
1659
1660 for (i = 0; i < nrules; ++i)
1661 if ((runp->used_in_level & (1 << i)) != 0)
1662 runp->mborder[i] = mbact[i]++;
1663 else
1664 runp->mborder[i] = 0;
1665 }
1666
1667 if (runp->mbs != NULL)
1668 {
1669 struct element_t **eptr;
1670 struct element_t *lastp = NULL;
1671
1672 /* Find the point where to insert in the list. */
1673 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1674 while (*eptr != NULL)
1675 {
1676 if ((*eptr)->nmbs < runp->nmbs)
1677 break;
1678
1679 if ((*eptr)->nmbs == runp->nmbs)
1680 {
1681 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1682
1683 if (c == 0)
1684 {
1685 /* This should not happen. It means that we have
1686 to symbols with the same byte sequence. It is
1687 of course an error. */
1688 record_error_at_line (0, 0, (*eptr)->file,
1689 (*eptr)->line,
1690 _("\
1691 symbol `%s' has the same encoding as"), (*eptr)->name);
1692
1693 record_error_at_line (0, 0, runp->file, runp->line,
1694 _("symbol `%s'"), runp->name);
1695 goto dont_insert;
1696 }
1697 else if (c < 0)
1698 /* Insert it here. */
1699 break;
1700 }
1701
1702 /* To the next entry. */
1703 lastp = *eptr;
1704 eptr = &(*eptr)->mbnext;
1705 }
1706
1707 /* Set the pointers. */
1708 runp->mbnext = *eptr;
1709 runp->mblast = lastp;
1710 if (*eptr != NULL)
1711 (*eptr)->mblast = runp;
1712 *eptr = runp;
1713 dont_insert:
1714 ;
1715 }
1716
1717 if (runp->used_in_level)
1718 runp->wcorder = wcact++;
1719
1720 if (runp->is_character)
1721 {
1722 if (runp->nmbs == 1)
1723 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1724
1725 runp->wcseqorder = wcseqact++;
1726 }
1727 else if (runp->mbs != NULL && runp->weights != NULL)
1728 /* This is for collation elements. */
1729 runp->wcseqorder = wcseqact++;
1730
1731 /* Up to the next entry. */
1732 runp = runp->next;
1733 }
1734
1735 /* Find out whether any of the `mbheads' entries is unset. In this
1736 case we use the UNDEFINED entry. */
1737 for (i = 1; i < 256; ++i)
1738 if (collate->mbheads[i] == NULL)
1739 {
1740 need_undefined = 1;
1741 collate->mbheads[i] = &collate->undefined;
1742 }
1743
1744 /* Now to the wide character case. */
1745 collate->wcheads.p = 6;
1746 collate->wcheads.q = 10;
1747 wchead_table_init (&collate->wcheads);
1748
1749 collate->wcseqorder.p = 6;
1750 collate->wcseqorder.q = 10;
1751 collseq_table_init (&collate->wcseqorder);
1752
1753 /* Start adding. */
1754 runp = collate->start;
1755 while (runp != NULL)
1756 {
1757 if (runp->wcs != NULL)
1758 {
1759 struct element_t *e;
1760 struct element_t **eptr;
1761 struct element_t *lastp;
1762
1763 /* Insert the collation sequence value. */
1764 if (runp->is_character)
1765 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1766 runp->wcseqorder);
1767
1768 /* Find the point where to insert in the list. */
1769 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1770 eptr = &e;
1771 lastp = NULL;
1772 while (*eptr != NULL)
1773 {
1774 if ((*eptr)->nwcs < runp->nwcs)
1775 break;
1776
1777 if ((*eptr)->nwcs == runp->nwcs)
1778 {
1779 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1780 (wchar_t *) runp->wcs, runp->nwcs);
1781
1782 if (c == 0)
1783 {
1784 /* This should not happen. It means that we have
1785 two symbols with the same byte sequence. It is
1786 of course an error. */
1787 record_error_at_line (0, 0, (*eptr)->file,
1788 (*eptr)->line,
1789 _("\
1790 symbol `%s' has the same encoding as"), (*eptr)->name);
1791
1792 record_error_at_line (0, 0, runp->file, runp->line,
1793 _("symbol `%s'"), runp->name);
1794 goto dont_insertwc;
1795 }
1796 else if (c < 0)
1797 /* Insert it here. */
1798 break;
1799 }
1800
1801 /* To the next entry. */
1802 lastp = *eptr;
1803 eptr = &(*eptr)->wcnext;
1804 }
1805
1806 /* Set the pointers. */
1807 runp->wcnext = *eptr;
1808 runp->wclast = lastp;
1809 if (*eptr != NULL)
1810 (*eptr)->wclast = runp;
1811 *eptr = runp;
1812 if (eptr == &e)
1813 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1814 dont_insertwc:
1815 ;
1816 }
1817
1818 /* Up to the next entry. */
1819 runp = runp->next;
1820 }
1821
1822 /* Now determine whether the UNDEFINED entry is needed and if yes,
1823 whether it was defined. */
1824 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1825 if (collate->undefined.file == NULL)
1826 {
1827 if (need_undefined)
1828 {
1829 /* This seems not to be enforced by recent standards. Don't
1830 emit an error, simply append UNDEFINED at the end. */
1831 collate->undefined.mborder =
1832 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1833
1834 for (i = 0; i < nrules; ++i)
1835 collate->undefined.mborder[i] = mbact[i]++;
1836 }
1837
1838 /* In any case we will need the definition for the wide character
1839 case. But we will not complain that it is missing since the
1840 specification strangely enough does not seem to account for
1841 this. */
1842 collate->undefined.wcorder = wcact++;
1843 }
1844
1845 /* Finally, try to unify the rules for the sections. Whenever the rules
1846 for a section are the same as those for another section give the
1847 ruleset the same index. Since there are never many section we can
1848 use an O(n^2) algorithm here. */
1849 sect = collate->sections;
1850 while (sect != NULL && sect->rules == NULL)
1851 sect = sect->next;
1852
1853 /* Bail out if we have no sections because of earlier errors. */
1854 if (sect == NULL)
1855 {
1856 record_error (EXIT_FAILURE, 0, _("too many errors; giving up"));
1857 return;
1858 }
1859
1860 ruleidx = 0;
1861 do
1862 {
1863 struct section_list *osect = collate->sections;
1864
1865 while (osect != sect)
1866 if (osect->rules != NULL
1867 && memcmp (osect->rules, sect->rules,
1868 nrules * sizeof (osect->rules[0])) == 0)
1869 break;
1870 else
1871 osect = osect->next;
1872
1873 if (osect == sect)
1874 sect->ruleidx = ruleidx++;
1875 else
1876 sect->ruleidx = osect->ruleidx;
1877
1878 /* Next section. */
1879 do
1880 sect = sect->next;
1881 while (sect != NULL && sect->rules == NULL);
1882 }
1883 while (sect != NULL);
1884 /* We are currently not prepared for more than 128 rulesets. But this
1885 should never really be a problem. */
1886 assert (ruleidx <= 128);
1887 }
1888
1889
1890 static int32_t
output_weight(struct obstack * pool,struct locale_collate_t * collate,struct element_t * elem)1891 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1892 struct element_t *elem)
1893 {
1894 size_t cnt;
1895 int32_t retval;
1896
1897 /* Optimize the use of UNDEFINED. */
1898 if (elem == &collate->undefined)
1899 /* The weights are already inserted. */
1900 return 0;
1901
1902 /* This byte can start exactly one collation element and this is
1903 a single byte. We can directly give the index to the weights. */
1904 retval = obstack_object_size (pool);
1905
1906 /* Construct the weight. */
1907 for (cnt = 0; cnt < nrules; ++cnt)
1908 {
1909 char buf[elem->weights[cnt].cnt * 7];
1910 int len = 0;
1911 int i;
1912
1913 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1914 /* Encode the weight value. We do nothing for IGNORE entries. */
1915 if (elem->weights[cnt].w[i] != NULL)
1916 len += utf8_encode (&buf[len],
1917 elem->weights[cnt].w[i]->mborder[cnt]);
1918
1919 /* And add the buffer content. */
1920 obstack_1grow (pool, len);
1921 obstack_grow (pool, buf, len);
1922 }
1923
1924 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1925 }
1926
1927
1928 static int32_t
output_weightwc(struct obstack * pool,struct locale_collate_t * collate,struct element_t * elem)1929 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1930 struct element_t *elem)
1931 {
1932 size_t cnt;
1933 int32_t retval;
1934
1935 /* Optimize the use of UNDEFINED. */
1936 if (elem == &collate->undefined)
1937 /* The weights are already inserted. */
1938 return 0;
1939
1940 /* This byte can start exactly one collation element and this is
1941 a single byte. We can directly give the index to the weights. */
1942 retval = obstack_object_size (pool) / sizeof (int32_t);
1943
1944 /* Construct the weight. */
1945 for (cnt = 0; cnt < nrules; ++cnt)
1946 {
1947 int32_t buf[elem->weights[cnt].cnt];
1948 int i;
1949 int32_t j;
1950
1951 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1952 if (elem->weights[cnt].w[i] != NULL)
1953 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1954
1955 /* And add the buffer content. */
1956 obstack_int32_grow (pool, j);
1957
1958 obstack_grow (pool, buf, j * sizeof (int32_t));
1959 maybe_swap_uint32_obstack (pool, j);
1960 }
1961
1962 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1963 }
1964
1965 /* If localedef is every threaded, this would need to be __thread var. */
1966 static struct
1967 {
1968 struct obstack *weightpool;
1969 struct obstack *extrapool;
1970 struct obstack *indpool;
1971 struct locale_collate_t *collate;
1972 struct collidx_table *tablewc;
1973 } atwc;
1974
1975 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1976
1977 static void
add_to_tablewc(uint32_t ch,struct element_t * runp)1978 add_to_tablewc (uint32_t ch, struct element_t *runp)
1979 {
1980 if (runp->wcnext == NULL && runp->nwcs == 1)
1981 {
1982 int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1983 runp);
1984 collidx_table_add (atwc.tablewc, ch, weigthidx);
1985 }
1986 else
1987 {
1988 /* As for the singlebyte table, we recognize sequences and
1989 compress them. */
1990
1991 collidx_table_add (atwc.tablewc, ch,
1992 -(obstack_object_size (atwc.extrapool)
1993 / sizeof (uint32_t)));
1994
1995 do
1996 {
1997 /* Store the current index in the weight table. We know that
1998 the current position in the `extrapool' is aligned on a
1999 32-bit address. */
2000 int32_t weightidx;
2001 int added;
2002
2003 /* Find out wether this is a single entry or we have more than
2004 one consecutive entry. */
2005 if (runp->wcnext != NULL
2006 && runp->nwcs == runp->wcnext->nwcs
2007 && wmemcmp ((wchar_t *) runp->wcs,
2008 (wchar_t *)runp->wcnext->wcs,
2009 runp->nwcs - 1) == 0
2010 && (runp->wcs[runp->nwcs - 1]
2011 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2012 {
2013 int i;
2014 struct element_t *series_startp = runp;
2015 struct element_t *curp;
2016
2017 /* Now add first the initial byte sequence. */
2018 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2019 if (sizeof (int32_t) == sizeof (int))
2020 obstack_make_room (atwc.extrapool, added);
2021
2022 /* More than one consecutive entry. We mark this by having
2023 a negative index into the indirect table. */
2024 obstack_int32_grow_fast (atwc.extrapool,
2025 -(obstack_object_size (atwc.indpool)
2026 / sizeof (int32_t)));
2027 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2028
2029 do
2030 runp = runp->wcnext;
2031 while (runp->wcnext != NULL
2032 && runp->nwcs == runp->wcnext->nwcs
2033 && wmemcmp ((wchar_t *) runp->wcs,
2034 (wchar_t *)runp->wcnext->wcs,
2035 runp->nwcs - 1) == 0
2036 && (runp->wcs[runp->nwcs - 1]
2037 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2038
2039 /* Now walk backward from here to the beginning. */
2040 curp = runp;
2041
2042 for (i = 1; i < runp->nwcs; ++i)
2043 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2044
2045 /* Now find the end of the consecutive sequence and
2046 add all the indices in the indirect pool. */
2047 do
2048 {
2049 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2050 curp);
2051 obstack_int32_grow (atwc.indpool, weightidx);
2052
2053 curp = curp->wclast;
2054 }
2055 while (curp != series_startp);
2056
2057 /* Add the final weight. */
2058 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2059 curp);
2060 obstack_int32_grow (atwc.indpool, weightidx);
2061
2062 /* And add the end byte sequence. Without length this
2063 time. */
2064 for (i = 1; i < curp->nwcs; ++i)
2065 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2066 }
2067 else
2068 {
2069 /* A single entry. Simply add the index and the length and
2070 string (except for the first character which is already
2071 tested for). */
2072 int i;
2073
2074 /* Output the weight info. */
2075 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2076 runp);
2077
2078 assert (runp->nwcs > 0);
2079 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2080 if (sizeof (int) == sizeof (int32_t))
2081 obstack_make_room (atwc.extrapool, added);
2082
2083 obstack_int32_grow_fast (atwc.extrapool, weightidx);
2084 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2085 for (i = 1; i < runp->nwcs; ++i)
2086 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2087 }
2088
2089 /* Next entry. */
2090 runp = runp->wcnext;
2091 }
2092 while (runp != NULL);
2093 }
2094 }
2095
2096 /* Include the C locale identity tables for _NL_COLLATE_COLLSEQMB and
2097 _NL_COLLATE_COLLSEQWC. */
2098 #include "C-collate-seq.c"
2099
2100 void
collate_output(struct localedef_t * locale,const struct charmap_t * charmap,const char * output_path)2101 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2102 const char *output_path)
2103 {
2104 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2105 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2106 struct locale_file file;
2107 size_t ch;
2108 int32_t tablemb[256];
2109 struct obstack weightpool;
2110 struct obstack extrapool;
2111 struct obstack indirectpool;
2112 struct section_list *sect;
2113 struct collidx_table tablewc;
2114 uint32_t elem_size;
2115 uint32_t *elem_table;
2116 int i;
2117 struct element_t *runp;
2118
2119 init_locale_data (&file, nelems);
2120 add_locale_uint32 (&file, nrules);
2121
2122 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
2123 if (collate == NULL || collate->codepoint_collation)
2124 {
2125 size_t idx;
2126 for (idx = 1; idx < nelems; idx++)
2127 {
2128 /* The words have to be handled specially. */
2129 if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2130 add_locale_uint32 (&file, 0);
2131 else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_CODESET)
2132 && collate != NULL)
2133 /* A valid LC_COLLATE must have a code set name. */
2134 add_locale_string (&file, charmap->code_set_name);
2135 else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB)
2136 && collate != NULL)
2137 add_locale_raw_data (&file, collseqmb, sizeof (collseqmb));
2138 else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC)
2139 && collate != NULL)
2140 add_locale_uint32_array (&file, collseqwc,
2141 array_length (collseqwc));
2142 else
2143 add_locale_empty (&file);
2144 }
2145 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2146 return;
2147 }
2148
2149 obstack_init (&weightpool);
2150 obstack_init (&extrapool);
2151 obstack_init (&indirectpool);
2152
2153 /* Since we are using the sign of an integer to mark indirection the
2154 offsets in the arrays we are indirectly referring to must not be
2155 zero since -0 == 0. Therefore we add a bit of dummy content. */
2156 obstack_int32_grow (&extrapool, 0);
2157 obstack_int32_grow (&indirectpool, 0);
2158
2159 /* Prepare the ruleset table. */
2160 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2161 if (sect->rules != NULL && sect->ruleidx == i)
2162 {
2163 int j;
2164
2165 obstack_make_room (&weightpool, nrules);
2166
2167 for (j = 0; j < nrules; ++j)
2168 obstack_1grow_fast (&weightpool, sect->rules[j]);
2169 ++i;
2170 }
2171 /* And align the output. */
2172 i = (nrules * i) % LOCFILE_ALIGN;
2173 if (i > 0)
2174 do
2175 obstack_1grow (&weightpool, '\0');
2176 while (++i < LOCFILE_ALIGN);
2177
2178 add_locale_raw_obstack (&file, &weightpool);
2179
2180 /* Generate the 8-bit table. Walk through the lists of sequences
2181 starting with the same byte and add them one after the other to
2182 the table. In case we have more than one sequence starting with
2183 the same byte we have to use extra indirection.
2184
2185 First add a record for the NUL byte. This entry will never be used
2186 so it does not matter. */
2187 tablemb[0] = 0;
2188
2189 /* Now insert the `UNDEFINED' value if it is used. Since this value
2190 will probably be used more than once it is good to store the
2191 weights only once. */
2192 if (collate->undefined.used_in_level != 0)
2193 output_weight (&weightpool, collate, &collate->undefined);
2194
2195 for (ch = 1; ch < 256; ++ch)
2196 if (collate->mbheads[ch]->mbnext == NULL
2197 && collate->mbheads[ch]->nmbs <= 1)
2198 {
2199 tablemb[ch] = output_weight (&weightpool, collate,
2200 collate->mbheads[ch]);
2201 }
2202 else
2203 {
2204 /* The entries in the list are sorted by length and then
2205 alphabetically. This is the order in which we will add the
2206 elements to the collation table. This allows simply walking
2207 the table in sequence and stopping at the first matching
2208 entry. Since the longer sequences are coming first in the
2209 list they have the possibility to match first, just as it
2210 has to be. In the worst case we are walking to the end of
2211 the list where we put, if no singlebyte sequence is defined
2212 in the locale definition, the weights for UNDEFINED.
2213
2214 To reduce the length of the search list we compress them a bit.
2215 This happens by collecting sequences of consecutive byte
2216 sequences in one entry (having and begin and end byte sequence)
2217 and add only one index into the weight table. We can find the
2218 consecutive entries since they are also consecutive in the list. */
2219 struct element_t *runp = collate->mbheads[ch];
2220 struct element_t *lastp;
2221
2222 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2223
2224 tablemb[ch] = -obstack_object_size (&extrapool);
2225
2226 do
2227 {
2228 /* Store the current index in the weight table. We know that
2229 the current position in the `extrapool' is aligned on a
2230 32-bit address. */
2231 int32_t weightidx;
2232 int added;
2233
2234 /* Find out wether this is a single entry or we have more than
2235 one consecutive entry. */
2236 if (runp->mbnext != NULL
2237 && runp->nmbs == runp->mbnext->nmbs
2238 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2239 && (runp->mbs[runp->nmbs - 1]
2240 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2241 {
2242 int i;
2243 struct element_t *series_startp = runp;
2244 struct element_t *curp;
2245
2246 /* Compute how much space we will need. */
2247 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2248 + 2 * (runp->nmbs - 1));
2249 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2250 obstack_make_room (&extrapool, added);
2251
2252 /* More than one consecutive entry. We mark this by having
2253 a negative index into the indirect table. */
2254 obstack_int32_grow_fast (&extrapool,
2255 -(obstack_object_size (&indirectpool)
2256 / sizeof (int32_t)));
2257
2258 /* Now search first the end of the series. */
2259 do
2260 runp = runp->mbnext;
2261 while (runp->mbnext != NULL
2262 && runp->nmbs == runp->mbnext->nmbs
2263 && memcmp (runp->mbs, runp->mbnext->mbs,
2264 runp->nmbs - 1) == 0
2265 && (runp->mbs[runp->nmbs - 1]
2266 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2267
2268 /* Now walk backward from here to the beginning. */
2269 curp = runp;
2270
2271 assert (runp->nmbs <= 256);
2272 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2273 for (i = 1; i < curp->nmbs; ++i)
2274 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2275
2276 /* Now find the end of the consecutive sequence and
2277 add all the indices in the indirect pool. */
2278 do
2279 {
2280 weightidx = output_weight (&weightpool, collate, curp);
2281 obstack_int32_grow (&indirectpool, weightidx);
2282
2283 curp = curp->mblast;
2284 }
2285 while (curp != series_startp);
2286
2287 /* Add the final weight. */
2288 weightidx = output_weight (&weightpool, collate, curp);
2289 obstack_int32_grow (&indirectpool, weightidx);
2290
2291 /* And add the end byte sequence. Without length this
2292 time. */
2293 for (i = 1; i < curp->nmbs; ++i)
2294 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2295 }
2296 else
2297 {
2298 /* A single entry. Simply add the index and the length and
2299 string (except for the first character which is already
2300 tested for). */
2301 int i;
2302
2303 /* Output the weight info. */
2304 weightidx = output_weight (&weightpool, collate, runp);
2305
2306 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2307 + runp->nmbs - 1);
2308 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2309 obstack_make_room (&extrapool, added);
2310
2311 obstack_int32_grow_fast (&extrapool, weightidx);
2312 assert (runp->nmbs <= 256);
2313 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2314
2315 for (i = 1; i < runp->nmbs; ++i)
2316 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2317 }
2318
2319 /* Add alignment bytes if necessary. */
2320 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2321 obstack_1grow_fast (&extrapool, '\0');
2322
2323 /* Next entry. */
2324 lastp = runp;
2325 runp = runp->mbnext;
2326 }
2327 while (runp != NULL);
2328
2329 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2330
2331 /* If the final entry in the list is not a single character we
2332 add an UNDEFINED entry here. */
2333 if (lastp->nmbs != 1)
2334 {
2335 int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
2336 obstack_make_room (&extrapool, added);
2337
2338 obstack_int32_grow_fast (&extrapool, 0);
2339 /* XXX What rule? We just pick the first. */
2340 obstack_1grow_fast (&extrapool, 0);
2341 /* Length is zero. */
2342 obstack_1grow_fast (&extrapool, 0);
2343
2344 /* Add alignment bytes if necessary. */
2345 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2346 obstack_1grow_fast (&extrapool, '\0');
2347 }
2348 }
2349
2350 /* Add padding to the tables if necessary. */
2351 while (!LOCFILE_ALIGNED_P (obstack_object_size (&weightpool)))
2352 obstack_1grow (&weightpool, 0);
2353
2354 /* Now add the four tables. */
2355 add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256);
2356 add_locale_raw_obstack (&file, &weightpool);
2357 add_locale_raw_obstack (&file, &extrapool);
2358 add_locale_raw_obstack (&file, &indirectpool);
2359
2360 /* Now the same for the wide character table. We need to store some
2361 more information here. */
2362 add_locale_empty (&file);
2363 add_locale_empty (&file);
2364 add_locale_empty (&file);
2365
2366 /* Since we are using the sign of an integer to mark indirection the
2367 offsets in the arrays we are indirectly referring to must not be
2368 zero since -0 == 0. Therefore we add a bit of dummy content. */
2369 obstack_int32_grow (&extrapool, 0);
2370 obstack_int32_grow (&indirectpool, 0);
2371
2372 /* Now insert the `UNDEFINED' value if it is used. Since this value
2373 will probably be used more than once it is good to store the
2374 weights only once. */
2375 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2376 abort ();
2377
2378 /* Generate the table. Walk through the lists of sequences starting
2379 with the same wide character and add them one after the other to
2380 the table. In case we have more than one sequence starting with
2381 the same byte we have to use extra indirection. */
2382 tablewc.p = 6;
2383 tablewc.q = 10;
2384 collidx_table_init (&tablewc);
2385
2386 atwc.weightpool = &weightpool;
2387 atwc.extrapool = &extrapool;
2388 atwc.indpool = &indirectpool;
2389 atwc.collate = collate;
2390 atwc.tablewc = &tablewc;
2391
2392 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2393
2394 memset (&atwc, 0, sizeof (atwc));
2395
2396 /* Now add the four tables. */
2397 add_locale_collidx_table (&file, &tablewc);
2398 add_locale_raw_obstack (&file, &weightpool);
2399 add_locale_raw_obstack (&file, &extrapool);
2400 add_locale_raw_obstack (&file, &indirectpool);
2401
2402 /* Finally write the table with collation element names out. It is
2403 a hash table with a simple function which gets the name of the
2404 character as the input. One character might have many names. The
2405 value associated with the name is an index into the weight table
2406 where we are then interested in the first-level weight value.
2407
2408 To determine how large the table should be we are counting the
2409 elements have to put in. Since we are using internal chaining
2410 using a secondary hash function we have to make the table a bit
2411 larger to avoid extremely long search times. We can achieve
2412 good results with a 40% larger table than there are entries. */
2413 elem_size = 0;
2414 runp = collate->start;
2415 while (runp != NULL)
2416 {
2417 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2418 /* Yep, the element really counts. */
2419 ++elem_size;
2420
2421 runp = runp->next;
2422 }
2423 /* Add 50% and find the next prime number. */
2424 elem_size = next_prime (elem_size + (elem_size >> 1));
2425
2426 /* Allocate the table. Each entry consists of two words: the hash
2427 value and an index in a secondary table which provides the index
2428 into the weight table and the string itself (so that a match can
2429 be determined). */
2430 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2431 elem_size * 2 * sizeof (uint32_t));
2432 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2433
2434 /* Now add the elements. */
2435 runp = collate->start;
2436 while (runp != NULL)
2437 {
2438 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2439 {
2440 /* Compute the hash value of the name. */
2441 uint32_t namelen = strlen (runp->name);
2442 uint32_t hash = elem_hash (runp->name, namelen);
2443 size_t idx = hash % elem_size;
2444 #ifndef NDEBUG
2445 size_t start_idx = idx;
2446 #endif
2447
2448 if (elem_table[idx * 2] != 0)
2449 {
2450 /* The spot is already taken. Try iterating using the value
2451 from the secondary hashing function. */
2452 size_t iter = hash % (elem_size - 2) + 1;
2453
2454 do
2455 {
2456 idx += iter;
2457 if (idx >= elem_size)
2458 idx -= elem_size;
2459 assert (idx != start_idx);
2460 }
2461 while (elem_table[idx * 2] != 0);
2462 }
2463 /* This is the spot where we will insert the value. */
2464 elem_table[idx * 2] = hash;
2465 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2466
2467 /* The string itself including length. */
2468 obstack_1grow (&extrapool, namelen);
2469 obstack_grow (&extrapool, runp->name, namelen);
2470
2471 /* And the multibyte representation. */
2472 obstack_1grow (&extrapool, runp->nmbs);
2473 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2474
2475 /* And align again to 32 bits. */
2476 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2477 obstack_grow (&extrapool, "\0\0",
2478 (sizeof (int32_t)
2479 - ((1 + namelen + 1 + runp->nmbs)
2480 % sizeof (int32_t))));
2481
2482 /* Now some 32-bit values: multibyte collation sequence,
2483 wide char string (including length), and wide char
2484 collation sequence. */
2485 obstack_int32_grow (&extrapool, runp->mbseqorder);
2486
2487 obstack_int32_grow (&extrapool, runp->nwcs);
2488 obstack_grow (&extrapool, runp->wcs,
2489 runp->nwcs * sizeof (uint32_t));
2490 maybe_swap_uint32_obstack (&extrapool, runp->nwcs);
2491
2492 obstack_int32_grow (&extrapool, runp->wcseqorder);
2493 }
2494
2495 runp = runp->next;
2496 }
2497
2498 /* Prepare to write out this data. */
2499 add_locale_uint32 (&file, elem_size);
2500 add_locale_uint32_array (&file, elem_table, 2 * elem_size);
2501 add_locale_raw_obstack (&file, &extrapool);
2502 add_locale_raw_data (&file, collate->mbseqorder, 256);
2503 add_locale_collseq_table (&file, &collate->wcseqorder);
2504 add_locale_string (&file, charmap->code_set_name);
2505 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2506
2507 obstack_free (&weightpool, NULL);
2508 obstack_free (&extrapool, NULL);
2509 obstack_free (&indirectpool, NULL);
2510 }
2511
2512
2513 static enum token_t
skip_to(struct linereader * ldfile,struct locale_collate_t * collate,const struct charmap_t * charmap,int to_endif)2514 skip_to (struct linereader *ldfile, struct locale_collate_t *collate,
2515 const struct charmap_t *charmap, int to_endif)
2516 {
2517 while (1)
2518 {
2519 struct token *now = lr_token (ldfile, charmap, NULL, NULL, 0);
2520 enum token_t nowtok = now->tok;
2521
2522 if (nowtok == tok_eof || nowtok == tok_end)
2523 return nowtok;
2524
2525 if (nowtok == tok_ifdef || nowtok == tok_ifndef)
2526 {
2527 lr_error (ldfile, _("%s: nested conditionals not supported"),
2528 "LC_COLLATE");
2529 nowtok = skip_to (ldfile, collate, charmap, tok_endif);
2530 if (nowtok == tok_eof || nowtok == tok_end)
2531 return nowtok;
2532 }
2533 else if (nowtok == tok_endif || (!to_endif && nowtok == tok_else))
2534 {
2535 lr_ignore_rest (ldfile, 1);
2536 return nowtok;
2537 }
2538 else if (!to_endif && (nowtok == tok_elifdef || nowtok == tok_elifndef))
2539 {
2540 /* Do not read the rest of the line. */
2541 return nowtok;
2542 }
2543 else if (nowtok == tok_else)
2544 {
2545 lr_error (ldfile, _("%s: more than one 'else'"), "LC_COLLATE");
2546 }
2547
2548 lr_ignore_rest (ldfile, 0);
2549 }
2550 }
2551
2552
2553 void
collate_read(struct linereader * ldfile,struct localedef_t * result,const struct charmap_t * charmap,const char * repertoire_name,int ignore_content)2554 collate_read (struct linereader *ldfile, struct localedef_t *result,
2555 const struct charmap_t *charmap, const char *repertoire_name,
2556 int ignore_content)
2557 {
2558 struct repertoire_t *repertoire = NULL;
2559 struct locale_collate_t *collate;
2560 struct token *now;
2561 struct token *arg = NULL;
2562 enum token_t nowtok;
2563 enum token_t was_ellipsis = tok_none;
2564 struct localedef_t *copy_locale = NULL;
2565 /* Parsing state:
2566 0 - start
2567 1 - between `order-start' and `order-end'
2568 2 - after `order-end'
2569 3 - after `reorder-after', waiting for `reorder-end'
2570 4 - after `reorder-end'
2571 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2572 6 - after `reorder-sections-end'
2573 */
2574 int state = 0;
2575
2576 /* Get the repertoire we have to use. */
2577 if (repertoire_name != NULL)
2578 repertoire = repertoire_read (repertoire_name);
2579
2580 /* The rest of the line containing `LC_COLLATE' must be free. */
2581 lr_ignore_rest (ldfile, 1);
2582
2583 while (1)
2584 {
2585 do
2586 {
2587 now = lr_token (ldfile, charmap, result, NULL, verbose);
2588 nowtok = now->tok;
2589 }
2590 while (nowtok == tok_eol);
2591
2592 if (nowtok != tok_define)
2593 break;
2594
2595 if (ignore_content)
2596 lr_ignore_rest (ldfile, 0);
2597 else
2598 {
2599 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2600 if (arg->tok != tok_ident)
2601 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2602 else
2603 {
2604 /* Simply add the new symbol. */
2605 struct name_list *newsym = xmalloc (sizeof (*newsym)
2606 + arg->val.str.lenmb + 1);
2607 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
2608 newsym->str[arg->val.str.lenmb] = '\0';
2609 newsym->next = defined;
2610 defined = newsym;
2611
2612 lr_ignore_rest (ldfile, 1);
2613 }
2614 }
2615 }
2616
2617 if (nowtok == tok_copy)
2618 {
2619 now = lr_token (ldfile, charmap, result, NULL, verbose);
2620 if (now->tok != tok_string)
2621 {
2622 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2623
2624 skip_category:
2625 do
2626 now = lr_token (ldfile, charmap, result, NULL, verbose);
2627 while (now->tok != tok_eof && now->tok != tok_end);
2628
2629 if (now->tok != tok_eof
2630 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2631 now->tok == tok_eof))
2632 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2633 else if (now->tok != tok_lc_collate)
2634 {
2635 lr_error (ldfile, _("\
2636 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2637 lr_ignore_rest (ldfile, 0);
2638 }
2639 else
2640 lr_ignore_rest (ldfile, 1);
2641
2642 return;
2643 }
2644
2645 if (! ignore_content)
2646 {
2647 /* Get the locale definition. */
2648 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2649 repertoire_name, charmap, NULL);
2650 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2651 {
2652 /* Not yet loaded. So do it now. */
2653 if (locfile_read (copy_locale, charmap) != 0)
2654 goto skip_category;
2655 }
2656
2657 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2658 return;
2659 }
2660
2661 lr_ignore_rest (ldfile, 1);
2662
2663 now = lr_token (ldfile, charmap, result, NULL, verbose);
2664 nowtok = now->tok;
2665 }
2666
2667 /* Prepare the data structures. */
2668 collate_startup (ldfile, result, copy_locale, ignore_content);
2669 collate = result->categories[LC_COLLATE].collate;
2670
2671 while (1)
2672 {
2673 char ucs4buf[10];
2674 char *symstr;
2675 size_t symlen;
2676
2677 /* Of course we don't proceed beyond the end of file. */
2678 if (nowtok == tok_eof)
2679 break;
2680
2681 /* Ingore empty lines. */
2682 if (nowtok == tok_eol)
2683 {
2684 now = lr_token (ldfile, charmap, result, NULL, verbose);
2685 nowtok = now->tok;
2686 continue;
2687 }
2688
2689 switch (nowtok)
2690 {
2691 case tok_codepoint_collation:
2692 collate->codepoint_collation = true;
2693 break;
2694
2695 case tok_copy:
2696 /* Allow copying other locales. */
2697 now = lr_token (ldfile, charmap, result, NULL, verbose);
2698 if (now->tok != tok_string)
2699 goto err_label;
2700
2701 if (! ignore_content)
2702 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2703 charmap, result);
2704
2705 lr_ignore_rest (ldfile, 1);
2706 break;
2707
2708 case tok_coll_weight_max:
2709 /* Ignore the rest of the line if we don't need the input of
2710 this line. */
2711 if (ignore_content)
2712 {
2713 lr_ignore_rest (ldfile, 0);
2714 break;
2715 }
2716
2717 if (state != 0)
2718 goto err_label;
2719
2720 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2721 if (arg->tok != tok_number)
2722 goto err_label;
2723 if (collate->col_weight_max != -1)
2724 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2725 "LC_COLLATE", "col_weight_max");
2726 else
2727 collate->col_weight_max = arg->val.num;
2728 lr_ignore_rest (ldfile, 1);
2729 break;
2730
2731 case tok_section_symbol:
2732 /* Ignore the rest of the line if we don't need the input of
2733 this line. */
2734 if (ignore_content)
2735 {
2736 lr_ignore_rest (ldfile, 0);
2737 break;
2738 }
2739
2740 if (state != 0)
2741 goto err_label;
2742
2743 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2744 if (arg->tok != tok_bsymbol)
2745 goto err_label;
2746 else if (!ignore_content)
2747 {
2748 /* Check whether this section is already known. */
2749 struct section_list *known = collate->sections;
2750 while (known != NULL)
2751 {
2752 if (strcmp (known->name, arg->val.str.startmb) == 0)
2753 break;
2754 known = known->next;
2755 }
2756
2757 if (known != NULL)
2758 {
2759 lr_error (ldfile,
2760 _("%s: duplicate declaration of section `%s'"),
2761 "LC_COLLATE", arg->val.str.startmb);
2762 free (arg->val.str.startmb);
2763 }
2764 else
2765 collate->sections = make_seclist_elem (collate,
2766 arg->val.str.startmb,
2767 collate->sections);
2768
2769 lr_ignore_rest (ldfile, known == NULL);
2770 }
2771 else
2772 {
2773 free (arg->val.str.startmb);
2774 lr_ignore_rest (ldfile, 0);
2775 }
2776 break;
2777
2778 case tok_collating_element:
2779 /* Ignore the rest of the line if we don't need the input of
2780 this line. */
2781 if (ignore_content)
2782 {
2783 lr_ignore_rest (ldfile, 0);
2784 break;
2785 }
2786
2787 if (state != 0 && state != 2)
2788 goto err_label;
2789
2790 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2791 if (arg->tok != tok_bsymbol)
2792 goto err_label;
2793 else
2794 {
2795 const char *symbol = arg->val.str.startmb;
2796 size_t symbol_len = arg->val.str.lenmb;
2797
2798 /* Next the `from' keyword. */
2799 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2800 if (arg->tok != tok_from)
2801 {
2802 free ((char *) symbol);
2803 goto err_label;
2804 }
2805
2806 ldfile->return_widestr = 1;
2807 ldfile->translate_strings = 1;
2808
2809 /* Finally the string with the replacement. */
2810 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2811
2812 ldfile->return_widestr = 0;
2813 ldfile->translate_strings = 0;
2814
2815 if (arg->tok != tok_string)
2816 goto err_label;
2817
2818 if (!ignore_content && symbol != NULL)
2819 {
2820 /* The name is already defined. */
2821 if (check_duplicate (ldfile, collate, charmap,
2822 repertoire, symbol, symbol_len))
2823 goto col_elem_free;
2824
2825 if (arg->val.str.startmb != NULL)
2826 insert_entry (&collate->elem_table, symbol, symbol_len,
2827 new_element (collate,
2828 arg->val.str.startmb,
2829 arg->val.str.lenmb - 1,
2830 arg->val.str.startwc,
2831 symbol, symbol_len, 0));
2832 }
2833 else
2834 {
2835 col_elem_free:
2836 free ((char *) symbol);
2837 free (arg->val.str.startmb);
2838 free (arg->val.str.startwc);
2839 }
2840 lr_ignore_rest (ldfile, 1);
2841 }
2842 break;
2843
2844 case tok_collating_symbol:
2845 /* Ignore the rest of the line if we don't need the input of
2846 this line. */
2847 if (ignore_content)
2848 {
2849 lr_ignore_rest (ldfile, 0);
2850 break;
2851 }
2852
2853 if (state != 0 && state != 2)
2854 goto err_label;
2855
2856 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2857 if (arg->tok != tok_bsymbol)
2858 goto err_label;
2859 else
2860 {
2861 char *symbol = arg->val.str.startmb;
2862 size_t symbol_len = arg->val.str.lenmb;
2863 char *endsymbol = NULL;
2864 size_t endsymbol_len = 0;
2865 enum token_t ellipsis = tok_none;
2866
2867 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2868 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2869 {
2870 ellipsis = arg->tok;
2871
2872 arg = lr_token (ldfile, charmap, result, repertoire,
2873 verbose);
2874 if (arg->tok != tok_bsymbol)
2875 {
2876 free (symbol);
2877 goto err_label;
2878 }
2879
2880 endsymbol = arg->val.str.startmb;
2881 endsymbol_len = arg->val.str.lenmb;
2882
2883 lr_ignore_rest (ldfile, 1);
2884 }
2885 else if (arg->tok != tok_eol)
2886 {
2887 free (symbol);
2888 goto err_label;
2889 }
2890
2891 if (!ignore_content)
2892 {
2893 if (symbol == NULL
2894 || (ellipsis != tok_none && endsymbol == NULL))
2895 {
2896 lr_error (ldfile, _("\
2897 %s: unknown character in collating symbol name"),
2898 "LC_COLLATE");
2899 goto col_sym_free;
2900 }
2901 else if (ellipsis == tok_none)
2902 {
2903 /* A single symbol, no ellipsis. */
2904 if (check_duplicate (ldfile, collate, charmap,
2905 repertoire, symbol, symbol_len))
2906 /* The name is already defined. */
2907 goto col_sym_free;
2908
2909 insert_entry (&collate->sym_table, symbol, symbol_len,
2910 new_symbol (collate, symbol, symbol_len));
2911 }
2912 else if (symbol_len != endsymbol_len)
2913 {
2914 col_sym_inv_range:
2915 lr_error (ldfile,
2916 _("invalid names for character range"));
2917 goto col_sym_free;
2918 }
2919 else
2920 {
2921 /* Oh my, we have to handle an ellipsis. First, as
2922 usual, determine the common prefix and then
2923 convert the rest into a range. */
2924 size_t prefixlen;
2925 unsigned long int from;
2926 unsigned long int to;
2927 char *endp;
2928
2929 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2930 if (symbol[prefixlen] != endsymbol[prefixlen])
2931 break;
2932
2933 /* Convert the rest into numbers. */
2934 symbol[symbol_len] = '\0';
2935 from = strtoul (&symbol[prefixlen], &endp,
2936 ellipsis == tok_ellipsis2 ? 16 : 10);
2937 if (*endp != '\0')
2938 goto col_sym_inv_range;
2939
2940 endsymbol[symbol_len] = '\0';
2941 to = strtoul (&endsymbol[prefixlen], &endp,
2942 ellipsis == tok_ellipsis2 ? 16 : 10);
2943 if (*endp != '\0')
2944 goto col_sym_inv_range;
2945
2946 if (from > to)
2947 goto col_sym_inv_range;
2948
2949 /* Now loop over all entries. */
2950 while (from <= to)
2951 {
2952 char *symbuf;
2953
2954 symbuf = (char *) obstack_alloc (&collate->mempool,
2955 symbol_len + 1);
2956
2957 /* Create the name. */
2958 sprintf (symbuf,
2959 ellipsis == tok_ellipsis2
2960 ? "%.*s%.*lX" : "%.*s%.*lu",
2961 (int) prefixlen, symbol,
2962 (int) (symbol_len - prefixlen), from);
2963
2964 if (check_duplicate (ldfile, collate, charmap,
2965 repertoire, symbuf, symbol_len))
2966 /* The name is already defined. */
2967 goto col_sym_free;
2968
2969 insert_entry (&collate->sym_table, symbuf,
2970 symbol_len,
2971 new_symbol (collate, symbuf,
2972 symbol_len));
2973
2974 /* Increment the counter. */
2975 ++from;
2976 }
2977
2978 goto col_sym_free;
2979 }
2980 }
2981 else
2982 {
2983 col_sym_free:
2984 free (symbol);
2985 free (endsymbol);
2986 }
2987 }
2988 break;
2989
2990 case tok_symbol_equivalence:
2991 /* Ignore the rest of the line if we don't need the input of
2992 this line. */
2993 if (ignore_content)
2994 {
2995 lr_ignore_rest (ldfile, 0);
2996 break;
2997 }
2998
2999 if (state != 0)
3000 goto err_label;
3001
3002 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3003 if (arg->tok != tok_bsymbol)
3004 goto err_label;
3005 else
3006 {
3007 const char *newname = arg->val.str.startmb;
3008 size_t newname_len = arg->val.str.lenmb;
3009 const char *symname;
3010 size_t symname_len;
3011 void *symval; /* Actually struct symbol_t* */
3012
3013 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3014 if (arg->tok != tok_bsymbol)
3015 {
3016 free ((char *) newname);
3017 goto err_label;
3018 }
3019
3020 symname = arg->val.str.startmb;
3021 symname_len = arg->val.str.lenmb;
3022
3023 if (newname == NULL)
3024 {
3025 lr_error (ldfile, _("\
3026 %s: unknown character in equivalent definition name"),
3027 "LC_COLLATE");
3028
3029 sym_equiv_free:
3030 free ((char *) newname);
3031 free ((char *) symname);
3032 break;
3033 }
3034 if (symname == NULL)
3035 {
3036 lr_error (ldfile, _("\
3037 %s: unknown character in equivalent definition value"),
3038 "LC_COLLATE");
3039 goto sym_equiv_free;
3040 }
3041
3042 /* See whether the symbol name is already defined. */
3043 if (find_entry (&collate->sym_table, symname, symname_len,
3044 &symval) != 0)
3045 {
3046 lr_error (ldfile, _("\
3047 %s: unknown symbol `%s' in equivalent definition"),
3048 "LC_COLLATE", symname);
3049 goto sym_equiv_free;
3050 }
3051
3052 if (insert_entry (&collate->sym_table,
3053 newname, newname_len, symval) < 0)
3054 {
3055 lr_error (ldfile, _("\
3056 error while adding equivalent collating symbol"));
3057 goto sym_equiv_free;
3058 }
3059
3060 free ((char *) symname);
3061 }
3062 lr_ignore_rest (ldfile, 1);
3063 break;
3064
3065 case tok_script:
3066 /* Ignore the rest of the line if we don't need the input of
3067 this line. */
3068 if (ignore_content)
3069 {
3070 lr_ignore_rest (ldfile, 0);
3071 break;
3072 }
3073
3074 /* We get told about the scripts we know. */
3075 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3076 if (arg->tok != tok_bsymbol)
3077 goto err_label;
3078 else
3079 {
3080 struct section_list *runp = collate->known_sections;
3081 char *name;
3082
3083 while (runp != NULL)
3084 if (strncmp (runp->name, arg->val.str.startmb,
3085 arg->val.str.lenmb) == 0
3086 && runp->name[arg->val.str.lenmb] == '\0')
3087 break;
3088 else
3089 runp = runp->def_next;
3090
3091 if (runp != NULL)
3092 {
3093 lr_error (ldfile, _("duplicate definition of script `%s'"),
3094 runp->name);
3095 lr_ignore_rest (ldfile, 0);
3096 break;
3097 }
3098
3099 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3100 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3101 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3102 name[arg->val.str.lenmb] = '\0';
3103 runp->name = name;
3104
3105 runp->def_next = collate->known_sections;
3106 collate->known_sections = runp;
3107 }
3108 lr_ignore_rest (ldfile, 1);
3109 break;
3110
3111 case tok_order_start:
3112 /* Ignore the rest of the line if we don't need the input of
3113 this line. */
3114 if (ignore_content)
3115 {
3116 lr_ignore_rest (ldfile, 0);
3117 break;
3118 }
3119
3120 if (state != 0 && state != 1 && state != 2)
3121 goto err_label;
3122 state = 1;
3123
3124 /* The 14652 draft does not specify whether all `order_start' lines
3125 must contain the same number of sort-rules, but 14651 does. So
3126 we require this here as well. */
3127 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3128 if (arg->tok == tok_bsymbol)
3129 {
3130 /* This better should be a section name. */
3131 struct section_list *sp = collate->known_sections;
3132 while (sp != NULL
3133 && (sp->name == NULL
3134 || strncmp (sp->name, arg->val.str.startmb,
3135 arg->val.str.lenmb) != 0
3136 || sp->name[arg->val.str.lenmb] != '\0'))
3137 sp = sp->def_next;
3138
3139 if (sp == NULL)
3140 {
3141 lr_error (ldfile, _("\
3142 %s: unknown section name `%.*s'"),
3143 "LC_COLLATE", (int) arg->val.str.lenmb,
3144 arg->val.str.startmb);
3145 /* We use the error section. */
3146 collate->current_section = &collate->error_section;
3147
3148 if (collate->error_section.first == NULL)
3149 {
3150 /* Insert &collate->error_section at the end of
3151 the collate->sections list. */
3152 if (collate->sections == NULL)
3153 collate->sections = &collate->error_section;
3154 else
3155 {
3156 sp = collate->sections;
3157 while (sp->next != NULL)
3158 sp = sp->next;
3159
3160 sp->next = &collate->error_section;
3161 }
3162 collate->error_section.next = NULL;
3163 }
3164 }
3165 else
3166 {
3167 /* One should not be allowed to open the same
3168 section twice. */
3169 if (sp->first != NULL)
3170 lr_error (ldfile, _("\
3171 %s: multiple order definitions for section `%s'"),
3172 "LC_COLLATE", sp->name);
3173 else
3174 {
3175 /* Insert sp in the collate->sections list,
3176 right after collate->current_section. */
3177 if (collate->current_section != NULL)
3178 {
3179 sp->next = collate->current_section->next;
3180 collate->current_section->next = sp;
3181 }
3182 else if (collate->sections == NULL)
3183 /* This is the first section to be defined. */
3184 collate->sections = sp;
3185
3186 collate->current_section = sp;
3187 }
3188
3189 /* Next should come the end of the line or a semicolon. */
3190 arg = lr_token (ldfile, charmap, result, repertoire,
3191 verbose);
3192 if (arg->tok == tok_eol)
3193 {
3194 uint32_t cnt;
3195
3196 /* This means we have exactly one rule: `forward'. */
3197 if (nrules > 1)
3198 lr_error (ldfile, _("\
3199 %s: invalid number of sorting rules"),
3200 "LC_COLLATE");
3201 else
3202 nrules = 1;
3203 sp->rules = obstack_alloc (&collate->mempool,
3204 (sizeof (enum coll_sort_rule)
3205 * nrules));
3206 for (cnt = 0; cnt < nrules; ++cnt)
3207 sp->rules[cnt] = sort_forward;
3208
3209 /* Next line. */
3210 break;
3211 }
3212
3213 /* Get the next token. */
3214 arg = lr_token (ldfile, charmap, result, repertoire,
3215 verbose);
3216 }
3217 }
3218 else
3219 {
3220 /* There is no section symbol. Therefore we use the unnamed
3221 section. */
3222 collate->current_section = &collate->unnamed_section;
3223
3224 if (collate->unnamed_section_defined)
3225 lr_error (ldfile, _("\
3226 %s: multiple order definitions for unnamed section"),
3227 "LC_COLLATE");
3228 else
3229 {
3230 /* Insert &collate->unnamed_section at the beginning of
3231 the collate->sections list. */
3232 collate->unnamed_section.next = collate->sections;
3233 collate->sections = &collate->unnamed_section;
3234 collate->unnamed_section_defined = true;
3235 }
3236 }
3237
3238 /* Now read the direction names. */
3239 read_directions (ldfile, arg, charmap, repertoire, result);
3240
3241 /* From now we need the strings untranslated. */
3242 ldfile->translate_strings = 0;
3243 break;
3244
3245 case tok_order_end:
3246 /* Ignore the rest of the line if we don't need the input of
3247 this line. */
3248 if (ignore_content)
3249 {
3250 lr_ignore_rest (ldfile, 0);
3251 break;
3252 }
3253
3254 if (state != 1)
3255 goto err_label;
3256
3257 /* Handle ellipsis at end of list. */
3258 if (was_ellipsis != tok_none)
3259 {
3260 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3261 repertoire, result);
3262 was_ellipsis = tok_none;
3263 }
3264
3265 state = 2;
3266 lr_ignore_rest (ldfile, 1);
3267 break;
3268
3269 case tok_reorder_after:
3270 /* Ignore the rest of the line if we don't need the input of
3271 this line. */
3272 if (ignore_content)
3273 {
3274 lr_ignore_rest (ldfile, 0);
3275 break;
3276 }
3277
3278 if (state == 1)
3279 {
3280 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3281 "LC_COLLATE");
3282 state = 2;
3283
3284 /* Handle ellipsis at end of list. */
3285 if (was_ellipsis != tok_none)
3286 {
3287 handle_ellipsis (ldfile, arg->val.str.startmb,
3288 arg->val.str.lenmb, was_ellipsis, charmap,
3289 repertoire, result);
3290 was_ellipsis = tok_none;
3291 }
3292 }
3293 else if (state == 0 && copy_locale == NULL)
3294 goto err_label;
3295 else if (state != 0 && state != 2 && state != 3)
3296 goto err_label;
3297 state = 3;
3298
3299 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3300 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3301 {
3302 /* Find this symbol in the sequence table. */
3303 char ucsbuf[10];
3304 char *startmb;
3305 size_t lenmb;
3306 struct element_t *insp;
3307 int no_error = 1;
3308 void *ptr;
3309
3310 if (arg->tok == tok_bsymbol)
3311 {
3312 startmb = arg->val.str.startmb;
3313 lenmb = arg->val.str.lenmb;
3314 }
3315 else
3316 {
3317 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3318 startmb = ucsbuf;
3319 lenmb = 9;
3320 }
3321
3322 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3323 /* Yes, the symbol exists. Simply point the cursor
3324 to it. */
3325 collate->cursor = (struct element_t *) ptr;
3326 else
3327 {
3328 struct symbol_t *symbp;
3329 void *ptr;
3330
3331 if (find_entry (&collate->sym_table, startmb, lenmb,
3332 &ptr) == 0)
3333 {
3334 symbp = ptr;
3335
3336 if (symbp->order->last != NULL
3337 || symbp->order->next != NULL)
3338 collate->cursor = symbp->order;
3339 else
3340 {
3341 /* This is a collating symbol but its position
3342 is not yet defined. */
3343 lr_error (ldfile, _("\
3344 %s: order for collating symbol %.*s not yet defined"),
3345 "LC_COLLATE", (int) lenmb, startmb);
3346 collate->cursor = NULL;
3347 no_error = 0;
3348 }
3349 }
3350 else if (find_entry (&collate->elem_table, startmb, lenmb,
3351 &ptr) == 0)
3352 {
3353 insp = (struct element_t *) ptr;
3354
3355 if (insp->last != NULL || insp->next != NULL)
3356 collate->cursor = insp;
3357 else
3358 {
3359 /* This is a collating element but its position
3360 is not yet defined. */
3361 lr_error (ldfile, _("\
3362 %s: order for collating element %.*s not yet defined"),
3363 "LC_COLLATE", (int) lenmb, startmb);
3364 collate->cursor = NULL;
3365 no_error = 0;
3366 }
3367 }
3368 else
3369 {
3370 /* This is bad. The symbol after which we have to
3371 insert does not exist. */
3372 lr_error (ldfile, _("\
3373 %s: cannot reorder after %.*s: symbol not known"),
3374 "LC_COLLATE", (int) lenmb, startmb);
3375 collate->cursor = NULL;
3376 no_error = 0;
3377 }
3378 }
3379
3380 lr_ignore_rest (ldfile, no_error);
3381 }
3382 else
3383 /* This must not happen. */
3384 goto err_label;
3385 break;
3386
3387 case tok_reorder_end:
3388 /* Ignore the rest of the line if we don't need the input of
3389 this line. */
3390 if (ignore_content)
3391 break;
3392
3393 if (state != 3)
3394 goto err_label;
3395 state = 4;
3396 lr_ignore_rest (ldfile, 1);
3397 break;
3398
3399 case tok_reorder_sections_after:
3400 /* Ignore the rest of the line if we don't need the input of
3401 this line. */
3402 if (ignore_content)
3403 {
3404 lr_ignore_rest (ldfile, 0);
3405 break;
3406 }
3407
3408 if (state == 1)
3409 {
3410 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3411 "LC_COLLATE");
3412 state = 2;
3413
3414 /* Handle ellipsis at end of list. */
3415 if (was_ellipsis != tok_none)
3416 {
3417 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3418 repertoire, result);
3419 was_ellipsis = tok_none;
3420 }
3421 }
3422 else if (state == 3)
3423 {
3424 record_error (0, 0, _("\
3425 %s: missing `reorder-end' keyword"), "LC_COLLATE");
3426 state = 4;
3427 }
3428 else if (state != 2 && state != 4)
3429 goto err_label;
3430 state = 5;
3431
3432 /* Get the name of the sections we are adding after. */
3433 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3434 if (arg->tok == tok_bsymbol)
3435 {
3436 /* Now find a section with this name. */
3437 struct section_list *runp = collate->sections;
3438
3439 while (runp != NULL)
3440 {
3441 if (runp->name != NULL
3442 && strlen (runp->name) == arg->val.str.lenmb
3443 && memcmp (runp->name, arg->val.str.startmb,
3444 arg->val.str.lenmb) == 0)
3445 break;
3446
3447 runp = runp->next;
3448 }
3449
3450 if (runp != NULL)
3451 collate->current_section = runp;
3452 else
3453 {
3454 /* This is bad. The section after which we have to
3455 reorder does not exist. Therefore we cannot
3456 process the whole rest of this reorder
3457 specification. */
3458 lr_error (ldfile, _("%s: section `%.*s' not known"),
3459 "LC_COLLATE", (int) arg->val.str.lenmb,
3460 arg->val.str.startmb);
3461
3462 do
3463 {
3464 lr_ignore_rest (ldfile, 0);
3465
3466 now = lr_token (ldfile, charmap, result, NULL, verbose);
3467 }
3468 while (now->tok == tok_reorder_sections_after
3469 || now->tok == tok_reorder_sections_end
3470 || now->tok == tok_end);
3471
3472 /* Process the token we just saw. */
3473 nowtok = now->tok;
3474 continue;
3475 }
3476 }
3477 else
3478 /* This must not happen. */
3479 goto err_label;
3480 break;
3481
3482 case tok_reorder_sections_end:
3483 /* Ignore the rest of the line if we don't need the input of
3484 this line. */
3485 if (ignore_content)
3486 break;
3487
3488 if (state != 5)
3489 goto err_label;
3490 state = 6;
3491 lr_ignore_rest (ldfile, 1);
3492 break;
3493
3494 case tok_bsymbol:
3495 case tok_ucs4:
3496 /* Ignore the rest of the line if we don't need the input of
3497 this line. */
3498 if (ignore_content)
3499 {
3500 lr_ignore_rest (ldfile, 0);
3501 break;
3502 }
3503
3504 if (state != 0 && state != 1 && state != 3 && state != 5)
3505 goto err_label;
3506
3507 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3508 goto err_label;
3509
3510 if (nowtok == tok_ucs4)
3511 {
3512 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3513 symstr = ucs4buf;
3514 symlen = 9;
3515 }
3516 else if (arg != NULL)
3517 {
3518 symstr = arg->val.str.startmb;
3519 symlen = arg->val.str.lenmb;
3520 }
3521 else
3522 {
3523 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3524 (int) ldfile->token.val.str.lenmb,
3525 ldfile->token.val.str.startmb);
3526 break;
3527 }
3528
3529 struct element_t *seqp;
3530 if (state == 0)
3531 {
3532 /* We are outside an `order_start' region. This means
3533 we must only accept definitions of values for
3534 collation symbols since these are purely abstract
3535 values and don't need directions associated. */
3536 void *ptr;
3537
3538 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3539 {
3540 seqp = ptr;
3541
3542 /* It's already defined. First check whether this
3543 is really a collating symbol. */
3544 if (seqp->is_character)
3545 goto err_label;
3546
3547 goto move_entry;
3548 }
3549 else
3550 {
3551 void *result;
3552
3553 if (find_entry (&collate->sym_table, symstr, symlen,
3554 &result) != 0)
3555 /* No collating symbol, it's an error. */
3556 goto err_label;
3557
3558 /* Maybe this is the first time we define a symbol
3559 value and it is before the first actual section. */
3560 if (collate->sections == NULL)
3561 collate->sections = collate->current_section =
3562 &collate->symbol_section;
3563 }
3564
3565 if (was_ellipsis != tok_none)
3566 {
3567 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3568 charmap, repertoire, result);
3569
3570 /* Remember that we processed the ellipsis. */
3571 was_ellipsis = tok_none;
3572
3573 /* And don't add the value a second time. */
3574 break;
3575 }
3576 }
3577 else if (state == 3)
3578 {
3579 /* It is possible that we already have this collation sequence.
3580 In this case we move the entry. */
3581 void *sym;
3582 void *ptr;
3583
3584 /* If the symbol after which we have to insert was not found
3585 ignore all entries. */
3586 if (collate->cursor == NULL)
3587 {
3588 lr_ignore_rest (ldfile, 0);
3589 break;
3590 }
3591
3592 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3593 {
3594 seqp = (struct element_t *) ptr;
3595 goto move_entry;
3596 }
3597
3598 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3599 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3600 goto move_entry;
3601
3602 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3603 && (seqp = (struct element_t *) ptr,
3604 seqp->last != NULL || seqp->next != NULL
3605 || (collate->start != NULL && seqp == collate->start)))
3606 {
3607 move_entry:
3608 /* Remove the entry from the old position. */
3609 if (seqp->last == NULL)
3610 collate->start = seqp->next;
3611 else
3612 seqp->last->next = seqp->next;
3613 if (seqp->next != NULL)
3614 seqp->next->last = seqp->last;
3615
3616 /* We also have to check whether this entry is the
3617 first or last of a section. */
3618 if (seqp->section->first == seqp)
3619 {
3620 if (seqp->section->first == seqp->section->last)
3621 /* This section has no content anymore. */
3622 seqp->section->first = seqp->section->last = NULL;
3623 else
3624 seqp->section->first = seqp->next;
3625 }
3626 else if (seqp->section->last == seqp)
3627 seqp->section->last = seqp->last;
3628
3629 /* Now insert it in the new place. */
3630 insert_weights (ldfile, seqp, charmap, repertoire, result,
3631 tok_none);
3632 break;
3633 }
3634
3635 /* Otherwise we just add a new entry. */
3636 }
3637 else if (state == 5)
3638 {
3639 /* We are reordering sections. Find the named section. */
3640 struct section_list *runp = collate->sections;
3641 struct section_list *prevp = NULL;
3642
3643 while (runp != NULL)
3644 {
3645 if (runp->name != NULL
3646 && strlen (runp->name) == symlen
3647 && memcmp (runp->name, symstr, symlen) == 0)
3648 break;
3649
3650 prevp = runp;
3651 runp = runp->next;
3652 }
3653
3654 if (runp == NULL)
3655 {
3656 lr_error (ldfile, _("%s: section `%.*s' not known"),
3657 "LC_COLLATE", (int) symlen, symstr);
3658 lr_ignore_rest (ldfile, 0);
3659 }
3660 else
3661 {
3662 if (runp != collate->current_section)
3663 {
3664 /* Remove the named section from the old place and
3665 insert it in the new one. */
3666 prevp->next = runp->next;
3667
3668 runp->next = collate->current_section->next;
3669 collate->current_section->next = runp;
3670 collate->current_section = runp;
3671 }
3672
3673 /* Process the rest of the line which might change
3674 the collation rules. */
3675 arg = lr_token (ldfile, charmap, result, repertoire,
3676 verbose);
3677 if (arg->tok != tok_eof && arg->tok != tok_eol)
3678 read_directions (ldfile, arg, charmap, repertoire,
3679 result);
3680 }
3681 break;
3682 }
3683 else if (was_ellipsis != tok_none)
3684 {
3685 /* Using the information in the `ellipsis_weight'
3686 element and this and the last value we have to handle
3687 the ellipsis now. */
3688 assert (state == 1);
3689
3690 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3691 repertoire, result);
3692
3693 /* Remember that we processed the ellipsis. */
3694 was_ellipsis = tok_none;
3695
3696 /* And don't add the value a second time. */
3697 break;
3698 }
3699
3700 /* Now insert in the new place. */
3701 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3702 break;
3703
3704 case tok_undefined:
3705 /* Ignore the rest of the line if we don't need the input of
3706 this line. */
3707 if (ignore_content)
3708 {
3709 lr_ignore_rest (ldfile, 0);
3710 break;
3711 }
3712
3713 if (state != 1)
3714 goto err_label;
3715
3716 if (was_ellipsis != tok_none)
3717 {
3718 lr_error (ldfile,
3719 _("%s: cannot have `%s' as end of ellipsis range"),
3720 "LC_COLLATE", "UNDEFINED");
3721
3722 unlink_element (collate);
3723 was_ellipsis = tok_none;
3724 }
3725
3726 /* See whether UNDEFINED already appeared somewhere. */
3727 if (collate->undefined.next != NULL
3728 || &collate->undefined == collate->cursor)
3729 {
3730 lr_error (ldfile,
3731 _("%s: order for `%.*s' already defined at %s:%Zu"),
3732 "LC_COLLATE", 9, "UNDEFINED",
3733 collate->undefined.file,
3734 collate->undefined.line);
3735 lr_ignore_rest (ldfile, 0);
3736 }
3737 else
3738 /* Parse the weights. */
3739 insert_weights (ldfile, &collate->undefined, charmap,
3740 repertoire, result, tok_none);
3741 break;
3742
3743 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3744 case tok_ellipsis3: /* absolute ellipsis */
3745 case tok_ellipsis4: /* symbolic decimal ellipsis */
3746 /* This is the symbolic (decimal or hexadecimal) or absolute
3747 ellipsis. */
3748 if (was_ellipsis != tok_none)
3749 goto err_label;
3750
3751 if (state != 0 && state != 1 && state != 3)
3752 goto err_label;
3753
3754 was_ellipsis = nowtok;
3755
3756 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3757 repertoire, result, nowtok);
3758 break;
3759
3760 case tok_end:
3761 seen_end:
3762 /* Next we assume `LC_COLLATE'. */
3763 if (!ignore_content)
3764 {
3765 if (state == 0
3766 && copy_locale == NULL
3767 && !collate->codepoint_collation)
3768 /* We must either see a copy statement or have
3769 ordering values, or codepoint_collation. */
3770 lr_error (ldfile,
3771 _("%s: empty category description not allowed"),
3772 "LC_COLLATE");
3773 else if (state == 1)
3774 {
3775 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3776 "LC_COLLATE");
3777
3778 /* Handle ellipsis at end of list. */
3779 if (was_ellipsis != tok_none)
3780 {
3781 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3782 repertoire, result);
3783 was_ellipsis = tok_none;
3784 }
3785 }
3786 else if (state == 3)
3787 record_error (0, 0, _("\
3788 %s: missing `reorder-end' keyword"), "LC_COLLATE");
3789 else if (state == 5)
3790 record_error (0, 0, _("\
3791 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE");
3792 }
3793 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3794 if (arg->tok == tok_eof)
3795 break;
3796 if (arg->tok == tok_eol)
3797 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3798 else if (arg->tok != tok_lc_collate)
3799 lr_error (ldfile, _("\
3800 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3801 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3802 return;
3803
3804 case tok_define:
3805 if (ignore_content)
3806 {
3807 lr_ignore_rest (ldfile, 0);
3808 break;
3809 }
3810
3811 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3812 if (arg->tok != tok_ident)
3813 goto err_label;
3814
3815 /* Simply add the new symbol. */
3816 struct name_list *newsym = xmalloc (sizeof (*newsym)
3817 + arg->val.str.lenmb + 1);
3818 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
3819 newsym->str[arg->val.str.lenmb] = '\0';
3820 newsym->next = defined;
3821 defined = newsym;
3822
3823 lr_ignore_rest (ldfile, 1);
3824 break;
3825
3826 case tok_undef:
3827 if (ignore_content)
3828 {
3829 lr_ignore_rest (ldfile, 0);
3830 break;
3831 }
3832
3833 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3834 if (arg->tok != tok_ident)
3835 goto err_label;
3836
3837 /* Remove _all_ occurrences of the symbol from the list. */
3838 struct name_list *prevdef = NULL;
3839 struct name_list *curdef = defined;
3840 while (curdef != NULL)
3841 if (strncmp (arg->val.str.startmb, curdef->str,
3842 arg->val.str.lenmb) == 0
3843 && curdef->str[arg->val.str.lenmb] == '\0')
3844 {
3845 if (prevdef == NULL)
3846 defined = curdef->next;
3847 else
3848 prevdef->next = curdef->next;
3849
3850 struct name_list *olddef = curdef;
3851 curdef = curdef->next;
3852
3853 free (olddef);
3854 }
3855 else
3856 {
3857 prevdef = curdef;
3858 curdef = curdef->next;
3859 }
3860
3861 lr_ignore_rest (ldfile, 1);
3862 break;
3863
3864 case tok_ifdef:
3865 case tok_ifndef:
3866 if (ignore_content)
3867 {
3868 lr_ignore_rest (ldfile, 0);
3869 break;
3870 }
3871
3872 found_ifdef:
3873 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3874 if (arg->tok != tok_ident)
3875 goto err_label;
3876 lr_ignore_rest (ldfile, 1);
3877
3878 if (collate->else_action == else_none)
3879 {
3880 curdef = defined;
3881 while (curdef != NULL)
3882 if (strncmp (arg->val.str.startmb, curdef->str,
3883 arg->val.str.lenmb) == 0
3884 && curdef->str[arg->val.str.lenmb] == '\0')
3885 break;
3886 else
3887 curdef = curdef->next;
3888
3889 if ((nowtok == tok_ifdef && curdef != NULL)
3890 || (nowtok == tok_ifndef && curdef == NULL))
3891 {
3892 /* We have to use the if-branch. */
3893 collate->else_action = else_ignore;
3894 }
3895 else
3896 {
3897 /* We have to use the else-branch, if there is one. */
3898 nowtok = skip_to (ldfile, collate, charmap, 0);
3899 if (nowtok == tok_else)
3900 collate->else_action = else_seen;
3901 else if (nowtok == tok_elifdef)
3902 {
3903 nowtok = tok_ifdef;
3904 goto found_ifdef;
3905 }
3906 else if (nowtok == tok_elifndef)
3907 {
3908 nowtok = tok_ifndef;
3909 goto found_ifdef;
3910 }
3911 else if (nowtok == tok_eof)
3912 goto seen_eof;
3913 else if (nowtok == tok_end)
3914 goto seen_end;
3915 }
3916 }
3917 else
3918 {
3919 /* XXX Should it really become necessary to support nested
3920 preprocessor handling we will push the state here. */
3921 lr_error (ldfile, _("%s: nested conditionals not supported"),
3922 "LC_COLLATE");
3923 nowtok = skip_to (ldfile, collate, charmap, 1);
3924 if (nowtok == tok_eof)
3925 goto seen_eof;
3926 else if (nowtok == tok_end)
3927 goto seen_end;
3928 }
3929 break;
3930
3931 case tok_elifdef:
3932 case tok_elifndef:
3933 case tok_else:
3934 if (ignore_content)
3935 {
3936 lr_ignore_rest (ldfile, 0);
3937 break;
3938 }
3939
3940 lr_ignore_rest (ldfile, 1);
3941
3942 if (collate->else_action == else_ignore)
3943 {
3944 /* Ignore everything until the endif. */
3945 nowtok = skip_to (ldfile, collate, charmap, 1);
3946 if (nowtok == tok_eof)
3947 goto seen_eof;
3948 else if (nowtok == tok_end)
3949 goto seen_end;
3950 }
3951 else
3952 {
3953 assert (collate->else_action == else_none);
3954 lr_error (ldfile, _("\
3955 %s: '%s' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE",
3956 nowtok == tok_else ? "else"
3957 : nowtok == tok_elifdef ? "elifdef" : "elifndef");
3958 }
3959 break;
3960
3961 case tok_endif:
3962 if (ignore_content)
3963 {
3964 lr_ignore_rest (ldfile, 0);
3965 break;
3966 }
3967
3968 lr_ignore_rest (ldfile, 1);
3969
3970 if (collate->else_action != else_ignore
3971 && collate->else_action != else_seen)
3972 lr_error (ldfile, _("\
3973 %s: 'endif' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE");
3974
3975 /* XXX If we support nested preprocessor directives we pop
3976 the state here. */
3977 collate->else_action = else_none;
3978 break;
3979
3980 default:
3981 err_label:
3982 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3983 }
3984
3985 /* Prepare for the next round. */
3986 now = lr_token (ldfile, charmap, result, NULL, verbose);
3987 nowtok = now->tok;
3988 }
3989
3990 seen_eof:
3991 /* When we come here we reached the end of the file. */
3992 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
3993 }
3994