1 /* Copyright (C) 1995-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 The GNU C Library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with the GNU C Library; if not, see
16 <https://www.gnu.org/licenses/>. */
17
18 #include <assert.h>
19 #include <langinfo.h>
20 #include <locale.h>
21 #include <stddef.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/param.h>
26
27 #ifndef STRING_TYPE
28 # define STRING_TYPE char
29 # define USTRING_TYPE unsigned char
30 # define STRXFRM __strxfrm_l
31 # define STRLEN strlen
32 # define STPNCPY __stpncpy
33 # define WEIGHT_H "../locale/weight.h"
34 # define SUFFIX MB
35 # define L(arg) arg
36 #endif
37
38 #define CONCAT(a,b) CONCAT1(a,b)
39 #define CONCAT1(a,b) a##b
40
41 /* Maximum string size that is calculated with cached indices. Right now this
42 is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
43 lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
44 #define SMALL_STR_SIZE 4095
45
46 #include "../locale/localeinfo.h"
47 #include WEIGHT_H
48
49 /* Group locale data for shorter parameter lists. */
50 typedef struct
51 {
52 uint32_t nrules;
53 unsigned char *rulesets;
54 USTRING_TYPE *weights;
55 int32_t *table;
56 USTRING_TYPE *extra;
57 int32_t *indirect;
58 } locale_data_t;
59
60 #ifndef WIDE_CHAR_VERSION
61
62 /* We need UTF-8 encoding of numbers. */
63 static int
utf8_encode(char * buf,int val)64 utf8_encode (char *buf, int val)
65 {
66 int retval;
67
68 if (val < 0x80)
69 {
70 *buf++ = (char) val;
71 retval = 1;
72 }
73 else
74 {
75 int step;
76
77 for (step = 2; step < 6; ++step)
78 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
79 break;
80 retval = step;
81
82 *buf = (unsigned char) (~0xff >> step);
83 --step;
84 do
85 {
86 buf[step] = 0x80 | (val & 0x3f);
87 val >>= 6;
88 }
89 while (--step > 0);
90 *buf |= val;
91 }
92
93 return retval;
94 }
95 #endif
96
97 /* Find next weight and rule index. Inlined since called for every char. */
98 static __always_inline size_t
find_idx(const USTRING_TYPE ** us,int32_t * weight_idx,unsigned char * rule_idx,const locale_data_t * l_data,const int pass)99 find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
100 unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
101 {
102 int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
103 -1);
104 *rule_idx = tmp >> 24;
105 int32_t idx = tmp & 0xffffff;
106 size_t len = l_data->weights[idx++];
107
108 /* Skip over indices of previous levels. */
109 for (int i = 0; i < pass; i++)
110 {
111 idx += len;
112 len = l_data->weights[idx++];
113 }
114
115 *weight_idx = idx;
116 return len;
117 }
118
119 static int
find_position(const USTRING_TYPE * us,const locale_data_t * l_data,const int pass)120 find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
121 const int pass)
122 {
123 int32_t weight_idx;
124 unsigned char rule_idx;
125 const USTRING_TYPE *usrc = us;
126
127 find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
128 return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
129 }
130
131 /* Do the transformation. */
132 static size_t
do_xfrm(const USTRING_TYPE * usrc,STRING_TYPE * dest,size_t n,const locale_data_t * l_data)133 do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
134 const locale_data_t *l_data)
135 {
136 int32_t weight_idx;
137 unsigned char rule_idx;
138 uint32_t pass;
139 size_t needed = 0;
140 size_t last_needed;
141
142 /* Now the passes over the weights. */
143 for (pass = 0; pass < l_data->nrules; ++pass)
144 {
145 size_t backw_len = 0;
146 last_needed = needed;
147 const USTRING_TYPE *cur = usrc;
148 const USTRING_TYPE *backw_start = NULL;
149
150 /* We assume that if a rule has defined `position' in one section
151 this is true for all of them. */
152 int position = find_position (cur, l_data, pass);
153
154 if (position == 0)
155 {
156 while (*cur != L('\0'))
157 {
158 const USTRING_TYPE *pos = cur;
159 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
160 pass);
161 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
162
163 if ((rule & sort_forward) != 0)
164 {
165 /* Handle the pushed backward sequence. */
166 if (backw_start != NULL)
167 {
168 for (size_t i = backw_len; i > 0; )
169 {
170 int32_t weight_idx;
171 unsigned char rule_idx;
172 size_t len = find_idx (&backw_start, &weight_idx,
173 &rule_idx, l_data, pass);
174 if (needed + i < n)
175 for (size_t j = len; j > 0; j--)
176 dest[needed + i - j] =
177 l_data->weights[weight_idx++];
178
179 i -= len;
180 }
181
182 needed += backw_len;
183 backw_start = NULL;
184 backw_len = 0;
185 }
186
187 /* Now handle the forward element. */
188 if (needed + len < n)
189 while (len-- > 0)
190 dest[needed++] = l_data->weights[weight_idx++];
191 else
192 /* No more characters fit into the buffer. */
193 needed += len;
194 }
195 else
196 {
197 /* Remember start of the backward sequence & track length. */
198 if (backw_start == NULL)
199 backw_start = pos;
200 backw_len += len;
201 }
202 }
203
204
205 /* Handle the pushed backward sequence. */
206 if (backw_start != NULL)
207 {
208 for (size_t i = backw_len; i > 0; )
209 {
210 size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
211 l_data, pass);
212 if (needed + i < n)
213 for (size_t j = len; j > 0; j--)
214 dest[needed + i - j] =
215 l_data->weights[weight_idx++];
216
217 i -= len;
218 }
219
220 needed += backw_len;
221 }
222 }
223 else
224 {
225 int val = 1;
226 #ifndef WIDE_CHAR_VERSION
227 char buf[7];
228 size_t buflen;
229 #endif
230 size_t i;
231
232 while (*cur != L('\0'))
233 {
234 const USTRING_TYPE *pos = cur;
235 size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
236 pass);
237 int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
238
239 if ((rule & sort_forward) != 0)
240 {
241 /* Handle the pushed backward sequence. */
242 if (backw_start != NULL)
243 {
244 for (size_t p = backw_len; p > 0; p--)
245 {
246 size_t len;
247 int32_t weight_idx;
248 unsigned char rule_idx;
249 const USTRING_TYPE *backw_cur = backw_start;
250
251 /* To prevent a warning init the used vars. */
252 len = find_idx (&backw_cur, &weight_idx,
253 &rule_idx, l_data, pass);
254
255 for (i = 1; i < p; i++)
256 len = find_idx (&backw_cur, &weight_idx,
257 &rule_idx, l_data, pass);
258
259 if (len != 0)
260 {
261 #ifdef WIDE_CHAR_VERSION
262 if (needed + 1 + len < n)
263 {
264 dest[needed] = val;
265 for (i = 0; i < len; ++i)
266 dest[needed + 1 + i] =
267 l_data->weights[weight_idx + i];
268 }
269 needed += 1 + len;
270 #else
271 buflen = utf8_encode (buf, val);
272 if (needed + buflen + len < n)
273 {
274 for (i = 0; i < buflen; ++i)
275 dest[needed + i] = buf[i];
276 for (i = 0; i < len; ++i)
277 dest[needed + buflen + i] =
278 l_data->weights[weight_idx + i];
279 }
280 needed += buflen + len;
281 #endif
282 val = 1;
283 }
284 else
285 ++val;
286 }
287
288 backw_start = NULL;
289 backw_len = 0;
290 }
291
292 /* Now handle the forward element. */
293 if (len != 0)
294 {
295 #ifdef WIDE_CHAR_VERSION
296 if (needed + 1 + len < n)
297 {
298 dest[needed] = val;
299 for (i = 0; i < len; ++i)
300 dest[needed + 1 + i] =
301 l_data->weights[weight_idx + i];
302 }
303 needed += 1 + len;
304 #else
305 buflen = utf8_encode (buf, val);
306 if (needed + buflen + len < n)
307 {
308 for (i = 0; i < buflen; ++i)
309 dest[needed + i] = buf[i];
310 for (i = 0; i < len; ++i)
311 dest[needed + buflen + i] =
312 l_data->weights[weight_idx + i];
313 }
314 needed += buflen + len;
315 #endif
316 val = 1;
317 }
318 else
319 ++val;
320 }
321 else
322 {
323 /* Remember start of the backward sequence & track length. */
324 if (backw_start == NULL)
325 backw_start = pos;
326 backw_len++;
327 }
328 }
329
330 /* Handle the pushed backward sequence. */
331 if (backw_start != NULL)
332 {
333 for (size_t p = backw_len; p > 0; p--)
334 {
335 size_t len;
336 int32_t weight_idx;
337 unsigned char rule_idx;
338 const USTRING_TYPE *backw_cur = backw_start;
339
340 /* To prevent a warning init the used vars. */
341 len = find_idx (&backw_cur, &weight_idx,
342 &rule_idx, l_data, pass);
343
344 for (i = 1; i < p; i++)
345 len = find_idx (&backw_cur, &weight_idx,
346 &rule_idx, l_data, pass);
347
348 if (len != 0)
349 {
350 #ifdef WIDE_CHAR_VERSION
351 if (needed + 1 + len < n)
352 {
353 dest[needed] = val;
354 for (i = 0; i < len; ++i)
355 dest[needed + 1 + i] =
356 l_data->weights[weight_idx + i];
357 }
358 needed += 1 + len;
359 #else
360 buflen = utf8_encode (buf, val);
361 if (needed + buflen + len < n)
362 {
363 for (i = 0; i < buflen; ++i)
364 dest[needed + i] = buf[i];
365 for (i = 0; i < len; ++i)
366 dest[needed + buflen + i] =
367 l_data->weights[weight_idx + i];
368 }
369 needed += buflen + len;
370 #endif
371 val = 1;
372 }
373 else
374 ++val;
375 }
376 }
377 }
378
379 /* Finally store the byte to separate the passes or terminate
380 the string. */
381 if (needed < n)
382 dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
383 ++needed;
384 }
385
386 /* This is a little optimization: many collation specifications have
387 a `position' rule at the end and if no non-ignored character
388 is found the last \1 byte is immediately followed by a \0 byte
389 signalling this. We can avoid the \1 byte(s). */
390 if (needed > 2 && needed == last_needed + 1)
391 {
392 /* Remove the \1 byte. */
393 if (--needed <= n)
394 dest[needed - 1] = L('\0');
395 }
396
397 /* Return the number of bytes/words we need, but don't count the NUL
398 byte/word at the end. */
399 return needed - 1;
400 }
401
402 /* Do the transformation using weight-index and rule cache. */
403 static size_t
do_xfrm_cached(STRING_TYPE * dest,size_t n,const locale_data_t * l_data,size_t idxmax,int32_t * idxarr,const unsigned char * rulearr)404 do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
405 size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
406 {
407 uint32_t nrules = l_data->nrules;
408 unsigned char *rulesets = l_data->rulesets;
409 USTRING_TYPE *weights = l_data->weights;
410 uint32_t pass;
411 size_t needed = 0;
412 size_t last_needed;
413 size_t idxcnt;
414
415 /* Now the passes over the weights. */
416 for (pass = 0; pass < nrules; ++pass)
417 {
418 size_t backw_stop = ~0ul;
419 int rule = rulesets[rulearr[0] * nrules + pass];
420 /* We assume that if a rule has defined `position' in one section
421 this is true for all of them. */
422 int position = rule & sort_position;
423
424 last_needed = needed;
425 if (position == 0)
426 {
427 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
428 {
429 if ((rule & sort_forward) != 0)
430 {
431 size_t len;
432
433 if (backw_stop != ~0ul)
434 {
435 /* Handle the pushed elements now. */
436 size_t backw;
437
438 for (backw = idxcnt; backw > backw_stop; )
439 {
440 --backw;
441 len = weights[idxarr[backw]++];
442
443 if (needed + len < n)
444 while (len-- > 0)
445 dest[needed++] = weights[idxarr[backw]++];
446 else
447 {
448 /* No more characters fit into the buffer. */
449 needed += len;
450 idxarr[backw] += len;
451 }
452 }
453
454 backw_stop = ~0ul;
455 }
456
457 /* Now handle the forward element. */
458 len = weights[idxarr[idxcnt]++];
459 if (needed + len < n)
460 while (len-- > 0)
461 dest[needed++] = weights[idxarr[idxcnt]++];
462 else
463 {
464 /* No more characters fit into the buffer. */
465 needed += len;
466 idxarr[idxcnt] += len;
467 }
468 }
469 else
470 {
471 /* Remember where the backwards series started. */
472 if (backw_stop == ~0ul)
473 backw_stop = idxcnt;
474 }
475
476 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
477 }
478
479
480 if (backw_stop != ~0ul)
481 {
482 /* Handle the pushed elements now. */
483 size_t backw;
484
485 backw = idxcnt;
486 while (backw > backw_stop)
487 {
488 size_t len = weights[idxarr[--backw]++];
489
490 if (needed + len < n)
491 while (len-- > 0)
492 dest[needed++] = weights[idxarr[backw]++];
493 else
494 {
495 /* No more characters fit into the buffer. */
496 needed += len;
497 idxarr[backw] += len;
498 }
499 }
500 }
501 }
502 else
503 {
504 int val = 1;
505 #ifndef WIDE_CHAR_VERSION
506 char buf[7];
507 size_t buflen;
508 #endif
509 size_t i;
510
511 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
512 {
513 if ((rule & sort_forward) != 0)
514 {
515 size_t len;
516
517 if (backw_stop != ~0ul)
518 {
519 /* Handle the pushed elements now. */
520 size_t backw;
521
522 for (backw = idxcnt; backw > backw_stop; )
523 {
524 --backw;
525 len = weights[idxarr[backw]++];
526 if (len != 0)
527 {
528 #ifdef WIDE_CHAR_VERSION
529 if (needed + 1 + len < n)
530 {
531 dest[needed] = val;
532 for (i = 0; i < len; ++i)
533 dest[needed + 1 + i] =
534 weights[idxarr[backw] + i];
535 }
536 needed += 1 + len;
537 #else
538 buflen = utf8_encode (buf, val);
539 if (needed + buflen + len < n)
540 {
541 for (i = 0; i < buflen; ++i)
542 dest[needed + i] = buf[i];
543 for (i = 0; i < len; ++i)
544 dest[needed + buflen + i] =
545 weights[idxarr[backw] + i];
546 }
547 needed += buflen + len;
548 #endif
549 idxarr[backw] += len;
550 val = 1;
551 }
552 else
553 ++val;
554 }
555
556 backw_stop = ~0ul;
557 }
558
559 /* Now handle the forward element. */
560 len = weights[idxarr[idxcnt]++];
561 if (len != 0)
562 {
563 #ifdef WIDE_CHAR_VERSION
564 if (needed + 1+ len < n)
565 {
566 dest[needed] = val;
567 for (i = 0; i < len; ++i)
568 dest[needed + 1 + i] =
569 weights[idxarr[idxcnt] + i];
570 }
571 needed += 1 + len;
572 #else
573 buflen = utf8_encode (buf, val);
574 if (needed + buflen + len < n)
575 {
576 for (i = 0; i < buflen; ++i)
577 dest[needed + i] = buf[i];
578 for (i = 0; i < len; ++i)
579 dest[needed + buflen + i] =
580 weights[idxarr[idxcnt] + i];
581 }
582 needed += buflen + len;
583 #endif
584 idxarr[idxcnt] += len;
585 val = 1;
586 }
587 else
588 /* Note that we don't have to increment `idxarr[idxcnt]'
589 since the length is zero. */
590 ++val;
591 }
592 else
593 {
594 /* Remember where the backwards series started. */
595 if (backw_stop == ~0ul)
596 backw_stop = idxcnt;
597 }
598
599 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
600 }
601
602 if (backw_stop != ~0ul)
603 {
604 /* Handle the pushed elements now. */
605 size_t backw;
606
607 backw = idxmax - 1;
608 while (backw > backw_stop)
609 {
610 size_t len = weights[idxarr[--backw]++];
611 if (len != 0)
612 {
613 #ifdef WIDE_CHAR_VERSION
614 if (needed + 1 + len < n)
615 {
616 dest[needed] = val;
617 for (i = 0; i < len; ++i)
618 dest[needed + 1 + i] =
619 weights[idxarr[backw] + i];
620 }
621 needed += 1 + len;
622 #else
623 buflen = utf8_encode (buf, val);
624 if (needed + buflen + len < n)
625 {
626 for (i = 0; i < buflen; ++i)
627 dest[needed + i] = buf[i];
628 for (i = 0; i < len; ++i)
629 dest[needed + buflen + i] =
630 weights[idxarr[backw] + i];
631 }
632 needed += buflen + len;
633 #endif
634 idxarr[backw] += len;
635 val = 1;
636 }
637 else
638 ++val;
639 }
640 }
641 }
642
643 /* Finally store the byte to separate the passes or terminate
644 the string. */
645 if (needed < n)
646 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
647 ++needed;
648 }
649
650 /* This is a little optimization: many collation specifications have
651 a `position' rule at the end and if no non-ignored character
652 is found the last \1 byte is immediately followed by a \0 byte
653 signalling this. We can avoid the \1 byte(s). */
654 if (needed > 2 && needed == last_needed + 1)
655 {
656 /* Remove the \1 byte. */
657 if (--needed <= n)
658 dest[needed - 1] = L('\0');
659 }
660
661 /* Return the number of bytes/words we need, but don't count the NUL
662 byte/word at the end. */
663 return needed - 1;
664 }
665
666 size_t
STRXFRM(STRING_TYPE * dest,const STRING_TYPE * src,size_t n,locale_t l)667 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, locale_t l)
668 {
669 locale_data_t l_data;
670 struct __locale_data *current = l->__locales[LC_COLLATE];
671 l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
672
673 /* Handle byte comparison case. */
674 if (l_data.nrules == 0)
675 {
676 size_t srclen = STRLEN (src);
677
678 if (n != 0)
679 STPNCPY (dest, src, MIN (srclen + 1, n));
680
681 return srclen;
682 }
683
684 /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
685 if (*src == L('\0'))
686 {
687 if (n != 0)
688 *dest = L('\0');
689 return 0;
690 }
691
692 /* Get the locale data. */
693 l_data.rulesets = (unsigned char *)
694 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
695 l_data.table = (int32_t *)
696 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
697 l_data.weights = (USTRING_TYPE *)
698 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
699 l_data.extra = (USTRING_TYPE *)
700 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
701 l_data.indirect = (int32_t *)
702 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
703
704 assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
705 assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
706 assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
707 assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
708
709 /* We need the elements of the string as unsigned values since they
710 are used as indices. */
711 const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
712
713 /* Allocate cache for small strings on the stack and fill it with weight and
714 rule indices. If the cache size is not sufficient, continue with the
715 uncached xfrm version. */
716 size_t idxmax = 0;
717 const USTRING_TYPE *cur = usrc;
718 int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
719 unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
720
721 do
722 {
723 int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
724 -1);
725 rulearr[idxmax] = tmp >> 24;
726 idxarr[idxmax] = tmp & 0xffffff;
727
728 ++idxmax;
729 }
730 while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
731
732 /* This element is only read, the value never used but to determine
733 another value which then is ignored. */
734 rulearr[idxmax] = '\0';
735
736 /* Do the transformation. */
737 if (*cur == L('\0'))
738 return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
739 else
740 return do_xfrm (usrc, dest, n, &l_data);
741 }
742 libc_hidden_def (STRXFRM)
743
744 #ifndef WIDE_CHAR_VERSION
745 weak_alias (__strxfrm_l, strxfrm_l)
746 #endif
747