1 /* Transliteration using the locale's data.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <assert.h>
20 #include <dlfcn.h>
21 #include <search.h>
22 #include <stdint.h>
23 #include <string.h>
24 #include <stdlib.h>
25
26 #include <libc-lock.h>
27 #include "gconv_int.h"
28 #include "../locale/localeinfo.h"
29
30
31 int
__gconv_transliterate(struct __gconv_step * step,struct __gconv_step_data * step_data,const unsigned char * inbufstart,const unsigned char ** inbufp,const unsigned char * inbufend,unsigned char ** outbufstart,size_t * irreversible)32 __gconv_transliterate (struct __gconv_step *step,
33 struct __gconv_step_data *step_data,
34 const unsigned char *inbufstart,
35 const unsigned char **inbufp,
36 const unsigned char *inbufend,
37 unsigned char **outbufstart, size_t *irreversible)
38 {
39 /* Find out about the locale's transliteration. */
40 uint32_t size;
41 const uint32_t *from_idx;
42 const uint32_t *from_tbl;
43 const uint32_t *to_idx;
44 const uint32_t *to_tbl;
45 const uint32_t *winbuf;
46 const uint32_t *winbufend;
47 uint32_t low;
48 uint32_t high;
49
50 /* The input buffer. There are actually 4-byte values. */
51 winbuf = (const uint32_t *) *inbufp;
52 winbufend = (const uint32_t *) inbufend;
53
54 __gconv_fct fct = step->__fct;
55 #ifdef PTR_DEMANGLE
56 if (step->__shlib_handle != NULL)
57 PTR_DEMANGLE (fct);
58 #endif
59
60 /* If there is no transliteration information in the locale don't do
61 anything and return the error. */
62 size = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_TAB_SIZE);
63 if (size == 0)
64 goto no_rules;
65
66 /* Get the rest of the values. */
67 from_idx =
68 (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_IDX);
69 from_tbl =
70 (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_TBL);
71 to_idx =
72 (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_IDX);
73 to_tbl =
74 (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_TBL);
75
76 /* Test whether there is enough input. */
77 if (winbuf + 1 > winbufend)
78 return (winbuf == winbufend
79 ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
80
81 /* The array starting at FROM_IDX contains indices to the string table
82 in FROM_TBL. The indices are sorted wrt to the strings. I.e., we
83 are doing binary search. */
84 low = 0;
85 high = size;
86 while (low < high)
87 {
88 uint32_t med = (low + high) / 2;
89 uint32_t idx;
90 int cnt;
91
92 /* Compare the string at this index with the string at the current
93 position in the input buffer. */
94 idx = from_idx[med];
95 cnt = 0;
96 do
97 {
98 if (from_tbl[idx + cnt] != winbuf[cnt])
99 /* Does not match. */
100 break;
101 ++cnt;
102 }
103 while (from_tbl[idx + cnt] != L'\0' && winbuf + cnt < winbufend);
104
105 if (cnt > 0 && from_tbl[idx + cnt] == L'\0')
106 {
107 /* Found a matching input sequence. Now try to convert the
108 possible replacements. */
109 uint32_t idx2 = to_idx[med];
110
111 do
112 {
113 /* Determine length of replacement. */
114 unsigned int len = 0;
115 int res;
116 const unsigned char *toinptr;
117 unsigned char *outptr;
118
119 while (to_tbl[idx2 + len] != L'\0')
120 ++len;
121
122 /* Try this input text. */
123 toinptr = (const unsigned char *) &to_tbl[idx2];
124 outptr = *outbufstart;
125 res = DL_CALL_FCT (fct,
126 (step, step_data, &toinptr,
127 (const unsigned char *) &to_tbl[idx2 + len],
128 &outptr, NULL, 0, 0));
129 if (res != __GCONV_ILLEGAL_INPUT)
130 {
131 /* If the conversion succeeds we have to increment the
132 input buffer. */
133 if (res == __GCONV_EMPTY_INPUT)
134 {
135 *inbufp += cnt * sizeof (uint32_t);
136 ++*irreversible;
137 res = __GCONV_OK;
138 }
139 /* Do not increment the output pointer if we could not
140 store the entire output. */
141 if (res != __GCONV_FULL_OUTPUT)
142 *outbufstart = outptr;
143
144 return res;
145 }
146
147 /* Next replacement. */
148 idx2 += len + 1;
149 }
150 while (to_tbl[idx2] != L'\0');
151
152 /* Nothing found, continue searching. */
153 }
154 else if (cnt > 0)
155 /* This means that the input buffer contents matches a prefix of
156 an entry. Since we cannot match it unless we get more input,
157 we will tell the caller about it. */
158 return __GCONV_INCOMPLETE_INPUT;
159
160 if (winbuf + cnt >= winbufend || from_tbl[idx + cnt] < winbuf[cnt])
161 low = med + 1;
162 else
163 high = med;
164 }
165
166 no_rules:
167 /* Maybe the character is supposed to be ignored. */
168 if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN) != 0)
169 {
170 int n = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN);
171 const uint32_t *ranges =
172 (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE);
173 const uint32_t wc = *(const uint32_t *) (*inbufp);
174 int i;
175
176 /* Test whether there is enough input. */
177 if (winbuf + 1 > winbufend)
178 return (winbuf == winbufend
179 ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
180
181 for (i = 0; i < n; ranges += 3, ++i)
182 if (ranges[0] <= wc && wc <= ranges[1]
183 && (wc - ranges[0]) % ranges[2] == 0)
184 {
185 /* Matches the range. Ignore it. */
186 *inbufp += 4;
187 ++*irreversible;
188 return __GCONV_OK;
189 }
190 else if (wc < ranges[0])
191 /* There cannot be any other matching range since they are
192 sorted. */
193 break;
194 }
195
196 /* One last chance: use the default replacement. */
197 if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN) != 0)
198 {
199 const uint32_t *default_missing = (const uint32_t *)
200 _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING);
201 const unsigned char *toinptr = (const unsigned char *) default_missing;
202 uint32_t len = _NL_CURRENT_WORD (LC_CTYPE,
203 _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN);
204 unsigned char *outptr;
205 int res;
206
207 /* Test whether there is enough input. */
208 if (winbuf + 1 > winbufend)
209 return (winbuf == winbufend
210 ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
211
212 outptr = *outbufstart;
213 res = DL_CALL_FCT (fct,
214 (step, step_data, &toinptr,
215 (const unsigned char *) (default_missing + len),
216 &outptr, NULL, 0, 0));
217
218 if (res != __GCONV_ILLEGAL_INPUT)
219 {
220 /* If the conversion succeeds we have to increment the
221 input buffer. */
222 if (res == __GCONV_EMPTY_INPUT)
223 {
224 /* This worked but is not reversible. */
225 ++*irreversible;
226 *inbufp += 4;
227 res = __GCONV_OK;
228 }
229 *outbufstart = outptr;
230
231 return res;
232 }
233 }
234
235 /* Haven't found a match. */
236 return __GCONV_ILLEGAL_INPUT;
237 }
238 libc_hidden_def (__gconv_transliterate)
239