1 /* Transliteration using the locale's data.
2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <assert.h>
20 #include <dlfcn.h>
21 #include <search.h>
22 #include <stdint.h>
23 #include <string.h>
24 #include <stdlib.h>
25 
26 #include <libc-lock.h>
27 #include "gconv_int.h"
28 #include "../locale/localeinfo.h"
29 
30 
31 int
__gconv_transliterate(struct __gconv_step * step,struct __gconv_step_data * step_data,const unsigned char * inbufstart,const unsigned char ** inbufp,const unsigned char * inbufend,unsigned char ** outbufstart,size_t * irreversible)32 __gconv_transliterate (struct __gconv_step *step,
33 		       struct __gconv_step_data *step_data,
34 		       const unsigned char *inbufstart,
35 		       const unsigned char **inbufp,
36 		       const unsigned char *inbufend,
37 		       unsigned char **outbufstart, size_t *irreversible)
38 {
39   /* Find out about the locale's transliteration.  */
40   uint32_t size;
41   const uint32_t *from_idx;
42   const uint32_t *from_tbl;
43   const uint32_t *to_idx;
44   const uint32_t *to_tbl;
45   const uint32_t *winbuf;
46   const uint32_t *winbufend;
47   uint32_t low;
48   uint32_t high;
49 
50   /* The input buffer.  There are actually 4-byte values.  */
51   winbuf = (const uint32_t *) *inbufp;
52   winbufend = (const uint32_t *) inbufend;
53 
54   __gconv_fct fct = step->__fct;
55 #ifdef PTR_DEMANGLE
56   if (step->__shlib_handle != NULL)
57     PTR_DEMANGLE (fct);
58 #endif
59 
60   /* If there is no transliteration information in the locale don't do
61      anything and return the error.  */
62   size = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_TAB_SIZE);
63   if (size == 0)
64     goto no_rules;
65 
66   /* Get the rest of the values.  */
67   from_idx =
68     (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_IDX);
69   from_tbl =
70     (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_TBL);
71   to_idx =
72     (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_IDX);
73   to_tbl =
74     (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_TBL);
75 
76   /* Test whether there is enough input.  */
77   if (winbuf + 1 > winbufend)
78     return (winbuf == winbufend
79 	    ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
80 
81   /* The array starting at FROM_IDX contains indices to the string table
82      in FROM_TBL.  The indices are sorted wrt to the strings.  I.e., we
83      are doing binary search.  */
84   low = 0;
85   high = size;
86   while (low < high)
87     {
88       uint32_t med = (low + high) / 2;
89       uint32_t idx;
90       int cnt;
91 
92       /* Compare the string at this index with the string at the current
93 	 position in the input buffer.  */
94       idx = from_idx[med];
95       cnt = 0;
96       do
97 	{
98 	  if (from_tbl[idx + cnt] != winbuf[cnt])
99 	    /* Does not match.  */
100 	    break;
101 	  ++cnt;
102 	}
103       while (from_tbl[idx + cnt] != L'\0' && winbuf + cnt < winbufend);
104 
105       if (cnt > 0 && from_tbl[idx + cnt] == L'\0')
106 	{
107 	  /* Found a matching input sequence.  Now try to convert the
108 	     possible replacements.  */
109 	  uint32_t idx2 = to_idx[med];
110 
111 	  do
112 	    {
113 	      /* Determine length of replacement.  */
114 	      unsigned int len = 0;
115 	      int res;
116 	      const unsigned char *toinptr;
117 	      unsigned char *outptr;
118 
119 	      while (to_tbl[idx2 + len] != L'\0')
120 		++len;
121 
122 	      /* Try this input text.  */
123 	      toinptr = (const unsigned char *) &to_tbl[idx2];
124 	      outptr = *outbufstart;
125 	      res = DL_CALL_FCT (fct,
126 				 (step, step_data, &toinptr,
127 				  (const unsigned char *) &to_tbl[idx2 + len],
128 				  &outptr, NULL, 0, 0));
129 	      if (res != __GCONV_ILLEGAL_INPUT)
130 		{
131 		  /* If the conversion succeeds we have to increment the
132 		     input buffer.  */
133 		  if (res == __GCONV_EMPTY_INPUT)
134 		    {
135 		      *inbufp += cnt * sizeof (uint32_t);
136 		      ++*irreversible;
137 		      res = __GCONV_OK;
138 		    }
139 		  /* Do not increment the output pointer if we could not
140 		     store the entire output. */
141 		  if (res != __GCONV_FULL_OUTPUT)
142 		    *outbufstart = outptr;
143 
144 		  return res;
145 		}
146 
147 	      /* Next replacement.  */
148 	      idx2 += len + 1;
149 	    }
150 	  while (to_tbl[idx2] != L'\0');
151 
152 	  /* Nothing found, continue searching.  */
153 	}
154       else if (cnt > 0)
155 	/* This means that the input buffer contents matches a prefix of
156 	   an entry.  Since we cannot match it unless we get more input,
157 	   we will tell the caller about it.  */
158 	return __GCONV_INCOMPLETE_INPUT;
159 
160       if (winbuf + cnt >= winbufend || from_tbl[idx + cnt] < winbuf[cnt])
161 	low = med + 1;
162       else
163 	high = med;
164     }
165 
166  no_rules:
167   /* Maybe the character is supposed to be ignored.  */
168   if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN) != 0)
169     {
170       int n = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN);
171       const uint32_t *ranges =
172 	(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE);
173       const uint32_t wc = *(const uint32_t *) (*inbufp);
174       int i;
175 
176       /* Test whether there is enough input.  */
177       if (winbuf + 1 > winbufend)
178 	return (winbuf == winbufend
179 		? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
180 
181       for (i = 0; i < n; ranges += 3, ++i)
182 	if (ranges[0] <= wc && wc <= ranges[1]
183 	    && (wc - ranges[0]) % ranges[2] == 0)
184 	  {
185 	    /* Matches the range.  Ignore it.  */
186 	    *inbufp += 4;
187 	    ++*irreversible;
188 	    return __GCONV_OK;
189 	  }
190 	else if (wc < ranges[0])
191 	  /* There cannot be any other matching range since they are
192              sorted.  */
193 	  break;
194     }
195 
196   /* One last chance: use the default replacement.  */
197   if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN) != 0)
198     {
199       const uint32_t *default_missing = (const uint32_t *)
200 	_NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING);
201       const unsigned char *toinptr = (const unsigned char *) default_missing;
202       uint32_t len = _NL_CURRENT_WORD (LC_CTYPE,
203 				       _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN);
204       unsigned char *outptr;
205       int res;
206 
207       /* Test whether there is enough input.  */
208       if (winbuf + 1 > winbufend)
209 	return (winbuf == winbufend
210 		? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
211 
212       outptr = *outbufstart;
213       res = DL_CALL_FCT (fct,
214 			 (step, step_data, &toinptr,
215 			  (const unsigned char *) (default_missing + len),
216 			  &outptr, NULL, 0, 0));
217 
218       if (res != __GCONV_ILLEGAL_INPUT)
219 	{
220 	  /* If the conversion succeeds we have to increment the
221 	     input buffer.  */
222 	  if (res == __GCONV_EMPTY_INPUT)
223 	    {
224 	      /* This worked but is not reversible.  */
225 	      ++*irreversible;
226 	      *inbufp += 4;
227 	      res = __GCONV_OK;
228 	    }
229 	  *outbufstart = outptr;
230 
231 	  return res;
232 	}
233     }
234 
235   /* Haven't found a match.  */
236   return __GCONV_ILLEGAL_INPUT;
237 }
238 libc_hidden_def (__gconv_transliterate)
239