1 /* Charset name normalization.
2    Copyright (C) 2020-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <http://www.gnu.org/licenses/>.  */
18 
19 
20 #include <stdlib.h>
21 #include <ctype.h>
22 #include <locale.h>
23 #include <stdbool.h>
24 #include <string.h>
25 #include <sys/stat.h>
26 #include "gconv_int.h"
27 #include "gconv_charset.h"
28 
29 
30 /* This function returns a pointer to the last suffix in a conversion code
31    string.  Valid suffixes matched by this function are of the form: '/' or ','
32    followed by arbitrary text that doesn't contain '/' or ','.  It does not
33    edit the string in any way.  The caller is expected to parse the suffix and
34    remove it (by e.g. truncating the string) before the next call.  */
35 static char *
find_suffix(char * s)36 find_suffix (char *s)
37 {
38   /* The conversion code is in the form of a triplet, separated by '/' chars.
39      The third component of the triplet contains suffixes. If we don't have two
40      slashes, we don't have a suffix.  */
41 
42   int slash_count = 0;
43   char *suffix_term = NULL;
44 
45   for (int i = 0; s[i] != '\0'; i++)
46     switch (s[i])
47       {
48         case '/':
49           slash_count++;
50           /* Fallthrough */
51         case ',':
52           suffix_term = &s[i];
53       }
54 
55   if (slash_count >= 2)
56     return suffix_term;
57 
58   return NULL;
59 }
60 
61 
62 struct gconv_parsed_code
63 {
64   char *code;
65   bool translit;
66   bool ignore;
67 };
68 
69 
70 /* This function parses an iconv_open encoding PC.CODE, strips any suffixes
71    (such as TRANSLIT or IGNORE) from it and sets corresponding flags in it.  */
72 static void
gconv_parse_code(struct gconv_parsed_code * pc)73 gconv_parse_code (struct gconv_parsed_code *pc)
74 {
75   pc->translit = false;
76   pc->ignore = false;
77 
78   while (1)
79     {
80       /* First drop any trailing whitespaces and separators.  */
81       size_t len = strlen (pc->code);
82       while ((len > 0)
83              && (isspace (pc->code[len - 1])
84                  || pc->code[len - 1] == ','
85                  || pc->code[len - 1] == '/'))
86         len--;
87 
88       pc->code[len] = '\0';
89 
90       if (len == 0)
91         return;
92 
93       char * suffix = find_suffix (pc->code);
94       if (suffix == NULL)
95         {
96           /* At this point, we have processed and removed all suffixes from the
97              code and what remains of the code is suffix free.  */
98           return;
99         }
100       else
101         {
102           /* A suffix is processed from the end of the code array going
103              backwards, one suffix at a time.  The suffix is an index into the
104              code character array and points to: one past the end of the code
105              and any unprocessed suffixes, and to the beginning of the suffix
106              currently being processed during this iteration.  We must process
107              this suffix and then drop it from the code by terminating the
108              preceding text with NULL.
109 
110              We want to allow and recognize suffixes such as:
111 
112              "/TRANSLIT"         i.e. single suffix
113              "//TRANSLIT"        i.e. single suffix and multiple separators
114              "//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
115              "/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
116              "//IGNORE,TRANSLIT" i.e. suffixes separated by ","
117              "//IGNORE,"         i.e. trailing ","
118              "//TRANSLIT/"       i.e. trailing "/"
119              "//TRANSLIT//"      i.e. trailing "//"
120              "/"                 i.e. empty suffix.
121 
122              Unknown suffixes are silently discarded and ignored.  */
123 
124           if ((__strcasecmp_l (suffix,
125                                GCONV_TRIPLE_SEPARATOR
126                                GCONV_TRANSLIT_SUFFIX,
127                                _nl_C_locobj_ptr) == 0)
128               || (__strcasecmp_l (suffix,
129                                   GCONV_SUFFIX_SEPARATOR
130                                   GCONV_TRANSLIT_SUFFIX,
131                                   _nl_C_locobj_ptr) == 0))
132             pc->translit = true;
133 
134           if ((__strcasecmp_l (suffix,
135                                GCONV_TRIPLE_SEPARATOR
136                                GCONV_IGNORE_ERRORS_SUFFIX,
137                                _nl_C_locobj_ptr) == 0)
138               || (__strcasecmp_l (suffix,
139                                   GCONV_SUFFIX_SEPARATOR
140                                   GCONV_IGNORE_ERRORS_SUFFIX,
141                                   _nl_C_locobj_ptr) == 0))
142             pc->ignore = true;
143 
144           /* We just processed this suffix.  We can now drop it from the
145              code string by truncating it at the suffix's position.  */
146           suffix[0] = '\0';
147         }
148     }
149 }
150 
151 
152 /* This function accepts the charset names of the source and destination of the
153    conversion and populates *conv_spec with an equivalent conversion
154    specification that may later be used by __gconv_open.  The charset names
155    might contain options in the form of suffixes that alter the conversion,
156    e.g. "ISO-10646/UTF-8/TRANSLIT".  It processes the charset names, ignoring
157    and truncating any suffix options in fromcode, and processing and truncating
158    any suffix options in tocode.  Supported suffix options ("TRANSLIT" or
159    "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
160    to be set to true.  Unrecognized suffix options are silently discarded.  If
161    the function succeeds, it returns conv_spec back to the caller.  It returns
162    NULL upon failure.  conv_spec must be allocated and freed by the caller.  */
163 struct gconv_spec *
__gconv_create_spec(struct gconv_spec * conv_spec,const char * fromcode,const char * tocode)164 __gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
165                    const char *tocode)
166 {
167   struct gconv_parsed_code pfc, ptc;
168   struct gconv_spec *ret = NULL;
169 
170   pfc.code = __strdup (fromcode);
171   ptc.code = __strdup (tocode);
172 
173   if ((pfc.code == NULL)
174       || (ptc.code == NULL))
175     goto out;
176 
177   gconv_parse_code (&pfc);
178   gconv_parse_code (&ptc);
179 
180   /* We ignore suffixes in the fromcode because that is how the current
181      implementation has always handled them.  Only suffixes in the tocode are
182      processed and handled.  The reality is that invalid input in the input
183      character set should only be ignored if the fromcode specifies IGNORE.
184      The current implementation ignores invalid intput in the input character
185      set if the tocode contains IGNORE.  We preserve this behavior for
186      backwards compatibility.  In the future we may split the handling of
187      IGNORE to allow a finer grained specification of ignorning invalid input
188      and/or ignoring invalid output.  */
189   conv_spec->translit = ptc.translit;
190   conv_spec->ignore = ptc.ignore;
191 
192   /* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
193      be able to add one or two trailing '/' characters if necessary.  */
194   conv_spec->fromcode = malloc (strlen (fromcode) + 3);
195   if (conv_spec->fromcode == NULL)
196     goto out;
197 
198   conv_spec->tocode = malloc (strlen (tocode) + 3);
199   if (conv_spec->tocode == NULL)
200     {
201       free (conv_spec->fromcode);
202       conv_spec->fromcode = NULL;
203       goto out;
204     }
205 
206   /* Strip unrecognized characters and ensure that the code has two '/'
207      characters as per conversion code triplet specification.  */
208   strip (conv_spec->fromcode, pfc.code);
209   strip (conv_spec->tocode, ptc.code);
210   ret = conv_spec;
211 
212 out:
213   free (pfc.code);
214   free (ptc.code);
215 
216   return ret;
217 }
libc_hidden_def(__gconv_create_spec)218 libc_hidden_def (__gconv_create_spec)
219 
220 
221 void
222 __gconv_destroy_spec (struct gconv_spec *conv_spec)
223 {
224   free (conv_spec->fromcode);
225   free (conv_spec->tocode);
226   return;
227 }
228 libc_hidden_def (__gconv_destroy_spec)
229