1 /* Charset name normalization.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19
20 #include <stdlib.h>
21 #include <ctype.h>
22 #include <locale.h>
23 #include <stdbool.h>
24 #include <string.h>
25 #include <sys/stat.h>
26 #include "gconv_int.h"
27 #include "gconv_charset.h"
28
29
30 /* This function returns a pointer to the last suffix in a conversion code
31 string. Valid suffixes matched by this function are of the form: '/' or ','
32 followed by arbitrary text that doesn't contain '/' or ','. It does not
33 edit the string in any way. The caller is expected to parse the suffix and
34 remove it (by e.g. truncating the string) before the next call. */
35 static char *
find_suffix(char * s)36 find_suffix (char *s)
37 {
38 /* The conversion code is in the form of a triplet, separated by '/' chars.
39 The third component of the triplet contains suffixes. If we don't have two
40 slashes, we don't have a suffix. */
41
42 int slash_count = 0;
43 char *suffix_term = NULL;
44
45 for (int i = 0; s[i] != '\0'; i++)
46 switch (s[i])
47 {
48 case '/':
49 slash_count++;
50 /* Fallthrough */
51 case ',':
52 suffix_term = &s[i];
53 }
54
55 if (slash_count >= 2)
56 return suffix_term;
57
58 return NULL;
59 }
60
61
62 struct gconv_parsed_code
63 {
64 char *code;
65 bool translit;
66 bool ignore;
67 };
68
69
70 /* This function parses an iconv_open encoding PC.CODE, strips any suffixes
71 (such as TRANSLIT or IGNORE) from it and sets corresponding flags in it. */
72 static void
gconv_parse_code(struct gconv_parsed_code * pc)73 gconv_parse_code (struct gconv_parsed_code *pc)
74 {
75 pc->translit = false;
76 pc->ignore = false;
77
78 while (1)
79 {
80 /* First drop any trailing whitespaces and separators. */
81 size_t len = strlen (pc->code);
82 while ((len > 0)
83 && (isspace (pc->code[len - 1])
84 || pc->code[len - 1] == ','
85 || pc->code[len - 1] == '/'))
86 len--;
87
88 pc->code[len] = '\0';
89
90 if (len == 0)
91 return;
92
93 char * suffix = find_suffix (pc->code);
94 if (suffix == NULL)
95 {
96 /* At this point, we have processed and removed all suffixes from the
97 code and what remains of the code is suffix free. */
98 return;
99 }
100 else
101 {
102 /* A suffix is processed from the end of the code array going
103 backwards, one suffix at a time. The suffix is an index into the
104 code character array and points to: one past the end of the code
105 and any unprocessed suffixes, and to the beginning of the suffix
106 currently being processed during this iteration. We must process
107 this suffix and then drop it from the code by terminating the
108 preceding text with NULL.
109
110 We want to allow and recognize suffixes such as:
111
112 "/TRANSLIT" i.e. single suffix
113 "//TRANSLIT" i.e. single suffix and multiple separators
114 "//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
115 "/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
116 "//IGNORE,TRANSLIT" i.e. suffixes separated by ","
117 "//IGNORE," i.e. trailing ","
118 "//TRANSLIT/" i.e. trailing "/"
119 "//TRANSLIT//" i.e. trailing "//"
120 "/" i.e. empty suffix.
121
122 Unknown suffixes are silently discarded and ignored. */
123
124 if ((__strcasecmp_l (suffix,
125 GCONV_TRIPLE_SEPARATOR
126 GCONV_TRANSLIT_SUFFIX,
127 _nl_C_locobj_ptr) == 0)
128 || (__strcasecmp_l (suffix,
129 GCONV_SUFFIX_SEPARATOR
130 GCONV_TRANSLIT_SUFFIX,
131 _nl_C_locobj_ptr) == 0))
132 pc->translit = true;
133
134 if ((__strcasecmp_l (suffix,
135 GCONV_TRIPLE_SEPARATOR
136 GCONV_IGNORE_ERRORS_SUFFIX,
137 _nl_C_locobj_ptr) == 0)
138 || (__strcasecmp_l (suffix,
139 GCONV_SUFFIX_SEPARATOR
140 GCONV_IGNORE_ERRORS_SUFFIX,
141 _nl_C_locobj_ptr) == 0))
142 pc->ignore = true;
143
144 /* We just processed this suffix. We can now drop it from the
145 code string by truncating it at the suffix's position. */
146 suffix[0] = '\0';
147 }
148 }
149 }
150
151
152 /* This function accepts the charset names of the source and destination of the
153 conversion and populates *conv_spec with an equivalent conversion
154 specification that may later be used by __gconv_open. The charset names
155 might contain options in the form of suffixes that alter the conversion,
156 e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
157 and truncating any suffix options in fromcode, and processing and truncating
158 any suffix options in tocode. Supported suffix options ("TRANSLIT" or
159 "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
160 to be set to true. Unrecognized suffix options are silently discarded. If
161 the function succeeds, it returns conv_spec back to the caller. It returns
162 NULL upon failure. conv_spec must be allocated and freed by the caller. */
163 struct gconv_spec *
__gconv_create_spec(struct gconv_spec * conv_spec,const char * fromcode,const char * tocode)164 __gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
165 const char *tocode)
166 {
167 struct gconv_parsed_code pfc, ptc;
168 struct gconv_spec *ret = NULL;
169
170 pfc.code = __strdup (fromcode);
171 ptc.code = __strdup (tocode);
172
173 if ((pfc.code == NULL)
174 || (ptc.code == NULL))
175 goto out;
176
177 gconv_parse_code (&pfc);
178 gconv_parse_code (&ptc);
179
180 /* We ignore suffixes in the fromcode because that is how the current
181 implementation has always handled them. Only suffixes in the tocode are
182 processed and handled. The reality is that invalid input in the input
183 character set should only be ignored if the fromcode specifies IGNORE.
184 The current implementation ignores invalid intput in the input character
185 set if the tocode contains IGNORE. We preserve this behavior for
186 backwards compatibility. In the future we may split the handling of
187 IGNORE to allow a finer grained specification of ignorning invalid input
188 and/or ignoring invalid output. */
189 conv_spec->translit = ptc.translit;
190 conv_spec->ignore = ptc.ignore;
191
192 /* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
193 be able to add one or two trailing '/' characters if necessary. */
194 conv_spec->fromcode = malloc (strlen (fromcode) + 3);
195 if (conv_spec->fromcode == NULL)
196 goto out;
197
198 conv_spec->tocode = malloc (strlen (tocode) + 3);
199 if (conv_spec->tocode == NULL)
200 {
201 free (conv_spec->fromcode);
202 conv_spec->fromcode = NULL;
203 goto out;
204 }
205
206 /* Strip unrecognized characters and ensure that the code has two '/'
207 characters as per conversion code triplet specification. */
208 strip (conv_spec->fromcode, pfc.code);
209 strip (conv_spec->tocode, ptc.code);
210 ret = conv_spec;
211
212 out:
213 free (pfc.code);
214 free (ptc.code);
215
216 return ret;
217 }
libc_hidden_def(__gconv_create_spec)218 libc_hidden_def (__gconv_create_spec)
219
220
221 void
222 __gconv_destroy_spec (struct gconv_spec *conv_spec)
223 {
224 free (conv_spec->fromcode);
225 free (conv_spec->tocode);
226 return;
227 }
228 libc_hidden_def (__gconv_destroy_spec)
229