1 /* Conversion module for UTF-32.
2    Copyright (C) 1999-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <byteswap.h>
20 #include <dlfcn.h>
21 #include <gconv.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 /* This is the Byte Order Mark character (BOM).  */
28 #define BOM	0x0000feffu
29 /* And in the other byte order.  */
30 #define BOM_OE	0xfffe0000u
31 
32 
33 /* Definitions used in the body of the `gconv' function.  */
34 #define FROM_LOOP		from_utf32_loop
35 #define TO_LOOP			to_utf32_loop
36 #define DEFINE_INIT		0
37 #define DEFINE_FINI		0
38 #define MIN_NEEDED_FROM		4
39 #define MIN_NEEDED_TO		4
40 #define ONE_DIRECTION		0
41 #define FROM_DIRECTION		(dir == from_utf32)
42 #define PREPARE_LOOP \
43   enum direction dir = ((struct utf32_data *) step->__data)->dir;	      \
44   enum variant var = ((struct utf32_data *) step->__data)->var;		      \
45   int swap;								      \
46   if (FROM_DIRECTION && var == UTF_32)					      \
47     {									      \
48       if (__glibc_unlikely (data->__invocation_counter == 0))		      \
49 	{								      \
50 	  /* We have to find out which byte order the file is encoded in.  */ \
51 	  if (inptr + 4 > inend)					      \
52 	    return (inptr == inend					      \
53 		    ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);	      \
54 									      \
55 	  if (get32u (inptr) == BOM)					      \
56 	    /* Simply ignore the BOM character.  */			      \
57 	    *inptrp = inptr += 4;					      \
58 	  else if (get32u (inptr) == BOM_OE)				      \
59 	    {								      \
60 	      data->__flags |= __GCONV_SWAP;				      \
61 	      *inptrp = inptr += 4;					      \
62 	    }								      \
63 	}								      \
64     }									      \
65   else if (!FROM_DIRECTION && var == UTF_32 && !data->__internal_use	      \
66 	   && data->__invocation_counter == 0)				      \
67     {									      \
68       /* Emit the Byte Order Mark.  */					      \
69       if (__glibc_unlikely (outbuf + 4 > outend))			      \
70 	return __GCONV_FULL_OUTPUT;					      \
71 									      \
72       put32u (outbuf, BOM);						      \
73       outbuf += 4;							      \
74     }									      \
75   else if (__builtin_expect (data->__invocation_counter == 0, 0)	      \
76 	   && ((var == UTF_32LE && BYTE_ORDER == BIG_ENDIAN)		      \
77 	       || (var == UTF_32BE && BYTE_ORDER == LITTLE_ENDIAN)))	      \
78     data->__flags |= __GCONV_SWAP;					      \
79   swap = data->__flags & __GCONV_SWAP;
80 #define EXTRA_LOOP_ARGS		, var, swap
81 
82 
83 /* Direction of the transformation.  */
84 enum direction
85 {
86   illegal_dir,
87   to_utf32,
88   from_utf32
89 };
90 
91 enum variant
92 {
93   illegal_var,
94   UTF_32,
95   UTF_32LE,
96   UTF_32BE
97 };
98 
99 struct utf32_data
100 {
101   enum direction dir;
102   enum variant var;
103 };
104 
105 
106 extern int gconv_init (struct __gconv_step *step);
107 int
gconv_init(struct __gconv_step * step)108 gconv_init (struct __gconv_step *step)
109 {
110   /* Determine which direction.  */
111   struct utf32_data *new_data;
112   enum direction dir = illegal_dir;
113   enum variant var = illegal_var;
114   int result;
115 
116   if (__strcasecmp (step->__from_name, "UTF-32//") == 0)
117     {
118       dir = from_utf32;
119       var = UTF_32;
120     }
121   else if (__strcasecmp (step->__to_name, "UTF-32//") == 0)
122     {
123       dir = to_utf32;
124       var = UTF_32;
125     }
126   else if (__strcasecmp (step->__from_name, "UTF-32BE//") == 0)
127     {
128       dir = from_utf32;
129       var = UTF_32BE;
130     }
131   else if (__strcasecmp (step->__to_name, "UTF-32BE//") == 0)
132     {
133       dir = to_utf32;
134       var = UTF_32BE;
135     }
136   else if (__strcasecmp (step->__from_name, "UTF-32LE//") == 0)
137     {
138       dir = from_utf32;
139       var = UTF_32LE;
140     }
141   else if (__strcasecmp (step->__to_name, "UTF-32LE//") == 0)
142     {
143       dir = to_utf32;
144       var = UTF_32LE;
145     }
146 
147   result = __GCONV_NOCONV;
148   if (__builtin_expect (dir, to_utf32) != illegal_dir)
149     {
150       new_data = (struct utf32_data *) malloc (sizeof (struct utf32_data));
151 
152       result = __GCONV_NOMEM;
153       if (new_data != NULL)
154 	{
155 	  new_data->dir = dir;
156 	  new_data->var = var;
157 	  step->__data = new_data;
158 
159 	  if (dir == from_utf32)
160 	    {
161 	      step->__min_needed_from = MIN_NEEDED_FROM;
162 	      step->__max_needed_from = MIN_NEEDED_FROM;
163 	      step->__min_needed_to = MIN_NEEDED_TO;
164 	      step->__max_needed_to = MIN_NEEDED_TO;
165 	    }
166 	  else
167 	    {
168 	      step->__min_needed_from = MIN_NEEDED_TO;
169 	      step->__max_needed_from = MIN_NEEDED_TO;
170 	      step->__min_needed_to = MIN_NEEDED_FROM;
171 	      step->__max_needed_to = MIN_NEEDED_FROM;
172 	    }
173 
174 	  step->__stateful = 0;
175 
176 	  result = __GCONV_OK;
177 	}
178     }
179 
180   return result;
181 }
182 
183 
184 extern void gconv_end (struct __gconv_step *data);
185 void
gconv_end(struct __gconv_step * data)186 gconv_end (struct __gconv_step *data)
187 {
188   free (data->__data);
189 }
190 
191 
192 /* Convert from the internal (UCS4-like) format to UTF-32.  */
193 #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
194 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
195 #define LOOPFCT			TO_LOOP
196 #define BODY \
197   {									      \
198     uint32_t c = get32 (inptr);						      \
199 									      \
200     if (__glibc_unlikely (c >= 0x110000))				      \
201       {									      \
202 	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
203       }									      \
204     else if (__glibc_unlikely (c >= 0xd800 && c < 0xe000))		      \
205       {									      \
206 	/* Surrogate characters in UCS-4 input are not valid.		      \
207 	   We must catch this.  If we let surrogates pass through,	      \
208 	   attackers could make a security hole exploit by		      \
209 	   generating "irregular UTF-32" sequences.  */			      \
210 	result = __GCONV_ILLEGAL_INPUT;					      \
211 	if (! ignore_errors_p ())					      \
212 	  break;							      \
213 	inptr += 4;							      \
214 	++*irreversible;						      \
215 	continue;							      \
216       }									      \
217 									      \
218     if (swap)								      \
219       c = bswap_32 (c);							      \
220     put32 (outptr, c);							      \
221 									      \
222     outptr += 4;							      \
223     inptr += 4;								      \
224   }
225 #define LOOP_NEED_FLAGS
226 #define EXTRA_LOOP_DECLS \
227 	, enum variant var, int swap
228 #include <iconv/loop.c>
229 
230 
231 /* Convert from UTF-32 to the internal (UCS4-like) format.  */
232 #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
233 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
234 #define LOOPFCT			FROM_LOOP
235 #define BODY \
236   {									      \
237     uint32_t u1 = get32 (inptr);					      \
238 									      \
239     if (swap)								      \
240       u1 = bswap_32 (u1);						      \
241 									      \
242     if (__glibc_unlikely (u1 >= 0x110000 || (u1 >= 0xd800 && u1 < 0xe000)))   \
243       {									      \
244 	/* This is illegal.  */						      \
245 	STANDARD_FROM_LOOP_ERR_HANDLER (4);				      \
246       }									      \
247 									      \
248     put32 (outptr, u1);							      \
249     inptr += 4;								      \
250     outptr += 4;							      \
251   }
252 #define LOOP_NEED_FLAGS
253 #define EXTRA_LOOP_DECLS \
254 	, enum variant var, int swap
255 #include <iconv/loop.c>
256 
257 
258 /* Now define the toplevel functions.  */
259 #include <iconv/skeleton.c>
260