1 /* Conversion module for UTF-7.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 /* UTF-7 is a legacy encoding used for transmitting Unicode within the
20 ASCII character set, used primarily by mail agents. New programs
21 are encouraged to use UTF-8 instead.
22
23 UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The
24 original Base64 encoding is defined in RFC 2045. */
25
26 #include <dlfcn.h>
27 #include <gconv.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32
33 enum variant
34 {
35 UTF7,
36 UTF_7_IMAP
37 };
38
39 /* Must be in the same order as enum variant above. */
40 static const char names[] =
41 "UTF-7//\0"
42 "UTF-7-IMAP//\0"
43 "\0";
44
45 static uint32_t
shift_character(enum variant const var)46 shift_character (enum variant const var)
47 {
48 if (var == UTF7)
49 return '+';
50 else if (var == UTF_7_IMAP)
51 return '&';
52 else
53 abort ();
54 }
55
56 static bool
between(uint32_t const ch,uint32_t const lower_bound,uint32_t const upper_bound)57 between (uint32_t const ch,
58 uint32_t const lower_bound, uint32_t const upper_bound)
59 {
60 return (ch >= lower_bound && ch <= upper_bound);
61 }
62
63 /* The set of "direct characters":
64 A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
65 FOR UTF-7-IMAP
66 A-Z a-z 0-9 ' ( ) , - . / : ? space
67 ! " # $ % + * ; < = > @ [ \ ] ^ _ ` { | } ~
68 */
69
70 static bool
isdirect(uint32_t ch,enum variant var)71 isdirect (uint32_t ch, enum variant var)
72 {
73 if (var == UTF7)
74 return (between (ch, 'A', 'Z')
75 || between (ch, 'a', 'z')
76 || between (ch, '0', '9')
77 || ch == '\'' || ch == '(' || ch == ')'
78 || between (ch, ',', '/')
79 || ch == ':' || ch == '?'
80 || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
81 else if (var == UTF_7_IMAP)
82 return (ch != '&' && between (ch, ' ', '~'));
83 abort ();
84 }
85
86
87 /* The set of "direct and optional direct characters":
88 A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
89 (UTF-7 only)
90 ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
91 */
92
93 static bool
isxdirect(uint32_t ch,enum variant var)94 isxdirect (uint32_t ch, enum variant var)
95 {
96 if (isdirect (ch, var))
97 return true;
98 if (var != UTF7)
99 return false;
100 return between (ch, '!', '&')
101 || ch == '*'
102 || between (ch, ';', '@')
103 || (between (ch, '[', '`') && ch != '\\')
104 || between (ch, '{', '}');
105 }
106
107
108 /* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
109 only): Modified base64 + '-' (shift back character)
110 A-Z a-z 0-9 + / -
111 */
112
113 static bool
needs_explicit_shift(uint32_t ch)114 needs_explicit_shift (uint32_t ch)
115 {
116 return (between (ch, 'A', 'Z')
117 || between (ch, 'a', 'z')
118 || between (ch, '/', '9') || ch == '+' || ch == '-');
119 }
120
121
122 /* Converts a value in the range 0..63 to a base64 encoded char. */
123 static unsigned char
base64(unsigned int i,enum variant var)124 base64 (unsigned int i, enum variant var)
125 {
126 if (i < 26)
127 return i + 'A';
128 else if (i < 52)
129 return i - 26 + 'a';
130 else if (i < 62)
131 return i - 52 + '0';
132 else if (i == 62)
133 return '+';
134 else if (i == 63 && var == UTF7)
135 return '/';
136 else if (i == 63 && var == UTF_7_IMAP)
137 return ',';
138 else
139 abort ();
140 }
141
142
143 /* Definitions used in the body of the `gconv' function. */
144 #define DEFINE_INIT 0
145 #define DEFINE_FINI 0
146 #define FROM_LOOP from_utf7_loop
147 #define TO_LOOP to_utf7_loop
148 #define MIN_NEEDED_FROM 1
149 #define MAX_NEEDED_FROM 6
150 #define MIN_NEEDED_TO 4
151 #define MAX_NEEDED_TO 4
152 #define ONE_DIRECTION 0
153 #define FROM_DIRECTION (dir == from_utf7)
154 #define PREPARE_LOOP \
155 mbstate_t saved_state; \
156 mbstate_t *statep = data->__statep; \
157 enum direction dir = ((struct utf7_data *) step->__data)->dir; \
158 enum variant var = ((struct utf7_data *) step->__data)->var;
159 #define EXTRA_LOOP_ARGS , statep, var
160
161
162 enum direction
163 {
164 illegal_dir,
165 from_utf7,
166 to_utf7
167 };
168
169 struct utf7_data
170 {
171 enum direction dir;
172 enum variant var;
173 };
174
175 /* Since we might have to reset input pointer we must be able to save
176 and restore the state. */
177 #define SAVE_RESET_STATE(Save) \
178 if (Save) \
179 saved_state = *statep; \
180 else \
181 *statep = saved_state
182
183 int
gconv_init(struct __gconv_step * step)184 gconv_init (struct __gconv_step *step)
185 {
186 /* Determine which direction. */
187 struct utf7_data *new_data;
188 enum direction dir = illegal_dir;
189
190 enum variant var = 0;
191 for (const char *name = names; *name != '\0';
192 name = __rawmemchr (name, '\0') + 1)
193 {
194 if (__strcasecmp (step->__from_name, name) == 0)
195 {
196 dir = from_utf7;
197 break;
198 }
199 else if (__strcasecmp (step->__to_name, name) == 0)
200 {
201 dir = to_utf7;
202 break;
203 }
204 ++var;
205 }
206
207 if (__glibc_likely (dir != illegal_dir))
208 {
209 new_data = malloc (sizeof (*new_data));
210 if (new_data == NULL)
211 return __GCONV_NOMEM;
212
213 new_data->dir = dir;
214 new_data->var = var;
215 step->__data = new_data;
216
217 if (dir == from_utf7)
218 {
219 step->__min_needed_from = MIN_NEEDED_FROM;
220 step->__max_needed_from = MAX_NEEDED_FROM;
221 step->__min_needed_to = MIN_NEEDED_TO;
222 step->__max_needed_to = MAX_NEEDED_TO;
223 }
224 else
225 {
226 step->__min_needed_from = MIN_NEEDED_TO;
227 step->__max_needed_from = MAX_NEEDED_TO;
228 step->__min_needed_to = MIN_NEEDED_FROM;
229 step->__max_needed_to = MAX_NEEDED_FROM;
230 }
231 }
232 else
233 return __GCONV_NOCONV;
234
235 step->__stateful = 1;
236
237 return __GCONV_OK;
238 }
239
240 void
gconv_end(struct __gconv_step * data)241 gconv_end (struct __gconv_step *data)
242 {
243 free (data->__data);
244 }
245
246
247
248 /* First define the conversion function from UTF-7 to UCS4.
249 The state is structured as follows:
250 __count bit 2..0: zero
251 __count bit 8..3: shift
252 __wch: data
253 Precise meaning:
254 shift data
255 0 -- not inside base64 encoding
256 1..32 XX..XX00..00 inside base64, (32 - shift) bits pending
257 This state layout is simpler than relying on STORE_REST/UNPACK_BYTES.
258
259 When shift = 0, __wch needs to store at most one lookahead byte (see
260 __GCONV_INCOMPLETE_INPUT below).
261 */
262 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
263 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
264 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
265 #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
266 #define LOOPFCT FROM_LOOP
267 #define BODY \
268 { \
269 uint_fast8_t ch = *inptr; \
270 \
271 if ((statep->__count >> 3) == 0) \
272 { \
273 /* base64 encoding inactive. */ \
274 if (isxdirect (ch, var)) \
275 { \
276 inptr++; \
277 put32 (outptr, ch); \
278 outptr += 4; \
279 } \
280 else if (__glibc_likely (ch == shift_character (var))) \
281 { \
282 if (__glibc_unlikely (inptr + 2 > inend)) \
283 { \
284 /* Not enough input available. */ \
285 result = __GCONV_INCOMPLETE_INPUT; \
286 break; \
287 } \
288 if (inptr[1] == '-') \
289 { \
290 inptr += 2; \
291 put32 (outptr, ch); \
292 outptr += 4; \
293 } \
294 else \
295 { \
296 /* Switch into base64 mode. */ \
297 inptr++; \
298 statep->__count = (32 << 3); \
299 statep->__value.__wch = 0; \
300 } \
301 } \
302 else \
303 { \
304 /* The input is invalid. */ \
305 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
306 } \
307 } \
308 else \
309 { \
310 /* base64 encoding active. */ \
311 uint32_t i; \
312 int shift; \
313 \
314 if (ch >= 'A' && ch <= 'Z') \
315 i = ch - 'A'; \
316 else if (ch >= 'a' && ch <= 'z') \
317 i = ch - 'a' + 26; \
318 else if (ch >= '0' && ch <= '9') \
319 i = ch - '0' + 52; \
320 else if (ch == '+') \
321 i = 62; \
322 else if ((var == UTF7 && ch == '/') \
323 || (var == UTF_7_IMAP && ch == ',')) \
324 i = 63; \
325 else \
326 { \
327 /* Terminate base64 encoding. */ \
328 \
329 /* If accumulated data is nonzero, the input is invalid. */ \
330 /* Also, partial UTF-16 characters are invalid. */ \
331 /* In IMAP variant, must be terminated by '-'. */ \
332 if (__glibc_unlikely (statep->__value.__wch != 0) \
333 || __glibc_unlikely ((statep->__count >> 3) <= 26) \
334 || __glibc_unlikely (var == UTF_7_IMAP && ch != '-')) \
335 { \
336 STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \
337 } \
338 \
339 if (ch == '-') \
340 inptr++; \
341 \
342 statep->__count = 0; \
343 continue; \
344 } \
345 \
346 /* Concatenate the base64 integer i to the accumulator. */ \
347 shift = (statep->__count >> 3); \
348 if (shift > 6) \
349 { \
350 uint32_t wch; \
351 \
352 shift -= 6; \
353 wch = statep->__value.__wch | (i << shift); \
354 \
355 if (shift <= 16 && shift > 10) \
356 { \
357 /* An UTF-16 character has just been completed. */ \
358 uint32_t wc1 = wch >> 16; \
359 \
360 /* UTF-16: When we see a High Surrogate, we must also decode \
361 the following Low Surrogate. */ \
362 if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \
363 { \
364 wch = wch << 16; \
365 shift += 16; \
366 put32 (outptr, wc1); \
367 outptr += 4; \
368 } \
369 } \
370 else if (shift <= 10 && shift > 4) \
371 { \
372 /* After a High Surrogate, verify that the next 16 bit \
373 indeed form a Low Surrogate. */ \
374 uint32_t wc2 = wch & 0xffff; \
375 \
376 if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000)) \
377 { \
378 STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
379 } \
380 } \
381 \
382 statep->__value.__wch = wch; \
383 } \
384 else \
385 { \
386 /* An UTF-16 surrogate pair has just been completed. */ \
387 uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \
388 uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \
389 | (i >> (6 - shift)); \
390 \
391 statep->__value.__wch = (i << shift) << 26; \
392 shift += 26; \
393 \
394 assert (wc1 >= 0xd800 && wc1 < 0xdc00); \
395 assert (wc2 >= 0xdc00 && wc2 < 0xe000); \
396 put32 (outptr, \
397 0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \
398 outptr += 4; \
399 } \
400 \
401 statep->__count = shift << 3; \
402 \
403 /* Now that we digested the input increment the input pointer. */ \
404 inptr++; \
405 } \
406 }
407 #define LOOP_NEED_FLAGS
408 #define EXTRA_LOOP_DECLS , mbstate_t *statep, enum variant var
409 #include <iconv/loop.c>
410
411
412 /* Next, define the conversion from UCS4 to UTF-7.
413 The state is structured as follows:
414 __count bit 2..0: zero
415 __count bit 4..3: shift
416 __count bit 8..5: data
417 Precise meaning:
418 shift data
419 0 0 not inside base64 encoding
420 1 0 inside base64, no pending bits
421 2 XX00 inside base64, 2 bits known for next byte
422 3 XXXX inside base64, 4 bits known for next byte
423
424 __count bit 2..0 and __wch are always zero, because this direction
425 never returns __GCONV_INCOMPLETE_INPUT.
426 */
427 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
428 #define MAX_NEEDED_INPUT MAX_NEEDED_TO
429 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
430 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
431 #define LOOPFCT TO_LOOP
432 #define BODY \
433 { \
434 uint32_t ch = get32 (inptr); \
435 \
436 if ((statep->__count & 0x18) == 0) \
437 { \
438 /* base64 encoding inactive */ \
439 if (isdirect (ch, var)) \
440 { \
441 *outptr++ = (unsigned char) ch; \
442 } \
443 else \
444 { \
445 size_t count; \
446 \
447 if (ch == shift_character (var)) \
448 count = 2; \
449 else if (ch < 0x10000) \
450 count = 3; \
451 else if (ch < 0x110000) \
452 count = 6; \
453 else \
454 STANDARD_TO_LOOP_ERR_HANDLER (4); \
455 \
456 if (__glibc_unlikely (outptr + count > outend)) \
457 { \
458 result = __GCONV_FULL_OUTPUT; \
459 break; \
460 } \
461 \
462 *outptr++ = shift_character (var); \
463 if (ch == shift_character (var)) \
464 *outptr++ = '-'; \
465 else if (ch < 0x10000) \
466 { \
467 *outptr++ = base64 (ch >> 10, var); \
468 *outptr++ = base64 ((ch >> 4) & 0x3f, var); \
469 statep->__count = ((ch & 15) << 5) | (3 << 3); \
470 } \
471 else if (ch < 0x110000) \
472 { \
473 uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \
474 uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \
475 \
476 ch = (ch1 << 16) | ch2; \
477 *outptr++ = base64 (ch >> 26, var); \
478 *outptr++ = base64 ((ch >> 20) & 0x3f, var); \
479 *outptr++ = base64 ((ch >> 14) & 0x3f, var); \
480 *outptr++ = base64 ((ch >> 8) & 0x3f, var); \
481 *outptr++ = base64 ((ch >> 2) & 0x3f, var); \
482 statep->__count = ((ch & 3) << 7) | (2 << 3); \
483 } \
484 else \
485 abort (); \
486 } \
487 } \
488 else \
489 { \
490 /* base64 encoding active */ \
491 if ((var == UTF_7_IMAP && ch == '&') || isdirect (ch, var)) \
492 { \
493 /* deactivate base64 encoding */ \
494 size_t count; \
495 \
496 count = ((statep->__count & 0x18) >= 0x10) \
497 + (var == UTF_7_IMAP || needs_explicit_shift (ch)) \
498 + (var == UTF_7_IMAP && ch == '&') \
499 + 1; \
500 if (__glibc_unlikely (outptr + count > outend)) \
501 { \
502 result = __GCONV_FULL_OUTPUT; \
503 break; \
504 } \
505 \
506 if ((statep->__count & 0x18) >= 0x10) \
507 *outptr++ = base64 ((statep->__count >> 3) & ~3, var); \
508 if (var == UTF_7_IMAP || needs_explicit_shift (ch)) \
509 *outptr++ = '-'; \
510 *outptr++ = (unsigned char) ch; \
511 if (var == UTF_7_IMAP && ch == '&') \
512 *outptr++ = '-'; \
513 statep->__count = 0; \
514 } \
515 else \
516 { \
517 size_t count; \
518 \
519 if (ch < 0x10000) \
520 count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \
521 else if (ch < 0x110000) \
522 count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \
523 else \
524 STANDARD_TO_LOOP_ERR_HANDLER (4); \
525 \
526 if (__glibc_unlikely (outptr + count > outend)) \
527 { \
528 result = __GCONV_FULL_OUTPUT; \
529 break; \
530 } \
531 \
532 if (ch < 0x10000) \
533 { \
534 switch ((statep->__count >> 3) & 3) \
535 { \
536 case 1: \
537 *outptr++ = base64 (ch >> 10, var); \
538 *outptr++ = base64 ((ch >> 4) & 0x3f, var); \
539 statep->__count = ((ch & 15) << 5) | (3 << 3); \
540 break; \
541 case 2: \
542 *outptr++ = \
543 base64 (((statep->__count >> 3) & ~3) | (ch >> 12), \
544 var); \
545 *outptr++ = base64 ((ch >> 6) & 0x3f, var); \
546 *outptr++ = base64 (ch & 0x3f, var); \
547 statep->__count = (1 << 3); \
548 break; \
549 case 3: \
550 *outptr++ = \
551 base64 (((statep->__count >> 3) & ~3) | (ch >> 14), \
552 var); \
553 *outptr++ = base64 ((ch >> 8) & 0x3f, var); \
554 *outptr++ = base64 ((ch >> 2) & 0x3f, var); \
555 statep->__count = ((ch & 3) << 7) | (2 << 3); \
556 break; \
557 default: \
558 abort (); \
559 } \
560 } \
561 else if (ch < 0x110000) \
562 { \
563 uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \
564 uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \
565 \
566 ch = (ch1 << 16) | ch2; \
567 switch ((statep->__count >> 3) & 3) \
568 { \
569 case 1: \
570 *outptr++ = base64 (ch >> 26, var); \
571 *outptr++ = base64 ((ch >> 20) & 0x3f, var); \
572 *outptr++ = base64 ((ch >> 14) & 0x3f, var); \
573 *outptr++ = base64 ((ch >> 8) & 0x3f, var); \
574 *outptr++ = base64 ((ch >> 2) & 0x3f, var); \
575 statep->__count = ((ch & 3) << 7) | (2 << 3); \
576 break; \
577 case 2: \
578 *outptr++ = \
579 base64 (((statep->__count >> 3) & ~3) | (ch >> 28), \
580 var); \
581 *outptr++ = base64 ((ch >> 22) & 0x3f, var); \
582 *outptr++ = base64 ((ch >> 16) & 0x3f, var); \
583 *outptr++ = base64 ((ch >> 10) & 0x3f, var); \
584 *outptr++ = base64 ((ch >> 4) & 0x3f, var); \
585 statep->__count = ((ch & 15) << 5) | (3 << 3); \
586 break; \
587 case 3: \
588 *outptr++ = \
589 base64 (((statep->__count >> 3) & ~3) | (ch >> 30), \
590 var); \
591 *outptr++ = base64 ((ch >> 24) & 0x3f, var); \
592 *outptr++ = base64 ((ch >> 18) & 0x3f, var); \
593 *outptr++ = base64 ((ch >> 12) & 0x3f, var); \
594 *outptr++ = base64 ((ch >> 6) & 0x3f, var); \
595 *outptr++ = base64 (ch & 0x3f, var); \
596 statep->__count = (1 << 3); \
597 break; \
598 default: \
599 abort (); \
600 } \
601 } \
602 else \
603 abort (); \
604 } \
605 } \
606 \
607 /* Now that we wrote the output increment the input pointer. */ \
608 inptr += 4; \
609 }
610 #define LOOP_NEED_FLAGS
611 #define EXTRA_LOOP_DECLS , mbstate_t *statep, enum variant var
612 #include <iconv/loop.c>
613
614
615 /* Since this is a stateful encoding we have to provide code which resets
616 the output state to the initial state. This has to be done during the
617 flushing. */
618 #define EMIT_SHIFT_TO_INIT \
619 if (FROM_DIRECTION) \
620 /* Nothing to emit. */ \
621 memset (data->__statep, '\0', sizeof (mbstate_t)); \
622 else \
623 { \
624 /* The "to UTF-7" direction. Flush the remaining bits and terminate \
625 with a '-' byte. This will guarantee correct decoding if more \
626 UTF-7 encoded text is added afterwards. */ \
627 int state = data->__statep->__count; \
628 \
629 if (state & 0x18) \
630 { \
631 /* Deactivate base64 encoding. */ \
632 size_t count = ((state & 0x18) >= 0x10) + 1; \
633 \
634 if (__glibc_unlikely (outbuf + count > outend)) \
635 /* We don't have enough room in the output buffer. */ \
636 status = __GCONV_FULL_OUTPUT; \
637 else \
638 { \
639 /* Write out the shift sequence. */ \
640 if ((state & 0x18) >= 0x10) \
641 *outbuf++ = base64 ((state >> 3) & ~3, var); \
642 *outbuf++ = '-'; \
643 \
644 data->__statep->__count = 0; \
645 } \
646 } \
647 else \
648 data->__statep->__count = 0; \
649 }
650
651
652 /* Now define the toplevel functions. */
653 #include <iconv/skeleton.c>
654