1 /* Conversion module for ISO-2022-CN. 2 Copyright (C) 1999-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19 #include <dlfcn.h> 20 #include <gconv.h> 21 #include <stdint.h> 22 #include <string.h> 23 #include "gb2312.h" 24 #include "cns11643l1.h" 25 #include "cns11643l2.h" 26 27 #include <assert.h> 28 29 /* This makes obvious what everybody knows: 0x1b is the Esc character. */ 30 #define ESC 0x1b 31 32 /* We have single-byte shift-in and shift-out sequences, and the single 33 shift sequence SS2 which replaces the SS2 designation for the next 34 two bytes. */ 35 #define SI 0x0f 36 #define SO 0x0e 37 #define SS2_0 ESC 38 #define SS2_1 0x4e 39 40 /* Definitions used in the body of the `gconv' function. */ 41 #define CHARSET_NAME "ISO-2022-CN//" 42 #define DEFINE_INIT 1 43 #define DEFINE_FINI 1 44 #define FROM_LOOP from_iso2022cn_loop 45 #define TO_LOOP to_iso2022cn_loop 46 #define ONE_DIRECTION 0 47 #define FROM_LOOP_MIN_NEEDED_FROM 1 48 #define FROM_LOOP_MAX_NEEDED_FROM 4 49 #define FROM_LOOP_MIN_NEEDED_TO 4 50 #define FROM_LOOP_MAX_NEEDED_TO 4 51 #define TO_LOOP_MIN_NEEDED_FROM 4 52 #define TO_LOOP_MAX_NEEDED_FROM 4 53 #define TO_LOOP_MIN_NEEDED_TO 1 54 #define TO_LOOP_MAX_NEEDED_TO 6 55 #define PREPARE_LOOP \ 56 int save_set; \ 57 int *setp = &data->__statep->__count; 58 #define EXTRA_LOOP_ARGS , setp 59 60 61 /* The COUNT element of the state keeps track of the currently selected 62 character set. The possible values are: */ 63 enum 64 { 65 ASCII_set = 0, 66 GB2312_set = 8, 67 CNS11643_1_set = 16, 68 CNS11643_2_set = 24, 69 CURRENT_SEL_MASK = 24, 70 GB2312_ann = 32, 71 CNS11643_1_ann = 64, 72 CNS11643_2_ann = 128, 73 CURRENT_ANN_MASK = 224 74 }; 75 76 77 /* Since this is a stateful encoding we have to provide code which resets 78 the output state to the initial state. This has to be done during the 79 flushing. */ 80 #define EMIT_SHIFT_TO_INIT \ 81 if (data->__statep->__count != ASCII_set) \ 82 { \ 83 if (FROM_DIRECTION) \ 84 /* It's easy, we don't have to emit anything, we just reset the \ 85 state for the input. */ \ 86 data->__statep->__count = ASCII_set; \ 87 else \ 88 { \ 89 /* We are not in the initial state. To switch back we have \ 90 to emit `SI'. */ \ 91 if (__glibc_unlikely (outbuf == outend)) \ 92 /* We don't have enough room in the output buffer. */ \ 93 status = __GCONV_FULL_OUTPUT; \ 94 else \ 95 { \ 96 /* Write out the shift sequence. */ \ 97 *outbuf++ = SI; \ 98 data->__statep->__count = ASCII_set; \ 99 } \ 100 } \ 101 } 102 103 104 /* Since we might have to reset input pointer we must be able to save 105 and retore the state. */ 106 #define SAVE_RESET_STATE(Save) \ 107 if (Save) \ 108 save_set = *setp; \ 109 else \ 110 *setp = save_set 111 112 113 /* First define the conversion function from ISO-2022-CN to UCS4. */ 114 #define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM 115 #define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM 116 #define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO 117 #define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO 118 #define LOOPFCT FROM_LOOP 119 #define BODY \ 120 { \ 121 uint32_t ch = *inptr; \ 122 \ 123 /* This is a 7bit character set, disallow all 8bit characters. */ \ 124 if (__glibc_unlikely (ch >= 0x7f)) \ 125 STANDARD_FROM_LOOP_ERR_HANDLER (1); \ 126 \ 127 /* Recognize escape sequences. */ \ 128 if (__builtin_expect (ch, 0) == ESC) \ 129 { \ 130 /* There are two kinds of escape sequences we have to handle: \ 131 - those announcing the use of GB and CNS characters on the \ 132 line; we can simply ignore them \ 133 - the initial byte of the SS2 sequence. \ 134 */ \ 135 if (__builtin_expect (inptr + 2 > inend, 0) \ 136 || (inptr[1] == '$' \ 137 && (__builtin_expect (inptr + 3 > inend, 0) \ 138 || (inptr[2] == ')' \ 139 && __builtin_expect (inptr + 4 > inend, 0)) \ 140 || (inptr[2] == '*' \ 141 && __builtin_expect (inptr + 4 > inend, 0)))) \ 142 || (inptr[1] == SS2_1 \ 143 && __builtin_expect (inptr + 4 > inend, 0))) \ 144 { \ 145 result = __GCONV_INCOMPLETE_INPUT; \ 146 break; \ 147 } \ 148 if (inptr[1] == '$' \ 149 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \ 150 || (inptr[2] == '*' && inptr[3] == 'H'))) \ 151 { \ 152 /* OK, we accept those character sets. */ \ 153 if (inptr[3] == 'A') \ 154 ann = GB2312_ann; \ 155 else if (inptr[3] == 'G') \ 156 ann = CNS11643_1_ann; \ 157 inptr += 4; \ 158 continue; \ 159 } \ 160 } \ 161 else if (__builtin_expect (ch, 0) == SO) \ 162 { \ 163 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \ 164 S0 designation came last. The only problem is what to do with \ 165 faulty input files where no designator came. \ 166 XXX For now I'll default to use GB2312. If this is not the \ 167 best behaviour (e.g., we should flag an error) let me know. */ \ 168 ++inptr; \ 169 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \ 170 continue; \ 171 } \ 172 else if (__builtin_expect (ch, 0) == SI) \ 173 { \ 174 /* Switch to use ASCII. */ \ 175 ++inptr; \ 176 set = ASCII_set; \ 177 continue; \ 178 } \ 179 \ 180 if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \ 181 { \ 182 /* This is a character from CNS 11643 plane 2. \ 183 XXX We could test here whether the use of this character \ 184 set was announced. */ \ 185 inptr += 2; \ 186 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \ 187 if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \ 188 { \ 189 inptr -= 2; \ 190 STANDARD_FROM_LOOP_ERR_HANDLER (2); \ 191 } \ 192 } \ 193 else if (set == ASCII_set) \ 194 { \ 195 /* Almost done, just advance the input pointer. */ \ 196 ++inptr; \ 197 } \ 198 else \ 199 { \ 200 /* That's pretty easy, we have a dedicated functions for this. */ \ 201 if (set == GB2312_set) \ 202 ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \ 203 else \ 204 { \ 205 assert (set == CNS11643_1_set); \ 206 ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \ 207 } \ 208 \ 209 if (__builtin_expect (ch, 1) == 0) \ 210 { \ 211 result = __GCONV_INCOMPLETE_INPUT; \ 212 break; \ 213 } \ 214 else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \ 215 { \ 216 STANDARD_FROM_LOOP_ERR_HANDLER (1); \ 217 } \ 218 } \ 219 \ 220 put32 (outptr, ch); \ 221 outptr += 4; \ 222 } 223 #define LOOP_NEED_FLAGS 224 #define EXTRA_LOOP_DECLS , int *setp 225 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ 226 int ann = *setp & CURRENT_ANN_MASK 227 #define UPDATE_PARAMS *setp = set | ann 228 #include <iconv/loop.c> 229 230 231 /* Next, define the other direction. */ 232 #define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM 233 #define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM 234 #define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO 235 #define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO 236 #define LOOPFCT TO_LOOP 237 #define BODY \ 238 { \ 239 uint32_t ch = get32 (inptr); \ 240 \ 241 /* First see whether we can write the character using the currently \ 242 selected character set. */ \ 243 if (ch < 0x80) \ 244 { \ 245 if (set != ASCII_set) \ 246 { \ 247 *outptr++ = SI; \ 248 set = ASCII_set; \ 249 if (__glibc_unlikely (outptr == outend)) \ 250 { \ 251 result = __GCONV_FULL_OUTPUT; \ 252 break; \ 253 } \ 254 } \ 255 \ 256 *outptr++ = ch; \ 257 \ 258 /* At the end of the line we have to clear the `ann' flags since \ 259 every line must contain this information again. */ \ 260 if (ch == L'\n') \ 261 ann = 0; \ 262 } \ 263 else \ 264 { \ 265 unsigned char buf[2]; \ 266 /* Fake initialization to keep gcc quiet. */ \ 267 asm ("" : "=m" (buf)); \ 268 \ 269 int used; \ 270 size_t written = 0; \ 271 \ 272 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \ 273 { \ 274 written = ucs4_to_gb2312 (ch, buf, 2); \ 275 used = GB2312_set; \ 276 } \ 277 else \ 278 { \ 279 written = ucs4_to_cns11643l1 (ch, buf, 2); \ 280 used = CNS11643_1_set; \ 281 } \ 282 \ 283 if (written == __UNKNOWN_10646_CHAR) \ 284 { \ 285 /* Cannot convert it using the currently selected SO set. \ 286 Next try the SS2 set. */ \ 287 written = ucs4_to_cns11643l2 (ch, buf, 2); \ 288 if (written != __UNKNOWN_10646_CHAR) \ 289 /* Yep, that worked. */ \ 290 used = CNS11643_2_set; \ 291 else \ 292 { \ 293 /* Well, see whether we have to change the SO set. */ \ 294 if (used == GB2312_set) \ 295 written = ucs4_to_cns11643l1 (ch, buf, 2); \ 296 else \ 297 written = ucs4_to_gb2312 (ch, buf, 2); \ 298 \ 299 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \ 300 /* Oh well, then switch SO. */ \ 301 used = GB2312_set + CNS11643_1_set - used; \ 302 else \ 303 { \ 304 UNICODE_TAG_HANDLER (ch, 4); \ 305 \ 306 /* Even this does not work. Error. */ \ 307 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 308 } \ 309 } \ 310 } \ 311 assert (written == 2); \ 312 \ 313 /* See whether we have to emit an escape sequence. */ \ 314 if (set != used) \ 315 { \ 316 /* First see whether we announced that we use this \ 317 character set. */ \ 318 if ((ann & (16 << (used >> 3))) == 0) \ 319 { \ 320 const char *escseq; \ 321 \ 322 if (__glibc_unlikely (outptr + 4 > outend)) \ 323 { \ 324 result = __GCONV_FULL_OUTPUT; \ 325 break; \ 326 } \ 327 \ 328 assert ((used >> 3) >= 1 && (used >> 3) <= 3); \ 329 escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \ 330 *outptr++ = ESC; \ 331 *outptr++ = '$'; \ 332 *outptr++ = *escseq++; \ 333 *outptr++ = *escseq++; \ 334 \ 335 if (used == GB2312_set) \ 336 ann = (ann & CNS11643_2_ann) | GB2312_ann; \ 337 else if (used == CNS11643_1_set) \ 338 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \ 339 else \ 340 ann |= CNS11643_2_ann; \ 341 } \ 342 \ 343 if (used == CNS11643_2_set) \ 344 { \ 345 if (__glibc_unlikely (outptr + 2 > outend)) \ 346 { \ 347 result = __GCONV_FULL_OUTPUT; \ 348 break; \ 349 } \ 350 *outptr++ = SS2_0; \ 351 *outptr++ = SS2_1; \ 352 } \ 353 else \ 354 { \ 355 /* We only have to emit something is currently ASCII is \ 356 selected. Otherwise we are switching within the \ 357 SO charset. */ \ 358 if (set == ASCII_set) \ 359 { \ 360 if (__glibc_unlikely (outptr + 1 > outend)) \ 361 { \ 362 result = __GCONV_FULL_OUTPUT; \ 363 break; \ 364 } \ 365 *outptr++ = SO; \ 366 } \ 367 } \ 368 \ 369 /* Always test the length here since we have used up all the \ 370 guaranteed output buffer slots. */ \ 371 if (__glibc_unlikely (outptr + 2 > outend)) \ 372 { \ 373 result = __GCONV_FULL_OUTPUT; \ 374 break; \ 375 } \ 376 } \ 377 else if (__glibc_unlikely (outptr + 2 > outend)) \ 378 { \ 379 result = __GCONV_FULL_OUTPUT; \ 380 break; \ 381 } \ 382 \ 383 *outptr++ = buf[0]; \ 384 *outptr++ = buf[1]; \ 385 set = used; \ 386 } \ 387 \ 388 /* Now that we wrote the output increment the input pointer. */ \ 389 inptr += 4; \ 390 } 391 #define LOOP_NEED_FLAGS 392 #define EXTRA_LOOP_DECLS , int *setp 393 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \ 394 int ann = *setp & CURRENT_ANN_MASK 395 #define REINIT_PARAMS do \ 396 { \ 397 set = *setp & CURRENT_SEL_MASK; \ 398 ann = *setp & CURRENT_ANN_MASK; \ 399 } \ 400 while (0) 401 #define UPDATE_PARAMS *setp = set | ann 402 #include <iconv/loop.c> 403 404 405 /* Now define the toplevel functions. */ 406 #include <iconv/skeleton.c> 407