1 /* Conversion to and from IBM939. 2 Copyright (C) 2000-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19 /* IBM939 is designed for the representation of Japanese Latin/Kanji 20 using a stateful EBCDIC encoding scheme. It is also known as 21 CCSID 939 or CP939. See: 22 https://www-01.ibm.com/software/globalization/ccsid/ccsid939.html */ 23 24 #include <dlfcn.h> 25 #include <stdint.h> 26 #include <wchar.h> 27 #include <byteswap.h> 28 #include "ibm939.h" 29 30 /* The shift sequences for this charset (it does not use ESC). */ 31 #define SI 0x0F /* Shift In, host code to turn DBCS off. */ 32 #define SO 0x0E /* Shift Out, host code to turn DBCS on. */ 33 34 /* Definitions used in the body of the `gconv' function. */ 35 #define CHARSET_NAME "IBM939//" 36 #define FROM_LOOP from_ibm939 37 #define TO_LOOP to_ibm939 38 #define ONE_DIRECTION 0 39 #define FROM_LOOP_MIN_NEEDED_FROM 1 40 #define FROM_LOOP_MAX_NEEDED_FROM 2 41 #define FROM_LOOP_MIN_NEEDED_TO 4 42 #define FROM_LOOP_MAX_NEEDED_TO 4 43 #define TO_LOOP_MIN_NEEDED_FROM 4 44 #define TO_LOOP_MAX_NEEDED_FROM 4 45 #define TO_LOOP_MIN_NEEDED_TO 1 46 #define TO_LOOP_MAX_NEEDED_TO 3 47 #define PREPARE_LOOP \ 48 int save_curcs; \ 49 int *curcsp = &data->__statep->__count; 50 #define EXTRA_LOOP_ARGS , curcsp 51 52 /* Definitions of initialization and destructor function. */ 53 #define DEFINE_INIT 1 54 #define DEFINE_FINI 1 55 56 57 /* Since this is a stateful encoding we have to provide code which resets 58 the output state to the initial state. This has to be done during the 59 flushing. */ 60 #define EMIT_SHIFT_TO_INIT \ 61 if ((data->__statep->__count & ~7) != sb) \ 62 { \ 63 if (FROM_DIRECTION) \ 64 data->__statep->__count &= 7; \ 65 else \ 66 { \ 67 /* We are not in the initial state. To switch back we have \ 68 to emit `SI'. */ \ 69 if (__glibc_unlikely (outbuf >= outend)) \ 70 /* We don't have enough room in the output buffer. */ \ 71 status = __GCONV_FULL_OUTPUT; \ 72 else \ 73 { \ 74 /* Write out the shift sequence. */ \ 75 *outbuf++ = SI; \ 76 data->__statep->__count &= 7; \ 77 } \ 78 } \ 79 } 80 81 82 /* Since we might have to reset input pointer we must be able to save 83 and retore the state. */ 84 #define SAVE_RESET_STATE(Save) \ 85 if (Save) \ 86 save_curcs = *curcsp; \ 87 else \ 88 *curcsp = save_curcs 89 90 91 /* Current codeset type. */ 92 enum 93 { 94 sb = 0, 95 db = 64 96 }; 97 98 /* First, define the conversion function from IBM-939 to UCS4. */ 99 #define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM 100 #define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM 101 #define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO 102 #define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO 103 #define LOOPFCT FROM_LOOP 104 #define BODY \ 105 { \ 106 uint32_t ch = *inptr; \ 107 uint32_t res; \ 108 \ 109 if (__builtin_expect (ch, 0) == SO) \ 110 { \ 111 /* Shift OUT, change to DBCS converter (redundant escape okay). */ \ 112 curcs = db; \ 113 ++inptr; \ 114 continue; \ 115 } \ 116 else if (__builtin_expect (ch, 0) == SI) \ 117 { \ 118 /* Shift IN, change to SBCS converter (redundant escape okay). */ \ 119 curcs = sb; \ 120 ++inptr; \ 121 continue; \ 122 } \ 123 \ 124 if (curcs == sb) \ 125 { \ 126 /* Use the IBM939 table for single byte. */ \ 127 res = __ibm939sb_to_ucs4[ch]; \ 128 if (__builtin_expect (res == L'\0', 0) && ch != '\0') \ 129 { \ 130 /* This is an illegal character. */ \ 131 STANDARD_FROM_LOOP_ERR_HANDLER (1); \ 132 } \ 133 else \ 134 { \ 135 put32 (outptr, res); \ 136 outptr += 4; \ 137 } \ 138 ++inptr; \ 139 } \ 140 else \ 141 { \ 142 /* Use the IBM939 table for double byte. */ \ 143 const struct gap *rp2 = __ibm939db_to_ucs4_idx; \ 144 \ 145 assert (curcs == db); \ 146 \ 147 if (__glibc_unlikely (inptr + 1 >= inend)) \ 148 { \ 149 /* The second character is not available. Store the \ 150 intermediate result. */ \ 151 result = __GCONV_INCOMPLETE_INPUT; \ 152 break; \ 153 } \ 154 \ 155 ch = (ch * 0x100) + inptr[1]; \ 156 while (ch > rp2->end) \ 157 ++rp2; \ 158 \ 159 if (__builtin_expect (rp2->start == 0xffff, 0) \ 160 || __builtin_expect (ch < rp2->start, 0) \ 161 || (res = __ibm939db_to_ucs4[ch + rp2->idx], \ 162 __builtin_expect (res, L'\1') == L'\0' && ch != '\0')) \ 163 { \ 164 /* This is an illegal character. */ \ 165 STANDARD_FROM_LOOP_ERR_HANDLER (2); \ 166 } \ 167 else \ 168 { \ 169 put32 (outptr, res); \ 170 outptr += 4; \ 171 } \ 172 inptr += 2; \ 173 } \ 174 } 175 #define LOOP_NEED_FLAGS 176 #define EXTRA_LOOP_DECLS , int *curcsp 177 #define INIT_PARAMS int curcs = *curcsp & ~7 178 #define UPDATE_PARAMS *curcsp = curcs 179 #include <iconv/loop.c> 180 181 /* Next, define the other direction */ 182 #define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM 183 #define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM 184 #define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO 185 #define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO 186 #define LOOPFCT TO_LOOP 187 #define BODY \ 188 { \ 189 uint32_t ch = get32 (inptr); \ 190 const struct gap *rp1 = __ucs4_to_ibm939sb_idx; \ 191 const struct gap *rp2 = __ucs4_to_ibm939db_idx; \ 192 \ 193 if (__glibc_unlikely (ch >= 0xffff)) \ 194 { \ 195 UNICODE_TAG_HANDLER (ch, 4); \ 196 goto ibm939_invalid_char; \ 197 } \ 198 \ 199 while (ch > rp1->end) \ 200 ++rp1; \ 201 \ 202 /* Use the UCS4 table for single byte. */ \ 203 unsigned char sbconv; \ 204 if (__builtin_expect (ch < rp1->start, 0) \ 205 || (sbconv = __ucs4_to_ibm939sb[ch + rp1->idx], \ 206 __builtin_expect (sbconv, L'\1') == L'\0' && ch != '\0')) \ 207 { \ 208 /* Use the UCS4 table for double byte. */ \ 209 while (ch > rp2->end) \ 210 ++rp2; \ 211 \ 212 const char *cp; \ 213 if (__builtin_expect (ch < rp2->start, 0) \ 214 || (cp = __ucs4_to_ibm939db[ch + rp2->idx], \ 215 __builtin_expect (cp[0], L'\1')==L'\0' && ch != '\0')) \ 216 { \ 217 /* This is an illegal character. */ \ 218 ibm939_invalid_char: \ 219 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 220 } \ 221 else \ 222 { \ 223 if (curcs == sb) \ 224 { \ 225 if (__glibc_unlikely (outptr + 1 > outend)) \ 226 { \ 227 result = __GCONV_FULL_OUTPUT; \ 228 break; \ 229 } \ 230 *outptr++ = SO; \ 231 curcs = db; \ 232 } \ 233 \ 234 if (__glibc_unlikely (outptr + 2 > outend)) \ 235 { \ 236 result = __GCONV_FULL_OUTPUT; \ 237 break; \ 238 } \ 239 *outptr++ = cp[0]; \ 240 *outptr++ = cp[1]; \ 241 } \ 242 } \ 243 else \ 244 { \ 245 if (curcs == db) \ 246 { \ 247 if (__glibc_unlikely (outptr + 1 > outend)) \ 248 { \ 249 result = __GCONV_FULL_OUTPUT; \ 250 break; \ 251 } \ 252 *outptr++ = SI; \ 253 curcs = sb; \ 254 } \ 255 \ 256 if (__glibc_unlikely (outptr + 1 > outend)) \ 257 { \ 258 result = __GCONV_FULL_OUTPUT; \ 259 break; \ 260 } \ 261 if (ch == 0x7e) \ 262 *outptr++ = 0xa0; \ 263 else if (ch == 0x5c) \ 264 *outptr++ = 0xb2; \ 265 else \ 266 *outptr++ = sbconv; \ 267 } \ 268 \ 269 /* Now that we wrote the output increment the input pointer. */ \ 270 inptr += 4; \ 271 } 272 #define LOOP_NEED_FLAGS 273 #define EXTRA_LOOP_DECLS , int *curcsp 274 #define INIT_PARAMS int curcs = *curcsp & ~7 275 #define REINIT_PARAMS curcs = *curcsp & ~7 276 #define UPDATE_PARAMS *curcsp = curcs 277 #include <iconv/loop.c> 278 279 /* Now define the toplevel functions. */ 280 #include <iconv/skeleton.c> 281