1 /* Mapping tables for JOHAB handling.
2 Copyright (C) 1998-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <dlfcn.h>
20 #include <stdint.h>
21 #include <ksc5601.h>
22
23 /* The table for Bit pattern to Hangul Jamo
24 5 bits each are used to encode
25 leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
26 and trailing consonants(27 + 1 filler).
27
28 KS C 5601-1992 Annex 3 Table 2
29 0 : Filler, -1: invalid, >= 1 : valid
30
31 */
32 static const int init[32] =
33 {
34 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
35 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
36 };
37 static const int mid[32] =
38 {
39 -1, -1, 0, 1, 2, 3, 4, 5,
40 -1, -1, 6, 7, 8, 9, 10, 11,
41 -1, -1, 12, 13, 14, 15, 16, 17,
42 -1, -1, 18, 19, 20, 21, -1, -1
43 };
44 static const int final[32] =
45 {
46 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
47 -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
48 };
49
50 /*
51 Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
52 defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
53
54 It's to be considered later which Jamo block to use, Compatibility
55 block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
56
57 */
58 static const uint32_t init_to_ucs[19] =
59 {
60 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
61 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
62 0x314c, 0x314d, 0x314e
63 };
64
65 static const uint32_t final_to_ucs[31] =
66 {
67 L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
68 0x313a, 0x313b, 0x313c, 0x313d, 0x313e, 0x313f,
69 0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0', L'\0',
70 L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
71 };
72
73 /* The following three arrays are used to convert
74 precomposed Hangul syllables in [0xac00,0xd???]
75 to Jamo bit patterns for Johab encoding
76
77 cf. : KS C 5601-1992, Annex3 Table 2
78
79 Arrays are used to speed up things although it's possible
80 to get the same result arithmetically.
81
82 */
83 static const int init_to_bit[19] =
84 {
85 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
86 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
87 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
88 0xd000
89 };
90
91 static const int mid_to_bit[21] =
92 {
93 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
94 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
95 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
96 0x0340, 0x0360, 0x0380, 0x03a0
97 };
98
99 static const int final_to_bit[28] =
100 {
101 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
102 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
103 };
104
105 /* The conversion table from
106 UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
107 to Johab
108
109 cf. 1. KS C 5601-1992 Annex 3 Table 2
110 2. Unicode 2.0 manual
111
112 */
113 static const uint16_t jamo_from_ucs_table[51] =
114 {
115 0x8841, 0x8c41,
116 0x8444,
117 0x9041,
118 0x8446, 0x8447,
119 0x9441, 0x9841, 0x9c41,
120 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, 0x8450,
121 0xa041, 0xa441, 0xa841,
122 0x8454,
123 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
124 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041,
125 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
126 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
127 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
128 0x8741, 0x8761, 0x8781, 0x87a1
129 };
130
131
132 static uint32_t
johab_sym_hanja_to_ucs(uint32_t idx,uint32_t c1,uint32_t c2)133 johab_sym_hanja_to_ucs (uint32_t idx, uint32_t c1, uint32_t c2)
134 {
135 if (idx <= 0xdefe)
136 return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
137 - (c2 > 0x90 ? 0x43 : 0x31)];
138 else
139 return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
140 - (c2 > 0x90 ? 0x43 : 0x31)];
141 }
142 /* Definitions used in the body of the `gconv' function. */
143 #define CHARSET_NAME "JOHAB//"
144 #define FROM_LOOP from_johab
145 #define TO_LOOP to_johab
146 #define DEFINE_INIT 1
147 #define DEFINE_FINI 1
148 #define MIN_NEEDED_FROM 1
149 #define MAX_NEEDED_FROM 2
150 #define MIN_NEEDED_TO 4
151 #define ONE_DIRECTION 0
152
153
154 /* First define the conversion function from JOHAB to UCS4. */
155 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
156 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
157 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
158 #define LOOPFCT FROM_LOOP
159 #define BODY \
160 { \
161 uint32_t ch = *inptr; \
162 \
163 if (ch <= 0x7f) \
164 { \
165 /* Plain ISO646-KR. */ \
166 if (ch == 0x5c) \
167 ch = 0x20a9; /* half-width Korean Currency WON sign */ \
168 ++inptr; \
169 } \
170 /* Johab : 1. Hangul \
171 1st byte : 0x84-0xd3 \
172 2nd byte : 0x41-0x7e, 0x81-0xfe \
173 2. Hanja & Symbol : \
174 1st byte : 0xd8-0xde, 0xe0-0xf9 \
175 2nd byte : 0x31-0x7e, 0x91-0xfe \
176 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */ \
177 else \
178 { \
179 if (__builtin_expect (ch > 0xf9, 0) \
180 || __builtin_expect (ch == 0xdf, 0) \
181 || (__builtin_expect (ch > 0x7e, 0) && ch < 0x84) \
182 || (__builtin_expect (ch > 0xd3, 0) && ch < 0xd9)) \
183 { \
184 /* These are illegal. */ \
185 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
186 } \
187 else \
188 { \
189 /* Two-byte character. First test whether the next \
190 character is also available. */ \
191 uint32_t ch2; \
192 uint32_t idx; \
193 \
194 if (__glibc_unlikely (inptr + 1 >= inend)) \
195 { \
196 /* The second character is not available. Store the \
197 intermediate result. */ \
198 result = __GCONV_INCOMPLETE_INPUT; \
199 break; \
200 } \
201 \
202 ch2 = inptr[1]; \
203 idx = ch * 256 + ch2; \
204 if (__glibc_likely (ch <= 0xd3)) \
205 { \
206 /* Hangul */ \
207 int i, m, f; \
208 \
209 i = init[(idx & 0x7c00) >> 10]; \
210 m = mid[(idx & 0x03e0) >> 5]; \
211 f = final[idx & 0x001f]; \
212 \
213 if (__builtin_expect (i == -1, 0) \
214 || __builtin_expect (m == -1, 0) \
215 || __builtin_expect (f == -1, 0)) \
216 { \
217 /* This is illegal. */ \
218 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
219 } \
220 else if (i > 0 && m > 0) \
221 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00; \
222 else if (i > 0 && m == 0 && f == 0) \
223 ch = init_to_ucs[i - 1]; \
224 else if (i == 0 && m > 0 && f == 0) \
225 ch = 0x314e + m; /* 0x314f + m - 1 */ \
226 else if (__builtin_expect ((i | m) == 0, 1) \
227 && __builtin_expect (f > 0, 1)) \
228 ch = final_to_ucs[f - 1]; /* round trip?? */ \
229 else \
230 { \
231 /* This is illegal. */ \
232 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
233 } \
234 } \
235 else \
236 { \
237 if (__builtin_expect (ch2 < 0x31, 0) \
238 || (__builtin_expect (ch2 > 0x7e, 0) && ch2 < 0x91) \
239 || __builtin_expect (ch2, 0) == 0xff \
240 || (__builtin_expect (ch, 0) == 0xd9 && ch2 > 0xe8) \
241 || (__builtin_expect (ch, 0) == 0xda \
242 && ch2 > 0xa0 && ch2 < 0xd4) \
243 || (__builtin_expect (ch, 0) == 0xde && ch2 > 0xf1)) \
244 { \
245 /* This is illegal. */ \
246 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
247 } \
248 else \
249 { \
250 ch = johab_sym_hanja_to_ucs (idx, ch, ch2); \
251 /* if (idx <= 0xdefe) \
252 ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192 \
253 + ch2 - (ch2 > 0x90 \
254 ? 0x43 : 0x31)]; \
255 else \
256 ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192 \
257 + ch2 - (ch2 > 0x90 \
258 ?0x43 : 0x31)];\
259 */ \
260 } \
261 } \
262 } \
263 \
264 if (__glibc_unlikely (ch == 0)) \
265 { \
266 /* This is an illegal character. */ \
267 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
268 } \
269 \
270 inptr += 2; \
271 } \
272 \
273 put32 (outptr, ch); \
274 outptr += 4; \
275 }
276 #define LOOP_NEED_FLAGS
277 #define ONEBYTE_BODY \
278 { \
279 if (c <= 0x7f) \
280 return (c == 0x5c ? 0x20a9 : c); \
281 else \
282 return WEOF; \
283 }
284 #include <iconv/loop.c>
285
286
287 /* Next, define the other direction. */
288 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
289 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
290 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
291 #define LOOPFCT TO_LOOP
292 #define BODY \
293 { \
294 uint32_t ch = get32 (inptr); \
295 /* \
296 if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0]))) \
297 { \
298 if (ch >= 0x0391 && ch <= 0x0451) \
299 cp = from_ucs4_greek[ch - 0x391]; \
300 else if (ch >= 0x2010 && ch <= 0x9fa0) \
301 cp = from_ucs4_cjk[ch - 0x02010]; \
302 else \
303 break; \
304 } \
305 else \
306 cp = from_ucs4_lat1[ch]; \
307 */ \
308 \
309 if (ch <= 0x7f && ch != 0x5c) \
310 *outptr++ = ch; \
311 else \
312 { \
313 if (ch >= 0xac00 && ch <= 0xd7a3) \
314 { \
315 if (__glibc_unlikely (outptr + 2 > outend)) \
316 { \
317 result = __GCONV_FULL_OUTPUT; \
318 break; \
319 } \
320 \
321 ch -= 0xac00; \
322 \
323 ch = (init_to_bit[ch / 588] /* 21 * 28 = 588 */ \
324 + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */ \
325 + final_to_bit[ch % 28]); /* (ch % (21 * 28)) % 28 */ \
326 \
327 *outptr++ = ch / 256; \
328 *outptr++ = ch % 256; \
329 } \
330 /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164) \
331 as symbol */ \
332 else if (ch >= 0x3131 && ch <= 0x3163) \
333 { \
334 ch = jamo_from_ucs_table[ch - 0x3131]; \
335 \
336 if (__glibc_unlikely (outptr + 2 > outend)) \
337 { \
338 result = __GCONV_FULL_OUTPUT; \
339 break; \
340 } \
341 \
342 *outptr++ = ch / 256; \
343 *outptr++ = ch % 256; \
344 } \
345 else if ((ch >= 0x4e00 && ch <= 0x9fa5) \
346 || (ch >= 0xf900 && ch <= 0xfa0b)) \
347 { \
348 size_t written; \
349 uint32_t temp; \
350 \
351 written = ucs4_to_ksc5601_hanja (ch, outptr, outend - outptr); \
352 if (__builtin_expect (written, 1) == 0) \
353 { \
354 result = __GCONV_FULL_OUTPUT; \
355 break; \
356 } \
357 if (__glibc_unlikely (written == __UNKNOWN_10646_CHAR)) \
358 { \
359 STANDARD_TO_LOOP_ERR_HANDLER (4); \
360 } \
361 \
362 outptr[0] -= 0x4a; \
363 outptr[1] -= 0x21; \
364 \
365 temp = outptr[0] * 94 + outptr[1]; \
366 \
367 outptr[0] = 0xe0 + temp / 188; \
368 outptr[1] = temp % 188; \
369 outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31; \
370 \
371 outptr += 2; \
372 } \
373 else if (ch == 0x20a9) \
374 *outptr++ = 0x5c; \
375 else \
376 { \
377 size_t written; \
378 uint32_t temp; \
379 \
380 written = ucs4_to_ksc5601_sym (ch, outptr, outend - outptr); \
381 if (__builtin_expect (written, 1) == 0) \
382 { \
383 result = __GCONV_FULL_OUTPUT; \
384 break; \
385 } \
386 if (__builtin_expect (written == __UNKNOWN_10646_CHAR, 0) \
387 || (outptr[0] == 0x22 && outptr[1] > 0x68)) \
388 { \
389 UNICODE_TAG_HANDLER (ch, 4); \
390 STANDARD_TO_LOOP_ERR_HANDLER (4); \
391 } \
392 \
393 temp = (outptr[0] < 0x4a ? outptr[0] + 0x191 : outptr[0] + 0x176);\
394 outptr[1] += (temp % 2 ? 0x5e : 0); \
395 outptr[1] += (outptr[1] < 0x6f ? 0x10 : 0x22); \
396 outptr[0] = temp / 2; \
397 \
398 outptr += 2; \
399 } \
400 } \
401 \
402 inptr += 4; \
403 }
404 #define LOOP_NEED_FLAGS
405 #include <iconv/loop.c>
406
407
408 /* Now define the toplevel functions. */
409 #include <iconv/skeleton.c>
410