1 /* Mapping tables for JOHAB handling.
2    Copyright (C) 1998-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <dlfcn.h>
20 #include <stdint.h>
21 #include <ksc5601.h>
22 
23 /* The table for Bit pattern to Hangul Jamo
24    5 bits each are used to encode
25    leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
26    and trailing consonants(27 + 1 filler).
27 
28    KS C 5601-1992 Annex 3 Table 2
29    0 : Filler, -1: invalid, >= 1 : valid
30 
31  */
32 static const int init[32] =
33 {
34   -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
35   19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
36 };
37 static const int mid[32] =
38 {
39   -1, -1, 0, 1, 2, 3, 4, 5,
40   -1, -1, 6, 7, 8, 9, 10, 11,
41   -1, -1, 12, 13, 14, 15, 16, 17,
42   -1, -1, 18, 19, 20, 21, -1, -1
43 };
44 static const int final[32] =
45 {
46   -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
47   -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
48 };
49 
50 /*
51    Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
52    defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
53 
54    It's to be considered later which Jamo block to use, Compatibility
55    block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
56 
57  */
58 static const uint32_t init_to_ucs[19] =
59 {
60   0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
61   0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
62   0x314c, 0x314d, 0x314e
63 };
64 
65 static const uint32_t final_to_ucs[31] =
66 {
67   L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
68   0x313a, 0x313b, 0x313c, 0x313d, 0x313e, 0x313f,
69   0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0', L'\0',
70   L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
71 };
72 
73 /* The following three arrays are used to convert
74    precomposed Hangul syllables in [0xac00,0xd???]
75    to Jamo bit patterns for Johab encoding
76 
77    cf. : KS C 5601-1992, Annex3 Table 2
78 
79    Arrays are used to speed up things although it's possible
80    to get the same result arithmetically.
81 
82  */
83 static const int init_to_bit[19] =
84 {
85   0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
86   0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
87   0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
88   0xd000
89 };
90 
91 static const int mid_to_bit[21] =
92 {
93 	  0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
94   0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
95   0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
96   0x0340, 0x0360, 0x0380, 0x03a0
97 };
98 
99 static const int final_to_bit[28] =
100 {
101   1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
102   0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
103 };
104 
105 /* The conversion table from
106    UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
107    to Johab
108 
109    cf. 1. KS C 5601-1992 Annex 3 Table 2
110    2. Unicode 2.0 manual
111 
112  */
113 static const uint16_t jamo_from_ucs_table[51] =
114 {
115   0x8841, 0x8c41,
116   0x8444,
117   0x9041,
118   0x8446, 0x8447,
119   0x9441, 0x9841, 0x9c41,
120   0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, 0x8450,
121   0xa041, 0xa441, 0xa841,
122   0x8454,
123   0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
124   0xc041, 0xc441, 0xc841, 0xcc41, 0xd041,
125   0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
126   0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
127   0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
128   0x8741, 0x8761, 0x8781, 0x87a1
129 };
130 
131 
132 static uint32_t
johab_sym_hanja_to_ucs(uint32_t idx,uint32_t c1,uint32_t c2)133 johab_sym_hanja_to_ucs (uint32_t idx, uint32_t c1, uint32_t c2)
134 {
135   if (idx <= 0xdefe)
136     return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
137 					   - (c2 > 0x90 ? 0x43 : 0x31)];
138   else
139     return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
140 					     - (c2 > 0x90 ? 0x43 : 0x31)];
141 }
142 /* Definitions used in the body of the `gconv' function.  */
143 #define CHARSET_NAME		"JOHAB//"
144 #define FROM_LOOP		from_johab
145 #define TO_LOOP			to_johab
146 #define DEFINE_INIT		1
147 #define DEFINE_FINI		1
148 #define MIN_NEEDED_FROM		1
149 #define MAX_NEEDED_FROM		2
150 #define MIN_NEEDED_TO		4
151 #define ONE_DIRECTION		0
152 
153 
154 /* First define the conversion function from JOHAB to UCS4.  */
155 #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
156 #define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
157 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
158 #define LOOPFCT			FROM_LOOP
159 #define BODY \
160   {									      \
161     uint32_t ch = *inptr;						      \
162 									      \
163     if (ch <= 0x7f)							      \
164       {									      \
165 	/* Plain ISO646-KR.  */						      \
166 	if (ch == 0x5c)							      \
167 	  ch = 0x20a9; /* half-width Korean Currency WON sign */	      \
168 	++inptr;							      \
169       }									      \
170     /* Johab : 1. Hangul						      \
171        1st byte : 0x84-0xd3						      \
172        2nd byte : 0x41-0x7e, 0x81-0xfe					      \
173        2. Hanja & Symbol  :						      \
174        1st byte : 0xd8-0xde, 0xe0-0xf9					      \
175        2nd byte : 0x31-0x7e, 0x91-0xfe					      \
176        0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */		      \
177     else								      \
178       {									      \
179 	if (__builtin_expect (ch > 0xf9, 0)				      \
180 	    || __builtin_expect (ch == 0xdf, 0)				      \
181 	    || (__builtin_expect (ch > 0x7e, 0) && ch < 0x84)		      \
182 	    || (__builtin_expect (ch > 0xd3, 0) && ch < 0xd9))		      \
183 	  {								      \
184 	    /* These are illegal.  */					      \
185 	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
186 	  }								      \
187 	else								      \
188 	  {								      \
189 	    /* Two-byte character.  First test whether the next		      \
190 	       character is also available.  */				      \
191 	    uint32_t ch2;						      \
192 	    uint32_t idx;						      \
193 									      \
194 	    if (__glibc_unlikely (inptr + 1 >= inend))			      \
195 	      {								      \
196 		/* The second character is not available.  Store the	      \
197 		   intermediate result.  */				      \
198 		result = __GCONV_INCOMPLETE_INPUT;			      \
199 		break;							      \
200 	      }								      \
201 									      \
202 	    ch2 = inptr[1];						      \
203 	    idx = ch * 256 + ch2;					      \
204 	    if (__glibc_likely (ch <= 0xd3))				      \
205 	      {								      \
206 		/* Hangul */						      \
207 		int i, m, f;					      \
208 									      \
209 		i = init[(idx & 0x7c00) >> 10];				      \
210 		m = mid[(idx & 0x03e0) >> 5];				      \
211 		f = final[idx & 0x001f];				      \
212 									      \
213 		if (__builtin_expect (i == -1, 0)			      \
214 		    || __builtin_expect (m == -1, 0)			      \
215 		    || __builtin_expect (f == -1, 0))			      \
216 		  {							      \
217 		    /* This is illegal.  */				      \
218 		    STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
219 		  }							      \
220 		else if (i > 0 && m > 0)				      \
221 		  ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00;	      \
222 		else if (i > 0 && m == 0 && f == 0)			      \
223 		  ch = init_to_ucs[i - 1];				      \
224 		else if (i == 0 && m > 0 && f == 0)			      \
225 		  ch = 0x314e + m;	/* 0x314f + m - 1 */		      \
226 		else if (__builtin_expect ((i | m) == 0, 1)		      \
227 			 && __builtin_expect (f > 0, 1))		      \
228 		  ch = final_to_ucs[f - 1];	/* round trip?? */	      \
229 		else							      \
230 		  {							      \
231 		    /* This is illegal.  */				      \
232 		    STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
233 		  }							      \
234 	      }								      \
235 	    else							      \
236 	      {								      \
237 		if (__builtin_expect (ch2 < 0x31, 0)			      \
238 		    || (__builtin_expect (ch2 > 0x7e, 0) && ch2 < 0x91)	      \
239 		    || __builtin_expect (ch2, 0) == 0xff		      \
240 		    || (__builtin_expect (ch, 0) == 0xd9 && ch2 > 0xe8)	      \
241 		    || (__builtin_expect (ch, 0) == 0xda		      \
242 			&& ch2 > 0xa0 && ch2 < 0xd4)			      \
243 		    || (__builtin_expect (ch, 0) == 0xde && ch2 > 0xf1))      \
244 		  {							      \
245 		    /* This is illegal.  */				      \
246 		    STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
247 		  }							      \
248 		else							      \
249 		  {							      \
250 		    ch = johab_sym_hanja_to_ucs (idx, ch, ch2);		      \
251 		    /* if (idx <= 0xdefe)				      \
252 			 ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192	      \
253 						   + ch2 - (ch2 > 0x90	      \
254 							    ? 0x43 : 0x31)];  \
255 		       else						      \
256 			 ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192	      \
257 						     + ch2 -  (ch2 > 0x90     \
258 							       ?0x43 : 0x31)];\
259 		    */							      \
260 		  }							      \
261 	      }								      \
262 	  }								      \
263 									      \
264 	if (__glibc_unlikely (ch == 0))					      \
265 	  {								      \
266 	    /* This is an illegal character.  */			      \
267 	    STANDARD_FROM_LOOP_ERR_HANDLER (2);				      \
268 	  }								      \
269 									      \
270 	inptr += 2;							      \
271       }									      \
272 									      \
273     put32 (outptr, ch);							      \
274     outptr += 4;							      \
275   }
276 #define LOOP_NEED_FLAGS
277 #define ONEBYTE_BODY \
278   {									      \
279     if (c <= 0x7f)							      \
280       return (c == 0x5c ? 0x20a9 : c);					      \
281     else								      \
282       return WEOF;							      \
283   }
284 #include <iconv/loop.c>
285 
286 
287 /* Next, define the other direction.  */
288 #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
289 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
290 #define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
291 #define LOOPFCT			TO_LOOP
292 #define BODY \
293   {									      \
294     uint32_t ch = get32 (inptr);					      \
295     /*									      \
296        if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0])))      \
297 	 {								      \
298 	   if (ch >= 0x0391 && ch <= 0x0451)				      \
299 	     cp = from_ucs4_greek[ch - 0x391];				      \
300 	   else if (ch >= 0x2010 && ch <= 0x9fa0)			      \
301 	     cp = from_ucs4_cjk[ch - 0x02010];				      \
302 	   else								      \
303 	     break;							      \
304 	 }								      \
305        else								      \
306 	 cp = from_ucs4_lat1[ch];					      \
307     */									      \
308 									      \
309     if (ch <= 0x7f && ch != 0x5c)					      \
310       *outptr++ = ch;							      \
311     else								      \
312       {									      \
313 	if (ch >= 0xac00 && ch <= 0xd7a3)				      \
314 	  {								      \
315 	    if (__glibc_unlikely (outptr + 2 > outend))			      \
316 	      {								      \
317 		result = __GCONV_FULL_OUTPUT;				      \
318 		break;							      \
319 	      }								      \
320 									      \
321 	    ch -= 0xac00;						      \
322 									      \
323 	    ch = (init_to_bit[ch / 588]	  /* 21 * 28 = 588 */		      \
324 		  + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */     \
325 		  + final_to_bit[ch %  28]);  /* (ch % (21 * 28)) % 28 */     \
326 									      \
327 	    *outptr++ = ch / 256;					      \
328 	    *outptr++ = ch % 256;					      \
329 	  }								      \
330 	/* KS C 5601-1992 Annex 3 regards  0xA4DA(Hangul Filler : U3164)      \
331 	   as symbol */							      \
332 	else if (ch >= 0x3131 && ch <= 0x3163)				      \
333 	  {								      \
334 	    ch = jamo_from_ucs_table[ch - 0x3131];			      \
335 									      \
336 	    if (__glibc_unlikely (outptr + 2 > outend))			      \
337 	      {								      \
338 		result = __GCONV_FULL_OUTPUT;				      \
339 		break;							      \
340 	      }								      \
341 									      \
342 	    *outptr++ = ch / 256;					      \
343 	    *outptr++ = ch % 256;					      \
344 	  }								      \
345 	else if ((ch >= 0x4e00 && ch <= 0x9fa5)				      \
346 		 || (ch >= 0xf900 && ch <= 0xfa0b))			      \
347 	  {								      \
348 	    size_t written;						      \
349 	    uint32_t temp;						      \
350 									      \
351 	    written = ucs4_to_ksc5601_hanja (ch, outptr, outend - outptr);    \
352 	    if (__builtin_expect (written, 1) == 0)			      \
353 	      {								      \
354 		result = __GCONV_FULL_OUTPUT;				      \
355 		break;							      \
356 	      }								      \
357 	    if (__glibc_unlikely (written == __UNKNOWN_10646_CHAR))	      \
358 	      {								      \
359 		STANDARD_TO_LOOP_ERR_HANDLER (4);			      \
360 	      }								      \
361 									      \
362 	    outptr[0] -= 0x4a;						      \
363 	    outptr[1] -= 0x21;						      \
364 									      \
365 	    temp = outptr[0] * 94 + outptr[1];				      \
366 									      \
367 	    outptr[0] = 0xe0 + temp / 188;				      \
368 	    outptr[1] = temp % 188;					      \
369 	    outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31;			      \
370 									      \
371 	    outptr += 2;						      \
372 	  }								      \
373 	else if (ch == 0x20a9)						      \
374 	  *outptr++ = 0x5c;						      \
375 	else								      \
376 	  {								      \
377 	    size_t written;						      \
378 	    uint32_t temp;						      \
379 									      \
380 	    written = ucs4_to_ksc5601_sym (ch, outptr, outend - outptr);      \
381 	    if (__builtin_expect (written, 1) == 0)			      \
382 	      {								      \
383 		result = __GCONV_FULL_OUTPUT;				      \
384 		break;							      \
385 	      }								      \
386 	    if (__builtin_expect (written == __UNKNOWN_10646_CHAR, 0)	      \
387 		|| (outptr[0] == 0x22 && outptr[1] > 0x68))		      \
388 	      {								      \
389 		UNICODE_TAG_HANDLER (ch, 4);				      \
390 		STANDARD_TO_LOOP_ERR_HANDLER (4);			      \
391 	      }								      \
392 									      \
393 	    temp = (outptr[0] < 0x4a ? outptr[0] + 0x191 : outptr[0] + 0x176);\
394 	    outptr[1] += (temp % 2 ? 0x5e : 0);				      \
395 	    outptr[1] += (outptr[1] < 0x6f ? 0x10 : 0x22);		      \
396 	    outptr[0] = temp / 2;					      \
397 									      \
398 	    outptr += 2;						      \
399 	  }								      \
400       }									      \
401 									      \
402     inptr += 4;								      \
403   }
404 #define LOOP_NEED_FLAGS
405 #include <iconv/loop.c>
406 
407 
408 /* Now define the toplevel functions.  */
409 #include <iconv/skeleton.c>
410