1 /* Conversion from and to Shift_JISX0213.
2    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <dlfcn.h>
20 #include <stdint.h>
21 #include <gconv.h>
22 
23 /* The structure of Shift_JISX0213 is as follows:
24 
25    0x00..0x7F: ISO646-JP, an ASCII variant
26 
27    0x{A1..DF}: JISX0201 Katakana.
28 
29    0x{81..9F,E0..EF}{40..7E,80..FC}: JISX0213 plane 1.
30 
31    0x{F0..FC}{40..7E,80..FC}: JISX0213 plane 2, with irregular row mapping.
32 
33    Note that some JISX0213 characters are not contained in Unicode 3.2
34    and are therefore best represented as sequences of Unicode characters.
35 */
36 
37 #include "jisx0213.h"
38 
39 /* Definitions used in the body of the `gconv' function.  */
40 #define CHARSET_NAME		"SHIFT_JISX0213//"
41 #define FROM_LOOP		from_shift_jisx0213
42 #define TO_LOOP			to_shift_jisx0213
43 #define DEFINE_INIT		1
44 #define DEFINE_FINI		1
45 #define ONE_DIRECTION		0
46 #define FROM_LOOP_MIN_NEEDED_FROM	1
47 #define FROM_LOOP_MAX_NEEDED_FROM	2
48 #define FROM_LOOP_MIN_NEEDED_TO		4
49 #define FROM_LOOP_MAX_NEEDED_TO		8
50 #define TO_LOOP_MIN_NEEDED_FROM		4
51 #define TO_LOOP_MAX_NEEDED_FROM		4
52 #define TO_LOOP_MIN_NEEDED_TO		1
53 #define TO_LOOP_MAX_NEEDED_TO		2
54 #define PREPARE_LOOP \
55   int saved_state;							      \
56   int *statep = &data->__statep->__count;
57 #define EXTRA_LOOP_ARGS		, statep
58 
59 
60 /* Since we might have to reset input pointer we must be able to save
61    and restore the state.  */
62 #define SAVE_RESET_STATE(Save) \
63   if (Save)								      \
64     saved_state = *statep;						      \
65   else									      \
66     *statep = saved_state
67 
68 
69 /* During Shift_JISX0213 to UCS-4 conversion, the COUNT element of the state
70    contains the last UCS-4 character, shifted by 3 bits.
71    During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
72    contains the last two bytes to be output, shifted by 3 bits.  */
73 
74 /* Since this is a stateful encoding we have to provide code which resets
75    the output state to the initial state.  This has to be done during the
76    flushing.  */
77 #define EMIT_SHIFT_TO_INIT \
78   if (data->__statep->__count != 0)					      \
79     {									      \
80       if (FROM_DIRECTION)						      \
81 	{								      \
82 	  if (__glibc_likely (outbuf + 4 <= outend))			      \
83 	    {								      \
84 	      /* Write out the last character.  */			      \
85 	      *((uint32_t *) outbuf) = data->__statep->__count >> 3;	      \
86 	      outbuf += sizeof (uint32_t);				      \
87 	      data->__statep->__count = 0;				      \
88 	    }								      \
89 	  else								      \
90 	    /* We don't have enough room in the output buffer.  */	      \
91 	    status = __GCONV_FULL_OUTPUT;				      \
92 	}								      \
93       else								      \
94 	{								      \
95 	  if (__glibc_likely (outbuf + 2 <= outend))			      \
96 	    {								      \
97 	      /* Write out the last character.  */			      \
98 	      uint32_t lasttwo = data->__statep->__count >> 3;		      \
99 	      *outbuf++ = (lasttwo >> 8) & 0xff;			      \
100 	      *outbuf++ = lasttwo & 0xff;				      \
101 	      data->__statep->__count = 0;				      \
102 	    }								      \
103 	  else								      \
104 	    /* We don't have enough room in the output buffer.  */	      \
105 	    status = __GCONV_FULL_OUTPUT;				      \
106 	}								      \
107     }
108 
109 
110 /* First define the conversion function from Shift_JISX0213 to UCS-4.  */
111 #define MIN_NEEDED_INPUT	FROM_LOOP_MIN_NEEDED_FROM
112 #define MAX_NEEDED_INPUT	FROM_LOOP_MAX_NEEDED_FROM
113 #define MIN_NEEDED_OUTPUT	FROM_LOOP_MIN_NEEDED_TO
114 #define MAX_NEEDED_OUTPUT	FROM_LOOP_MAX_NEEDED_TO
115 #define LOOPFCT			FROM_LOOP
116 #define BODY \
117   {									      \
118     uint32_t ch;							      \
119 									      \
120     /* Determine whether there is a buffered character pending.  */	      \
121     ch = *statep >> 3;							      \
122     if (__glibc_likely (ch == 0))					      \
123       {									      \
124 	/* No - so look at the next input byte.  */			      \
125 	ch = *inptr;							      \
126 									      \
127 	if (ch < 0x80)							      \
128 	  {								      \
129 	    /* Plain ISO646-JP character.  */				      \
130 	    if (__glibc_unlikely (ch == 0x5c))				      \
131 	      ch = 0xa5;						      \
132 	    else if (__glibc_unlikely (ch == 0x7e))			      \
133 	      ch = 0x203e;						      \
134 	    ++inptr;							      \
135 	  }								      \
136 	else if (ch >= 0xa1 && ch <= 0xdf)				      \
137 	  {								      \
138 	    /* Half-width katakana.  */					      \
139 	    ch += 0xfec0;						      \
140 	    ++inptr;							      \
141 	  }								      \
142 	else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))    \
143 	  {								      \
144 	    /* Two byte character.  */					      \
145 	    uint32_t ch2;						      \
146 									      \
147 	    if (__glibc_unlikely (inptr + 1 >= inend))			      \
148 	      {								      \
149 		/* The second byte is not available.  */		      \
150 		result = __GCONV_INCOMPLETE_INPUT;			      \
151 		break;							      \
152 	      }								      \
153 									      \
154 	    ch2 = inptr[1];						      \
155 									      \
156 	    /* The second byte must be in the range 0x{40..7E,80..FC}.  */    \
157 	    if (__glibc_unlikely (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc))   \
158 	      {								      \
159 		/* This is an illegal character.  */			      \
160 		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
161 	      }								      \
162 									      \
163 	    /* Convert to row and column.  */				      \
164 	    if (ch < 0xe0)						      \
165 	      ch -= 0x81;						      \
166 	    else							      \
167 	      ch -= 0xc1;						      \
168 	    if (ch2 < 0x80)						      \
169 	      ch2 -= 0x40;						      \
170 	    else							      \
171 	      ch2 -= 0x41;						      \
172 	    /* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */		      \
173 	    ch = 2 * ch;						      \
174 	    if (ch2 >= 0x5e)						      \
175 	      ch2 -= 0x5e, ch++;					      \
176 	    ch2 += 0x21;						      \
177 	    if (ch >= 0x5e)						      \
178 	      {								      \
179 		/* Handling of JISX 0213 plane 2 rows.  */		      \
180 		if (ch >= 0x67)						      \
181 		  ch += 230;						      \
182 		else if (ch >= 0x63 || ch == 0x5f)			      \
183 		  ch += 168;						      \
184 		else 							      \
185 		  ch += 162;						      \
186 	      }								      \
187 									      \
188 	    ch = jisx0213_to_ucs4 (0x121 + ch, ch2);			      \
189 									      \
190 	    if (ch == 0)						      \
191 	      {								      \
192 		/* This is an illegal character.  */			      \
193 		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
194 	      }								      \
195 									      \
196 	    inptr += 2;							      \
197 									      \
198 	    if (ch < 0x80)						      \
199 	      {								      \
200 		/* It's a combining character.  */			      \
201 		uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
202 		uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
203 									      \
204 		put32 (outptr, u1);					      \
205 		outptr += 4;						      \
206 									      \
207 		/* See whether we have room for two characters.  */	      \
208 		if (outptr + 4 <= outend)				      \
209 		  {							      \
210 		    put32 (outptr, u2);					      \
211 		    outptr += 4;					      \
212 		    continue;						      \
213 		  }							      \
214 									      \
215 		/* Otherwise store only the first character now, and	      \
216 		   put the second one into the queue.  */		      \
217 		*statep = u2 << 3;					      \
218 		/* Tell the caller why we terminate the loop.  */	      \
219 		result = __GCONV_FULL_OUTPUT;				      \
220 		break;							      \
221 	      }								      \
222 	  }								      \
223 	else								      \
224 	  {								      \
225 	    /* This is illegal.  */					      \
226 	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
227 	  }								      \
228       }									      \
229 									      \
230     put32 (outptr, ch);							      \
231     outptr += 4;							      \
232   }
233 #define LOOP_NEED_FLAGS
234 #define EXTRA_LOOP_DECLS	, int *statep
235 #define ONEBYTE_BODY \
236   {									      \
237     if (c < 0x80)							      \
238       {									      \
239 	if (c == 0x5c)							      \
240 	  return 0xa5;							      \
241 	if (c == 0x7e)							      \
242 	  return 0x203e;						      \
243 	return c;							      \
244       }									      \
245     if (c >= 0xa1 && c <= 0xdf)						      \
246       return 0xfec0 + c;						      \
247     return WEOF;							      \
248   }
249 #include <iconv/loop.c>
250 
251 
252 /* Next, define the other direction, from UCS-4 to Shift_JISX0213.  */
253 
254 /* Composition tables for each of the relevant combining characters.  */
255 static const struct
256 {
257   uint16_t base;
258   uint16_t composed;
259 } comp_table_data[] =
260 {
261 #define COMP_TABLE_IDX_02E5 0
262 #define COMP_TABLE_LEN_02E5 1
263   { 0x8684, 0x8685 }, /* 0x12B65 = 0x12B64 U+02E5 */
264 #define COMP_TABLE_IDX_02E9 (COMP_TABLE_IDX_02E5 + COMP_TABLE_LEN_02E5)
265 #define COMP_TABLE_LEN_02E9 1
266   { 0x8680, 0x8686 }, /* 0x12B66 = 0x12B60 U+02E9 */
267 #define COMP_TABLE_IDX_0300 (COMP_TABLE_IDX_02E9 + COMP_TABLE_LEN_02E9)
268 #define COMP_TABLE_LEN_0300 5
269   { 0x857b, 0x8663 }, /* 0x12B44 = 0x1295C U+0300 */
270   { 0x8657, 0x8667 }, /* 0x12B48 = 0x12B38 U+0300 */
271   { 0x8656, 0x8669 }, /* 0x12B4A = 0x12B37 U+0300 */
272   { 0x864f, 0x866b }, /* 0x12B4C = 0x12B30 U+0300 */
273   { 0x8662, 0x866d }, /* 0x12B4E = 0x12B43 U+0300 */
274 #define COMP_TABLE_IDX_0301 (COMP_TABLE_IDX_0300 + COMP_TABLE_LEN_0300)
275 #define COMP_TABLE_LEN_0301 4
276   { 0x8657, 0x8668 }, /* 0x12B49 = 0x12B38 U+0301 */
277   { 0x8656, 0x866a }, /* 0x12B4B = 0x12B37 U+0301 */
278   { 0x864f, 0x866c }, /* 0x12B4D = 0x12B30 U+0301 */
279   { 0x8662, 0x866e }, /* 0x12B4F = 0x12B43 U+0301 */
280 #define COMP_TABLE_IDX_309A (COMP_TABLE_IDX_0301 + COMP_TABLE_LEN_0301)
281 #define COMP_TABLE_LEN_309A 14
282   { 0x82a9, 0x82f5 }, /* 0x12477 = 0x1242B U+309A */
283   { 0x82ab, 0x82f6 }, /* 0x12478 = 0x1242D U+309A */
284   { 0x82ad, 0x82f7 }, /* 0x12479 = 0x1242F U+309A */
285   { 0x82af, 0x82f8 }, /* 0x1247A = 0x12431 U+309A */
286   { 0x82b1, 0x82f9 }, /* 0x1247B = 0x12433 U+309A */
287   { 0x834a, 0x8397 }, /* 0x12577 = 0x1252B U+309A */
288   { 0x834c, 0x8398 }, /* 0x12578 = 0x1252D U+309A */
289   { 0x834e, 0x8399 }, /* 0x12579 = 0x1252F U+309A */
290   { 0x8350, 0x839a }, /* 0x1257A = 0x12531 U+309A */
291   { 0x8352, 0x839b }, /* 0x1257B = 0x12533 U+309A */
292   { 0x835a, 0x839c }, /* 0x1257C = 0x1253B U+309A */
293   { 0x8363, 0x839d }, /* 0x1257D = 0x12544 U+309A */
294   { 0x8367, 0x839e }, /* 0x1257E = 0x12548 U+309A */
295   { 0x83f3, 0x83f6 }, /* 0x12678 = 0x12675 U+309A */
296 };
297 
298 #define MIN_NEEDED_INPUT	TO_LOOP_MIN_NEEDED_FROM
299 #define MAX_NEEDED_INPUT	TO_LOOP_MAX_NEEDED_FROM
300 #define MIN_NEEDED_OUTPUT	TO_LOOP_MIN_NEEDED_TO
301 #define MAX_NEEDED_OUTPUT	TO_LOOP_MAX_NEEDED_TO
302 #define LOOPFCT			TO_LOOP
303 #define BODY \
304   {									      \
305     uint32_t ch = get32 (inptr);					      \
306 									      \
307     if ((*statep >> 3) != 0)						      \
308       {									      \
309 	/* Attempt to combine the last character with this one.  */	      \
310 	uint16_t lasttwo = *statep >> 3;				      \
311 	unsigned int idx;						      \
312 	unsigned int len;						      \
313 									      \
314 	if (ch == 0x02e5)						      \
315 	  idx = COMP_TABLE_IDX_02E5, len = COMP_TABLE_LEN_02E5;		      \
316 	else if (ch == 0x02e9)						      \
317 	  idx = COMP_TABLE_IDX_02E9, len = COMP_TABLE_LEN_02E9;		      \
318 	else if (ch == 0x0300)						      \
319 	  idx = COMP_TABLE_IDX_0300, len = COMP_TABLE_LEN_0300;		      \
320 	else if (ch == 0x0301)						      \
321 	  idx = COMP_TABLE_IDX_0301, len = COMP_TABLE_LEN_0301;		      \
322 	else if (ch == 0x309a)						      \
323 	  idx = COMP_TABLE_IDX_309A, len = COMP_TABLE_LEN_309A;		      \
324 	else								      \
325 	  goto not_combining;						      \
326 									      \
327 	do								      \
328 	  if (comp_table_data[idx].base == lasttwo)			      \
329 	    break;							      \
330 	while (++idx, --len > 0);					      \
331 									      \
332 	if (len > 0)							      \
333 	  {								      \
334 	    /* Output the combined character.  */			      \
335 	    if (__glibc_unlikely (outptr + 1 >= outend))		      \
336 	      {								      \
337 		result = __GCONV_FULL_OUTPUT;				      \
338 		break;							      \
339 	      }								      \
340 	    lasttwo = comp_table_data[idx].composed;			      \
341 	    *outptr++ = (lasttwo >> 8) & 0xff;				      \
342 	    *outptr++ = lasttwo & 0xff;					      \
343 	    *statep = 0;						      \
344 	    inptr += 4;							      \
345 	    continue;							      \
346 	  }								      \
347 									      \
348       not_combining:							      \
349 	/* Output the buffered character.  */				      \
350 	if (__glibc_unlikely (outptr + 1 >= outend))			      \
351 	  {								      \
352 	    result = __GCONV_FULL_OUTPUT;				      \
353 	    break;							      \
354 	  }								      \
355 	*outptr++ = (lasttwo >> 8) & 0xff;				      \
356 	*outptr++ = lasttwo & 0xff;					      \
357 	*statep = 0;							      \
358 	continue;							      \
359       }									      \
360 									      \
361     if (ch < 0x80)							      \
362       /* Plain ISO646-JP character.  */					      \
363       *outptr++ = ch;							      \
364     else if (ch == 0xa5)						      \
365       *outptr++ = 0x5c;							      \
366     else if (ch == 0x203e)						      \
367       *outptr++ = 0x7e;							      \
368     else if (ch >= 0xff61 && ch <= 0xff9f)				      \
369       /* Half-width katakana.  */					      \
370       *outptr++ = ch - 0xfec0;						      \
371     else								      \
372       {									      \
373 	unsigned int s1, s2;						      \
374 	uint32_t jch = ucs4_to_jisx0213 (ch);				      \
375 	if (jch == 0)							      \
376 	  {								      \
377 	    UNICODE_TAG_HANDLER (ch, 4);				      \
378 									      \
379 	    /* Illegal character.  */					      \
380 	    STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
381 	  }								      \
382 									      \
383 	/* Convert it to shifted representation.  */			      \
384 	s1 = jch >> 8;							      \
385 	s2 = jch & 0x7f;							      \
386 	s1 -= 0x21;							      \
387 	s2 -= 0x21;							      \
388 	if (s1 >= 0x5e)							      \
389 	  {								      \
390 	    /* Handling of JISX 0213 plane 2 rows.  */			      \
391 	    if (s1 >= 0xcd) /* rows 0x26E..0x27E */			      \
392 	      s1 -= 102;						      \
393 	    else if (s1 >= 0x8b || s1 == 0x87) /* rows 0x228, 0x22C..0x22F */ \
394 	      s1 -= 40;							      \
395 	    else /* rows 0x221, 0x223..0x225 */				      \
396 	      s1 -= 34;							      \
397 	    /* Now 0x5e <= s1 <= 0x77.  */				      \
398 	  }								      \
399 	if (s1 & 1)							      \
400 	  s2 += 0x5e;							      \
401 	s1 = s1 >> 1;							      \
402 	if (s1 < 0x1f)							      \
403 	  s1 += 0x81;							      \
404 	else								      \
405 	  s1 += 0xc1;							      \
406 	if (s2 < 0x3f)							      \
407 	  s2 += 0x40;							      \
408 	else								      \
409 	  s2 += 0x41;							      \
410 									      \
411 	if (jch & 0x0080)						      \
412 	  {								      \
413 	    /* A possible match in comp_table_data.  We have to buffer it.  */\
414 									      \
415 	    /* We know it's a JISX 0213 plane 1 character.  */		      \
416 	    assert ((jch & 0x8000) == 0);				      \
417 									      \
418 	    *statep = ((s1 << 8) | s2) << 3;				      \
419 	    inptr += 4;							      \
420 	    continue;							      \
421 	  }								      \
422 									      \
423 	/* Output the shifted representation.  */			      \
424 	if (__glibc_unlikely (outptr + 1 >= outend))			      \
425 	  {								      \
426 	    result = __GCONV_FULL_OUTPUT;				      \
427 	    break;							      \
428 	  }								      \
429 	*outptr++ = s1;							      \
430 	*outptr++ = s2;							      \
431       }									      \
432 									      \
433     inptr += 4;								      \
434   }
435 #define LOOP_NEED_FLAGS
436 #define EXTRA_LOOP_DECLS	, int *statep
437 #include <iconv/loop.c>
438 
439 
440 /* Now define the toplevel functions.  */
441 #include <iconv/skeleton.c>
442