1 /* Conversion from and to EUC-JISX0213.
2    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <dlfcn.h>
20 #include <stdint.h>
21 #include <gconv.h>
22 
23 /* The structure of EUC-JISX0213 is as follows:
24 
25    0x00..0x7F: ASCII
26 
27    0x8E{A1..FE}: JISX0201 Katakana, with prefix 0x8E, offset by +0x80.
28 
29    0x8F{A1..FE}{A1..FE}: JISX0213 plane 2, with prefix 0x8F, offset by +0x8080.
30 
31    0x{A1..FE}{A1..FE}: JISX0213 plane 1, offset by +0x8080.
32 
33    Note that some JISX0213 characters are not contained in Unicode 3.2
34    and are therefore best represented as sequences of Unicode characters.
35 */
36 
37 #include "jisx0213.h"
38 
39 /* Definitions used in the body of the `gconv' function.  */
40 #define CHARSET_NAME		"EUC-JISX0213//"
41 #define FROM_LOOP		from_euc_jisx0213
42 #define TO_LOOP			to_euc_jisx0213
43 #define DEFINE_INIT		1
44 #define DEFINE_FINI		1
45 #define ONE_DIRECTION		0
46 #define FROM_LOOP_MIN_NEEDED_FROM	1
47 #define FROM_LOOP_MAX_NEEDED_FROM	3
48 #define FROM_LOOP_MIN_NEEDED_TO		4
49 #define FROM_LOOP_MAX_NEEDED_TO		8
50 #define TO_LOOP_MIN_NEEDED_FROM		4
51 #define TO_LOOP_MAX_NEEDED_FROM		4
52 #define TO_LOOP_MIN_NEEDED_TO		1
53 #define TO_LOOP_MAX_NEEDED_TO		3
54 #define PREPARE_LOOP \
55   int saved_state;							      \
56   int *statep = &data->__statep->__count;
57 #define EXTRA_LOOP_ARGS		, statep
58 
59 
60 /* Since we might have to reset input pointer we must be able to save
61    and restore the state.  */
62 #define SAVE_RESET_STATE(Save) \
63   if (Save)								      \
64     saved_state = *statep;						      \
65   else									      \
66     *statep = saved_state
67 
68 
69 /* During EUC-JISX0213 to UCS-4 conversion, the COUNT element of the state
70    contains the last UCS-4 character, shifted by 3 bits.
71    During UCS-4 to EUC-JISX0213 conversion, the COUNT element of the state
72    contains the last two bytes to be output, shifted by 3 bits.  */
73 
74 /* Since this is a stateful encoding we have to provide code which resets
75    the output state to the initial state.  This has to be done during the
76    flushing.  */
77 #define EMIT_SHIFT_TO_INIT \
78   if (data->__statep->__count != 0)					      \
79     {									      \
80       if (FROM_DIRECTION)						      \
81 	{								      \
82 	  if (__glibc_likely (outbuf + 4 <= outend))			      \
83 	    {								      \
84 	      /* Write out the last character.  */			      \
85 	      *((uint32_t *) outbuf) = data->__statep->__count >> 3;	      \
86 	      outbuf += sizeof (uint32_t);				      \
87 	      data->__statep->__count = 0;				      \
88 	    }								      \
89 	  else								      \
90 	    /* We don't have enough room in the output buffer.  */	      \
91 	    status = __GCONV_FULL_OUTPUT;				      \
92 	}								      \
93       else								      \
94 	{								      \
95 	  if (__glibc_likely (outbuf + 2 <= outend))			      \
96 	    {								      \
97 	      /* Write out the last character.  */			      \
98 	      uint32_t lasttwo = data->__statep->__count >> 3;		      \
99 	      *outbuf++ = (lasttwo >> 8) & 0xff;			      \
100 	      *outbuf++ = lasttwo & 0xff;				      \
101 	      data->__statep->__count = 0;				      \
102 	    }								      \
103 	  else								      \
104 	    /* We don't have enough room in the output buffer.  */	      \
105 	    status = __GCONV_FULL_OUTPUT;				      \
106 	}								      \
107     }
108 
109 
110 /* First define the conversion function from EUC-JISX0213 to UCS-4.  */
111 #define MIN_NEEDED_INPUT	FROM_LOOP_MIN_NEEDED_FROM
112 #define MAX_NEEDED_INPUT	FROM_LOOP_MAX_NEEDED_FROM
113 #define MIN_NEEDED_OUTPUT	FROM_LOOP_MIN_NEEDED_TO
114 #define MAX_NEEDED_OUTPUT	FROM_LOOP_MAX_NEEDED_TO
115 #define LOOPFCT			FROM_LOOP
116 #define BODY \
117   {									      \
118     uint32_t ch;							      \
119 									      \
120     /* Determine whether there is a buffered character pending.  */	      \
121     ch = *statep >> 3;							      \
122     if (__glibc_likely (ch == 0))					      \
123       {									      \
124 	/* No - so look at the next input byte.  */			      \
125 	ch = *inptr;							      \
126 									      \
127 	if (ch < 0x80)							      \
128 	  /* Plain ASCII character.  */					      \
129 	  ++inptr;							      \
130 	else if ((ch >= 0xa1 && ch <= 0xfe) || ch == 0x8e || ch == 0x8f)      \
131 	  {								      \
132 	    /* Two or three byte character.  */				      \
133 	    uint32_t ch2;						      \
134 									      \
135 	    if (__glibc_unlikely (inptr + 1 >= inend))			      \
136 	      {								      \
137 		/* The second byte is not available.  */		      \
138 		result = __GCONV_INCOMPLETE_INPUT;			      \
139 		break;							      \
140 	      }								      \
141 									      \
142 	    ch2 = inptr[1];						      \
143 									      \
144 	    /* The second byte must be >= 0xa1 and <= 0xfe.  */		      \
145 	    if (__glibc_unlikely (ch2 < 0xa1 || ch2 > 0xfe))		      \
146 	      {								      \
147 		/* This is an illegal character.  */			      \
148 		STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
149 	      }								      \
150 									      \
151 	    if (ch == 0x8e)						      \
152 	      {								      \
153 		/* Half-width katakana.  */				      \
154 		if (__glibc_unlikely (ch2 > 0xdf))			      \
155 		  STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
156 									      \
157 		ch = ch2 + 0xfec0;					      \
158 		inptr += 2;						      \
159 	      }								      \
160 	    else							      \
161 	      {								      \
162 		const unsigned char *endp;				      \
163 									      \
164 		if (ch == 0x8f)						      \
165 		  {							      \
166 		    /* JISX 0213 plane 2.  */				      \
167 		    uint32_t ch3;					      \
168 									      \
169 		    if (__glibc_unlikely (inptr + 2 >= inend))		      \
170 		      {							      \
171 			/* The third byte is not available.  */		      \
172 			result = __GCONV_INCOMPLETE_INPUT;		      \
173 			break;						      \
174 		      }							      \
175 									      \
176 		    ch3 = inptr[2];					      \
177 		    endp = inptr + 3;					      \
178 									      \
179 		    ch = jisx0213_to_ucs4 (0x200 - 0x80 + ch2, ch3 ^ 0x80);   \
180 		  }							      \
181 		else							      \
182 		  {							      \
183 		    /* JISX 0213 plane 1.  */				      \
184 		    endp = inptr + 2;					      \
185 									      \
186 		    ch = jisx0213_to_ucs4 (0x100 - 0x80 + ch, ch2 ^ 0x80);    \
187 		  }							      \
188 									      \
189 		if (ch == 0)						      \
190 		  /* This is an illegal character.  */			      \
191 		  STANDARD_FROM_LOOP_ERR_HANDLER (1);			      \
192 									      \
193 		inptr = endp;						      \
194 									      \
195 		if (ch < 0x80)						      \
196 		  {							      \
197 		    /* It's a combining character.  */			      \
198 		    uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];     \
199 		    uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];     \
200 									      \
201 		    put32 (outptr, u1);					      \
202 		    outptr += 4;					      \
203 									      \
204 		    /* See whether we have room for two characters.  */	      \
205 		    if (outptr + 4 <= outend)				      \
206 		      {							      \
207 			put32 (outptr, u2);				      \
208 			outptr += 4;					      \
209 			continue;					      \
210 		      }							      \
211 									      \
212 		    /* Otherwise store only the first character now, and      \
213 		       put the second one into the queue.  */		      \
214 		    *statep = u2 << 3;					      \
215 		    /* Tell the caller why we terminate the loop.  */	      \
216 		    result = __GCONV_FULL_OUTPUT;			      \
217 		    break;						      \
218 		  }							      \
219 	      }								      \
220 	  }								      \
221 	else								      \
222 	  {								      \
223 	    /* This is illegal.  */					      \
224 	    STANDARD_FROM_LOOP_ERR_HANDLER (1);				      \
225 	  }								      \
226       }									      \
227 									      \
228     put32 (outptr, ch);							      \
229     outptr += 4;							      \
230   }
231 #define LOOP_NEED_FLAGS
232 #define EXTRA_LOOP_DECLS	, int *statep
233 #define ONEBYTE_BODY \
234   {									      \
235     if (c < 0x80)							      \
236       return c;								      \
237     else								      \
238       return WEOF;							      \
239   }
240 #include <iconv/loop.c>
241 
242 
243 /* Next, define the other direction, from UCS-4 to EUC-JISX0213.  */
244 
245 /* Composition tables for each of the relevant combining characters.  */
246 static const struct
247 {
248   uint16_t base;
249   uint16_t composed;
250 } comp_table_data[] =
251 {
252 #define COMP_TABLE_IDX_02E5 0
253 #define COMP_TABLE_LEN_02E5 1
254   { 0xabe4, 0xabe5 }, /* 0x12B65 = 0x12B64 U+02E5 */
255 #define COMP_TABLE_IDX_02E9 (COMP_TABLE_IDX_02E5 + COMP_TABLE_LEN_02E5)
256 #define COMP_TABLE_LEN_02E9 1
257   { 0xabe0, 0xabe6 }, /* 0x12B66 = 0x12B60 U+02E9 */
258 #define COMP_TABLE_IDX_0300 (COMP_TABLE_IDX_02E9 + COMP_TABLE_LEN_02E9)
259 #define COMP_TABLE_LEN_0300 5
260   { 0xa9dc, 0xabc4 }, /* 0x12B44 = 0x1295C U+0300 */
261   { 0xabb8, 0xabc8 }, /* 0x12B48 = 0x12B38 U+0300 */
262   { 0xabb7, 0xabca }, /* 0x12B4A = 0x12B37 U+0300 */
263   { 0xabb0, 0xabcc }, /* 0x12B4C = 0x12B30 U+0300 */
264   { 0xabc3, 0xabce }, /* 0x12B4E = 0x12B43 U+0300 */
265 #define COMP_TABLE_IDX_0301 (COMP_TABLE_IDX_0300 + COMP_TABLE_LEN_0300)
266 #define COMP_TABLE_LEN_0301 4
267   { 0xabb8, 0xabc9 }, /* 0x12B49 = 0x12B38 U+0301 */
268   { 0xabb7, 0xabcb }, /* 0x12B4B = 0x12B37 U+0301 */
269   { 0xabb0, 0xabcd }, /* 0x12B4D = 0x12B30 U+0301 */
270   { 0xabc3, 0xabcf }, /* 0x12B4F = 0x12B43 U+0301 */
271 #define COMP_TABLE_IDX_309A (COMP_TABLE_IDX_0301 + COMP_TABLE_LEN_0301)
272 #define COMP_TABLE_LEN_309A 14
273   { 0xa4ab, 0xa4f7 }, /* 0x12477 = 0x1242B U+309A */
274   { 0xa4ad, 0xa4f8 }, /* 0x12478 = 0x1242D U+309A */
275   { 0xa4af, 0xa4f9 }, /* 0x12479 = 0x1242F U+309A */
276   { 0xa4b1, 0xa4fa }, /* 0x1247A = 0x12431 U+309A */
277   { 0xa4b3, 0xa4fb }, /* 0x1247B = 0x12433 U+309A */
278   { 0xa5ab, 0xa5f7 }, /* 0x12577 = 0x1252B U+309A */
279   { 0xa5ad, 0xa5f8 }, /* 0x12578 = 0x1252D U+309A */
280   { 0xa5af, 0xa5f9 }, /* 0x12579 = 0x1252F U+309A */
281   { 0xa5b1, 0xa5fa }, /* 0x1257A = 0x12531 U+309A */
282   { 0xa5b3, 0xa5fb }, /* 0x1257B = 0x12533 U+309A */
283   { 0xa5bb, 0xa5fc }, /* 0x1257C = 0x1253B U+309A */
284   { 0xa5c4, 0xa5fd }, /* 0x1257D = 0x12544 U+309A */
285   { 0xa5c8, 0xa5fe }, /* 0x1257E = 0x12548 U+309A */
286   { 0xa6f5, 0xa6f8 }, /* 0x12678 = 0x12675 U+309A */
287 };
288 
289 #define MIN_NEEDED_INPUT	TO_LOOP_MIN_NEEDED_FROM
290 #define MAX_NEEDED_INPUT	TO_LOOP_MAX_NEEDED_FROM
291 #define MIN_NEEDED_OUTPUT	TO_LOOP_MIN_NEEDED_TO
292 #define MAX_NEEDED_OUTPUT	TO_LOOP_MAX_NEEDED_TO
293 #define LOOPFCT			TO_LOOP
294 #define BODY \
295   {									      \
296     uint32_t ch = get32 (inptr);					      \
297 									      \
298     if ((*statep >> 3) != 0)						      \
299       {									      \
300 	/* Attempt to combine the last character with this one.  */	      \
301 	uint16_t lasttwo = *statep >> 3;				      \
302 	unsigned int idx;						      \
303 	unsigned int len;						      \
304 									      \
305 	if (ch == 0x02e5)						      \
306 	  idx = COMP_TABLE_IDX_02E5, len = COMP_TABLE_LEN_02E5;		      \
307 	else if (ch == 0x02e9)						      \
308 	  idx = COMP_TABLE_IDX_02E9, len = COMP_TABLE_LEN_02E9;		      \
309 	else if (ch == 0x0300)						      \
310 	  idx = COMP_TABLE_IDX_0300, len = COMP_TABLE_LEN_0300;		      \
311 	else if (ch == 0x0301)						      \
312 	  idx = COMP_TABLE_IDX_0301, len = COMP_TABLE_LEN_0301;		      \
313 	else if (ch == 0x309a)						      \
314 	  idx = COMP_TABLE_IDX_309A, len = COMP_TABLE_LEN_309A;		      \
315 	else								      \
316 	  goto not_combining;						      \
317 									      \
318 	do								      \
319 	  if (comp_table_data[idx].base == lasttwo)			      \
320 	    break;							      \
321 	while (++idx, --len > 0);					      \
322 									      \
323 	if (len > 0)							      \
324 	  {								      \
325 	    /* Output the combined character.  */			      \
326 	    if (__glibc_unlikely (outptr + 1 >= outend))		      \
327 	      {								      \
328 		result = __GCONV_FULL_OUTPUT;				      \
329 		break;							      \
330 	      }								      \
331 	    lasttwo = comp_table_data[idx].composed;			      \
332 	    *outptr++ = (lasttwo >> 8) & 0xff;				      \
333 	    *outptr++ = lasttwo & 0xff;					      \
334 	    *statep = 0;						      \
335 	    inptr += 4;							      \
336 	    continue;							      \
337 	  }								      \
338 									      \
339       not_combining:							      \
340 	/* Output the buffered character.  */				      \
341 	if (__glibc_unlikely (outptr + 1 >= outend))			      \
342 	  {								      \
343 	    result = __GCONV_FULL_OUTPUT;				      \
344 	    break;							      \
345 	  }								      \
346 	*outptr++ = (lasttwo >> 8) & 0xff;				      \
347 	*outptr++ = lasttwo & 0xff;					      \
348 	*statep = 0;							      \
349 	continue;							      \
350       }									      \
351 									      \
352     if (ch < 0x80)							      \
353       /* Plain ASCII character.  */					      \
354       *outptr++ = ch;							      \
355     else if (ch >= 0xff61 && ch <= 0xff9f)				      \
356       {									      \
357 	/* Half-width katakana.  */					      \
358 	if (__glibc_unlikely (outptr + 1 >= outend))			      \
359 	  {								      \
360 	    result = __GCONV_FULL_OUTPUT;				      \
361 	    break;							      \
362 	  }								      \
363 	*outptr++ = 0x8e;						      \
364 	*outptr++ = ch - 0xfec0;					      \
365       }									      \
366     else								      \
367       {									      \
368 	uint32_t jch = ucs4_to_jisx0213 (ch);				      \
369 	if (jch == 0)							      \
370 	  {								      \
371 	    UNICODE_TAG_HANDLER (ch, 4);				      \
372 									      \
373 	    /* Illegal character.  */					      \
374 	    STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
375 	  }								      \
376 									      \
377 	if (jch & 0x0080)						      \
378 	  {								      \
379 	    /* A possible match in comp_table_data.  We have to buffer it.  */\
380 									      \
381 	    /* We know it's a JISX 0213 plane 1 character.  */		      \
382 	    assert ((jch & 0x8000) == 0);				      \
383 									      \
384 	    *statep = (jch | 0x8080) << 3;				      \
385 	    inptr += 4;							      \
386 	    continue;							      \
387 	  }								      \
388 									      \
389 	if (jch & 0x8000)						      \
390 	  {								      \
391 	    /* JISX 0213 plane 2.  */					      \
392 	    if (__glibc_unlikely (outptr + 2 >= outend))		      \
393 	      {								      \
394 		result = __GCONV_FULL_OUTPUT;				      \
395 		break;							      \
396 	      }								      \
397 	    *outptr++ = 0x8f;						      \
398 	  }								      \
399 	else								      \
400 	  {								      \
401 	    /* JISX 0213 plane 1.  */					      \
402 	    if (__glibc_unlikely (outptr + 1 >= outend))		      \
403 	      {								      \
404 		result = __GCONV_FULL_OUTPUT;				      \
405 		break;							      \
406 	      }								      \
407 	  }								      \
408 	*outptr++ = (jch >> 8) | 0x80;					      \
409 	*outptr++ = (jch & 0xff) | 0x80;				      \
410       }									      \
411 									      \
412     inptr += 4;								      \
413   }
414 #define LOOP_NEED_FLAGS
415 #define EXTRA_LOOP_DECLS	, int *statep
416 #include <iconv/loop.c>
417 
418 
419 /* Now define the toplevel functions.  */
420 #include <iconv/skeleton.c>
421