1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, Idx len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, bool icase,
23 					const re_dfa_t *dfa);
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  re_hashval_t hash);
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  re_hashval_t hash);
31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 						Idx new_buf_len);
33 #ifdef RE_ENABLE_I18N
34 static void build_wcs_buffer (re_string_t *pstr);
35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36 #endif /* RE_ENABLE_I18N */
37 static void build_upper_buffer (re_string_t *pstr);
38 static void re_string_translate_buffer (re_string_t *pstr);
39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 					  int eflags) __attribute__ ((pure));
41 
42 /* Functions for string operation.  */
43 
44 /* This function allocate the buffers.  It is necessary to call
45    re_string_reconstruct before using the object.  */
46 
47 static reg_errcode_t
48 __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51 {
52   reg_errcode_t ret;
53   Idx init_buf_len;
54 
55   /* Ensure at least one character fits into the buffers.  */
56   if (init_len < dfa->mb_cur_max)
57     init_len = dfa->mb_cur_max;
58   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59   re_string_construct_common (str, len, pstr, trans, icase, dfa);
60 
61   ret = re_string_realloc_buffers (pstr, init_buf_len);
62   if (__glibc_unlikely (ret != REG_NOERROR))
63     return ret;
64 
65   pstr->word_char = dfa->word_char;
66   pstr->word_ops_used = dfa->word_ops_used;
67   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69   pstr->valid_raw_len = pstr->valid_len;
70   return REG_NOERROR;
71 }
72 
73 /* This function allocate the buffers, and initialize them.  */
74 
75 static reg_errcode_t
76 __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79 {
80   reg_errcode_t ret;
81   memset (pstr, '\0', sizeof (re_string_t));
82   re_string_construct_common (str, len, pstr, trans, icase, dfa);
83 
84   if (len > 0)
85     {
86       ret = re_string_realloc_buffers (pstr, len + 1);
87       if (__glibc_unlikely (ret != REG_NOERROR))
88 	return ret;
89     }
90   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91 
92   if (icase)
93     {
94 #ifdef RE_ENABLE_I18N
95       if (dfa->mb_cur_max > 1)
96 	{
97 	  while (1)
98 	    {
99 	      ret = build_wcs_upper_buffer (pstr);
100 	      if (__glibc_unlikely (ret != REG_NOERROR))
101 		return ret;
102 	      if (pstr->valid_raw_len >= len)
103 		break;
104 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 		break;
106 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 	      if (__glibc_unlikely (ret != REG_NOERROR))
108 		return ret;
109 	    }
110 	}
111       else
112 #endif /* RE_ENABLE_I18N  */
113 	build_upper_buffer (pstr);
114     }
115   else
116     {
117 #ifdef RE_ENABLE_I18N
118       if (dfa->mb_cur_max > 1)
119 	build_wcs_buffer (pstr);
120       else
121 #endif /* RE_ENABLE_I18N  */
122 	{
123 	  if (trans != NULL)
124 	    re_string_translate_buffer (pstr);
125 	  else
126 	    {
127 	      pstr->valid_len = pstr->bufs_len;
128 	      pstr->valid_raw_len = pstr->bufs_len;
129 	    }
130 	}
131     }
132 
133   return REG_NOERROR;
134 }
135 
136 /* Helper functions for re_string_allocate, and re_string_construct.  */
137 
138 static reg_errcode_t
139 __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141 {
142 #ifdef RE_ENABLE_I18N
143   if (pstr->mb_cur_max > 1)
144     {
145       wint_t *new_wcs;
146 
147       /* Avoid overflow in realloc.  */
148       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 			    < new_buf_len))
151 	return REG_ESPACE;
152 
153       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154       if (__glibc_unlikely (new_wcs == NULL))
155 	return REG_ESPACE;
156       pstr->wcs = new_wcs;
157       if (pstr->offsets != NULL)
158 	{
159 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160 	  if (__glibc_unlikely (new_offsets == NULL))
161 	    return REG_ESPACE;
162 	  pstr->offsets = new_offsets;
163 	}
164     }
165 #endif /* RE_ENABLE_I18N  */
166   if (pstr->mbs_allocated)
167     {
168       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 					   new_buf_len);
170       if (__glibc_unlikely (new_mbs == NULL))
171 	return REG_ESPACE;
172       pstr->mbs = new_mbs;
173     }
174   pstr->bufs_len = new_buf_len;
175   return REG_NOERROR;
176 }
177 
178 
179 static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)180 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 			    RE_TRANSLATE_TYPE trans, bool icase,
182 			    const re_dfa_t *dfa)
183 {
184   pstr->raw_mbs = (const unsigned char *) str;
185   pstr->len = len;
186   pstr->raw_len = len;
187   pstr->trans = trans;
188   pstr->icase = icase;
189   pstr->mbs_allocated = (trans != NULL || icase);
190   pstr->mb_cur_max = dfa->mb_cur_max;
191   pstr->is_utf8 = dfa->is_utf8;
192   pstr->map_notascii = dfa->map_notascii;
193   pstr->stop = pstr->len;
194   pstr->raw_stop = pstr->stop;
195 }
196 
197 #ifdef RE_ENABLE_I18N
198 
199 /* Build wide character buffer PSTR->WCS.
200    If the byte sequence of the string are:
201      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202    Then wide character buffer will be:
203      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
204    We use WEOF for padding, they indicate that the position isn't
205    a first byte of a multibyte character.
206 
207    Note that this function assumes PSTR->VALID_LEN elements are already
208    built and starts from PSTR->VALID_LEN.  */
209 
210 static void
build_wcs_buffer(re_string_t * pstr)211 build_wcs_buffer (re_string_t *pstr)
212 {
213 #ifdef _LIBC
214   unsigned char buf[MB_LEN_MAX];
215   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
216 #else
217   unsigned char buf[64];
218 #endif
219   mbstate_t prev_st;
220   Idx byte_idx, end_idx, remain_len;
221   size_t mbclen;
222 
223   /* Build the buffers from pstr->valid_len to either pstr->len or
224      pstr->bufs_len.  */
225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227     {
228       wchar_t wc;
229       const char *p;
230 
231       remain_len = end_idx - byte_idx;
232       prev_st = pstr->cur_state;
233       /* Apply the translation if we need.  */
234       if (__glibc_unlikely (pstr->trans != NULL))
235 	{
236 	  int i, ch;
237 
238 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 	    {
240 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 	    }
243 	  p = (const char *) buf;
244 	}
245       else
246 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 			    || (mbclen == (size_t) -2
250 				&& pstr->bufs_len >= pstr->len)))
251 	{
252 	  /* We treat these cases as a singlebyte character.  */
253 	  mbclen = 1;
254 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 	  if (__glibc_unlikely (pstr->trans != NULL))
256 	    wc = pstr->trans[wc];
257 	  pstr->cur_state = prev_st;
258 	}
259       else if (__glibc_unlikely (mbclen == (size_t) -2))
260 	{
261 	  /* The buffer doesn't have enough space, finish to build.  */
262 	  pstr->cur_state = prev_st;
263 	  break;
264 	}
265 
266       /* Write wide character and padding.  */
267       pstr->wcs[byte_idx++] = wc;
268       /* Write paddings.  */
269       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 	pstr->wcs[byte_idx++] = WEOF;
271     }
272   pstr->valid_len = byte_idx;
273   pstr->valid_raw_len = byte_idx;
274 }
275 
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277    but for REG_ICASE.  */
278 
279 static reg_errcode_t
280 __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)281 build_wcs_upper_buffer (re_string_t *pstr)
282 {
283   mbstate_t prev_st;
284   Idx src_idx, byte_idx, end_idx, remain_len;
285   size_t mbclen;
286 #ifdef _LIBC
287   char buf[MB_LEN_MAX];
288   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
289 #else
290   char buf[64];
291 #endif
292 
293   byte_idx = pstr->valid_len;
294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295 
296   /* The following optimization assumes that ASCII characters can be
297      mapped to wide characters with a simple cast.  */
298   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299     {
300       while (byte_idx < end_idx)
301 	{
302 	  wchar_t wc;
303 	  unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
304 
305 	  if (isascii (ch) && mbsinit (&pstr->cur_state))
306 	    {
307 	      /* The next step uses the assumption that wchar_t is encoded
308 		 ASCII-safe: all ASCII values can be converted like this.  */
309 	      wchar_t wcu = __towupper (ch);
310 	      if (isascii (wcu))
311 		{
312 		  pstr->mbs[byte_idx] = wcu;
313 		  pstr->wcs[byte_idx] = wcu;
314 		  byte_idx++;
315 		  continue;
316 		}
317 	    }
318 
319 	  remain_len = end_idx - byte_idx;
320 	  prev_st = pstr->cur_state;
321 	  mbclen = __mbrtowc (&wc,
322 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 			       + byte_idx), remain_len, &pstr->cur_state);
324 	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
325 	    {
326 	      wchar_t wcu = __towupper (wc);
327 	      if (wcu != wc)
328 		{
329 		  size_t mbcdlen;
330 
331 		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
332 		  if (__glibc_likely (mbclen == mbcdlen))
333 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 		  else
335 		    {
336 		      src_idx = byte_idx;
337 		      goto offsets_needed;
338 		    }
339 		}
340 	      else
341 		memcpy (pstr->mbs + byte_idx,
342 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343 	      pstr->wcs[byte_idx++] = wcu;
344 	      /* Write paddings.  */
345 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 		pstr->wcs[byte_idx++] = WEOF;
347 	    }
348 	  else if (mbclen == (size_t) -1 || mbclen == 0
349 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
350 	    {
351 	      /* It is an invalid character, an incomplete character
352 		 at the end of the string, or '\0'.  Just use the byte.  */
353 	      pstr->mbs[byte_idx] = ch;
354 	      /* And also cast it to wide char.  */
355 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
356 	      if (__glibc_unlikely (mbclen == (size_t) -1))
357 		pstr->cur_state = prev_st;
358 	    }
359 	  else
360 	    {
361 	      /* The buffer doesn't have enough space, finish to build.  */
362 	      pstr->cur_state = prev_st;
363 	      break;
364 	    }
365 	}
366       pstr->valid_len = byte_idx;
367       pstr->valid_raw_len = byte_idx;
368       return REG_NOERROR;
369     }
370   else
371     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
372       {
373 	wchar_t wc;
374 	const char *p;
375       offsets_needed:
376 	remain_len = end_idx - byte_idx;
377 	prev_st = pstr->cur_state;
378 	if (__glibc_unlikely (pstr->trans != NULL))
379 	  {
380 	    int i, ch;
381 
382 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
383 	      {
384 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385 		buf[i] = pstr->trans[ch];
386 	      }
387 	    p = (const char *) buf;
388 	  }
389 	else
390 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
391 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
392 	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
393 	  {
394 	    wchar_t wcu = __towupper (wc);
395 	    if (wcu != wc)
396 	      {
397 		size_t mbcdlen;
398 
399 		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
400 		if (__glibc_likely (mbclen == mbcdlen))
401 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
402 		else if (mbcdlen != (size_t) -1)
403 		  {
404 		    size_t i;
405 
406 		    if (byte_idx + mbcdlen > pstr->bufs_len)
407 		      {
408 			pstr->cur_state = prev_st;
409 			break;
410 		      }
411 
412 		    if (pstr->offsets == NULL)
413 		      {
414 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
415 
416 			if (pstr->offsets == NULL)
417 			  return REG_ESPACE;
418 		      }
419 		    if (!pstr->offsets_needed)
420 		      {
421 			for (i = 0; i < (size_t) byte_idx; ++i)
422 			  pstr->offsets[i] = i;
423 			pstr->offsets_needed = 1;
424 		      }
425 
426 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 		    pstr->wcs[byte_idx] = wcu;
428 		    pstr->offsets[byte_idx] = src_idx;
429 		    for (i = 1; i < mbcdlen; ++i)
430 		      {
431 			pstr->offsets[byte_idx + i]
432 			  = src_idx + (i < mbclen ? i : mbclen - 1);
433 			pstr->wcs[byte_idx + i] = WEOF;
434 		      }
435 		    pstr->len += mbcdlen - mbclen;
436 		    if (pstr->raw_stop > src_idx)
437 		      pstr->stop += mbcdlen - mbclen;
438 		    end_idx = (pstr->bufs_len > pstr->len)
439 			      ? pstr->len : pstr->bufs_len;
440 		    byte_idx += mbcdlen;
441 		    src_idx += mbclen;
442 		    continue;
443 		  }
444 		else
445 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
446 	      }
447 	    else
448 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
449 
450 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
451 	      {
452 		size_t i;
453 		for (i = 0; i < mbclen; ++i)
454 		  pstr->offsets[byte_idx + i] = src_idx + i;
455 	      }
456 	    src_idx += mbclen;
457 
458 	    pstr->wcs[byte_idx++] = wcu;
459 	    /* Write paddings.  */
460 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 	      pstr->wcs[byte_idx++] = WEOF;
462 	  }
463 	else if (mbclen == (size_t) -1 || mbclen == 0
464 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
465 	  {
466 	    /* It is an invalid character or '\0'.  Just use the byte.  */
467 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468 
469 	    if (__glibc_unlikely (pstr->trans != NULL))
470 	      ch = pstr->trans [ch];
471 	    pstr->mbs[byte_idx] = ch;
472 
473 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
474 	      pstr->offsets[byte_idx] = src_idx;
475 	    ++src_idx;
476 
477 	    /* And also cast it to wide char.  */
478 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
479 	    if (__glibc_unlikely (mbclen == (size_t) -1))
480 	      pstr->cur_state = prev_st;
481 	  }
482 	else
483 	  {
484 	    /* The buffer doesn't have enough space, finish to build.  */
485 	    pstr->cur_state = prev_st;
486 	    break;
487 	  }
488       }
489   pstr->valid_len = byte_idx;
490   pstr->valid_raw_len = src_idx;
491   return REG_NOERROR;
492 }
493 
494 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
495    Return the index.  */
496 
497 static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)498 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
499 {
500   mbstate_t prev_st;
501   Idx rawbuf_idx;
502   size_t mbclen;
503   wint_t wc = WEOF;
504 
505   /* Skip the characters which are not necessary to check.  */
506   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
507        rawbuf_idx < new_raw_idx;)
508     {
509       wchar_t wc2;
510       Idx remain_len = pstr->raw_len - rawbuf_idx;
511       prev_st = pstr->cur_state;
512       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
513 			  remain_len, &pstr->cur_state);
514       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
515 			    || mbclen == 0))
516 	{
517 	  /* We treat these cases as a single byte character.  */
518 	  if (mbclen == 0 || remain_len == 0)
519 	    wc = L'\0';
520 	  else
521 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
522 	  mbclen = 1;
523 	  pstr->cur_state = prev_st;
524 	}
525       else
526 	wc = wc2;
527       /* Then proceed the next character.  */
528       rawbuf_idx += mbclen;
529     }
530   *last_wc = wc;
531   return rawbuf_idx;
532 }
533 #endif /* RE_ENABLE_I18N  */
534 
535 /* Build the buffer PSTR->MBS, and apply the translation if we need.
536    This function is used in case of REG_ICASE.  */
537 
538 static void
build_upper_buffer(re_string_t * pstr)539 build_upper_buffer (re_string_t *pstr)
540 {
541   Idx char_idx, end_idx;
542   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
543 
544   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
545     {
546       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
547       if (__glibc_unlikely (pstr->trans != NULL))
548 	ch = pstr->trans[ch];
549       pstr->mbs[char_idx] = toupper (ch);
550     }
551   pstr->valid_len = char_idx;
552   pstr->valid_raw_len = char_idx;
553 }
554 
555 /* Apply TRANS to the buffer in PSTR.  */
556 
557 static void
re_string_translate_buffer(re_string_t * pstr)558 re_string_translate_buffer (re_string_t *pstr)
559 {
560   Idx buf_idx, end_idx;
561   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562 
563   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
564     {
565       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
566       pstr->mbs[buf_idx] = pstr->trans[ch];
567     }
568 
569   pstr->valid_len = buf_idx;
570   pstr->valid_raw_len = buf_idx;
571 }
572 
573 /* This function re-construct the buffers.
574    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575    convert to upper case in case of REG_ICASE, apply translation.  */
576 
577 static reg_errcode_t
578 __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)579 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
580 {
581   Idx offset;
582 
583   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
584     offset = idx - pstr->raw_mbs_idx;
585   else
586     {
587       /* Reset buffer.  */
588 #ifdef RE_ENABLE_I18N
589       if (pstr->mb_cur_max > 1)
590 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
591 #endif /* RE_ENABLE_I18N */
592       pstr->len = pstr->raw_len;
593       pstr->stop = pstr->raw_stop;
594       pstr->valid_len = 0;
595       pstr->raw_mbs_idx = 0;
596       pstr->valid_raw_len = 0;
597       pstr->offsets_needed = 0;
598       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
599 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
600       if (!pstr->mbs_allocated)
601 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
602       offset = idx;
603     }
604 
605   if (__glibc_likely (offset != 0))
606     {
607       /* Should the already checked characters be kept?  */
608       if (__glibc_likely (offset < pstr->valid_raw_len))
609 	{
610 	  /* Yes, move them to the front of the buffer.  */
611 #ifdef RE_ENABLE_I18N
612 	  if (__glibc_unlikely (pstr->offsets_needed))
613 	    {
614 	      Idx low = 0, high = pstr->valid_len, mid;
615 	      do
616 		{
617 		  mid = (high + low) / 2;
618 		  if (pstr->offsets[mid] > offset)
619 		    high = mid;
620 		  else if (pstr->offsets[mid] < offset)
621 		    low = mid + 1;
622 		  else
623 		    break;
624 		}
625 	      while (low < high);
626 	      if (pstr->offsets[mid] < offset)
627 		++mid;
628 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
629 							eflags);
630 	      /* This can be quite complicated, so handle specially
631 		 only the common and easy case where the character with
632 		 different length representation of lower and upper
633 		 case is present at or after offset.  */
634 	      if (pstr->valid_len > offset
635 		  && mid == offset && pstr->offsets[mid] == offset)
636 		{
637 		  memmove (pstr->wcs, pstr->wcs + offset,
638 			   (pstr->valid_len - offset) * sizeof (wint_t));
639 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
640 		  pstr->valid_len -= offset;
641 		  pstr->valid_raw_len -= offset;
642 		  for (low = 0; low < pstr->valid_len; low++)
643 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
644 		}
645 	      else
646 		{
647 		  /* Otherwise, just find out how long the partial multibyte
648 		     character at offset is and fill it with WEOF/255.  */
649 		  pstr->len = pstr->raw_len - idx + offset;
650 		  pstr->stop = pstr->raw_stop - idx + offset;
651 		  pstr->offsets_needed = 0;
652 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
653 		    --mid;
654 		  while (mid < pstr->valid_len)
655 		    if (pstr->wcs[mid] != WEOF)
656 		      break;
657 		    else
658 		      ++mid;
659 		  if (mid == pstr->valid_len)
660 		    pstr->valid_len = 0;
661 		  else
662 		    {
663 		      pstr->valid_len = pstr->offsets[mid] - offset;
664 		      if (pstr->valid_len)
665 			{
666 			  for (low = 0; low < pstr->valid_len; ++low)
667 			    pstr->wcs[low] = WEOF;
668 			  memset (pstr->mbs, 255, pstr->valid_len);
669 			}
670 		    }
671 		  pstr->valid_raw_len = pstr->valid_len;
672 		}
673 	    }
674 	  else
675 #endif
676 	    {
677 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
678 							eflags);
679 #ifdef RE_ENABLE_I18N
680 	      if (pstr->mb_cur_max > 1)
681 		memmove (pstr->wcs, pstr->wcs + offset,
682 			 (pstr->valid_len - offset) * sizeof (wint_t));
683 #endif /* RE_ENABLE_I18N */
684 	      if (__glibc_unlikely (pstr->mbs_allocated))
685 		memmove (pstr->mbs, pstr->mbs + offset,
686 			 pstr->valid_len - offset);
687 	      pstr->valid_len -= offset;
688 	      pstr->valid_raw_len -= offset;
689 	      DEBUG_ASSERT (pstr->valid_len > 0);
690 	    }
691 	}
692       else
693 	{
694 #ifdef RE_ENABLE_I18N
695 	  /* No, skip all characters until IDX.  */
696 	  Idx prev_valid_len = pstr->valid_len;
697 
698 	  if (__glibc_unlikely (pstr->offsets_needed))
699 	    {
700 	      pstr->len = pstr->raw_len - idx + offset;
701 	      pstr->stop = pstr->raw_stop - idx + offset;
702 	      pstr->offsets_needed = 0;
703 	    }
704 #endif
705 	  pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 	  if (pstr->mb_cur_max > 1)
708 	    {
709 	      Idx wcs_idx;
710 	      wint_t wc = WEOF;
711 
712 	      if (pstr->is_utf8)
713 		{
714 		  const unsigned char *raw, *p, *end;
715 
716 		  /* Special case UTF-8.  Multi-byte chars start with any
717 		     byte other than 0x80 - 0xbf.  */
718 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 		  end = raw + (offset - pstr->mb_cur_max);
720 		  if (end < pstr->raw_mbs)
721 		    end = pstr->raw_mbs;
722 		  p = raw + offset - 1;
723 #ifdef _LIBC
724 		  /* We know the wchar_t encoding is UCS4, so for the simple
725 		     case, ASCII characters, skip the conversion step.  */
726 		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
727 		    {
728 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 		      /* pstr->valid_len = 0; */
730 		      wc = (wchar_t) *p;
731 		    }
732 		  else
733 #endif
734 		    for (; p >= end; --p)
735 		      if ((*p & 0xc0) != 0x80)
736 			{
737 			  mbstate_t cur_state;
738 			  wchar_t wc2;
739 			  Idx mlen = raw + pstr->len - p;
740 			  unsigned char buf[6];
741 			  size_t mbclen;
742 
743 			  const unsigned char *pp = p;
744 			  if (__glibc_unlikely (pstr->trans != NULL))
745 			    {
746 			      int i = mlen < 6 ? mlen : 6;
747 			      while (--i >= 0)
748 				buf[i] = pstr->trans[p[i]];
749 			      pp = buf;
750 			    }
751 			  /* XXX Don't use mbrtowc, we know which conversion
752 			     to use (UTF-8 -> UCS4).  */
753 			  memset (&cur_state, 0, sizeof (cur_state));
754 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755 					      &cur_state);
756 			  if (raw + offset - p <= mbclen
757 			      && mbclen < (size_t) -2)
758 			    {
759 			      memset (&pstr->cur_state, '\0',
760 				      sizeof (mbstate_t));
761 			      pstr->valid_len = mbclen - (raw + offset - p);
762 			      wc = wc2;
763 			    }
764 			  break;
765 			}
766 		}
767 
768 	      if (wc == WEOF)
769 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770 	      if (wc == WEOF)
771 		pstr->tip_context
772 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773 	      else
774 		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
775 				      && IS_WIDE_WORD_CHAR (wc))
776 				     ? CONTEXT_WORD
777 				     : ((IS_WIDE_NEWLINE (wc)
778 					 && pstr->newline_anchor)
779 					? CONTEXT_NEWLINE : 0));
780 	      if (__glibc_unlikely (pstr->valid_len))
781 		{
782 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 		    pstr->wcs[wcs_idx] = WEOF;
784 		  if (pstr->mbs_allocated)
785 		    memset (pstr->mbs, 255, pstr->valid_len);
786 		}
787 	      pstr->valid_raw_len = pstr->valid_len;
788 	    }
789 	  else
790 #endif /* RE_ENABLE_I18N */
791 	    {
792 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793 	      pstr->valid_raw_len = 0;
794 	      if (pstr->trans)
795 		c = pstr->trans[c];
796 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 				   ? CONTEXT_WORD
798 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
799 				      ? CONTEXT_NEWLINE : 0));
800 	    }
801 	}
802       if (!__glibc_unlikely (pstr->mbs_allocated))
803 	pstr->mbs += offset;
804     }
805   pstr->raw_mbs_idx = idx;
806   pstr->len -= offset;
807   pstr->stop -= offset;
808 
809   /* Then build the buffers.  */
810 #ifdef RE_ENABLE_I18N
811   if (pstr->mb_cur_max > 1)
812     {
813       if (pstr->icase)
814 	{
815 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816 	  if (__glibc_unlikely (ret != REG_NOERROR))
817 	    return ret;
818 	}
819       else
820 	build_wcs_buffer (pstr);
821     }
822   else
823 #endif /* RE_ENABLE_I18N */
824     if (__glibc_unlikely (pstr->mbs_allocated))
825       {
826 	if (pstr->icase)
827 	  build_upper_buffer (pstr);
828 	else if (pstr->trans != NULL)
829 	  re_string_translate_buffer (pstr);
830       }
831     else
832       pstr->valid_len = pstr->len;
833 
834   pstr->cur_idx = 0;
835   return REG_NOERROR;
836 }
837 
838 static unsigned char
839 __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841 {
842   int ch;
843   Idx off;
844 
845   /* Handle the common (easiest) cases first.  */
846   if (__glibc_likely (!pstr->mbs_allocated))
847     return re_string_peek_byte (pstr, idx);
848 
849 #ifdef RE_ENABLE_I18N
850   if (pstr->mb_cur_max > 1
851       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852     return re_string_peek_byte (pstr, idx);
853 #endif
854 
855   off = pstr->cur_idx + idx;
856 #ifdef RE_ENABLE_I18N
857   if (pstr->offsets_needed)
858     off = pstr->offsets[off];
859 #endif
860 
861   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862 
863 #ifdef RE_ENABLE_I18N
864   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865      this function returns CAPITAL LETTER I instead of first byte of
866      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
867      since peek_byte_case doesn't advance cur_idx in any way.  */
868   if (pstr->offsets_needed && !isascii (ch))
869     return re_string_peek_byte (pstr, idx);
870 #endif
871 
872   return ch;
873 }
874 
875 static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)876 re_string_fetch_byte_case (re_string_t *pstr)
877 {
878   if (__glibc_likely (!pstr->mbs_allocated))
879     return re_string_fetch_byte (pstr);
880 
881 #ifdef RE_ENABLE_I18N
882   if (pstr->offsets_needed)
883     {
884       Idx off;
885       int ch;
886 
887       /* For tr_TR.UTF-8 [[:islower:]] there is
888 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
889 	 in that case the whole multi-byte character and return
890 	 the original letter.  On the other side, with
891 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
892 	 anything else would complicate things too much.  */
893 
894       if (!re_string_first_byte (pstr, pstr->cur_idx))
895 	return re_string_fetch_byte (pstr);
896 
897       off = pstr->offsets[pstr->cur_idx];
898       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
899 
900       if (! isascii (ch))
901 	return re_string_fetch_byte (pstr);
902 
903       re_string_skip_bytes (pstr,
904 			    re_string_char_size_at (pstr, pstr->cur_idx));
905       return ch;
906     }
907 #endif
908 
909   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
910 }
911 
912 static void
re_string_destruct(re_string_t * pstr)913 re_string_destruct (re_string_t *pstr)
914 {
915 #ifdef RE_ENABLE_I18N
916   re_free (pstr->wcs);
917   re_free (pstr->offsets);
918 #endif /* RE_ENABLE_I18N  */
919   if (pstr->mbs_allocated)
920     re_free (pstr->mbs);
921 }
922 
923 /* Return the context at IDX in INPUT.  */
924 
925 static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)926 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
927 {
928   int c;
929   if (__glibc_unlikely (idx < 0))
930     /* In this case, we use the value stored in input->tip_context,
931        since we can't know the character in input->mbs[-1] here.  */
932     return input->tip_context;
933   if (__glibc_unlikely (idx == input->len))
934     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
936 #ifdef RE_ENABLE_I18N
937   if (input->mb_cur_max > 1)
938     {
939       wint_t wc;
940       Idx wc_idx = idx;
941       while(input->wcs[wc_idx] == WEOF)
942 	{
943 	  DEBUG_ASSERT (wc_idx >= 0);
944 	  --wc_idx;
945 	  if (wc_idx < 0)
946 	    return input->tip_context;
947 	}
948       wc = input->wcs[wc_idx];
949       if (__glibc_unlikely (input->word_ops_used != 0)
950 	  && IS_WIDE_WORD_CHAR (wc))
951 	return CONTEXT_WORD;
952       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 	      ? CONTEXT_NEWLINE : 0);
954     }
955   else
956 #endif
957     {
958       c = re_string_byte_at (input, idx);
959       if (bitset_contain (input->word_char, c))
960 	return CONTEXT_WORD;
961       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
962     }
963 }
964 
965 /* Functions for set operation.  */
966 
967 static reg_errcode_t
968 __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)969 re_node_set_alloc (re_node_set *set, Idx size)
970 {
971   set->alloc = size;
972   set->nelem = 0;
973   set->elems = re_malloc (Idx, size);
974   if (__glibc_unlikely (set->elems == NULL)
975       && (MALLOC_0_IS_NONNULL || size != 0))
976     return REG_ESPACE;
977   return REG_NOERROR;
978 }
979 
980 static reg_errcode_t
981 __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)982 re_node_set_init_1 (re_node_set *set, Idx elem)
983 {
984   set->alloc = 1;
985   set->nelem = 1;
986   set->elems = re_malloc (Idx, 1);
987   if (__glibc_unlikely (set->elems == NULL))
988     {
989       set->alloc = set->nelem = 0;
990       return REG_ESPACE;
991     }
992   set->elems[0] = elem;
993   return REG_NOERROR;
994 }
995 
996 static reg_errcode_t
997 __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
999 {
1000   set->alloc = 2;
1001   set->elems = re_malloc (Idx, 2);
1002   if (__glibc_unlikely (set->elems == NULL))
1003     return REG_ESPACE;
1004   if (elem1 == elem2)
1005     {
1006       set->nelem = 1;
1007       set->elems[0] = elem1;
1008     }
1009   else
1010     {
1011       set->nelem = 2;
1012       if (elem1 < elem2)
1013 	{
1014 	  set->elems[0] = elem1;
1015 	  set->elems[1] = elem2;
1016 	}
1017       else
1018 	{
1019 	  set->elems[0] = elem2;
1020 	  set->elems[1] = elem1;
1021 	}
1022     }
1023   return REG_NOERROR;
1024 }
1025 
1026 static reg_errcode_t
1027 __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029 {
1030   dest->nelem = src->nelem;
1031   if (src->nelem > 0)
1032     {
1033       dest->alloc = dest->nelem;
1034       dest->elems = re_malloc (Idx, dest->alloc);
1035       if (__glibc_unlikely (dest->elems == NULL))
1036 	{
1037 	  dest->alloc = dest->nelem = 0;
1038 	  return REG_ESPACE;
1039 	}
1040       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041     }
1042   else
1043     re_node_set_init_empty (dest);
1044   return REG_NOERROR;
1045 }
1046 
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1050 
1051 static reg_errcode_t
1052 __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054 			   const re_node_set *src2)
1055 {
1056   Idx i1, i2, is, id, delta, sbase;
1057   if (src1->nelem == 0 || src2->nelem == 0)
1058     return REG_NOERROR;
1059 
1060   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061      conservative estimate.  */
1062   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063     {
1064       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066       if (__glibc_unlikely (new_elems == NULL))
1067 	return REG_ESPACE;
1068       dest->elems = new_elems;
1069       dest->alloc = new_alloc;
1070     }
1071 
1072   /* Find the items in the intersection of SRC1 and SRC2, and copy
1073      into the top of DEST those that are not already in DEST itself.  */
1074   sbase = dest->nelem + src1->nelem + src2->nelem;
1075   i1 = src1->nelem - 1;
1076   i2 = src2->nelem - 1;
1077   id = dest->nelem - 1;
1078   for (;;)
1079     {
1080       if (src1->elems[i1] == src2->elems[i2])
1081 	{
1082 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1083 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1084 	    --id;
1085 
1086 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1087             dest->elems[--sbase] = src1->elems[i1];
1088 
1089 	  if (--i1 < 0 || --i2 < 0)
1090 	    break;
1091 	}
1092 
1093       /* Lower the highest of the two items.  */
1094       else if (src1->elems[i1] < src2->elems[i2])
1095 	{
1096 	  if (--i2 < 0)
1097 	    break;
1098 	}
1099       else
1100 	{
1101 	  if (--i1 < 0)
1102 	    break;
1103 	}
1104     }
1105 
1106   id = dest->nelem - 1;
1107   is = dest->nelem + src1->nelem + src2->nelem - 1;
1108   delta = is - sbase + 1;
1109 
1110   /* Now copy.  When DELTA becomes zero, the remaining
1111      DEST elements are already in place; this is more or
1112      less the same loop that is in re_node_set_merge.  */
1113   dest->nelem += delta;
1114   if (delta > 0 && id >= 0)
1115     for (;;)
1116       {
1117 	if (dest->elems[is] > dest->elems[id])
1118 	  {
1119 	    /* Copy from the top.  */
1120 	    dest->elems[id + delta--] = dest->elems[is--];
1121 	    if (delta == 0)
1122 	      break;
1123 	  }
1124 	else
1125 	  {
1126 	    /* Slide from the bottom.  */
1127 	    dest->elems[id + delta] = dest->elems[id];
1128 	    if (--id < 0)
1129 	      break;
1130 	  }
1131       }
1132 
1133   /* Copy remaining SRC elements.  */
1134   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135 
1136   return REG_NOERROR;
1137 }
1138 
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1141 
1142 static reg_errcode_t
1143 __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145 			const re_node_set *src2)
1146 {
1147   Idx i1, i2, id;
1148   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149     {
1150       dest->alloc = src1->nelem + src2->nelem;
1151       dest->elems = re_malloc (Idx, dest->alloc);
1152       if (__glibc_unlikely (dest->elems == NULL))
1153 	return REG_ESPACE;
1154     }
1155   else
1156     {
1157       if (src1 != NULL && src1->nelem > 0)
1158 	return re_node_set_init_copy (dest, src1);
1159       else if (src2 != NULL && src2->nelem > 0)
1160 	return re_node_set_init_copy (dest, src2);
1161       else
1162 	re_node_set_init_empty (dest);
1163       return REG_NOERROR;
1164     }
1165   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166     {
1167       if (src1->elems[i1] > src2->elems[i2])
1168 	{
1169 	  dest->elems[id++] = src2->elems[i2++];
1170 	  continue;
1171 	}
1172       if (src1->elems[i1] == src2->elems[i2])
1173 	++i2;
1174       dest->elems[id++] = src1->elems[i1++];
1175     }
1176   if (i1 < src1->nelem)
1177     {
1178       memcpy (dest->elems + id, src1->elems + i1,
1179 	     (src1->nelem - i1) * sizeof (Idx));
1180       id += src1->nelem - i1;
1181     }
1182   else if (i2 < src2->nelem)
1183     {
1184       memcpy (dest->elems + id, src2->elems + i2,
1185 	     (src2->nelem - i2) * sizeof (Idx));
1186       id += src2->nelem - i2;
1187     }
1188   dest->nelem = id;
1189   return REG_NOERROR;
1190 }
1191 
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1194 
1195 static reg_errcode_t
1196 __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198 {
1199   Idx is, id, sbase, delta;
1200   if (src == NULL || src->nelem == 0)
1201     return REG_NOERROR;
1202   if (dest->alloc < 2 * src->nelem + dest->nelem)
1203     {
1204       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206       if (__glibc_unlikely (new_buffer == NULL))
1207 	return REG_ESPACE;
1208       dest->elems = new_buffer;
1209       dest->alloc = new_alloc;
1210     }
1211 
1212   if (__glibc_unlikely (dest->nelem == 0))
1213     {
1214       /* Although we already guaranteed above that dest->alloc != 0 and
1215          therefore dest->elems != NULL, add a debug assertion to pacify
1216          GCC 11.2.1's -fanalyzer.  */
1217       DEBUG_ASSERT (dest->elems);
1218       dest->nelem = src->nelem;
1219       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220       return REG_NOERROR;
1221     }
1222 
1223   /* Copy into the top of DEST the items of SRC that are not
1224      found in DEST.  Maybe we could binary search in DEST?  */
1225   for (sbase = dest->nelem + 2 * src->nelem,
1226        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1227     {
1228       if (dest->elems[id] == src->elems[is])
1229 	is--, id--;
1230       else if (dest->elems[id] < src->elems[is])
1231 	dest->elems[--sbase] = src->elems[is--];
1232       else /* if (dest->elems[id] > src->elems[is]) */
1233 	--id;
1234     }
1235 
1236   if (is >= 0)
1237     {
1238       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1239       sbase -= is + 1;
1240       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1241     }
1242 
1243   id = dest->nelem - 1;
1244   is = dest->nelem + 2 * src->nelem - 1;
1245   delta = is - sbase + 1;
1246   if (delta == 0)
1247     return REG_NOERROR;
1248 
1249   /* Now copy.  When DELTA becomes zero, the remaining
1250      DEST elements are already in place.  */
1251   dest->nelem += delta;
1252   for (;;)
1253     {
1254       if (dest->elems[is] > dest->elems[id])
1255 	{
1256 	  /* Copy from the top.  */
1257 	  dest->elems[id + delta--] = dest->elems[is--];
1258 	  if (delta == 0)
1259 	    break;
1260 	}
1261       else
1262 	{
1263 	  /* Slide from the bottom.  */
1264 	  dest->elems[id + delta] = dest->elems[id];
1265 	  if (--id < 0)
1266 	    {
1267 	      /* Copy remaining SRC elements.  */
1268 	      memcpy (dest->elems, dest->elems + sbase,
1269 		      delta * sizeof (Idx));
1270 	      break;
1271 	    }
1272 	}
1273     }
1274 
1275   return REG_NOERROR;
1276 }
1277 
1278 /* Insert the new element ELEM to the re_node_set* SET.
1279    SET should not already have ELEM.
1280    Return true if successful.  */
1281 
1282 static bool
1283 __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1284 re_node_set_insert (re_node_set *set, Idx elem)
1285 {
1286   Idx idx;
1287   /* In case the set is empty.  */
1288   if (set->alloc == 0)
1289     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1290 
1291   if (__glibc_unlikely (set->nelem) == 0)
1292     {
1293       /* Although we already guaranteed above that set->alloc != 0 and
1294          therefore set->elems != NULL, add a debug assertion to pacify
1295          GCC 11.2 -fanalyzer.  */
1296       DEBUG_ASSERT (set->elems);
1297       set->elems[0] = elem;
1298       ++set->nelem;
1299       return true;
1300     }
1301 
1302   /* Realloc if we need.  */
1303   if (set->alloc == set->nelem)
1304     {
1305       Idx *new_elems;
1306       set->alloc = set->alloc * 2;
1307       new_elems = re_realloc (set->elems, Idx, set->alloc);
1308       if (__glibc_unlikely (new_elems == NULL))
1309 	return false;
1310       set->elems = new_elems;
1311     }
1312 
1313   /* Move the elements which follows the new element.  Test the
1314      first element separately to skip a check in the inner loop.  */
1315   if (elem < set->elems[0])
1316     {
1317       for (idx = set->nelem; idx > 0; idx--)
1318 	set->elems[idx] = set->elems[idx - 1];
1319     }
1320   else
1321     {
1322       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1323 	set->elems[idx] = set->elems[idx - 1];
1324       DEBUG_ASSERT (set->elems[idx - 1] < elem);
1325     }
1326 
1327   /* Insert the new element.  */
1328   set->elems[idx] = elem;
1329   ++set->nelem;
1330   return true;
1331 }
1332 
1333 /* Insert the new element ELEM to the re_node_set* SET.
1334    SET should not already have any element greater than or equal to ELEM.
1335    Return true if successful.  */
1336 
1337 static bool
1338 __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1339 re_node_set_insert_last (re_node_set *set, Idx elem)
1340 {
1341   /* Realloc if we need.  */
1342   if (set->alloc == set->nelem)
1343     {
1344       Idx *new_elems;
1345       set->alloc = (set->alloc + 1) * 2;
1346       new_elems = re_realloc (set->elems, Idx, set->alloc);
1347       if (__glibc_unlikely (new_elems == NULL))
1348 	return false;
1349       set->elems = new_elems;
1350     }
1351 
1352   /* Insert the new element.  */
1353   set->elems[set->nelem++] = elem;
1354   return true;
1355 }
1356 
1357 /* Compare two node sets SET1 and SET2.
1358    Return true if SET1 and SET2 are equivalent.  */
1359 
1360 static bool
1361 __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1362 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1363 {
1364   Idx i;
1365   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1366     return false;
1367   for (i = set1->nelem ; --i >= 0 ; )
1368     if (set1->elems[i] != set2->elems[i])
1369       return false;
1370   return true;
1371 }
1372 
1373 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1374 
1375 static Idx
1376 __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1377 re_node_set_contains (const re_node_set *set, Idx elem)
1378 {
1379   __re_size_t idx, right, mid;
1380   if (set->nelem <= 0)
1381     return 0;
1382 
1383   /* Binary search the element.  */
1384   idx = 0;
1385   right = set->nelem - 1;
1386   while (idx < right)
1387     {
1388       mid = (idx + right) / 2;
1389       if (set->elems[mid] < elem)
1390 	idx = mid + 1;
1391       else
1392 	right = mid;
1393     }
1394   return set->elems[idx] == elem ? idx + 1 : 0;
1395 }
1396 
1397 static void
re_node_set_remove_at(re_node_set * set,Idx idx)1398 re_node_set_remove_at (re_node_set *set, Idx idx)
1399 {
1400   if (idx < 0 || idx >= set->nelem)
1401     return;
1402   --set->nelem;
1403   for (; idx < set->nelem; idx++)
1404     set->elems[idx] = set->elems[idx + 1];
1405 }
1406 
1407 
1408 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1409    Or return -1 if an error occurred.  */
1410 
1411 static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1412 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1413 {
1414   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1415     {
1416       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1417       Idx *new_nexts, *new_indices;
1418       re_node_set *new_edests, *new_eclosures;
1419       re_token_t *new_nodes;
1420 
1421       /* Avoid overflows in realloc.  */
1422       const size_t max_object_size = MAX (sizeof (re_token_t),
1423 					  MAX (sizeof (re_node_set),
1424 					       sizeof (Idx)));
1425       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1426 			    < new_nodes_alloc))
1427 	return -1;
1428 
1429       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1430       if (__glibc_unlikely (new_nodes == NULL))
1431 	return -1;
1432       dfa->nodes = new_nodes;
1433       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1434       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1435       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1436       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1437       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1438 			    || new_edests == NULL || new_eclosures == NULL))
1439 	{
1440 	   re_free (new_nexts);
1441 	   re_free (new_indices);
1442 	   re_free (new_edests);
1443 	   re_free (new_eclosures);
1444 	   return -1;
1445 	}
1446       dfa->nexts = new_nexts;
1447       dfa->org_indices = new_indices;
1448       dfa->edests = new_edests;
1449       dfa->eclosures = new_eclosures;
1450       dfa->nodes_alloc = new_nodes_alloc;
1451     }
1452   dfa->nodes[dfa->nodes_len] = token;
1453   dfa->nodes[dfa->nodes_len].constraint = 0;
1454 #ifdef RE_ENABLE_I18N
1455   dfa->nodes[dfa->nodes_len].accept_mb =
1456     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1457      || token.type == COMPLEX_BRACKET);
1458 #endif
1459   dfa->nexts[dfa->nodes_len] = -1;
1460   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1461   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1462   return dfa->nodes_len++;
1463 }
1464 
1465 static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)1466 calc_state_hash (const re_node_set *nodes, unsigned int context)
1467 {
1468   re_hashval_t hash = nodes->nelem + context;
1469   Idx i;
1470   for (i = 0 ; i < nodes->nelem ; i++)
1471     hash += nodes->elems[i];
1472   return hash;
1473 }
1474 
1475 /* Search for the state whose node_set is equivalent to NODES.
1476    Return the pointer to the state, if we found it in the DFA.
1477    Otherwise create the new one and return it.  In case of an error
1478    return NULL and set the error code in ERR.
1479    Note: - We assume NULL as the invalid state, then it is possible that
1480 	   return value is NULL and ERR is REG_NOERROR.
1481 	 - We never return non-NULL value in case of any errors, it is for
1482 	   optimization.  */
1483 
1484 static re_dfastate_t *
1485 __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1486 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1487 		  const re_node_set *nodes)
1488 {
1489   re_hashval_t hash;
1490   re_dfastate_t *new_state;
1491   struct re_state_table_entry *spot;
1492   Idx i;
1493 #if defined GCC_LINT || defined lint
1494   /* Suppress bogus uninitialized-variable warnings.  */
1495   *err = REG_NOERROR;
1496 #endif
1497   if (__glibc_unlikely (nodes->nelem == 0))
1498     {
1499       *err = REG_NOERROR;
1500       return NULL;
1501     }
1502   hash = calc_state_hash (nodes, 0);
1503   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504 
1505   for (i = 0 ; i < spot->num ; i++)
1506     {
1507       re_dfastate_t *state = spot->array[i];
1508       if (hash != state->hash)
1509 	continue;
1510       if (re_node_set_compare (&state->nodes, nodes))
1511 	return state;
1512     }
1513 
1514   /* There are no appropriate state in the dfa, create the new one.  */
1515   new_state = create_ci_newstate (dfa, nodes, hash);
1516   if (__glibc_unlikely (new_state == NULL))
1517     *err = REG_ESPACE;
1518 
1519   return new_state;
1520 }
1521 
1522 /* Search for the state whose node_set is equivalent to NODES and
1523    whose context is equivalent to CONTEXT.
1524    Return the pointer to the state, if we found it in the DFA.
1525    Otherwise create the new one and return it.  In case of an error
1526    return NULL and set the error code in ERR.
1527    Note: - We assume NULL as the invalid state, then it is possible that
1528 	   return value is NULL and ERR is REG_NOERROR.
1529 	 - We never return non-NULL value in case of any errors, it is for
1530 	   optimization.  */
1531 
1532 static re_dfastate_t *
1533 __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1534 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535 			  const re_node_set *nodes, unsigned int context)
1536 {
1537   re_hashval_t hash;
1538   re_dfastate_t *new_state;
1539   struct re_state_table_entry *spot;
1540   Idx i;
1541 #if defined GCC_LINT || defined lint
1542   /* Suppress bogus uninitialized-variable warnings.  */
1543   *err = REG_NOERROR;
1544 #endif
1545   if (nodes->nelem == 0)
1546     {
1547       *err = REG_NOERROR;
1548       return NULL;
1549     }
1550   hash = calc_state_hash (nodes, context);
1551   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1552 
1553   for (i = 0 ; i < spot->num ; i++)
1554     {
1555       re_dfastate_t *state = spot->array[i];
1556       if (state->hash == hash
1557 	  && state->context == context
1558 	  && re_node_set_compare (state->entrance_nodes, nodes))
1559 	return state;
1560     }
1561   /* There are no appropriate state in 'dfa', create the new one.  */
1562   new_state = create_cd_newstate (dfa, nodes, context, hash);
1563   if (__glibc_unlikely (new_state == NULL))
1564     *err = REG_ESPACE;
1565 
1566   return new_state;
1567 }
1568 
1569 /* Finish initialization of the new state NEWSTATE, and using its hash value
1570    HASH put in the appropriate bucket of DFA's state table.  Return value
1571    indicates the error code if failed.  */
1572 
1573 static reg_errcode_t
1574 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1575 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1576 		re_hashval_t hash)
1577 {
1578   struct re_state_table_entry *spot;
1579   reg_errcode_t err;
1580   Idx i;
1581 
1582   newstate->hash = hash;
1583   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1584   if (__glibc_unlikely (err != REG_NOERROR))
1585     return REG_ESPACE;
1586   for (i = 0; i < newstate->nodes.nelem; i++)
1587     {
1588       Idx elem = newstate->nodes.elems[i];
1589       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1590 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1591 	  return REG_ESPACE;
1592     }
1593 
1594   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1595   if (__glibc_unlikely (spot->alloc <= spot->num))
1596     {
1597       Idx new_alloc = 2 * spot->num + 2;
1598       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1599 					      new_alloc);
1600       if (__glibc_unlikely (new_array == NULL))
1601 	return REG_ESPACE;
1602       spot->array = new_array;
1603       spot->alloc = new_alloc;
1604     }
1605   spot->array[spot->num++] = newstate;
1606   return REG_NOERROR;
1607 }
1608 
1609 static void
free_state(re_dfastate_t * state)1610 free_state (re_dfastate_t *state)
1611 {
1612   re_node_set_free (&state->non_eps_nodes);
1613   re_node_set_free (&state->inveclosure);
1614   if (state->entrance_nodes != &state->nodes)
1615     {
1616       re_node_set_free (state->entrance_nodes);
1617       re_free (state->entrance_nodes);
1618     }
1619   re_node_set_free (&state->nodes);
1620   re_free (state->word_trtable);
1621   re_free (state->trtable);
1622   re_free (state);
1623 }
1624 
1625 /* Create the new state which is independent of contexts.
1626    Return the new state if succeeded, otherwise return NULL.  */
1627 
1628 static re_dfastate_t *
1629 __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1630 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1631 		    re_hashval_t hash)
1632 {
1633   Idx i;
1634   reg_errcode_t err;
1635   re_dfastate_t *newstate;
1636 
1637   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1638   if (__glibc_unlikely (newstate == NULL))
1639     return NULL;
1640   err = re_node_set_init_copy (&newstate->nodes, nodes);
1641   if (__glibc_unlikely (err != REG_NOERROR))
1642     {
1643       re_free (newstate);
1644       return NULL;
1645     }
1646 
1647   newstate->entrance_nodes = &newstate->nodes;
1648   for (i = 0 ; i < nodes->nelem ; i++)
1649     {
1650       re_token_t *node = dfa->nodes + nodes->elems[i];
1651       re_token_type_t type = node->type;
1652       if (type == CHARACTER && !node->constraint)
1653 	continue;
1654 #ifdef RE_ENABLE_I18N
1655       newstate->accept_mb |= node->accept_mb;
1656 #endif /* RE_ENABLE_I18N */
1657 
1658       /* If the state has the halt node, the state is a halt state.  */
1659       if (type == END_OF_RE)
1660 	newstate->halt = 1;
1661       else if (type == OP_BACK_REF)
1662 	newstate->has_backref = 1;
1663       else if (type == ANCHOR || node->constraint)
1664 	newstate->has_constraint = 1;
1665     }
1666   err = register_state (dfa, newstate, hash);
1667   if (__glibc_unlikely (err != REG_NOERROR))
1668     {
1669       free_state (newstate);
1670       newstate = NULL;
1671     }
1672   return newstate;
1673 }
1674 
1675 /* Create the new state which is depend on the context CONTEXT.
1676    Return the new state if succeeded, otherwise return NULL.  */
1677 
1678 static re_dfastate_t *
1679 __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1680 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1681 		    unsigned int context, re_hashval_t hash)
1682 {
1683   Idx i, nctx_nodes = 0;
1684   reg_errcode_t err;
1685   re_dfastate_t *newstate;
1686 
1687   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1688   if (__glibc_unlikely (newstate == NULL))
1689     return NULL;
1690   err = re_node_set_init_copy (&newstate->nodes, nodes);
1691   if (__glibc_unlikely (err != REG_NOERROR))
1692     {
1693       re_free (newstate);
1694       return NULL;
1695     }
1696 
1697   newstate->context = context;
1698   newstate->entrance_nodes = &newstate->nodes;
1699 
1700   for (i = 0 ; i < nodes->nelem ; i++)
1701     {
1702       re_token_t *node = dfa->nodes + nodes->elems[i];
1703       re_token_type_t type = node->type;
1704       unsigned int constraint = node->constraint;
1705 
1706       if (type == CHARACTER && !constraint)
1707 	continue;
1708 #ifdef RE_ENABLE_I18N
1709       newstate->accept_mb |= node->accept_mb;
1710 #endif /* RE_ENABLE_I18N */
1711 
1712       /* If the state has the halt node, the state is a halt state.  */
1713       if (type == END_OF_RE)
1714 	newstate->halt = 1;
1715       else if (type == OP_BACK_REF)
1716 	newstate->has_backref = 1;
1717 
1718       if (constraint)
1719 	{
1720 	  if (newstate->entrance_nodes == &newstate->nodes)
1721 	    {
1722 	      re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1723 	      if (__glibc_unlikely (entrance_nodes == NULL))
1724 		{
1725 		  free_state (newstate);
1726 		  return NULL;
1727 		}
1728 	      newstate->entrance_nodes = entrance_nodes;
1729 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1730 		  != REG_NOERROR)
1731 		{
1732 		  free_state (newstate);
1733 		  return NULL;
1734 		}
1735 	      nctx_nodes = 0;
1736 	      newstate->has_constraint = 1;
1737 	    }
1738 
1739 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1740 	    {
1741 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1742 	      ++nctx_nodes;
1743 	    }
1744 	}
1745     }
1746   err = register_state (dfa, newstate, hash);
1747   if (__glibc_unlikely (err != REG_NOERROR))
1748     {
1749       free_state (newstate);
1750       newstate = NULL;
1751     }
1752   return  newstate;
1753 }
1754