1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 					  Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 					   Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 						    Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 			   re_dfastate_t **limited_sts, Idx last_node,
33 			   Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 					 const char *string, Idx length,
36 					 Idx start, Idx last_start, Idx stop,
37 					 size_t nmatch, regmatch_t pmatch[],
38 					 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 				  const char *string1, Idx length1,
41 				  const char *string2, Idx length2,
42 				  Idx start, regoff_t range,
43 				  struct re_registers *regs,
44 				  Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 				const char *string, Idx length, Idx start,
47 				regoff_t range, Idx stop,
48 				struct re_registers *regs,
49 				bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51                               Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 			   Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 				     const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 			 regmatch_t *prev_idx_match, Idx cur_node,
59 			 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 				      Idx str_idx, Idx dest_node, Idx nregs,
62 				      regmatch_t *regs, regmatch_t *prevregs,
63 				      re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 			       const re_match_context_t *mctx,
66 			       size_t nmatch, regmatch_t *pmatch,
67 			       bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69 
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 				re_sift_context_t *sctx,
73 				Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 					   re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 					  re_sift_context_t *sctx, Idx str_idx,
79 					  re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 					      re_sift_context_t *sctx,
82 					      Idx str_idx,
83 					      re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 					    re_node_set *dest_nodes,
86 					    const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 			      const re_node_set *limits,
89 			      Idx dst_node, Idx dst_idx, Idx src_node,
90 			      Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 					int boundaries, Idx subexp_idx,
93 					Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 				      Idx limit, Idx subexp_idx,
96 				      Idx node, Idx str_idx,
97 				      Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 					  re_node_set *dest_nodes,
100 					  const re_node_set *candidates,
101 					  re_node_set *limits,
102 					  struct re_backref_cache_entry *bkref_ents,
103 					  Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 					re_sift_context_t *sctx,
106 					Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 					re_dfastate_t **dst,
109 					re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 					 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 				     re_match_context_t *mctx,
114 				     re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 					    re_match_context_t *mctx,
117 					    re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 						re_node_set *cur_nodes,
120 						Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 					re_match_context_t *mctx,
124 					re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 				       re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 					  const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 				 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 				     const re_sub_match_top_t *sub_top,
136 				     re_sub_match_last_t *sub_last,
137 				     Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 			     Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 				    state_array_t *path, Idx top_node,
142 				    Idx top_str, Idx last_node, Idx last_str,
143 				    int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 						   Idx str_idx,
146 						   re_node_set *cur_nodes,
147 						   re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 					       re_node_set *cur_nodes,
150 					       Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 						   re_node_set *dst_nodes,
153 						   Idx target, Idx ex_subexp,
154 						   int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 					 re_node_set *cur_nodes, Idx cur_str,
157 					 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 				    const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 						   size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 				       const re_dfastate_t *state,
169 				       re_node_set *states_node,
170 				       bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 			       const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174 
175 /* Entry point for POSIX code.  */
176 
177 /* regexec searches for a given pattern, specified by PREG, in the
178    string STRING.
179 
180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
182    least NMATCH elements, and we set them to the offsets of the
183    corresponding matched substrings.
184 
185    EFLAGS specifies "execution flags" which affect matching: if
186    REG_NOTBOL is set, then ^ does not match at the beginning of the
187    string; if REG_NOTEOL is set, then $ does not match at the end.
188 
189    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
190    EFLAGS is invalid.  */
191 
192 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[_REGEX_NELTS (nmatch)],int eflags)193 regexec (const regex_t *__restrict preg, const char *__restrict string,
194 	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
195 {
196   reg_errcode_t err;
197   Idx start, length;
198   re_dfa_t *dfa = preg->buffer;
199 
200   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
201     return REG_BADPAT;
202 
203   if (eflags & REG_STARTEND)
204     {
205       start = pmatch[0].rm_so;
206       length = pmatch[0].rm_eo;
207     }
208   else
209     {
210       start = 0;
211       length = strlen (string);
212     }
213 
214   lock_lock (dfa->lock);
215   if (preg->no_sub)
216     err = re_search_internal (preg, string, length, start, length,
217 			      length, 0, NULL, eflags);
218   else
219     err = re_search_internal (preg, string, length, start, length,
220 			      length, nmatch, pmatch, eflags);
221   lock_unlock (dfa->lock);
222   return err != REG_NOERROR;
223 }
224 
225 #ifdef _LIBC
226 libc_hidden_def (__regexec)
227 
228 # include <shlib-compat.h>
229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
230 
231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
232 __typeof__ (__regexec) __compat_regexec;
233 
234 int
235 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[_REGEX_NELTS (nmatch)],int eflags)236 __compat_regexec (const regex_t *__restrict preg,
237 		  const char *__restrict string, size_t nmatch,
238 		  regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
239 {
240   return regexec (preg, string, nmatch, pmatch,
241 		  eflags & (REG_NOTBOL | REG_NOTEOL));
242 }
243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
244 # endif
245 #endif
246 
247 /* Entry points for GNU code.  */
248 
249 /* re_match, re_search, re_match_2, re_search_2
250 
251    The former two functions operate on STRING with length LENGTH,
252    while the later two operate on concatenation of STRING1 and STRING2
253    with lengths LENGTH1 and LENGTH2, respectively.
254 
255    re_match() matches the compiled pattern in BUFP against the string,
256    starting at index START.
257 
258    re_search() first tries matching at index START, then it tries to match
259    starting from index START + 1, and so on.  The last start position tried
260    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
261    way as re_match().)
262 
263    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
264    the first STOP characters of the concatenation of the strings should be
265    concerned.
266 
267    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
268    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
269    computed relative to the concatenation, not relative to the individual
270    strings.)
271 
272    On success, re_match* functions return the length of the match, re_search*
273    return the position of the start of the match.  They return -1 on
274    match failure, -2 on error.  */
275 
276 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
278 	  Idx start, struct re_registers *regs)
279 {
280   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
281 }
282 #ifdef _LIBC
weak_alias(__re_match,re_match)283 weak_alias (__re_match, re_match)
284 #endif
285 
286 regoff_t
287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
288 	   Idx start, regoff_t range, struct re_registers *regs)
289 {
290   return re_search_stub (bufp, string, length, start, range, length, regs,
291 			 false);
292 }
293 #ifdef _LIBC
weak_alias(__re_search,re_search)294 weak_alias (__re_search, re_search)
295 #endif
296 
297 regoff_t
298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
299 	    const char *string2, Idx length2, Idx start,
300 	    struct re_registers *regs, Idx stop)
301 {
302   return re_search_2_stub (bufp, string1, length1, string2, length2,
303 			   start, 0, regs, stop, true);
304 }
305 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)306 weak_alias (__re_match_2, re_match_2)
307 #endif
308 
309 regoff_t
310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
311 	     const char *string2, Idx length2, Idx start, regoff_t range,
312 	     struct re_registers *regs, Idx stop)
313 {
314   return re_search_2_stub (bufp, string1, length1, string2, length2,
315 			   start, range, regs, stop, false);
316 }
317 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)318 weak_alias (__re_search_2, re_search_2)
319 #endif
320 
321 static regoff_t
322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
323 		  Idx length1, const char *string2, Idx length2, Idx start,
324 		  regoff_t range, struct re_registers *regs,
325 		  Idx stop, bool ret_len)
326 {
327   const char *str;
328   regoff_t rval;
329   Idx len;
330   char *s = NULL;
331 
332   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
333 			 || INT_ADD_WRAPV (length1, length2, &len))))
334     return -2;
335 
336   /* Concatenate the strings.  */
337   if (length2 > 0)
338     if (length1 > 0)
339       {
340 	s = re_malloc (char, len);
341 
342 	if (__glibc_unlikely (s == NULL))
343 	  return -2;
344 #ifdef _LIBC
345 	memcpy (__mempcpy (s, string1, length1), string2, length2);
346 #else
347 	memcpy (s, string1, length1);
348 	memcpy (s + length1, string2, length2);
349 #endif
350 	str = s;
351       }
352     else
353       str = string2;
354   else
355     str = string1;
356 
357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 			 ret_len);
359   re_free (s);
360   return rval;
361 }
362 
363 /* The parameters have the same meaning as those of re_search.
364    Additional parameters:
365    If RET_LEN is true the length of the match is returned (re_match style);
366    otherwise the position of the match is returned.  */
367 
368 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 		bool ret_len)
372 {
373   reg_errcode_t result;
374   regmatch_t *pmatch;
375   Idx nregs;
376   regoff_t rval;
377   int eflags = 0;
378   re_dfa_t *dfa = bufp->buffer;
379   Idx last_start = start + range;
380 
381   /* Check for out-of-range.  */
382   if (__glibc_unlikely (start < 0 || start > length))
383     return -1;
384   if (__glibc_unlikely (length < last_start
385 			|| (0 <= range && last_start < start)))
386     last_start = length;
387   else if (__glibc_unlikely (last_start < 0
388 			     || (range < 0 && start <= last_start)))
389     last_start = 0;
390 
391   lock_lock (dfa->lock);
392 
393   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
394   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
395 
396   /* Compile fastmap if we haven't yet.  */
397   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
398     re_compile_fastmap (bufp);
399 
400   if (__glibc_unlikely (bufp->no_sub))
401     regs = NULL;
402 
403   /* We need at least 1 register.  */
404   if (regs == NULL)
405     nregs = 1;
406   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
407 			     && regs->num_regs <= bufp->re_nsub))
408     {
409       nregs = regs->num_regs;
410       if (__glibc_unlikely (nregs < 1))
411 	{
412 	  /* Nothing can be copied to regs.  */
413 	  regs = NULL;
414 	  nregs = 1;
415 	}
416     }
417   else
418     nregs = bufp->re_nsub + 1;
419   pmatch = re_malloc (regmatch_t, nregs);
420   if (__glibc_unlikely (pmatch == NULL))
421     {
422       rval = -2;
423       goto out;
424     }
425 
426   result = re_search_internal (bufp, string, length, start, last_start, stop,
427 			       nregs, pmatch, eflags);
428 
429   rval = 0;
430 
431   /* I hope we needn't fill their regs with -1's when no match was found.  */
432   if (result != REG_NOERROR)
433     rval = result == REG_NOMATCH ? -1 : -2;
434   else if (regs != NULL)
435     {
436       /* If caller wants register contents data back, copy them.  */
437       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
438 					   bufp->regs_allocated);
439       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
440 	rval = -2;
441     }
442 
443   if (__glibc_likely (rval == 0))
444     {
445       if (ret_len)
446 	{
447 	  DEBUG_ASSERT (pmatch[0].rm_so == start);
448 	  rval = pmatch[0].rm_eo - start;
449 	}
450       else
451 	rval = pmatch[0].rm_so;
452     }
453   re_free (pmatch);
454  out:
455   lock_unlock (dfa->lock);
456   return rval;
457 }
458 
459 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
461 	      int regs_allocated)
462 {
463   int rval = REGS_REALLOCATE;
464   Idx i;
465   Idx need_regs = nregs + 1;
466   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
467      uses.  */
468 
469   /* Have the register data arrays been allocated?  */
470   if (regs_allocated == REGS_UNALLOCATED)
471     { /* No.  So allocate them with malloc.  */
472       regs->start = re_malloc (regoff_t, need_regs);
473       if (__glibc_unlikely (regs->start == NULL))
474 	return REGS_UNALLOCATED;
475       regs->end = re_malloc (regoff_t, need_regs);
476       if (__glibc_unlikely (regs->end == NULL))
477 	{
478 	  re_free (regs->start);
479 	  return REGS_UNALLOCATED;
480 	}
481       regs->num_regs = need_regs;
482     }
483   else if (regs_allocated == REGS_REALLOCATE)
484     { /* Yes.  If we need more elements than were already
485 	 allocated, reallocate them.  If we need fewer, just
486 	 leave it alone.  */
487       if (__glibc_unlikely (need_regs > regs->num_regs))
488 	{
489 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
490 	  regoff_t *new_end;
491 	  if (__glibc_unlikely (new_start == NULL))
492 	    return REGS_UNALLOCATED;
493 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
494 	  if (__glibc_unlikely (new_end == NULL))
495 	    {
496 	      re_free (new_start);
497 	      return REGS_UNALLOCATED;
498 	    }
499 	  regs->start = new_start;
500 	  regs->end = new_end;
501 	  regs->num_regs = need_regs;
502 	}
503     }
504   else
505     {
506       DEBUG_ASSERT (regs_allocated == REGS_FIXED);
507       /* This function may not be called with REGS_FIXED and nregs too big.  */
508       DEBUG_ASSERT (nregs <= regs->num_regs);
509       rval = REGS_FIXED;
510     }
511 
512   /* Copy the regs.  */
513   for (i = 0; i < nregs; ++i)
514     {
515       regs->start[i] = pmatch[i].rm_so;
516       regs->end[i] = pmatch[i].rm_eo;
517     }
518   for ( ; i < regs->num_regs; ++i)
519     regs->start[i] = regs->end[i] = -1;
520 
521   return rval;
522 }
523 
524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
525    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
526    this memory for recording register information.  STARTS and ENDS
527    must be allocated using the malloc library routine, and must each
528    be at least NUM_REGS * sizeof (regoff_t) bytes long.
529 
530    If NUM_REGS == 0, then subsequent matches should allocate their own
531    register data.
532 
533    Unless this function is called, the first search or match using
534    PATTERN_BUFFER will allocate its own register data, without
535    freeing the old data.  */
536 
537 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
539 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
540 {
541   if (num_regs)
542     {
543       bufp->regs_allocated = REGS_REALLOCATE;
544       regs->num_regs = num_regs;
545       regs->start = starts;
546       regs->end = ends;
547     }
548   else
549     {
550       bufp->regs_allocated = REGS_UNALLOCATED;
551       regs->num_regs = 0;
552       regs->start = regs->end = NULL;
553     }
554 }
555 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)556 weak_alias (__re_set_registers, re_set_registers)
557 #endif
558 
559 /* Entry points compatible with 4.2 BSD regex library.  We don't define
560    them unless specifically requested.  */
561 
562 #if defined _REGEX_RE_COMP || defined _LIBC
563 int
564 # ifdef _LIBC
565 weak_function
566 # endif
567 re_exec (const char *s)
568 {
569   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
570 }
571 #endif /* _REGEX_RE_COMP */
572 
573 /* Internal entry point.  */
574 
575 /* Searches for a compiled pattern PREG in the string STRING, whose
576    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
577    meaning as with regexec.  LAST_START is START + RANGE, where
578    START and RANGE have the same meaning as with re_search.
579    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
580    otherwise return the error code.
581    Note: We assume front end functions already check ranges.
582    (0 <= LAST_START && LAST_START <= LENGTH)  */
583 
584 static reg_errcode_t
585 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)586 re_search_internal (const regex_t *preg, const char *string, Idx length,
587 		    Idx start, Idx last_start, Idx stop, size_t nmatch,
588 		    regmatch_t pmatch[], int eflags)
589 {
590   reg_errcode_t err;
591   const re_dfa_t *dfa = preg->buffer;
592   Idx left_lim, right_lim;
593   int incr;
594   bool fl_longest_match;
595   int match_kind;
596   Idx match_first;
597   Idx match_last = -1;
598   Idx extra_nmatch;
599   bool sb;
600   int ch;
601   re_match_context_t mctx = { .dfa = dfa };
602   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
603 		    && start != last_start && !preg->can_be_null)
604 		   ? preg->fastmap : NULL);
605   RE_TRANSLATE_TYPE t = preg->translate;
606 
607   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
608   nmatch -= extra_nmatch;
609 
610   /* Check if the DFA haven't been compiled.  */
611   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
612 			|| dfa->init_state_word == NULL
613 			|| dfa->init_state_nl == NULL
614 			|| dfa->init_state_begbuf == NULL))
615     return REG_NOMATCH;
616 
617   /* We assume front-end functions already check them.  */
618   DEBUG_ASSERT (0 <= last_start && last_start <= length);
619 
620   /* If initial states with non-begbuf contexts have no elements,
621      the regex must be anchored.  If preg->newline_anchor is set,
622      we'll never use init_state_nl, so do not check it.  */
623   if (dfa->init_state->nodes.nelem == 0
624       && dfa->init_state_word->nodes.nelem == 0
625       && (dfa->init_state_nl->nodes.nelem == 0
626 	  || !preg->newline_anchor))
627     {
628       if (start != 0 && last_start != 0)
629         return REG_NOMATCH;
630       start = last_start = 0;
631     }
632 
633   /* We must check the longest matching, if nmatch > 0.  */
634   fl_longest_match = (nmatch != 0 || dfa->nbackref);
635 
636   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
637 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
638 			    dfa);
639   if (__glibc_unlikely (err != REG_NOERROR))
640     goto free_return;
641   mctx.input.stop = stop;
642   mctx.input.raw_stop = stop;
643   mctx.input.newline_anchor = preg->newline_anchor;
644 
645   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
646   if (__glibc_unlikely (err != REG_NOERROR))
647     goto free_return;
648 
649   /* We will log all the DFA states through which the dfa pass,
650      if nmatch > 1, or this dfa has "multibyte node", which is a
651      back-reference or a node which can accept multibyte character or
652      multi character collating element.  */
653   if (nmatch > 1 || dfa->has_mb_node)
654     {
655       /* Avoid overflow.  */
656       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
657 			     <= mctx.input.bufs_len)))
658 	{
659 	  err = REG_ESPACE;
660 	  goto free_return;
661 	}
662 
663       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
664       if (__glibc_unlikely (mctx.state_log == NULL))
665 	{
666 	  err = REG_ESPACE;
667 	  goto free_return;
668 	}
669     }
670 
671   match_first = start;
672   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
673 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
674 
675   /* Check incrementally whether the input string matches.  */
676   incr = (last_start < start) ? -1 : 1;
677   left_lim = (last_start < start) ? last_start : start;
678   right_lim = (last_start < start) ? start : last_start;
679   sb = dfa->mb_cur_max == 1;
680   match_kind =
681     (fastmap
682      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
683 	| (start <= last_start ? 2 : 0)
684 	| (t != NULL ? 1 : 0))
685      : 8);
686 
687   for (;; match_first += incr)
688     {
689       err = REG_NOMATCH;
690       if (match_first < left_lim || right_lim < match_first)
691 	goto free_return;
692 
693       /* Advance as rapidly as possible through the string, until we
694 	 find a plausible place to start matching.  This may be done
695 	 with varying efficiency, so there are various possibilities:
696 	 only the most common of them are specialized, in order to
697 	 save on code size.  We use a switch statement for speed.  */
698       switch (match_kind)
699 	{
700 	case 8:
701 	  /* No fastmap.  */
702 	  break;
703 
704 	case 7:
705 	  /* Fastmap with single-byte translation, match forward.  */
706 	  while (__glibc_likely (match_first < right_lim)
707 		 && !fastmap[t[(unsigned char) string[match_first]]])
708 	    ++match_first;
709 	  goto forward_match_found_start_or_reached_end;
710 
711 	case 6:
712 	  /* Fastmap without translation, match forward.  */
713 	  while (__glibc_likely (match_first < right_lim)
714 		 && !fastmap[(unsigned char) string[match_first]])
715 	    ++match_first;
716 
717 	forward_match_found_start_or_reached_end:
718 	  if (__glibc_unlikely (match_first == right_lim))
719 	    {
720 	      ch = match_first >= length
721 		       ? 0 : (unsigned char) string[match_first];
722 	      if (!fastmap[t ? t[ch] : ch])
723 		goto free_return;
724 	    }
725 	  break;
726 
727 	case 4:
728 	case 5:
729 	  /* Fastmap without multi-byte translation, match backwards.  */
730 	  while (match_first >= left_lim)
731 	    {
732 	      ch = match_first >= length
733 		       ? 0 : (unsigned char) string[match_first];
734 	      if (fastmap[t ? t[ch] : ch])
735 		break;
736 	      --match_first;
737 	    }
738 	  if (match_first < left_lim)
739 	    goto free_return;
740 	  break;
741 
742 	default:
743 	  /* In this case, we can't determine easily the current byte,
744 	     since it might be a component byte of a multibyte
745 	     character.  Then we use the constructed buffer instead.  */
746 	  for (;;)
747 	    {
748 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
749 		 buffers.  */
750 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
751 	      if (__glibc_unlikely (offset
752 				    >= (__re_size_t) mctx.input.valid_raw_len))
753 		{
754 		  err = re_string_reconstruct (&mctx.input, match_first,
755 					       eflags);
756 		  if (__glibc_unlikely (err != REG_NOERROR))
757 		    goto free_return;
758 
759 		  offset = match_first - mctx.input.raw_mbs_idx;
760 		}
761 	      /* Use buffer byte if OFFSET is in buffer, otherwise '\0'.  */
762 	      ch = (offset < mctx.input.valid_len
763 		    ? re_string_byte_at (&mctx.input, offset) : 0);
764 	      if (fastmap[ch])
765 		break;
766 	      match_first += incr;
767 	      if (match_first < left_lim || match_first > right_lim)
768 		{
769 		  err = REG_NOMATCH;
770 		  goto free_return;
771 		}
772 	    }
773 	  break;
774 	}
775 
776       /* Reconstruct the buffers so that the matcher can assume that
777 	 the matching starts from the beginning of the buffer.  */
778       err = re_string_reconstruct (&mctx.input, match_first, eflags);
779       if (__glibc_unlikely (err != REG_NOERROR))
780 	goto free_return;
781 
782 #ifdef RE_ENABLE_I18N
783      /* Don't consider this char as a possible match start if it part,
784 	yet isn't the head, of a multibyte character.  */
785       if (!sb && !re_string_first_byte (&mctx.input, 0))
786 	continue;
787 #endif
788 
789       /* It seems to be appropriate one, then use the matcher.  */
790       /* We assume that the matching starts from 0.  */
791       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
792       match_last = check_matching (&mctx, fl_longest_match,
793 				   start <= last_start ? &match_first : NULL);
794       if (match_last != -1)
795 	{
796 	  if (__glibc_unlikely (match_last == -2))
797 	    {
798 	      err = REG_ESPACE;
799 	      goto free_return;
800 	    }
801 	  else
802 	    {
803 	      mctx.match_last = match_last;
804 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
805 		{
806 		  re_dfastate_t *pstate = mctx.state_log[match_last];
807 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
808 							     match_last);
809 		}
810 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
811 		  || dfa->nbackref)
812 		{
813 		  err = prune_impossible_nodes (&mctx);
814 		  if (err == REG_NOERROR)
815 		    break;
816 		  if (__glibc_unlikely (err != REG_NOMATCH))
817 		    goto free_return;
818 		  match_last = -1;
819 		}
820 	      else
821 		break; /* We found a match.  */
822 	    }
823 	}
824 
825       match_ctx_clean (&mctx);
826     }
827 
828   DEBUG_ASSERT (match_last != -1);
829   DEBUG_ASSERT (err == REG_NOERROR);
830 
831   /* Set pmatch[] if we need.  */
832   if (nmatch > 0)
833     {
834       Idx reg_idx;
835 
836       /* Initialize registers.  */
837       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
838 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
839 
840       /* Set the points where matching start/end.  */
841       pmatch[0].rm_so = 0;
842       pmatch[0].rm_eo = mctx.match_last;
843       /* FIXME: This function should fail if mctx.match_last exceeds
844 	 the maximum possible regoff_t value.  We need a new error
845 	 code REG_OVERFLOW.  */
846 
847       if (!preg->no_sub && nmatch > 1)
848 	{
849 	  err = set_regs (preg, &mctx, nmatch, pmatch,
850 			  dfa->has_plural_match && dfa->nbackref > 0);
851 	  if (__glibc_unlikely (err != REG_NOERROR))
852 	    goto free_return;
853 	}
854 
855       /* At last, add the offset to each register, since we slid
856 	 the buffers so that we could assume that the matching starts
857 	 from 0.  */
858       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
859 	if (pmatch[reg_idx].rm_so != -1)
860 	  {
861 #ifdef RE_ENABLE_I18N
862 	    if (__glibc_unlikely (mctx.input.offsets_needed != 0))
863 	      {
864 		pmatch[reg_idx].rm_so =
865 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
866 		   ? mctx.input.valid_raw_len
867 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
868 		pmatch[reg_idx].rm_eo =
869 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
870 		   ? mctx.input.valid_raw_len
871 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
872 	      }
873 #else
874 	    DEBUG_ASSERT (mctx.input.offsets_needed == 0);
875 #endif
876 	    pmatch[reg_idx].rm_so += match_first;
877 	    pmatch[reg_idx].rm_eo += match_first;
878 	  }
879       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
880 	{
881 	  pmatch[nmatch + reg_idx].rm_so = -1;
882 	  pmatch[nmatch + reg_idx].rm_eo = -1;
883 	}
884 
885       if (dfa->subexp_map)
886 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
887 	  if (dfa->subexp_map[reg_idx] != reg_idx)
888 	    {
889 	      pmatch[reg_idx + 1].rm_so
890 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
891 	      pmatch[reg_idx + 1].rm_eo
892 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
893 	    }
894     }
895 
896  free_return:
897   re_free (mctx.state_log);
898   if (dfa->nbackref)
899     match_ctx_free (&mctx);
900   re_string_destruct (&mctx.input);
901   return err;
902 }
903 
904 static reg_errcode_t
905 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)906 prune_impossible_nodes (re_match_context_t *mctx)
907 {
908   const re_dfa_t *const dfa = mctx->dfa;
909   Idx halt_node, match_last;
910   reg_errcode_t ret;
911   re_dfastate_t **sifted_states;
912   re_dfastate_t **lim_states = NULL;
913   re_sift_context_t sctx;
914   DEBUG_ASSERT (mctx->state_log != NULL);
915   match_last = mctx->match_last;
916   halt_node = mctx->last_node;
917 
918   /* Avoid overflow.  */
919   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
920 			<= match_last))
921     return REG_ESPACE;
922 
923   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
924   if (__glibc_unlikely (sifted_states == NULL))
925     {
926       ret = REG_ESPACE;
927       goto free_return;
928     }
929   if (dfa->nbackref)
930     {
931       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
932       if (__glibc_unlikely (lim_states == NULL))
933 	{
934 	  ret = REG_ESPACE;
935 	  goto free_return;
936 	}
937       while (1)
938 	{
939 	  memset (lim_states, '\0',
940 		  sizeof (re_dfastate_t *) * (match_last + 1));
941 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
942 			 match_last);
943 	  ret = sift_states_backward (mctx, &sctx);
944 	  re_node_set_free (&sctx.limits);
945 	  if (__glibc_unlikely (ret != REG_NOERROR))
946 	      goto free_return;
947 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
948 	    break;
949 	  do
950 	    {
951 	      --match_last;
952 	      if (match_last < 0)
953 		{
954 		  ret = REG_NOMATCH;
955 		  goto free_return;
956 		}
957 	    } while (mctx->state_log[match_last] == NULL
958 		     || !mctx->state_log[match_last]->halt);
959 	  halt_node = check_halt_state_context (mctx,
960 						mctx->state_log[match_last],
961 						match_last);
962 	}
963       ret = merge_state_array (dfa, sifted_states, lim_states,
964 			       match_last + 1);
965       re_free (lim_states);
966       lim_states = NULL;
967       if (__glibc_unlikely (ret != REG_NOERROR))
968 	goto free_return;
969     }
970   else
971     {
972       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
973       ret = sift_states_backward (mctx, &sctx);
974       re_node_set_free (&sctx.limits);
975       if (__glibc_unlikely (ret != REG_NOERROR))
976 	goto free_return;
977       if (sifted_states[0] == NULL)
978 	{
979 	  ret = REG_NOMATCH;
980 	  goto free_return;
981 	}
982     }
983   re_free (mctx->state_log);
984   mctx->state_log = sifted_states;
985   sifted_states = NULL;
986   mctx->last_node = halt_node;
987   mctx->match_last = match_last;
988   ret = REG_NOERROR;
989  free_return:
990   re_free (sifted_states);
991   re_free (lim_states);
992   return ret;
993 }
994 
995 /* Acquire an initial state and return it.
996    We must select appropriate initial state depending on the context,
997    since initial states may have constraints like "\<", "^", etc..  */
998 
999 static inline re_dfastate_t *
1000 __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1001 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1002 			    Idx idx)
1003 {
1004   const re_dfa_t *const dfa = mctx->dfa;
1005   if (dfa->init_state->has_constraint)
1006     {
1007       unsigned int context;
1008       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1009       if (IS_WORD_CONTEXT (context))
1010 	return dfa->init_state_word;
1011       else if (IS_ORDINARY_CONTEXT (context))
1012 	return dfa->init_state;
1013       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1014 	return dfa->init_state_begbuf;
1015       else if (IS_NEWLINE_CONTEXT (context))
1016 	return dfa->init_state_nl;
1017       else if (IS_BEGBUF_CONTEXT (context))
1018 	{
1019 	  /* It is relatively rare case, then calculate on demand.  */
1020 	  return re_acquire_state_context (err, dfa,
1021 					   dfa->init_state->entrance_nodes,
1022 					   context);
1023 	}
1024       else
1025 	/* Must not happen?  */
1026 	return dfa->init_state;
1027     }
1028   else
1029     return dfa->init_state;
1030 }
1031 
1032 /* Check whether the regular expression match input string INPUT or not,
1033    and return the index where the matching end.  Return -1 if
1034    there is no match, and return -2 in case of an error.
1035    FL_LONGEST_MATCH means we want the POSIX longest matching.
1036    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1037    next place where we may want to try matching.
1038    Note that the matcher assumes that the matching starts from the current
1039    index of the buffer.  */
1040 
1041 static Idx
1042 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1043 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1044 		Idx *p_match_first)
1045 {
1046   const re_dfa_t *const dfa = mctx->dfa;
1047   reg_errcode_t err;
1048   Idx match = 0;
1049   Idx match_last = -1;
1050   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1051   re_dfastate_t *cur_state;
1052   bool at_init_state = p_match_first != NULL;
1053   Idx next_start_idx = cur_str_idx;
1054 
1055   err = REG_NOERROR;
1056   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1057   /* An initial state must not be NULL (invalid).  */
1058   if (__glibc_unlikely (cur_state == NULL))
1059     {
1060       DEBUG_ASSERT (err == REG_ESPACE);
1061       return -2;
1062     }
1063 
1064   if (mctx->state_log != NULL)
1065     {
1066       mctx->state_log[cur_str_idx] = cur_state;
1067 
1068       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1069 	 later.  E.g. Processing back references.  */
1070       if (__glibc_unlikely (dfa->nbackref))
1071 	{
1072 	  at_init_state = false;
1073 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1074 	  if (__glibc_unlikely (err != REG_NOERROR))
1075 	    return err;
1076 
1077 	  if (cur_state->has_backref)
1078 	    {
1079 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1080 	      if (__glibc_unlikely (err != REG_NOERROR))
1081 		return err;
1082 	    }
1083 	}
1084     }
1085 
1086   /* If the RE accepts NULL string.  */
1087   if (__glibc_unlikely (cur_state->halt))
1088     {
1089       if (!cur_state->has_constraint
1090 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1091 	{
1092 	  if (!fl_longest_match)
1093 	    return cur_str_idx;
1094 	  else
1095 	    {
1096 	      match_last = cur_str_idx;
1097 	      match = 1;
1098 	    }
1099 	}
1100     }
1101 
1102   while (!re_string_eoi (&mctx->input))
1103     {
1104       re_dfastate_t *old_state = cur_state;
1105       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1106 
1107       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1108 	   && mctx->input.bufs_len < mctx->input.len)
1109 	  || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1110 	      && mctx->input.valid_len < mctx->input.len))
1111 	{
1112 	  err = extend_buffers (mctx, next_char_idx + 1);
1113 	  if (__glibc_unlikely (err != REG_NOERROR))
1114 	    {
1115 	      DEBUG_ASSERT (err == REG_ESPACE);
1116 	      return -2;
1117 	    }
1118 	}
1119 
1120       cur_state = transit_state (&err, mctx, cur_state);
1121       if (mctx->state_log != NULL)
1122 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1123 
1124       if (cur_state == NULL)
1125 	{
1126 	  /* Reached the invalid state or an error.  Try to recover a valid
1127 	     state using the state log, if available and if we have not
1128 	     already found a valid (even if not the longest) match.  */
1129 	  if (__glibc_unlikely (err != REG_NOERROR))
1130 	    return -2;
1131 
1132 	  if (mctx->state_log == NULL
1133 	      || (match && !fl_longest_match)
1134 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1135 	    break;
1136 	}
1137 
1138       if (__glibc_unlikely (at_init_state))
1139 	{
1140 	  if (old_state == cur_state)
1141 	    next_start_idx = next_char_idx;
1142 	  else
1143 	    at_init_state = false;
1144 	}
1145 
1146       if (cur_state->halt)
1147 	{
1148 	  /* Reached a halt state.
1149 	     Check the halt state can satisfy the current context.  */
1150 	  if (!cur_state->has_constraint
1151 	      || check_halt_state_context (mctx, cur_state,
1152 					   re_string_cur_idx (&mctx->input)))
1153 	    {
1154 	      /* We found an appropriate halt state.  */
1155 	      match_last = re_string_cur_idx (&mctx->input);
1156 	      match = 1;
1157 
1158 	      /* We found a match, do not modify match_first below.  */
1159 	      p_match_first = NULL;
1160 	      if (!fl_longest_match)
1161 		break;
1162 	    }
1163 	}
1164     }
1165 
1166   if (p_match_first)
1167     *p_match_first += next_start_idx;
1168 
1169   return match_last;
1170 }
1171 
1172 /* Check NODE match the current context.  */
1173 
1174 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1175 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1176 {
1177   re_token_type_t type = dfa->nodes[node].type;
1178   unsigned int constraint = dfa->nodes[node].constraint;
1179   if (type != END_OF_RE)
1180     return false;
1181   if (!constraint)
1182     return true;
1183   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1184     return false;
1185   return true;
1186 }
1187 
1188 /* Check the halt state STATE match the current context.
1189    Return 0 if not match, if the node, STATE has, is a halt node and
1190    match the context, return the node.  */
1191 
1192 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1193 check_halt_state_context (const re_match_context_t *mctx,
1194 			  const re_dfastate_t *state, Idx idx)
1195 {
1196   Idx i;
1197   unsigned int context;
1198   DEBUG_ASSERT (state->halt);
1199   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1200   for (i = 0; i < state->nodes.nelem; ++i)
1201     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1202       return state->nodes.elems[i];
1203   return 0;
1204 }
1205 
1206 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1207    corresponding to the DFA).
1208    Return the destination node, and update EPS_VIA_NODES;
1209    return -1 on match failure, -2 on error.  */
1210 
1211 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1212 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1213 		   regmatch_t *prevregs,
1214 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1215 		   struct re_fail_stack_t *fs)
1216 {
1217   const re_dfa_t *const dfa = mctx->dfa;
1218   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1219     {
1220       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1221       re_node_set *edests = &dfa->edests[node];
1222 
1223       if (! re_node_set_contains (eps_via_nodes, node))
1224         {
1225           bool ok = re_node_set_insert (eps_via_nodes, node);
1226           if (__glibc_unlikely (! ok))
1227             return -2;
1228         }
1229 
1230       /* Pick a valid destination, or return -1 if none is found.  */
1231       Idx dest_node = -1;
1232       for (Idx i = 0; i < edests->nelem; i++)
1233 	{
1234 	  Idx candidate = edests->elems[i];
1235 	  if (!re_node_set_contains (cur_nodes, candidate))
1236 	    continue;
1237           if (dest_node == -1)
1238 	    dest_node = candidate;
1239 
1240 	  else
1241 	    {
1242 	      /* In order to avoid infinite loop like "(a*)*", return the second
1243 		 epsilon-transition if the first was already considered.  */
1244 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1245 		return candidate;
1246 
1247 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1248 	      else if (fs != NULL
1249 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1250 					   prevregs, eps_via_nodes))
1251 		return -2;
1252 
1253 	      /* We know we are going to exit.  */
1254 	      break;
1255 	    }
1256 	}
1257       return dest_node;
1258     }
1259   else
1260     {
1261       Idx naccepted = 0;
1262       re_token_type_t type = dfa->nodes[node].type;
1263 
1264 #ifdef RE_ENABLE_I18N
1265       if (dfa->nodes[node].accept_mb)
1266 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1267       else
1268 #endif /* RE_ENABLE_I18N */
1269       if (type == OP_BACK_REF)
1270 	{
1271 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1272 	  if (subexp_idx < nregs)
1273 	    naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1274 	  if (fs != NULL)
1275 	    {
1276 	      if (subexp_idx >= nregs
1277 		  || regs[subexp_idx].rm_so == -1
1278 		  || regs[subexp_idx].rm_eo == -1)
1279 		return -1;
1280 	      else if (naccepted)
1281 		{
1282 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1283 		  if (mctx->input.valid_len - *pidx < naccepted
1284 		      || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1285 				  naccepted)
1286 			  != 0))
1287 		    return -1;
1288 		}
1289 	    }
1290 
1291 	  if (naccepted == 0)
1292 	    {
1293 	      Idx dest_node;
1294 	      bool ok = re_node_set_insert (eps_via_nodes, node);
1295 	      if (__glibc_unlikely (! ok))
1296 		return -2;
1297 	      dest_node = dfa->edests[node].elems[0];
1298 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1299 					dest_node))
1300 		return dest_node;
1301 	    }
1302 	}
1303 
1304       if (naccepted != 0
1305 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1306 	{
1307 	  Idx dest_node = dfa->nexts[node];
1308 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1309 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1310 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1311 					       dest_node)))
1312 	    return -1;
1313 	  re_node_set_empty (eps_via_nodes);
1314 	  return dest_node;
1315 	}
1316     }
1317   return -1;
1318 }
1319 
1320 static reg_errcode_t
1321 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1322 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1323 		 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1324 		 re_node_set *eps_via_nodes)
1325 {
1326   reg_errcode_t err;
1327   Idx num = fs->num++;
1328   if (fs->num == fs->alloc)
1329     {
1330       struct re_fail_stack_ent_t *new_array;
1331       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1332                               fs->alloc * 2);
1333       if (new_array == NULL)
1334 	return REG_ESPACE;
1335       fs->alloc *= 2;
1336       fs->stack = new_array;
1337     }
1338   fs->stack[num].idx = str_idx;
1339   fs->stack[num].node = dest_node;
1340   fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1341   if (fs->stack[num].regs == NULL)
1342     return REG_ESPACE;
1343   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1344   memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1345   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1346   return err;
1347 }
1348 
1349 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1350 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1351 		regmatch_t *regs, regmatch_t *prevregs,
1352 		re_node_set *eps_via_nodes)
1353 {
1354   if (fs == NULL || fs->num == 0)
1355     return -1;
1356   Idx num = --fs->num;
1357   *pidx = fs->stack[num].idx;
1358   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1359   memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1360   re_node_set_free (eps_via_nodes);
1361   re_free (fs->stack[num].regs);
1362   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1363   DEBUG_ASSERT (0 <= fs->stack[num].node);
1364   return fs->stack[num].node;
1365 }
1366 
1367 
1368 #define DYNARRAY_STRUCT  regmatch_list
1369 #define DYNARRAY_ELEMENT regmatch_t
1370 #define DYNARRAY_PREFIX  regmatch_list_
1371 #include <malloc/dynarray-skeleton.c>
1372 
1373 /* Set the positions where the subexpressions are starts/ends to registers
1374    PMATCH.
1375    Note: We assume that pmatch[0] is already set, and
1376    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1377 
1378 static reg_errcode_t
1379 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1380 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1381 	  regmatch_t *pmatch, bool fl_backtrack)
1382 {
1383   const re_dfa_t *dfa = preg->buffer;
1384   Idx idx, cur_node;
1385   re_node_set eps_via_nodes;
1386   struct re_fail_stack_t *fs;
1387   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1388   struct regmatch_list prev_match;
1389   regmatch_list_init (&prev_match);
1390 
1391   DEBUG_ASSERT (nmatch > 1);
1392   DEBUG_ASSERT (mctx->state_log != NULL);
1393   if (fl_backtrack)
1394     {
1395       fs = &fs_body;
1396       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1397       if (fs->stack == NULL)
1398 	return REG_ESPACE;
1399     }
1400   else
1401     fs = NULL;
1402 
1403   cur_node = dfa->init_node;
1404   re_node_set_init_empty (&eps_via_nodes);
1405 
1406   if (!regmatch_list_resize (&prev_match, nmatch))
1407     {
1408       regmatch_list_free (&prev_match);
1409       free_fail_stack_return (fs);
1410       return REG_ESPACE;
1411     }
1412   regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1413   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1414 
1415   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1416     {
1417       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1418 
1419       if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1420 	  || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1421 	{
1422 	  Idx reg_idx;
1423 	  cur_node = -1;
1424 	  if (fs)
1425 	    {
1426 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1427 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1428 		  {
1429 		    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1430 					       prev_idx_match, &eps_via_nodes);
1431 		    break;
1432 		  }
1433 	    }
1434 	  if (cur_node < 0)
1435 	    {
1436 	      re_node_set_free (&eps_via_nodes);
1437 	      regmatch_list_free (&prev_match);
1438 	      return free_fail_stack_return (fs);
1439 	    }
1440 	}
1441 
1442       /* Proceed to next node.  */
1443       cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1444 				    &idx, cur_node,
1445 				    &eps_via_nodes, fs);
1446 
1447       if (__glibc_unlikely (cur_node < 0))
1448 	{
1449 	  if (__glibc_unlikely (cur_node == -2))
1450 	    {
1451 	      re_node_set_free (&eps_via_nodes);
1452 	      regmatch_list_free (&prev_match);
1453 	      free_fail_stack_return (fs);
1454 	      return REG_ESPACE;
1455 	    }
1456 	  cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1457 				     prev_idx_match, &eps_via_nodes);
1458 	  if (cur_node < 0)
1459 	    {
1460 	      re_node_set_free (&eps_via_nodes);
1461 	      regmatch_list_free (&prev_match);
1462 	      free_fail_stack_return (fs);
1463 	      return REG_NOMATCH;
1464 	    }
1465 	}
1466     }
1467   re_node_set_free (&eps_via_nodes);
1468   regmatch_list_free (&prev_match);
1469   return free_fail_stack_return (fs);
1470 }
1471 
1472 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1473 free_fail_stack_return (struct re_fail_stack_t *fs)
1474 {
1475   if (fs)
1476     {
1477       Idx fs_idx;
1478       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1479 	{
1480 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1481 	  re_free (fs->stack[fs_idx].regs);
1482 	}
1483       re_free (fs->stack);
1484     }
1485   return REG_NOERROR;
1486 }
1487 
1488 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1489 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1490 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1491 {
1492   int type = dfa->nodes[cur_node].type;
1493   if (type == OP_OPEN_SUBEXP)
1494     {
1495       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1496 
1497       /* We are at the first node of this sub expression.  */
1498       if (reg_num < nmatch)
1499 	{
1500 	  pmatch[reg_num].rm_so = cur_idx;
1501 	  pmatch[reg_num].rm_eo = -1;
1502 	}
1503     }
1504   else if (type == OP_CLOSE_SUBEXP)
1505     {
1506       /* We are at the last node of this sub expression.  */
1507       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1508       if (reg_num < nmatch)
1509 	{
1510 	  if (pmatch[reg_num].rm_so < cur_idx)
1511 	    {
1512 	      pmatch[reg_num].rm_eo = cur_idx;
1513 	      /* This is a non-empty match or we are not inside an optional
1514 		 subexpression.  Accept this right away.  */
1515 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1516 	    }
1517 	  else
1518 	    {
1519 	      if (dfa->nodes[cur_node].opt_subexp
1520 		  && prev_idx_match[reg_num].rm_so != -1)
1521 		/* We transited through an empty match for an optional
1522 		   subexpression, like (a?)*, and this is not the subexp's
1523 		   first match.  Copy back the old content of the registers
1524 		   so that matches of an inner subexpression are undone as
1525 		   well, like in ((a?))*.  */
1526 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1527 	      else
1528 		/* We completed a subexpression, but it may be part of
1529 		   an optional one, so do not update PREV_IDX_MATCH.  */
1530 		pmatch[reg_num].rm_eo = cur_idx;
1531 	    }
1532 	}
1533     }
1534 }
1535 
1536 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1537    and sift the nodes in each states according to the following rules.
1538    Updated state_log will be wrote to STATE_LOG.
1539 
1540    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1541      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1542 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1543 	the LAST_NODE, we throw away the node 'a'.
1544      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1545 	string 's' and transit to 'b':
1546 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1547 	   away the node 'a'.
1548 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1549 	    thrown away, we throw away the node 'a'.
1550      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1551 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1552 	   node 'a'.
1553 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1554 	    we throw away the node 'a'.  */
1555 
1556 #define STATE_NODE_CONTAINS(state,node) \
1557   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1558 
1559 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1560 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1561 {
1562   reg_errcode_t err;
1563   int null_cnt = 0;
1564   Idx str_idx = sctx->last_str_idx;
1565   re_node_set cur_dest;
1566 
1567   DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1568 
1569   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1570      transit to the last_node and the last_node itself.  */
1571   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1572   if (__glibc_unlikely (err != REG_NOERROR))
1573     return err;
1574   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1575   if (__glibc_unlikely (err != REG_NOERROR))
1576     goto free_return;
1577 
1578   /* Then check each states in the state_log.  */
1579   while (str_idx > 0)
1580     {
1581       /* Update counters.  */
1582       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1583       if (null_cnt > mctx->max_mb_elem_len)
1584 	{
1585 	  memset (sctx->sifted_states, '\0',
1586 		  sizeof (re_dfastate_t *) * str_idx);
1587 	  re_node_set_free (&cur_dest);
1588 	  return REG_NOERROR;
1589 	}
1590       re_node_set_empty (&cur_dest);
1591       --str_idx;
1592 
1593       if (mctx->state_log[str_idx])
1594 	{
1595 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1596 	  if (__glibc_unlikely (err != REG_NOERROR))
1597 	    goto free_return;
1598 	}
1599 
1600       /* Add all the nodes which satisfy the following conditions:
1601 	 - It can epsilon transit to a node in CUR_DEST.
1602 	 - It is in CUR_SRC.
1603 	 And update state_log.  */
1604       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1605       if (__glibc_unlikely (err != REG_NOERROR))
1606 	goto free_return;
1607     }
1608   err = REG_NOERROR;
1609  free_return:
1610   re_node_set_free (&cur_dest);
1611   return err;
1612 }
1613 
1614 static reg_errcode_t
1615 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1616 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1617 		     Idx str_idx, re_node_set *cur_dest)
1618 {
1619   const re_dfa_t *const dfa = mctx->dfa;
1620   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1621   Idx i;
1622 
1623   /* Then build the next sifted state.
1624      We build the next sifted state on 'cur_dest', and update
1625      'sifted_states[str_idx]' with 'cur_dest'.
1626      Note:
1627      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1628      'cur_src' points the node_set of the old 'state_log[str_idx]'
1629      (with the epsilon nodes pre-filtered out).  */
1630   for (i = 0; i < cur_src->nelem; i++)
1631     {
1632       Idx prev_node = cur_src->elems[i];
1633       int naccepted = 0;
1634       bool ok;
1635       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1636 
1637 #ifdef RE_ENABLE_I18N
1638       /* If the node may accept "multi byte".  */
1639       if (dfa->nodes[prev_node].accept_mb)
1640 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1641 					 str_idx, sctx->last_str_idx);
1642 #endif /* RE_ENABLE_I18N */
1643 
1644       /* We don't check backreferences here.
1645 	 See update_cur_sifted_state().  */
1646       if (!naccepted
1647 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1648 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1649 				  dfa->nexts[prev_node]))
1650 	naccepted = 1;
1651 
1652       if (naccepted == 0)
1653 	continue;
1654 
1655       if (sctx->limits.nelem)
1656 	{
1657 	  Idx to_idx = str_idx + naccepted;
1658 	  if (check_dst_limits (mctx, &sctx->limits,
1659 				dfa->nexts[prev_node], to_idx,
1660 				prev_node, str_idx))
1661 	    continue;
1662 	}
1663       ok = re_node_set_insert (cur_dest, prev_node);
1664       if (__glibc_unlikely (! ok))
1665 	return REG_ESPACE;
1666     }
1667 
1668   return REG_NOERROR;
1669 }
1670 
1671 /* Helper functions.  */
1672 
1673 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1674 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1675 {
1676   Idx top = mctx->state_log_top;
1677 
1678   if ((next_state_log_idx >= mctx->input.bufs_len
1679        && mctx->input.bufs_len < mctx->input.len)
1680       || (next_state_log_idx >= mctx->input.valid_len
1681 	  && mctx->input.valid_len < mctx->input.len))
1682     {
1683       reg_errcode_t err;
1684       err = extend_buffers (mctx, next_state_log_idx + 1);
1685       if (__glibc_unlikely (err != REG_NOERROR))
1686 	return err;
1687     }
1688 
1689   if (top < next_state_log_idx)
1690     {
1691       memset (mctx->state_log + top + 1, '\0',
1692 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1693       mctx->state_log_top = next_state_log_idx;
1694     }
1695   return REG_NOERROR;
1696 }
1697 
1698 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1699 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1700 		   re_dfastate_t **src, Idx num)
1701 {
1702   Idx st_idx;
1703   reg_errcode_t err;
1704   for (st_idx = 0; st_idx < num; ++st_idx)
1705     {
1706       if (dst[st_idx] == NULL)
1707 	dst[st_idx] = src[st_idx];
1708       else if (src[st_idx] != NULL)
1709 	{
1710 	  re_node_set merged_set;
1711 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1712 					&src[st_idx]->nodes);
1713 	  if (__glibc_unlikely (err != REG_NOERROR))
1714 	    return err;
1715 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1716 	  re_node_set_free (&merged_set);
1717 	  if (__glibc_unlikely (err != REG_NOERROR))
1718 	    return err;
1719 	}
1720     }
1721   return REG_NOERROR;
1722 }
1723 
1724 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1725 update_cur_sifted_state (const re_match_context_t *mctx,
1726 			 re_sift_context_t *sctx, Idx str_idx,
1727 			 re_node_set *dest_nodes)
1728 {
1729   const re_dfa_t *const dfa = mctx->dfa;
1730   reg_errcode_t err = REG_NOERROR;
1731   const re_node_set *candidates;
1732   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1733 		: &mctx->state_log[str_idx]->nodes);
1734 
1735   if (dest_nodes->nelem == 0)
1736     sctx->sifted_states[str_idx] = NULL;
1737   else
1738     {
1739       if (candidates)
1740 	{
1741 	  /* At first, add the nodes which can epsilon transit to a node in
1742 	     DEST_NODE.  */
1743 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1744 	  if (__glibc_unlikely (err != REG_NOERROR))
1745 	    return err;
1746 
1747 	  /* Then, check the limitations in the current sift_context.  */
1748 	  if (sctx->limits.nelem)
1749 	    {
1750 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1751 					 mctx->bkref_ents, str_idx);
1752 	      if (__glibc_unlikely (err != REG_NOERROR))
1753 		return err;
1754 	    }
1755 	}
1756 
1757       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1758       if (__glibc_unlikely (err != REG_NOERROR))
1759 	return err;
1760     }
1761 
1762   if (candidates && mctx->state_log[str_idx]->has_backref)
1763     {
1764       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1765       if (__glibc_unlikely (err != REG_NOERROR))
1766 	return err;
1767     }
1768   return REG_NOERROR;
1769 }
1770 
1771 static reg_errcode_t
1772 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1773 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1774 		       const re_node_set *candidates)
1775 {
1776   reg_errcode_t err = REG_NOERROR;
1777   Idx i;
1778 
1779   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1780   if (__glibc_unlikely (err != REG_NOERROR))
1781     return err;
1782 
1783   if (!state->inveclosure.alloc)
1784     {
1785       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1786       if (__glibc_unlikely (err != REG_NOERROR))
1787 	return REG_ESPACE;
1788       for (i = 0; i < dest_nodes->nelem; i++)
1789 	{
1790 	  err = re_node_set_merge (&state->inveclosure,
1791 				   dfa->inveclosures + dest_nodes->elems[i]);
1792 	  if (__glibc_unlikely (err != REG_NOERROR))
1793 	    return REG_ESPACE;
1794 	}
1795     }
1796   return re_node_set_add_intersect (dest_nodes, candidates,
1797 				    &state->inveclosure);
1798 }
1799 
1800 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1801 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1802 		       const re_node_set *candidates)
1803 {
1804     Idx ecl_idx;
1805     reg_errcode_t err;
1806     re_node_set *inv_eclosure = dfa->inveclosures + node;
1807     re_node_set except_nodes;
1808     re_node_set_init_empty (&except_nodes);
1809     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1810       {
1811 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1812 	if (cur_node == node)
1813 	  continue;
1814 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1815 	  {
1816 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1817 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1818 			 ? dfa->edests[cur_node].elems[1] : -1);
1819 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1820 		 && re_node_set_contains (dest_nodes, edst1))
1821 		|| (edst2 > 0
1822 		    && !re_node_set_contains (inv_eclosure, edst2)
1823 		    && re_node_set_contains (dest_nodes, edst2)))
1824 	      {
1825 		err = re_node_set_add_intersect (&except_nodes, candidates,
1826 						 dfa->inveclosures + cur_node);
1827 		if (__glibc_unlikely (err != REG_NOERROR))
1828 		  {
1829 		    re_node_set_free (&except_nodes);
1830 		    return err;
1831 		  }
1832 	      }
1833 	  }
1834       }
1835     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1836       {
1837 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1838 	if (!re_node_set_contains (&except_nodes, cur_node))
1839 	  {
1840 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1841 	    re_node_set_remove_at (dest_nodes, idx);
1842 	  }
1843       }
1844     re_node_set_free (&except_nodes);
1845     return REG_NOERROR;
1846 }
1847 
1848 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1849 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1850 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1851 {
1852   const re_dfa_t *const dfa = mctx->dfa;
1853   Idx lim_idx, src_pos, dst_pos;
1854 
1855   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1856   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1857   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1858     {
1859       Idx subexp_idx;
1860       struct re_backref_cache_entry *ent;
1861       ent = mctx->bkref_ents + limits->elems[lim_idx];
1862       subexp_idx = dfa->nodes[ent->node].opr.idx;
1863 
1864       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1865 					   subexp_idx, dst_node, dst_idx,
1866 					   dst_bkref_idx);
1867       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1868 					   subexp_idx, src_node, src_idx,
1869 					   src_bkref_idx);
1870 
1871       /* In case of:
1872 	 <src> <dst> ( <subexp> )
1873 	 ( <subexp> ) <src> <dst>
1874 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1875       if (src_pos == dst_pos)
1876 	continue; /* This is unrelated limitation.  */
1877       else
1878 	return true;
1879     }
1880   return false;
1881 }
1882 
1883 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1884 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1885 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1886 {
1887   const re_dfa_t *const dfa = mctx->dfa;
1888   const re_node_set *eclosures = dfa->eclosures + from_node;
1889   Idx node_idx;
1890 
1891   /* Else, we are on the boundary: examine the nodes on the epsilon
1892      closure.  */
1893   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1894     {
1895       Idx node = eclosures->elems[node_idx];
1896       switch (dfa->nodes[node].type)
1897 	{
1898 	case OP_BACK_REF:
1899 	  if (bkref_idx != -1)
1900 	    {
1901 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1902 	      do
1903 		{
1904 		  Idx dst;
1905 		  int cpos;
1906 
1907 		  if (ent->node != node)
1908 		    continue;
1909 
1910 		  if (subexp_idx < BITSET_WORD_BITS
1911 		      && !(ent->eps_reachable_subexps_map
1912 			   & ((bitset_word_t) 1 << subexp_idx)))
1913 		    continue;
1914 
1915 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1916 		     OP_CLOSE_SUBEXP cases below.  But, if the
1917 		     destination node is the same node as the source
1918 		     node, don't recurse because it would cause an
1919 		     infinite loop: a regex that exhibits this behavior
1920 		     is ()\1*\1*  */
1921 		  dst = dfa->edests[node].elems[0];
1922 		  if (dst == from_node)
1923 		    {
1924 		      if (boundaries & 1)
1925 			return -1;
1926 		      else /* if (boundaries & 2) */
1927 			return 0;
1928 		    }
1929 
1930 		  cpos =
1931 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1932 						 dst, bkref_idx);
1933 		  if (cpos == -1 /* && (boundaries & 1) */)
1934 		    return -1;
1935 		  if (cpos == 0 && (boundaries & 2))
1936 		    return 0;
1937 
1938 		  if (subexp_idx < BITSET_WORD_BITS)
1939 		    ent->eps_reachable_subexps_map
1940 		      &= ~((bitset_word_t) 1 << subexp_idx);
1941 		}
1942 	      while (ent++->more);
1943 	    }
1944 	  break;
1945 
1946 	case OP_OPEN_SUBEXP:
1947 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1948 	    return -1;
1949 	  break;
1950 
1951 	case OP_CLOSE_SUBEXP:
1952 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1953 	    return 0;
1954 	  break;
1955 
1956 	default:
1957 	    break;
1958 	}
1959     }
1960 
1961   return (boundaries & 2) ? 1 : 0;
1962 }
1963 
1964 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1965 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1966 			   Idx subexp_idx, Idx from_node, Idx str_idx,
1967 			   Idx bkref_idx)
1968 {
1969   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1970   int boundaries;
1971 
1972   /* If we are outside the range of the subexpression, return -1 or 1.  */
1973   if (str_idx < lim->subexp_from)
1974     return -1;
1975 
1976   if (lim->subexp_to < str_idx)
1977     return 1;
1978 
1979   /* If we are within the subexpression, return 0.  */
1980   boundaries = (str_idx == lim->subexp_from);
1981   boundaries |= (str_idx == lim->subexp_to) << 1;
1982   if (boundaries == 0)
1983     return 0;
1984 
1985   /* Else, examine epsilon closure.  */
1986   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1987 				      from_node, bkref_idx);
1988 }
1989 
1990 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1991    which are against limitations from DEST_NODES. */
1992 
1993 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)1994 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1995 		     const re_node_set *candidates, re_node_set *limits,
1996 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1997 {
1998   reg_errcode_t err;
1999   Idx node_idx, lim_idx;
2000 
2001   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2002     {
2003       Idx subexp_idx;
2004       struct re_backref_cache_entry *ent;
2005       ent = bkref_ents + limits->elems[lim_idx];
2006 
2007       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2008 	continue; /* This is unrelated limitation.  */
2009 
2010       subexp_idx = dfa->nodes[ent->node].opr.idx;
2011       if (ent->subexp_to == str_idx)
2012 	{
2013 	  Idx ops_node = -1;
2014 	  Idx cls_node = -1;
2015 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2016 	    {
2017 	      Idx node = dest_nodes->elems[node_idx];
2018 	      re_token_type_t type = dfa->nodes[node].type;
2019 	      if (type == OP_OPEN_SUBEXP
2020 		  && subexp_idx == dfa->nodes[node].opr.idx)
2021 		ops_node = node;
2022 	      else if (type == OP_CLOSE_SUBEXP
2023 		       && subexp_idx == dfa->nodes[node].opr.idx)
2024 		cls_node = node;
2025 	    }
2026 
2027 	  /* Check the limitation of the open subexpression.  */
2028 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2029 	  if (ops_node >= 0)
2030 	    {
2031 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2032 					   candidates);
2033 	      if (__glibc_unlikely (err != REG_NOERROR))
2034 		return err;
2035 	    }
2036 
2037 	  /* Check the limitation of the close subexpression.  */
2038 	  if (cls_node >= 0)
2039 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2040 	      {
2041 		Idx node = dest_nodes->elems[node_idx];
2042 		if (!re_node_set_contains (dfa->inveclosures + node,
2043 					   cls_node)
2044 		    && !re_node_set_contains (dfa->eclosures + node,
2045 					      cls_node))
2046 		  {
2047 		    /* It is against this limitation.
2048 		       Remove it form the current sifted state.  */
2049 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2050 						 candidates);
2051 		    if (__glibc_unlikely (err != REG_NOERROR))
2052 		      return err;
2053 		    --node_idx;
2054 		  }
2055 	      }
2056 	}
2057       else /* (ent->subexp_to != str_idx)  */
2058 	{
2059 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2060 	    {
2061 	      Idx node = dest_nodes->elems[node_idx];
2062 	      re_token_type_t type = dfa->nodes[node].type;
2063 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2064 		{
2065 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2066 		    continue;
2067 		  /* It is against this limitation.
2068 		     Remove it form the current sifted state.  */
2069 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2070 					       candidates);
2071 		  if (__glibc_unlikely (err != REG_NOERROR))
2072 		    return err;
2073 		}
2074 	    }
2075 	}
2076     }
2077   return REG_NOERROR;
2078 }
2079 
2080 static reg_errcode_t
2081 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2082 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2083 		   Idx str_idx, const re_node_set *candidates)
2084 {
2085   const re_dfa_t *const dfa = mctx->dfa;
2086   reg_errcode_t err;
2087   Idx node_idx, node;
2088   re_sift_context_t local_sctx;
2089   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2090 
2091   if (first_idx == -1)
2092     return REG_NOERROR;
2093 
2094   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2095 
2096   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2097     {
2098       Idx enabled_idx;
2099       re_token_type_t type;
2100       struct re_backref_cache_entry *entry;
2101       node = candidates->elems[node_idx];
2102       type = dfa->nodes[node].type;
2103       /* Avoid infinite loop for the REs like "()\1+".  */
2104       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2105 	continue;
2106       if (type != OP_BACK_REF)
2107 	continue;
2108 
2109       entry = mctx->bkref_ents + first_idx;
2110       enabled_idx = first_idx;
2111       do
2112 	{
2113 	  Idx subexp_len;
2114 	  Idx to_idx;
2115 	  Idx dst_node;
2116 	  bool ok;
2117 	  re_dfastate_t *cur_state;
2118 
2119 	  if (entry->node != node)
2120 	    continue;
2121 	  subexp_len = entry->subexp_to - entry->subexp_from;
2122 	  to_idx = str_idx + subexp_len;
2123 	  dst_node = (subexp_len ? dfa->nexts[node]
2124 		      : dfa->edests[node].elems[0]);
2125 
2126 	  if (to_idx > sctx->last_str_idx
2127 	      || sctx->sifted_states[to_idx] == NULL
2128 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2129 	      || check_dst_limits (mctx, &sctx->limits, node,
2130 				   str_idx, dst_node, to_idx))
2131 	    continue;
2132 
2133 	  if (local_sctx.sifted_states == NULL)
2134 	    {
2135 	      local_sctx = *sctx;
2136 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2137 	      if (__glibc_unlikely (err != REG_NOERROR))
2138 		goto free_return;
2139 	    }
2140 	  local_sctx.last_node = node;
2141 	  local_sctx.last_str_idx = str_idx;
2142 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2143 	  if (__glibc_unlikely (! ok))
2144 	    {
2145 	      err = REG_ESPACE;
2146 	      goto free_return;
2147 	    }
2148 	  cur_state = local_sctx.sifted_states[str_idx];
2149 	  err = sift_states_backward (mctx, &local_sctx);
2150 	  if (__glibc_unlikely (err != REG_NOERROR))
2151 	    goto free_return;
2152 	  if (sctx->limited_states != NULL)
2153 	    {
2154 	      err = merge_state_array (dfa, sctx->limited_states,
2155 				       local_sctx.sifted_states,
2156 				       str_idx + 1);
2157 	      if (__glibc_unlikely (err != REG_NOERROR))
2158 		goto free_return;
2159 	    }
2160 	  local_sctx.sifted_states[str_idx] = cur_state;
2161 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2162 
2163 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2164 	  entry = mctx->bkref_ents + enabled_idx;
2165 	}
2166       while (enabled_idx++, entry++->more);
2167     }
2168   err = REG_NOERROR;
2169  free_return:
2170   if (local_sctx.sifted_states != NULL)
2171     {
2172       re_node_set_free (&local_sctx.limits);
2173     }
2174 
2175   return err;
2176 }
2177 
2178 
2179 #ifdef RE_ENABLE_I18N
2180 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2181 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2182 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2183 {
2184   const re_dfa_t *const dfa = mctx->dfa;
2185   int naccepted;
2186   /* Check the node can accept "multi byte".  */
2187   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2188   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2189       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2190 			       dfa->nexts[node_idx]))
2191     /* The node can't accept the "multi byte", or the
2192        destination was already thrown away, then the node
2193        couldn't accept the current input "multi byte".   */
2194     naccepted = 0;
2195   /* Otherwise, it is sure that the node could accept
2196      'naccepted' bytes input.  */
2197   return naccepted;
2198 }
2199 #endif /* RE_ENABLE_I18N */
2200 
2201 
2202 /* Functions for state transition.  */
2203 
2204 /* Return the next state to which the current state STATE will transit by
2205    accepting the current input byte, and update STATE_LOG if necessary.
2206    Return NULL on failure.
2207    If STATE can accept a multibyte char/collating element/back reference
2208    update the destination of STATE_LOG.  */
2209 
2210 static re_dfastate_t *
2211 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2212 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2213 	       re_dfastate_t *state)
2214 {
2215   re_dfastate_t **trtable;
2216   unsigned char ch;
2217 
2218 #ifdef RE_ENABLE_I18N
2219   /* If the current state can accept multibyte.  */
2220   if (__glibc_unlikely (state->accept_mb))
2221     {
2222       *err = transit_state_mb (mctx, state);
2223       if (__glibc_unlikely (*err != REG_NOERROR))
2224 	return NULL;
2225     }
2226 #endif /* RE_ENABLE_I18N */
2227 
2228   /* Then decide the next state with the single byte.  */
2229 #if 0
2230   if (0)
2231     /* don't use transition table  */
2232     return transit_state_sb (err, mctx, state);
2233 #endif
2234 
2235   /* Use transition table  */
2236   ch = re_string_fetch_byte (&mctx->input);
2237   for (;;)
2238     {
2239       trtable = state->trtable;
2240       if (__glibc_likely (trtable != NULL))
2241 	return trtable[ch];
2242 
2243       trtable = state->word_trtable;
2244       if (__glibc_likely (trtable != NULL))
2245 	{
2246 	  unsigned int context;
2247 	  context
2248 	    = re_string_context_at (&mctx->input,
2249 				    re_string_cur_idx (&mctx->input) - 1,
2250 				    mctx->eflags);
2251 	  if (IS_WORD_CONTEXT (context))
2252 	    return trtable[ch + SBC_MAX];
2253 	  else
2254 	    return trtable[ch];
2255 	}
2256 
2257       if (!build_trtable (mctx->dfa, state))
2258 	{
2259 	  *err = REG_ESPACE;
2260 	  return NULL;
2261 	}
2262 
2263       /* Retry, we now have a transition table.  */
2264     }
2265 }
2266 
2267 /* Update the state_log if we need */
2268 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2269 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2270 		      re_dfastate_t *next_state)
2271 {
2272   const re_dfa_t *const dfa = mctx->dfa;
2273   Idx cur_idx = re_string_cur_idx (&mctx->input);
2274 
2275   if (cur_idx > mctx->state_log_top)
2276     {
2277       mctx->state_log[cur_idx] = next_state;
2278       mctx->state_log_top = cur_idx;
2279     }
2280   else if (mctx->state_log[cur_idx] == 0)
2281     {
2282       mctx->state_log[cur_idx] = next_state;
2283     }
2284   else
2285     {
2286       re_dfastate_t *pstate;
2287       unsigned int context;
2288       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2289       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2290 	 the destination of a multibyte char/collating element/
2291 	 back reference.  Then the next state is the union set of
2292 	 these destinations and the results of the transition table.  */
2293       pstate = mctx->state_log[cur_idx];
2294       log_nodes = pstate->entrance_nodes;
2295       if (next_state != NULL)
2296 	{
2297 	  table_nodes = next_state->entrance_nodes;
2298 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2299 					     log_nodes);
2300 	  if (__glibc_unlikely (*err != REG_NOERROR))
2301 	    return NULL;
2302 	}
2303       else
2304 	next_nodes = *log_nodes;
2305       /* Note: We already add the nodes of the initial state,
2306 	 then we don't need to add them here.  */
2307 
2308       context = re_string_context_at (&mctx->input,
2309 				      re_string_cur_idx (&mctx->input) - 1,
2310 				      mctx->eflags);
2311       next_state = mctx->state_log[cur_idx]
2312 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2313       /* We don't need to check errors here, since the return value of
2314 	 this function is next_state and ERR is already set.  */
2315 
2316       if (table_nodes != NULL)
2317 	re_node_set_free (&next_nodes);
2318     }
2319 
2320   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2321     {
2322       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2323 	 later.  We must check them here, since the back references in the
2324 	 next state might use them.  */
2325       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2326 					cur_idx);
2327       if (__glibc_unlikely (*err != REG_NOERROR))
2328 	return NULL;
2329 
2330       /* If the next state has back references.  */
2331       if (next_state->has_backref)
2332 	{
2333 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2334 	  if (__glibc_unlikely (*err != REG_NOERROR))
2335 	    return NULL;
2336 	  next_state = mctx->state_log[cur_idx];
2337 	}
2338     }
2339 
2340   return next_state;
2341 }
2342 
2343 /* Skip bytes in the input that correspond to part of a
2344    multi-byte match, then look in the log for a state
2345    from which to restart matching.  */
2346 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2347 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2348 {
2349   re_dfastate_t *cur_state;
2350   do
2351     {
2352       Idx max = mctx->state_log_top;
2353       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2354 
2355       do
2356 	{
2357 	  if (++cur_str_idx > max)
2358 	    return NULL;
2359 	  re_string_skip_bytes (&mctx->input, 1);
2360 	}
2361       while (mctx->state_log[cur_str_idx] == NULL);
2362 
2363       cur_state = merge_state_with_log (err, mctx, NULL);
2364     }
2365   while (*err == REG_NOERROR && cur_state == NULL);
2366   return cur_state;
2367 }
2368 
2369 /* Helper functions for transit_state.  */
2370 
2371 /* From the node set CUR_NODES, pick up the nodes whose types are
2372    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2373    expression. And register them to use them later for evaluating the
2374    corresponding back references.  */
2375 
2376 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2377 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2378 			   Idx str_idx)
2379 {
2380   const re_dfa_t *const dfa = mctx->dfa;
2381   Idx node_idx;
2382   reg_errcode_t err;
2383 
2384   /* TODO: This isn't efficient.
2385 	   Because there might be more than one nodes whose types are
2386 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2387 	   nodes.
2388 	   E.g. RE: (a){2}  */
2389   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2390     {
2391       Idx node = cur_nodes->elems[node_idx];
2392       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2393 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2394 	  && (dfa->used_bkref_map
2395 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2396 	{
2397 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2398 	  if (__glibc_unlikely (err != REG_NOERROR))
2399 	    return err;
2400 	}
2401     }
2402   return REG_NOERROR;
2403 }
2404 
2405 #if 0
2406 /* Return the next state to which the current state STATE will transit by
2407    accepting the current input byte.  Return NULL on failure.  */
2408 
2409 static re_dfastate_t *
2410 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2411 		  re_dfastate_t *state)
2412 {
2413   const re_dfa_t *const dfa = mctx->dfa;
2414   re_node_set next_nodes;
2415   re_dfastate_t *next_state;
2416   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2417   unsigned int context;
2418 
2419   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2420   if (__glibc_unlikely (*err != REG_NOERROR))
2421     return NULL;
2422   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2423     {
2424       Idx cur_node = state->nodes.elems[node_cnt];
2425       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2426 	{
2427 	  *err = re_node_set_merge (&next_nodes,
2428 				    dfa->eclosures + dfa->nexts[cur_node]);
2429 	  if (__glibc_unlikely (*err != REG_NOERROR))
2430 	    {
2431 	      re_node_set_free (&next_nodes);
2432 	      return NULL;
2433 	    }
2434 	}
2435     }
2436   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2437   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2438   /* We don't need to check errors here, since the return value of
2439      this function is next_state and ERR is already set.  */
2440 
2441   re_node_set_free (&next_nodes);
2442   re_string_skip_bytes (&mctx->input, 1);
2443   return next_state;
2444 }
2445 #endif
2446 
2447 #ifdef RE_ENABLE_I18N
2448 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2449 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2450 {
2451   const re_dfa_t *const dfa = mctx->dfa;
2452   reg_errcode_t err;
2453   Idx i;
2454 
2455   for (i = 0; i < pstate->nodes.nelem; ++i)
2456     {
2457       re_node_set dest_nodes, *new_nodes;
2458       Idx cur_node_idx = pstate->nodes.elems[i];
2459       int naccepted;
2460       Idx dest_idx;
2461       unsigned int context;
2462       re_dfastate_t *dest_state;
2463 
2464       if (!dfa->nodes[cur_node_idx].accept_mb)
2465 	continue;
2466 
2467       if (dfa->nodes[cur_node_idx].constraint)
2468 	{
2469 	  context = re_string_context_at (&mctx->input,
2470 					  re_string_cur_idx (&mctx->input),
2471 					  mctx->eflags);
2472 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2473 					   context))
2474 	    continue;
2475 	}
2476 
2477       /* How many bytes the node can accept?  */
2478       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2479 					   re_string_cur_idx (&mctx->input));
2480       if (naccepted == 0)
2481 	continue;
2482 
2483       /* The node can accepts 'naccepted' bytes.  */
2484       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2485       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2486 			       : mctx->max_mb_elem_len);
2487       err = clean_state_log_if_needed (mctx, dest_idx);
2488       if (__glibc_unlikely (err != REG_NOERROR))
2489 	return err;
2490       DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2491       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2492 
2493       dest_state = mctx->state_log[dest_idx];
2494       if (dest_state == NULL)
2495 	dest_nodes = *new_nodes;
2496       else
2497 	{
2498 	  err = re_node_set_init_union (&dest_nodes,
2499 					dest_state->entrance_nodes, new_nodes);
2500 	  if (__glibc_unlikely (err != REG_NOERROR))
2501 	    return err;
2502 	}
2503       context = re_string_context_at (&mctx->input, dest_idx - 1,
2504 				      mctx->eflags);
2505       mctx->state_log[dest_idx]
2506 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2507       if (dest_state != NULL)
2508 	re_node_set_free (&dest_nodes);
2509       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2510 			    && err != REG_NOERROR))
2511 	return err;
2512     }
2513   return REG_NOERROR;
2514 }
2515 #endif /* RE_ENABLE_I18N */
2516 
2517 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2518 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2519 {
2520   const re_dfa_t *const dfa = mctx->dfa;
2521   reg_errcode_t err;
2522   Idx i;
2523   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2524 
2525   for (i = 0; i < nodes->nelem; ++i)
2526     {
2527       Idx dest_str_idx, prev_nelem, bkc_idx;
2528       Idx node_idx = nodes->elems[i];
2529       unsigned int context;
2530       const re_token_t *node = dfa->nodes + node_idx;
2531       re_node_set *new_dest_nodes;
2532 
2533       /* Check whether 'node' is a backreference or not.  */
2534       if (node->type != OP_BACK_REF)
2535 	continue;
2536 
2537       if (node->constraint)
2538 	{
2539 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2540 					  mctx->eflags);
2541 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2542 	    continue;
2543 	}
2544 
2545       /* 'node' is a backreference.
2546 	 Check the substring which the substring matched.  */
2547       bkc_idx = mctx->nbkref_ents;
2548       err = get_subexp (mctx, node_idx, cur_str_idx);
2549       if (__glibc_unlikely (err != REG_NOERROR))
2550 	goto free_return;
2551 
2552       /* And add the epsilon closures (which is 'new_dest_nodes') of
2553 	 the backreference to appropriate state_log.  */
2554       DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2555       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2556 	{
2557 	  Idx subexp_len;
2558 	  re_dfastate_t *dest_state;
2559 	  struct re_backref_cache_entry *bkref_ent;
2560 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2561 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2562 	    continue;
2563 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2564 	  new_dest_nodes = (subexp_len == 0
2565 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2566 			    : dfa->eclosures + dfa->nexts[node_idx]);
2567 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2568 			  - bkref_ent->subexp_from);
2569 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2570 					  mctx->eflags);
2571 	  dest_state = mctx->state_log[dest_str_idx];
2572 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2573 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2574 	  /* Add 'new_dest_node' to state_log.  */
2575 	  if (dest_state == NULL)
2576 	    {
2577 	      mctx->state_log[dest_str_idx]
2578 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2579 					    context);
2580 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2581 				    && err != REG_NOERROR))
2582 		goto free_return;
2583 	    }
2584 	  else
2585 	    {
2586 	      re_node_set dest_nodes;
2587 	      err = re_node_set_init_union (&dest_nodes,
2588 					    dest_state->entrance_nodes,
2589 					    new_dest_nodes);
2590 	      if (__glibc_unlikely (err != REG_NOERROR))
2591 		{
2592 		  re_node_set_free (&dest_nodes);
2593 		  goto free_return;
2594 		}
2595 	      mctx->state_log[dest_str_idx]
2596 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2597 	      re_node_set_free (&dest_nodes);
2598 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2599 				    && err != REG_NOERROR))
2600 		goto free_return;
2601 	    }
2602 	  /* We need to check recursively if the backreference can epsilon
2603 	     transit.  */
2604 	  if (subexp_len == 0
2605 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2606 	    {
2607 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2608 					       cur_str_idx);
2609 	      if (__glibc_unlikely (err != REG_NOERROR))
2610 		goto free_return;
2611 	      err = transit_state_bkref (mctx, new_dest_nodes);
2612 	      if (__glibc_unlikely (err != REG_NOERROR))
2613 		goto free_return;
2614 	    }
2615 	}
2616     }
2617   err = REG_NOERROR;
2618  free_return:
2619   return err;
2620 }
2621 
2622 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2623    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2624    Note that we might collect inappropriate candidates here.
2625    However, the cost of checking them strictly here is too high, then we
2626    delay these checking for prune_impossible_nodes().  */
2627 
2628 static reg_errcode_t
2629 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2630 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2631 {
2632   const re_dfa_t *const dfa = mctx->dfa;
2633   Idx subexp_num, sub_top_idx;
2634   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2635   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2636   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2637   if (cache_idx != -1)
2638     {
2639       const struct re_backref_cache_entry *entry
2640 	= mctx->bkref_ents + cache_idx;
2641       do
2642 	if (entry->node == bkref_node)
2643 	  return REG_NOERROR; /* We already checked it.  */
2644       while (entry++->more);
2645     }
2646 
2647   subexp_num = dfa->nodes[bkref_node].opr.idx;
2648 
2649   /* For each sub expression  */
2650   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2651     {
2652       reg_errcode_t err;
2653       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2654       re_sub_match_last_t *sub_last;
2655       Idx sub_last_idx, sl_str, bkref_str_off;
2656 
2657       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2658 	continue; /* It isn't related.  */
2659 
2660       sl_str = sub_top->str_idx;
2661       bkref_str_off = bkref_str_idx;
2662       /* At first, check the last node of sub expressions we already
2663 	 evaluated.  */
2664       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2665 	{
2666 	  regoff_t sl_str_diff;
2667 	  sub_last = sub_top->lasts[sub_last_idx];
2668 	  sl_str_diff = sub_last->str_idx - sl_str;
2669 	  /* The matched string by the sub expression match with the substring
2670 	     at the back reference?  */
2671 	  if (sl_str_diff > 0)
2672 	    {
2673 	      if (__glibc_unlikely (bkref_str_off + sl_str_diff
2674 				    > mctx->input.valid_len))
2675 		{
2676 		  /* Not enough chars for a successful match.  */
2677 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2678 		    break;
2679 
2680 		  err = clean_state_log_if_needed (mctx,
2681 						   bkref_str_off
2682 						   + sl_str_diff);
2683 		  if (__glibc_unlikely (err != REG_NOERROR))
2684 		    return err;
2685 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2686 		}
2687 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2688 		/* We don't need to search this sub expression any more.  */
2689 		break;
2690 	    }
2691 	  bkref_str_off += sl_str_diff;
2692 	  sl_str += sl_str_diff;
2693 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2694 				bkref_str_idx);
2695 
2696 	  /* Reload buf, since the preceding call might have reallocated
2697 	     the buffer.  */
2698 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2699 
2700 	  if (err == REG_NOMATCH)
2701 	    continue;
2702 	  if (__glibc_unlikely (err != REG_NOERROR))
2703 	    return err;
2704 	}
2705 
2706       if (sub_last_idx < sub_top->nlasts)
2707 	continue;
2708       if (sub_last_idx > 0)
2709 	++sl_str;
2710       /* Then, search for the other last nodes of the sub expression.  */
2711       for (; sl_str <= bkref_str_idx; ++sl_str)
2712 	{
2713 	  Idx cls_node;
2714 	  regoff_t sl_str_off;
2715 	  const re_node_set *nodes;
2716 	  sl_str_off = sl_str - sub_top->str_idx;
2717 	  /* The matched string by the sub expression match with the substring
2718 	     at the back reference?  */
2719 	  if (sl_str_off > 0)
2720 	    {
2721 	      if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2722 		{
2723 		  /* If we are at the end of the input, we cannot match.  */
2724 		  if (bkref_str_off >= mctx->input.len)
2725 		    break;
2726 
2727 		  err = extend_buffers (mctx, bkref_str_off + 1);
2728 		  if (__glibc_unlikely (err != REG_NOERROR))
2729 		    return err;
2730 
2731 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2732 		}
2733 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2734 		break; /* We don't need to search this sub expression
2735 			  any more.  */
2736 	    }
2737 	  if (mctx->state_log[sl_str] == NULL)
2738 	    continue;
2739 	  /* Does this state have a ')' of the sub expression?  */
2740 	  nodes = &mctx->state_log[sl_str]->nodes;
2741 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2742 				       OP_CLOSE_SUBEXP);
2743 	  if (cls_node == -1)
2744 	    continue; /* No.  */
2745 	  if (sub_top->path == NULL)
2746 	    {
2747 	      sub_top->path = calloc (sizeof (state_array_t),
2748 				      sl_str - sub_top->str_idx + 1);
2749 	      if (sub_top->path == NULL)
2750 		return REG_ESPACE;
2751 	    }
2752 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2753 	     in the current context?  */
2754 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2755 			       sub_top->str_idx, cls_node, sl_str,
2756 			       OP_CLOSE_SUBEXP);
2757 	  if (err == REG_NOMATCH)
2758 	      continue;
2759 	  if (__glibc_unlikely (err != REG_NOERROR))
2760 	      return err;
2761 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2762 	  if (__glibc_unlikely (sub_last == NULL))
2763 	    return REG_ESPACE;
2764 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2765 				bkref_str_idx);
2766 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2767 	  if (err == REG_NOMATCH)
2768 	    continue;
2769 	  if (__glibc_unlikely (err != REG_NOERROR))
2770 	    return err;
2771 	}
2772     }
2773   return REG_NOERROR;
2774 }
2775 
2776 /* Helper functions for get_subexp().  */
2777 
2778 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2779    If it can arrive, register the sub expression expressed with SUB_TOP
2780    and SUB_LAST.  */
2781 
2782 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2783 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2784 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2785 {
2786   reg_errcode_t err;
2787   Idx to_idx;
2788   /* Can the subexpression arrive the back reference?  */
2789   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2790 		       sub_last->str_idx, bkref_node, bkref_str,
2791 		       OP_OPEN_SUBEXP);
2792   if (err != REG_NOERROR)
2793     return err;
2794   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2795 			     sub_last->str_idx);
2796   if (__glibc_unlikely (err != REG_NOERROR))
2797     return err;
2798   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2799   return clean_state_log_if_needed (mctx, to_idx);
2800 }
2801 
2802 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2803    Search '(' if FL_OPEN, or search ')' otherwise.
2804    TODO: This function isn't efficient...
2805 	 Because there might be more than one nodes whose types are
2806 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2807 	 nodes.
2808 	 E.g. RE: (a){2}  */
2809 
2810 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2811 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2812 		  Idx subexp_idx, int type)
2813 {
2814   Idx cls_idx;
2815   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2816     {
2817       Idx cls_node = nodes->elems[cls_idx];
2818       const re_token_t *node = dfa->nodes + cls_node;
2819       if (node->type == type
2820 	  && node->opr.idx == subexp_idx)
2821 	return cls_node;
2822     }
2823   return -1;
2824 }
2825 
2826 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2827    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2828    heavily reused.
2829    Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2830    REG_ESPACE if memory is exhausted.  */
2831 
2832 static reg_errcode_t
2833 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2834 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2835 	       Idx top_str, Idx last_node, Idx last_str, int type)
2836 {
2837   const re_dfa_t *const dfa = mctx->dfa;
2838   reg_errcode_t err = REG_NOERROR;
2839   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2840   re_dfastate_t *cur_state = NULL;
2841   re_node_set *cur_nodes, next_nodes;
2842   re_dfastate_t **backup_state_log;
2843   unsigned int context;
2844 
2845   subexp_num = dfa->nodes[top_node].opr.idx;
2846   /* Extend the buffer if we need.  */
2847   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2848     {
2849       re_dfastate_t **new_array;
2850       Idx old_alloc = path->alloc;
2851       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2852       Idx new_alloc;
2853       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2854 	return REG_ESPACE;
2855       new_alloc = old_alloc + incr_alloc;
2856       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2857 	return REG_ESPACE;
2858       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2859       if (__glibc_unlikely (new_array == NULL))
2860 	return REG_ESPACE;
2861       path->array = new_array;
2862       path->alloc = new_alloc;
2863       memset (new_array + old_alloc, '\0',
2864 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2865     }
2866 
2867   str_idx = path->next_idx ? path->next_idx : top_str;
2868 
2869   /* Temporary modify MCTX.  */
2870   backup_state_log = mctx->state_log;
2871   backup_cur_idx = mctx->input.cur_idx;
2872   mctx->state_log = path->array;
2873   mctx->input.cur_idx = str_idx;
2874 
2875   /* Setup initial node set.  */
2876   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2877   if (str_idx == top_str)
2878     {
2879       err = re_node_set_init_1 (&next_nodes, top_node);
2880       if (__glibc_unlikely (err != REG_NOERROR))
2881 	return err;
2882       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2883       if (__glibc_unlikely (err != REG_NOERROR))
2884 	{
2885 	  re_node_set_free (&next_nodes);
2886 	  return err;
2887 	}
2888     }
2889   else
2890     {
2891       cur_state = mctx->state_log[str_idx];
2892       if (cur_state && cur_state->has_backref)
2893 	{
2894 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2895 	  if (__glibc_unlikely (err != REG_NOERROR))
2896 	    return err;
2897 	}
2898       else
2899 	re_node_set_init_empty (&next_nodes);
2900     }
2901   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2902     {
2903       if (next_nodes.nelem)
2904 	{
2905 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2906 				    subexp_num, type);
2907 	  if (__glibc_unlikely (err != REG_NOERROR))
2908 	    {
2909 	      re_node_set_free (&next_nodes);
2910 	      return err;
2911 	    }
2912 	}
2913       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2914       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2915 	{
2916 	  re_node_set_free (&next_nodes);
2917 	  return err;
2918 	}
2919       mctx->state_log[str_idx] = cur_state;
2920     }
2921 
2922   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2923     {
2924       re_node_set_empty (&next_nodes);
2925       if (mctx->state_log[str_idx + 1])
2926 	{
2927 	  err = re_node_set_merge (&next_nodes,
2928 				   &mctx->state_log[str_idx + 1]->nodes);
2929 	  if (__glibc_unlikely (err != REG_NOERROR))
2930 	    {
2931 	      re_node_set_free (&next_nodes);
2932 	      return err;
2933 	    }
2934 	}
2935       if (cur_state)
2936 	{
2937 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2938 					      &cur_state->non_eps_nodes,
2939 					      &next_nodes);
2940 	  if (__glibc_unlikely (err != REG_NOERROR))
2941 	    {
2942 	      re_node_set_free (&next_nodes);
2943 	      return err;
2944 	    }
2945 	}
2946       ++str_idx;
2947       if (next_nodes.nelem)
2948 	{
2949 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2950 	  if (__glibc_unlikely (err != REG_NOERROR))
2951 	    {
2952 	      re_node_set_free (&next_nodes);
2953 	      return err;
2954 	    }
2955 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2956 				    subexp_num, type);
2957 	  if (__glibc_unlikely (err != REG_NOERROR))
2958 	    {
2959 	      re_node_set_free (&next_nodes);
2960 	      return err;
2961 	    }
2962 	}
2963       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2964       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2965       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2966 	{
2967 	  re_node_set_free (&next_nodes);
2968 	  return err;
2969 	}
2970       mctx->state_log[str_idx] = cur_state;
2971       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2972     }
2973   re_node_set_free (&next_nodes);
2974   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2975 	       : &mctx->state_log[last_str]->nodes);
2976   path->next_idx = str_idx;
2977 
2978   /* Fix MCTX.  */
2979   mctx->state_log = backup_state_log;
2980   mctx->input.cur_idx = backup_cur_idx;
2981 
2982   /* Then check the current node set has the node LAST_NODE.  */
2983   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2984     return REG_NOERROR;
2985 
2986   return REG_NOMATCH;
2987 }
2988 
2989 /* Helper functions for check_arrival.  */
2990 
2991 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2992    to NEXT_NODES.
2993    TODO: This function is similar to the functions transit_state*(),
2994 	 however this function has many additional works.
2995 	 Can't we unify them?  */
2996 
2997 static reg_errcode_t
2998 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)2999 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3000 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3001 {
3002   const re_dfa_t *const dfa = mctx->dfa;
3003   bool ok;
3004   Idx cur_idx;
3005 #ifdef RE_ENABLE_I18N
3006   reg_errcode_t err = REG_NOERROR;
3007 #endif
3008   re_node_set union_set;
3009   re_node_set_init_empty (&union_set);
3010   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3011     {
3012       int naccepted = 0;
3013       Idx cur_node = cur_nodes->elems[cur_idx];
3014       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3015 
3016 #ifdef RE_ENABLE_I18N
3017       /* If the node may accept "multi byte".  */
3018       if (dfa->nodes[cur_node].accept_mb)
3019 	{
3020 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3021 					       str_idx);
3022 	  if (naccepted > 1)
3023 	    {
3024 	      re_dfastate_t *dest_state;
3025 	      Idx next_node = dfa->nexts[cur_node];
3026 	      Idx next_idx = str_idx + naccepted;
3027 	      dest_state = mctx->state_log[next_idx];
3028 	      re_node_set_empty (&union_set);
3029 	      if (dest_state)
3030 		{
3031 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3032 		  if (__glibc_unlikely (err != REG_NOERROR))
3033 		    {
3034 		      re_node_set_free (&union_set);
3035 		      return err;
3036 		    }
3037 		}
3038 	      ok = re_node_set_insert (&union_set, next_node);
3039 	      if (__glibc_unlikely (! ok))
3040 		{
3041 		  re_node_set_free (&union_set);
3042 		  return REG_ESPACE;
3043 		}
3044 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3045 							    &union_set);
3046 	      if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3047 				    && err != REG_NOERROR))
3048 		{
3049 		  re_node_set_free (&union_set);
3050 		  return err;
3051 		}
3052 	    }
3053 	}
3054 #endif /* RE_ENABLE_I18N */
3055       if (naccepted
3056 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3057 	{
3058 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3059 	  if (__glibc_unlikely (! ok))
3060 	    {
3061 	      re_node_set_free (&union_set);
3062 	      return REG_ESPACE;
3063 	    }
3064 	}
3065     }
3066   re_node_set_free (&union_set);
3067   return REG_NOERROR;
3068 }
3069 
3070 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3071    CUR_NODES, however exclude the nodes which are:
3072     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3073     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3074 */
3075 
3076 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3077 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3078 			  Idx ex_subexp, int type)
3079 {
3080   reg_errcode_t err;
3081   Idx idx, outside_node;
3082   re_node_set new_nodes;
3083   DEBUG_ASSERT (cur_nodes->nelem);
3084   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3085   if (__glibc_unlikely (err != REG_NOERROR))
3086     return err;
3087   /* Create a new node set NEW_NODES with the nodes which are epsilon
3088      closures of the node in CUR_NODES.  */
3089 
3090   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3091     {
3092       Idx cur_node = cur_nodes->elems[idx];
3093       const re_node_set *eclosure = dfa->eclosures + cur_node;
3094       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3095       if (outside_node == -1)
3096 	{
3097 	  /* There are no problematic nodes, just merge them.  */
3098 	  err = re_node_set_merge (&new_nodes, eclosure);
3099 	  if (__glibc_unlikely (err != REG_NOERROR))
3100 	    {
3101 	      re_node_set_free (&new_nodes);
3102 	      return err;
3103 	    }
3104 	}
3105       else
3106 	{
3107 	  /* There are problematic nodes, re-calculate incrementally.  */
3108 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3109 					      ex_subexp, type);
3110 	  if (__glibc_unlikely (err != REG_NOERROR))
3111 	    {
3112 	      re_node_set_free (&new_nodes);
3113 	      return err;
3114 	    }
3115 	}
3116     }
3117   re_node_set_free (cur_nodes);
3118   *cur_nodes = new_nodes;
3119   return REG_NOERROR;
3120 }
3121 
3122 /* Helper function for check_arrival_expand_ecl.
3123    Check incrementally the epsilon closure of TARGET, and if it isn't
3124    problematic append it to DST_NODES.  */
3125 
3126 static reg_errcode_t
3127 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3128 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3129 			      Idx target, Idx ex_subexp, int type)
3130 {
3131   Idx cur_node;
3132   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3133     {
3134       bool ok;
3135 
3136       if (dfa->nodes[cur_node].type == type
3137 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3138 	{
3139 	  if (type == OP_CLOSE_SUBEXP)
3140 	    {
3141 	      ok = re_node_set_insert (dst_nodes, cur_node);
3142 	      if (__glibc_unlikely (! ok))
3143 		return REG_ESPACE;
3144 	    }
3145 	  break;
3146 	}
3147       ok = re_node_set_insert (dst_nodes, cur_node);
3148       if (__glibc_unlikely (! ok))
3149 	return REG_ESPACE;
3150       if (dfa->edests[cur_node].nelem == 0)
3151 	break;
3152       if (dfa->edests[cur_node].nelem == 2)
3153 	{
3154 	  reg_errcode_t err;
3155 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3156 					      dfa->edests[cur_node].elems[1],
3157 					      ex_subexp, type);
3158 	  if (__glibc_unlikely (err != REG_NOERROR))
3159 	    return err;
3160 	}
3161       cur_node = dfa->edests[cur_node].elems[0];
3162     }
3163   return REG_NOERROR;
3164 }
3165 
3166 
3167 /* For all the back references in the current state, calculate the
3168    destination of the back references by the appropriate entry
3169    in MCTX->BKREF_ENTS.  */
3170 
3171 static reg_errcode_t
3172 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3173 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3174 		    Idx cur_str, Idx subexp_num, int type)
3175 {
3176   const re_dfa_t *const dfa = mctx->dfa;
3177   reg_errcode_t err;
3178   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3179   struct re_backref_cache_entry *ent;
3180 
3181   if (cache_idx_start == -1)
3182     return REG_NOERROR;
3183 
3184  restart:
3185   ent = mctx->bkref_ents + cache_idx_start;
3186   do
3187     {
3188       Idx to_idx, next_node;
3189 
3190       /* Is this entry ENT is appropriate?  */
3191       if (!re_node_set_contains (cur_nodes, ent->node))
3192 	continue; /* No.  */
3193 
3194       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3195       /* Calculate the destination of the back reference, and append it
3196 	 to MCTX->STATE_LOG.  */
3197       if (to_idx == cur_str)
3198 	{
3199 	  /* The backreference did epsilon transit, we must re-check all the
3200 	     node in the current state.  */
3201 	  re_node_set new_dests;
3202 	  reg_errcode_t err2, err3;
3203 	  next_node = dfa->edests[ent->node].elems[0];
3204 	  if (re_node_set_contains (cur_nodes, next_node))
3205 	    continue;
3206 	  err = re_node_set_init_1 (&new_dests, next_node);
3207 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3208 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3209 	  re_node_set_free (&new_dests);
3210 	  if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3211 				|| err3 != REG_NOERROR))
3212 	    {
3213 	      err = (err != REG_NOERROR ? err
3214 		     : (err2 != REG_NOERROR ? err2 : err3));
3215 	      return err;
3216 	    }
3217 	  /* TODO: It is still inefficient...  */
3218 	  goto restart;
3219 	}
3220       else
3221 	{
3222 	  re_node_set union_set;
3223 	  next_node = dfa->nexts[ent->node];
3224 	  if (mctx->state_log[to_idx])
3225 	    {
3226 	      bool ok;
3227 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3228 					next_node))
3229 		continue;
3230 	      err = re_node_set_init_copy (&union_set,
3231 					   &mctx->state_log[to_idx]->nodes);
3232 	      ok = re_node_set_insert (&union_set, next_node);
3233 	      if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3234 		{
3235 		  re_node_set_free (&union_set);
3236 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3237 		  return err;
3238 		}
3239 	    }
3240 	  else
3241 	    {
3242 	      err = re_node_set_init_1 (&union_set, next_node);
3243 	      if (__glibc_unlikely (err != REG_NOERROR))
3244 		return err;
3245 	    }
3246 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3247 	  re_node_set_free (&union_set);
3248 	  if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3249 				&& err != REG_NOERROR))
3250 	    return err;
3251 	}
3252     }
3253   while (ent++->more);
3254   return REG_NOERROR;
3255 }
3256 
3257 /* Build transition table for the state.
3258    Return true if successful.  */
3259 
3260 static bool __attribute_noinline__
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3261 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3262 {
3263   reg_errcode_t err;
3264   Idx i, j;
3265   int ch;
3266   bool need_word_trtable = false;
3267   bitset_word_t elem, mask;
3268   Idx ndests; /* Number of the destination states from 'state'.  */
3269   re_dfastate_t **trtable;
3270   re_dfastate_t *dest_states[SBC_MAX];
3271   re_dfastate_t *dest_states_word[SBC_MAX];
3272   re_dfastate_t *dest_states_nl[SBC_MAX];
3273   re_node_set follows;
3274   bitset_t acceptable;
3275 
3276   /* We build DFA states which corresponds to the destination nodes
3277      from 'state'.  'dests_node[i]' represents the nodes which i-th
3278      destination state contains, and 'dests_ch[i]' represents the
3279      characters which i-th destination state accepts.  */
3280   re_node_set dests_node[SBC_MAX];
3281   bitset_t dests_ch[SBC_MAX];
3282 
3283   /* Initialize transition table.  */
3284   state->word_trtable = state->trtable = NULL;
3285 
3286   /* At first, group all nodes belonging to 'state' into several
3287      destinations.  */
3288   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3289   if (__glibc_unlikely (ndests <= 0))
3290     {
3291       /* Return false in case of an error, true otherwise.  */
3292       if (ndests == 0)
3293 	{
3294 	  state->trtable = (re_dfastate_t **)
3295 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3296           if (__glibc_unlikely (state->trtable == NULL))
3297             return false;
3298 	  return true;
3299 	}
3300       return false;
3301     }
3302 
3303   err = re_node_set_alloc (&follows, ndests + 1);
3304   if (__glibc_unlikely (err != REG_NOERROR))
3305     {
3306     out_free:
3307       re_node_set_free (&follows);
3308       for (i = 0; i < ndests; ++i)
3309 	re_node_set_free (dests_node + i);
3310       return false;
3311     }
3312 
3313   bitset_empty (acceptable);
3314 
3315   /* Then build the states for all destinations.  */
3316   for (i = 0; i < ndests; ++i)
3317     {
3318       Idx next_node;
3319       re_node_set_empty (&follows);
3320       /* Merge the follows of this destination states.  */
3321       for (j = 0; j < dests_node[i].nelem; ++j)
3322 	{
3323 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3324 	  if (next_node != -1)
3325 	    {
3326 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3327 	      if (__glibc_unlikely (err != REG_NOERROR))
3328 		goto out_free;
3329 	    }
3330 	}
3331       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3332       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3333 	goto out_free;
3334       /* If the new state has context constraint,
3335 	 build appropriate states for these contexts.  */
3336       if (dest_states[i]->has_constraint)
3337 	{
3338 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3339 							  CONTEXT_WORD);
3340 	  if (__glibc_unlikely (dest_states_word[i] == NULL
3341 				&& err != REG_NOERROR))
3342 	    goto out_free;
3343 
3344 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3345 	    need_word_trtable = true;
3346 
3347 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3348 							CONTEXT_NEWLINE);
3349 	  if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3350 	    goto out_free;
3351 	}
3352       else
3353 	{
3354 	  dest_states_word[i] = dest_states[i];
3355 	  dest_states_nl[i] = dest_states[i];
3356 	}
3357       bitset_merge (acceptable, dests_ch[i]);
3358     }
3359 
3360   if (!__glibc_unlikely (need_word_trtable))
3361     {
3362       /* We don't care about whether the following character is a word
3363 	 character, or we are in a single-byte character set so we can
3364 	 discern by looking at the character code: allocate a
3365 	 256-entry transition table.  */
3366       trtable = state->trtable =
3367 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3368       if (__glibc_unlikely (trtable == NULL))
3369 	goto out_free;
3370 
3371       /* For all characters ch...:  */
3372       for (i = 0; i < BITSET_WORDS; ++i)
3373 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3374 	     elem;
3375 	     mask <<= 1, elem >>= 1, ++ch)
3376 	  if (__glibc_unlikely (elem & 1))
3377 	    {
3378 	      /* There must be exactly one destination which accepts
3379 		 character ch.  See group_nodes_into_DFAstates.  */
3380 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3381 		;
3382 
3383 	      /* j-th destination accepts the word character ch.  */
3384 	      if (dfa->word_char[i] & mask)
3385 		trtable[ch] = dest_states_word[j];
3386 	      else
3387 		trtable[ch] = dest_states[j];
3388 	    }
3389     }
3390   else
3391     {
3392       /* We care about whether the following character is a word
3393 	 character, and we are in a multi-byte character set: discern
3394 	 by looking at the character code: build two 256-entry
3395 	 transition tables, one starting at trtable[0] and one
3396 	 starting at trtable[SBC_MAX].  */
3397       trtable = state->word_trtable =
3398 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3399       if (__glibc_unlikely (trtable == NULL))
3400 	goto out_free;
3401 
3402       /* For all characters ch...:  */
3403       for (i = 0; i < BITSET_WORDS; ++i)
3404 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3405 	     elem;
3406 	     mask <<= 1, elem >>= 1, ++ch)
3407 	  if (__glibc_unlikely (elem & 1))
3408 	    {
3409 	      /* There must be exactly one destination which accepts
3410 		 character ch.  See group_nodes_into_DFAstates.  */
3411 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3412 		;
3413 
3414 	      /* j-th destination accepts the word character ch.  */
3415 	      trtable[ch] = dest_states[j];
3416 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3417 	    }
3418     }
3419 
3420   /* new line */
3421   if (bitset_contain (acceptable, NEWLINE_CHAR))
3422     {
3423       /* The current state accepts newline character.  */
3424       for (j = 0; j < ndests; ++j)
3425 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3426 	  {
3427 	    /* k-th destination accepts newline character.  */
3428 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3429 	    if (need_word_trtable)
3430 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3431 	    /* There must be only one destination which accepts
3432 	       newline.  See group_nodes_into_DFAstates.  */
3433 	    break;
3434 	  }
3435     }
3436 
3437   re_node_set_free (&follows);
3438   for (i = 0; i < ndests; ++i)
3439     re_node_set_free (dests_node + i);
3440   return true;
3441 }
3442 
3443 /* Group all nodes belonging to STATE into several destinations.
3444    Then for all destinations, set the nodes belonging to the destination
3445    to DESTS_NODE[i] and set the characters accepted by the destination
3446    to DEST_CH[i].  Return the number of destinations if successful,
3447    -1 on internal error.  */
3448 
3449 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3450 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3451 			    re_node_set *dests_node, bitset_t *dests_ch)
3452 {
3453   reg_errcode_t err;
3454   bool ok;
3455   Idx i, j, k;
3456   Idx ndests; /* Number of the destinations from 'state'.  */
3457   bitset_t accepts; /* Characters a node can accept.  */
3458   const re_node_set *cur_nodes = &state->nodes;
3459   bitset_empty (accepts);
3460   ndests = 0;
3461 
3462   /* For all the nodes belonging to 'state',  */
3463   for (i = 0; i < cur_nodes->nelem; ++i)
3464     {
3465       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3466       re_token_type_t type = node->type;
3467       unsigned int constraint = node->constraint;
3468 
3469       /* Enumerate all single byte character this node can accept.  */
3470       if (type == CHARACTER)
3471 	bitset_set (accepts, node->opr.c);
3472       else if (type == SIMPLE_BRACKET)
3473 	{
3474 	  bitset_merge (accepts, node->opr.sbcset);
3475 	}
3476       else if (type == OP_PERIOD)
3477 	{
3478 #ifdef RE_ENABLE_I18N
3479 	  if (dfa->mb_cur_max > 1)
3480 	    bitset_merge (accepts, dfa->sb_char);
3481 	  else
3482 #endif
3483 	    bitset_set_all (accepts);
3484 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3485 	    bitset_clear (accepts, '\n');
3486 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3487 	    bitset_clear (accepts, '\0');
3488 	}
3489 #ifdef RE_ENABLE_I18N
3490       else if (type == OP_UTF8_PERIOD)
3491 	{
3492 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3493 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3494 	  else
3495 	    bitset_merge (accepts, utf8_sb_map);
3496 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3497 	    bitset_clear (accepts, '\n');
3498 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3499 	    bitset_clear (accepts, '\0');
3500 	}
3501 #endif
3502       else
3503 	continue;
3504 
3505       /* Check the 'accepts' and sift the characters which are not
3506 	 match it the context.  */
3507       if (constraint)
3508 	{
3509 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3510 	    {
3511 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3512 	      bitset_empty (accepts);
3513 	      if (accepts_newline)
3514 		bitset_set (accepts, NEWLINE_CHAR);
3515 	      else
3516 		continue;
3517 	    }
3518 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3519 	    {
3520 	      bitset_empty (accepts);
3521 	      continue;
3522 	    }
3523 
3524 	  if (constraint & NEXT_WORD_CONSTRAINT)
3525 	    {
3526 	      bitset_word_t any_set = 0;
3527 	      if (type == CHARACTER && !node->word_char)
3528 		{
3529 		  bitset_empty (accepts);
3530 		  continue;
3531 		}
3532 #ifdef RE_ENABLE_I18N
3533 	      if (dfa->mb_cur_max > 1)
3534 		for (j = 0; j < BITSET_WORDS; ++j)
3535 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3536 	      else
3537 #endif
3538 		for (j = 0; j < BITSET_WORDS; ++j)
3539 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3540 	      if (!any_set)
3541 		continue;
3542 	    }
3543 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3544 	    {
3545 	      bitset_word_t any_set = 0;
3546 	      if (type == CHARACTER && node->word_char)
3547 		{
3548 		  bitset_empty (accepts);
3549 		  continue;
3550 		}
3551 #ifdef RE_ENABLE_I18N
3552 	      if (dfa->mb_cur_max > 1)
3553 		for (j = 0; j < BITSET_WORDS; ++j)
3554 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3555 	      else
3556 #endif
3557 		for (j = 0; j < BITSET_WORDS; ++j)
3558 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3559 	      if (!any_set)
3560 		continue;
3561 	    }
3562 	}
3563 
3564       /* Then divide 'accepts' into DFA states, or create a new
3565 	 state.  Above, we make sure that accepts is not empty.  */
3566       for (j = 0; j < ndests; ++j)
3567 	{
3568 	  bitset_t intersec; /* Intersection sets, see below.  */
3569 	  bitset_t remains;
3570 	  /* Flags, see below.  */
3571 	  bitset_word_t has_intersec, not_subset, not_consumed;
3572 
3573 	  /* Optimization, skip if this state doesn't accept the character.  */
3574 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3575 	    continue;
3576 
3577 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3578 	  has_intersec = 0;
3579 	  for (k = 0; k < BITSET_WORDS; ++k)
3580 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3581 	  /* And skip if the intersection set is empty.  */
3582 	  if (!has_intersec)
3583 	    continue;
3584 
3585 	  /* Then check if this state is a subset of 'accepts'.  */
3586 	  not_subset = not_consumed = 0;
3587 	  for (k = 0; k < BITSET_WORDS; ++k)
3588 	    {
3589 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3590 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3591 	    }
3592 
3593 	  /* If this state isn't a subset of 'accepts', create a
3594 	     new group state, which has the 'remains'. */
3595 	  if (not_subset)
3596 	    {
3597 	      bitset_copy (dests_ch[ndests], remains);
3598 	      bitset_copy (dests_ch[j], intersec);
3599 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3600 	      if (__glibc_unlikely (err != REG_NOERROR))
3601 		goto error_return;
3602 	      ++ndests;
3603 	    }
3604 
3605 	  /* Put the position in the current group. */
3606 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3607 	  if (__glibc_unlikely (! ok))
3608 	    goto error_return;
3609 
3610 	  /* If all characters are consumed, go to next node. */
3611 	  if (!not_consumed)
3612 	    break;
3613 	}
3614       /* Some characters remain, create a new group. */
3615       if (j == ndests)
3616 	{
3617 	  bitset_copy (dests_ch[ndests], accepts);
3618 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3619 	  if (__glibc_unlikely (err != REG_NOERROR))
3620 	    goto error_return;
3621 	  ++ndests;
3622 	  bitset_empty (accepts);
3623 	}
3624     }
3625   assume (ndests <= SBC_MAX);
3626   return ndests;
3627  error_return:
3628   for (j = 0; j < ndests; ++j)
3629     re_node_set_free (dests_node + j);
3630   return -1;
3631 }
3632 
3633 #ifdef RE_ENABLE_I18N
3634 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3635    Return the number of the bytes the node accepts.
3636    STR_IDX is the current index of the input string.
3637 
3638    This function handles the nodes which can accept one character, or
3639    one collating element like '.', '[a-z]', opposite to the other nodes
3640    can only accept one byte.  */
3641 
3642 # ifdef _LIBC
3643 #  include <locale/weight.h>
3644 # endif
3645 
3646 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3647 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3648 			 const re_string_t *input, Idx str_idx)
3649 {
3650   const re_token_t *node = dfa->nodes + node_idx;
3651   int char_len, elem_len;
3652   Idx i;
3653 
3654   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3655     {
3656       unsigned char c = re_string_byte_at (input, str_idx), d;
3657       if (__glibc_likely (c < 0xc2))
3658 	return 0;
3659 
3660       if (str_idx + 2 > input->len)
3661 	return 0;
3662 
3663       d = re_string_byte_at (input, str_idx + 1);
3664       if (c < 0xe0)
3665 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3666       else if (c < 0xf0)
3667 	{
3668 	  char_len = 3;
3669 	  if (c == 0xe0 && d < 0xa0)
3670 	    return 0;
3671 	}
3672       else if (c < 0xf8)
3673 	{
3674 	  char_len = 4;
3675 	  if (c == 0xf0 && d < 0x90)
3676 	    return 0;
3677 	}
3678       else if (c < 0xfc)
3679 	{
3680 	  char_len = 5;
3681 	  if (c == 0xf8 && d < 0x88)
3682 	    return 0;
3683 	}
3684       else if (c < 0xfe)
3685 	{
3686 	  char_len = 6;
3687 	  if (c == 0xfc && d < 0x84)
3688 	    return 0;
3689 	}
3690       else
3691 	return 0;
3692 
3693       if (str_idx + char_len > input->len)
3694 	return 0;
3695 
3696       for (i = 1; i < char_len; ++i)
3697 	{
3698 	  d = re_string_byte_at (input, str_idx + i);
3699 	  if (d < 0x80 || d > 0xbf)
3700 	    return 0;
3701 	}
3702       return char_len;
3703     }
3704 
3705   char_len = re_string_char_size_at (input, str_idx);
3706   if (node->type == OP_PERIOD)
3707     {
3708       if (char_len <= 1)
3709 	return 0;
3710       /* FIXME: I don't think this if is needed, as both '\n'
3711 	 and '\0' are char_len == 1.  */
3712       /* '.' accepts any one character except the following two cases.  */
3713       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3714 	   && re_string_byte_at (input, str_idx) == '\n')
3715 	  || ((dfa->syntax & RE_DOT_NOT_NULL)
3716 	      && re_string_byte_at (input, str_idx) == '\0'))
3717 	return 0;
3718       return char_len;
3719     }
3720 
3721   elem_len = re_string_elem_size_at (input, str_idx);
3722   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3723     return 0;
3724 
3725   if (node->type == COMPLEX_BRACKET)
3726     {
3727       const re_charset_t *cset = node->opr.mbcset;
3728 # ifdef _LIBC
3729       const unsigned char *pin
3730 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3731       Idx j;
3732       uint32_t nrules;
3733 # endif /* _LIBC */
3734       int match_len = 0;
3735       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3736 		    ? re_string_wchar_at (input, str_idx) : 0);
3737 
3738       /* match with multibyte character?  */
3739       for (i = 0; i < cset->nmbchars; ++i)
3740 	if (wc == cset->mbchars[i])
3741 	  {
3742 	    match_len = char_len;
3743 	    goto check_node_accept_bytes_match;
3744 	  }
3745       /* match with character_class?  */
3746       for (i = 0; i < cset->nchar_classes; ++i)
3747 	{
3748 	  wctype_t wt = cset->char_classes[i];
3749 	  if (__iswctype (wc, wt))
3750 	    {
3751 	      match_len = char_len;
3752 	      goto check_node_accept_bytes_match;
3753 	    }
3754 	}
3755 
3756 # ifdef _LIBC
3757       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3758       if (nrules != 0)
3759 	{
3760 	  unsigned int in_collseq = 0;
3761 	  const int32_t *table, *indirect;
3762 	  const unsigned char *weights, *extra;
3763 	  const char *collseqwc;
3764 
3765 	  /* match with collating_symbol?  */
3766 	  if (cset->ncoll_syms)
3767 	    extra = (const unsigned char *)
3768 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3769 	  for (i = 0; i < cset->ncoll_syms; ++i)
3770 	    {
3771 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3772 	      /* Compare the length of input collating element and
3773 		 the length of current collating element.  */
3774 	      if (*coll_sym != elem_len)
3775 		continue;
3776 	      /* Compare each bytes.  */
3777 	      for (j = 0; j < *coll_sym; j++)
3778 		if (pin[j] != coll_sym[1 + j])
3779 		  break;
3780 	      if (j == *coll_sym)
3781 		{
3782 		  /* Match if every bytes is equal.  */
3783 		  match_len = j;
3784 		  goto check_node_accept_bytes_match;
3785 		}
3786 	    }
3787 
3788 	  if (cset->nranges)
3789 	    {
3790 	      if (elem_len <= char_len)
3791 		{
3792 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3793 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3794 		}
3795 	      else
3796 		in_collseq = find_collation_sequence_value (pin, elem_len);
3797 	    }
3798 	  /* match with range expression?  */
3799 	  /* FIXME: Implement rational ranges here, too.  */
3800 	  for (i = 0; i < cset->nranges; ++i)
3801 	    if (cset->range_starts[i] <= in_collseq
3802 		&& in_collseq <= cset->range_ends[i])
3803 	      {
3804 		match_len = elem_len;
3805 		goto check_node_accept_bytes_match;
3806 	      }
3807 
3808 	  /* match with equivalence_class?  */
3809 	  if (cset->nequiv_classes)
3810 	    {
3811 	      const unsigned char *cp = pin;
3812 	      table = (const int32_t *)
3813 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3814 	      weights = (const unsigned char *)
3815 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3816 	      extra = (const unsigned char *)
3817 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3818 	      indirect = (const int32_t *)
3819 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3820 	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3821 	      int32_t rule = idx >> 24;
3822 	      idx &= 0xffffff;
3823 	      if (idx > 0)
3824 		{
3825 		  size_t weight_len = weights[idx];
3826 		  for (i = 0; i < cset->nequiv_classes; ++i)
3827 		    {
3828 		      int32_t equiv_class_idx = cset->equiv_classes[i];
3829 		      int32_t equiv_class_rule = equiv_class_idx >> 24;
3830 		      equiv_class_idx &= 0xffffff;
3831 		      if (weights[equiv_class_idx] == weight_len
3832 			  && equiv_class_rule == rule
3833 			  && memcmp (weights + idx + 1,
3834 				     weights + equiv_class_idx + 1,
3835 				     weight_len) == 0)
3836 			{
3837 			  match_len = elem_len;
3838 			  goto check_node_accept_bytes_match;
3839 			}
3840 		    }
3841 		}
3842 	    }
3843 	}
3844       else
3845 # endif /* _LIBC */
3846 	{
3847 	  /* match with range expression?  */
3848 	  for (i = 0; i < cset->nranges; ++i)
3849 	    {
3850 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3851 		{
3852 		  match_len = char_len;
3853 		  goto check_node_accept_bytes_match;
3854 		}
3855 	    }
3856 	}
3857     check_node_accept_bytes_match:
3858       if (!cset->non_match)
3859 	return match_len;
3860       else
3861 	{
3862 	  if (match_len > 0)
3863 	    return 0;
3864 	  else
3865 	    return (elem_len > char_len) ? elem_len : char_len;
3866 	}
3867     }
3868   return 0;
3869 }
3870 
3871 # ifdef _LIBC
3872 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3873 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3874 {
3875   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3876   if (nrules == 0)
3877     {
3878       if (mbs_len == 1)
3879 	{
3880 	  /* No valid character.  Match it as a single byte character.  */
3881 	  const unsigned char *collseq = (const unsigned char *)
3882 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3883 	  return collseq[mbs[0]];
3884 	}
3885       return UINT_MAX;
3886     }
3887   else
3888     {
3889       int32_t idx;
3890       const unsigned char *extra = (const unsigned char *)
3891 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3892       int32_t extrasize = (const unsigned char *)
3893 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3894 
3895       for (idx = 0; idx < extrasize;)
3896 	{
3897 	  int mbs_cnt;
3898 	  bool found = false;
3899 	  int32_t elem_mbs_len;
3900 	  /* Skip the name of collating element name.  */
3901 	  idx = idx + extra[idx] + 1;
3902 	  elem_mbs_len = extra[idx++];
3903 	  if (mbs_len == elem_mbs_len)
3904 	    {
3905 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3906 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3907 		  break;
3908 	      if (mbs_cnt == elem_mbs_len)
3909 		/* Found the entry.  */
3910 		found = true;
3911 	    }
3912 	  /* Skip the byte sequence of the collating element.  */
3913 	  idx += elem_mbs_len;
3914 	  /* Adjust for the alignment.  */
3915 	  idx = (idx + 3) & ~3;
3916 	  /* Skip the collation sequence value.  */
3917 	  idx += sizeof (uint32_t);
3918 	  /* Skip the wide char sequence of the collating element.  */
3919 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3920 	  /* If we found the entry, return the sequence value.  */
3921 	  if (found)
3922 	    return *(uint32_t *) (extra + idx);
3923 	  /* Skip the collation sequence value.  */
3924 	  idx += sizeof (uint32_t);
3925 	}
3926       return UINT_MAX;
3927     }
3928 }
3929 # endif /* _LIBC */
3930 #endif /* RE_ENABLE_I18N */
3931 
3932 /* Check whether the node accepts the byte which is IDX-th
3933    byte of the INPUT.  */
3934 
3935 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3936 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3937 		   Idx idx)
3938 {
3939   unsigned char ch;
3940   ch = re_string_byte_at (&mctx->input, idx);
3941   switch (node->type)
3942     {
3943     case CHARACTER:
3944       if (node->opr.c != ch)
3945         return false;
3946       break;
3947 
3948     case SIMPLE_BRACKET:
3949       if (!bitset_contain (node->opr.sbcset, ch))
3950         return false;
3951       break;
3952 
3953 #ifdef RE_ENABLE_I18N
3954     case OP_UTF8_PERIOD:
3955       if (ch >= ASCII_CHARS)
3956         return false;
3957       FALLTHROUGH;
3958 #endif
3959     case OP_PERIOD:
3960       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3961 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3962 	return false;
3963       break;
3964 
3965     default:
3966       return false;
3967     }
3968 
3969   if (node->constraint)
3970     {
3971       /* The node has constraints.  Check whether the current context
3972 	 satisfies the constraints.  */
3973       unsigned int context = re_string_context_at (&mctx->input, idx,
3974 						   mctx->eflags);
3975       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3976 	return false;
3977     }
3978 
3979   return true;
3980 }
3981 
3982 /* Extend the buffers, if the buffers have run out.  */
3983 
3984 static reg_errcode_t
3985 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)3986 extend_buffers (re_match_context_t *mctx, int min_len)
3987 {
3988   reg_errcode_t ret;
3989   re_string_t *pstr = &mctx->input;
3990 
3991   /* Avoid overflow.  */
3992   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3993 			<= pstr->bufs_len))
3994     return REG_ESPACE;
3995 
3996   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3997   ret = re_string_realloc_buffers (pstr,
3998 				   MAX (min_len,
3999 					MIN (pstr->len, pstr->bufs_len * 2)));
4000   if (__glibc_unlikely (ret != REG_NOERROR))
4001     return ret;
4002 
4003   if (mctx->state_log != NULL)
4004     {
4005       /* And double the length of state_log.  */
4006       /* XXX We have no indication of the size of this buffer.  If this
4007 	 allocation fail we have no indication that the state_log array
4008 	 does not have the right size.  */
4009       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4010 					      pstr->bufs_len + 1);
4011       if (__glibc_unlikely (new_array == NULL))
4012 	return REG_ESPACE;
4013       mctx->state_log = new_array;
4014     }
4015 
4016   /* Then reconstruct the buffers.  */
4017   if (pstr->icase)
4018     {
4019 #ifdef RE_ENABLE_I18N
4020       if (pstr->mb_cur_max > 1)
4021 	{
4022 	  ret = build_wcs_upper_buffer (pstr);
4023 	  if (__glibc_unlikely (ret != REG_NOERROR))
4024 	    return ret;
4025 	}
4026       else
4027 #endif /* RE_ENABLE_I18N  */
4028 	build_upper_buffer (pstr);
4029     }
4030   else
4031     {
4032 #ifdef RE_ENABLE_I18N
4033       if (pstr->mb_cur_max > 1)
4034 	build_wcs_buffer (pstr);
4035       else
4036 #endif /* RE_ENABLE_I18N  */
4037 	{
4038 	  if (pstr->trans != NULL)
4039 	    re_string_translate_buffer (pstr);
4040 	}
4041     }
4042   return REG_NOERROR;
4043 }
4044 
4045 
4046 /* Functions for matching context.  */
4047 
4048 /* Initialize MCTX.  */
4049 
4050 static reg_errcode_t
4051 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4052 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4053 {
4054   mctx->eflags = eflags;
4055   mctx->match_last = -1;
4056   if (n > 0)
4057     {
4058       /* Avoid overflow.  */
4059       size_t max_object_size =
4060 	MAX (sizeof (struct re_backref_cache_entry),
4061 	     sizeof (re_sub_match_top_t *));
4062       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4063 	return REG_ESPACE;
4064 
4065       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4066       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4067       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4068 	return REG_ESPACE;
4069     }
4070   /* Already zero-ed by the caller.
4071      else
4072        mctx->bkref_ents = NULL;
4073      mctx->nbkref_ents = 0;
4074      mctx->nsub_tops = 0;  */
4075   mctx->abkref_ents = n;
4076   mctx->max_mb_elem_len = 1;
4077   mctx->asub_tops = n;
4078   return REG_NOERROR;
4079 }
4080 
4081 /* Clean the entries which depend on the current input in MCTX.
4082    This function must be invoked when the matcher changes the start index
4083    of the input, or changes the input string.  */
4084 
4085 static void
match_ctx_clean(re_match_context_t * mctx)4086 match_ctx_clean (re_match_context_t *mctx)
4087 {
4088   Idx st_idx;
4089   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4090     {
4091       Idx sl_idx;
4092       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4093       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4094 	{
4095 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4096 	  re_free (last->path.array);
4097 	  re_free (last);
4098 	}
4099       re_free (top->lasts);
4100       if (top->path)
4101 	{
4102 	  re_free (top->path->array);
4103 	  re_free (top->path);
4104 	}
4105       re_free (top);
4106     }
4107 
4108   mctx->nsub_tops = 0;
4109   mctx->nbkref_ents = 0;
4110 }
4111 
4112 /* Free all the memory associated with MCTX.  */
4113 
4114 static void
match_ctx_free(re_match_context_t * mctx)4115 match_ctx_free (re_match_context_t *mctx)
4116 {
4117   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4118   match_ctx_clean (mctx);
4119   re_free (mctx->sub_tops);
4120   re_free (mctx->bkref_ents);
4121 }
4122 
4123 /* Add a new backreference entry to MCTX.
4124    Note that we assume that caller never call this function with duplicate
4125    entry, and call with STR_IDX which isn't smaller than any existing entry.
4126 */
4127 
4128 static reg_errcode_t
4129 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4130 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4131 		     Idx to)
4132 {
4133   if (mctx->nbkref_ents >= mctx->abkref_ents)
4134     {
4135       struct re_backref_cache_entry* new_entry;
4136       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4137 			      mctx->abkref_ents * 2);
4138       if (__glibc_unlikely (new_entry == NULL))
4139 	{
4140 	  re_free (mctx->bkref_ents);
4141 	  return REG_ESPACE;
4142 	}
4143       mctx->bkref_ents = new_entry;
4144       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4145 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4146       mctx->abkref_ents *= 2;
4147     }
4148   if (mctx->nbkref_ents > 0
4149       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4150     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4151 
4152   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4153   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4154   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4155   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4156 
4157   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4158      If bit N is clear, means that this entry won't epsilon-transition to
4159      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4160      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4161      such node.
4162 
4163      A backreference does not epsilon-transition unless it is empty, so set
4164      to all zeros if FROM != TO.  */
4165   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4166     = (from == to ? -1 : 0);
4167 
4168   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4169   if (mctx->max_mb_elem_len < to - from)
4170     mctx->max_mb_elem_len = to - from;
4171   return REG_NOERROR;
4172 }
4173 
4174 /* Return the first entry with the same str_idx, or -1 if none is
4175    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4176 
4177 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4178 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4179 {
4180   Idx left, right, mid, last;
4181   last = right = mctx->nbkref_ents;
4182   for (left = 0; left < right;)
4183     {
4184       mid = (left + right) / 2;
4185       if (mctx->bkref_ents[mid].str_idx < str_idx)
4186 	left = mid + 1;
4187       else
4188 	right = mid;
4189     }
4190   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4191     return left;
4192   else
4193     return -1;
4194 }
4195 
4196 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4197    at STR_IDX.  */
4198 
4199 static reg_errcode_t
4200 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4201 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4202 {
4203   DEBUG_ASSERT (mctx->sub_tops != NULL);
4204   DEBUG_ASSERT (mctx->asub_tops > 0);
4205   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4206     {
4207       Idx new_asub_tops = mctx->asub_tops * 2;
4208       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4209 						   re_sub_match_top_t *,
4210 						   new_asub_tops);
4211       if (__glibc_unlikely (new_array == NULL))
4212 	return REG_ESPACE;
4213       mctx->sub_tops = new_array;
4214       mctx->asub_tops = new_asub_tops;
4215     }
4216   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4217   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4218     return REG_ESPACE;
4219   mctx->sub_tops[mctx->nsub_tops]->node = node;
4220   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4221   return REG_NOERROR;
4222 }
4223 
4224 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4225    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4226    Return the new entry if successful, NULL if memory is exhausted.  */
4227 
4228 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4229 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4230 {
4231   re_sub_match_last_t *new_entry;
4232   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4233     {
4234       Idx new_alasts = 2 * subtop->alasts + 1;
4235       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4236 						    re_sub_match_last_t *,
4237 						    new_alasts);
4238       if (__glibc_unlikely (new_array == NULL))
4239 	return NULL;
4240       subtop->lasts = new_array;
4241       subtop->alasts = new_alasts;
4242     }
4243   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4244   if (__glibc_likely (new_entry != NULL))
4245     {
4246       subtop->lasts[subtop->nlasts] = new_entry;
4247       new_entry->node = node;
4248       new_entry->str_idx = str_idx;
4249       ++subtop->nlasts;
4250     }
4251   return new_entry;
4252 }
4253 
4254 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4255 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4256 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4257 {
4258   sctx->sifted_states = sifted_sts;
4259   sctx->limited_states = limited_sts;
4260   sctx->last_node = last_node;
4261   sctx->last_str_idx = last_str_idx;
4262   re_node_set_init_empty (&sctx->limits);
4263 }
4264