1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23 
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 					  size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 				     const re_dfastate_t *init_state,
28 				     char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 			       reg_errcode_t (fn (void *, bin_tree_t *)),
41 			       void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 				reg_errcode_t (fn (void *, bin_tree_t *)),
44 				void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 				 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 				   unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 					 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 			 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 			reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 			  reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 				  re_token_t *token, reg_syntax_t syntax,
67 				  Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 				 re_token_t *token, reg_syntax_t syntax,
70 				 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 				     re_token_t *token, reg_syntax_t syntax,
73 				     Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 				  re_token_t *token, reg_syntax_t syntax,
76 				  Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 				 re_dfa_t *dfa, re_token_t *token,
79 				 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 				      re_token_t *token, reg_syntax_t syntax,
82 				      reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 					    re_string_t *regexp,
85 					    re_token_t *token, int token_len,
86 					    re_dfa_t *dfa,
87 					    reg_syntax_t syntax,
88 					    bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 					  re_string_t *regexp,
91 					  re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 					re_charset_t *mbcset,
95 					Idx *equiv_class_alloc,
96 					const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 				      bitset_t sbcset,
99 				      re_charset_t *mbcset,
100 				      Idx *char_class_alloc,
101 				      const char *class_name,
102 				      reg_syntax_t syntax);
103 #else  /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 					const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 				      bitset_t sbcset,
108 				      const char *class_name,
109 				      reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 				       RE_TRANSLATE_TYPE trans,
113 				       const char *class_name,
114 				       const char *extra,
115 				       bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 				bin_tree_t *left, bin_tree_t *right,
118 				re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 				      bin_tree_t *left, bin_tree_t *right,
121 				      const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 
127 /* This table gives an error message for each of the error codes listed
128    in regex.h.  Obviously the order here has to be same as there.
129    POSIX doesn't require that we do anything for REG_NOERROR,
130    but why not be nice?  */
131 
132 static const char __re_error_msgid[] =
133   {
134 #define REG_NOERROR_IDX	0
135     gettext_noop ("Success")	/* REG_NOERROR */
136     "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138     gettext_noop ("No match")	/* REG_NOMATCH */
139     "\0"
140 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142     "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145     "\0"
146 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148     "\0"
149 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151     "\0"
152 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154     "\0"
155 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
156     gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
157     "\0"
158 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160     "\0"
161 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163     "\0"
164 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166     "\0"
167 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168     gettext_noop ("Invalid range end")	/* REG_ERANGE */
169     "\0"
170 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
172     "\0"
173 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175     "\0"
176 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
178     "\0"
179 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
181     "\0"
182 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184   };
185 
186 static const size_t __re_error_msgid_idx[] =
187   {
188     REG_NOERROR_IDX,
189     REG_NOMATCH_IDX,
190     REG_BADPAT_IDX,
191     REG_ECOLLATE_IDX,
192     REG_ECTYPE_IDX,
193     REG_EESCAPE_IDX,
194     REG_ESUBREG_IDX,
195     REG_EBRACK_IDX,
196     REG_EPAREN_IDX,
197     REG_EBRACE_IDX,
198     REG_BADBR_IDX,
199     REG_ERANGE_IDX,
200     REG_ESPACE_IDX,
201     REG_BADRPT_IDX,
202     REG_EEND_IDX,
203     REG_ESIZE_IDX,
204     REG_ERPAREN_IDX
205   };
206 
207 /* Entry points for GNU code.  */
208 
209 /* re_compile_pattern is the GNU regular expression compiler: it
210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211    Returns 0 if the pattern was valid, otherwise an error string.
212 
213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214    are set in BUFP on entry.  */
215 
216 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)217 re_compile_pattern (const char *pattern, size_t length,
218 		    struct re_pattern_buffer *bufp)
219 {
220   reg_errcode_t ret;
221 
222   /* And GNU code determines whether or not to get register information
223      by passing null for the REGS argument to re_match, etc., not by
224      setting no_sub, unless RE_NO_SUB is set.  */
225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 
227   /* Match anchors at newline.  */
228   bufp->newline_anchor = 1;
229 
230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 
232   if (!ret)
233     return NULL;
234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
weak_alias(__re_compile_pattern,re_compile_pattern)236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 
238 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
239    also be assigned to arbitrarily: each pattern buffer stores its own
240    syntax, so it can be changed between regex compilations.  */
241 /* This has no initializer because initialized variables in Emacs
242    become read-only after dumping.  */
243 reg_syntax_t re_syntax_options;
244 
245 
246 /* Specify the precise syntax of regexps for compilation.  This provides
247    for compatibility for various utilities which historically have
248    different, incompatible syntaxes.
249 
250    The argument SYNTAX is a bit mask comprised of the various bits
251    defined in regex.h.  We return the old syntax.  */
252 
253 reg_syntax_t
254 re_set_syntax (reg_syntax_t syntax)
255 {
256   reg_syntax_t ret = re_syntax_options;
257 
258   re_syntax_options = syntax;
259   return ret;
260 }
weak_alias(__re_set_syntax,re_set_syntax)261 weak_alias (__re_set_syntax, re_set_syntax)
262 
263 int
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
265 {
266   re_dfa_t *dfa = bufp->buffer;
267   char *fastmap = bufp->fastmap;
268 
269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271   if (dfa->init_state != dfa->init_state_word)
272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273   if (dfa->init_state != dfa->init_state_nl)
274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275   if (dfa->init_state != dfa->init_state_begbuf)
276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277   bufp->fastmap_accurate = 1;
278   return 0;
279 }
weak_alias(__re_compile_fastmap,re_compile_fastmap)280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
281 
282 static inline void
283 __attribute__ ((always_inline))
284 re_set_fastmap (char *fastmap, bool icase, int ch)
285 {
286   fastmap[ch] = 1;
287   if (icase)
288     fastmap[tolower (ch)] = 1;
289 }
290 
291 /* Helper function for re_compile_fastmap.
292    Compile fastmap for the initial_state INIT_STATE.  */
293 
294 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 			 char *fastmap)
297 {
298   re_dfa_t *dfa = bufp->buffer;
299   Idx node_cnt;
300   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
302     {
303       Idx node = init_state->nodes.elems[node_cnt];
304       re_token_type_t type = dfa->nodes[node].type;
305 
306       if (type == CHARACTER)
307 	{
308 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309 #ifdef RE_ENABLE_I18N
310 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
311 	    {
312 	      unsigned char buf[MB_LEN_MAX];
313 	      unsigned char *p;
314 	      wchar_t wc;
315 	      mbstate_t state;
316 
317 	      p = buf;
318 	      *p++ = dfa->nodes[node].opr.c;
319 	      while (++node < dfa->nodes_len
320 		     &&	dfa->nodes[node].type == CHARACTER
321 		     && dfa->nodes[node].mb_partial)
322 		*p++ = dfa->nodes[node].opr.c;
323 	      memset (&state, '\0', sizeof (state));
324 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 			     &state) == p - buf
326 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
327 		      != (size_t) -1))
328 		re_set_fastmap (fastmap, false, buf[0]);
329 	    }
330 #endif
331 	}
332       else if (type == SIMPLE_BRACKET)
333 	{
334 	  int i, ch;
335 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
336 	    {
337 	      int j;
338 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 		if (w & ((bitset_word_t) 1 << j))
341 		  re_set_fastmap (fastmap, icase, ch);
342 	    }
343 	}
344 #ifdef RE_ENABLE_I18N
345       else if (type == COMPLEX_BRACKET)
346 	{
347 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348 	  Idx i;
349 
350 # ifdef _LIBC
351 	  /* See if we have to try all bytes which start multiple collation
352 	     elements.
353 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 		  collation element, and don't catch 'b' since 'b' is
355 		  the only collation element which starts from 'b' (and
356 		  it is caught by SIMPLE_BRACKET).  */
357 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 		  && (cset->ncoll_syms || cset->nranges))
359 		{
360 		  const int32_t *table = (const int32_t *)
361 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 		  for (i = 0; i < SBC_MAX; ++i)
363 		    if (table[i] < 0)
364 		      re_set_fastmap (fastmap, icase, i);
365 		}
366 # endif /* _LIBC */
367 
368 	  /* See if we have to start the match at all multibyte characters,
369 	     i.e. where we would not find an invalid sequence.  This only
370 	     applies to multibyte character sets; for single byte character
371 	     sets, the SIMPLE_BRACKET again suffices.  */
372 	  if (dfa->mb_cur_max > 1
373 	      && (cset->nchar_classes || cset->non_match || cset->nranges
374 # ifdef _LIBC
375 		  || cset->nequiv_classes
376 # endif /* _LIBC */
377 		 ))
378 	    {
379 	      unsigned char c = 0;
380 	      do
381 		{
382 		  mbstate_t mbs;
383 		  memset (&mbs, 0, sizeof (mbs));
384 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 		    re_set_fastmap (fastmap, false, (int) c);
386 		}
387 	      while (++c != 0);
388 	    }
389 
390 	  else
391 	    {
392 	      /* ... Else catch all bytes which can start the mbchars.  */
393 	      for (i = 0; i < cset->nmbchars; ++i)
394 		{
395 		  char buf[256];
396 		  mbstate_t state;
397 		  memset (&state, '\0', sizeof (state));
398 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
401 		    {
402 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403 			  != (size_t) -1)
404 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
405 		    }
406 		}
407 	    }
408 	}
409 #endif /* RE_ENABLE_I18N */
410       else if (type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 	       || type == OP_UTF8_PERIOD
413 #endif /* RE_ENABLE_I18N */
414 	       || type == END_OF_RE)
415 	{
416 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 	  if (type == END_OF_RE)
418 	    bufp->can_be_null = 1;
419 	  return;
420 	}
421     }
422 }
423 
424 /* Entry point for POSIX code.  */
425 /* regcomp takes a regular expression as a string and compiles it.
426 
427    PREG is a regex_t *.  We do not expect any fields to be initialized,
428    since POSIX says we shouldn't.  Thus, we set
429 
430      'buffer' to the compiled pattern;
431      'used' to the length of the compiled pattern;
432      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
434        RE_SYNTAX_POSIX_BASIC;
435      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436      'fastmap' to an allocated space for the fastmap;
437      'fastmap_accurate' to zero;
438      're_nsub' to the number of subexpressions in PATTERN.
439 
440    PATTERN is the address of the pattern string.
441 
442    CFLAGS is a series of bits which affect compilation.
443 
444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445      use POSIX basic syntax.
446 
447      If REG_NEWLINE is set, then . and [^...] don't match newline.
448      Also, regexec will try a match beginning after every newline.
449 
450      If REG_ICASE is set, then we considers upper- and lowercase
451      versions of letters to be equivalent when matching.
452 
453      If REG_NOSUB is set, then when PREG is passed to regexec, that
454      routine will report only success or failure, and nothing about the
455      registers.
456 
457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
458    the return codes and their meanings.)  */
459 
460 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462 {
463   reg_errcode_t ret;
464   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 			 : RE_SYNTAX_POSIX_BASIC);
466 
467   preg->buffer = NULL;
468   preg->allocated = 0;
469   preg->used = 0;
470 
471   /* Try to allocate space for the fastmap.  */
472   preg->fastmap = re_malloc (char, SBC_MAX);
473   if (__glibc_unlikely (preg->fastmap == NULL))
474     return REG_ESPACE;
475 
476   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
477 
478   /* If REG_NEWLINE is set, newlines are treated differently.  */
479   if (cflags & REG_NEWLINE)
480     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
481       syntax &= ~RE_DOT_NEWLINE;
482       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483       /* It also changes the matching behavior.  */
484       preg->newline_anchor = 1;
485     }
486   else
487     preg->newline_anchor = 0;
488   preg->no_sub = !!(cflags & REG_NOSUB);
489   preg->translate = NULL;
490 
491   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492 
493   /* POSIX doesn't distinguish between an unmatched open-group and an
494      unmatched close-group: both are REG_EPAREN.  */
495   if (ret == REG_ERPAREN)
496     ret = REG_EPAREN;
497 
498   /* We have already checked preg->fastmap != NULL.  */
499   if (__glibc_likely (ret == REG_NOERROR))
500     /* Compute the fastmap now, since regexec cannot modify the pattern
501        buffer.  This function never fails in this implementation.  */
502     (void) re_compile_fastmap (preg);
503   else
504     {
505       /* Some error occurred while compiling the expression.  */
506       re_free (preg->fastmap);
507       preg->fastmap = NULL;
508     }
509 
510   return (int) ret;
511 }
512 libc_hidden_def (__regcomp)
weak_alias(__regcomp,regcomp)513 weak_alias (__regcomp, regcomp)
514 
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516    from either regcomp or regexec.   We don't use PREG here.  */
517 
518 size_t
519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520 	  size_t errbuf_size)
521 {
522   const char *msg;
523   size_t msg_size;
524   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
525 
526   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527     /* Only error codes returned by the rest of the code should be passed
528        to this routine.  If we are given anything else, or if other regex
529        code generates an invalid error code, then the program has a bug.
530        Dump core so we can fix it.  */
531     abort ();
532 
533   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534 
535   msg_size = strlen (msg) + 1; /* Includes the null.  */
536 
537   if (__glibc_likely (errbuf_size != 0))
538     {
539       size_t cpy_size = msg_size;
540       if (__glibc_unlikely (msg_size > errbuf_size))
541 	{
542 	  cpy_size = errbuf_size - 1;
543 	  errbuf[cpy_size] = '\0';
544 	}
545       memcpy (errbuf, msg, cpy_size);
546     }
547 
548   return msg_size;
549 }
550 weak_alias (__regerror, regerror)
551 
552 
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
556    it the same all the time.  UTF-8 is the preferred encoding so this is
557    a worthwhile optimization.  */
558 static const bitset_t utf8_sb_map =
559 {
560   /* Set the first 128 bits.  */
561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
562   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 # else
564 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 #   error "bitset_word_t is narrower than 32 bits"
566 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569   BITSET_WORD_MAX, BITSET_WORD_MAX,
570 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571   BITSET_WORD_MAX,
572 #  endif
573   (BITSET_WORD_MAX
574    >> (SBC_MAX % BITSET_WORD_BITS == 0
575        ? 0
576        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577 # endif
578 };
579 #endif
580 
581 
582 static void
free_dfa_content(re_dfa_t * dfa)583 free_dfa_content (re_dfa_t *dfa)
584 {
585   Idx i, j;
586 
587   if (dfa->nodes)
588     for (i = 0; i < dfa->nodes_len; ++i)
589       free_token (dfa->nodes + i);
590   re_free (dfa->nexts);
591   for (i = 0; i < dfa->nodes_len; ++i)
592     {
593       if (dfa->eclosures != NULL)
594 	re_node_set_free (dfa->eclosures + i);
595       if (dfa->inveclosures != NULL)
596 	re_node_set_free (dfa->inveclosures + i);
597       if (dfa->edests != NULL)
598 	re_node_set_free (dfa->edests + i);
599     }
600   re_free (dfa->edests);
601   re_free (dfa->eclosures);
602   re_free (dfa->inveclosures);
603   re_free (dfa->nodes);
604 
605   if (dfa->state_table)
606     for (i = 0; i <= dfa->state_hash_mask; ++i)
607       {
608 	struct re_state_table_entry *entry = dfa->state_table + i;
609 	for (j = 0; j < entry->num; ++j)
610 	  {
611 	    re_dfastate_t *state = entry->array[j];
612 	    free_state (state);
613 	  }
614 	re_free (entry->array);
615       }
616   re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618   if (dfa->sb_char != utf8_sb_map)
619     re_free (dfa->sb_char);
620 #endif
621   re_free (dfa->subexp_map);
622 #ifdef DEBUG
623   re_free (dfa->re_str);
624 #endif
625 
626   re_free (dfa);
627 }
628 
629 
630 /* Free dynamically allocated space used by PREG.  */
631 
632 void
regfree(regex_t * preg)633 regfree (regex_t *preg)
634 {
635   re_dfa_t *dfa = preg->buffer;
636   if (__glibc_likely (dfa != NULL))
637     {
638       lock_fini (dfa->lock);
639       free_dfa_content (dfa);
640     }
641   preg->buffer = NULL;
642   preg->allocated = 0;
643 
644   re_free (preg->fastmap);
645   preg->fastmap = NULL;
646 
647   re_free (preg->translate);
648   preg->translate = NULL;
649 }
650 libc_hidden_def (__regfree)
651 weak_alias (__regfree, regfree)
652 
653 /* Entry points compatible with 4.2 BSD regex library.  We don't define
654    them unless specifically requested.  */
655 
656 #if defined _REGEX_RE_COMP || defined _LIBC
657 
658 /* BSD has one and only one pattern buffer.  */
659 static struct re_pattern_buffer re_comp_buf;
660 
661 char *
662 # ifdef _LIBC
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664    these names if they don't use our functions, and still use
665    regcomp/regexec above without link errors.  */
666 weak_function
667 # endif
re_comp(const char * s)668 re_comp (const char *s)
669 {
670   reg_errcode_t ret;
671   char *fastmap;
672 
673   if (!s)
674     {
675       if (!re_comp_buf.buffer)
676 	return gettext ("No previous regular expression");
677       return 0;
678     }
679 
680   if (re_comp_buf.buffer)
681     {
682       fastmap = re_comp_buf.fastmap;
683       re_comp_buf.fastmap = NULL;
684       __regfree (&re_comp_buf);
685       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686       re_comp_buf.fastmap = fastmap;
687     }
688 
689   if (re_comp_buf.fastmap == NULL)
690     {
691       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692       if (re_comp_buf.fastmap == NULL)
693 	return (char *) gettext (__re_error_msgid
694 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
695     }
696 
697   /* Since 're_exec' always passes NULL for the 'regs' argument, we
698      don't need to initialize the pattern buffer fields which affect it.  */
699 
700   /* Match anchors at newlines.  */
701   re_comp_buf.newline_anchor = 1;
702 
703   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704 
705   if (!ret)
706     return NULL;
707 
708   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
709   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
710 }
711 
712 #ifdef _LIBC
libc_freeres_fn(free_mem)713 libc_freeres_fn (free_mem)
714 {
715   __regfree (&re_comp_buf);
716 }
717 #endif
718 
719 #endif /* _REGEX_RE_COMP */
720 
721 /* Internal entry point.
722    Compile the regular expression PATTERN, whose length is LENGTH.
723    SYNTAX indicate regular expression's syntax.  */
724 
725 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
727 		     reg_syntax_t syntax)
728 {
729   reg_errcode_t err = REG_NOERROR;
730   re_dfa_t *dfa;
731   re_string_t regexp;
732 
733   /* Initialize the pattern buffer.  */
734   preg->fastmap_accurate = 0;
735   preg->syntax = syntax;
736   preg->not_bol = preg->not_eol = 0;
737   preg->used = 0;
738   preg->re_nsub = 0;
739   preg->can_be_null = 0;
740   preg->regs_allocated = REGS_UNALLOCATED;
741 
742   /* Initialize the dfa.  */
743   dfa = preg->buffer;
744   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
745     {
746       /* If zero allocated, but buffer is non-null, try to realloc
747 	 enough space.  This loses if buffer's address is bogus, but
748 	 that is the user's responsibility.  If ->buffer is NULL this
749 	 is a simple allocation.  */
750       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
751       if (dfa == NULL)
752 	return REG_ESPACE;
753       preg->allocated = sizeof (re_dfa_t);
754       preg->buffer = dfa;
755     }
756   preg->used = sizeof (re_dfa_t);
757 
758   err = init_dfa (dfa, length);
759   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760     err = REG_ESPACE;
761   if (__glibc_unlikely (err != REG_NOERROR))
762     {
763       free_dfa_content (dfa);
764       preg->buffer = NULL;
765       preg->allocated = 0;
766       return err;
767     }
768 #ifdef DEBUG
769   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
770   dfa->re_str = re_malloc (char, length + 1);
771   strncpy (dfa->re_str, pattern, length + 1);
772 #endif
773 
774   err = re_string_construct (&regexp, pattern, length, preg->translate,
775 			     (syntax & RE_ICASE) != 0, dfa);
776   if (__glibc_unlikely (err != REG_NOERROR))
777     {
778     re_compile_internal_free_return:
779       free_workarea_compile (preg);
780       re_string_destruct (&regexp);
781       lock_fini (dfa->lock);
782       free_dfa_content (dfa);
783       preg->buffer = NULL;
784       preg->allocated = 0;
785       return err;
786     }
787 
788   /* Parse the regular expression, and build a structure tree.  */
789   preg->re_nsub = 0;
790   dfa->str_tree = parse (&regexp, preg, syntax, &err);
791   if (__glibc_unlikely (dfa->str_tree == NULL))
792     goto re_compile_internal_free_return;
793 
794   /* Analyze the tree and create the nfa.  */
795   err = analyze (preg);
796   if (__glibc_unlikely (err != REG_NOERROR))
797     goto re_compile_internal_free_return;
798 
799 #ifdef RE_ENABLE_I18N
800   /* If possible, do searching in single byte encoding to speed things up.  */
801   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802     optimize_utf8 (dfa);
803 #endif
804 
805   /* Then create the initial state of the dfa.  */
806   err = create_initial_state (dfa);
807 
808   /* Release work areas.  */
809   free_workarea_compile (preg);
810   re_string_destruct (&regexp);
811 
812   if (__glibc_unlikely (err != REG_NOERROR))
813     {
814       lock_fini (dfa->lock);
815       free_dfa_content (dfa);
816       preg->buffer = NULL;
817       preg->allocated = 0;
818     }
819 
820   return err;
821 }
822 
823 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
824    as the initial length of some arrays.  */
825 
826 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)827 init_dfa (re_dfa_t *dfa, size_t pat_len)
828 {
829   __re_size_t table_size;
830 #ifndef _LIBC
831   const char *codeset_name;
832 #endif
833 #ifdef RE_ENABLE_I18N
834   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835 #else
836   size_t max_i18n_object_size = 0;
837 #endif
838   size_t max_object_size =
839     MAX (sizeof (struct re_state_table_entry),
840 	 MAX (sizeof (re_token_t),
841 	      MAX (sizeof (re_node_set),
842 		   MAX (sizeof (regmatch_t),
843 			max_i18n_object_size))));
844 
845   memset (dfa, '\0', sizeof (re_dfa_t));
846 
847   /* Force allocation of str_tree_storage the first time.  */
848   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849 
850   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
851      calculation below, and for similar doubling calculations
852      elsewhere.  And it's <= rather than <, because some of the
853      doubling calculations add 1 afterwards.  */
854   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855 			<= pat_len))
856     return REG_ESPACE;
857 
858   dfa->nodes_alloc = pat_len + 1;
859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860 
861   /*  table_size = 2 ^ ceil(log pat_len) */
862   for (table_size = 1; ; table_size <<= 1)
863     if (table_size > pat_len)
864       break;
865 
866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867   dfa->state_hash_mask = table_size - 1;
868 
869   dfa->mb_cur_max = MB_CUR_MAX;
870 #ifdef _LIBC
871   if (dfa->mb_cur_max == 6
872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873     dfa->is_utf8 = 1;
874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 		       != 0);
876 #else
877   codeset_name = nl_langinfo (CODESET);
878   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
882     dfa->is_utf8 = 1;
883 
884   /* We check exhaustively in the loop below if this charset is a
885      superset of ASCII.  */
886   dfa->map_notascii = 0;
887 #endif
888 
889 #ifdef RE_ENABLE_I18N
890   if (dfa->mb_cur_max > 1)
891     {
892       if (dfa->is_utf8)
893 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894       else
895 	{
896 	  int i, j, ch;
897 
898 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899 	  if (__glibc_unlikely (dfa->sb_char == NULL))
900 	    return REG_ESPACE;
901 
902 	  /* Set the bits corresponding to single byte chars.  */
903 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
904 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
905 	      {
906 		wint_t wch = __btowc (ch);
907 		if (wch != WEOF)
908 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
909 # ifndef _LIBC
910 		if (isascii (ch) && wch != ch)
911 		  dfa->map_notascii = 1;
912 # endif
913 	      }
914 	}
915     }
916 #endif
917 
918   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
919     return REG_ESPACE;
920   return REG_NOERROR;
921 }
922 
923 /* Initialize WORD_CHAR table, which indicate which character is
924    "word".  In this case "word" means that it is the word construction
925    character used by some operators like "\<", "\>", etc.  */
926 
927 static void
init_word_char(re_dfa_t * dfa)928 init_word_char (re_dfa_t *dfa)
929 {
930   int i = 0;
931   int j;
932   int ch = 0;
933   dfa->word_ops_used = 1;
934   if (__glibc_likely (dfa->map_notascii == 0))
935     {
936       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937 	 them, an issue when this code is used in Gnulib.  */
938       bitset_word_t bits0 = 0x00000000;
939       bitset_word_t bits1 = 0x03ff0000;
940       bitset_word_t bits2 = 0x87fffffe;
941       bitset_word_t bits3 = 0x07fffffe;
942       if (BITSET_WORD_BITS == 64)
943 	{
944 	  /* Pacify gcc -Woverflow on 32-bit platformns.  */
945 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947 	  i = 2;
948 	}
949       else if (BITSET_WORD_BITS == 32)
950 	{
951 	  dfa->word_char[0] = bits0;
952 	  dfa->word_char[1] = bits1;
953 	  dfa->word_char[2] = bits2;
954 	  dfa->word_char[3] = bits3;
955 	  i = 4;
956 	}
957       else
958         goto general_case;
959       ch = 128;
960 
961       if (__glibc_likely (dfa->is_utf8))
962 	{
963 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 	  return;
965 	}
966     }
967 
968  general_case:
969   for (; i < BITSET_WORDS; ++i)
970     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971       if (isalnum (ch) || ch == '_')
972 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
973 }
974 
975 /* Free the work area which are only used while compiling.  */
976 
977 static void
free_workarea_compile(regex_t * preg)978 free_workarea_compile (regex_t *preg)
979 {
980   re_dfa_t *dfa = preg->buffer;
981   bin_tree_storage_t *storage, *next;
982   for (storage = dfa->str_tree_storage; storage; storage = next)
983     {
984       next = storage->next;
985       re_free (storage);
986     }
987   dfa->str_tree_storage = NULL;
988   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989   dfa->str_tree = NULL;
990   re_free (dfa->org_indices);
991   dfa->org_indices = NULL;
992 }
993 
994 /* Create initial states for all contexts.  */
995 
996 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)997 create_initial_state (re_dfa_t *dfa)
998 {
999   Idx first, i;
1000   reg_errcode_t err;
1001   re_node_set init_nodes;
1002 
1003   /* Initial states have the epsilon closure of the node which is
1004      the first node of the regular expression.  */
1005   first = dfa->str_tree->first->node_idx;
1006   dfa->init_node = first;
1007   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008   if (__glibc_unlikely (err != REG_NOERROR))
1009     return err;
1010 
1011   /* The back-references which are in initial states can epsilon transit,
1012      since in this case all of the subexpressions can be null.
1013      Then we add epsilon closures of the nodes which are the next nodes of
1014      the back-references.  */
1015   if (dfa->nbackref > 0)
1016     for (i = 0; i < init_nodes.nelem; ++i)
1017       {
1018 	Idx node_idx = init_nodes.elems[i];
1019 	re_token_type_t type = dfa->nodes[node_idx].type;
1020 
1021 	Idx clexp_idx;
1022 	if (type != OP_BACK_REF)
1023 	  continue;
1024 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 	  {
1026 	    re_token_t *clexp_node;
1027 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1029 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 	      break;
1031 	  }
1032 	if (clexp_idx == init_nodes.nelem)
1033 	  continue;
1034 
1035 	if (type == OP_BACK_REF)
1036 	  {
1037 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1038 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1039 	      {
1040 		reg_errcode_t merge_err
1041                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042 		if (merge_err != REG_NOERROR)
1043 		  return merge_err;
1044 		i = 0;
1045 	      }
1046 	  }
1047       }
1048 
1049   /* It must be the first time to invoke acquire_state.  */
1050   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051   /* We don't check ERR here, since the initial state must not be NULL.  */
1052   if (__glibc_unlikely (dfa->init_state == NULL))
1053     return err;
1054   if (dfa->init_state->has_constraint)
1055     {
1056       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 						       CONTEXT_WORD);
1058       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 						     CONTEXT_NEWLINE);
1060       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061 							 &init_nodes,
1062 							 CONTEXT_NEWLINE
1063 							 | CONTEXT_BEGBUF);
1064       if (__glibc_unlikely (dfa->init_state_word == NULL
1065 			    || dfa->init_state_nl == NULL
1066 			    || dfa->init_state_begbuf == NULL))
1067 	return err;
1068     }
1069   else
1070     dfa->init_state_word = dfa->init_state_nl
1071       = dfa->init_state_begbuf = dfa->init_state;
1072 
1073   re_node_set_free (&init_nodes);
1074   return REG_NOERROR;
1075 }
1076 
1077 #ifdef RE_ENABLE_I18N
1078 /* If it is possible to do searching in single byte encoding instead of UTF-8
1079    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080    DFA nodes where needed.  */
1081 
1082 static void
optimize_utf8(re_dfa_t * dfa)1083 optimize_utf8 (re_dfa_t *dfa)
1084 {
1085   Idx node;
1086   int i;
1087   bool mb_chars = false;
1088   bool has_period = false;
1089 
1090   for (node = 0; node < dfa->nodes_len; ++node)
1091     switch (dfa->nodes[node].type)
1092       {
1093       case CHARACTER:
1094 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095 	  mb_chars = true;
1096 	break;
1097       case ANCHOR:
1098 	switch (dfa->nodes[node].opr.ctx_type)
1099 	  {
1100 	  case LINE_FIRST:
1101 	  case LINE_LAST:
1102 	  case BUF_FIRST:
1103 	  case BUF_LAST:
1104 	    break;
1105 	  default:
1106 	    /* Word anchors etc. cannot be handled.  It's okay to test
1107 	       opr.ctx_type since constraints (for all DFA nodes) are
1108 	       created by ORing one or more opr.ctx_type values.  */
1109 	    return;
1110 	  }
1111 	break;
1112       case OP_PERIOD:
1113 	has_period = true;
1114 	break;
1115       case OP_BACK_REF:
1116       case OP_ALT:
1117       case END_OF_RE:
1118       case OP_DUP_ASTERISK:
1119       case OP_OPEN_SUBEXP:
1120       case OP_CLOSE_SUBEXP:
1121 	break;
1122       case COMPLEX_BRACKET:
1123 	return;
1124       case SIMPLE_BRACKET:
1125 	/* Just double check.  */
1126 	{
1127 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1128 			? 0
1129 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131 	    {
1132 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1133 		return;
1134 	      rshift = 0;
1135 	    }
1136 	}
1137 	break;
1138       default:
1139 	abort ();
1140       }
1141 
1142   if (mb_chars || has_period)
1143     for (node = 0; node < dfa->nodes_len; ++node)
1144       {
1145 	if (dfa->nodes[node].type == CHARACTER
1146 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1147 	  dfa->nodes[node].mb_partial = 0;
1148 	else if (dfa->nodes[node].type == OP_PERIOD)
1149 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1150       }
1151 
1152   /* The search can be in single byte locale.  */
1153   dfa->mb_cur_max = 1;
1154   dfa->is_utf8 = 0;
1155   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1156 }
1157 #endif
1158 
1159 /* Analyze the structure tree, and calculate "first", "next", "edest",
1160    "eclosure", and "inveclosure".  */
1161 
1162 static reg_errcode_t
analyze(regex_t * preg)1163 analyze (regex_t *preg)
1164 {
1165   re_dfa_t *dfa = preg->buffer;
1166   reg_errcode_t ret;
1167 
1168   /* Allocate arrays.  */
1169   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174 			|| dfa->edests == NULL || dfa->eclosures == NULL))
1175     return REG_ESPACE;
1176 
1177   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178   if (dfa->subexp_map != NULL)
1179     {
1180       Idx i;
1181       for (i = 0; i < preg->re_nsub; i++)
1182 	dfa->subexp_map[i] = i;
1183       preorder (dfa->str_tree, optimize_subexps, dfa);
1184       for (i = 0; i < preg->re_nsub; i++)
1185 	if (dfa->subexp_map[i] != i)
1186 	  break;
1187       if (i == preg->re_nsub)
1188 	{
1189 	  re_free (dfa->subexp_map);
1190 	  dfa->subexp_map = NULL;
1191 	}
1192     }
1193 
1194   ret = postorder (dfa->str_tree, lower_subexps, preg);
1195   if (__glibc_unlikely (ret != REG_NOERROR))
1196     return ret;
1197   ret = postorder (dfa->str_tree, calc_first, dfa);
1198   if (__glibc_unlikely (ret != REG_NOERROR))
1199     return ret;
1200   preorder (dfa->str_tree, calc_next, dfa);
1201   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202   if (__glibc_unlikely (ret != REG_NOERROR))
1203     return ret;
1204   ret = calc_eclosure (dfa);
1205   if (__glibc_unlikely (ret != REG_NOERROR))
1206     return ret;
1207 
1208   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1209      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1210   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1211       || dfa->nbackref)
1212     {
1213       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214       if (__glibc_unlikely (dfa->inveclosures == NULL))
1215 	return REG_ESPACE;
1216       ret = calc_inveclosure (dfa);
1217     }
1218 
1219   return ret;
1220 }
1221 
1222 /* Our parse trees are very unbalanced, so we cannot use a stack to
1223    implement parse tree visits.  Instead, we use parent pointers and
1224    some hairy code in these two functions.  */
1225 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1226 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1227 	   void *extra)
1228 {
1229   bin_tree_t *node, *prev;
1230 
1231   for (node = root; ; )
1232     {
1233       /* Descend down the tree, preferably to the left (or to the right
1234 	 if that's the only child).  */
1235       while (node->left || node->right)
1236 	if (node->left)
1237 	  node = node->left;
1238 	else
1239 	  node = node->right;
1240 
1241       do
1242 	{
1243 	  reg_errcode_t err = fn (extra, node);
1244 	  if (__glibc_unlikely (err != REG_NOERROR))
1245 	    return err;
1246 	  if (node->parent == NULL)
1247 	    return REG_NOERROR;
1248 	  prev = node;
1249 	  node = node->parent;
1250 	}
1251       /* Go up while we have a node that is reached from the right.  */
1252       while (node->right == prev || node->right == NULL);
1253       node = node->right;
1254     }
1255 }
1256 
1257 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1258 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1259 	  void *extra)
1260 {
1261   bin_tree_t *node;
1262 
1263   for (node = root; ; )
1264     {
1265       reg_errcode_t err = fn (extra, node);
1266       if (__glibc_unlikely (err != REG_NOERROR))
1267 	return err;
1268 
1269       /* Go to the left node, or up and to the right.  */
1270       if (node->left)
1271 	node = node->left;
1272       else
1273 	{
1274 	  bin_tree_t *prev = NULL;
1275 	  while (node->right == prev || node->right == NULL)
1276 	    {
1277 	      prev = node;
1278 	      node = node->parent;
1279 	      if (!node)
1280 		return REG_NOERROR;
1281 	    }
1282 	  node = node->right;
1283 	}
1284     }
1285 }
1286 
1287 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1288    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1289    backreferences as well.  Requires a preorder visit.  */
1290 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1291 optimize_subexps (void *extra, bin_tree_t *node)
1292 {
1293   re_dfa_t *dfa = (re_dfa_t *) extra;
1294 
1295   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296     {
1297       int idx = node->token.opr.idx;
1298       node->token.opr.idx = dfa->subexp_map[idx];
1299       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1300     }
1301 
1302   else if (node->token.type == SUBEXP
1303 	   && node->left && node->left->token.type == SUBEXP)
1304     {
1305       Idx other_idx = node->left->token.opr.idx;
1306 
1307       node->left = node->left->left;
1308       if (node->left)
1309 	node->left->parent = node;
1310 
1311       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312       if (other_idx < BITSET_WORD_BITS)
1313 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1314     }
1315 
1316   return REG_NOERROR;
1317 }
1318 
1319 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1320    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1321 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1322 lower_subexps (void *extra, bin_tree_t *node)
1323 {
1324   regex_t *preg = (regex_t *) extra;
1325   reg_errcode_t err = REG_NOERROR;
1326 
1327   if (node->left && node->left->token.type == SUBEXP)
1328     {
1329       node->left = lower_subexp (&err, preg, node->left);
1330       if (node->left)
1331 	node->left->parent = node;
1332     }
1333   if (node->right && node->right->token.type == SUBEXP)
1334     {
1335       node->right = lower_subexp (&err, preg, node->right);
1336       if (node->right)
1337 	node->right->parent = node;
1338     }
1339 
1340   return err;
1341 }
1342 
1343 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1344 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1345 {
1346   re_dfa_t *dfa = preg->buffer;
1347   bin_tree_t *body = node->left;
1348   bin_tree_t *op, *cls, *tree1, *tree;
1349 
1350   if (preg->no_sub
1351       /* We do not optimize empty subexpressions, because otherwise we may
1352 	 have bad CONCAT nodes with NULL children.  This is obviously not
1353 	 very common, so we do not lose much.  An example that triggers
1354 	 this case is the sed "script" /\(\)/x.  */
1355       && node->left != NULL
1356       && (node->token.opr.idx >= BITSET_WORD_BITS
1357 	  || !(dfa->used_bkref_map
1358 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1359     return node->left;
1360 
1361   /* Convert the SUBEXP node to the concatenation of an
1362      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1363   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366   tree = create_tree (dfa, op, tree1, CONCAT);
1367   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368 			|| op == NULL || cls == NULL))
1369     {
1370       *err = REG_ESPACE;
1371       return NULL;
1372     }
1373 
1374   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376   return tree;
1377 }
1378 
1379 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1380    nodes.  Requires a postorder visit.  */
1381 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1382 calc_first (void *extra, bin_tree_t *node)
1383 {
1384   re_dfa_t *dfa = (re_dfa_t *) extra;
1385   if (node->token.type == CONCAT)
1386     {
1387       node->first = node->left->first;
1388       node->node_idx = node->left->node_idx;
1389     }
1390   else
1391     {
1392       node->first = node;
1393       node->node_idx = re_dfa_add_node (dfa, node->token);
1394       if (__glibc_unlikely (node->node_idx == -1))
1395 	return REG_ESPACE;
1396       if (node->token.type == ANCHOR)
1397 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398     }
1399   return REG_NOERROR;
1400 }
1401 
1402 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1403 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1404 calc_next (void *extra, bin_tree_t *node)
1405 {
1406   switch (node->token.type)
1407     {
1408     case OP_DUP_ASTERISK:
1409       node->left->next = node;
1410       break;
1411     case CONCAT:
1412       node->left->next = node->right->first;
1413       node->right->next = node->next;
1414       break;
1415     default:
1416       if (node->left)
1417 	node->left->next = node->next;
1418       if (node->right)
1419 	node->right->next = node->next;
1420       break;
1421     }
1422   return REG_NOERROR;
1423 }
1424 
1425 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1426 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1427 link_nfa_nodes (void *extra, bin_tree_t *node)
1428 {
1429   re_dfa_t *dfa = (re_dfa_t *) extra;
1430   Idx idx = node->node_idx;
1431   reg_errcode_t err = REG_NOERROR;
1432 
1433   switch (node->token.type)
1434     {
1435     case CONCAT:
1436       break;
1437 
1438     case END_OF_RE:
1439       DEBUG_ASSERT (node->next == NULL);
1440       break;
1441 
1442     case OP_DUP_ASTERISK:
1443     case OP_ALT:
1444       {
1445 	Idx left, right;
1446 	dfa->has_plural_match = 1;
1447 	if (node->left != NULL)
1448 	  left = node->left->first->node_idx;
1449 	else
1450 	  left = node->next->node_idx;
1451 	if (node->right != NULL)
1452 	  right = node->right->first->node_idx;
1453 	else
1454 	  right = node->next->node_idx;
1455 	DEBUG_ASSERT (left > -1);
1456 	DEBUG_ASSERT (right > -1);
1457 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458       }
1459       break;
1460 
1461     case ANCHOR:
1462     case OP_OPEN_SUBEXP:
1463     case OP_CLOSE_SUBEXP:
1464       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465       break;
1466 
1467     case OP_BACK_REF:
1468       dfa->nexts[idx] = node->next->node_idx;
1469       if (node->token.type == OP_BACK_REF)
1470 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471       break;
1472 
1473     default:
1474       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475       dfa->nexts[idx] = node->next->node_idx;
1476       break;
1477     }
1478 
1479   return err;
1480 }
1481 
1482 /* Duplicate the epsilon closure of the node ROOT_NODE.
1483    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484    to their own constraint.  */
1485 
1486 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1487 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488 			Idx root_node, unsigned int init_constraint)
1489 {
1490   Idx org_node, clone_node;
1491   bool ok;
1492   unsigned int constraint = init_constraint;
1493   for (org_node = top_org_node, clone_node = top_clone_node;;)
1494     {
1495       Idx org_dest, clone_dest;
1496       if (dfa->nodes[org_node].type == OP_BACK_REF)
1497 	{
1498 	  /* If the back reference epsilon-transit, its destination must
1499 	     also have the constraint.  Then duplicate the epsilon closure
1500 	     of the destination of the back reference, and store it in
1501 	     edests of the back reference.  */
1502 	  org_dest = dfa->nexts[org_node];
1503 	  re_node_set_empty (dfa->edests + clone_node);
1504 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 	  if (__glibc_unlikely (clone_dest == -1))
1506 	    return REG_ESPACE;
1507 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1508 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 	  if (__glibc_unlikely (! ok))
1510 	    return REG_ESPACE;
1511 	}
1512       else if (dfa->edests[org_node].nelem == 0)
1513 	{
1514 	  /* In case of the node can't epsilon-transit, don't duplicate the
1515 	     destination and store the original destination as the
1516 	     destination of the node.  */
1517 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 	  break;
1519 	}
1520       else if (dfa->edests[org_node].nelem == 1)
1521 	{
1522 	  /* In case of the node can epsilon-transit, and it has only one
1523 	     destination.  */
1524 	  org_dest = dfa->edests[org_node].elems[0];
1525 	  re_node_set_empty (dfa->edests + clone_node);
1526 	  /* If the node is root_node itself, it means the epsilon closure
1527 	     has a loop.  Then tie it to the destination of the root_node.  */
1528 	  if (org_node == root_node && clone_node != org_node)
1529 	    {
1530 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531 	      if (__glibc_unlikely (! ok))
1532 	        return REG_ESPACE;
1533 	      break;
1534 	    }
1535 	  /* In case the node has another constraint, append it.  */
1536 	  constraint |= dfa->nodes[org_node].constraint;
1537 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1538 	  if (__glibc_unlikely (clone_dest == -1))
1539 	    return REG_ESPACE;
1540 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541 	  if (__glibc_unlikely (! ok))
1542 	    return REG_ESPACE;
1543 	}
1544       else /* dfa->edests[org_node].nelem == 2 */
1545 	{
1546 	  /* In case of the node can epsilon-transit, and it has two
1547 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1548 	  org_dest = dfa->edests[org_node].elems[0];
1549 	  re_node_set_empty (dfa->edests + clone_node);
1550 	  /* Search for a duplicated node which satisfies the constraint.  */
1551 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552 	  if (clone_dest == -1)
1553 	    {
1554 	      /* There is no such duplicated node, create a new one.  */
1555 	      reg_errcode_t err;
1556 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 	      if (__glibc_unlikely (clone_dest == -1))
1558 		return REG_ESPACE;
1559 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 	      if (__glibc_unlikely (! ok))
1561 		return REG_ESPACE;
1562 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563 					    root_node, constraint);
1564 	      if (__glibc_unlikely (err != REG_NOERROR))
1565 		return err;
1566 	    }
1567 	  else
1568 	    {
1569 	      /* There is a duplicated node which satisfies the constraint,
1570 		 use it to avoid infinite loop.  */
1571 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 	      if (__glibc_unlikely (! ok))
1573 		return REG_ESPACE;
1574 	    }
1575 
1576 	  org_dest = dfa->edests[org_node].elems[1];
1577 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 	  if (__glibc_unlikely (clone_dest == -1))
1579 	    return REG_ESPACE;
1580 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 	  if (__glibc_unlikely (! ok))
1582 	    return REG_ESPACE;
1583 	}
1584       org_node = org_dest;
1585       clone_node = clone_dest;
1586     }
1587   return REG_NOERROR;
1588 }
1589 
1590 /* Search for a node which is duplicated from the node ORG_NODE, and
1591    satisfies the constraint CONSTRAINT.  */
1592 
1593 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1594 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595 			unsigned int constraint)
1596 {
1597   Idx idx;
1598   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1599     {
1600       if (org_node == dfa->org_indices[idx]
1601 	  && constraint == dfa->nodes[idx].constraint)
1602 	return idx; /* Found.  */
1603     }
1604   return -1; /* Not found.  */
1605 }
1606 
1607 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608    Return the index of the new node, or -1 if insufficient storage is
1609    available.  */
1610 
1611 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1612 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1613 {
1614   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615   if (__glibc_likely (dup_idx != -1))
1616     {
1617       dfa->nodes[dup_idx].constraint = constraint;
1618       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1619       dfa->nodes[dup_idx].duplicated = 1;
1620 
1621       /* Store the index of the original node.  */
1622       dfa->org_indices[dup_idx] = org_idx;
1623     }
1624   return dup_idx;
1625 }
1626 
1627 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1628 calc_inveclosure (re_dfa_t *dfa)
1629 {
1630   Idx src, idx;
1631   bool ok;
1632   for (idx = 0; idx < dfa->nodes_len; ++idx)
1633     re_node_set_init_empty (dfa->inveclosures + idx);
1634 
1635   for (src = 0; src < dfa->nodes_len; ++src)
1636     {
1637       Idx *elems = dfa->eclosures[src].elems;
1638       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1639 	{
1640 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641 	  if (__glibc_unlikely (! ok))
1642 	    return REG_ESPACE;
1643 	}
1644     }
1645 
1646   return REG_NOERROR;
1647 }
1648 
1649 /* Calculate "eclosure" for all the node in DFA.  */
1650 
1651 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1652 calc_eclosure (re_dfa_t *dfa)
1653 {
1654   Idx node_idx;
1655   bool incomplete;
1656   DEBUG_ASSERT (dfa->nodes_len > 0);
1657   incomplete = false;
1658   /* For each nodes, calculate epsilon closure.  */
1659   for (node_idx = 0; ; ++node_idx)
1660     {
1661       reg_errcode_t err;
1662       re_node_set eclosure_elem;
1663       if (node_idx == dfa->nodes_len)
1664 	{
1665 	  if (!incomplete)
1666 	    break;
1667 	  incomplete = false;
1668 	  node_idx = 0;
1669 	}
1670 
1671       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1672 
1673       /* If we have already calculated, skip it.  */
1674       if (dfa->eclosures[node_idx].nelem != 0)
1675 	continue;
1676       /* Calculate epsilon closure of 'node_idx'.  */
1677       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678       if (__glibc_unlikely (err != REG_NOERROR))
1679 	return err;
1680 
1681       if (dfa->eclosures[node_idx].nelem == 0)
1682 	{
1683 	  incomplete = true;
1684 	  re_node_set_free (&eclosure_elem);
1685 	}
1686     }
1687   return REG_NOERROR;
1688 }
1689 
1690 /* Calculate epsilon closure of NODE.  */
1691 
1692 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1693 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1694 {
1695   reg_errcode_t err;
1696   Idx i;
1697   re_node_set eclosure;
1698   bool incomplete = false;
1699   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1700   if (__glibc_unlikely (err != REG_NOERROR))
1701     return err;
1702 
1703   /* An epsilon closure includes itself.  */
1704   eclosure.elems[eclosure.nelem++] = node;
1705 
1706   /* This indicates that we are calculating this node now.
1707      We reference this value to avoid infinite loop.  */
1708   dfa->eclosures[node].nelem = -1;
1709 
1710   /* If the current node has constraints, duplicate all nodes
1711      since they must inherit the constraints.  */
1712   if (dfa->nodes[node].constraint
1713       && dfa->edests[node].nelem
1714       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1715     {
1716       err = duplicate_node_closure (dfa, node, node, node,
1717 				    dfa->nodes[node].constraint);
1718       if (__glibc_unlikely (err != REG_NOERROR))
1719 	return err;
1720     }
1721 
1722   /* Expand each epsilon destination nodes.  */
1723   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1724     for (i = 0; i < dfa->edests[node].nelem; ++i)
1725       {
1726 	re_node_set eclosure_elem;
1727 	Idx edest = dfa->edests[node].elems[i];
1728 	/* If calculating the epsilon closure of 'edest' is in progress,
1729 	   return intermediate result.  */
1730 	if (dfa->eclosures[edest].nelem == -1)
1731 	  {
1732 	    incomplete = true;
1733 	    continue;
1734 	  }
1735 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1736 	   calculate now. Otherwise use calculated epsilon closure.  */
1737 	if (dfa->eclosures[edest].nelem == 0)
1738 	  {
1739 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1740 	    if (__glibc_unlikely (err != REG_NOERROR))
1741 	      return err;
1742 	  }
1743 	else
1744 	  eclosure_elem = dfa->eclosures[edest];
1745 	/* Merge the epsilon closure of 'edest'.  */
1746 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1747 	if (__glibc_unlikely (err != REG_NOERROR))
1748 	  return err;
1749 	/* If the epsilon closure of 'edest' is incomplete,
1750 	   the epsilon closure of this node is also incomplete.  */
1751 	if (dfa->eclosures[edest].nelem == 0)
1752 	  {
1753 	    incomplete = true;
1754 	    re_node_set_free (&eclosure_elem);
1755 	  }
1756       }
1757 
1758   if (incomplete && !root)
1759     dfa->eclosures[node].nelem = 0;
1760   else
1761     dfa->eclosures[node] = eclosure;
1762   *new_set = eclosure;
1763   return REG_NOERROR;
1764 }
1765 
1766 /* Functions for token which are used in the parser.  */
1767 
1768 /* Fetch a token from INPUT.
1769    We must not use this function inside bracket expressions.  */
1770 
1771 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1772 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773 {
1774   re_string_skip_bytes (input, peek_token (result, input, syntax));
1775 }
1776 
1777 /* Peek a token from INPUT, and return the length of the token.
1778    We must not use this function inside bracket expressions.  */
1779 
1780 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1782 {
1783   unsigned char c;
1784 
1785   if (re_string_eoi (input))
1786     {
1787       token->type = END_OF_RE;
1788       return 0;
1789     }
1790 
1791   c = re_string_peek_byte (input, 0);
1792   token->opr.c = c;
1793 
1794   token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796   token->mb_partial = 0;
1797   if (input->mb_cur_max > 1
1798       && !re_string_first_byte (input, re_string_cur_idx (input)))
1799     {
1800       token->type = CHARACTER;
1801       token->mb_partial = 1;
1802       return 1;
1803     }
1804 #endif
1805   if (c == '\\')
1806     {
1807       unsigned char c2;
1808       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1809 	{
1810 	  token->type = BACK_SLASH;
1811 	  return 1;
1812 	}
1813 
1814       c2 = re_string_peek_byte_case (input, 1);
1815       token->opr.c = c2;
1816       token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818       if (input->mb_cur_max > 1)
1819 	{
1820 	  wint_t wc = re_string_wchar_at (input,
1821 					  re_string_cur_idx (input) + 1);
1822 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1823 	}
1824       else
1825 #endif
1826 	token->word_char = IS_WORD_CHAR (c2) != 0;
1827 
1828       switch (c2)
1829 	{
1830 	case '|':
1831 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832 	    token->type = OP_ALT;
1833 	  break;
1834 	case '1': case '2': case '3': case '4': case '5':
1835 	case '6': case '7': case '8': case '9':
1836 	  if (!(syntax & RE_NO_BK_REFS))
1837 	    {
1838 	      token->type = OP_BACK_REF;
1839 	      token->opr.idx = c2 - '1';
1840 	    }
1841 	  break;
1842 	case '<':
1843 	  if (!(syntax & RE_NO_GNU_OPS))
1844 	    {
1845 	      token->type = ANCHOR;
1846 	      token->opr.ctx_type = WORD_FIRST;
1847 	    }
1848 	  break;
1849 	case '>':
1850 	  if (!(syntax & RE_NO_GNU_OPS))
1851 	    {
1852 	      token->type = ANCHOR;
1853 	      token->opr.ctx_type = WORD_LAST;
1854 	    }
1855 	  break;
1856 	case 'b':
1857 	  if (!(syntax & RE_NO_GNU_OPS))
1858 	    {
1859 	      token->type = ANCHOR;
1860 	      token->opr.ctx_type = WORD_DELIM;
1861 	    }
1862 	  break;
1863 	case 'B':
1864 	  if (!(syntax & RE_NO_GNU_OPS))
1865 	    {
1866 	      token->type = ANCHOR;
1867 	      token->opr.ctx_type = NOT_WORD_DELIM;
1868 	    }
1869 	  break;
1870 	case 'w':
1871 	  if (!(syntax & RE_NO_GNU_OPS))
1872 	    token->type = OP_WORD;
1873 	  break;
1874 	case 'W':
1875 	  if (!(syntax & RE_NO_GNU_OPS))
1876 	    token->type = OP_NOTWORD;
1877 	  break;
1878 	case 's':
1879 	  if (!(syntax & RE_NO_GNU_OPS))
1880 	    token->type = OP_SPACE;
1881 	  break;
1882 	case 'S':
1883 	  if (!(syntax & RE_NO_GNU_OPS))
1884 	    token->type = OP_NOTSPACE;
1885 	  break;
1886 	case '`':
1887 	  if (!(syntax & RE_NO_GNU_OPS))
1888 	    {
1889 	      token->type = ANCHOR;
1890 	      token->opr.ctx_type = BUF_FIRST;
1891 	    }
1892 	  break;
1893 	case '\'':
1894 	  if (!(syntax & RE_NO_GNU_OPS))
1895 	    {
1896 	      token->type = ANCHOR;
1897 	      token->opr.ctx_type = BUF_LAST;
1898 	    }
1899 	  break;
1900 	case '(':
1901 	  if (!(syntax & RE_NO_BK_PARENS))
1902 	    token->type = OP_OPEN_SUBEXP;
1903 	  break;
1904 	case ')':
1905 	  if (!(syntax & RE_NO_BK_PARENS))
1906 	    token->type = OP_CLOSE_SUBEXP;
1907 	  break;
1908 	case '+':
1909 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 	    token->type = OP_DUP_PLUS;
1911 	  break;
1912 	case '?':
1913 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 	    token->type = OP_DUP_QUESTION;
1915 	  break;
1916 	case '{':
1917 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 	    token->type = OP_OPEN_DUP_NUM;
1919 	  break;
1920 	case '}':
1921 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 	    token->type = OP_CLOSE_DUP_NUM;
1923 	  break;
1924 	default:
1925 	  break;
1926 	}
1927       return 2;
1928     }
1929 
1930   token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932   if (input->mb_cur_max > 1)
1933     {
1934       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1936     }
1937   else
1938 #endif
1939     token->word_char = IS_WORD_CHAR (token->opr.c);
1940 
1941   switch (c)
1942     {
1943     case '\n':
1944       if (syntax & RE_NEWLINE_ALT)
1945 	token->type = OP_ALT;
1946       break;
1947     case '|':
1948       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949 	token->type = OP_ALT;
1950       break;
1951     case '*':
1952       token->type = OP_DUP_ASTERISK;
1953       break;
1954     case '+':
1955       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 	token->type = OP_DUP_PLUS;
1957       break;
1958     case '?':
1959       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 	token->type = OP_DUP_QUESTION;
1961       break;
1962     case '{':
1963       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 	token->type = OP_OPEN_DUP_NUM;
1965       break;
1966     case '}':
1967       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 	token->type = OP_CLOSE_DUP_NUM;
1969       break;
1970     case '(':
1971       if (syntax & RE_NO_BK_PARENS)
1972 	token->type = OP_OPEN_SUBEXP;
1973       break;
1974     case ')':
1975       if (syntax & RE_NO_BK_PARENS)
1976 	token->type = OP_CLOSE_SUBEXP;
1977       break;
1978     case '[':
1979       token->type = OP_OPEN_BRACKET;
1980       break;
1981     case '.':
1982       token->type = OP_PERIOD;
1983       break;
1984     case '^':
1985       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1986 	  && re_string_cur_idx (input) != 0)
1987 	{
1988 	  char prev = re_string_peek_byte (input, -1);
1989 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990 	    break;
1991 	}
1992       token->type = ANCHOR;
1993       token->opr.ctx_type = LINE_FIRST;
1994       break;
1995     case '$':
1996       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1997 	  && re_string_cur_idx (input) + 1 != re_string_length (input))
1998 	{
1999 	  re_token_t next;
2000 	  re_string_skip_bytes (input, 1);
2001 	  peek_token (&next, input, syntax);
2002 	  re_string_skip_bytes (input, -1);
2003 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004 	    break;
2005 	}
2006       token->type = ANCHOR;
2007       token->opr.ctx_type = LINE_LAST;
2008       break;
2009     default:
2010       break;
2011     }
2012   return 1;
2013 }
2014 
2015 /* Peek a token from INPUT, and return the length of the token.
2016    We must not use this function out of bracket expressions.  */
2017 
2018 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2019 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2020 {
2021   unsigned char c;
2022   if (re_string_eoi (input))
2023     {
2024       token->type = END_OF_RE;
2025       return 0;
2026     }
2027   c = re_string_peek_byte (input, 0);
2028   token->opr.c = c;
2029 
2030 #ifdef RE_ENABLE_I18N
2031   if (input->mb_cur_max > 1
2032       && !re_string_first_byte (input, re_string_cur_idx (input)))
2033     {
2034       token->type = CHARACTER;
2035       return 1;
2036     }
2037 #endif /* RE_ENABLE_I18N */
2038 
2039   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040       && re_string_cur_idx (input) + 1 < re_string_length (input))
2041     {
2042       /* In this case, '\' escape a character.  */
2043       unsigned char c2;
2044       re_string_skip_bytes (input, 1);
2045       c2 = re_string_peek_byte (input, 0);
2046       token->opr.c = c2;
2047       token->type = CHARACTER;
2048       return 1;
2049     }
2050   if (c == '[') /* '[' is a special char in a bracket exps.  */
2051     {
2052       unsigned char c2;
2053       int token_len;
2054       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055 	c2 = re_string_peek_byte (input, 1);
2056       else
2057 	c2 = 0;
2058       token->opr.c = c2;
2059       token_len = 2;
2060       switch (c2)
2061 	{
2062 	case '.':
2063 	  token->type = OP_OPEN_COLL_ELEM;
2064 	  break;
2065 
2066 	case '=':
2067 	  token->type = OP_OPEN_EQUIV_CLASS;
2068 	  break;
2069 
2070 	case ':':
2071 	  if (syntax & RE_CHAR_CLASSES)
2072 	    {
2073 	      token->type = OP_OPEN_CHAR_CLASS;
2074 	      break;
2075 	    }
2076 	  FALLTHROUGH;
2077 	default:
2078 	  token->type = CHARACTER;
2079 	  token->opr.c = c;
2080 	  token_len = 1;
2081 	  break;
2082 	}
2083       return token_len;
2084     }
2085   switch (c)
2086     {
2087     case '-':
2088       token->type = OP_CHARSET_RANGE;
2089       break;
2090     case ']':
2091       token->type = OP_CLOSE_BRACKET;
2092       break;
2093     case '^':
2094       token->type = OP_NON_MATCH_LIST;
2095       break;
2096     default:
2097       token->type = CHARACTER;
2098     }
2099   return 1;
2100 }
2101 
2102 /* Functions for parser.  */
2103 
2104 /* Entry point of the parser.
2105    Parse the regular expression REGEXP and return the structure tree.
2106    If an error occurs, ERR is set by error code, and return NULL.
2107    This function build the following tree, from regular expression <reg_exp>:
2108 	   CAT
2109 	   / \
2110 	  /   \
2111    <reg_exp>  EOR
2112 
2113    CAT means concatenation.
2114    EOR means end of regular expression.  */
2115 
2116 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2117 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118        reg_errcode_t *err)
2119 {
2120   re_dfa_t *dfa = preg->buffer;
2121   bin_tree_t *tree, *eor, *root;
2122   re_token_t current_token;
2123   dfa->syntax = syntax;
2124   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2126   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2127     return NULL;
2128   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129   if (tree != NULL)
2130     root = create_tree (dfa, tree, eor, CONCAT);
2131   else
2132     root = eor;
2133   if (__glibc_unlikely (eor == NULL || root == NULL))
2134     {
2135       *err = REG_ESPACE;
2136       return NULL;
2137     }
2138   return root;
2139 }
2140 
2141 /* This function build the following tree, from regular expression
2142    <branch1>|<branch2>:
2143 	   ALT
2144 	   / \
2145 	  /   \
2146    <branch1> <branch2>
2147 
2148    ALT means alternative, which represents the operator '|'.  */
2149 
2150 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2151 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = preg->buffer;
2155   bin_tree_t *tree, *branch = NULL;
2156   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2157   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2158   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2159     return NULL;
2160 
2161   while (token->type == OP_ALT)
2162     {
2163       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2164       if (token->type != OP_ALT && token->type != END_OF_RE
2165 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166 	{
2167 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2168 	  dfa->completed_bkref_map = initial_bkref_map;
2169 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2170 	  if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2171 	    {
2172 	      if (tree != NULL)
2173 		postorder (tree, free_tree, NULL);
2174 	      return NULL;
2175 	    }
2176 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2177 	}
2178       else
2179 	branch = NULL;
2180       tree = create_tree (dfa, tree, branch, OP_ALT);
2181       if (__glibc_unlikely (tree == NULL))
2182 	{
2183 	  *err = REG_ESPACE;
2184 	  return NULL;
2185 	}
2186     }
2187   return tree;
2188 }
2189 
2190 /* This function build the following tree, from regular expression
2191    <exp1><exp2>:
2192 	CAT
2193 	/ \
2194        /   \
2195    <exp1> <exp2>
2196 
2197    CAT means concatenation.  */
2198 
2199 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2200 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2201 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2202 {
2203   bin_tree_t *tree, *expr;
2204   re_dfa_t *dfa = preg->buffer;
2205   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2206   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2207     return NULL;
2208 
2209   while (token->type != OP_ALT && token->type != END_OF_RE
2210 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2211     {
2212       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2213       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2214 	{
2215 	  if (tree != NULL)
2216 	    postorder (tree, free_tree, NULL);
2217 	  return NULL;
2218 	}
2219       if (tree != NULL && expr != NULL)
2220 	{
2221 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2222 	  if (newtree == NULL)
2223 	    {
2224 	      postorder (expr, free_tree, NULL);
2225 	      postorder (tree, free_tree, NULL);
2226 	      *err = REG_ESPACE;
2227 	      return NULL;
2228 	    }
2229 	  tree = newtree;
2230 	}
2231       else if (tree == NULL)
2232 	tree = expr;
2233       /* Otherwise expr == NULL, we don't need to create new tree.  */
2234     }
2235   return tree;
2236 }
2237 
2238 /* This function build the following tree, from regular expression a*:
2239 	 *
2240 	 |
2241 	 a
2242 */
2243 
2244 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2245 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2246 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2247 {
2248   re_dfa_t *dfa = preg->buffer;
2249   bin_tree_t *tree;
2250   switch (token->type)
2251     {
2252     case CHARACTER:
2253       tree = create_token_tree (dfa, NULL, NULL, token);
2254       if (__glibc_unlikely (tree == NULL))
2255 	{
2256 	  *err = REG_ESPACE;
2257 	  return NULL;
2258 	}
2259 #ifdef RE_ENABLE_I18N
2260       if (dfa->mb_cur_max > 1)
2261 	{
2262 	  while (!re_string_eoi (regexp)
2263 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2264 	    {
2265 	      bin_tree_t *mbc_remain;
2266 	      fetch_token (token, regexp, syntax);
2267 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2268 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2269 	      if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2270 		{
2271 		  *err = REG_ESPACE;
2272 		  return NULL;
2273 		}
2274 	    }
2275 	}
2276 #endif
2277       break;
2278 
2279     case OP_OPEN_SUBEXP:
2280       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2281       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2282 	return NULL;
2283       break;
2284 
2285     case OP_OPEN_BRACKET:
2286       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2287       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2288 	return NULL;
2289       break;
2290 
2291     case OP_BACK_REF:
2292       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2293 	{
2294 	  *err = REG_ESUBREG;
2295 	  return NULL;
2296 	}
2297       dfa->used_bkref_map |= 1 << token->opr.idx;
2298       tree = create_token_tree (dfa, NULL, NULL, token);
2299       if (__glibc_unlikely (tree == NULL))
2300 	{
2301 	  *err = REG_ESPACE;
2302 	  return NULL;
2303 	}
2304       ++dfa->nbackref;
2305       dfa->has_mb_node = 1;
2306       break;
2307 
2308     case OP_OPEN_DUP_NUM:
2309       if (syntax & RE_CONTEXT_INVALID_DUP)
2310 	{
2311 	  *err = REG_BADRPT;
2312 	  return NULL;
2313 	}
2314       FALLTHROUGH;
2315     case OP_DUP_ASTERISK:
2316     case OP_DUP_PLUS:
2317     case OP_DUP_QUESTION:
2318       if (syntax & RE_CONTEXT_INVALID_OPS)
2319 	{
2320 	  *err = REG_BADRPT;
2321 	  return NULL;
2322 	}
2323       else if (syntax & RE_CONTEXT_INDEP_OPS)
2324 	{
2325 	  fetch_token (token, regexp, syntax);
2326 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2327 	}
2328       FALLTHROUGH;
2329     case OP_CLOSE_SUBEXP:
2330       if ((token->type == OP_CLOSE_SUBEXP)
2331 	  && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2332 	{
2333 	  *err = REG_ERPAREN;
2334 	  return NULL;
2335 	}
2336       FALLTHROUGH;
2337     case OP_CLOSE_DUP_NUM:
2338       /* We treat it as a normal character.  */
2339 
2340       /* Then we can these characters as normal characters.  */
2341       token->type = CHARACTER;
2342       /* mb_partial and word_char bits should be initialized already
2343 	 by peek_token.  */
2344       tree = create_token_tree (dfa, NULL, NULL, token);
2345       if (__glibc_unlikely (tree == NULL))
2346 	{
2347 	  *err = REG_ESPACE;
2348 	  return NULL;
2349 	}
2350       break;
2351 
2352     case ANCHOR:
2353       if ((token->opr.ctx_type
2354 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2355 	  && dfa->word_ops_used == 0)
2356 	init_word_char (dfa);
2357       if (token->opr.ctx_type == WORD_DELIM
2358 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2359 	{
2360 	  bin_tree_t *tree_first, *tree_last;
2361 	  if (token->opr.ctx_type == WORD_DELIM)
2362 	    {
2363 	      token->opr.ctx_type = WORD_FIRST;
2364 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2365 	      token->opr.ctx_type = WORD_LAST;
2366 	    }
2367 	  else
2368 	    {
2369 	      token->opr.ctx_type = INSIDE_WORD;
2370 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2371 	      token->opr.ctx_type = INSIDE_NOTWORD;
2372 	    }
2373 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2374 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2375 	  if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2376 				|| tree == NULL))
2377 	    {
2378 	      *err = REG_ESPACE;
2379 	      return NULL;
2380 	    }
2381 	}
2382       else
2383 	{
2384 	  tree = create_token_tree (dfa, NULL, NULL, token);
2385 	  if (__glibc_unlikely (tree == NULL))
2386 	    {
2387 	      *err = REG_ESPACE;
2388 	      return NULL;
2389 	    }
2390 	}
2391       /* We must return here, since ANCHORs can't be followed
2392 	 by repetition operators.
2393 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2394 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2395       fetch_token (token, regexp, syntax);
2396       return tree;
2397 
2398     case OP_PERIOD:
2399       tree = create_token_tree (dfa, NULL, NULL, token);
2400       if (__glibc_unlikely (tree == NULL))
2401 	{
2402 	  *err = REG_ESPACE;
2403 	  return NULL;
2404 	}
2405       if (dfa->mb_cur_max > 1)
2406 	dfa->has_mb_node = 1;
2407       break;
2408 
2409     case OP_WORD:
2410     case OP_NOTWORD:
2411       tree = build_charclass_op (dfa, regexp->trans,
2412 				 "alnum",
2413 				 "_",
2414 				 token->type == OP_NOTWORD, err);
2415       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2416 	return NULL;
2417       break;
2418 
2419     case OP_SPACE:
2420     case OP_NOTSPACE:
2421       tree = build_charclass_op (dfa, regexp->trans,
2422 				 "space",
2423 				 "",
2424 				 token->type == OP_NOTSPACE, err);
2425       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2426 	return NULL;
2427       break;
2428 
2429     case OP_ALT:
2430     case END_OF_RE:
2431       return NULL;
2432 
2433     case BACK_SLASH:
2434       *err = REG_EESCAPE;
2435       return NULL;
2436 
2437     default:
2438       /* Must not happen?  */
2439       DEBUG_ASSERT (false);
2440       return NULL;
2441     }
2442   fetch_token (token, regexp, syntax);
2443 
2444   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446     {
2447       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2448 					   syntax, err);
2449       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2450 	{
2451 	  if (tree != NULL)
2452 	    postorder (tree, free_tree, NULL);
2453 	  return NULL;
2454 	}
2455       tree = dup_tree;
2456       /* In BRE consecutive duplications are not allowed.  */
2457       if ((syntax & RE_CONTEXT_INVALID_DUP)
2458 	  && (token->type == OP_DUP_ASTERISK
2459 	      || token->type == OP_OPEN_DUP_NUM))
2460 	{
2461 	  if (tree != NULL)
2462 	    postorder (tree, free_tree, NULL);
2463 	  *err = REG_BADRPT;
2464 	  return NULL;
2465 	}
2466     }
2467 
2468   return tree;
2469 }
2470 
2471 /* This function build the following tree, from regular expression
2472    (<reg_exp>):
2473 	 SUBEXP
2474 	    |
2475 	<reg_exp>
2476 */
2477 
2478 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2481 {
2482   re_dfa_t *dfa = preg->buffer;
2483   bin_tree_t *tree;
2484   size_t cur_nsub;
2485   cur_nsub = preg->re_nsub++;
2486 
2487   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488 
2489   /* The subexpression may be a null string.  */
2490   if (token->type == OP_CLOSE_SUBEXP)
2491     tree = NULL;
2492   else
2493     {
2494       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495       if (__glibc_unlikely (*err == REG_NOERROR
2496 			    && token->type != OP_CLOSE_SUBEXP))
2497 	{
2498 	  if (tree != NULL)
2499 	    postorder (tree, free_tree, NULL);
2500 	  *err = REG_EPAREN;
2501 	}
2502       if (__glibc_unlikely (*err != REG_NOERROR))
2503 	return NULL;
2504     }
2505 
2506   if (cur_nsub <= '9' - '1')
2507     dfa->completed_bkref_map |= 1 << cur_nsub;
2508 
2509   tree = create_tree (dfa, tree, NULL, SUBEXP);
2510   if (__glibc_unlikely (tree == NULL))
2511     {
2512       *err = REG_ESPACE;
2513       return NULL;
2514     }
2515   tree->token.opr.idx = cur_nsub;
2516   return tree;
2517 }
2518 
2519 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2520 
2521 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2522 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2523 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2524 {
2525   bin_tree_t *tree = NULL, *old_tree = NULL;
2526   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2527   re_token_t start_token = *token;
2528 
2529   if (token->type == OP_OPEN_DUP_NUM)
2530     {
2531       end = 0;
2532       start = fetch_number (regexp, token, syntax);
2533       if (start == -1)
2534 	{
2535 	  if (token->type == CHARACTER && token->opr.c == ',')
2536 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2537 	  else
2538 	    {
2539 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2540 	      return NULL;
2541 	    }
2542 	}
2543       if (__glibc_likely (start != -2))
2544 	{
2545 	  /* We treat "{n}" as "{n,n}".  */
2546 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547 		 : ((token->type == CHARACTER && token->opr.c == ',')
2548 		    ? fetch_number (regexp, token, syntax) : -2));
2549 	}
2550       if (__glibc_unlikely (start == -2 || end == -2))
2551 	{
2552 	  /* Invalid sequence.  */
2553 	  if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2554 	    {
2555 	      if (token->type == END_OF_RE)
2556 		*err = REG_EBRACE;
2557 	      else
2558 		*err = REG_BADBR;
2559 
2560 	      return NULL;
2561 	    }
2562 
2563 	  /* If the syntax bit is set, rollback.  */
2564 	  re_string_set_index (regexp, start_idx);
2565 	  *token = start_token;
2566 	  token->type = CHARACTER;
2567 	  /* mb_partial and word_char bits should be already initialized by
2568 	     peek_token.  */
2569 	  return elem;
2570 	}
2571 
2572       if (__glibc_unlikely ((end != -1 && start > end)
2573 			    || token->type != OP_CLOSE_DUP_NUM))
2574 	{
2575 	  /* First number greater than second.  */
2576 	  *err = REG_BADBR;
2577 	  return NULL;
2578 	}
2579 
2580       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2581 	{
2582 	  *err = REG_ESIZE;
2583 	  return NULL;
2584 	}
2585     }
2586   else
2587     {
2588       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2589       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2590     }
2591 
2592   fetch_token (token, regexp, syntax);
2593 
2594   if (__glibc_unlikely (elem == NULL))
2595     return NULL;
2596   if (__glibc_unlikely (start == 0 && end == 0))
2597     {
2598       postorder (elem, free_tree, NULL);
2599       return NULL;
2600     }
2601 
2602   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2603   if (__glibc_unlikely (start > 0))
2604     {
2605       tree = elem;
2606       for (i = 2; i <= start; ++i)
2607 	{
2608 	  elem = duplicate_tree (elem, dfa);
2609 	  tree = create_tree (dfa, tree, elem, CONCAT);
2610 	  if (__glibc_unlikely (elem == NULL || tree == NULL))
2611 	    goto parse_dup_op_espace;
2612 	}
2613 
2614       if (start == end)
2615 	return tree;
2616 
2617       /* Duplicate ELEM before it is marked optional.  */
2618       elem = duplicate_tree (elem, dfa);
2619       if (__glibc_unlikely (elem == NULL))
2620         goto parse_dup_op_espace;
2621       old_tree = tree;
2622     }
2623   else
2624     old_tree = NULL;
2625 
2626   if (elem->token.type == SUBEXP)
2627     {
2628       uintptr_t subidx = elem->token.opr.idx;
2629       postorder (elem, mark_opt_subexp, (void *) subidx);
2630     }
2631 
2632   tree = create_tree (dfa, elem, NULL,
2633 		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2634   if (__glibc_unlikely (tree == NULL))
2635     goto parse_dup_op_espace;
2636 
2637   /* This loop is actually executed only when end != -1,
2638      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2639      already created the start+1-th copy.  */
2640   if (TYPE_SIGNED (Idx) || end != -1)
2641     for (i = start + 2; i <= end; ++i)
2642       {
2643 	elem = duplicate_tree (elem, dfa);
2644 	tree = create_tree (dfa, tree, elem, CONCAT);
2645 	if (__glibc_unlikely (elem == NULL || tree == NULL))
2646 	  goto parse_dup_op_espace;
2647 
2648 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2649 	if (__glibc_unlikely (tree == NULL))
2650 	  goto parse_dup_op_espace;
2651       }
2652 
2653   if (old_tree)
2654     tree = create_tree (dfa, old_tree, tree, CONCAT);
2655 
2656   return tree;
2657 
2658  parse_dup_op_espace:
2659   *err = REG_ESPACE;
2660   return NULL;
2661 }
2662 
2663 /* Size of the names for collating symbol/equivalence_class/character_class.
2664    I'm not sure, but maybe enough.  */
2665 #define BRACKET_NAME_BUF_SIZE 32
2666 
2667 #ifndef _LIBC
2668 
2669 # ifdef RE_ENABLE_I18N
2670 /* Convert the byte B to the corresponding wide character.  In a
2671    unibyte locale, treat B as itself.  In a multibyte locale, return
2672    WEOF if B is an encoding error.  */
2673 static wint_t
parse_byte(unsigned char b,re_charset_t * mbcset)2674 parse_byte (unsigned char b, re_charset_t *mbcset)
2675 {
2676   return mbcset == NULL ? b : __btowc (b);
2677 }
2678 # endif
2679 
2680   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2681      Build the range expression which starts from START_ELEM, and ends
2682      at END_ELEM.  The result are written to MBCSET and SBCSET.
2683      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2684      mbcset->range_ends, is a pointer argument since we may
2685      update it.  */
2686 
2687 static reg_errcode_t
2688 # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)2689 build_range_exp (const reg_syntax_t syntax,
2690                  bitset_t sbcset,
2691                  re_charset_t *mbcset,
2692                  Idx *range_alloc,
2693                  const bracket_elem_t *start_elem,
2694                  const bracket_elem_t *end_elem)
2695 # else /* not RE_ENABLE_I18N */
2696 build_range_exp (const reg_syntax_t syntax,
2697                  bitset_t sbcset,
2698                  const bracket_elem_t *start_elem,
2699                  const bracket_elem_t *end_elem)
2700 # endif /* not RE_ENABLE_I18N */
2701 {
2702   unsigned int start_ch, end_ch;
2703   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2704   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2705 			|| start_elem->type == CHAR_CLASS
2706 			|| end_elem->type == EQUIV_CLASS
2707 			|| end_elem->type == CHAR_CLASS))
2708     return REG_ERANGE;
2709 
2710   /* We can handle no multi character collating elements without libc
2711      support.  */
2712   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2713 			 && strlen ((char *) start_elem->opr.name) > 1)
2714 			|| (end_elem->type == COLL_SYM
2715 			    && strlen ((char *) end_elem->opr.name) > 1)))
2716     return REG_ECOLLATE;
2717 
2718 # ifdef RE_ENABLE_I18N
2719   {
2720     wchar_t wc;
2721     wint_t start_wc;
2722     wint_t end_wc;
2723 
2724     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2725 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726 		   : 0));
2727     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2728 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729 		 : 0));
2730     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2731 		? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2732     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2733 	      ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2734     if (start_wc == WEOF || end_wc == WEOF)
2735       return REG_ECOLLATE;
2736     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2737 			       && start_wc > end_wc))
2738       return REG_ERANGE;
2739 
2740     /* Got valid collation sequence values, add them as a new entry.
2741        However, for !_LIBC we have no collation elements: if the
2742        character set is single byte, the single byte character set
2743        that we build below suffices.  parse_bracket_exp passes
2744        no MBCSET if dfa->mb_cur_max == 1.  */
2745     if (mbcset)
2746       {
2747 	/* Check the space of the arrays.  */
2748 	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2749 	  {
2750 	    /* There is not enough space, need realloc.  */
2751 	    wchar_t *new_array_start, *new_array_end;
2752 	    Idx new_nranges;
2753 
2754 	    /* +1 in case of mbcset->nranges is 0.  */
2755 	    new_nranges = 2 * mbcset->nranges + 1;
2756 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2757 	       are NULL if *range_alloc == 0.  */
2758 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2759 					  new_nranges);
2760 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2761 					new_nranges);
2762 
2763 	    if (__glibc_unlikely (new_array_start == NULL
2764 				  || new_array_end == NULL))
2765 	      {
2766 		re_free (new_array_start);
2767 		re_free (new_array_end);
2768 		return REG_ESPACE;
2769 	      }
2770 
2771 	    mbcset->range_starts = new_array_start;
2772 	    mbcset->range_ends = new_array_end;
2773 	    *range_alloc = new_nranges;
2774 	  }
2775 
2776 	mbcset->range_starts[mbcset->nranges] = start_wc;
2777 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2778       }
2779 
2780     /* Build the table for single byte characters.  */
2781     for (wc = 0; wc < SBC_MAX; ++wc)
2782       {
2783 	if (start_wc <= wc && wc <= end_wc)
2784 	  bitset_set (sbcset, wc);
2785       }
2786   }
2787 # else /* not RE_ENABLE_I18N */
2788   {
2789     unsigned int ch;
2790     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2791 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2792 		   : 0));
2793     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2794 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2795 		 : 0));
2796     if (start_ch > end_ch)
2797       return REG_ERANGE;
2798     /* Build the table for single byte characters.  */
2799     for (ch = 0; ch < SBC_MAX; ++ch)
2800       if (start_ch <= ch  && ch <= end_ch)
2801 	bitset_set (sbcset, ch);
2802   }
2803 # endif /* not RE_ENABLE_I18N */
2804   return REG_NOERROR;
2805 }
2806 #endif /* not _LIBC */
2807 
2808 #ifndef _LIBC
2809 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2810    Build the collating element which is represented by NAME.
2811    The result are written to MBCSET and SBCSET.
2812    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2813    pointer argument since we may update it.  */
2814 
2815 static reg_errcode_t
2816 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2817 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2818 			Idx *coll_sym_alloc, const unsigned char *name)
2819 # else /* not RE_ENABLE_I18N */
2820 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2821 # endif /* not RE_ENABLE_I18N */
2822 {
2823   size_t name_len = strlen ((const char *) name);
2824   if (__glibc_unlikely (name_len != 1))
2825     return REG_ECOLLATE;
2826   else
2827     {
2828       bitset_set (sbcset, name[0]);
2829       return REG_NOERROR;
2830     }
2831 }
2832 #endif /* not _LIBC */
2833 
2834 #ifdef _LIBC
2835 /* Local function for parse_bracket_exp used in _LIBC environment.
2836    Seek the collating symbol entry corresponding to NAME.
2837    Return the index of the symbol in the SYMB_TABLE,
2838    or -1 if not found.  */
2839 
2840 static inline int32_t
2841 __attribute__ ((always_inline))
seek_collating_symbol_entry(const unsigned char * name,size_t name_len,const int32_t * symb_table,int32_t table_size,const unsigned char * extra)2842 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2843 			     const int32_t *symb_table, int32_t table_size,
2844 			     const unsigned char *extra)
2845 {
2846   int32_t elem;
2847 
2848   for (elem = 0; elem < table_size; elem++)
2849     if (symb_table[2 * elem] != 0)
2850       {
2851 	int32_t idx = symb_table[2 * elem + 1];
2852 	/* Skip the name of collating element name.  */
2853 	idx += 1 + extra[idx];
2854 	if (/* Compare the length of the name.  */
2855 	    name_len == extra[idx]
2856 	    /* Compare the name.  */
2857 	    && memcmp (name, &extra[idx + 1], name_len) == 0)
2858 	  /* Yep, this is the entry.  */
2859 	  return elem;
2860       }
2861   return -1;
2862 }
2863 
2864 /* Local function for parse_bracket_exp used in _LIBC environment.
2865    Look up the collation sequence value of BR_ELEM.
2866    Return the value if succeeded, UINT_MAX otherwise.  */
2867 
2868 static inline unsigned int
2869 __attribute__ ((always_inline))
lookup_collation_sequence_value(bracket_elem_t * br_elem,uint32_t nrules,const unsigned char * collseqmb,const char * collseqwc,int32_t table_size,const int32_t * symb_table,const unsigned char * extra)2870 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2871 				 const unsigned char *collseqmb,
2872 				 const char *collseqwc, int32_t table_size,
2873 				 const int32_t *symb_table,
2874 				 const unsigned char *extra)
2875 {
2876   if (br_elem->type == SB_CHAR)
2877     {
2878       /* if (MB_CUR_MAX == 1) */
2879       if (nrules == 0)
2880 	return collseqmb[br_elem->opr.ch];
2881       else
2882 	{
2883 	  wint_t wc = __btowc (br_elem->opr.ch);
2884 	  return __collseq_table_lookup (collseqwc, wc);
2885 	}
2886     }
2887   else if (br_elem->type == MB_CHAR)
2888     {
2889       if (nrules != 0)
2890 	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2891     }
2892   else if (br_elem->type == COLL_SYM)
2893     {
2894       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2895       if (nrules != 0)
2896 	{
2897 	  int32_t elem, idx;
2898 	  elem = seek_collating_symbol_entry (br_elem->opr.name,
2899 					      sym_name_len,
2900 					      symb_table, table_size,
2901 					      extra);
2902 	  if (elem != -1)
2903 	    {
2904 	      /* We found the entry.  */
2905 	      idx = symb_table[2 * elem + 1];
2906 	      /* Skip the name of collating element name.  */
2907 	      idx += 1 + extra[idx];
2908 	      /* Skip the byte sequence of the collating element.  */
2909 	      idx += 1 + extra[idx];
2910 	      /* Adjust for the alignment.  */
2911 	      idx = (idx + 3) & ~3;
2912 	      /* Skip the multibyte collation sequence value.  */
2913 	      idx += sizeof (unsigned int);
2914 	      /* Skip the wide char sequence of the collating element.  */
2915 	      idx += sizeof (unsigned int) *
2916 		(1 + *(unsigned int *) (extra + idx));
2917 	      /* Return the collation sequence value.  */
2918 	      return *(unsigned int *) (extra + idx);
2919 	    }
2920 	  else if (sym_name_len == 1)
2921 	    {
2922 	      /* No valid character.  Match it as a single byte
2923 		 character.  */
2924 	      return collseqmb[br_elem->opr.name[0]];
2925 	    }
2926 	}
2927       else if (sym_name_len == 1)
2928 	return collseqmb[br_elem->opr.name[0]];
2929     }
2930   return UINT_MAX;
2931 }
2932 
2933 /* Local function for parse_bracket_exp used in _LIBC environment.
2934    Build the range expression which starts from START_ELEM, and ends
2935    at END_ELEM.  The result are written to MBCSET and SBCSET.
2936    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937    mbcset->range_ends, is a pointer argument since we may
2938    update it.  */
2939 
2940 static inline reg_errcode_t
2941 __attribute__ ((always_inline))
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem,re_dfa_t * dfa,reg_syntax_t syntax,uint32_t nrules,const unsigned char * collseqmb,const char * collseqwc,int32_t table_size,const int32_t * symb_table,const unsigned char * extra)2942 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2944 		 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2945 		 const unsigned char *collseqmb, const char *collseqwc,
2946 		 int32_t table_size, const int32_t *symb_table,
2947 		 const unsigned char *extra)
2948 {
2949   unsigned int ch;
2950   uint32_t start_collseq;
2951   uint32_t end_collseq;
2952 
2953   /* Equivalence Classes and Character Classes can't be a range
2954      start/end.  */
2955   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2956                         || start_elem->type == CHAR_CLASS
2957                         || end_elem->type == EQUIV_CLASS
2958                         || end_elem->type == CHAR_CLASS))
2959     return REG_ERANGE;
2960 
2961   /* FIXME: Implement rational ranges here, too.  */
2962   start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2963 						   table_size, symb_table, extra);
2964   end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2965 						 table_size, symb_table, extra);
2966   /* Check start/end collation sequence values.  */
2967   if (__glibc_unlikely (start_collseq == UINT_MAX
2968                         || end_collseq == UINT_MAX))
2969     return REG_ECOLLATE;
2970   if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2971                         && start_collseq > end_collseq))
2972     return REG_ERANGE;
2973 
2974   /* Got valid collation sequence values, add them as a new entry.
2975      However, if we have no collation elements, and the character set
2976      is single byte, the single byte character set that we
2977      build below suffices. */
2978   if (nrules > 0 || dfa->mb_cur_max > 1)
2979     {
2980       /* Check the space of the arrays.  */
2981       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2982 	{
2983 	  /* There is not enough space, need realloc.  */
2984 	  uint32_t *new_array_start;
2985 	  uint32_t *new_array_end;
2986 	  int new_nranges;
2987 
2988 	  /* +1 in case of mbcset->nranges is 0.  */
2989 	  new_nranges = 2 * mbcset->nranges + 1;
2990 	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2991 					new_nranges);
2992 	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2993 				      new_nranges);
2994 
2995           if (__glibc_unlikely (new_array_start == NULL
2996                                 || new_array_end == NULL))
2997 	    return REG_ESPACE;
2998 
2999 	  mbcset->range_starts = new_array_start;
3000 	  mbcset->range_ends = new_array_end;
3001 	  *range_alloc = new_nranges;
3002 	}
3003 
3004       mbcset->range_starts[mbcset->nranges] = start_collseq;
3005       mbcset->range_ends[mbcset->nranges++] = end_collseq;
3006     }
3007 
3008   /* Build the table for single byte characters.  */
3009   for (ch = 0; ch < SBC_MAX; ch++)
3010     {
3011       uint32_t ch_collseq;
3012       /* if (MB_CUR_MAX == 1) */
3013       if (nrules == 0)
3014 	ch_collseq = collseqmb[ch];
3015       else
3016 	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3017       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3018 	bitset_set (sbcset, ch);
3019     }
3020   return REG_NOERROR;
3021 }
3022 
3023 /* Local function for parse_bracket_exp used in _LIBC environment.
3024    Build the collating element which is represented by NAME.
3025    The result are written to MBCSET and SBCSET.
3026    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3027    pointer argument since we may update it.  */
3028 
3029 static inline reg_errcode_t
3030 __attribute__ ((always_inline))
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name,uint32_t nrules,int32_t table_size,const int32_t * symb_table,const unsigned char * extra)3031 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3032 			int *coll_sym_alloc, const unsigned char *name,
3033 			uint32_t nrules, int32_t table_size,
3034 			const int32_t *symb_table, const unsigned char *extra)
3035 {
3036   int32_t elem, idx;
3037   size_t name_len = strlen ((const char *) name);
3038   if (nrules != 0)
3039     {
3040       elem = seek_collating_symbol_entry (name, name_len, symb_table,
3041 					  table_size, extra);
3042       if (elem != -1)
3043 	{
3044 	  /* We found the entry.  */
3045 	  idx = symb_table[2 * elem + 1];
3046 	  /* Skip the name of collating element name.  */
3047 	  idx += 1 + extra[idx];
3048 	}
3049       else if (name_len == 1)
3050 	{
3051 	  /* No valid character, treat it as a normal
3052 	     character.  */
3053 	  bitset_set (sbcset, name[0]);
3054 	  return REG_NOERROR;
3055 	}
3056       else
3057 	return REG_ECOLLATE;
3058 
3059       /* Got valid collation sequence, add it as a new entry.  */
3060       /* Check the space of the arrays.  */
3061       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3062 	{
3063 	  /* Not enough, realloc it.  */
3064 	  /* +1 in case of mbcset->ncoll_syms is 0.  */
3065 	  int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3066 	  /* Use realloc since mbcset->coll_syms is NULL
3067 	     if *alloc == 0.  */
3068 	  int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3069 					       new_coll_sym_alloc);
3070           if (__glibc_unlikely (new_coll_syms == NULL))
3071 	    return REG_ESPACE;
3072 	  mbcset->coll_syms = new_coll_syms;
3073 	  *coll_sym_alloc = new_coll_sym_alloc;
3074 	}
3075       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3076       return REG_NOERROR;
3077     }
3078   else
3079     {
3080       if (__glibc_unlikely (name_len != 1))
3081 	return REG_ECOLLATE;
3082       else
3083 	{
3084 	  bitset_set (sbcset, name[0]);
3085 	  return REG_NOERROR;
3086 	}
3087     }
3088 }
3089 #endif /* _LIBC */
3090 
3091 /* This function parse bracket expression like "[abc]", "[a-c]",
3092    "[[.a-a.]]" etc.  */
3093 
3094 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)3095 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3096 		   reg_syntax_t syntax, reg_errcode_t *err)
3097 {
3098 #ifdef _LIBC
3099   const unsigned char *collseqmb;
3100   const char *collseqwc = NULL;
3101   uint32_t nrules;
3102   int32_t table_size = 0;
3103   const int32_t *symb_table = NULL;
3104   const unsigned char *extra = NULL;
3105 #endif
3106 
3107   re_token_t br_token;
3108   re_bitset_ptr_t sbcset;
3109 #ifdef RE_ENABLE_I18N
3110   re_charset_t *mbcset;
3111   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3112   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3113 #endif /* not RE_ENABLE_I18N */
3114   bool non_match = false;
3115   bin_tree_t *work_tree;
3116   int token_len;
3117   bool first_round = true;
3118 #ifdef _LIBC
3119   collseqmb = (const unsigned char *)
3120     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3121   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3122   if (nrules)
3123     {
3124       /*
3125       if (MB_CUR_MAX > 1)
3126       */
3127       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3128       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3129       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3130 						  _NL_COLLATE_SYMB_TABLEMB);
3131       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3132 						   _NL_COLLATE_SYMB_EXTRAMB);
3133     }
3134 #endif
3135   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3136 #ifdef RE_ENABLE_I18N
3137   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3138 #endif /* RE_ENABLE_I18N */
3139 #ifdef RE_ENABLE_I18N
3140   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3141 #else
3142   if (__glibc_unlikely (sbcset == NULL))
3143 #endif /* RE_ENABLE_I18N */
3144     {
3145       re_free (sbcset);
3146 #ifdef RE_ENABLE_I18N
3147       re_free (mbcset);
3148 #endif
3149       *err = REG_ESPACE;
3150       return NULL;
3151     }
3152 
3153   token_len = peek_token_bracket (token, regexp, syntax);
3154   if (__glibc_unlikely (token->type == END_OF_RE))
3155     {
3156       *err = REG_BADPAT;
3157       goto parse_bracket_exp_free_return;
3158     }
3159   if (token->type == OP_NON_MATCH_LIST)
3160     {
3161 #ifdef RE_ENABLE_I18N
3162       mbcset->non_match = 1;
3163 #endif /* not RE_ENABLE_I18N */
3164       non_match = true;
3165       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3166 	bitset_set (sbcset, '\n');
3167       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3168       token_len = peek_token_bracket (token, regexp, syntax);
3169       if (__glibc_unlikely (token->type == END_OF_RE))
3170 	{
3171 	  *err = REG_BADPAT;
3172 	  goto parse_bracket_exp_free_return;
3173 	}
3174     }
3175 
3176   /* We treat the first ']' as a normal character.  */
3177   if (token->type == OP_CLOSE_BRACKET)
3178     token->type = CHARACTER;
3179 
3180   while (1)
3181     {
3182       bracket_elem_t start_elem, end_elem;
3183       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3184       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3185       reg_errcode_t ret;
3186       int token_len2 = 0;
3187       bool is_range_exp = false;
3188       re_token_t token2;
3189 
3190       start_elem.opr.name = start_name_buf;
3191       start_elem.type = COLL_SYM;
3192       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3193 				   syntax, first_round);
3194       if (__glibc_unlikely (ret != REG_NOERROR))
3195 	{
3196 	  *err = ret;
3197 	  goto parse_bracket_exp_free_return;
3198 	}
3199       first_round = false;
3200 
3201       /* Get information about the next token.  We need it in any case.  */
3202       token_len = peek_token_bracket (token, regexp, syntax);
3203 
3204       /* Do not check for ranges if we know they are not allowed.  */
3205       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3206 	{
3207 	  if (__glibc_unlikely (token->type == END_OF_RE))
3208 	    {
3209 	      *err = REG_EBRACK;
3210 	      goto parse_bracket_exp_free_return;
3211 	    }
3212 	  if (token->type == OP_CHARSET_RANGE)
3213 	    {
3214 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3215 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3216 	      if (__glibc_unlikely (token2.type == END_OF_RE))
3217 		{
3218 		  *err = REG_EBRACK;
3219 		  goto parse_bracket_exp_free_return;
3220 		}
3221 	      if (token2.type == OP_CLOSE_BRACKET)
3222 		{
3223 		  /* We treat the last '-' as a normal character.  */
3224 		  re_string_skip_bytes (regexp, -token_len);
3225 		  token->type = CHARACTER;
3226 		}
3227 	      else
3228 		is_range_exp = true;
3229 	    }
3230 	}
3231 
3232       if (is_range_exp == true)
3233 	{
3234 	  end_elem.opr.name = end_name_buf;
3235 	  end_elem.type = COLL_SYM;
3236 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3237 				       dfa, syntax, true);
3238 	  if (__glibc_unlikely (ret != REG_NOERROR))
3239 	    {
3240 	      *err = ret;
3241 	      goto parse_bracket_exp_free_return;
3242 	    }
3243 
3244 	  token_len = peek_token_bracket (token, regexp, syntax);
3245 
3246 #ifdef _LIBC
3247 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3248 				  &start_elem, &end_elem,
3249 				  dfa, syntax, nrules, collseqmb, collseqwc,
3250 				  table_size, symb_table, extra);
3251 #else
3252 # ifdef RE_ENABLE_I18N
3253 	  *err = build_range_exp (syntax, sbcset,
3254 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3255 				  &range_alloc, &start_elem, &end_elem);
3256 # else
3257 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3258 # endif
3259 #endif /* RE_ENABLE_I18N */
3260 	  if (__glibc_unlikely (*err != REG_NOERROR))
3261 	    goto parse_bracket_exp_free_return;
3262 	}
3263       else
3264 	{
3265 	  switch (start_elem.type)
3266 	    {
3267 	    case SB_CHAR:
3268 	      bitset_set (sbcset, start_elem.opr.ch);
3269 	      break;
3270 #ifdef RE_ENABLE_I18N
3271 	    case MB_CHAR:
3272 	      /* Check whether the array has enough space.  */
3273 	      if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3274 		{
3275 		  wchar_t *new_mbchars;
3276 		  /* Not enough, realloc it.  */
3277 		  /* +1 in case of mbcset->nmbchars is 0.  */
3278 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3279 		  /* Use realloc since array is NULL if *alloc == 0.  */
3280 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3281 					    mbchar_alloc);
3282 		  if (__glibc_unlikely (new_mbchars == NULL))
3283 		    goto parse_bracket_exp_espace;
3284 		  mbcset->mbchars = new_mbchars;
3285 		}
3286 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3287 	      break;
3288 #endif /* RE_ENABLE_I18N */
3289 	    case EQUIV_CLASS:
3290 	      *err = build_equiv_class (sbcset,
3291 #ifdef RE_ENABLE_I18N
3292 					mbcset, &equiv_class_alloc,
3293 #endif /* RE_ENABLE_I18N */
3294 					start_elem.opr.name);
3295 	      if (__glibc_unlikely (*err != REG_NOERROR))
3296 		goto parse_bracket_exp_free_return;
3297 	      break;
3298 	    case COLL_SYM:
3299 	      *err = build_collating_symbol (sbcset,
3300 #ifdef RE_ENABLE_I18N
3301 					     mbcset, &coll_sym_alloc,
3302 #endif /* RE_ENABLE_I18N */
3303 					     start_elem.opr.name,
3304 					     nrules, table_size, symb_table, extra);
3305 	      if (__glibc_unlikely (*err != REG_NOERROR))
3306 		goto parse_bracket_exp_free_return;
3307 	      break;
3308 	    case CHAR_CLASS:
3309 	      *err = build_charclass (regexp->trans, sbcset,
3310 #ifdef RE_ENABLE_I18N
3311 				      mbcset, &char_class_alloc,
3312 #endif /* RE_ENABLE_I18N */
3313 				      (const char *) start_elem.opr.name,
3314 				      syntax);
3315 	      if (__glibc_unlikely (*err != REG_NOERROR))
3316 	       goto parse_bracket_exp_free_return;
3317 	      break;
3318 	    default:
3319 	      DEBUG_ASSERT (false);
3320 	      break;
3321 	    }
3322 	}
3323       if (__glibc_unlikely (token->type == END_OF_RE))
3324 	{
3325 	  *err = REG_EBRACK;
3326 	  goto parse_bracket_exp_free_return;
3327 	}
3328       if (token->type == OP_CLOSE_BRACKET)
3329 	break;
3330     }
3331 
3332   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3333 
3334   /* If it is non-matching list.  */
3335   if (non_match)
3336     bitset_not (sbcset);
3337 
3338 #ifdef RE_ENABLE_I18N
3339   /* Ensure only single byte characters are set.  */
3340   if (dfa->mb_cur_max > 1)
3341     bitset_mask (sbcset, dfa->sb_char);
3342 
3343   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3344       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3345 						     || mbcset->non_match)))
3346     {
3347       bin_tree_t *mbc_tree;
3348       int sbc_idx;
3349       /* Build a tree for complex bracket.  */
3350       dfa->has_mb_node = 1;
3351       br_token.type = COMPLEX_BRACKET;
3352       br_token.opr.mbcset = mbcset;
3353       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354       if (__glibc_unlikely (mbc_tree == NULL))
3355 	goto parse_bracket_exp_espace;
3356       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3357 	if (sbcset[sbc_idx])
3358 	  break;
3359       /* If there are no bits set in sbcset, there is no point
3360 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3361       if (sbc_idx < BITSET_WORDS)
3362 	{
3363 	  /* Build a tree for simple bracket.  */
3364 	  br_token.type = SIMPLE_BRACKET;
3365 	  br_token.opr.sbcset = sbcset;
3366 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3367 	  if (__glibc_unlikely (work_tree == NULL))
3368 	    goto parse_bracket_exp_espace;
3369 
3370 	  /* Then join them by ALT node.  */
3371 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3372 	  if (__glibc_unlikely (work_tree == NULL))
3373 	    goto parse_bracket_exp_espace;
3374 	}
3375       else
3376 	{
3377 	  re_free (sbcset);
3378 	  work_tree = mbc_tree;
3379 	}
3380     }
3381   else
3382 #endif /* not RE_ENABLE_I18N */
3383     {
3384 #ifdef RE_ENABLE_I18N
3385       free_charset (mbcset);
3386 #endif
3387       /* Build a tree for simple bracket.  */
3388       br_token.type = SIMPLE_BRACKET;
3389       br_token.opr.sbcset = sbcset;
3390       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3391       if (__glibc_unlikely (work_tree == NULL))
3392 	goto parse_bracket_exp_espace;
3393     }
3394   return work_tree;
3395 
3396  parse_bracket_exp_espace:
3397   *err = REG_ESPACE;
3398  parse_bracket_exp_free_return:
3399   re_free (sbcset);
3400 #ifdef RE_ENABLE_I18N
3401   free_charset (mbcset);
3402 #endif /* RE_ENABLE_I18N */
3403   return NULL;
3404 }
3405 
3406 /* Parse an element in the bracket expression.  */
3407 
3408 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3409 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3410 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3411 		       reg_syntax_t syntax, bool accept_hyphen)
3412 {
3413 #ifdef RE_ENABLE_I18N
3414   int cur_char_size;
3415   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3416   if (cur_char_size > 1)
3417     {
3418       elem->type = MB_CHAR;
3419       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3420       re_string_skip_bytes (regexp, cur_char_size);
3421       return REG_NOERROR;
3422     }
3423 #endif /* RE_ENABLE_I18N */
3424   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3425   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3426       || token->type == OP_OPEN_EQUIV_CLASS)
3427     return parse_bracket_symbol (elem, regexp, token);
3428   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3429     {
3430       /* A '-' must only appear as anything but a range indicator before
3431 	 the closing bracket.  Everything else is an error.  */
3432       re_token_t token2;
3433       (void) peek_token_bracket (&token2, regexp, syntax);
3434       if (token2.type != OP_CLOSE_BRACKET)
3435 	/* The actual error value is not standardized since this whole
3436 	   case is undefined.  But ERANGE makes good sense.  */
3437 	return REG_ERANGE;
3438     }
3439   elem->type = SB_CHAR;
3440   elem->opr.ch = token->opr.c;
3441   return REG_NOERROR;
3442 }
3443 
3444 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3445    such as [:<character_class>:], [.<collating_element>.], and
3446    [=<equivalent_class>=].  */
3447 
3448 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3449 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3450 		      re_token_t *token)
3451 {
3452   unsigned char ch, delim = token->opr.c;
3453   int i = 0;
3454   if (re_string_eoi(regexp))
3455     return REG_EBRACK;
3456   for (;; ++i)
3457     {
3458       if (i >= BRACKET_NAME_BUF_SIZE)
3459 	return REG_EBRACK;
3460       if (token->type == OP_OPEN_CHAR_CLASS)
3461 	ch = re_string_fetch_byte_case (regexp);
3462       else
3463 	ch = re_string_fetch_byte (regexp);
3464       if (re_string_eoi(regexp))
3465 	return REG_EBRACK;
3466       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3467 	break;
3468       elem->opr.name[i] = ch;
3469     }
3470   re_string_skip_bytes (regexp, 1);
3471   elem->opr.name[i] = '\0';
3472   switch (token->type)
3473     {
3474     case OP_OPEN_COLL_ELEM:
3475       elem->type = COLL_SYM;
3476       break;
3477     case OP_OPEN_EQUIV_CLASS:
3478       elem->type = EQUIV_CLASS;
3479       break;
3480     case OP_OPEN_CHAR_CLASS:
3481       elem->type = CHAR_CLASS;
3482       break;
3483     default:
3484       break;
3485     }
3486   return REG_NOERROR;
3487 }
3488 
3489   /* Helper function for parse_bracket_exp.
3490      Build the equivalence class which is represented by NAME.
3491      The result are written to MBCSET and SBCSET.
3492      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3493      is a pointer argument since we may update it.  */
3494 
3495 static reg_errcode_t
3496 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3497 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3498 		   Idx *equiv_class_alloc, const unsigned char *name)
3499 #else /* not RE_ENABLE_I18N */
3500 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3501 #endif /* not RE_ENABLE_I18N */
3502 {
3503 #ifdef _LIBC
3504   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3505   if (nrules != 0)
3506     {
3507       const int32_t *table, *indirect;
3508       const unsigned char *weights, *extra, *cp;
3509       unsigned char char_buf[2];
3510       int32_t idx1, idx2;
3511       unsigned int ch;
3512       size_t len;
3513       /* Calculate the index for equivalence class.  */
3514       cp = name;
3515       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3516       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3517 					       _NL_COLLATE_WEIGHTMB);
3518       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3519 						   _NL_COLLATE_EXTRAMB);
3520       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3521 						_NL_COLLATE_INDIRECTMB);
3522       idx1 = findidx (table, indirect, extra, &cp, -1);
3523       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3524 	/* This isn't a valid character.  */
3525 	return REG_ECOLLATE;
3526 
3527       /* Build single byte matching table for this equivalence class.  */
3528       len = weights[idx1 & 0xffffff];
3529       for (ch = 0; ch < SBC_MAX; ++ch)
3530 	{
3531 	  char_buf[0] = ch;
3532 	  cp = char_buf;
3533 	  idx2 = findidx (table, indirect, extra, &cp, 1);
3534 /*
3535 	  idx2 = table[ch];
3536 */
3537 	  if (idx2 == 0)
3538 	    /* This isn't a valid character.  */
3539 	    continue;
3540 	  /* Compare only if the length matches and the collation rule
3541 	     index is the same.  */
3542 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3543 	      && memcmp (weights + (idx1 & 0xffffff) + 1,
3544 			 weights + (idx2 & 0xffffff) + 1, len) == 0)
3545 	    bitset_set (sbcset, ch);
3546 	}
3547       /* Check whether the array has enough space.  */
3548       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3549 	{
3550 	  /* Not enough, realloc it.  */
3551 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3552 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3554 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555 						   int32_t,
3556 						   new_equiv_class_alloc);
3557 	  if (__glibc_unlikely (new_equiv_classes == NULL))
3558 	    return REG_ESPACE;
3559 	  mbcset->equiv_classes = new_equiv_classes;
3560 	  *equiv_class_alloc = new_equiv_class_alloc;
3561 	}
3562       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563     }
3564   else
3565 #endif /* _LIBC */
3566     {
3567       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3568 	return REG_ECOLLATE;
3569       bitset_set (sbcset, *name);
3570     }
3571   return REG_NOERROR;
3572 }
3573 
3574   /* Helper function for parse_bracket_exp.
3575      Build the character class which is represented by NAME.
3576      The result are written to MBCSET and SBCSET.
3577      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578      is a pointer argument since we may update it.  */
3579 
3580 static reg_errcode_t
3581 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 		 re_charset_t *mbcset, Idx *char_class_alloc,
3584 		 const char *class_name, reg_syntax_t syntax)
3585 #else /* not RE_ENABLE_I18N */
3586 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587 		 const char *class_name, reg_syntax_t syntax)
3588 #endif /* not RE_ENABLE_I18N */
3589 {
3590   int i;
3591   const char *name = class_name;
3592 
3593   /* In case of REG_ICASE "upper" and "lower" match the both of
3594      upper and lower cases.  */
3595   if ((syntax & RE_ICASE)
3596       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3597     name = "alpha";
3598 
3599 #ifdef RE_ENABLE_I18N
3600   /* Check the space of the arrays.  */
3601   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3602     {
3603       /* Not enough, realloc it.  */
3604       /* +1 in case of mbcset->nchar_classes is 0.  */
3605       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606       /* Use realloc since array is NULL if *alloc == 0.  */
3607       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608 					       new_char_class_alloc);
3609       if (__glibc_unlikely (new_char_classes == NULL))
3610 	return REG_ESPACE;
3611       mbcset->char_classes = new_char_classes;
3612       *char_class_alloc = new_char_class_alloc;
3613     }
3614   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615 #endif /* RE_ENABLE_I18N */
3616 
3617 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3618   do {						\
3619     if (__glibc_unlikely (trans != NULL))			\
3620       {						\
3621 	for (i = 0; i < SBC_MAX; ++i)		\
3622 	  if (ctype_func (i))			\
3623 	    bitset_set (sbcset, trans[i]);	\
3624       }						\
3625     else					\
3626       {						\
3627 	for (i = 0; i < SBC_MAX; ++i)		\
3628 	  if (ctype_func (i))			\
3629 	    bitset_set (sbcset, i);		\
3630       }						\
3631   } while (0)
3632 
3633   if (strcmp (name, "alnum") == 0)
3634     BUILD_CHARCLASS_LOOP (isalnum);
3635   else if (strcmp (name, "cntrl") == 0)
3636     BUILD_CHARCLASS_LOOP (iscntrl);
3637   else if (strcmp (name, "lower") == 0)
3638     BUILD_CHARCLASS_LOOP (islower);
3639   else if (strcmp (name, "space") == 0)
3640     BUILD_CHARCLASS_LOOP (isspace);
3641   else if (strcmp (name, "alpha") == 0)
3642     BUILD_CHARCLASS_LOOP (isalpha);
3643   else if (strcmp (name, "digit") == 0)
3644     BUILD_CHARCLASS_LOOP (isdigit);
3645   else if (strcmp (name, "print") == 0)
3646     BUILD_CHARCLASS_LOOP (isprint);
3647   else if (strcmp (name, "upper") == 0)
3648     BUILD_CHARCLASS_LOOP (isupper);
3649   else if (strcmp (name, "blank") == 0)
3650     BUILD_CHARCLASS_LOOP (isblank);
3651   else if (strcmp (name, "graph") == 0)
3652     BUILD_CHARCLASS_LOOP (isgraph);
3653   else if (strcmp (name, "punct") == 0)
3654     BUILD_CHARCLASS_LOOP (ispunct);
3655   else if (strcmp (name, "xdigit") == 0)
3656     BUILD_CHARCLASS_LOOP (isxdigit);
3657   else
3658     return REG_ECTYPE;
3659 
3660   return REG_NOERROR;
3661 }
3662 
3663 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)3664 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665 		    const char *class_name,
3666 		    const char *extra, bool non_match,
3667 		    reg_errcode_t *err)
3668 {
3669   re_bitset_ptr_t sbcset;
3670 #ifdef RE_ENABLE_I18N
3671   re_charset_t *mbcset;
3672   Idx alloc = 0;
3673 #endif /* not RE_ENABLE_I18N */
3674   reg_errcode_t ret;
3675   bin_tree_t *tree;
3676 
3677   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3678   if (__glibc_unlikely (sbcset == NULL))
3679     {
3680       *err = REG_ESPACE;
3681       return NULL;
3682     }
3683 #ifdef RE_ENABLE_I18N
3684   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3685   if (__glibc_unlikely (mbcset == NULL))
3686     {
3687       re_free (sbcset);
3688       *err = REG_ESPACE;
3689       return NULL;
3690     }
3691   mbcset->non_match = non_match;
3692 #endif /* RE_ENABLE_I18N */
3693 
3694   /* We don't care the syntax in this case.  */
3695   ret = build_charclass (trans, sbcset,
3696 #ifdef RE_ENABLE_I18N
3697 			 mbcset, &alloc,
3698 #endif /* RE_ENABLE_I18N */
3699 			 class_name, 0);
3700 
3701   if (__glibc_unlikely (ret != REG_NOERROR))
3702     {
3703       re_free (sbcset);
3704 #ifdef RE_ENABLE_I18N
3705       free_charset (mbcset);
3706 #endif /* RE_ENABLE_I18N */
3707       *err = ret;
3708       return NULL;
3709     }
3710   /* \w match '_' also.  */
3711   for (; *extra; extra++)
3712     bitset_set (sbcset, *extra);
3713 
3714   /* If it is non-matching list.  */
3715   if (non_match)
3716     bitset_not (sbcset);
3717 
3718 #ifdef RE_ENABLE_I18N
3719   /* Ensure only single byte characters are set.  */
3720   if (dfa->mb_cur_max > 1)
3721     bitset_mask (sbcset, dfa->sb_char);
3722 #endif
3723 
3724   /* Build a tree for simple bracket.  */
3725   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3726   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3727   if (__glibc_unlikely (tree == NULL))
3728     goto build_word_op_espace;
3729 
3730 #ifdef RE_ENABLE_I18N
3731   if (dfa->mb_cur_max > 1)
3732     {
3733       bin_tree_t *mbc_tree;
3734       /* Build a tree for complex bracket.  */
3735       br_token.type = COMPLEX_BRACKET;
3736       br_token.opr.mbcset = mbcset;
3737       dfa->has_mb_node = 1;
3738       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3739       if (__glibc_unlikely (mbc_tree == NULL))
3740 	goto build_word_op_espace;
3741       /* Then join them by ALT node.  */
3742       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3743       if (__glibc_likely (mbc_tree != NULL))
3744 	return tree;
3745     }
3746   else
3747     {
3748       free_charset (mbcset);
3749       return tree;
3750     }
3751 #else /* not RE_ENABLE_I18N */
3752   return tree;
3753 #endif /* not RE_ENABLE_I18N */
3754 
3755  build_word_op_espace:
3756   re_free (sbcset);
3757 #ifdef RE_ENABLE_I18N
3758   free_charset (mbcset);
3759 #endif /* RE_ENABLE_I18N */
3760   *err = REG_ESPACE;
3761   return NULL;
3762 }
3763 
3764 /* This is intended for the expressions like "a{1,3}".
3765    Fetch a number from 'input', and return the number.
3766    Return -1 if the number field is empty like "{,1}".
3767    Return RE_DUP_MAX + 1 if the number field is too large.
3768    Return -2 if an error occurred.  */
3769 
3770 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3771 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3772 {
3773   Idx num = -1;
3774   unsigned char c;
3775   while (1)
3776     {
3777       fetch_token (token, input, syntax);
3778       c = token->opr.c;
3779       if (__glibc_unlikely (token->type == END_OF_RE))
3780 	return -2;
3781       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3782 	break;
3783       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3784 	     ? -2
3785 	     : num == -1
3786 	     ? c - '0'
3787 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3788     }
3789   return num;
3790 }
3791 
3792 #ifdef RE_ENABLE_I18N
3793 static void
free_charset(re_charset_t * cset)3794 free_charset (re_charset_t *cset)
3795 {
3796   re_free (cset->mbchars);
3797 # ifdef _LIBC
3798   re_free (cset->coll_syms);
3799   re_free (cset->equiv_classes);
3800 # endif
3801   re_free (cset->range_starts);
3802   re_free (cset->range_ends);
3803   re_free (cset->char_classes);
3804   re_free (cset);
3805 }
3806 #endif /* RE_ENABLE_I18N */
3807 
3808 /* Functions for binary tree operation.  */
3809 
3810 /* Create a tree node.  */
3811 
3812 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3813 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3814 	     re_token_type_t type)
3815 {
3816   re_token_t t = { .type = type };
3817   return create_token_tree (dfa, left, right, &t);
3818 }
3819 
3820 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3821 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 		   const re_token_t *token)
3823 {
3824   bin_tree_t *tree;
3825   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3826     {
3827       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3828 
3829       if (storage == NULL)
3830 	return NULL;
3831       storage->next = dfa->str_tree_storage;
3832       dfa->str_tree_storage = storage;
3833       dfa->str_tree_storage_idx = 0;
3834     }
3835   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3836 
3837   tree->parent = NULL;
3838   tree->left = left;
3839   tree->right = right;
3840   tree->token = *token;
3841   tree->token.duplicated = 0;
3842   tree->token.opt_subexp = 0;
3843   tree->first = NULL;
3844   tree->next = NULL;
3845   tree->node_idx = -1;
3846 
3847   if (left != NULL)
3848     left->parent = tree;
3849   if (right != NULL)
3850     right->parent = tree;
3851   return tree;
3852 }
3853 
3854 /* Mark the tree SRC as an optional subexpression.
3855    To be called from preorder or postorder.  */
3856 
3857 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3858 mark_opt_subexp (void *extra, bin_tree_t *node)
3859 {
3860   Idx idx = (uintptr_t) extra;
3861   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862     node->token.opt_subexp = 1;
3863 
3864   return REG_NOERROR;
3865 }
3866 
3867 /* Free the allocated memory inside NODE. */
3868 
3869 static void
free_token(re_token_t * node)3870 free_token (re_token_t *node)
3871 {
3872 #ifdef RE_ENABLE_I18N
3873   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874     free_charset (node->opr.mbcset);
3875   else
3876 #endif /* RE_ENABLE_I18N */
3877     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878       re_free (node->opr.sbcset);
3879 }
3880 
3881 /* Worker function for tree walking.  Free the allocated memory inside NODE
3882    and its children. */
3883 
3884 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3885 free_tree (void *extra, bin_tree_t *node)
3886 {
3887   free_token (&node->token);
3888   return REG_NOERROR;
3889 }
3890 
3891 
3892 /* Duplicate the node SRC, and return new node.  This is a preorder
3893    visit similar to the one implemented by the generic visitor, but
3894    we need more infrastructure to maintain two parallel trees --- so,
3895    it's easier to duplicate.  */
3896 
3897 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3898 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3899 {
3900   const bin_tree_t *node;
3901   bin_tree_t *dup_root;
3902   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3903 
3904   for (node = root; ; )
3905     {
3906       /* Create a new tree and link it back to the current parent.  */
3907       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3908       if (*p_new == NULL)
3909 	return NULL;
3910       (*p_new)->parent = dup_node;
3911       (*p_new)->token.duplicated = 1;
3912       dup_node = *p_new;
3913 
3914       /* Go to the left node, or up and to the right.  */
3915       if (node->left)
3916 	{
3917 	  node = node->left;
3918 	  p_new = &dup_node->left;
3919 	}
3920       else
3921 	{
3922 	  const bin_tree_t *prev = NULL;
3923 	  while (node->right == prev || node->right == NULL)
3924 	    {
3925 	      prev = node;
3926 	      node = node->parent;
3927 	      dup_node = dup_node->parent;
3928 	      if (!node)
3929 		return dup_root;
3930 	    }
3931 	  node = node->right;
3932 	  p_new = &dup_node->right;
3933 	}
3934     }
3935 }
3936