1 /* Regular expression tests.
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <sys/types.h>
20 #include <mcheck.h>
21 #include <regex.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <locale.h>
26 #include <getopt.h>
27 
28 static void
replace_special_chars(char * str)29 replace_special_chars (char *str)
30 {
31   for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
32     switch (*str)
33       {
34       case 'N': *str = '\n'; break;
35       case 'T': *str = '\t'; break;
36       case 'S': *str = ' '; break;
37       case 'Z': *str = '\0'; break;
38       }
39 }
40 
41 static void
glibc_re_syntax(char * str)42 glibc_re_syntax (char *str)
43 {
44   char *p, *end = strchr (str, '\0') + 1;
45 
46   /* Replace [[:<:]] with \< and [[:>:]] with \>.  */
47   for (p = str; (p = strstr (p, "[[:")) != NULL; )
48     if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
49       {
50         p[0] = '\\';
51         p[1] = p[3];
52         memmove (p + 2, p + 7, end - p - 7);
53         end -= 5;
54         p += 2;
55       }
56     else
57       p += 3;
58 }
59 
60 static char *
mb_replace(char * dst,const char c)61 mb_replace (char *dst, const char c)
62 {
63   switch (c)
64     {
65     /* Replace a with \'a and A with \'A.  */
66     case 'a':
67       *dst++ = '\xc3';
68       *dst++ = '\xa1';
69       break;
70     case 'A':
71       *dst++ = '\xc3';
72       *dst++ = '\x81';
73       break;
74     /* Replace b with \v{c} and B with \v{C}.  */
75     case 'b':
76       *dst++ = '\xc4';
77       *dst++ = '\x8d';
78       break;
79     case 'B':
80       *dst++ = '\xc4';
81       *dst++ = '\x8c';
82       break;
83     /* Replace c with \v{d} and C with \v{D}.  */
84     case 'c':
85       *dst++ = '\xc4';
86       *dst++ = '\x8f';
87       break;
88     case 'C':
89       *dst++ = '\xc4';
90       *dst++ = '\x8e';
91       break;
92     /* Replace d with \'e and D with \'E.  */
93     case 'd':
94       *dst++ = '\xc3';
95       *dst++ = '\xa9';
96       break;
97     case 'D':
98       *dst++ = '\xc3';
99       *dst++ = '\x89';
100       break;
101     }
102   return dst;
103 }
104 
105 static char *
mb_frob_string(const char * str,const char * letters)106 mb_frob_string (const char *str, const char *letters)
107 {
108   char *ret, *dst;
109   const char *src;
110 
111   if (str == NULL)
112     return NULL;
113 
114   ret = malloc (2 * strlen (str) + 1);
115   if (ret == NULL)
116     return NULL;
117 
118   for (src = str, dst = ret; *src; ++src)
119     if (strchr (letters, *src))
120       dst = mb_replace (dst, *src);
121     else
122       *dst++ = *src;
123   *dst = '\0';
124   return ret;
125 }
126 
127 /* Like mb_frob_string, but don't replace anything between
128    [: and :], [. and .] or [= and =] or characters escaped
129    with a backslash.  */
130 
131 static char *
mb_frob_pattern(const char * str,const char * letters)132 mb_frob_pattern (const char *str, const char *letters)
133 {
134   char *ret, *dst;
135   const char *src;
136   int in_class = 0, escaped = 0;
137 
138   if (str == NULL)
139     return NULL;
140 
141   ret = malloc (2 * strlen (str) + 1);
142   if (ret == NULL)
143     return NULL;
144 
145   for (src = str, dst = ret; *src; ++src)
146     if (*src == '\\')
147       {
148 	escaped ^= 1;
149 	*dst++ = *src;
150       }
151     else if (escaped)
152       {
153 	escaped = 0;
154 	*dst++ = *src;
155 	continue;
156       }
157     else if (!in_class && strchr (letters, *src))
158       dst = mb_replace (dst, *src);
159     else
160       {
161 	if (!in_class && *src == '[' && strchr (":.=", src[1]))
162 	  in_class = 1;
163 	else if (in_class && *src == ']' && strchr (":.=", src[-1]))
164 	  in_class = 0;
165 	*dst++ = *src;
166       }
167   *dst = '\0';
168   return ret;
169 }
170 
171 static int
check_match(regmatch_t * rm,int idx,const char * string,const char * match,const char * fail)172 check_match (regmatch_t *rm, int idx, const char *string,
173 	     const char *match, const char *fail)
174 {
175   if (match[0] == '-' && match[1] == '\0')
176     {
177       if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
178 	return 0;
179       printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
180       return 1;
181     }
182 
183   if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
184     {
185       printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
186       return 1;
187     }
188 
189   if (match[0] == '@')
190     {
191       if (rm[idx].rm_so != rm[idx].rm_eo)
192 	{
193 	  printf ("%s rm[%d] not empty\n", fail, idx);
194 	  return 1;
195 	}
196 
197       if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1) ?: 1))
198 	{
199 	  printf ("%s rm[%d] not matching %s\n", fail, idx, match);
200 	  return 1;
201 	}
202       return 0;
203     }
204 
205   if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
206       || strncmp (string + rm[idx].rm_so, match,
207 		  rm[idx].rm_eo - rm[idx].rm_so))
208     {
209       printf ("%s rm[%d] not matching %s\n", fail, idx, match);
210       return 1;
211     }
212 
213   return 0;
214 }
215 
216 static int
test(const char * pattern,int cflags,const char * string,int eflags,char * expect,char * matches,const char * fail)217 test (const char *pattern, int cflags, const char *string, int eflags,
218       char *expect, char *matches, const char *fail)
219 {
220   regex_t re;
221   regmatch_t rm[10];
222   int n, ret = 0;
223 
224   n = regcomp (&re, pattern, cflags);
225   if (n != 0)
226     {
227       char buf[500];
228       if (eflags == -1)
229 	{
230 	  static struct { reg_errcode_t code; const char *name; } codes []
231 #define C(x) { REG_##x, #x }
232 	    = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
233 		C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
234 		C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
235 		C(ESPACE), C(BADRPT) };
236 
237 	  for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
238 	    if (n == codes[i].code)
239 	      {
240 		if (strcmp (string, codes[i].name))
241 		  {
242 		    printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
243 			    fail, codes[i].name, string);
244 		    return 1;
245 		  }
246 	        return 0;
247 	      }
248 
249 	  printf ("%s regcomp return value REG_%d\n", fail, n);
250 	  return 1;
251 	}
252 
253       regerror (n, &re, buf, sizeof (buf));
254       printf ("%s regcomp failed: %s\n", fail, buf);
255       return 1;
256     }
257 
258   if (eflags == -1)
259     {
260       regfree (&re);
261 
262       /* The test case file assumes something only guaranteed by the
263 	 rxspencer regex implementation.  Namely that for empty
264 	 expressions regcomp() return REG_EMPTY.  This is not the case
265 	 for us and so we ignore this error.  */
266       if (strcmp (string, "EMPTY") == 0)
267 	return 0;
268 
269       printf ("%s regcomp unexpectedly succeeded\n", fail);
270       return 1;
271     }
272 
273   if (regexec (&re, string, 10, rm, eflags))
274     {
275       regfree (&re);
276       if (expect == NULL)
277 	return 0;
278       printf ("%s regexec failed\n", fail);
279       return 1;
280     }
281 
282   regfree (&re);
283 
284   if (expect == NULL)
285     {
286       printf ("%s regexec unexpectedly succeeded\n", fail);
287       return 1;
288     }
289 
290   if (cflags & REG_NOSUB)
291     return 0;
292 
293   ret = check_match (rm, 0, string, expect, fail);
294   if (matches == NULL)
295     return ret;
296 
297   for (n = 1; ret == 0 && n < 10; ++n)
298     {
299       char *p = NULL;
300 
301       if (matches)
302 	{
303 	  p = strchr (matches, ',');
304 	  if (p != NULL)
305 	    *p = '\0';
306 	}
307       ret = check_match (rm, n, string, matches ?: "-", fail);
308       if (p)
309 	{
310 	  *p = ',';
311 	  matches = p + 1;
312 	}
313       else
314 	matches = NULL;
315     }
316 
317   return ret;
318 }
319 
320 static int
mb_test(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches,const char * letters,const char * fail)321 mb_test (const char *pattern, int cflags, const char *string, int eflags,
322 	 char *expect, const char *matches, const char *letters,
323 	 const char *fail)
324 {
325   char *pattern_mb = mb_frob_pattern (pattern, letters);
326   const char *string_mb
327     = eflags == -1 ? string : mb_frob_string (string, letters);
328   char *expect_mb = mb_frob_string (expect, letters);
329   char *matches_mb = mb_frob_string (matches, letters);
330   int ret = 0;
331 
332   if (!pattern_mb || !string_mb
333       || (expect && !expect_mb) || (matches && !matches_mb))
334     {
335       printf ("%s %m", fail);
336       ret = 1;
337     }
338   else
339     ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
340 		matches_mb, fail);
341 
342   free (matches_mb);
343   free (expect_mb);
344   if (string_mb != string)
345     free ((char *) string_mb);
346   free (pattern_mb);
347   return ret;
348 }
349 
350 static int
mb_tests(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches)351 mb_tests (const char *pattern, int cflags, const char *string, int eflags,
352 	  char *expect, const char *matches)
353 {
354   int ret = 0;
355   int i;
356   char letters[9], fail[20];
357 
358   /* The tests aren't supposed to work with xdigit, since a-dA-D are
359      hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not.  */
360   if (strstr (pattern, "[:xdigit:]"))
361     return 0;
362 
363   /* XXX: regex ATM handles only single byte equivalence classes.  */
364   if (strstr (pattern, "[[=b=]]"))
365     return 0;
366 
367   for (i = 1; i < 16; ++i)
368     {
369       char *p = letters;
370       if (i & 1)
371 	{
372 	  if (!strchr (pattern, 'a') && !strchr (string, 'a')
373 	      && !strchr (pattern, 'A') && !strchr (string, 'A'))
374 	    continue;
375 	  *p++ = 'a', *p++ = 'A';
376 	}
377       if (i & 2)
378 	{
379 	  if (!strchr (pattern, 'b') && !strchr (string, 'b')
380 	      && !strchr (pattern, 'B') && !strchr (string, 'B'))
381 	    continue;
382 	  *p++ = 'b', *p++ = 'B';
383 	}
384       if (i & 4)
385 	{
386 	  if (!strchr (pattern, 'c') && !strchr (string, 'c')
387 	      && !strchr (pattern, 'C') && !strchr (string, 'C'))
388 	    continue;
389 	  *p++ = 'c', *p++ = 'C';
390 	}
391       if (i & 8)
392 	{
393 	  if (!strchr (pattern, 'd') && !strchr (string, 'd')
394 	      && !strchr (pattern, 'D') && !strchr (string, 'D'))
395 	    continue;
396 	  *p++ = 'd', *p++ = 'D';
397 	}
398       *p++ = '\0';
399       sprintf (fail, "UTF-8 %s FAIL", letters);
400       ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
401 		      letters, fail);
402     }
403   return ret;
404 }
405 
406 int
main(int argc,char ** argv)407 main (int argc, char **argv)
408 {
409   int ret = 0;
410   char *line = NULL;
411   size_t line_len = 0;
412   ssize_t len;
413   FILE *f;
414   static int test_utf8 = 0;
415   static const struct option options[] =
416     {
417       {"utf8",	no_argument,	&test_utf8,	1},
418       {NULL,	0,		NULL,		0 }
419     };
420 
421   mtrace ();
422 
423   while (getopt_long (argc, argv, "", options, NULL) >= 0);
424 
425   if (optind + 1 != argc)
426     {
427       fprintf (stderr, "Missing test filename\n");
428       return 1;
429     }
430 
431   f = fopen (argv[optind], "r");
432   if (f == NULL)
433     {
434       fprintf (stderr, "Couldn't open %s\n", argv[optind]);
435       return 1;
436     }
437 
438   while ((len = getline (&line, &line_len, f)) > 0)
439     {
440       char *pattern, *flagstr, *string, *expect, *matches, *p;
441       int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
442 
443       if (line[len - 1] == '\n')
444         line[len - 1] = '\0';
445 
446       /* Skip comments and empty lines.  */
447       if (*line == '#' || *line == '\0')
448 	continue;
449 
450       puts (line);
451       fflush (stdout);
452 
453       pattern = strtok (line, "\t");
454       if (pattern == NULL)
455         continue;
456 
457       if (strcmp (pattern, "\"\"") == 0)
458 	pattern += 2;
459 
460       flagstr = strtok (NULL, "\t");
461       if (flagstr == NULL)
462         continue;
463 
464       string = strtok (NULL, "\t");
465       if (string == NULL)
466         continue;
467 
468       if (strcmp (string, "\"\"") == 0)
469 	string += 2;
470 
471       for (p = flagstr; *p; ++p)
472 	switch (*p)
473 	  {
474 	  case '-':
475 	    break;
476 	  case 'b':
477 	    cflags &= ~REG_EXTENDED;
478 	    break;
479 	  case '&':
480 	    try_bre_ere = 1;
481 	    break;
482 	  case 'C':
483 	    eflags = -1;
484 	    break;
485 	  case 'i':
486 	    cflags |= REG_ICASE;
487 	    break;
488 	  case 's':
489 	    cflags |= REG_NOSUB;
490 	    break;
491 	  case 'n':
492 	    cflags |= REG_NEWLINE;
493 	    break;
494 	  case '^':
495 	    eflags |= REG_NOTBOL;
496 	    break;
497 	  case '$':
498 	    eflags |= REG_NOTEOL;
499 	    break;
500 	  case 'm':
501 	  case 'p':
502 	  case '#':
503 	    /* Not supported.  */
504 	    flagstr = NULL;
505 	    break;
506 	  }
507 
508       if (flagstr == NULL)
509 	continue;
510 
511       replace_special_chars (pattern);
512       glibc_re_syntax (pattern);
513       if (eflags != -1)
514         replace_special_chars (string);
515 
516       expect = strtok (NULL, "\t");
517       matches = NULL;
518       if (expect != NULL)
519         {
520 	  replace_special_chars (expect);
521 	  matches = strtok (NULL, "\t");
522 	  if (matches != NULL)
523 	    replace_special_chars (matches);
524         }
525 
526       if (setlocale (LC_ALL, "C") == NULL)
527 	{
528 	  puts ("setlocale C failed");
529 	  ret = 1;
530 	}
531       if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
532 	  || (try_bre_ere
533 	      && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
534 		       expect, matches, "FAIL")))
535 	ret = 1;
536       else if (test_utf8)
537 	{
538 	  if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
539 	    {
540 	      puts ("setlocale cs_CZ.UTF-8 failed");
541 	      ret = 1;
542 	    }
543 	  else if (test (pattern, cflags, string, eflags, expect, matches,
544 			 "UTF-8 FAIL")
545 		   || (try_bre_ere
546 		       && test (pattern, cflags & ~REG_EXTENDED, string,
547 				eflags, expect, matches, "UTF-8 FAIL")))
548 	    ret = 1;
549 	  else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
550 		   || (try_bre_ere
551 		       && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
552 				    eflags, expect, matches)))
553 	    ret = 1;
554 	}
555     }
556 
557   free (line);
558   fclose (f);
559   return ret;
560 }
561