1 /* Regular expression tests.
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sys/types.h>
20 #include <mcheck.h>
21 #include <regex.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <locale.h>
26 #include <getopt.h>
27
28 static void
replace_special_chars(char * str)29 replace_special_chars (char *str)
30 {
31 for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
32 switch (*str)
33 {
34 case 'N': *str = '\n'; break;
35 case 'T': *str = '\t'; break;
36 case 'S': *str = ' '; break;
37 case 'Z': *str = '\0'; break;
38 }
39 }
40
41 static void
glibc_re_syntax(char * str)42 glibc_re_syntax (char *str)
43 {
44 char *p, *end = strchr (str, '\0') + 1;
45
46 /* Replace [[:<:]] with \< and [[:>:]] with \>. */
47 for (p = str; (p = strstr (p, "[[:")) != NULL; )
48 if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
49 {
50 p[0] = '\\';
51 p[1] = p[3];
52 memmove (p + 2, p + 7, end - p - 7);
53 end -= 5;
54 p += 2;
55 }
56 else
57 p += 3;
58 }
59
60 static char *
mb_replace(char * dst,const char c)61 mb_replace (char *dst, const char c)
62 {
63 switch (c)
64 {
65 /* Replace a with \'a and A with \'A. */
66 case 'a':
67 *dst++ = '\xc3';
68 *dst++ = '\xa1';
69 break;
70 case 'A':
71 *dst++ = '\xc3';
72 *dst++ = '\x81';
73 break;
74 /* Replace b with \v{c} and B with \v{C}. */
75 case 'b':
76 *dst++ = '\xc4';
77 *dst++ = '\x8d';
78 break;
79 case 'B':
80 *dst++ = '\xc4';
81 *dst++ = '\x8c';
82 break;
83 /* Replace c with \v{d} and C with \v{D}. */
84 case 'c':
85 *dst++ = '\xc4';
86 *dst++ = '\x8f';
87 break;
88 case 'C':
89 *dst++ = '\xc4';
90 *dst++ = '\x8e';
91 break;
92 /* Replace d with \'e and D with \'E. */
93 case 'd':
94 *dst++ = '\xc3';
95 *dst++ = '\xa9';
96 break;
97 case 'D':
98 *dst++ = '\xc3';
99 *dst++ = '\x89';
100 break;
101 }
102 return dst;
103 }
104
105 static char *
mb_frob_string(const char * str,const char * letters)106 mb_frob_string (const char *str, const char *letters)
107 {
108 char *ret, *dst;
109 const char *src;
110
111 if (str == NULL)
112 return NULL;
113
114 ret = malloc (2 * strlen (str) + 1);
115 if (ret == NULL)
116 return NULL;
117
118 for (src = str, dst = ret; *src; ++src)
119 if (strchr (letters, *src))
120 dst = mb_replace (dst, *src);
121 else
122 *dst++ = *src;
123 *dst = '\0';
124 return ret;
125 }
126
127 /* Like mb_frob_string, but don't replace anything between
128 [: and :], [. and .] or [= and =] or characters escaped
129 with a backslash. */
130
131 static char *
mb_frob_pattern(const char * str,const char * letters)132 mb_frob_pattern (const char *str, const char *letters)
133 {
134 char *ret, *dst;
135 const char *src;
136 int in_class = 0, escaped = 0;
137
138 if (str == NULL)
139 return NULL;
140
141 ret = malloc (2 * strlen (str) + 1);
142 if (ret == NULL)
143 return NULL;
144
145 for (src = str, dst = ret; *src; ++src)
146 if (*src == '\\')
147 {
148 escaped ^= 1;
149 *dst++ = *src;
150 }
151 else if (escaped)
152 {
153 escaped = 0;
154 *dst++ = *src;
155 continue;
156 }
157 else if (!in_class && strchr (letters, *src))
158 dst = mb_replace (dst, *src);
159 else
160 {
161 if (!in_class && *src == '[' && strchr (":.=", src[1]))
162 in_class = 1;
163 else if (in_class && *src == ']' && strchr (":.=", src[-1]))
164 in_class = 0;
165 *dst++ = *src;
166 }
167 *dst = '\0';
168 return ret;
169 }
170
171 static int
check_match(regmatch_t * rm,int idx,const char * string,const char * match,const char * fail)172 check_match (regmatch_t *rm, int idx, const char *string,
173 const char *match, const char *fail)
174 {
175 if (match[0] == '-' && match[1] == '\0')
176 {
177 if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
178 return 0;
179 printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
180 return 1;
181 }
182
183 if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
184 {
185 printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
186 return 1;
187 }
188
189 if (match[0] == '@')
190 {
191 if (rm[idx].rm_so != rm[idx].rm_eo)
192 {
193 printf ("%s rm[%d] not empty\n", fail, idx);
194 return 1;
195 }
196
197 if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1) ?: 1))
198 {
199 printf ("%s rm[%d] not matching %s\n", fail, idx, match);
200 return 1;
201 }
202 return 0;
203 }
204
205 if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
206 || strncmp (string + rm[idx].rm_so, match,
207 rm[idx].rm_eo - rm[idx].rm_so))
208 {
209 printf ("%s rm[%d] not matching %s\n", fail, idx, match);
210 return 1;
211 }
212
213 return 0;
214 }
215
216 static int
test(const char * pattern,int cflags,const char * string,int eflags,char * expect,char * matches,const char * fail)217 test (const char *pattern, int cflags, const char *string, int eflags,
218 char *expect, char *matches, const char *fail)
219 {
220 regex_t re;
221 regmatch_t rm[10];
222 int n, ret = 0;
223
224 n = regcomp (&re, pattern, cflags);
225 if (n != 0)
226 {
227 char buf[500];
228 if (eflags == -1)
229 {
230 static struct { reg_errcode_t code; const char *name; } codes []
231 #define C(x) { REG_##x, #x }
232 = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
233 C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
234 C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
235 C(ESPACE), C(BADRPT) };
236
237 for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
238 if (n == codes[i].code)
239 {
240 if (strcmp (string, codes[i].name))
241 {
242 printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
243 fail, codes[i].name, string);
244 return 1;
245 }
246 return 0;
247 }
248
249 printf ("%s regcomp return value REG_%d\n", fail, n);
250 return 1;
251 }
252
253 regerror (n, &re, buf, sizeof (buf));
254 printf ("%s regcomp failed: %s\n", fail, buf);
255 return 1;
256 }
257
258 if (eflags == -1)
259 {
260 regfree (&re);
261
262 /* The test case file assumes something only guaranteed by the
263 rxspencer regex implementation. Namely that for empty
264 expressions regcomp() return REG_EMPTY. This is not the case
265 for us and so we ignore this error. */
266 if (strcmp (string, "EMPTY") == 0)
267 return 0;
268
269 printf ("%s regcomp unexpectedly succeeded\n", fail);
270 return 1;
271 }
272
273 if (regexec (&re, string, 10, rm, eflags))
274 {
275 regfree (&re);
276 if (expect == NULL)
277 return 0;
278 printf ("%s regexec failed\n", fail);
279 return 1;
280 }
281
282 regfree (&re);
283
284 if (expect == NULL)
285 {
286 printf ("%s regexec unexpectedly succeeded\n", fail);
287 return 1;
288 }
289
290 if (cflags & REG_NOSUB)
291 return 0;
292
293 ret = check_match (rm, 0, string, expect, fail);
294 if (matches == NULL)
295 return ret;
296
297 for (n = 1; ret == 0 && n < 10; ++n)
298 {
299 char *p = NULL;
300
301 if (matches)
302 {
303 p = strchr (matches, ',');
304 if (p != NULL)
305 *p = '\0';
306 }
307 ret = check_match (rm, n, string, matches ?: "-", fail);
308 if (p)
309 {
310 *p = ',';
311 matches = p + 1;
312 }
313 else
314 matches = NULL;
315 }
316
317 return ret;
318 }
319
320 static int
mb_test(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches,const char * letters,const char * fail)321 mb_test (const char *pattern, int cflags, const char *string, int eflags,
322 char *expect, const char *matches, const char *letters,
323 const char *fail)
324 {
325 char *pattern_mb = mb_frob_pattern (pattern, letters);
326 const char *string_mb
327 = eflags == -1 ? string : mb_frob_string (string, letters);
328 char *expect_mb = mb_frob_string (expect, letters);
329 char *matches_mb = mb_frob_string (matches, letters);
330 int ret = 0;
331
332 if (!pattern_mb || !string_mb
333 || (expect && !expect_mb) || (matches && !matches_mb))
334 {
335 printf ("%s %m", fail);
336 ret = 1;
337 }
338 else
339 ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
340 matches_mb, fail);
341
342 free (matches_mb);
343 free (expect_mb);
344 if (string_mb != string)
345 free ((char *) string_mb);
346 free (pattern_mb);
347 return ret;
348 }
349
350 static int
mb_tests(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches)351 mb_tests (const char *pattern, int cflags, const char *string, int eflags,
352 char *expect, const char *matches)
353 {
354 int ret = 0;
355 int i;
356 char letters[9], fail[20];
357
358 /* The tests aren't supposed to work with xdigit, since a-dA-D are
359 hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */
360 if (strstr (pattern, "[:xdigit:]"))
361 return 0;
362
363 /* XXX: regex ATM handles only single byte equivalence classes. */
364 if (strstr (pattern, "[[=b=]]"))
365 return 0;
366
367 for (i = 1; i < 16; ++i)
368 {
369 char *p = letters;
370 if (i & 1)
371 {
372 if (!strchr (pattern, 'a') && !strchr (string, 'a')
373 && !strchr (pattern, 'A') && !strchr (string, 'A'))
374 continue;
375 *p++ = 'a', *p++ = 'A';
376 }
377 if (i & 2)
378 {
379 if (!strchr (pattern, 'b') && !strchr (string, 'b')
380 && !strchr (pattern, 'B') && !strchr (string, 'B'))
381 continue;
382 *p++ = 'b', *p++ = 'B';
383 }
384 if (i & 4)
385 {
386 if (!strchr (pattern, 'c') && !strchr (string, 'c')
387 && !strchr (pattern, 'C') && !strchr (string, 'C'))
388 continue;
389 *p++ = 'c', *p++ = 'C';
390 }
391 if (i & 8)
392 {
393 if (!strchr (pattern, 'd') && !strchr (string, 'd')
394 && !strchr (pattern, 'D') && !strchr (string, 'D'))
395 continue;
396 *p++ = 'd', *p++ = 'D';
397 }
398 *p++ = '\0';
399 sprintf (fail, "UTF-8 %s FAIL", letters);
400 ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
401 letters, fail);
402 }
403 return ret;
404 }
405
406 int
main(int argc,char ** argv)407 main (int argc, char **argv)
408 {
409 int ret = 0;
410 char *line = NULL;
411 size_t line_len = 0;
412 ssize_t len;
413 FILE *f;
414 static int test_utf8 = 0;
415 static const struct option options[] =
416 {
417 {"utf8", no_argument, &test_utf8, 1},
418 {NULL, 0, NULL, 0 }
419 };
420
421 mtrace ();
422
423 while (getopt_long (argc, argv, "", options, NULL) >= 0);
424
425 if (optind + 1 != argc)
426 {
427 fprintf (stderr, "Missing test filename\n");
428 return 1;
429 }
430
431 f = fopen (argv[optind], "r");
432 if (f == NULL)
433 {
434 fprintf (stderr, "Couldn't open %s\n", argv[optind]);
435 return 1;
436 }
437
438 while ((len = getline (&line, &line_len, f)) > 0)
439 {
440 char *pattern, *flagstr, *string, *expect, *matches, *p;
441 int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
442
443 if (line[len - 1] == '\n')
444 line[len - 1] = '\0';
445
446 /* Skip comments and empty lines. */
447 if (*line == '#' || *line == '\0')
448 continue;
449
450 puts (line);
451 fflush (stdout);
452
453 pattern = strtok (line, "\t");
454 if (pattern == NULL)
455 continue;
456
457 if (strcmp (pattern, "\"\"") == 0)
458 pattern += 2;
459
460 flagstr = strtok (NULL, "\t");
461 if (flagstr == NULL)
462 continue;
463
464 string = strtok (NULL, "\t");
465 if (string == NULL)
466 continue;
467
468 if (strcmp (string, "\"\"") == 0)
469 string += 2;
470
471 for (p = flagstr; *p; ++p)
472 switch (*p)
473 {
474 case '-':
475 break;
476 case 'b':
477 cflags &= ~REG_EXTENDED;
478 break;
479 case '&':
480 try_bre_ere = 1;
481 break;
482 case 'C':
483 eflags = -1;
484 break;
485 case 'i':
486 cflags |= REG_ICASE;
487 break;
488 case 's':
489 cflags |= REG_NOSUB;
490 break;
491 case 'n':
492 cflags |= REG_NEWLINE;
493 break;
494 case '^':
495 eflags |= REG_NOTBOL;
496 break;
497 case '$':
498 eflags |= REG_NOTEOL;
499 break;
500 case 'm':
501 case 'p':
502 case '#':
503 /* Not supported. */
504 flagstr = NULL;
505 break;
506 }
507
508 if (flagstr == NULL)
509 continue;
510
511 replace_special_chars (pattern);
512 glibc_re_syntax (pattern);
513 if (eflags != -1)
514 replace_special_chars (string);
515
516 expect = strtok (NULL, "\t");
517 matches = NULL;
518 if (expect != NULL)
519 {
520 replace_special_chars (expect);
521 matches = strtok (NULL, "\t");
522 if (matches != NULL)
523 replace_special_chars (matches);
524 }
525
526 if (setlocale (LC_ALL, "C") == NULL)
527 {
528 puts ("setlocale C failed");
529 ret = 1;
530 }
531 if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
532 || (try_bre_ere
533 && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
534 expect, matches, "FAIL")))
535 ret = 1;
536 else if (test_utf8)
537 {
538 if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
539 {
540 puts ("setlocale cs_CZ.UTF-8 failed");
541 ret = 1;
542 }
543 else if (test (pattern, cflags, string, eflags, expect, matches,
544 "UTF-8 FAIL")
545 || (try_bre_ere
546 && test (pattern, cflags & ~REG_EXTENDED, string,
547 eflags, expect, matches, "UTF-8 FAIL")))
548 ret = 1;
549 else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
550 || (try_bre_ere
551 && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
552 eflags, expect, matches)))
553 ret = 1;
554 }
555 }
556
557 free (line);
558 fclose (f);
559 return ret;
560 }
561