1 /* vi: set sw=4 ts=4: */
2 /*
3 * awk implementation for busybox
4 *
5 * Copyright (C) 2002 by Dmitry Zakharov <dmit@crp.bank.gov.ua>
6 *
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8 */
9 //config:config AWK
10 //config: bool "awk (23 kb)"
11 //config: default y
12 //config: help
13 //config: Awk is used as a pattern scanning and processing language.
14 //config:
15 //config:config FEATURE_AWK_LIBM
16 //config: bool "Enable math functions (requires libm)"
17 //config: default y
18 //config: depends on AWK
19 //config: help
20 //config: Enable math functions of the Awk programming language.
21 //config: NOTE: This requires libm to be present for linking.
22 //config:
23 //config:config FEATURE_AWK_GNU_EXTENSIONS
24 //config: bool "Enable a few GNU extensions"
25 //config: default y
26 //config: depends on AWK
27 //config: help
28 //config: Enable a few features from gawk:
29 //config: * command line option -e AWK_PROGRAM
30 //config: * simultaneous use of -f and -e on the command line.
31 //config: This enables the use of awk library files.
32 //config: Example: awk -f mylib.awk -e '{print myfunction($1);}' ...
33
34 //applet:IF_AWK(APPLET_NOEXEC(awk, awk, BB_DIR_USR_BIN, BB_SUID_DROP, awk))
35
36 //kbuild:lib-$(CONFIG_AWK) += awk.o
37
38 //usage:#define awk_trivial_usage
39 //usage: "[OPTIONS] [AWK_PROGRAM] [FILE]..."
40 //usage:#define awk_full_usage "\n\n"
41 //usage: " -v VAR=VAL Set variable"
42 //usage: "\n -F SEP Use SEP as field separator"
43 //usage: "\n -f FILE Read program from FILE"
44 //usage: IF_FEATURE_AWK_GNU_EXTENSIONS(
45 //usage: "\n -e AWK_PROGRAM"
46 //usage: )
47
48 #include "libbb.h"
49 #include "xregex.h"
50 #include <math.h>
51
52 /* This is a NOEXEC applet. Be very careful! */
53
54
55 /* If you comment out one of these below, it will be #defined later
56 * to perform debug printfs to stderr: */
57 #define debug_printf_walker(...) do {} while (0)
58 #define debug_printf_eval(...) do {} while (0)
59 #define debug_printf_parse(...) do {} while (0)
60
61 #ifndef debug_printf_walker
62 # define debug_printf_walker(...) (fprintf(stderr, __VA_ARGS__))
63 #endif
64 #ifndef debug_printf_eval
65 # define debug_printf_eval(...) (fprintf(stderr, __VA_ARGS__))
66 #endif
67 #ifndef debug_printf_parse
68 # define debug_printf_parse(...) (fprintf(stderr, __VA_ARGS__))
69 #else
70 # define debug_parse_print_tc(...) ((void)0)
71 #endif
72
73
74 /* "+": stop on first non-option:
75 * $ awk 'BEGIN { for(i=1; i<ARGC; ++i) { print i ": " ARGV[i] }}' -argz
76 * 1: -argz
77 */
78 #define OPTSTR_AWK "+" \
79 "F:v:*f:*" \
80 IF_FEATURE_AWK_GNU_EXTENSIONS("e:*") \
81 "W:"
82 enum {
83 OPTBIT_F, /* define field separator */
84 OPTBIT_v, /* define variable */
85 OPTBIT_f, /* pull in awk program from file */
86 IF_FEATURE_AWK_GNU_EXTENSIONS(OPTBIT_e,) /* -e AWK_PROGRAM */
87 OPTBIT_W, /* -W ignored */
88 OPT_F = 1 << OPTBIT_F,
89 OPT_v = 1 << OPTBIT_v,
90 OPT_f = 1 << OPTBIT_f,
91 OPT_e = IF_FEATURE_AWK_GNU_EXTENSIONS((1 << OPTBIT_e)) + 0,
92 OPT_W = 1 << OPTBIT_W
93 };
94
95 #define MAXVARFMT 240
96
97 /* variable flags */
98 #define VF_NUMBER 0x0001 /* 1 = primary type is number */
99 #define VF_ARRAY 0x0002 /* 1 = it's an array */
100
101 #define VF_CACHED 0x0100 /* 1 = num/str value has cached str/num eq */
102 #define VF_USER 0x0200 /* 1 = user input (may be numeric string) */
103 #define VF_SPECIAL 0x0400 /* 1 = requires extra handling when changed */
104 #define VF_WALK 0x0800 /* 1 = variable has alloc'd x.walker list */
105 #define VF_FSTR 0x1000 /* 1 = don't free() var::string (not malloced, or is owned by something else) */
106 #define VF_CHILD 0x2000 /* 1 = function arg; x.parent points to source */
107 #define VF_DIRTY 0x4000 /* 1 = variable was set explicitly */
108
109 /* these flags are static, don't change them when value is changed */
110 #define VF_DONTTOUCH (VF_ARRAY | VF_SPECIAL | VF_WALK | VF_CHILD | VF_DIRTY)
111
112 typedef struct walker_list {
113 char *end;
114 char *cur;
115 struct walker_list *prev;
116 char wbuf[1];
117 } walker_list;
118
119 /* Variable */
120 typedef struct var_s {
121 unsigned type; /* flags */
122 char *string;
123 double number;
124 union {
125 int aidx; /* func arg idx (for compilation stage) */
126 struct xhash_s *array; /* array ptr */
127 struct var_s *parent; /* for func args, ptr to actual parameter */
128 walker_list *walker; /* list of array elements (for..in) */
129 } x;
130 } var;
131
132 /* Node chain (pattern-action chain, BEGIN, END, function bodies) */
133 typedef struct chain_s {
134 struct node_s *first;
135 struct node_s *last;
136 const char *programname;
137 } chain;
138
139 /* Function */
140 typedef struct func_s {
141 unsigned nargs;
142 smallint defined;
143 struct chain_s body;
144 } func;
145
146 /* I/O stream */
147 typedef struct rstream_s {
148 FILE *F;
149 char *buffer;
150 int adv;
151 int size;
152 int pos;
153 smallint is_pipe;
154 } rstream;
155
156 typedef struct hash_item_s {
157 union {
158 struct var_s v; /* variable/array hash */
159 struct rstream_s rs; /* redirect streams hash */
160 struct func_s f; /* functions hash */
161 } data;
162 struct hash_item_s *next; /* next in chain */
163 char name[1]; /* really it's longer */
164 } hash_item;
165
166 typedef struct xhash_s {
167 unsigned nel; /* num of elements */
168 unsigned csize; /* current hash size */
169 unsigned nprime; /* next hash size in PRIMES[] */
170 unsigned glen; /* summary length of item names */
171 struct hash_item_s **items;
172 } xhash;
173
174 /* Tree node */
175 typedef struct node_s {
176 uint32_t info;
177 unsigned lineno;
178 union {
179 struct node_s *n;
180 var *v;
181 int aidx;
182 const char *new_progname;
183 regex_t *re;
184 } l;
185 union {
186 struct node_s *n;
187 regex_t *ire;
188 func *f;
189 } r;
190 union {
191 struct node_s *n;
192 } a;
193 } node;
194
195 typedef struct tsplitter_s {
196 node n;
197 regex_t re[2];
198 } tsplitter;
199
200 /* simple token classes */
201 /* order and hex values are very important!!! See next_token() */
202 #define TC_LPAREN (1 << 0) /* ( */
203 #define TC_RPAREN (1 << 1) /* ) */
204 #define TC_REGEXP (1 << 2) /* /.../ */
205 #define TC_OUTRDR (1 << 3) /* | > >> */
206 #define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */
207 #define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */
208 #define TC_BINOPX (1 << 6) /* two-opnd operator */
209 #define TC_IN (1 << 7) /* 'in' */
210 #define TC_COMMA (1 << 8) /* , */
211 #define TC_PIPE (1 << 9) /* input redirection pipe | */
212 #define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */
213 #define TC_ARRTERM (1 << 11) /* ] */
214 #define TC_LBRACE (1 << 12) /* { */
215 #define TC_RBRACE (1 << 13) /* } */
216 #define TC_SEMICOL (1 << 14) /* ; */
217 #define TC_NEWLINE (1 << 15)
218 #define TC_STATX (1 << 16) /* ctl statement (for, next...) */
219 #define TC_WHILE (1 << 17) /* 'while' */
220 #define TC_ELSE (1 << 18) /* 'else' */
221 #define TC_BUILTIN (1 << 19)
222 /* This costs ~50 bytes of code.
223 * A separate class to support deprecated "length" form. If we don't need that
224 * (i.e. if we demand that only "length()" with () is valid), then TC_LENGTH
225 * can be merged with TC_BUILTIN:
226 */
227 #define TC_LENGTH (1 << 20) /* 'length' */
228 #define TC_GETLINE (1 << 21) /* 'getline' */
229 #define TC_FUNCDECL (1 << 22) /* 'function' 'func' */
230 #define TC_BEGIN (1 << 23) /* 'BEGIN' */
231 #define TC_END (1 << 24) /* 'END' */
232 #define TC_EOF (1 << 25)
233 #define TC_VARIABLE (1 << 26) /* name */
234 #define TC_ARRAY (1 << 27) /* name[ */
235 #define TC_FUNCTION (1 << 28) /* name( */
236 #define TC_STRING (1 << 29) /* "..." */
237 #define TC_NUMBER (1 << 30)
238
239 #ifndef debug_parse_print_tc
debug_parse_print_tc(uint32_t n)240 static void debug_parse_print_tc(uint32_t n)
241 {
242 if (n & TC_LPAREN ) debug_printf_parse(" LPAREN" );
243 if (n & TC_RPAREN ) debug_printf_parse(" RPAREN" );
244 if (n & TC_REGEXP ) debug_printf_parse(" REGEXP" );
245 if (n & TC_OUTRDR ) debug_printf_parse(" OUTRDR" );
246 if (n & TC_UOPPOST ) debug_printf_parse(" UOPPOST" );
247 if (n & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" );
248 if (n & TC_BINOPX ) debug_printf_parse(" BINOPX" );
249 if (n & TC_IN ) debug_printf_parse(" IN" );
250 if (n & TC_COMMA ) debug_printf_parse(" COMMA" );
251 if (n & TC_PIPE ) debug_printf_parse(" PIPE" );
252 if (n & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" );
253 if (n & TC_ARRTERM ) debug_printf_parse(" ARRTERM" );
254 if (n & TC_LBRACE ) debug_printf_parse(" LBRACE" );
255 if (n & TC_RBRACE ) debug_printf_parse(" RBRACE" );
256 if (n & TC_SEMICOL ) debug_printf_parse(" SEMICOL" );
257 if (n & TC_NEWLINE ) debug_printf_parse(" NEWLINE" );
258 if (n & TC_STATX ) debug_printf_parse(" STATX" );
259 if (n & TC_WHILE ) debug_printf_parse(" WHILE" );
260 if (n & TC_ELSE ) debug_printf_parse(" ELSE" );
261 if (n & TC_BUILTIN ) debug_printf_parse(" BUILTIN" );
262 if (n & TC_LENGTH ) debug_printf_parse(" LENGTH" );
263 if (n & TC_GETLINE ) debug_printf_parse(" GETLINE" );
264 if (n & TC_FUNCDECL) debug_printf_parse(" FUNCDECL");
265 if (n & TC_BEGIN ) debug_printf_parse(" BEGIN" );
266 if (n & TC_END ) debug_printf_parse(" END" );
267 if (n & TC_EOF ) debug_printf_parse(" EOF" );
268 if (n & TC_VARIABLE) debug_printf_parse(" VARIABLE");
269 if (n & TC_ARRAY ) debug_printf_parse(" ARRAY" );
270 if (n & TC_FUNCTION) debug_printf_parse(" FUNCTION");
271 if (n & TC_STRING ) debug_printf_parse(" STRING" );
272 if (n & TC_NUMBER ) debug_printf_parse(" NUMBER" );
273 }
274 #endif
275
276 /* combined token classes ("token [class] sets") */
277 #define TS_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2)
278
279 #define TS_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN)
280 //#define TS_UNARYOP (TS_UOPPRE | TC_UOPPOST)
281 #define TS_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \
282 | TC_BUILTIN | TC_LENGTH | TC_GETLINE \
283 | TC_LPAREN | TC_STRING | TC_NUMBER)
284
285 #define TS_LVALUE (TC_VARIABLE | TC_ARRAY)
286 #define TS_STATEMNT (TC_STATX | TC_WHILE)
287
288 /* word tokens, cannot mean something else if not expected */
289 #define TS_WORD (TC_IN | TS_STATEMNT | TC_ELSE \
290 | TC_BUILTIN | TC_LENGTH | TC_GETLINE \
291 | TC_FUNCDECL | TC_BEGIN | TC_END)
292
293 /* discard newlines after these */
294 #define TS_NOTERM (TS_BINOP | TC_COMMA | TC_LBRACE | TC_RBRACE \
295 | TC_SEMICOL | TC_NEWLINE)
296
297 /* what can expression begin with */
298 #define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP)
299 /* what can group begin with */
300 #define TS_GRPSEQ (TS_OPSEQ | TS_STATEMNT \
301 | TC_SEMICOL | TC_NEWLINE | TC_LBRACE)
302
303 /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */
304 /* operator is inserted between them */
305 #define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_RPAREN \
306 | TC_STRING | TC_NUMBER | TC_UOPPOST \
307 | TC_LENGTH)
308 #define TS_CONCAT_R (TS_OPERAND | TS_UOPPRE)
309
310 #define OF_RES1 0x010000
311 #define OF_RES2 0x020000
312 #define OF_STR1 0x040000
313 #define OF_STR2 0x080000
314 #define OF_NUM1 0x100000
315 #define OF_CHECKED 0x200000
316 #define OF_REQUIRED 0x400000
317
318 /* combined operator flags */
319 #define xx 0
320 #define xV OF_RES2
321 #define xS (OF_RES2 | OF_STR2)
322 #define Vx OF_RES1
323 #define Rx OF_REQUIRED
324 #define VV (OF_RES1 | OF_RES2)
325 #define Nx (OF_RES1 | OF_NUM1)
326 #define NV (OF_RES1 | OF_NUM1 | OF_RES2)
327 #define Sx (OF_RES1 | OF_STR1)
328 #define SV (OF_RES1 | OF_STR1 | OF_RES2)
329 #define SS (OF_RES1 | OF_STR1 | OF_RES2 | OF_STR2)
330
331 #define OPCLSMASK 0xFF00
332 #define OPNMASK 0x007F
333
334 /* operator priority is a highest byte (even: r->l, odd: l->r grouping)
335 * (for builtins it has different meaning)
336 */
337 #undef P
338 #undef PRIMASK
339 #undef PRIMASK2
340 #define P(x) (x << 24)
341 #define PRIMASK 0x7F000000
342 #define PRIMASK2 0x7E000000
343
344 /* Operation classes */
345 #define SHIFT_TIL_THIS 0x0600
346 #define RECUR_FROM_THIS 0x1000
347 enum {
348 OC_DELETE = 0x0100, OC_EXEC = 0x0200, OC_NEWSOURCE = 0x0300,
349 OC_PRINT = 0x0400, OC_PRINTF = 0x0500, OC_WALKINIT = 0x0600,
350
351 OC_BR = 0x0700, OC_BREAK = 0x0800, OC_CONTINUE = 0x0900,
352 OC_EXIT = 0x0a00, OC_NEXT = 0x0b00, OC_NEXTFILE = 0x0c00,
353 OC_TEST = 0x0d00, OC_WALKNEXT = 0x0e00,
354
355 OC_BINARY = 0x1000, OC_BUILTIN = 0x1100, OC_COLON = 0x1200,
356 OC_COMMA = 0x1300, OC_COMPARE = 0x1400, OC_CONCAT = 0x1500,
357 OC_FBLTIN = 0x1600, OC_FIELD = 0x1700, OC_FNARG = 0x1800,
358 OC_FUNC = 0x1900, OC_GETLINE = 0x1a00, OC_IN = 0x1b00,
359 OC_LAND = 0x1c00, OC_LOR = 0x1d00, OC_MATCH = 0x1e00,
360 OC_MOVE = 0x1f00, OC_PGETLINE = 0x2000, OC_REGEXP = 0x2100,
361 OC_REPLACE = 0x2200, OC_RETURN = 0x2300, OC_SPRINTF = 0x2400,
362 OC_TERNARY = 0x2500, OC_UNARY = 0x2600, OC_VAR = 0x2700,
363 OC_DONE = 0x2800,
364
365 ST_IF = 0x3000, ST_DO = 0x3100, ST_FOR = 0x3200,
366 ST_WHILE = 0x3300
367 };
368
369 /* simple builtins */
370 enum {
371 F_in, F_rn, F_co, F_ex, F_lg, F_si, F_sq, F_sr,
372 F_ti, F_le, F_sy, F_ff, F_cl
373 };
374
375 /* builtins */
376 enum {
377 B_a2, B_ix, B_ma, B_sp, B_ss, B_ti, B_mt, B_lo, B_up,
378 B_ge, B_gs, B_su,
379 B_an, B_co, B_ls, B_or, B_rs, B_xo,
380 };
381
382 /* tokens and their corresponding info values */
383
384 #define NTC "\377" /* switch to next token class (tc<<1) */
385 #define NTCC '\377'
386
387 static const char tokenlist[] ALIGN1 =
388 "\1(" NTC /* TC_LPAREN */
389 "\1)" NTC /* TC_RPAREN */
390 "\1/" NTC /* TC_REGEXP */
391 "\2>>" "\1>" "\1|" NTC /* TC_OUTRDR */
392 "\2++" "\2--" NTC /* TC_UOPPOST */
393 "\2++" "\2--" "\1$" NTC /* TC_UOPPRE1 */
394 "\2==" "\1=" "\2+=" "\2-=" /* TC_BINOPX */
395 "\2*=" "\2/=" "\2%=" "\2^="
396 "\1+" "\1-" "\3**=" "\2**"
397 "\1/" "\1%" "\1^" "\1*"
398 "\2!=" "\2>=" "\2<=" "\1>"
399 "\1<" "\2!~" "\1~" "\2&&"
400 "\2||" "\1?" "\1:" NTC
401 "\2in" NTC /* TC_IN */
402 "\1," NTC /* TC_COMMA */
403 "\1|" NTC /* TC_PIPE */
404 "\1+" "\1-" "\1!" NTC /* TC_UOPPRE2 */
405 "\1]" NTC /* TC_ARRTERM */
406 "\1{" NTC /* TC_LBRACE */
407 "\1}" NTC /* TC_RBRACE */
408 "\1;" NTC /* TC_SEMICOL */
409 "\1\n" NTC /* TC_NEWLINE */
410 "\2if" "\2do" "\3for" "\5break" /* TC_STATX */
411 "\10continue" "\6delete" "\5print"
412 "\6printf" "\4next" "\10nextfile"
413 "\6return" "\4exit" NTC
414 "\5while" NTC /* TC_WHILE */
415 "\4else" NTC /* TC_ELSE */
416 "\3and" "\5compl" "\6lshift" "\2or" /* TC_BUILTIN */
417 "\6rshift" "\3xor"
418 "\5close" "\6system" "\6fflush" "\5atan2"
419 "\3cos" "\3exp" "\3int" "\3log"
420 "\4rand" "\3sin" "\4sqrt" "\5srand"
421 "\6gensub" "\4gsub" "\5index" /* "\6length" was here */
422 "\5match" "\5split" "\7sprintf" "\3sub"
423 "\6substr" "\7systime" "\10strftime" "\6mktime"
424 "\7tolower" "\7toupper" NTC
425 "\6length" NTC /* TC_LENGTH */
426 "\7getline" NTC /* TC_GETLINE */
427 "\4func" "\10function" NTC /* TC_FUNCDECL */
428 "\5BEGIN" NTC /* TC_BEGIN */
429 "\3END" /* TC_END */
430 /* compiler adds trailing "\0" */
431 ;
432
433 static const uint32_t tokeninfo[] ALIGN4 = {
434 0,
435 0,
436 #define TI_REGEXP OC_REGEXP
437 TI_REGEXP,
438 xS|'a', xS|'w', xS|'|',
439 OC_UNARY|xV|P(9)|'p', OC_UNARY|xV|P(9)|'m',
440 #define TI_PREINC (OC_UNARY|xV|P(9)|'P')
441 #define TI_PREDEC (OC_UNARY|xV|P(9)|'M')
442 TI_PREINC, TI_PREDEC, OC_FIELD|xV|P(5),
443 OC_COMPARE|VV|P(39)|5, OC_MOVE|VV|P(74), OC_REPLACE|NV|P(74)|'+', OC_REPLACE|NV|P(74)|'-',
444 OC_REPLACE|NV|P(74)|'*', OC_REPLACE|NV|P(74)|'/', OC_REPLACE|NV|P(74)|'%', OC_REPLACE|NV|P(74)|'&',
445 OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&',
446 OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*',
447 OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1,
448 #define TI_LESS (OC_COMPARE|VV|P(39)|2)
449 TI_LESS, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55),
450 #define TI_TERNARY (OC_TERNARY|Vx|P(64)|'?')
451 #define TI_COLON (OC_COLON|xx|P(67)|':')
452 OC_LOR|Vx|P(59), TI_TERNARY, TI_COLON,
453 #define TI_IN (OC_IN|SV|P(49))
454 TI_IN,
455 #define TI_COMMA (OC_COMMA|SS|P(80))
456 TI_COMMA,
457 #define TI_PGETLINE (OC_PGETLINE|SV|P(37))
458 TI_PGETLINE,
459 OC_UNARY|xV|P(19)|'+', OC_UNARY|xV|P(19)|'-', OC_UNARY|xV|P(19)|'!',
460 0, /* ] */
461 0,
462 0,
463 0,
464 0, /* \n */
465 ST_IF, ST_DO, ST_FOR, OC_BREAK,
466 OC_CONTINUE, OC_DELETE|Rx, OC_PRINT,
467 OC_PRINTF, OC_NEXT, OC_NEXTFILE,
468 OC_RETURN|Vx, OC_EXIT|Nx,
469 ST_WHILE,
470 0, /* else */
471 // OC_B's are builtins with enforced minimum number of arguments (two upper bits).
472 // Highest byte bit pattern: nn s3s2s1 v3v2v1
473 // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var
474 // OC_F's are builtins with zero or one argument.
475 // |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt
476 // Check for no args is present in builtins' code (not in this table): rand, systime
477 // Have one _optional_ arg: fflush, srand, length
478 #define OC_B OC_BUILTIN
479 #define OC_F OC_FBLTIN
480 #define A1 P(0x40) /*one arg*/
481 #define A2 P(0x80) /*two args*/
482 #define A3 P(0xc0) /*three args*/
483 #define __v P(1)
484 #define _vv P(3)
485 #define __s__v P(9)
486 #define __s_vv P(0x0b)
487 #define __svvv P(0x0f)
488 #define _ss_vv P(0x1b)
489 #define _s_vv_ P(0x16)
490 #define ss_vv_ P(0x36)
491 OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or
492 OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor
493 OC_F|F_cl|Sx|Rx, OC_F|F_sy|Sx|Rx, OC_F|F_ff|Sx, OC_B|B_a2|_vv|A2, // close system fflush atan2
494 OC_F|F_co|Nx|Rx, OC_F|F_ex|Nx|Rx, OC_F|F_in|Nx|Rx, OC_F|F_lg|Nx|Rx, // cos exp int log
495 OC_F|F_rn, OC_F|F_si|Nx|Rx, OC_F|F_sq|Nx|Rx, OC_F|F_sr|Nx, // rand sin sqrt srand
496 OC_B|B_ge|_s_vv_|A3,OC_B|B_gs|ss_vv_|A2,OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/
497 OC_B|B_ma|__s__v|A2,OC_B|B_sp|__s_vv|A2,OC_SPRINTF, OC_B|B_su|ss_vv_|A2,// match split sprintf sub
498 OC_B|B_ss|__svvv|A2,OC_F|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime
499 OC_B|B_lo|__s__v|A1,OC_B|B_up|__s__v|A1, // tolower toupper
500 OC_F|F_le|Sx, // length
501 OC_GETLINE|SV, // getline
502 0, 0, // func function
503 0, // BEGIN
504 0 // END
505 #undef A1
506 #undef A2
507 #undef A3
508 #undef OC_B
509 #undef OC_F
510 };
511
512 /* internal variable names and their initial values */
513 /* asterisk marks SPECIAL vars; $ is just no-named Field0 */
514 enum {
515 CONVFMT, OFMT, FS, OFS,
516 ORS, RS, RT, FILENAME,
517 SUBSEP, F0, ARGIND, ARGC,
518 ARGV, ERRNO, FNR, NR,
519 NF, IGNORECASE, ENVIRON, NUM_INTERNAL_VARS
520 };
521
522 static const char vNames[] ALIGN1 =
523 "CONVFMT\0" "OFMT\0" "FS\0*" "OFS\0"
524 "ORS\0" "RS\0*" "RT\0" "FILENAME\0"
525 "SUBSEP\0" "$\0*" "ARGIND\0" "ARGC\0"
526 "ARGV\0" "ERRNO\0" "FNR\0" "NR\0"
527 "NF\0*" "IGNORECASE\0*" "ENVIRON\0" "\0";
528
529 static const char vValues[] ALIGN1 =
530 "%.6g\0" "%.6g\0" " \0" " \0"
531 "\n\0" "\n\0" "\0" "\0"
532 "\034\0" "\0" "\377";
533
534 /* hash size may grow to these values */
535 #define FIRST_PRIME 61
536 static const uint16_t PRIMES[] ALIGN2 = { 251, 1021, 4093, 16381, 65521 };
537
538
539 /* Globals. Split in two parts so that first one is addressed
540 * with (mostly short) negative offsets.
541 * NB: it's unsafe to put members of type "double"
542 * into globals2 (gcc may fail to align them).
543 */
544 struct globals {
545 double t_double;
546 chain beginseq, mainseq, endseq;
547 chain *seq;
548 node *break_ptr, *continue_ptr;
549 rstream *iF;
550 xhash *ahash; /* argument names, used only while parsing function bodies */
551 xhash *fnhash; /* function names, used only in parsing stage */
552 xhash *vhash; /* variables and arrays */
553 //xhash *fdhash; /* file objects, used only in execution stage */
554 //we are reusing ahash as fdhash, via define (see later)
555 const char *g_progname;
556 int g_lineno;
557 int nfields;
558 int maxfields; /* used in fsrealloc() only */
559 var *Fields;
560 char *g_pos;
561 char g_saved_ch;
562 smallint icase;
563 smallint exiting;
564 smallint nextrec;
565 smallint nextfile;
566 smallint is_f0_split;
567 smallint t_rollback;
568
569 /* former statics from various functions */
570 smallint next_token__concat_inserted;
571 uint32_t next_token__save_tclass;
572 uint32_t next_token__save_info;
573 };
574 struct globals2 {
575 uint32_t t_info; /* often used */
576 uint32_t t_tclass;
577 char *t_string;
578 int t_lineno;
579
580 var *intvar[NUM_INTERNAL_VARS]; /* often used */
581
582 /* former statics from various functions */
583 char *split_f0__fstrings;
584
585 rstream next_input_file__rsm;
586 smallint next_input_file__files_happen;
587
588 smalluint exitcode;
589
590 unsigned evaluate__seed;
591 var *evaluate__fnargs;
592 regex_t evaluate__sreg;
593
594 var ptest__tmpvar;
595 var awk_printf__tmpvar;
596 var as_regex__tmpvar;
597 var exit__tmpvar;
598 var main__tmpvar;
599
600 tsplitter exec_builtin__tspl;
601
602 /* biggest and least used members go last */
603 tsplitter fsplitter, rsplitter;
604
605 char g_buf[MAXVARFMT + 1];
606 };
607 #define G1 (ptr_to_globals[-1])
608 #define G (*(struct globals2 *)ptr_to_globals)
609 /* For debug. nm --size-sort awk.o | grep -vi ' [tr] ' */
610 //char G1size[sizeof(G1)]; // 0x70
611 //char Gsize[sizeof(G)]; // 0x2f8
612 /* Trying to keep most of members accessible with short offsets: */
613 //char Gofs_seed[offsetof(struct globals2, evaluate__seed)]; // 0x7c
614 #define t_double (G1.t_double )
615 #define beginseq (G1.beginseq )
616 #define mainseq (G1.mainseq )
617 #define endseq (G1.endseq )
618 #define seq (G1.seq )
619 #define break_ptr (G1.break_ptr )
620 #define continue_ptr (G1.continue_ptr)
621 #define iF (G1.iF )
622 #define ahash (G1.ahash )
623 #define fnhash (G1.fnhash )
624 #define vhash (G1.vhash )
625 #define fdhash ahash
626 //^^^^^^^^^^^^^^^^^^ ahash is cleared after every function parsing,
627 // and ends up empty after parsing phase. Thus, we can simply reuse it
628 // for fdhash in execution stage.
629 #define g_progname (G1.g_progname )
630 #define g_lineno (G1.g_lineno )
631 #define nfields (G1.nfields )
632 #define maxfields (G1.maxfields )
633 #define Fields (G1.Fields )
634 #define g_pos (G1.g_pos )
635 #define g_saved_ch (G1.g_saved_ch )
636 #define icase (G1.icase )
637 #define exiting (G1.exiting )
638 #define nextrec (G1.nextrec )
639 #define nextfile (G1.nextfile )
640 #define is_f0_split (G1.is_f0_split )
641 #define t_rollback (G1.t_rollback )
642 #define t_info (G.t_info )
643 #define t_tclass (G.t_tclass )
644 #define t_string (G.t_string )
645 #define t_lineno (G.t_lineno )
646 #define intvar (G.intvar )
647 #define fsplitter (G.fsplitter )
648 #define rsplitter (G.rsplitter )
649 #define g_buf (G.g_buf )
650 #define INIT_G() do { \
651 SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \
652 t_tclass = TC_NEWLINE; \
653 G.evaluate__seed = 1; \
654 } while (0)
655
656 static const char EMSG_UNEXP_EOS[] ALIGN1 = "Unexpected end of string";
657 static const char EMSG_UNEXP_TOKEN[] ALIGN1 = "Unexpected token";
658 static const char EMSG_DIV_BY_ZERO[] ALIGN1 = "Division by zero";
659 static const char EMSG_INV_FMT[] ALIGN1 = "Invalid format specifier";
660 static const char EMSG_TOO_FEW_ARGS[] ALIGN1 = "Too few arguments";
661 static const char EMSG_NOT_ARRAY[] ALIGN1 = "Not an array";
662 static const char EMSG_POSSIBLE_ERROR[] ALIGN1 = "Possible syntax error";
663 static const char EMSG_UNDEF_FUNC[] ALIGN1 = "Call to undefined function";
664 static const char EMSG_NO_MATH[] ALIGN1 = "Math support is not compiled in";
665 static const char EMSG_NEGATIVE_FIELD[] ALIGN1 = "Access to negative field";
666
667 static int awk_exit(void) NORETURN;
668
669 static void syntax_error(const char *message) NORETURN;
syntax_error(const char * message)670 static void syntax_error(const char *message)
671 {
672 bb_error_msg_and_die("%s:%i: %s", g_progname, g_lineno, message);
673 }
674
675 /* ---- hash stuff ---- */
676
hashidx(const char * name)677 static unsigned hashidx(const char *name)
678 {
679 unsigned idx = 0;
680
681 while (*name)
682 idx = *name++ + (idx << 6) - idx;
683 return idx;
684 }
685
686 /* create new hash */
hash_init(void)687 static xhash *hash_init(void)
688 {
689 xhash *newhash;
690
691 newhash = xzalloc(sizeof(*newhash));
692 newhash->csize = FIRST_PRIME;
693 newhash->items = xzalloc(FIRST_PRIME * sizeof(newhash->items[0]));
694
695 return newhash;
696 }
697
hash_clear(xhash * hash)698 static void hash_clear(xhash *hash)
699 {
700 unsigned i;
701 hash_item *hi, *thi;
702
703 for (i = 0; i < hash->csize; i++) {
704 hi = hash->items[i];
705 while (hi) {
706 thi = hi;
707 hi = hi->next;
708 //FIXME: this assumes that it's a hash of *variables*:
709 free(thi->data.v.string);
710 free(thi);
711 }
712 hash->items[i] = NULL;
713 }
714 hash->glen = hash->nel = 0;
715 }
716
717 #if 0 //UNUSED
718 static void hash_free(xhash *hash)
719 {
720 hash_clear(hash);
721 free(hash->items);
722 free(hash);
723 }
724 #endif
725
726 /* find item in hash, return ptr to data, NULL if not found */
hash_search3(xhash * hash,const char * name,unsigned idx)727 static NOINLINE void *hash_search3(xhash *hash, const char *name, unsigned idx)
728 {
729 hash_item *hi;
730
731 hi = hash->items[idx % hash->csize];
732 while (hi) {
733 if (strcmp(hi->name, name) == 0)
734 return &hi->data;
735 hi = hi->next;
736 }
737 return NULL;
738 }
739
hash_search(xhash * hash,const char * name)740 static void *hash_search(xhash *hash, const char *name)
741 {
742 return hash_search3(hash, name, hashidx(name));
743 }
744
745 /* grow hash if it becomes too big */
hash_rebuild(xhash * hash)746 static void hash_rebuild(xhash *hash)
747 {
748 unsigned newsize, i, idx;
749 hash_item **newitems, *hi, *thi;
750
751 if (hash->nprime == ARRAY_SIZE(PRIMES))
752 return;
753
754 newsize = PRIMES[hash->nprime++];
755 newitems = xzalloc(newsize * sizeof(newitems[0]));
756
757 for (i = 0; i < hash->csize; i++) {
758 hi = hash->items[i];
759 while (hi) {
760 thi = hi;
761 hi = thi->next;
762 idx = hashidx(thi->name) % newsize;
763 thi->next = newitems[idx];
764 newitems[idx] = thi;
765 }
766 }
767
768 free(hash->items);
769 hash->csize = newsize;
770 hash->items = newitems;
771 }
772
773 /* find item in hash, add it if necessary. Return ptr to data */
hash_find(xhash * hash,const char * name)774 static void *hash_find(xhash *hash, const char *name)
775 {
776 hash_item *hi;
777 unsigned idx;
778 int l;
779
780 idx = hashidx(name);
781 hi = hash_search3(hash, name, idx);
782 if (!hi) {
783 if (++hash->nel > hash->csize * 8)
784 hash_rebuild(hash);
785
786 l = strlen(name) + 1;
787 hi = xzalloc(sizeof(*hi) + l);
788 strcpy(hi->name, name);
789
790 idx = idx % hash->csize;
791 hi->next = hash->items[idx];
792 hash->items[idx] = hi;
793 hash->glen += l;
794 }
795 return &hi->data;
796 }
797
798 #define findvar(hash, name) ((var*) hash_find((hash), (name)))
799 #define newvar(name) ((var*) hash_find(vhash, (name)))
800 #define newfile(name) ((rstream*)hash_find(fdhash, (name)))
801 #define newfunc(name) ((func*) hash_find(fnhash, (name)))
802
hash_remove(xhash * hash,const char * name)803 static void hash_remove(xhash *hash, const char *name)
804 {
805 hash_item *hi, **phi;
806
807 phi = &hash->items[hashidx(name) % hash->csize];
808 while (*phi) {
809 hi = *phi;
810 if (strcmp(hi->name, name) == 0) {
811 hash->glen -= (strlen(name) + 1);
812 hash->nel--;
813 *phi = hi->next;
814 free(hi);
815 break;
816 }
817 phi = &hi->next;
818 }
819 }
820
821 /* ------ some useful functions ------ */
822
skip_spaces(char * p)823 static char *skip_spaces(char *p)
824 {
825 for (;;) {
826 if (*p == '\\' && p[1] == '\n') {
827 p++;
828 t_lineno++;
829 } else if (*p != ' ' && *p != '\t') {
830 break;
831 }
832 p++;
833 }
834 return p;
835 }
836
837 /* returns old *s, advances *s past word and terminating NUL */
nextword(char ** s)838 static char *nextword(char **s)
839 {
840 char *p = *s;
841 char *q = p;
842 while (*q++ != '\0')
843 continue;
844 *s = q;
845 return p;
846 }
847
nextchar(char ** s)848 static char nextchar(char **s)
849 {
850 char c, *pps;
851
852 c = *(*s)++;
853 pps = *s;
854 if (c == '\\')
855 c = bb_process_escape_sequence((const char**)s);
856 /* Example awk statement:
857 * s = "abc\"def"
858 * we must treat \" as "
859 */
860 if (c == '\\' && *s == pps) { /* unrecognized \z? */
861 c = *(*s); /* yes, fetch z */
862 if (c)
863 (*s)++; /* advance unless z = NUL */
864 }
865 return c;
866 }
867
868 /* TODO: merge with strcpy_and_process_escape_sequences()?
869 */
unescape_string_in_place(char * s1)870 static void unescape_string_in_place(char *s1)
871 {
872 char *s = s1;
873 while ((*s1 = nextchar(&s)) != '\0')
874 s1++;
875 }
876
isalnum_(int c)877 static ALWAYS_INLINE int isalnum_(int c)
878 {
879 return (isalnum(c) || c == '_');
880 }
881
my_strtod(char ** pp)882 static double my_strtod(char **pp)
883 {
884 char *cp = *pp;
885 if (ENABLE_DESKTOP && cp[0] == '0') {
886 /* Might be hex or octal integer: 0x123abc or 07777 */
887 char c = (cp[1] | 0x20);
888 if (c == 'x' || isdigit(cp[1])) {
889 unsigned long long ull = strtoull(cp, pp, 0);
890 if (c == 'x')
891 return ull;
892 c = **pp;
893 if (!isdigit(c) && c != '.')
894 return ull;
895 /* else: it may be a floating number. Examples:
896 * 009.123 (*pp points to '9')
897 * 000.123 (*pp points to '.')
898 * fall through to strtod.
899 */
900 }
901 }
902 return strtod(cp, pp);
903 }
904
905 /* -------- working with variables (set/get/copy/etc) -------- */
906
fmt_num(const char * format,double n)907 static void fmt_num(const char *format, double n)
908 {
909 if (n == (long long)n) {
910 snprintf(g_buf, MAXVARFMT, "%lld", (long long)n);
911 } else {
912 const char *s = format;
913 char c;
914
915 do { c = *s; } while (c && *++s);
916 if (strchr("diouxX", c)) {
917 snprintf(g_buf, MAXVARFMT, format, (int)n);
918 } else if (strchr("eEfFgGaA", c)) {
919 snprintf(g_buf, MAXVARFMT, format, n);
920 } else {
921 syntax_error(EMSG_INV_FMT);
922 }
923 }
924 }
925
iamarray(var * a)926 static xhash *iamarray(var *a)
927 {
928 while (a->type & VF_CHILD)
929 a = a->x.parent;
930
931 if (!(a->type & VF_ARRAY)) {
932 a->type |= VF_ARRAY;
933 a->x.array = hash_init();
934 }
935 return a->x.array;
936 }
937
938 #define clear_array(array) hash_clear(array)
939
940 /* clear a variable */
clrvar(var * v)941 static var *clrvar(var *v)
942 {
943 if (!(v->type & VF_FSTR))
944 free(v->string);
945
946 v->type &= VF_DONTTOUCH;
947 v->type |= VF_DIRTY;
948 v->string = NULL;
949 return v;
950 }
951
952 static void handle_special(var *);
953
954 /* assign string value to variable */
setvar_p(var * v,char * value)955 static var *setvar_p(var *v, char *value)
956 {
957 clrvar(v);
958 v->string = value;
959 handle_special(v);
960 return v;
961 }
962
963 /* same as setvar_p but make a copy of string */
setvar_s(var * v,const char * value)964 static var *setvar_s(var *v, const char *value)
965 {
966 return setvar_p(v, (value && *value) ? xstrdup(value) : NULL);
967 }
968
969 /* same as setvar_s but sets USER flag */
setvar_u(var * v,const char * value)970 static var *setvar_u(var *v, const char *value)
971 {
972 v = setvar_s(v, value);
973 v->type |= VF_USER;
974 return v;
975 }
976
977 /* set array element to user string */
setari_u(var * a,int idx,const char * s)978 static void setari_u(var *a, int idx, const char *s)
979 {
980 var *v;
981
982 v = findvar(iamarray(a), itoa(idx));
983 setvar_u(v, s);
984 }
985
986 /* assign numeric value to variable */
setvar_i(var * v,double value)987 static var *setvar_i(var *v, double value)
988 {
989 clrvar(v);
990 v->type |= VF_NUMBER;
991 v->number = value;
992 handle_special(v);
993 return v;
994 }
995
getvar_s(var * v)996 static const char *getvar_s(var *v)
997 {
998 /* if v is numeric and has no cached string, convert it to string */
999 if ((v->type & (VF_NUMBER | VF_CACHED)) == VF_NUMBER) {
1000 fmt_num(getvar_s(intvar[CONVFMT]), v->number);
1001 v->string = xstrdup(g_buf);
1002 v->type |= VF_CACHED;
1003 }
1004 return (v->string == NULL) ? "" : v->string;
1005 }
1006
getvar_i(var * v)1007 static double getvar_i(var *v)
1008 {
1009 char *s;
1010
1011 if ((v->type & (VF_NUMBER | VF_CACHED)) == 0) {
1012 v->number = 0;
1013 s = v->string;
1014 if (s && *s) {
1015 debug_printf_eval("getvar_i: '%s'->", s);
1016 v->number = my_strtod(&s);
1017 debug_printf_eval("%f (s:'%s')\n", v->number, s);
1018 if (v->type & VF_USER) {
1019 //TODO: skip_spaces() also skips backslash+newline, is it intended here?
1020 s = skip_spaces(s);
1021 if (*s != '\0')
1022 v->type &= ~VF_USER;
1023 }
1024 } else {
1025 debug_printf_eval("getvar_i: '%s'->zero\n", s);
1026 v->type &= ~VF_USER;
1027 }
1028 v->type |= VF_CACHED;
1029 }
1030 debug_printf_eval("getvar_i: %f\n", v->number);
1031 return v->number;
1032 }
1033
1034 /* Used for operands of bitwise ops */
getvar_i_int(var * v)1035 static unsigned long getvar_i_int(var *v)
1036 {
1037 double d = getvar_i(v);
1038
1039 /* Casting doubles to longs is undefined for values outside
1040 * of target type range. Try to widen it as much as possible */
1041 if (d >= 0)
1042 return (unsigned long)d;
1043 /* Why? Think about d == -4294967295.0 (assuming 32bit longs) */
1044 return - (long) (unsigned long) (-d);
1045 }
1046
copyvar(var * dest,const var * src)1047 static var *copyvar(var *dest, const var *src)
1048 {
1049 if (dest != src) {
1050 clrvar(dest);
1051 dest->type |= (src->type & ~(VF_DONTTOUCH | VF_FSTR));
1052 debug_printf_eval("copyvar: number:%f string:'%s'\n", src->number, src->string);
1053 dest->number = src->number;
1054 if (src->string)
1055 dest->string = xstrdup(src->string);
1056 }
1057 handle_special(dest);
1058 return dest;
1059 }
1060
incvar(var * v)1061 static var *incvar(var *v)
1062 {
1063 return setvar_i(v, getvar_i(v) + 1.0);
1064 }
1065
1066 /* return true if v is number or numeric string */
is_numeric(var * v)1067 static int is_numeric(var *v)
1068 {
1069 getvar_i(v);
1070 return ((v->type ^ VF_DIRTY) & (VF_NUMBER | VF_USER | VF_DIRTY));
1071 }
1072
1073 /* return 1 when value of v corresponds to true, 0 otherwise */
istrue(var * v)1074 static int istrue(var *v)
1075 {
1076 if (is_numeric(v))
1077 return (v->number != 0);
1078 return (v->string && v->string[0]);
1079 }
1080
1081 /* ------- awk program text parsing ------- */
1082
1083 /* Parse next token pointed by global pos, place results into global t_XYZ variables.
1084 * If token isn't expected, print error message and die.
1085 * Return token class (also store it in t_tclass).
1086 */
next_token(uint32_t expected)1087 static uint32_t next_token(uint32_t expected)
1088 {
1089 #define concat_inserted (G1.next_token__concat_inserted)
1090 #define save_tclass (G1.next_token__save_tclass)
1091 #define save_info (G1.next_token__save_info)
1092
1093 char *p;
1094 const char *tl;
1095 const uint32_t *ti;
1096 uint32_t tc, last_token_class;
1097
1098 last_token_class = t_tclass; /* t_tclass is initialized to TC_NEWLINE */
1099
1100 debug_printf_parse("%s() expected(%x):", __func__, expected);
1101 debug_parse_print_tc(expected);
1102 debug_printf_parse("\n");
1103
1104 if (t_rollback) {
1105 debug_printf_parse("%s: using rolled-back token\n", __func__);
1106 t_rollback = FALSE;
1107 } else if (concat_inserted) {
1108 debug_printf_parse("%s: using concat-inserted token\n", __func__);
1109 concat_inserted = FALSE;
1110 t_tclass = save_tclass;
1111 t_info = save_info;
1112 } else {
1113 p = g_pos;
1114 if (g_saved_ch != '\0') {
1115 *p = g_saved_ch;
1116 g_saved_ch = '\0';
1117 }
1118 readnext:
1119 p = skip_spaces(p);
1120 g_lineno = t_lineno;
1121 if (*p == '#')
1122 while (*p != '\n' && *p != '\0')
1123 p++;
1124
1125 if (*p == '\0') {
1126 tc = TC_EOF;
1127 debug_printf_parse("%s: token found: TC_EOF\n", __func__);
1128 } else if (*p == '\"') {
1129 /* it's a string */
1130 char *s = t_string = ++p;
1131 while (*p != '\"') {
1132 char *pp;
1133 if (*p == '\0' || *p == '\n')
1134 syntax_error(EMSG_UNEXP_EOS);
1135 pp = p;
1136 *s++ = nextchar(&pp);
1137 p = pp;
1138 }
1139 p++;
1140 *s = '\0';
1141 tc = TC_STRING;
1142 debug_printf_parse("%s: token found:'%s' TC_STRING\n", __func__, t_string);
1143 } else if ((expected & TC_REGEXP) && *p == '/') {
1144 /* it's regexp */
1145 char *s = t_string = ++p;
1146 while (*p != '/') {
1147 if (*p == '\0' || *p == '\n')
1148 syntax_error(EMSG_UNEXP_EOS);
1149 *s = *p++;
1150 if (*s++ == '\\') {
1151 char *pp = p;
1152 s[-1] = bb_process_escape_sequence((const char **)&pp);
1153 if (*p == '\\')
1154 *s++ = '\\';
1155 if (pp == p)
1156 *s++ = *p++;
1157 else
1158 p = pp;
1159 }
1160 }
1161 p++;
1162 *s = '\0';
1163 tc = TC_REGEXP;
1164 debug_printf_parse("%s: token found:'%s' TC_REGEXP\n", __func__, t_string);
1165
1166 } else if (*p == '.' || isdigit(*p)) {
1167 /* it's a number */
1168 char *pp = p;
1169 t_double = my_strtod(&pp);
1170 p = pp;
1171 if (*p == '.')
1172 syntax_error(EMSG_UNEXP_TOKEN);
1173 tc = TC_NUMBER;
1174 debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double);
1175 } else {
1176 char *end_of_name;
1177
1178 if (*p == '\n')
1179 t_lineno++;
1180
1181 /* search for something known */
1182 tl = tokenlist;
1183 tc = 0x00000001;
1184 ti = tokeninfo;
1185 while (*tl) {
1186 int l = (unsigned char) *tl++;
1187 if (l == (unsigned char) NTCC) {
1188 tc <<= 1;
1189 continue;
1190 }
1191 /* if token class is expected,
1192 * token matches,
1193 * and it's not a longer word,
1194 */
1195 if ((tc & (expected | TS_WORD | TC_NEWLINE))
1196 && strncmp(p, tl, l) == 0
1197 && !((tc & TS_WORD) && isalnum_(p[l]))
1198 ) {
1199 /* then this is what we are looking for */
1200 t_info = *ti;
1201 debug_printf_parse("%s: token found:'%.*s' t_info:%x\n", __func__, l, p, t_info);
1202 p += l;
1203 goto token_found;
1204 }
1205 ti++;
1206 tl += l;
1207 }
1208 /* not a known token */
1209
1210 /* is it a name? (var/array/function) */
1211 if (!isalnum_(*p))
1212 syntax_error(EMSG_UNEXP_TOKEN); /* no */
1213 /* yes */
1214 t_string = p;
1215 while (isalnum_(*p))
1216 p++;
1217 end_of_name = p;
1218
1219 if (last_token_class == TC_FUNCDECL)
1220 /* eat space in "function FUNC (...) {...}" declaration */
1221 p = skip_spaces(p);
1222 else if (expected & TC_ARRAY) {
1223 /* eat space between array name and [ */
1224 char *s = skip_spaces(p);
1225 if (*s == '[') /* array ref, not just a name? */
1226 p = s;
1227 }
1228 /* else: do NOT consume whitespace after variable name!
1229 * gawk allows definition "function FUNC (p) {...}" - note space,
1230 * but disallows the call "FUNC (p)" because it isn't one -
1231 * expression "v (a)" should NOT be parsed as TC_FUNCTION:
1232 * it is a valid concatenation if "v" is a variable,
1233 * not a function name (and type of name is not known at parse time).
1234 */
1235
1236 if (*p == '(') {
1237 p++;
1238 tc = TC_FUNCTION;
1239 debug_printf_parse("%s: token found:'%s' TC_FUNCTION\n", __func__, t_string);
1240 } else if (*p == '[') {
1241 p++;
1242 tc = TC_ARRAY;
1243 debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string);
1244 } else {
1245 tc = TC_VARIABLE;
1246 debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string);
1247 if (end_of_name == p) {
1248 /* there is no space for trailing NUL in t_string!
1249 * We need to save the char we are going to NUL.
1250 * (we'll use it in future call to next_token())
1251 */
1252 g_saved_ch = *end_of_name;
1253 // especially pathological example is V="abc"; V.2 - it's V concatenated to .2
1254 // (it evaluates to "abc0.2"). Because of this case, we can't simply cache
1255 // '.' and analyze it later: we also have to *store it back* in next
1256 // next_token(), in order to give my_strtod() the undamaged ".2" string.
1257 }
1258 }
1259 *end_of_name = '\0'; /* terminate t_string */
1260 }
1261 token_found:
1262 g_pos = p;
1263
1264 /* skipping newlines in some cases */
1265 if ((last_token_class & TS_NOTERM) && (tc & TC_NEWLINE))
1266 goto readnext;
1267
1268 /* insert concatenation operator when needed */
1269 debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__,
1270 (last_token_class & TS_CONCAT_L), (tc & TS_CONCAT_R), (expected & TS_BINOP),
1271 !(last_token_class == TC_LENGTH && tc == TC_LPAREN));
1272 if ((last_token_class & TS_CONCAT_L) && (tc & TS_CONCAT_R) && (expected & TS_BINOP)
1273 && !(last_token_class == TC_LENGTH && tc == TC_LPAREN) /* but not for "length(..." */
1274 ) {
1275 concat_inserted = TRUE;
1276 save_tclass = tc;
1277 save_info = t_info;
1278 tc = TC_BINOPX;
1279 t_info = OC_CONCAT | SS | P(35);
1280 }
1281
1282 t_tclass = tc;
1283 debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, tc);
1284 }
1285 /* Are we ready for this? */
1286 if (!(t_tclass & expected)) {
1287 syntax_error((last_token_class & (TC_NEWLINE | TC_EOF)) ?
1288 EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN);
1289 }
1290
1291 debug_printf_parse("%s: returning, t_double:%f t_tclass:", __func__, t_double);
1292 debug_parse_print_tc(t_tclass);
1293 debug_printf_parse("\n");
1294
1295 return t_tclass;
1296 #undef concat_inserted
1297 #undef save_tclass
1298 #undef save_info
1299 }
1300
rollback_token(void)1301 static ALWAYS_INLINE void rollback_token(void)
1302 {
1303 t_rollback = TRUE;
1304 }
1305
new_node(uint32_t info)1306 static node *new_node(uint32_t info)
1307 {
1308 node *n;
1309
1310 n = xzalloc(sizeof(node));
1311 n->info = info;
1312 n->lineno = g_lineno;
1313 return n;
1314 }
1315
mk_re_node(const char * s,node * n,regex_t * re)1316 static void mk_re_node(const char *s, node *n, regex_t *re)
1317 {
1318 n->info = TI_REGEXP;
1319 n->l.re = re;
1320 n->r.ire = re + 1;
1321 xregcomp(re, s, REG_EXTENDED);
1322 xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE);
1323 }
1324
1325 static node *parse_expr(uint32_t);
1326
parse_lrparen_list(void)1327 static node *parse_lrparen_list(void)
1328 {
1329 next_token(TC_LPAREN);
1330 return parse_expr(TC_RPAREN);
1331 }
1332
1333 /* parse expression terminated by given argument, return ptr
1334 * to built subtree. Terminator is eaten by parse_expr */
parse_expr(uint32_t term_tc)1335 static node *parse_expr(uint32_t term_tc)
1336 {
1337 node sn;
1338 node *cn = &sn;
1339 node *vn, *glptr;
1340 uint32_t tc, expected_tc;
1341 var *v;
1342
1343 debug_printf_parse("%s() term_tc(%x):", __func__, term_tc);
1344 debug_parse_print_tc(term_tc);
1345 debug_printf_parse("\n");
1346
1347 sn.info = PRIMASK;
1348 sn.r.n = sn.a.n = glptr = NULL;
1349 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP | term_tc;
1350
1351 while (!((tc = next_token(expected_tc)) & term_tc)) {
1352
1353 if (glptr && (t_info == TI_LESS)) {
1354 /* input redirection (<) attached to glptr node */
1355 debug_printf_parse("%s: input redir\n", __func__);
1356 cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37));
1357 cn->a.n = glptr;
1358 expected_tc = TS_OPERAND | TS_UOPPRE;
1359 glptr = NULL;
1360 continue;
1361 }
1362 if (tc & (TS_BINOP | TC_UOPPOST)) {
1363 debug_printf_parse("%s: TS_BINOP | TC_UOPPOST tc:%x\n", __func__, tc);
1364 /* for binary and postfix-unary operators, jump back over
1365 * previous operators with higher priority */
1366 vn = cn;
1367 while (((t_info & PRIMASK) > (vn->a.n->info & PRIMASK2))
1368 || ((t_info == vn->info) && t_info == TI_COLON)
1369 ) {
1370 vn = vn->a.n;
1371 if (!vn->a.n) syntax_error(EMSG_UNEXP_TOKEN);
1372 }
1373 if (t_info == TI_TERNARY)
1374 //TODO: why?
1375 t_info += P(6);
1376 cn = vn->a.n->r.n = new_node(t_info);
1377 cn->a.n = vn->a.n;
1378 if (tc & TS_BINOP) {
1379 cn->l.n = vn;
1380 //FIXME: this is the place to detect and reject assignments to non-lvalues.
1381 //Currently we allow "assignments" to consts and temporaries, nonsense like this:
1382 // awk 'BEGIN { "qwe" = 1 }'
1383 // awk 'BEGIN { 7 *= 7 }'
1384 // awk 'BEGIN { length("qwe") = 1 }'
1385 // awk 'BEGIN { (1+1) += 3 }'
1386 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP;
1387 if (t_info == TI_PGETLINE) {
1388 /* it's a pipe */
1389 next_token(TC_GETLINE);
1390 /* give maximum priority to this pipe */
1391 cn->info &= ~PRIMASK;
1392 expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc;
1393 }
1394 } else {
1395 cn->r.n = vn;
1396 expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc;
1397 }
1398 vn->a.n = cn;
1399 continue;
1400 }
1401
1402 debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info);
1403 /* for operands and prefix-unary operators, attach them
1404 * to last node */
1405 vn = cn;
1406 cn = vn->r.n = new_node(t_info);
1407 cn->a.n = vn;
1408
1409 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP;
1410 if (t_info == TI_PREINC || t_info == TI_PREDEC)
1411 expected_tc = TS_LVALUE | TC_UOPPRE1;
1412
1413 if (!(tc & (TS_OPERAND | TC_REGEXP)))
1414 continue;
1415
1416 debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__);
1417 expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc;
1418 /* one should be very careful with switch on tclass -
1419 * only simple tclasses should be used (TC_xyz, not TS_xyz) */
1420 switch (tc) {
1421 case TC_VARIABLE:
1422 case TC_ARRAY:
1423 debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__);
1424 cn->info = OC_VAR;
1425 v = hash_search(ahash, t_string);
1426 if (v != NULL) {
1427 cn->info = OC_FNARG;
1428 cn->l.aidx = v->x.aidx;
1429 } else {
1430 cn->l.v = newvar(t_string);
1431 }
1432 if (tc & TC_ARRAY) {
1433 cn->info |= xS;
1434 cn->r.n = parse_expr(TC_ARRTERM);
1435 }
1436 break;
1437
1438 case TC_NUMBER:
1439 case TC_STRING:
1440 debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__);
1441 cn->info = OC_VAR;
1442 v = cn->l.v = xzalloc(sizeof(var));
1443 if (tc & TC_NUMBER)
1444 setvar_i(v, t_double);
1445 else {
1446 setvar_s(v, t_string);
1447 expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */
1448 }
1449 break;
1450
1451 case TC_REGEXP:
1452 debug_printf_parse("%s: TC_REGEXP\n", __func__);
1453 mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2));
1454 break;
1455
1456 case TC_FUNCTION:
1457 debug_printf_parse("%s: TC_FUNCTION\n", __func__);
1458 cn->info = OC_FUNC;
1459 cn->r.f = newfunc(t_string);
1460 cn->l.n = parse_expr(TC_RPAREN);
1461 break;
1462
1463 case TC_LPAREN:
1464 debug_printf_parse("%s: TC_LPAREN\n", __func__);
1465 cn = vn->r.n = parse_expr(TC_RPAREN);
1466 if (!cn)
1467 syntax_error("Empty sequence");
1468 cn->a.n = vn;
1469 break;
1470
1471 case TC_GETLINE:
1472 debug_printf_parse("%s: TC_GETLINE\n", __func__);
1473 glptr = cn;
1474 expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc;
1475 break;
1476
1477 case TC_BUILTIN:
1478 debug_printf_parse("%s: TC_BUILTIN\n", __func__);
1479 cn->l.n = parse_lrparen_list();
1480 break;
1481
1482 case TC_LENGTH:
1483 debug_printf_parse("%s: TC_LENGTH\n", __func__);
1484 tc = next_token(TC_LPAREN /* length(...) */
1485 | TC_SEMICOL /* length; */
1486 | TC_NEWLINE /* length<newline> */
1487 | TC_RBRACE /* length } */
1488 | TC_BINOPX /* length <op> NUM */
1489 | TC_COMMA /* print length, 1 */
1490 );
1491 if (tc != TC_LPAREN)
1492 rollback_token();
1493 else {
1494 /* It was a "(" token. Handle just like TC_BUILTIN */
1495 cn->l.n = parse_expr(TC_RPAREN);
1496 }
1497 break;
1498 }
1499 } /* while() */
1500
1501 debug_printf_parse("%s() returns %p\n", __func__, sn.r.n);
1502 return sn.r.n;
1503 }
1504
1505 /* add node to chain. Return ptr to alloc'd node */
chain_node(uint32_t info)1506 static node *chain_node(uint32_t info)
1507 {
1508 node *n;
1509
1510 if (!seq->first)
1511 seq->first = seq->last = new_node(0);
1512
1513 if (seq->programname != g_progname) {
1514 seq->programname = g_progname;
1515 n = chain_node(OC_NEWSOURCE);
1516 n->l.new_progname = g_progname;
1517 }
1518
1519 n = seq->last;
1520 n->info = info;
1521 seq->last = n->a.n = new_node(OC_DONE);
1522
1523 return n;
1524 }
1525
chain_expr(uint32_t info)1526 static void chain_expr(uint32_t info)
1527 {
1528 node *n;
1529
1530 n = chain_node(info);
1531
1532 n->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_RBRACE);
1533 if ((info & OF_REQUIRED) && !n->l.n)
1534 syntax_error(EMSG_TOO_FEW_ARGS);
1535
1536 if (t_tclass & TC_RBRACE)
1537 rollback_token();
1538 }
1539
1540 static void chain_group(void);
1541
chain_loop(node * nn)1542 static node *chain_loop(node *nn)
1543 {
1544 node *n, *n2, *save_brk, *save_cont;
1545
1546 save_brk = break_ptr;
1547 save_cont = continue_ptr;
1548
1549 n = chain_node(OC_BR | Vx);
1550 continue_ptr = new_node(OC_EXEC);
1551 break_ptr = new_node(OC_EXEC);
1552 chain_group();
1553 n2 = chain_node(OC_EXEC | Vx);
1554 n2->l.n = nn;
1555 n2->a.n = n;
1556 continue_ptr->a.n = n2;
1557 break_ptr->a.n = n->r.n = seq->last;
1558
1559 continue_ptr = save_cont;
1560 break_ptr = save_brk;
1561
1562 return n;
1563 }
1564
chain_until_rbrace(void)1565 static void chain_until_rbrace(void)
1566 {
1567 uint32_t tc;
1568 while ((tc = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) {
1569 debug_printf_parse("%s: !TC_RBRACE\n", __func__);
1570 if (tc == TC_NEWLINE)
1571 continue;
1572 rollback_token();
1573 chain_group();
1574 }
1575 debug_printf_parse("%s: TC_RBRACE\n", __func__);
1576 }
1577
1578 /* parse group and attach it to chain */
chain_group(void)1579 static void chain_group(void)
1580 {
1581 uint32_t tc;
1582 node *n, *n2, *n3;
1583
1584 do {
1585 tc = next_token(TS_GRPSEQ);
1586 } while (tc == TC_NEWLINE);
1587
1588 if (tc == TC_LBRACE) {
1589 debug_printf_parse("%s: TC_LBRACE\n", __func__);
1590 chain_until_rbrace();
1591 return;
1592 }
1593 if (tc & (TS_OPSEQ | TC_SEMICOL)) {
1594 debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL\n", __func__);
1595 rollback_token();
1596 chain_expr(OC_EXEC | Vx);
1597 return;
1598 }
1599
1600 /* TS_STATEMNT */
1601 debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__);
1602 switch (t_info & OPCLSMASK) {
1603 case ST_IF:
1604 debug_printf_parse("%s: ST_IF\n", __func__);
1605 n = chain_node(OC_BR | Vx);
1606 n->l.n = parse_lrparen_list();
1607 chain_group();
1608 n2 = chain_node(OC_EXEC);
1609 n->r.n = seq->last;
1610 if (next_token(TS_GRPSEQ | TC_RBRACE | TC_ELSE) == TC_ELSE) {
1611 chain_group();
1612 n2->a.n = seq->last;
1613 } else {
1614 rollback_token();
1615 }
1616 break;
1617
1618 case ST_WHILE:
1619 debug_printf_parse("%s: ST_WHILE\n", __func__);
1620 n2 = parse_lrparen_list();
1621 n = chain_loop(NULL);
1622 n->l.n = n2;
1623 break;
1624
1625 case ST_DO:
1626 debug_printf_parse("%s: ST_DO\n", __func__);
1627 n2 = chain_node(OC_EXEC);
1628 n = chain_loop(NULL);
1629 n2->a.n = n->a.n;
1630 next_token(TC_WHILE);
1631 n->l.n = parse_lrparen_list();
1632 break;
1633
1634 case ST_FOR:
1635 debug_printf_parse("%s: ST_FOR\n", __func__);
1636 next_token(TC_LPAREN);
1637 n2 = parse_expr(TC_SEMICOL | TC_RPAREN);
1638 if (t_tclass & TC_RPAREN) { /* for (I in ARRAY) */
1639 if (!n2 || n2->info != TI_IN)
1640 syntax_error(EMSG_UNEXP_TOKEN);
1641 n = chain_node(OC_WALKINIT | VV);
1642 n->l.n = n2->l.n;
1643 n->r.n = n2->r.n;
1644 n = chain_loop(NULL);
1645 n->info = OC_WALKNEXT | Vx;
1646 n->l.n = n2->l.n;
1647 } else { /* for (;;) */
1648 n = chain_node(OC_EXEC | Vx);
1649 n->l.n = n2;
1650 n2 = parse_expr(TC_SEMICOL);
1651 n3 = parse_expr(TC_RPAREN);
1652 n = chain_loop(n3);
1653 n->l.n = n2;
1654 if (!n2)
1655 n->info = OC_EXEC;
1656 }
1657 break;
1658
1659 case OC_PRINT:
1660 case OC_PRINTF:
1661 debug_printf_parse("%s: OC_PRINT[F]\n", __func__);
1662 n = chain_node(t_info);
1663 n->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_OUTRDR | TC_RBRACE);
1664 if (t_tclass & TC_OUTRDR) {
1665 n->info |= t_info;
1666 n->r.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_RBRACE);
1667 }
1668 if (t_tclass & TC_RBRACE)
1669 rollback_token();
1670 break;
1671
1672 case OC_BREAK:
1673 debug_printf_parse("%s: OC_BREAK\n", __func__);
1674 n = chain_node(OC_EXEC);
1675 if (!break_ptr)
1676 syntax_error("'break' not in a loop");
1677 n->a.n = break_ptr;
1678 chain_expr(t_info);
1679 break;
1680
1681 case OC_CONTINUE:
1682 debug_printf_parse("%s: OC_CONTINUE\n", __func__);
1683 n = chain_node(OC_EXEC);
1684 if (!continue_ptr)
1685 syntax_error("'continue' not in a loop");
1686 n->a.n = continue_ptr;
1687 chain_expr(t_info);
1688 break;
1689
1690 /* delete, next, nextfile, return, exit */
1691 default:
1692 debug_printf_parse("%s: default\n", __func__);
1693 chain_expr(t_info);
1694 }
1695 }
1696
parse_program(char * p)1697 static void parse_program(char *p)
1698 {
1699 debug_printf_parse("%s()\n", __func__);
1700
1701 g_pos = p;
1702 t_lineno = 1;
1703 for (;;) {
1704 uint32_t tclass;
1705
1706 tclass = next_token(TS_OPSEQ | TC_LBRACE | TC_BEGIN | TC_END | TC_FUNCDECL
1707 | TC_EOF | TC_NEWLINE /* but not TC_SEMICOL */);
1708 got_tok:
1709 if (tclass == TC_EOF) {
1710 debug_printf_parse("%s: TC_EOF\n", __func__);
1711 break;
1712 }
1713 if (tclass == TC_NEWLINE) {
1714 debug_printf_parse("%s: TC_NEWLINE\n", __func__);
1715 continue;
1716 }
1717 if (tclass == TC_BEGIN) {
1718 debug_printf_parse("%s: TC_BEGIN\n", __func__);
1719 seq = &beginseq;
1720 /* ensure there is no newline between BEGIN and { */
1721 next_token(TC_LBRACE);
1722 chain_until_rbrace();
1723 goto next_tok;
1724 }
1725 if (tclass == TC_END) {
1726 debug_printf_parse("%s: TC_END\n", __func__);
1727 seq = &endseq;
1728 /* ensure there is no newline between END and { */
1729 next_token(TC_LBRACE);
1730 chain_until_rbrace();
1731 goto next_tok;
1732 }
1733 if (tclass == TC_FUNCDECL) {
1734 func *f;
1735
1736 debug_printf_parse("%s: TC_FUNCDECL\n", __func__);
1737 next_token(TC_FUNCTION);
1738 f = newfunc(t_string);
1739 if (f->defined)
1740 syntax_error("Duplicate function");
1741 f->defined = 1;
1742 //f->body.first = NULL; - already is
1743 //f->nargs = 0; - already is
1744 /* func arg list: comma sep list of args, and a close paren */
1745 for (;;) {
1746 var *v;
1747 if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) {
1748 if (f->nargs == 0)
1749 break; /* func() is ok */
1750 /* func(a,) is not ok */
1751 syntax_error(EMSG_UNEXP_TOKEN);
1752 }
1753 v = findvar(ahash, t_string);
1754 v->x.aidx = f->nargs++;
1755 /* Arg followed either by end of arg list or 1 comma */
1756 if (next_token(TC_COMMA | TC_RPAREN) == TC_RPAREN)
1757 break;
1758 /* it was a comma, we ate it */
1759 }
1760 seq = &f->body;
1761 /* ensure there is { after "func F(...)" - but newlines are allowed */
1762 while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE)
1763 continue;
1764 chain_until_rbrace();
1765 hash_clear(ahash);
1766 goto next_tok;
1767 }
1768 seq = &mainseq;
1769 if (tclass & TS_OPSEQ) {
1770 node *cn;
1771
1772 debug_printf_parse("%s: TS_OPSEQ\n", __func__);
1773 rollback_token();
1774 cn = chain_node(OC_TEST);
1775 cn->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_EOF | TC_LBRACE);
1776 if (t_tclass == TC_LBRACE) {
1777 debug_printf_parse("%s: TC_LBRACE\n", __func__);
1778 chain_until_rbrace();
1779 } else {
1780 /* no action, assume default "{ print }" */
1781 debug_printf_parse("%s: !TC_LBRACE\n", __func__);
1782 chain_node(OC_PRINT);
1783 }
1784 cn->r.n = mainseq.last;
1785 goto next_tok;
1786 }
1787 /* tclass == TC_LBRACE */
1788 debug_printf_parse("%s: TC_LBRACE(?)\n", __func__);
1789 chain_until_rbrace();
1790 next_tok:
1791 /* Same as next_token() at the top of the loop, + TC_SEMICOL */
1792 tclass = next_token(TS_OPSEQ | TC_LBRACE | TC_BEGIN | TC_END | TC_FUNCDECL
1793 | TC_EOF | TC_NEWLINE | TC_SEMICOL);
1794 /* gawk allows many newlines, but does not allow more than one semicolon:
1795 * BEGIN {...}<newline>;<newline>;
1796 * would complain "each rule must have a pattern or an action part".
1797 * Same message for
1798 * ; BEGIN {...}
1799 */
1800 if (tclass != TC_SEMICOL)
1801 goto got_tok; /* use this token */
1802 /* else: loop back - ate the semicolon, get and use _next_ token */
1803 } /* for (;;) */
1804 }
1805
1806 /* -------- program execution part -------- */
1807
1808 /* temporary variables allocator */
nvalloc(int sz)1809 static var *nvalloc(int sz)
1810 {
1811 return xzalloc(sz * sizeof(var));
1812 }
1813
nvfree(var * v,int sz)1814 static void nvfree(var *v, int sz)
1815 {
1816 var *p = v;
1817
1818 while (--sz >= 0) {
1819 if ((p->type & (VF_ARRAY | VF_CHILD)) == VF_ARRAY) {
1820 clear_array(iamarray(p));
1821 free(p->x.array->items);
1822 free(p->x.array);
1823 }
1824 if (p->type & VF_WALK) {
1825 walker_list *n;
1826 walker_list *w = p->x.walker;
1827 debug_printf_walker("nvfree: freeing walker @%p\n", &p->x.walker);
1828 p->x.walker = NULL;
1829 while (w) {
1830 n = w->prev;
1831 debug_printf_walker(" free(%p)\n", w);
1832 free(w);
1833 w = n;
1834 }
1835 }
1836 clrvar(p);
1837 p++;
1838 }
1839
1840 free(v);
1841 }
1842
mk_splitter(const char * s,tsplitter * spl)1843 static node *mk_splitter(const char *s, tsplitter *spl)
1844 {
1845 regex_t *re, *ire;
1846 node *n;
1847
1848 re = &spl->re[0];
1849 ire = &spl->re[1];
1850 n = &spl->n;
1851 if (n->info == TI_REGEXP) {
1852 regfree(re);
1853 regfree(ire); // TODO: nuke ire, use re+1?
1854 }
1855 if (s[0] && s[1]) { /* strlen(s) > 1 */
1856 mk_re_node(s, n, re);
1857 } else {
1858 n->info = (uint32_t) s[0];
1859 }
1860
1861 return n;
1862 }
1863
1864 static var *evaluate(node *, var *);
1865
1866 /* Use node as a regular expression. Supplied with node ptr and regex_t
1867 * storage space. Return ptr to regex (if result points to preg, it should
1868 * be later regfree'd manually).
1869 */
as_regex(node * op,regex_t * preg)1870 static regex_t *as_regex(node *op, regex_t *preg)
1871 {
1872 int cflags;
1873 const char *s;
1874
1875 if (op->info == TI_REGEXP) {
1876 return icase ? op->r.ire : op->l.re;
1877 }
1878
1879 //tmpvar = nvalloc(1);
1880 #define TMPVAR (&G.as_regex__tmpvar)
1881 // We use a single "static" tmpvar (instead of on-stack or malloced one)
1882 // to decrease memory consumption in deeply-recursive awk programs.
1883 // The rule to work safely is to never call evaluate() while our static
1884 // TMPVAR's value is still needed.
1885 s = getvar_s(evaluate(op, TMPVAR));
1886
1887 cflags = icase ? REG_EXTENDED | REG_ICASE : REG_EXTENDED;
1888 /* Testcase where REG_EXTENDED fails (unpaired '{'):
1889 * echo Hi | awk 'gsub("@(samp|code|file)\{","");'
1890 * gawk 3.1.5 eats this. We revert to ~REG_EXTENDED
1891 * (maybe gsub is not supposed to use REG_EXTENDED?).
1892 */
1893 if (regcomp(preg, s, cflags)) {
1894 cflags &= ~REG_EXTENDED;
1895 xregcomp(preg, s, cflags);
1896 }
1897 //nvfree(tmpvar, 1);
1898 #undef TMPVAR
1899 return preg;
1900 }
1901
1902 /* gradually increasing buffer.
1903 * note that we reallocate even if n == old_size,
1904 * and thus there is at least one extra allocated byte.
1905 */
qrealloc(char * b,int n,int * size)1906 static char* qrealloc(char *b, int n, int *size)
1907 {
1908 if (!b || n >= *size) {
1909 *size = n + (n>>1) + 80;
1910 b = xrealloc(b, *size);
1911 }
1912 return b;
1913 }
1914
1915 /* resize field storage space */
fsrealloc(int size)1916 static void fsrealloc(int size)
1917 {
1918 int i, newsize;
1919
1920 if (size >= maxfields) {
1921 /* Sanity cap, easier than catering for overflows */
1922 if (size > 0xffffff)
1923 bb_die_memory_exhausted();
1924
1925 i = maxfields;
1926 maxfields = size + 16;
1927
1928 newsize = maxfields * sizeof(Fields[0]);
1929 debug_printf_eval("fsrealloc: xrealloc(%p, %u)\n", Fields, newsize);
1930 Fields = xrealloc(Fields, newsize);
1931 debug_printf_eval("fsrealloc: Fields=%p..%p\n", Fields, (char*)Fields + newsize - 1);
1932 /* ^^^ did Fields[] move? debug aid for L.v getting "upstaged" by R.v in evaluate() */
1933
1934 for (; i < maxfields; i++) {
1935 Fields[i].type = VF_SPECIAL;
1936 Fields[i].string = NULL;
1937 }
1938 }
1939 /* if size < nfields, clear extra field variables */
1940 for (i = size; i < nfields; i++) {
1941 clrvar(Fields + i);
1942 }
1943 nfields = size;
1944 }
1945
regexec1_nonempty(const regex_t * preg,const char * s,regmatch_t pmatch[])1946 static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[])
1947 {
1948 int r = regexec(preg, s, 1, pmatch, 0);
1949 if (r == 0 && pmatch[0].rm_eo == 0) {
1950 /* For example, happens when FS can match
1951 * an empty string (awk -F ' *'). Logically,
1952 * this should split into one-char fields.
1953 * However, gawk 5.0.1 searches for first
1954 * _non-empty_ separator string match:
1955 */
1956 size_t ofs = 0;
1957 do {
1958 ofs++;
1959 if (!s[ofs])
1960 return REG_NOMATCH;
1961 regexec(preg, s + ofs, 1, pmatch, 0);
1962 } while (pmatch[0].rm_eo == 0);
1963 pmatch[0].rm_so += ofs;
1964 pmatch[0].rm_eo += ofs;
1965 }
1966 return r;
1967 }
1968
awk_split(const char * s,node * spl,char ** slist)1969 static int awk_split(const char *s, node *spl, char **slist)
1970 {
1971 int n;
1972 char c[4];
1973 char *s1;
1974
1975 /* in worst case, each char would be a separate field */
1976 *slist = s1 = xzalloc(strlen(s) * 2 + 3);
1977 strcpy(s1, s);
1978
1979 c[0] = c[1] = (char)spl->info;
1980 c[2] = c[3] = '\0';
1981 if (*getvar_s(intvar[RS]) == '\0')
1982 c[2] = '\n';
1983
1984 n = 0;
1985 if (spl->info == TI_REGEXP) { /* regex split */
1986 if (!*s)
1987 return n; /* "": zero fields */
1988 n++; /* at least one field will be there */
1989 do {
1990 int l;
1991 regmatch_t pmatch[1];
1992
1993 l = strcspn(s, c+2); /* len till next NUL or \n */
1994 if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0
1995 && pmatch[0].rm_so <= l
1996 ) {
1997 /* if (pmatch[0].rm_eo == 0) ... - impossible */
1998 l = pmatch[0].rm_so;
1999 n++; /* we saw yet another delimiter */
2000 } else {
2001 pmatch[0].rm_eo = l;
2002 if (s[l])
2003 pmatch[0].rm_eo++;
2004 }
2005 s1 = mempcpy(s1, s, l);
2006 *s1++ = '\0';
2007 s += pmatch[0].rm_eo;
2008 } while (*s);
2009
2010 /* echo a-- | awk -F-- '{ print NF, length($NF), $NF }'
2011 * should print "2 0 ":
2012 */
2013 *s1 = '\0';
2014
2015 return n;
2016 }
2017 if (c[0] == '\0') { /* null split */
2018 while (*s) {
2019 *s1++ = *s++;
2020 *s1++ = '\0';
2021 n++;
2022 }
2023 return n;
2024 }
2025 if (c[0] != ' ') { /* single-character split */
2026 if (icase) {
2027 c[0] = toupper(c[0]);
2028 c[1] = tolower(c[1]);
2029 }
2030 if (*s1)
2031 n++;
2032 while ((s1 = strpbrk(s1, c)) != NULL) {
2033 *s1++ = '\0';
2034 n++;
2035 }
2036 return n;
2037 }
2038 /* space split */
2039 while (*s) {
2040 s = skip_whitespace(s);
2041 if (!*s)
2042 break;
2043 n++;
2044 while (*s && !isspace(*s))
2045 *s1++ = *s++;
2046 *s1++ = '\0';
2047 }
2048 return n;
2049 }
2050
split_f0(void)2051 static void split_f0(void)
2052 {
2053 /* static char *fstrings; */
2054 #define fstrings (G.split_f0__fstrings)
2055
2056 int i, n;
2057 char *s;
2058
2059 if (is_f0_split)
2060 return;
2061
2062 is_f0_split = TRUE;
2063 free(fstrings);
2064 fsrealloc(0);
2065 n = awk_split(getvar_s(intvar[F0]), &fsplitter.n, &fstrings);
2066 fsrealloc(n);
2067 s = fstrings;
2068 for (i = 0; i < n; i++) {
2069 Fields[i].string = nextword(&s);
2070 Fields[i].type |= (VF_FSTR | VF_USER | VF_DIRTY);
2071 }
2072
2073 /* set NF manually to avoid side effects */
2074 clrvar(intvar[NF]);
2075 intvar[NF]->type = VF_NUMBER | VF_SPECIAL;
2076 intvar[NF]->number = nfields;
2077 #undef fstrings
2078 }
2079
2080 /* perform additional actions when some internal variables changed */
handle_special(var * v)2081 static void handle_special(var *v)
2082 {
2083 int n;
2084 char *b;
2085 const char *sep, *s;
2086 int sl, l, len, i, bsize;
2087
2088 if (!(v->type & VF_SPECIAL))
2089 return;
2090
2091 if (v == intvar[NF]) {
2092 n = (int)getvar_i(v);
2093 if (n < 0)
2094 syntax_error("NF set to negative value");
2095 fsrealloc(n);
2096
2097 /* recalculate $0 */
2098 sep = getvar_s(intvar[OFS]);
2099 sl = strlen(sep);
2100 b = NULL;
2101 len = 0;
2102 for (i = 0; i < n; i++) {
2103 s = getvar_s(&Fields[i]);
2104 l = strlen(s);
2105 if (b) {
2106 memcpy(b+len, sep, sl);
2107 len += sl;
2108 }
2109 b = qrealloc(b, len+l+sl, &bsize);
2110 memcpy(b+len, s, l);
2111 len += l;
2112 }
2113 if (b)
2114 b[len] = '\0';
2115 setvar_p(intvar[F0], b);
2116 is_f0_split = TRUE;
2117
2118 } else if (v == intvar[F0]) {
2119 is_f0_split = FALSE;
2120
2121 } else if (v == intvar[FS]) {
2122 /*
2123 * The POSIX-2008 standard says that changing FS should have no effect on the
2124 * current input line, but only on the next one. The language is:
2125 *
2126 * > Before the first reference to a field in the record is evaluated, the record
2127 * > shall be split into fields, according to the rules in Regular Expressions,
2128 * > using the value of FS that was current at the time the record was read.
2129 *
2130 * So, split up current line before assignment to FS:
2131 */
2132 split_f0();
2133
2134 mk_splitter(getvar_s(v), &fsplitter);
2135 } else if (v == intvar[RS]) {
2136 mk_splitter(getvar_s(v), &rsplitter);
2137 } else if (v == intvar[IGNORECASE]) {
2138 icase = istrue(v);
2139 } else { /* $n */
2140 n = getvar_i(intvar[NF]);
2141 setvar_i(intvar[NF], n > v-Fields ? n : v-Fields+1);
2142 /* right here v is invalid. Just to note... */
2143 }
2144 }
2145
2146 /* step through func/builtin/etc arguments */
nextarg(node ** pn)2147 static node *nextarg(node **pn)
2148 {
2149 node *n;
2150
2151 n = *pn;
2152 if (n && n->info == TI_COMMA) {
2153 *pn = n->r.n;
2154 n = n->l.n;
2155 } else {
2156 *pn = NULL;
2157 }
2158 return n;
2159 }
2160
hashwalk_init(var * v,xhash * array)2161 static void hashwalk_init(var *v, xhash *array)
2162 {
2163 hash_item *hi;
2164 unsigned i;
2165 walker_list *w;
2166 walker_list *prev_walker;
2167
2168 if (v->type & VF_WALK) {
2169 prev_walker = v->x.walker;
2170 } else {
2171 v->type |= VF_WALK;
2172 prev_walker = NULL;
2173 }
2174 debug_printf_walker("hashwalk_init: prev_walker:%p\n", prev_walker);
2175
2176 w = v->x.walker = xzalloc(sizeof(*w) + array->glen + 1); /* why + 1? */
2177 debug_printf_walker(" walker@%p=%p\n", &v->x.walker, w);
2178 w->cur = w->end = w->wbuf;
2179 w->prev = prev_walker;
2180 for (i = 0; i < array->csize; i++) {
2181 hi = array->items[i];
2182 while (hi) {
2183 w->end = stpcpy(w->end, hi->name) + 1;
2184 hi = hi->next;
2185 }
2186 }
2187 }
2188
hashwalk_next(var * v)2189 static int hashwalk_next(var *v)
2190 {
2191 walker_list *w = v->x.walker;
2192
2193 if (w->cur >= w->end) {
2194 walker_list *prev_walker = w->prev;
2195
2196 debug_printf_walker("end of iteration, free(walker@%p:%p), prev_walker:%p\n", &v->x.walker, w, prev_walker);
2197 free(w);
2198 v->x.walker = prev_walker;
2199 return FALSE;
2200 }
2201
2202 setvar_s(v, nextword(&w->cur));
2203 return TRUE;
2204 }
2205
2206 /* evaluate node, return 1 when result is true, 0 otherwise */
ptest(node * pattern)2207 static int ptest(node *pattern)
2208 {
2209 // We use a single "static" tmpvar (instead of on-stack or malloced one)
2210 // to decrease memory consumption in deeply-recursive awk programs.
2211 // The rule to work safely is to never call evaluate() while our static
2212 // TMPVAR's value is still needed.
2213 return istrue(evaluate(pattern, &G.ptest__tmpvar));
2214 }
2215
2216 /* read next record from stream rsm into a variable v */
awk_getline(rstream * rsm,var * v)2217 static int awk_getline(rstream *rsm, var *v)
2218 {
2219 char *b;
2220 regmatch_t pmatch[1];
2221 int size, a, p, pp = 0;
2222 int fd, so, eo, r, rp;
2223 char c, *m, *s;
2224
2225 debug_printf_eval("entered %s()\n", __func__);
2226
2227 /* we're using our own buffer since we need access to accumulating
2228 * characters
2229 */
2230 fd = fileno(rsm->F);
2231 m = rsm->buffer;
2232 a = rsm->adv;
2233 p = rsm->pos;
2234 size = rsm->size;
2235 c = (char) rsplitter.n.info;
2236 rp = 0;
2237
2238 if (!m)
2239 m = qrealloc(m, 256, &size);
2240
2241 do {
2242 b = m + a;
2243 so = eo = p;
2244 r = 1;
2245 if (p > 0) {
2246 if (rsplitter.n.info == TI_REGEXP) {
2247 if (regexec(icase ? rsplitter.n.r.ire : rsplitter.n.l.re,
2248 b, 1, pmatch, 0) == 0) {
2249 so = pmatch[0].rm_so;
2250 eo = pmatch[0].rm_eo;
2251 if (b[eo] != '\0')
2252 break;
2253 }
2254 } else if (c != '\0') {
2255 s = strchr(b+pp, c);
2256 if (!s)
2257 s = memchr(b+pp, '\0', p - pp);
2258 if (s) {
2259 so = eo = s-b;
2260 eo++;
2261 break;
2262 }
2263 } else {
2264 while (b[rp] == '\n')
2265 rp++;
2266 s = strstr(b+rp, "\n\n");
2267 if (s) {
2268 so = eo = s-b;
2269 while (b[eo] == '\n')
2270 eo++;
2271 if (b[eo] != '\0')
2272 break;
2273 }
2274 }
2275 }
2276
2277 if (a > 0) {
2278 memmove(m, m+a, p+1);
2279 b = m;
2280 a = 0;
2281 }
2282
2283 m = qrealloc(m, a+p+128, &size);
2284 b = m + a;
2285 pp = p;
2286 p += safe_read(fd, b+p, size-p-1);
2287 if (p < pp) {
2288 p = 0;
2289 r = 0;
2290 setvar_i(intvar[ERRNO], errno);
2291 }
2292 b[p] = '\0';
2293
2294 } while (p > pp);
2295
2296 if (p == 0) {
2297 r--;
2298 } else {
2299 c = b[so]; b[so] = '\0';
2300 setvar_s(v, b+rp);
2301 v->type |= VF_USER;
2302 b[so] = c;
2303 c = b[eo]; b[eo] = '\0';
2304 setvar_s(intvar[RT], b+so);
2305 b[eo] = c;
2306 }
2307
2308 rsm->buffer = m;
2309 rsm->adv = a + eo;
2310 rsm->pos = p - eo;
2311 rsm->size = size;
2312
2313 debug_printf_eval("returning from %s(): %d\n", __func__, r);
2314
2315 return r;
2316 }
2317
2318 /* formatted output into an allocated buffer, return ptr to buffer */
2319 #if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS
2320 # define awk_printf(a, b) awk_printf(a)
2321 #endif
awk_printf(node * n,size_t * len)2322 static char *awk_printf(node *n, size_t *len)
2323 {
2324 char *b;
2325 char *fmt, *f;
2326 size_t i;
2327
2328 //tmpvar = nvalloc(1);
2329 #define TMPVAR (&G.awk_printf__tmpvar)
2330 // We use a single "static" tmpvar (instead of on-stack or malloced one)
2331 // to decrease memory consumption in deeply-recursive awk programs.
2332 // The rule to work safely is to never call evaluate() while our static
2333 // TMPVAR's value is still needed.
2334 fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), TMPVAR)));
2335 // ^^^^^^^^^ here we immediately strdup() the value, so the later call
2336 // to evaluate() potentially recursing into another awk_printf() can't
2337 // mangle the value.
2338
2339 b = NULL;
2340 i = 0;
2341 while (1) { /* "print one format spec" loop */
2342 char *s;
2343 char c;
2344 char sv;
2345 var *arg;
2346 size_t slen;
2347
2348 /* Find end of the next format spec, or end of line */
2349 s = f;
2350 while (1) {
2351 c = *f;
2352 if (!c) /* no percent chars found at all */
2353 goto nul;
2354 f++;
2355 if (c == '%')
2356 break;
2357 }
2358 /* we are past % in "....%..." */
2359 c = *f;
2360 if (!c) /* "....%" */
2361 goto nul;
2362 if (c == '%') { /* "....%%...." */
2363 slen = f - s;
2364 s = xstrndup(s, slen);
2365 f++;
2366 goto append; /* print "....%" part verbatim */
2367 }
2368 while (1) {
2369 if (isalpha(c))
2370 break;
2371 if (c == '*')
2372 syntax_error("%*x formats are not supported");
2373 c = *++f;
2374 if (!c) { /* "....%...." and no letter found after % */
2375 /* Example: awk 'BEGIN { printf "^^^%^^^\n"; }' */
2376 nul:
2377 slen = f - s;
2378 goto tail; /* print remaining string, exit loop */
2379 }
2380 }
2381 /* we are at A in "....%...A..." */
2382
2383 arg = evaluate(nextarg(&n), TMPVAR);
2384
2385 /* Result can be arbitrarily long. Example:
2386 * printf "%99999s", "BOOM"
2387 */
2388 sv = *++f;
2389 *f = '\0';
2390 if (c == 'c') {
2391 char cc = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg);
2392 char *r = xasprintf(s, cc ? cc : '^' /* else strlen will be wrong */);
2393 slen = strlen(r);
2394 if (cc == '\0') /* if cc is NUL, re-format the string with it */
2395 sprintf(r, s, cc);
2396 s = r;
2397 } else {
2398 if (c == 's') {
2399 s = xasprintf(s, getvar_s(arg));
2400 } else {
2401 double d = getvar_i(arg);
2402 if (strchr("diouxX", c)) {
2403 //TODO: make it wider here (%x -> %llx etc)?
2404 s = xasprintf(s, (int)d);
2405 } else if (strchr("eEfFgGaA", c)) {
2406 s = xasprintf(s, d);
2407 } else {
2408 //TODO: GNU Awk 5.0.1: printf "%W" prints "%W", does not error out
2409 syntax_error(EMSG_INV_FMT);
2410 }
2411 }
2412 slen = strlen(s);
2413 }
2414 *f = sv;
2415 append:
2416 if (i == 0) {
2417 b = s;
2418 i = slen;
2419 continue;
2420 }
2421 tail:
2422 b = xrealloc(b, i + slen + 1);
2423 strcpy(b + i, s);
2424 i += slen;
2425 if (!c) /* s is NOT allocated and this is the last part of string? */
2426 break;
2427 free(s);
2428 }
2429
2430 free(fmt);
2431 //nvfree(tmpvar, 1);
2432 #undef TMPVAR
2433
2434 #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS
2435 if (len)
2436 *len = i;
2437 #endif
2438 return b;
2439 }
2440
2441 /* Common substitution routine.
2442 * Replace (nm)'th substring of (src) that matches (rn) with (repl),
2443 * store result into (dest), return number of substitutions.
2444 * If nm = 0, replace all matches.
2445 * If src or dst is NULL, use $0.
2446 * If subexp != 0, enable subexpression matching (\1-\9).
2447 */
awk_sub(node * rn,const char * repl,int nm,var * src,var * dest,int subexp)2448 static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int subexp)
2449 {
2450 char *resbuf;
2451 const char *sp;
2452 int match_no, residx, replen, resbufsize;
2453 int regexec_flags;
2454 regmatch_t pmatch[10];
2455 regex_t sreg, *regex;
2456
2457 resbuf = NULL;
2458 residx = 0;
2459 match_no = 0;
2460 regexec_flags = 0;
2461 regex = as_regex(rn, &sreg);
2462 sp = getvar_s(src ? src : intvar[F0]);
2463 replen = strlen(repl);
2464 while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) {
2465 int so = pmatch[0].rm_so;
2466 int eo = pmatch[0].rm_eo;
2467
2468 //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
2469 resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
2470 memcpy(resbuf + residx, sp, eo);
2471 residx += eo;
2472 if (++match_no >= nm) {
2473 const char *s;
2474 int nbs;
2475
2476 /* replace */
2477 residx -= (eo - so);
2478 nbs = 0;
2479 for (s = repl; *s; s++) {
2480 char c = resbuf[residx++] = *s;
2481 if (c == '\\') {
2482 nbs++;
2483 continue;
2484 }
2485 if (c == '&' || (subexp && c >= '0' && c <= '9')) {
2486 int j;
2487 residx -= ((nbs + 3) >> 1);
2488 j = 0;
2489 if (c != '&') {
2490 j = c - '0';
2491 nbs++;
2492 }
2493 if (nbs % 2) {
2494 resbuf[residx++] = c;
2495 } else {
2496 int n = pmatch[j].rm_eo - pmatch[j].rm_so;
2497 resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
2498 memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);
2499 residx += n;
2500 }
2501 }
2502 nbs = 0;
2503 }
2504 }
2505
2506 regexec_flags = REG_NOTBOL;
2507 sp += eo;
2508 if (match_no == nm)
2509 break;
2510 if (eo == so) {
2511 /* Empty match (e.g. "b*" will match anywhere).
2512 * Advance by one char. */
2513 //BUG (bug 1333):
2514 //gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
2515 //... and will erroneously match "b" even though it is NOT at the word start.
2516 //we need REG_NOTBOW but it does not exist...
2517 //TODO: if EXTRA_COMPAT=y, use GNU matching and re_search,
2518 //it should be able to do it correctly.
2519 /* Subtle: this is safe only because
2520 * qrealloc allocated at least one extra byte */
2521 resbuf[residx] = *sp;
2522 if (*sp == '\0')
2523 goto ret;
2524 sp++;
2525 residx++;
2526 }
2527 }
2528
2529 resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);
2530 strcpy(resbuf + residx, sp);
2531 ret:
2532 //bb_error_msg("end sp:'%s'%p", sp,sp);
2533 setvar_p(dest ? dest : intvar[F0], resbuf);
2534 if (regex == &sreg)
2535 regfree(regex);
2536 return match_no;
2537 }
2538
do_mktime(const char * ds)2539 static NOINLINE int do_mktime(const char *ds)
2540 {
2541 struct tm then;
2542 int count;
2543
2544 /*memset(&then, 0, sizeof(then)); - not needed */
2545 then.tm_isdst = -1; /* default is unknown */
2546
2547 /* manpage of mktime says these fields are ints,
2548 * so we can sscanf stuff directly into them */
2549 count = sscanf(ds, "%u %u %u %u %u %u %d",
2550 &then.tm_year, &then.tm_mon, &then.tm_mday,
2551 &then.tm_hour, &then.tm_min, &then.tm_sec,
2552 &then.tm_isdst);
2553
2554 if (count < 6
2555 || (unsigned)then.tm_mon < 1
2556 || (unsigned)then.tm_year < 1900
2557 ) {
2558 return -1;
2559 }
2560
2561 then.tm_mon -= 1;
2562 then.tm_year -= 1900;
2563
2564 return mktime(&then);
2565 }
2566
2567 /* Reduce stack usage in exec_builtin() by keeping match() code separate */
do_match(node * an1,const char * as0)2568 static NOINLINE var *do_match(node *an1, const char *as0)
2569 {
2570 regmatch_t pmatch[1];
2571 regex_t sreg, *re;
2572 int n, start, len;
2573
2574 re = as_regex(an1, &sreg);
2575 n = regexec(re, as0, 1, pmatch, 0);
2576 if (re == &sreg)
2577 regfree(re);
2578 start = 0;
2579 len = -1;
2580 if (n == 0) {
2581 start = pmatch[0].rm_so + 1;
2582 len = pmatch[0].rm_eo - pmatch[0].rm_so;
2583 }
2584 setvar_i(newvar("RLENGTH"), len);
2585 return setvar_i(newvar("RSTART"), start);
2586 }
2587
2588 /* Reduce stack usage in evaluate() by keeping builtins' code separate */
exec_builtin(node * op,var * res)2589 static NOINLINE var *exec_builtin(node *op, var *res)
2590 {
2591 #define tspl (G.exec_builtin__tspl)
2592
2593 var *tmpvars;
2594 node *an[4];
2595 var *av[4];
2596 const char *as[4];
2597 node *spl;
2598 uint32_t isr, info;
2599 int nargs;
2600 time_t tt;
2601 int i, l, ll, n;
2602
2603 tmpvars = nvalloc(4);
2604 #define TMPVAR0 (tmpvars)
2605 #define TMPVAR1 (tmpvars + 1)
2606 #define TMPVAR2 (tmpvars + 2)
2607 #define TMPVAR3 (tmpvars + 3)
2608 #define TMPVAR(i) (tmpvars + (i))
2609 isr = info = op->info;
2610 op = op->l.n;
2611
2612 av[2] = av[3] = NULL;
2613 for (i = 0; i < 4 && op; i++) {
2614 an[i] = nextarg(&op);
2615 if (isr & 0x09000000) {
2616 av[i] = evaluate(an[i], TMPVAR(i));
2617 if (isr & 0x08000000)
2618 as[i] = getvar_s(av[i]);
2619 }
2620 isr >>= 1;
2621 }
2622
2623 nargs = i;
2624 if ((uint32_t)nargs < (info >> 30))
2625 syntax_error(EMSG_TOO_FEW_ARGS);
2626
2627 info &= OPNMASK;
2628 switch (info) {
2629
2630 case B_a2:
2631 if (ENABLE_FEATURE_AWK_LIBM)
2632 setvar_i(res, atan2(getvar_i(av[0]), getvar_i(av[1])));
2633 else
2634 syntax_error(EMSG_NO_MATH);
2635 break;
2636
2637 case B_sp: {
2638 char *s, *s1;
2639
2640 if (nargs > 2) {
2641 spl = (an[2]->info == TI_REGEXP) ? an[2]
2642 : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl);
2643 } else {
2644 spl = &fsplitter.n;
2645 }
2646
2647 n = awk_split(as[0], spl, &s);
2648 s1 = s;
2649 clear_array(iamarray(av[1]));
2650 for (i = 1; i <= n; i++)
2651 setari_u(av[1], i, nextword(&s));
2652 free(s1);
2653 setvar_i(res, n);
2654 break;
2655 }
2656
2657 case B_ss: {
2658 char *s;
2659
2660 l = strlen(as[0]);
2661 i = getvar_i(av[1]) - 1;
2662 if (i > l)
2663 i = l;
2664 if (i < 0)
2665 i = 0;
2666 n = (nargs > 2) ? getvar_i(av[2]) : l-i;
2667 if (n < 0)
2668 n = 0;
2669 s = xstrndup(as[0]+i, n);
2670 setvar_p(res, s);
2671 break;
2672 }
2673
2674 /* Bitwise ops must assume that operands are unsigned. GNU Awk 3.1.5:
2675 * awk '{ print or(-1,1) }' gives "4.29497e+09", not "-2.xxxe+09" */
2676 case B_an:
2677 setvar_i(res, getvar_i_int(av[0]) & getvar_i_int(av[1]));
2678 break;
2679
2680 case B_co:
2681 setvar_i(res, ~getvar_i_int(av[0]));
2682 break;
2683
2684 case B_ls:
2685 setvar_i(res, getvar_i_int(av[0]) << getvar_i_int(av[1]));
2686 break;
2687
2688 case B_or:
2689 setvar_i(res, getvar_i_int(av[0]) | getvar_i_int(av[1]));
2690 break;
2691
2692 case B_rs:
2693 setvar_i(res, getvar_i_int(av[0]) >> getvar_i_int(av[1]));
2694 break;
2695
2696 case B_xo:
2697 setvar_i(res, getvar_i_int(av[0]) ^ getvar_i_int(av[1]));
2698 break;
2699
2700 case B_lo:
2701 case B_up: {
2702 char *s, *s1;
2703 s1 = s = xstrdup(as[0]);
2704 while (*s1) {
2705 //*s1 = (info == B_up) ? toupper(*s1) : tolower(*s1);
2706 if ((unsigned char)((*s1 | 0x20) - 'a') <= ('z' - 'a'))
2707 *s1 = (info == B_up) ? (*s1 & 0xdf) : (*s1 | 0x20);
2708 s1++;
2709 }
2710 setvar_p(res, s);
2711 break;
2712 }
2713
2714 case B_ix:
2715 n = 0;
2716 ll = strlen(as[1]);
2717 l = strlen(as[0]) - ll;
2718 if (ll > 0 && l >= 0) {
2719 if (!icase) {
2720 char *s = strstr(as[0], as[1]);
2721 if (s)
2722 n = (s - as[0]) + 1;
2723 } else {
2724 /* this piece of code is terribly slow and
2725 * really should be rewritten
2726 */
2727 for (i = 0; i <= l; i++) {
2728 if (strncasecmp(as[0]+i, as[1], ll) == 0) {
2729 n = i+1;
2730 break;
2731 }
2732 }
2733 }
2734 }
2735 setvar_i(res, n);
2736 break;
2737
2738 case B_ti:
2739 if (nargs > 1)
2740 tt = getvar_i(av[1]);
2741 else
2742 time(&tt);
2743 //s = (nargs > 0) ? as[0] : "%a %b %d %H:%M:%S %Z %Y";
2744 i = strftime(g_buf, MAXVARFMT,
2745 ((nargs > 0) ? as[0] : "%a %b %d %H:%M:%S %Z %Y"),
2746 localtime(&tt));
2747 g_buf[i] = '\0';
2748 setvar_s(res, g_buf);
2749 break;
2750
2751 case B_mt:
2752 setvar_i(res, do_mktime(as[0]));
2753 break;
2754
2755 case B_ma:
2756 res = do_match(an[1], as[0]);
2757 break;
2758
2759 case B_ge:
2760 awk_sub(an[0], as[1], getvar_i(av[2]), av[3], res, TRUE);
2761 break;
2762
2763 case B_gs:
2764 setvar_i(res, awk_sub(an[0], as[1], 0, av[2], av[2], FALSE));
2765 break;
2766
2767 case B_su:
2768 setvar_i(res, awk_sub(an[0], as[1], 1, av[2], av[2], FALSE));
2769 break;
2770 }
2771
2772 nvfree(tmpvars, 4);
2773 #undef TMPVAR0
2774 #undef TMPVAR1
2775 #undef TMPVAR2
2776 #undef TMPVAR3
2777 #undef TMPVAR
2778
2779 return res;
2780 #undef tspl
2781 }
2782
2783 /* if expr looks like "var=value", perform assignment and return 1,
2784 * otherwise return 0 */
is_assignment(const char * expr)2785 static int is_assignment(const char *expr)
2786 {
2787 char *exprc, *val;
2788
2789 val = (char*)endofname(expr);
2790 if (val == (char*)expr || *val != '=') {
2791 return FALSE;
2792 }
2793
2794 exprc = xstrdup(expr);
2795 val = exprc + (val - expr);
2796 *val++ = '\0';
2797
2798 unescape_string_in_place(val);
2799 setvar_u(newvar(exprc), val);
2800 free(exprc);
2801 return TRUE;
2802 }
2803
2804 /* switch to next input file */
next_input_file(void)2805 static rstream *next_input_file(void)
2806 {
2807 #define rsm (G.next_input_file__rsm)
2808 #define files_happen (G.next_input_file__files_happen)
2809
2810 const char *fname, *ind;
2811
2812 if (rsm.F)
2813 fclose(rsm.F);
2814 rsm.F = NULL;
2815 rsm.pos = rsm.adv = 0;
2816
2817 for (;;) {
2818 if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) {
2819 if (files_happen)
2820 return NULL;
2821 fname = "-";
2822 rsm.F = stdin;
2823 break;
2824 }
2825 ind = getvar_s(incvar(intvar[ARGIND]));
2826 fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind));
2827 if (fname && *fname && !is_assignment(fname)) {
2828 rsm.F = xfopen_stdin(fname);
2829 break;
2830 }
2831 }
2832
2833 files_happen = TRUE;
2834 setvar_s(intvar[FILENAME], fname);
2835 return &rsm;
2836 #undef rsm
2837 #undef files_happen
2838 }
2839
2840 /*
2841 * Evaluate node - the heart of the program. Supplied with subtree
2842 * and "res" variable to assign the result to if we evaluate an expression.
2843 * If node refers to e.g. a variable or a field, no assignment happens.
2844 * Return ptr to the result (which may or may not be the "res" variable!)
2845 */
2846 #define XC(n) ((n) >> 8)
2847
evaluate(node * op,var * res)2848 static var *evaluate(node *op, var *res)
2849 {
2850 /* This procedure is recursive so we should count every byte */
2851 #define fnargs (G.evaluate__fnargs)
2852 /* seed is initialized to 1 */
2853 #define seed (G.evaluate__seed)
2854 #define sreg (G.evaluate__sreg)
2855
2856 var *tmpvars;
2857
2858 if (!op)
2859 return setvar_s(res, NULL);
2860
2861 debug_printf_eval("entered %s()\n", __func__);
2862
2863 tmpvars = nvalloc(2);
2864 #define TMPVAR0 (tmpvars)
2865 #define TMPVAR1 (tmpvars + 1)
2866
2867 while (op) {
2868 struct {
2869 var *v;
2870 const char *s;
2871 } L = L; /* for compiler */
2872 struct {
2873 var *v;
2874 const char *s;
2875 } R = R;
2876 double L_d = L_d;
2877 uint32_t opinfo;
2878 int opn;
2879 node *op1;
2880
2881 opinfo = op->info;
2882 opn = (opinfo & OPNMASK);
2883 g_lineno = op->lineno;
2884 op1 = op->l.n;
2885 debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn);
2886
2887 /* execute inevitable things */
2888 if (opinfo & OF_RES1) {
2889 if ((opinfo & OF_REQUIRED) && !op1)
2890 syntax_error(EMSG_TOO_FEW_ARGS);
2891 L.v = evaluate(op1, TMPVAR0);
2892 if (opinfo & OF_STR1) {
2893 L.s = getvar_s(L.v);
2894 debug_printf_eval("L.s:'%s'\n", L.s);
2895 }
2896 if (opinfo & OF_NUM1) {
2897 L_d = getvar_i(L.v);
2898 debug_printf_eval("L_d:%f\n", L_d);
2899 }
2900 }
2901 /* NB: Must get string/numeric values of L (done above)
2902 * _before_ evaluate()'ing R.v: if both L and R are $NNNs,
2903 * and right one is large, then L.v points to Fields[NNN1],
2904 * second evaluate() reallocates and moves (!) Fields[],
2905 * R.v points to Fields[NNN2] but L.v now points to freed mem!
2906 * (Seen trying to evaluate "$444 $44444")
2907 */
2908 if (opinfo & OF_RES2) {
2909 R.v = evaluate(op->r.n, TMPVAR1);
2910 //TODO: L.v may be invalid now, set L.v to NULL to catch bugs?
2911 //L.v = NULL;
2912 if (opinfo & OF_STR2) {
2913 R.s = getvar_s(R.v);
2914 debug_printf_eval("R.s:'%s'\n", R.s);
2915 }
2916 }
2917
2918 debug_printf_eval("switch(0x%x)\n", XC(opinfo & OPCLSMASK));
2919 switch (XC(opinfo & OPCLSMASK)) {
2920
2921 /* -- iterative node type -- */
2922
2923 /* test pattern */
2924 case XC( OC_TEST ):
2925 debug_printf_eval("TEST\n");
2926 if (op1->info == TI_COMMA) {
2927 /* it's range pattern */
2928 if ((opinfo & OF_CHECKED) || ptest(op1->l.n)) {
2929 op->info |= OF_CHECKED;
2930 if (ptest(op1->r.n))
2931 op->info &= ~OF_CHECKED;
2932 op = op->a.n;
2933 } else {
2934 op = op->r.n;
2935 }
2936 } else {
2937 op = ptest(op1) ? op->a.n : op->r.n;
2938 }
2939 break;
2940
2941 /* just evaluate an expression, also used as unconditional jump */
2942 case XC( OC_EXEC ):
2943 debug_printf_eval("EXEC\n");
2944 break;
2945
2946 /* branch, used in if-else and various loops */
2947 case XC( OC_BR ):
2948 debug_printf_eval("BR\n");
2949 op = istrue(L.v) ? op->a.n : op->r.n;
2950 break;
2951
2952 /* initialize for-in loop */
2953 case XC( OC_WALKINIT ):
2954 debug_printf_eval("WALKINIT\n");
2955 hashwalk_init(L.v, iamarray(R.v));
2956 break;
2957
2958 /* get next array item */
2959 case XC( OC_WALKNEXT ):
2960 debug_printf_eval("WALKNEXT\n");
2961 op = hashwalk_next(L.v) ? op->a.n : op->r.n;
2962 break;
2963
2964 case XC( OC_PRINT ):
2965 debug_printf_eval("PRINT /\n");
2966 case XC( OC_PRINTF ):
2967 debug_printf_eval("PRINTF\n");
2968 {
2969 FILE *F = stdout;
2970
2971 if (op->r.n) {
2972 rstream *rsm = newfile(R.s);
2973 if (!rsm->F) {
2974 if (opn == '|') {
2975 rsm->F = popen(R.s, "w");
2976 if (rsm->F == NULL)
2977 bb_simple_perror_msg_and_die("popen");
2978 rsm->is_pipe = 1;
2979 } else {
2980 rsm->F = xfopen(R.s, opn=='w' ? "w" : "a");
2981 }
2982 }
2983 F = rsm->F;
2984 }
2985
2986 /* Can't just check 'opinfo == OC_PRINT' here, parser ORs
2987 * additional bits to opinfos of print/printf with redirects
2988 */
2989 if ((opinfo & OPCLSMASK) == OC_PRINT) {
2990 if (!op1) {
2991 fputs(getvar_s(intvar[F0]), F);
2992 } else {
2993 for (;;) {
2994 var *v = evaluate(nextarg(&op1), TMPVAR0);
2995 if (v->type & VF_NUMBER) {
2996 fmt_num(getvar_s(intvar[OFMT]),
2997 getvar_i(v));
2998 fputs(g_buf, F);
2999 } else {
3000 fputs(getvar_s(v), F);
3001 }
3002 if (!op1)
3003 break;
3004 fputs(getvar_s(intvar[OFS]), F);
3005 }
3006 }
3007 fputs(getvar_s(intvar[ORS]), F);
3008 } else { /* PRINTF */
3009 IF_FEATURE_AWK_GNU_EXTENSIONS(size_t len;)
3010 char *s = awk_printf(op1, &len);
3011 #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS
3012 fwrite(s, len, 1, F);
3013 #else
3014 fputs(s, F);
3015 #endif
3016 free(s);
3017 }
3018 fflush(F);
3019 break;
3020 }
3021
3022 case XC( OC_DELETE ):
3023 debug_printf_eval("DELETE\n");
3024 {
3025 /* "delete" is special:
3026 * "delete array[var--]" must evaluate index expr only once.
3027 */
3028 uint32_t info = op1->info & OPCLSMASK;
3029 var *v;
3030
3031 if (info == OC_VAR) {
3032 v = op1->l.v;
3033 } else if (info == OC_FNARG) {
3034 v = &fnargs[op1->l.aidx];
3035 } else {
3036 syntax_error(EMSG_NOT_ARRAY);
3037 }
3038 if (op1->r.n) { /* array ref? */
3039 const char *s;
3040 s = getvar_s(evaluate(op1->r.n, TMPVAR0));
3041 hash_remove(iamarray(v), s);
3042 } else {
3043 clear_array(iamarray(v));
3044 }
3045 break;
3046 }
3047
3048 case XC( OC_NEWSOURCE ):
3049 debug_printf_eval("NEWSOURCE\n");
3050 g_progname = op->l.new_progname;
3051 break;
3052
3053 case XC( OC_RETURN ):
3054 debug_printf_eval("RETURN\n");
3055 copyvar(res, L.v);
3056 break;
3057
3058 case XC( OC_NEXTFILE ):
3059 debug_printf_eval("NEXTFILE\n");
3060 nextfile = TRUE;
3061 case XC( OC_NEXT ):
3062 debug_printf_eval("NEXT\n");
3063 nextrec = TRUE;
3064 case XC( OC_DONE ):
3065 debug_printf_eval("DONE\n");
3066 clrvar(res);
3067 break;
3068
3069 case XC( OC_EXIT ):
3070 debug_printf_eval("EXIT\n");
3071 if (op1)
3072 G.exitcode = (int)L_d;
3073 awk_exit();
3074
3075 /* -- recursive node type -- */
3076
3077 case XC( OC_VAR ):
3078 debug_printf_eval("VAR\n");
3079 L.v = op->l.v;
3080 if (L.v == intvar[NF])
3081 split_f0();
3082 goto v_cont;
3083
3084 case XC( OC_FNARG ):
3085 debug_printf_eval("FNARG[%d]\n", op->l.aidx);
3086 L.v = &fnargs[op->l.aidx];
3087 v_cont:
3088 res = op->r.n ? findvar(iamarray(L.v), R.s) : L.v;
3089 break;
3090
3091 case XC( OC_IN ):
3092 debug_printf_eval("IN\n");
3093 setvar_i(res, hash_search(iamarray(R.v), L.s) ? 1 : 0);
3094 break;
3095
3096 case XC( OC_REGEXP ):
3097 debug_printf_eval("REGEXP\n");
3098 op1 = op;
3099 L.s = getvar_s(intvar[F0]);
3100 goto re_cont;
3101
3102 case XC( OC_MATCH ):
3103 debug_printf_eval("MATCH\n");
3104 op1 = op->r.n;
3105 re_cont:
3106 {
3107 regex_t *re = as_regex(op1, &sreg);
3108 int i = regexec(re, L.s, 0, NULL, 0);
3109 if (re == &sreg)
3110 regfree(re);
3111 setvar_i(res, (i == 0) ^ (opn == '!'));
3112 }
3113 break;
3114
3115 case XC( OC_MOVE ):
3116 debug_printf_eval("MOVE\n");
3117 /* if source is a temporary string, jusk relink it to dest */
3118 if (R.v == TMPVAR1
3119 && !(R.v->type & VF_NUMBER)
3120 /* Why check !NUMBER? if R.v is a number but has cached R.v->string,
3121 * L.v ends up a string, which is wrong */
3122 /*&& R.v->string - always not NULL (right?) */
3123 ) {
3124 res = setvar_p(L.v, R.v->string); /* avoids strdup */
3125 R.v->string = NULL;
3126 } else {
3127 res = copyvar(L.v, R.v);
3128 }
3129 break;
3130
3131 case XC( OC_TERNARY ):
3132 debug_printf_eval("TERNARY\n");
3133 if (op->r.n->info != TI_COLON)
3134 syntax_error(EMSG_POSSIBLE_ERROR);
3135 res = evaluate(istrue(L.v) ? op->r.n->l.n : op->r.n->r.n, res);
3136 break;
3137
3138 case XC( OC_FUNC ): {
3139 var *argvars, *sv_fnargs;
3140 const char *sv_progname;
3141 int nargs, i;
3142
3143 debug_printf_eval("FUNC\n");
3144
3145 if (!op->r.f->defined)
3146 syntax_error(EMSG_UNDEF_FUNC);
3147
3148 /* The body might be empty, still has to eval the args */
3149 nargs = op->r.f->nargs;
3150 argvars = nvalloc(nargs);
3151 i = 0;
3152 while (op1) {
3153 var *arg = evaluate(nextarg(&op1), TMPVAR0);
3154 if (i == nargs) {
3155 /* call with more arguments than function takes.
3156 * (gawk warns: "warning: function 'f' called with more arguments than declared").
3157 * They are still evaluated, but discarded: */
3158 clrvar(arg);
3159 continue;
3160 }
3161 copyvar(&argvars[i], arg);
3162 argvars[i].type |= VF_CHILD;
3163 argvars[i].x.parent = arg;
3164 i++;
3165 }
3166
3167 sv_fnargs = fnargs;
3168 sv_progname = g_progname;
3169
3170 fnargs = argvars;
3171 res = evaluate(op->r.f->body.first, res);
3172 nvfree(argvars, nargs);
3173
3174 g_progname = sv_progname;
3175 fnargs = sv_fnargs;
3176
3177 break;
3178 }
3179
3180 case XC( OC_GETLINE ):
3181 debug_printf_eval("GETLINE /\n");
3182 case XC( OC_PGETLINE ):
3183 debug_printf_eval("PGETLINE\n");
3184 {
3185 rstream *rsm;
3186 int i;
3187
3188 if (op1) {
3189 rsm = newfile(L.s);
3190 if (!rsm->F) {
3191 /* NB: can't use "opinfo == TI_PGETLINE", would break "cmd" | getline */
3192 if ((opinfo & OPCLSMASK) == OC_PGETLINE) {
3193 rsm->F = popen(L.s, "r");
3194 rsm->is_pipe = TRUE;
3195 } else {
3196 rsm->F = fopen_for_read(L.s); /* not xfopen! */
3197 }
3198 }
3199 } else {
3200 if (!iF)
3201 iF = next_input_file();
3202 rsm = iF;
3203 }
3204
3205 if (!rsm || !rsm->F) {
3206 setvar_i(intvar[ERRNO], errno);
3207 setvar_i(res, -1);
3208 break;
3209 }
3210
3211 if (!op->r.n)
3212 R.v = intvar[F0];
3213
3214 i = awk_getline(rsm, R.v);
3215 if (i > 0 && !op1) {
3216 incvar(intvar[FNR]);
3217 incvar(intvar[NR]);
3218 }
3219 setvar_i(res, i);
3220 break;
3221 }
3222
3223 /* simple builtins */
3224 case XC( OC_FBLTIN ): {
3225 double R_d = R_d; /* for compiler */
3226 debug_printf_eval("FBLTIN\n");
3227
3228 if (op1 && op1->info == TI_COMMA)
3229 /* Simple builtins take one arg maximum */
3230 syntax_error("Too many arguments");
3231
3232 switch (opn) {
3233 case F_in:
3234 R_d = (long long)L_d;
3235 break;
3236
3237 case F_rn: /*rand*/
3238 if (op1)
3239 syntax_error("Too many arguments");
3240 {
3241 #if RAND_MAX >= 0x7fffffff
3242 uint32_t u = ((uint32_t)rand() << 16) ^ rand();
3243 uint64_t v = ((uint64_t)rand() << 32) | u;
3244 /* the above shift+or is optimized out on 32-bit arches */
3245 # if RAND_MAX > 0x7fffffff
3246 v &= 0x7fffffffffffffffULL;
3247 # endif
3248 R_d = (double)v / 0x8000000000000000ULL;
3249 #else
3250 # error Not implemented for this value of RAND_MAX
3251 #endif
3252 break;
3253 }
3254 case F_co:
3255 if (ENABLE_FEATURE_AWK_LIBM) {
3256 R_d = cos(L_d);
3257 break;
3258 }
3259
3260 case F_ex:
3261 if (ENABLE_FEATURE_AWK_LIBM) {
3262 R_d = exp(L_d);
3263 break;
3264 }
3265
3266 case F_lg:
3267 if (ENABLE_FEATURE_AWK_LIBM) {
3268 R_d = log(L_d);
3269 break;
3270 }
3271
3272 case F_si:
3273 if (ENABLE_FEATURE_AWK_LIBM) {
3274 R_d = sin(L_d);
3275 break;
3276 }
3277
3278 case F_sq:
3279 if (ENABLE_FEATURE_AWK_LIBM) {
3280 R_d = sqrt(L_d);
3281 break;
3282 }
3283
3284 syntax_error(EMSG_NO_MATH);
3285 break;
3286
3287 case F_sr:
3288 R_d = (double)seed;
3289 seed = op1 ? (unsigned)L_d : (unsigned)time(NULL);
3290 srand(seed);
3291 break;
3292
3293 case F_ti: /*systime*/
3294 if (op1)
3295 syntax_error("Too many arguments");
3296 R_d = time(NULL);
3297 break;
3298
3299 case F_le:
3300 debug_printf_eval("length: L.s:'%s'\n", L.s);
3301 if (!op1) {
3302 L.s = getvar_s(intvar[F0]);
3303 debug_printf_eval("length: L.s='%s'\n", L.s);
3304 }
3305 else if (L.v->type & VF_ARRAY) {
3306 R_d = L.v->x.array->nel;
3307 debug_printf_eval("length: array_len:%d\n", L.v->x.array->nel);
3308 break;
3309 }
3310 R_d = strlen(L.s);
3311 break;
3312
3313 case F_sy:
3314 fflush_all();
3315 R_d = (ENABLE_FEATURE_ALLOW_EXEC && L.s && *L.s)
3316 ? (system(L.s) >> 8) : 0;
3317 break;
3318
3319 case F_ff:
3320 if (!op1) {
3321 fflush(stdout);
3322 } else if (L.s && *L.s) {
3323 rstream *rsm = newfile(L.s);
3324 fflush(rsm->F);
3325 } else {
3326 fflush_all();
3327 }
3328 break;
3329
3330 case F_cl: {
3331 rstream *rsm;
3332 int err = 0;
3333 rsm = (rstream *)hash_search(fdhash, L.s);
3334 debug_printf_eval("OC_FBLTIN close: op1:%p s:'%s' rsm:%p\n", op1, L.s, rsm);
3335 if (rsm) {
3336 debug_printf_eval("OC_FBLTIN F_cl "
3337 "rsm->is_pipe:%d, ->F:%p\n",
3338 rsm->is_pipe, rsm->F);
3339 /* Can be NULL if open failed. Example:
3340 * getline line <"doesnt_exist";
3341 * close("doesnt_exist"); <--- here rsm->F is NULL
3342 */
3343 if (rsm->F)
3344 err = rsm->is_pipe ? pclose(rsm->F) : fclose(rsm->F);
3345 //TODO: fix this case:
3346 // $ awk 'BEGIN { print close(""); print ERRNO }'
3347 // -1
3348 // close of redirection that was never opened
3349 // (we print 0, 0)
3350 free(rsm->buffer);
3351 hash_remove(fdhash, L.s);
3352 }
3353 if (err)
3354 setvar_i(intvar[ERRNO], errno);
3355 R_d = (double)err;
3356 break;
3357 }
3358 } /* switch */
3359 setvar_i(res, R_d);
3360 break;
3361 }
3362
3363 case XC( OC_BUILTIN ):
3364 debug_printf_eval("BUILTIN\n");
3365 res = exec_builtin(op, res);
3366 break;
3367
3368 case XC( OC_SPRINTF ):
3369 debug_printf_eval("SPRINTF\n");
3370 setvar_p(res, awk_printf(op1, NULL));
3371 break;
3372
3373 case XC( OC_UNARY ):
3374 debug_printf_eval("UNARY\n");
3375 {
3376 double Ld, R_d;
3377
3378 Ld = R_d = getvar_i(R.v);
3379 switch (opn) {
3380 case 'P':
3381 Ld = ++R_d;
3382 goto r_op_change;
3383 case 'p':
3384 R_d++;
3385 goto r_op_change;
3386 case 'M':
3387 Ld = --R_d;
3388 goto r_op_change;
3389 case 'm':
3390 R_d--;
3391 r_op_change:
3392 setvar_i(R.v, R_d);
3393 break;
3394 case '!':
3395 Ld = !istrue(R.v);
3396 break;
3397 case '-':
3398 Ld = -R_d;
3399 break;
3400 }
3401 setvar_i(res, Ld);
3402 break;
3403 }
3404
3405 case XC( OC_FIELD ):
3406 debug_printf_eval("FIELD\n");
3407 {
3408 int i = (int)getvar_i(R.v);
3409 if (i < 0)
3410 syntax_error(EMSG_NEGATIVE_FIELD);
3411 if (i == 0) {
3412 res = intvar[F0];
3413 } else {
3414 split_f0();
3415 if (i > nfields)
3416 fsrealloc(i);
3417 res = &Fields[i - 1];
3418 }
3419 break;
3420 }
3421
3422 /* concatenation (" ") and index joining (",") */
3423 case XC( OC_CONCAT ):
3424 debug_printf_eval("CONCAT /\n");
3425 case XC( OC_COMMA ): {
3426 const char *sep = "";
3427 debug_printf_eval("COMMA\n");
3428 if (opinfo == TI_COMMA)
3429 sep = getvar_s(intvar[SUBSEP]);
3430 setvar_p(res, xasprintf("%s%s%s", L.s, sep, R.s));
3431 break;
3432 }
3433
3434 case XC( OC_LAND ):
3435 debug_printf_eval("LAND\n");
3436 setvar_i(res, istrue(L.v) ? ptest(op->r.n) : 0);
3437 break;
3438
3439 case XC( OC_LOR ):
3440 debug_printf_eval("LOR\n");
3441 setvar_i(res, istrue(L.v) ? 1 : ptest(op->r.n));
3442 break;
3443
3444 case XC( OC_BINARY ):
3445 debug_printf_eval("BINARY /\n");
3446 case XC( OC_REPLACE ):
3447 debug_printf_eval("REPLACE\n");
3448 {
3449 double R_d = getvar_i(R.v);
3450 debug_printf_eval("R_d:%f opn:%c\n", R_d, opn);
3451 switch (opn) {
3452 case '+':
3453 L_d += R_d;
3454 break;
3455 case '-':
3456 L_d -= R_d;
3457 break;
3458 case '*':
3459 L_d *= R_d;
3460 break;
3461 case '/':
3462 if (R_d == 0)
3463 syntax_error(EMSG_DIV_BY_ZERO);
3464 L_d /= R_d;
3465 break;
3466 case '&':
3467 if (ENABLE_FEATURE_AWK_LIBM)
3468 L_d = pow(L_d, R_d);
3469 else
3470 syntax_error(EMSG_NO_MATH);
3471 break;
3472 case '%':
3473 if (R_d == 0)
3474 syntax_error(EMSG_DIV_BY_ZERO);
3475 L_d -= (long long)(L_d / R_d) * R_d;
3476 break;
3477 }
3478 debug_printf_eval("BINARY/REPLACE result:%f\n", L_d);
3479 res = setvar_i(((opinfo & OPCLSMASK) == OC_BINARY) ? res : L.v, L_d);
3480 break;
3481 }
3482
3483 case XC( OC_COMPARE ): {
3484 int i = i; /* for compiler */
3485 double Ld;
3486 debug_printf_eval("COMPARE\n");
3487
3488 if (is_numeric(L.v) && is_numeric(R.v)) {
3489 Ld = getvar_i(L.v) - getvar_i(R.v);
3490 } else {
3491 const char *l = getvar_s(L.v);
3492 const char *r = getvar_s(R.v);
3493 Ld = icase ? strcasecmp(l, r) : strcmp(l, r);
3494 }
3495 switch (opn & 0xfe) {
3496 case 0:
3497 i = (Ld > 0);
3498 break;
3499 case 2:
3500 i = (Ld >= 0);
3501 break;
3502 case 4:
3503 i = (Ld == 0);
3504 break;
3505 }
3506 setvar_i(res, (i == 0) ^ (opn & 1));
3507 break;
3508 }
3509
3510 default:
3511 syntax_error(EMSG_POSSIBLE_ERROR);
3512 } /* switch */
3513
3514 if ((opinfo & OPCLSMASK) <= SHIFT_TIL_THIS)
3515 op = op->a.n;
3516 if ((opinfo & OPCLSMASK) >= RECUR_FROM_THIS)
3517 break;
3518 if (nextrec)
3519 break;
3520 } /* while (op) */
3521
3522 nvfree(tmpvars, 2);
3523 #undef TMPVAR0
3524 #undef TMPVAR1
3525
3526 debug_printf_eval("returning from %s(): %p\n", __func__, res);
3527 return res;
3528 #undef fnargs
3529 #undef seed
3530 #undef sreg
3531 }
3532
3533 /* -------- main & co. -------- */
3534
awk_exit(void)3535 static int awk_exit(void)
3536 {
3537 unsigned i;
3538
3539 if (!exiting) {
3540 exiting = TRUE;
3541 nextrec = FALSE;
3542 evaluate(endseq.first, &G.exit__tmpvar);
3543 }
3544
3545 /* waiting for children */
3546 for (i = 0; i < fdhash->csize; i++) {
3547 hash_item *hi;
3548 hi = fdhash->items[i];
3549 while (hi) {
3550 if (hi->data.rs.F && hi->data.rs.is_pipe)
3551 pclose(hi->data.rs.F);
3552 hi = hi->next;
3553 }
3554 }
3555
3556 exit(G.exitcode);
3557 }
3558
3559 int awk_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
awk_main(int argc UNUSED_PARAM,char ** argv)3560 int awk_main(int argc UNUSED_PARAM, char **argv)
3561 {
3562 unsigned opt;
3563 char *opt_F;
3564 llist_t *list_v = NULL;
3565 llist_t *list_f = NULL;
3566 #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS
3567 llist_t *list_e = NULL;
3568 #endif
3569 int i;
3570
3571 INIT_G();
3572
3573 /* Undo busybox.c, or else strtod may eat ','! This breaks parsing:
3574 * $1,$2 == '$1,' '$2', NOT '$1' ',' '$2' */
3575 if (ENABLE_LOCALE_SUPPORT)
3576 setlocale(LC_NUMERIC, "C");
3577
3578 /* initialize variables */
3579 vhash = hash_init();
3580 {
3581 char *vnames = (char *)vNames; /* cheat */
3582 char *vvalues = (char *)vValues;
3583 for (i = 0; *vnames; i++) {
3584 var *v;
3585 intvar[i] = v = newvar(nextword(&vnames));
3586 if (*vvalues != '\377')
3587 setvar_s(v, nextword(&vvalues));
3588 else
3589 setvar_i(v, 0);
3590
3591 if (*vnames == '*') {
3592 v->type |= VF_SPECIAL;
3593 vnames++;
3594 }
3595 }
3596 }
3597
3598 handle_special(intvar[FS]);
3599 handle_special(intvar[RS]);
3600
3601 /* Huh, people report that sometimes environ is NULL. Oh well. */
3602 if (environ) {
3603 char **envp;
3604 for (envp = environ; *envp; envp++) {
3605 /* environ is writable, thus we don't strdup it needlessly */
3606 char *s = *envp;
3607 char *s1 = strchr(s, '=');
3608 if (s1) {
3609 *s1 = '\0';
3610 /* Both findvar and setvar_u take const char*
3611 * as 2nd arg -> environment is not trashed */
3612 setvar_u(findvar(iamarray(intvar[ENVIRON]), s), s1 + 1);
3613 *s1 = '=';
3614 }
3615 }
3616 }
3617 opt = getopt32(argv, OPTSTR_AWK, &opt_F, &list_v, &list_f, IF_FEATURE_AWK_GNU_EXTENSIONS(&list_e,) NULL);
3618 argv += optind;
3619 //argc -= optind;
3620 if (opt & OPT_W)
3621 bb_simple_error_msg("warning: option -W is ignored");
3622 if (opt & OPT_F) {
3623 unescape_string_in_place(opt_F);
3624 setvar_s(intvar[FS], opt_F);
3625 }
3626 while (list_v) {
3627 if (!is_assignment(llist_pop(&list_v)))
3628 bb_show_usage();
3629 }
3630
3631 /* Parse all supplied programs */
3632 fnhash = hash_init();
3633 ahash = hash_init();
3634 while (list_f) {
3635 int fd;
3636 char *s;
3637
3638 g_progname = llist_pop(&list_f);
3639 fd = xopen_stdin(g_progname);
3640 s = xmalloc_read(fd, NULL); /* it's NUL-terminated */
3641 close(fd);
3642 parse_program(s);
3643 free(s);
3644 }
3645 g_progname = "cmd. line";
3646 #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS
3647 while (list_e) {
3648 parse_program(llist_pop(&list_e));
3649 }
3650 #endif
3651 //FIXME: preserve order of -e and -f
3652 //TODO: implement -i LIBRARY and -E FILE too, they are easy-ish
3653 if (!(opt & (OPT_f | OPT_e))) {
3654 if (!*argv)
3655 bb_show_usage();
3656 parse_program(*argv++);
3657 }
3658 /* Free unused parse structures */
3659 //hash_free(fnhash); // ~250 bytes when empty, used only for function names
3660 //^^^^^^^^^^^^^^^^^ does not work, hash_clear() inside SEGVs
3661 // (IOW: hash_clear() assumes it's a hash of variables. fnhash is not).
3662 free(fnhash->items);
3663 free(fnhash);
3664 fnhash = NULL; // debug
3665 //hash_free(ahash); // empty after parsing, will reuse as fdhash instead of freeing
3666
3667 /* Parsing done, on to executing */
3668
3669 /* fill in ARGV array */
3670 setari_u(intvar[ARGV], 0, "awk");
3671 i = 0;
3672 while (*argv)
3673 setari_u(intvar[ARGV], ++i, *argv++);
3674 setvar_i(intvar[ARGC], i + 1);
3675
3676 //fdhash = ahash; // done via define
3677 newfile("/dev/stdin")->F = stdin;
3678 newfile("/dev/stdout")->F = stdout;
3679 newfile("/dev/stderr")->F = stderr;
3680
3681 evaluate(beginseq.first, &G.main__tmpvar);
3682 if (!mainseq.first && !endseq.first)
3683 awk_exit();
3684
3685 /* input file could already be opened in BEGIN block */
3686 if (!iF)
3687 iF = next_input_file();
3688
3689 /* passing through input files */
3690 while (iF) {
3691 nextfile = FALSE;
3692 setvar_i(intvar[FNR], 0);
3693
3694 while ((i = awk_getline(iF, intvar[F0])) > 0) {
3695 nextrec = FALSE;
3696 incvar(intvar[NR]);
3697 incvar(intvar[FNR]);
3698 evaluate(mainseq.first, &G.main__tmpvar);
3699
3700 if (nextfile)
3701 break;
3702 }
3703
3704 if (i < 0)
3705 syntax_error(strerror(errno));
3706
3707 iF = next_input_file();
3708 }
3709
3710 awk_exit();
3711 /*return 0;*/
3712 }
3713