1 /* vi: set sw=4 ts=4: */
2 /*
3  * shuf: Write a random permutation of the input lines to standard output.
4  *
5  * Copyright (C) 2014 by Bartosz Golaszewski <bartekgola@gmail.com>
6  *
7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8  */
9 //config:config SHUF
10 //config:	bool "shuf (5.4 kb)"
11 //config:	default y
12 //config:	help
13 //config:	Generate random permutations
14 
15 //applet:IF_SHUF(APPLET_NOEXEC(shuf, shuf, BB_DIR_USR_BIN, BB_SUID_DROP, shuf))
16 
17 //kbuild:lib-$(CONFIG_SHUF) += shuf.o
18 
19 //usage:#define shuf_trivial_usage
20 //usage:       "[-n NUM] [-o FILE] [-z] [FILE | -e [ARG...] | -i L-H]"
21 //usage:#define shuf_full_usage "\n\n"
22 //usage:       "Randomly permute lines\n"
23 //usage:     "\n	-n NUM	Output at most NUM lines"
24 //usage:     "\n	-o FILE	Write to FILE, not standard output"
25 //usage:     "\n	-z	NUL terminated output"
26 //usage:     "\n	-e	Treat ARGs as lines"
27 //usage:     "\n	-i L-H	Treat numbers L-H as lines"
28 
29 #include "libbb.h"
30 
31 /* This is a NOEXEC applet. Be very careful! */
32 
33 #define OPT_e		(1 << 0)
34 #define OPT_i		(1 << 1)
35 #define OPT_n		(1 << 2)
36 #define OPT_o		(1 << 3)
37 #define OPT_z		(1 << 4)
38 #define OPT_STR		"ei:n:o:z"
39 
40 /*
41  * Use the Fisher-Yates shuffle algorithm on an array of lines.
42  * If the required number of output lines is less than the total
43  * we can stop shuffling early.
44  */
shuffle_lines(char ** lines,unsigned numlines,unsigned outlines)45 static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines)
46 {
47 	srand(monotonic_us());
48 
49 	while (outlines != 0) {
50 		char *tmp;
51 		unsigned r = rand();
52 		/* RAND_MAX can be as small as 32767 */
53 		if (numlines > RAND_MAX)
54 			r ^= rand() << 15;
55 		r %= numlines;
56 //TODO: the above method is seriously non-uniform when numlines is very large.
57 //For example, with numlines of   0xf0000000,
58 //values of (r % numlines) in [0, 0x0fffffff] range
59 //are more likely: e.g. r=1 and r=0xf0000001 both map to 1,
60 //whereas only one value, r=0xefffffff, maps to 0xefffffff.
61 		numlines--;
62 		tmp = lines[numlines];
63 		lines[numlines] = lines[r];
64 		lines[r] = tmp;
65 		outlines--;
66 	}
67 }
68 
69 int shuf_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
shuf_main(int argc,char ** argv)70 int shuf_main(int argc, char **argv)
71 {
72 	unsigned opts;
73 	char *opt_i_str, *opt_n_str, *opt_o_str;
74 	char **lines;
75 	unsigned long long lo = lo;
76 	unsigned numlines, outlines;
77 	unsigned i;
78 	char eol;
79 
80 	opts = getopt32(argv, "^"
81 			OPT_STR
82 			"\0" "e--i:i--e"/* mutually exclusive */,
83 			&opt_i_str, &opt_n_str, &opt_o_str
84 	);
85 
86 	argc -= optind;
87 	argv += optind;
88 
89 	/* Prepare lines for shuffling - either: */
90 	if (opts & OPT_e) {
91 		/* make lines from command-line arguments */
92 		numlines = argc;
93 		lines = argv;
94 	} else
95 	if (opts & OPT_i) {
96 		/* create a range of numbers */
97 		unsigned long long hi;
98 		char *dash;
99 
100 		if (argv[0])
101 			bb_show_usage();
102 
103 		dash = strchr(opt_i_str, '-');
104 		if (!dash) {
105 			bb_error_msg_and_die("bad range '%s'", opt_i_str);
106 		}
107 		*dash = '\0';
108 		lo = xatoull(opt_i_str);
109 		hi = xatoull(dash + 1);
110 		*dash = '-';
111 		if (hi < lo)
112 			bb_error_msg_and_die("bad range '%s'", opt_i_str);
113 		hi -= lo;
114 		if (sizeof(size_t) > sizeof(numlines)) {
115 			if (hi >= UINT_MAX)
116 				bb_error_msg_and_die("bad range '%s'", opt_i_str);
117 		} else {
118 			if (hi >= UINT_MAX / sizeof(lines[0]))
119 				bb_error_msg_and_die("bad range '%s'", opt_i_str);
120 		}
121 
122 		numlines = hi + 1;
123 		lines = xmalloc((size_t)numlines * sizeof(lines[0]));
124 		for (i = 0; i < numlines; i++) {
125 			lines[i] = (char*)(uintptr_t)i;
126 		}
127 	} else {
128 		/* default - read lines from stdin or the input file */
129 		FILE *fp;
130 		const char *fname = "-";
131 
132 		if (argv[0]) {
133 			if (argv[1])
134 				bb_show_usage();
135 			fname = argv[0];
136 		}
137 
138 		fp = xfopen_stdin(fname);
139 		lines = NULL;
140 		numlines = 0;
141 		for (;;) {
142 			char *line = xmalloc_fgetline(fp);
143 			if (!line)
144 				break;
145 			lines = xrealloc_vector(lines, 6, numlines);
146 			lines[numlines++] = line;
147 		}
148 		fclose_if_not_stdin(fp);
149 	}
150 
151 	outlines = numlines;
152 	if (opts & OPT_n) {
153 		outlines = xatou(opt_n_str);
154 		if (outlines > numlines)
155 			outlines = numlines;
156 	}
157 
158 	shuffle_lines(lines, numlines, outlines);
159 
160 	if (opts & OPT_o)
161 		xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO);
162 
163 	eol = '\n';
164 	if (opts & OPT_z)
165 		eol = '\0';
166 
167 	for (i = numlines - outlines; i < numlines; i++) {
168 		if (opts & OPT_i)
169 			printf("%llu%c", lo + (uintptr_t)lines[i], eol);
170 		else
171 			printf("%s%c", lines[i], eol);
172 	}
173 
174 	fflush_stdout_and_exit(EXIT_SUCCESS);
175 }
176