1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Simple benchmark program that uses the various features of io_uring
4  * to provide fast random access to a device/file. It has various
5  * options that are control how we use io_uring, see the OPTIONS section
6  * below. This uses the raw io_uring interface.
7  *
8  * Copyright (C) 2018-2019 Jens Axboe
9  */
10 #include <stdio.h>
11 #include <errno.h>
12 #include <assert.h>
13 #include <stdlib.h>
14 #include <stddef.h>
15 #include <signal.h>
16 #include <inttypes.h>
17 
18 #include <sys/types.h>
19 #include <sys/stat.h>
20 #include <sys/ioctl.h>
21 #include <sys/syscall.h>
22 #include <sys/resource.h>
23 #include <sys/mman.h>
24 #include <sys/uio.h>
25 #include <linux/fs.h>
26 #include <fcntl.h>
27 #include <unistd.h>
28 #include <string.h>
29 #include <pthread.h>
30 #include <sched.h>
31 
32 #include "liburing.h"
33 #include "barrier.h"
34 
35 #define min(a, b)		((a < b) ? (a) : (b))
36 
37 struct io_sq_ring {
38 	unsigned *head;
39 	unsigned *tail;
40 	unsigned *ring_mask;
41 	unsigned *ring_entries;
42 	unsigned *flags;
43 	unsigned *array;
44 };
45 
46 struct io_cq_ring {
47 	unsigned *head;
48 	unsigned *tail;
49 	unsigned *ring_mask;
50 	unsigned *ring_entries;
51 	struct io_uring_cqe *cqes;
52 };
53 
54 #define DEPTH			128
55 
56 #define BATCH_SUBMIT		32
57 #define BATCH_COMPLETE		32
58 
59 #define BS			4096
60 
61 #define MAX_FDS			16
62 
63 static unsigned sq_ring_mask, cq_ring_mask;
64 
65 struct file {
66 	unsigned long max_blocks;
67 	unsigned pending_ios;
68 	int real_fd;
69 	int fixed_fd;
70 };
71 
72 struct submitter {
73 	pthread_t thread;
74 	int ring_fd;
75 	struct drand48_data rand;
76 	struct io_sq_ring sq_ring;
77 	struct io_uring_sqe *sqes;
78 	struct iovec iovecs[DEPTH];
79 	struct io_cq_ring cq_ring;
80 	int inflight;
81 	unsigned long reaps;
82 	unsigned long done;
83 	unsigned long calls;
84 	volatile int finish;
85 
86 	__s32 *fds;
87 
88 	struct file files[MAX_FDS];
89 	unsigned nr_files;
90 	unsigned cur_file;
91 };
92 
93 static struct submitter submitters[1];
94 static volatile int finish;
95 
96 /*
97  * OPTIONS: Set these to test the various features of io_uring.
98  */
99 static int polled = 1;		/* use IO polling */
100 static int fixedbufs = 1;	/* use fixed user buffers */
101 static int register_files = 1;	/* use fixed files */
102 static int buffered = 0;	/* use buffered IO, not O_DIRECT */
103 static int sq_thread_poll = 0;	/* use kernel submission/poller thread */
104 static int sq_thread_cpu = -1;	/* pin above thread to this CPU */
105 static int do_nop = 0;		/* no-op SQ ring commands */
106 
io_uring_register_buffers(struct submitter * s)107 static int io_uring_register_buffers(struct submitter *s)
108 {
109 	if (do_nop)
110 		return 0;
111 
112 	return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs,
113 					DEPTH);
114 }
115 
io_uring_register_files(struct submitter * s)116 static int io_uring_register_files(struct submitter *s)
117 {
118 	unsigned i;
119 
120 	if (do_nop)
121 		return 0;
122 
123 	s->fds = calloc(s->nr_files, sizeof(__s32));
124 	for (i = 0; i < s->nr_files; i++) {
125 		s->fds[i] = s->files[i].real_fd;
126 		s->files[i].fixed_fd = i;
127 	}
128 
129 	return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds,
130 					s->nr_files);
131 }
132 
lk_gettid(void)133 static int lk_gettid(void)
134 {
135 	return syscall(__NR_gettid);
136 }
137 
file_depth(struct submitter * s)138 static unsigned file_depth(struct submitter *s)
139 {
140 	return (DEPTH + s->nr_files - 1) / s->nr_files;
141 }
142 
init_io(struct submitter * s,unsigned index)143 static void init_io(struct submitter *s, unsigned index)
144 {
145 	struct io_uring_sqe *sqe = &s->sqes[index];
146 	unsigned long offset;
147 	struct file *f;
148 	long r;
149 
150 	if (do_nop) {
151 		sqe->opcode = IORING_OP_NOP;
152 		return;
153 	}
154 
155 	if (s->nr_files == 1) {
156 		f = &s->files[0];
157 	} else {
158 		f = &s->files[s->cur_file];
159 		if (f->pending_ios >= file_depth(s)) {
160 			s->cur_file++;
161 			if (s->cur_file == s->nr_files)
162 				s->cur_file = 0;
163 			f = &s->files[s->cur_file];
164 		}
165 	}
166 	f->pending_ios++;
167 
168 	lrand48_r(&s->rand, &r);
169 	offset = (r % (f->max_blocks - 1)) * BS;
170 
171 	if (register_files) {
172 		sqe->flags = IOSQE_FIXED_FILE;
173 		sqe->fd = f->fixed_fd;
174 	} else {
175 		sqe->flags = 0;
176 		sqe->fd = f->real_fd;
177 	}
178 	if (fixedbufs) {
179 		sqe->opcode = IORING_OP_READ_FIXED;
180 		sqe->addr = (unsigned long) s->iovecs[index].iov_base;
181 		sqe->len = BS;
182 		sqe->buf_index = index;
183 	} else {
184 		sqe->opcode = IORING_OP_READV;
185 		sqe->addr = (unsigned long) &s->iovecs[index];
186 		sqe->len = 1;
187 		sqe->buf_index = 0;
188 	}
189 	sqe->ioprio = 0;
190 	sqe->off = offset;
191 	sqe->user_data = (unsigned long) f;
192 }
193 
prep_more_ios(struct submitter * s,unsigned max_ios)194 static int prep_more_ios(struct submitter *s, unsigned max_ios)
195 {
196 	struct io_sq_ring *ring = &s->sq_ring;
197 	unsigned index, tail, next_tail, prepped = 0;
198 
199 	next_tail = tail = *ring->tail;
200 	do {
201 		next_tail++;
202 		read_barrier();
203 		if (next_tail == *ring->head)
204 			break;
205 
206 		index = tail & sq_ring_mask;
207 		init_io(s, index);
208 		ring->array[index] = index;
209 		prepped++;
210 		tail = next_tail;
211 	} while (prepped < max_ios);
212 
213 	if (*ring->tail != tail) {
214 		/* order tail store with writes to sqes above */
215 		write_barrier();
216 		*ring->tail = tail;
217 		write_barrier();
218 	}
219 	return prepped;
220 }
221 
get_file_size(struct file * f)222 static int get_file_size(struct file *f)
223 {
224 	struct stat st;
225 
226 	if (fstat(f->real_fd, &st) < 0)
227 		return -1;
228 	if (S_ISBLK(st.st_mode)) {
229 		unsigned long long bytes;
230 
231 		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
232 			return -1;
233 
234 		f->max_blocks = bytes / BS;
235 		return 0;
236 	} else if (S_ISREG(st.st_mode)) {
237 		f->max_blocks = st.st_size / BS;
238 		return 0;
239 	}
240 
241 	return -1;
242 }
243 
reap_events(struct submitter * s)244 static int reap_events(struct submitter *s)
245 {
246 	struct io_cq_ring *ring = &s->cq_ring;
247 	struct io_uring_cqe *cqe;
248 	unsigned head, reaped = 0;
249 
250 	head = *ring->head;
251 	do {
252 		struct file *f;
253 
254 		read_barrier();
255 		if (head == *ring->tail)
256 			break;
257 		cqe = &ring->cqes[head & cq_ring_mask];
258 		if (!do_nop) {
259 			f = (struct file *) (uintptr_t) cqe->user_data;
260 			f->pending_ios--;
261 			if (cqe->res != BS) {
262 				printf("io: unexpected ret=%d\n", cqe->res);
263 				if (polled && cqe->res == -EOPNOTSUPP)
264 					printf("Your filesystem doesn't support poll\n");
265 				return -1;
266 			}
267 		}
268 		reaped++;
269 		head++;
270 	} while (1);
271 
272 	s->inflight -= reaped;
273 	*ring->head = head;
274 	write_barrier();
275 	return reaped;
276 }
277 
submitter_fn(void * data)278 static void *submitter_fn(void *data)
279 {
280 	struct submitter *s = data;
281 	struct io_sq_ring *ring = &s->sq_ring;
282 	int ret, prepped;
283 
284 	printf("submitter=%d\n", lk_gettid());
285 
286 	srand48_r(pthread_self(), &s->rand);
287 
288 	prepped = 0;
289 	do {
290 		int to_wait, to_submit, this_reap, to_prep;
291 
292 		if (!prepped && s->inflight < DEPTH) {
293 			to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT);
294 			prepped = prep_more_ios(s, to_prep);
295 		}
296 		s->inflight += prepped;
297 submit_more:
298 		to_submit = prepped;
299 submit:
300 		if (to_submit && (s->inflight + to_submit <= DEPTH))
301 			to_wait = 0;
302 		else
303 			to_wait = min(s->inflight + to_submit, BATCH_COMPLETE);
304 
305 		/*
306 		 * Only need to call io_uring_enter if we're not using SQ thread
307 		 * poll, or if IORING_SQ_NEED_WAKEUP is set.
308 		 */
309 		if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
310 			unsigned flags = 0;
311 
312 			if (to_wait)
313 				flags = IORING_ENTER_GETEVENTS;
314 			if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
315 				flags |= IORING_ENTER_SQ_WAKEUP;
316 			ret = io_uring_enter(s->ring_fd, to_submit, to_wait,
317 						flags, NULL);
318 			s->calls++;
319 		}
320 
321 		/*
322 		 * For non SQ thread poll, we already got the events we needed
323 		 * through the io_uring_enter() above. For SQ thread poll, we
324 		 * need to loop here until we find enough events.
325 		 */
326 		this_reap = 0;
327 		do {
328 			int r;
329 			r = reap_events(s);
330 			if (r == -1) {
331 				s->finish = 1;
332 				break;
333 			} else if (r > 0)
334 				this_reap += r;
335 		} while (sq_thread_poll && this_reap < to_wait);
336 		s->reaps += this_reap;
337 
338 		if (ret >= 0) {
339 			if (!ret) {
340 				to_submit = 0;
341 				if (s->inflight)
342 					goto submit;
343 				continue;
344 			} else if (ret < to_submit) {
345 				int diff = to_submit - ret;
346 
347 				s->done += ret;
348 				prepped -= diff;
349 				goto submit_more;
350 			}
351 			s->done += ret;
352 			prepped = 0;
353 			continue;
354 		} else if (ret < 0) {
355 			if (errno == EAGAIN) {
356 				if (s->finish)
357 					break;
358 				if (this_reap)
359 					goto submit;
360 				to_submit = 0;
361 				goto submit;
362 			}
363 			printf("io_submit: %s\n", strerror(errno));
364 			break;
365 		}
366 	} while (!s->finish);
367 
368 	finish = 1;
369 	return NULL;
370 }
371 
sig_int(int sig)372 static void sig_int(int sig)
373 {
374 	printf("Exiting on signal %d\n", sig);
375 	submitters[0].finish = 1;
376 	finish = 1;
377 }
378 
arm_sig_int(void)379 static void arm_sig_int(void)
380 {
381 	struct sigaction act;
382 
383 	memset(&act, 0, sizeof(act));
384 	act.sa_handler = sig_int;
385 	act.sa_flags = SA_RESTART;
386 	sigaction(SIGINT, &act, NULL);
387 }
388 
setup_ring(struct submitter * s)389 static int setup_ring(struct submitter *s)
390 {
391 	struct io_sq_ring *sring = &s->sq_ring;
392 	struct io_cq_ring *cring = &s->cq_ring;
393 	struct io_uring_params p;
394 	int ret, fd;
395 	void *ptr;
396 
397 	memset(&p, 0, sizeof(p));
398 
399 	if (polled && !do_nop)
400 		p.flags |= IORING_SETUP_IOPOLL;
401 	if (sq_thread_poll) {
402 		p.flags |= IORING_SETUP_SQPOLL;
403 		if (sq_thread_cpu != -1) {
404 			p.flags |= IORING_SETUP_SQ_AFF;
405 			p.sq_thread_cpu = sq_thread_cpu;
406 		}
407 	}
408 
409 	fd = io_uring_setup(DEPTH, &p);
410 	if (fd < 0) {
411 		perror("io_uring_setup");
412 		return 1;
413 	}
414 	s->ring_fd = fd;
415 
416 	if (fixedbufs) {
417 		ret = io_uring_register_buffers(s);
418 		if (ret < 0) {
419 			perror("io_uring_register_buffers");
420 			return 1;
421 		}
422 	}
423 
424 	if (register_files) {
425 		ret = io_uring_register_files(s);
426 		if (ret < 0) {
427 			perror("io_uring_register_files");
428 			return 1;
429 		}
430 	}
431 
432 	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
433 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
434 			IORING_OFF_SQ_RING);
435 	printf("sq_ring ptr = 0x%p\n", ptr);
436 	sring->head = ptr + p.sq_off.head;
437 	sring->tail = ptr + p.sq_off.tail;
438 	sring->ring_mask = ptr + p.sq_off.ring_mask;
439 	sring->ring_entries = ptr + p.sq_off.ring_entries;
440 	sring->flags = ptr + p.sq_off.flags;
441 	sring->array = ptr + p.sq_off.array;
442 	sq_ring_mask = *sring->ring_mask;
443 
444 	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
445 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
446 			IORING_OFF_SQES);
447 	printf("sqes ptr    = 0x%p\n", s->sqes);
448 
449 	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
450 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
451 			IORING_OFF_CQ_RING);
452 	printf("cq_ring ptr = 0x%p\n", ptr);
453 	cring->head = ptr + p.cq_off.head;
454 	cring->tail = ptr + p.cq_off.tail;
455 	cring->ring_mask = ptr + p.cq_off.ring_mask;
456 	cring->ring_entries = ptr + p.cq_off.ring_entries;
457 	cring->cqes = ptr + p.cq_off.cqes;
458 	cq_ring_mask = *cring->ring_mask;
459 	return 0;
460 }
461 
file_depths(char * buf)462 static void file_depths(char *buf)
463 {
464 	struct submitter *s = &submitters[0];
465 	unsigned i;
466 	char *p;
467 
468 	buf[0] = '\0';
469 	p = buf;
470 	for (i = 0; i < s->nr_files; i++) {
471 		struct file *f = &s->files[i];
472 
473 		if (i + 1 == s->nr_files)
474 			p += sprintf(p, "%d", f->pending_ios);
475 		else
476 			p += sprintf(p, "%d, ", f->pending_ios);
477 	}
478 }
479 
main(int argc,char * argv[])480 int main(int argc, char *argv[])
481 {
482 	struct submitter *s = &submitters[0];
483 	unsigned long done, calls, reap;
484 	int err, i, flags, fd;
485 	char *fdepths;
486 	void *ret;
487 
488 	if (!do_nop && argc < 2) {
489 		printf("%s: filename\n", argv[0]);
490 		return 1;
491 	}
492 
493 	flags = O_RDONLY | O_NOATIME;
494 	if (!buffered)
495 		flags |= O_DIRECT;
496 
497 	i = 1;
498 	while (!do_nop && i < argc) {
499 		struct file *f;
500 
501 		if (s->nr_files == MAX_FDS) {
502 			printf("Max number of files (%d) reached\n", MAX_FDS);
503 			break;
504 		}
505 		fd = open(argv[i], flags);
506 		if (fd < 0) {
507 			perror("open");
508 			return 1;
509 		}
510 
511 		f = &s->files[s->nr_files];
512 		f->real_fd = fd;
513 		if (get_file_size(f)) {
514 			printf("failed getting size of device/file\n");
515 			return 1;
516 		}
517 		if (f->max_blocks <= 1) {
518 			printf("Zero file/device size?\n");
519 			return 1;
520 		}
521 		f->max_blocks--;
522 
523 		printf("Added file %s\n", argv[i]);
524 		s->nr_files++;
525 		i++;
526 	}
527 
528 	if (fixedbufs) {
529 		struct rlimit rlim;
530 
531 		rlim.rlim_cur = RLIM_INFINITY;
532 		rlim.rlim_max = RLIM_INFINITY;
533 		if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
534 			perror("setrlimit");
535 			return 1;
536 		}
537 	}
538 
539 	arm_sig_int();
540 
541 	for (i = 0; i < DEPTH; i++) {
542 		void *buf;
543 
544 		if (posix_memalign(&buf, BS, BS)) {
545 			printf("failed alloc\n");
546 			return 1;
547 		}
548 		s->iovecs[i].iov_base = buf;
549 		s->iovecs[i].iov_len = BS;
550 	}
551 
552 	err = setup_ring(s);
553 	if (err) {
554 		printf("ring setup failed: %s, %d\n", strerror(errno), err);
555 		return 1;
556 	}
557 	printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
558 	printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
559 
560 	pthread_create(&s->thread, NULL, submitter_fn, s);
561 
562 	fdepths = malloc(8 * s->nr_files);
563 	reap = calls = done = 0;
564 	do {
565 		unsigned long this_done = 0;
566 		unsigned long this_reap = 0;
567 		unsigned long this_call = 0;
568 		unsigned long rpc = 0, ipc = 0;
569 
570 		sleep(1);
571 		this_done += s->done;
572 		this_call += s->calls;
573 		this_reap += s->reaps;
574 		if (this_call - calls) {
575 			rpc = (this_done - done) / (this_call - calls);
576 			ipc = (this_reap - reap) / (this_call - calls);
577 		} else
578 			rpc = ipc = -1;
579 		file_depths(fdepths);
580 		printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
581 				this_done - done, rpc, ipc, s->inflight,
582 				fdepths);
583 		done = this_done;
584 		calls = this_call;
585 		reap = this_reap;
586 	} while (!finish);
587 
588 	pthread_join(s->thread, &ret);
589 	close(s->ring_fd);
590 	free(fdepths);
591 	return 0;
592 }
593