1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License, or (at your option) any later version.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <asm/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/init.h>
26 
27 #include <net/pkt_sched.h>
28 
29 #define qdisc_priv(q)	((void *)(q->data))
30 
31 /*	Network Emulation Queuing algorithm.
32 	====================================
33 
34 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 		 Network Emulation Tool
36 		 [2] Luigi Rizzo, DummyNet for FreeBSD
37 
38 	 ----------------------------------------------------------------
39 
40 	 This started out as a simple way to delay outgoing packets to
41 	 test TCP but has grown to include most of the functionality
42 	 of a full blown network emulator like NISTnet. It can delay
43 	 packets and add random jitter (and correlation). The random
44 	 distribution can be loaded from a table as well to provide
45 	 normal, Pareto, or experimental curves. Packet loss,
46 	 duplication, and reordering can also be emulated.
47 
48 	 This qdisc does not do classification that can be handled in
49 	 layering other disciplines.  It does not need to do bandwidth
50 	 control either since that can be handled by using token
51 	 bucket or other rate control.
52 
53 	 The simulator is limited by the Linux timer resolution
54 	 and will create packet bursts on the HZ boundary (1ms).
55 */
56 
57 struct netem_sched_data {
58 	struct Qdisc	*qdisc;
59 	struct sk_buff_head delayed;
60 	struct timer_list timer;
61 
62 	u32 latency;
63 	u32 loss;
64 	u32 limit;
65 	u32 counter;
66 	u32 gap;
67 	u32 jitter;
68 	u32 duplicate;
69 
70 	struct crndstate {
71 		unsigned long last;
72 		unsigned long rho;
73 	} delay_cor, loss_cor, dup_cor;
74 
75 	struct disttable {
76 		u32  size;
77 		s16 table[0];
78 	} *delay_dist;
79 };
80 
81 /* Time stamp put into socket buffer control block */
82 struct netem_skb_cb {
83 	psched_time_t	time_to_send;
84 };
85 
86 /* init_crandom - initialize correlated random number generator
87  * Use entropy source for initial seed.
88  */
init_crandom(struct crndstate * state,unsigned long rho)89 static void init_crandom(struct crndstate *state, unsigned long rho)
90 {
91 	state->rho = rho;
92 	state->last = net_random();
93 }
94 
95 /* get_crandom - correlated random number generator
96  * Next number depends on last value.
97  * rho is scaled to avoid floating point.
98  */
get_crandom(struct crndstate * state)99 static unsigned long get_crandom(struct crndstate *state)
100 {
101 	u64 value, rho;
102 	unsigned long answer;
103 
104 	if (state->rho == 0)	/* no correllation */
105 		return net_random();
106 
107 	value = net_random();
108 	rho = (u64)state->rho + 1;
109 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
110 	state->last = answer;
111 	return answer;
112 }
113 
114 /* tabledist - return a pseudo-randomly distributed value with mean mu and
115  * std deviation sigma.  Uses table lookup to approximate the desired
116  * distribution, and a uniformly-distributed pseudo-random source.
117  */
tabledist(unsigned long mu,long sigma,struct crndstate * state,const struct disttable * dist)118 static long tabledist(unsigned long mu, long sigma,
119 		      struct crndstate *state, const struct disttable *dist)
120 {
121 	long t, x;
122 	unsigned long rnd;
123 
124 	if (sigma == 0)
125 		return mu;
126 
127 	rnd = get_crandom(state);
128 
129 	/* default uniform distribution */
130 	if (dist == NULL)
131 		return (rnd % (2*sigma)) - sigma + mu;
132 
133 	t = dist->table[rnd % dist->size];
134 	x = (sigma % NETEM_DIST_SCALE) * t;
135 	if (x >= 0)
136 		x += NETEM_DIST_SCALE/2;
137 	else
138 		x -= NETEM_DIST_SCALE/2;
139 
140 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
141 }
142 
143 /* Put skb in the private delayed queue. */
delay_skb(struct Qdisc * sch,struct sk_buff * skb)144 static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
145 {
146 	struct netem_sched_data *q = qdisc_priv(sch);
147 	struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
148  	psched_tdiff_t td;
149   	psched_time_t now;
150 
151   	PSCHED_GET_TIME(now);
152 	td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
153 	PSCHED_TADD2(now, td, cb->time_to_send);
154 
155 	/* Always queue at tail to keep packets in order */
156 	if (likely(q->delayed.qlen < q->limit)) {
157 		__skb_queue_tail(&q->delayed, skb);
158 		sch->stats.bytes += skb->len;
159 		sch->stats.packets++;
160 
161 		if (!timer_pending(&q->timer)) {
162 			q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
163 			add_timer(&q->timer);
164 		}
165 		return NET_XMIT_SUCCESS;
166 	}
167 
168 	sch->stats.drops++;
169 	kfree_skb(skb);
170 	return NET_XMIT_DROP;
171 }
172 
netem_enqueue(struct sk_buff * skb,struct Qdisc * sch)173 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
174 {
175 	struct netem_sched_data *q = qdisc_priv(sch);
176 
177 	pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
178 
179 	/* Random packet drop 0 => none, ~0 => all */
180 	if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
181 		pr_debug("netem_enqueue: random loss\n");
182 		sch->stats.drops++;
183 		kfree_skb(skb);
184 		return 0;	/* lie about loss so TCP doesn't know */
185 	}
186 
187 	/* Random duplication */
188 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
189 		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
190 
191 		pr_debug("netem_enqueue: dup %p\n", skb2);
192 		if (skb2)
193 			delay_skb(sch, skb2);
194 	}
195 
196 	/* If doing simple delay then gap == 0 so all packets
197 	 * go into the delayed holding queue
198 	 * otherwise if doing out of order only "1 out of gap"
199 	 * packets will be delayed.
200 	 */
201 	if (q->counter < q->gap) {
202 		int ret;
203 
204 		++q->counter;
205 		ret = q->qdisc->enqueue(skb, q->qdisc);
206 		if (likely(ret == NET_XMIT_SUCCESS)) {
207 			sch->q.qlen++;
208 			sch->stats.bytes += skb->len;
209 			sch->stats.packets++;
210 		} else
211 			sch->stats.drops++;
212 		return ret;
213 	}
214 
215 	q->counter = 0;
216 
217 	return delay_skb(sch, skb);
218 }
219 
220 /* Requeue packets but don't change time stamp */
netem_requeue(struct sk_buff * skb,struct Qdisc * sch)221 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
222 {
223 	struct netem_sched_data *q = qdisc_priv(sch);
224 	int ret;
225 
226 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0)
227 		sch->q.qlen++;
228 
229 	return ret;
230 }
231 
netem_drop(struct Qdisc * sch)232 static unsigned int netem_drop(struct Qdisc* sch)
233 {
234 	struct netem_sched_data *q = qdisc_priv(sch);
235 	unsigned int len;
236 
237 	if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
238 		sch->q.qlen--;
239 		sch->stats.drops++;
240 	}
241 	return len;
242 }
243 
244 /* Dequeue packet.
245  *  Move all packets that are ready to send from the delay holding
246  *  list to the underlying qdisc, then just call dequeue
247  */
netem_dequeue(struct Qdisc * sch)248 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
249 {
250 	struct netem_sched_data *q = qdisc_priv(sch);
251 	struct sk_buff *skb;
252 
253 	skb = q->qdisc->dequeue(q->qdisc);
254 	if (skb)
255 		sch->q.qlen--;
256 	return skb;
257 }
258 
netem_watchdog(unsigned long arg)259 static void netem_watchdog(unsigned long arg)
260 {
261 	struct Qdisc *sch = (struct Qdisc *)arg;
262 	struct netem_sched_data *q = qdisc_priv(sch);
263 	struct net_device *dev = sch->dev;
264 	struct sk_buff *skb;
265 	psched_time_t now;
266 
267 	pr_debug("netem_watchdog: fired @%lu\n", jiffies);
268 
269 	spin_lock_bh(&dev->queue_lock);
270 	PSCHED_GET_TIME(now);
271 
272 	while ((skb = skb_peek(&q->delayed)) != NULL) {
273 		const struct netem_skb_cb *cb
274 			= (const struct netem_skb_cb *)skb->cb;
275 		long delay
276 			= PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
277 		pr_debug("netem_watchdog: skb %p@%lu %ld\n",
278 			 skb, jiffies, delay);
279 
280 		/* if more time remaining? */
281 		if (delay > 0) {
282 			mod_timer(&q->timer, jiffies + delay);
283 			break;
284 		}
285 		__skb_unlink(skb, &q->delayed);
286 
287 		if (q->qdisc->enqueue(skb, q->qdisc))
288 			sch->stats.drops++;
289 		else
290 			sch->q.qlen++;
291 	}
292 	qdisc_run(dev);
293 	spin_unlock_bh(&dev->queue_lock);
294 }
295 
netem_reset(struct Qdisc * sch)296 static void netem_reset(struct Qdisc *sch)
297 {
298 	struct netem_sched_data *q = qdisc_priv(sch);
299 
300 	qdisc_reset(q->qdisc);
301 	skb_queue_purge(&q->delayed);
302 
303 	sch->q.qlen = 0;
304 	del_timer_sync(&q->timer);
305 }
306 
set_fifo_limit(struct Qdisc * q,int limit)307 static int set_fifo_limit(struct Qdisc *q, int limit)
308 {
309         struct rtattr *rta;
310 	int ret = -ENOMEM;
311 
312 	rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
313 	if (rta) {
314 		rta->rta_type = RTM_NEWQDISC;
315 		rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
316 		((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
317 
318 		ret = q->ops->change(q, rta);
319 		kfree(rta);
320 	}
321 	return ret;
322 }
323 
324 /*
325  * Distribution data is a variable size payload containing
326  * signed 16 bit values.
327  */
get_dist_table(struct Qdisc * sch,const struct rtattr * attr)328 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
329 {
330 	struct netem_sched_data *q = qdisc_priv(sch);
331 	unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
332 	const __s16 *data = RTA_DATA(attr);
333 	struct disttable *d;
334 	int i;
335 
336 	if (n > 65536)
337 		return -EINVAL;
338 
339 	d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
340 	if (!d)
341 		return -ENOMEM;
342 
343 	d->size = n;
344 	for (i = 0; i < n; i++)
345 		d->table[i] = data[i];
346 
347 	spin_lock_bh(&sch->dev->queue_lock);
348 	d = xchg(&q->delay_dist, d);
349 	spin_unlock_bh(&sch->dev->queue_lock);
350 
351 	kfree(d);
352 	return 0;
353 }
354 
get_correlation(struct Qdisc * sch,const struct rtattr * attr)355 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
356 {
357 	struct netem_sched_data *q = qdisc_priv(sch);
358 	const struct tc_netem_corr *c = RTA_DATA(attr);
359 
360 	if (RTA_PAYLOAD(attr) != sizeof(*c))
361 		return -EINVAL;
362 
363 	init_crandom(&q->delay_cor, c->delay_corr);
364 	init_crandom(&q->loss_cor, c->loss_corr);
365 	init_crandom(&q->dup_cor, c->dup_corr);
366 	return 0;
367 }
368 
netem_change(struct Qdisc * sch,struct rtattr * opt)369 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
370 {
371 	struct netem_sched_data *q = qdisc_priv(sch);
372 	struct tc_netem_qopt *qopt;
373 	int ret;
374 
375 	if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
376 		return -EINVAL;
377 
378 	qopt = RTA_DATA(opt);
379 	ret = set_fifo_limit(q->qdisc, qopt->limit);
380 	if (ret) {
381 		pr_debug("netem: can't set fifo limit\n");
382 		return ret;
383 	}
384 
385 	q->latency = qopt->latency;
386 	q->jitter = qopt->jitter;
387 	q->limit = qopt->limit;
388 	q->gap = qopt->gap;
389 	q->loss = qopt->loss;
390 	q->duplicate = qopt->duplicate;
391 
392 	/* Handle nested options after initial queue options.
393 	 * Should have put all options in nested format but too late now.
394 	 */
395 	if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
396 		struct rtattr *tb[TCA_NETEM_MAX];
397 		if (rtattr_parse(tb, TCA_NETEM_MAX,
398 				 RTA_DATA(opt) + sizeof(*qopt),
399 				 RTA_PAYLOAD(opt) - sizeof(*qopt)))
400 			return -EINVAL;
401 
402 		if (tb[TCA_NETEM_CORR-1]) {
403 			ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
404 			if (ret)
405 				return ret;
406 		}
407 
408 		if (tb[TCA_NETEM_DELAY_DIST-1]) {
409 			ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
410 			if (ret)
411 				return ret;
412 		}
413 	}
414 
415 
416 	return 0;
417 }
418 
netem_init(struct Qdisc * sch,struct rtattr * opt)419 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
420 {
421 	struct netem_sched_data *q = qdisc_priv(sch);
422 	int ret;
423 
424 	if (!opt)
425 		return -EINVAL;
426 
427 	MOD_INC_USE_COUNT;
428 	skb_queue_head_init(&q->delayed);
429 	init_timer(&q->timer);
430 	q->timer.function = netem_watchdog;
431 	q->timer.data = (unsigned long) sch;
432 	q->counter = 0;
433 
434 	q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
435 	if (!q->qdisc) {
436 		pr_debug("netem: qdisc create failed\n");
437 		return -ENOMEM;
438 	}
439 
440 	ret = netem_change(sch, opt);
441 	if (ret) {
442 		pr_debug("netem: change failed\n");
443 		qdisc_destroy(q->qdisc);
444 		MOD_DEC_USE_COUNT;
445 	}
446 	return ret;
447 }
448 
netem_destroy(struct Qdisc * sch)449 static void netem_destroy(struct Qdisc *sch)
450 {
451 	struct netem_sched_data *q = qdisc_priv(sch);
452 
453 	del_timer_sync(&q->timer);
454 	qdisc_destroy(q->qdisc);
455 	kfree(q->delay_dist);
456 	MOD_DEC_USE_COUNT;
457 }
458 
netem_dump(struct Qdisc * sch,struct sk_buff * skb)459 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
460 {
461 	const struct netem_sched_data *q = qdisc_priv(sch);
462 	unsigned char	 *b = skb->tail;
463 	struct rtattr *rta = (struct rtattr *) b;
464 	struct tc_netem_qopt qopt;
465 	struct tc_netem_corr cor;
466 
467 	qopt.latency = q->latency;
468 	qopt.jitter = q->jitter;
469 	qopt.limit = q->limit;
470 	qopt.loss = q->loss;
471 	qopt.gap = q->gap;
472 	qopt.duplicate = q->duplicate;
473 	RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
474 
475 	cor.delay_corr = q->delay_cor.rho;
476 	cor.loss_corr = q->loss_cor.rho;
477 	cor.dup_corr = q->dup_cor.rho;
478 	RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
479 	rta->rta_len = skb->tail - b;
480 
481 	return skb->len;
482 
483 rtattr_failure:
484 	skb_trim(skb, b - skb->data);
485 	return -1;
486 }
487 
netem_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)488 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
489 			  struct sk_buff *skb, struct tcmsg *tcm)
490 {
491 	struct netem_sched_data *q = qdisc_priv(sch);
492 
493 	if (cl != 1) 	/* only one class */
494 		return -ENOENT;
495 
496 	tcm->tcm_handle |= TC_H_MIN(1);
497 	tcm->tcm_info = q->qdisc->handle;
498 
499 	return 0;
500 }
501 
netem_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old)502 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
503 		     struct Qdisc **old)
504 {
505 	struct netem_sched_data *q = qdisc_priv(sch);
506 
507 	if (new == NULL)
508 		new = &noop_qdisc;
509 
510 	sch_tree_lock(sch);
511 	*old = xchg(&q->qdisc, new);
512 	qdisc_reset(*old);
513 	sch->q.qlen = 0;
514 	sch_tree_unlock(sch);
515 
516 	return 0;
517 }
518 
netem_leaf(struct Qdisc * sch,unsigned long arg)519 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
520 {
521 	struct netem_sched_data *q = qdisc_priv(sch);
522 	return q->qdisc;
523 }
524 
netem_get(struct Qdisc * sch,u32 classid)525 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
526 {
527 	return 1;
528 }
529 
netem_put(struct Qdisc * sch,unsigned long arg)530 static void netem_put(struct Qdisc *sch, unsigned long arg)
531 {
532 }
533 
netem_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct rtattr ** tca,unsigned long * arg)534 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
535 			    struct rtattr **tca, unsigned long *arg)
536 {
537 	return -ENOSYS;
538 }
539 
netem_delete(struct Qdisc * sch,unsigned long arg)540 static int netem_delete(struct Qdisc *sch, unsigned long arg)
541 {
542 	return -ENOSYS;
543 }
544 
netem_walk(struct Qdisc * sch,struct qdisc_walker * walker)545 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
546 {
547 	if (!walker->stop) {
548 		if (walker->count >= walker->skip)
549 			if (walker->fn(sch, 1, walker) < 0) {
550 				walker->stop = 1;
551 				return;
552 			}
553 		walker->count++;
554 	}
555 }
556 
netem_find_tcf(struct Qdisc * sch,unsigned long cl)557 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
558 {
559 	return NULL;
560 }
561 
562 static struct Qdisc_class_ops netem_class_ops = {
563 	.graft		=	netem_graft,
564 	.leaf		=	netem_leaf,
565 	.get		=	netem_get,
566 	.put		=	netem_put,
567 	.change		=	netem_change_class,
568 	.delete		=	netem_delete,
569 	.walk		=	netem_walk,
570 	.tcf_chain	=	netem_find_tcf,
571 	.dump		=	netem_dump_class,
572 };
573 
574 static struct Qdisc_ops netem_qdisc_ops = {
575 	.id		=	"netem",
576 	.cl_ops		=	&netem_class_ops,
577 	.priv_size	=	sizeof(struct netem_sched_data),
578 	.enqueue	=	netem_enqueue,
579 	.dequeue	=	netem_dequeue,
580 	.requeue	=	netem_requeue,
581 	.drop		=	netem_drop,
582 	.init		=	netem_init,
583 	.reset		=	netem_reset,
584 	.destroy	=	netem_destroy,
585 	.change		=	netem_change,
586 	.dump		=	netem_dump,
587 };
588 
589 
netem_module_init(void)590 static int __init netem_module_init(void)
591 {
592 	return register_qdisc(&netem_qdisc_ops);
593 }
netem_module_exit(void)594 static void __exit netem_module_exit(void)
595 {
596 	unregister_qdisc(&netem_qdisc_ops);
597 }
598 module_init(netem_module_init)
599 module_exit(netem_module_exit)
600 MODULE_LICENSE("GPL");
601