1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/config.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/kmod.h>
35 #include <linux/list.h>
36 
37 #include <net/sock.h>
38 #include <net/pkt_sched.h>
39 
40 #include <asm/processor.h>
41 #include <asm/uaccess.h>
42 #include <asm/system.h>
43 #include <asm/bitops.h>
44 
45 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 			struct Qdisc *old, struct Qdisc *new);
47 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 			 struct Qdisc *q, unsigned long cl, int event);
49 
50 /*
51 
52    Short review.
53    -------------
54 
55    This file consists of two interrelated parts:
56 
57    1. queueing disciplines manager frontend.
58    2. traffic classes manager frontend.
59 
60    Generally, queueing discipline ("qdisc") is a black box,
61    which is able to enqueue packets and to dequeue them (when
62    device is ready to send something) in order and at times
63    determined by algorithm hidden in it.
64 
65    qdisc's are divided to two categories:
66    - "queues", which have no internal structure visible from outside.
67    - "schedulers", which split all the packets to "traffic classes",
68      using "packet classifiers" (look at cls_api.c)
69 
70    In turn, classes may have child qdiscs (as rule, queues)
71    attached to them etc. etc. etc.
72 
73    The goal of the routines in this file is to translate
74    information supplied by user in the form of handles
75    to more intelligible for kernel form, to make some sanity
76    checks and part of work, which is common to all qdiscs
77    and to provide rtnetlink notifications.
78 
79    All real intelligent work is done inside qdisc modules.
80 
81 
82 
83    Every discipline has two major routines: enqueue and dequeue.
84 
85    ---dequeue
86 
87    dequeue usually returns a skb to send. It is allowed to return NULL,
88    but it does not mean that queue is empty, it just means that
89    discipline does not want to send anything this time.
90    Queue is really empty if q->q.qlen == 0.
91    For complicated disciplines with multiple queues q->q is not
92    real packet queue, but however q->q.qlen must be valid.
93 
94    ---enqueue
95 
96    enqueue returns 0, if packet was enqueued successfully.
97    If packet (this one or another one) was dropped, it returns
98    not zero error code.
99    NET_XMIT_DROP 	- this packet dropped
100      Expected action: do not backoff, but wait until queue will clear.
101    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
102      Expected action: backoff or ignore
103    NET_XMIT_POLICED	- dropped by police.
104      Expected action: backoff or error to real-time apps.
105 
106    Auxiliary routines:
107 
108    ---requeue
109 
110    requeues once dequeued packet. It is used for non-standard or
111    just buggy devices, which can defer output even if dev->tbusy=0.
112 
113    ---reset
114 
115    returns qdisc to initial state: purge all buffers, clear all
116    timers, counters (except for statistics) etc.
117 
118    ---init
119 
120    initializes newly created qdisc.
121 
122    ---destroy
123 
124    destroys resources allocated by init and during lifetime of qdisc.
125 
126    ---change
127 
128    changes qdisc parameters.
129  */
130 
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
133 
134 
135 /************************************************
136  *	Queueing disciplines manipulation.	*
137  ************************************************/
138 
139 
140 /* The list of all installed queueing disciplines. */
141 
142 static struct Qdisc_ops *qdisc_base = NULL;
143 
144 /* Register/uregister queueing discipline */
145 
register_qdisc(struct Qdisc_ops * qops)146 int register_qdisc(struct Qdisc_ops *qops)
147 {
148 	struct Qdisc_ops *q, **qp;
149 
150 	write_lock(&qdisc_mod_lock);
151 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
152 		if (strcmp(qops->id, q->id) == 0) {
153 			write_unlock(&qdisc_mod_lock);
154 			return -EEXIST;
155 		}
156 	}
157 
158 	if (qops->enqueue == NULL)
159 		qops->enqueue = noop_qdisc_ops.enqueue;
160 	if (qops->requeue == NULL)
161 		qops->requeue = noop_qdisc_ops.requeue;
162 	if (qops->dequeue == NULL)
163 		qops->dequeue = noop_qdisc_ops.dequeue;
164 
165 	qops->next = NULL;
166 	*qp = qops;
167 	write_unlock(&qdisc_mod_lock);
168 	return 0;
169 }
170 
unregister_qdisc(struct Qdisc_ops * qops)171 int unregister_qdisc(struct Qdisc_ops *qops)
172 {
173 	struct Qdisc_ops *q, **qp;
174 	int err = -ENOENT;
175 
176 	write_lock(&qdisc_mod_lock);
177 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 		if (q == qops)
179 			break;
180 	if (q) {
181 		*qp = q->next;
182 		q->next = NULL;
183 		err = 0;
184 	}
185 	write_unlock(&qdisc_mod_lock);
186 	return err;
187 }
188 
189 /* We know handle. Find qdisc among all qdisc's attached to device
190    (root qdisc, all its children, children of children etc.)
191  */
192 
qdisc_lookup(struct net_device * dev,u32 handle)193 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
194 {
195 	struct Qdisc *q;
196 
197 	list_for_each_entry(q, &dev->qdisc_list, list) {
198 		if (q->handle == handle)
199 			return q;
200 	}
201 	return NULL;
202 }
203 
qdisc_leaf(struct Qdisc * p,u32 classid)204 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
205 {
206 	unsigned long cl;
207 	struct Qdisc *leaf;
208 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
209 
210 	if (cops == NULL)
211 		return NULL;
212 	cl = cops->get(p, classid);
213 
214 	if (cl == 0)
215 		return NULL;
216 	leaf = cops->leaf(p, cl);
217 	cops->put(p, cl);
218 	return leaf;
219 }
220 
221 /* Find queueing discipline by name */
222 
qdisc_lookup_ops(struct rtattr * kind)223 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
224 {
225 	struct Qdisc_ops *q = NULL;
226 
227 	if (kind) {
228 		read_lock(&qdisc_mod_lock);
229 		for (q = qdisc_base; q; q = q->next) {
230 			if (rtattr_strcmp(kind, q->id) == 0)
231 				break;
232 		}
233 		read_unlock(&qdisc_mod_lock);
234 	}
235 	return q;
236 }
237 
238 static struct qdisc_rate_table *qdisc_rtab_list;
239 
qdisc_get_rtab(struct tc_ratespec * r,struct rtattr * tab)240 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
241 {
242 	struct qdisc_rate_table *rtab;
243 
244 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
245 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
246 			rtab->refcnt++;
247 			return rtab;
248 		}
249 	}
250 
251 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
252 		return NULL;
253 
254 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
255 	if (rtab) {
256 		rtab->rate = *r;
257 		rtab->refcnt = 1;
258 		memcpy(rtab->data, RTA_DATA(tab), 1024);
259 		rtab->next = qdisc_rtab_list;
260 		qdisc_rtab_list = rtab;
261 	}
262 	return rtab;
263 }
264 
qdisc_put_rtab(struct qdisc_rate_table * tab)265 void qdisc_put_rtab(struct qdisc_rate_table *tab)
266 {
267 	struct qdisc_rate_table *rtab, **rtabp;
268 
269 	if (!tab || --tab->refcnt)
270 		return;
271 
272 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
273 		if (rtab == tab) {
274 			*rtabp = rtab->next;
275 			kfree(rtab);
276 			return;
277 		}
278 	}
279 }
280 
281 
282 /* Allocate an unique handle from space managed by kernel */
283 
qdisc_alloc_handle(struct net_device * dev)284 u32 qdisc_alloc_handle(struct net_device *dev)
285 {
286 	int i = 0x10000;
287 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
288 
289 	do {
290 		autohandle += TC_H_MAKE(0x10000U, 0);
291 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
292 			autohandle = TC_H_MAKE(0x80000000U, 0);
293 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
294 
295 	return i>0 ? autohandle : 0;
296 }
297 
298 /* Attach toplevel qdisc to device dev */
299 
300 static struct Qdisc *
dev_graft_qdisc(struct net_device * dev,struct Qdisc * qdisc)301 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
302 {
303 	struct Qdisc *oqdisc;
304 
305 	if (dev->flags & IFF_UP)
306 		dev_deactivate(dev);
307 
308 	write_lock(&qdisc_tree_lock);
309 	spin_lock_bh(&dev->queue_lock);
310 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
311 		oqdisc = dev->qdisc_ingress;
312 		/* Prune old scheduler */
313 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
314 			/* delete */
315 			qdisc_reset(oqdisc);
316 			dev->qdisc_ingress = NULL;
317 		} else {  /* new */
318 			dev->qdisc_ingress = qdisc;
319 		}
320 
321 	} else {
322 
323 		oqdisc = dev->qdisc_sleeping;
324 
325 		/* Prune old scheduler */
326 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
327 			qdisc_reset(oqdisc);
328 
329 		/* ... and graft new one */
330 		if (qdisc == NULL)
331 			qdisc = &noop_qdisc;
332 		dev->qdisc_sleeping = qdisc;
333 		dev->qdisc = &noop_qdisc;
334 	}
335 
336 	spin_unlock_bh(&dev->queue_lock);
337 	write_unlock(&qdisc_tree_lock);
338 
339 	if (dev->flags & IFF_UP)
340 		dev_activate(dev);
341 
342 	return oqdisc;
343 }
344 
345 
346 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
347    to device "dev".
348 
349    Old qdisc is not destroyed but returned in *old.
350  */
351 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,u32 classid,struct Qdisc * new,struct Qdisc ** old)352 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
353 		struct Qdisc *new, struct Qdisc **old)
354 {
355 	int err = 0;
356 	struct Qdisc *q = *old;
357 
358 
359 	if (parent == NULL) {
360 		if (q && q->flags&TCQ_F_INGRESS) {
361 			*old = dev_graft_qdisc(dev, q);
362 		} else {
363 			*old = dev_graft_qdisc(dev, new);
364 		}
365 	} else {
366 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
367 
368 		err = -EINVAL;
369 
370 		if (cops) {
371 			unsigned long cl = cops->get(parent, classid);
372 			if (cl) {
373 				err = cops->graft(parent, cl, new, old);
374 				if (new)
375 					new->parent = classid;
376 				cops->put(parent, cl);
377 			}
378 		}
379 	}
380 	return err;
381 }
382 
383 /*
384    Allocate and initialize new qdisc.
385 
386    Parameters are passed via opt.
387  */
388 
389 static struct Qdisc *
qdisc_create(struct net_device * dev,u32 handle,struct rtattr ** tca,int * errp)390 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
391 {
392 	int err;
393 	struct rtattr *kind = tca[TCA_KIND-1];
394 	struct Qdisc *sch = NULL;
395 	struct Qdisc_ops *ops;
396 	int size;
397 
398 	ops = qdisc_lookup_ops(kind);
399 #ifdef CONFIG_KMOD
400 	if (ops==NULL && tca[TCA_KIND-1] != NULL) {
401 		char module_name[4 + IFNAMSIZ + 1];
402 
403 		if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
404 			sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
405 			request_module (module_name);
406 			ops = qdisc_lookup_ops(kind);
407 		}
408 	}
409 #endif
410 
411 	err = -EINVAL;
412 	if (ops == NULL)
413 		goto err_out;
414 
415 	size = sizeof(*sch) + ops->priv_size;
416 
417 	sch = kmalloc(size, GFP_KERNEL);
418 	err = -ENOBUFS;
419 	if (!sch)
420 		goto err_out;
421 
422 	/* Grrr... Resolve race condition with module unload */
423 
424 	err = -EINVAL;
425 	if (ops != qdisc_lookup_ops(kind))
426 		goto err_out;
427 
428 	memset(sch, 0, size);
429 
430 	INIT_LIST_HEAD(&sch->list);
431 	skb_queue_head_init(&sch->q);
432 
433 	if (handle == TC_H_INGRESS)
434 		sch->flags |= TCQ_F_INGRESS;
435 
436 	sch->ops = ops;
437 	sch->enqueue = ops->enqueue;
438 	sch->dequeue = ops->dequeue;
439 	sch->dev = dev;
440 	atomic_set(&sch->refcnt, 1);
441 	sch->stats.lock = &dev->queue_lock;
442 	if (handle == 0) {
443 		handle = qdisc_alloc_handle(dev);
444 		err = -ENOMEM;
445 		if (handle == 0)
446 			goto err_out;
447 	}
448 
449 	if (handle == TC_H_INGRESS)
450                 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
451         else
452                 sch->handle = handle;
453 
454 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
455 		write_lock(&qdisc_tree_lock);
456 		list_add_tail(&sch->list, &dev->qdisc_list);
457 		write_unlock(&qdisc_tree_lock);
458 #ifdef CONFIG_NET_ESTIMATOR
459 		if (tca[TCA_RATE-1])
460 			qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
461 #endif
462 		return sch;
463 	}
464 
465 err_out:
466 	*errp = err;
467 	if (sch)
468 		kfree(sch);
469 	return NULL;
470 }
471 
qdisc_change(struct Qdisc * sch,struct rtattr ** tca)472 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
473 {
474 	if (tca[TCA_OPTIONS-1]) {
475 		int err;
476 
477 		if (sch->ops->change == NULL)
478 			return -EINVAL;
479 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
480 		if (err)
481 			return err;
482 	}
483 #ifdef CONFIG_NET_ESTIMATOR
484 	if (tca[TCA_RATE-1]) {
485 		qdisc_kill_estimator(&sch->stats);
486 		qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
487 	}
488 #endif
489 	return 0;
490 }
491 
492 struct check_loop_arg
493 {
494 	struct qdisc_walker 	w;
495 	struct Qdisc		*p;
496 	int			depth;
497 };
498 
499 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
500 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)501 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
502 {
503 	struct check_loop_arg	arg;
504 
505 	if (q->ops->cl_ops == NULL)
506 		return 0;
507 
508 	arg.w.stop = arg.w.skip = arg.w.count = 0;
509 	arg.w.fn = check_loop_fn;
510 	arg.depth = depth;
511 	arg.p = p;
512 	q->ops->cl_ops->walk(q, &arg.w);
513 	return arg.w.stop ? -ELOOP : 0;
514 }
515 
516 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)517 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
518 {
519 	struct Qdisc *leaf;
520 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
521 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
522 
523 	leaf = cops->leaf(q, cl);
524 	if (leaf) {
525 		if (leaf == arg->p || arg->depth > 7)
526 			return -ELOOP;
527 		return check_loop(leaf, arg->p, arg->depth + 1);
528 	}
529 	return 0;
530 }
531 
532 /*
533  * Delete/get qdisc.
534  */
535 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)536 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
537 {
538 	struct tcmsg *tcm = NLMSG_DATA(n);
539 	struct rtattr **tca = arg;
540 	struct net_device *dev;
541 	u32 clid = tcm->tcm_parent;
542 	struct Qdisc *q = NULL;
543 	struct Qdisc *p = NULL;
544 	int err;
545 
546 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
547 		return -ENODEV;
548 
549 	if (clid) {
550 		if (clid != TC_H_ROOT) {
551 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
552 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
553 					return -ENOENT;
554 				q = qdisc_leaf(p, clid);
555 			} else { /* ingress */
556 				q = dev->qdisc_ingress;
557                         }
558 		} else {
559 			q = dev->qdisc_sleeping;
560 		}
561 		if (!q)
562 			return -ENOENT;
563 
564 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
565 			return -EINVAL;
566 	} else {
567 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
568 			return -ENOENT;
569 	}
570 
571 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
572 		return -EINVAL;
573 
574 	if (n->nlmsg_type == RTM_DELQDISC) {
575 		if (!clid)
576 			return -EINVAL;
577 		if (q->handle == 0)
578 			return -ENOENT;
579 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
580 			return err;
581 		if (q) {
582 			qdisc_notify(skb, n, clid, q, NULL);
583 			spin_lock_bh(&dev->queue_lock);
584 			qdisc_destroy(q);
585 			spin_unlock_bh(&dev->queue_lock);
586 		}
587 	} else {
588 		qdisc_notify(skb, n, clid, NULL, q);
589 	}
590 	return 0;
591 }
592 
593 /*
594    Create/change qdisc.
595  */
596 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)597 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
598 {
599 	struct tcmsg *tcm = NLMSG_DATA(n);
600 	struct rtattr **tca = arg;
601 	struct net_device *dev;
602 	u32 clid = tcm->tcm_parent;
603 	struct Qdisc *q = NULL;
604 	struct Qdisc *p = NULL;
605 	int err;
606 
607 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
608 		return -ENODEV;
609 
610 	if (clid) {
611 		if (clid != TC_H_ROOT) {
612 			if (clid != TC_H_INGRESS) {
613 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
614 					return -ENOENT;
615 				q = qdisc_leaf(p, clid);
616 			} else { /*ingress */
617 				q = dev->qdisc_ingress;
618 			}
619 		} else {
620 			q = dev->qdisc_sleeping;
621 		}
622 
623 		/* It may be default qdisc, ignore it */
624 		if (q && q->handle == 0)
625 			q = NULL;
626 
627 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
628 			if (tcm->tcm_handle) {
629 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
630 					return -EEXIST;
631 				if (TC_H_MIN(tcm->tcm_handle))
632 					return -EINVAL;
633 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
634 					goto create_n_graft;
635 				if (n->nlmsg_flags&NLM_F_EXCL)
636 					return -EEXIST;
637 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
638 					return -EINVAL;
639 				if (q == p ||
640 				    (p && check_loop(q, p, 0)))
641 					return -ELOOP;
642 				atomic_inc(&q->refcnt);
643 				goto graft;
644 			} else {
645 				if (q == NULL)
646 					goto create_n_graft;
647 
648 				/* This magic test requires explanation.
649 				 *
650 				 *   We know, that some child q is already
651 				 *   attached to this parent and have choice:
652 				 *   either to change it or to create/graft new one.
653 				 *
654 				 *   1. We are allowed to create/graft only
655 				 *   if CREATE and REPLACE flags are set.
656 				 *
657 				 *   2. If EXCL is set, requestor wanted to say,
658 				 *   that qdisc tcm_handle is not expected
659 				 *   to exist, so that we choose create/graft too.
660 				 *
661 				 *   3. The last case is when no flags are set.
662 				 *   Alas, it is sort of hole in API, we
663 				 *   cannot decide what to do unambiguously.
664 				 *   For now we select create/graft, if
665 				 *   user gave KIND, which does not match existing.
666 				 */
667 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
668 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
669 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
670 				     (tca[TCA_KIND-1] &&
671 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
672 					goto create_n_graft;
673 			}
674 		}
675 	} else {
676 		if (!tcm->tcm_handle)
677 			return -EINVAL;
678 		q = qdisc_lookup(dev, tcm->tcm_handle);
679 	}
680 
681 	/* Change qdisc parameters */
682 	if (q == NULL)
683 		return -ENOENT;
684 	if (n->nlmsg_flags&NLM_F_EXCL)
685 		return -EEXIST;
686 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
687 		return -EINVAL;
688 	err = qdisc_change(q, tca);
689 	if (err == 0)
690 		qdisc_notify(skb, n, clid, NULL, q);
691 	return err;
692 
693 create_n_graft:
694 	if (!(n->nlmsg_flags&NLM_F_CREATE))
695 		return -ENOENT;
696 	if (clid == TC_H_INGRESS)
697 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
698         else
699 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
700 	if (q == NULL)
701 		return err;
702 
703 graft:
704 	if (1) {
705 		struct Qdisc *old_q = NULL;
706 		err = qdisc_graft(dev, p, clid, q, &old_q);
707 		if (err) {
708 			if (q) {
709 				spin_lock_bh(&dev->queue_lock);
710 				qdisc_destroy(q);
711 				spin_unlock_bh(&dev->queue_lock);
712 			}
713 			return err;
714 		}
715 		qdisc_notify(skb, n, clid, old_q, q);
716 		if (old_q) {
717 			spin_lock_bh(&dev->queue_lock);
718 			qdisc_destroy(old_q);
719 			spin_unlock_bh(&dev->queue_lock);
720 		}
721 	}
722 	return 0;
723 }
724 
qdisc_copy_stats(struct sk_buff * skb,struct tc_stats * st)725 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
726 {
727 	spin_lock_bh(st->lock);
728 	RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
729 	spin_unlock_bh(st->lock);
730 	return 0;
731 
732 rtattr_failure:
733 	spin_unlock_bh(st->lock);
734 	return -1;
735 }
736 
737 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 pid,u32 seq,unsigned flags,int event)738 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
739 			 u32 pid, u32 seq, unsigned flags, int event)
740 {
741 	struct tcmsg *tcm;
742 	struct nlmsghdr  *nlh;
743 	unsigned char	 *b = skb->tail;
744 
745 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
746 	nlh->nlmsg_flags = flags;
747 	tcm = NLMSG_DATA(nlh);
748 	tcm->tcm_family = AF_UNSPEC;
749 	tcm->tcm__pad1 = 0;
750 	tcm->tcm__pad2 = 0;
751 	tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
752 	tcm->tcm_parent = clid;
753 	tcm->tcm_handle = q->handle;
754 	tcm->tcm_info = atomic_read(&q->refcnt);
755 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
756 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
757 		goto rtattr_failure;
758 	q->stats.qlen = q->q.qlen;
759 	if (qdisc_copy_stats(skb, &q->stats))
760 		goto rtattr_failure;
761 	nlh->nlmsg_len = skb->tail - b;
762 	return skb->len;
763 
764 nlmsg_failure:
765 rtattr_failure:
766 	skb_trim(skb, b - skb->data);
767 	return -1;
768 }
769 
qdisc_notify(struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)770 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
771 			u32 clid, struct Qdisc *old, struct Qdisc *new)
772 {
773 	struct sk_buff *skb;
774 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
775 
776 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
777 	if (!skb)
778 		return -ENOBUFS;
779 
780 	if (old && old->handle) {
781 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
782 			goto err_out;
783 	}
784 	if (new) {
785 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
786 			goto err_out;
787 	}
788 
789 	if (skb->len)
790 		return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
791 
792 err_out:
793 	kfree_skb(skb);
794 	return -EINVAL;
795 }
796 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)797 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
798 {
799 	int idx, q_idx;
800 	int s_idx, s_q_idx;
801 	struct net_device *dev;
802 	struct Qdisc *q;
803 
804 	s_idx = cb->args[0];
805 	s_q_idx = q_idx = cb->args[1];
806 	read_lock(&dev_base_lock);
807 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
808 		if (idx < s_idx)
809 			continue;
810 		if (idx > s_idx)
811 			s_q_idx = 0;
812 		read_lock(&qdisc_tree_lock);
813 		q_idx = 0;
814 		list_for_each_entry(q, &dev->qdisc_list, list) {
815 			if (q_idx < s_q_idx) {
816 				q_idx++;
817 				continue;
818 			}
819 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
820 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
821 				read_unlock(&qdisc_tree_lock);
822 				goto done;
823 			}
824 			q_idx++;
825 		}
826 		read_unlock(&qdisc_tree_lock);
827 	}
828 
829 done:
830 	read_unlock(&dev_base_lock);
831 
832 	cb->args[0] = idx;
833 	cb->args[1] = q_idx;
834 
835 	return skb->len;
836 }
837 
838 
839 
840 /************************************************
841  *	Traffic classes manipulation.		*
842  ************************************************/
843 
844 
845 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,void * arg)846 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
847 {
848 	struct tcmsg *tcm = NLMSG_DATA(n);
849 	struct rtattr **tca = arg;
850 	struct net_device *dev;
851 	struct Qdisc *q = NULL;
852 	struct Qdisc_class_ops *cops;
853 	unsigned long cl = 0;
854 	unsigned long new_cl;
855 	u32 pid = tcm->tcm_parent;
856 	u32 clid = tcm->tcm_handle;
857 	u32 qid = TC_H_MAJ(clid);
858 	int err;
859 
860 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
861 		return -ENODEV;
862 
863 	/*
864 	   parent == TC_H_UNSPEC - unspecified parent.
865 	   parent == TC_H_ROOT   - class is root, which has no parent.
866 	   parent == X:0	 - parent is root class.
867 	   parent == X:Y	 - parent is a node in hierarchy.
868 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
869 
870 	   handle == 0:0	 - generate handle from kernel pool.
871 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
872 	   handle == X:Y	 - clear.
873 	   handle == X:0	 - root class.
874 	 */
875 
876 	/* Step 1. Determine qdisc handle X:0 */
877 
878 	if (pid != TC_H_ROOT) {
879 		u32 qid1 = TC_H_MAJ(pid);
880 
881 		if (qid && qid1) {
882 			/* If both majors are known, they must be identical. */
883 			if (qid != qid1)
884 				return -EINVAL;
885 		} else if (qid1) {
886 			qid = qid1;
887 		} else if (qid == 0)
888 			qid = dev->qdisc_sleeping->handle;
889 
890 		/* Now qid is genuine qdisc handle consistent
891 		   both with parent and child.
892 
893 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
894 		 */
895 		if (pid)
896 			pid = TC_H_MAKE(qid, pid);
897 	} else {
898 		if (qid == 0)
899 			qid = dev->qdisc_sleeping->handle;
900 	}
901 
902 	/* OK. Locate qdisc */
903 	if ((q = qdisc_lookup(dev, qid)) == NULL)
904 		return -ENOENT;
905 
906 	/* An check that it supports classes */
907 	cops = q->ops->cl_ops;
908 	if (cops == NULL)
909 		return -EINVAL;
910 
911 	/* Now try to get class */
912 	if (clid == 0) {
913 		if (pid == TC_H_ROOT)
914 			clid = qid;
915 	} else
916 		clid = TC_H_MAKE(qid, clid);
917 
918 	if (clid)
919 		cl = cops->get(q, clid);
920 
921 	if (cl == 0) {
922 		err = -ENOENT;
923 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
924 			goto out;
925 	} else {
926 		switch (n->nlmsg_type) {
927 		case RTM_NEWTCLASS:
928 			err = -EEXIST;
929 			if (n->nlmsg_flags&NLM_F_EXCL)
930 				goto out;
931 			break;
932 		case RTM_DELTCLASS:
933 			err = cops->delete(q, cl);
934 			if (err == 0)
935 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
936 			goto out;
937 		case RTM_GETTCLASS:
938 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
939 			goto out;
940 		default:
941 			err = -EINVAL;
942 			goto out;
943 		}
944 	}
945 
946 	new_cl = cl;
947 	err = cops->change(q, clid, pid, tca, &new_cl);
948 	if (err == 0)
949 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
950 
951 out:
952 	if (cl)
953 		cops->put(q, cl);
954 
955 	return err;
956 }
957 
958 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 pid,u32 seq,unsigned flags,int event)959 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
960 			  unsigned long cl,
961 			  u32 pid, u32 seq, unsigned flags, int event)
962 {
963 	struct tcmsg *tcm;
964 	struct nlmsghdr  *nlh;
965 	unsigned char	 *b = skb->tail;
966 
967 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
968 	nlh->nlmsg_flags = flags;
969 	tcm = NLMSG_DATA(nlh);
970 	tcm->tcm_family = AF_UNSPEC;
971 	tcm->tcm__pad1 = 0;
972 	tcm->tcm__pad2 = 0;
973 	tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
974 	tcm->tcm_parent = q->handle;
975 	tcm->tcm_handle = q->handle;
976 	tcm->tcm_info = 0;
977 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
978 	if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
979 		goto rtattr_failure;
980 	nlh->nlmsg_len = skb->tail - b;
981 	return skb->len;
982 
983 nlmsg_failure:
984 rtattr_failure:
985 	skb_trim(skb, b - skb->data);
986 	return -1;
987 }
988 
tclass_notify(struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)989 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
990 			  struct Qdisc *q, unsigned long cl, int event)
991 {
992 	struct sk_buff *skb;
993 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
994 
995 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
996 	if (!skb)
997 		return -ENOBUFS;
998 
999 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1000 		kfree_skb(skb);
1001 		return -EINVAL;
1002 	}
1003 
1004 	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1005 }
1006 
1007 struct qdisc_dump_args
1008 {
1009 	struct qdisc_walker w;
1010 	struct sk_buff *skb;
1011 	struct netlink_callback *cb;
1012 };
1013 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)1014 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1015 {
1016 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1017 
1018 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1019 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1020 }
1021 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)1022 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1023 {
1024 	int t;
1025 	int s_t;
1026 	struct net_device *dev;
1027 	struct Qdisc *q;
1028 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1029 	struct qdisc_dump_args arg;
1030 
1031 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1032 		return 0;
1033 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1034 		return 0;
1035 
1036 	s_t = cb->args[0];
1037 	t = 0;
1038 
1039 	read_lock(&qdisc_tree_lock);
1040 	list_for_each_entry(q, &dev->qdisc_list, list) {
1041 		if (t < s_t || !q->ops->cl_ops ||
1042 		    (tcm->tcm_parent &&
1043 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1044 			t++;
1045 			continue;
1046 		}
1047 		if (t > s_t)
1048 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1049 		arg.w.fn = qdisc_class_dump;
1050 		arg.skb = skb;
1051 		arg.cb = cb;
1052 		arg.w.stop  = 0;
1053 		arg.w.skip = cb->args[1];
1054 		arg.w.count = 0;
1055 		q->ops->cl_ops->walk(q, &arg.w);
1056 		cb->args[1] = arg.w.count;
1057 		if (arg.w.stop)
1058 			break;
1059 		t++;
1060 	}
1061 	read_unlock(&qdisc_tree_lock);
1062 
1063 	cb->args[0] = t;
1064 
1065 	dev_put(dev);
1066 	return skb->len;
1067 }
1068 
1069 int psched_us_per_tick = 1;
1070 int psched_tick_per_us = 1;
1071 
1072 #ifdef CONFIG_PROC_FS
psched_read_proc(char * buffer,char ** start,off_t offset,int length,int * eof,void * data)1073 static int psched_read_proc(char *buffer, char **start, off_t offset,
1074 			     int length, int *eof, void *data)
1075 {
1076 	int len;
1077 
1078 	len = sprintf(buffer, "%08x %08x %08x %08x\n",
1079 		      psched_tick_per_us, psched_us_per_tick,
1080 		      1000000, HZ);
1081 
1082 	len -= offset;
1083 
1084 	if (len > length)
1085 		len = length;
1086 	if(len < 0)
1087 		len = 0;
1088 
1089 	*start = buffer + offset;
1090 	*eof = 1;
1091 
1092 	return len;
1093 }
1094 #endif
1095 
1096 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
psched_tod_diff(int delta_sec,int bound)1097 int psched_tod_diff(int delta_sec, int bound)
1098 {
1099 	int delta;
1100 
1101 	if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1102 		return bound;
1103 	delta = delta_sec * 1000000;
1104 	if (delta > bound)
1105 		delta = bound;
1106 	return delta;
1107 }
1108 #endif
1109 
1110 psched_time_t psched_time_base;
1111 
1112 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1113 psched_tdiff_t psched_clock_per_hz;
1114 int psched_clock_scale;
1115 #endif
1116 
1117 #ifdef PSCHED_WATCHER
1118 PSCHED_WATCHER psched_time_mark;
1119 
1120 static void psched_tick(unsigned long);
1121 
1122 static struct timer_list psched_timer =
1123 	{ function: psched_tick };
1124 
psched_tick(unsigned long dummy)1125 static void psched_tick(unsigned long dummy)
1126 {
1127 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1128 	psched_time_t dummy_stamp;
1129 	PSCHED_GET_TIME(dummy_stamp);
1130 	/* It is OK up to 4GHz cpu */
1131 	psched_timer.expires = jiffies + 1*HZ;
1132 #else
1133 	unsigned long now = jiffies;
1134 	psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE;
1135 	psched_time_mark = now;
1136 	psched_timer.expires = now + 60*60*HZ;
1137 #endif
1138 	add_timer(&psched_timer);
1139 }
1140 #endif
1141 
1142 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
psched_calibrate_clock(void)1143 int __init psched_calibrate_clock(void)
1144 {
1145 	psched_time_t stamp, stamp1;
1146 	struct timeval tv, tv1;
1147 	psched_tdiff_t delay;
1148 	long rdelay;
1149 	unsigned long stop;
1150 
1151 #ifdef PSCHED_WATCHER
1152 	psched_tick(0);
1153 #endif
1154 	stop = jiffies + HZ/10;
1155 	PSCHED_GET_TIME(stamp);
1156 	do_gettimeofday(&tv);
1157 	while (time_before(jiffies, stop)) {
1158 		barrier();
1159 		cpu_relax();
1160 	}
1161 	PSCHED_GET_TIME(stamp1);
1162 	do_gettimeofday(&tv1);
1163 
1164 	delay = PSCHED_TDIFF(stamp1, stamp);
1165 	rdelay = tv1.tv_usec - tv.tv_usec;
1166 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1167 	if (rdelay > delay)
1168 		return -1;
1169 	delay /= rdelay;
1170 	psched_tick_per_us = delay;
1171 	while ((delay>>=1) != 0)
1172 		psched_clock_scale++;
1173 	psched_us_per_tick = 1<<psched_clock_scale;
1174 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1175 	return 0;
1176 }
1177 #endif
1178 
pktsched_init(void)1179 int __init pktsched_init(void)
1180 {
1181 	struct rtnetlink_link *link_p;
1182 
1183 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1184 	if (psched_calibrate_clock() < 0)
1185 		return -1;
1186 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1187 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1188 	psched_us_per_tick = 1000000;
1189 #ifdef PSCHED_WATCHER
1190 	psched_tick(0);
1191 #endif
1192 #endif
1193 
1194 	link_p = rtnetlink_links[PF_UNSPEC];
1195 
1196 	/* Setup rtnetlink links. It is made here to avoid
1197 	   exporting large number of public symbols.
1198 	 */
1199 
1200 	if (link_p) {
1201 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1202 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1203 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1204 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1205 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1206 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1207 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1208 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1209 	}
1210 
1211 #define INIT_QDISC(name) { \
1212           extern struct Qdisc_ops name##_qdisc_ops; \
1213           register_qdisc(& name##_qdisc_ops);       \
1214 	}
1215 
1216 	INIT_QDISC(pfifo);
1217 	INIT_QDISC(bfifo);
1218 
1219 #ifdef CONFIG_NET_SCH_CBQ
1220 	INIT_QDISC(cbq);
1221 #endif
1222 #ifdef CONFIG_NET_SCH_HTB
1223 	INIT_QDISC(htb);
1224 #endif
1225 #ifdef CONFIG_NET_SCH_CSZ
1226 	INIT_QDISC(csz);
1227 #endif
1228 #ifdef CONFIG_NET_SCH_HPFQ
1229 	INIT_QDISC(hpfq);
1230 #endif
1231 #ifdef CONFIG_NET_SCH_HFSC
1232 	INIT_QDISC(hfsc);
1233 #endif
1234 #ifdef CONFIG_NET_SCH_RED
1235 	INIT_QDISC(red);
1236 #endif
1237 #ifdef CONFIG_NET_SCH_GRED
1238        INIT_QDISC(gred);
1239 #endif
1240 #ifdef CONFIG_NET_SCH_INGRESS
1241        INIT_QDISC(ingress);
1242 #endif
1243 #ifdef CONFIG_NET_SCH_DSMARK
1244        INIT_QDISC(dsmark);
1245 #endif
1246 #ifdef CONFIG_NET_SCH_SFQ
1247 	INIT_QDISC(sfq);
1248 #endif
1249 #ifdef CONFIG_NET_SCH_TBF
1250 	INIT_QDISC(tbf);
1251 #endif
1252 #ifdef CONFIG_NET_SCH_TEQL
1253 	teql_init();
1254 #endif
1255 #ifdef CONFIG_NET_SCH_PRIO
1256 	INIT_QDISC(prio);
1257 #endif
1258 #ifdef CONFIG_NET_SCH_ATM
1259 	INIT_QDISC(atm);
1260 #endif
1261 #ifdef CONFIG_NET_CLS
1262 	tc_filter_init();
1263 #endif
1264 
1265 #ifdef CONFIG_PROC_FS
1266 	create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL);
1267 #endif
1268 
1269 	return 0;
1270 }
1271