1 /*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18 #include <linux/config.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/kmod.h>
35 #include <linux/list.h>
36
37 #include <net/sock.h>
38 #include <net/pkt_sched.h>
39
40 #include <asm/processor.h>
41 #include <asm/uaccess.h>
42 #include <asm/system.h>
43 #include <asm/bitops.h>
44
45 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 struct Qdisc *old, struct Qdisc *new);
47 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 struct Qdisc *q, unsigned long cl, int event);
49
50 /*
51
52 Short review.
53 -------------
54
55 This file consists of two interrelated parts:
56
57 1. queueing disciplines manager frontend.
58 2. traffic classes manager frontend.
59
60 Generally, queueing discipline ("qdisc") is a black box,
61 which is able to enqueue packets and to dequeue them (when
62 device is ready to send something) in order and at times
63 determined by algorithm hidden in it.
64
65 qdisc's are divided to two categories:
66 - "queues", which have no internal structure visible from outside.
67 - "schedulers", which split all the packets to "traffic classes",
68 using "packet classifiers" (look at cls_api.c)
69
70 In turn, classes may have child qdiscs (as rule, queues)
71 attached to them etc. etc. etc.
72
73 The goal of the routines in this file is to translate
74 information supplied by user in the form of handles
75 to more intelligible for kernel form, to make some sanity
76 checks and part of work, which is common to all qdiscs
77 and to provide rtnetlink notifications.
78
79 All real intelligent work is done inside qdisc modules.
80
81
82
83 Every discipline has two major routines: enqueue and dequeue.
84
85 ---dequeue
86
87 dequeue usually returns a skb to send. It is allowed to return NULL,
88 but it does not mean that queue is empty, it just means that
89 discipline does not want to send anything this time.
90 Queue is really empty if q->q.qlen == 0.
91 For complicated disciplines with multiple queues q->q is not
92 real packet queue, but however q->q.qlen must be valid.
93
94 ---enqueue
95
96 enqueue returns 0, if packet was enqueued successfully.
97 If packet (this one or another one) was dropped, it returns
98 not zero error code.
99 NET_XMIT_DROP - this packet dropped
100 Expected action: do not backoff, but wait until queue will clear.
101 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
102 Expected action: backoff or ignore
103 NET_XMIT_POLICED - dropped by police.
104 Expected action: backoff or error to real-time apps.
105
106 Auxiliary routines:
107
108 ---requeue
109
110 requeues once dequeued packet. It is used for non-standard or
111 just buggy devices, which can defer output even if dev->tbusy=0.
112
113 ---reset
114
115 returns qdisc to initial state: purge all buffers, clear all
116 timers, counters (except for statistics) etc.
117
118 ---init
119
120 initializes newly created qdisc.
121
122 ---destroy
123
124 destroys resources allocated by init and during lifetime of qdisc.
125
126 ---change
127
128 changes qdisc parameters.
129 */
130
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
133
134
135 /************************************************
136 * Queueing disciplines manipulation. *
137 ************************************************/
138
139
140 /* The list of all installed queueing disciplines. */
141
142 static struct Qdisc_ops *qdisc_base = NULL;
143
144 /* Register/uregister queueing discipline */
145
register_qdisc(struct Qdisc_ops * qops)146 int register_qdisc(struct Qdisc_ops *qops)
147 {
148 struct Qdisc_ops *q, **qp;
149
150 write_lock(&qdisc_mod_lock);
151 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
152 if (strcmp(qops->id, q->id) == 0) {
153 write_unlock(&qdisc_mod_lock);
154 return -EEXIST;
155 }
156 }
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 write_unlock(&qdisc_mod_lock);
168 return 0;
169 }
170
unregister_qdisc(struct Qdisc_ops * qops)171 int unregister_qdisc(struct Qdisc_ops *qops)
172 {
173 struct Qdisc_ops *q, **qp;
174 int err = -ENOENT;
175
176 write_lock(&qdisc_mod_lock);
177 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 if (q == qops)
179 break;
180 if (q) {
181 *qp = q->next;
182 q->next = NULL;
183 err = 0;
184 }
185 write_unlock(&qdisc_mod_lock);
186 return err;
187 }
188
189 /* We know handle. Find qdisc among all qdisc's attached to device
190 (root qdisc, all its children, children of children etc.)
191 */
192
qdisc_lookup(struct net_device * dev,u32 handle)193 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
194 {
195 struct Qdisc *q;
196
197 list_for_each_entry(q, &dev->qdisc_list, list) {
198 if (q->handle == handle)
199 return q;
200 }
201 return NULL;
202 }
203
qdisc_leaf(struct Qdisc * p,u32 classid)204 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
205 {
206 unsigned long cl;
207 struct Qdisc *leaf;
208 struct Qdisc_class_ops *cops = p->ops->cl_ops;
209
210 if (cops == NULL)
211 return NULL;
212 cl = cops->get(p, classid);
213
214 if (cl == 0)
215 return NULL;
216 leaf = cops->leaf(p, cl);
217 cops->put(p, cl);
218 return leaf;
219 }
220
221 /* Find queueing discipline by name */
222
qdisc_lookup_ops(struct rtattr * kind)223 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
224 {
225 struct Qdisc_ops *q = NULL;
226
227 if (kind) {
228 read_lock(&qdisc_mod_lock);
229 for (q = qdisc_base; q; q = q->next) {
230 if (rtattr_strcmp(kind, q->id) == 0)
231 break;
232 }
233 read_unlock(&qdisc_mod_lock);
234 }
235 return q;
236 }
237
238 static struct qdisc_rate_table *qdisc_rtab_list;
239
qdisc_get_rtab(struct tc_ratespec * r,struct rtattr * tab)240 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
241 {
242 struct qdisc_rate_table *rtab;
243
244 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
245 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
246 rtab->refcnt++;
247 return rtab;
248 }
249 }
250
251 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
252 return NULL;
253
254 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
255 if (rtab) {
256 rtab->rate = *r;
257 rtab->refcnt = 1;
258 memcpy(rtab->data, RTA_DATA(tab), 1024);
259 rtab->next = qdisc_rtab_list;
260 qdisc_rtab_list = rtab;
261 }
262 return rtab;
263 }
264
qdisc_put_rtab(struct qdisc_rate_table * tab)265 void qdisc_put_rtab(struct qdisc_rate_table *tab)
266 {
267 struct qdisc_rate_table *rtab, **rtabp;
268
269 if (!tab || --tab->refcnt)
270 return;
271
272 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
273 if (rtab == tab) {
274 *rtabp = rtab->next;
275 kfree(rtab);
276 return;
277 }
278 }
279 }
280
281
282 /* Allocate an unique handle from space managed by kernel */
283
qdisc_alloc_handle(struct net_device * dev)284 u32 qdisc_alloc_handle(struct net_device *dev)
285 {
286 int i = 0x10000;
287 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
288
289 do {
290 autohandle += TC_H_MAKE(0x10000U, 0);
291 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
292 autohandle = TC_H_MAKE(0x80000000U, 0);
293 } while (qdisc_lookup(dev, autohandle) && --i > 0);
294
295 return i>0 ? autohandle : 0;
296 }
297
298 /* Attach toplevel qdisc to device dev */
299
300 static struct Qdisc *
dev_graft_qdisc(struct net_device * dev,struct Qdisc * qdisc)301 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
302 {
303 struct Qdisc *oqdisc;
304
305 if (dev->flags & IFF_UP)
306 dev_deactivate(dev);
307
308 write_lock(&qdisc_tree_lock);
309 spin_lock_bh(&dev->queue_lock);
310 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
311 oqdisc = dev->qdisc_ingress;
312 /* Prune old scheduler */
313 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
314 /* delete */
315 qdisc_reset(oqdisc);
316 dev->qdisc_ingress = NULL;
317 } else { /* new */
318 dev->qdisc_ingress = qdisc;
319 }
320
321 } else {
322
323 oqdisc = dev->qdisc_sleeping;
324
325 /* Prune old scheduler */
326 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
327 qdisc_reset(oqdisc);
328
329 /* ... and graft new one */
330 if (qdisc == NULL)
331 qdisc = &noop_qdisc;
332 dev->qdisc_sleeping = qdisc;
333 dev->qdisc = &noop_qdisc;
334 }
335
336 spin_unlock_bh(&dev->queue_lock);
337 write_unlock(&qdisc_tree_lock);
338
339 if (dev->flags & IFF_UP)
340 dev_activate(dev);
341
342 return oqdisc;
343 }
344
345
346 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
347 to device "dev".
348
349 Old qdisc is not destroyed but returned in *old.
350 */
351
qdisc_graft(struct net_device * dev,struct Qdisc * parent,u32 classid,struct Qdisc * new,struct Qdisc ** old)352 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
353 struct Qdisc *new, struct Qdisc **old)
354 {
355 int err = 0;
356 struct Qdisc *q = *old;
357
358
359 if (parent == NULL) {
360 if (q && q->flags&TCQ_F_INGRESS) {
361 *old = dev_graft_qdisc(dev, q);
362 } else {
363 *old = dev_graft_qdisc(dev, new);
364 }
365 } else {
366 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
367
368 err = -EINVAL;
369
370 if (cops) {
371 unsigned long cl = cops->get(parent, classid);
372 if (cl) {
373 err = cops->graft(parent, cl, new, old);
374 if (new)
375 new->parent = classid;
376 cops->put(parent, cl);
377 }
378 }
379 }
380 return err;
381 }
382
383 /*
384 Allocate and initialize new qdisc.
385
386 Parameters are passed via opt.
387 */
388
389 static struct Qdisc *
qdisc_create(struct net_device * dev,u32 handle,struct rtattr ** tca,int * errp)390 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
391 {
392 int err;
393 struct rtattr *kind = tca[TCA_KIND-1];
394 struct Qdisc *sch = NULL;
395 struct Qdisc_ops *ops;
396 int size;
397
398 ops = qdisc_lookup_ops(kind);
399 #ifdef CONFIG_KMOD
400 if (ops==NULL && tca[TCA_KIND-1] != NULL) {
401 char module_name[4 + IFNAMSIZ + 1];
402
403 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
404 sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
405 request_module (module_name);
406 ops = qdisc_lookup_ops(kind);
407 }
408 }
409 #endif
410
411 err = -EINVAL;
412 if (ops == NULL)
413 goto err_out;
414
415 size = sizeof(*sch) + ops->priv_size;
416
417 sch = kmalloc(size, GFP_KERNEL);
418 err = -ENOBUFS;
419 if (!sch)
420 goto err_out;
421
422 /* Grrr... Resolve race condition with module unload */
423
424 err = -EINVAL;
425 if (ops != qdisc_lookup_ops(kind))
426 goto err_out;
427
428 memset(sch, 0, size);
429
430 INIT_LIST_HEAD(&sch->list);
431 skb_queue_head_init(&sch->q);
432
433 if (handle == TC_H_INGRESS)
434 sch->flags |= TCQ_F_INGRESS;
435
436 sch->ops = ops;
437 sch->enqueue = ops->enqueue;
438 sch->dequeue = ops->dequeue;
439 sch->dev = dev;
440 atomic_set(&sch->refcnt, 1);
441 sch->stats.lock = &dev->queue_lock;
442 if (handle == 0) {
443 handle = qdisc_alloc_handle(dev);
444 err = -ENOMEM;
445 if (handle == 0)
446 goto err_out;
447 }
448
449 if (handle == TC_H_INGRESS)
450 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
451 else
452 sch->handle = handle;
453
454 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
455 write_lock(&qdisc_tree_lock);
456 list_add_tail(&sch->list, &dev->qdisc_list);
457 write_unlock(&qdisc_tree_lock);
458 #ifdef CONFIG_NET_ESTIMATOR
459 if (tca[TCA_RATE-1])
460 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
461 #endif
462 return sch;
463 }
464
465 err_out:
466 *errp = err;
467 if (sch)
468 kfree(sch);
469 return NULL;
470 }
471
qdisc_change(struct Qdisc * sch,struct rtattr ** tca)472 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
473 {
474 if (tca[TCA_OPTIONS-1]) {
475 int err;
476
477 if (sch->ops->change == NULL)
478 return -EINVAL;
479 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
480 if (err)
481 return err;
482 }
483 #ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1]) {
485 qdisc_kill_estimator(&sch->stats);
486 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
487 }
488 #endif
489 return 0;
490 }
491
492 struct check_loop_arg
493 {
494 struct qdisc_walker w;
495 struct Qdisc *p;
496 int depth;
497 };
498
499 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
500
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)501 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
502 {
503 struct check_loop_arg arg;
504
505 if (q->ops->cl_ops == NULL)
506 return 0;
507
508 arg.w.stop = arg.w.skip = arg.w.count = 0;
509 arg.w.fn = check_loop_fn;
510 arg.depth = depth;
511 arg.p = p;
512 q->ops->cl_ops->walk(q, &arg.w);
513 return arg.w.stop ? -ELOOP : 0;
514 }
515
516 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)517 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
518 {
519 struct Qdisc *leaf;
520 struct Qdisc_class_ops *cops = q->ops->cl_ops;
521 struct check_loop_arg *arg = (struct check_loop_arg *)w;
522
523 leaf = cops->leaf(q, cl);
524 if (leaf) {
525 if (leaf == arg->p || arg->depth > 7)
526 return -ELOOP;
527 return check_loop(leaf, arg->p, arg->depth + 1);
528 }
529 return 0;
530 }
531
532 /*
533 * Delete/get qdisc.
534 */
535
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)536 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
537 {
538 struct tcmsg *tcm = NLMSG_DATA(n);
539 struct rtattr **tca = arg;
540 struct net_device *dev;
541 u32 clid = tcm->tcm_parent;
542 struct Qdisc *q = NULL;
543 struct Qdisc *p = NULL;
544 int err;
545
546 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
547 return -ENODEV;
548
549 if (clid) {
550 if (clid != TC_H_ROOT) {
551 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
552 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
553 return -ENOENT;
554 q = qdisc_leaf(p, clid);
555 } else { /* ingress */
556 q = dev->qdisc_ingress;
557 }
558 } else {
559 q = dev->qdisc_sleeping;
560 }
561 if (!q)
562 return -ENOENT;
563
564 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
565 return -EINVAL;
566 } else {
567 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
568 return -ENOENT;
569 }
570
571 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
572 return -EINVAL;
573
574 if (n->nlmsg_type == RTM_DELQDISC) {
575 if (!clid)
576 return -EINVAL;
577 if (q->handle == 0)
578 return -ENOENT;
579 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
580 return err;
581 if (q) {
582 qdisc_notify(skb, n, clid, q, NULL);
583 spin_lock_bh(&dev->queue_lock);
584 qdisc_destroy(q);
585 spin_unlock_bh(&dev->queue_lock);
586 }
587 } else {
588 qdisc_notify(skb, n, clid, NULL, q);
589 }
590 return 0;
591 }
592
593 /*
594 Create/change qdisc.
595 */
596
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)597 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
598 {
599 struct tcmsg *tcm = NLMSG_DATA(n);
600 struct rtattr **tca = arg;
601 struct net_device *dev;
602 u32 clid = tcm->tcm_parent;
603 struct Qdisc *q = NULL;
604 struct Qdisc *p = NULL;
605 int err;
606
607 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
608 return -ENODEV;
609
610 if (clid) {
611 if (clid != TC_H_ROOT) {
612 if (clid != TC_H_INGRESS) {
613 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
614 return -ENOENT;
615 q = qdisc_leaf(p, clid);
616 } else { /*ingress */
617 q = dev->qdisc_ingress;
618 }
619 } else {
620 q = dev->qdisc_sleeping;
621 }
622
623 /* It may be default qdisc, ignore it */
624 if (q && q->handle == 0)
625 q = NULL;
626
627 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
628 if (tcm->tcm_handle) {
629 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
630 return -EEXIST;
631 if (TC_H_MIN(tcm->tcm_handle))
632 return -EINVAL;
633 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
634 goto create_n_graft;
635 if (n->nlmsg_flags&NLM_F_EXCL)
636 return -EEXIST;
637 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
638 return -EINVAL;
639 if (q == p ||
640 (p && check_loop(q, p, 0)))
641 return -ELOOP;
642 atomic_inc(&q->refcnt);
643 goto graft;
644 } else {
645 if (q == NULL)
646 goto create_n_graft;
647
648 /* This magic test requires explanation.
649 *
650 * We know, that some child q is already
651 * attached to this parent and have choice:
652 * either to change it or to create/graft new one.
653 *
654 * 1. We are allowed to create/graft only
655 * if CREATE and REPLACE flags are set.
656 *
657 * 2. If EXCL is set, requestor wanted to say,
658 * that qdisc tcm_handle is not expected
659 * to exist, so that we choose create/graft too.
660 *
661 * 3. The last case is when no flags are set.
662 * Alas, it is sort of hole in API, we
663 * cannot decide what to do unambiguously.
664 * For now we select create/graft, if
665 * user gave KIND, which does not match existing.
666 */
667 if ((n->nlmsg_flags&NLM_F_CREATE) &&
668 (n->nlmsg_flags&NLM_F_REPLACE) &&
669 ((n->nlmsg_flags&NLM_F_EXCL) ||
670 (tca[TCA_KIND-1] &&
671 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
672 goto create_n_graft;
673 }
674 }
675 } else {
676 if (!tcm->tcm_handle)
677 return -EINVAL;
678 q = qdisc_lookup(dev, tcm->tcm_handle);
679 }
680
681 /* Change qdisc parameters */
682 if (q == NULL)
683 return -ENOENT;
684 if (n->nlmsg_flags&NLM_F_EXCL)
685 return -EEXIST;
686 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
687 return -EINVAL;
688 err = qdisc_change(q, tca);
689 if (err == 0)
690 qdisc_notify(skb, n, clid, NULL, q);
691 return err;
692
693 create_n_graft:
694 if (!(n->nlmsg_flags&NLM_F_CREATE))
695 return -ENOENT;
696 if (clid == TC_H_INGRESS)
697 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
698 else
699 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
700 if (q == NULL)
701 return err;
702
703 graft:
704 if (1) {
705 struct Qdisc *old_q = NULL;
706 err = qdisc_graft(dev, p, clid, q, &old_q);
707 if (err) {
708 if (q) {
709 spin_lock_bh(&dev->queue_lock);
710 qdisc_destroy(q);
711 spin_unlock_bh(&dev->queue_lock);
712 }
713 return err;
714 }
715 qdisc_notify(skb, n, clid, old_q, q);
716 if (old_q) {
717 spin_lock_bh(&dev->queue_lock);
718 qdisc_destroy(old_q);
719 spin_unlock_bh(&dev->queue_lock);
720 }
721 }
722 return 0;
723 }
724
qdisc_copy_stats(struct sk_buff * skb,struct tc_stats * st)725 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
726 {
727 spin_lock_bh(st->lock);
728 RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
729 spin_unlock_bh(st->lock);
730 return 0;
731
732 rtattr_failure:
733 spin_unlock_bh(st->lock);
734 return -1;
735 }
736
737
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 pid,u32 seq,unsigned flags,int event)738 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
739 u32 pid, u32 seq, unsigned flags, int event)
740 {
741 struct tcmsg *tcm;
742 struct nlmsghdr *nlh;
743 unsigned char *b = skb->tail;
744
745 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
746 nlh->nlmsg_flags = flags;
747 tcm = NLMSG_DATA(nlh);
748 tcm->tcm_family = AF_UNSPEC;
749 tcm->tcm__pad1 = 0;
750 tcm->tcm__pad2 = 0;
751 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
752 tcm->tcm_parent = clid;
753 tcm->tcm_handle = q->handle;
754 tcm->tcm_info = atomic_read(&q->refcnt);
755 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
756 if (q->ops->dump && q->ops->dump(q, skb) < 0)
757 goto rtattr_failure;
758 q->stats.qlen = q->q.qlen;
759 if (qdisc_copy_stats(skb, &q->stats))
760 goto rtattr_failure;
761 nlh->nlmsg_len = skb->tail - b;
762 return skb->len;
763
764 nlmsg_failure:
765 rtattr_failure:
766 skb_trim(skb, b - skb->data);
767 return -1;
768 }
769
qdisc_notify(struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)770 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
771 u32 clid, struct Qdisc *old, struct Qdisc *new)
772 {
773 struct sk_buff *skb;
774 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
775
776 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
777 if (!skb)
778 return -ENOBUFS;
779
780 if (old && old->handle) {
781 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
782 goto err_out;
783 }
784 if (new) {
785 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
786 goto err_out;
787 }
788
789 if (skb->len)
790 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
791
792 err_out:
793 kfree_skb(skb);
794 return -EINVAL;
795 }
796
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)797 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
798 {
799 int idx, q_idx;
800 int s_idx, s_q_idx;
801 struct net_device *dev;
802 struct Qdisc *q;
803
804 s_idx = cb->args[0];
805 s_q_idx = q_idx = cb->args[1];
806 read_lock(&dev_base_lock);
807 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
808 if (idx < s_idx)
809 continue;
810 if (idx > s_idx)
811 s_q_idx = 0;
812 read_lock(&qdisc_tree_lock);
813 q_idx = 0;
814 list_for_each_entry(q, &dev->qdisc_list, list) {
815 if (q_idx < s_q_idx) {
816 q_idx++;
817 continue;
818 }
819 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
820 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
821 read_unlock(&qdisc_tree_lock);
822 goto done;
823 }
824 q_idx++;
825 }
826 read_unlock(&qdisc_tree_lock);
827 }
828
829 done:
830 read_unlock(&dev_base_lock);
831
832 cb->args[0] = idx;
833 cb->args[1] = q_idx;
834
835 return skb->len;
836 }
837
838
839
840 /************************************************
841 * Traffic classes manipulation. *
842 ************************************************/
843
844
845
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,void * arg)846 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
847 {
848 struct tcmsg *tcm = NLMSG_DATA(n);
849 struct rtattr **tca = arg;
850 struct net_device *dev;
851 struct Qdisc *q = NULL;
852 struct Qdisc_class_ops *cops;
853 unsigned long cl = 0;
854 unsigned long new_cl;
855 u32 pid = tcm->tcm_parent;
856 u32 clid = tcm->tcm_handle;
857 u32 qid = TC_H_MAJ(clid);
858 int err;
859
860 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
861 return -ENODEV;
862
863 /*
864 parent == TC_H_UNSPEC - unspecified parent.
865 parent == TC_H_ROOT - class is root, which has no parent.
866 parent == X:0 - parent is root class.
867 parent == X:Y - parent is a node in hierarchy.
868 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
869
870 handle == 0:0 - generate handle from kernel pool.
871 handle == 0:Y - class is X:Y, where X:0 is qdisc.
872 handle == X:Y - clear.
873 handle == X:0 - root class.
874 */
875
876 /* Step 1. Determine qdisc handle X:0 */
877
878 if (pid != TC_H_ROOT) {
879 u32 qid1 = TC_H_MAJ(pid);
880
881 if (qid && qid1) {
882 /* If both majors are known, they must be identical. */
883 if (qid != qid1)
884 return -EINVAL;
885 } else if (qid1) {
886 qid = qid1;
887 } else if (qid == 0)
888 qid = dev->qdisc_sleeping->handle;
889
890 /* Now qid is genuine qdisc handle consistent
891 both with parent and child.
892
893 TC_H_MAJ(pid) still may be unspecified, complete it now.
894 */
895 if (pid)
896 pid = TC_H_MAKE(qid, pid);
897 } else {
898 if (qid == 0)
899 qid = dev->qdisc_sleeping->handle;
900 }
901
902 /* OK. Locate qdisc */
903 if ((q = qdisc_lookup(dev, qid)) == NULL)
904 return -ENOENT;
905
906 /* An check that it supports classes */
907 cops = q->ops->cl_ops;
908 if (cops == NULL)
909 return -EINVAL;
910
911 /* Now try to get class */
912 if (clid == 0) {
913 if (pid == TC_H_ROOT)
914 clid = qid;
915 } else
916 clid = TC_H_MAKE(qid, clid);
917
918 if (clid)
919 cl = cops->get(q, clid);
920
921 if (cl == 0) {
922 err = -ENOENT;
923 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
924 goto out;
925 } else {
926 switch (n->nlmsg_type) {
927 case RTM_NEWTCLASS:
928 err = -EEXIST;
929 if (n->nlmsg_flags&NLM_F_EXCL)
930 goto out;
931 break;
932 case RTM_DELTCLASS:
933 err = cops->delete(q, cl);
934 if (err == 0)
935 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
936 goto out;
937 case RTM_GETTCLASS:
938 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
939 goto out;
940 default:
941 err = -EINVAL;
942 goto out;
943 }
944 }
945
946 new_cl = cl;
947 err = cops->change(q, clid, pid, tca, &new_cl);
948 if (err == 0)
949 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
950
951 out:
952 if (cl)
953 cops->put(q, cl);
954
955 return err;
956 }
957
958
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 pid,u32 seq,unsigned flags,int event)959 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
960 unsigned long cl,
961 u32 pid, u32 seq, unsigned flags, int event)
962 {
963 struct tcmsg *tcm;
964 struct nlmsghdr *nlh;
965 unsigned char *b = skb->tail;
966
967 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
968 nlh->nlmsg_flags = flags;
969 tcm = NLMSG_DATA(nlh);
970 tcm->tcm_family = AF_UNSPEC;
971 tcm->tcm__pad1 = 0;
972 tcm->tcm__pad2 = 0;
973 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
974 tcm->tcm_parent = q->handle;
975 tcm->tcm_handle = q->handle;
976 tcm->tcm_info = 0;
977 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
978 if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
979 goto rtattr_failure;
980 nlh->nlmsg_len = skb->tail - b;
981 return skb->len;
982
983 nlmsg_failure:
984 rtattr_failure:
985 skb_trim(skb, b - skb->data);
986 return -1;
987 }
988
tclass_notify(struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)989 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
990 struct Qdisc *q, unsigned long cl, int event)
991 {
992 struct sk_buff *skb;
993 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
994
995 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
996 if (!skb)
997 return -ENOBUFS;
998
999 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1000 kfree_skb(skb);
1001 return -EINVAL;
1002 }
1003
1004 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1005 }
1006
1007 struct qdisc_dump_args
1008 {
1009 struct qdisc_walker w;
1010 struct sk_buff *skb;
1011 struct netlink_callback *cb;
1012 };
1013
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)1014 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1015 {
1016 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1017
1018 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1019 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1020 }
1021
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)1022 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1023 {
1024 int t;
1025 int s_t;
1026 struct net_device *dev;
1027 struct Qdisc *q;
1028 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1029 struct qdisc_dump_args arg;
1030
1031 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1032 return 0;
1033 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1034 return 0;
1035
1036 s_t = cb->args[0];
1037 t = 0;
1038
1039 read_lock(&qdisc_tree_lock);
1040 list_for_each_entry(q, &dev->qdisc_list, list) {
1041 if (t < s_t || !q->ops->cl_ops ||
1042 (tcm->tcm_parent &&
1043 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1044 t++;
1045 continue;
1046 }
1047 if (t > s_t)
1048 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1049 arg.w.fn = qdisc_class_dump;
1050 arg.skb = skb;
1051 arg.cb = cb;
1052 arg.w.stop = 0;
1053 arg.w.skip = cb->args[1];
1054 arg.w.count = 0;
1055 q->ops->cl_ops->walk(q, &arg.w);
1056 cb->args[1] = arg.w.count;
1057 if (arg.w.stop)
1058 break;
1059 t++;
1060 }
1061 read_unlock(&qdisc_tree_lock);
1062
1063 cb->args[0] = t;
1064
1065 dev_put(dev);
1066 return skb->len;
1067 }
1068
1069 int psched_us_per_tick = 1;
1070 int psched_tick_per_us = 1;
1071
1072 #ifdef CONFIG_PROC_FS
psched_read_proc(char * buffer,char ** start,off_t offset,int length,int * eof,void * data)1073 static int psched_read_proc(char *buffer, char **start, off_t offset,
1074 int length, int *eof, void *data)
1075 {
1076 int len;
1077
1078 len = sprintf(buffer, "%08x %08x %08x %08x\n",
1079 psched_tick_per_us, psched_us_per_tick,
1080 1000000, HZ);
1081
1082 len -= offset;
1083
1084 if (len > length)
1085 len = length;
1086 if(len < 0)
1087 len = 0;
1088
1089 *start = buffer + offset;
1090 *eof = 1;
1091
1092 return len;
1093 }
1094 #endif
1095
1096 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
psched_tod_diff(int delta_sec,int bound)1097 int psched_tod_diff(int delta_sec, int bound)
1098 {
1099 int delta;
1100
1101 if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1102 return bound;
1103 delta = delta_sec * 1000000;
1104 if (delta > bound)
1105 delta = bound;
1106 return delta;
1107 }
1108 #endif
1109
1110 psched_time_t psched_time_base;
1111
1112 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1113 psched_tdiff_t psched_clock_per_hz;
1114 int psched_clock_scale;
1115 #endif
1116
1117 #ifdef PSCHED_WATCHER
1118 PSCHED_WATCHER psched_time_mark;
1119
1120 static void psched_tick(unsigned long);
1121
1122 static struct timer_list psched_timer =
1123 { function: psched_tick };
1124
psched_tick(unsigned long dummy)1125 static void psched_tick(unsigned long dummy)
1126 {
1127 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1128 psched_time_t dummy_stamp;
1129 PSCHED_GET_TIME(dummy_stamp);
1130 /* It is OK up to 4GHz cpu */
1131 psched_timer.expires = jiffies + 1*HZ;
1132 #else
1133 unsigned long now = jiffies;
1134 psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE;
1135 psched_time_mark = now;
1136 psched_timer.expires = now + 60*60*HZ;
1137 #endif
1138 add_timer(&psched_timer);
1139 }
1140 #endif
1141
1142 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
psched_calibrate_clock(void)1143 int __init psched_calibrate_clock(void)
1144 {
1145 psched_time_t stamp, stamp1;
1146 struct timeval tv, tv1;
1147 psched_tdiff_t delay;
1148 long rdelay;
1149 unsigned long stop;
1150
1151 #ifdef PSCHED_WATCHER
1152 psched_tick(0);
1153 #endif
1154 stop = jiffies + HZ/10;
1155 PSCHED_GET_TIME(stamp);
1156 do_gettimeofday(&tv);
1157 while (time_before(jiffies, stop)) {
1158 barrier();
1159 cpu_relax();
1160 }
1161 PSCHED_GET_TIME(stamp1);
1162 do_gettimeofday(&tv1);
1163
1164 delay = PSCHED_TDIFF(stamp1, stamp);
1165 rdelay = tv1.tv_usec - tv.tv_usec;
1166 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1167 if (rdelay > delay)
1168 return -1;
1169 delay /= rdelay;
1170 psched_tick_per_us = delay;
1171 while ((delay>>=1) != 0)
1172 psched_clock_scale++;
1173 psched_us_per_tick = 1<<psched_clock_scale;
1174 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1175 return 0;
1176 }
1177 #endif
1178
pktsched_init(void)1179 int __init pktsched_init(void)
1180 {
1181 struct rtnetlink_link *link_p;
1182
1183 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1184 if (psched_calibrate_clock() < 0)
1185 return -1;
1186 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1187 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1188 psched_us_per_tick = 1000000;
1189 #ifdef PSCHED_WATCHER
1190 psched_tick(0);
1191 #endif
1192 #endif
1193
1194 link_p = rtnetlink_links[PF_UNSPEC];
1195
1196 /* Setup rtnetlink links. It is made here to avoid
1197 exporting large number of public symbols.
1198 */
1199
1200 if (link_p) {
1201 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1202 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1203 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1204 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1205 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1206 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1207 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1208 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1209 }
1210
1211 #define INIT_QDISC(name) { \
1212 extern struct Qdisc_ops name##_qdisc_ops; \
1213 register_qdisc(& name##_qdisc_ops); \
1214 }
1215
1216 INIT_QDISC(pfifo);
1217 INIT_QDISC(bfifo);
1218
1219 #ifdef CONFIG_NET_SCH_CBQ
1220 INIT_QDISC(cbq);
1221 #endif
1222 #ifdef CONFIG_NET_SCH_HTB
1223 INIT_QDISC(htb);
1224 #endif
1225 #ifdef CONFIG_NET_SCH_CSZ
1226 INIT_QDISC(csz);
1227 #endif
1228 #ifdef CONFIG_NET_SCH_HPFQ
1229 INIT_QDISC(hpfq);
1230 #endif
1231 #ifdef CONFIG_NET_SCH_HFSC
1232 INIT_QDISC(hfsc);
1233 #endif
1234 #ifdef CONFIG_NET_SCH_RED
1235 INIT_QDISC(red);
1236 #endif
1237 #ifdef CONFIG_NET_SCH_GRED
1238 INIT_QDISC(gred);
1239 #endif
1240 #ifdef CONFIG_NET_SCH_INGRESS
1241 INIT_QDISC(ingress);
1242 #endif
1243 #ifdef CONFIG_NET_SCH_DSMARK
1244 INIT_QDISC(dsmark);
1245 #endif
1246 #ifdef CONFIG_NET_SCH_SFQ
1247 INIT_QDISC(sfq);
1248 #endif
1249 #ifdef CONFIG_NET_SCH_TBF
1250 INIT_QDISC(tbf);
1251 #endif
1252 #ifdef CONFIG_NET_SCH_TEQL
1253 teql_init();
1254 #endif
1255 #ifdef CONFIG_NET_SCH_PRIO
1256 INIT_QDISC(prio);
1257 #endif
1258 #ifdef CONFIG_NET_SCH_ATM
1259 INIT_QDISC(atm);
1260 #endif
1261 #ifdef CONFIG_NET_CLS
1262 tc_filter_init();
1263 #endif
1264
1265 #ifdef CONFIG_PROC_FS
1266 create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL);
1267 #endif
1268
1269 return 0;
1270 }
1271