1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 			struct nlmsghdr *n, u32 clid,
40 			struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 			 struct nlmsghdr *n, struct Qdisc *q,
43 			 unsigned long cl, int event);
44 
45 /*
46 
47    Short review.
48    -------------
49 
50    This file consists of two interrelated parts:
51 
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54 
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59 
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64 
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67 
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73 
74    All real intelligent work is done inside qdisc modules.
75 
76 
77 
78    Every discipline has two major routines: enqueue and dequeue.
79 
80    ---dequeue
81 
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88 
89    ---enqueue
90 
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP 	- this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED	- dropped by police.
99      Expected action: backoff or error to real-time apps.
100 
101    Auxiliary routines:
102 
103    ---peek
104 
105    like dequeue but without removing a packet from the queue
106 
107    ---reset
108 
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111 
112    ---init
113 
114    initializes newly created qdisc.
115 
116    ---destroy
117 
118    destroys resources allocated by init and during lifetime of qdisc.
119 
120    ---change
121 
122    changes qdisc parameters.
123  */
124 
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127 
128 
129 /************************************************
130  *	Queueing disciplines manipulation.	*
131  ************************************************/
132 
133 
134 /* The list of all installed queueing disciplines. */
135 
136 static struct Qdisc_ops *qdisc_base;
137 
138 /* Register/uregister queueing discipline */
139 
register_qdisc(struct Qdisc_ops * qops)140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142 	struct Qdisc_ops *q, **qp;
143 	int rc = -EEXIST;
144 
145 	write_lock(&qdisc_mod_lock);
146 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 		if (!strcmp(qops->id, q->id))
148 			goto out;
149 
150 	if (qops->enqueue == NULL)
151 		qops->enqueue = noop_qdisc_ops.enqueue;
152 	if (qops->peek == NULL) {
153 		if (qops->dequeue == NULL)
154 			qops->peek = noop_qdisc_ops.peek;
155 		else
156 			goto out_einval;
157 	}
158 	if (qops->dequeue == NULL)
159 		qops->dequeue = noop_qdisc_ops.dequeue;
160 
161 	if (qops->cl_ops) {
162 		const struct Qdisc_class_ops *cops = qops->cl_ops;
163 
164 		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165 			goto out_einval;
166 
167 		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168 			goto out_einval;
169 	}
170 
171 	qops->next = NULL;
172 	*qp = qops;
173 	rc = 0;
174 out:
175 	write_unlock(&qdisc_mod_lock);
176 	return rc;
177 
178 out_einval:
179 	rc = -EINVAL;
180 	goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183 
unregister_qdisc(struct Qdisc_ops * qops)184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186 	struct Qdisc_ops *q, **qp;
187 	int err = -ENOENT;
188 
189 	write_lock(&qdisc_mod_lock);
190 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191 		if (q == qops)
192 			break;
193 	if (q) {
194 		*qp = q->next;
195 		q->next = NULL;
196 		err = 0;
197 	}
198 	write_unlock(&qdisc_mod_lock);
199 	return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202 
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206 
qdisc_match_from_root(struct Qdisc * root,u32 handle)207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209 	struct Qdisc *q;
210 
211 	if (!(root->flags & TCQ_F_BUILTIN) &&
212 	    root->handle == handle)
213 		return root;
214 
215 	list_for_each_entry(q, &root->list, list) {
216 		if (q->handle == handle)
217 			return q;
218 	}
219 	return NULL;
220 }
221 
qdisc_list_add(struct Qdisc * q)222 static void qdisc_list_add(struct Qdisc *q)
223 {
224 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227 
qdisc_list_del(struct Qdisc * q)228 void qdisc_list_del(struct Qdisc *q)
229 {
230 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231 		list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234 
qdisc_lookup(struct net_device * dev,u32 handle)235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237 	struct Qdisc *q;
238 
239 	q = qdisc_match_from_root(dev->qdisc, handle);
240 	if (q)
241 		goto out;
242 
243 	if (dev_ingress_queue(dev))
244 		q = qdisc_match_from_root(
245 			dev_ingress_queue(dev)->qdisc_sleeping,
246 			handle);
247 out:
248 	return q;
249 }
250 
qdisc_leaf(struct Qdisc * p,u32 classid)251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253 	unsigned long cl;
254 	struct Qdisc *leaf;
255 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256 
257 	if (cops == NULL)
258 		return NULL;
259 	cl = cops->get(p, classid);
260 
261 	if (cl == 0)
262 		return NULL;
263 	leaf = cops->leaf(p, cl);
264 	cops->put(p, cl);
265 	return leaf;
266 }
267 
268 /* Find queueing discipline by name */
269 
qdisc_lookup_ops(struct nlattr * kind)270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272 	struct Qdisc_ops *q = NULL;
273 
274 	if (kind) {
275 		read_lock(&qdisc_mod_lock);
276 		for (q = qdisc_base; q; q = q->next) {
277 			if (nla_strcmp(kind, q->id) == 0) {
278 				if (!try_module_get(q->owner))
279 					q = NULL;
280 				break;
281 			}
282 		}
283 		read_unlock(&qdisc_mod_lock);
284 	}
285 	return q;
286 }
287 
288 static struct qdisc_rate_table *qdisc_rtab_list;
289 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab)290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292 	struct qdisc_rate_table *rtab;
293 
294 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296 			rtab->refcnt++;
297 			return rtab;
298 		}
299 	}
300 
301 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302 	    nla_len(tab) != TC_RTAB_SIZE)
303 		return NULL;
304 
305 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306 	if (rtab) {
307 		rtab->rate = *r;
308 		rtab->refcnt = 1;
309 		memcpy(rtab->data, nla_data(tab), 1024);
310 		rtab->next = qdisc_rtab_list;
311 		qdisc_rtab_list = rtab;
312 	}
313 	return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316 
qdisc_put_rtab(struct qdisc_rate_table * tab)317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319 	struct qdisc_rate_table *rtab, **rtabp;
320 
321 	if (!tab || --tab->refcnt)
322 		return;
323 
324 	for (rtabp = &qdisc_rtab_list;
325 	     (rtab = *rtabp) != NULL;
326 	     rtabp = &rtab->next) {
327 		if (rtab == tab) {
328 			*rtabp = rtab->next;
329 			kfree(rtab);
330 			return;
331 		}
332 	}
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335 
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338 
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
341 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343 
qdisc_get_stab(struct nlattr * opt)344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346 	struct nlattr *tb[TCA_STAB_MAX + 1];
347 	struct qdisc_size_table *stab;
348 	struct tc_sizespec *s;
349 	unsigned int tsize = 0;
350 	u16 *tab = NULL;
351 	int err;
352 
353 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354 	if (err < 0)
355 		return ERR_PTR(err);
356 	if (!tb[TCA_STAB_BASE])
357 		return ERR_PTR(-EINVAL);
358 
359 	s = nla_data(tb[TCA_STAB_BASE]);
360 
361 	if (s->tsize > 0) {
362 		if (!tb[TCA_STAB_DATA])
363 			return ERR_PTR(-EINVAL);
364 		tab = nla_data(tb[TCA_STAB_DATA]);
365 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366 	}
367 
368 	if (tsize != s->tsize || (!tab && tsize > 0))
369 		return ERR_PTR(-EINVAL);
370 
371 	spin_lock(&qdisc_stab_lock);
372 
373 	list_for_each_entry(stab, &qdisc_stab_list, list) {
374 		if (memcmp(&stab->szopts, s, sizeof(*s)))
375 			continue;
376 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377 			continue;
378 		stab->refcnt++;
379 		spin_unlock(&qdisc_stab_lock);
380 		return stab;
381 	}
382 
383 	spin_unlock(&qdisc_stab_lock);
384 
385 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386 	if (!stab)
387 		return ERR_PTR(-ENOMEM);
388 
389 	stab->refcnt = 1;
390 	stab->szopts = *s;
391 	if (tsize > 0)
392 		memcpy(stab->data, tab, tsize * sizeof(u16));
393 
394 	spin_lock(&qdisc_stab_lock);
395 	list_add_tail(&stab->list, &qdisc_stab_list);
396 	spin_unlock(&qdisc_stab_lock);
397 
398 	return stab;
399 }
400 
stab_kfree_rcu(struct rcu_head * head)401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403 	kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405 
qdisc_put_stab(struct qdisc_size_table * tab)406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408 	if (!tab)
409 		return;
410 
411 	spin_lock(&qdisc_stab_lock);
412 
413 	if (--tab->refcnt == 0) {
414 		list_del(&tab->list);
415 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416 	}
417 
418 	spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424 	struct nlattr *nest;
425 
426 	nest = nla_nest_start(skb, TCA_STAB);
427 	if (nest == NULL)
428 		goto nla_put_failure;
429 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
430 	nla_nest_end(skb, nest);
431 
432 	return skb->len;
433 
434 nla_put_failure:
435 	return -1;
436 }
437 
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)438 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
439 {
440 	int pkt_len, slot;
441 
442 	pkt_len = skb->len + stab->szopts.overhead;
443 	if (unlikely(!stab->szopts.tsize))
444 		goto out;
445 
446 	slot = pkt_len + stab->szopts.cell_align;
447 	if (unlikely(slot < 0))
448 		slot = 0;
449 
450 	slot >>= stab->szopts.cell_log;
451 	if (likely(slot < stab->szopts.tsize))
452 		pkt_len = stab->data[slot];
453 	else
454 		pkt_len = stab->data[stab->szopts.tsize - 1] *
455 				(slot / stab->szopts.tsize) +
456 				stab->data[slot % stab->szopts.tsize];
457 
458 	pkt_len <<= stab->szopts.size_log;
459 out:
460 	if (unlikely(pkt_len < 1))
461 		pkt_len = 1;
462 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
463 }
464 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
465 
qdisc_warn_nonwc(char * txt,struct Qdisc * qdisc)466 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
467 {
468 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
469 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
470 			txt, qdisc->ops->id, qdisc->handle >> 16);
471 		qdisc->flags |= TCQ_F_WARN_NONWC;
472 	}
473 }
474 EXPORT_SYMBOL(qdisc_warn_nonwc);
475 
qdisc_watchdog(struct hrtimer * timer)476 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
477 {
478 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
479 						 timer);
480 
481 	qdisc_unthrottled(wd->qdisc);
482 	__netif_schedule(qdisc_root(wd->qdisc));
483 
484 	return HRTIMER_NORESTART;
485 }
486 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)487 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
488 {
489 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
490 	wd->timer.function = qdisc_watchdog;
491 	wd->qdisc = qdisc;
492 }
493 EXPORT_SYMBOL(qdisc_watchdog_init);
494 
qdisc_watchdog_schedule(struct qdisc_watchdog * wd,psched_time_t expires)495 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
496 {
497 	ktime_t time;
498 
499 	if (test_bit(__QDISC_STATE_DEACTIVATED,
500 		     &qdisc_root_sleeping(wd->qdisc)->state))
501 		return;
502 
503 	qdisc_throttled(wd->qdisc);
504 	time = ktime_set(0, 0);
505 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
506 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
507 }
508 EXPORT_SYMBOL(qdisc_watchdog_schedule);
509 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)510 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
511 {
512 	hrtimer_cancel(&wd->timer);
513 	qdisc_unthrottled(wd->qdisc);
514 }
515 EXPORT_SYMBOL(qdisc_watchdog_cancel);
516 
qdisc_class_hash_alloc(unsigned int n)517 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
518 {
519 	unsigned int size = n * sizeof(struct hlist_head), i;
520 	struct hlist_head *h;
521 
522 	if (size <= PAGE_SIZE)
523 		h = kmalloc(size, GFP_KERNEL);
524 	else
525 		h = (struct hlist_head *)
526 			__get_free_pages(GFP_KERNEL, get_order(size));
527 
528 	if (h != NULL) {
529 		for (i = 0; i < n; i++)
530 			INIT_HLIST_HEAD(&h[i]);
531 	}
532 	return h;
533 }
534 
qdisc_class_hash_free(struct hlist_head * h,unsigned int n)535 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
536 {
537 	unsigned int size = n * sizeof(struct hlist_head);
538 
539 	if (size <= PAGE_SIZE)
540 		kfree(h);
541 	else
542 		free_pages((unsigned long)h, get_order(size));
543 }
544 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)545 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
546 {
547 	struct Qdisc_class_common *cl;
548 	struct hlist_node *n, *next;
549 	struct hlist_head *nhash, *ohash;
550 	unsigned int nsize, nmask, osize;
551 	unsigned int i, h;
552 
553 	/* Rehash when load factor exceeds 0.75 */
554 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
555 		return;
556 	nsize = clhash->hashsize * 2;
557 	nmask = nsize - 1;
558 	nhash = qdisc_class_hash_alloc(nsize);
559 	if (nhash == NULL)
560 		return;
561 
562 	ohash = clhash->hash;
563 	osize = clhash->hashsize;
564 
565 	sch_tree_lock(sch);
566 	for (i = 0; i < osize; i++) {
567 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
568 			h = qdisc_class_hash(cl->classid, nmask);
569 			hlist_add_head(&cl->hnode, &nhash[h]);
570 		}
571 	}
572 	clhash->hash     = nhash;
573 	clhash->hashsize = nsize;
574 	clhash->hashmask = nmask;
575 	sch_tree_unlock(sch);
576 
577 	qdisc_class_hash_free(ohash, osize);
578 }
579 EXPORT_SYMBOL(qdisc_class_hash_grow);
580 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)581 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
582 {
583 	unsigned int size = 4;
584 
585 	clhash->hash = qdisc_class_hash_alloc(size);
586 	if (clhash->hash == NULL)
587 		return -ENOMEM;
588 	clhash->hashsize  = size;
589 	clhash->hashmask  = size - 1;
590 	clhash->hashelems = 0;
591 	return 0;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_init);
594 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)595 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
596 {
597 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
598 }
599 EXPORT_SYMBOL(qdisc_class_hash_destroy);
600 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)601 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
602 			     struct Qdisc_class_common *cl)
603 {
604 	unsigned int h;
605 
606 	INIT_HLIST_NODE(&cl->hnode);
607 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
608 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
609 	clhash->hashelems++;
610 }
611 EXPORT_SYMBOL(qdisc_class_hash_insert);
612 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)613 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
614 			     struct Qdisc_class_common *cl)
615 {
616 	hlist_del(&cl->hnode);
617 	clhash->hashelems--;
618 }
619 EXPORT_SYMBOL(qdisc_class_hash_remove);
620 
621 /* Allocate an unique handle from space managed by kernel */
622 
qdisc_alloc_handle(struct net_device * dev)623 static u32 qdisc_alloc_handle(struct net_device *dev)
624 {
625 	int i = 0x10000;
626 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
627 
628 	do {
629 		autohandle += TC_H_MAKE(0x10000U, 0);
630 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
631 			autohandle = TC_H_MAKE(0x80000000U, 0);
632 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
633 
634 	return i > 0 ? autohandle : 0;
635 }
636 
qdisc_tree_decrease_qlen(struct Qdisc * sch,unsigned int n)637 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
638 {
639 	const struct Qdisc_class_ops *cops;
640 	unsigned long cl;
641 	u32 parentid;
642 
643 	if (n == 0)
644 		return;
645 	while ((parentid = sch->parent)) {
646 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
647 			return;
648 
649 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
650 		if (sch == NULL) {
651 			WARN_ON(parentid != TC_H_ROOT);
652 			return;
653 		}
654 		cops = sch->ops->cl_ops;
655 		if (cops->qlen_notify) {
656 			cl = cops->get(sch, parentid);
657 			cops->qlen_notify(sch, cl);
658 			cops->put(sch, cl);
659 		}
660 		sch->q.qlen -= n;
661 	}
662 }
663 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
664 
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)665 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
666 			       struct nlmsghdr *n, u32 clid,
667 			       struct Qdisc *old, struct Qdisc *new)
668 {
669 	if (new || old)
670 		qdisc_notify(net, skb, n, clid, old, new);
671 
672 	if (old)
673 		qdisc_destroy(old);
674 }
675 
676 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
677  * to device "dev".
678  *
679  * When appropriate send a netlink notification using 'skb'
680  * and "n".
681  *
682  * On success, destroy old qdisc.
683  */
684 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old)685 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
686 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
687 		       struct Qdisc *new, struct Qdisc *old)
688 {
689 	struct Qdisc *q = old;
690 	struct net *net = dev_net(dev);
691 	int err = 0;
692 
693 	if (parent == NULL) {
694 		unsigned int i, num_q, ingress;
695 
696 		ingress = 0;
697 		num_q = dev->num_tx_queues;
698 		if ((q && q->flags & TCQ_F_INGRESS) ||
699 		    (new && new->flags & TCQ_F_INGRESS)) {
700 			num_q = 1;
701 			ingress = 1;
702 			if (!dev_ingress_queue(dev))
703 				return -ENOENT;
704 		}
705 
706 		if (dev->flags & IFF_UP)
707 			dev_deactivate(dev);
708 
709 		if (new && new->ops->attach) {
710 			new->ops->attach(new);
711 			num_q = 0;
712 		}
713 
714 		for (i = 0; i < num_q; i++) {
715 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
716 
717 			if (!ingress)
718 				dev_queue = netdev_get_tx_queue(dev, i);
719 
720 			old = dev_graft_qdisc(dev_queue, new);
721 			if (new && i > 0)
722 				atomic_inc(&new->refcnt);
723 
724 			if (!ingress)
725 				qdisc_destroy(old);
726 		}
727 
728 		if (!ingress) {
729 			notify_and_destroy(net, skb, n, classid,
730 					   dev->qdisc, new);
731 			if (new && !new->ops->attach)
732 				atomic_inc(&new->refcnt);
733 			dev->qdisc = new ? : &noop_qdisc;
734 		} else {
735 			notify_and_destroy(net, skb, n, classid, old, new);
736 		}
737 
738 		if (dev->flags & IFF_UP)
739 			dev_activate(dev);
740 	} else {
741 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
742 
743 		err = -EOPNOTSUPP;
744 		if (cops && cops->graft) {
745 			unsigned long cl = cops->get(parent, classid);
746 			if (cl) {
747 				err = cops->graft(parent, cl, new, &old);
748 				cops->put(parent, cl);
749 			} else
750 				err = -ENOENT;
751 		}
752 		if (!err)
753 			notify_and_destroy(net, skb, n, classid, old, new);
754 	}
755 	return err;
756 }
757 
758 /* lockdep annotation is needed for ingress; egress gets it only for name */
759 static struct lock_class_key qdisc_tx_lock;
760 static struct lock_class_key qdisc_rx_lock;
761 
762 /*
763    Allocate and initialize new qdisc.
764 
765    Parameters are passed via opt.
766  */
767 
768 static struct Qdisc *
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp)769 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
770 	     struct Qdisc *p, u32 parent, u32 handle,
771 	     struct nlattr **tca, int *errp)
772 {
773 	int err;
774 	struct nlattr *kind = tca[TCA_KIND];
775 	struct Qdisc *sch;
776 	struct Qdisc_ops *ops;
777 	struct qdisc_size_table *stab;
778 
779 	ops = qdisc_lookup_ops(kind);
780 #ifdef CONFIG_MODULES
781 	if (ops == NULL && kind != NULL) {
782 		char name[IFNAMSIZ];
783 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
784 			/* We dropped the RTNL semaphore in order to
785 			 * perform the module load.  So, even if we
786 			 * succeeded in loading the module we have to
787 			 * tell the caller to replay the request.  We
788 			 * indicate this using -EAGAIN.
789 			 * We replay the request because the device may
790 			 * go away in the mean time.
791 			 */
792 			rtnl_unlock();
793 			request_module("sch_%s", name);
794 			rtnl_lock();
795 			ops = qdisc_lookup_ops(kind);
796 			if (ops != NULL) {
797 				/* We will try again qdisc_lookup_ops,
798 				 * so don't keep a reference.
799 				 */
800 				module_put(ops->owner);
801 				err = -EAGAIN;
802 				goto err_out;
803 			}
804 		}
805 	}
806 #endif
807 
808 	err = -ENOENT;
809 	if (ops == NULL)
810 		goto err_out;
811 
812 	sch = qdisc_alloc(dev_queue, ops);
813 	if (IS_ERR(sch)) {
814 		err = PTR_ERR(sch);
815 		goto err_out2;
816 	}
817 
818 	sch->parent = parent;
819 
820 	if (handle == TC_H_INGRESS) {
821 		sch->flags |= TCQ_F_INGRESS;
822 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
823 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
824 	} else {
825 		if (handle == 0) {
826 			handle = qdisc_alloc_handle(dev);
827 			err = -ENOMEM;
828 			if (handle == 0)
829 				goto err_out3;
830 		}
831 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
832 	}
833 
834 	sch->handle = handle;
835 
836 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
837 		if (tca[TCA_STAB]) {
838 			stab = qdisc_get_stab(tca[TCA_STAB]);
839 			if (IS_ERR(stab)) {
840 				err = PTR_ERR(stab);
841 				goto err_out4;
842 			}
843 			rcu_assign_pointer(sch->stab, stab);
844 		}
845 		if (tca[TCA_RATE]) {
846 			spinlock_t *root_lock;
847 
848 			err = -EOPNOTSUPP;
849 			if (sch->flags & TCQ_F_MQROOT)
850 				goto err_out4;
851 
852 			if ((sch->parent != TC_H_ROOT) &&
853 			    !(sch->flags & TCQ_F_INGRESS) &&
854 			    (!p || !(p->flags & TCQ_F_MQROOT)))
855 				root_lock = qdisc_root_sleeping_lock(sch);
856 			else
857 				root_lock = qdisc_lock(sch);
858 
859 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
860 						root_lock, tca[TCA_RATE]);
861 			if (err)
862 				goto err_out4;
863 		}
864 
865 		qdisc_list_add(sch);
866 
867 		return sch;
868 	}
869 err_out3:
870 	dev_put(dev);
871 	kfree((char *) sch - sch->padded);
872 err_out2:
873 	module_put(ops->owner);
874 err_out:
875 	*errp = err;
876 	return NULL;
877 
878 err_out4:
879 	/*
880 	 * Any broken qdiscs that would require a ops->reset() here?
881 	 * The qdisc was never in action so it shouldn't be necessary.
882 	 */
883 	qdisc_put_stab(rtnl_dereference(sch->stab));
884 	if (ops->destroy)
885 		ops->destroy(sch);
886 	goto err_out3;
887 }
888 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca)889 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
890 {
891 	struct qdisc_size_table *ostab, *stab = NULL;
892 	int err = 0;
893 
894 	if (tca[TCA_OPTIONS]) {
895 		if (sch->ops->change == NULL)
896 			return -EINVAL;
897 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
898 		if (err)
899 			return err;
900 	}
901 
902 	if (tca[TCA_STAB]) {
903 		stab = qdisc_get_stab(tca[TCA_STAB]);
904 		if (IS_ERR(stab))
905 			return PTR_ERR(stab);
906 	}
907 
908 	ostab = rtnl_dereference(sch->stab);
909 	rcu_assign_pointer(sch->stab, stab);
910 	qdisc_put_stab(ostab);
911 
912 	if (tca[TCA_RATE]) {
913 		/* NB: ignores errors from replace_estimator
914 		   because change can't be undone. */
915 		if (sch->flags & TCQ_F_MQROOT)
916 			goto out;
917 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
918 					    qdisc_root_sleeping_lock(sch),
919 					    tca[TCA_RATE]);
920 	}
921 out:
922 	return 0;
923 }
924 
925 struct check_loop_arg {
926 	struct qdisc_walker	w;
927 	struct Qdisc		*p;
928 	int			depth;
929 };
930 
931 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
932 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)933 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
934 {
935 	struct check_loop_arg	arg;
936 
937 	if (q->ops->cl_ops == NULL)
938 		return 0;
939 
940 	arg.w.stop = arg.w.skip = arg.w.count = 0;
941 	arg.w.fn = check_loop_fn;
942 	arg.depth = depth;
943 	arg.p = p;
944 	q->ops->cl_ops->walk(q, &arg.w);
945 	return arg.w.stop ? -ELOOP : 0;
946 }
947 
948 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)949 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
950 {
951 	struct Qdisc *leaf;
952 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
953 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
954 
955 	leaf = cops->leaf(q, cl);
956 	if (leaf) {
957 		if (leaf == arg->p || arg->depth > 7)
958 			return -ELOOP;
959 		return check_loop(leaf, arg->p, arg->depth + 1);
960 	}
961 	return 0;
962 }
963 
964 /*
965  * Delete/get qdisc.
966  */
967 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)968 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
969 {
970 	struct net *net = sock_net(skb->sk);
971 	struct tcmsg *tcm = NLMSG_DATA(n);
972 	struct nlattr *tca[TCA_MAX + 1];
973 	struct net_device *dev;
974 	u32 clid = tcm->tcm_parent;
975 	struct Qdisc *q = NULL;
976 	struct Qdisc *p = NULL;
977 	int err;
978 
979 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
980 	if (!dev)
981 		return -ENODEV;
982 
983 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
984 	if (err < 0)
985 		return err;
986 
987 	if (clid) {
988 		if (clid != TC_H_ROOT) {
989 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
990 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
991 				if (!p)
992 					return -ENOENT;
993 				q = qdisc_leaf(p, clid);
994 			} else if (dev_ingress_queue(dev)) {
995 				q = dev_ingress_queue(dev)->qdisc_sleeping;
996 			}
997 		} else {
998 			q = dev->qdisc;
999 		}
1000 		if (!q)
1001 			return -ENOENT;
1002 
1003 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1004 			return -EINVAL;
1005 	} else {
1006 		q = qdisc_lookup(dev, tcm->tcm_handle);
1007 		if (!q)
1008 			return -ENOENT;
1009 	}
1010 
1011 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1012 		return -EINVAL;
1013 
1014 	if (n->nlmsg_type == RTM_DELQDISC) {
1015 		if (!clid)
1016 			return -EINVAL;
1017 		if (q->handle == 0)
1018 			return -ENOENT;
1019 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1020 		if (err != 0)
1021 			return err;
1022 	} else {
1023 		qdisc_notify(net, skb, n, clid, NULL, q);
1024 	}
1025 	return 0;
1026 }
1027 
1028 /*
1029  * Create/change qdisc.
1030  */
1031 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)1032 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1033 {
1034 	struct net *net = sock_net(skb->sk);
1035 	struct tcmsg *tcm;
1036 	struct nlattr *tca[TCA_MAX + 1];
1037 	struct net_device *dev;
1038 	u32 clid;
1039 	struct Qdisc *q, *p;
1040 	int err;
1041 
1042 replay:
1043 	/* Reinit, just in case something touches this. */
1044 	tcm = NLMSG_DATA(n);
1045 	clid = tcm->tcm_parent;
1046 	q = p = NULL;
1047 
1048 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1049 	if (!dev)
1050 		return -ENODEV;
1051 
1052 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1053 	if (err < 0)
1054 		return err;
1055 
1056 	if (clid) {
1057 		if (clid != TC_H_ROOT) {
1058 			if (clid != TC_H_INGRESS) {
1059 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1060 				if (!p)
1061 					return -ENOENT;
1062 				q = qdisc_leaf(p, clid);
1063 			} else if (dev_ingress_queue_create(dev)) {
1064 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1065 			}
1066 		} else {
1067 			q = dev->qdisc;
1068 		}
1069 
1070 		/* It may be default qdisc, ignore it */
1071 		if (q && q->handle == 0)
1072 			q = NULL;
1073 
1074 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1075 			if (tcm->tcm_handle) {
1076 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1077 					return -EEXIST;
1078 				if (TC_H_MIN(tcm->tcm_handle))
1079 					return -EINVAL;
1080 				q = qdisc_lookup(dev, tcm->tcm_handle);
1081 				if (!q)
1082 					goto create_n_graft;
1083 				if (n->nlmsg_flags & NLM_F_EXCL)
1084 					return -EEXIST;
1085 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1086 					return -EINVAL;
1087 				if (q == p ||
1088 				    (p && check_loop(q, p, 0)))
1089 					return -ELOOP;
1090 				atomic_inc(&q->refcnt);
1091 				goto graft;
1092 			} else {
1093 				if (!q)
1094 					goto create_n_graft;
1095 
1096 				/* This magic test requires explanation.
1097 				 *
1098 				 *   We know, that some child q is already
1099 				 *   attached to this parent and have choice:
1100 				 *   either to change it or to create/graft new one.
1101 				 *
1102 				 *   1. We are allowed to create/graft only
1103 				 *   if CREATE and REPLACE flags are set.
1104 				 *
1105 				 *   2. If EXCL is set, requestor wanted to say,
1106 				 *   that qdisc tcm_handle is not expected
1107 				 *   to exist, so that we choose create/graft too.
1108 				 *
1109 				 *   3. The last case is when no flags are set.
1110 				 *   Alas, it is sort of hole in API, we
1111 				 *   cannot decide what to do unambiguously.
1112 				 *   For now we select create/graft, if
1113 				 *   user gave KIND, which does not match existing.
1114 				 */
1115 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1116 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1117 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1118 				     (tca[TCA_KIND] &&
1119 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1120 					goto create_n_graft;
1121 			}
1122 		}
1123 	} else {
1124 		if (!tcm->tcm_handle)
1125 			return -EINVAL;
1126 		q = qdisc_lookup(dev, tcm->tcm_handle);
1127 	}
1128 
1129 	/* Change qdisc parameters */
1130 	if (q == NULL)
1131 		return -ENOENT;
1132 	if (n->nlmsg_flags & NLM_F_EXCL)
1133 		return -EEXIST;
1134 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1135 		return -EINVAL;
1136 	err = qdisc_change(q, tca);
1137 	if (err == 0)
1138 		qdisc_notify(net, skb, n, clid, NULL, q);
1139 	return err;
1140 
1141 create_n_graft:
1142 	if (!(n->nlmsg_flags & NLM_F_CREATE))
1143 		return -ENOENT;
1144 	if (clid == TC_H_INGRESS) {
1145 		if (dev_ingress_queue(dev))
1146 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1147 					 tcm->tcm_parent, tcm->tcm_parent,
1148 					 tca, &err);
1149 		else
1150 			err = -ENOENT;
1151 	} else {
1152 		struct netdev_queue *dev_queue;
1153 
1154 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1155 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1156 		else if (p)
1157 			dev_queue = p->dev_queue;
1158 		else
1159 			dev_queue = netdev_get_tx_queue(dev, 0);
1160 
1161 		q = qdisc_create(dev, dev_queue, p,
1162 				 tcm->tcm_parent, tcm->tcm_handle,
1163 				 tca, &err);
1164 	}
1165 	if (q == NULL) {
1166 		if (err == -EAGAIN)
1167 			goto replay;
1168 		return err;
1169 	}
1170 
1171 graft:
1172 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1173 	if (err) {
1174 		if (q)
1175 			qdisc_destroy(q);
1176 		return err;
1177 	}
1178 
1179 	return 0;
1180 }
1181 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 pid,u32 seq,u16 flags,int event)1182 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1183 			 u32 pid, u32 seq, u16 flags, int event)
1184 {
1185 	struct tcmsg *tcm;
1186 	struct nlmsghdr  *nlh;
1187 	unsigned char *b = skb_tail_pointer(skb);
1188 	struct gnet_dump d;
1189 	struct qdisc_size_table *stab;
1190 
1191 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1192 	tcm = NLMSG_DATA(nlh);
1193 	tcm->tcm_family = AF_UNSPEC;
1194 	tcm->tcm__pad1 = 0;
1195 	tcm->tcm__pad2 = 0;
1196 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1197 	tcm->tcm_parent = clid;
1198 	tcm->tcm_handle = q->handle;
1199 	tcm->tcm_info = atomic_read(&q->refcnt);
1200 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1201 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1202 		goto nla_put_failure;
1203 	q->qstats.qlen = q->q.qlen;
1204 
1205 	stab = rtnl_dereference(q->stab);
1206 	if (stab && qdisc_dump_stab(skb, stab) < 0)
1207 		goto nla_put_failure;
1208 
1209 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1210 					 qdisc_root_sleeping_lock(q), &d) < 0)
1211 		goto nla_put_failure;
1212 
1213 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1214 		goto nla_put_failure;
1215 
1216 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1217 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1218 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1219 		goto nla_put_failure;
1220 
1221 	if (gnet_stats_finish_copy(&d) < 0)
1222 		goto nla_put_failure;
1223 
1224 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1225 	return skb->len;
1226 
1227 nlmsg_failure:
1228 nla_put_failure:
1229 	nlmsg_trim(skb, b);
1230 	return -1;
1231 }
1232 
tc_qdisc_dump_ignore(struct Qdisc * q)1233 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1234 {
1235 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1236 }
1237 
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)1238 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1239 			struct nlmsghdr *n, u32 clid,
1240 			struct Qdisc *old, struct Qdisc *new)
1241 {
1242 	struct sk_buff *skb;
1243 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1244 
1245 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1246 	if (!skb)
1247 		return -ENOBUFS;
1248 
1249 	if (old && !tc_qdisc_dump_ignore(old)) {
1250 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1251 				  0, RTM_DELQDISC) < 0)
1252 			goto err_out;
1253 	}
1254 	if (new && !tc_qdisc_dump_ignore(new)) {
1255 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1256 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1257 			goto err_out;
1258 	}
1259 
1260 	if (skb->len)
1261 		return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1262 				      n->nlmsg_flags & NLM_F_ECHO);
1263 
1264 err_out:
1265 	kfree_skb(skb);
1266 	return -EINVAL;
1267 }
1268 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx)1269 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1270 			      struct netlink_callback *cb,
1271 			      int *q_idx_p, int s_q_idx)
1272 {
1273 	int ret = 0, q_idx = *q_idx_p;
1274 	struct Qdisc *q;
1275 
1276 	if (!root)
1277 		return 0;
1278 
1279 	q = root;
1280 	if (q_idx < s_q_idx) {
1281 		q_idx++;
1282 	} else {
1283 		if (!tc_qdisc_dump_ignore(q) &&
1284 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1285 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1286 			goto done;
1287 		q_idx++;
1288 	}
1289 	list_for_each_entry(q, &root->list, list) {
1290 		if (q_idx < s_q_idx) {
1291 			q_idx++;
1292 			continue;
1293 		}
1294 		if (!tc_qdisc_dump_ignore(q) &&
1295 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1296 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1297 			goto done;
1298 		q_idx++;
1299 	}
1300 
1301 out:
1302 	*q_idx_p = q_idx;
1303 	return ret;
1304 done:
1305 	ret = -1;
1306 	goto out;
1307 }
1308 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1309 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1310 {
1311 	struct net *net = sock_net(skb->sk);
1312 	int idx, q_idx;
1313 	int s_idx, s_q_idx;
1314 	struct net_device *dev;
1315 
1316 	s_idx = cb->args[0];
1317 	s_q_idx = q_idx = cb->args[1];
1318 
1319 	rcu_read_lock();
1320 	idx = 0;
1321 	for_each_netdev_rcu(net, dev) {
1322 		struct netdev_queue *dev_queue;
1323 
1324 		if (idx < s_idx)
1325 			goto cont;
1326 		if (idx > s_idx)
1327 			s_q_idx = 0;
1328 		q_idx = 0;
1329 
1330 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1331 			goto done;
1332 
1333 		dev_queue = dev_ingress_queue(dev);
1334 		if (dev_queue &&
1335 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1336 				       &q_idx, s_q_idx) < 0)
1337 			goto done;
1338 
1339 cont:
1340 		idx++;
1341 	}
1342 
1343 done:
1344 	rcu_read_unlock();
1345 
1346 	cb->args[0] = idx;
1347 	cb->args[1] = q_idx;
1348 
1349 	return skb->len;
1350 }
1351 
1352 
1353 
1354 /************************************************
1355  *	Traffic classes manipulation.		*
1356  ************************************************/
1357 
1358 
1359 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,void * arg)1360 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1361 {
1362 	struct net *net = sock_net(skb->sk);
1363 	struct tcmsg *tcm = NLMSG_DATA(n);
1364 	struct nlattr *tca[TCA_MAX + 1];
1365 	struct net_device *dev;
1366 	struct Qdisc *q = NULL;
1367 	const struct Qdisc_class_ops *cops;
1368 	unsigned long cl = 0;
1369 	unsigned long new_cl;
1370 	u32 pid = tcm->tcm_parent;
1371 	u32 clid = tcm->tcm_handle;
1372 	u32 qid = TC_H_MAJ(clid);
1373 	int err;
1374 
1375 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1376 	if (!dev)
1377 		return -ENODEV;
1378 
1379 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1380 	if (err < 0)
1381 		return err;
1382 
1383 	/*
1384 	   parent == TC_H_UNSPEC - unspecified parent.
1385 	   parent == TC_H_ROOT   - class is root, which has no parent.
1386 	   parent == X:0	 - parent is root class.
1387 	   parent == X:Y	 - parent is a node in hierarchy.
1388 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1389 
1390 	   handle == 0:0	 - generate handle from kernel pool.
1391 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1392 	   handle == X:Y	 - clear.
1393 	   handle == X:0	 - root class.
1394 	 */
1395 
1396 	/* Step 1. Determine qdisc handle X:0 */
1397 
1398 	if (pid != TC_H_ROOT) {
1399 		u32 qid1 = TC_H_MAJ(pid);
1400 
1401 		if (qid && qid1) {
1402 			/* If both majors are known, they must be identical. */
1403 			if (qid != qid1)
1404 				return -EINVAL;
1405 		} else if (qid1) {
1406 			qid = qid1;
1407 		} else if (qid == 0)
1408 			qid = dev->qdisc->handle;
1409 
1410 		/* Now qid is genuine qdisc handle consistent
1411 		 * both with parent and child.
1412 		 *
1413 		 * TC_H_MAJ(pid) still may be unspecified, complete it now.
1414 		 */
1415 		if (pid)
1416 			pid = TC_H_MAKE(qid, pid);
1417 	} else {
1418 		if (qid == 0)
1419 			qid = dev->qdisc->handle;
1420 	}
1421 
1422 	/* OK. Locate qdisc */
1423 	q = qdisc_lookup(dev, qid);
1424 	if (!q)
1425 		return -ENOENT;
1426 
1427 	/* An check that it supports classes */
1428 	cops = q->ops->cl_ops;
1429 	if (cops == NULL)
1430 		return -EINVAL;
1431 
1432 	/* Now try to get class */
1433 	if (clid == 0) {
1434 		if (pid == TC_H_ROOT)
1435 			clid = qid;
1436 	} else
1437 		clid = TC_H_MAKE(qid, clid);
1438 
1439 	if (clid)
1440 		cl = cops->get(q, clid);
1441 
1442 	if (cl == 0) {
1443 		err = -ENOENT;
1444 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1445 		    !(n->nlmsg_flags & NLM_F_CREATE))
1446 			goto out;
1447 	} else {
1448 		switch (n->nlmsg_type) {
1449 		case RTM_NEWTCLASS:
1450 			err = -EEXIST;
1451 			if (n->nlmsg_flags & NLM_F_EXCL)
1452 				goto out;
1453 			break;
1454 		case RTM_DELTCLASS:
1455 			err = -EOPNOTSUPP;
1456 			if (cops->delete)
1457 				err = cops->delete(q, cl);
1458 			if (err == 0)
1459 				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1460 			goto out;
1461 		case RTM_GETTCLASS:
1462 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1463 			goto out;
1464 		default:
1465 			err = -EINVAL;
1466 			goto out;
1467 		}
1468 	}
1469 
1470 	new_cl = cl;
1471 	err = -EOPNOTSUPP;
1472 	if (cops->change)
1473 		err = cops->change(q, clid, pid, tca, &new_cl);
1474 	if (err == 0)
1475 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1476 
1477 out:
1478 	if (cl)
1479 		cops->put(q, cl);
1480 
1481 	return err;
1482 }
1483 
1484 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 pid,u32 seq,u16 flags,int event)1485 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1486 			  unsigned long cl,
1487 			  u32 pid, u32 seq, u16 flags, int event)
1488 {
1489 	struct tcmsg *tcm;
1490 	struct nlmsghdr  *nlh;
1491 	unsigned char *b = skb_tail_pointer(skb);
1492 	struct gnet_dump d;
1493 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1494 
1495 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1496 	tcm = NLMSG_DATA(nlh);
1497 	tcm->tcm_family = AF_UNSPEC;
1498 	tcm->tcm__pad1 = 0;
1499 	tcm->tcm__pad2 = 0;
1500 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1501 	tcm->tcm_parent = q->handle;
1502 	tcm->tcm_handle = q->handle;
1503 	tcm->tcm_info = 0;
1504 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1505 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1506 		goto nla_put_failure;
1507 
1508 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1509 					 qdisc_root_sleeping_lock(q), &d) < 0)
1510 		goto nla_put_failure;
1511 
1512 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1513 		goto nla_put_failure;
1514 
1515 	if (gnet_stats_finish_copy(&d) < 0)
1516 		goto nla_put_failure;
1517 
1518 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1519 	return skb->len;
1520 
1521 nlmsg_failure:
1522 nla_put_failure:
1523 	nlmsg_trim(skb, b);
1524 	return -1;
1525 }
1526 
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1527 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1528 			 struct nlmsghdr *n, struct Qdisc *q,
1529 			 unsigned long cl, int event)
1530 {
1531 	struct sk_buff *skb;
1532 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1533 
1534 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1535 	if (!skb)
1536 		return -ENOBUFS;
1537 
1538 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1539 		kfree_skb(skb);
1540 		return -EINVAL;
1541 	}
1542 
1543 	return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1544 			      n->nlmsg_flags & NLM_F_ECHO);
1545 }
1546 
1547 struct qdisc_dump_args {
1548 	struct qdisc_walker	w;
1549 	struct sk_buff		*skb;
1550 	struct netlink_callback	*cb;
1551 };
1552 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)1553 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1554 {
1555 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1556 
1557 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1558 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1559 }
1560 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1561 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1562 				struct tcmsg *tcm, struct netlink_callback *cb,
1563 				int *t_p, int s_t)
1564 {
1565 	struct qdisc_dump_args arg;
1566 
1567 	if (tc_qdisc_dump_ignore(q) ||
1568 	    *t_p < s_t || !q->ops->cl_ops ||
1569 	    (tcm->tcm_parent &&
1570 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1571 		(*t_p)++;
1572 		return 0;
1573 	}
1574 	if (*t_p > s_t)
1575 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1576 	arg.w.fn = qdisc_class_dump;
1577 	arg.skb = skb;
1578 	arg.cb = cb;
1579 	arg.w.stop  = 0;
1580 	arg.w.skip = cb->args[1];
1581 	arg.w.count = 0;
1582 	q->ops->cl_ops->walk(q, &arg.w);
1583 	cb->args[1] = arg.w.count;
1584 	if (arg.w.stop)
1585 		return -1;
1586 	(*t_p)++;
1587 	return 0;
1588 }
1589 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1590 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1591 			       struct tcmsg *tcm, struct netlink_callback *cb,
1592 			       int *t_p, int s_t)
1593 {
1594 	struct Qdisc *q;
1595 
1596 	if (!root)
1597 		return 0;
1598 
1599 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1600 		return -1;
1601 
1602 	list_for_each_entry(q, &root->list, list) {
1603 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1604 			return -1;
1605 	}
1606 
1607 	return 0;
1608 }
1609 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)1610 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1611 {
1612 	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1613 	struct net *net = sock_net(skb->sk);
1614 	struct netdev_queue *dev_queue;
1615 	struct net_device *dev;
1616 	int t, s_t;
1617 
1618 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1619 		return 0;
1620 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1621 	if (!dev)
1622 		return 0;
1623 
1624 	s_t = cb->args[0];
1625 	t = 0;
1626 
1627 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1628 		goto done;
1629 
1630 	dev_queue = dev_ingress_queue(dev);
1631 	if (dev_queue &&
1632 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1633 				&t, s_t) < 0)
1634 		goto done;
1635 
1636 done:
1637 	cb->args[0] = t;
1638 
1639 	dev_put(dev);
1640 	return skb->len;
1641 }
1642 
1643 /* Main classifier routine: scans classifier chain attached
1644  * to this qdisc, (optionally) tests for protocol and asks
1645  * specific classifiers.
1646  */
tc_classify_compat(struct sk_buff * skb,struct tcf_proto * tp,struct tcf_result * res)1647 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1648 		       struct tcf_result *res)
1649 {
1650 	__be16 protocol = skb->protocol;
1651 	int err;
1652 
1653 	for (; tp; tp = tp->next) {
1654 		if (tp->protocol != protocol &&
1655 		    tp->protocol != htons(ETH_P_ALL))
1656 			continue;
1657 		err = tp->classify(skb, tp, res);
1658 
1659 		if (err >= 0) {
1660 #ifdef CONFIG_NET_CLS_ACT
1661 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1662 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1663 #endif
1664 			return err;
1665 		}
1666 	}
1667 	return -1;
1668 }
1669 EXPORT_SYMBOL(tc_classify_compat);
1670 
tc_classify(struct sk_buff * skb,struct tcf_proto * tp,struct tcf_result * res)1671 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1672 		struct tcf_result *res)
1673 {
1674 	int err = 0;
1675 #ifdef CONFIG_NET_CLS_ACT
1676 	__be16 protocol;
1677 	struct tcf_proto *otp = tp;
1678 reclassify:
1679 	protocol = skb->protocol;
1680 #endif
1681 
1682 	err = tc_classify_compat(skb, tp, res);
1683 #ifdef CONFIG_NET_CLS_ACT
1684 	if (err == TC_ACT_RECLASSIFY) {
1685 		u32 verd = G_TC_VERD(skb->tc_verd);
1686 		tp = otp;
1687 
1688 		if (verd++ >= MAX_REC_LOOP) {
1689 			if (net_ratelimit())
1690 				pr_notice("%s: packet reclassify loop"
1691 					  " rule prio %u protocol %02x\n",
1692 					  tp->q->ops->id,
1693 					  tp->prio & 0xffff,
1694 					  ntohs(tp->protocol));
1695 			return TC_ACT_SHOT;
1696 		}
1697 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1698 		goto reclassify;
1699 	}
1700 #endif
1701 	return err;
1702 }
1703 EXPORT_SYMBOL(tc_classify);
1704 
tcf_destroy(struct tcf_proto * tp)1705 void tcf_destroy(struct tcf_proto *tp)
1706 {
1707 	tp->ops->destroy(tp);
1708 	module_put(tp->ops->owner);
1709 	kfree(tp);
1710 }
1711 
tcf_destroy_chain(struct tcf_proto ** fl)1712 void tcf_destroy_chain(struct tcf_proto **fl)
1713 {
1714 	struct tcf_proto *tp;
1715 
1716 	while ((tp = *fl) != NULL) {
1717 		*fl = tp->next;
1718 		tcf_destroy(tp);
1719 	}
1720 }
1721 EXPORT_SYMBOL(tcf_destroy_chain);
1722 
1723 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)1724 static int psched_show(struct seq_file *seq, void *v)
1725 {
1726 	struct timespec ts;
1727 
1728 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1729 	seq_printf(seq, "%08x %08x %08x %08x\n",
1730 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1731 		   1000000,
1732 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1733 
1734 	return 0;
1735 }
1736 
psched_open(struct inode * inode,struct file * file)1737 static int psched_open(struct inode *inode, struct file *file)
1738 {
1739 	return single_open(file, psched_show, NULL);
1740 }
1741 
1742 static const struct file_operations psched_fops = {
1743 	.owner = THIS_MODULE,
1744 	.open = psched_open,
1745 	.read  = seq_read,
1746 	.llseek = seq_lseek,
1747 	.release = single_release,
1748 };
1749 
psched_net_init(struct net * net)1750 static int __net_init psched_net_init(struct net *net)
1751 {
1752 	struct proc_dir_entry *e;
1753 
1754 	e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1755 	if (e == NULL)
1756 		return -ENOMEM;
1757 
1758 	return 0;
1759 }
1760 
psched_net_exit(struct net * net)1761 static void __net_exit psched_net_exit(struct net *net)
1762 {
1763 	proc_net_remove(net, "psched");
1764 }
1765 #else
psched_net_init(struct net * net)1766 static int __net_init psched_net_init(struct net *net)
1767 {
1768 	return 0;
1769 }
1770 
psched_net_exit(struct net * net)1771 static void __net_exit psched_net_exit(struct net *net)
1772 {
1773 }
1774 #endif
1775 
1776 static struct pernet_operations psched_net_ops = {
1777 	.init = psched_net_init,
1778 	.exit = psched_net_exit,
1779 };
1780 
pktsched_init(void)1781 static int __init pktsched_init(void)
1782 {
1783 	int err;
1784 
1785 	err = register_pernet_subsys(&psched_net_ops);
1786 	if (err) {
1787 		pr_err("pktsched_init: "
1788 		       "cannot initialize per netns operations\n");
1789 		return err;
1790 	}
1791 
1792 	register_qdisc(&pfifo_qdisc_ops);
1793 	register_qdisc(&bfifo_qdisc_ops);
1794 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1795 	register_qdisc(&mq_qdisc_ops);
1796 
1797 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1798 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1799 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1800 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1801 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1802 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1803 
1804 	return 0;
1805 }
1806 
1807 subsys_initcall(pktsched_init);
1808