1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c	Class-Based Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18 
19 
20 /*	Class-Based Queueing (CBQ) algorithm.
21 	=======================================
22 
23 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24 		 Management Models for Packet Networks",
25 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26 
27 		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28 
29 		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30 		 Parameters", 1996
31 
32 		 [4] Sally Floyd and Michael Speer, "Experimental Results
33 		 for Class-Based Queueing", 1998, not published.
34 
35 	-----------------------------------------------------------------------
36 
37 	Algorithm skeleton was taken from NS simulator cbq.cc.
38 	If someone wants to check this code against the LBL version,
39 	he should take into account that ONLY the skeleton was borrowed,
40 	the implementation is different. Particularly:
41 
42 	--- The WRR algorithm is different. Our version looks more
43 	reasonable (I hope) and works when quanta are allowed to be
44 	less than MTU, which is always the case when real time classes
45 	have small rates. Note, that the statement of [3] is
46 	incomplete, delay may actually be estimated even if class
47 	per-round allotment is less than MTU. Namely, if per-round
48 	allotment is W*r_i, and r_1+...+r_k = r < 1
49 
50 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51 
52 	In the worst case we have IntServ estimate with D = W*r+k*MTU
53 	and C = MTU*r. The proof (if correct at all) is trivial.
54 
55 
56 	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
57 	interpret some places, which look like wrong translations
58 	from NS. Anyone is advised to find these differences
59 	and explain to me, why I am wrong 8).
60 
61 	--- Linux has no EOI event, so that we cannot estimate true class
62 	idle time. Workaround is to consider the next dequeue event
63 	as sign that previous packet is finished. This is wrong because of
64 	internal device queueing, but on a permanently loaded link it is true.
65 	Moreover, combined with clock integrator, this scheme looks
66 	very close to an ideal solution.  */
67 
68 struct cbq_sched_data;
69 
70 
71 struct cbq_class {
72 	struct Qdisc_class_common common;
73 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
74 
75 /* Parameters */
76 	unsigned char		priority;	/* class priority */
77 	unsigned char		priority2;	/* priority to be used after overlimit */
78 	unsigned char		ewma_log;	/* time constant for idle time calculation */
79 
80 	u32			defmap;
81 
82 	/* Link-sharing scheduler parameters */
83 	long			maxidle;	/* Class parameters: see below. */
84 	long			offtime;
85 	long			minidle;
86 	u32			avpkt;
87 	struct qdisc_rate_table	*R_tab;
88 
89 	/* General scheduler (WRR) parameters */
90 	long			allot;
91 	long			quantum;	/* Allotment per WRR round */
92 	long			weight;		/* Relative allotment: see below */
93 
94 	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
95 	struct cbq_class	*split;		/* Ptr to split node */
96 	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
97 	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
98 	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
99 						   parent otherwise */
100 	struct cbq_class	*sibling;	/* Sibling chain */
101 	struct cbq_class	*children;	/* Pointer to children chain */
102 
103 	struct Qdisc		*q;		/* Elementary queueing discipline */
104 
105 
106 /* Variables */
107 	unsigned char		cpriority;	/* Effective priority */
108 	unsigned char		delayed;
109 	unsigned char		level;		/* level of the class in hierarchy:
110 						   0 for leaf classes, and maximal
111 						   level of children + 1 for nodes.
112 						 */
113 
114 	psched_time_t		last;		/* Last end of service */
115 	psched_time_t		undertime;
116 	long			avgidle;
117 	long			deficit;	/* Saved deficit for WRR */
118 	psched_time_t		penalized;
119 	struct gnet_stats_basic_sync bstats;
120 	struct gnet_stats_queue qstats;
121 	struct net_rate_estimator __rcu *rate_est;
122 	struct tc_cbq_xstats	xstats;
123 
124 	struct tcf_proto __rcu	*filter_list;
125 	struct tcf_block	*block;
126 
127 	int			filters;
128 
129 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
130 };
131 
132 struct cbq_sched_data {
133 	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
134 	int			nclasses[TC_CBQ_MAXPRIO + 1];
135 	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
136 
137 	struct cbq_class	link;
138 
139 	unsigned int		activemask;
140 	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
141 								   with backlog */
142 
143 #ifdef CONFIG_NET_CLS_ACT
144 	struct cbq_class	*rx_class;
145 #endif
146 	struct cbq_class	*tx_class;
147 	struct cbq_class	*tx_borrowed;
148 	int			tx_len;
149 	psched_time_t		now;		/* Cached timestamp */
150 	unsigned int		pmask;
151 
152 	struct hrtimer		delay_timer;
153 	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
154 						   started when CBQ has
155 						   backlog, but cannot
156 						   transmit just now */
157 	psched_tdiff_t		wd_expires;
158 	int			toplevel;
159 	u32			hgenerator;
160 };
161 
162 
163 #define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
164 
165 static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data * q,u32 classid)166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167 {
168 	struct Qdisc_class_common *clc;
169 
170 	clc = qdisc_class_find(&q->clhash, classid);
171 	if (clc == NULL)
172 		return NULL;
173 	return container_of(clc, struct cbq_class, common);
174 }
175 
176 #ifdef CONFIG_NET_CLS_ACT
177 
178 static struct cbq_class *
cbq_reclassify(struct sk_buff * skb,struct cbq_class * this)179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180 {
181 	struct cbq_class *cl;
182 
183 	for (cl = this->tparent; cl; cl = cl->tparent) {
184 		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185 
186 		if (new != NULL && new != this)
187 			return new;
188 	}
189 	return NULL;
190 }
191 
192 #endif
193 
194 /* Classify packet. The procedure is pretty complicated, but
195  * it allows us to combine link sharing and priority scheduling
196  * transparently.
197  *
198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199  * so that it resolves to split nodes. Then packets are classified
200  * by logical priority, or a more specific classifier may be attached
201  * to the split node.
202  */
203 
204 static struct cbq_class *
cbq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206 {
207 	struct cbq_sched_data *q = qdisc_priv(sch);
208 	struct cbq_class *head = &q->link;
209 	struct cbq_class **defmap;
210 	struct cbq_class *cl = NULL;
211 	u32 prio = skb->priority;
212 	struct tcf_proto *fl;
213 	struct tcf_result res;
214 
215 	/*
216 	 *  Step 1. If skb->priority points to one of our classes, use it.
217 	 */
218 	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219 	    (cl = cbq_class_lookup(q, prio)) != NULL)
220 		return cl;
221 
222 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223 	for (;;) {
224 		int result = 0;
225 		defmap = head->defaults;
226 
227 		fl = rcu_dereference_bh(head->filter_list);
228 		/*
229 		 * Step 2+n. Apply classifier.
230 		 */
231 		result = tcf_classify(skb, NULL, fl, &res, true);
232 		if (!fl || result < 0)
233 			goto fallback;
234 
235 		cl = (void *)res.class;
236 		if (!cl) {
237 			if (TC_H_MAJ(res.classid))
238 				cl = cbq_class_lookup(q, res.classid);
239 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
240 				cl = defmap[TC_PRIO_BESTEFFORT];
241 
242 			if (cl == NULL)
243 				goto fallback;
244 		}
245 		if (cl->level >= head->level)
246 			goto fallback;
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		case TC_ACT_RECLASSIFY:
257 			return cbq_reclassify(skb, cl);
258 		}
259 #endif
260 		if (cl->level == 0)
261 			return cl;
262 
263 		/*
264 		 * Step 3+n. If classifier selected a link sharing class,
265 		 *	   apply agency specific classifier.
266 		 *	   Repeat this procedure until we hit a leaf node.
267 		 */
268 		head = cl;
269 	}
270 
271 fallback:
272 	cl = head;
273 
274 	/*
275 	 * Step 4. No success...
276 	 */
277 	if (TC_H_MAJ(prio) == 0 &&
278 	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
279 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
280 		return head;
281 
282 	return cl;
283 }
284 
285 /*
286  * A packet has just been enqueued on the empty class.
287  * cbq_activate_class adds it to the tail of active class list
288  * of its priority band.
289  */
290 
cbq_activate_class(struct cbq_class * cl)291 static inline void cbq_activate_class(struct cbq_class *cl)
292 {
293 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
294 	int prio = cl->cpriority;
295 	struct cbq_class *cl_tail;
296 
297 	cl_tail = q->active[prio];
298 	q->active[prio] = cl;
299 
300 	if (cl_tail != NULL) {
301 		cl->next_alive = cl_tail->next_alive;
302 		cl_tail->next_alive = cl;
303 	} else {
304 		cl->next_alive = cl;
305 		q->activemask |= (1<<prio);
306 	}
307 }
308 
309 /*
310  * Unlink class from active chain.
311  * Note that this same procedure is done directly in cbq_dequeue*
312  * during round-robin procedure.
313  */
314 
cbq_deactivate_class(struct cbq_class * this)315 static void cbq_deactivate_class(struct cbq_class *this)
316 {
317 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
318 	int prio = this->cpriority;
319 	struct cbq_class *cl;
320 	struct cbq_class *cl_prev = q->active[prio];
321 
322 	do {
323 		cl = cl_prev->next_alive;
324 		if (cl == this) {
325 			cl_prev->next_alive = cl->next_alive;
326 			cl->next_alive = NULL;
327 
328 			if (cl == q->active[prio]) {
329 				q->active[prio] = cl_prev;
330 				if (cl == q->active[prio]) {
331 					q->active[prio] = NULL;
332 					q->activemask &= ~(1<<prio);
333 					return;
334 				}
335 			}
336 			return;
337 		}
338 	} while ((cl_prev = cl) != q->active[prio]);
339 }
340 
341 static void
cbq_mark_toplevel(struct cbq_sched_data * q,struct cbq_class * cl)342 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
343 {
344 	int toplevel = q->toplevel;
345 
346 	if (toplevel > cl->level) {
347 		psched_time_t now = psched_get_time();
348 
349 		do {
350 			if (cl->undertime < now) {
351 				q->toplevel = cl->level;
352 				return;
353 			}
354 		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
355 	}
356 }
357 
358 static int
cbq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)359 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
360 	    struct sk_buff **to_free)
361 {
362 	struct cbq_sched_data *q = qdisc_priv(sch);
363 	int ret;
364 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
365 
366 #ifdef CONFIG_NET_CLS_ACT
367 	q->rx_class = cl;
368 #endif
369 	if (cl == NULL) {
370 		if (ret & __NET_XMIT_BYPASS)
371 			qdisc_qstats_drop(sch);
372 		__qdisc_drop(skb, to_free);
373 		return ret;
374 	}
375 
376 	ret = qdisc_enqueue(skb, cl->q, to_free);
377 	if (ret == NET_XMIT_SUCCESS) {
378 		sch->q.qlen++;
379 		cbq_mark_toplevel(q, cl);
380 		if (!cl->next_alive)
381 			cbq_activate_class(cl);
382 		return ret;
383 	}
384 
385 	if (net_xmit_drop_count(ret)) {
386 		qdisc_qstats_drop(sch);
387 		cbq_mark_toplevel(q, cl);
388 		cl->qstats.drops++;
389 	}
390 	return ret;
391 }
392 
393 /* Overlimit action: penalize leaf class by adding offtime */
cbq_overlimit(struct cbq_class * cl)394 static void cbq_overlimit(struct cbq_class *cl)
395 {
396 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
397 	psched_tdiff_t delay = cl->undertime - q->now;
398 
399 	if (!cl->delayed) {
400 		delay += cl->offtime;
401 
402 		/*
403 		 * Class goes to sleep, so that it will have no
404 		 * chance to work avgidle. Let's forgive it 8)
405 		 *
406 		 * BTW cbq-2.0 has a crap in this
407 		 * place, apparently they forgot to shift it by cl->ewma_log.
408 		 */
409 		if (cl->avgidle < 0)
410 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
411 		if (cl->avgidle < cl->minidle)
412 			cl->avgidle = cl->minidle;
413 		if (delay <= 0)
414 			delay = 1;
415 		cl->undertime = q->now + delay;
416 
417 		cl->xstats.overactions++;
418 		cl->delayed = 1;
419 	}
420 	if (q->wd_expires == 0 || q->wd_expires > delay)
421 		q->wd_expires = delay;
422 
423 	/* Dirty work! We must schedule wakeups based on
424 	 * real available rate, rather than leaf rate,
425 	 * which may be tiny (even zero).
426 	 */
427 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
428 		struct cbq_class *b;
429 		psched_tdiff_t base_delay = q->wd_expires;
430 
431 		for (b = cl->borrow; b; b = b->borrow) {
432 			delay = b->undertime - q->now;
433 			if (delay < base_delay) {
434 				if (delay <= 0)
435 					delay = 1;
436 				base_delay = delay;
437 			}
438 		}
439 
440 		q->wd_expires = base_delay;
441 	}
442 }
443 
cbq_undelay_prio(struct cbq_sched_data * q,int prio,psched_time_t now)444 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
445 				       psched_time_t now)
446 {
447 	struct cbq_class *cl;
448 	struct cbq_class *cl_prev = q->active[prio];
449 	psched_time_t sched = now;
450 
451 	if (cl_prev == NULL)
452 		return 0;
453 
454 	do {
455 		cl = cl_prev->next_alive;
456 		if (now - cl->penalized > 0) {
457 			cl_prev->next_alive = cl->next_alive;
458 			cl->next_alive = NULL;
459 			cl->cpriority = cl->priority;
460 			cl->delayed = 0;
461 			cbq_activate_class(cl);
462 
463 			if (cl == q->active[prio]) {
464 				q->active[prio] = cl_prev;
465 				if (cl == q->active[prio]) {
466 					q->active[prio] = NULL;
467 					return 0;
468 				}
469 			}
470 
471 			cl = cl_prev->next_alive;
472 		} else if (sched - cl->penalized > 0)
473 			sched = cl->penalized;
474 	} while ((cl_prev = cl) != q->active[prio]);
475 
476 	return sched - now;
477 }
478 
cbq_undelay(struct hrtimer * timer)479 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
480 {
481 	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
482 						delay_timer);
483 	struct Qdisc *sch = q->watchdog.qdisc;
484 	psched_time_t now;
485 	psched_tdiff_t delay = 0;
486 	unsigned int pmask;
487 
488 	now = psched_get_time();
489 
490 	pmask = q->pmask;
491 	q->pmask = 0;
492 
493 	while (pmask) {
494 		int prio = ffz(~pmask);
495 		psched_tdiff_t tmp;
496 
497 		pmask &= ~(1<<prio);
498 
499 		tmp = cbq_undelay_prio(q, prio, now);
500 		if (tmp > 0) {
501 			q->pmask |= 1<<prio;
502 			if (tmp < delay || delay == 0)
503 				delay = tmp;
504 		}
505 	}
506 
507 	if (delay) {
508 		ktime_t time;
509 
510 		time = 0;
511 		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
512 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
513 	}
514 
515 	__netif_schedule(qdisc_root(sch));
516 	return HRTIMER_NORESTART;
517 }
518 
519 /*
520  * It is mission critical procedure.
521  *
522  * We "regenerate" toplevel cutoff, if transmitting class
523  * has backlog and it is not regulated. It is not part of
524  * original CBQ description, but looks more reasonable.
525  * Probably, it is wrong. This question needs further investigation.
526  */
527 
528 static inline void
cbq_update_toplevel(struct cbq_sched_data * q,struct cbq_class * cl,struct cbq_class * borrowed)529 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
530 		    struct cbq_class *borrowed)
531 {
532 	if (cl && q->toplevel >= borrowed->level) {
533 		if (cl->q->q.qlen > 1) {
534 			do {
535 				if (borrowed->undertime == PSCHED_PASTPERFECT) {
536 					q->toplevel = borrowed->level;
537 					return;
538 				}
539 			} while ((borrowed = borrowed->borrow) != NULL);
540 		}
541 #if 0
542 	/* It is not necessary now. Uncommenting it
543 	   will save CPU cycles, but decrease fairness.
544 	 */
545 		q->toplevel = TC_CBQ_MAXLEVEL;
546 #endif
547 	}
548 }
549 
550 static void
cbq_update(struct cbq_sched_data * q)551 cbq_update(struct cbq_sched_data *q)
552 {
553 	struct cbq_class *this = q->tx_class;
554 	struct cbq_class *cl = this;
555 	int len = q->tx_len;
556 	psched_time_t now;
557 
558 	q->tx_class = NULL;
559 	/* Time integrator. We calculate EOS time
560 	 * by adding expected packet transmission time.
561 	 */
562 	now = q->now + L2T(&q->link, len);
563 
564 	for ( ; cl; cl = cl->share) {
565 		long avgidle = cl->avgidle;
566 		long idle;
567 
568 		_bstats_update(&cl->bstats, len, 1);
569 
570 		/*
571 		 * (now - last) is total time between packet right edges.
572 		 * (last_pktlen/rate) is "virtual" busy time, so that
573 		 *
574 		 *	idle = (now - last) - last_pktlen/rate
575 		 */
576 
577 		idle = now - cl->last;
578 		if ((unsigned long)idle > 128*1024*1024) {
579 			avgidle = cl->maxidle;
580 		} else {
581 			idle -= L2T(cl, len);
582 
583 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
584 		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
585 		 * cl->avgidle == true_avgidle/W,
586 		 * hence:
587 		 */
588 			avgidle += idle - (avgidle>>cl->ewma_log);
589 		}
590 
591 		if (avgidle <= 0) {
592 			/* Overlimit or at-limit */
593 
594 			if (avgidle < cl->minidle)
595 				avgidle = cl->minidle;
596 
597 			cl->avgidle = avgidle;
598 
599 			/* Calculate expected time, when this class
600 			 * will be allowed to send.
601 			 * It will occur, when:
602 			 * (1-W)*true_avgidle + W*delay = 0, i.e.
603 			 * idle = (1/W - 1)*(-true_avgidle)
604 			 * or
605 			 * idle = (1 - W)*(-cl->avgidle);
606 			 */
607 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
608 
609 			/*
610 			 * That is not all.
611 			 * To maintain the rate allocated to the class,
612 			 * we add to undertime virtual clock,
613 			 * necessary to complete transmitted packet.
614 			 * (len/phys_bandwidth has been already passed
615 			 * to the moment of cbq_update)
616 			 */
617 
618 			idle -= L2T(&q->link, len);
619 			idle += L2T(cl, len);
620 
621 			cl->undertime = now + idle;
622 		} else {
623 			/* Underlimit */
624 
625 			cl->undertime = PSCHED_PASTPERFECT;
626 			if (avgidle > cl->maxidle)
627 				cl->avgidle = cl->maxidle;
628 			else
629 				cl->avgidle = avgidle;
630 		}
631 		if ((s64)(now - cl->last) > 0)
632 			cl->last = now;
633 	}
634 
635 	cbq_update_toplevel(q, this, q->tx_borrowed);
636 }
637 
638 static inline struct cbq_class *
cbq_under_limit(struct cbq_class * cl)639 cbq_under_limit(struct cbq_class *cl)
640 {
641 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
642 	struct cbq_class *this_cl = cl;
643 
644 	if (cl->tparent == NULL)
645 		return cl;
646 
647 	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
648 		cl->delayed = 0;
649 		return cl;
650 	}
651 
652 	do {
653 		/* It is very suspicious place. Now overlimit
654 		 * action is generated for not bounded classes
655 		 * only if link is completely congested.
656 		 * Though it is in agree with ancestor-only paradigm,
657 		 * it looks very stupid. Particularly,
658 		 * it means that this chunk of code will either
659 		 * never be called or result in strong amplification
660 		 * of burstiness. Dangerous, silly, and, however,
661 		 * no another solution exists.
662 		 */
663 		cl = cl->borrow;
664 		if (!cl) {
665 			this_cl->qstats.overlimits++;
666 			cbq_overlimit(this_cl);
667 			return NULL;
668 		}
669 		if (cl->level > q->toplevel)
670 			return NULL;
671 	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
672 
673 	cl->delayed = 0;
674 	return cl;
675 }
676 
677 static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc * sch,int prio)678 cbq_dequeue_prio(struct Qdisc *sch, int prio)
679 {
680 	struct cbq_sched_data *q = qdisc_priv(sch);
681 	struct cbq_class *cl_tail, *cl_prev, *cl;
682 	struct sk_buff *skb;
683 	int deficit;
684 
685 	cl_tail = cl_prev = q->active[prio];
686 	cl = cl_prev->next_alive;
687 
688 	do {
689 		deficit = 0;
690 
691 		/* Start round */
692 		do {
693 			struct cbq_class *borrow = cl;
694 
695 			if (cl->q->q.qlen &&
696 			    (borrow = cbq_under_limit(cl)) == NULL)
697 				goto skip_class;
698 
699 			if (cl->deficit <= 0) {
700 				/* Class exhausted its allotment per
701 				 * this round. Switch to the next one.
702 				 */
703 				deficit = 1;
704 				cl->deficit += cl->quantum;
705 				goto next_class;
706 			}
707 
708 			skb = cl->q->dequeue(cl->q);
709 
710 			/* Class did not give us any skb :-(
711 			 * It could occur even if cl->q->q.qlen != 0
712 			 * f.e. if cl->q == "tbf"
713 			 */
714 			if (skb == NULL)
715 				goto skip_class;
716 
717 			cl->deficit -= qdisc_pkt_len(skb);
718 			q->tx_class = cl;
719 			q->tx_borrowed = borrow;
720 			if (borrow != cl) {
721 #ifndef CBQ_XSTATS_BORROWS_BYTES
722 				borrow->xstats.borrows++;
723 				cl->xstats.borrows++;
724 #else
725 				borrow->xstats.borrows += qdisc_pkt_len(skb);
726 				cl->xstats.borrows += qdisc_pkt_len(skb);
727 #endif
728 			}
729 			q->tx_len = qdisc_pkt_len(skb);
730 
731 			if (cl->deficit <= 0) {
732 				q->active[prio] = cl;
733 				cl = cl->next_alive;
734 				cl->deficit += cl->quantum;
735 			}
736 			return skb;
737 
738 skip_class:
739 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
740 				/* Class is empty or penalized.
741 				 * Unlink it from active chain.
742 				 */
743 				cl_prev->next_alive = cl->next_alive;
744 				cl->next_alive = NULL;
745 
746 				/* Did cl_tail point to it? */
747 				if (cl == cl_tail) {
748 					/* Repair it! */
749 					cl_tail = cl_prev;
750 
751 					/* Was it the last class in this band? */
752 					if (cl == cl_tail) {
753 						/* Kill the band! */
754 						q->active[prio] = NULL;
755 						q->activemask &= ~(1<<prio);
756 						if (cl->q->q.qlen)
757 							cbq_activate_class(cl);
758 						return NULL;
759 					}
760 
761 					q->active[prio] = cl_tail;
762 				}
763 				if (cl->q->q.qlen)
764 					cbq_activate_class(cl);
765 
766 				cl = cl_prev;
767 			}
768 
769 next_class:
770 			cl_prev = cl;
771 			cl = cl->next_alive;
772 		} while (cl_prev != cl_tail);
773 	} while (deficit);
774 
775 	q->active[prio] = cl_prev;
776 
777 	return NULL;
778 }
779 
780 static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc * sch)781 cbq_dequeue_1(struct Qdisc *sch)
782 {
783 	struct cbq_sched_data *q = qdisc_priv(sch);
784 	struct sk_buff *skb;
785 	unsigned int activemask;
786 
787 	activemask = q->activemask & 0xFF;
788 	while (activemask) {
789 		int prio = ffz(~activemask);
790 		activemask &= ~(1<<prio);
791 		skb = cbq_dequeue_prio(sch, prio);
792 		if (skb)
793 			return skb;
794 	}
795 	return NULL;
796 }
797 
798 static struct sk_buff *
cbq_dequeue(struct Qdisc * sch)799 cbq_dequeue(struct Qdisc *sch)
800 {
801 	struct sk_buff *skb;
802 	struct cbq_sched_data *q = qdisc_priv(sch);
803 	psched_time_t now;
804 
805 	now = psched_get_time();
806 
807 	if (q->tx_class)
808 		cbq_update(q);
809 
810 	q->now = now;
811 
812 	for (;;) {
813 		q->wd_expires = 0;
814 
815 		skb = cbq_dequeue_1(sch);
816 		if (skb) {
817 			qdisc_bstats_update(sch, skb);
818 			sch->q.qlen--;
819 			return skb;
820 		}
821 
822 		/* All the classes are overlimit.
823 		 *
824 		 * It is possible, if:
825 		 *
826 		 * 1. Scheduler is empty.
827 		 * 2. Toplevel cutoff inhibited borrowing.
828 		 * 3. Root class is overlimit.
829 		 *
830 		 * Reset 2d and 3d conditions and retry.
831 		 *
832 		 * Note, that NS and cbq-2.0 are buggy, peeking
833 		 * an arbitrary class is appropriate for ancestor-only
834 		 * sharing, but not for toplevel algorithm.
835 		 *
836 		 * Our version is better, but slower, because it requires
837 		 * two passes, but it is unavoidable with top-level sharing.
838 		 */
839 
840 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
841 		    q->link.undertime == PSCHED_PASTPERFECT)
842 			break;
843 
844 		q->toplevel = TC_CBQ_MAXLEVEL;
845 		q->link.undertime = PSCHED_PASTPERFECT;
846 	}
847 
848 	/* No packets in scheduler or nobody wants to give them to us :-(
849 	 * Sigh... start watchdog timer in the last case.
850 	 */
851 
852 	if (sch->q.qlen) {
853 		qdisc_qstats_overlimit(sch);
854 		if (q->wd_expires)
855 			qdisc_watchdog_schedule(&q->watchdog,
856 						now + q->wd_expires);
857 	}
858 	return NULL;
859 }
860 
861 /* CBQ class maintenance routines */
862 
cbq_adjust_levels(struct cbq_class * this)863 static void cbq_adjust_levels(struct cbq_class *this)
864 {
865 	if (this == NULL)
866 		return;
867 
868 	do {
869 		int level = 0;
870 		struct cbq_class *cl;
871 
872 		cl = this->children;
873 		if (cl) {
874 			do {
875 				if (cl->level > level)
876 					level = cl->level;
877 			} while ((cl = cl->sibling) != this->children);
878 		}
879 		this->level = level + 1;
880 	} while ((this = this->tparent) != NULL);
881 }
882 
cbq_normalize_quanta(struct cbq_sched_data * q,int prio)883 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
884 {
885 	struct cbq_class *cl;
886 	unsigned int h;
887 
888 	if (q->quanta[prio] == 0)
889 		return;
890 
891 	for (h = 0; h < q->clhash.hashsize; h++) {
892 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
893 			/* BUGGGG... Beware! This expression suffer of
894 			 * arithmetic overflows!
895 			 */
896 			if (cl->priority == prio) {
897 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
898 					q->quanta[prio];
899 			}
900 			if (cl->quantum <= 0 ||
901 			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
902 				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
903 					cl->common.classid, cl->quantum);
904 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
905 			}
906 		}
907 	}
908 }
909 
cbq_sync_defmap(struct cbq_class * cl)910 static void cbq_sync_defmap(struct cbq_class *cl)
911 {
912 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
913 	struct cbq_class *split = cl->split;
914 	unsigned int h;
915 	int i;
916 
917 	if (split == NULL)
918 		return;
919 
920 	for (i = 0; i <= TC_PRIO_MAX; i++) {
921 		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
922 			split->defaults[i] = NULL;
923 	}
924 
925 	for (i = 0; i <= TC_PRIO_MAX; i++) {
926 		int level = split->level;
927 
928 		if (split->defaults[i])
929 			continue;
930 
931 		for (h = 0; h < q->clhash.hashsize; h++) {
932 			struct cbq_class *c;
933 
934 			hlist_for_each_entry(c, &q->clhash.hash[h],
935 					     common.hnode) {
936 				if (c->split == split && c->level < level &&
937 				    c->defmap & (1<<i)) {
938 					split->defaults[i] = c;
939 					level = c->level;
940 				}
941 			}
942 		}
943 	}
944 }
945 
cbq_change_defmap(struct cbq_class * cl,u32 splitid,u32 def,u32 mask)946 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
947 {
948 	struct cbq_class *split = NULL;
949 
950 	if (splitid == 0) {
951 		split = cl->split;
952 		if (!split)
953 			return;
954 		splitid = split->common.classid;
955 	}
956 
957 	if (split == NULL || split->common.classid != splitid) {
958 		for (split = cl->tparent; split; split = split->tparent)
959 			if (split->common.classid == splitid)
960 				break;
961 	}
962 
963 	if (split == NULL)
964 		return;
965 
966 	if (cl->split != split) {
967 		cl->defmap = 0;
968 		cbq_sync_defmap(cl);
969 		cl->split = split;
970 		cl->defmap = def & mask;
971 	} else
972 		cl->defmap = (cl->defmap & ~mask) | (def & mask);
973 
974 	cbq_sync_defmap(cl);
975 }
976 
cbq_unlink_class(struct cbq_class * this)977 static void cbq_unlink_class(struct cbq_class *this)
978 {
979 	struct cbq_class *cl, **clp;
980 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
981 
982 	qdisc_class_hash_remove(&q->clhash, &this->common);
983 
984 	if (this->tparent) {
985 		clp = &this->sibling;
986 		cl = *clp;
987 		do {
988 			if (cl == this) {
989 				*clp = cl->sibling;
990 				break;
991 			}
992 			clp = &cl->sibling;
993 		} while ((cl = *clp) != this->sibling);
994 
995 		if (this->tparent->children == this) {
996 			this->tparent->children = this->sibling;
997 			if (this->sibling == this)
998 				this->tparent->children = NULL;
999 		}
1000 	} else {
1001 		WARN_ON(this->sibling != this);
1002 	}
1003 }
1004 
cbq_link_class(struct cbq_class * this)1005 static void cbq_link_class(struct cbq_class *this)
1006 {
1007 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1008 	struct cbq_class *parent = this->tparent;
1009 
1010 	this->sibling = this;
1011 	qdisc_class_hash_insert(&q->clhash, &this->common);
1012 
1013 	if (parent == NULL)
1014 		return;
1015 
1016 	if (parent->children == NULL) {
1017 		parent->children = this;
1018 	} else {
1019 		this->sibling = parent->children->sibling;
1020 		parent->children->sibling = this;
1021 	}
1022 }
1023 
1024 static void
cbq_reset(struct Qdisc * sch)1025 cbq_reset(struct Qdisc *sch)
1026 {
1027 	struct cbq_sched_data *q = qdisc_priv(sch);
1028 	struct cbq_class *cl;
1029 	int prio;
1030 	unsigned int h;
1031 
1032 	q->activemask = 0;
1033 	q->pmask = 0;
1034 	q->tx_class = NULL;
1035 	q->tx_borrowed = NULL;
1036 	qdisc_watchdog_cancel(&q->watchdog);
1037 	hrtimer_cancel(&q->delay_timer);
1038 	q->toplevel = TC_CBQ_MAXLEVEL;
1039 	q->now = psched_get_time();
1040 
1041 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1042 		q->active[prio] = NULL;
1043 
1044 	for (h = 0; h < q->clhash.hashsize; h++) {
1045 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1046 			qdisc_reset(cl->q);
1047 
1048 			cl->next_alive = NULL;
1049 			cl->undertime = PSCHED_PASTPERFECT;
1050 			cl->avgidle = cl->maxidle;
1051 			cl->deficit = cl->quantum;
1052 			cl->cpriority = cl->priority;
1053 		}
1054 	}
1055 	sch->q.qlen = 0;
1056 }
1057 
1058 
cbq_set_lss(struct cbq_class * cl,struct tc_cbq_lssopt * lss)1059 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1060 {
1061 	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1062 		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1063 		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1064 	}
1065 	if (lss->change & TCF_CBQ_LSS_EWMA)
1066 		cl->ewma_log = lss->ewma_log;
1067 	if (lss->change & TCF_CBQ_LSS_AVPKT)
1068 		cl->avpkt = lss->avpkt;
1069 	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1070 		cl->minidle = -(long)lss->minidle;
1071 	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1072 		cl->maxidle = lss->maxidle;
1073 		cl->avgidle = lss->maxidle;
1074 	}
1075 	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1076 		cl->offtime = lss->offtime;
1077 	return 0;
1078 }
1079 
cbq_rmprio(struct cbq_sched_data * q,struct cbq_class * cl)1080 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1081 {
1082 	q->nclasses[cl->priority]--;
1083 	q->quanta[cl->priority] -= cl->weight;
1084 	cbq_normalize_quanta(q, cl->priority);
1085 }
1086 
cbq_addprio(struct cbq_sched_data * q,struct cbq_class * cl)1087 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1088 {
1089 	q->nclasses[cl->priority]++;
1090 	q->quanta[cl->priority] += cl->weight;
1091 	cbq_normalize_quanta(q, cl->priority);
1092 }
1093 
cbq_set_wrr(struct cbq_class * cl,struct tc_cbq_wrropt * wrr)1094 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1095 {
1096 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1097 
1098 	if (wrr->allot)
1099 		cl->allot = wrr->allot;
1100 	if (wrr->weight)
1101 		cl->weight = wrr->weight;
1102 	if (wrr->priority) {
1103 		cl->priority = wrr->priority - 1;
1104 		cl->cpriority = cl->priority;
1105 		if (cl->priority >= cl->priority2)
1106 			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1107 	}
1108 
1109 	cbq_addprio(q, cl);
1110 	return 0;
1111 }
1112 
cbq_set_fopt(struct cbq_class * cl,struct tc_cbq_fopt * fopt)1113 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1114 {
1115 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1116 	return 0;
1117 }
1118 
1119 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1120 	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1121 	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1122 	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1123 	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1124 	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1125 	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1126 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1127 };
1128 
cbq_opt_parse(struct nlattr * tb[TCA_CBQ_MAX+1],struct nlattr * opt,struct netlink_ext_ack * extack)1129 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1130 			 struct nlattr *opt,
1131 			 struct netlink_ext_ack *extack)
1132 {
1133 	int err;
1134 
1135 	if (!opt) {
1136 		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1137 		return -EINVAL;
1138 	}
1139 
1140 	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1141 					  cbq_policy, extack);
1142 	if (err < 0)
1143 		return err;
1144 
1145 	if (tb[TCA_CBQ_WRROPT]) {
1146 		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147 
1148 		if (wrr->priority > TC_CBQ_MAXPRIO) {
1149 			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1150 			err = -EINVAL;
1151 		}
1152 	}
1153 	return err;
1154 }
1155 
cbq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1156 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1157 		    struct netlink_ext_ack *extack)
1158 {
1159 	struct cbq_sched_data *q = qdisc_priv(sch);
1160 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1161 	struct tc_ratespec *r;
1162 	int err;
1163 
1164 	qdisc_watchdog_init(&q->watchdog, sch);
1165 	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1166 	q->delay_timer.function = cbq_undelay;
1167 
1168 	err = cbq_opt_parse(tb, opt, extack);
1169 	if (err < 0)
1170 		return err;
1171 
1172 	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1173 		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1174 		return -EINVAL;
1175 	}
1176 
1177 	r = nla_data(tb[TCA_CBQ_RATE]);
1178 
1179 	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1180 	if (!q->link.R_tab)
1181 		return -EINVAL;
1182 
1183 	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1184 	if (err)
1185 		goto put_rtab;
1186 
1187 	err = qdisc_class_hash_init(&q->clhash);
1188 	if (err < 0)
1189 		goto put_block;
1190 
1191 	q->link.sibling = &q->link;
1192 	q->link.common.classid = sch->handle;
1193 	q->link.qdisc = sch;
1194 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1195 				      sch->handle, NULL);
1196 	if (!q->link.q)
1197 		q->link.q = &noop_qdisc;
1198 	else
1199 		qdisc_hash_add(q->link.q, true);
1200 
1201 	q->link.priority = TC_CBQ_MAXPRIO - 1;
1202 	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1203 	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1204 	q->link.allot = psched_mtu(qdisc_dev(sch));
1205 	q->link.quantum = q->link.allot;
1206 	q->link.weight = q->link.R_tab->rate.rate;
1207 
1208 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1209 	q->link.avpkt = q->link.allot/2;
1210 	q->link.minidle = -0x7FFFFFFF;
1211 
1212 	q->toplevel = TC_CBQ_MAXLEVEL;
1213 	q->now = psched_get_time();
1214 
1215 	cbq_link_class(&q->link);
1216 
1217 	if (tb[TCA_CBQ_LSSOPT])
1218 		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1219 
1220 	cbq_addprio(q, &q->link);
1221 	return 0;
1222 
1223 put_block:
1224 	tcf_block_put(q->link.block);
1225 
1226 put_rtab:
1227 	qdisc_put_rtab(q->link.R_tab);
1228 	return err;
1229 }
1230 
cbq_dump_rate(struct sk_buff * skb,struct cbq_class * cl)1231 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1232 {
1233 	unsigned char *b = skb_tail_pointer(skb);
1234 
1235 	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1236 		goto nla_put_failure;
1237 	return skb->len;
1238 
1239 nla_put_failure:
1240 	nlmsg_trim(skb, b);
1241 	return -1;
1242 }
1243 
cbq_dump_lss(struct sk_buff * skb,struct cbq_class * cl)1244 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1245 {
1246 	unsigned char *b = skb_tail_pointer(skb);
1247 	struct tc_cbq_lssopt opt;
1248 
1249 	opt.flags = 0;
1250 	if (cl->borrow == NULL)
1251 		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1252 	if (cl->share == NULL)
1253 		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1254 	opt.ewma_log = cl->ewma_log;
1255 	opt.level = cl->level;
1256 	opt.avpkt = cl->avpkt;
1257 	opt.maxidle = cl->maxidle;
1258 	opt.minidle = (u32)(-cl->minidle);
1259 	opt.offtime = cl->offtime;
1260 	opt.change = ~0;
1261 	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1262 		goto nla_put_failure;
1263 	return skb->len;
1264 
1265 nla_put_failure:
1266 	nlmsg_trim(skb, b);
1267 	return -1;
1268 }
1269 
cbq_dump_wrr(struct sk_buff * skb,struct cbq_class * cl)1270 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1271 {
1272 	unsigned char *b = skb_tail_pointer(skb);
1273 	struct tc_cbq_wrropt opt;
1274 
1275 	memset(&opt, 0, sizeof(opt));
1276 	opt.flags = 0;
1277 	opt.allot = cl->allot;
1278 	opt.priority = cl->priority + 1;
1279 	opt.cpriority = cl->cpriority + 1;
1280 	opt.weight = cl->weight;
1281 	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1282 		goto nla_put_failure;
1283 	return skb->len;
1284 
1285 nla_put_failure:
1286 	nlmsg_trim(skb, b);
1287 	return -1;
1288 }
1289 
cbq_dump_fopt(struct sk_buff * skb,struct cbq_class * cl)1290 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1291 {
1292 	unsigned char *b = skb_tail_pointer(skb);
1293 	struct tc_cbq_fopt opt;
1294 
1295 	if (cl->split || cl->defmap) {
1296 		opt.split = cl->split ? cl->split->common.classid : 0;
1297 		opt.defmap = cl->defmap;
1298 		opt.defchange = ~0;
1299 		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1300 			goto nla_put_failure;
1301 	}
1302 	return skb->len;
1303 
1304 nla_put_failure:
1305 	nlmsg_trim(skb, b);
1306 	return -1;
1307 }
1308 
cbq_dump_attr(struct sk_buff * skb,struct cbq_class * cl)1309 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1310 {
1311 	if (cbq_dump_lss(skb, cl) < 0 ||
1312 	    cbq_dump_rate(skb, cl) < 0 ||
1313 	    cbq_dump_wrr(skb, cl) < 0 ||
1314 	    cbq_dump_fopt(skb, cl) < 0)
1315 		return -1;
1316 	return 0;
1317 }
1318 
cbq_dump(struct Qdisc * sch,struct sk_buff * skb)1319 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1320 {
1321 	struct cbq_sched_data *q = qdisc_priv(sch);
1322 	struct nlattr *nest;
1323 
1324 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1325 	if (nest == NULL)
1326 		goto nla_put_failure;
1327 	if (cbq_dump_attr(skb, &q->link) < 0)
1328 		goto nla_put_failure;
1329 	return nla_nest_end(skb, nest);
1330 
1331 nla_put_failure:
1332 	nla_nest_cancel(skb, nest);
1333 	return -1;
1334 }
1335 
1336 static int
cbq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1337 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1338 {
1339 	struct cbq_sched_data *q = qdisc_priv(sch);
1340 
1341 	q->link.xstats.avgidle = q->link.avgidle;
1342 	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1343 }
1344 
1345 static int
cbq_dump_class(struct Qdisc * sch,unsigned long arg,struct sk_buff * skb,struct tcmsg * tcm)1346 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1347 	       struct sk_buff *skb, struct tcmsg *tcm)
1348 {
1349 	struct cbq_class *cl = (struct cbq_class *)arg;
1350 	struct nlattr *nest;
1351 
1352 	if (cl->tparent)
1353 		tcm->tcm_parent = cl->tparent->common.classid;
1354 	else
1355 		tcm->tcm_parent = TC_H_ROOT;
1356 	tcm->tcm_handle = cl->common.classid;
1357 	tcm->tcm_info = cl->q->handle;
1358 
1359 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1360 	if (nest == NULL)
1361 		goto nla_put_failure;
1362 	if (cbq_dump_attr(skb, cl) < 0)
1363 		goto nla_put_failure;
1364 	return nla_nest_end(skb, nest);
1365 
1366 nla_put_failure:
1367 	nla_nest_cancel(skb, nest);
1368 	return -1;
1369 }
1370 
1371 static int
cbq_dump_class_stats(struct Qdisc * sch,unsigned long arg,struct gnet_dump * d)1372 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1373 	struct gnet_dump *d)
1374 {
1375 	struct cbq_sched_data *q = qdisc_priv(sch);
1376 	struct cbq_class *cl = (struct cbq_class *)arg;
1377 	__u32 qlen;
1378 
1379 	cl->xstats.avgidle = cl->avgidle;
1380 	cl->xstats.undertime = 0;
1381 	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1382 
1383 	if (cl->undertime != PSCHED_PASTPERFECT)
1384 		cl->xstats.undertime = cl->undertime - q->now;
1385 
1386 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1387 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1388 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1389 		return -1;
1390 
1391 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1392 }
1393 
cbq_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1394 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1395 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1396 {
1397 	struct cbq_class *cl = (struct cbq_class *)arg;
1398 
1399 	if (new == NULL) {
1400 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1401 					cl->common.classid, extack);
1402 		if (new == NULL)
1403 			return -ENOBUFS;
1404 	}
1405 
1406 	*old = qdisc_replace(sch, new, &cl->q);
1407 	return 0;
1408 }
1409 
cbq_leaf(struct Qdisc * sch,unsigned long arg)1410 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1411 {
1412 	struct cbq_class *cl = (struct cbq_class *)arg;
1413 
1414 	return cl->q;
1415 }
1416 
cbq_qlen_notify(struct Qdisc * sch,unsigned long arg)1417 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1418 {
1419 	struct cbq_class *cl = (struct cbq_class *)arg;
1420 
1421 	cbq_deactivate_class(cl);
1422 }
1423 
cbq_find(struct Qdisc * sch,u32 classid)1424 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1425 {
1426 	struct cbq_sched_data *q = qdisc_priv(sch);
1427 
1428 	return (unsigned long)cbq_class_lookup(q, classid);
1429 }
1430 
cbq_destroy_class(struct Qdisc * sch,struct cbq_class * cl)1431 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1432 {
1433 	struct cbq_sched_data *q = qdisc_priv(sch);
1434 
1435 	WARN_ON(cl->filters);
1436 
1437 	tcf_block_put(cl->block);
1438 	qdisc_put(cl->q);
1439 	qdisc_put_rtab(cl->R_tab);
1440 	gen_kill_estimator(&cl->rate_est);
1441 	if (cl != &q->link)
1442 		kfree(cl);
1443 }
1444 
cbq_destroy(struct Qdisc * sch)1445 static void cbq_destroy(struct Qdisc *sch)
1446 {
1447 	struct cbq_sched_data *q = qdisc_priv(sch);
1448 	struct hlist_node *next;
1449 	struct cbq_class *cl;
1450 	unsigned int h;
1451 
1452 #ifdef CONFIG_NET_CLS_ACT
1453 	q->rx_class = NULL;
1454 #endif
1455 	/*
1456 	 * Filters must be destroyed first because we don't destroy the
1457 	 * classes from root to leafs which means that filters can still
1458 	 * be bound to classes which have been destroyed already. --TGR '04
1459 	 */
1460 	for (h = 0; h < q->clhash.hashsize; h++) {
1461 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1462 			tcf_block_put(cl->block);
1463 			cl->block = NULL;
1464 		}
1465 	}
1466 	for (h = 0; h < q->clhash.hashsize; h++) {
1467 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1468 					  common.hnode)
1469 			cbq_destroy_class(sch, cl);
1470 	}
1471 	qdisc_class_hash_destroy(&q->clhash);
1472 }
1473 
1474 static int
cbq_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct nlattr ** tca,unsigned long * arg,struct netlink_ext_ack * extack)1475 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1476 		 unsigned long *arg, struct netlink_ext_ack *extack)
1477 {
1478 	int err;
1479 	struct cbq_sched_data *q = qdisc_priv(sch);
1480 	struct cbq_class *cl = (struct cbq_class *)*arg;
1481 	struct nlattr *opt = tca[TCA_OPTIONS];
1482 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1483 	struct cbq_class *parent;
1484 	struct qdisc_rate_table *rtab = NULL;
1485 
1486 	err = cbq_opt_parse(tb, opt, extack);
1487 	if (err < 0)
1488 		return err;
1489 
1490 	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1491 		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1492 		return -EOPNOTSUPP;
1493 	}
1494 
1495 	if (cl) {
1496 		/* Check parent */
1497 		if (parentid) {
1498 			if (cl->tparent &&
1499 			    cl->tparent->common.classid != parentid) {
1500 				NL_SET_ERR_MSG(extack, "Invalid parent id");
1501 				return -EINVAL;
1502 			}
1503 			if (!cl->tparent && parentid != TC_H_ROOT) {
1504 				NL_SET_ERR_MSG(extack, "Parent must be root");
1505 				return -EINVAL;
1506 			}
1507 		}
1508 
1509 		if (tb[TCA_CBQ_RATE]) {
1510 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1511 					      tb[TCA_CBQ_RTAB], extack);
1512 			if (rtab == NULL)
1513 				return -EINVAL;
1514 		}
1515 
1516 		if (tca[TCA_RATE]) {
1517 			err = gen_replace_estimator(&cl->bstats, NULL,
1518 						    &cl->rate_est,
1519 						    NULL,
1520 						    true,
1521 						    tca[TCA_RATE]);
1522 			if (err) {
1523 				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1524 				qdisc_put_rtab(rtab);
1525 				return err;
1526 			}
1527 		}
1528 
1529 		/* Change class parameters */
1530 		sch_tree_lock(sch);
1531 
1532 		if (cl->next_alive != NULL)
1533 			cbq_deactivate_class(cl);
1534 
1535 		if (rtab) {
1536 			qdisc_put_rtab(cl->R_tab);
1537 			cl->R_tab = rtab;
1538 		}
1539 
1540 		if (tb[TCA_CBQ_LSSOPT])
1541 			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1542 
1543 		if (tb[TCA_CBQ_WRROPT]) {
1544 			cbq_rmprio(q, cl);
1545 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1546 		}
1547 
1548 		if (tb[TCA_CBQ_FOPT])
1549 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1550 
1551 		if (cl->q->q.qlen)
1552 			cbq_activate_class(cl);
1553 
1554 		sch_tree_unlock(sch);
1555 
1556 		return 0;
1557 	}
1558 
1559 	if (parentid == TC_H_ROOT)
1560 		return -EINVAL;
1561 
1562 	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1563 		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1564 		return -EINVAL;
1565 	}
1566 
1567 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1568 			      extack);
1569 	if (rtab == NULL)
1570 		return -EINVAL;
1571 
1572 	if (classid) {
1573 		err = -EINVAL;
1574 		if (TC_H_MAJ(classid ^ sch->handle) ||
1575 		    cbq_class_lookup(q, classid)) {
1576 			NL_SET_ERR_MSG(extack, "Specified class not found");
1577 			goto failure;
1578 		}
1579 	} else {
1580 		int i;
1581 		classid = TC_H_MAKE(sch->handle, 0x8000);
1582 
1583 		for (i = 0; i < 0x8000; i++) {
1584 			if (++q->hgenerator >= 0x8000)
1585 				q->hgenerator = 1;
1586 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1587 				break;
1588 		}
1589 		err = -ENOSR;
1590 		if (i >= 0x8000) {
1591 			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1592 			goto failure;
1593 		}
1594 		classid = classid|q->hgenerator;
1595 	}
1596 
1597 	parent = &q->link;
1598 	if (parentid) {
1599 		parent = cbq_class_lookup(q, parentid);
1600 		err = -EINVAL;
1601 		if (!parent) {
1602 			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1603 			goto failure;
1604 		}
1605 	}
1606 
1607 	err = -ENOBUFS;
1608 	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1609 	if (cl == NULL)
1610 		goto failure;
1611 
1612 	gnet_stats_basic_sync_init(&cl->bstats);
1613 	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1614 	if (err) {
1615 		kfree(cl);
1616 		goto failure;
1617 	}
1618 
1619 	if (tca[TCA_RATE]) {
1620 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1621 					NULL, true, tca[TCA_RATE]);
1622 		if (err) {
1623 			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1624 			tcf_block_put(cl->block);
1625 			kfree(cl);
1626 			goto failure;
1627 		}
1628 	}
1629 
1630 	cl->R_tab = rtab;
1631 	rtab = NULL;
1632 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1633 				  NULL);
1634 	if (!cl->q)
1635 		cl->q = &noop_qdisc;
1636 	else
1637 		qdisc_hash_add(cl->q, true);
1638 
1639 	cl->common.classid = classid;
1640 	cl->tparent = parent;
1641 	cl->qdisc = sch;
1642 	cl->allot = parent->allot;
1643 	cl->quantum = cl->allot;
1644 	cl->weight = cl->R_tab->rate.rate;
1645 
1646 	sch_tree_lock(sch);
1647 	cbq_link_class(cl);
1648 	cl->borrow = cl->tparent;
1649 	if (cl->tparent != &q->link)
1650 		cl->share = cl->tparent;
1651 	cbq_adjust_levels(parent);
1652 	cl->minidle = -0x7FFFFFFF;
1653 	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1654 	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1655 	if (cl->ewma_log == 0)
1656 		cl->ewma_log = q->link.ewma_log;
1657 	if (cl->maxidle == 0)
1658 		cl->maxidle = q->link.maxidle;
1659 	if (cl->avpkt == 0)
1660 		cl->avpkt = q->link.avpkt;
1661 	if (tb[TCA_CBQ_FOPT])
1662 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1663 	sch_tree_unlock(sch);
1664 
1665 	qdisc_class_hash_grow(sch, &q->clhash);
1666 
1667 	*arg = (unsigned long)cl;
1668 	return 0;
1669 
1670 failure:
1671 	qdisc_put_rtab(rtab);
1672 	return err;
1673 }
1674 
cbq_delete(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1675 static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1676 		      struct netlink_ext_ack *extack)
1677 {
1678 	struct cbq_sched_data *q = qdisc_priv(sch);
1679 	struct cbq_class *cl = (struct cbq_class *)arg;
1680 
1681 	if (cl->filters || cl->children || cl == &q->link)
1682 		return -EBUSY;
1683 
1684 	sch_tree_lock(sch);
1685 
1686 	qdisc_purge_queue(cl->q);
1687 
1688 	if (cl->next_alive)
1689 		cbq_deactivate_class(cl);
1690 
1691 	if (q->tx_borrowed == cl)
1692 		q->tx_borrowed = q->tx_class;
1693 	if (q->tx_class == cl) {
1694 		q->tx_class = NULL;
1695 		q->tx_borrowed = NULL;
1696 	}
1697 #ifdef CONFIG_NET_CLS_ACT
1698 	if (q->rx_class == cl)
1699 		q->rx_class = NULL;
1700 #endif
1701 
1702 	cbq_unlink_class(cl);
1703 	cbq_adjust_levels(cl->tparent);
1704 	cl->defmap = 0;
1705 	cbq_sync_defmap(cl);
1706 
1707 	cbq_rmprio(q, cl);
1708 	sch_tree_unlock(sch);
1709 
1710 	cbq_destroy_class(sch, cl);
1711 	return 0;
1712 }
1713 
cbq_tcf_block(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1714 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1715 				       struct netlink_ext_ack *extack)
1716 {
1717 	struct cbq_sched_data *q = qdisc_priv(sch);
1718 	struct cbq_class *cl = (struct cbq_class *)arg;
1719 
1720 	if (cl == NULL)
1721 		cl = &q->link;
1722 
1723 	return cl->block;
1724 }
1725 
cbq_bind_filter(struct Qdisc * sch,unsigned long parent,u32 classid)1726 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1727 				     u32 classid)
1728 {
1729 	struct cbq_sched_data *q = qdisc_priv(sch);
1730 	struct cbq_class *p = (struct cbq_class *)parent;
1731 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1732 
1733 	if (cl) {
1734 		if (p && p->level <= cl->level)
1735 			return 0;
1736 		cl->filters++;
1737 		return (unsigned long)cl;
1738 	}
1739 	return 0;
1740 }
1741 
cbq_unbind_filter(struct Qdisc * sch,unsigned long arg)1742 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1743 {
1744 	struct cbq_class *cl = (struct cbq_class *)arg;
1745 
1746 	cl->filters--;
1747 }
1748 
cbq_walk(struct Qdisc * sch,struct qdisc_walker * arg)1749 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1750 {
1751 	struct cbq_sched_data *q = qdisc_priv(sch);
1752 	struct cbq_class *cl;
1753 	unsigned int h;
1754 
1755 	if (arg->stop)
1756 		return;
1757 
1758 	for (h = 0; h < q->clhash.hashsize; h++) {
1759 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1760 			if (arg->count < arg->skip) {
1761 				arg->count++;
1762 				continue;
1763 			}
1764 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1765 				arg->stop = 1;
1766 				return;
1767 			}
1768 			arg->count++;
1769 		}
1770 	}
1771 }
1772 
1773 static const struct Qdisc_class_ops cbq_class_ops = {
1774 	.graft		=	cbq_graft,
1775 	.leaf		=	cbq_leaf,
1776 	.qlen_notify	=	cbq_qlen_notify,
1777 	.find		=	cbq_find,
1778 	.change		=	cbq_change_class,
1779 	.delete		=	cbq_delete,
1780 	.walk		=	cbq_walk,
1781 	.tcf_block	=	cbq_tcf_block,
1782 	.bind_tcf	=	cbq_bind_filter,
1783 	.unbind_tcf	=	cbq_unbind_filter,
1784 	.dump		=	cbq_dump_class,
1785 	.dump_stats	=	cbq_dump_class_stats,
1786 };
1787 
1788 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1789 	.next		=	NULL,
1790 	.cl_ops		=	&cbq_class_ops,
1791 	.id		=	"cbq",
1792 	.priv_size	=	sizeof(struct cbq_sched_data),
1793 	.enqueue	=	cbq_enqueue,
1794 	.dequeue	=	cbq_dequeue,
1795 	.peek		=	qdisc_peek_dequeued,
1796 	.init		=	cbq_init,
1797 	.reset		=	cbq_reset,
1798 	.destroy	=	cbq_destroy,
1799 	.change		=	NULL,
1800 	.dump		=	cbq_dump,
1801 	.dump_stats	=	cbq_dump_stats,
1802 	.owner		=	THIS_MODULE,
1803 };
1804 
cbq_module_init(void)1805 static int __init cbq_module_init(void)
1806 {
1807 	return register_qdisc(&cbq_qdisc_ops);
1808 }
cbq_module_exit(void)1809 static void __exit cbq_module_exit(void)
1810 {
1811 	unregister_qdisc(&cbq_qdisc_ops);
1812 }
1813 module_init(cbq_module_init)
1814 module_exit(cbq_module_exit)
1815 MODULE_LICENSE("GPL");
1816