1 /*
2  * net/sched/sch_csz.c	Clark-Shenker-Zhang scheduler.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  */
12 
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
38 
39 
40 /*	Clark-Shenker-Zhang algorithm.
41 	=======================================
42 
43 	SOURCE.
44 
45 	David D. Clark, Scott Shenker and Lixia Zhang
46 	"Supporting Real-Time Applications in an Integrated Services Packet
47 	Network: Architecture and Mechanism".
48 
49 	CBQ presents a flexible universal algorithm for packet scheduling,
50 	but it has pretty poor delay characteristics.
51 	Round-robin scheduling and link-sharing goals
52 	apparently contradict minimization of network delay and jitter.
53 	Moreover, correct handling of predictive flows seems to be
54 	impossible in CBQ.
55 
56 	CSZ presents a more precise but less flexible and less efficient
57 	approach. As I understand it, the main idea is to create
58 	WFQ flows for each guaranteed service and to allocate
59 	the rest of bandwidth to dummy flow-0. Flow-0 comprises
60 	the predictive services and the best effort traffic;
61 	it is handled by a priority scheduler with the highest
62 	priority band allocated	for predictive services, and the rest ---
63 	to the best effort packets.
64 
65 	Note that in CSZ flows are NOT limited to their bandwidth.  It
66 	is supposed that the flow passed admission control at the edge
67 	of the QoS network and it doesn't need further shaping. Any
68 	attempt to improve the flow or to shape it to a token bucket
69 	at intermediate hops will introduce undesired delays and raise
70 	jitter.
71 
72 	At the moment CSZ is the only scheduler that provides
73 	true guaranteed service. Another schemes (including CBQ)
74 	do not provide guaranteed delay and randomize jitter.
75 	There is a proof (Sally Floyd), that delay
76 	can be estimated by a IntServ compliant formula.
77 	This result is true formally, but it is wrong in principle.
78 	It takes into account only round-robin delays,
79 	ignoring delays introduced by link sharing i.e. overlimiting.
80 	Note that temporary overlimits are inevitable because
81 	real links are not ideal, and the real algorithm must take this
82 	into account.
83 
84         ALGORITHM.
85 
86 	--- Notations.
87 
88 	$B$ is link bandwidth (bits/sec).
89 
90 	$I$ is set of all flows, including flow $0$.
91 	Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
92 	$\sum_{a \in I} r_a = 1$.
93 
94 	--- Flow model.
95 
96 	Let $m_a$ is the number of backlogged bits in flow $a$.
97 	The flow is {\em active}, if $m_a > 0$.
98 	This number is a discontinuous function of time;
99 	when a packet $i$ arrives:
100 	\[
101 	m_a(t_i+0) - m_a(t_i-0) = L^i,
102 	\]
103 	where $L^i$ is the length of the arrived packet.
104 	The flow queue is drained continuously until $m_a == 0$:
105 	\[
106 	{d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
107 	\]
108 	I.e. flow rates are their allocated rates proportionally
109 	scaled to take all available link bandwidth. Apparently,
110 	it is not the only possible policy. F.e. CBQ classes
111 	without borrowing would be modelled by:
112 	\[
113 	{d m_a \over dt} = - B r_a .
114 	\]
115 	More complicated hierarchical bandwidth allocation
116 	policies are possible, but unfortunately, the basic
117 	flow equations have a simple solution only for proportional
118 	scaling.
119 
120 	--- Departure times.
121 
122 	We calculate the time until the last bit of packet is sent:
123 	\[
124 	E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
125 	\]
126 	where $\delta_a(t)$ is number of bits drained since $t_i$.
127 	We have to evaluate $E_a^i$ for all queued packets,
128 	then find the packet with minimal $E_a^i$ and send it.
129 
130 	This sounds good, but direct implementation of the algorithm
131 	is absolutely infeasible. Luckily, if flow rates
132 	are scaled proportionally, the equations have a simple solution.
133 
134 	The differential equation for $E_a^i$ is
135 	\[
136 	{d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
137 	{ B \over \sum_{b \in A} r_b}
138 	\]
139 	with initial condition
140 	\[
141 	E_a^i (t_i) = { m_a(t_i) \over r_a } .
142 	\]
143 
144 	Let's introduce an auxiliary function $R(t)$:
145 
146 	--- Round number.
147 
148 	Consider the following model: we rotate over active flows,
149 	sending $r_a B$ bits from every flow, so that we send
150 	$B \sum_{a \in A} r_a$ bits per round, that takes
151 	$\sum_{a \in A} r_a$ seconds.
152 
153 	Hence, $R(t)$ (round number) is a monotonically increasing
154 	linear function	of time when $A$ is not changed
155 	\[
156 	{ d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
157 	\]
158 	and it is continuous when $A$ changes.
159 
160 	The central observation is that the quantity
161 	$F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
162 	$R(t)$ does not depend on flow, so that $F_a^i$ can be
163 	calculated only once on packet arrival, and we need not
164 	recalculate $E$ numbers and resorting queues.
165 	The number $F_a^i$ is called finish number of the packet.
166 	It is just the value of $R(t)$ when the last bit of packet
167 	is sent out.
168 
169 	Maximal finish number on flow is called finish number of flow
170 	and minimal one is "start number of flow".
171 	Apparently, flow is active if and only if $F_a \leq R$.
172 
173 	When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
174 	we calculate $F_a^i$ as:
175 
176 	If flow was inactive ($F_a < R$):
177 	$F_a^i = R(t) + {L_i \over B r_a}$
178 	otherwise
179 	$F_a^i = F_a + {L_i \over B r_a}$
180 
181 	These equations complete the algorithm specification.
182 
183 	It looks pretty hairy, but there is a simple
184 	procedure for solving these equations.
185 	See procedure csz_update(), that is a generalization of
186 	the algorithm from S. Keshav's thesis Chapter 3
187 	"Efficient Implementation of Fair Queeing".
188 
189 	NOTES.
190 
191 	* We implement only the simplest variant of CSZ,
192 	when flow-0 is a explicit 4band priority fifo.
193 	This is bad, but we need a "peek" operation in addition
194 	to "dequeue" to implement complete CSZ.
195 	I do not want to do that, unless it is absolutely
196 	necessary.
197 
198 	* A primitive support for token bucket filtering
199 	presents itself too. It directly contradicts CSZ, but
200 	even though the Internet is on the globe ... :-)
201 	"the edges of the network" really exist.
202 
203 	BUGS.
204 
205 	* Fixed point arithmetic is overcomplicated, suboptimal and even
206 	wrong. Check it later.  */
207 
208 
209 /* This number is arbitrary */
210 
211 #define CSZ_GUARANTEED		16
212 #define CSZ_FLOWS		(CSZ_GUARANTEED+4)
213 
214 struct csz_head
215 {
216 	struct csz_head		*snext;
217 	struct csz_head		*sprev;
218 	struct csz_head		*fnext;
219 	struct csz_head		*fprev;
220 };
221 
222 struct csz_flow
223 {
224 	struct csz_head		*snext;
225 	struct csz_head		*sprev;
226 	struct csz_head		*fnext;
227 	struct csz_head		*fprev;
228 
229 /* Parameters */
230 	struct tc_ratespec	rate;
231 	struct tc_ratespec	slice;
232 	u32			*L_tab;	/* Lookup table for L/(B*r_a) values */
233 	unsigned long		limit;	/* Maximal length of queue */
234 #ifdef CSZ_PLUS_TBF
235 	struct tc_ratespec	peakrate;
236 	__u32			buffer;	/* Depth of token bucket, normalized
237 					   as L/(B*r_a) */
238 	__u32			mtu;
239 #endif
240 
241 /* Variables */
242 #ifdef CSZ_PLUS_TBF
243 	unsigned long		tokens; /* Tokens number: usecs */
244 	psched_time_t		t_tbf;
245 	unsigned long		R_tbf;
246 	int			throttled;
247 #endif
248 	unsigned		peeked;
249 	unsigned long		start;	/* Finish number of the first skb */
250 	unsigned long		finish;	/* Finish number of the flow */
251 
252 	struct sk_buff_head	q;	/* FIFO queue */
253 };
254 
255 #define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log])
256 
257 struct csz_sched_data
258 {
259 /* Parameters */
260 	unsigned char	rate_log;	/* fixed point position for rate;
261 					 * really we need not it */
262 	unsigned char	R_log;		/* fixed point position for round number */
263 	unsigned char	delta_log;	/* 1<<delta_log is maximal timeout in usecs;
264 					 * 21 <-> 2.1sec is MAXIMAL value */
265 
266 /* Variables */
267 	struct tcf_proto *filter_list;
268 	u8	prio2band[TC_PRIO_MAX+1];
269 #ifdef CSZ_PLUS_TBF
270 	struct timer_list wd_timer;
271 	long		wd_expires;
272 #endif
273 	psched_time_t	t_c;		/* Time check-point */
274 	unsigned long	R_c;		/* R-number check-point	*/
275 	unsigned long	rate;		/* Current sum of rates of active flows */
276 	struct csz_head	s;		/* Flows sorted by "start" */
277 	struct csz_head	f;		/* Flows sorted by "finish"	*/
278 
279 	struct sk_buff_head	other[4];/* Predicted (0) and the best efforts
280 					    classes (1,2,3) */
281 	struct csz_flow	flow[CSZ_GUARANTEED]; /* Array of flows */
282 };
283 
284 /* These routines (csz_insert_finish and csz_insert_start) are
285    the most time consuming part of all the algorithm.
286 
287    We insert to sorted list, so that time
288    is linear with respect to number of active flows in the worst case.
289    Note that we have not very large number of guaranteed flows,
290    so that logarithmic algorithms (heap etc.) are useless,
291    they are slower than linear one when length of list <= 32.
292 
293    Heap would take sence if we used WFQ for best efforts
294    flows, but SFQ is better choice in this case.
295  */
296 
297 
298 /* Insert flow "this" to the list "b" before
299    flow with greater finish number.
300  */
301 
302 #if 0
303 /* Scan forward */
304 extern __inline__ void csz_insert_finish(struct csz_head *b,
305 					 struct csz_flow *this)
306 {
307 	struct csz_head *f = b->fnext;
308 	unsigned long finish = this->finish;
309 
310 	while (f != b) {
311 		if (((struct csz_flow*)f)->finish - finish > 0)
312 			break;
313 		f = f->fnext;
314 	}
315 	this->fnext = f;
316 	this->fprev = f->fprev;
317 	this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
318 }
319 #else
320 /* Scan backward */
csz_insert_finish(struct csz_head * b,struct csz_flow * this)321 extern __inline__ void csz_insert_finish(struct csz_head *b,
322 					 struct csz_flow *this)
323 {
324 	struct csz_head *f = b->fprev;
325 	unsigned long finish = this->finish;
326 
327 	while (f != b) {
328 		if (((struct csz_flow*)f)->finish - finish <= 0)
329 			break;
330 		f = f->fprev;
331 	}
332 	this->fnext = f->fnext;
333 	this->fprev = f;
334 	this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
335 }
336 #endif
337 
338 /* Insert flow "this" to the list "b" before
339    flow with greater start number.
340  */
341 
csz_insert_start(struct csz_head * b,struct csz_flow * this)342 extern __inline__ void csz_insert_start(struct csz_head *b,
343 					struct csz_flow *this)
344 {
345 	struct csz_head *f = b->snext;
346 	unsigned long start = this->start;
347 
348 	while (f != b) {
349 		if (((struct csz_flow*)f)->start - start > 0)
350 			break;
351 		f = f->snext;
352 	}
353 	this->snext = f;
354 	this->sprev = f->sprev;
355 	this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
356 }
357 
358 
359 /* Calculate and return current round number.
360    It is another time consuming part, but
361    it is impossible to avoid it.
362 
363    It costs O(N) that make all the algorithm useful only
364    to play with closest to ideal fluid model.
365 
366    There exist less academic, but more practical modifications,
367    which might have even better characteristics (WF2Q+, HPFQ, HFSC)
368  */
369 
csz_update(struct Qdisc * sch)370 static unsigned long csz_update(struct Qdisc *sch)
371 {
372 	struct csz_sched_data	*q = (struct csz_sched_data*)sch->data;
373 	struct csz_flow 	*a;
374 	unsigned long		F;
375 	unsigned long		tmp;
376 	psched_time_t		now;
377 	unsigned long		delay;
378 	unsigned long		R_c;
379 
380 	PSCHED_GET_TIME(now);
381 	delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
382 
383 	if (delay>>q->delta_log) {
384 do_reset:
385 		/* Delta is too large.
386 		   It is possible if MTU/BW > 1<<q->delta_log
387 		   (i.e. configuration error) or because of hardware
388 		   fault. We have no choice...
389 		 */
390 		qdisc_reset(sch);
391 		return 0;
392 	}
393 
394 	q->t_c = now;
395 
396 	for (;;) {
397 		a = (struct csz_flow*)q->f.fnext;
398 
399 		/* No more active flows. Reset R and exit. */
400 		if (a == (struct csz_flow*)&q->f) {
401 #ifdef CSZ_DEBUG
402 			if (q->rate) {
403 				printk("csz_update: rate!=0 on inactive csz\n");
404 				q->rate = 0;
405 			}
406 #endif
407 			q->R_c = 0;
408 			return 0;
409 		}
410 
411 		F = a->finish;
412 
413 #ifdef CSZ_DEBUG
414 		if (q->rate == 0) {
415 			printk("csz_update: rate=0 on active csz\n");
416 			goto do_reset;
417 		}
418 #endif
419 
420 		/*
421 		 *           tmp = (t - q->t_c)/q->rate;
422 		 */
423 
424 		tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
425 
426 		tmp += q->R_c;
427 
428 		/* OK, this flow (and all flows with greater
429 		   finish numbers) is still active */
430 		if (F - tmp > 0)
431 			break;
432 
433 		/* It is more not active */
434 
435 		a->fprev->fnext = a->fnext;
436 		a->fnext->fprev = a->fprev;
437 
438 		/*
439 		 * q->t_c += (F - q->R_c)*q->rate
440 		 */
441 
442 		tmp = ((F-q->R_c)*q->rate)<<q->R_log;
443 		R_c = F;
444 		q->rate -= a->slice.rate;
445 
446 		if ((long)(delay - tmp) >= 0) {
447 			delay -= tmp;
448 			continue;
449 		}
450 		delay = 0;
451 	}
452 
453 	q->R_c = tmp;
454 	return tmp;
455 }
456 
csz_classify(struct sk_buff * skb,struct csz_sched_data * q)457 unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q)
458 {
459 	return CSZ_GUARANTEED;
460 }
461 
462 static int
csz_enqueue(struct sk_buff * skb,struct Qdisc * sch)463 csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
464 {
465 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
466 	unsigned flow_id = csz_classify(skb, q);
467 	unsigned long R;
468 	int prio = 0;
469 	struct csz_flow *this;
470 
471 	if (flow_id >= CSZ_GUARANTEED) {
472 		prio = flow_id - CSZ_GUARANTEED;
473 		flow_id = 0;
474 	}
475 
476 	this = &q->flow[flow_id];
477 	if (this->q.qlen >= this->limit || this->L_tab == NULL) {
478 		sch->stats.drops++;
479 		kfree_skb(skb);
480 		return NET_XMIT_DROP;
481 	}
482 
483 	R = csz_update(sch);
484 
485 	if ((long)(this->finish - R) >= 0) {
486 		/* It was active */
487 		this->finish += L2R(this,skb->len);
488 	} else {
489 		/* It is inactive; activate it */
490 		this->finish = R + L2R(this,skb->len);
491 		q->rate += this->slice.rate;
492 		csz_insert_finish(&q->f, this);
493 	}
494 
495 	/* If this flow was empty, remember start number
496 	   and insert it into start queue */
497 	if (this->q.qlen == 0) {
498 		this->start = this->finish;
499 		csz_insert_start(&q->s, this);
500 	}
501 	if (flow_id)
502 		skb_queue_tail(&this->q, skb);
503 	else
504 		skb_queue_tail(&q->other[prio], skb);
505 	sch->q.qlen++;
506 	sch->stats.bytes += skb->len;
507 	sch->stats.packets++;
508 	return 0;
509 }
510 
511 static __inline__ struct sk_buff *
skb_dequeue_best(struct csz_sched_data * q)512 skb_dequeue_best(struct csz_sched_data * q)
513 {
514 	int i;
515 	struct sk_buff *skb;
516 
517 	for (i=0; i<4; i++) {
518 		skb = skb_dequeue(&q->other[i]);
519 		if (skb) {
520 			q->flow[0].q.qlen--;
521 			return skb;
522 		}
523 	}
524 	return NULL;
525 }
526 
527 static __inline__ struct sk_buff *
skb_peek_best(struct csz_sched_data * q)528 skb_peek_best(struct csz_sched_data * q)
529 {
530 	int i;
531 	struct sk_buff *skb;
532 
533 	for (i=0; i<4; i++) {
534 		skb = skb_peek(&q->other[i]);
535 		if (skb)
536 			return skb;
537 	}
538 	return NULL;
539 }
540 
541 #ifdef CSZ_PLUS_TBF
542 
csz_watchdog(unsigned long arg)543 static void csz_watchdog(unsigned long arg)
544 {
545 	struct Qdisc *sch = (struct Qdisc*)arg;
546 
547 	qdisc_wakeup(sch->dev);
548 }
549 
550 static __inline__ void
csz_move_queue(struct csz_flow * this,long delta)551 csz_move_queue(struct csz_flow *this, long delta)
552 {
553 	this->fprev->fnext = this->fnext;
554 	this->fnext->fprev = this->fprev;
555 
556 	this->start += delta;
557 	this->finish += delta;
558 
559 	csz_insert_finish(this);
560 }
561 
csz_enough_tokens(struct csz_sched_data * q,struct csz_flow * this,struct sk_buff * skb)562 static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
563 					struct csz_flow *this,
564 					struct sk_buff *skb)
565 {
566 	long toks;
567 	long shift;
568 	psched_time_t now;
569 
570 	PSCHED_GET_TIME(now);
571 
572 	toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
573 
574 	shift = 0;
575 	if (this->throttled) {
576 		/* Remember aposteriory delay */
577 
578 		unsigned long R = csz_update(q);
579 		shift = R - this->R_tbf;
580 		this->R_tbf = R;
581 	}
582 
583 	if (toks >= 0) {
584 		/* Now we have enough tokens to proceed */
585 
586 		this->tokens = toks <= this->depth ? toks : this->depth;
587 		this->t_tbf = now;
588 
589 		if (!this->throttled)
590 			return 1;
591 
592 		/* Flow was throttled. Update its start&finish numbers
593 		   with delay calculated aposteriori.
594 		 */
595 
596 		this->throttled = 0;
597 		if (shift > 0)
598 			csz_move_queue(this, shift);
599 		return 1;
600 	}
601 
602 	if (!this->throttled) {
603 		/* Flow has just been throttled; remember
604 		   current round number to calculate aposteriori delay
605 		 */
606 		this->throttled = 1;
607 		this->R_tbf = csz_update(q);
608 	}
609 
610 	/* Move all the queue to the time when it will be allowed to send.
611 	   We should translate time to round number, but it is impossible,
612 	   so that we made the most conservative estimate i.e. we suppose
613 	   that only this flow is active and, hence, R = t.
614 	   Really toks <= R <= toks/r_a.
615 
616 	   This apriory shift in R will be adjusted later to reflect
617 	   real delay. We cannot avoid it because of:
618 	   - throttled flow continues to be active from the viewpoint
619 	     of CSZ, so that it would acquire the highest priority,
620 	     if you not adjusted start numbers.
621 	   - Eventually, finish number would become less than round
622 	     number and flow were declared inactive.
623 	 */
624 
625 	toks = -toks;
626 
627 	/* Remeber, that we should start watchdog */
628 	if (toks < q->wd_expires)
629 		q->wd_expires = toks;
630 
631 	toks >>= q->R_log;
632 	shift += toks;
633 	if (shift > 0) {
634 		this->R_tbf += toks;
635 		csz_move_queue(this, shift);
636 	}
637 	csz_insert_start(this);
638 	return 0;
639 }
640 #endif
641 
642 
643 static struct sk_buff *
csz_dequeue(struct Qdisc * sch)644 csz_dequeue(struct Qdisc* sch)
645 {
646 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
647 	struct sk_buff *skb;
648 	struct csz_flow *this;
649 
650 #ifdef CSZ_PLUS_TBF
651 	q->wd_expires = 0;
652 #endif
653 	this = (struct csz_flow*)q->s.snext;
654 
655 	while (this != (struct csz_flow*)&q->s) {
656 
657 		/* First of all: unlink from start list */
658 		this->sprev->snext = this->snext;
659 		this->snext->sprev = this->sprev;
660 
661 		if (this != &q->flow[0]) {	/* Guaranteed flow */
662 			skb = __skb_dequeue(&this->q);
663 			if (skb) {
664 #ifdef CSZ_PLUS_TBF
665 				if (this->depth) {
666 					if (!csz_enough_tokens(q, this, skb))
667 						continue;
668 				}
669 #endif
670 				if (this->q.qlen) {
671 					struct sk_buff *nskb = skb_peek(&this->q);
672 					this->start += L2R(this,nskb->len);
673 					csz_insert_start(&q->s, this);
674 				}
675 				sch->q.qlen--;
676 				return skb;
677 			}
678 		} else {	/* Predicted or best effort flow */
679 			skb = skb_dequeue_best(q);
680 			if (skb) {
681 				unsigned peeked = this->peeked;
682 				this->peeked = 0;
683 
684 				if (--this->q.qlen) {
685 					struct sk_buff *nskb;
686 					unsigned dequeued = L2R(this,skb->len);
687 
688 					/* We got not the same thing that
689 					   peeked earlier; adjust start number
690 					   */
691 					if (peeked != dequeued && peeked)
692 						this->start += dequeued - peeked;
693 
694 					nskb = skb_peek_best(q);
695 					peeked = L2R(this,nskb->len);
696 					this->start += peeked;
697 					this->peeked = peeked;
698 					csz_insert_start(&q->s, this);
699 				}
700 				sch->q.qlen--;
701 				return skb;
702 			}
703 		}
704 	}
705 #ifdef CSZ_PLUS_TBF
706 	/* We are about to return no skb.
707 	   Schedule watchdog timer, if it occurred because of shaping.
708 	 */
709 	if (q->wd_expires) {
710 		unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires);
711 		if (delay == 0)
712 			delay = 1;
713 		mod_timer(&q->wd_timer, jiffies + delay);
714 		sch->stats.overlimits++;
715 	}
716 #endif
717 	return NULL;
718 }
719 
720 static void
csz_reset(struct Qdisc * sch)721 csz_reset(struct Qdisc* sch)
722 {
723 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
724 	int    i;
725 
726 	for (i=0; i<4; i++)
727 		skb_queue_purge(&q->other[i]);
728 
729 	for (i=0; i<CSZ_GUARANTEED; i++) {
730 		struct csz_flow *this = q->flow + i;
731 		skb_queue_purge(&this->q);
732 		this->snext = this->sprev =
733 		this->fnext = this->fprev = (struct csz_head*)this;
734 		this->start = this->finish = 0;
735 	}
736 	q->s.snext = q->s.sprev = &q->s;
737 	q->f.fnext = q->f.fprev = &q->f;
738 	q->R_c = 0;
739 #ifdef CSZ_PLUS_TBF
740 	PSCHED_GET_TIME(&q->t_tbf);
741 	q->tokens = q->depth;
742 	del_timer(&q->wd_timer);
743 #endif
744 	sch->q.qlen = 0;
745 }
746 
747 static void
csz_destroy(struct Qdisc * sch)748 csz_destroy(struct Qdisc* sch)
749 {
750 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
751 	struct tcf_proto *tp;
752 
753 	while ((tp = q->filter_list) != NULL) {
754 		q->filter_list = tp->next;
755 		tcf_destroy(tp);
756 	}
757 
758 	MOD_DEC_USE_COUNT;
759 }
760 
csz_init(struct Qdisc * sch,struct rtattr * opt)761 static int csz_init(struct Qdisc *sch, struct rtattr *opt)
762 {
763 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
764 	struct rtattr *tb[TCA_CSZ_PTAB];
765 	struct tc_csz_qopt *qopt;
766 	int    i;
767 
768 	rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
769 	if (tb[TCA_CSZ_PARMS-1] == NULL ||
770 	    RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt))
771 		return -EINVAL;
772 	qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
773 
774 	q->R_log = qopt->R_log;
775 	q->delta_log = qopt->delta_log;
776 	for (i=0; i<=TC_PRIO_MAX; i++) {
777 		if (qopt->priomap[i] >= CSZ_FLOWS)
778 			return -EINVAL;
779 		q->prio2band[i] = qopt->priomap[i];
780 	}
781 
782 	for (i=0; i<4; i++)
783 		skb_queue_head_init(&q->other[i]);
784 
785 	for (i=0; i<CSZ_GUARANTEED; i++) {
786 		struct csz_flow *this = q->flow + i;
787 		skb_queue_head_init(&this->q);
788 		this->snext = this->sprev =
789 		this->fnext = this->fprev = (struct csz_head*)this;
790 		this->start = this->finish = 0;
791 	}
792 	q->s.snext = q->s.sprev = &q->s;
793 	q->f.fnext = q->f.fprev = &q->f;
794 	q->R_c = 0;
795 #ifdef CSZ_PLUS_TBF
796 	init_timer(&q->wd_timer);
797 	q->wd_timer.data = (unsigned long)sch;
798 	q->wd_timer.function = csz_watchdog;
799 #endif
800 	MOD_INC_USE_COUNT;
801 	return 0;
802 }
803 
csz_dump(struct Qdisc * sch,struct sk_buff * skb)804 static int csz_dump(struct Qdisc *sch, struct sk_buff *skb)
805 {
806 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
807 	unsigned char	 *b = skb->tail;
808 	struct rtattr *rta;
809 	struct tc_csz_qopt opt;
810 
811 	rta = (struct rtattr*)b;
812 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
813 
814 	opt.flows = CSZ_FLOWS;
815 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
816 	RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
817 	rta->rta_len = skb->tail - b;
818 
819 	return skb->len;
820 
821 rtattr_failure:
822 	skb_trim(skb, b - skb->data);
823 	return -1;
824 }
825 
csz_graft(struct Qdisc * sch,unsigned long cl,struct Qdisc * new,struct Qdisc ** old)826 static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
827 		     struct Qdisc **old)
828 {
829 	return -EINVAL;
830 }
831 
csz_leaf(struct Qdisc * sch,unsigned long cl)832 static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
833 {
834 	return NULL;
835 }
836 
837 
csz_get(struct Qdisc * sch,u32 classid)838 static unsigned long csz_get(struct Qdisc *sch, u32 classid)
839 {
840 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
841 	unsigned long band = TC_H_MIN(classid) - 1;
842 
843 	if (band >= CSZ_FLOWS)
844 		return 0;
845 
846 	if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL)
847 		return 0;
848 
849 	return band+1;
850 }
851 
csz_bind(struct Qdisc * sch,unsigned long parent,u32 classid)852 static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
853 {
854 	return csz_get(sch, classid);
855 }
856 
857 
csz_put(struct Qdisc * sch,unsigned long cl)858 static void csz_put(struct Qdisc *sch, unsigned long cl)
859 {
860 	return;
861 }
862 
csz_change(struct Qdisc * sch,u32 handle,u32 parent,struct rtattr ** tca,unsigned long * arg)863 static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
864 {
865 	unsigned long cl = *arg;
866 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
867 	struct rtattr *opt = tca[TCA_OPTIONS-1];
868 	struct rtattr *tb[TCA_CSZ_PTAB];
869 	struct tc_csz_copt *copt;
870 
871 	rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
872 	if (tb[TCA_CSZ_PARMS-1] == NULL ||
873 	    RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt))
874 		return -EINVAL;
875 	copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
876 
877 	if (tb[TCA_CSZ_RTAB-1] &&
878 	    RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024)
879 		return -EINVAL;
880 
881 	if (cl) {
882 		struct csz_flow *a;
883 		cl--;
884 		if (cl >= CSZ_FLOWS)
885 			return -ENOENT;
886 		if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
887 			return -EINVAL;
888 
889 		a = &q->flow[cl];
890 
891 		spin_lock_bh(&sch->dev->queue_lock);
892 #if 0
893 		a->rate_log = copt->rate_log;
894 #endif
895 #ifdef CSZ_PLUS_TBF
896 		a->limit = copt->limit;
897 		a->rate = copt->rate;
898 		a->buffer = copt->buffer;
899 		a->mtu = copt->mtu;
900 #endif
901 
902 		if (tb[TCA_CSZ_RTAB-1])
903 			memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024);
904 
905 		spin_unlock_bh(&sch->dev->queue_lock);
906 		return 0;
907 	}
908 	/* NI */
909 	return 0;
910 }
911 
csz_delete(struct Qdisc * sch,unsigned long cl)912 static int csz_delete(struct Qdisc *sch, unsigned long cl)
913 {
914 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
915 	struct csz_flow *a;
916 
917 	cl--;
918 
919 	if (cl >= CSZ_FLOWS)
920 		return -ENOENT;
921 	if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
922 		return -EINVAL;
923 
924 	a = &q->flow[cl];
925 
926 	spin_lock_bh(&sch->dev->queue_lock);
927 	a->fprev->fnext = a->fnext;
928 	a->fnext->fprev = a->fprev;
929 	a->sprev->snext = a->snext;
930 	a->snext->sprev = a->sprev;
931 	a->start = a->finish = 0;
932 	kfree(xchg(&q->flow[cl].L_tab, NULL));
933 	spin_unlock_bh(&sch->dev->queue_lock);
934 
935 	return 0;
936 }
937 
csz_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)938 static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
939 {
940 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
941 	unsigned char	 *b = skb->tail;
942 	struct rtattr *rta;
943 	struct tc_csz_copt opt;
944 
945 	tcm->tcm_handle = sch->handle|cl;
946 
947 	cl--;
948 
949 	if (cl > CSZ_FLOWS)
950 		goto rtattr_failure;
951 
952 	if (cl < CSZ_GUARANTEED) {
953 		struct csz_flow *f = &q->flow[cl];
954 
955 		if (f->L_tab == NULL)
956 			goto rtattr_failure;
957 
958 		rta = (struct rtattr*)b;
959 		RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
960 
961 		opt.limit = f->limit;
962 		opt.rate = f->rate;
963 		opt.slice = f->slice;
964 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
965 #ifdef CSZ_PLUS_TBF
966 		opt.buffer = f->buffer;
967 		opt.mtu = f->mtu;
968 #else
969 		opt.buffer = 0;
970 		opt.mtu = 0;
971 #endif
972 
973 		RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
974 		rta->rta_len = skb->tail - b;
975 	}
976 
977 	return skb->len;
978 
979 rtattr_failure:
980 	skb_trim(skb, b - skb->data);
981 	return -1;
982 }
983 
csz_walk(struct Qdisc * sch,struct qdisc_walker * arg)984 static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg)
985 {
986 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
987 	int prio = 0;
988 
989 	if (arg->stop)
990 		return;
991 
992 	for (prio = 0; prio < CSZ_FLOWS; prio++) {
993 		if (arg->count < arg->skip) {
994 			arg->count++;
995 			continue;
996 		}
997 		if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) {
998 			arg->count++;
999 			continue;
1000 		}
1001 		if (arg->fn(sch, prio+1, arg) < 0) {
1002 			arg->stop = 1;
1003 			break;
1004 		}
1005 		arg->count++;
1006 	}
1007 }
1008 
csz_find_tcf(struct Qdisc * sch,unsigned long cl)1009 static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
1010 {
1011 	struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
1012 
1013 	if (cl)
1014 		return NULL;
1015 
1016 	return &q->filter_list;
1017 }
1018 
1019 struct Qdisc_class_ops csz_class_ops =
1020 {
1021 	csz_graft,
1022 	csz_leaf,
1023 
1024 	csz_get,
1025 	csz_put,
1026 	csz_change,
1027 	csz_delete,
1028 	csz_walk,
1029 
1030 	csz_find_tcf,
1031 	csz_bind,
1032 	csz_put,
1033 
1034 	csz_dump_class,
1035 };
1036 
1037 struct Qdisc_ops csz_qdisc_ops =
1038 {
1039 	NULL,
1040 	&csz_class_ops,
1041 	"csz",
1042 	sizeof(struct csz_sched_data),
1043 
1044 	csz_enqueue,
1045 	csz_dequeue,
1046 	NULL,
1047 	NULL,
1048 
1049 	csz_init,
1050 	csz_reset,
1051 	csz_destroy,
1052 	NULL /* csz_change */,
1053 
1054 	csz_dump,
1055 };
1056 
1057 
1058 #ifdef MODULE
init_module(void)1059 int init_module(void)
1060 {
1061 	return register_qdisc(&csz_qdisc_ops);
1062 }
1063 
cleanup_module(void)1064 void cleanup_module(void)
1065 {
1066 	unregister_qdisc(&csz_qdisc_ops);
1067 }
1068 #endif
1069 MODULE_LICENSE("GPL");
1070