1 /*
2 * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
38
39
40 /* Clark-Shenker-Zhang algorithm.
41 =======================================
42
43 SOURCE.
44
45 David D. Clark, Scott Shenker and Lixia Zhang
46 "Supporting Real-Time Applications in an Integrated Services Packet
47 Network: Architecture and Mechanism".
48
49 CBQ presents a flexible universal algorithm for packet scheduling,
50 but it has pretty poor delay characteristics.
51 Round-robin scheduling and link-sharing goals
52 apparently contradict minimization of network delay and jitter.
53 Moreover, correct handling of predictive flows seems to be
54 impossible in CBQ.
55
56 CSZ presents a more precise but less flexible and less efficient
57 approach. As I understand it, the main idea is to create
58 WFQ flows for each guaranteed service and to allocate
59 the rest of bandwidth to dummy flow-0. Flow-0 comprises
60 the predictive services and the best effort traffic;
61 it is handled by a priority scheduler with the highest
62 priority band allocated for predictive services, and the rest ---
63 to the best effort packets.
64
65 Note that in CSZ flows are NOT limited to their bandwidth. It
66 is supposed that the flow passed admission control at the edge
67 of the QoS network and it doesn't need further shaping. Any
68 attempt to improve the flow or to shape it to a token bucket
69 at intermediate hops will introduce undesired delays and raise
70 jitter.
71
72 At the moment CSZ is the only scheduler that provides
73 true guaranteed service. Another schemes (including CBQ)
74 do not provide guaranteed delay and randomize jitter.
75 There is a proof (Sally Floyd), that delay
76 can be estimated by a IntServ compliant formula.
77 This result is true formally, but it is wrong in principle.
78 It takes into account only round-robin delays,
79 ignoring delays introduced by link sharing i.e. overlimiting.
80 Note that temporary overlimits are inevitable because
81 real links are not ideal, and the real algorithm must take this
82 into account.
83
84 ALGORITHM.
85
86 --- Notations.
87
88 $B$ is link bandwidth (bits/sec).
89
90 $I$ is set of all flows, including flow $0$.
91 Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
92 $\sum_{a \in I} r_a = 1$.
93
94 --- Flow model.
95
96 Let $m_a$ is the number of backlogged bits in flow $a$.
97 The flow is {\em active}, if $m_a > 0$.
98 This number is a discontinuous function of time;
99 when a packet $i$ arrives:
100 \[
101 m_a(t_i+0) - m_a(t_i-0) = L^i,
102 \]
103 where $L^i$ is the length of the arrived packet.
104 The flow queue is drained continuously until $m_a == 0$:
105 \[
106 {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
107 \]
108 I.e. flow rates are their allocated rates proportionally
109 scaled to take all available link bandwidth. Apparently,
110 it is not the only possible policy. F.e. CBQ classes
111 without borrowing would be modelled by:
112 \[
113 {d m_a \over dt} = - B r_a .
114 \]
115 More complicated hierarchical bandwidth allocation
116 policies are possible, but unfortunately, the basic
117 flow equations have a simple solution only for proportional
118 scaling.
119
120 --- Departure times.
121
122 We calculate the time until the last bit of packet is sent:
123 \[
124 E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
125 \]
126 where $\delta_a(t)$ is number of bits drained since $t_i$.
127 We have to evaluate $E_a^i$ for all queued packets,
128 then find the packet with minimal $E_a^i$ and send it.
129
130 This sounds good, but direct implementation of the algorithm
131 is absolutely infeasible. Luckily, if flow rates
132 are scaled proportionally, the equations have a simple solution.
133
134 The differential equation for $E_a^i$ is
135 \[
136 {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
137 { B \over \sum_{b \in A} r_b}
138 \]
139 with initial condition
140 \[
141 E_a^i (t_i) = { m_a(t_i) \over r_a } .
142 \]
143
144 Let's introduce an auxiliary function $R(t)$:
145
146 --- Round number.
147
148 Consider the following model: we rotate over active flows,
149 sending $r_a B$ bits from every flow, so that we send
150 $B \sum_{a \in A} r_a$ bits per round, that takes
151 $\sum_{a \in A} r_a$ seconds.
152
153 Hence, $R(t)$ (round number) is a monotonically increasing
154 linear function of time when $A$ is not changed
155 \[
156 { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
157 \]
158 and it is continuous when $A$ changes.
159
160 The central observation is that the quantity
161 $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
162 $R(t)$ does not depend on flow, so that $F_a^i$ can be
163 calculated only once on packet arrival, and we need not
164 recalculate $E$ numbers and resorting queues.
165 The number $F_a^i$ is called finish number of the packet.
166 It is just the value of $R(t)$ when the last bit of packet
167 is sent out.
168
169 Maximal finish number on flow is called finish number of flow
170 and minimal one is "start number of flow".
171 Apparently, flow is active if and only if $F_a \leq R$.
172
173 When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
174 we calculate $F_a^i$ as:
175
176 If flow was inactive ($F_a < R$):
177 $F_a^i = R(t) + {L_i \over B r_a}$
178 otherwise
179 $F_a^i = F_a + {L_i \over B r_a}$
180
181 These equations complete the algorithm specification.
182
183 It looks pretty hairy, but there is a simple
184 procedure for solving these equations.
185 See procedure csz_update(), that is a generalization of
186 the algorithm from S. Keshav's thesis Chapter 3
187 "Efficient Implementation of Fair Queeing".
188
189 NOTES.
190
191 * We implement only the simplest variant of CSZ,
192 when flow-0 is a explicit 4band priority fifo.
193 This is bad, but we need a "peek" operation in addition
194 to "dequeue" to implement complete CSZ.
195 I do not want to do that, unless it is absolutely
196 necessary.
197
198 * A primitive support for token bucket filtering
199 presents itself too. It directly contradicts CSZ, but
200 even though the Internet is on the globe ... :-)
201 "the edges of the network" really exist.
202
203 BUGS.
204
205 * Fixed point arithmetic is overcomplicated, suboptimal and even
206 wrong. Check it later. */
207
208
209 /* This number is arbitrary */
210
211 #define CSZ_GUARANTEED 16
212 #define CSZ_FLOWS (CSZ_GUARANTEED+4)
213
214 struct csz_head
215 {
216 struct csz_head *snext;
217 struct csz_head *sprev;
218 struct csz_head *fnext;
219 struct csz_head *fprev;
220 };
221
222 struct csz_flow
223 {
224 struct csz_head *snext;
225 struct csz_head *sprev;
226 struct csz_head *fnext;
227 struct csz_head *fprev;
228
229 /* Parameters */
230 struct tc_ratespec rate;
231 struct tc_ratespec slice;
232 u32 *L_tab; /* Lookup table for L/(B*r_a) values */
233 unsigned long limit; /* Maximal length of queue */
234 #ifdef CSZ_PLUS_TBF
235 struct tc_ratespec peakrate;
236 __u32 buffer; /* Depth of token bucket, normalized
237 as L/(B*r_a) */
238 __u32 mtu;
239 #endif
240
241 /* Variables */
242 #ifdef CSZ_PLUS_TBF
243 unsigned long tokens; /* Tokens number: usecs */
244 psched_time_t t_tbf;
245 unsigned long R_tbf;
246 int throttled;
247 #endif
248 unsigned peeked;
249 unsigned long start; /* Finish number of the first skb */
250 unsigned long finish; /* Finish number of the flow */
251
252 struct sk_buff_head q; /* FIFO queue */
253 };
254
255 #define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log])
256
257 struct csz_sched_data
258 {
259 /* Parameters */
260 unsigned char rate_log; /* fixed point position for rate;
261 * really we need not it */
262 unsigned char R_log; /* fixed point position for round number */
263 unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs;
264 * 21 <-> 2.1sec is MAXIMAL value */
265
266 /* Variables */
267 struct tcf_proto *filter_list;
268 u8 prio2band[TC_PRIO_MAX+1];
269 #ifdef CSZ_PLUS_TBF
270 struct timer_list wd_timer;
271 long wd_expires;
272 #endif
273 psched_time_t t_c; /* Time check-point */
274 unsigned long R_c; /* R-number check-point */
275 unsigned long rate; /* Current sum of rates of active flows */
276 struct csz_head s; /* Flows sorted by "start" */
277 struct csz_head f; /* Flows sorted by "finish" */
278
279 struct sk_buff_head other[4];/* Predicted (0) and the best efforts
280 classes (1,2,3) */
281 struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */
282 };
283
284 /* These routines (csz_insert_finish and csz_insert_start) are
285 the most time consuming part of all the algorithm.
286
287 We insert to sorted list, so that time
288 is linear with respect to number of active flows in the worst case.
289 Note that we have not very large number of guaranteed flows,
290 so that logarithmic algorithms (heap etc.) are useless,
291 they are slower than linear one when length of list <= 32.
292
293 Heap would take sence if we used WFQ for best efforts
294 flows, but SFQ is better choice in this case.
295 */
296
297
298 /* Insert flow "this" to the list "b" before
299 flow with greater finish number.
300 */
301
302 #if 0
303 /* Scan forward */
304 extern __inline__ void csz_insert_finish(struct csz_head *b,
305 struct csz_flow *this)
306 {
307 struct csz_head *f = b->fnext;
308 unsigned long finish = this->finish;
309
310 while (f != b) {
311 if (((struct csz_flow*)f)->finish - finish > 0)
312 break;
313 f = f->fnext;
314 }
315 this->fnext = f;
316 this->fprev = f->fprev;
317 this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
318 }
319 #else
320 /* Scan backward */
csz_insert_finish(struct csz_head * b,struct csz_flow * this)321 extern __inline__ void csz_insert_finish(struct csz_head *b,
322 struct csz_flow *this)
323 {
324 struct csz_head *f = b->fprev;
325 unsigned long finish = this->finish;
326
327 while (f != b) {
328 if (((struct csz_flow*)f)->finish - finish <= 0)
329 break;
330 f = f->fprev;
331 }
332 this->fnext = f->fnext;
333 this->fprev = f;
334 this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
335 }
336 #endif
337
338 /* Insert flow "this" to the list "b" before
339 flow with greater start number.
340 */
341
csz_insert_start(struct csz_head * b,struct csz_flow * this)342 extern __inline__ void csz_insert_start(struct csz_head *b,
343 struct csz_flow *this)
344 {
345 struct csz_head *f = b->snext;
346 unsigned long start = this->start;
347
348 while (f != b) {
349 if (((struct csz_flow*)f)->start - start > 0)
350 break;
351 f = f->snext;
352 }
353 this->snext = f;
354 this->sprev = f->sprev;
355 this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
356 }
357
358
359 /* Calculate and return current round number.
360 It is another time consuming part, but
361 it is impossible to avoid it.
362
363 It costs O(N) that make all the algorithm useful only
364 to play with closest to ideal fluid model.
365
366 There exist less academic, but more practical modifications,
367 which might have even better characteristics (WF2Q+, HPFQ, HFSC)
368 */
369
csz_update(struct Qdisc * sch)370 static unsigned long csz_update(struct Qdisc *sch)
371 {
372 struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
373 struct csz_flow *a;
374 unsigned long F;
375 unsigned long tmp;
376 psched_time_t now;
377 unsigned long delay;
378 unsigned long R_c;
379
380 PSCHED_GET_TIME(now);
381 delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
382
383 if (delay>>q->delta_log) {
384 do_reset:
385 /* Delta is too large.
386 It is possible if MTU/BW > 1<<q->delta_log
387 (i.e. configuration error) or because of hardware
388 fault. We have no choice...
389 */
390 qdisc_reset(sch);
391 return 0;
392 }
393
394 q->t_c = now;
395
396 for (;;) {
397 a = (struct csz_flow*)q->f.fnext;
398
399 /* No more active flows. Reset R and exit. */
400 if (a == (struct csz_flow*)&q->f) {
401 #ifdef CSZ_DEBUG
402 if (q->rate) {
403 printk("csz_update: rate!=0 on inactive csz\n");
404 q->rate = 0;
405 }
406 #endif
407 q->R_c = 0;
408 return 0;
409 }
410
411 F = a->finish;
412
413 #ifdef CSZ_DEBUG
414 if (q->rate == 0) {
415 printk("csz_update: rate=0 on active csz\n");
416 goto do_reset;
417 }
418 #endif
419
420 /*
421 * tmp = (t - q->t_c)/q->rate;
422 */
423
424 tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
425
426 tmp += q->R_c;
427
428 /* OK, this flow (and all flows with greater
429 finish numbers) is still active */
430 if (F - tmp > 0)
431 break;
432
433 /* It is more not active */
434
435 a->fprev->fnext = a->fnext;
436 a->fnext->fprev = a->fprev;
437
438 /*
439 * q->t_c += (F - q->R_c)*q->rate
440 */
441
442 tmp = ((F-q->R_c)*q->rate)<<q->R_log;
443 R_c = F;
444 q->rate -= a->slice.rate;
445
446 if ((long)(delay - tmp) >= 0) {
447 delay -= tmp;
448 continue;
449 }
450 delay = 0;
451 }
452
453 q->R_c = tmp;
454 return tmp;
455 }
456
csz_classify(struct sk_buff * skb,struct csz_sched_data * q)457 unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q)
458 {
459 return CSZ_GUARANTEED;
460 }
461
462 static int
csz_enqueue(struct sk_buff * skb,struct Qdisc * sch)463 csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
464 {
465 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
466 unsigned flow_id = csz_classify(skb, q);
467 unsigned long R;
468 int prio = 0;
469 struct csz_flow *this;
470
471 if (flow_id >= CSZ_GUARANTEED) {
472 prio = flow_id - CSZ_GUARANTEED;
473 flow_id = 0;
474 }
475
476 this = &q->flow[flow_id];
477 if (this->q.qlen >= this->limit || this->L_tab == NULL) {
478 sch->stats.drops++;
479 kfree_skb(skb);
480 return NET_XMIT_DROP;
481 }
482
483 R = csz_update(sch);
484
485 if ((long)(this->finish - R) >= 0) {
486 /* It was active */
487 this->finish += L2R(this,skb->len);
488 } else {
489 /* It is inactive; activate it */
490 this->finish = R + L2R(this,skb->len);
491 q->rate += this->slice.rate;
492 csz_insert_finish(&q->f, this);
493 }
494
495 /* If this flow was empty, remember start number
496 and insert it into start queue */
497 if (this->q.qlen == 0) {
498 this->start = this->finish;
499 csz_insert_start(&q->s, this);
500 }
501 if (flow_id)
502 skb_queue_tail(&this->q, skb);
503 else
504 skb_queue_tail(&q->other[prio], skb);
505 sch->q.qlen++;
506 sch->stats.bytes += skb->len;
507 sch->stats.packets++;
508 return 0;
509 }
510
511 static __inline__ struct sk_buff *
skb_dequeue_best(struct csz_sched_data * q)512 skb_dequeue_best(struct csz_sched_data * q)
513 {
514 int i;
515 struct sk_buff *skb;
516
517 for (i=0; i<4; i++) {
518 skb = skb_dequeue(&q->other[i]);
519 if (skb) {
520 q->flow[0].q.qlen--;
521 return skb;
522 }
523 }
524 return NULL;
525 }
526
527 static __inline__ struct sk_buff *
skb_peek_best(struct csz_sched_data * q)528 skb_peek_best(struct csz_sched_data * q)
529 {
530 int i;
531 struct sk_buff *skb;
532
533 for (i=0; i<4; i++) {
534 skb = skb_peek(&q->other[i]);
535 if (skb)
536 return skb;
537 }
538 return NULL;
539 }
540
541 #ifdef CSZ_PLUS_TBF
542
csz_watchdog(unsigned long arg)543 static void csz_watchdog(unsigned long arg)
544 {
545 struct Qdisc *sch = (struct Qdisc*)arg;
546
547 qdisc_wakeup(sch->dev);
548 }
549
550 static __inline__ void
csz_move_queue(struct csz_flow * this,long delta)551 csz_move_queue(struct csz_flow *this, long delta)
552 {
553 this->fprev->fnext = this->fnext;
554 this->fnext->fprev = this->fprev;
555
556 this->start += delta;
557 this->finish += delta;
558
559 csz_insert_finish(this);
560 }
561
csz_enough_tokens(struct csz_sched_data * q,struct csz_flow * this,struct sk_buff * skb)562 static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
563 struct csz_flow *this,
564 struct sk_buff *skb)
565 {
566 long toks;
567 long shift;
568 psched_time_t now;
569
570 PSCHED_GET_TIME(now);
571
572 toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
573
574 shift = 0;
575 if (this->throttled) {
576 /* Remember aposteriory delay */
577
578 unsigned long R = csz_update(q);
579 shift = R - this->R_tbf;
580 this->R_tbf = R;
581 }
582
583 if (toks >= 0) {
584 /* Now we have enough tokens to proceed */
585
586 this->tokens = toks <= this->depth ? toks : this->depth;
587 this->t_tbf = now;
588
589 if (!this->throttled)
590 return 1;
591
592 /* Flow was throttled. Update its start&finish numbers
593 with delay calculated aposteriori.
594 */
595
596 this->throttled = 0;
597 if (shift > 0)
598 csz_move_queue(this, shift);
599 return 1;
600 }
601
602 if (!this->throttled) {
603 /* Flow has just been throttled; remember
604 current round number to calculate aposteriori delay
605 */
606 this->throttled = 1;
607 this->R_tbf = csz_update(q);
608 }
609
610 /* Move all the queue to the time when it will be allowed to send.
611 We should translate time to round number, but it is impossible,
612 so that we made the most conservative estimate i.e. we suppose
613 that only this flow is active and, hence, R = t.
614 Really toks <= R <= toks/r_a.
615
616 This apriory shift in R will be adjusted later to reflect
617 real delay. We cannot avoid it because of:
618 - throttled flow continues to be active from the viewpoint
619 of CSZ, so that it would acquire the highest priority,
620 if you not adjusted start numbers.
621 - Eventually, finish number would become less than round
622 number and flow were declared inactive.
623 */
624
625 toks = -toks;
626
627 /* Remeber, that we should start watchdog */
628 if (toks < q->wd_expires)
629 q->wd_expires = toks;
630
631 toks >>= q->R_log;
632 shift += toks;
633 if (shift > 0) {
634 this->R_tbf += toks;
635 csz_move_queue(this, shift);
636 }
637 csz_insert_start(this);
638 return 0;
639 }
640 #endif
641
642
643 static struct sk_buff *
csz_dequeue(struct Qdisc * sch)644 csz_dequeue(struct Qdisc* sch)
645 {
646 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
647 struct sk_buff *skb;
648 struct csz_flow *this;
649
650 #ifdef CSZ_PLUS_TBF
651 q->wd_expires = 0;
652 #endif
653 this = (struct csz_flow*)q->s.snext;
654
655 while (this != (struct csz_flow*)&q->s) {
656
657 /* First of all: unlink from start list */
658 this->sprev->snext = this->snext;
659 this->snext->sprev = this->sprev;
660
661 if (this != &q->flow[0]) { /* Guaranteed flow */
662 skb = __skb_dequeue(&this->q);
663 if (skb) {
664 #ifdef CSZ_PLUS_TBF
665 if (this->depth) {
666 if (!csz_enough_tokens(q, this, skb))
667 continue;
668 }
669 #endif
670 if (this->q.qlen) {
671 struct sk_buff *nskb = skb_peek(&this->q);
672 this->start += L2R(this,nskb->len);
673 csz_insert_start(&q->s, this);
674 }
675 sch->q.qlen--;
676 return skb;
677 }
678 } else { /* Predicted or best effort flow */
679 skb = skb_dequeue_best(q);
680 if (skb) {
681 unsigned peeked = this->peeked;
682 this->peeked = 0;
683
684 if (--this->q.qlen) {
685 struct sk_buff *nskb;
686 unsigned dequeued = L2R(this,skb->len);
687
688 /* We got not the same thing that
689 peeked earlier; adjust start number
690 */
691 if (peeked != dequeued && peeked)
692 this->start += dequeued - peeked;
693
694 nskb = skb_peek_best(q);
695 peeked = L2R(this,nskb->len);
696 this->start += peeked;
697 this->peeked = peeked;
698 csz_insert_start(&q->s, this);
699 }
700 sch->q.qlen--;
701 return skb;
702 }
703 }
704 }
705 #ifdef CSZ_PLUS_TBF
706 /* We are about to return no skb.
707 Schedule watchdog timer, if it occurred because of shaping.
708 */
709 if (q->wd_expires) {
710 unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires);
711 if (delay == 0)
712 delay = 1;
713 mod_timer(&q->wd_timer, jiffies + delay);
714 sch->stats.overlimits++;
715 }
716 #endif
717 return NULL;
718 }
719
720 static void
csz_reset(struct Qdisc * sch)721 csz_reset(struct Qdisc* sch)
722 {
723 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
724 int i;
725
726 for (i=0; i<4; i++)
727 skb_queue_purge(&q->other[i]);
728
729 for (i=0; i<CSZ_GUARANTEED; i++) {
730 struct csz_flow *this = q->flow + i;
731 skb_queue_purge(&this->q);
732 this->snext = this->sprev =
733 this->fnext = this->fprev = (struct csz_head*)this;
734 this->start = this->finish = 0;
735 }
736 q->s.snext = q->s.sprev = &q->s;
737 q->f.fnext = q->f.fprev = &q->f;
738 q->R_c = 0;
739 #ifdef CSZ_PLUS_TBF
740 PSCHED_GET_TIME(&q->t_tbf);
741 q->tokens = q->depth;
742 del_timer(&q->wd_timer);
743 #endif
744 sch->q.qlen = 0;
745 }
746
747 static void
csz_destroy(struct Qdisc * sch)748 csz_destroy(struct Qdisc* sch)
749 {
750 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
751 struct tcf_proto *tp;
752
753 while ((tp = q->filter_list) != NULL) {
754 q->filter_list = tp->next;
755 tcf_destroy(tp);
756 }
757
758 MOD_DEC_USE_COUNT;
759 }
760
csz_init(struct Qdisc * sch,struct rtattr * opt)761 static int csz_init(struct Qdisc *sch, struct rtattr *opt)
762 {
763 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
764 struct rtattr *tb[TCA_CSZ_PTAB];
765 struct tc_csz_qopt *qopt;
766 int i;
767
768 rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
769 if (tb[TCA_CSZ_PARMS-1] == NULL ||
770 RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt))
771 return -EINVAL;
772 qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
773
774 q->R_log = qopt->R_log;
775 q->delta_log = qopt->delta_log;
776 for (i=0; i<=TC_PRIO_MAX; i++) {
777 if (qopt->priomap[i] >= CSZ_FLOWS)
778 return -EINVAL;
779 q->prio2band[i] = qopt->priomap[i];
780 }
781
782 for (i=0; i<4; i++)
783 skb_queue_head_init(&q->other[i]);
784
785 for (i=0; i<CSZ_GUARANTEED; i++) {
786 struct csz_flow *this = q->flow + i;
787 skb_queue_head_init(&this->q);
788 this->snext = this->sprev =
789 this->fnext = this->fprev = (struct csz_head*)this;
790 this->start = this->finish = 0;
791 }
792 q->s.snext = q->s.sprev = &q->s;
793 q->f.fnext = q->f.fprev = &q->f;
794 q->R_c = 0;
795 #ifdef CSZ_PLUS_TBF
796 init_timer(&q->wd_timer);
797 q->wd_timer.data = (unsigned long)sch;
798 q->wd_timer.function = csz_watchdog;
799 #endif
800 MOD_INC_USE_COUNT;
801 return 0;
802 }
803
csz_dump(struct Qdisc * sch,struct sk_buff * skb)804 static int csz_dump(struct Qdisc *sch, struct sk_buff *skb)
805 {
806 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
807 unsigned char *b = skb->tail;
808 struct rtattr *rta;
809 struct tc_csz_qopt opt;
810
811 rta = (struct rtattr*)b;
812 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
813
814 opt.flows = CSZ_FLOWS;
815 memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
816 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
817 rta->rta_len = skb->tail - b;
818
819 return skb->len;
820
821 rtattr_failure:
822 skb_trim(skb, b - skb->data);
823 return -1;
824 }
825
csz_graft(struct Qdisc * sch,unsigned long cl,struct Qdisc * new,struct Qdisc ** old)826 static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
827 struct Qdisc **old)
828 {
829 return -EINVAL;
830 }
831
csz_leaf(struct Qdisc * sch,unsigned long cl)832 static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
833 {
834 return NULL;
835 }
836
837
csz_get(struct Qdisc * sch,u32 classid)838 static unsigned long csz_get(struct Qdisc *sch, u32 classid)
839 {
840 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
841 unsigned long band = TC_H_MIN(classid) - 1;
842
843 if (band >= CSZ_FLOWS)
844 return 0;
845
846 if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL)
847 return 0;
848
849 return band+1;
850 }
851
csz_bind(struct Qdisc * sch,unsigned long parent,u32 classid)852 static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
853 {
854 return csz_get(sch, classid);
855 }
856
857
csz_put(struct Qdisc * sch,unsigned long cl)858 static void csz_put(struct Qdisc *sch, unsigned long cl)
859 {
860 return;
861 }
862
csz_change(struct Qdisc * sch,u32 handle,u32 parent,struct rtattr ** tca,unsigned long * arg)863 static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
864 {
865 unsigned long cl = *arg;
866 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
867 struct rtattr *opt = tca[TCA_OPTIONS-1];
868 struct rtattr *tb[TCA_CSZ_PTAB];
869 struct tc_csz_copt *copt;
870
871 rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
872 if (tb[TCA_CSZ_PARMS-1] == NULL ||
873 RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt))
874 return -EINVAL;
875 copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
876
877 if (tb[TCA_CSZ_RTAB-1] &&
878 RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024)
879 return -EINVAL;
880
881 if (cl) {
882 struct csz_flow *a;
883 cl--;
884 if (cl >= CSZ_FLOWS)
885 return -ENOENT;
886 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
887 return -EINVAL;
888
889 a = &q->flow[cl];
890
891 spin_lock_bh(&sch->dev->queue_lock);
892 #if 0
893 a->rate_log = copt->rate_log;
894 #endif
895 #ifdef CSZ_PLUS_TBF
896 a->limit = copt->limit;
897 a->rate = copt->rate;
898 a->buffer = copt->buffer;
899 a->mtu = copt->mtu;
900 #endif
901
902 if (tb[TCA_CSZ_RTAB-1])
903 memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024);
904
905 spin_unlock_bh(&sch->dev->queue_lock);
906 return 0;
907 }
908 /* NI */
909 return 0;
910 }
911
csz_delete(struct Qdisc * sch,unsigned long cl)912 static int csz_delete(struct Qdisc *sch, unsigned long cl)
913 {
914 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
915 struct csz_flow *a;
916
917 cl--;
918
919 if (cl >= CSZ_FLOWS)
920 return -ENOENT;
921 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
922 return -EINVAL;
923
924 a = &q->flow[cl];
925
926 spin_lock_bh(&sch->dev->queue_lock);
927 a->fprev->fnext = a->fnext;
928 a->fnext->fprev = a->fprev;
929 a->sprev->snext = a->snext;
930 a->snext->sprev = a->sprev;
931 a->start = a->finish = 0;
932 kfree(xchg(&q->flow[cl].L_tab, NULL));
933 spin_unlock_bh(&sch->dev->queue_lock);
934
935 return 0;
936 }
937
csz_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)938 static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
939 {
940 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
941 unsigned char *b = skb->tail;
942 struct rtattr *rta;
943 struct tc_csz_copt opt;
944
945 tcm->tcm_handle = sch->handle|cl;
946
947 cl--;
948
949 if (cl > CSZ_FLOWS)
950 goto rtattr_failure;
951
952 if (cl < CSZ_GUARANTEED) {
953 struct csz_flow *f = &q->flow[cl];
954
955 if (f->L_tab == NULL)
956 goto rtattr_failure;
957
958 rta = (struct rtattr*)b;
959 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
960
961 opt.limit = f->limit;
962 opt.rate = f->rate;
963 opt.slice = f->slice;
964 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
965 #ifdef CSZ_PLUS_TBF
966 opt.buffer = f->buffer;
967 opt.mtu = f->mtu;
968 #else
969 opt.buffer = 0;
970 opt.mtu = 0;
971 #endif
972
973 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
974 rta->rta_len = skb->tail - b;
975 }
976
977 return skb->len;
978
979 rtattr_failure:
980 skb_trim(skb, b - skb->data);
981 return -1;
982 }
983
csz_walk(struct Qdisc * sch,struct qdisc_walker * arg)984 static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg)
985 {
986 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
987 int prio = 0;
988
989 if (arg->stop)
990 return;
991
992 for (prio = 0; prio < CSZ_FLOWS; prio++) {
993 if (arg->count < arg->skip) {
994 arg->count++;
995 continue;
996 }
997 if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) {
998 arg->count++;
999 continue;
1000 }
1001 if (arg->fn(sch, prio+1, arg) < 0) {
1002 arg->stop = 1;
1003 break;
1004 }
1005 arg->count++;
1006 }
1007 }
1008
csz_find_tcf(struct Qdisc * sch,unsigned long cl)1009 static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
1010 {
1011 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
1012
1013 if (cl)
1014 return NULL;
1015
1016 return &q->filter_list;
1017 }
1018
1019 struct Qdisc_class_ops csz_class_ops =
1020 {
1021 csz_graft,
1022 csz_leaf,
1023
1024 csz_get,
1025 csz_put,
1026 csz_change,
1027 csz_delete,
1028 csz_walk,
1029
1030 csz_find_tcf,
1031 csz_bind,
1032 csz_put,
1033
1034 csz_dump_class,
1035 };
1036
1037 struct Qdisc_ops csz_qdisc_ops =
1038 {
1039 NULL,
1040 &csz_class_ops,
1041 "csz",
1042 sizeof(struct csz_sched_data),
1043
1044 csz_enqueue,
1045 csz_dequeue,
1046 NULL,
1047 NULL,
1048
1049 csz_init,
1050 csz_reset,
1051 csz_destroy,
1052 NULL /* csz_change */,
1053
1054 csz_dump,
1055 };
1056
1057
1058 #ifdef MODULE
init_module(void)1059 int init_module(void)
1060 {
1061 return register_qdisc(&csz_qdisc_ops);
1062 }
1063
cleanup_module(void)1064 void cleanup_module(void)
1065 {
1066 unregister_qdisc(&csz_qdisc_ops);
1067 }
1068 #endif
1069 MODULE_LICENSE("GPL");
1070