1 /*
2 * net/sched/sch_red.c Random Early Detection queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
15 */
16
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
26 #include <linux/mm.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/in.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
43
44
45 /* Random Early Detection (RED) algorithm.
46 =======================================
47
48 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
49 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
50
51 This file codes a "divisionless" version of RED algorithm
52 as written down in Fig.17 of the paper.
53
54 Short description.
55 ------------------
56
57 When a new packet arrives we calculate the average queue length:
58
59 avg = (1-W)*avg + W*current_queue_len,
60
61 W is the filter time constant (choosen as 2^(-Wlog)), it controls
62 the inertia of the algorithm. To allow larger bursts, W should be
63 decreased.
64
65 if (avg > th_max) -> packet marked (dropped).
66 if (avg < th_min) -> packet passes.
67 if (th_min < avg < th_max) we calculate probability:
68
69 Pb = max_P * (avg - th_min)/(th_max-th_min)
70
71 and mark (drop) packet with this probability.
72 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
73 max_P should be small (not 1), usually 0.01..0.02 is good value.
74
75 max_P is chosen as a number, so that max_P/(th_max-th_min)
76 is a negative power of two in order arithmetics to contain
77 only shifts.
78
79
80 Parameters, settable by user:
81 -----------------------------
82
83 limit - bytes (must be > qth_max + burst)
84
85 Hard limit on queue length, should be chosen >qth_max
86 to allow packet bursts. This parameter does not
87 affect the algorithms behaviour and can be chosen
88 arbitrarily high (well, less than ram size)
89 Really, this limit will never be reached
90 if RED works correctly.
91
92 qth_min - bytes (should be < qth_max/2)
93 qth_max - bytes (should be at least 2*qth_min and less limit)
94 Wlog - bits (<32) log(1/W).
95 Plog - bits (<32)
96
97 Plog is related to max_P by formula:
98
99 max_P = (qth_max-qth_min)/2^Plog;
100
101 F.e. if qth_max=128K and qth_min=32K, then Plog=22
102 corresponds to max_P=0.02
103
104 Scell_log
105 Stab
106
107 Lookup table for log((1-W)^(t/t_ave).
108
109
110 NOTES:
111
112 Upper bound on W.
113 -----------------
114
115 If you want to allow bursts of L packets of size S,
116 you should choose W:
117
118 L + 1 - th_min/S < (1-(1-W)^L)/W
119
120 th_min/S = 32 th_min/S = 4
121
122 log(W) L
123 -1 33
124 -2 35
125 -3 39
126 -4 46
127 -5 57
128 -6 75
129 -7 101
130 -8 135
131 -9 190
132 etc.
133 */
134
135 struct red_sched_data
136 {
137 /* Parameters */
138 u32 limit; /* HARD maximal queue length */
139 u32 qth_min; /* Min average length threshold: A scaled */
140 u32 qth_max; /* Max average length threshold: A scaled */
141 u32 Rmask;
142 u32 Scell_max;
143 unsigned char flags;
144 char Wlog; /* log(W) */
145 char Plog; /* random number bits */
146 char Scell_log;
147 u8 Stab[256];
148
149 /* Variables */
150 unsigned long qave; /* Average queue length: A scaled */
151 int qcount; /* Packets since last random number generation */
152 u32 qR; /* Cached random number */
153
154 psched_time_t qidlestart; /* Start of idle period */
155 struct tc_red_xstats st;
156 };
157
red_ecn_mark(struct sk_buff * skb)158 static int red_ecn_mark(struct sk_buff *skb)
159 {
160 if (skb->nh.raw + 20 > skb->tail)
161 return 0;
162
163 switch (skb->protocol) {
164 case __constant_htons(ETH_P_IP):
165 if (!INET_ECN_is_capable(skb->nh.iph->tos))
166 return 0;
167 if (INET_ECN_is_not_ce(skb->nh.iph->tos))
168 IP_ECN_set_ce(skb->nh.iph);
169 return 1;
170 case __constant_htons(ETH_P_IPV6):
171 if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
172 return 0;
173 IP6_ECN_set_ce(skb->nh.ipv6h);
174 return 1;
175 default:
176 return 0;
177 }
178 }
179
180 static int
red_enqueue(struct sk_buff * skb,struct Qdisc * sch)181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
182 {
183 struct red_sched_data *q = (struct red_sched_data *)sch->data;
184
185 psched_time_t now;
186
187 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
188 long us_idle;
189 int shift;
190
191 PSCHED_GET_TIME(now);
192 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
193 PSCHED_SET_PASTPERFECT(q->qidlestart);
194
195 /*
196 The problem: ideally, average length queue recalcultion should
197 be done over constant clock intervals. This is too expensive, so that
198 the calculation is driven by outgoing packets.
199 When the queue is idle we have to model this clock by hand.
200
201 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202 dummy packets as a burst after idle time, i.e.
203
204 q->qave *= (1-W)^m
205
206 This is an apparently overcomplicated solution (f.e. we have to precompute
207 a table to make this calculation in reasonable time)
208 I believe that a simpler model may be used here,
209 but it is field for experiments.
210 */
211 shift = q->Stab[us_idle>>q->Scell_log];
212
213 if (shift) {
214 q->qave >>= shift;
215 } else {
216 /* Approximate initial part of exponent
217 with linear function:
218 (1-W)^m ~= 1-mW + ...
219
220 Seems, it is the best solution to
221 problem of too coarce exponent tabulation.
222 */
223
224 us_idle = (q->qave * us_idle)>>q->Scell_log;
225 if (us_idle < q->qave/2)
226 q->qave -= us_idle;
227 else
228 q->qave >>= 1;
229 }
230 } else {
231 q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
232 /* NOTE:
233 q->qave is fixed point number with point at Wlog.
234 The formulae above is equvalent to floating point
235 version:
236
237 qave = qave*(1-W) + sch->stats.backlog*W;
238 --ANK (980924)
239 */
240 }
241
242 if (q->qave < q->qth_min) {
243 q->qcount = -1;
244 enqueue:
245 if (sch->stats.backlog + skb->len <= q->limit) {
246 __skb_queue_tail(&sch->q, skb);
247 sch->stats.backlog += skb->len;
248 sch->stats.bytes += skb->len;
249 sch->stats.packets++;
250 return NET_XMIT_SUCCESS;
251 } else {
252 q->st.pdrop++;
253 }
254 kfree_skb(skb);
255 sch->stats.drops++;
256 return NET_XMIT_DROP;
257 }
258 if (q->qave >= q->qth_max) {
259 q->qcount = -1;
260 sch->stats.overlimits++;
261 mark:
262 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
263 q->st.early++;
264 goto drop;
265 }
266 q->st.marked++;
267 goto enqueue;
268 }
269
270 if (++q->qcount) {
271 /* The formula used below causes questions.
272
273 OK. qR is random number in the interval 0..Rmask
274 i.e. 0..(2^Plog). If we used floating point
275 arithmetics, it would be: (2^Plog)*rnd_num,
276 where rnd_num is less 1.
277
278 Taking into account, that qave have fixed
279 point at Wlog, and Plog is related to max_P by
280 max_P = (qth_max-qth_min)/2^Plog; two lines
281 below have the following floating point equivalent:
282
283 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284
285 Any questions? --ANK (980924)
286 */
287 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288 goto enqueue;
289 q->qcount = 0;
290 q->qR = net_random()&q->Rmask;
291 sch->stats.overlimits++;
292 goto mark;
293 }
294 q->qR = net_random()&q->Rmask;
295 goto enqueue;
296
297 drop:
298 kfree_skb(skb);
299 sch->stats.drops++;
300 return NET_XMIT_CN;
301 }
302
303 static int
red_requeue(struct sk_buff * skb,struct Qdisc * sch)304 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
305 {
306 struct red_sched_data *q = (struct red_sched_data *)sch->data;
307
308 PSCHED_SET_PASTPERFECT(q->qidlestart);
309
310 __skb_queue_head(&sch->q, skb);
311 sch->stats.backlog += skb->len;
312 return 0;
313 }
314
315 static struct sk_buff *
red_dequeue(struct Qdisc * sch)316 red_dequeue(struct Qdisc* sch)
317 {
318 struct sk_buff *skb;
319 struct red_sched_data *q = (struct red_sched_data *)sch->data;
320
321 skb = __skb_dequeue(&sch->q);
322 if (skb) {
323 sch->stats.backlog -= skb->len;
324 return skb;
325 }
326 PSCHED_GET_TIME(q->qidlestart);
327 return NULL;
328 }
329
red_drop(struct Qdisc * sch)330 static unsigned int red_drop(struct Qdisc* sch)
331 {
332 struct sk_buff *skb;
333 struct red_sched_data *q = (struct red_sched_data *)sch->data;
334
335 skb = __skb_dequeue_tail(&sch->q);
336 if (skb) {
337 unsigned int len = skb->len;
338 sch->stats.backlog -= len;
339 sch->stats.drops++;
340 q->st.other++;
341 kfree_skb(skb);
342 return len;
343 }
344 PSCHED_GET_TIME(q->qidlestart);
345 return 0;
346 }
347
red_reset(struct Qdisc * sch)348 static void red_reset(struct Qdisc* sch)
349 {
350 struct red_sched_data *q = (struct red_sched_data *)sch->data;
351
352 __skb_queue_purge(&sch->q);
353 sch->stats.backlog = 0;
354 PSCHED_SET_PASTPERFECT(q->qidlestart);
355 q->qave = 0;
356 q->qcount = -1;
357 }
358
red_change(struct Qdisc * sch,struct rtattr * opt)359 static int red_change(struct Qdisc *sch, struct rtattr *opt)
360 {
361 struct red_sched_data *q = (struct red_sched_data *)sch->data;
362 struct rtattr *tb[TCA_RED_STAB];
363 struct tc_red_qopt *ctl;
364
365 if (opt == NULL ||
366 rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
367 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
368 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
369 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
370 return -EINVAL;
371
372 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
373
374 sch_tree_lock(sch);
375 q->flags = ctl->flags;
376 q->Wlog = ctl->Wlog;
377 q->Plog = ctl->Plog;
378 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
379 q->Scell_log = ctl->Scell_log;
380 q->Scell_max = (255<<q->Scell_log);
381 q->qth_min = ctl->qth_min<<ctl->Wlog;
382 q->qth_max = ctl->qth_max<<ctl->Wlog;
383 q->limit = ctl->limit;
384 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
385
386 q->qcount = -1;
387 if (skb_queue_len(&sch->q) == 0)
388 PSCHED_SET_PASTPERFECT(q->qidlestart);
389 sch_tree_unlock(sch);
390 return 0;
391 }
392
red_init(struct Qdisc * sch,struct rtattr * opt)393 static int red_init(struct Qdisc* sch, struct rtattr *opt)
394 {
395 int err;
396
397 MOD_INC_USE_COUNT;
398
399 if ((err = red_change(sch, opt)) != 0) {
400 MOD_DEC_USE_COUNT;
401 }
402 return err;
403 }
404
405
red_copy_xstats(struct sk_buff * skb,struct tc_red_xstats * st)406 int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
407 {
408 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
409 return 0;
410
411 rtattr_failure:
412 return 1;
413 }
414
red_dump(struct Qdisc * sch,struct sk_buff * skb)415 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
416 {
417 struct red_sched_data *q = (struct red_sched_data *)sch->data;
418 unsigned char *b = skb->tail;
419 struct rtattr *rta;
420 struct tc_red_qopt opt;
421
422 rta = (struct rtattr*)b;
423 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
424 opt.limit = q->limit;
425 opt.qth_min = q->qth_min>>q->Wlog;
426 opt.qth_max = q->qth_max>>q->Wlog;
427 opt.Wlog = q->Wlog;
428 opt.Plog = q->Plog;
429 opt.Scell_log = q->Scell_log;
430 opt.flags = q->flags;
431 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
432 rta->rta_len = skb->tail - b;
433
434 if (red_copy_xstats(skb, &q->st))
435 goto rtattr_failure;
436
437 return skb->len;
438
439 rtattr_failure:
440 skb_trim(skb, b - skb->data);
441 return -1;
442 }
443
red_destroy(struct Qdisc * sch)444 static void red_destroy(struct Qdisc *sch)
445 {
446 MOD_DEC_USE_COUNT;
447 }
448
449 struct Qdisc_ops red_qdisc_ops =
450 {
451 NULL,
452 NULL,
453 "red",
454 sizeof(struct red_sched_data),
455
456 red_enqueue,
457 red_dequeue,
458 red_requeue,
459 red_drop,
460
461 red_init,
462 red_reset,
463 red_destroy,
464 red_change,
465
466 red_dump,
467 };
468
469
470 #ifdef MODULE
init_module(void)471 int init_module(void)
472 {
473 return register_qdisc(&red_qdisc_ops);
474 }
475
cleanup_module(void)476 void cleanup_module(void)
477 {
478 unregister_qdisc(&red_qdisc_ops);
479 }
480 #endif
481 MODULE_LICENSE("GPL");
482