1 /*
2  * net/sched/sch_red.c	Random Early Detection queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Changes:
12  * J Hadi Salim <hadi@nortel.com> 980914:	computation fixes
13  * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14  * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support
15  */
16 
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
26 #include <linux/mm.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/in.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
43 
44 
45 /*	Random Early Detection (RED) algorithm.
46 	=======================================
47 
48 	Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
49 	for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
50 
51 	This file codes a "divisionless" version of RED algorithm
52 	as written down in Fig.17 of the paper.
53 
54 Short description.
55 ------------------
56 
57 	When a new packet arrives we calculate the average queue length:
58 
59 	avg = (1-W)*avg + W*current_queue_len,
60 
61 	W is the filter time constant (choosen as 2^(-Wlog)), it controls
62 	the inertia of the algorithm. To allow larger bursts, W should be
63 	decreased.
64 
65 	if (avg > th_max) -> packet marked (dropped).
66 	if (avg < th_min) -> packet passes.
67 	if (th_min < avg < th_max) we calculate probability:
68 
69 	Pb = max_P * (avg - th_min)/(th_max-th_min)
70 
71 	and mark (drop) packet with this probability.
72 	Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
73 	max_P should be small (not 1), usually 0.01..0.02 is good value.
74 
75 	max_P is chosen as a number, so that max_P/(th_max-th_min)
76 	is a negative power of two in order arithmetics to contain
77 	only shifts.
78 
79 
80 	Parameters, settable by user:
81 	-----------------------------
82 
83 	limit		- bytes (must be > qth_max + burst)
84 
85 	Hard limit on queue length, should be chosen >qth_max
86 	to allow packet bursts. This parameter does not
87 	affect the algorithms behaviour and can be chosen
88 	arbitrarily high (well, less than ram size)
89 	Really, this limit will never be reached
90 	if RED works correctly.
91 
92 	qth_min		- bytes (should be < qth_max/2)
93 	qth_max		- bytes (should be at least 2*qth_min and less limit)
94 	Wlog	       	- bits (<32) log(1/W).
95 	Plog	       	- bits (<32)
96 
97 	Plog is related to max_P by formula:
98 
99 	max_P = (qth_max-qth_min)/2^Plog;
100 
101 	F.e. if qth_max=128K and qth_min=32K, then Plog=22
102 	corresponds to max_P=0.02
103 
104 	Scell_log
105 	Stab
106 
107 	Lookup table for log((1-W)^(t/t_ave).
108 
109 
110 NOTES:
111 
112 Upper bound on W.
113 -----------------
114 
115 	If you want to allow bursts of L packets of size S,
116 	you should choose W:
117 
118 	L + 1 - th_min/S < (1-(1-W)^L)/W
119 
120 	th_min/S = 32         th_min/S = 4
121 
122 	log(W)	L
123 	-1	33
124 	-2	35
125 	-3	39
126 	-4	46
127 	-5	57
128 	-6	75
129 	-7	101
130 	-8	135
131 	-9	190
132 	etc.
133  */
134 
135 struct red_sched_data
136 {
137 /* Parameters */
138 	u32		limit;		/* HARD maximal queue length	*/
139 	u32		qth_min;	/* Min average length threshold: A scaled */
140 	u32		qth_max;	/* Max average length threshold: A scaled */
141 	u32		Rmask;
142 	u32		Scell_max;
143 	unsigned char	flags;
144 	char		Wlog;		/* log(W)		*/
145 	char		Plog;		/* random number bits	*/
146 	char		Scell_log;
147 	u8		Stab[256];
148 
149 /* Variables */
150 	unsigned long	qave;		/* Average queue length: A scaled */
151 	int		qcount;		/* Packets since last random number generation */
152 	u32		qR;		/* Cached random number */
153 
154 	psched_time_t	qidlestart;	/* Start of idle period		*/
155 	struct tc_red_xstats st;
156 };
157 
red_ecn_mark(struct sk_buff * skb)158 static int red_ecn_mark(struct sk_buff *skb)
159 {
160 	if (skb->nh.raw + 20 > skb->tail)
161 		return 0;
162 
163 	switch (skb->protocol) {
164 	case __constant_htons(ETH_P_IP):
165 		if (!INET_ECN_is_capable(skb->nh.iph->tos))
166 			return 0;
167 		if (INET_ECN_is_not_ce(skb->nh.iph->tos))
168 			IP_ECN_set_ce(skb->nh.iph);
169 		return 1;
170 	case __constant_htons(ETH_P_IPV6):
171 		if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
172 			return 0;
173 		IP6_ECN_set_ce(skb->nh.ipv6h);
174 		return 1;
175 	default:
176 		return 0;
177 	}
178 }
179 
180 static int
red_enqueue(struct sk_buff * skb,struct Qdisc * sch)181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
182 {
183 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
184 
185 	psched_time_t now;
186 
187 	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
188 		long us_idle;
189 		int  shift;
190 
191 		PSCHED_GET_TIME(now);
192 		us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
193 		PSCHED_SET_PASTPERFECT(q->qidlestart);
194 
195 /*
196    The problem: ideally, average length queue recalcultion should
197    be done over constant clock intervals. This is too expensive, so that
198    the calculation is driven by outgoing packets.
199    When the queue is idle we have to model this clock by hand.
200 
201    SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202    dummy packets as a burst after idle time, i.e.
203 
204           q->qave *= (1-W)^m
205 
206    This is an apparently overcomplicated solution (f.e. we have to precompute
207    a table to make this calculation in reasonable time)
208    I believe that a simpler model may be used here,
209    but it is field for experiments.
210 */
211 		shift = q->Stab[us_idle>>q->Scell_log];
212 
213 		if (shift) {
214 			q->qave >>= shift;
215 		} else {
216 			/* Approximate initial part of exponent
217 			   with linear function:
218 			   (1-W)^m ~= 1-mW + ...
219 
220 			   Seems, it is the best solution to
221 			   problem of too coarce exponent tabulation.
222 			 */
223 
224 			us_idle = (q->qave * us_idle)>>q->Scell_log;
225 			if (us_idle < q->qave/2)
226 				q->qave -= us_idle;
227 			else
228 				q->qave >>= 1;
229 		}
230 	} else {
231 		q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
232 		/* NOTE:
233 		   q->qave is fixed point number with point at Wlog.
234 		   The formulae above is equvalent to floating point
235 		   version:
236 
237 		   qave = qave*(1-W) + sch->stats.backlog*W;
238 		                                           --ANK (980924)
239 		 */
240 	}
241 
242 	if (q->qave < q->qth_min) {
243 		q->qcount = -1;
244 enqueue:
245 		if (sch->stats.backlog + skb->len <= q->limit) {
246 			__skb_queue_tail(&sch->q, skb);
247 			sch->stats.backlog += skb->len;
248 			sch->stats.bytes += skb->len;
249 			sch->stats.packets++;
250 			return NET_XMIT_SUCCESS;
251 		} else {
252 			q->st.pdrop++;
253 		}
254 		kfree_skb(skb);
255 		sch->stats.drops++;
256 		return NET_XMIT_DROP;
257 	}
258 	if (q->qave >= q->qth_max) {
259 		q->qcount = -1;
260 		sch->stats.overlimits++;
261 mark:
262 		if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
263 			q->st.early++;
264 			goto drop;
265 		}
266 		q->st.marked++;
267 		goto enqueue;
268 	}
269 
270 	if (++q->qcount) {
271 		/* The formula used below causes questions.
272 
273 		   OK. qR is random number in the interval 0..Rmask
274 		   i.e. 0..(2^Plog). If we used floating point
275 		   arithmetics, it would be: (2^Plog)*rnd_num,
276 		   where rnd_num is less 1.
277 
278 		   Taking into account, that qave have fixed
279 		   point at Wlog, and Plog is related to max_P by
280 		   max_P = (qth_max-qth_min)/2^Plog; two lines
281 		   below have the following floating point equivalent:
282 
283 		   max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284 
285 		   Any questions? --ANK (980924)
286 		 */
287 		if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288 			goto enqueue;
289 		q->qcount = 0;
290 		q->qR = net_random()&q->Rmask;
291 		sch->stats.overlimits++;
292 		goto mark;
293 	}
294 	q->qR = net_random()&q->Rmask;
295 	goto enqueue;
296 
297 drop:
298 	kfree_skb(skb);
299 	sch->stats.drops++;
300 	return NET_XMIT_CN;
301 }
302 
303 static int
red_requeue(struct sk_buff * skb,struct Qdisc * sch)304 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
305 {
306 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
307 
308 	PSCHED_SET_PASTPERFECT(q->qidlestart);
309 
310 	__skb_queue_head(&sch->q, skb);
311 	sch->stats.backlog += skb->len;
312 	return 0;
313 }
314 
315 static struct sk_buff *
red_dequeue(struct Qdisc * sch)316 red_dequeue(struct Qdisc* sch)
317 {
318 	struct sk_buff *skb;
319 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
320 
321 	skb = __skb_dequeue(&sch->q);
322 	if (skb) {
323 		sch->stats.backlog -= skb->len;
324 		return skb;
325 	}
326 	PSCHED_GET_TIME(q->qidlestart);
327 	return NULL;
328 }
329 
red_drop(struct Qdisc * sch)330 static unsigned int red_drop(struct Qdisc* sch)
331 {
332 	struct sk_buff *skb;
333 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
334 
335 	skb = __skb_dequeue_tail(&sch->q);
336 	if (skb) {
337 		unsigned int len = skb->len;
338 		sch->stats.backlog -= len;
339 		sch->stats.drops++;
340 		q->st.other++;
341 		kfree_skb(skb);
342 		return len;
343 	}
344 	PSCHED_GET_TIME(q->qidlestart);
345 	return 0;
346 }
347 
red_reset(struct Qdisc * sch)348 static void red_reset(struct Qdisc* sch)
349 {
350 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
351 
352 	__skb_queue_purge(&sch->q);
353 	sch->stats.backlog = 0;
354 	PSCHED_SET_PASTPERFECT(q->qidlestart);
355 	q->qave = 0;
356 	q->qcount = -1;
357 }
358 
red_change(struct Qdisc * sch,struct rtattr * opt)359 static int red_change(struct Qdisc *sch, struct rtattr *opt)
360 {
361 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
362 	struct rtattr *tb[TCA_RED_STAB];
363 	struct tc_red_qopt *ctl;
364 
365 	if (opt == NULL ||
366 	    rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
367 	    tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
368 	    RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
369 	    RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
370 		return -EINVAL;
371 
372 	ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
373 
374 	sch_tree_lock(sch);
375 	q->flags = ctl->flags;
376 	q->Wlog = ctl->Wlog;
377 	q->Plog = ctl->Plog;
378 	q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
379 	q->Scell_log = ctl->Scell_log;
380 	q->Scell_max = (255<<q->Scell_log);
381 	q->qth_min = ctl->qth_min<<ctl->Wlog;
382 	q->qth_max = ctl->qth_max<<ctl->Wlog;
383 	q->limit = ctl->limit;
384 	memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
385 
386 	q->qcount = -1;
387 	if (skb_queue_len(&sch->q) == 0)
388 		PSCHED_SET_PASTPERFECT(q->qidlestart);
389 	sch_tree_unlock(sch);
390 	return 0;
391 }
392 
red_init(struct Qdisc * sch,struct rtattr * opt)393 static int red_init(struct Qdisc* sch, struct rtattr *opt)
394 {
395 	int err;
396 
397 	MOD_INC_USE_COUNT;
398 
399 	if ((err = red_change(sch, opt)) != 0) {
400 		MOD_DEC_USE_COUNT;
401 	}
402 	return err;
403 }
404 
405 
red_copy_xstats(struct sk_buff * skb,struct tc_red_xstats * st)406 int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
407 {
408         RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
409         return 0;
410 
411 rtattr_failure:
412         return 1;
413 }
414 
red_dump(struct Qdisc * sch,struct sk_buff * skb)415 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
416 {
417 	struct red_sched_data *q = (struct red_sched_data *)sch->data;
418 	unsigned char	 *b = skb->tail;
419 	struct rtattr *rta;
420 	struct tc_red_qopt opt;
421 
422 	rta = (struct rtattr*)b;
423 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
424 	opt.limit = q->limit;
425 	opt.qth_min = q->qth_min>>q->Wlog;
426 	opt.qth_max = q->qth_max>>q->Wlog;
427 	opt.Wlog = q->Wlog;
428 	opt.Plog = q->Plog;
429 	opt.Scell_log = q->Scell_log;
430 	opt.flags = q->flags;
431 	RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
432 	rta->rta_len = skb->tail - b;
433 
434 	if (red_copy_xstats(skb, &q->st))
435 		goto rtattr_failure;
436 
437 	return skb->len;
438 
439 rtattr_failure:
440 	skb_trim(skb, b - skb->data);
441 	return -1;
442 }
443 
red_destroy(struct Qdisc * sch)444 static void red_destroy(struct Qdisc *sch)
445 {
446 	MOD_DEC_USE_COUNT;
447 }
448 
449 struct Qdisc_ops red_qdisc_ops =
450 {
451 	NULL,
452 	NULL,
453 	"red",
454 	sizeof(struct red_sched_data),
455 
456 	red_enqueue,
457 	red_dequeue,
458 	red_requeue,
459 	red_drop,
460 
461 	red_init,
462 	red_reset,
463 	red_destroy,
464 	red_change,
465 
466 	red_dump,
467 };
468 
469 
470 #ifdef MODULE
init_module(void)471 int init_module(void)
472 {
473 	return register_qdisc(&red_qdisc_ops);
474 }
475 
cleanup_module(void)476 void cleanup_module(void)
477 {
478 	unregister_qdisc(&red_qdisc_ops);
479 }
480 #endif
481 MODULE_LICENSE("GPL");
482