1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <asm/bitops.h>
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/list.h>
33 #include <net/sock.h>
34 #include <net/pkt_sched.h>
35 
36 /* Main transmission queue. */
37 
38 /* Main qdisc structure lock.
39 
40    However, modifications
41    to data, participating in scheduling must be additionally
42    protected with dev->queue_lock spinlock.
43 
44    The idea is the following:
45    - enqueue, dequeue are serialized via top level device
46      spinlock dev->queue_lock.
47    - tree walking is protected by read_lock(qdisc_tree_lock)
48      and this lock is used only in process context.
49    - updates to tree are made only under rtnl semaphore,
50      hence this lock may be made without local bh disabling.
51 
52    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
53  */
54 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
55 
56 /*
57    dev->queue_lock serializes queue accesses for this device
58    AND dev->qdisc pointer itself.
59 
60    dev->xmit_lock serializes accesses to device driver.
61 
62    dev->queue_lock and dev->xmit_lock are mutually exclusive,
63    if one is grabbed, another must be free.
64  */
65 
66 
67 /* Kick device.
68    Note, that this procedure can be called by a watchdog timer, so that
69    we do not check dev->tbusy flag here.
70 
71    Returns:  0  - queue is empty.
72             >0  - queue is not empty, but throttled.
73 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
74 
75    NOTE: Called under dev->queue_lock with locally disabled BH.
76 */
77 
qdisc_restart(struct net_device * dev)78 int qdisc_restart(struct net_device *dev)
79 {
80 	struct Qdisc *q = dev->qdisc;
81 	struct sk_buff *skb;
82 
83 	/* Dequeue packet */
84 	if ((skb = q->dequeue(q)) != NULL) {
85 		if (spin_trylock(&dev->xmit_lock)) {
86 			/* Remember that the driver is grabbed by us. */
87 			dev->xmit_lock_owner = smp_processor_id();
88 
89 			/* And release queue */
90 			spin_unlock(&dev->queue_lock);
91 
92 			if (!netif_queue_stopped(dev)) {
93 				if (netdev_nit)
94 					dev_queue_xmit_nit(skb, dev);
95 
96 				if (dev->hard_start_xmit(skb, dev) == 0) {
97 					dev->xmit_lock_owner = -1;
98 					spin_unlock(&dev->xmit_lock);
99 
100 					spin_lock(&dev->queue_lock);
101 					return -1;
102 				}
103 			}
104 
105 			/* Release the driver */
106 			dev->xmit_lock_owner = -1;
107 			spin_unlock(&dev->xmit_lock);
108 			spin_lock(&dev->queue_lock);
109 			q = dev->qdisc;
110 		} else {
111 			/* So, someone grabbed the driver. */
112 
113 			/* It may be transient configuration error,
114 			   when hard_start_xmit() recurses. We detect
115 			   it by checking xmit owner and drop the
116 			   packet when deadloop is detected.
117 			 */
118 			if (dev->xmit_lock_owner == smp_processor_id()) {
119 				kfree_skb(skb);
120 				if (net_ratelimit())
121 					printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
122 				return -1;
123 			}
124 			netdev_rx_stat[smp_processor_id()].cpu_collision++;
125 		}
126 
127 		/* Device kicked us out :(
128 		   This is possible in three cases:
129 
130 		   0. driver is locked
131 		   1. fastroute is enabled
132 		   2. device cannot determine busy state
133 		      before start of transmission (f.e. dialout)
134 		   3. device is buggy (ppp)
135 		 */
136 
137 		q->ops->requeue(skb, q);
138 		netif_schedule(dev);
139 		return 1;
140 	}
141 	return q->q.qlen;
142 }
143 
dev_watchdog(unsigned long arg)144 static void dev_watchdog(unsigned long arg)
145 {
146 	struct net_device *dev = (struct net_device *)arg;
147 
148 	spin_lock(&dev->xmit_lock);
149 	if (dev->qdisc != &noop_qdisc) {
150 		if (netif_device_present(dev) &&
151 		    netif_running(dev) &&
152 		    netif_carrier_ok(dev)) {
153 			if (netif_queue_stopped(dev) &&
154 			    (jiffies - dev->trans_start) > dev->watchdog_timeo) {
155 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
156 				dev->tx_timeout(dev);
157 			}
158 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
159 				dev_hold(dev);
160 		}
161 	}
162 	spin_unlock(&dev->xmit_lock);
163 
164 	dev_put(dev);
165 }
166 
dev_watchdog_init(struct net_device * dev)167 static void dev_watchdog_init(struct net_device *dev)
168 {
169 	init_timer(&dev->watchdog_timer);
170 	dev->watchdog_timer.data = (unsigned long)dev;
171 	dev->watchdog_timer.function = dev_watchdog;
172 }
173 
__netdev_watchdog_up(struct net_device * dev)174 void __netdev_watchdog_up(struct net_device *dev)
175 {
176 	if (dev->tx_timeout) {
177 		if (dev->watchdog_timeo <= 0)
178 			dev->watchdog_timeo = 5*HZ;
179 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
180 			dev_hold(dev);
181 	}
182 }
183 
dev_watchdog_up(struct net_device * dev)184 static void dev_watchdog_up(struct net_device *dev)
185 {
186 	spin_lock_bh(&dev->xmit_lock);
187 	__netdev_watchdog_up(dev);
188 	spin_unlock_bh(&dev->xmit_lock);
189 }
190 
dev_watchdog_down(struct net_device * dev)191 static void dev_watchdog_down(struct net_device *dev)
192 {
193 	spin_lock_bh(&dev->xmit_lock);
194 	if (del_timer(&dev->watchdog_timer))
195 		__dev_put(dev);
196 	spin_unlock_bh(&dev->xmit_lock);
197 }
198 
199 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
200    under all circumstances. It is difficult to invent anything faster or
201    cheaper.
202  */
203 
204 static int
noop_enqueue(struct sk_buff * skb,struct Qdisc * qdisc)205 noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
206 {
207 	kfree_skb(skb);
208 	return NET_XMIT_CN;
209 }
210 
211 static struct sk_buff *
noop_dequeue(struct Qdisc * qdisc)212 noop_dequeue(struct Qdisc * qdisc)
213 {
214 	return NULL;
215 }
216 
217 static int
noop_requeue(struct sk_buff * skb,struct Qdisc * qdisc)218 noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
219 {
220 	if (net_ratelimit())
221 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
222 	kfree_skb(skb);
223 	return NET_XMIT_CN;
224 }
225 
226 struct Qdisc_ops noop_qdisc_ops =
227 {
228 	NULL,
229 	NULL,
230 	"noop",
231 	0,
232 
233 	noop_enqueue,
234 	noop_dequeue,
235 	noop_requeue,
236 };
237 
238 struct Qdisc noop_qdisc =
239 {
240 	noop_enqueue,
241 	noop_dequeue,
242 	TCQ_F_BUILTIN,
243 	&noop_qdisc_ops,
244 };
245 
246 
247 struct Qdisc_ops noqueue_qdisc_ops =
248 {
249 	NULL,
250 	NULL,
251 	"noqueue",
252 	0,
253 
254 	noop_enqueue,
255 	noop_dequeue,
256 	noop_requeue,
257 
258 };
259 
260 struct Qdisc noqueue_qdisc =
261 {
262 	NULL,
263 	noop_dequeue,
264 	TCQ_F_BUILTIN,
265 	&noqueue_qdisc_ops,
266 };
267 
268 
269 static const u8 prio2band[TC_PRIO_MAX+1] =
270 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
271 
272 /* 3-band FIFO queue: old style, but should be a bit faster than
273    generic prio+fifo combination.
274  */
275 
276 static int
pfifo_fast_enqueue(struct sk_buff * skb,struct Qdisc * qdisc)277 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
278 {
279 	struct sk_buff_head *list;
280 
281 	list = ((struct sk_buff_head*)qdisc->data) +
282 		prio2band[skb->priority&TC_PRIO_MAX];
283 
284 	if (list->qlen < qdisc->dev->tx_queue_len) {
285 		__skb_queue_tail(list, skb);
286 		qdisc->q.qlen++;
287 		qdisc->stats.bytes += skb->len;
288 		qdisc->stats.packets++;
289 		return 0;
290 	}
291 	qdisc->stats.drops++;
292 	kfree_skb(skb);
293 	return NET_XMIT_DROP;
294 }
295 
296 static struct sk_buff *
pfifo_fast_dequeue(struct Qdisc * qdisc)297 pfifo_fast_dequeue(struct Qdisc* qdisc)
298 {
299 	int prio;
300 	struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
301 	struct sk_buff *skb;
302 
303 	for (prio = 0; prio < 3; prio++, list++) {
304 		skb = __skb_dequeue(list);
305 		if (skb) {
306 			qdisc->q.qlen--;
307 			return skb;
308 		}
309 	}
310 	return NULL;
311 }
312 
313 static int
pfifo_fast_requeue(struct sk_buff * skb,struct Qdisc * qdisc)314 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
315 {
316 	struct sk_buff_head *list;
317 
318 	list = ((struct sk_buff_head*)qdisc->data) +
319 		prio2band[skb->priority&TC_PRIO_MAX];
320 
321 	__skb_queue_head(list, skb);
322 	qdisc->q.qlen++;
323 	return 0;
324 }
325 
326 static void
pfifo_fast_reset(struct Qdisc * qdisc)327 pfifo_fast_reset(struct Qdisc* qdisc)
328 {
329 	int prio;
330 	struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
331 
332 	for (prio=0; prio < 3; prio++)
333 		skb_queue_purge(list+prio);
334 	qdisc->q.qlen = 0;
335 }
336 
pfifo_fast_dump(struct Qdisc * qdisc,struct sk_buff * skb)337 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
338 {
339 	unsigned char	 *b = skb->tail;
340 	struct tc_prio_qopt opt;
341 
342 	opt.bands = 3;
343 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
344 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
345 	return skb->len;
346 
347 rtattr_failure:
348 	skb_trim(skb, b - skb->data);
349 	return -1;
350 }
351 
pfifo_fast_init(struct Qdisc * qdisc,struct rtattr * opt)352 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
353 {
354 	int i;
355 	struct sk_buff_head *list;
356 
357 	list = ((struct sk_buff_head*)qdisc->data);
358 
359 	for (i=0; i<3; i++)
360 		skb_queue_head_init(list+i);
361 
362 	return 0;
363 }
364 
365 static struct Qdisc_ops pfifo_fast_ops =
366 {
367 	NULL,
368 	NULL,
369 	"pfifo_fast",
370 	3 * sizeof(struct sk_buff_head),
371 
372 	pfifo_fast_enqueue,
373 	pfifo_fast_dequeue,
374 	pfifo_fast_requeue,
375 	NULL,
376 
377 	pfifo_fast_init,
378 	pfifo_fast_reset,
379 	NULL,
380 	NULL,
381 	pfifo_fast_dump,
382 
383 };
384 
qdisc_create_dflt(struct net_device * dev,struct Qdisc_ops * ops)385 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
386 {
387 	struct Qdisc *sch;
388 	int size = sizeof(*sch) + ops->priv_size;
389 
390 	sch = kmalloc(size, GFP_KERNEL);
391 	if (!sch)
392 		return NULL;
393 	memset(sch, 0, size);
394 
395 	INIT_LIST_HEAD(&sch->list);
396 	skb_queue_head_init(&sch->q);
397 	sch->ops = ops;
398 	sch->enqueue = ops->enqueue;
399 	sch->dequeue = ops->dequeue;
400 	sch->dev = dev;
401 	sch->stats.lock = &dev->queue_lock;
402 	atomic_set(&sch->refcnt, 1);
403 	if (!ops->init || ops->init(sch, NULL) == 0)
404 		return sch;
405 
406 	kfree(sch);
407 	return NULL;
408 }
409 
410 /* Under dev->queue_lock and BH! */
411 
qdisc_reset(struct Qdisc * qdisc)412 void qdisc_reset(struct Qdisc *qdisc)
413 {
414 	struct Qdisc_ops *ops = qdisc->ops;
415 
416 	if (ops->reset)
417 		ops->reset(qdisc);
418 }
419 
420 /* Under dev->queue_lock and BH! */
421 
qdisc_destroy(struct Qdisc * qdisc)422 void qdisc_destroy(struct Qdisc *qdisc)
423 {
424 	struct Qdisc_ops *ops = qdisc->ops;
425 
426 	if (qdisc->flags&TCQ_F_BUILTIN ||
427 	    !atomic_dec_and_test(&qdisc->refcnt))
428 		return;
429 	list_del(&qdisc->list);
430 #ifdef CONFIG_NET_ESTIMATOR
431 	qdisc_kill_estimator(&qdisc->stats);
432 #endif
433 	if (ops->reset)
434 		ops->reset(qdisc);
435 	if (ops->destroy)
436 		ops->destroy(qdisc);
437 	kfree(qdisc);
438 }
439 
440 
dev_activate(struct net_device * dev)441 void dev_activate(struct net_device *dev)
442 {
443 	/* No queueing discipline is attached to device;
444 	   create default one i.e. pfifo_fast for devices,
445 	   which need queueing and noqueue_qdisc for
446 	   virtual interfaces
447 	 */
448 
449 	if (dev->qdisc_sleeping == &noop_qdisc) {
450 		struct Qdisc *qdisc;
451 		if (dev->tx_queue_len) {
452 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
453 			if (qdisc == NULL) {
454 				printk(KERN_INFO "%s: activation failed\n", dev->name);
455 				return;
456 			}
457 			write_lock(&qdisc_tree_lock);
458 			list_add_tail(&qdisc->list, &dev->qdisc_list);
459 			write_unlock(&qdisc_tree_lock);
460 
461 		} else {
462 			qdisc =  &noqueue_qdisc;
463 		}
464 		write_lock(&qdisc_tree_lock);
465 		dev->qdisc_sleeping = qdisc;
466 		write_unlock(&qdisc_tree_lock);
467 	}
468 
469 	spin_lock_bh(&dev->queue_lock);
470 	if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
471 		dev->trans_start = jiffies;
472 		dev_watchdog_up(dev);
473 	}
474 	spin_unlock_bh(&dev->queue_lock);
475 }
476 
dev_deactivate(struct net_device * dev)477 void dev_deactivate(struct net_device *dev)
478 {
479 	struct Qdisc *qdisc;
480 
481 	spin_lock_bh(&dev->queue_lock);
482 	qdisc = dev->qdisc;
483 	dev->qdisc = &noop_qdisc;
484 
485 	qdisc_reset(qdisc);
486 
487 	spin_unlock_bh(&dev->queue_lock);
488 
489 	dev_watchdog_down(dev);
490 
491 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
492 		yield();
493 
494 	spin_unlock_wait(&dev->xmit_lock);
495 }
496 
dev_init_scheduler(struct net_device * dev)497 void dev_init_scheduler(struct net_device *dev)
498 {
499 	write_lock(&qdisc_tree_lock);
500 	spin_lock_bh(&dev->queue_lock);
501 	dev->qdisc = &noop_qdisc;
502 	spin_unlock_bh(&dev->queue_lock);
503 	dev->qdisc_sleeping = &noop_qdisc;
504 	INIT_LIST_HEAD(&dev->qdisc_list);
505 	write_unlock(&qdisc_tree_lock);
506 
507 	dev_watchdog_init(dev);
508 }
509 
dev_shutdown(struct net_device * dev)510 void dev_shutdown(struct net_device *dev)
511 {
512 	struct Qdisc *qdisc;
513 
514 	write_lock(&qdisc_tree_lock);
515 	spin_lock_bh(&dev->queue_lock);
516 	qdisc = dev->qdisc_sleeping;
517 	dev->qdisc = &noop_qdisc;
518 	dev->qdisc_sleeping = &noop_qdisc;
519 	qdisc_destroy(qdisc);
520 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
521         if ((qdisc = dev->qdisc_ingress) != NULL) {
522 		dev->qdisc_ingress = NULL;
523 		qdisc_destroy(qdisc);
524         }
525 #endif
526 	BUG_TRAP(list_empty(&dev->qdisc_list));
527 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
528 	spin_unlock_bh(&dev->queue_lock);
529 	write_unlock(&qdisc_tree_lock);
530 }
531