1 /*
2 * net/sched/sch_generic.c Generic packet scheduler routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11 * - Ingress support
12 */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <asm/bitops.h>
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/list.h>
33 #include <net/sock.h>
34 #include <net/pkt_sched.h>
35
36 /* Main transmission queue. */
37
38 /* Main qdisc structure lock.
39
40 However, modifications
41 to data, participating in scheduling must be additionally
42 protected with dev->queue_lock spinlock.
43
44 The idea is the following:
45 - enqueue, dequeue are serialized via top level device
46 spinlock dev->queue_lock.
47 - tree walking is protected by read_lock(qdisc_tree_lock)
48 and this lock is used only in process context.
49 - updates to tree are made only under rtnl semaphore,
50 hence this lock may be made without local bh disabling.
51
52 qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
53 */
54 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
55
56 /*
57 dev->queue_lock serializes queue accesses for this device
58 AND dev->qdisc pointer itself.
59
60 dev->xmit_lock serializes accesses to device driver.
61
62 dev->queue_lock and dev->xmit_lock are mutually exclusive,
63 if one is grabbed, another must be free.
64 */
65
66
67 /* Kick device.
68 Note, that this procedure can be called by a watchdog timer, so that
69 we do not check dev->tbusy flag here.
70
71 Returns: 0 - queue is empty.
72 >0 - queue is not empty, but throttled.
73 <0 - queue is not empty. Device is throttled, if dev->tbusy != 0.
74
75 NOTE: Called under dev->queue_lock with locally disabled BH.
76 */
77
qdisc_restart(struct net_device * dev)78 int qdisc_restart(struct net_device *dev)
79 {
80 struct Qdisc *q = dev->qdisc;
81 struct sk_buff *skb;
82
83 /* Dequeue packet */
84 if ((skb = q->dequeue(q)) != NULL) {
85 if (spin_trylock(&dev->xmit_lock)) {
86 /* Remember that the driver is grabbed by us. */
87 dev->xmit_lock_owner = smp_processor_id();
88
89 /* And release queue */
90 spin_unlock(&dev->queue_lock);
91
92 if (!netif_queue_stopped(dev)) {
93 if (netdev_nit)
94 dev_queue_xmit_nit(skb, dev);
95
96 if (dev->hard_start_xmit(skb, dev) == 0) {
97 dev->xmit_lock_owner = -1;
98 spin_unlock(&dev->xmit_lock);
99
100 spin_lock(&dev->queue_lock);
101 return -1;
102 }
103 }
104
105 /* Release the driver */
106 dev->xmit_lock_owner = -1;
107 spin_unlock(&dev->xmit_lock);
108 spin_lock(&dev->queue_lock);
109 q = dev->qdisc;
110 } else {
111 /* So, someone grabbed the driver. */
112
113 /* It may be transient configuration error,
114 when hard_start_xmit() recurses. We detect
115 it by checking xmit owner and drop the
116 packet when deadloop is detected.
117 */
118 if (dev->xmit_lock_owner == smp_processor_id()) {
119 kfree_skb(skb);
120 if (net_ratelimit())
121 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
122 return -1;
123 }
124 netdev_rx_stat[smp_processor_id()].cpu_collision++;
125 }
126
127 /* Device kicked us out :(
128 This is possible in three cases:
129
130 0. driver is locked
131 1. fastroute is enabled
132 2. device cannot determine busy state
133 before start of transmission (f.e. dialout)
134 3. device is buggy (ppp)
135 */
136
137 q->ops->requeue(skb, q);
138 netif_schedule(dev);
139 return 1;
140 }
141 return q->q.qlen;
142 }
143
dev_watchdog(unsigned long arg)144 static void dev_watchdog(unsigned long arg)
145 {
146 struct net_device *dev = (struct net_device *)arg;
147
148 spin_lock(&dev->xmit_lock);
149 if (dev->qdisc != &noop_qdisc) {
150 if (netif_device_present(dev) &&
151 netif_running(dev) &&
152 netif_carrier_ok(dev)) {
153 if (netif_queue_stopped(dev) &&
154 (jiffies - dev->trans_start) > dev->watchdog_timeo) {
155 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
156 dev->tx_timeout(dev);
157 }
158 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
159 dev_hold(dev);
160 }
161 }
162 spin_unlock(&dev->xmit_lock);
163
164 dev_put(dev);
165 }
166
dev_watchdog_init(struct net_device * dev)167 static void dev_watchdog_init(struct net_device *dev)
168 {
169 init_timer(&dev->watchdog_timer);
170 dev->watchdog_timer.data = (unsigned long)dev;
171 dev->watchdog_timer.function = dev_watchdog;
172 }
173
__netdev_watchdog_up(struct net_device * dev)174 void __netdev_watchdog_up(struct net_device *dev)
175 {
176 if (dev->tx_timeout) {
177 if (dev->watchdog_timeo <= 0)
178 dev->watchdog_timeo = 5*HZ;
179 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
180 dev_hold(dev);
181 }
182 }
183
dev_watchdog_up(struct net_device * dev)184 static void dev_watchdog_up(struct net_device *dev)
185 {
186 spin_lock_bh(&dev->xmit_lock);
187 __netdev_watchdog_up(dev);
188 spin_unlock_bh(&dev->xmit_lock);
189 }
190
dev_watchdog_down(struct net_device * dev)191 static void dev_watchdog_down(struct net_device *dev)
192 {
193 spin_lock_bh(&dev->xmit_lock);
194 if (del_timer(&dev->watchdog_timer))
195 __dev_put(dev);
196 spin_unlock_bh(&dev->xmit_lock);
197 }
198
199 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
200 under all circumstances. It is difficult to invent anything faster or
201 cheaper.
202 */
203
204 static int
noop_enqueue(struct sk_buff * skb,struct Qdisc * qdisc)205 noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
206 {
207 kfree_skb(skb);
208 return NET_XMIT_CN;
209 }
210
211 static struct sk_buff *
noop_dequeue(struct Qdisc * qdisc)212 noop_dequeue(struct Qdisc * qdisc)
213 {
214 return NULL;
215 }
216
217 static int
noop_requeue(struct sk_buff * skb,struct Qdisc * qdisc)218 noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
219 {
220 if (net_ratelimit())
221 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
222 kfree_skb(skb);
223 return NET_XMIT_CN;
224 }
225
226 struct Qdisc_ops noop_qdisc_ops =
227 {
228 NULL,
229 NULL,
230 "noop",
231 0,
232
233 noop_enqueue,
234 noop_dequeue,
235 noop_requeue,
236 };
237
238 struct Qdisc noop_qdisc =
239 {
240 noop_enqueue,
241 noop_dequeue,
242 TCQ_F_BUILTIN,
243 &noop_qdisc_ops,
244 };
245
246
247 struct Qdisc_ops noqueue_qdisc_ops =
248 {
249 NULL,
250 NULL,
251 "noqueue",
252 0,
253
254 noop_enqueue,
255 noop_dequeue,
256 noop_requeue,
257
258 };
259
260 struct Qdisc noqueue_qdisc =
261 {
262 NULL,
263 noop_dequeue,
264 TCQ_F_BUILTIN,
265 &noqueue_qdisc_ops,
266 };
267
268
269 static const u8 prio2band[TC_PRIO_MAX+1] =
270 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
271
272 /* 3-band FIFO queue: old style, but should be a bit faster than
273 generic prio+fifo combination.
274 */
275
276 static int
pfifo_fast_enqueue(struct sk_buff * skb,struct Qdisc * qdisc)277 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
278 {
279 struct sk_buff_head *list;
280
281 list = ((struct sk_buff_head*)qdisc->data) +
282 prio2band[skb->priority&TC_PRIO_MAX];
283
284 if (list->qlen < qdisc->dev->tx_queue_len) {
285 __skb_queue_tail(list, skb);
286 qdisc->q.qlen++;
287 qdisc->stats.bytes += skb->len;
288 qdisc->stats.packets++;
289 return 0;
290 }
291 qdisc->stats.drops++;
292 kfree_skb(skb);
293 return NET_XMIT_DROP;
294 }
295
296 static struct sk_buff *
pfifo_fast_dequeue(struct Qdisc * qdisc)297 pfifo_fast_dequeue(struct Qdisc* qdisc)
298 {
299 int prio;
300 struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
301 struct sk_buff *skb;
302
303 for (prio = 0; prio < 3; prio++, list++) {
304 skb = __skb_dequeue(list);
305 if (skb) {
306 qdisc->q.qlen--;
307 return skb;
308 }
309 }
310 return NULL;
311 }
312
313 static int
pfifo_fast_requeue(struct sk_buff * skb,struct Qdisc * qdisc)314 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
315 {
316 struct sk_buff_head *list;
317
318 list = ((struct sk_buff_head*)qdisc->data) +
319 prio2band[skb->priority&TC_PRIO_MAX];
320
321 __skb_queue_head(list, skb);
322 qdisc->q.qlen++;
323 return 0;
324 }
325
326 static void
pfifo_fast_reset(struct Qdisc * qdisc)327 pfifo_fast_reset(struct Qdisc* qdisc)
328 {
329 int prio;
330 struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
331
332 for (prio=0; prio < 3; prio++)
333 skb_queue_purge(list+prio);
334 qdisc->q.qlen = 0;
335 }
336
pfifo_fast_dump(struct Qdisc * qdisc,struct sk_buff * skb)337 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
338 {
339 unsigned char *b = skb->tail;
340 struct tc_prio_qopt opt;
341
342 opt.bands = 3;
343 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
344 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
345 return skb->len;
346
347 rtattr_failure:
348 skb_trim(skb, b - skb->data);
349 return -1;
350 }
351
pfifo_fast_init(struct Qdisc * qdisc,struct rtattr * opt)352 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
353 {
354 int i;
355 struct sk_buff_head *list;
356
357 list = ((struct sk_buff_head*)qdisc->data);
358
359 for (i=0; i<3; i++)
360 skb_queue_head_init(list+i);
361
362 return 0;
363 }
364
365 static struct Qdisc_ops pfifo_fast_ops =
366 {
367 NULL,
368 NULL,
369 "pfifo_fast",
370 3 * sizeof(struct sk_buff_head),
371
372 pfifo_fast_enqueue,
373 pfifo_fast_dequeue,
374 pfifo_fast_requeue,
375 NULL,
376
377 pfifo_fast_init,
378 pfifo_fast_reset,
379 NULL,
380 NULL,
381 pfifo_fast_dump,
382
383 };
384
qdisc_create_dflt(struct net_device * dev,struct Qdisc_ops * ops)385 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
386 {
387 struct Qdisc *sch;
388 int size = sizeof(*sch) + ops->priv_size;
389
390 sch = kmalloc(size, GFP_KERNEL);
391 if (!sch)
392 return NULL;
393 memset(sch, 0, size);
394
395 INIT_LIST_HEAD(&sch->list);
396 skb_queue_head_init(&sch->q);
397 sch->ops = ops;
398 sch->enqueue = ops->enqueue;
399 sch->dequeue = ops->dequeue;
400 sch->dev = dev;
401 sch->stats.lock = &dev->queue_lock;
402 atomic_set(&sch->refcnt, 1);
403 if (!ops->init || ops->init(sch, NULL) == 0)
404 return sch;
405
406 kfree(sch);
407 return NULL;
408 }
409
410 /* Under dev->queue_lock and BH! */
411
qdisc_reset(struct Qdisc * qdisc)412 void qdisc_reset(struct Qdisc *qdisc)
413 {
414 struct Qdisc_ops *ops = qdisc->ops;
415
416 if (ops->reset)
417 ops->reset(qdisc);
418 }
419
420 /* Under dev->queue_lock and BH! */
421
qdisc_destroy(struct Qdisc * qdisc)422 void qdisc_destroy(struct Qdisc *qdisc)
423 {
424 struct Qdisc_ops *ops = qdisc->ops;
425
426 if (qdisc->flags&TCQ_F_BUILTIN ||
427 !atomic_dec_and_test(&qdisc->refcnt))
428 return;
429 list_del(&qdisc->list);
430 #ifdef CONFIG_NET_ESTIMATOR
431 qdisc_kill_estimator(&qdisc->stats);
432 #endif
433 if (ops->reset)
434 ops->reset(qdisc);
435 if (ops->destroy)
436 ops->destroy(qdisc);
437 kfree(qdisc);
438 }
439
440
dev_activate(struct net_device * dev)441 void dev_activate(struct net_device *dev)
442 {
443 /* No queueing discipline is attached to device;
444 create default one i.e. pfifo_fast for devices,
445 which need queueing and noqueue_qdisc for
446 virtual interfaces
447 */
448
449 if (dev->qdisc_sleeping == &noop_qdisc) {
450 struct Qdisc *qdisc;
451 if (dev->tx_queue_len) {
452 qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
453 if (qdisc == NULL) {
454 printk(KERN_INFO "%s: activation failed\n", dev->name);
455 return;
456 }
457 write_lock(&qdisc_tree_lock);
458 list_add_tail(&qdisc->list, &dev->qdisc_list);
459 write_unlock(&qdisc_tree_lock);
460
461 } else {
462 qdisc = &noqueue_qdisc;
463 }
464 write_lock(&qdisc_tree_lock);
465 dev->qdisc_sleeping = qdisc;
466 write_unlock(&qdisc_tree_lock);
467 }
468
469 spin_lock_bh(&dev->queue_lock);
470 if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
471 dev->trans_start = jiffies;
472 dev_watchdog_up(dev);
473 }
474 spin_unlock_bh(&dev->queue_lock);
475 }
476
dev_deactivate(struct net_device * dev)477 void dev_deactivate(struct net_device *dev)
478 {
479 struct Qdisc *qdisc;
480
481 spin_lock_bh(&dev->queue_lock);
482 qdisc = dev->qdisc;
483 dev->qdisc = &noop_qdisc;
484
485 qdisc_reset(qdisc);
486
487 spin_unlock_bh(&dev->queue_lock);
488
489 dev_watchdog_down(dev);
490
491 while (test_bit(__LINK_STATE_SCHED, &dev->state))
492 yield();
493
494 spin_unlock_wait(&dev->xmit_lock);
495 }
496
dev_init_scheduler(struct net_device * dev)497 void dev_init_scheduler(struct net_device *dev)
498 {
499 write_lock(&qdisc_tree_lock);
500 spin_lock_bh(&dev->queue_lock);
501 dev->qdisc = &noop_qdisc;
502 spin_unlock_bh(&dev->queue_lock);
503 dev->qdisc_sleeping = &noop_qdisc;
504 INIT_LIST_HEAD(&dev->qdisc_list);
505 write_unlock(&qdisc_tree_lock);
506
507 dev_watchdog_init(dev);
508 }
509
dev_shutdown(struct net_device * dev)510 void dev_shutdown(struct net_device *dev)
511 {
512 struct Qdisc *qdisc;
513
514 write_lock(&qdisc_tree_lock);
515 spin_lock_bh(&dev->queue_lock);
516 qdisc = dev->qdisc_sleeping;
517 dev->qdisc = &noop_qdisc;
518 dev->qdisc_sleeping = &noop_qdisc;
519 qdisc_destroy(qdisc);
520 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
521 if ((qdisc = dev->qdisc_ingress) != NULL) {
522 dev->qdisc_ingress = NULL;
523 qdisc_destroy(qdisc);
524 }
525 #endif
526 BUG_TRAP(list_empty(&dev->qdisc_list));
527 BUG_TRAP(!timer_pending(&dev->watchdog_timer));
528 spin_unlock_bh(&dev->queue_lock);
529 write_unlock(&qdisc_tree_lock);
530 }
531