1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 *
57 * This program is free software; you can redistribute it and/or
58 * modify it under the terms of the GNU General Public License
59 * as published by the Free Software Foundation; either version
60 * 2 of the License, or (at your option) any later version.
61 */
62
63 #include <linux/config.h>
64 #include <asm/uaccess.h>
65 #include <asm/system.h>
66 #include <asm/bitops.h>
67 #include <linux/types.h>
68 #include <linux/kernel.h>
69 #include <linux/sched.h>
70 #include <linux/mm.h>
71 #include <linux/string.h>
72 #include <linux/socket.h>
73 #include <linux/sockios.h>
74 #include <linux/errno.h>
75 #include <linux/in.h>
76 #include <linux/inet.h>
77 #include <linux/netdevice.h>
78 #include <linux/proc_fs.h>
79 #include <linux/init.h>
80 #include <linux/skbuff.h>
81 #include <linux/rtnetlink.h>
82 #include <linux/inetdevice.h>
83 #include <linux/igmp.h>
84 #include <linux/pkt_sched.h>
85 #include <linux/mroute.h>
86 #include <linux/netfilter_ipv4.h>
87 #include <linux/random.h>
88 #include <linux/jhash.h>
89 #include <net/protocol.h>
90 #include <net/ip.h>
91 #include <net/route.h>
92 #include <net/inetpeer.h>
93 #include <net/sock.h>
94 #include <net/ip_fib.h>
95 #include <net/arp.h>
96 #include <net/tcp.h>
97 #include <net/icmp.h>
98 #ifdef CONFIG_SYSCTL
99 #include <linux/sysctl.h>
100 #endif
101
102 #define IP_MAX_MTU 0xFFF0
103
104 #define RT_GC_TIMEOUT (300*HZ)
105
106 int ip_rt_min_delay = 2 * HZ;
107 int ip_rt_max_delay = 10 * HZ;
108 int ip_rt_max_size;
109 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
110 int ip_rt_gc_interval = 60 * HZ;
111 int ip_rt_gc_min_interval = HZ / 2;
112 int ip_rt_redirect_number = 9;
113 int ip_rt_redirect_load = HZ / 50;
114 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
115 int ip_rt_error_cost = HZ;
116 int ip_rt_error_burst = 5 * HZ;
117 int ip_rt_gc_elasticity = 8;
118 int ip_rt_mtu_expires = 10 * 60 * HZ;
119 int ip_rt_min_pmtu = 512 + 20 + 20;
120 int ip_rt_min_advmss = 256;
121 int ip_rt_secret_interval = 10 * 60 * HZ;
122 static unsigned long rt_deadline;
123
124 #define RTprint(a...) printk(KERN_DEBUG a)
125
126 static struct timer_list rt_flush_timer;
127 static struct timer_list rt_periodic_timer;
128 static struct timer_list rt_secret_timer;
129
130 /*
131 * Interface to generic destination cache.
132 */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
136 struct sk_buff *skb);
137 static void ipv4_dst_destroy(struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void ipv4_link_failure(struct sk_buff *skb);
140 static int rt_garbage_collect(void);
141
142
143 struct dst_ops ipv4_dst_ops = {
144 family: AF_INET,
145 protocol: __constant_htons(ETH_P_IP),
146 gc: rt_garbage_collect,
147 check: ipv4_dst_check,
148 reroute: ipv4_dst_reroute,
149 destroy: ipv4_dst_destroy,
150 negative_advice: ipv4_negative_advice,
151 link_failure: ipv4_link_failure,
152 entry_size: sizeof(struct rtable),
153 };
154
155 #define ECN_OR_COST(class) TC_PRIO_##class
156
157 __u8 ip_tos2prio[16] = {
158 TC_PRIO_BESTEFFORT,
159 ECN_OR_COST(FILLER),
160 TC_PRIO_BESTEFFORT,
161 ECN_OR_COST(BESTEFFORT),
162 TC_PRIO_BULK,
163 ECN_OR_COST(BULK),
164 TC_PRIO_BULK,
165 ECN_OR_COST(BULK),
166 TC_PRIO_INTERACTIVE,
167 ECN_OR_COST(INTERACTIVE),
168 TC_PRIO_INTERACTIVE,
169 ECN_OR_COST(INTERACTIVE),
170 TC_PRIO_INTERACTIVE_BULK,
171 ECN_OR_COST(INTERACTIVE_BULK),
172 TC_PRIO_INTERACTIVE_BULK,
173 ECN_OR_COST(INTERACTIVE_BULK)
174 };
175
176
177 /*
178 * Route cache.
179 */
180
181 /* The locking scheme is rather straight forward:
182 *
183 * 1) A BH protected rwlocks protect buckets of the central route hash.
184 * 2) Only writers remove entries, and they hold the lock
185 * as they look at rtable reference counts.
186 * 3) Only readers acquire references to rtable entries,
187 * they do so with atomic increments and with the
188 * lock held.
189 */
190
191 struct rt_hash_bucket {
192 struct rtable *chain;
193 rwlock_t lock;
194 } __attribute__((__aligned__(8)));
195
196 static struct rt_hash_bucket *rt_hash_table;
197 static unsigned rt_hash_mask;
198 static int rt_hash_log;
199 static unsigned int rt_hash_rnd;
200
201 struct rt_cache_stat rt_cache_stat[NR_CPUS];
202
203 static int rt_intern_hash(unsigned hash, struct rtable *rth,
204 struct rtable **res);
205
rt_hash_code(u32 daddr,u32 saddr,u8 tos)206 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
207 {
208 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
209 & rt_hash_mask);
210 }
211
rt_cache_get_info(char * buffer,char ** start,off_t offset,int length)212 static int rt_cache_get_info(char *buffer, char **start, off_t offset,
213 int length)
214 {
215 int len = 0;
216 off_t pos = 128;
217 char temp[256];
218 struct rtable *r;
219 int i;
220
221 if (offset < 128) {
222 sprintf(buffer, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 len = 128;
227 }
228
229 for (i = rt_hash_mask; i >= 0; i--) {
230 read_lock_bh(&rt_hash_table[i].lock);
231 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
232 /*
233 * Spin through entries until we are ready
234 */
235 pos += 128;
236
237 if (pos <= offset) {
238 len = 0;
239 continue;
240 }
241 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
242 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
243 r->u.dst.dev ? r->u.dst.dev->name : "*",
244 (unsigned long)r->rt_dst,
245 (unsigned long)r->rt_gateway,
246 r->rt_flags,
247 atomic_read(&r->u.dst.__refcnt),
248 r->u.dst.__use,
249 0,
250 (unsigned long)r->rt_src,
251 (r->u.dst.advmss ?
252 (int) r->u.dst.advmss + 40 : 0),
253 r->u.dst.window,
254 (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
255 r->key.tos,
256 r->u.dst.hh ?
257 atomic_read(&r->u.dst.hh->hh_refcnt) :
258 -1,
259 r->u.dst.hh ?
260 (r->u.dst.hh->hh_output ==
261 dev_queue_xmit) : 0,
262 r->rt_spec_dst);
263 sprintf(buffer + len, "%-127s\n", temp);
264 len += 128;
265 if (pos >= offset+length) {
266 read_unlock_bh(&rt_hash_table[i].lock);
267 goto done;
268 }
269 }
270 read_unlock_bh(&rt_hash_table[i].lock);
271 }
272
273 done:
274 *start = buffer + len - (pos - offset);
275 len = pos - offset;
276 if (len > length)
277 len = length;
278 return len;
279 }
280
rt_cache_stat_get_info(char * buffer,char ** start,off_t offset,int length)281 static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
282 {
283 unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
284 int i, lcpu;
285 int len = 0;
286
287 len += sprintf(buffer+len, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288 for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
289 i = cpu_logical_map(lcpu);
290
291 len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292 dst_entries,
293 rt_cache_stat[i].in_hit,
294 rt_cache_stat[i].in_slow_tot,
295 rt_cache_stat[i].in_slow_mc,
296 rt_cache_stat[i].in_no_route,
297 rt_cache_stat[i].in_brd,
298 rt_cache_stat[i].in_martian_dst,
299 rt_cache_stat[i].in_martian_src,
300
301 rt_cache_stat[i].out_hit,
302 rt_cache_stat[i].out_slow_tot,
303 rt_cache_stat[i].out_slow_mc,
304
305 rt_cache_stat[i].gc_total,
306 rt_cache_stat[i].gc_ignored,
307 rt_cache_stat[i].gc_goal_miss,
308 rt_cache_stat[i].gc_dst_overflow,
309 rt_cache_stat[i].in_hlist_search,
310 rt_cache_stat[i].out_hlist_search
311
312 );
313 }
314 len -= offset;
315
316 if (len > length)
317 len = length;
318 if (len < 0)
319 len = 0;
320
321 *start = buffer + offset;
322 return len;
323 }
324
rt_free(struct rtable * rt)325 static __inline__ void rt_free(struct rtable *rt)
326 {
327 dst_free(&rt->u.dst);
328 }
329
rt_drop(struct rtable * rt)330 static __inline__ void rt_drop(struct rtable *rt)
331 {
332 ip_rt_put(rt);
333 dst_free(&rt->u.dst);
334 }
335
rt_fast_clean(struct rtable * rth)336 static __inline__ int rt_fast_clean(struct rtable *rth)
337 {
338 /* Kill broadcast/multicast entries very aggresively, if they
339 collide in hash table with more useful entries */
340 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
341 rth->key.iif && rth->u.rt_next;
342 }
343
rt_valuable(struct rtable * rth)344 static __inline__ int rt_valuable(struct rtable *rth)
345 {
346 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
347 rth->u.dst.expires;
348 }
349
rt_may_expire(struct rtable * rth,unsigned long tmo1,unsigned long tmo2)350 static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
351 {
352 unsigned long age;
353 int ret = 0;
354
355 if (atomic_read(&rth->u.dst.__refcnt))
356 goto out;
357
358 ret = 1;
359 if (rth->u.dst.expires &&
360 time_after_eq(jiffies, rth->u.dst.expires))
361 goto out;
362
363 age = jiffies - rth->u.dst.lastuse;
364 ret = 0;
365 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
366 (age <= tmo2 && rt_valuable(rth)))
367 goto out;
368 ret = 1;
369 out: return ret;
370 }
371
372 /* Bits of score are:
373 * 31: very valuable
374 * 30: not quite useless
375 * 29..0: usage counter
376 */
rt_score(struct rtable * rt)377 static inline u32 rt_score(struct rtable *rt)
378 {
379 u32 score = jiffies - rt->u.dst.lastuse;
380
381 score = ~score & ~(3<<30);
382
383 if (rt_valuable(rt))
384 score |= (1<<31);
385
386 if (!rt->key.iif ||
387 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
388 score |= (1<<30);
389
390 return score;
391 }
392
393 /* This runs via a timer and thus is always in BH context. */
SMP_TIMER_NAME(rt_check_expire)394 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
395 {
396 static int rover;
397 int i = rover, t;
398 struct rtable *rth, **rthp;
399 unsigned long now = jiffies;
400
401 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
402 t -= ip_rt_gc_timeout) {
403 unsigned long tmo = ip_rt_gc_timeout;
404
405 i = (i + 1) & rt_hash_mask;
406 rthp = &rt_hash_table[i].chain;
407
408 write_lock(&rt_hash_table[i].lock);
409 while ((rth = *rthp) != NULL) {
410 if (rth->u.dst.expires) {
411 /* Entry is expired even if it is in use */
412 if (time_before_eq(now, rth->u.dst.expires)) {
413 tmo >>= 1;
414 rthp = &rth->u.rt_next;
415 continue;
416 }
417 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
418 tmo >>= 1;
419 rthp = &rth->u.rt_next;
420 continue;
421 }
422
423 /* Cleanup aged off entries. */
424 *rthp = rth->u.rt_next;
425 rt_free(rth);
426 }
427 write_unlock(&rt_hash_table[i].lock);
428
429 /* Fallback loop breaker. */
430 if (time_after(jiffies, now))
431 break;
432 }
433 rover = i;
434 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
435 }
436
437 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
438
439 /* This can run from both BH and non-BH contexts, the latter
440 * in the case of a forced flush event.
441 */
SMP_TIMER_NAME(rt_run_flush)442 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
443 {
444 int i;
445 struct rtable *rth, *next;
446
447 rt_deadline = 0;
448
449 get_random_bytes(&rt_hash_rnd, 4);
450
451 for (i = rt_hash_mask; i >= 0; i--) {
452 write_lock_bh(&rt_hash_table[i].lock);
453 rth = rt_hash_table[i].chain;
454 if (rth)
455 rt_hash_table[i].chain = NULL;
456 write_unlock_bh(&rt_hash_table[i].lock);
457
458 for (; rth; rth = next) {
459 next = rth->u.rt_next;
460 rt_free(rth);
461 }
462 }
463 }
464
465 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
466
467 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
468
rt_cache_flush(int delay)469 void rt_cache_flush(int delay)
470 {
471 unsigned long now = jiffies;
472 int user_mode = !in_softirq();
473
474 if (delay < 0)
475 delay = ip_rt_min_delay;
476
477 spin_lock_bh(&rt_flush_lock);
478
479 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
480 long tmo = (long)(rt_deadline - now);
481
482 /* If flush timer is already running
483 and flush request is not immediate (delay > 0):
484
485 if deadline is not achieved, prolongate timer to "delay",
486 otherwise fire it at deadline time.
487 */
488
489 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
490 tmo = 0;
491
492 if (delay > tmo)
493 delay = tmo;
494 }
495
496 if (delay <= 0) {
497 spin_unlock_bh(&rt_flush_lock);
498 SMP_TIMER_NAME(rt_run_flush)(0);
499 return;
500 }
501
502 if (rt_deadline == 0)
503 rt_deadline = now + ip_rt_max_delay;
504
505 mod_timer(&rt_flush_timer, now+delay);
506 spin_unlock_bh(&rt_flush_lock);
507 }
508
rt_secret_rebuild(unsigned long dummy)509 static void rt_secret_rebuild(unsigned long dummy)
510 {
511 unsigned long now = jiffies;
512
513 rt_cache_flush(0);
514 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
515 }
516
517 /*
518 Short description of GC goals.
519
520 We want to build algorithm, which will keep routing cache
521 at some equilibrium point, when number of aged off entries
522 is kept approximately equal to newly generated ones.
523
524 Current expiration strength is variable "expire".
525 We try to adjust it dynamically, so that if networking
526 is idle expires is large enough to keep enough of warm entries,
527 and when load increases it reduces to limit cache size.
528 */
529
rt_garbage_collect(void)530 static int rt_garbage_collect(void)
531 {
532 static unsigned long expire = RT_GC_TIMEOUT;
533 static unsigned long last_gc;
534 static int rover;
535 static int equilibrium;
536 struct rtable *rth, **rthp;
537 unsigned long now = jiffies;
538 int goal;
539
540 /*
541 * Garbage collection is pretty expensive,
542 * do not make it too frequently.
543 */
544
545 rt_cache_stat[smp_processor_id()].gc_total++;
546
547 if (now - last_gc < ip_rt_gc_min_interval &&
548 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
549 rt_cache_stat[smp_processor_id()].gc_ignored++;
550 goto out;
551 }
552
553 /* Calculate number of entries, which we want to expire now. */
554 goal = atomic_read(&ipv4_dst_ops.entries) -
555 (ip_rt_gc_elasticity << rt_hash_log);
556 if (goal <= 0) {
557 if (equilibrium < ipv4_dst_ops.gc_thresh)
558 equilibrium = ipv4_dst_ops.gc_thresh;
559 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
560 if (goal > 0) {
561 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
562 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
563 }
564 } else {
565 /* We are in dangerous area. Try to reduce cache really
566 * aggressively.
567 */
568 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
569 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
570 }
571
572 if (now - last_gc >= ip_rt_gc_min_interval)
573 last_gc = now;
574
575 if (goal <= 0) {
576 equilibrium += goal;
577 goto work_done;
578 }
579
580 do {
581 int i, k;
582
583 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
584 unsigned long tmo = expire;
585
586 k = (k + 1) & rt_hash_mask;
587 rthp = &rt_hash_table[k].chain;
588 write_lock_bh(&rt_hash_table[k].lock);
589 while ((rth = *rthp) != NULL) {
590 if (!rt_may_expire(rth, tmo, expire)) {
591 tmo >>= 1;
592 rthp = &rth->u.rt_next;
593 continue;
594 }
595 *rthp = rth->u.rt_next;
596 rt_free(rth);
597 goal--;
598 }
599 write_unlock_bh(&rt_hash_table[k].lock);
600 if (goal <= 0)
601 break;
602 }
603 rover = k;
604
605 if (goal <= 0)
606 goto work_done;
607
608 /* Goal is not achieved. We stop process if:
609
610 - if expire reduced to zero. Otherwise, expire is halfed.
611 - if table is not full.
612 - if we are called from interrupt.
613 - jiffies check is just fallback/debug loop breaker.
614 We will not spin here for long time in any case.
615 */
616
617 rt_cache_stat[smp_processor_id()].gc_goal_miss++;
618
619 if (expire == 0)
620 break;
621
622 expire >>= 1;
623 #if RT_CACHE_DEBUG >= 2
624 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
625 atomic_read(&ipv4_dst_ops.entries), goal, i);
626 #endif
627
628 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
629 goto out;
630 } while (!in_softirq() && time_before_eq(jiffies, now));
631
632 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
633 goto out;
634 if (net_ratelimit())
635 printk(KERN_WARNING "dst cache overflow\n");
636 rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
637 return 1;
638
639 work_done:
640 expire += ip_rt_gc_min_interval;
641 if (expire > ip_rt_gc_timeout ||
642 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
643 expire = ip_rt_gc_timeout;
644 #if RT_CACHE_DEBUG >= 2
645 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
646 atomic_read(&ipv4_dst_ops.entries), goal, rover);
647 #endif
648 out: return 0;
649 }
650
rt_intern_hash(unsigned hash,struct rtable * rt,struct rtable ** rp)651 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
652 {
653 struct rtable *rth, **rthp;
654 unsigned long now;
655 struct rtable *cand, **candp;
656 u32 min_score;
657 int chain_length;
658 int attempts = !in_softirq();
659
660 restart:
661 chain_length = 0;
662 min_score = ~(u32)0;
663 cand = NULL;
664 candp = NULL;
665 now = jiffies;
666
667 rthp = &rt_hash_table[hash].chain;
668
669 write_lock_bh(&rt_hash_table[hash].lock);
670 while ((rth = *rthp) != NULL) {
671 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
672 /* Put it first */
673 *rthp = rth->u.rt_next;
674 rth->u.rt_next = rt_hash_table[hash].chain;
675 rt_hash_table[hash].chain = rth;
676
677 rth->u.dst.__use++;
678 dst_hold(&rth->u.dst);
679 rth->u.dst.lastuse = now;
680 write_unlock_bh(&rt_hash_table[hash].lock);
681
682 rt_drop(rt);
683 *rp = rth;
684 return 0;
685 }
686
687 if (!atomic_read(&rth->u.dst.__refcnt)) {
688 u32 score = rt_score(rth);
689
690 if (score <= min_score) {
691 cand = rth;
692 candp = rthp;
693 min_score = score;
694 }
695 }
696
697 chain_length++;
698
699 rthp = &rth->u.rt_next;
700 }
701
702 if (cand) {
703 /* ip_rt_gc_elasticity used to be average length of chain
704 * length, when exceeded gc becomes really aggressive.
705 *
706 * The second limit is less certain. At the moment it allows
707 * only 2 entries per bucket. We will see.
708 */
709 if (chain_length > ip_rt_gc_elasticity) {
710 *candp = cand->u.rt_next;
711 rt_free(cand);
712 }
713 }
714
715 /* Try to bind route to arp only if it is output
716 route or unicast forwarding path.
717 */
718 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
719 int err = arp_bind_neighbour(&rt->u.dst);
720 if (err) {
721 write_unlock_bh(&rt_hash_table[hash].lock);
722
723 if (err != -ENOBUFS) {
724 rt_drop(rt);
725 return err;
726 }
727
728 /* Neighbour tables are full and nothing
729 can be released. Try to shrink route cache,
730 it is most likely it holds some neighbour records.
731 */
732 if (attempts-- > 0) {
733 int saved_elasticity = ip_rt_gc_elasticity;
734 int saved_int = ip_rt_gc_min_interval;
735 ip_rt_gc_elasticity = 1;
736 ip_rt_gc_min_interval = 0;
737 rt_garbage_collect();
738 ip_rt_gc_min_interval = saved_int;
739 ip_rt_gc_elasticity = saved_elasticity;
740 goto restart;
741 }
742
743 if (net_ratelimit())
744 printk(KERN_WARNING "Neighbour table overflow.\n");
745 rt_drop(rt);
746 return -ENOBUFS;
747 }
748 }
749
750 rt->u.rt_next = rt_hash_table[hash].chain;
751 #if RT_CACHE_DEBUG >= 2
752 if (rt->u.rt_next) {
753 struct rtable *trt;
754 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
755 NIPQUAD(rt->rt_dst));
756 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
757 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
758 printk("\n");
759 }
760 #endif
761 rt_hash_table[hash].chain = rt;
762 write_unlock_bh(&rt_hash_table[hash].lock);
763 *rp = rt;
764 return 0;
765 }
766
rt_bind_peer(struct rtable * rt,int create)767 void rt_bind_peer(struct rtable *rt, int create)
768 {
769 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
770 struct inet_peer *peer;
771
772 peer = inet_getpeer(rt->rt_dst, create);
773
774 spin_lock_bh(&rt_peer_lock);
775 if (rt->peer == NULL) {
776 rt->peer = peer;
777 peer = NULL;
778 }
779 spin_unlock_bh(&rt_peer_lock);
780 if (peer)
781 inet_putpeer(peer);
782 }
783
784 /*
785 * Peer allocation may fail only in serious out-of-memory conditions. However
786 * we still can generate some output.
787 * Random ID selection looks a bit dangerous because we have no chances to
788 * select ID being unique in a reasonable period of time.
789 * But broken packet identifier may be better than no packet at all.
790 */
ip_select_fb_ident(struct iphdr * iph)791 static void ip_select_fb_ident(struct iphdr *iph)
792 {
793 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
794 static u32 ip_fallback_id;
795 u32 salt;
796
797 spin_lock_bh(&ip_fb_id_lock);
798 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
799 iph->id = htons(salt & 0xFFFF);
800 ip_fallback_id = salt;
801 spin_unlock_bh(&ip_fb_id_lock);
802 }
803
__ip_select_ident(struct iphdr * iph,struct dst_entry * dst)804 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
805 {
806 struct rtable *rt = (struct rtable *) dst;
807
808 if (rt) {
809 if (rt->peer == NULL)
810 rt_bind_peer(rt, 1);
811
812 /* If peer is attached to destination, it is never detached,
813 so that we need not to grab a lock to dereference it.
814 */
815 if (rt->peer) {
816 iph->id = htons(inet_getid(rt->peer));
817 return;
818 }
819 } else
820 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
821
822 ip_select_fb_ident(iph);
823 }
824
rt_del(unsigned hash,struct rtable * rt)825 static void rt_del(unsigned hash, struct rtable *rt)
826 {
827 struct rtable **rthp;
828
829 write_lock_bh(&rt_hash_table[hash].lock);
830 ip_rt_put(rt);
831 for (rthp = &rt_hash_table[hash].chain; *rthp;
832 rthp = &(*rthp)->u.rt_next)
833 if (*rthp == rt) {
834 *rthp = rt->u.rt_next;
835 rt_free(rt);
836 break;
837 }
838 write_unlock_bh(&rt_hash_table[hash].lock);
839 }
840
ip_rt_redirect(u32 old_gw,u32 daddr,u32 new_gw,u32 saddr,u8 tos,struct net_device * dev)841 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
842 u32 saddr, u8 tos, struct net_device *dev)
843 {
844 int i, k;
845 struct in_device *in_dev = in_dev_get(dev);
846 struct rtable *rth, **rthp;
847 u32 skeys[2] = { saddr, 0 };
848 int ikeys[2] = { dev->ifindex, 0 };
849
850 tos &= IPTOS_RT_MASK;
851
852 if (!in_dev)
853 return;
854
855 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
856 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
857 goto reject_redirect;
858
859 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
860 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
861 goto reject_redirect;
862 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
863 goto reject_redirect;
864 } else {
865 if (inet_addr_type(new_gw) != RTN_UNICAST)
866 goto reject_redirect;
867 }
868
869 for (i = 0; i < 2; i++) {
870 for (k = 0; k < 2; k++) {
871 unsigned hash = rt_hash_code(daddr,
872 skeys[i] ^ (ikeys[k] << 5),
873 tos);
874
875 rthp=&rt_hash_table[hash].chain;
876
877 read_lock(&rt_hash_table[hash].lock);
878 while ((rth = *rthp) != NULL) {
879 struct rtable *rt;
880
881 if (rth->key.dst != daddr ||
882 rth->key.src != skeys[i] ||
883 rth->key.tos != tos ||
884 rth->key.oif != ikeys[k] ||
885 rth->key.iif != 0) {
886 rthp = &rth->u.rt_next;
887 continue;
888 }
889
890 if (rth->rt_dst != daddr ||
891 rth->rt_src != saddr ||
892 rth->u.dst.error ||
893 rth->rt_gateway != old_gw ||
894 rth->u.dst.dev != dev)
895 break;
896
897 dst_hold(&rth->u.dst);
898 read_unlock(&rt_hash_table[hash].lock);
899
900 rt = dst_alloc(&ipv4_dst_ops);
901 if (rt == NULL) {
902 ip_rt_put(rth);
903 in_dev_put(in_dev);
904 return;
905 }
906
907 /* Copy all the information. */
908 *rt = *rth;
909 rt->u.dst.__use = 1;
910 atomic_set(&rt->u.dst.__refcnt, 1);
911 if (rt->u.dst.dev)
912 dev_hold(rt->u.dst.dev);
913 rt->u.dst.lastuse = jiffies;
914 rt->u.dst.neighbour = NULL;
915 rt->u.dst.hh = NULL;
916 rt->u.dst.obsolete = 0;
917
918 rt->rt_flags |= RTCF_REDIRECTED;
919
920 /* Gateway is different ... */
921 rt->rt_gateway = new_gw;
922
923 /* Redirect received -> path was valid */
924 dst_confirm(&rth->u.dst);
925
926 if (rt->peer)
927 atomic_inc(&rt->peer->refcnt);
928
929 if (arp_bind_neighbour(&rt->u.dst) ||
930 !(rt->u.dst.neighbour->nud_state &
931 NUD_VALID)) {
932 if (rt->u.dst.neighbour)
933 neigh_event_send(rt->u.dst.neighbour, NULL);
934 ip_rt_put(rth);
935 rt_drop(rt);
936 goto do_next;
937 }
938
939 rt_del(hash, rth);
940 if (!rt_intern_hash(hash, rt, &rt))
941 ip_rt_put(rt);
942 goto do_next;
943 }
944 read_unlock(&rt_hash_table[hash].lock);
945 do_next:
946 ;
947 }
948 }
949 in_dev_put(in_dev);
950 return;
951
952 reject_redirect:
953 #ifdef CONFIG_IP_ROUTE_VERBOSE
954 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
955 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
956 "%u.%u.%u.%u ignored.\n"
957 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
958 "tos %02x\n",
959 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
960 NIPQUAD(saddr), NIPQUAD(daddr), tos);
961 #endif
962 in_dev_put(in_dev);
963 }
964
ipv4_negative_advice(struct dst_entry * dst)965 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
966 {
967 struct rtable *rt = (struct rtable*)dst;
968 struct dst_entry *ret = dst;
969
970 if (rt) {
971 if (dst->obsolete) {
972 ip_rt_put(rt);
973 ret = NULL;
974 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
975 rt->u.dst.expires) {
976 unsigned hash = rt_hash_code(rt->key.dst,
977 rt->key.src ^
978 (rt->key.oif << 5),
979 rt->key.tos);
980 #if RT_CACHE_DEBUG >= 1
981 printk(KERN_DEBUG "ip_rt_advice: redirect to "
982 "%u.%u.%u.%u/%02x dropped\n",
983 NIPQUAD(rt->rt_dst), rt->key.tos);
984 #endif
985 rt_del(hash, rt);
986 ret = NULL;
987 }
988 }
989 return ret;
990 }
991
992 /*
993 * Algorithm:
994 * 1. The first ip_rt_redirect_number redirects are sent
995 * with exponential backoff, then we stop sending them at all,
996 * assuming that the host ignores our redirects.
997 * 2. If we did not see packets requiring redirects
998 * during ip_rt_redirect_silence, we assume that the host
999 * forgot redirected route and start to send redirects again.
1000 *
1001 * This algorithm is much cheaper and more intelligent than dumb load limiting
1002 * in icmp.c.
1003 *
1004 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1005 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1006 */
1007
ip_rt_send_redirect(struct sk_buff * skb)1008 void ip_rt_send_redirect(struct sk_buff *skb)
1009 {
1010 struct rtable *rt = (struct rtable*)skb->dst;
1011 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1012
1013 if (!in_dev)
1014 return;
1015
1016 if (!IN_DEV_TX_REDIRECTS(in_dev))
1017 goto out;
1018
1019 /* No redirected packets during ip_rt_redirect_silence;
1020 * reset the algorithm.
1021 */
1022 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1023 rt->u.dst.rate_tokens = 0;
1024
1025 /* Too many ignored redirects; do not send anything
1026 * set u.dst.rate_last to the last seen redirected packet.
1027 */
1028 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1029 rt->u.dst.rate_last = jiffies;
1030 goto out;
1031 }
1032
1033 /* Check for load limit; set rate_last to the latest sent
1034 * redirect.
1035 */
1036 if (time_after(jiffies,
1037 (rt->u.dst.rate_last +
1038 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1039 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1040 rt->u.dst.rate_last = jiffies;
1041 ++rt->u.dst.rate_tokens;
1042 #ifdef CONFIG_IP_ROUTE_VERBOSE
1043 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1044 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1045 net_ratelimit())
1046 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1047 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1048 NIPQUAD(rt->rt_src), rt->rt_iif,
1049 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1050 #endif
1051 }
1052 out:
1053 in_dev_put(in_dev);
1054 }
1055
ip_error(struct sk_buff * skb)1056 static int ip_error(struct sk_buff *skb)
1057 {
1058 struct rtable *rt = (struct rtable*)skb->dst;
1059 unsigned long now;
1060 int code;
1061
1062 switch (rt->u.dst.error) {
1063 case EINVAL:
1064 default:
1065 goto out;
1066 case EHOSTUNREACH:
1067 code = ICMP_HOST_UNREACH;
1068 break;
1069 case ENETUNREACH:
1070 code = ICMP_NET_UNREACH;
1071 break;
1072 case EACCES:
1073 code = ICMP_PKT_FILTERED;
1074 break;
1075 }
1076
1077 now = jiffies;
1078 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1079 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1080 rt->u.dst.rate_tokens = ip_rt_error_burst;
1081 rt->u.dst.rate_last = now;
1082 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1083 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1084 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1085 }
1086
1087 out: kfree_skb(skb);
1088 return 0;
1089 }
1090
1091 /*
1092 * The last two values are not from the RFC but
1093 * are needed for AMPRnet AX.25 paths.
1094 */
1095
1096 static unsigned short mtu_plateau[] =
1097 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1098
guess_mtu(unsigned short old_mtu)1099 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1100 {
1101 int i;
1102
1103 for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1104 if (old_mtu > mtu_plateau[i])
1105 return mtu_plateau[i];
1106 return 68;
1107 }
1108
ip_rt_frag_needed(struct iphdr * iph,unsigned short new_mtu)1109 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1110 {
1111 int i;
1112 unsigned short old_mtu = ntohs(iph->tot_len);
1113 struct rtable *rth;
1114 u32 skeys[2] = { iph->saddr, 0, };
1115 u32 daddr = iph->daddr;
1116 u8 tos = iph->tos & IPTOS_RT_MASK;
1117 unsigned short est_mtu = 0;
1118
1119 if (ipv4_config.no_pmtu_disc)
1120 return 0;
1121
1122 for (i = 0; i < 2; i++) {
1123 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1124
1125 read_lock(&rt_hash_table[hash].lock);
1126 for (rth = rt_hash_table[hash].chain; rth;
1127 rth = rth->u.rt_next) {
1128 if (rth->key.dst == daddr &&
1129 rth->key.src == skeys[i] &&
1130 rth->rt_dst == daddr &&
1131 rth->rt_src == iph->saddr &&
1132 rth->key.tos == tos &&
1133 rth->key.iif == 0 &&
1134 !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1135 unsigned short mtu = new_mtu;
1136
1137 if (new_mtu < 68 || new_mtu >= old_mtu) {
1138
1139 /* BSD 4.2 compatibility hack :-( */
1140 if (mtu == 0 &&
1141 old_mtu >= rth->u.dst.pmtu &&
1142 old_mtu >= 68 + (iph->ihl << 2))
1143 old_mtu -= iph->ihl << 2;
1144
1145 mtu = guess_mtu(old_mtu);
1146 }
1147 if (mtu <= rth->u.dst.pmtu) {
1148 if (mtu < rth->u.dst.pmtu) {
1149 dst_confirm(&rth->u.dst);
1150 if (mtu < ip_rt_min_pmtu) {
1151 mtu = ip_rt_min_pmtu;
1152 rth->u.dst.mxlock |=
1153 (1 << RTAX_MTU);
1154 }
1155 rth->u.dst.pmtu = mtu;
1156 dst_set_expires(&rth->u.dst,
1157 ip_rt_mtu_expires);
1158 }
1159 est_mtu = mtu;
1160 }
1161 }
1162 }
1163 read_unlock(&rt_hash_table[hash].lock);
1164 }
1165 return est_mtu ? : new_mtu;
1166 }
1167
ip_rt_update_pmtu(struct dst_entry * dst,unsigned mtu)1168 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1169 {
1170 if (dst->pmtu > mtu && mtu >= 68 &&
1171 !(dst->mxlock & (1 << RTAX_MTU))) {
1172 if (mtu < ip_rt_min_pmtu) {
1173 mtu = ip_rt_min_pmtu;
1174 dst->mxlock |= (1 << RTAX_MTU);
1175 }
1176 dst->pmtu = mtu;
1177 dst_set_expires(dst, ip_rt_mtu_expires);
1178 }
1179 }
1180
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183 dst_release(dst);
1184 return NULL;
1185 }
1186
ipv4_dst_reroute(struct dst_entry * dst,struct sk_buff * skb)1187 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1188 struct sk_buff *skb)
1189 {
1190 return NULL;
1191 }
1192
ipv4_dst_destroy(struct dst_entry * dst)1193 static void ipv4_dst_destroy(struct dst_entry *dst)
1194 {
1195 struct rtable *rt = (struct rtable *) dst;
1196 struct inet_peer *peer = rt->peer;
1197
1198 if (peer) {
1199 rt->peer = NULL;
1200 inet_putpeer(peer);
1201 }
1202 }
1203
ipv4_link_failure(struct sk_buff * skb)1204 static void ipv4_link_failure(struct sk_buff *skb)
1205 {
1206 struct rtable *rt;
1207
1208 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1209
1210 rt = (struct rtable *) skb->dst;
1211 if (rt)
1212 dst_set_expires(&rt->u.dst, 0);
1213 }
1214
ip_rt_bug(struct sk_buff * skb)1215 static int ip_rt_bug(struct sk_buff *skb)
1216 {
1217 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1218 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1219 skb->dev ? skb->dev->name : "?");
1220 kfree_skb(skb);
1221 return 0;
1222 }
1223
1224 /*
1225 We do not cache source address of outgoing interface,
1226 because it is used only by IP RR, TS and SRR options,
1227 so that it out of fast path.
1228
1229 BTW remember: "addr" is allowed to be not aligned
1230 in IP options!
1231 */
1232
ip_rt_get_source(u8 * addr,struct rtable * rt)1233 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1234 {
1235 u32 src;
1236 struct fib_result res;
1237
1238 if (rt->key.iif == 0)
1239 src = rt->rt_src;
1240 else if (fib_lookup(&rt->key, &res) == 0) {
1241 #ifdef CONFIG_IP_ROUTE_NAT
1242 if (res.type == RTN_NAT)
1243 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1244 RT_SCOPE_UNIVERSE);
1245 else
1246 #endif
1247 src = FIB_RES_PREFSRC(res);
1248 fib_res_put(&res);
1249 } else
1250 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1251 RT_SCOPE_UNIVERSE);
1252 memcpy(addr, &src, 4);
1253 }
1254
1255 #ifdef CONFIG_NET_CLS_ROUTE
set_class_tag(struct rtable * rt,u32 tag)1256 static void set_class_tag(struct rtable *rt, u32 tag)
1257 {
1258 if (!(rt->u.dst.tclassid & 0xFFFF))
1259 rt->u.dst.tclassid |= tag & 0xFFFF;
1260 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1261 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1262 }
1263 #endif
1264
rt_set_nexthop(struct rtable * rt,struct fib_result * res,u32 itag)1265 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1266 {
1267 struct fib_info *fi = res->fi;
1268
1269 if (fi) {
1270 if (FIB_RES_GW(*res) &&
1271 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1272 rt->rt_gateway = FIB_RES_GW(*res);
1273 memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1274 sizeof(fi->fib_metrics));
1275 if (fi->fib_mtu == 0) {
1276 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1277 if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1278 rt->rt_gateway != rt->rt_dst &&
1279 rt->u.dst.pmtu > 576)
1280 rt->u.dst.pmtu = 576;
1281 }
1282 #ifdef CONFIG_NET_CLS_ROUTE
1283 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1284 #endif
1285 } else
1286 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1287
1288 if (rt->u.dst.pmtu > IP_MAX_MTU)
1289 rt->u.dst.pmtu = IP_MAX_MTU;
1290 if (rt->u.dst.advmss == 0)
1291 rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1292 ip_rt_min_advmss);
1293 if (rt->u.dst.advmss > 65535 - 40)
1294 rt->u.dst.advmss = 65535 - 40;
1295
1296 #ifdef CONFIG_NET_CLS_ROUTE
1297 #ifdef CONFIG_IP_MULTIPLE_TABLES
1298 set_class_tag(rt, fib_rules_tclass(res));
1299 #endif
1300 set_class_tag(rt, itag);
1301 #endif
1302 rt->rt_type = res->type;
1303 }
1304
ip_route_input_mc(struct sk_buff * skb,u32 daddr,u32 saddr,u8 tos,struct net_device * dev,int our)1305 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1306 u8 tos, struct net_device *dev, int our)
1307 {
1308 unsigned hash;
1309 struct rtable *rth;
1310 u32 spec_dst;
1311 struct in_device *in_dev = in_dev_get(dev);
1312 u32 itag = 0;
1313
1314 /* Primary sanity checks. */
1315
1316 if (in_dev == NULL)
1317 return -EINVAL;
1318
1319 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1320 skb->protocol != htons(ETH_P_IP))
1321 goto e_inval;
1322
1323 if (ZERONET(saddr)) {
1324 if (!LOCAL_MCAST(daddr))
1325 goto e_inval;
1326 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1327 } else if (fib_validate_source(saddr, 0, tos, 0,
1328 dev, &spec_dst, &itag) < 0)
1329 goto e_inval;
1330
1331 rth = dst_alloc(&ipv4_dst_ops);
1332 if (!rth)
1333 goto e_nobufs;
1334
1335 rth->u.dst.output= ip_rt_bug;
1336
1337 atomic_set(&rth->u.dst.__refcnt, 1);
1338 rth->u.dst.flags= DST_HOST;
1339 rth->key.dst = daddr;
1340 rth->rt_dst = daddr;
1341 rth->key.tos = tos;
1342 #ifdef CONFIG_IP_ROUTE_FWMARK
1343 rth->key.fwmark = skb->nfmark;
1344 #endif
1345 rth->key.src = saddr;
1346 rth->rt_src = saddr;
1347 #ifdef CONFIG_IP_ROUTE_NAT
1348 rth->rt_dst_map = daddr;
1349 rth->rt_src_map = saddr;
1350 #endif
1351 #ifdef CONFIG_NET_CLS_ROUTE
1352 rth->u.dst.tclassid = itag;
1353 #endif
1354 rth->rt_iif =
1355 rth->key.iif = dev->ifindex;
1356 rth->u.dst.dev = &loopback_dev;
1357 dev_hold(rth->u.dst.dev);
1358 rth->key.oif = 0;
1359 rth->rt_gateway = daddr;
1360 rth->rt_spec_dst= spec_dst;
1361 rth->rt_type = RTN_MULTICAST;
1362 rth->rt_flags = RTCF_MULTICAST;
1363 if (our) {
1364 rth->u.dst.input= ip_local_deliver;
1365 rth->rt_flags |= RTCF_LOCAL;
1366 }
1367
1368 #ifdef CONFIG_IP_MROUTE
1369 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1370 rth->u.dst.input = ip_mr_input;
1371 #endif
1372 rt_cache_stat[smp_processor_id()].in_slow_mc++;
1373
1374 in_dev_put(in_dev);
1375 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1376 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1377
1378 e_nobufs:
1379 in_dev_put(in_dev);
1380 return -ENOBUFS;
1381
1382 e_inval:
1383 in_dev_put(in_dev);
1384 return -EINVAL;
1385 }
1386
1387 /*
1388 * NOTE. We drop all the packets that has local source
1389 * addresses, because every properly looped back packet
1390 * must have correct destination already attached by output routine.
1391 *
1392 * Such approach solves two big problems:
1393 * 1. Not simplex devices are handled properly.
1394 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1395 */
1396
ip_route_input_slow(struct sk_buff * skb,u32 daddr,u32 saddr,u8 tos,struct net_device * dev)1397 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1398 u8 tos, struct net_device *dev)
1399 {
1400 struct rt_key key;
1401 struct fib_result res;
1402 struct in_device *in_dev = in_dev_get(dev);
1403 struct in_device *out_dev = NULL;
1404 unsigned flags = 0;
1405 u32 itag = 0;
1406 struct rtable * rth;
1407 unsigned hash;
1408 u32 spec_dst;
1409 int err = -EINVAL;
1410 int free_res = 0;
1411
1412 /* IP on this device is disabled. */
1413
1414 if (!in_dev)
1415 goto out;
1416
1417 key.dst = daddr;
1418 key.src = saddr;
1419 key.tos = tos;
1420 #ifdef CONFIG_IP_ROUTE_FWMARK
1421 key.fwmark = skb->nfmark;
1422 #endif
1423 key.iif = dev->ifindex;
1424 key.oif = 0;
1425 key.scope = RT_SCOPE_UNIVERSE;
1426
1427 hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1428
1429 /* Check for the most weird martians, which can be not detected
1430 by fib_lookup.
1431 */
1432
1433 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1434 goto martian_source;
1435
1436 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1437 goto brd_input;
1438
1439 /* Accept zero addresses only to limited broadcast;
1440 * I even do not know to fix it or not. Waiting for complains :-)
1441 */
1442 if (ZERONET(saddr))
1443 goto martian_source;
1444
1445 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1446 goto martian_destination;
1447
1448 /*
1449 * Now we are ready to route packet.
1450 */
1451 if ((err = fib_lookup(&key, &res)) != 0) {
1452 if (!IN_DEV_FORWARD(in_dev))
1453 goto e_inval;
1454 goto no_route;
1455 }
1456 free_res = 1;
1457
1458 rt_cache_stat[smp_processor_id()].in_slow_tot++;
1459
1460 #ifdef CONFIG_IP_ROUTE_NAT
1461 /* Policy is applied before mapping destination,
1462 but rerouting after map should be made with old source.
1463 */
1464
1465 if (1) {
1466 u32 src_map = saddr;
1467 if (res.r)
1468 src_map = fib_rules_policy(saddr, &res, &flags);
1469
1470 if (res.type == RTN_NAT) {
1471 key.dst = fib_rules_map_destination(daddr, &res);
1472 fib_res_put(&res);
1473 free_res = 0;
1474 if (fib_lookup(&key, &res))
1475 goto e_inval;
1476 free_res = 1;
1477 if (res.type != RTN_UNICAST)
1478 goto e_inval;
1479 flags |= RTCF_DNAT;
1480 }
1481 key.src = src_map;
1482 }
1483 #endif
1484
1485 if (res.type == RTN_BROADCAST)
1486 goto brd_input;
1487
1488 if (res.type == RTN_LOCAL) {
1489 int result;
1490 result = fib_validate_source(saddr, daddr, tos,
1491 loopback_dev.ifindex,
1492 dev, &spec_dst, &itag);
1493 if (result < 0)
1494 goto martian_source;
1495 if (result)
1496 flags |= RTCF_DIRECTSRC;
1497 spec_dst = daddr;
1498 goto local_input;
1499 }
1500
1501 if (!IN_DEV_FORWARD(in_dev))
1502 goto e_inval;
1503 if (res.type != RTN_UNICAST)
1504 goto martian_destination;
1505
1506 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1507 if (res.fi->fib_nhs > 1 && key.oif == 0)
1508 fib_select_multipath(&key, &res);
1509 #endif
1510 out_dev = in_dev_get(FIB_RES_DEV(res));
1511 if (out_dev == NULL) {
1512 if (net_ratelimit())
1513 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1514 "Please, report\n");
1515 goto e_inval;
1516 }
1517
1518 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1519 &spec_dst, &itag);
1520 if (err < 0)
1521 goto martian_source;
1522
1523 if (err)
1524 flags |= RTCF_DIRECTSRC;
1525
1526 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1527 (IN_DEV_SHARED_MEDIA(out_dev) ||
1528 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1529 flags |= RTCF_DOREDIRECT;
1530
1531 if (skb->protocol != htons(ETH_P_IP)) {
1532 /* Not IP (i.e. ARP). Do not create route, if it is
1533 * invalid for proxy arp. DNAT routes are always valid.
1534 */
1535 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1536 goto e_inval;
1537 }
1538
1539 rth = dst_alloc(&ipv4_dst_ops);
1540 if (!rth)
1541 goto e_nobufs;
1542
1543 atomic_set(&rth->u.dst.__refcnt, 1);
1544 rth->u.dst.flags= DST_HOST;
1545 rth->key.dst = daddr;
1546 rth->rt_dst = daddr;
1547 rth->key.tos = tos;
1548 #ifdef CONFIG_IP_ROUTE_FWMARK
1549 rth->key.fwmark = skb->nfmark;
1550 #endif
1551 rth->key.src = saddr;
1552 rth->rt_src = saddr;
1553 rth->rt_gateway = daddr;
1554 #ifdef CONFIG_IP_ROUTE_NAT
1555 rth->rt_src_map = key.src;
1556 rth->rt_dst_map = key.dst;
1557 if (flags&RTCF_DNAT)
1558 rth->rt_gateway = key.dst;
1559 #endif
1560 rth->rt_iif =
1561 rth->key.iif = dev->ifindex;
1562 rth->u.dst.dev = out_dev->dev;
1563 dev_hold(rth->u.dst.dev);
1564 rth->key.oif = 0;
1565 rth->rt_spec_dst= spec_dst;
1566
1567 rth->u.dst.input = ip_forward;
1568 rth->u.dst.output = ip_output;
1569
1570 rt_set_nexthop(rth, &res, itag);
1571
1572 rth->rt_flags = flags;
1573
1574 #ifdef CONFIG_NET_FASTROUTE
1575 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1576 struct net_device *odev = rth->u.dst.dev;
1577 if (odev != dev &&
1578 dev->accept_fastpath &&
1579 odev->mtu >= dev->mtu &&
1580 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1581 rth->rt_flags |= RTCF_FAST;
1582 }
1583 #endif
1584
1585 intern:
1586 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1587 done:
1588 in_dev_put(in_dev);
1589 if (out_dev)
1590 in_dev_put(out_dev);
1591 if (free_res)
1592 fib_res_put(&res);
1593 out: return err;
1594
1595 brd_input:
1596 if (skb->protocol != htons(ETH_P_IP))
1597 goto e_inval;
1598
1599 if (ZERONET(saddr))
1600 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1601 else {
1602 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1603 &itag);
1604 if (err < 0)
1605 goto martian_source;
1606 if (err)
1607 flags |= RTCF_DIRECTSRC;
1608 }
1609 flags |= RTCF_BROADCAST;
1610 res.type = RTN_BROADCAST;
1611 rt_cache_stat[smp_processor_id()].in_brd++;
1612
1613 local_input:
1614 rth = dst_alloc(&ipv4_dst_ops);
1615 if (!rth)
1616 goto e_nobufs;
1617
1618 rth->u.dst.output= ip_rt_bug;
1619
1620 atomic_set(&rth->u.dst.__refcnt, 1);
1621 rth->u.dst.flags= DST_HOST;
1622 rth->key.dst = daddr;
1623 rth->rt_dst = daddr;
1624 rth->key.tos = tos;
1625 #ifdef CONFIG_IP_ROUTE_FWMARK
1626 rth->key.fwmark = skb->nfmark;
1627 #endif
1628 rth->key.src = saddr;
1629 rth->rt_src = saddr;
1630 #ifdef CONFIG_IP_ROUTE_NAT
1631 rth->rt_dst_map = key.dst;
1632 rth->rt_src_map = key.src;
1633 #endif
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 rth->u.dst.tclassid = itag;
1636 #endif
1637 rth->rt_iif =
1638 rth->key.iif = dev->ifindex;
1639 rth->u.dst.dev = &loopback_dev;
1640 dev_hold(rth->u.dst.dev);
1641 rth->key.oif = 0;
1642 rth->rt_gateway = daddr;
1643 rth->rt_spec_dst= spec_dst;
1644 rth->u.dst.input= ip_local_deliver;
1645 rth->rt_flags = flags|RTCF_LOCAL;
1646 if (res.type == RTN_UNREACHABLE) {
1647 rth->u.dst.input= ip_error;
1648 rth->u.dst.error= -err;
1649 rth->rt_flags &= ~RTCF_LOCAL;
1650 }
1651 rth->rt_type = res.type;
1652 goto intern;
1653
1654 no_route:
1655 rt_cache_stat[smp_processor_id()].in_no_route++;
1656 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1657 res.type = RTN_UNREACHABLE;
1658 goto local_input;
1659
1660 /*
1661 * Do not cache martian addresses: they should be logged (RFC1812)
1662 */
1663 martian_destination:
1664 rt_cache_stat[smp_processor_id()].in_martian_dst++;
1665 #ifdef CONFIG_IP_ROUTE_VERBOSE
1666 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1667 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1668 "%u.%u.%u.%u, dev %s\n",
1669 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670 #endif
1671 e_inval:
1672 err = -EINVAL;
1673 goto done;
1674
1675 e_nobufs:
1676 err = -ENOBUFS;
1677 goto done;
1678
1679 martian_source:
1680
1681 rt_cache_stat[smp_processor_id()].in_martian_src++;
1682 #ifdef CONFIG_IP_ROUTE_VERBOSE
1683 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1684 /*
1685 * RFC1812 recommendation, if source is martian,
1686 * the only hint is MAC header.
1687 */
1688 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1689 "%u.%u.%u.%u, on dev %s\n",
1690 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1691 if (dev->hard_header_len) {
1692 int i;
1693 unsigned char *p = skb->mac.raw;
1694 printk(KERN_WARNING "ll header: ");
1695 for (i = 0; i < dev->hard_header_len; i++, p++) {
1696 printk("%02x", *p);
1697 if (i < (dev->hard_header_len - 1))
1698 printk(":");
1699 }
1700 printk("\n");
1701 }
1702 }
1703 #endif
1704 goto e_inval;
1705 }
1706
ip_route_input(struct sk_buff * skb,u32 daddr,u32 saddr,u8 tos,struct net_device * dev)1707 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1708 u8 tos, struct net_device *dev)
1709 {
1710 struct rtable * rth;
1711 unsigned hash;
1712 int iif = dev->ifindex;
1713
1714 tos &= IPTOS_RT_MASK;
1715 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1716
1717 read_lock(&rt_hash_table[hash].lock);
1718 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1719 if (rth->key.dst == daddr &&
1720 rth->key.src == saddr &&
1721 rth->key.iif == iif &&
1722 rth->key.oif == 0 &&
1723 #ifdef CONFIG_IP_ROUTE_FWMARK
1724 rth->key.fwmark == skb->nfmark &&
1725 #endif
1726 rth->key.tos == tos) {
1727 rth->u.dst.lastuse = jiffies;
1728 dst_hold(&rth->u.dst);
1729 rth->u.dst.__use++;
1730 rt_cache_stat[smp_processor_id()].in_hit++;
1731 read_unlock(&rt_hash_table[hash].lock);
1732 skb->dst = (struct dst_entry*)rth;
1733 return 0;
1734 }
1735 rt_cache_stat[smp_processor_id()].in_hlist_search++;
1736 }
1737 read_unlock(&rt_hash_table[hash].lock);
1738
1739 /* Multicast recognition logic is moved from route cache to here.
1740 The problem was that too many Ethernet cards have broken/missing
1741 hardware multicast filters :-( As result the host on multicasting
1742 network acquires a lot of useless route cache entries, sort of
1743 SDR messages from all the world. Now we try to get rid of them.
1744 Really, provided software IP multicast filter is organized
1745 reasonably (at least, hashed), it does not result in a slowdown
1746 comparing with route cache reject entries.
1747 Note, that multicast routers are not affected, because
1748 route cache entry is created eventually.
1749 */
1750 if (MULTICAST(daddr)) {
1751 struct in_device *in_dev;
1752
1753 read_lock(&inetdev_lock);
1754 if ((in_dev = __in_dev_get(dev)) != NULL) {
1755 int our = ip_check_mc(in_dev, daddr, saddr);
1756 if (our
1757 #ifdef CONFIG_IP_MROUTE
1758 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1759 #endif
1760 ) {
1761 read_unlock(&inetdev_lock);
1762 return ip_route_input_mc(skb, daddr, saddr,
1763 tos, dev, our);
1764 }
1765 }
1766 read_unlock(&inetdev_lock);
1767 return -EINVAL;
1768 }
1769 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770 }
1771
1772 /*
1773 * Major route resolver routine.
1774 */
1775
ip_route_output_slow(struct rtable ** rp,const struct rt_key * oldkey)1776 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1777 {
1778 struct rt_key key;
1779 struct fib_result res;
1780 unsigned flags = 0;
1781 struct rtable *rth;
1782 struct net_device *dev_out = NULL;
1783 unsigned hash;
1784 int free_res = 0;
1785 int err;
1786 u32 tos;
1787
1788 tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1789 key.dst = oldkey->dst;
1790 key.src = oldkey->src;
1791 key.tos = tos & IPTOS_RT_MASK;
1792 key.iif = loopback_dev.ifindex;
1793 key.oif = oldkey->oif;
1794 #ifdef CONFIG_IP_ROUTE_FWMARK
1795 key.fwmark = oldkey->fwmark;
1796 #endif
1797 key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1798 RT_SCOPE_UNIVERSE;
1799 res.fi = NULL;
1800 #ifdef CONFIG_IP_MULTIPLE_TABLES
1801 res.r = NULL;
1802 #endif
1803
1804 if (oldkey->src) {
1805 err = -EINVAL;
1806 if (MULTICAST(oldkey->src) ||
1807 BADCLASS(oldkey->src) ||
1808 ZERONET(oldkey->src))
1809 goto out;
1810
1811 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1812 dev_out = ip_dev_find(oldkey->src);
1813 if (dev_out == NULL)
1814 goto out;
1815
1816 /* I removed check for oif == dev_out->oif here.
1817 It was wrong by three reasons:
1818 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1819 assigned to multiple interfaces.
1820 2. Moreover, we are allowed to send packets with saddr
1821 of another iface. --ANK
1822 */
1823
1824 if (oldkey->oif == 0
1825 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1826 /* Special hack: user can direct multicasts
1827 and limited broadcast via necessary interface
1828 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1829 This hack is not just for fun, it allows
1830 vic,vat and friends to work.
1831 They bind socket to loopback, set ttl to zero
1832 and expect that it will work.
1833 From the viewpoint of routing cache they are broken,
1834 because we are not allowed to build multicast path
1835 with loopback source addr (look, routing cache
1836 cannot know, that ttl is zero, so that packet
1837 will not leave this host and route is valid).
1838 Luckily, this hack is good workaround.
1839 */
1840
1841 key.oif = dev_out->ifindex;
1842 goto make_route;
1843 }
1844 if (dev_out)
1845 dev_put(dev_out);
1846 dev_out = NULL;
1847 }
1848 if (oldkey->oif) {
1849 dev_out = dev_get_by_index(oldkey->oif);
1850 err = -ENODEV;
1851 if (dev_out == NULL)
1852 goto out;
1853 if (__in_dev_get(dev_out) == NULL) {
1854 dev_put(dev_out);
1855 goto out; /* Wrong error code */
1856 }
1857
1858 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1859 if (!key.src)
1860 key.src = inet_select_addr(dev_out, 0,
1861 RT_SCOPE_LINK);
1862 goto make_route;
1863 }
1864 if (!key.src) {
1865 if (MULTICAST(oldkey->dst))
1866 key.src = inet_select_addr(dev_out, 0,
1867 key.scope);
1868 else if (!oldkey->dst)
1869 key.src = inet_select_addr(dev_out, 0,
1870 RT_SCOPE_HOST);
1871 }
1872 }
1873
1874 if (!key.dst) {
1875 key.dst = key.src;
1876 if (!key.dst)
1877 key.dst = key.src = htonl(INADDR_LOOPBACK);
1878 if (dev_out)
1879 dev_put(dev_out);
1880 dev_out = &loopback_dev;
1881 dev_hold(dev_out);
1882 key.oif = loopback_dev.ifindex;
1883 res.type = RTN_LOCAL;
1884 flags |= RTCF_LOCAL;
1885 goto make_route;
1886 }
1887
1888 if (fib_lookup(&key, &res)) {
1889 res.fi = NULL;
1890 if (oldkey->oif) {
1891 /* Apparently, routing tables are wrong. Assume,
1892 that the destination is on link.
1893
1894 WHY? DW.
1895 Because we are allowed to send to iface
1896 even if it has NO routes and NO assigned
1897 addresses. When oif is specified, routing
1898 tables are looked up with only one purpose:
1899 to catch if destination is gatewayed, rather than
1900 direct. Moreover, if MSG_DONTROUTE is set,
1901 we send packet, ignoring both routing tables
1902 and ifaddr state. --ANK
1903
1904
1905 We could make it even if oif is unknown,
1906 likely IPv6, but we do not.
1907 */
1908
1909 if (key.src == 0)
1910 key.src = inet_select_addr(dev_out, 0,
1911 RT_SCOPE_LINK);
1912 res.type = RTN_UNICAST;
1913 goto make_route;
1914 }
1915 if (dev_out)
1916 dev_put(dev_out);
1917 err = -ENETUNREACH;
1918 goto out;
1919 }
1920 free_res = 1;
1921
1922 if (res.type == RTN_NAT)
1923 goto e_inval;
1924
1925 if (res.type == RTN_LOCAL) {
1926 if (!key.src)
1927 key.src = key.dst;
1928 if (dev_out)
1929 dev_put(dev_out);
1930 dev_out = &loopback_dev;
1931 dev_hold(dev_out);
1932 key.oif = dev_out->ifindex;
1933 if (res.fi)
1934 fib_info_put(res.fi);
1935 res.fi = NULL;
1936 flags |= RTCF_LOCAL;
1937 goto make_route;
1938 }
1939
1940 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1941 if (res.fi->fib_nhs > 1 && key.oif == 0)
1942 fib_select_multipath(&key, &res);
1943 else
1944 #endif
1945 if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1946 fib_select_default(&key, &res);
1947
1948 if (!key.src)
1949 key.src = FIB_RES_PREFSRC(res);
1950
1951 if (dev_out)
1952 dev_put(dev_out);
1953 dev_out = FIB_RES_DEV(res);
1954 dev_hold(dev_out);
1955 key.oif = dev_out->ifindex;
1956
1957 make_route:
1958 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1959 goto e_inval;
1960
1961 if (key.dst == 0xFFFFFFFF)
1962 res.type = RTN_BROADCAST;
1963 else if (MULTICAST(key.dst))
1964 res.type = RTN_MULTICAST;
1965 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1966 goto e_inval;
1967
1968 if (dev_out->flags & IFF_LOOPBACK)
1969 flags |= RTCF_LOCAL;
1970
1971 if (res.type == RTN_BROADCAST) {
1972 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1973 if (res.fi) {
1974 fib_info_put(res.fi);
1975 res.fi = NULL;
1976 }
1977 } else if (res.type == RTN_MULTICAST) {
1978 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1979 read_lock(&inetdev_lock);
1980 if (!__in_dev_get(dev_out) ||
1981 !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1982 flags &= ~RTCF_LOCAL;
1983 read_unlock(&inetdev_lock);
1984 /* If multicast route do not exist use
1985 default one, but do not gateway in this case.
1986 Yes, it is hack.
1987 */
1988 if (res.fi && res.prefixlen < 4) {
1989 fib_info_put(res.fi);
1990 res.fi = NULL;
1991 }
1992 }
1993
1994 rth = dst_alloc(&ipv4_dst_ops);
1995 if (!rth)
1996 goto e_nobufs;
1997
1998 atomic_set(&rth->u.dst.__refcnt, 1);
1999 rth->u.dst.flags= DST_HOST;
2000 rth->key.dst = oldkey->dst;
2001 rth->key.tos = tos;
2002 rth->key.src = oldkey->src;
2003 rth->key.iif = 0;
2004 rth->key.oif = oldkey->oif;
2005 #ifdef CONFIG_IP_ROUTE_FWMARK
2006 rth->key.fwmark = oldkey->fwmark;
2007 #endif
2008 rth->rt_dst = key.dst;
2009 rth->rt_src = key.src;
2010 #ifdef CONFIG_IP_ROUTE_NAT
2011 rth->rt_dst_map = key.dst;
2012 rth->rt_src_map = key.src;
2013 #endif
2014 rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
2015 rth->u.dst.dev = dev_out;
2016 dev_hold(dev_out);
2017 rth->rt_gateway = key.dst;
2018 rth->rt_spec_dst= key.src;
2019
2020 rth->u.dst.output=ip_output;
2021
2022 rt_cache_stat[smp_processor_id()].out_slow_tot++;
2023
2024 if (flags & RTCF_LOCAL) {
2025 rth->u.dst.input = ip_local_deliver;
2026 rth->rt_spec_dst = key.dst;
2027 }
2028 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2029 rth->rt_spec_dst = key.src;
2030 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2031 rth->u.dst.output = ip_mc_output;
2032 rt_cache_stat[smp_processor_id()].out_slow_mc++;
2033 }
2034 #ifdef CONFIG_IP_MROUTE
2035 if (res.type == RTN_MULTICAST) {
2036 struct in_device *in_dev = in_dev_get(dev_out);
2037 if (in_dev) {
2038 if (IN_DEV_MFORWARD(in_dev) &&
2039 !LOCAL_MCAST(oldkey->dst)) {
2040 rth->u.dst.input = ip_mr_input;
2041 rth->u.dst.output = ip_mc_output;
2042 }
2043 in_dev_put(in_dev);
2044 }
2045 }
2046 #endif
2047 }
2048
2049 rt_set_nexthop(rth, &res, 0);
2050
2051 rth->rt_flags = flags;
2052
2053 hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2054 err = rt_intern_hash(hash, rth, rp);
2055 done:
2056 if (free_res)
2057 fib_res_put(&res);
2058 if (dev_out)
2059 dev_put(dev_out);
2060 out: return err;
2061
2062 e_inval:
2063 err = -EINVAL;
2064 goto done;
2065 e_nobufs:
2066 err = -ENOBUFS;
2067 goto done;
2068 }
2069
ip_route_output_key(struct rtable ** rp,const struct rt_key * key)2070 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2071 {
2072 unsigned hash;
2073 struct rtable *rth;
2074
2075 hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2076
2077 read_lock_bh(&rt_hash_table[hash].lock);
2078 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2079 if (rth->key.dst == key->dst &&
2080 rth->key.src == key->src &&
2081 rth->key.iif == 0 &&
2082 rth->key.oif == key->oif &&
2083 #ifdef CONFIG_IP_ROUTE_FWMARK
2084 rth->key.fwmark == key->fwmark &&
2085 #endif
2086 !((rth->key.tos ^ key->tos) &
2087 (IPTOS_RT_MASK | RTO_ONLINK))) {
2088 rth->u.dst.lastuse = jiffies;
2089 dst_hold(&rth->u.dst);
2090 rth->u.dst.__use++;
2091 rt_cache_stat[smp_processor_id()].out_hit++;
2092 read_unlock_bh(&rt_hash_table[hash].lock);
2093 *rp = rth;
2094 return 0;
2095 }
2096 rt_cache_stat[smp_processor_id()].out_hlist_search++;
2097 }
2098 read_unlock_bh(&rt_hash_table[hash].lock);
2099
2100 return ip_route_output_slow(rp, key);
2101 }
2102
rt_fill_info(struct sk_buff * skb,u32 pid,u32 seq,int event,int nowait)2103 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2104 int nowait)
2105 {
2106 struct rtable *rt = (struct rtable*)skb->dst;
2107 struct rtmsg *r;
2108 struct nlmsghdr *nlh;
2109 unsigned char *b = skb->tail;
2110 struct rta_cacheinfo ci;
2111 #ifdef CONFIG_IP_MROUTE
2112 struct rtattr *eptr;
2113 #endif
2114 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2115 r = NLMSG_DATA(nlh);
2116 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2117 r->rtm_family = AF_INET;
2118 r->rtm_dst_len = 32;
2119 r->rtm_src_len = 0;
2120 r->rtm_tos = rt->key.tos;
2121 r->rtm_table = RT_TABLE_MAIN;
2122 r->rtm_type = rt->rt_type;
2123 r->rtm_scope = RT_SCOPE_UNIVERSE;
2124 r->rtm_protocol = RTPROT_UNSPEC;
2125 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2126 if (rt->rt_flags & RTCF_NOTIFY)
2127 r->rtm_flags |= RTM_F_NOTIFY;
2128 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2129 if (rt->key.src) {
2130 r->rtm_src_len = 32;
2131 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2132 }
2133 if (rt->u.dst.dev)
2134 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2135 #ifdef CONFIG_NET_CLS_ROUTE
2136 if (rt->u.dst.tclassid)
2137 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2138 #endif
2139 if (rt->key.iif)
2140 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2141 else if (rt->rt_src != rt->key.src)
2142 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2143 if (rt->rt_dst != rt->rt_gateway)
2144 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2145 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2146 goto rtattr_failure;
2147 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
2148 ci.rta_used = rt->u.dst.__use;
2149 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2150 if (rt->u.dst.expires)
2151 ci.rta_expires = rt->u.dst.expires - jiffies;
2152 else
2153 ci.rta_expires = 0;
2154 ci.rta_error = rt->u.dst.error;
2155 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2156 if (rt->peer) {
2157 ci.rta_id = rt->peer->ip_id_count;
2158 if (rt->peer->tcp_ts_stamp) {
2159 ci.rta_ts = rt->peer->tcp_ts;
2160 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2161 }
2162 }
2163 #ifdef CONFIG_IP_MROUTE
2164 eptr = (struct rtattr*)skb->tail;
2165 #endif
2166 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2167 if (rt->key.iif) {
2168 #ifdef CONFIG_IP_MROUTE
2169 u32 dst = rt->rt_dst;
2170
2171 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2172 ipv4_devconf.mc_forwarding) {
2173 int err = ipmr_get_route(skb, r, nowait);
2174 if (err <= 0) {
2175 if (!nowait) {
2176 if (err == 0)
2177 return 0;
2178 goto nlmsg_failure;
2179 } else {
2180 if (err == -EMSGSIZE)
2181 goto nlmsg_failure;
2182 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2183 }
2184 }
2185 } else
2186 #endif
2187 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2188 }
2189
2190 nlh->nlmsg_len = skb->tail - b;
2191 return skb->len;
2192
2193 nlmsg_failure:
2194 rtattr_failure:
2195 skb_trim(skb, b - skb->data);
2196 return -1;
2197 }
2198
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,void * arg)2199 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2200 {
2201 struct rtattr **rta = arg;
2202 struct rtmsg *rtm = NLMSG_DATA(nlh);
2203 struct rtable *rt = NULL;
2204 u32 dst = 0;
2205 u32 src = 0;
2206 int iif = 0;
2207 int err = -ENOBUFS;
2208 struct sk_buff *skb;
2209
2210 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211 if (!skb)
2212 goto out;
2213
2214 /* Reserve room for dummy headers, this skb can pass
2215 through good chunk of routing engine.
2216 */
2217 skb->mac.raw = skb->nh.raw = skb->data;
2218
2219 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2220 skb->nh.iph->protocol = IPPROTO_ICMP;
2221 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2222
2223 if (rta[RTA_SRC - 1])
2224 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2225 if (rta[RTA_DST - 1])
2226 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2227 if (rta[RTA_IIF - 1])
2228 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2229
2230 if (iif) {
2231 struct net_device *dev = __dev_get_by_index(iif);
2232 err = -ENODEV;
2233 if (!dev)
2234 goto out_free;
2235 skb->protocol = htons(ETH_P_IP);
2236 skb->dev = dev;
2237 local_bh_disable();
2238 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2239 local_bh_enable();
2240 rt = (struct rtable*)skb->dst;
2241 if (!err && rt->u.dst.error)
2242 err = -rt->u.dst.error;
2243 } else {
2244 int oif = 0;
2245 if (rta[RTA_OIF - 1])
2246 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2247 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2248 }
2249 if (err)
2250 goto out_free;
2251
2252 skb->dst = &rt->u.dst;
2253 if (rtm->rtm_flags & RTM_F_NOTIFY)
2254 rt->rt_flags |= RTCF_NOTIFY;
2255
2256 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2257
2258 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2259 RTM_NEWROUTE, 0);
2260 if (!err)
2261 goto out_free;
2262 if (err < 0) {
2263 err = -EMSGSIZE;
2264 goto out_free;
2265 }
2266
2267 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2268 if (err > 0)
2269 err = 0;
2270 out: return err;
2271
2272 out_free:
2273 kfree_skb(skb);
2274 goto out;
2275 }
2276
ip_rt_dump(struct sk_buff * skb,struct netlink_callback * cb)2277 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2278 {
2279 struct rtable *rt;
2280 int h, s_h;
2281 int idx, s_idx;
2282
2283 s_h = cb->args[0];
2284 s_idx = idx = cb->args[1];
2285 for (h = 0; h <= rt_hash_mask; h++) {
2286 if (h < s_h) continue;
2287 if (h > s_h)
2288 s_idx = 0;
2289 read_lock_bh(&rt_hash_table[h].lock);
2290 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2291 rt = rt->u.rt_next, idx++) {
2292 if (idx < s_idx)
2293 continue;
2294 skb->dst = dst_clone(&rt->u.dst);
2295 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2296 cb->nlh->nlmsg_seq,
2297 RTM_NEWROUTE, 1) <= 0) {
2298 dst_release(xchg(&skb->dst, NULL));
2299 read_unlock_bh(&rt_hash_table[h].lock);
2300 goto done;
2301 }
2302 dst_release(xchg(&skb->dst, NULL));
2303 }
2304 read_unlock_bh(&rt_hash_table[h].lock);
2305 }
2306
2307 done:
2308 cb->args[0] = h;
2309 cb->args[1] = idx;
2310 return skb->len;
2311 }
2312
ip_rt_multicast_event(struct in_device * in_dev)2313 void ip_rt_multicast_event(struct in_device *in_dev)
2314 {
2315 rt_cache_flush(0);
2316 }
2317
2318 #ifdef CONFIG_SYSCTL
2319 static int flush_delay;
2320
ipv4_sysctl_rtcache_flush(ctl_table * ctl,int write,struct file * filp,void * buffer,size_t * lenp)2321 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2322 struct file *filp, void *buffer,
2323 size_t *lenp)
2324 {
2325 if (write) {
2326 proc_dointvec(ctl, write, filp, buffer, lenp);
2327 rt_cache_flush(flush_delay);
2328 return 0;
2329 }
2330
2331 return -EINVAL;
2332 }
2333
ipv4_sysctl_rtcache_flush_strategy(ctl_table * table,int * name,int nlen,void * oldval,size_t * oldlenp,void * newval,size_t newlen,void ** context)2334 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2335 int nlen, void *oldval,
2336 size_t *oldlenp, void *newval,
2337 size_t newlen, void **context)
2338 {
2339 int delay;
2340 if (newlen != sizeof(int))
2341 return -EINVAL;
2342 if (get_user(delay, (int *)newval))
2343 return -EFAULT;
2344 rt_cache_flush(delay);
2345 return 0;
2346 }
2347
2348 ctl_table ipv4_route_table[] = {
2349 {
2350 ctl_name: NET_IPV4_ROUTE_FLUSH,
2351 procname: "flush",
2352 data: &flush_delay,
2353 maxlen: sizeof(int),
2354 mode: 0644,
2355 proc_handler: &ipv4_sysctl_rtcache_flush,
2356 strategy: &ipv4_sysctl_rtcache_flush_strategy,
2357 },
2358 {
2359 ctl_name: NET_IPV4_ROUTE_MIN_DELAY,
2360 procname: "min_delay",
2361 data: &ip_rt_min_delay,
2362 maxlen: sizeof(int),
2363 mode: 0644,
2364 proc_handler: &proc_dointvec_jiffies,
2365 strategy: &sysctl_jiffies,
2366 },
2367 {
2368 ctl_name: NET_IPV4_ROUTE_MAX_DELAY,
2369 procname: "max_delay",
2370 data: &ip_rt_max_delay,
2371 maxlen: sizeof(int),
2372 mode: 0644,
2373 proc_handler: &proc_dointvec_jiffies,
2374 strategy: &sysctl_jiffies,
2375 },
2376 {
2377 ctl_name: NET_IPV4_ROUTE_GC_THRESH,
2378 procname: "gc_thresh",
2379 data: &ipv4_dst_ops.gc_thresh,
2380 maxlen: sizeof(int),
2381 mode: 0644,
2382 proc_handler: &proc_dointvec,
2383 },
2384 {
2385 ctl_name: NET_IPV4_ROUTE_MAX_SIZE,
2386 procname: "max_size",
2387 data: &ip_rt_max_size,
2388 maxlen: sizeof(int),
2389 mode: 0644,
2390 proc_handler: &proc_dointvec,
2391 },
2392 {
2393 ctl_name: NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2394 procname: "gc_min_interval",
2395 data: &ip_rt_gc_min_interval,
2396 maxlen: sizeof(int),
2397 mode: 0644,
2398 proc_handler: &proc_dointvec_jiffies,
2399 strategy: &sysctl_jiffies,
2400 },
2401 {
2402 ctl_name: NET_IPV4_ROUTE_GC_TIMEOUT,
2403 procname: "gc_timeout",
2404 data: &ip_rt_gc_timeout,
2405 maxlen: sizeof(int),
2406 mode: 0644,
2407 proc_handler: &proc_dointvec_jiffies,
2408 strategy: &sysctl_jiffies,
2409 },
2410 {
2411 ctl_name: NET_IPV4_ROUTE_GC_INTERVAL,
2412 procname: "gc_interval",
2413 data: &ip_rt_gc_interval,
2414 maxlen: sizeof(int),
2415 mode: 0644,
2416 proc_handler: &proc_dointvec_jiffies,
2417 strategy: &sysctl_jiffies,
2418 },
2419 {
2420 ctl_name: NET_IPV4_ROUTE_REDIRECT_LOAD,
2421 procname: "redirect_load",
2422 data: &ip_rt_redirect_load,
2423 maxlen: sizeof(int),
2424 mode: 0644,
2425 proc_handler: &proc_dointvec,
2426 },
2427 {
2428 ctl_name: NET_IPV4_ROUTE_REDIRECT_NUMBER,
2429 procname: "redirect_number",
2430 data: &ip_rt_redirect_number,
2431 maxlen: sizeof(int),
2432 mode: 0644,
2433 proc_handler: &proc_dointvec,
2434 },
2435 {
2436 ctl_name: NET_IPV4_ROUTE_REDIRECT_SILENCE,
2437 procname: "redirect_silence",
2438 data: &ip_rt_redirect_silence,
2439 maxlen: sizeof(int),
2440 mode: 0644,
2441 proc_handler: &proc_dointvec,
2442 },
2443 {
2444 ctl_name: NET_IPV4_ROUTE_ERROR_COST,
2445 procname: "error_cost",
2446 data: &ip_rt_error_cost,
2447 maxlen: sizeof(int),
2448 mode: 0644,
2449 proc_handler: &proc_dointvec,
2450 },
2451 {
2452 ctl_name: NET_IPV4_ROUTE_ERROR_BURST,
2453 procname: "error_burst",
2454 data: &ip_rt_error_burst,
2455 maxlen: sizeof(int),
2456 mode: 0644,
2457 proc_handler: &proc_dointvec,
2458 },
2459 {
2460 ctl_name: NET_IPV4_ROUTE_GC_ELASTICITY,
2461 procname: "gc_elasticity",
2462 data: &ip_rt_gc_elasticity,
2463 maxlen: sizeof(int),
2464 mode: 0644,
2465 proc_handler: &proc_dointvec,
2466 },
2467 {
2468 ctl_name: NET_IPV4_ROUTE_MTU_EXPIRES,
2469 procname: "mtu_expires",
2470 data: &ip_rt_mtu_expires,
2471 maxlen: sizeof(int),
2472 mode: 0644,
2473 proc_handler: &proc_dointvec_jiffies,
2474 strategy: &sysctl_jiffies,
2475 },
2476 {
2477 ctl_name: NET_IPV4_ROUTE_MIN_PMTU,
2478 procname: "min_pmtu",
2479 data: &ip_rt_min_pmtu,
2480 maxlen: sizeof(int),
2481 mode: 0644,
2482 proc_handler: &proc_dointvec,
2483 },
2484 {
2485 ctl_name: NET_IPV4_ROUTE_MIN_ADVMSS,
2486 procname: "min_adv_mss",
2487 data: &ip_rt_min_advmss,
2488 maxlen: sizeof(int),
2489 mode: 0644,
2490 proc_handler: &proc_dointvec,
2491 },
2492 {
2493 ctl_name: NET_IPV4_ROUTE_SECRET_INTERVAL,
2494 procname: "secret_interval",
2495 data: &ip_rt_secret_interval,
2496 maxlen: sizeof(int),
2497 mode: 0644,
2498 proc_handler: &proc_dointvec_jiffies,
2499 strategy: &sysctl_jiffies,
2500 },
2501 { 0 }
2502 };
2503 #endif
2504
2505 #ifdef CONFIG_NET_CLS_ROUTE
2506 struct ip_rt_acct *ip_rt_acct;
2507
2508 /* This code sucks. But you should have seen it before! --RR */
2509
2510 /* IP route accounting ptr for this logical cpu number. */
2511 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2512
ip_rt_acct_read(char * buffer,char ** start,off_t offset,int length,int * eof,void * data)2513 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2514 int length, int *eof, void *data)
2515 {
2516 unsigned int i;
2517
2518 if ((offset & 3) || (length & 3))
2519 return -EIO;
2520
2521 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2522 *eof = 1;
2523 return 0;
2524 }
2525
2526 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2527 length = sizeof(struct ip_rt_acct) * 256 - offset;
2528 *eof = 1;
2529 }
2530
2531 offset /= sizeof(u32);
2532
2533 if (length > 0) {
2534 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2535 u32 *dst = (u32 *) buffer;
2536
2537 /* Copy first cpu. */
2538 *start = buffer;
2539 memcpy(dst, src, length);
2540
2541 /* Add the other cpus in, one int at a time */
2542 for (i = 1; i < smp_num_cpus; i++) {
2543 unsigned int j;
2544
2545 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2546
2547 for (j = 0; j < length/4; j++)
2548 dst[j] += src[j];
2549 }
2550 }
2551 return length;
2552 }
2553 #endif
2554
ip_rt_init(void)2555 void __init ip_rt_init(void)
2556 {
2557 int i, order, goal;
2558
2559 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2560 (jiffies ^ (jiffies >> 7)));
2561
2562 #ifdef CONFIG_NET_CLS_ROUTE
2563 for (order = 0;
2564 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2565 /* NOTHING */;
2566 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2567 if (!ip_rt_acct)
2568 panic("IP: failed to allocate ip_rt_acct\n");
2569 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2570 #endif
2571
2572 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2573 sizeof(struct rtable),
2574 0, SLAB_HWCACHE_ALIGN,
2575 NULL, NULL);
2576
2577 if (!ipv4_dst_ops.kmem_cachep)
2578 panic("IP: failed to allocate ip_dst_cache\n");
2579
2580 goal = num_physpages >> (26 - PAGE_SHIFT);
2581
2582 for (order = 0; (1UL << order) < goal; order++)
2583 /* NOTHING */;
2584
2585 do {
2586 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2587 sizeof(struct rt_hash_bucket);
2588 while (rt_hash_mask & (rt_hash_mask - 1))
2589 rt_hash_mask--;
2590 rt_hash_table = (struct rt_hash_bucket *)
2591 __get_free_pages(GFP_ATOMIC, order);
2592 } while (rt_hash_table == NULL && --order > 0);
2593
2594 if (!rt_hash_table)
2595 panic("Failed to allocate IP route cache hash table\n");
2596
2597 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2598 rt_hash_mask,
2599 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2600
2601 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2602 /* NOTHING */;
2603
2604 rt_hash_mask--;
2605 for (i = 0; i <= rt_hash_mask; i++) {
2606 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2607 rt_hash_table[i].chain = NULL;
2608 }
2609
2610 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2611 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2612
2613 devinet_init();
2614 ip_fib_init();
2615
2616 rt_flush_timer.function = rt_run_flush;
2617 rt_periodic_timer.function = rt_check_expire;
2618 rt_secret_timer.function = rt_secret_rebuild;
2619
2620 /* All the timers, started at system startup tend
2621 to synchronize. Perturb it a bit.
2622 */
2623 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2624 ip_rt_gc_interval;
2625 add_timer(&rt_periodic_timer);
2626
2627 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2628 ip_rt_secret_interval;
2629 add_timer(&rt_secret_timer);
2630
2631 proc_net_create ("rt_cache", 0, rt_cache_get_info);
2632 create_proc_info_entry ("rt_cache", 0, proc_net_stat,
2633 rt_cache_stat_get_info);
2634 #ifdef CONFIG_NET_CLS_ROUTE
2635 create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2636 #endif
2637 }
2638