1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Version:	$Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $
9  *
10  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *		Alan Cox	:	Verify area fixes.
18  *		Alan Cox	:	cli() protects routing changes
19  *		Rui Oliveira	:	ICMP routing table updates
20  *		(rco@di.uminho.pt)	Routing table insertion and update
21  *		Linus Torvalds	:	Rewrote bits to be sensible
22  *		Alan Cox	:	Added BSD route gw semantics
23  *		Alan Cox	:	Super /proc >4K
24  *		Alan Cox	:	MTU in route table
25  *		Alan Cox	: 	MSS actually. Also added the window
26  *					clamper.
27  *		Sam Lantinga	:	Fixed route matching in rt_del()
28  *		Alan Cox	:	Routing cache support.
29  *		Alan Cox	:	Removed compatibility cruft.
30  *		Alan Cox	:	RTF_REJECT support.
31  *		Alan Cox	:	TCP irtt support.
32  *		Jonathan Naylor	:	Added Metric support.
33  *	Miquel van Smoorenburg	:	BSD API fixes.
34  *	Miquel van Smoorenburg	:	Metrics.
35  *		Alan Cox	:	Use __u32 properly
36  *		Alan Cox	:	Aligned routing errors more closely with BSD
37  *					our system is still very different.
38  *		Alan Cox	:	Faster /proc handling
39  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
40  *					routing caches and better behaviour.
41  *
42  *		Olaf Erb	:	irtt wasn't being copied right.
43  *		Bjorn Ekwall	:	Kerneld route support.
44  *		Alan Cox	:	Multicast fixed (I hope)
45  * 		Pavel Krauz	:	Limited broadcast fixed
46  *		Mike McLagan	:	Routing by source
47  *	Alexey Kuznetsov	:	End of old history. Splitted to fib.c and
48  *					route.c and rewritten from scratch.
49  *		Andi Kleen	:	Load-limit warning messages.
50  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
51  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
52  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
53  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
54  *		Marc Boucher	:	routing by fwmark
55  *	Robert Olsson		:	Added rt_cache statistics
56  *
57  *		This program is free software; you can redistribute it and/or
58  *		modify it under the terms of the GNU General Public License
59  *		as published by the Free Software Foundation; either version
60  *		2 of the License, or (at your option) any later version.
61  */
62 
63 #include <linux/config.h>
64 #include <asm/uaccess.h>
65 #include <asm/system.h>
66 #include <asm/bitops.h>
67 #include <linux/types.h>
68 #include <linux/kernel.h>
69 #include <linux/sched.h>
70 #include <linux/mm.h>
71 #include <linux/string.h>
72 #include <linux/socket.h>
73 #include <linux/sockios.h>
74 #include <linux/errno.h>
75 #include <linux/in.h>
76 #include <linux/inet.h>
77 #include <linux/netdevice.h>
78 #include <linux/proc_fs.h>
79 #include <linux/init.h>
80 #include <linux/skbuff.h>
81 #include <linux/rtnetlink.h>
82 #include <linux/inetdevice.h>
83 #include <linux/igmp.h>
84 #include <linux/pkt_sched.h>
85 #include <linux/mroute.h>
86 #include <linux/netfilter_ipv4.h>
87 #include <linux/random.h>
88 #include <linux/jhash.h>
89 #include <net/protocol.h>
90 #include <net/ip.h>
91 #include <net/route.h>
92 #include <net/inetpeer.h>
93 #include <net/sock.h>
94 #include <net/ip_fib.h>
95 #include <net/arp.h>
96 #include <net/tcp.h>
97 #include <net/icmp.h>
98 #ifdef CONFIG_SYSCTL
99 #include <linux/sysctl.h>
100 #endif
101 
102 #define IP_MAX_MTU	0xFFF0
103 
104 #define RT_GC_TIMEOUT (300*HZ)
105 
106 int ip_rt_min_delay		= 2 * HZ;
107 int ip_rt_max_delay		= 10 * HZ;
108 int ip_rt_max_size;
109 int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
110 int ip_rt_gc_interval		= 60 * HZ;
111 int ip_rt_gc_min_interval	= HZ / 2;
112 int ip_rt_redirect_number	= 9;
113 int ip_rt_redirect_load		= HZ / 50;
114 int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
115 int ip_rt_error_cost		= HZ;
116 int ip_rt_error_burst		= 5 * HZ;
117 int ip_rt_gc_elasticity		= 8;
118 int ip_rt_mtu_expires		= 10 * 60 * HZ;
119 int ip_rt_min_pmtu		= 512 + 20 + 20;
120 int ip_rt_min_advmss		= 256;
121 int ip_rt_secret_interval	= 10 * 60 * HZ;
122 static unsigned long rt_deadline;
123 
124 #define RTprint(a...)	printk(KERN_DEBUG a)
125 
126 static struct timer_list rt_flush_timer;
127 static struct timer_list rt_periodic_timer;
128 static struct timer_list rt_secret_timer;
129 
130 /*
131  *	Interface to generic destination cache.
132  */
133 
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
136 					   struct sk_buff *skb);
137 static void		 ipv4_dst_destroy(struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void		 ipv4_link_failure(struct sk_buff *skb);
140 static int rt_garbage_collect(void);
141 
142 
143 struct dst_ops ipv4_dst_ops = {
144 	family:			AF_INET,
145 	protocol:		__constant_htons(ETH_P_IP),
146 	gc:			rt_garbage_collect,
147 	check:			ipv4_dst_check,
148 	reroute:		ipv4_dst_reroute,
149 	destroy:		ipv4_dst_destroy,
150 	negative_advice:	ipv4_negative_advice,
151 	link_failure:		ipv4_link_failure,
152 	entry_size:		sizeof(struct rtable),
153 };
154 
155 #define ECN_OR_COST(class)	TC_PRIO_##class
156 
157 __u8 ip_tos2prio[16] = {
158 	TC_PRIO_BESTEFFORT,
159 	ECN_OR_COST(FILLER),
160 	TC_PRIO_BESTEFFORT,
161 	ECN_OR_COST(BESTEFFORT),
162 	TC_PRIO_BULK,
163 	ECN_OR_COST(BULK),
164 	TC_PRIO_BULK,
165 	ECN_OR_COST(BULK),
166 	TC_PRIO_INTERACTIVE,
167 	ECN_OR_COST(INTERACTIVE),
168 	TC_PRIO_INTERACTIVE,
169 	ECN_OR_COST(INTERACTIVE),
170 	TC_PRIO_INTERACTIVE_BULK,
171 	ECN_OR_COST(INTERACTIVE_BULK),
172 	TC_PRIO_INTERACTIVE_BULK,
173 	ECN_OR_COST(INTERACTIVE_BULK)
174 };
175 
176 
177 /*
178  * Route cache.
179  */
180 
181 /* The locking scheme is rather straight forward:
182  *
183  * 1) A BH protected rwlocks protect buckets of the central route hash.
184  * 2) Only writers remove entries, and they hold the lock
185  *    as they look at rtable reference counts.
186  * 3) Only readers acquire references to rtable entries,
187  *    they do so with atomic increments and with the
188  *    lock held.
189  */
190 
191 struct rt_hash_bucket {
192 	struct rtable	*chain;
193 	rwlock_t	lock;
194 } __attribute__((__aligned__(8)));
195 
196 static struct rt_hash_bucket 	*rt_hash_table;
197 static unsigned			rt_hash_mask;
198 static int			rt_hash_log;
199 static unsigned int		rt_hash_rnd;
200 
201 struct rt_cache_stat rt_cache_stat[NR_CPUS];
202 
203 static int rt_intern_hash(unsigned hash, struct rtable *rth,
204 				struct rtable **res);
205 
rt_hash_code(u32 daddr,u32 saddr,u8 tos)206 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
207 {
208 	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
209 		& rt_hash_mask);
210 }
211 
rt_cache_get_info(char * buffer,char ** start,off_t offset,int length)212 static int rt_cache_get_info(char *buffer, char **start, off_t offset,
213 				int length)
214 {
215 	int len = 0;
216 	off_t pos = 128;
217 	char temp[256];
218 	struct rtable *r;
219 	int i;
220 
221 	if (offset < 128) {
222 		sprintf(buffer, "%-127s\n",
223 			"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 			"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 			"HHUptod\tSpecDst");
226 		len = 128;
227   	}
228 
229 	for (i = rt_hash_mask; i >= 0; i--) {
230 		read_lock_bh(&rt_hash_table[i].lock);
231 		for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
232 			/*
233 			 *	Spin through entries until we are ready
234 			 */
235 			pos += 128;
236 
237 			if (pos <= offset) {
238 				len = 0;
239 				continue;
240 			}
241 			sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
242 				"%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
243 				r->u.dst.dev ? r->u.dst.dev->name : "*",
244 				(unsigned long)r->rt_dst,
245 				(unsigned long)r->rt_gateway,
246 				r->rt_flags,
247 				atomic_read(&r->u.dst.__refcnt),
248 				r->u.dst.__use,
249 				0,
250 				(unsigned long)r->rt_src,
251 				(r->u.dst.advmss ?
252 				 (int) r->u.dst.advmss + 40 : 0),
253 				r->u.dst.window,
254 				(int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
255 				r->key.tos,
256 				r->u.dst.hh ?
257 					atomic_read(&r->u.dst.hh->hh_refcnt) :
258 					-1,
259 				r->u.dst.hh ?
260 			       		(r->u.dst.hh->hh_output ==
261 					 dev_queue_xmit) : 0,
262 				r->rt_spec_dst);
263 			sprintf(buffer + len, "%-127s\n", temp);
264 			len += 128;
265 			if (pos >= offset+length) {
266 				read_unlock_bh(&rt_hash_table[i].lock);
267 				goto done;
268 			}
269 		}
270 		read_unlock_bh(&rt_hash_table[i].lock);
271         }
272 
273 done:
274   	*start = buffer + len - (pos - offset);
275   	len = pos - offset;
276   	if (len > length)
277   		len = length;
278   	return len;
279 }
280 
rt_cache_stat_get_info(char * buffer,char ** start,off_t offset,int length)281 static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
282 {
283 	unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
284 	int i, lcpu;
285 	int len = 0;
286 
287  	len += sprintf(buffer+len, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288         for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
289                 i = cpu_logical_map(lcpu);
290 
291 		len += sprintf(buffer+len, "%08x  %08x %08x %08x %08x %08x %08x %08x  %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292 			       dst_entries,
293 			       rt_cache_stat[i].in_hit,
294 			       rt_cache_stat[i].in_slow_tot,
295 			       rt_cache_stat[i].in_slow_mc,
296 			       rt_cache_stat[i].in_no_route,
297 			       rt_cache_stat[i].in_brd,
298 			       rt_cache_stat[i].in_martian_dst,
299 			       rt_cache_stat[i].in_martian_src,
300 
301 			       rt_cache_stat[i].out_hit,
302 			       rt_cache_stat[i].out_slow_tot,
303 			       rt_cache_stat[i].out_slow_mc,
304 
305 			       rt_cache_stat[i].gc_total,
306 			       rt_cache_stat[i].gc_ignored,
307 			       rt_cache_stat[i].gc_goal_miss,
308 			       rt_cache_stat[i].gc_dst_overflow,
309 			       rt_cache_stat[i].in_hlist_search,
310 			       rt_cache_stat[i].out_hlist_search
311 
312 			);
313 	}
314 	len -= offset;
315 
316 	if (len > length)
317 		len = length;
318 	if (len < 0)
319 		len = 0;
320 
321 	*start = buffer + offset;
322   	return len;
323 }
324 
rt_free(struct rtable * rt)325 static __inline__ void rt_free(struct rtable *rt)
326 {
327 	dst_free(&rt->u.dst);
328 }
329 
rt_drop(struct rtable * rt)330 static __inline__ void rt_drop(struct rtable *rt)
331 {
332 	ip_rt_put(rt);
333 	dst_free(&rt->u.dst);
334 }
335 
rt_fast_clean(struct rtable * rth)336 static __inline__ int rt_fast_clean(struct rtable *rth)
337 {
338 	/* Kill broadcast/multicast entries very aggresively, if they
339 	   collide in hash table with more useful entries */
340 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
341 		rth->key.iif && rth->u.rt_next;
342 }
343 
rt_valuable(struct rtable * rth)344 static __inline__ int rt_valuable(struct rtable *rth)
345 {
346 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
347 		rth->u.dst.expires;
348 }
349 
rt_may_expire(struct rtable * rth,unsigned long tmo1,unsigned long tmo2)350 static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
351 {
352 	unsigned long age;
353 	int ret = 0;
354 
355 	if (atomic_read(&rth->u.dst.__refcnt))
356 		goto out;
357 
358 	ret = 1;
359 	if (rth->u.dst.expires &&
360 	    time_after_eq(jiffies, rth->u.dst.expires))
361 		goto out;
362 
363 	age = jiffies - rth->u.dst.lastuse;
364 	ret = 0;
365 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
366 	    (age <= tmo2 && rt_valuable(rth)))
367 		goto out;
368 	ret = 1;
369 out:	return ret;
370 }
371 
372 /* Bits of score are:
373  * 31: very valuable
374  * 30: not quite useless
375  * 29..0: usage counter
376  */
rt_score(struct rtable * rt)377 static inline u32 rt_score(struct rtable *rt)
378 {
379 	u32 score = jiffies - rt->u.dst.lastuse;
380 
381 	score = ~score & ~(3<<30);
382 
383 	if (rt_valuable(rt))
384 		score |= (1<<31);
385 
386 	if (!rt->key.iif ||
387 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
388 		score |= (1<<30);
389 
390 	return score;
391 }
392 
393 /* This runs via a timer and thus is always in BH context. */
SMP_TIMER_NAME(rt_check_expire)394 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
395 {
396 	static int rover;
397 	int i = rover, t;
398 	struct rtable *rth, **rthp;
399 	unsigned long now = jiffies;
400 
401 	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
402 	     t -= ip_rt_gc_timeout) {
403 		unsigned long tmo = ip_rt_gc_timeout;
404 
405 		i = (i + 1) & rt_hash_mask;
406 		rthp = &rt_hash_table[i].chain;
407 
408 		write_lock(&rt_hash_table[i].lock);
409 		while ((rth = *rthp) != NULL) {
410 			if (rth->u.dst.expires) {
411 				/* Entry is expired even if it is in use */
412 				if (time_before_eq(now, rth->u.dst.expires)) {
413 					tmo >>= 1;
414 					rthp = &rth->u.rt_next;
415 					continue;
416 				}
417 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
418 				tmo >>= 1;
419 				rthp = &rth->u.rt_next;
420 				continue;
421 			}
422 
423 			/* Cleanup aged off entries. */
424 			*rthp = rth->u.rt_next;
425 			rt_free(rth);
426 		}
427 		write_unlock(&rt_hash_table[i].lock);
428 
429 		/* Fallback loop breaker. */
430 		if (time_after(jiffies, now))
431 			break;
432 	}
433 	rover = i;
434 	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
435 }
436 
437 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
438 
439 /* This can run from both BH and non-BH contexts, the latter
440  * in the case of a forced flush event.
441  */
SMP_TIMER_NAME(rt_run_flush)442 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
443 {
444 	int i;
445 	struct rtable *rth, *next;
446 
447 	rt_deadline = 0;
448 
449 	get_random_bytes(&rt_hash_rnd, 4);
450 
451 	for (i = rt_hash_mask; i >= 0; i--) {
452 		write_lock_bh(&rt_hash_table[i].lock);
453 		rth = rt_hash_table[i].chain;
454 		if (rth)
455 			rt_hash_table[i].chain = NULL;
456 		write_unlock_bh(&rt_hash_table[i].lock);
457 
458 		for (; rth; rth = next) {
459 			next = rth->u.rt_next;
460 			rt_free(rth);
461 		}
462 	}
463 }
464 
465 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
466 
467 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
468 
rt_cache_flush(int delay)469 void rt_cache_flush(int delay)
470 {
471 	unsigned long now = jiffies;
472 	int user_mode = !in_softirq();
473 
474 	if (delay < 0)
475 		delay = ip_rt_min_delay;
476 
477 	spin_lock_bh(&rt_flush_lock);
478 
479 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
480 		long tmo = (long)(rt_deadline - now);
481 
482 		/* If flush timer is already running
483 		   and flush request is not immediate (delay > 0):
484 
485 		   if deadline is not achieved, prolongate timer to "delay",
486 		   otherwise fire it at deadline time.
487 		 */
488 
489 		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
490 			tmo = 0;
491 
492 		if (delay > tmo)
493 			delay = tmo;
494 	}
495 
496 	if (delay <= 0) {
497 		spin_unlock_bh(&rt_flush_lock);
498 		SMP_TIMER_NAME(rt_run_flush)(0);
499 		return;
500 	}
501 
502 	if (rt_deadline == 0)
503 		rt_deadline = now + ip_rt_max_delay;
504 
505 	mod_timer(&rt_flush_timer, now+delay);
506 	spin_unlock_bh(&rt_flush_lock);
507 }
508 
rt_secret_rebuild(unsigned long dummy)509 static void rt_secret_rebuild(unsigned long dummy)
510 {
511 	unsigned long now = jiffies;
512 
513 	rt_cache_flush(0);
514 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
515 }
516 
517 /*
518    Short description of GC goals.
519 
520    We want to build algorithm, which will keep routing cache
521    at some equilibrium point, when number of aged off entries
522    is kept approximately equal to newly generated ones.
523 
524    Current expiration strength is variable "expire".
525    We try to adjust it dynamically, so that if networking
526    is idle expires is large enough to keep enough of warm entries,
527    and when load increases it reduces to limit cache size.
528  */
529 
rt_garbage_collect(void)530 static int rt_garbage_collect(void)
531 {
532 	static unsigned long expire = RT_GC_TIMEOUT;
533 	static unsigned long last_gc;
534 	static int rover;
535 	static int equilibrium;
536 	struct rtable *rth, **rthp;
537 	unsigned long now = jiffies;
538 	int goal;
539 
540 	/*
541 	 * Garbage collection is pretty expensive,
542 	 * do not make it too frequently.
543 	 */
544 
545 	rt_cache_stat[smp_processor_id()].gc_total++;
546 
547 	if (now - last_gc < ip_rt_gc_min_interval &&
548 	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
549 		rt_cache_stat[smp_processor_id()].gc_ignored++;
550 		goto out;
551 	}
552 
553 	/* Calculate number of entries, which we want to expire now. */
554 	goal = atomic_read(&ipv4_dst_ops.entries) -
555 		(ip_rt_gc_elasticity << rt_hash_log);
556 	if (goal <= 0) {
557 		if (equilibrium < ipv4_dst_ops.gc_thresh)
558 			equilibrium = ipv4_dst_ops.gc_thresh;
559 		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
560 		if (goal > 0) {
561 			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
562 			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
563 		}
564 	} else {
565 		/* We are in dangerous area. Try to reduce cache really
566 		 * aggressively.
567 		 */
568 		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
569 		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
570 	}
571 
572 	if (now - last_gc >= ip_rt_gc_min_interval)
573 		last_gc = now;
574 
575 	if (goal <= 0) {
576 		equilibrium += goal;
577 		goto work_done;
578 	}
579 
580 	do {
581 		int i, k;
582 
583 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
584 			unsigned long tmo = expire;
585 
586 			k = (k + 1) & rt_hash_mask;
587 			rthp = &rt_hash_table[k].chain;
588 			write_lock_bh(&rt_hash_table[k].lock);
589 			while ((rth = *rthp) != NULL) {
590 				if (!rt_may_expire(rth, tmo, expire)) {
591 					tmo >>= 1;
592 					rthp = &rth->u.rt_next;
593 					continue;
594 				}
595 				*rthp = rth->u.rt_next;
596 				rt_free(rth);
597 				goal--;
598 			}
599 			write_unlock_bh(&rt_hash_table[k].lock);
600 			if (goal <= 0)
601 				break;
602 		}
603 		rover = k;
604 
605 		if (goal <= 0)
606 			goto work_done;
607 
608 		/* Goal is not achieved. We stop process if:
609 
610 		   - if expire reduced to zero. Otherwise, expire is halfed.
611 		   - if table is not full.
612 		   - if we are called from interrupt.
613 		   - jiffies check is just fallback/debug loop breaker.
614 		     We will not spin here for long time in any case.
615 		 */
616 
617 		rt_cache_stat[smp_processor_id()].gc_goal_miss++;
618 
619 		if (expire == 0)
620 			break;
621 
622 		expire >>= 1;
623 #if RT_CACHE_DEBUG >= 2
624 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
625 				atomic_read(&ipv4_dst_ops.entries), goal, i);
626 #endif
627 
628 		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
629 			goto out;
630 	} while (!in_softirq() && time_before_eq(jiffies, now));
631 
632 	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
633 		goto out;
634 	if (net_ratelimit())
635 		printk(KERN_WARNING "dst cache overflow\n");
636 	rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
637 	return 1;
638 
639 work_done:
640 	expire += ip_rt_gc_min_interval;
641 	if (expire > ip_rt_gc_timeout ||
642 	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
643 		expire = ip_rt_gc_timeout;
644 #if RT_CACHE_DEBUG >= 2
645 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
646 			atomic_read(&ipv4_dst_ops.entries), goal, rover);
647 #endif
648 out:	return 0;
649 }
650 
rt_intern_hash(unsigned hash,struct rtable * rt,struct rtable ** rp)651 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
652 {
653 	struct rtable	*rth, **rthp;
654 	unsigned long	now;
655 	struct rtable *cand, **candp;
656 	u32 		min_score;
657 	int		chain_length;
658 	int attempts = !in_softirq();
659 
660 restart:
661 	chain_length = 0;
662 	min_score = ~(u32)0;
663 	cand = NULL;
664 	candp = NULL;
665 	now = jiffies;
666 
667 	rthp = &rt_hash_table[hash].chain;
668 
669 	write_lock_bh(&rt_hash_table[hash].lock);
670 	while ((rth = *rthp) != NULL) {
671 		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
672 			/* Put it first */
673 			*rthp = rth->u.rt_next;
674 			rth->u.rt_next = rt_hash_table[hash].chain;
675 			rt_hash_table[hash].chain = rth;
676 
677 			rth->u.dst.__use++;
678 			dst_hold(&rth->u.dst);
679 			rth->u.dst.lastuse = now;
680 			write_unlock_bh(&rt_hash_table[hash].lock);
681 
682 			rt_drop(rt);
683 			*rp = rth;
684 			return 0;
685 		}
686 
687 		if (!atomic_read(&rth->u.dst.__refcnt)) {
688 			u32 score = rt_score(rth);
689 
690 			if (score <= min_score) {
691 				cand = rth;
692 				candp = rthp;
693 				min_score = score;
694 			}
695 		}
696 
697 		chain_length++;
698 
699 		rthp = &rth->u.rt_next;
700 	}
701 
702 	if (cand) {
703 		/* ip_rt_gc_elasticity used to be average length of chain
704 		 * length, when exceeded gc becomes really aggressive.
705 		 *
706 		 * The second limit is less certain. At the moment it allows
707 		 * only 2 entries per bucket. We will see.
708 		 */
709 		if (chain_length > ip_rt_gc_elasticity) {
710 			*candp = cand->u.rt_next;
711 			rt_free(cand);
712 		}
713 	}
714 
715 	/* Try to bind route to arp only if it is output
716 	   route or unicast forwarding path.
717 	 */
718 	if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
719 		int err = arp_bind_neighbour(&rt->u.dst);
720 		if (err) {
721 			write_unlock_bh(&rt_hash_table[hash].lock);
722 
723 			if (err != -ENOBUFS) {
724 				rt_drop(rt);
725 				return err;
726 			}
727 
728 			/* Neighbour tables are full and nothing
729 			   can be released. Try to shrink route cache,
730 			   it is most likely it holds some neighbour records.
731 			 */
732 			if (attempts-- > 0) {
733 				int saved_elasticity = ip_rt_gc_elasticity;
734 				int saved_int = ip_rt_gc_min_interval;
735 				ip_rt_gc_elasticity	= 1;
736 				ip_rt_gc_min_interval	= 0;
737 				rt_garbage_collect();
738 				ip_rt_gc_min_interval	= saved_int;
739 				ip_rt_gc_elasticity	= saved_elasticity;
740 				goto restart;
741 			}
742 
743 			if (net_ratelimit())
744 				printk(KERN_WARNING "Neighbour table overflow.\n");
745 			rt_drop(rt);
746 			return -ENOBUFS;
747 		}
748 	}
749 
750 	rt->u.rt_next = rt_hash_table[hash].chain;
751 #if RT_CACHE_DEBUG >= 2
752 	if (rt->u.rt_next) {
753 		struct rtable *trt;
754 		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
755 		       NIPQUAD(rt->rt_dst));
756 		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
757 			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
758 		printk("\n");
759 	}
760 #endif
761 	rt_hash_table[hash].chain = rt;
762 	write_unlock_bh(&rt_hash_table[hash].lock);
763 	*rp = rt;
764 	return 0;
765 }
766 
rt_bind_peer(struct rtable * rt,int create)767 void rt_bind_peer(struct rtable *rt, int create)
768 {
769 	static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
770 	struct inet_peer *peer;
771 
772 	peer = inet_getpeer(rt->rt_dst, create);
773 
774 	spin_lock_bh(&rt_peer_lock);
775 	if (rt->peer == NULL) {
776 		rt->peer = peer;
777 		peer = NULL;
778 	}
779 	spin_unlock_bh(&rt_peer_lock);
780 	if (peer)
781 		inet_putpeer(peer);
782 }
783 
784 /*
785  * Peer allocation may fail only in serious out-of-memory conditions.  However
786  * we still can generate some output.
787  * Random ID selection looks a bit dangerous because we have no chances to
788  * select ID being unique in a reasonable period of time.
789  * But broken packet identifier may be better than no packet at all.
790  */
ip_select_fb_ident(struct iphdr * iph)791 static void ip_select_fb_ident(struct iphdr *iph)
792 {
793 	static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
794 	static u32 ip_fallback_id;
795 	u32 salt;
796 
797 	spin_lock_bh(&ip_fb_id_lock);
798 	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
799 	iph->id = htons(salt & 0xFFFF);
800 	ip_fallback_id = salt;
801 	spin_unlock_bh(&ip_fb_id_lock);
802 }
803 
__ip_select_ident(struct iphdr * iph,struct dst_entry * dst)804 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
805 {
806 	struct rtable *rt = (struct rtable *) dst;
807 
808 	if (rt) {
809 		if (rt->peer == NULL)
810 			rt_bind_peer(rt, 1);
811 
812 		/* If peer is attached to destination, it is never detached,
813 		   so that we need not to grab a lock to dereference it.
814 		 */
815 		if (rt->peer) {
816 			iph->id = htons(inet_getid(rt->peer));
817 			return;
818 		}
819 	} else
820 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
821 
822 	ip_select_fb_ident(iph);
823 }
824 
rt_del(unsigned hash,struct rtable * rt)825 static void rt_del(unsigned hash, struct rtable *rt)
826 {
827 	struct rtable **rthp;
828 
829 	write_lock_bh(&rt_hash_table[hash].lock);
830 	ip_rt_put(rt);
831 	for (rthp = &rt_hash_table[hash].chain; *rthp;
832 	     rthp = &(*rthp)->u.rt_next)
833 		if (*rthp == rt) {
834 			*rthp = rt->u.rt_next;
835 			rt_free(rt);
836 			break;
837 		}
838 	write_unlock_bh(&rt_hash_table[hash].lock);
839 }
840 
ip_rt_redirect(u32 old_gw,u32 daddr,u32 new_gw,u32 saddr,u8 tos,struct net_device * dev)841 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
842 		    u32 saddr, u8 tos, struct net_device *dev)
843 {
844 	int i, k;
845 	struct in_device *in_dev = in_dev_get(dev);
846 	struct rtable *rth, **rthp;
847 	u32  skeys[2] = { saddr, 0 };
848 	int  ikeys[2] = { dev->ifindex, 0 };
849 
850 	tos &= IPTOS_RT_MASK;
851 
852 	if (!in_dev)
853 		return;
854 
855 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
856 	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
857 		goto reject_redirect;
858 
859 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
860 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
861 			goto reject_redirect;
862 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
863 			goto reject_redirect;
864 	} else {
865 		if (inet_addr_type(new_gw) != RTN_UNICAST)
866 			goto reject_redirect;
867 	}
868 
869 	for (i = 0; i < 2; i++) {
870 		for (k = 0; k < 2; k++) {
871 			unsigned hash = rt_hash_code(daddr,
872 						     skeys[i] ^ (ikeys[k] << 5),
873 						     tos);
874 
875 			rthp=&rt_hash_table[hash].chain;
876 
877 			read_lock(&rt_hash_table[hash].lock);
878 			while ((rth = *rthp) != NULL) {
879 				struct rtable *rt;
880 
881 				if (rth->key.dst != daddr ||
882 				    rth->key.src != skeys[i] ||
883 				    rth->key.tos != tos ||
884 				    rth->key.oif != ikeys[k] ||
885 				    rth->key.iif != 0) {
886 					rthp = &rth->u.rt_next;
887 					continue;
888 				}
889 
890 				if (rth->rt_dst != daddr ||
891 				    rth->rt_src != saddr ||
892 				    rth->u.dst.error ||
893 				    rth->rt_gateway != old_gw ||
894 				    rth->u.dst.dev != dev)
895 					break;
896 
897 				dst_hold(&rth->u.dst);
898 				read_unlock(&rt_hash_table[hash].lock);
899 
900 				rt = dst_alloc(&ipv4_dst_ops);
901 				if (rt == NULL) {
902 					ip_rt_put(rth);
903 					in_dev_put(in_dev);
904 					return;
905 				}
906 
907 				/* Copy all the information. */
908 				*rt = *rth;
909 				rt->u.dst.__use		= 1;
910 				atomic_set(&rt->u.dst.__refcnt, 1);
911 				if (rt->u.dst.dev)
912 					dev_hold(rt->u.dst.dev);
913 				rt->u.dst.lastuse	= jiffies;
914 				rt->u.dst.neighbour	= NULL;
915 				rt->u.dst.hh		= NULL;
916 				rt->u.dst.obsolete	= 0;
917 
918 				rt->rt_flags		|= RTCF_REDIRECTED;
919 
920 				/* Gateway is different ... */
921 				rt->rt_gateway		= new_gw;
922 
923 				/* Redirect received -> path was valid */
924 				dst_confirm(&rth->u.dst);
925 
926 				if (rt->peer)
927 					atomic_inc(&rt->peer->refcnt);
928 
929 				if (arp_bind_neighbour(&rt->u.dst) ||
930 				    !(rt->u.dst.neighbour->nud_state &
931 					    NUD_VALID)) {
932 					if (rt->u.dst.neighbour)
933 						neigh_event_send(rt->u.dst.neighbour, NULL);
934 					ip_rt_put(rth);
935 					rt_drop(rt);
936 					goto do_next;
937 				}
938 
939 				rt_del(hash, rth);
940 				if (!rt_intern_hash(hash, rt, &rt))
941 					ip_rt_put(rt);
942 				goto do_next;
943 			}
944 			read_unlock(&rt_hash_table[hash].lock);
945 		do_next:
946 			;
947 		}
948 	}
949 	in_dev_put(in_dev);
950 	return;
951 
952 reject_redirect:
953 #ifdef CONFIG_IP_ROUTE_VERBOSE
954 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
955 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
956 			"%u.%u.%u.%u ignored.\n"
957 			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
958 			"tos %02x\n",
959 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
960 		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
961 #endif
962 	in_dev_put(in_dev);
963 }
964 
ipv4_negative_advice(struct dst_entry * dst)965 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
966 {
967 	struct rtable *rt = (struct rtable*)dst;
968 	struct dst_entry *ret = dst;
969 
970 	if (rt) {
971 		if (dst->obsolete) {
972 			ip_rt_put(rt);
973 			ret = NULL;
974 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
975 			   rt->u.dst.expires) {
976 			unsigned hash = rt_hash_code(rt->key.dst,
977 						     rt->key.src ^
978 							(rt->key.oif << 5),
979 						     rt->key.tos);
980 #if RT_CACHE_DEBUG >= 1
981 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
982 					  "%u.%u.%u.%u/%02x dropped\n",
983 				NIPQUAD(rt->rt_dst), rt->key.tos);
984 #endif
985 			rt_del(hash, rt);
986 			ret = NULL;
987 		}
988 	}
989 	return ret;
990 }
991 
992 /*
993  * Algorithm:
994  *	1. The first ip_rt_redirect_number redirects are sent
995  *	   with exponential backoff, then we stop sending them at all,
996  *	   assuming that the host ignores our redirects.
997  *	2. If we did not see packets requiring redirects
998  *	   during ip_rt_redirect_silence, we assume that the host
999  *	   forgot redirected route and start to send redirects again.
1000  *
1001  * This algorithm is much cheaper and more intelligent than dumb load limiting
1002  * in icmp.c.
1003  *
1004  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1005  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1006  */
1007 
ip_rt_send_redirect(struct sk_buff * skb)1008 void ip_rt_send_redirect(struct sk_buff *skb)
1009 {
1010 	struct rtable *rt = (struct rtable*)skb->dst;
1011 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1012 
1013 	if (!in_dev)
1014 		return;
1015 
1016 	if (!IN_DEV_TX_REDIRECTS(in_dev))
1017 		goto out;
1018 
1019 	/* No redirected packets during ip_rt_redirect_silence;
1020 	 * reset the algorithm.
1021 	 */
1022 	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1023 		rt->u.dst.rate_tokens = 0;
1024 
1025 	/* Too many ignored redirects; do not send anything
1026 	 * set u.dst.rate_last to the last seen redirected packet.
1027 	 */
1028 	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1029 		rt->u.dst.rate_last = jiffies;
1030 		goto out;
1031 	}
1032 
1033 	/* Check for load limit; set rate_last to the latest sent
1034 	 * redirect.
1035 	 */
1036 	if (time_after(jiffies,
1037 		       (rt->u.dst.rate_last +
1038 			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1039 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1040 		rt->u.dst.rate_last = jiffies;
1041 		++rt->u.dst.rate_tokens;
1042 #ifdef CONFIG_IP_ROUTE_VERBOSE
1043 		if (IN_DEV_LOG_MARTIANS(in_dev) &&
1044 		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1045 		    net_ratelimit())
1046 			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1047 				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1048 				NIPQUAD(rt->rt_src), rt->rt_iif,
1049 				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1050 #endif
1051 	}
1052 out:
1053         in_dev_put(in_dev);
1054 }
1055 
ip_error(struct sk_buff * skb)1056 static int ip_error(struct sk_buff *skb)
1057 {
1058 	struct rtable *rt = (struct rtable*)skb->dst;
1059 	unsigned long now;
1060 	int code;
1061 
1062 	switch (rt->u.dst.error) {
1063 		case EINVAL:
1064 		default:
1065 			goto out;
1066 		case EHOSTUNREACH:
1067 			code = ICMP_HOST_UNREACH;
1068 			break;
1069 		case ENETUNREACH:
1070 			code = ICMP_NET_UNREACH;
1071 			break;
1072 		case EACCES:
1073 			code = ICMP_PKT_FILTERED;
1074 			break;
1075 	}
1076 
1077 	now = jiffies;
1078 	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1079 	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1080 		rt->u.dst.rate_tokens = ip_rt_error_burst;
1081 	rt->u.dst.rate_last = now;
1082 	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1083 		rt->u.dst.rate_tokens -= ip_rt_error_cost;
1084 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1085 	}
1086 
1087 out:	kfree_skb(skb);
1088 	return 0;
1089 }
1090 
1091 /*
1092  *	The last two values are not from the RFC but
1093  *	are needed for AMPRnet AX.25 paths.
1094  */
1095 
1096 static unsigned short mtu_plateau[] =
1097 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1098 
guess_mtu(unsigned short old_mtu)1099 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1100 {
1101 	int i;
1102 
1103 	for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1104 		if (old_mtu > mtu_plateau[i])
1105 			return mtu_plateau[i];
1106 	return 68;
1107 }
1108 
ip_rt_frag_needed(struct iphdr * iph,unsigned short new_mtu)1109 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1110 {
1111 	int i;
1112 	unsigned short old_mtu = ntohs(iph->tot_len);
1113 	struct rtable *rth;
1114 	u32  skeys[2] = { iph->saddr, 0, };
1115 	u32  daddr = iph->daddr;
1116 	u8   tos = iph->tos & IPTOS_RT_MASK;
1117 	unsigned short est_mtu = 0;
1118 
1119 	if (ipv4_config.no_pmtu_disc)
1120 		return 0;
1121 
1122 	for (i = 0; i < 2; i++) {
1123 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1124 
1125 		read_lock(&rt_hash_table[hash].lock);
1126 		for (rth = rt_hash_table[hash].chain; rth;
1127 		     rth = rth->u.rt_next) {
1128 			if (rth->key.dst == daddr &&
1129 			    rth->key.src == skeys[i] &&
1130 			    rth->rt_dst  == daddr &&
1131 			    rth->rt_src  == iph->saddr &&
1132 			    rth->key.tos == tos &&
1133 			    rth->key.iif == 0 &&
1134 			    !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1135 				unsigned short mtu = new_mtu;
1136 
1137 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1138 
1139 					/* BSD 4.2 compatibility hack :-( */
1140 					if (mtu == 0 &&
1141 					    old_mtu >= rth->u.dst.pmtu &&
1142 					    old_mtu >= 68 + (iph->ihl << 2))
1143 						old_mtu -= iph->ihl << 2;
1144 
1145 					mtu = guess_mtu(old_mtu);
1146 				}
1147 				if (mtu <= rth->u.dst.pmtu) {
1148 					if (mtu < rth->u.dst.pmtu) {
1149 						dst_confirm(&rth->u.dst);
1150 						if (mtu < ip_rt_min_pmtu) {
1151 							mtu = ip_rt_min_pmtu;
1152 							rth->u.dst.mxlock |=
1153 								(1 << RTAX_MTU);
1154 						}
1155 						rth->u.dst.pmtu = mtu;
1156 						dst_set_expires(&rth->u.dst,
1157 							ip_rt_mtu_expires);
1158 					}
1159 					est_mtu = mtu;
1160 				}
1161 			}
1162 		}
1163 		read_unlock(&rt_hash_table[hash].lock);
1164 	}
1165 	return est_mtu ? : new_mtu;
1166 }
1167 
ip_rt_update_pmtu(struct dst_entry * dst,unsigned mtu)1168 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1169 {
1170 	if (dst->pmtu > mtu && mtu >= 68 &&
1171 	    !(dst->mxlock & (1 << RTAX_MTU))) {
1172 		if (mtu < ip_rt_min_pmtu) {
1173 			mtu = ip_rt_min_pmtu;
1174 			dst->mxlock |= (1 << RTAX_MTU);
1175 		}
1176 		dst->pmtu = mtu;
1177 		dst_set_expires(dst, ip_rt_mtu_expires);
1178 	}
1179 }
1180 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183 	dst_release(dst);
1184 	return NULL;
1185 }
1186 
ipv4_dst_reroute(struct dst_entry * dst,struct sk_buff * skb)1187 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1188 					  struct sk_buff *skb)
1189 {
1190 	return NULL;
1191 }
1192 
ipv4_dst_destroy(struct dst_entry * dst)1193 static void ipv4_dst_destroy(struct dst_entry *dst)
1194 {
1195 	struct rtable *rt = (struct rtable *) dst;
1196 	struct inet_peer *peer = rt->peer;
1197 
1198 	if (peer) {
1199 		rt->peer = NULL;
1200 		inet_putpeer(peer);
1201 	}
1202 }
1203 
ipv4_link_failure(struct sk_buff * skb)1204 static void ipv4_link_failure(struct sk_buff *skb)
1205 {
1206 	struct rtable *rt;
1207 
1208 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1209 
1210 	rt = (struct rtable *) skb->dst;
1211 	if (rt)
1212 		dst_set_expires(&rt->u.dst, 0);
1213 }
1214 
ip_rt_bug(struct sk_buff * skb)1215 static int ip_rt_bug(struct sk_buff *skb)
1216 {
1217 	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1218 		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1219 		skb->dev ? skb->dev->name : "?");
1220 	kfree_skb(skb);
1221 	return 0;
1222 }
1223 
1224 /*
1225    We do not cache source address of outgoing interface,
1226    because it is used only by IP RR, TS and SRR options,
1227    so that it out of fast path.
1228 
1229    BTW remember: "addr" is allowed to be not aligned
1230    in IP options!
1231  */
1232 
ip_rt_get_source(u8 * addr,struct rtable * rt)1233 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1234 {
1235 	u32 src;
1236 	struct fib_result res;
1237 
1238 	if (rt->key.iif == 0)
1239 		src = rt->rt_src;
1240 	else if (fib_lookup(&rt->key, &res) == 0) {
1241 #ifdef CONFIG_IP_ROUTE_NAT
1242 		if (res.type == RTN_NAT)
1243 			src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1244 						RT_SCOPE_UNIVERSE);
1245 		else
1246 #endif
1247 			src = FIB_RES_PREFSRC(res);
1248 		fib_res_put(&res);
1249 	} else
1250 		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1251 					RT_SCOPE_UNIVERSE);
1252 	memcpy(addr, &src, 4);
1253 }
1254 
1255 #ifdef CONFIG_NET_CLS_ROUTE
set_class_tag(struct rtable * rt,u32 tag)1256 static void set_class_tag(struct rtable *rt, u32 tag)
1257 {
1258 	if (!(rt->u.dst.tclassid & 0xFFFF))
1259 		rt->u.dst.tclassid |= tag & 0xFFFF;
1260 	if (!(rt->u.dst.tclassid & 0xFFFF0000))
1261 		rt->u.dst.tclassid |= tag & 0xFFFF0000;
1262 }
1263 #endif
1264 
rt_set_nexthop(struct rtable * rt,struct fib_result * res,u32 itag)1265 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1266 {
1267 	struct fib_info *fi = res->fi;
1268 
1269 	if (fi) {
1270 		if (FIB_RES_GW(*res) &&
1271 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1272 			rt->rt_gateway = FIB_RES_GW(*res);
1273 		memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1274 			sizeof(fi->fib_metrics));
1275 		if (fi->fib_mtu == 0) {
1276 			rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1277 			if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1278 			    rt->rt_gateway != rt->rt_dst &&
1279 			    rt->u.dst.pmtu > 576)
1280 				rt->u.dst.pmtu = 576;
1281 		}
1282 #ifdef CONFIG_NET_CLS_ROUTE
1283 		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1284 #endif
1285 	} else
1286 		rt->u.dst.pmtu	= rt->u.dst.dev->mtu;
1287 
1288 	if (rt->u.dst.pmtu > IP_MAX_MTU)
1289 		rt->u.dst.pmtu = IP_MAX_MTU;
1290 	if (rt->u.dst.advmss == 0)
1291 		rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1292 				       ip_rt_min_advmss);
1293 	if (rt->u.dst.advmss > 65535 - 40)
1294 		rt->u.dst.advmss = 65535 - 40;
1295 
1296 #ifdef CONFIG_NET_CLS_ROUTE
1297 #ifdef CONFIG_IP_MULTIPLE_TABLES
1298 	set_class_tag(rt, fib_rules_tclass(res));
1299 #endif
1300 	set_class_tag(rt, itag);
1301 #endif
1302         rt->rt_type = res->type;
1303 }
1304 
ip_route_input_mc(struct sk_buff * skb,u32 daddr,u32 saddr,u8 tos,struct net_device * dev,int our)1305 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1306 				u8 tos, struct net_device *dev, int our)
1307 {
1308 	unsigned hash;
1309 	struct rtable *rth;
1310 	u32 spec_dst;
1311 	struct in_device *in_dev = in_dev_get(dev);
1312 	u32 itag = 0;
1313 
1314 	/* Primary sanity checks. */
1315 
1316 	if (in_dev == NULL)
1317 		return -EINVAL;
1318 
1319 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1320 	    skb->protocol != htons(ETH_P_IP))
1321 		goto e_inval;
1322 
1323 	if (ZERONET(saddr)) {
1324 		if (!LOCAL_MCAST(daddr))
1325 			goto e_inval;
1326 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1327 	} else if (fib_validate_source(saddr, 0, tos, 0,
1328 					dev, &spec_dst, &itag) < 0)
1329 		goto e_inval;
1330 
1331 	rth = dst_alloc(&ipv4_dst_ops);
1332 	if (!rth)
1333 		goto e_nobufs;
1334 
1335 	rth->u.dst.output= ip_rt_bug;
1336 
1337 	atomic_set(&rth->u.dst.__refcnt, 1);
1338 	rth->u.dst.flags= DST_HOST;
1339 	rth->key.dst	= daddr;
1340 	rth->rt_dst	= daddr;
1341 	rth->key.tos	= tos;
1342 #ifdef CONFIG_IP_ROUTE_FWMARK
1343 	rth->key.fwmark	= skb->nfmark;
1344 #endif
1345 	rth->key.src	= saddr;
1346 	rth->rt_src	= saddr;
1347 #ifdef CONFIG_IP_ROUTE_NAT
1348 	rth->rt_dst_map	= daddr;
1349 	rth->rt_src_map	= saddr;
1350 #endif
1351 #ifdef CONFIG_NET_CLS_ROUTE
1352 	rth->u.dst.tclassid = itag;
1353 #endif
1354 	rth->rt_iif	=
1355 	rth->key.iif	= dev->ifindex;
1356 	rth->u.dst.dev	= &loopback_dev;
1357 	dev_hold(rth->u.dst.dev);
1358 	rth->key.oif	= 0;
1359 	rth->rt_gateway	= daddr;
1360 	rth->rt_spec_dst= spec_dst;
1361 	rth->rt_type	= RTN_MULTICAST;
1362 	rth->rt_flags	= RTCF_MULTICAST;
1363 	if (our) {
1364 		rth->u.dst.input= ip_local_deliver;
1365 		rth->rt_flags |= RTCF_LOCAL;
1366 	}
1367 
1368 #ifdef CONFIG_IP_MROUTE
1369 	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1370 		rth->u.dst.input = ip_mr_input;
1371 #endif
1372 	rt_cache_stat[smp_processor_id()].in_slow_mc++;
1373 
1374 	in_dev_put(in_dev);
1375 	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1376 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1377 
1378 e_nobufs:
1379 	in_dev_put(in_dev);
1380 	return -ENOBUFS;
1381 
1382 e_inval:
1383 	in_dev_put(in_dev);
1384 	return -EINVAL;
1385 }
1386 
1387 /*
1388  *	NOTE. We drop all the packets that has local source
1389  *	addresses, because every properly looped back packet
1390  *	must have correct destination already attached by output routine.
1391  *
1392  *	Such approach solves two big problems:
1393  *	1. Not simplex devices are handled properly.
1394  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1395  */
1396 
ip_route_input_slow(struct sk_buff * skb,u32 daddr,u32 saddr,u8 tos,struct net_device * dev)1397 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1398 			u8 tos, struct net_device *dev)
1399 {
1400 	struct rt_key	key;
1401 	struct fib_result res;
1402 	struct in_device *in_dev = in_dev_get(dev);
1403 	struct in_device *out_dev = NULL;
1404 	unsigned	flags = 0;
1405 	u32		itag = 0;
1406 	struct rtable * rth;
1407 	unsigned	hash;
1408 	u32		spec_dst;
1409 	int		err = -EINVAL;
1410 	int		free_res = 0;
1411 
1412 	/* IP on this device is disabled. */
1413 
1414 	if (!in_dev)
1415 		goto out;
1416 
1417 	key.dst		= daddr;
1418 	key.src		= saddr;
1419 	key.tos		= tos;
1420 #ifdef CONFIG_IP_ROUTE_FWMARK
1421 	key.fwmark	= skb->nfmark;
1422 #endif
1423 	key.iif		= dev->ifindex;
1424 	key.oif		= 0;
1425 	key.scope	= RT_SCOPE_UNIVERSE;
1426 
1427 	hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1428 
1429 	/* Check for the most weird martians, which can be not detected
1430 	   by fib_lookup.
1431 	 */
1432 
1433 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1434 		goto martian_source;
1435 
1436 	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1437 		goto brd_input;
1438 
1439 	/* Accept zero addresses only to limited broadcast;
1440 	 * I even do not know to fix it or not. Waiting for complains :-)
1441 	 */
1442 	if (ZERONET(saddr))
1443 		goto martian_source;
1444 
1445 	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1446 		goto martian_destination;
1447 
1448 	/*
1449 	 *	Now we are ready to route packet.
1450 	 */
1451 	if ((err = fib_lookup(&key, &res)) != 0) {
1452 		if (!IN_DEV_FORWARD(in_dev))
1453 			goto e_inval;
1454 		goto no_route;
1455 	}
1456 	free_res = 1;
1457 
1458 	rt_cache_stat[smp_processor_id()].in_slow_tot++;
1459 
1460 #ifdef CONFIG_IP_ROUTE_NAT
1461 	/* Policy is applied before mapping destination,
1462 	   but rerouting after map should be made with old source.
1463 	 */
1464 
1465 	if (1) {
1466 		u32 src_map = saddr;
1467 		if (res.r)
1468 			src_map = fib_rules_policy(saddr, &res, &flags);
1469 
1470 		if (res.type == RTN_NAT) {
1471 			key.dst = fib_rules_map_destination(daddr, &res);
1472 			fib_res_put(&res);
1473 			free_res = 0;
1474 			if (fib_lookup(&key, &res))
1475 				goto e_inval;
1476 			free_res = 1;
1477 			if (res.type != RTN_UNICAST)
1478 				goto e_inval;
1479 			flags |= RTCF_DNAT;
1480 		}
1481 		key.src = src_map;
1482 	}
1483 #endif
1484 
1485 	if (res.type == RTN_BROADCAST)
1486 		goto brd_input;
1487 
1488 	if (res.type == RTN_LOCAL) {
1489 		int result;
1490 		result = fib_validate_source(saddr, daddr, tos,
1491 					     loopback_dev.ifindex,
1492 					     dev, &spec_dst, &itag);
1493 		if (result < 0)
1494 			goto martian_source;
1495 		if (result)
1496 			flags |= RTCF_DIRECTSRC;
1497 		spec_dst = daddr;
1498 		goto local_input;
1499 	}
1500 
1501 	if (!IN_DEV_FORWARD(in_dev))
1502 		goto e_inval;
1503 	if (res.type != RTN_UNICAST)
1504 		goto martian_destination;
1505 
1506 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1507 	if (res.fi->fib_nhs > 1 && key.oif == 0)
1508 		fib_select_multipath(&key, &res);
1509 #endif
1510 	out_dev = in_dev_get(FIB_RES_DEV(res));
1511 	if (out_dev == NULL) {
1512 		if (net_ratelimit())
1513 			printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1514 					 "Please, report\n");
1515 		goto e_inval;
1516 	}
1517 
1518 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1519 				  &spec_dst, &itag);
1520 	if (err < 0)
1521 		goto martian_source;
1522 
1523 	if (err)
1524 		flags |= RTCF_DIRECTSRC;
1525 
1526 	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1527 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1528 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1529 		flags |= RTCF_DOREDIRECT;
1530 
1531 	if (skb->protocol != htons(ETH_P_IP)) {
1532 		/* Not IP (i.e. ARP). Do not create route, if it is
1533 		 * invalid for proxy arp. DNAT routes are always valid.
1534 		 */
1535 		if (out_dev == in_dev && !(flags & RTCF_DNAT))
1536 			goto e_inval;
1537 	}
1538 
1539 	rth = dst_alloc(&ipv4_dst_ops);
1540 	if (!rth)
1541 		goto e_nobufs;
1542 
1543 	atomic_set(&rth->u.dst.__refcnt, 1);
1544 	rth->u.dst.flags= DST_HOST;
1545 	rth->key.dst	= daddr;
1546 	rth->rt_dst	= daddr;
1547 	rth->key.tos	= tos;
1548 #ifdef CONFIG_IP_ROUTE_FWMARK
1549 	rth->key.fwmark	= skb->nfmark;
1550 #endif
1551 	rth->key.src	= saddr;
1552 	rth->rt_src	= saddr;
1553 	rth->rt_gateway	= daddr;
1554 #ifdef CONFIG_IP_ROUTE_NAT
1555 	rth->rt_src_map	= key.src;
1556 	rth->rt_dst_map	= key.dst;
1557 	if (flags&RTCF_DNAT)
1558 		rth->rt_gateway	= key.dst;
1559 #endif
1560 	rth->rt_iif 	=
1561 	rth->key.iif	= dev->ifindex;
1562 	rth->u.dst.dev	= out_dev->dev;
1563 	dev_hold(rth->u.dst.dev);
1564 	rth->key.oif 	= 0;
1565 	rth->rt_spec_dst= spec_dst;
1566 
1567 	rth->u.dst.input = ip_forward;
1568 	rth->u.dst.output = ip_output;
1569 
1570 	rt_set_nexthop(rth, &res, itag);
1571 
1572 	rth->rt_flags = flags;
1573 
1574 #ifdef CONFIG_NET_FASTROUTE
1575 	if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1576 		struct net_device *odev = rth->u.dst.dev;
1577 		if (odev != dev &&
1578 		    dev->accept_fastpath &&
1579 		    odev->mtu >= dev->mtu &&
1580 		    dev->accept_fastpath(dev, &rth->u.dst) == 0)
1581 			rth->rt_flags |= RTCF_FAST;
1582 	}
1583 #endif
1584 
1585 intern:
1586 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1587 done:
1588 	in_dev_put(in_dev);
1589 	if (out_dev)
1590 		in_dev_put(out_dev);
1591 	if (free_res)
1592 		fib_res_put(&res);
1593 out:	return err;
1594 
1595 brd_input:
1596 	if (skb->protocol != htons(ETH_P_IP))
1597 		goto e_inval;
1598 
1599 	if (ZERONET(saddr))
1600 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1601 	else {
1602 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1603 					  &itag);
1604 		if (err < 0)
1605 			goto martian_source;
1606 		if (err)
1607 			flags |= RTCF_DIRECTSRC;
1608 	}
1609 	flags |= RTCF_BROADCAST;
1610 	res.type = RTN_BROADCAST;
1611 	rt_cache_stat[smp_processor_id()].in_brd++;
1612 
1613 local_input:
1614 	rth = dst_alloc(&ipv4_dst_ops);
1615 	if (!rth)
1616 		goto e_nobufs;
1617 
1618 	rth->u.dst.output= ip_rt_bug;
1619 
1620 	atomic_set(&rth->u.dst.__refcnt, 1);
1621 	rth->u.dst.flags= DST_HOST;
1622 	rth->key.dst	= daddr;
1623 	rth->rt_dst	= daddr;
1624 	rth->key.tos	= tos;
1625 #ifdef CONFIG_IP_ROUTE_FWMARK
1626 	rth->key.fwmark	= skb->nfmark;
1627 #endif
1628 	rth->key.src	= saddr;
1629 	rth->rt_src	= saddr;
1630 #ifdef CONFIG_IP_ROUTE_NAT
1631 	rth->rt_dst_map	= key.dst;
1632 	rth->rt_src_map	= key.src;
1633 #endif
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 	rth->u.dst.tclassid = itag;
1636 #endif
1637 	rth->rt_iif	=
1638 	rth->key.iif	= dev->ifindex;
1639 	rth->u.dst.dev	= &loopback_dev;
1640 	dev_hold(rth->u.dst.dev);
1641 	rth->key.oif 	= 0;
1642 	rth->rt_gateway	= daddr;
1643 	rth->rt_spec_dst= spec_dst;
1644 	rth->u.dst.input= ip_local_deliver;
1645 	rth->rt_flags 	= flags|RTCF_LOCAL;
1646 	if (res.type == RTN_UNREACHABLE) {
1647 		rth->u.dst.input= ip_error;
1648 		rth->u.dst.error= -err;
1649 		rth->rt_flags 	&= ~RTCF_LOCAL;
1650 	}
1651 	rth->rt_type	= res.type;
1652 	goto intern;
1653 
1654 no_route:
1655 	rt_cache_stat[smp_processor_id()].in_no_route++;
1656 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1657 	res.type = RTN_UNREACHABLE;
1658 	goto local_input;
1659 
1660 	/*
1661 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1662 	 */
1663 martian_destination:
1664 	rt_cache_stat[smp_processor_id()].in_martian_dst++;
1665 #ifdef CONFIG_IP_ROUTE_VERBOSE
1666 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1667 		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1668 			"%u.%u.%u.%u, dev %s\n",
1669 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670 #endif
1671 e_inval:
1672 	err = -EINVAL;
1673 	goto done;
1674 
1675 e_nobufs:
1676 	err = -ENOBUFS;
1677 	goto done;
1678 
1679 martian_source:
1680 
1681 	rt_cache_stat[smp_processor_id()].in_martian_src++;
1682 #ifdef CONFIG_IP_ROUTE_VERBOSE
1683 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1684 		/*
1685 		 *	RFC1812 recommendation, if source is martian,
1686 		 *	the only hint is MAC header.
1687 		 */
1688 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1689 			"%u.%u.%u.%u, on dev %s\n",
1690 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1691 		if (dev->hard_header_len) {
1692 			int i;
1693 			unsigned char *p = skb->mac.raw;
1694 			printk(KERN_WARNING "ll header: ");
1695 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1696 				printk("%02x", *p);
1697 				if (i < (dev->hard_header_len - 1))
1698 					printk(":");
1699 			}
1700 			printk("\n");
1701 		}
1702 	}
1703 #endif
1704 	goto e_inval;
1705 }
1706 
ip_route_input(struct sk_buff * skb,u32 daddr,u32 saddr,u8 tos,struct net_device * dev)1707 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1708 		   u8 tos, struct net_device *dev)
1709 {
1710 	struct rtable * rth;
1711 	unsigned	hash;
1712 	int iif = dev->ifindex;
1713 
1714 	tos &= IPTOS_RT_MASK;
1715 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1716 
1717 	read_lock(&rt_hash_table[hash].lock);
1718 	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1719 		if (rth->key.dst == daddr &&
1720 		    rth->key.src == saddr &&
1721 		    rth->key.iif == iif &&
1722 		    rth->key.oif == 0 &&
1723 #ifdef CONFIG_IP_ROUTE_FWMARK
1724 		    rth->key.fwmark == skb->nfmark &&
1725 #endif
1726 		    rth->key.tos == tos) {
1727 			rth->u.dst.lastuse = jiffies;
1728 			dst_hold(&rth->u.dst);
1729 			rth->u.dst.__use++;
1730 			rt_cache_stat[smp_processor_id()].in_hit++;
1731 			read_unlock(&rt_hash_table[hash].lock);
1732 			skb->dst = (struct dst_entry*)rth;
1733 			return 0;
1734 		}
1735 		rt_cache_stat[smp_processor_id()].in_hlist_search++;
1736 	}
1737 	read_unlock(&rt_hash_table[hash].lock);
1738 
1739 	/* Multicast recognition logic is moved from route cache to here.
1740 	   The problem was that too many Ethernet cards have broken/missing
1741 	   hardware multicast filters :-( As result the host on multicasting
1742 	   network acquires a lot of useless route cache entries, sort of
1743 	   SDR messages from all the world. Now we try to get rid of them.
1744 	   Really, provided software IP multicast filter is organized
1745 	   reasonably (at least, hashed), it does not result in a slowdown
1746 	   comparing with route cache reject entries.
1747 	   Note, that multicast routers are not affected, because
1748 	   route cache entry is created eventually.
1749 	 */
1750 	if (MULTICAST(daddr)) {
1751 		struct in_device *in_dev;
1752 
1753 		read_lock(&inetdev_lock);
1754 		if ((in_dev = __in_dev_get(dev)) != NULL) {
1755 			int our = ip_check_mc(in_dev, daddr, saddr);
1756 			if (our
1757 #ifdef CONFIG_IP_MROUTE
1758 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1759 #endif
1760 			    ) {
1761 				read_unlock(&inetdev_lock);
1762 				return ip_route_input_mc(skb, daddr, saddr,
1763 							 tos, dev, our);
1764 			}
1765 		}
1766 		read_unlock(&inetdev_lock);
1767 		return -EINVAL;
1768 	}
1769 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770 }
1771 
1772 /*
1773  * Major route resolver routine.
1774  */
1775 
ip_route_output_slow(struct rtable ** rp,const struct rt_key * oldkey)1776 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1777 {
1778 	struct rt_key key;
1779 	struct fib_result res;
1780 	unsigned flags = 0;
1781 	struct rtable *rth;
1782 	struct net_device *dev_out = NULL;
1783 	unsigned hash;
1784 	int free_res = 0;
1785 	int err;
1786 	u32 tos;
1787 
1788 	tos		= oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1789 	key.dst		= oldkey->dst;
1790 	key.src		= oldkey->src;
1791 	key.tos		= tos & IPTOS_RT_MASK;
1792 	key.iif		= loopback_dev.ifindex;
1793 	key.oif		= oldkey->oif;
1794 #ifdef CONFIG_IP_ROUTE_FWMARK
1795 	key.fwmark	= oldkey->fwmark;
1796 #endif
1797 	key.scope	= (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1798 						RT_SCOPE_UNIVERSE;
1799 	res.fi		= NULL;
1800 #ifdef CONFIG_IP_MULTIPLE_TABLES
1801 	res.r		= NULL;
1802 #endif
1803 
1804 	if (oldkey->src) {
1805 		err = -EINVAL;
1806 		if (MULTICAST(oldkey->src) ||
1807 		    BADCLASS(oldkey->src) ||
1808 		    ZERONET(oldkey->src))
1809 			goto out;
1810 
1811 		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1812 		dev_out = ip_dev_find(oldkey->src);
1813 		if (dev_out == NULL)
1814 			goto out;
1815 
1816 		/* I removed check for oif == dev_out->oif here.
1817 		   It was wrong by three reasons:
1818 		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
1819 		      assigned to multiple interfaces.
1820 		   2. Moreover, we are allowed to send packets with saddr
1821 		      of another iface. --ANK
1822 		 */
1823 
1824 		if (oldkey->oif == 0
1825 		    && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1826 			/* Special hack: user can direct multicasts
1827 			   and limited broadcast via necessary interface
1828 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1829 			   This hack is not just for fun, it allows
1830 			   vic,vat and friends to work.
1831 			   They bind socket to loopback, set ttl to zero
1832 			   and expect that it will work.
1833 			   From the viewpoint of routing cache they are broken,
1834 			   because we are not allowed to build multicast path
1835 			   with loopback source addr (look, routing cache
1836 			   cannot know, that ttl is zero, so that packet
1837 			   will not leave this host and route is valid).
1838 			   Luckily, this hack is good workaround.
1839 			 */
1840 
1841 			key.oif = dev_out->ifindex;
1842 			goto make_route;
1843 		}
1844 		if (dev_out)
1845 			dev_put(dev_out);
1846 		dev_out = NULL;
1847 	}
1848 	if (oldkey->oif) {
1849 		dev_out = dev_get_by_index(oldkey->oif);
1850 		err = -ENODEV;
1851 		if (dev_out == NULL)
1852 			goto out;
1853 		if (__in_dev_get(dev_out) == NULL) {
1854 			dev_put(dev_out);
1855 			goto out;	/* Wrong error code */
1856 		}
1857 
1858 		if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1859 			if (!key.src)
1860 				key.src = inet_select_addr(dev_out, 0,
1861 								RT_SCOPE_LINK);
1862 			goto make_route;
1863 		}
1864 		if (!key.src) {
1865 			if (MULTICAST(oldkey->dst))
1866 				key.src = inet_select_addr(dev_out, 0,
1867 								key.scope);
1868 			else if (!oldkey->dst)
1869 				key.src = inet_select_addr(dev_out, 0,
1870 								RT_SCOPE_HOST);
1871 		}
1872 	}
1873 
1874 	if (!key.dst) {
1875 		key.dst = key.src;
1876 		if (!key.dst)
1877 			key.dst = key.src = htonl(INADDR_LOOPBACK);
1878 		if (dev_out)
1879 			dev_put(dev_out);
1880 		dev_out = &loopback_dev;
1881 		dev_hold(dev_out);
1882 		key.oif = loopback_dev.ifindex;
1883 		res.type = RTN_LOCAL;
1884 		flags |= RTCF_LOCAL;
1885 		goto make_route;
1886 	}
1887 
1888 	if (fib_lookup(&key, &res)) {
1889 		res.fi = NULL;
1890 		if (oldkey->oif) {
1891 			/* Apparently, routing tables are wrong. Assume,
1892 			   that the destination is on link.
1893 
1894 			   WHY? DW.
1895 			   Because we are allowed to send to iface
1896 			   even if it has NO routes and NO assigned
1897 			   addresses. When oif is specified, routing
1898 			   tables are looked up with only one purpose:
1899 			   to catch if destination is gatewayed, rather than
1900 			   direct. Moreover, if MSG_DONTROUTE is set,
1901 			   we send packet, ignoring both routing tables
1902 			   and ifaddr state. --ANK
1903 
1904 
1905 			   We could make it even if oif is unknown,
1906 			   likely IPv6, but we do not.
1907 			 */
1908 
1909 			if (key.src == 0)
1910 				key.src = inet_select_addr(dev_out, 0,
1911 							   RT_SCOPE_LINK);
1912 			res.type = RTN_UNICAST;
1913 			goto make_route;
1914 		}
1915 		if (dev_out)
1916 			dev_put(dev_out);
1917 		err = -ENETUNREACH;
1918 		goto out;
1919 	}
1920 	free_res = 1;
1921 
1922 	if (res.type == RTN_NAT)
1923 		goto e_inval;
1924 
1925 	if (res.type == RTN_LOCAL) {
1926 		if (!key.src)
1927 			key.src = key.dst;
1928 		if (dev_out)
1929 			dev_put(dev_out);
1930 		dev_out = &loopback_dev;
1931 		dev_hold(dev_out);
1932 		key.oif = dev_out->ifindex;
1933 		if (res.fi)
1934 			fib_info_put(res.fi);
1935 		res.fi = NULL;
1936 		flags |= RTCF_LOCAL;
1937 		goto make_route;
1938 	}
1939 
1940 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1941 	if (res.fi->fib_nhs > 1 && key.oif == 0)
1942 		fib_select_multipath(&key, &res);
1943 	else
1944 #endif
1945 	if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1946 		fib_select_default(&key, &res);
1947 
1948 	if (!key.src)
1949 		key.src = FIB_RES_PREFSRC(res);
1950 
1951 	if (dev_out)
1952 		dev_put(dev_out);
1953 	dev_out = FIB_RES_DEV(res);
1954 	dev_hold(dev_out);
1955 	key.oif = dev_out->ifindex;
1956 
1957 make_route:
1958 	if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1959 		goto e_inval;
1960 
1961 	if (key.dst == 0xFFFFFFFF)
1962 		res.type = RTN_BROADCAST;
1963 	else if (MULTICAST(key.dst))
1964 		res.type = RTN_MULTICAST;
1965 	else if (BADCLASS(key.dst) || ZERONET(key.dst))
1966 		goto e_inval;
1967 
1968 	if (dev_out->flags & IFF_LOOPBACK)
1969 		flags |= RTCF_LOCAL;
1970 
1971 	if (res.type == RTN_BROADCAST) {
1972 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1973 		if (res.fi) {
1974 			fib_info_put(res.fi);
1975 			res.fi = NULL;
1976 		}
1977 	} else if (res.type == RTN_MULTICAST) {
1978 		flags |= RTCF_MULTICAST|RTCF_LOCAL;
1979 		read_lock(&inetdev_lock);
1980 		if (!__in_dev_get(dev_out) ||
1981 		    !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1982 			flags &= ~RTCF_LOCAL;
1983 		read_unlock(&inetdev_lock);
1984 		/* If multicast route do not exist use
1985 		   default one, but do not gateway in this case.
1986 		   Yes, it is hack.
1987 		 */
1988 		if (res.fi && res.prefixlen < 4) {
1989 			fib_info_put(res.fi);
1990 			res.fi = NULL;
1991 		}
1992 	}
1993 
1994 	rth = dst_alloc(&ipv4_dst_ops);
1995 	if (!rth)
1996 		goto e_nobufs;
1997 
1998 	atomic_set(&rth->u.dst.__refcnt, 1);
1999 	rth->u.dst.flags= DST_HOST;
2000 	rth->key.dst	= oldkey->dst;
2001 	rth->key.tos	= tos;
2002 	rth->key.src	= oldkey->src;
2003 	rth->key.iif	= 0;
2004 	rth->key.oif	= oldkey->oif;
2005 #ifdef CONFIG_IP_ROUTE_FWMARK
2006 	rth->key.fwmark	= oldkey->fwmark;
2007 #endif
2008 	rth->rt_dst	= key.dst;
2009 	rth->rt_src	= key.src;
2010 #ifdef CONFIG_IP_ROUTE_NAT
2011 	rth->rt_dst_map	= key.dst;
2012 	rth->rt_src_map	= key.src;
2013 #endif
2014 	rth->rt_iif	= oldkey->oif ? : dev_out->ifindex;
2015 	rth->u.dst.dev	= dev_out;
2016 	dev_hold(dev_out);
2017 	rth->rt_gateway = key.dst;
2018 	rth->rt_spec_dst= key.src;
2019 
2020 	rth->u.dst.output=ip_output;
2021 
2022 	rt_cache_stat[smp_processor_id()].out_slow_tot++;
2023 
2024 	if (flags & RTCF_LOCAL) {
2025 		rth->u.dst.input = ip_local_deliver;
2026 		rth->rt_spec_dst = key.dst;
2027 	}
2028 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2029 		rth->rt_spec_dst = key.src;
2030 		if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2031 			rth->u.dst.output = ip_mc_output;
2032 			rt_cache_stat[smp_processor_id()].out_slow_mc++;
2033 		}
2034 #ifdef CONFIG_IP_MROUTE
2035 		if (res.type == RTN_MULTICAST) {
2036 			struct in_device *in_dev = in_dev_get(dev_out);
2037 			if (in_dev) {
2038 				if (IN_DEV_MFORWARD(in_dev) &&
2039 				    !LOCAL_MCAST(oldkey->dst)) {
2040 					rth->u.dst.input = ip_mr_input;
2041 					rth->u.dst.output = ip_mc_output;
2042 				}
2043 				in_dev_put(in_dev);
2044 			}
2045 		}
2046 #endif
2047 	}
2048 
2049 	rt_set_nexthop(rth, &res, 0);
2050 
2051 	rth->rt_flags = flags;
2052 
2053 	hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2054 	err = rt_intern_hash(hash, rth, rp);
2055 done:
2056 	if (free_res)
2057 		fib_res_put(&res);
2058 	if (dev_out)
2059 		dev_put(dev_out);
2060 out:	return err;
2061 
2062 e_inval:
2063 	err = -EINVAL;
2064 	goto done;
2065 e_nobufs:
2066 	err = -ENOBUFS;
2067 	goto done;
2068 }
2069 
ip_route_output_key(struct rtable ** rp,const struct rt_key * key)2070 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2071 {
2072 	unsigned hash;
2073 	struct rtable *rth;
2074 
2075 	hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2076 
2077 	read_lock_bh(&rt_hash_table[hash].lock);
2078 	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2079 		if (rth->key.dst == key->dst &&
2080 		    rth->key.src == key->src &&
2081 		    rth->key.iif == 0 &&
2082 		    rth->key.oif == key->oif &&
2083 #ifdef CONFIG_IP_ROUTE_FWMARK
2084 		    rth->key.fwmark == key->fwmark &&
2085 #endif
2086 		    !((rth->key.tos ^ key->tos) &
2087 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
2088 			rth->u.dst.lastuse = jiffies;
2089 			dst_hold(&rth->u.dst);
2090 			rth->u.dst.__use++;
2091 			rt_cache_stat[smp_processor_id()].out_hit++;
2092 			read_unlock_bh(&rt_hash_table[hash].lock);
2093 			*rp = rth;
2094 			return 0;
2095 		}
2096 		rt_cache_stat[smp_processor_id()].out_hlist_search++;
2097 	}
2098 	read_unlock_bh(&rt_hash_table[hash].lock);
2099 
2100 	return ip_route_output_slow(rp, key);
2101 }
2102 
rt_fill_info(struct sk_buff * skb,u32 pid,u32 seq,int event,int nowait)2103 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2104 			int nowait)
2105 {
2106 	struct rtable *rt = (struct rtable*)skb->dst;
2107 	struct rtmsg *r;
2108 	struct nlmsghdr  *nlh;
2109 	unsigned char	 *b = skb->tail;
2110 	struct rta_cacheinfo ci;
2111 #ifdef CONFIG_IP_MROUTE
2112 	struct rtattr *eptr;
2113 #endif
2114 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2115 	r = NLMSG_DATA(nlh);
2116 	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2117 	r->rtm_family	 = AF_INET;
2118 	r->rtm_dst_len	= 32;
2119 	r->rtm_src_len	= 0;
2120 	r->rtm_tos	= rt->key.tos;
2121 	r->rtm_table	= RT_TABLE_MAIN;
2122 	r->rtm_type	= rt->rt_type;
2123 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2124 	r->rtm_protocol = RTPROT_UNSPEC;
2125 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2126 	if (rt->rt_flags & RTCF_NOTIFY)
2127 		r->rtm_flags |= RTM_F_NOTIFY;
2128 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2129 	if (rt->key.src) {
2130 		r->rtm_src_len = 32;
2131 		RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2132 	}
2133 	if (rt->u.dst.dev)
2134 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2135 #ifdef CONFIG_NET_CLS_ROUTE
2136 	if (rt->u.dst.tclassid)
2137 		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2138 #endif
2139 	if (rt->key.iif)
2140 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2141 	else if (rt->rt_src != rt->key.src)
2142 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2143 	if (rt->rt_dst != rt->rt_gateway)
2144 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2145 	if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2146 		goto rtattr_failure;
2147 	ci.rta_lastuse	= jiffies - rt->u.dst.lastuse;
2148 	ci.rta_used	= rt->u.dst.__use;
2149 	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
2150 	if (rt->u.dst.expires)
2151 		ci.rta_expires = rt->u.dst.expires - jiffies;
2152 	else
2153 		ci.rta_expires = 0;
2154 	ci.rta_error	= rt->u.dst.error;
2155 	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
2156 	if (rt->peer) {
2157 		ci.rta_id = rt->peer->ip_id_count;
2158 		if (rt->peer->tcp_ts_stamp) {
2159 			ci.rta_ts = rt->peer->tcp_ts;
2160 			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2161 		}
2162 	}
2163 #ifdef CONFIG_IP_MROUTE
2164 	eptr = (struct rtattr*)skb->tail;
2165 #endif
2166 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2167 	if (rt->key.iif) {
2168 #ifdef CONFIG_IP_MROUTE
2169 		u32 dst = rt->rt_dst;
2170 
2171 		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2172 		    ipv4_devconf.mc_forwarding) {
2173 			int err = ipmr_get_route(skb, r, nowait);
2174 			if (err <= 0) {
2175 				if (!nowait) {
2176 					if (err == 0)
2177 						return 0;
2178 					goto nlmsg_failure;
2179 				} else {
2180 					if (err == -EMSGSIZE)
2181 						goto nlmsg_failure;
2182 					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2183 				}
2184 			}
2185 		} else
2186 #endif
2187 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2188 	}
2189 
2190 	nlh->nlmsg_len = skb->tail - b;
2191 	return skb->len;
2192 
2193 nlmsg_failure:
2194 rtattr_failure:
2195 	skb_trim(skb, b - skb->data);
2196 	return -1;
2197 }
2198 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,void * arg)2199 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2200 {
2201 	struct rtattr **rta = arg;
2202 	struct rtmsg *rtm = NLMSG_DATA(nlh);
2203 	struct rtable *rt = NULL;
2204 	u32 dst = 0;
2205 	u32 src = 0;
2206 	int iif = 0;
2207 	int err = -ENOBUFS;
2208 	struct sk_buff *skb;
2209 
2210 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211 	if (!skb)
2212 		goto out;
2213 
2214 	/* Reserve room for dummy headers, this skb can pass
2215 	   through good chunk of routing engine.
2216 	 */
2217 	skb->mac.raw = skb->nh.raw = skb->data;
2218 
2219 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2220 	skb->nh.iph->protocol = IPPROTO_ICMP;
2221 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2222 
2223 	if (rta[RTA_SRC - 1])
2224 		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2225 	if (rta[RTA_DST - 1])
2226 		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2227 	if (rta[RTA_IIF - 1])
2228 		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2229 
2230 	if (iif) {
2231 		struct net_device *dev = __dev_get_by_index(iif);
2232 		err = -ENODEV;
2233 		if (!dev)
2234 			goto out_free;
2235 		skb->protocol	= htons(ETH_P_IP);
2236 		skb->dev	= dev;
2237 		local_bh_disable();
2238 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2239 		local_bh_enable();
2240 		rt = (struct rtable*)skb->dst;
2241 		if (!err && rt->u.dst.error)
2242 			err = -rt->u.dst.error;
2243 	} else {
2244 		int oif = 0;
2245 		if (rta[RTA_OIF - 1])
2246 			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2247 		err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2248 	}
2249 	if (err)
2250 		goto out_free;
2251 
2252 	skb->dst = &rt->u.dst;
2253 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2254 		rt->rt_flags |= RTCF_NOTIFY;
2255 
2256 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2257 
2258 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2259 				RTM_NEWROUTE, 0);
2260 	if (!err)
2261 		goto out_free;
2262 	if (err < 0) {
2263 		err = -EMSGSIZE;
2264 		goto out_free;
2265 	}
2266 
2267 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2268 	if (err > 0)
2269 		err = 0;
2270 out:	return err;
2271 
2272 out_free:
2273 	kfree_skb(skb);
2274 	goto out;
2275 }
2276 
ip_rt_dump(struct sk_buff * skb,struct netlink_callback * cb)2277 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2278 {
2279 	struct rtable *rt;
2280 	int h, s_h;
2281 	int idx, s_idx;
2282 
2283 	s_h = cb->args[0];
2284 	s_idx = idx = cb->args[1];
2285 	for (h = 0; h <= rt_hash_mask; h++) {
2286 		if (h < s_h) continue;
2287 		if (h > s_h)
2288 			s_idx = 0;
2289 		read_lock_bh(&rt_hash_table[h].lock);
2290 		for (rt = rt_hash_table[h].chain, idx = 0; rt;
2291 		     rt = rt->u.rt_next, idx++) {
2292 			if (idx < s_idx)
2293 				continue;
2294 			skb->dst = dst_clone(&rt->u.dst);
2295 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2296 					 cb->nlh->nlmsg_seq,
2297 					 RTM_NEWROUTE, 1) <= 0) {
2298 				dst_release(xchg(&skb->dst, NULL));
2299 				read_unlock_bh(&rt_hash_table[h].lock);
2300 				goto done;
2301 			}
2302 			dst_release(xchg(&skb->dst, NULL));
2303 		}
2304 		read_unlock_bh(&rt_hash_table[h].lock);
2305 	}
2306 
2307 done:
2308 	cb->args[0] = h;
2309 	cb->args[1] = idx;
2310 	return skb->len;
2311 }
2312 
ip_rt_multicast_event(struct in_device * in_dev)2313 void ip_rt_multicast_event(struct in_device *in_dev)
2314 {
2315 	rt_cache_flush(0);
2316 }
2317 
2318 #ifdef CONFIG_SYSCTL
2319 static int flush_delay;
2320 
ipv4_sysctl_rtcache_flush(ctl_table * ctl,int write,struct file * filp,void * buffer,size_t * lenp)2321 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2322 					struct file *filp, void *buffer,
2323 					size_t *lenp)
2324 {
2325 	if (write) {
2326 		proc_dointvec(ctl, write, filp, buffer, lenp);
2327 		rt_cache_flush(flush_delay);
2328 		return 0;
2329 	}
2330 
2331 	return -EINVAL;
2332 }
2333 
ipv4_sysctl_rtcache_flush_strategy(ctl_table * table,int * name,int nlen,void * oldval,size_t * oldlenp,void * newval,size_t newlen,void ** context)2334 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2335 						int nlen, void *oldval,
2336 						size_t *oldlenp, void *newval,
2337 						size_t newlen, void **context)
2338 {
2339 	int delay;
2340 	if (newlen != sizeof(int))
2341 		return -EINVAL;
2342 	if (get_user(delay, (int *)newval))
2343 		return -EFAULT;
2344 	rt_cache_flush(delay);
2345 	return 0;
2346 }
2347 
2348 ctl_table ipv4_route_table[] = {
2349         {
2350 		ctl_name:	NET_IPV4_ROUTE_FLUSH,
2351 		procname:	"flush",
2352 		data:		&flush_delay,
2353 		maxlen:		sizeof(int),
2354 		mode:		0644,
2355 		proc_handler:	&ipv4_sysctl_rtcache_flush,
2356 		strategy:	&ipv4_sysctl_rtcache_flush_strategy,
2357 	},
2358 	{
2359 		ctl_name:	NET_IPV4_ROUTE_MIN_DELAY,
2360 		procname:	"min_delay",
2361 		data:		&ip_rt_min_delay,
2362 		maxlen:		sizeof(int),
2363 		mode:		0644,
2364 		proc_handler:	&proc_dointvec_jiffies,
2365 		strategy:	&sysctl_jiffies,
2366 	},
2367 	{
2368 		ctl_name:	NET_IPV4_ROUTE_MAX_DELAY,
2369 		procname:	"max_delay",
2370 		data:		&ip_rt_max_delay,
2371 		maxlen:		sizeof(int),
2372 		mode:		0644,
2373 		proc_handler:	&proc_dointvec_jiffies,
2374 		strategy:	&sysctl_jiffies,
2375 	},
2376 	{
2377 		ctl_name:	NET_IPV4_ROUTE_GC_THRESH,
2378 		procname:	"gc_thresh",
2379 		data:		&ipv4_dst_ops.gc_thresh,
2380 		maxlen:		sizeof(int),
2381 		mode:		0644,
2382 		proc_handler:	&proc_dointvec,
2383 	},
2384 	{
2385 		ctl_name:	NET_IPV4_ROUTE_MAX_SIZE,
2386 		procname:	"max_size",
2387 		data:		&ip_rt_max_size,
2388 		maxlen:		sizeof(int),
2389 		mode:		0644,
2390 		proc_handler:	&proc_dointvec,
2391 	},
2392 	{
2393 		ctl_name:	NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2394 		procname:	"gc_min_interval",
2395 		data:		&ip_rt_gc_min_interval,
2396 		maxlen:		sizeof(int),
2397 		mode:		0644,
2398 		proc_handler:	&proc_dointvec_jiffies,
2399 		strategy:	&sysctl_jiffies,
2400 	},
2401 	{
2402 		ctl_name:	NET_IPV4_ROUTE_GC_TIMEOUT,
2403 		procname:	"gc_timeout",
2404 		data:		&ip_rt_gc_timeout,
2405 		maxlen:		sizeof(int),
2406 		mode:		0644,
2407 		proc_handler:	&proc_dointvec_jiffies,
2408 		strategy:	&sysctl_jiffies,
2409 	},
2410 	{
2411 		ctl_name:	NET_IPV4_ROUTE_GC_INTERVAL,
2412 		procname:	"gc_interval",
2413 		data:		&ip_rt_gc_interval,
2414 		maxlen:		sizeof(int),
2415 		mode:		0644,
2416 		proc_handler:	&proc_dointvec_jiffies,
2417 		strategy:	&sysctl_jiffies,
2418 	},
2419 	{
2420 		ctl_name:	NET_IPV4_ROUTE_REDIRECT_LOAD,
2421 		procname:	"redirect_load",
2422 		data:		&ip_rt_redirect_load,
2423 		maxlen:		sizeof(int),
2424 		mode:		0644,
2425 		proc_handler:	&proc_dointvec,
2426 	},
2427 	{
2428 		ctl_name:	NET_IPV4_ROUTE_REDIRECT_NUMBER,
2429 		procname:	"redirect_number",
2430 		data:		&ip_rt_redirect_number,
2431 		maxlen:		sizeof(int),
2432 		mode:		0644,
2433 		proc_handler:	&proc_dointvec,
2434 	},
2435 	{
2436 		ctl_name:	NET_IPV4_ROUTE_REDIRECT_SILENCE,
2437 		procname:	"redirect_silence",
2438 		data:		&ip_rt_redirect_silence,
2439 		maxlen:		sizeof(int),
2440 		mode:		0644,
2441 		proc_handler:	&proc_dointvec,
2442 	},
2443 	{
2444 		ctl_name:	NET_IPV4_ROUTE_ERROR_COST,
2445 		procname:	"error_cost",
2446 		data:		&ip_rt_error_cost,
2447 		maxlen:		sizeof(int),
2448 		mode:		0644,
2449 		proc_handler:	&proc_dointvec,
2450 	},
2451 	{
2452 		ctl_name:	NET_IPV4_ROUTE_ERROR_BURST,
2453 		procname:	"error_burst",
2454 		data:		&ip_rt_error_burst,
2455 		maxlen:		sizeof(int),
2456 		mode:		0644,
2457 		proc_handler:	&proc_dointvec,
2458 	},
2459 	{
2460 		ctl_name:	NET_IPV4_ROUTE_GC_ELASTICITY,
2461 		procname:	"gc_elasticity",
2462 		data:		&ip_rt_gc_elasticity,
2463 		maxlen:		sizeof(int),
2464 		mode:		0644,
2465 		proc_handler:	&proc_dointvec,
2466 	},
2467 	{
2468 		ctl_name:	NET_IPV4_ROUTE_MTU_EXPIRES,
2469 		procname:	"mtu_expires",
2470 		data:		&ip_rt_mtu_expires,
2471 		maxlen:		sizeof(int),
2472 		mode:		0644,
2473 		proc_handler:	&proc_dointvec_jiffies,
2474 		strategy:	&sysctl_jiffies,
2475 	},
2476 	{
2477 		ctl_name:	NET_IPV4_ROUTE_MIN_PMTU,
2478 		procname:	"min_pmtu",
2479 		data:		&ip_rt_min_pmtu,
2480 		maxlen:		sizeof(int),
2481 		mode:		0644,
2482 		proc_handler:	&proc_dointvec,
2483 	},
2484 	{
2485 		ctl_name:	NET_IPV4_ROUTE_MIN_ADVMSS,
2486 		procname:	"min_adv_mss",
2487 		data:		&ip_rt_min_advmss,
2488 		maxlen:		sizeof(int),
2489 		mode:		0644,
2490 		proc_handler:	&proc_dointvec,
2491 	},
2492 	{
2493 		ctl_name:	NET_IPV4_ROUTE_SECRET_INTERVAL,
2494 		procname:	"secret_interval",
2495 		data:		&ip_rt_secret_interval,
2496 		maxlen:		sizeof(int),
2497 		mode:		0644,
2498 		proc_handler:	&proc_dointvec_jiffies,
2499 		strategy:	&sysctl_jiffies,
2500 	},
2501 	 { 0 }
2502 };
2503 #endif
2504 
2505 #ifdef CONFIG_NET_CLS_ROUTE
2506 struct ip_rt_acct *ip_rt_acct;
2507 
2508 /* This code sucks.  But you should have seen it before! --RR */
2509 
2510 /* IP route accounting ptr for this logical cpu number. */
2511 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2512 
ip_rt_acct_read(char * buffer,char ** start,off_t offset,int length,int * eof,void * data)2513 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2514 			   int length, int *eof, void *data)
2515 {
2516 	unsigned int i;
2517 
2518 	if ((offset & 3) || (length & 3))
2519 		return -EIO;
2520 
2521 	if (offset >= sizeof(struct ip_rt_acct) * 256) {
2522 		*eof = 1;
2523 		return 0;
2524 	}
2525 
2526 	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2527 		length = sizeof(struct ip_rt_acct) * 256 - offset;
2528 		*eof = 1;
2529 	}
2530 
2531 	offset /= sizeof(u32);
2532 
2533 	if (length > 0) {
2534 		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2535 		u32 *dst = (u32 *) buffer;
2536 
2537 		/* Copy first cpu. */
2538 		*start = buffer;
2539 		memcpy(dst, src, length);
2540 
2541 		/* Add the other cpus in, one int at a time */
2542 		for (i = 1; i < smp_num_cpus; i++) {
2543 			unsigned int j;
2544 
2545 			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2546 
2547 			for (j = 0; j < length/4; j++)
2548 				dst[j] += src[j];
2549 		}
2550 	}
2551 	return length;
2552 }
2553 #endif
2554 
ip_rt_init(void)2555 void __init ip_rt_init(void)
2556 {
2557 	int i, order, goal;
2558 
2559 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2560 			     (jiffies ^ (jiffies >> 7)));
2561 
2562 #ifdef CONFIG_NET_CLS_ROUTE
2563 	for (order = 0;
2564 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2565 		/* NOTHING */;
2566 	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2567 	if (!ip_rt_acct)
2568 		panic("IP: failed to allocate ip_rt_acct\n");
2569 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
2570 #endif
2571 
2572 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2573 						     sizeof(struct rtable),
2574 						     0, SLAB_HWCACHE_ALIGN,
2575 						     NULL, NULL);
2576 
2577 	if (!ipv4_dst_ops.kmem_cachep)
2578 		panic("IP: failed to allocate ip_dst_cache\n");
2579 
2580 	goal = num_physpages >> (26 - PAGE_SHIFT);
2581 
2582 	for (order = 0; (1UL << order) < goal; order++)
2583 		/* NOTHING */;
2584 
2585 	do {
2586 		rt_hash_mask = (1UL << order) * PAGE_SIZE /
2587 			sizeof(struct rt_hash_bucket);
2588 		while (rt_hash_mask & (rt_hash_mask - 1))
2589 			rt_hash_mask--;
2590 		rt_hash_table = (struct rt_hash_bucket *)
2591 			__get_free_pages(GFP_ATOMIC, order);
2592 	} while (rt_hash_table == NULL && --order > 0);
2593 
2594 	if (!rt_hash_table)
2595 		panic("Failed to allocate IP route cache hash table\n");
2596 
2597 	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2598 	       rt_hash_mask,
2599 	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2600 
2601 	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2602 		/* NOTHING */;
2603 
2604 	rt_hash_mask--;
2605 	for (i = 0; i <= rt_hash_mask; i++) {
2606 		rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2607 		rt_hash_table[i].chain = NULL;
2608 	}
2609 
2610 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2611 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
2612 
2613 	devinet_init();
2614 	ip_fib_init();
2615 
2616 	rt_flush_timer.function = rt_run_flush;
2617 	rt_periodic_timer.function = rt_check_expire;
2618 	rt_secret_timer.function = rt_secret_rebuild;
2619 
2620 	/* All the timers, started at system startup tend
2621 	   to synchronize. Perturb it a bit.
2622 	 */
2623 	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2624 					ip_rt_gc_interval;
2625 	add_timer(&rt_periodic_timer);
2626 
2627 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2628 		ip_rt_secret_interval;
2629 	add_timer(&rt_secret_timer);
2630 
2631 	proc_net_create ("rt_cache", 0, rt_cache_get_info);
2632 	create_proc_info_entry ("rt_cache", 0, proc_net_stat,
2633 				rt_cache_stat_get_info);
2634 #ifdef CONFIG_NET_CLS_ROUTE
2635 	create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2636 #endif
2637 }
2638