1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #endif
113 #include <net/secure_seq.h>
114 
115 #define RT_FL_TOS(oldflp4) \
116 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 
118 #define IP_MAX_MTU	0xFFF0
119 
120 #define RT_GC_TIMEOUT (300*HZ)
121 
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
126 static int ip_rt_redirect_number __read_mostly	= 9;
127 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly	= HZ;
130 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly	= 8;
132 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly	= 256;
135 static int rt_chain_length_max __read_mostly	= 20;
136 
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139 
140 /*
141  *	Interface to generic destination cache.
142  */
143 
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
147 static void		 ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void		 ipv4_link_failure(struct sk_buff *skb);
150 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152 
ipv4_dst_ifdown(struct dst_entry * dst,struct net_device * dev,int how)153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 			    int how)
155 {
156 }
157 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160 	struct rtable *rt = (struct rtable *) dst;
161 	struct inet_peer *peer;
162 	u32 *p = NULL;
163 
164 	if (!rt->peer)
165 		rt_bind_peer(rt, rt->rt_dst, 1);
166 
167 	peer = rt->peer;
168 	if (peer) {
169 		u32 *old_p = __DST_METRICS_PTR(old);
170 		unsigned long prev, new;
171 
172 		p = peer->metrics;
173 		if (inet_metrics_new(peer))
174 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175 
176 		new = (unsigned long) p;
177 		prev = cmpxchg(&dst->_metrics, old, new);
178 
179 		if (prev != old) {
180 			p = __DST_METRICS_PTR(prev);
181 			if (prev & DST_METRICS_READ_ONLY)
182 				p = NULL;
183 		} else {
184 			if (rt->fi) {
185 				fib_info_put(rt->fi);
186 				rt->fi = NULL;
187 			}
188 		}
189 	}
190 	return p;
191 }
192 
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194 
195 static struct dst_ops ipv4_dst_ops = {
196 	.family =		AF_INET,
197 	.protocol =		cpu_to_be16(ETH_P_IP),
198 	.gc =			rt_garbage_collect,
199 	.check =		ipv4_dst_check,
200 	.default_advmss =	ipv4_default_advmss,
201 	.mtu =			ipv4_mtu,
202 	.cow_metrics =		ipv4_cow_metrics,
203 	.destroy =		ipv4_dst_destroy,
204 	.ifdown =		ipv4_dst_ifdown,
205 	.negative_advice =	ipv4_negative_advice,
206 	.link_failure =		ipv4_link_failure,
207 	.update_pmtu =		ip_rt_update_pmtu,
208 	.local_out =		__ip_local_out,
209 	.neigh_lookup =		ipv4_neigh_lookup,
210 };
211 
212 #define ECN_OR_COST(class)	TC_PRIO_##class
213 
214 const __u8 ip_tos2prio[16] = {
215 	TC_PRIO_BESTEFFORT,
216 	ECN_OR_COST(BESTEFFORT),
217 	TC_PRIO_BESTEFFORT,
218 	ECN_OR_COST(BESTEFFORT),
219 	TC_PRIO_BULK,
220 	ECN_OR_COST(BULK),
221 	TC_PRIO_BULK,
222 	ECN_OR_COST(BULK),
223 	TC_PRIO_INTERACTIVE,
224 	ECN_OR_COST(INTERACTIVE),
225 	TC_PRIO_INTERACTIVE,
226 	ECN_OR_COST(INTERACTIVE),
227 	TC_PRIO_INTERACTIVE_BULK,
228 	ECN_OR_COST(INTERACTIVE_BULK),
229 	TC_PRIO_INTERACTIVE_BULK,
230 	ECN_OR_COST(INTERACTIVE_BULK)
231 };
232 
233 
234 /*
235  * Route cache.
236  */
237 
238 /* The locking scheme is rather straight forward:
239  *
240  * 1) Read-Copy Update protects the buckets of the central route hash.
241  * 2) Only writers remove entries, and they hold the lock
242  *    as they look at rtable reference counts.
243  * 3) Only readers acquire references to rtable entries,
244  *    they do so with atomic increments and with the
245  *    lock held.
246  */
247 
248 struct rt_hash_bucket {
249 	struct rtable __rcu	*chain;
250 };
251 
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 	defined(CONFIG_PROVE_LOCKING)
254 /*
255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256  * The size of this table is a power of two and depends on the number of CPUS.
257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258  */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ	256
261 #else
262 # if NR_CPUS >= 32
263 #  define RT_HASH_LOCK_SZ	4096
264 # elif NR_CPUS >= 16
265 #  define RT_HASH_LOCK_SZ	2048
266 # elif NR_CPUS >= 8
267 #  define RT_HASH_LOCK_SZ	1024
268 # elif NR_CPUS >= 4
269 #  define RT_HASH_LOCK_SZ	512
270 # else
271 #  define RT_HASH_LOCK_SZ	256
272 # endif
273 #endif
274 
275 static spinlock_t	*rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277 
rt_hash_lock_init(void)278 static __init void rt_hash_lock_init(void)
279 {
280 	int i;
281 
282 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 			GFP_KERNEL);
284 	if (!rt_hash_locks)
285 		panic("IP: failed to allocate rt_hash_locks\n");
286 
287 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 		spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292 
rt_hash_lock_init(void)293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297 
298 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
299 static unsigned			rt_hash_mask __read_mostly;
300 static unsigned int		rt_hash_log  __read_mostly;
301 
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304 
rt_hash(__be32 daddr,__be32 saddr,int idx,int genid)305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306 				   int genid)
307 {
308 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 			    idx, genid)
310 		& rt_hash_mask;
311 }
312 
rt_genid(struct net * net)313 static inline int rt_genid(struct net *net)
314 {
315 	return atomic_read(&net->ipv4.rt_genid);
316 }
317 
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320 	struct seq_net_private p;
321 	int bucket;
322 	int genid;
323 };
324 
rt_cache_get_first(struct seq_file * seq)325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327 	struct rt_cache_iter_state *st = seq->private;
328 	struct rtable *r = NULL;
329 
330 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332 			continue;
333 		rcu_read_lock_bh();
334 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335 		while (r) {
336 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337 			    r->rt_genid == st->genid)
338 				return r;
339 			r = rcu_dereference_bh(r->dst.rt_next);
340 		}
341 		rcu_read_unlock_bh();
342 	}
343 	return r;
344 }
345 
__rt_cache_get_next(struct seq_file * seq,struct rtable * r)346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347 					  struct rtable *r)
348 {
349 	struct rt_cache_iter_state *st = seq->private;
350 
351 	r = rcu_dereference_bh(r->dst.rt_next);
352 	while (!r) {
353 		rcu_read_unlock_bh();
354 		do {
355 			if (--st->bucket < 0)
356 				return NULL;
357 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358 		rcu_read_lock_bh();
359 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 	}
361 	return r;
362 }
363 
rt_cache_get_next(struct seq_file * seq,struct rtable * r)364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365 					struct rtable *r)
366 {
367 	struct rt_cache_iter_state *st = seq->private;
368 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369 		if (dev_net(r->dst.dev) != seq_file_net(seq))
370 			continue;
371 		if (r->rt_genid == st->genid)
372 			break;
373 	}
374 	return r;
375 }
376 
rt_cache_get_idx(struct seq_file * seq,loff_t pos)377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379 	struct rtable *r = rt_cache_get_first(seq);
380 
381 	if (r)
382 		while (pos && (r = rt_cache_get_next(seq, r)))
383 			--pos;
384 	return pos ? NULL : r;
385 }
386 
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389 	struct rt_cache_iter_state *st = seq->private;
390 	if (*pos)
391 		return rt_cache_get_idx(seq, *pos - 1);
392 	st->genid = rt_genid(seq_file_net(seq));
393 	return SEQ_START_TOKEN;
394 }
395 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398 	struct rtable *r;
399 
400 	if (v == SEQ_START_TOKEN)
401 		r = rt_cache_get_first(seq);
402 	else
403 		r = rt_cache_get_next(seq, v);
404 	++*pos;
405 	return r;
406 }
407 
rt_cache_seq_stop(struct seq_file * seq,void * v)408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410 	if (v && v != SEQ_START_TOKEN)
411 		rcu_read_unlock_bh();
412 }
413 
rt_cache_seq_show(struct seq_file * seq,void * v)414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416 	if (v == SEQ_START_TOKEN)
417 		seq_printf(seq, "%-127s\n",
418 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 			   "HHUptod\tSpecDst");
421 	else {
422 		struct rtable *r = v;
423 		struct neighbour *n;
424 		int len, HHUptod;
425 
426 		rcu_read_lock();
427 		n = dst_get_neighbour_noref(&r->dst);
428 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 		rcu_read_unlock();
430 
431 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433 			r->dst.dev ? r->dst.dev->name : "*",
434 			(__force u32)r->rt_dst,
435 			(__force u32)r->rt_gateway,
436 			r->rt_flags, atomic_read(&r->dst.__refcnt),
437 			r->dst.__use, 0, (__force u32)r->rt_src,
438 			dst_metric_advmss(&r->dst) + 40,
439 			dst_metric(&r->dst, RTAX_WINDOW),
440 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 			      dst_metric(&r->dst, RTAX_RTTVAR)),
442 			r->rt_key_tos,
443 			-1,
444 			HHUptod,
445 			r->rt_spec_dst, &len);
446 
447 		seq_printf(seq, "%*s\n", 127 - len, "");
448 	}
449 	return 0;
450 }
451 
452 static const struct seq_operations rt_cache_seq_ops = {
453 	.start  = rt_cache_seq_start,
454 	.next   = rt_cache_seq_next,
455 	.stop   = rt_cache_seq_stop,
456 	.show   = rt_cache_seq_show,
457 };
458 
rt_cache_seq_open(struct inode * inode,struct file * file)459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461 	return seq_open_net(inode, file, &rt_cache_seq_ops,
462 			sizeof(struct rt_cache_iter_state));
463 }
464 
465 static const struct file_operations rt_cache_seq_fops = {
466 	.owner	 = THIS_MODULE,
467 	.open	 = rt_cache_seq_open,
468 	.read	 = seq_read,
469 	.llseek	 = seq_lseek,
470 	.release = seq_release_net,
471 };
472 
473 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476 	int cpu;
477 
478 	if (*pos == 0)
479 		return SEQ_START_TOKEN;
480 
481 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482 		if (!cpu_possible(cpu))
483 			continue;
484 		*pos = cpu+1;
485 		return &per_cpu(rt_cache_stat, cpu);
486 	}
487 	return NULL;
488 }
489 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492 	int cpu;
493 
494 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495 		if (!cpu_possible(cpu))
496 			continue;
497 		*pos = cpu+1;
498 		return &per_cpu(rt_cache_stat, cpu);
499 	}
500 	return NULL;
501 
502 }
503 
rt_cpu_seq_stop(struct seq_file * seq,void * v)504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506 
507 }
508 
rt_cpu_seq_show(struct seq_file * seq,void * v)509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511 	struct rt_cache_stat *st = v;
512 
513 	if (v == SEQ_START_TOKEN) {
514 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515 		return 0;
516 	}
517 
518 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
519 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520 		   dst_entries_get_slow(&ipv4_dst_ops),
521 		   st->in_hit,
522 		   st->in_slow_tot,
523 		   st->in_slow_mc,
524 		   st->in_no_route,
525 		   st->in_brd,
526 		   st->in_martian_dst,
527 		   st->in_martian_src,
528 
529 		   st->out_hit,
530 		   st->out_slow_tot,
531 		   st->out_slow_mc,
532 
533 		   st->gc_total,
534 		   st->gc_ignored,
535 		   st->gc_goal_miss,
536 		   st->gc_dst_overflow,
537 		   st->in_hlist_search,
538 		   st->out_hlist_search
539 		);
540 	return 0;
541 }
542 
543 static const struct seq_operations rt_cpu_seq_ops = {
544 	.start  = rt_cpu_seq_start,
545 	.next   = rt_cpu_seq_next,
546 	.stop   = rt_cpu_seq_stop,
547 	.show   = rt_cpu_seq_show,
548 };
549 
550 
rt_cpu_seq_open(struct inode * inode,struct file * file)551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553 	return seq_open(file, &rt_cpu_seq_ops);
554 }
555 
556 static const struct file_operations rt_cpu_seq_fops = {
557 	.owner	 = THIS_MODULE,
558 	.open	 = rt_cpu_seq_open,
559 	.read	 = seq_read,
560 	.llseek	 = seq_lseek,
561 	.release = seq_release,
562 };
563 
564 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567 	struct ip_rt_acct *dst, *src;
568 	unsigned int i, j;
569 
570 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 	if (!dst)
572 		return -ENOMEM;
573 
574 	for_each_possible_cpu(i) {
575 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 		for (j = 0; j < 256; j++) {
577 			dst[j].o_bytes   += src[j].o_bytes;
578 			dst[j].o_packets += src[j].o_packets;
579 			dst[j].i_bytes   += src[j].i_bytes;
580 			dst[j].i_packets += src[j].i_packets;
581 		}
582 	}
583 
584 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 	kfree(dst);
586 	return 0;
587 }
588 
rt_acct_proc_open(struct inode * inode,struct file * file)589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591 	return single_open(file, rt_acct_proc_show, NULL);
592 }
593 
594 static const struct file_operations rt_acct_proc_fops = {
595 	.owner		= THIS_MODULE,
596 	.open		= rt_acct_proc_open,
597 	.read		= seq_read,
598 	.llseek		= seq_lseek,
599 	.release	= single_release,
600 };
601 #endif
602 
ip_rt_do_proc_init(struct net * net)603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605 	struct proc_dir_entry *pde;
606 
607 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 			&rt_cache_seq_fops);
609 	if (!pde)
610 		goto err1;
611 
612 	pde = proc_create("rt_cache", S_IRUGO,
613 			  net->proc_net_stat, &rt_cpu_seq_fops);
614 	if (!pde)
615 		goto err2;
616 
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619 	if (!pde)
620 		goto err3;
621 #endif
622 	return 0;
623 
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626 	remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629 	remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631 	return -ENOMEM;
632 }
633 
ip_rt_do_proc_exit(struct net * net)634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636 	remove_proc_entry("rt_cache", net->proc_net_stat);
637 	remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639 	remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642 
643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
644 	.init = ip_rt_do_proc_init,
645 	.exit = ip_rt_do_proc_exit,
646 };
647 
ip_rt_proc_init(void)648 static int __init ip_rt_proc_init(void)
649 {
650 	return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652 
653 #else
ip_rt_proc_init(void)654 static inline int ip_rt_proc_init(void)
655 {
656 	return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659 
rt_free(struct rtable * rt)660 static inline void rt_free(struct rtable *rt)
661 {
662 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664 
rt_drop(struct rtable * rt)665 static inline void rt_drop(struct rtable *rt)
666 {
667 	ip_rt_put(rt);
668 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670 
rt_fast_clean(struct rtable * rth)671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673 	/* Kill broadcast/multicast entries very aggresively, if they
674 	   collide in hash table with more useful entries */
675 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676 		rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678 
rt_valuable(struct rtable * rth)679 static inline int rt_valuable(struct rtable *rth)
680 {
681 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682 		(rth->peer && rth->peer->pmtu_expires);
683 }
684 
rt_may_expire(struct rtable * rth,unsigned long tmo1,unsigned long tmo2)685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687 	unsigned long age;
688 	int ret = 0;
689 
690 	if (atomic_read(&rth->dst.__refcnt))
691 		goto out;
692 
693 	age = jiffies - rth->dst.lastuse;
694 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 	    (age <= tmo2 && rt_valuable(rth)))
696 		goto out;
697 	ret = 1;
698 out:	return ret;
699 }
700 
701 /* Bits of score are:
702  * 31: very valuable
703  * 30: not quite useless
704  * 29..0: usage counter
705  */
rt_score(struct rtable * rt)706 static inline u32 rt_score(struct rtable *rt)
707 {
708 	u32 score = jiffies - rt->dst.lastuse;
709 
710 	score = ~score & ~(3<<30);
711 
712 	if (rt_valuable(rt))
713 		score |= (1<<31);
714 
715 	if (rt_is_output_route(rt) ||
716 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 		score |= (1<<30);
718 
719 	return score;
720 }
721 
rt_caching(const struct net * net)722 static inline bool rt_caching(const struct net *net)
723 {
724 	return net->ipv4.current_rt_cache_rebuild_count <=
725 		net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727 
compare_hash_inputs(const struct rtable * rt1,const struct rtable * rt2)728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729 				       const struct rtable *rt2)
730 {
731 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735 
compare_keys(struct rtable * rt1,struct rtable * rt2)736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 		(rt1->rt_mark ^ rt2->rt_mark) |
741 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
742 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
743 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745 
compare_netns(struct rtable * rt1,struct rtable * rt2)746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750 
rt_is_expired(struct rtable * rth)751 static inline int rt_is_expired(struct rtable *rth)
752 {
753 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755 
756 /*
757  * Perform a full scan of hash table and free all entries.
758  * Can be called by a softirq or a process.
759  * In the later case, we want to be reschedule if necessary
760  */
rt_do_flush(struct net * net,int process_context)761 static void rt_do_flush(struct net *net, int process_context)
762 {
763 	unsigned int i;
764 	struct rtable *rth, *next;
765 
766 	for (i = 0; i <= rt_hash_mask; i++) {
767 		struct rtable __rcu **pprev;
768 		struct rtable *list;
769 
770 		if (process_context && need_resched())
771 			cond_resched();
772 		rth = rcu_access_pointer(rt_hash_table[i].chain);
773 		if (!rth)
774 			continue;
775 
776 		spin_lock_bh(rt_hash_lock_addr(i));
777 
778 		list = NULL;
779 		pprev = &rt_hash_table[i].chain;
780 		rth = rcu_dereference_protected(*pprev,
781 			lockdep_is_held(rt_hash_lock_addr(i)));
782 
783 		while (rth) {
784 			next = rcu_dereference_protected(rth->dst.rt_next,
785 				lockdep_is_held(rt_hash_lock_addr(i)));
786 
787 			if (!net ||
788 			    net_eq(dev_net(rth->dst.dev), net)) {
789 				rcu_assign_pointer(*pprev, next);
790 				rcu_assign_pointer(rth->dst.rt_next, list);
791 				list = rth;
792 			} else {
793 				pprev = &rth->dst.rt_next;
794 			}
795 			rth = next;
796 		}
797 
798 		spin_unlock_bh(rt_hash_lock_addr(i));
799 
800 		for (; list; list = next) {
801 			next = rcu_dereference_protected(list->dst.rt_next, 1);
802 			rt_free(list);
803 		}
804 	}
805 }
806 
807 /*
808  * While freeing expired entries, we compute average chain length
809  * and standard deviation, using fixed-point arithmetic.
810  * This to have an estimation of rt_chain_length_max
811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813  */
814 
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817 
818 /*
819  * Given a hash chain and an item in this hash chain,
820  * find if a previous entry has the same hash_inputs
821  * (but differs on tos, mark or oif)
822  * Returns 0 if an alias is found.
823  * Returns ONE if rth has no alias before itself.
824  */
has_noalias(const struct rtable * head,const struct rtable * rth)825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827 	const struct rtable *aux = head;
828 
829 	while (aux != rth) {
830 		if (compare_hash_inputs(aux, rth))
831 			return 0;
832 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833 	}
834 	return ONE;
835 }
836 
rt_check_expire(void)837 static void rt_check_expire(void)
838 {
839 	static unsigned int rover;
840 	unsigned int i = rover, goal;
841 	struct rtable *rth;
842 	struct rtable __rcu **rthp;
843 	unsigned long samples = 0;
844 	unsigned long sum = 0, sum2 = 0;
845 	unsigned long delta;
846 	u64 mult;
847 
848 	delta = jiffies - expires_ljiffies;
849 	expires_ljiffies = jiffies;
850 	mult = ((u64)delta) << rt_hash_log;
851 	if (ip_rt_gc_timeout > 1)
852 		do_div(mult, ip_rt_gc_timeout);
853 	goal = (unsigned int)mult;
854 	if (goal > rt_hash_mask)
855 		goal = rt_hash_mask + 1;
856 	for (; goal > 0; goal--) {
857 		unsigned long tmo = ip_rt_gc_timeout;
858 		unsigned long length;
859 
860 		i = (i + 1) & rt_hash_mask;
861 		rthp = &rt_hash_table[i].chain;
862 
863 		if (need_resched())
864 			cond_resched();
865 
866 		samples++;
867 
868 		if (rcu_dereference_raw(*rthp) == NULL)
869 			continue;
870 		length = 0;
871 		spin_lock_bh(rt_hash_lock_addr(i));
872 		while ((rth = rcu_dereference_protected(*rthp,
873 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 			prefetch(rth->dst.rt_next);
875 			if (rt_is_expired(rth)) {
876 				*rthp = rth->dst.rt_next;
877 				rt_free(rth);
878 				continue;
879 			}
880 			if (rth->dst.expires) {
881 				/* Entry is expired even if it is in use */
882 				if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884 					tmo >>= 1;
885 					rthp = &rth->dst.rt_next;
886 					/*
887 					 * We only count entries on
888 					 * a chain with equal hash inputs once
889 					 * so that entries for different QOS
890 					 * levels, and other non-hash input
891 					 * attributes don't unfairly skew
892 					 * the length computation
893 					 */
894 					length += has_noalias(rt_hash_table[i].chain, rth);
895 					continue;
896 				}
897 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 				goto nofree;
899 
900 			/* Cleanup aged off entries. */
901 			*rthp = rth->dst.rt_next;
902 			rt_free(rth);
903 		}
904 		spin_unlock_bh(rt_hash_lock_addr(i));
905 		sum += length;
906 		sum2 += length*length;
907 	}
908 	if (samples) {
909 		unsigned long avg = sum / samples;
910 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 		rt_chain_length_max = max_t(unsigned long,
912 					ip_rt_gc_elasticity,
913 					(avg + 4*sd) >> FRACT_BITS);
914 	}
915 	rover = i;
916 }
917 
918 /*
919  * rt_worker_func() is run in process context.
920  * we call rt_check_expire() to scan part of the hash table
921  */
rt_worker_func(struct work_struct * work)922 static void rt_worker_func(struct work_struct *work)
923 {
924 	rt_check_expire();
925 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927 
928 /*
929  * Perturbation of rt_genid by a small quantity [1..256]
930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931  * many times (2^24) without giving recent rt_genid.
932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933  */
rt_cache_invalidate(struct net * net)934 static void rt_cache_invalidate(struct net *net)
935 {
936 	unsigned char shuffle;
937 
938 	get_random_bytes(&shuffle, sizeof(shuffle));
939 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940 	inetpeer_invalidate_tree(AF_INET);
941 }
942 
943 /*
944  * delay < 0  : invalidate cache (fast : entries will be deleted later)
945  * delay >= 0 : invalidate & flush cache (can be long)
946  */
rt_cache_flush(struct net * net,int delay)947 void rt_cache_flush(struct net *net, int delay)
948 {
949 	rt_cache_invalidate(net);
950 	if (delay >= 0)
951 		rt_do_flush(net, !in_softirq());
952 }
953 
954 /* Flush previous cache invalidated entries from the cache */
rt_cache_flush_batch(struct net * net)955 void rt_cache_flush_batch(struct net *net)
956 {
957 	rt_do_flush(net, !in_softirq());
958 }
959 
rt_emergency_hash_rebuild(struct net * net)960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962 	if (net_ratelimit())
963 		pr_warn("Route hash chain too long!\n");
964 	rt_cache_invalidate(net);
965 }
966 
967 /*
968    Short description of GC goals.
969 
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973 
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979 
rt_garbage_collect(struct dst_ops * ops)980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982 	static unsigned long expire = RT_GC_TIMEOUT;
983 	static unsigned long last_gc;
984 	static int rover;
985 	static int equilibrium;
986 	struct rtable *rth;
987 	struct rtable __rcu **rthp;
988 	unsigned long now = jiffies;
989 	int goal;
990 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
991 
992 	/*
993 	 * Garbage collection is pretty expensive,
994 	 * do not make it too frequently.
995 	 */
996 
997 	RT_CACHE_STAT_INC(gc_total);
998 
999 	if (now - last_gc < ip_rt_gc_min_interval &&
1000 	    entries < ip_rt_max_size) {
1001 		RT_CACHE_STAT_INC(gc_ignored);
1002 		goto out;
1003 	}
1004 
1005 	entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 	/* Calculate number of entries, which we want to expire now. */
1007 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008 	if (goal <= 0) {
1009 		if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 			equilibrium = ipv4_dst_ops.gc_thresh;
1011 		goal = entries - equilibrium;
1012 		if (goal > 0) {
1013 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 			goal = entries - equilibrium;
1015 		}
1016 	} else {
1017 		/* We are in dangerous area. Try to reduce cache really
1018 		 * aggressively.
1019 		 */
1020 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 		equilibrium = entries - goal;
1022 	}
1023 
1024 	if (now - last_gc >= ip_rt_gc_min_interval)
1025 		last_gc = now;
1026 
1027 	if (goal <= 0) {
1028 		equilibrium += goal;
1029 		goto work_done;
1030 	}
1031 
1032 	do {
1033 		int i, k;
1034 
1035 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 			unsigned long tmo = expire;
1037 
1038 			k = (k + 1) & rt_hash_mask;
1039 			rthp = &rt_hash_table[k].chain;
1040 			spin_lock_bh(rt_hash_lock_addr(k));
1041 			while ((rth = rcu_dereference_protected(*rthp,
1042 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 				if (!rt_is_expired(rth) &&
1044 					!rt_may_expire(rth, tmo, expire)) {
1045 					tmo >>= 1;
1046 					rthp = &rth->dst.rt_next;
1047 					continue;
1048 				}
1049 				*rthp = rth->dst.rt_next;
1050 				rt_free(rth);
1051 				goal--;
1052 			}
1053 			spin_unlock_bh(rt_hash_lock_addr(k));
1054 			if (goal <= 0)
1055 				break;
1056 		}
1057 		rover = k;
1058 
1059 		if (goal <= 0)
1060 			goto work_done;
1061 
1062 		/* Goal is not achieved. We stop process if:
1063 
1064 		   - if expire reduced to zero. Otherwise, expire is halfed.
1065 		   - if table is not full.
1066 		   - if we are called from interrupt.
1067 		   - jiffies check is just fallback/debug loop breaker.
1068 		     We will not spin here for long time in any case.
1069 		 */
1070 
1071 		RT_CACHE_STAT_INC(gc_goal_miss);
1072 
1073 		if (expire == 0)
1074 			break;
1075 
1076 		expire >>= 1;
1077 
1078 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 			goto out;
1080 	} while (!in_softirq() && time_before_eq(jiffies, now));
1081 
1082 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 		goto out;
1084 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 		goto out;
1086 	if (net_ratelimit())
1087 		pr_warn("dst cache overflow\n");
1088 	RT_CACHE_STAT_INC(gc_dst_overflow);
1089 	return 1;
1090 
1091 work_done:
1092 	expire += ip_rt_gc_min_interval;
1093 	if (expire > ip_rt_gc_timeout ||
1094 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096 		expire = ip_rt_gc_timeout;
1097 out:	return 0;
1098 }
1099 
1100 /*
1101  * Returns number of entries in a hash chain that have different hash_inputs
1102  */
slow_chain_length(const struct rtable * head)1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105 	int length = 0;
1106 	const struct rtable *rth = head;
1107 
1108 	while (rth) {
1109 		length += has_noalias(head, rth);
1110 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111 	}
1112 	return length >> FRACT_BITS;
1113 }
1114 
ipv4_neigh_lookup(const struct dst_entry * dst,const void * daddr)1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117 	static const __be32 inaddr_any = 0;
1118 	struct net_device *dev = dst->dev;
1119 	const __be32 *pkey = daddr;
1120 	const struct rtable *rt;
1121 	struct neighbour *n;
1122 
1123 	rt = (const struct rtable *) dst;
1124 
1125 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1126 		pkey = &inaddr_any;
1127 	else if (rt->rt_gateway)
1128 		pkey = (const __be32 *) &rt->rt_gateway;
1129 
1130 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1131 	if (n)
1132 		return n;
1133 	return neigh_create(&arp_tbl, pkey, dev);
1134 }
1135 
rt_bind_neighbour(struct rtable * rt)1136 static int rt_bind_neighbour(struct rtable *rt)
1137 {
1138 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1139 	if (IS_ERR(n))
1140 		return PTR_ERR(n);
1141 	dst_set_neighbour(&rt->dst, n);
1142 
1143 	return 0;
1144 }
1145 
rt_intern_hash(unsigned hash,struct rtable * rt,struct sk_buff * skb,int ifindex)1146 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1147 				     struct sk_buff *skb, int ifindex)
1148 {
1149 	struct rtable	*rth, *cand;
1150 	struct rtable __rcu **rthp, **candp;
1151 	unsigned long	now;
1152 	u32 		min_score;
1153 	int		chain_length;
1154 	int attempts = !in_softirq();
1155 
1156 restart:
1157 	chain_length = 0;
1158 	min_score = ~(u32)0;
1159 	cand = NULL;
1160 	candp = NULL;
1161 	now = jiffies;
1162 
1163 	if (!rt_caching(dev_net(rt->dst.dev))) {
1164 		/*
1165 		 * If we're not caching, just tell the caller we
1166 		 * were successful and don't touch the route.  The
1167 		 * caller hold the sole reference to the cache entry, and
1168 		 * it will be released when the caller is done with it.
1169 		 * If we drop it here, the callers have no way to resolve routes
1170 		 * when we're not caching.  Instead, just point *rp at rt, so
1171 		 * the caller gets a single use out of the route
1172 		 * Note that we do rt_free on this new route entry, so that
1173 		 * once its refcount hits zero, we are still able to reap it
1174 		 * (Thanks Alexey)
1175 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1176 		 * we set DST_NOCACHE so that dst_release() can free dst without
1177 		 * waiting a grace period.
1178 		 */
1179 
1180 		rt->dst.flags |= DST_NOCACHE;
1181 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182 			int err = rt_bind_neighbour(rt);
1183 			if (err) {
1184 				if (net_ratelimit())
1185 					pr_warn("Neighbour table failure & not caching routes\n");
1186 				ip_rt_put(rt);
1187 				return ERR_PTR(err);
1188 			}
1189 		}
1190 
1191 		goto skip_hashing;
1192 	}
1193 
1194 	rthp = &rt_hash_table[hash].chain;
1195 
1196 	spin_lock_bh(rt_hash_lock_addr(hash));
1197 	while ((rth = rcu_dereference_protected(*rthp,
1198 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1199 		if (rt_is_expired(rth)) {
1200 			*rthp = rth->dst.rt_next;
1201 			rt_free(rth);
1202 			continue;
1203 		}
1204 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1205 			/* Put it first */
1206 			*rthp = rth->dst.rt_next;
1207 			/*
1208 			 * Since lookup is lockfree, the deletion
1209 			 * must be visible to another weakly ordered CPU before
1210 			 * the insertion at the start of the hash chain.
1211 			 */
1212 			rcu_assign_pointer(rth->dst.rt_next,
1213 					   rt_hash_table[hash].chain);
1214 			/*
1215 			 * Since lookup is lockfree, the update writes
1216 			 * must be ordered for consistency on SMP.
1217 			 */
1218 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1219 
1220 			dst_use(&rth->dst, now);
1221 			spin_unlock_bh(rt_hash_lock_addr(hash));
1222 
1223 			rt_drop(rt);
1224 			if (skb)
1225 				skb_dst_set(skb, &rth->dst);
1226 			return rth;
1227 		}
1228 
1229 		if (!atomic_read(&rth->dst.__refcnt)) {
1230 			u32 score = rt_score(rth);
1231 
1232 			if (score <= min_score) {
1233 				cand = rth;
1234 				candp = rthp;
1235 				min_score = score;
1236 			}
1237 		}
1238 
1239 		chain_length++;
1240 
1241 		rthp = &rth->dst.rt_next;
1242 	}
1243 
1244 	if (cand) {
1245 		/* ip_rt_gc_elasticity used to be average length of chain
1246 		 * length, when exceeded gc becomes really aggressive.
1247 		 *
1248 		 * The second limit is less certain. At the moment it allows
1249 		 * only 2 entries per bucket. We will see.
1250 		 */
1251 		if (chain_length > ip_rt_gc_elasticity) {
1252 			*candp = cand->dst.rt_next;
1253 			rt_free(cand);
1254 		}
1255 	} else {
1256 		if (chain_length > rt_chain_length_max &&
1257 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1258 			struct net *net = dev_net(rt->dst.dev);
1259 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1260 			if (!rt_caching(net)) {
1261 				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1262 					rt->dst.dev->name, num);
1263 			}
1264 			rt_emergency_hash_rebuild(net);
1265 			spin_unlock_bh(rt_hash_lock_addr(hash));
1266 
1267 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1268 					ifindex, rt_genid(net));
1269 			goto restart;
1270 		}
1271 	}
1272 
1273 	/* Try to bind route to arp only if it is output
1274 	   route or unicast forwarding path.
1275 	 */
1276 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1277 		int err = rt_bind_neighbour(rt);
1278 		if (err) {
1279 			spin_unlock_bh(rt_hash_lock_addr(hash));
1280 
1281 			if (err != -ENOBUFS) {
1282 				rt_drop(rt);
1283 				return ERR_PTR(err);
1284 			}
1285 
1286 			/* Neighbour tables are full and nothing
1287 			   can be released. Try to shrink route cache,
1288 			   it is most likely it holds some neighbour records.
1289 			 */
1290 			if (attempts-- > 0) {
1291 				int saved_elasticity = ip_rt_gc_elasticity;
1292 				int saved_int = ip_rt_gc_min_interval;
1293 				ip_rt_gc_elasticity	= 1;
1294 				ip_rt_gc_min_interval	= 0;
1295 				rt_garbage_collect(&ipv4_dst_ops);
1296 				ip_rt_gc_min_interval	= saved_int;
1297 				ip_rt_gc_elasticity	= saved_elasticity;
1298 				goto restart;
1299 			}
1300 
1301 			if (net_ratelimit())
1302 				pr_warn("Neighbour table overflow\n");
1303 			rt_drop(rt);
1304 			return ERR_PTR(-ENOBUFS);
1305 		}
1306 	}
1307 
1308 	rt->dst.rt_next = rt_hash_table[hash].chain;
1309 
1310 	/*
1311 	 * Since lookup is lockfree, we must make sure
1312 	 * previous writes to rt are committed to memory
1313 	 * before making rt visible to other CPUS.
1314 	 */
1315 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1316 
1317 	spin_unlock_bh(rt_hash_lock_addr(hash));
1318 
1319 skip_hashing:
1320 	if (skb)
1321 		skb_dst_set(skb, &rt->dst);
1322 	return rt;
1323 }
1324 
1325 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1326 
rt_peer_genid(void)1327 static u32 rt_peer_genid(void)
1328 {
1329 	return atomic_read(&__rt_peer_genid);
1330 }
1331 
rt_bind_peer(struct rtable * rt,__be32 daddr,int create)1332 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1333 {
1334 	struct inet_peer *peer;
1335 
1336 	peer = inet_getpeer_v4(daddr, create);
1337 
1338 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1339 		inet_putpeer(peer);
1340 	else
1341 		rt->rt_peer_genid = rt_peer_genid();
1342 }
1343 
1344 /*
1345  * Peer allocation may fail only in serious out-of-memory conditions.  However
1346  * we still can generate some output.
1347  * Random ID selection looks a bit dangerous because we have no chances to
1348  * select ID being unique in a reasonable period of time.
1349  * But broken packet identifier may be better than no packet at all.
1350  */
ip_select_fb_ident(struct iphdr * iph)1351 static void ip_select_fb_ident(struct iphdr *iph)
1352 {
1353 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1354 	static u32 ip_fallback_id;
1355 	u32 salt;
1356 
1357 	spin_lock_bh(&ip_fb_id_lock);
1358 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1359 	iph->id = htons(salt & 0xFFFF);
1360 	ip_fallback_id = salt;
1361 	spin_unlock_bh(&ip_fb_id_lock);
1362 }
1363 
__ip_select_ident(struct iphdr * iph,struct dst_entry * dst,int more)1364 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1365 {
1366 	struct rtable *rt = (struct rtable *) dst;
1367 
1368 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1369 		if (rt->peer == NULL)
1370 			rt_bind_peer(rt, rt->rt_dst, 1);
1371 
1372 		/* If peer is attached to destination, it is never detached,
1373 		   so that we need not to grab a lock to dereference it.
1374 		 */
1375 		if (rt->peer) {
1376 			iph->id = htons(inet_getid(rt->peer, more));
1377 			return;
1378 		}
1379 	} else if (!rt)
1380 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1381 		       __builtin_return_address(0));
1382 
1383 	ip_select_fb_ident(iph);
1384 }
1385 EXPORT_SYMBOL(__ip_select_ident);
1386 
rt_del(unsigned hash,struct rtable * rt)1387 static void rt_del(unsigned hash, struct rtable *rt)
1388 {
1389 	struct rtable __rcu **rthp;
1390 	struct rtable *aux;
1391 
1392 	rthp = &rt_hash_table[hash].chain;
1393 	spin_lock_bh(rt_hash_lock_addr(hash));
1394 	ip_rt_put(rt);
1395 	while ((aux = rcu_dereference_protected(*rthp,
1396 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1397 		if (aux == rt || rt_is_expired(aux)) {
1398 			*rthp = aux->dst.rt_next;
1399 			rt_free(aux);
1400 			continue;
1401 		}
1402 		rthp = &aux->dst.rt_next;
1403 	}
1404 	spin_unlock_bh(rt_hash_lock_addr(hash));
1405 }
1406 
check_peer_redir(struct dst_entry * dst,struct inet_peer * peer)1407 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1408 {
1409 	struct rtable *rt = (struct rtable *) dst;
1410 	__be32 orig_gw = rt->rt_gateway;
1411 	struct neighbour *n, *old_n;
1412 
1413 	dst_confirm(&rt->dst);
1414 
1415 	rt->rt_gateway = peer->redirect_learned.a4;
1416 
1417 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1418 	if (IS_ERR(n)) {
1419 		rt->rt_gateway = orig_gw;
1420 		return;
1421 	}
1422 	old_n = xchg(&rt->dst._neighbour, n);
1423 	if (old_n)
1424 		neigh_release(old_n);
1425 	if (!(n->nud_state & NUD_VALID)) {
1426 		neigh_event_send(n, NULL);
1427 	} else {
1428 		rt->rt_flags |= RTCF_REDIRECTED;
1429 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1430 	}
1431 }
1432 
1433 /* called in rcu_read_lock() section */
ip_rt_redirect(__be32 old_gw,__be32 daddr,__be32 new_gw,__be32 saddr,struct net_device * dev)1434 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1435 		    __be32 saddr, struct net_device *dev)
1436 {
1437 	int s, i;
1438 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1439 	__be32 skeys[2] = { saddr, 0 };
1440 	int    ikeys[2] = { dev->ifindex, 0 };
1441 	struct inet_peer *peer;
1442 	struct net *net;
1443 
1444 	if (!in_dev)
1445 		return;
1446 
1447 	net = dev_net(dev);
1448 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1449 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1450 	    ipv4_is_zeronet(new_gw))
1451 		goto reject_redirect;
1452 
1453 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1454 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1455 			goto reject_redirect;
1456 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1457 			goto reject_redirect;
1458 	} else {
1459 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1460 			goto reject_redirect;
1461 	}
1462 
1463 	for (s = 0; s < 2; s++) {
1464 		for (i = 0; i < 2; i++) {
1465 			unsigned int hash;
1466 			struct rtable __rcu **rthp;
1467 			struct rtable *rt;
1468 
1469 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1470 
1471 			rthp = &rt_hash_table[hash].chain;
1472 
1473 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1474 				rthp = &rt->dst.rt_next;
1475 
1476 				if (rt->rt_key_dst != daddr ||
1477 				    rt->rt_key_src != skeys[s] ||
1478 				    rt->rt_oif != ikeys[i] ||
1479 				    rt_is_input_route(rt) ||
1480 				    rt_is_expired(rt) ||
1481 				    !net_eq(dev_net(rt->dst.dev), net) ||
1482 				    rt->dst.error ||
1483 				    rt->dst.dev != dev ||
1484 				    rt->rt_gateway != old_gw)
1485 					continue;
1486 
1487 				if (!rt->peer)
1488 					rt_bind_peer(rt, rt->rt_dst, 1);
1489 
1490 				peer = rt->peer;
1491 				if (peer) {
1492 					if (peer->redirect_learned.a4 != new_gw) {
1493 						peer->redirect_learned.a4 = new_gw;
1494 						atomic_inc(&__rt_peer_genid);
1495 					}
1496 					check_peer_redir(&rt->dst, peer);
1497 				}
1498 			}
1499 		}
1500 	}
1501 	return;
1502 
1503 reject_redirect:
1504 #ifdef CONFIG_IP_ROUTE_VERBOSE
1505 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1506 		pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
1507 			"  Advised path = %pI4 -> %pI4\n",
1508 			&old_gw, dev->name, &new_gw,
1509 			&saddr, &daddr);
1510 #endif
1511 	;
1512 }
1513 
peer_pmtu_expired(struct inet_peer * peer)1514 static bool peer_pmtu_expired(struct inet_peer *peer)
1515 {
1516 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1517 
1518 	return orig &&
1519 	       time_after_eq(jiffies, orig) &&
1520 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1521 }
1522 
peer_pmtu_cleaned(struct inet_peer * peer)1523 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1524 {
1525 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1526 
1527 	return orig &&
1528 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1529 }
1530 
ipv4_negative_advice(struct dst_entry * dst)1531 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1532 {
1533 	struct rtable *rt = (struct rtable *)dst;
1534 	struct dst_entry *ret = dst;
1535 
1536 	if (rt) {
1537 		if (dst->obsolete > 0) {
1538 			ip_rt_put(rt);
1539 			ret = NULL;
1540 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1541 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1542 						rt->rt_oif,
1543 						rt_genid(dev_net(dst->dev)));
1544 			rt_del(hash, rt);
1545 			ret = NULL;
1546 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1547 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1548 		}
1549 	}
1550 	return ret;
1551 }
1552 
1553 /*
1554  * Algorithm:
1555  *	1. The first ip_rt_redirect_number redirects are sent
1556  *	   with exponential backoff, then we stop sending them at all,
1557  *	   assuming that the host ignores our redirects.
1558  *	2. If we did not see packets requiring redirects
1559  *	   during ip_rt_redirect_silence, we assume that the host
1560  *	   forgot redirected route and start to send redirects again.
1561  *
1562  * This algorithm is much cheaper and more intelligent than dumb load limiting
1563  * in icmp.c.
1564  *
1565  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1566  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1567  */
1568 
ip_rt_send_redirect(struct sk_buff * skb)1569 void ip_rt_send_redirect(struct sk_buff *skb)
1570 {
1571 	struct rtable *rt = skb_rtable(skb);
1572 	struct in_device *in_dev;
1573 	struct inet_peer *peer;
1574 	int log_martians;
1575 
1576 	rcu_read_lock();
1577 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1578 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1579 		rcu_read_unlock();
1580 		return;
1581 	}
1582 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1583 	rcu_read_unlock();
1584 
1585 	if (!rt->peer)
1586 		rt_bind_peer(rt, rt->rt_dst, 1);
1587 	peer = rt->peer;
1588 	if (!peer) {
1589 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1590 		return;
1591 	}
1592 
1593 	/* No redirected packets during ip_rt_redirect_silence;
1594 	 * reset the algorithm.
1595 	 */
1596 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1597 		peer->rate_tokens = 0;
1598 
1599 	/* Too many ignored redirects; do not send anything
1600 	 * set dst.rate_last to the last seen redirected packet.
1601 	 */
1602 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1603 		peer->rate_last = jiffies;
1604 		return;
1605 	}
1606 
1607 	/* Check for load limit; set rate_last to the latest sent
1608 	 * redirect.
1609 	 */
1610 	if (peer->rate_tokens == 0 ||
1611 	    time_after(jiffies,
1612 		       (peer->rate_last +
1613 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1614 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1615 		peer->rate_last = jiffies;
1616 		++peer->rate_tokens;
1617 #ifdef CONFIG_IP_ROUTE_VERBOSE
1618 		if (log_martians &&
1619 		    peer->rate_tokens == ip_rt_redirect_number &&
1620 		    net_ratelimit())
1621 			pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1622 				&ip_hdr(skb)->saddr, rt->rt_iif,
1623 				&rt->rt_dst, &rt->rt_gateway);
1624 #endif
1625 	}
1626 }
1627 
ip_error(struct sk_buff * skb)1628 static int ip_error(struct sk_buff *skb)
1629 {
1630 	struct rtable *rt = skb_rtable(skb);
1631 	struct inet_peer *peer;
1632 	unsigned long now;
1633 	bool send;
1634 	int code;
1635 
1636 	switch (rt->dst.error) {
1637 	case EINVAL:
1638 	default:
1639 		goto out;
1640 	case EHOSTUNREACH:
1641 		code = ICMP_HOST_UNREACH;
1642 		break;
1643 	case ENETUNREACH:
1644 		code = ICMP_NET_UNREACH;
1645 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1646 				IPSTATS_MIB_INNOROUTES);
1647 		break;
1648 	case EACCES:
1649 		code = ICMP_PKT_FILTERED;
1650 		break;
1651 	}
1652 
1653 	if (!rt->peer)
1654 		rt_bind_peer(rt, rt->rt_dst, 1);
1655 	peer = rt->peer;
1656 
1657 	send = true;
1658 	if (peer) {
1659 		now = jiffies;
1660 		peer->rate_tokens += now - peer->rate_last;
1661 		if (peer->rate_tokens > ip_rt_error_burst)
1662 			peer->rate_tokens = ip_rt_error_burst;
1663 		peer->rate_last = now;
1664 		if (peer->rate_tokens >= ip_rt_error_cost)
1665 			peer->rate_tokens -= ip_rt_error_cost;
1666 		else
1667 			send = false;
1668 	}
1669 	if (send)
1670 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1671 
1672 out:	kfree_skb(skb);
1673 	return 0;
1674 }
1675 
1676 /*
1677  *	The last two values are not from the RFC but
1678  *	are needed for AMPRnet AX.25 paths.
1679  */
1680 
1681 static const unsigned short mtu_plateau[] =
1682 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1683 
guess_mtu(unsigned short old_mtu)1684 static inline unsigned short guess_mtu(unsigned short old_mtu)
1685 {
1686 	int i;
1687 
1688 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1689 		if (old_mtu > mtu_plateau[i])
1690 			return mtu_plateau[i];
1691 	return 68;
1692 }
1693 
ip_rt_frag_needed(struct net * net,const struct iphdr * iph,unsigned short new_mtu,struct net_device * dev)1694 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1695 				 unsigned short new_mtu,
1696 				 struct net_device *dev)
1697 {
1698 	unsigned short old_mtu = ntohs(iph->tot_len);
1699 	unsigned short est_mtu = 0;
1700 	struct inet_peer *peer;
1701 
1702 	peer = inet_getpeer_v4(iph->daddr, 1);
1703 	if (peer) {
1704 		unsigned short mtu = new_mtu;
1705 
1706 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1707 			/* BSD 4.2 derived systems incorrectly adjust
1708 			 * tot_len by the IP header length, and report
1709 			 * a zero MTU in the ICMP message.
1710 			 */
1711 			if (mtu == 0 &&
1712 			    old_mtu >= 68 + (iph->ihl << 2))
1713 				old_mtu -= iph->ihl << 2;
1714 			mtu = guess_mtu(old_mtu);
1715 		}
1716 
1717 		if (mtu < ip_rt_min_pmtu)
1718 			mtu = ip_rt_min_pmtu;
1719 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1720 			unsigned long pmtu_expires;
1721 
1722 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1723 			if (!pmtu_expires)
1724 				pmtu_expires = 1UL;
1725 
1726 			est_mtu = mtu;
1727 			peer->pmtu_learned = mtu;
1728 			peer->pmtu_expires = pmtu_expires;
1729 			atomic_inc(&__rt_peer_genid);
1730 		}
1731 
1732 		inet_putpeer(peer);
1733 	}
1734 	return est_mtu ? : new_mtu;
1735 }
1736 
check_peer_pmtu(struct dst_entry * dst,struct inet_peer * peer)1737 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1738 {
1739 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1740 
1741 	if (!expires)
1742 		return;
1743 	if (time_before(jiffies, expires)) {
1744 		u32 orig_dst_mtu = dst_mtu(dst);
1745 		if (peer->pmtu_learned < orig_dst_mtu) {
1746 			if (!peer->pmtu_orig)
1747 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1748 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1749 		}
1750 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1751 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1752 }
1753 
ip_rt_update_pmtu(struct dst_entry * dst,u32 mtu)1754 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1755 {
1756 	struct rtable *rt = (struct rtable *) dst;
1757 	struct inet_peer *peer;
1758 
1759 	dst_confirm(dst);
1760 
1761 	if (!rt->peer)
1762 		rt_bind_peer(rt, rt->rt_dst, 1);
1763 	peer = rt->peer;
1764 	if (peer) {
1765 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1766 
1767 		if (mtu < ip_rt_min_pmtu)
1768 			mtu = ip_rt_min_pmtu;
1769 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1770 
1771 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1772 			if (!pmtu_expires)
1773 				pmtu_expires = 1UL;
1774 
1775 			peer->pmtu_learned = mtu;
1776 			peer->pmtu_expires = pmtu_expires;
1777 
1778 			atomic_inc(&__rt_peer_genid);
1779 			rt->rt_peer_genid = rt_peer_genid();
1780 		}
1781 		check_peer_pmtu(dst, peer);
1782 	}
1783 }
1784 
1785 
ipv4_validate_peer(struct rtable * rt)1786 static void ipv4_validate_peer(struct rtable *rt)
1787 {
1788 	if (rt->rt_peer_genid != rt_peer_genid()) {
1789 		struct inet_peer *peer;
1790 
1791 		if (!rt->peer)
1792 			rt_bind_peer(rt, rt->rt_dst, 0);
1793 
1794 		peer = rt->peer;
1795 		if (peer) {
1796 			check_peer_pmtu(&rt->dst, peer);
1797 
1798 			if (peer->redirect_learned.a4 &&
1799 			    peer->redirect_learned.a4 != rt->rt_gateway)
1800 				check_peer_redir(&rt->dst, peer);
1801 		}
1802 
1803 		rt->rt_peer_genid = rt_peer_genid();
1804 	}
1805 }
1806 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1807 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808 {
1809 	struct rtable *rt = (struct rtable *) dst;
1810 
1811 	if (rt_is_expired(rt))
1812 		return NULL;
1813 	ipv4_validate_peer(rt);
1814 	return dst;
1815 }
1816 
ipv4_dst_destroy(struct dst_entry * dst)1817 static void ipv4_dst_destroy(struct dst_entry *dst)
1818 {
1819 	struct rtable *rt = (struct rtable *) dst;
1820 	struct inet_peer *peer = rt->peer;
1821 
1822 	if (rt->fi) {
1823 		fib_info_put(rt->fi);
1824 		rt->fi = NULL;
1825 	}
1826 	if (peer) {
1827 		rt->peer = NULL;
1828 		inet_putpeer(peer);
1829 	}
1830 }
1831 
1832 
ipv4_link_failure(struct sk_buff * skb)1833 static void ipv4_link_failure(struct sk_buff *skb)
1834 {
1835 	struct rtable *rt;
1836 
1837 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838 
1839 	rt = skb_rtable(skb);
1840 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1842 }
1843 
ip_rt_bug(struct sk_buff * skb)1844 static int ip_rt_bug(struct sk_buff *skb)
1845 {
1846 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848 		skb->dev ? skb->dev->name : "?");
1849 	kfree_skb(skb);
1850 	WARN_ON(1);
1851 	return 0;
1852 }
1853 
1854 /*
1855    We do not cache source address of outgoing interface,
1856    because it is used only by IP RR, TS and SRR options,
1857    so that it out of fast path.
1858 
1859    BTW remember: "addr" is allowed to be not aligned
1860    in IP options!
1861  */
1862 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1863 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1864 {
1865 	__be32 src;
1866 
1867 	if (rt_is_output_route(rt))
1868 		src = ip_hdr(skb)->saddr;
1869 	else {
1870 		struct fib_result res;
1871 		struct flowi4 fl4;
1872 		struct iphdr *iph;
1873 
1874 		iph = ip_hdr(skb);
1875 
1876 		memset(&fl4, 0, sizeof(fl4));
1877 		fl4.daddr = iph->daddr;
1878 		fl4.saddr = iph->saddr;
1879 		fl4.flowi4_tos = RT_TOS(iph->tos);
1880 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1881 		fl4.flowi4_iif = skb->dev->ifindex;
1882 		fl4.flowi4_mark = skb->mark;
1883 
1884 		rcu_read_lock();
1885 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1886 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1887 		else
1888 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1889 					RT_SCOPE_UNIVERSE);
1890 		rcu_read_unlock();
1891 	}
1892 	memcpy(addr, &src, 4);
1893 }
1894 
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1896 static void set_class_tag(struct rtable *rt, u32 tag)
1897 {
1898 	if (!(rt->dst.tclassid & 0xFFFF))
1899 		rt->dst.tclassid |= tag & 0xFFFF;
1900 	if (!(rt->dst.tclassid & 0xFFFF0000))
1901 		rt->dst.tclassid |= tag & 0xFFFF0000;
1902 }
1903 #endif
1904 
ipv4_default_advmss(const struct dst_entry * dst)1905 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906 {
1907 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908 
1909 	if (advmss == 0) {
1910 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911 			       ip_rt_min_advmss);
1912 		if (advmss > 65535 - 40)
1913 			advmss = 65535 - 40;
1914 	}
1915 	return advmss;
1916 }
1917 
ipv4_mtu(const struct dst_entry * dst)1918 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1919 {
1920 	const struct rtable *rt = (const struct rtable *) dst;
1921 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922 
1923 	if (mtu && rt_is_output_route(rt))
1924 		return mtu;
1925 
1926 	mtu = dst->dev->mtu;
1927 
1928 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1929 
1930 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931 			mtu = 576;
1932 	}
1933 
1934 	if (mtu > IP_MAX_MTU)
1935 		mtu = IP_MAX_MTU;
1936 
1937 	return mtu;
1938 }
1939 
rt_init_metrics(struct rtable * rt,const struct flowi4 * fl4,struct fib_info * fi)1940 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1941 			    struct fib_info *fi)
1942 {
1943 	struct inet_peer *peer;
1944 	int create = 0;
1945 
1946 	/* If a peer entry exists for this destination, we must hook
1947 	 * it up in order to get at cached metrics.
1948 	 */
1949 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1950 		create = 1;
1951 
1952 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1953 	if (peer) {
1954 		rt->rt_peer_genid = rt_peer_genid();
1955 		if (inet_metrics_new(peer))
1956 			memcpy(peer->metrics, fi->fib_metrics,
1957 			       sizeof(u32) * RTAX_MAX);
1958 		dst_init_metrics(&rt->dst, peer->metrics, false);
1959 
1960 		check_peer_pmtu(&rt->dst, peer);
1961 
1962 		if (peer->redirect_learned.a4 &&
1963 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1964 			rt->rt_gateway = peer->redirect_learned.a4;
1965 			rt->rt_flags |= RTCF_REDIRECTED;
1966 		}
1967 	} else {
1968 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1969 			rt->fi = fi;
1970 			atomic_inc(&fi->fib_clntref);
1971 		}
1972 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1973 	}
1974 }
1975 
rt_set_nexthop(struct rtable * rt,const struct flowi4 * fl4,const struct fib_result * res,struct fib_info * fi,u16 type,u32 itag)1976 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1977 			   const struct fib_result *res,
1978 			   struct fib_info *fi, u16 type, u32 itag)
1979 {
1980 	struct dst_entry *dst = &rt->dst;
1981 
1982 	if (fi) {
1983 		if (FIB_RES_GW(*res) &&
1984 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1985 			rt->rt_gateway = FIB_RES_GW(*res);
1986 		rt_init_metrics(rt, fl4, fi);
1987 #ifdef CONFIG_IP_ROUTE_CLASSID
1988 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1989 #endif
1990 	}
1991 
1992 	if (dst_mtu(dst) > IP_MAX_MTU)
1993 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1994 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1995 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1996 
1997 #ifdef CONFIG_IP_ROUTE_CLASSID
1998 #ifdef CONFIG_IP_MULTIPLE_TABLES
1999 	set_class_tag(rt, fib_rules_tclass(res));
2000 #endif
2001 	set_class_tag(rt, itag);
2002 #endif
2003 }
2004 
rt_dst_alloc(struct net_device * dev,bool nopolicy,bool noxfrm)2005 static struct rtable *rt_dst_alloc(struct net_device *dev,
2006 				   bool nopolicy, bool noxfrm)
2007 {
2008 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2009 			 DST_HOST |
2010 			 (nopolicy ? DST_NOPOLICY : 0) |
2011 			 (noxfrm ? DST_NOXFRM : 0));
2012 }
2013 
2014 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)2015 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2016 				u8 tos, struct net_device *dev, int our)
2017 {
2018 	unsigned int hash;
2019 	struct rtable *rth;
2020 	__be32 spec_dst;
2021 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2022 	u32 itag = 0;
2023 	int err;
2024 
2025 	/* Primary sanity checks. */
2026 
2027 	if (in_dev == NULL)
2028 		return -EINVAL;
2029 
2030 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2031 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2032 		goto e_inval;
2033 
2034 	if (ipv4_is_zeronet(saddr)) {
2035 		if (!ipv4_is_local_multicast(daddr))
2036 			goto e_inval;
2037 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2038 	} else {
2039 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2040 					  &itag);
2041 		if (err < 0)
2042 			goto e_err;
2043 	}
2044 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2045 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2046 	if (!rth)
2047 		goto e_nobufs;
2048 
2049 #ifdef CONFIG_IP_ROUTE_CLASSID
2050 	rth->dst.tclassid = itag;
2051 #endif
2052 	rth->dst.output = ip_rt_bug;
2053 
2054 	rth->rt_key_dst	= daddr;
2055 	rth->rt_key_src	= saddr;
2056 	rth->rt_genid	= rt_genid(dev_net(dev));
2057 	rth->rt_flags	= RTCF_MULTICAST;
2058 	rth->rt_type	= RTN_MULTICAST;
2059 	rth->rt_key_tos	= tos;
2060 	rth->rt_dst	= daddr;
2061 	rth->rt_src	= saddr;
2062 	rth->rt_route_iif = dev->ifindex;
2063 	rth->rt_iif	= dev->ifindex;
2064 	rth->rt_oif	= 0;
2065 	rth->rt_mark    = skb->mark;
2066 	rth->rt_gateway	= daddr;
2067 	rth->rt_spec_dst= spec_dst;
2068 	rth->rt_peer_genid = 0;
2069 	rth->peer = NULL;
2070 	rth->fi = NULL;
2071 	if (our) {
2072 		rth->dst.input= ip_local_deliver;
2073 		rth->rt_flags |= RTCF_LOCAL;
2074 	}
2075 
2076 #ifdef CONFIG_IP_MROUTE
2077 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2078 		rth->dst.input = ip_mr_input;
2079 #endif
2080 	RT_CACHE_STAT_INC(in_slow_mc);
2081 
2082 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2083 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2084 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2085 
2086 e_nobufs:
2087 	return -ENOBUFS;
2088 e_inval:
2089 	return -EINVAL;
2090 e_err:
2091 	return err;
2092 }
2093 
2094 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)2095 static void ip_handle_martian_source(struct net_device *dev,
2096 				     struct in_device *in_dev,
2097 				     struct sk_buff *skb,
2098 				     __be32 daddr,
2099 				     __be32 saddr)
2100 {
2101 	RT_CACHE_STAT_INC(in_martian_src);
2102 #ifdef CONFIG_IP_ROUTE_VERBOSE
2103 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2104 		/*
2105 		 *	RFC1812 recommendation, if source is martian,
2106 		 *	the only hint is MAC header.
2107 		 */
2108 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2109 			&daddr, &saddr, dev->name);
2110 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2111 			print_hex_dump(KERN_WARNING, "ll header: ",
2112 				       DUMP_PREFIX_OFFSET, 16, 1,
2113 				       skb_mac_header(skb),
2114 				       dev->hard_header_len, true);
2115 		}
2116 	}
2117 #endif
2118 }
2119 
2120 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct rtable ** result)2121 static int __mkroute_input(struct sk_buff *skb,
2122 			   const struct fib_result *res,
2123 			   struct in_device *in_dev,
2124 			   __be32 daddr, __be32 saddr, u32 tos,
2125 			   struct rtable **result)
2126 {
2127 	struct rtable *rth;
2128 	int err;
2129 	struct in_device *out_dev;
2130 	unsigned int flags = 0;
2131 	__be32 spec_dst;
2132 	u32 itag = 0;
2133 
2134 	/* get a working reference to the output device */
2135 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136 	if (out_dev == NULL) {
2137 		if (net_ratelimit())
2138 			pr_crit("Bug in ip_route_input_slow(). Please report.\n");
2139 		return -EINVAL;
2140 	}
2141 
2142 
2143 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2144 				  in_dev->dev, &spec_dst, &itag);
2145 	if (err < 0) {
2146 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2147 					 saddr);
2148 
2149 		goto cleanup;
2150 	}
2151 
2152 	if (err)
2153 		flags |= RTCF_DIRECTSRC;
2154 
2155 	if (out_dev == in_dev && err &&
2156 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2157 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2158 		flags |= RTCF_DOREDIRECT;
2159 
2160 	if (skb->protocol != htons(ETH_P_IP)) {
2161 		/* Not IP (i.e. ARP). Do not create route, if it is
2162 		 * invalid for proxy arp. DNAT routes are always valid.
2163 		 *
2164 		 * Proxy arp feature have been extended to allow, ARP
2165 		 * replies back to the same interface, to support
2166 		 * Private VLAN switch technologies. See arp.c.
2167 		 */
2168 		if (out_dev == in_dev &&
2169 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2170 			err = -EINVAL;
2171 			goto cleanup;
2172 		}
2173 	}
2174 
2175 	rth = rt_dst_alloc(out_dev->dev,
2176 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2177 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2178 	if (!rth) {
2179 		err = -ENOBUFS;
2180 		goto cleanup;
2181 	}
2182 
2183 	rth->rt_key_dst	= daddr;
2184 	rth->rt_key_src	= saddr;
2185 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2186 	rth->rt_flags = flags;
2187 	rth->rt_type = res->type;
2188 	rth->rt_key_tos	= tos;
2189 	rth->rt_dst	= daddr;
2190 	rth->rt_src	= saddr;
2191 	rth->rt_route_iif = in_dev->dev->ifindex;
2192 	rth->rt_iif 	= in_dev->dev->ifindex;
2193 	rth->rt_oif 	= 0;
2194 	rth->rt_mark    = skb->mark;
2195 	rth->rt_gateway	= daddr;
2196 	rth->rt_spec_dst= spec_dst;
2197 	rth->rt_peer_genid = 0;
2198 	rth->peer = NULL;
2199 	rth->fi = NULL;
2200 
2201 	rth->dst.input = ip_forward;
2202 	rth->dst.output = ip_output;
2203 
2204 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2205 
2206 	*result = rth;
2207 	err = 0;
2208  cleanup:
2209 	return err;
2210 }
2211 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,const struct flowi4 * fl4,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)2212 static int ip_mkroute_input(struct sk_buff *skb,
2213 			    struct fib_result *res,
2214 			    const struct flowi4 *fl4,
2215 			    struct in_device *in_dev,
2216 			    __be32 daddr, __be32 saddr, u32 tos)
2217 {
2218 	struct rtable* rth = NULL;
2219 	int err;
2220 	unsigned hash;
2221 
2222 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2223 	if (res->fi && res->fi->fib_nhs > 1)
2224 		fib_select_multipath(res);
2225 #endif
2226 
2227 	/* create a routing cache entry */
2228 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2229 	if (err)
2230 		return err;
2231 
2232 	/* put it into the cache */
2233 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2234 		       rt_genid(dev_net(rth->dst.dev)));
2235 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2236 	if (IS_ERR(rth))
2237 		return PTR_ERR(rth);
2238 	return 0;
2239 }
2240 
2241 /*
2242  *	NOTE. We drop all the packets that has local source
2243  *	addresses, because every properly looped back packet
2244  *	must have correct destination already attached by output routine.
2245  *
2246  *	Such approach solves two big problems:
2247  *	1. Not simplex devices are handled properly.
2248  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2249  *	called with rcu_read_lock()
2250  */
2251 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2252 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2253 			       u8 tos, struct net_device *dev)
2254 {
2255 	struct fib_result res;
2256 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2257 	struct flowi4	fl4;
2258 	unsigned	flags = 0;
2259 	u32		itag = 0;
2260 	struct rtable * rth;
2261 	unsigned	hash;
2262 	__be32		spec_dst;
2263 	int		err = -EINVAL;
2264 	struct net    * net = dev_net(dev);
2265 
2266 	/* IP on this device is disabled. */
2267 
2268 	if (!in_dev)
2269 		goto out;
2270 
2271 	/* Check for the most weird martians, which can be not detected
2272 	   by fib_lookup.
2273 	 */
2274 
2275 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2276 	    ipv4_is_loopback(saddr))
2277 		goto martian_source;
2278 
2279 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2280 		goto brd_input;
2281 
2282 	/* Accept zero addresses only to limited broadcast;
2283 	 * I even do not know to fix it or not. Waiting for complains :-)
2284 	 */
2285 	if (ipv4_is_zeronet(saddr))
2286 		goto martian_source;
2287 
2288 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2289 		goto martian_destination;
2290 
2291 	/*
2292 	 *	Now we are ready to route packet.
2293 	 */
2294 	fl4.flowi4_oif = 0;
2295 	fl4.flowi4_iif = dev->ifindex;
2296 	fl4.flowi4_mark = skb->mark;
2297 	fl4.flowi4_tos = tos;
2298 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2299 	fl4.daddr = daddr;
2300 	fl4.saddr = saddr;
2301 	err = fib_lookup(net, &fl4, &res);
2302 	if (err != 0) {
2303 		if (!IN_DEV_FORWARD(in_dev))
2304 			goto e_hostunreach;
2305 		goto no_route;
2306 	}
2307 
2308 	RT_CACHE_STAT_INC(in_slow_tot);
2309 
2310 	if (res.type == RTN_BROADCAST)
2311 		goto brd_input;
2312 
2313 	if (res.type == RTN_LOCAL) {
2314 		err = fib_validate_source(skb, saddr, daddr, tos,
2315 					  net->loopback_dev->ifindex,
2316 					  dev, &spec_dst, &itag);
2317 		if (err < 0)
2318 			goto martian_source_keep_err;
2319 		if (err)
2320 			flags |= RTCF_DIRECTSRC;
2321 		spec_dst = daddr;
2322 		goto local_input;
2323 	}
2324 
2325 	if (!IN_DEV_FORWARD(in_dev))
2326 		goto e_hostunreach;
2327 	if (res.type != RTN_UNICAST)
2328 		goto martian_destination;
2329 
2330 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2331 out:	return err;
2332 
2333 brd_input:
2334 	if (skb->protocol != htons(ETH_P_IP))
2335 		goto e_inval;
2336 
2337 	if (ipv4_is_zeronet(saddr))
2338 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2339 	else {
2340 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2341 					  &itag);
2342 		if (err < 0)
2343 			goto martian_source_keep_err;
2344 		if (err)
2345 			flags |= RTCF_DIRECTSRC;
2346 	}
2347 	flags |= RTCF_BROADCAST;
2348 	res.type = RTN_BROADCAST;
2349 	RT_CACHE_STAT_INC(in_brd);
2350 
2351 local_input:
2352 	rth = rt_dst_alloc(net->loopback_dev,
2353 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2354 	if (!rth)
2355 		goto e_nobufs;
2356 
2357 	rth->dst.input= ip_local_deliver;
2358 	rth->dst.output= ip_rt_bug;
2359 #ifdef CONFIG_IP_ROUTE_CLASSID
2360 	rth->dst.tclassid = itag;
2361 #endif
2362 
2363 	rth->rt_key_dst	= daddr;
2364 	rth->rt_key_src	= saddr;
2365 	rth->rt_genid = rt_genid(net);
2366 	rth->rt_flags 	= flags|RTCF_LOCAL;
2367 	rth->rt_type	= res.type;
2368 	rth->rt_key_tos	= tos;
2369 	rth->rt_dst	= daddr;
2370 	rth->rt_src	= saddr;
2371 #ifdef CONFIG_IP_ROUTE_CLASSID
2372 	rth->dst.tclassid = itag;
2373 #endif
2374 	rth->rt_route_iif = dev->ifindex;
2375 	rth->rt_iif	= dev->ifindex;
2376 	rth->rt_oif	= 0;
2377 	rth->rt_mark    = skb->mark;
2378 	rth->rt_gateway	= daddr;
2379 	rth->rt_spec_dst= spec_dst;
2380 	rth->rt_peer_genid = 0;
2381 	rth->peer = NULL;
2382 	rth->fi = NULL;
2383 	if (res.type == RTN_UNREACHABLE) {
2384 		rth->dst.input= ip_error;
2385 		rth->dst.error= -err;
2386 		rth->rt_flags 	&= ~RTCF_LOCAL;
2387 	}
2388 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2389 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2390 	err = 0;
2391 	if (IS_ERR(rth))
2392 		err = PTR_ERR(rth);
2393 	goto out;
2394 
2395 no_route:
2396 	RT_CACHE_STAT_INC(in_no_route);
2397 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2398 	res.type = RTN_UNREACHABLE;
2399 	if (err == -ESRCH)
2400 		err = -ENETUNREACH;
2401 	goto local_input;
2402 
2403 	/*
2404 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2405 	 */
2406 martian_destination:
2407 	RT_CACHE_STAT_INC(in_martian_dst);
2408 #ifdef CONFIG_IP_ROUTE_VERBOSE
2409 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2410 		pr_warn("martian destination %pI4 from %pI4, dev %s\n",
2411 			&daddr, &saddr, dev->name);
2412 #endif
2413 
2414 e_hostunreach:
2415 	err = -EHOSTUNREACH;
2416 	goto out;
2417 
2418 e_inval:
2419 	err = -EINVAL;
2420 	goto out;
2421 
2422 e_nobufs:
2423 	err = -ENOBUFS;
2424 	goto out;
2425 
2426 martian_source:
2427 	err = -EINVAL;
2428 martian_source_keep_err:
2429 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2430 	goto out;
2431 }
2432 
ip_route_input_common(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,bool noref)2433 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2434 			   u8 tos, struct net_device *dev, bool noref)
2435 {
2436 	struct rtable * rth;
2437 	unsigned	hash;
2438 	int iif = dev->ifindex;
2439 	struct net *net;
2440 	int res;
2441 
2442 	net = dev_net(dev);
2443 
2444 	rcu_read_lock();
2445 
2446 	if (!rt_caching(net))
2447 		goto skip_cache;
2448 
2449 	tos &= IPTOS_RT_MASK;
2450 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2451 
2452 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2453 	     rth = rcu_dereference(rth->dst.rt_next)) {
2454 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2455 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2456 		     (rth->rt_route_iif ^ iif) |
2457 		     (rth->rt_key_tos ^ tos)) == 0 &&
2458 		    rth->rt_mark == skb->mark &&
2459 		    net_eq(dev_net(rth->dst.dev), net) &&
2460 		    !rt_is_expired(rth)) {
2461 			ipv4_validate_peer(rth);
2462 			if (noref) {
2463 				dst_use_noref(&rth->dst, jiffies);
2464 				skb_dst_set_noref(skb, &rth->dst);
2465 			} else {
2466 				dst_use(&rth->dst, jiffies);
2467 				skb_dst_set(skb, &rth->dst);
2468 			}
2469 			RT_CACHE_STAT_INC(in_hit);
2470 			rcu_read_unlock();
2471 			return 0;
2472 		}
2473 		RT_CACHE_STAT_INC(in_hlist_search);
2474 	}
2475 
2476 skip_cache:
2477 	/* Multicast recognition logic is moved from route cache to here.
2478 	   The problem was that too many Ethernet cards have broken/missing
2479 	   hardware multicast filters :-( As result the host on multicasting
2480 	   network acquires a lot of useless route cache entries, sort of
2481 	   SDR messages from all the world. Now we try to get rid of them.
2482 	   Really, provided software IP multicast filter is organized
2483 	   reasonably (at least, hashed), it does not result in a slowdown
2484 	   comparing with route cache reject entries.
2485 	   Note, that multicast routers are not affected, because
2486 	   route cache entry is created eventually.
2487 	 */
2488 	if (ipv4_is_multicast(daddr)) {
2489 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2490 
2491 		if (in_dev) {
2492 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2493 						  ip_hdr(skb)->protocol);
2494 			if (our
2495 #ifdef CONFIG_IP_MROUTE
2496 				||
2497 			    (!ipv4_is_local_multicast(daddr) &&
2498 			     IN_DEV_MFORWARD(in_dev))
2499 #endif
2500 			   ) {
2501 				int res = ip_route_input_mc(skb, daddr, saddr,
2502 							    tos, dev, our);
2503 				rcu_read_unlock();
2504 				return res;
2505 			}
2506 		}
2507 		rcu_read_unlock();
2508 		return -EINVAL;
2509 	}
2510 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2511 	rcu_read_unlock();
2512 	return res;
2513 }
2514 EXPORT_SYMBOL(ip_route_input_common);
2515 
2516 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,__be32 orig_daddr,__be32 orig_saddr,int orig_oif,__u8 orig_rtos,struct net_device * dev_out,unsigned int flags)2517 static struct rtable *__mkroute_output(const struct fib_result *res,
2518 				       const struct flowi4 *fl4,
2519 				       __be32 orig_daddr, __be32 orig_saddr,
2520 				       int orig_oif, __u8 orig_rtos,
2521 				       struct net_device *dev_out,
2522 				       unsigned int flags)
2523 {
2524 	struct fib_info *fi = res->fi;
2525 	struct in_device *in_dev;
2526 	u16 type = res->type;
2527 	struct rtable *rth;
2528 
2529 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2530 		return ERR_PTR(-EINVAL);
2531 
2532 	if (ipv4_is_lbcast(fl4->daddr))
2533 		type = RTN_BROADCAST;
2534 	else if (ipv4_is_multicast(fl4->daddr))
2535 		type = RTN_MULTICAST;
2536 	else if (ipv4_is_zeronet(fl4->daddr))
2537 		return ERR_PTR(-EINVAL);
2538 
2539 	if (dev_out->flags & IFF_LOOPBACK)
2540 		flags |= RTCF_LOCAL;
2541 
2542 	in_dev = __in_dev_get_rcu(dev_out);
2543 	if (!in_dev)
2544 		return ERR_PTR(-EINVAL);
2545 
2546 	if (type == RTN_BROADCAST) {
2547 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2548 		fi = NULL;
2549 	} else if (type == RTN_MULTICAST) {
2550 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2551 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2552 				     fl4->flowi4_proto))
2553 			flags &= ~RTCF_LOCAL;
2554 		/* If multicast route do not exist use
2555 		 * default one, but do not gateway in this case.
2556 		 * Yes, it is hack.
2557 		 */
2558 		if (fi && res->prefixlen < 4)
2559 			fi = NULL;
2560 	}
2561 
2562 	rth = rt_dst_alloc(dev_out,
2563 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2564 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2565 	if (!rth)
2566 		return ERR_PTR(-ENOBUFS);
2567 
2568 	rth->dst.output = ip_output;
2569 
2570 	rth->rt_key_dst	= orig_daddr;
2571 	rth->rt_key_src	= orig_saddr;
2572 	rth->rt_genid = rt_genid(dev_net(dev_out));
2573 	rth->rt_flags	= flags;
2574 	rth->rt_type	= type;
2575 	rth->rt_key_tos	= orig_rtos;
2576 	rth->rt_dst	= fl4->daddr;
2577 	rth->rt_src	= fl4->saddr;
2578 	rth->rt_route_iif = 0;
2579 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2580 	rth->rt_oif	= orig_oif;
2581 	rth->rt_mark    = fl4->flowi4_mark;
2582 	rth->rt_gateway = fl4->daddr;
2583 	rth->rt_spec_dst= fl4->saddr;
2584 	rth->rt_peer_genid = 0;
2585 	rth->peer = NULL;
2586 	rth->fi = NULL;
2587 
2588 	RT_CACHE_STAT_INC(out_slow_tot);
2589 
2590 	if (flags & RTCF_LOCAL) {
2591 		rth->dst.input = ip_local_deliver;
2592 		rth->rt_spec_dst = fl4->daddr;
2593 	}
2594 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2595 		rth->rt_spec_dst = fl4->saddr;
2596 		if (flags & RTCF_LOCAL &&
2597 		    !(dev_out->flags & IFF_LOOPBACK)) {
2598 			rth->dst.output = ip_mc_output;
2599 			RT_CACHE_STAT_INC(out_slow_mc);
2600 		}
2601 #ifdef CONFIG_IP_MROUTE
2602 		if (type == RTN_MULTICAST) {
2603 			if (IN_DEV_MFORWARD(in_dev) &&
2604 			    !ipv4_is_local_multicast(fl4->daddr)) {
2605 				rth->dst.input = ip_mr_input;
2606 				rth->dst.output = ip_mc_output;
2607 			}
2608 		}
2609 #endif
2610 	}
2611 
2612 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2613 
2614 	return rth;
2615 }
2616 
2617 /*
2618  * Major route resolver routine.
2619  * called with rcu_read_lock();
2620  */
2621 
ip_route_output_slow(struct net * net,struct flowi4 * fl4)2622 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2623 {
2624 	struct net_device *dev_out = NULL;
2625 	__u8 tos = RT_FL_TOS(fl4);
2626 	unsigned int flags = 0;
2627 	struct fib_result res;
2628 	struct rtable *rth;
2629 	__be32 orig_daddr;
2630 	__be32 orig_saddr;
2631 	int orig_oif;
2632 
2633 	res.fi		= NULL;
2634 #ifdef CONFIG_IP_MULTIPLE_TABLES
2635 	res.r		= NULL;
2636 #endif
2637 
2638 	orig_daddr = fl4->daddr;
2639 	orig_saddr = fl4->saddr;
2640 	orig_oif = fl4->flowi4_oif;
2641 
2642 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2643 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2644 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2645 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2646 
2647 	rcu_read_lock();
2648 	if (fl4->saddr) {
2649 		rth = ERR_PTR(-EINVAL);
2650 		if (ipv4_is_multicast(fl4->saddr) ||
2651 		    ipv4_is_lbcast(fl4->saddr) ||
2652 		    ipv4_is_zeronet(fl4->saddr))
2653 			goto out;
2654 
2655 		/* I removed check for oif == dev_out->oif here.
2656 		   It was wrong for two reasons:
2657 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2658 		      is assigned to multiple interfaces.
2659 		   2. Moreover, we are allowed to send packets with saddr
2660 		      of another iface. --ANK
2661 		 */
2662 
2663 		if (fl4->flowi4_oif == 0 &&
2664 		    (ipv4_is_multicast(fl4->daddr) ||
2665 		     ipv4_is_lbcast(fl4->daddr))) {
2666 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2667 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2668 			if (dev_out == NULL)
2669 				goto out;
2670 
2671 			/* Special hack: user can direct multicasts
2672 			   and limited broadcast via necessary interface
2673 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2674 			   This hack is not just for fun, it allows
2675 			   vic,vat and friends to work.
2676 			   They bind socket to loopback, set ttl to zero
2677 			   and expect that it will work.
2678 			   From the viewpoint of routing cache they are broken,
2679 			   because we are not allowed to build multicast path
2680 			   with loopback source addr (look, routing cache
2681 			   cannot know, that ttl is zero, so that packet
2682 			   will not leave this host and route is valid).
2683 			   Luckily, this hack is good workaround.
2684 			 */
2685 
2686 			fl4->flowi4_oif = dev_out->ifindex;
2687 			goto make_route;
2688 		}
2689 
2690 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2691 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2692 			if (!__ip_dev_find(net, fl4->saddr, false))
2693 				goto out;
2694 		}
2695 	}
2696 
2697 
2698 	if (fl4->flowi4_oif) {
2699 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2700 		rth = ERR_PTR(-ENODEV);
2701 		if (dev_out == NULL)
2702 			goto out;
2703 
2704 		/* RACE: Check return value of inet_select_addr instead. */
2705 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2706 			rth = ERR_PTR(-ENETUNREACH);
2707 			goto out;
2708 		}
2709 		if (ipv4_is_local_multicast(fl4->daddr) ||
2710 		    ipv4_is_lbcast(fl4->daddr)) {
2711 			if (!fl4->saddr)
2712 				fl4->saddr = inet_select_addr(dev_out, 0,
2713 							      RT_SCOPE_LINK);
2714 			goto make_route;
2715 		}
2716 		if (!fl4->saddr) {
2717 			if (ipv4_is_multicast(fl4->daddr))
2718 				fl4->saddr = inet_select_addr(dev_out, 0,
2719 							      fl4->flowi4_scope);
2720 			else if (!fl4->daddr)
2721 				fl4->saddr = inet_select_addr(dev_out, 0,
2722 							      RT_SCOPE_HOST);
2723 		}
2724 	}
2725 
2726 	if (!fl4->daddr) {
2727 		fl4->daddr = fl4->saddr;
2728 		if (!fl4->daddr)
2729 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2730 		dev_out = net->loopback_dev;
2731 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2732 		res.type = RTN_LOCAL;
2733 		flags |= RTCF_LOCAL;
2734 		goto make_route;
2735 	}
2736 
2737 	if (fib_lookup(net, fl4, &res)) {
2738 		res.fi = NULL;
2739 		if (fl4->flowi4_oif) {
2740 			/* Apparently, routing tables are wrong. Assume,
2741 			   that the destination is on link.
2742 
2743 			   WHY? DW.
2744 			   Because we are allowed to send to iface
2745 			   even if it has NO routes and NO assigned
2746 			   addresses. When oif is specified, routing
2747 			   tables are looked up with only one purpose:
2748 			   to catch if destination is gatewayed, rather than
2749 			   direct. Moreover, if MSG_DONTROUTE is set,
2750 			   we send packet, ignoring both routing tables
2751 			   and ifaddr state. --ANK
2752 
2753 
2754 			   We could make it even if oif is unknown,
2755 			   likely IPv6, but we do not.
2756 			 */
2757 
2758 			if (fl4->saddr == 0)
2759 				fl4->saddr = inet_select_addr(dev_out, 0,
2760 							      RT_SCOPE_LINK);
2761 			res.type = RTN_UNICAST;
2762 			goto make_route;
2763 		}
2764 		rth = ERR_PTR(-ENETUNREACH);
2765 		goto out;
2766 	}
2767 
2768 	if (res.type == RTN_LOCAL) {
2769 		if (!fl4->saddr) {
2770 			if (res.fi->fib_prefsrc)
2771 				fl4->saddr = res.fi->fib_prefsrc;
2772 			else
2773 				fl4->saddr = fl4->daddr;
2774 		}
2775 		dev_out = net->loopback_dev;
2776 		fl4->flowi4_oif = dev_out->ifindex;
2777 		res.fi = NULL;
2778 		flags |= RTCF_LOCAL;
2779 		goto make_route;
2780 	}
2781 
2782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2783 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2784 		fib_select_multipath(&res);
2785 	else
2786 #endif
2787 	if (!res.prefixlen &&
2788 	    res.table->tb_num_default > 1 &&
2789 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2790 		fib_select_default(&res);
2791 
2792 	if (!fl4->saddr)
2793 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2794 
2795 	dev_out = FIB_RES_DEV(res);
2796 	fl4->flowi4_oif = dev_out->ifindex;
2797 
2798 
2799 make_route:
2800 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2801 			       tos, dev_out, flags);
2802 	if (!IS_ERR(rth)) {
2803 		unsigned int hash;
2804 
2805 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2806 			       rt_genid(dev_net(dev_out)));
2807 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2808 	}
2809 
2810 out:
2811 	rcu_read_unlock();
2812 	return rth;
2813 }
2814 
__ip_route_output_key(struct net * net,struct flowi4 * flp4)2815 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2816 {
2817 	struct rtable *rth;
2818 	unsigned int hash;
2819 
2820 	if (!rt_caching(net))
2821 		goto slow_output;
2822 
2823 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2824 
2825 	rcu_read_lock_bh();
2826 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2827 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2828 		if (rth->rt_key_dst == flp4->daddr &&
2829 		    rth->rt_key_src == flp4->saddr &&
2830 		    rt_is_output_route(rth) &&
2831 		    rth->rt_oif == flp4->flowi4_oif &&
2832 		    rth->rt_mark == flp4->flowi4_mark &&
2833 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2834 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2835 		    net_eq(dev_net(rth->dst.dev), net) &&
2836 		    !rt_is_expired(rth)) {
2837 			ipv4_validate_peer(rth);
2838 			dst_use(&rth->dst, jiffies);
2839 			RT_CACHE_STAT_INC(out_hit);
2840 			rcu_read_unlock_bh();
2841 			if (!flp4->saddr)
2842 				flp4->saddr = rth->rt_src;
2843 			if (!flp4->daddr)
2844 				flp4->daddr = rth->rt_dst;
2845 			return rth;
2846 		}
2847 		RT_CACHE_STAT_INC(out_hlist_search);
2848 	}
2849 	rcu_read_unlock_bh();
2850 
2851 slow_output:
2852 	return ip_route_output_slow(net, flp4);
2853 }
2854 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2855 
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2856 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2857 {
2858 	return NULL;
2859 }
2860 
ipv4_blackhole_mtu(const struct dst_entry * dst)2861 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2862 {
2863 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2864 
2865 	return mtu ? : dst->dev->mtu;
2866 }
2867 
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,u32 mtu)2868 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2869 {
2870 }
2871 
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2872 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2873 					  unsigned long old)
2874 {
2875 	return NULL;
2876 }
2877 
2878 static struct dst_ops ipv4_dst_blackhole_ops = {
2879 	.family			=	AF_INET,
2880 	.protocol		=	cpu_to_be16(ETH_P_IP),
2881 	.destroy		=	ipv4_dst_destroy,
2882 	.check			=	ipv4_blackhole_dst_check,
2883 	.mtu			=	ipv4_blackhole_mtu,
2884 	.default_advmss		=	ipv4_default_advmss,
2885 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2886 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2887 	.neigh_lookup		=	ipv4_neigh_lookup,
2888 };
2889 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2890 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2891 {
2892 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2893 	struct rtable *ort = (struct rtable *) dst_orig;
2894 
2895 	if (rt) {
2896 		struct dst_entry *new = &rt->dst;
2897 
2898 		new->__use = 1;
2899 		new->input = dst_discard;
2900 		new->output = dst_discard;
2901 		dst_copy_metrics(new, &ort->dst);
2902 
2903 		new->dev = ort->dst.dev;
2904 		if (new->dev)
2905 			dev_hold(new->dev);
2906 
2907 		rt->rt_key_dst = ort->rt_key_dst;
2908 		rt->rt_key_src = ort->rt_key_src;
2909 		rt->rt_key_tos = ort->rt_key_tos;
2910 		rt->rt_route_iif = ort->rt_route_iif;
2911 		rt->rt_iif = ort->rt_iif;
2912 		rt->rt_oif = ort->rt_oif;
2913 		rt->rt_mark = ort->rt_mark;
2914 
2915 		rt->rt_genid = rt_genid(net);
2916 		rt->rt_flags = ort->rt_flags;
2917 		rt->rt_type = ort->rt_type;
2918 		rt->rt_dst = ort->rt_dst;
2919 		rt->rt_src = ort->rt_src;
2920 		rt->rt_gateway = ort->rt_gateway;
2921 		rt->rt_spec_dst = ort->rt_spec_dst;
2922 		rt->peer = ort->peer;
2923 		if (rt->peer)
2924 			atomic_inc(&rt->peer->refcnt);
2925 		rt->fi = ort->fi;
2926 		if (rt->fi)
2927 			atomic_inc(&rt->fi->fib_clntref);
2928 
2929 		dst_free(new);
2930 	}
2931 
2932 	dst_release(dst_orig);
2933 
2934 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2935 }
2936 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,struct sock * sk)2937 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2938 				    struct sock *sk)
2939 {
2940 	struct rtable *rt = __ip_route_output_key(net, flp4);
2941 
2942 	if (IS_ERR(rt))
2943 		return rt;
2944 
2945 	if (flp4->flowi4_proto)
2946 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2947 						   flowi4_to_flowi(flp4),
2948 						   sk, 0);
2949 
2950 	return rt;
2951 }
2952 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2953 
rt_fill_info(struct net * net,struct sk_buff * skb,u32 pid,u32 seq,int event,int nowait,unsigned int flags)2954 static int rt_fill_info(struct net *net,
2955 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2956 			int nowait, unsigned int flags)
2957 {
2958 	struct rtable *rt = skb_rtable(skb);
2959 	struct rtmsg *r;
2960 	struct nlmsghdr *nlh;
2961 	unsigned long expires = 0;
2962 	const struct inet_peer *peer = rt->peer;
2963 	u32 id = 0, ts = 0, tsage = 0, error;
2964 
2965 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2966 	if (nlh == NULL)
2967 		return -EMSGSIZE;
2968 
2969 	r = nlmsg_data(nlh);
2970 	r->rtm_family	 = AF_INET;
2971 	r->rtm_dst_len	= 32;
2972 	r->rtm_src_len	= 0;
2973 	r->rtm_tos	= rt->rt_key_tos;
2974 	r->rtm_table	= RT_TABLE_MAIN;
2975 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2976 	r->rtm_type	= rt->rt_type;
2977 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2978 	r->rtm_protocol = RTPROT_UNSPEC;
2979 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2980 	if (rt->rt_flags & RTCF_NOTIFY)
2981 		r->rtm_flags |= RTM_F_NOTIFY;
2982 
2983 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2984 
2985 	if (rt->rt_key_src) {
2986 		r->rtm_src_len = 32;
2987 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2988 	}
2989 	if (rt->dst.dev)
2990 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2991 #ifdef CONFIG_IP_ROUTE_CLASSID
2992 	if (rt->dst.tclassid)
2993 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2994 #endif
2995 	if (rt_is_input_route(rt))
2996 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2997 	else if (rt->rt_src != rt->rt_key_src)
2998 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2999 
3000 	if (rt->rt_dst != rt->rt_gateway)
3001 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3002 
3003 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3004 		goto nla_put_failure;
3005 
3006 	if (rt->rt_mark)
3007 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3008 
3009 	error = rt->dst.error;
3010 	if (peer) {
3011 		inet_peer_refcheck(rt->peer);
3012 		id = atomic_read(&peer->ip_id_count) & 0xffff;
3013 		if (peer->tcp_ts_stamp) {
3014 			ts = peer->tcp_ts;
3015 			tsage = get_seconds() - peer->tcp_ts_stamp;
3016 		}
3017 		expires = ACCESS_ONCE(peer->pmtu_expires);
3018 		if (expires) {
3019 			if (time_before(jiffies, expires))
3020 				expires -= jiffies;
3021 			else
3022 				expires = 0;
3023 		}
3024 	}
3025 
3026 	if (rt_is_input_route(rt)) {
3027 #ifdef CONFIG_IP_MROUTE
3028 		__be32 dst = rt->rt_dst;
3029 
3030 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3031 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3032 			int err = ipmr_get_route(net, skb,
3033 						 rt->rt_src, rt->rt_dst,
3034 						 r, nowait);
3035 			if (err <= 0) {
3036 				if (!nowait) {
3037 					if (err == 0)
3038 						return 0;
3039 					goto nla_put_failure;
3040 				} else {
3041 					if (err == -EMSGSIZE)
3042 						goto nla_put_failure;
3043 					error = err;
3044 				}
3045 			}
3046 		} else
3047 #endif
3048 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3049 	}
3050 
3051 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3052 			       expires, error) < 0)
3053 		goto nla_put_failure;
3054 
3055 	return nlmsg_end(skb, nlh);
3056 
3057 nla_put_failure:
3058 	nlmsg_cancel(skb, nlh);
3059 	return -EMSGSIZE;
3060 }
3061 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,void * arg)3062 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3063 {
3064 	struct net *net = sock_net(in_skb->sk);
3065 	struct rtmsg *rtm;
3066 	struct nlattr *tb[RTA_MAX+1];
3067 	struct rtable *rt = NULL;
3068 	__be32 dst = 0;
3069 	__be32 src = 0;
3070 	u32 iif;
3071 	int err;
3072 	int mark;
3073 	struct sk_buff *skb;
3074 
3075 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3076 	if (err < 0)
3077 		goto errout;
3078 
3079 	rtm = nlmsg_data(nlh);
3080 
3081 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3082 	if (skb == NULL) {
3083 		err = -ENOBUFS;
3084 		goto errout;
3085 	}
3086 
3087 	/* Reserve room for dummy headers, this skb can pass
3088 	   through good chunk of routing engine.
3089 	 */
3090 	skb_reset_mac_header(skb);
3091 	skb_reset_network_header(skb);
3092 
3093 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3094 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3095 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3096 
3097 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3098 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3099 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3100 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3101 
3102 	if (iif) {
3103 		struct net_device *dev;
3104 
3105 		dev = __dev_get_by_index(net, iif);
3106 		if (dev == NULL) {
3107 			err = -ENODEV;
3108 			goto errout_free;
3109 		}
3110 
3111 		skb->protocol	= htons(ETH_P_IP);
3112 		skb->dev	= dev;
3113 		skb->mark	= mark;
3114 		local_bh_disable();
3115 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3116 		local_bh_enable();
3117 
3118 		rt = skb_rtable(skb);
3119 		if (err == 0 && rt->dst.error)
3120 			err = -rt->dst.error;
3121 	} else {
3122 		struct flowi4 fl4 = {
3123 			.daddr = dst,
3124 			.saddr = src,
3125 			.flowi4_tos = rtm->rtm_tos,
3126 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3127 			.flowi4_mark = mark,
3128 		};
3129 		rt = ip_route_output_key(net, &fl4);
3130 
3131 		err = 0;
3132 		if (IS_ERR(rt))
3133 			err = PTR_ERR(rt);
3134 	}
3135 
3136 	if (err)
3137 		goto errout_free;
3138 
3139 	skb_dst_set(skb, &rt->dst);
3140 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3141 		rt->rt_flags |= RTCF_NOTIFY;
3142 
3143 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3144 			   RTM_NEWROUTE, 0, 0);
3145 	if (err <= 0)
3146 		goto errout_free;
3147 
3148 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3149 errout:
3150 	return err;
3151 
3152 errout_free:
3153 	kfree_skb(skb);
3154 	goto errout;
3155 }
3156 
ip_rt_dump(struct sk_buff * skb,struct netlink_callback * cb)3157 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3158 {
3159 	struct rtable *rt;
3160 	int h, s_h;
3161 	int idx, s_idx;
3162 	struct net *net;
3163 
3164 	net = sock_net(skb->sk);
3165 
3166 	s_h = cb->args[0];
3167 	if (s_h < 0)
3168 		s_h = 0;
3169 	s_idx = idx = cb->args[1];
3170 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3171 		if (!rt_hash_table[h].chain)
3172 			continue;
3173 		rcu_read_lock_bh();
3174 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3175 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3176 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3177 				continue;
3178 			if (rt_is_expired(rt))
3179 				continue;
3180 			skb_dst_set_noref(skb, &rt->dst);
3181 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3182 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3183 					 1, NLM_F_MULTI) <= 0) {
3184 				skb_dst_drop(skb);
3185 				rcu_read_unlock_bh();
3186 				goto done;
3187 			}
3188 			skb_dst_drop(skb);
3189 		}
3190 		rcu_read_unlock_bh();
3191 	}
3192 
3193 done:
3194 	cb->args[0] = h;
3195 	cb->args[1] = idx;
3196 	return skb->len;
3197 }
3198 
ip_rt_multicast_event(struct in_device * in_dev)3199 void ip_rt_multicast_event(struct in_device *in_dev)
3200 {
3201 	rt_cache_flush(dev_net(in_dev->dev), 0);
3202 }
3203 
3204 #ifdef CONFIG_SYSCTL
ipv4_sysctl_rtcache_flush(ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)3205 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3206 					void __user *buffer,
3207 					size_t *lenp, loff_t *ppos)
3208 {
3209 	if (write) {
3210 		int flush_delay;
3211 		ctl_table ctl;
3212 		struct net *net;
3213 
3214 		memcpy(&ctl, __ctl, sizeof(ctl));
3215 		ctl.data = &flush_delay;
3216 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3217 
3218 		net = (struct net *)__ctl->extra1;
3219 		rt_cache_flush(net, flush_delay);
3220 		return 0;
3221 	}
3222 
3223 	return -EINVAL;
3224 }
3225 
3226 static ctl_table ipv4_route_table[] = {
3227 	{
3228 		.procname	= "gc_thresh",
3229 		.data		= &ipv4_dst_ops.gc_thresh,
3230 		.maxlen		= sizeof(int),
3231 		.mode		= 0644,
3232 		.proc_handler	= proc_dointvec,
3233 	},
3234 	{
3235 		.procname	= "max_size",
3236 		.data		= &ip_rt_max_size,
3237 		.maxlen		= sizeof(int),
3238 		.mode		= 0644,
3239 		.proc_handler	= proc_dointvec,
3240 	},
3241 	{
3242 		/*  Deprecated. Use gc_min_interval_ms */
3243 
3244 		.procname	= "gc_min_interval",
3245 		.data		= &ip_rt_gc_min_interval,
3246 		.maxlen		= sizeof(int),
3247 		.mode		= 0644,
3248 		.proc_handler	= proc_dointvec_jiffies,
3249 	},
3250 	{
3251 		.procname	= "gc_min_interval_ms",
3252 		.data		= &ip_rt_gc_min_interval,
3253 		.maxlen		= sizeof(int),
3254 		.mode		= 0644,
3255 		.proc_handler	= proc_dointvec_ms_jiffies,
3256 	},
3257 	{
3258 		.procname	= "gc_timeout",
3259 		.data		= &ip_rt_gc_timeout,
3260 		.maxlen		= sizeof(int),
3261 		.mode		= 0644,
3262 		.proc_handler	= proc_dointvec_jiffies,
3263 	},
3264 	{
3265 		.procname	= "gc_interval",
3266 		.data		= &ip_rt_gc_interval,
3267 		.maxlen		= sizeof(int),
3268 		.mode		= 0644,
3269 		.proc_handler	= proc_dointvec_jiffies,
3270 	},
3271 	{
3272 		.procname	= "redirect_load",
3273 		.data		= &ip_rt_redirect_load,
3274 		.maxlen		= sizeof(int),
3275 		.mode		= 0644,
3276 		.proc_handler	= proc_dointvec,
3277 	},
3278 	{
3279 		.procname	= "redirect_number",
3280 		.data		= &ip_rt_redirect_number,
3281 		.maxlen		= sizeof(int),
3282 		.mode		= 0644,
3283 		.proc_handler	= proc_dointvec,
3284 	},
3285 	{
3286 		.procname	= "redirect_silence",
3287 		.data		= &ip_rt_redirect_silence,
3288 		.maxlen		= sizeof(int),
3289 		.mode		= 0644,
3290 		.proc_handler	= proc_dointvec,
3291 	},
3292 	{
3293 		.procname	= "error_cost",
3294 		.data		= &ip_rt_error_cost,
3295 		.maxlen		= sizeof(int),
3296 		.mode		= 0644,
3297 		.proc_handler	= proc_dointvec,
3298 	},
3299 	{
3300 		.procname	= "error_burst",
3301 		.data		= &ip_rt_error_burst,
3302 		.maxlen		= sizeof(int),
3303 		.mode		= 0644,
3304 		.proc_handler	= proc_dointvec,
3305 	},
3306 	{
3307 		.procname	= "gc_elasticity",
3308 		.data		= &ip_rt_gc_elasticity,
3309 		.maxlen		= sizeof(int),
3310 		.mode		= 0644,
3311 		.proc_handler	= proc_dointvec,
3312 	},
3313 	{
3314 		.procname	= "mtu_expires",
3315 		.data		= &ip_rt_mtu_expires,
3316 		.maxlen		= sizeof(int),
3317 		.mode		= 0644,
3318 		.proc_handler	= proc_dointvec_jiffies,
3319 	},
3320 	{
3321 		.procname	= "min_pmtu",
3322 		.data		= &ip_rt_min_pmtu,
3323 		.maxlen		= sizeof(int),
3324 		.mode		= 0644,
3325 		.proc_handler	= proc_dointvec,
3326 	},
3327 	{
3328 		.procname	= "min_adv_mss",
3329 		.data		= &ip_rt_min_advmss,
3330 		.maxlen		= sizeof(int),
3331 		.mode		= 0644,
3332 		.proc_handler	= proc_dointvec,
3333 	},
3334 	{ }
3335 };
3336 
3337 static struct ctl_table empty[1];
3338 
3339 static struct ctl_table ipv4_skeleton[] =
3340 {
3341 	{ .procname = "route",
3342 	  .mode = 0555, .child = ipv4_route_table},
3343 	{ .procname = "neigh",
3344 	  .mode = 0555, .child = empty},
3345 	{ }
3346 };
3347 
3348 static __net_initdata struct ctl_path ipv4_path[] = {
3349 	{ .procname = "net", },
3350 	{ .procname = "ipv4", },
3351 	{ },
3352 };
3353 
3354 static struct ctl_table ipv4_route_flush_table[] = {
3355 	{
3356 		.procname	= "flush",
3357 		.maxlen		= sizeof(int),
3358 		.mode		= 0200,
3359 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3360 	},
3361 	{ },
3362 };
3363 
3364 static __net_initdata struct ctl_path ipv4_route_path[] = {
3365 	{ .procname = "net", },
3366 	{ .procname = "ipv4", },
3367 	{ .procname = "route", },
3368 	{ },
3369 };
3370 
sysctl_route_net_init(struct net * net)3371 static __net_init int sysctl_route_net_init(struct net *net)
3372 {
3373 	struct ctl_table *tbl;
3374 
3375 	tbl = ipv4_route_flush_table;
3376 	if (!net_eq(net, &init_net)) {
3377 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3378 		if (tbl == NULL)
3379 			goto err_dup;
3380 	}
3381 	tbl[0].extra1 = net;
3382 
3383 	net->ipv4.route_hdr =
3384 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3385 	if (net->ipv4.route_hdr == NULL)
3386 		goto err_reg;
3387 	return 0;
3388 
3389 err_reg:
3390 	if (tbl != ipv4_route_flush_table)
3391 		kfree(tbl);
3392 err_dup:
3393 	return -ENOMEM;
3394 }
3395 
sysctl_route_net_exit(struct net * net)3396 static __net_exit void sysctl_route_net_exit(struct net *net)
3397 {
3398 	struct ctl_table *tbl;
3399 
3400 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3401 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3402 	BUG_ON(tbl == ipv4_route_flush_table);
3403 	kfree(tbl);
3404 }
3405 
3406 static __net_initdata struct pernet_operations sysctl_route_ops = {
3407 	.init = sysctl_route_net_init,
3408 	.exit = sysctl_route_net_exit,
3409 };
3410 #endif
3411 
rt_genid_init(struct net * net)3412 static __net_init int rt_genid_init(struct net *net)
3413 {
3414 	get_random_bytes(&net->ipv4.rt_genid,
3415 			 sizeof(net->ipv4.rt_genid));
3416 	get_random_bytes(&net->ipv4.dev_addr_genid,
3417 			 sizeof(net->ipv4.dev_addr_genid));
3418 	return 0;
3419 }
3420 
3421 static __net_initdata struct pernet_operations rt_genid_ops = {
3422 	.init = rt_genid_init,
3423 };
3424 
3425 
3426 #ifdef CONFIG_IP_ROUTE_CLASSID
3427 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3428 #endif /* CONFIG_IP_ROUTE_CLASSID */
3429 
3430 static __initdata unsigned long rhash_entries;
set_rhash_entries(char * str)3431 static int __init set_rhash_entries(char *str)
3432 {
3433 	if (!str)
3434 		return 0;
3435 	rhash_entries = simple_strtoul(str, &str, 0);
3436 	return 1;
3437 }
3438 __setup("rhash_entries=", set_rhash_entries);
3439 
ip_rt_init(void)3440 int __init ip_rt_init(void)
3441 {
3442 	int rc = 0;
3443 
3444 #ifdef CONFIG_IP_ROUTE_CLASSID
3445 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3446 	if (!ip_rt_acct)
3447 		panic("IP: failed to allocate ip_rt_acct\n");
3448 #endif
3449 
3450 	ipv4_dst_ops.kmem_cachep =
3451 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3452 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3453 
3454 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3455 
3456 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3457 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3458 
3459 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3460 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3461 
3462 	rt_hash_table = (struct rt_hash_bucket *)
3463 		alloc_large_system_hash("IP route cache",
3464 					sizeof(struct rt_hash_bucket),
3465 					rhash_entries,
3466 					(totalram_pages >= 128 * 1024) ?
3467 					15 : 17,
3468 					0,
3469 					&rt_hash_log,
3470 					&rt_hash_mask,
3471 					rhash_entries ? 0 : 512 * 1024);
3472 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3473 	rt_hash_lock_init();
3474 
3475 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3476 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3477 
3478 	devinet_init();
3479 	ip_fib_init();
3480 
3481 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3482 	expires_ljiffies = jiffies;
3483 	schedule_delayed_work(&expires_work,
3484 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3485 
3486 	if (ip_rt_proc_init())
3487 		pr_err("Unable to create route proc files\n");
3488 #ifdef CONFIG_XFRM
3489 	xfrm_init();
3490 	xfrm4_init(ip_rt_max_size);
3491 #endif
3492 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3493 
3494 #ifdef CONFIG_SYSCTL
3495 	register_pernet_subsys(&sysctl_route_ops);
3496 #endif
3497 	register_pernet_subsys(&rt_genid_ops);
3498 	return rc;
3499 }
3500 
3501 #ifdef CONFIG_SYSCTL
3502 /*
3503  * We really need to sanitize the damn ipv4 init order, then all
3504  * this nonsense will go away.
3505  */
ip_static_sysctl_init(void)3506 void __init ip_static_sysctl_init(void)
3507 {
3508 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3509 }
3510 #endif
3511