1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14 /* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
84
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 struct in6_addr *prefix, int prefixlen,
93 struct in6_addr *gwaddr, int ifindex,
94 unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 struct in6_addr *prefix, int prefixlen,
97 struct in6_addr *gwaddr, int ifindex);
98 #endif
99
ipv6_cow_metrics(struct dst_entry * dst,unsigned long old)100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102 struct rt6_info *rt = (struct rt6_info *) dst;
103 struct inet_peer *peer;
104 u32 *p = NULL;
105
106 if (!rt->rt6i_peer)
107 rt6_bind_peer(rt, 1);
108
109 peer = rt->rt6i_peer;
110 if (peer) {
111 u32 *old_p = __DST_METRICS_PTR(old);
112 unsigned long prev, new;
113
114 p = peer->metrics;
115 if (inet_metrics_new(peer))
116 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118 new = (unsigned long) p;
119 prev = cmpxchg(&dst->_metrics, old, new);
120
121 if (prev != old) {
122 p = __DST_METRICS_PTR(prev);
123 if (prev & DST_METRICS_READ_ONLY)
124 p = NULL;
125 }
126 }
127 return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131 .family = AF_INET6,
132 .protocol = cpu_to_be16(ETH_P_IPV6),
133 .gc = ip6_dst_gc,
134 .gc_thresh = 1024,
135 .check = ip6_dst_check,
136 .default_advmss = ip6_default_advmss,
137 .default_mtu = ip6_default_mtu,
138 .cow_metrics = ipv6_cow_metrics,
139 .destroy = ip6_dst_destroy,
140 .ifdown = ip6_dst_ifdown,
141 .negative_advice = ip6_negative_advice,
142 .link_failure = ip6_link_failure,
143 .update_pmtu = ip6_rt_update_pmtu,
144 .local_out = __ip6_local_out,
145 };
146
ip6_blackhole_default_mtu(const struct dst_entry * dst)147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149 return 0;
150 }
151
ip6_rt_blackhole_update_pmtu(struct dst_entry * dst,u32 mtu)152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
ip6_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157 unsigned long old)
158 {
159 return NULL;
160 }
161
162 static struct dst_ops ip6_dst_blackhole_ops = {
163 .family = AF_INET6,
164 .protocol = cpu_to_be16(ETH_P_IPV6),
165 .destroy = ip6_dst_destroy,
166 .check = ip6_dst_check,
167 .default_mtu = ip6_blackhole_default_mtu,
168 .default_advmss = ip6_default_advmss,
169 .update_pmtu = ip6_rt_blackhole_update_pmtu,
170 .cow_metrics = ip6_rt_blackhole_cow_metrics,
171 };
172
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174 [RTAX_HOPLIMIT - 1] = 255,
175 };
176
177 static struct rt6_info ip6_null_entry_template = {
178 .dst = {
179 .__refcnt = ATOMIC_INIT(1),
180 .__use = 1,
181 .obsolete = -1,
182 .error = -ENETUNREACH,
183 .input = ip6_pkt_discard,
184 .output = ip6_pkt_discard_out,
185 },
186 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
187 .rt6i_protocol = RTPROT_KERNEL,
188 .rt6i_metric = ~(u32) 0,
189 .rt6i_ref = ATOMIC_INIT(1),
190 };
191
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196
197 static struct rt6_info ip6_prohibit_entry_template = {
198 .dst = {
199 .__refcnt = ATOMIC_INIT(1),
200 .__use = 1,
201 .obsolete = -1,
202 .error = -EACCES,
203 .input = ip6_pkt_prohibit,
204 .output = ip6_pkt_prohibit_out,
205 },
206 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
207 .rt6i_protocol = RTPROT_KERNEL,
208 .rt6i_metric = ~(u32) 0,
209 .rt6i_ref = ATOMIC_INIT(1),
210 };
211
212 static struct rt6_info ip6_blk_hole_entry_template = {
213 .dst = {
214 .__refcnt = ATOMIC_INIT(1),
215 .__use = 1,
216 .obsolete = -1,
217 .error = -EINVAL,
218 .input = dst_discard,
219 .output = dst_discard,
220 },
221 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
222 .rt6i_protocol = RTPROT_KERNEL,
223 .rt6i_metric = ~(u32) 0,
224 .rt6i_ref = ATOMIC_INIT(1),
225 };
226
227 #endif
228
229 /* allocate dst with ip6_dst_ops */
ip6_dst_alloc(struct dst_ops * ops)230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
231 {
232 return (struct rt6_info *)dst_alloc(ops, 0);
233 }
234
ip6_dst_destroy(struct dst_entry * dst)235 static void ip6_dst_destroy(struct dst_entry *dst)
236 {
237 struct rt6_info *rt = (struct rt6_info *)dst;
238 struct inet6_dev *idev = rt->rt6i_idev;
239 struct inet_peer *peer = rt->rt6i_peer;
240
241 if (idev != NULL) {
242 rt->rt6i_idev = NULL;
243 in6_dev_put(idev);
244 }
245 if (peer) {
246 rt->rt6i_peer = NULL;
247 inet_putpeer(peer);
248 }
249 }
250
251 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
252
rt6_peer_genid(void)253 static u32 rt6_peer_genid(void)
254 {
255 return atomic_read(&__rt6_peer_genid);
256 }
257
rt6_bind_peer(struct rt6_info * rt,int create)258 void rt6_bind_peer(struct rt6_info *rt, int create)
259 {
260 struct inet_peer *peer;
261
262 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
263 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
264 inet_putpeer(peer);
265 else
266 rt->rt6i_peer_genid = rt6_peer_genid();
267 }
268
ip6_dst_ifdown(struct dst_entry * dst,struct net_device * dev,int how)269 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
270 int how)
271 {
272 struct rt6_info *rt = (struct rt6_info *)dst;
273 struct inet6_dev *idev = rt->rt6i_idev;
274 struct net_device *loopback_dev =
275 dev_net(dev)->loopback_dev;
276
277 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
278 struct inet6_dev *loopback_idev =
279 in6_dev_get(loopback_dev);
280 if (loopback_idev != NULL) {
281 rt->rt6i_idev = loopback_idev;
282 in6_dev_put(idev);
283 }
284 }
285 }
286
rt6_check_expired(const struct rt6_info * rt)287 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
288 {
289 return (rt->rt6i_flags & RTF_EXPIRES) &&
290 time_after(jiffies, rt->rt6i_expires);
291 }
292
rt6_need_strict(struct in6_addr * daddr)293 static inline int rt6_need_strict(struct in6_addr *daddr)
294 {
295 return ipv6_addr_type(daddr) &
296 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
297 }
298
299 /*
300 * Route lookup. Any table->tb6_lock is implied.
301 */
302
rt6_device_match(struct net * net,struct rt6_info * rt,struct in6_addr * saddr,int oif,int flags)303 static inline struct rt6_info *rt6_device_match(struct net *net,
304 struct rt6_info *rt,
305 struct in6_addr *saddr,
306 int oif,
307 int flags)
308 {
309 struct rt6_info *local = NULL;
310 struct rt6_info *sprt;
311
312 if (!oif && ipv6_addr_any(saddr))
313 goto out;
314
315 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
316 struct net_device *dev = sprt->rt6i_dev;
317
318 if (oif) {
319 if (dev->ifindex == oif)
320 return sprt;
321 if (dev->flags & IFF_LOOPBACK) {
322 if (sprt->rt6i_idev == NULL ||
323 sprt->rt6i_idev->dev->ifindex != oif) {
324 if (flags & RT6_LOOKUP_F_IFACE && oif)
325 continue;
326 if (local && (!oif ||
327 local->rt6i_idev->dev->ifindex == oif))
328 continue;
329 }
330 local = sprt;
331 }
332 } else {
333 if (ipv6_chk_addr(net, saddr, dev,
334 flags & RT6_LOOKUP_F_IFACE))
335 return sprt;
336 }
337 }
338
339 if (oif) {
340 if (local)
341 return local;
342
343 if (flags & RT6_LOOKUP_F_IFACE)
344 return net->ipv6.ip6_null_entry;
345 }
346 out:
347 return rt;
348 }
349
350 #ifdef CONFIG_IPV6_ROUTER_PREF
rt6_probe(struct rt6_info * rt)351 static void rt6_probe(struct rt6_info *rt)
352 {
353 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
354 /*
355 * Okay, this does not seem to be appropriate
356 * for now, however, we need to check if it
357 * is really so; aka Router Reachability Probing.
358 *
359 * Router Reachability Probe MUST be rate-limited
360 * to no more than one per minute.
361 */
362 if (!neigh || (neigh->nud_state & NUD_VALID))
363 return;
364 read_lock_bh(&neigh->lock);
365 if (!(neigh->nud_state & NUD_VALID) &&
366 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
367 struct in6_addr mcaddr;
368 struct in6_addr *target;
369
370 neigh->updated = jiffies;
371 read_unlock_bh(&neigh->lock);
372
373 target = (struct in6_addr *)&neigh->primary_key;
374 addrconf_addr_solict_mult(target, &mcaddr);
375 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
376 } else
377 read_unlock_bh(&neigh->lock);
378 }
379 #else
rt6_probe(struct rt6_info * rt)380 static inline void rt6_probe(struct rt6_info *rt)
381 {
382 }
383 #endif
384
385 /*
386 * Default Router Selection (RFC 2461 6.3.6)
387 */
rt6_check_dev(struct rt6_info * rt,int oif)388 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
389 {
390 struct net_device *dev = rt->rt6i_dev;
391 if (!oif || dev->ifindex == oif)
392 return 2;
393 if ((dev->flags & IFF_LOOPBACK) &&
394 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
395 return 1;
396 return 0;
397 }
398
rt6_check_neigh(struct rt6_info * rt)399 static inline int rt6_check_neigh(struct rt6_info *rt)
400 {
401 struct neighbour *neigh = rt->rt6i_nexthop;
402 int m;
403 if (rt->rt6i_flags & RTF_NONEXTHOP ||
404 !(rt->rt6i_flags & RTF_GATEWAY))
405 m = 1;
406 else if (neigh) {
407 read_lock_bh(&neigh->lock);
408 if (neigh->nud_state & NUD_VALID)
409 m = 2;
410 #ifdef CONFIG_IPV6_ROUTER_PREF
411 else if (neigh->nud_state & NUD_FAILED)
412 m = 0;
413 #endif
414 else
415 m = 1;
416 read_unlock_bh(&neigh->lock);
417 } else
418 m = 0;
419 return m;
420 }
421
rt6_score_route(struct rt6_info * rt,int oif,int strict)422 static int rt6_score_route(struct rt6_info *rt, int oif,
423 int strict)
424 {
425 int m, n;
426
427 m = rt6_check_dev(rt, oif);
428 if (!m && (strict & RT6_LOOKUP_F_IFACE))
429 return -1;
430 #ifdef CONFIG_IPV6_ROUTER_PREF
431 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
432 #endif
433 n = rt6_check_neigh(rt);
434 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
435 return -1;
436 return m;
437 }
438
find_match(struct rt6_info * rt,int oif,int strict,int * mpri,struct rt6_info * match)439 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
440 int *mpri, struct rt6_info *match)
441 {
442 int m;
443
444 if (rt6_check_expired(rt))
445 goto out;
446
447 m = rt6_score_route(rt, oif, strict);
448 if (m < 0)
449 goto out;
450
451 if (m > *mpri) {
452 if (strict & RT6_LOOKUP_F_REACHABLE)
453 rt6_probe(match);
454 *mpri = m;
455 match = rt;
456 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
457 rt6_probe(rt);
458 }
459
460 out:
461 return match;
462 }
463
find_rr_leaf(struct fib6_node * fn,struct rt6_info * rr_head,u32 metric,int oif,int strict)464 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
465 struct rt6_info *rr_head,
466 u32 metric, int oif, int strict)
467 {
468 struct rt6_info *rt, *match;
469 int mpri = -1;
470
471 match = NULL;
472 for (rt = rr_head; rt && rt->rt6i_metric == metric;
473 rt = rt->dst.rt6_next)
474 match = find_match(rt, oif, strict, &mpri, match);
475 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
476 rt = rt->dst.rt6_next)
477 match = find_match(rt, oif, strict, &mpri, match);
478
479 return match;
480 }
481
rt6_select(struct fib6_node * fn,int oif,int strict)482 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
483 {
484 struct rt6_info *match, *rt0;
485 struct net *net;
486
487 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
488 __func__, fn->leaf, oif);
489
490 rt0 = fn->rr_ptr;
491 if (!rt0)
492 fn->rr_ptr = rt0 = fn->leaf;
493
494 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
495
496 if (!match &&
497 (strict & RT6_LOOKUP_F_REACHABLE)) {
498 struct rt6_info *next = rt0->dst.rt6_next;
499
500 /* no entries matched; do round-robin */
501 if (!next || next->rt6i_metric != rt0->rt6i_metric)
502 next = fn->leaf;
503
504 if (next != rt0)
505 fn->rr_ptr = next;
506 }
507
508 RT6_TRACE("%s() => %p\n",
509 __func__, match);
510
511 net = dev_net(rt0->rt6i_dev);
512 return match ? match : net->ipv6.ip6_null_entry;
513 }
514
515 #ifdef CONFIG_IPV6_ROUTE_INFO
rt6_route_rcv(struct net_device * dev,u8 * opt,int len,struct in6_addr * gwaddr)516 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
517 struct in6_addr *gwaddr)
518 {
519 struct net *net = dev_net(dev);
520 struct route_info *rinfo = (struct route_info *) opt;
521 struct in6_addr prefix_buf, *prefix;
522 unsigned int pref;
523 unsigned long lifetime;
524 struct rt6_info *rt;
525
526 if (len < sizeof(struct route_info)) {
527 return -EINVAL;
528 }
529
530 /* Sanity check for prefix_len and length */
531 if (rinfo->length > 3) {
532 return -EINVAL;
533 } else if (rinfo->prefix_len > 128) {
534 return -EINVAL;
535 } else if (rinfo->prefix_len > 64) {
536 if (rinfo->length < 2) {
537 return -EINVAL;
538 }
539 } else if (rinfo->prefix_len > 0) {
540 if (rinfo->length < 1) {
541 return -EINVAL;
542 }
543 }
544
545 pref = rinfo->route_pref;
546 if (pref == ICMPV6_ROUTER_PREF_INVALID)
547 return -EINVAL;
548
549 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
550
551 if (rinfo->length == 3)
552 prefix = (struct in6_addr *)rinfo->prefix;
553 else {
554 /* this function is safe */
555 ipv6_addr_prefix(&prefix_buf,
556 (struct in6_addr *)rinfo->prefix,
557 rinfo->prefix_len);
558 prefix = &prefix_buf;
559 }
560
561 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
562 dev->ifindex);
563
564 if (rt && !lifetime) {
565 ip6_del_rt(rt);
566 rt = NULL;
567 }
568
569 if (!rt && lifetime)
570 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
571 pref);
572 else if (rt)
573 rt->rt6i_flags = RTF_ROUTEINFO |
574 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
575
576 if (rt) {
577 if (!addrconf_finite_timeout(lifetime)) {
578 rt->rt6i_flags &= ~RTF_EXPIRES;
579 } else {
580 rt->rt6i_expires = jiffies + HZ * lifetime;
581 rt->rt6i_flags |= RTF_EXPIRES;
582 }
583 dst_release(&rt->dst);
584 }
585 return 0;
586 }
587 #endif
588
589 #define BACKTRACK(__net, saddr) \
590 do { \
591 if (rt == __net->ipv6.ip6_null_entry) { \
592 struct fib6_node *pn; \
593 while (1) { \
594 if (fn->fn_flags & RTN_TL_ROOT) \
595 goto out; \
596 pn = fn->parent; \
597 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
598 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
599 else \
600 fn = pn; \
601 if (fn->fn_flags & RTN_RTINFO) \
602 goto restart; \
603 } \
604 } \
605 } while(0)
606
ip6_pol_route_lookup(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)607 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
608 struct fib6_table *table,
609 struct flowi6 *fl6, int flags)
610 {
611 struct fib6_node *fn;
612 struct rt6_info *rt;
613
614 read_lock_bh(&table->tb6_lock);
615 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
616 restart:
617 rt = fn->leaf;
618 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
619 BACKTRACK(net, &fl6->saddr);
620 out:
621 dst_use(&rt->dst, jiffies);
622 read_unlock_bh(&table->tb6_lock);
623 return rt;
624
625 }
626
rt6_lookup(struct net * net,const struct in6_addr * daddr,const struct in6_addr * saddr,int oif,int strict)627 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
628 const struct in6_addr *saddr, int oif, int strict)
629 {
630 struct flowi6 fl6 = {
631 .flowi6_oif = oif,
632 .daddr = *daddr,
633 };
634 struct dst_entry *dst;
635 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
636
637 if (saddr) {
638 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
639 flags |= RT6_LOOKUP_F_HAS_SADDR;
640 }
641
642 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
643 if (dst->error == 0)
644 return (struct rt6_info *) dst;
645
646 dst_release(dst);
647
648 return NULL;
649 }
650
651 EXPORT_SYMBOL(rt6_lookup);
652
653 /* ip6_ins_rt is called with FREE table->tb6_lock.
654 It takes new route entry, the addition fails by any reason the
655 route is freed. In any case, if caller does not hold it, it may
656 be destroyed.
657 */
658
__ip6_ins_rt(struct rt6_info * rt,struct nl_info * info)659 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
660 {
661 int err;
662 struct fib6_table *table;
663
664 table = rt->rt6i_table;
665 write_lock_bh(&table->tb6_lock);
666 err = fib6_add(&table->tb6_root, rt, info);
667 write_unlock_bh(&table->tb6_lock);
668
669 return err;
670 }
671
ip6_ins_rt(struct rt6_info * rt)672 int ip6_ins_rt(struct rt6_info *rt)
673 {
674 struct nl_info info = {
675 .nl_net = dev_net(rt->rt6i_dev),
676 };
677 return __ip6_ins_rt(rt, &info);
678 }
679
rt6_alloc_cow(struct rt6_info * ort,struct in6_addr * daddr,struct in6_addr * saddr)680 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
681 struct in6_addr *saddr)
682 {
683 struct rt6_info *rt;
684
685 /*
686 * Clone the route.
687 */
688
689 rt = ip6_rt_copy(ort);
690
691 if (rt) {
692 struct neighbour *neigh;
693 int attempts = !in_softirq();
694
695 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
696 if (rt->rt6i_dst.plen != 128 &&
697 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
698 rt->rt6i_flags |= RTF_ANYCAST;
699 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
700 }
701
702 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
703 rt->rt6i_dst.plen = 128;
704 rt->rt6i_flags |= RTF_CACHE;
705 rt->dst.flags |= DST_HOST;
706
707 #ifdef CONFIG_IPV6_SUBTREES
708 if (rt->rt6i_src.plen && saddr) {
709 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
710 rt->rt6i_src.plen = 128;
711 }
712 #endif
713
714 retry:
715 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
716 if (IS_ERR(neigh)) {
717 struct net *net = dev_net(rt->rt6i_dev);
718 int saved_rt_min_interval =
719 net->ipv6.sysctl.ip6_rt_gc_min_interval;
720 int saved_rt_elasticity =
721 net->ipv6.sysctl.ip6_rt_gc_elasticity;
722
723 if (attempts-- > 0) {
724 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
725 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
726
727 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
728
729 net->ipv6.sysctl.ip6_rt_gc_elasticity =
730 saved_rt_elasticity;
731 net->ipv6.sysctl.ip6_rt_gc_min_interval =
732 saved_rt_min_interval;
733 goto retry;
734 }
735
736 if (net_ratelimit())
737 printk(KERN_WARNING
738 "ipv6: Neighbour table overflow.\n");
739 dst_free(&rt->dst);
740 return NULL;
741 }
742 rt->rt6i_nexthop = neigh;
743
744 }
745
746 return rt;
747 }
748
rt6_alloc_clone(struct rt6_info * ort,struct in6_addr * daddr)749 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
750 {
751 struct rt6_info *rt = ip6_rt_copy(ort);
752 if (rt) {
753 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
754 rt->rt6i_dst.plen = 128;
755 rt->rt6i_flags |= RTF_CACHE;
756 rt->dst.flags |= DST_HOST;
757 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
758 }
759 return rt;
760 }
761
ip6_pol_route(struct net * net,struct fib6_table * table,int oif,struct flowi6 * fl6,int flags)762 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
763 struct flowi6 *fl6, int flags)
764 {
765 struct fib6_node *fn;
766 struct rt6_info *rt, *nrt;
767 int strict = 0;
768 int attempts = 3;
769 int err;
770 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
771
772 strict |= flags & RT6_LOOKUP_F_IFACE;
773
774 relookup:
775 read_lock_bh(&table->tb6_lock);
776
777 restart_2:
778 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
779
780 restart:
781 rt = rt6_select(fn, oif, strict | reachable);
782
783 BACKTRACK(net, &fl6->saddr);
784 if (rt == net->ipv6.ip6_null_entry ||
785 rt->rt6i_flags & RTF_CACHE)
786 goto out;
787
788 dst_hold(&rt->dst);
789 read_unlock_bh(&table->tb6_lock);
790
791 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
792 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
793 else if (!(rt->dst.flags & DST_HOST))
794 nrt = rt6_alloc_clone(rt, &fl6->daddr);
795 else
796 goto out2;
797
798 dst_release(&rt->dst);
799 rt = nrt ? : net->ipv6.ip6_null_entry;
800
801 dst_hold(&rt->dst);
802 if (nrt) {
803 err = ip6_ins_rt(nrt);
804 if (!err)
805 goto out2;
806 }
807
808 if (--attempts <= 0)
809 goto out2;
810
811 /*
812 * Race condition! In the gap, when table->tb6_lock was
813 * released someone could insert this route. Relookup.
814 */
815 dst_release(&rt->dst);
816 goto relookup;
817
818 out:
819 if (reachable) {
820 reachable = 0;
821 goto restart_2;
822 }
823 dst_hold(&rt->dst);
824 read_unlock_bh(&table->tb6_lock);
825 out2:
826 rt->dst.lastuse = jiffies;
827 rt->dst.__use++;
828
829 return rt;
830 }
831
ip6_pol_route_input(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)832 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
833 struct flowi6 *fl6, int flags)
834 {
835 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
836 }
837
ip6_route_input(struct sk_buff * skb)838 void ip6_route_input(struct sk_buff *skb)
839 {
840 struct ipv6hdr *iph = ipv6_hdr(skb);
841 struct net *net = dev_net(skb->dev);
842 int flags = RT6_LOOKUP_F_HAS_SADDR;
843 struct flowi6 fl6 = {
844 .flowi6_iif = skb->dev->ifindex,
845 .daddr = iph->daddr,
846 .saddr = iph->saddr,
847 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
848 .flowi6_mark = skb->mark,
849 .flowi6_proto = iph->nexthdr,
850 };
851
852 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
853 flags |= RT6_LOOKUP_F_IFACE;
854
855 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
856 }
857
ip6_pol_route_output(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)858 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
859 struct flowi6 *fl6, int flags)
860 {
861 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
862 }
863
ip6_route_output(struct net * net,const struct sock * sk,struct flowi6 * fl6)864 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
865 struct flowi6 *fl6)
866 {
867 int flags = 0;
868
869 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
870 flags |= RT6_LOOKUP_F_IFACE;
871
872 if (!ipv6_addr_any(&fl6->saddr))
873 flags |= RT6_LOOKUP_F_HAS_SADDR;
874 else if (sk)
875 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
876
877 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
878 }
879
880 EXPORT_SYMBOL(ip6_route_output);
881
ip6_blackhole_route(struct net * net,struct dst_entry * dst_orig)882 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
883 {
884 struct rt6_info *rt = dst_alloc(&ip6_dst_blackhole_ops, 1);
885 struct rt6_info *ort = (struct rt6_info *) dst_orig;
886 struct dst_entry *new = NULL;
887
888 if (rt) {
889 new = &rt->dst;
890
891 new->__use = 1;
892 new->input = dst_discard;
893 new->output = dst_discard;
894
895 dst_copy_metrics(new, &ort->dst);
896 new->dev = ort->dst.dev;
897 if (new->dev)
898 dev_hold(new->dev);
899 rt->rt6i_idev = ort->rt6i_idev;
900 if (rt->rt6i_idev)
901 in6_dev_hold(rt->rt6i_idev);
902 rt->rt6i_expires = 0;
903
904 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
905 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
906 rt->rt6i_metric = 0;
907
908 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
909 #ifdef CONFIG_IPV6_SUBTREES
910 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
911 #endif
912
913 dst_free(new);
914 }
915
916 dst_release(dst_orig);
917 return new ? new : ERR_PTR(-ENOMEM);
918 }
919
920 /*
921 * Destination cache support functions
922 */
923
ip6_dst_check(struct dst_entry * dst,u32 cookie)924 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
925 {
926 struct rt6_info *rt;
927
928 rt = (struct rt6_info *) dst;
929
930 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
931 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
932 if (!rt->rt6i_peer)
933 rt6_bind_peer(rt, 0);
934 rt->rt6i_peer_genid = rt6_peer_genid();
935 }
936 return dst;
937 }
938 return NULL;
939 }
940
ip6_negative_advice(struct dst_entry * dst)941 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
942 {
943 struct rt6_info *rt = (struct rt6_info *) dst;
944
945 if (rt) {
946 if (rt->rt6i_flags & RTF_CACHE) {
947 if (rt6_check_expired(rt)) {
948 ip6_del_rt(rt);
949 dst = NULL;
950 }
951 } else {
952 dst_release(dst);
953 dst = NULL;
954 }
955 }
956 return dst;
957 }
958
ip6_link_failure(struct sk_buff * skb)959 static void ip6_link_failure(struct sk_buff *skb)
960 {
961 struct rt6_info *rt;
962
963 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
964
965 rt = (struct rt6_info *) skb_dst(skb);
966 if (rt) {
967 if (rt->rt6i_flags&RTF_CACHE) {
968 dst_set_expires(&rt->dst, 0);
969 rt->rt6i_flags |= RTF_EXPIRES;
970 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
971 rt->rt6i_node->fn_sernum = -1;
972 }
973 }
974
ip6_rt_update_pmtu(struct dst_entry * dst,u32 mtu)975 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
976 {
977 struct rt6_info *rt6 = (struct rt6_info*)dst;
978
979 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
980 rt6->rt6i_flags |= RTF_MODIFIED;
981 if (mtu < IPV6_MIN_MTU) {
982 u32 features = dst_metric(dst, RTAX_FEATURES);
983 mtu = IPV6_MIN_MTU;
984 features |= RTAX_FEATURE_ALLFRAG;
985 dst_metric_set(dst, RTAX_FEATURES, features);
986 }
987 dst_metric_set(dst, RTAX_MTU, mtu);
988 }
989 }
990
ip6_default_advmss(const struct dst_entry * dst)991 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
992 {
993 struct net_device *dev = dst->dev;
994 unsigned int mtu = dst_mtu(dst);
995 struct net *net = dev_net(dev);
996
997 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
998
999 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1000 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1001
1002 /*
1003 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1004 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1005 * IPV6_MAXPLEN is also valid and means: "any MSS,
1006 * rely only on pmtu discovery"
1007 */
1008 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1009 mtu = IPV6_MAXPLEN;
1010 return mtu;
1011 }
1012
ip6_default_mtu(const struct dst_entry * dst)1013 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1014 {
1015 unsigned int mtu = IPV6_MIN_MTU;
1016 struct inet6_dev *idev;
1017
1018 rcu_read_lock();
1019 idev = __in6_dev_get(dst->dev);
1020 if (idev)
1021 mtu = idev->cnf.mtu6;
1022 rcu_read_unlock();
1023
1024 return mtu;
1025 }
1026
1027 static struct dst_entry *icmp6_dst_gc_list;
1028 static DEFINE_SPINLOCK(icmp6_dst_lock);
1029
icmp6_dst_alloc(struct net_device * dev,struct neighbour * neigh,const struct in6_addr * addr)1030 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1031 struct neighbour *neigh,
1032 const struct in6_addr *addr)
1033 {
1034 struct rt6_info *rt;
1035 struct inet6_dev *idev = in6_dev_get(dev);
1036 struct net *net = dev_net(dev);
1037
1038 if (unlikely(idev == NULL))
1039 return NULL;
1040
1041 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1042 if (unlikely(rt == NULL)) {
1043 in6_dev_put(idev);
1044 goto out;
1045 }
1046
1047 dev_hold(dev);
1048 if (neigh)
1049 neigh_hold(neigh);
1050 else {
1051 neigh = ndisc_get_neigh(dev, addr);
1052 if (IS_ERR(neigh))
1053 neigh = NULL;
1054 }
1055
1056 rt->rt6i_dev = dev;
1057 rt->rt6i_idev = idev;
1058 rt->rt6i_nexthop = neigh;
1059 atomic_set(&rt->dst.__refcnt, 1);
1060 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1061 rt->dst.output = ip6_output;
1062
1063 #if 0 /* there's no chance to use these for ndisc */
1064 rt->dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1065 ? DST_HOST
1066 : 0;
1067 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1068 rt->rt6i_dst.plen = 128;
1069 #endif
1070
1071 spin_lock_bh(&icmp6_dst_lock);
1072 rt->dst.next = icmp6_dst_gc_list;
1073 icmp6_dst_gc_list = &rt->dst;
1074 spin_unlock_bh(&icmp6_dst_lock);
1075
1076 fib6_force_start_gc(net);
1077
1078 out:
1079 return &rt->dst;
1080 }
1081
icmp6_dst_gc(void)1082 int icmp6_dst_gc(void)
1083 {
1084 struct dst_entry *dst, **pprev;
1085 int more = 0;
1086
1087 spin_lock_bh(&icmp6_dst_lock);
1088 pprev = &icmp6_dst_gc_list;
1089
1090 while ((dst = *pprev) != NULL) {
1091 if (!atomic_read(&dst->__refcnt)) {
1092 *pprev = dst->next;
1093 dst_free(dst);
1094 } else {
1095 pprev = &dst->next;
1096 ++more;
1097 }
1098 }
1099
1100 spin_unlock_bh(&icmp6_dst_lock);
1101
1102 return more;
1103 }
1104
icmp6_clean_all(int (* func)(struct rt6_info * rt,void * arg),void * arg)1105 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1106 void *arg)
1107 {
1108 struct dst_entry *dst, **pprev;
1109
1110 spin_lock_bh(&icmp6_dst_lock);
1111 pprev = &icmp6_dst_gc_list;
1112 while ((dst = *pprev) != NULL) {
1113 struct rt6_info *rt = (struct rt6_info *) dst;
1114 if (func(rt, arg)) {
1115 *pprev = dst->next;
1116 dst_free(dst);
1117 } else {
1118 pprev = &dst->next;
1119 }
1120 }
1121 spin_unlock_bh(&icmp6_dst_lock);
1122 }
1123
ip6_dst_gc(struct dst_ops * ops)1124 static int ip6_dst_gc(struct dst_ops *ops)
1125 {
1126 unsigned long now = jiffies;
1127 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1128 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1129 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1130 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1131 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1132 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1133 int entries;
1134
1135 entries = dst_entries_get_fast(ops);
1136 if (time_after(rt_last_gc + rt_min_interval, now) &&
1137 entries <= rt_max_size)
1138 goto out;
1139
1140 net->ipv6.ip6_rt_gc_expire++;
1141 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1142 net->ipv6.ip6_rt_last_gc = now;
1143 entries = dst_entries_get_slow(ops);
1144 if (entries < ops->gc_thresh)
1145 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1146 out:
1147 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1148 return entries > rt_max_size;
1149 }
1150
1151 /* Clean host part of a prefix. Not necessary in radix tree,
1152 but results in cleaner routing tables.
1153
1154 Remove it only when all the things will work!
1155 */
1156
ip6_dst_hoplimit(struct dst_entry * dst)1157 int ip6_dst_hoplimit(struct dst_entry *dst)
1158 {
1159 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1160 if (hoplimit == 0) {
1161 struct net_device *dev = dst->dev;
1162 struct inet6_dev *idev;
1163
1164 rcu_read_lock();
1165 idev = __in6_dev_get(dev);
1166 if (idev)
1167 hoplimit = idev->cnf.hop_limit;
1168 else
1169 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1170 rcu_read_unlock();
1171 }
1172 return hoplimit;
1173 }
1174 EXPORT_SYMBOL(ip6_dst_hoplimit);
1175
1176 /*
1177 *
1178 */
1179
ip6_route_add(struct fib6_config * cfg)1180 int ip6_route_add(struct fib6_config *cfg)
1181 {
1182 int err;
1183 struct net *net = cfg->fc_nlinfo.nl_net;
1184 struct rt6_info *rt = NULL;
1185 struct net_device *dev = NULL;
1186 struct inet6_dev *idev = NULL;
1187 struct fib6_table *table;
1188 int addr_type;
1189
1190 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1191 return -EINVAL;
1192 #ifndef CONFIG_IPV6_SUBTREES
1193 if (cfg->fc_src_len)
1194 return -EINVAL;
1195 #endif
1196 if (cfg->fc_ifindex) {
1197 err = -ENODEV;
1198 dev = dev_get_by_index(net, cfg->fc_ifindex);
1199 if (!dev)
1200 goto out;
1201 idev = in6_dev_get(dev);
1202 if (!idev)
1203 goto out;
1204 }
1205
1206 if (cfg->fc_metric == 0)
1207 cfg->fc_metric = IP6_RT_PRIO_USER;
1208
1209 table = fib6_new_table(net, cfg->fc_table);
1210 if (table == NULL) {
1211 err = -ENOBUFS;
1212 goto out;
1213 }
1214
1215 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1216
1217 if (rt == NULL) {
1218 err = -ENOMEM;
1219 goto out;
1220 }
1221
1222 rt->dst.obsolete = -1;
1223 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1224 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1225 0;
1226
1227 if (cfg->fc_protocol == RTPROT_UNSPEC)
1228 cfg->fc_protocol = RTPROT_BOOT;
1229 rt->rt6i_protocol = cfg->fc_protocol;
1230
1231 addr_type = ipv6_addr_type(&cfg->fc_dst);
1232
1233 if (addr_type & IPV6_ADDR_MULTICAST)
1234 rt->dst.input = ip6_mc_input;
1235 else if (cfg->fc_flags & RTF_LOCAL)
1236 rt->dst.input = ip6_input;
1237 else
1238 rt->dst.input = ip6_forward;
1239
1240 rt->dst.output = ip6_output;
1241
1242 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1243 rt->rt6i_dst.plen = cfg->fc_dst_len;
1244 if (rt->rt6i_dst.plen == 128)
1245 rt->dst.flags = DST_HOST;
1246
1247 #ifdef CONFIG_IPV6_SUBTREES
1248 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1249 rt->rt6i_src.plen = cfg->fc_src_len;
1250 #endif
1251
1252 rt->rt6i_metric = cfg->fc_metric;
1253
1254 /* We cannot add true routes via loopback here,
1255 they would result in kernel looping; promote them to reject routes
1256 */
1257 if ((cfg->fc_flags & RTF_REJECT) ||
1258 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1259 && !(cfg->fc_flags&RTF_LOCAL))) {
1260 /* hold loopback dev/idev if we haven't done so. */
1261 if (dev != net->loopback_dev) {
1262 if (dev) {
1263 dev_put(dev);
1264 in6_dev_put(idev);
1265 }
1266 dev = net->loopback_dev;
1267 dev_hold(dev);
1268 idev = in6_dev_get(dev);
1269 if (!idev) {
1270 err = -ENODEV;
1271 goto out;
1272 }
1273 }
1274 rt->dst.output = ip6_pkt_discard_out;
1275 rt->dst.input = ip6_pkt_discard;
1276 rt->dst.error = -ENETUNREACH;
1277 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1278 goto install_route;
1279 }
1280
1281 if (cfg->fc_flags & RTF_GATEWAY) {
1282 struct in6_addr *gw_addr;
1283 int gwa_type;
1284
1285 gw_addr = &cfg->fc_gateway;
1286 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1287 gwa_type = ipv6_addr_type(gw_addr);
1288
1289 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1290 struct rt6_info *grt;
1291
1292 /* IPv6 strictly inhibits using not link-local
1293 addresses as nexthop address.
1294 Otherwise, router will not able to send redirects.
1295 It is very good, but in some (rare!) circumstances
1296 (SIT, PtP, NBMA NOARP links) it is handy to allow
1297 some exceptions. --ANK
1298 */
1299 err = -EINVAL;
1300 if (!(gwa_type&IPV6_ADDR_UNICAST))
1301 goto out;
1302
1303 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1304
1305 err = -EHOSTUNREACH;
1306 if (grt == NULL)
1307 goto out;
1308 if (dev) {
1309 if (dev != grt->rt6i_dev) {
1310 dst_release(&grt->dst);
1311 goto out;
1312 }
1313 } else {
1314 dev = grt->rt6i_dev;
1315 idev = grt->rt6i_idev;
1316 dev_hold(dev);
1317 in6_dev_hold(grt->rt6i_idev);
1318 }
1319 if (!(grt->rt6i_flags&RTF_GATEWAY))
1320 err = 0;
1321 dst_release(&grt->dst);
1322
1323 if (err)
1324 goto out;
1325 }
1326 err = -EINVAL;
1327 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1328 goto out;
1329 }
1330
1331 err = -ENODEV;
1332 if (dev == NULL)
1333 goto out;
1334
1335 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1336 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1337 if (IS_ERR(rt->rt6i_nexthop)) {
1338 err = PTR_ERR(rt->rt6i_nexthop);
1339 rt->rt6i_nexthop = NULL;
1340 goto out;
1341 }
1342 }
1343
1344 rt->rt6i_flags = cfg->fc_flags;
1345
1346 install_route:
1347 if (cfg->fc_mx) {
1348 struct nlattr *nla;
1349 int remaining;
1350
1351 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1352 int type = nla_type(nla);
1353
1354 if (type) {
1355 if (type > RTAX_MAX) {
1356 err = -EINVAL;
1357 goto out;
1358 }
1359
1360 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1361 }
1362 }
1363 }
1364
1365 rt->dst.dev = dev;
1366 rt->rt6i_idev = idev;
1367 rt->rt6i_table = table;
1368
1369 cfg->fc_nlinfo.nl_net = dev_net(dev);
1370
1371 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1372
1373 out:
1374 if (dev)
1375 dev_put(dev);
1376 if (idev)
1377 in6_dev_put(idev);
1378 if (rt)
1379 dst_free(&rt->dst);
1380 return err;
1381 }
1382
__ip6_del_rt(struct rt6_info * rt,struct nl_info * info)1383 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1384 {
1385 int err;
1386 struct fib6_table *table;
1387 struct net *net = dev_net(rt->rt6i_dev);
1388
1389 if (rt == net->ipv6.ip6_null_entry)
1390 return -ENOENT;
1391
1392 table = rt->rt6i_table;
1393 write_lock_bh(&table->tb6_lock);
1394
1395 err = fib6_del(rt, info);
1396 dst_release(&rt->dst);
1397
1398 write_unlock_bh(&table->tb6_lock);
1399
1400 return err;
1401 }
1402
ip6_del_rt(struct rt6_info * rt)1403 int ip6_del_rt(struct rt6_info *rt)
1404 {
1405 struct nl_info info = {
1406 .nl_net = dev_net(rt->rt6i_dev),
1407 };
1408 return __ip6_del_rt(rt, &info);
1409 }
1410
ip6_route_del(struct fib6_config * cfg)1411 static int ip6_route_del(struct fib6_config *cfg)
1412 {
1413 struct fib6_table *table;
1414 struct fib6_node *fn;
1415 struct rt6_info *rt;
1416 int err = -ESRCH;
1417
1418 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1419 if (table == NULL)
1420 return err;
1421
1422 read_lock_bh(&table->tb6_lock);
1423
1424 fn = fib6_locate(&table->tb6_root,
1425 &cfg->fc_dst, cfg->fc_dst_len,
1426 &cfg->fc_src, cfg->fc_src_len);
1427
1428 if (fn) {
1429 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1430 if (cfg->fc_ifindex &&
1431 (rt->rt6i_dev == NULL ||
1432 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1433 continue;
1434 if (cfg->fc_flags & RTF_GATEWAY &&
1435 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1436 continue;
1437 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1438 continue;
1439 dst_hold(&rt->dst);
1440 read_unlock_bh(&table->tb6_lock);
1441
1442 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1443 }
1444 }
1445 read_unlock_bh(&table->tb6_lock);
1446
1447 return err;
1448 }
1449
1450 /*
1451 * Handle redirects
1452 */
1453 struct ip6rd_flowi {
1454 struct flowi6 fl6;
1455 struct in6_addr gateway;
1456 };
1457
__ip6_route_redirect(struct net * net,struct fib6_table * table,struct flowi6 * fl6,int flags)1458 static struct rt6_info *__ip6_route_redirect(struct net *net,
1459 struct fib6_table *table,
1460 struct flowi6 *fl6,
1461 int flags)
1462 {
1463 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1464 struct rt6_info *rt;
1465 struct fib6_node *fn;
1466
1467 /*
1468 * Get the "current" route for this destination and
1469 * check if the redirect has come from approriate router.
1470 *
1471 * RFC 2461 specifies that redirects should only be
1472 * accepted if they come from the nexthop to the target.
1473 * Due to the way the routes are chosen, this notion
1474 * is a bit fuzzy and one might need to check all possible
1475 * routes.
1476 */
1477
1478 read_lock_bh(&table->tb6_lock);
1479 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1480 restart:
1481 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1482 /*
1483 * Current route is on-link; redirect is always invalid.
1484 *
1485 * Seems, previous statement is not true. It could
1486 * be node, which looks for us as on-link (f.e. proxy ndisc)
1487 * But then router serving it might decide, that we should
1488 * know truth 8)8) --ANK (980726).
1489 */
1490 if (rt6_check_expired(rt))
1491 continue;
1492 if (!(rt->rt6i_flags & RTF_GATEWAY))
1493 continue;
1494 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1495 continue;
1496 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1497 continue;
1498 break;
1499 }
1500
1501 if (!rt)
1502 rt = net->ipv6.ip6_null_entry;
1503 BACKTRACK(net, &fl6->saddr);
1504 out:
1505 dst_hold(&rt->dst);
1506
1507 read_unlock_bh(&table->tb6_lock);
1508
1509 return rt;
1510 };
1511
ip6_route_redirect(struct in6_addr * dest,struct in6_addr * src,struct in6_addr * gateway,struct net_device * dev)1512 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1513 struct in6_addr *src,
1514 struct in6_addr *gateway,
1515 struct net_device *dev)
1516 {
1517 int flags = RT6_LOOKUP_F_HAS_SADDR;
1518 struct net *net = dev_net(dev);
1519 struct ip6rd_flowi rdfl = {
1520 .fl6 = {
1521 .flowi6_oif = dev->ifindex,
1522 .daddr = *dest,
1523 .saddr = *src,
1524 },
1525 };
1526
1527 ipv6_addr_copy(&rdfl.gateway, gateway);
1528
1529 if (rt6_need_strict(dest))
1530 flags |= RT6_LOOKUP_F_IFACE;
1531
1532 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1533 flags, __ip6_route_redirect);
1534 }
1535
rt6_redirect(struct in6_addr * dest,struct in6_addr * src,struct in6_addr * saddr,struct neighbour * neigh,u8 * lladdr,int on_link)1536 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1537 struct in6_addr *saddr,
1538 struct neighbour *neigh, u8 *lladdr, int on_link)
1539 {
1540 struct rt6_info *rt, *nrt = NULL;
1541 struct netevent_redirect netevent;
1542 struct net *net = dev_net(neigh->dev);
1543
1544 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1545
1546 if (rt == net->ipv6.ip6_null_entry) {
1547 if (net_ratelimit())
1548 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1549 "for redirect target\n");
1550 goto out;
1551 }
1552
1553 /*
1554 * We have finally decided to accept it.
1555 */
1556
1557 neigh_update(neigh, lladdr, NUD_STALE,
1558 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1559 NEIGH_UPDATE_F_OVERRIDE|
1560 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1561 NEIGH_UPDATE_F_ISROUTER))
1562 );
1563
1564 /*
1565 * Redirect received -> path was valid.
1566 * Look, redirects are sent only in response to data packets,
1567 * so that this nexthop apparently is reachable. --ANK
1568 */
1569 dst_confirm(&rt->dst);
1570
1571 /* Duplicate redirect: silently ignore. */
1572 if (neigh == rt->dst.neighbour)
1573 goto out;
1574
1575 nrt = ip6_rt_copy(rt);
1576 if (nrt == NULL)
1577 goto out;
1578
1579 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1580 if (on_link)
1581 nrt->rt6i_flags &= ~RTF_GATEWAY;
1582
1583 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1584 nrt->rt6i_dst.plen = 128;
1585 nrt->dst.flags |= DST_HOST;
1586
1587 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1588 nrt->rt6i_nexthop = neigh_clone(neigh);
1589
1590 if (ip6_ins_rt(nrt))
1591 goto out;
1592
1593 netevent.old = &rt->dst;
1594 netevent.new = &nrt->dst;
1595 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1596
1597 if (rt->rt6i_flags&RTF_CACHE) {
1598 ip6_del_rt(rt);
1599 return;
1600 }
1601
1602 out:
1603 dst_release(&rt->dst);
1604 }
1605
1606 /*
1607 * Handle ICMP "packet too big" messages
1608 * i.e. Path MTU discovery
1609 */
1610
rt6_do_pmtu_disc(struct in6_addr * daddr,struct in6_addr * saddr,struct net * net,u32 pmtu,int ifindex)1611 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1612 struct net *net, u32 pmtu, int ifindex)
1613 {
1614 struct rt6_info *rt, *nrt;
1615 int allfrag = 0;
1616 again:
1617 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1618 if (rt == NULL)
1619 return;
1620
1621 if (rt6_check_expired(rt)) {
1622 ip6_del_rt(rt);
1623 goto again;
1624 }
1625
1626 if (pmtu >= dst_mtu(&rt->dst))
1627 goto out;
1628
1629 if (pmtu < IPV6_MIN_MTU) {
1630 /*
1631 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1632 * MTU (1280) and a fragment header should always be included
1633 * after a node receiving Too Big message reporting PMTU is
1634 * less than the IPv6 Minimum Link MTU.
1635 */
1636 pmtu = IPV6_MIN_MTU;
1637 allfrag = 1;
1638 }
1639
1640 /* New mtu received -> path was valid.
1641 They are sent only in response to data packets,
1642 so that this nexthop apparently is reachable. --ANK
1643 */
1644 dst_confirm(&rt->dst);
1645
1646 /* Host route. If it is static, it would be better
1647 not to override it, but add new one, so that
1648 when cache entry will expire old pmtu
1649 would return automatically.
1650 */
1651 if (rt->rt6i_flags & RTF_CACHE) {
1652 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1653 if (allfrag) {
1654 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1655 features |= RTAX_FEATURE_ALLFRAG;
1656 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1657 }
1658 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1659 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1660 goto out;
1661 }
1662
1663 /* Network route.
1664 Two cases are possible:
1665 1. It is connected route. Action: COW
1666 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1667 */
1668 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1669 nrt = rt6_alloc_cow(rt, daddr, saddr);
1670 else
1671 nrt = rt6_alloc_clone(rt, daddr);
1672
1673 if (nrt) {
1674 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1675 if (allfrag) {
1676 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1677 features |= RTAX_FEATURE_ALLFRAG;
1678 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1679 }
1680
1681 /* According to RFC 1981, detecting PMTU increase shouldn't be
1682 * happened within 5 mins, the recommended timer is 10 mins.
1683 * Here this route expiration time is set to ip6_rt_mtu_expires
1684 * which is 10 mins. After 10 mins the decreased pmtu is expired
1685 * and detecting PMTU increase will be automatically happened.
1686 */
1687 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1688 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1689
1690 ip6_ins_rt(nrt);
1691 }
1692 out:
1693 dst_release(&rt->dst);
1694 }
1695
rt6_pmtu_discovery(struct in6_addr * daddr,struct in6_addr * saddr,struct net_device * dev,u32 pmtu)1696 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1697 struct net_device *dev, u32 pmtu)
1698 {
1699 struct net *net = dev_net(dev);
1700
1701 /*
1702 * RFC 1981 states that a node "MUST reduce the size of the packets it
1703 * is sending along the path" that caused the Packet Too Big message.
1704 * Since it's not possible in the general case to determine which
1705 * interface was used to send the original packet, we update the MTU
1706 * on the interface that will be used to send future packets. We also
1707 * update the MTU on the interface that received the Packet Too Big in
1708 * case the original packet was forced out that interface with
1709 * SO_BINDTODEVICE or similar. This is the next best thing to the
1710 * correct behaviour, which would be to update the MTU on all
1711 * interfaces.
1712 */
1713 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1714 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1715 }
1716
1717 /*
1718 * Misc support functions
1719 */
1720
ip6_rt_copy(struct rt6_info * ort)1721 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1722 {
1723 struct net *net = dev_net(ort->rt6i_dev);
1724 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1725
1726 if (rt) {
1727 rt->dst.input = ort->dst.input;
1728 rt->dst.output = ort->dst.output;
1729
1730 dst_copy_metrics(&rt->dst, &ort->dst);
1731 rt->dst.error = ort->dst.error;
1732 rt->dst.dev = ort->dst.dev;
1733 if (rt->dst.dev)
1734 dev_hold(rt->dst.dev);
1735 rt->rt6i_idev = ort->rt6i_idev;
1736 if (rt->rt6i_idev)
1737 in6_dev_hold(rt->rt6i_idev);
1738 rt->dst.lastuse = jiffies;
1739 rt->rt6i_expires = 0;
1740
1741 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1742 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1743 rt->rt6i_metric = 0;
1744
1745 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1746 #ifdef CONFIG_IPV6_SUBTREES
1747 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1748 #endif
1749 rt->rt6i_table = ort->rt6i_table;
1750 }
1751 return rt;
1752 }
1753
1754 #ifdef CONFIG_IPV6_ROUTE_INFO
rt6_get_route_info(struct net * net,struct in6_addr * prefix,int prefixlen,struct in6_addr * gwaddr,int ifindex)1755 static struct rt6_info *rt6_get_route_info(struct net *net,
1756 struct in6_addr *prefix, int prefixlen,
1757 struct in6_addr *gwaddr, int ifindex)
1758 {
1759 struct fib6_node *fn;
1760 struct rt6_info *rt = NULL;
1761 struct fib6_table *table;
1762
1763 table = fib6_get_table(net, RT6_TABLE_INFO);
1764 if (table == NULL)
1765 return NULL;
1766
1767 write_lock_bh(&table->tb6_lock);
1768 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1769 if (!fn)
1770 goto out;
1771
1772 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1773 if (rt->rt6i_dev->ifindex != ifindex)
1774 continue;
1775 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1776 continue;
1777 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1778 continue;
1779 dst_hold(&rt->dst);
1780 break;
1781 }
1782 out:
1783 write_unlock_bh(&table->tb6_lock);
1784 return rt;
1785 }
1786
rt6_add_route_info(struct net * net,struct in6_addr * prefix,int prefixlen,struct in6_addr * gwaddr,int ifindex,unsigned pref)1787 static struct rt6_info *rt6_add_route_info(struct net *net,
1788 struct in6_addr *prefix, int prefixlen,
1789 struct in6_addr *gwaddr, int ifindex,
1790 unsigned pref)
1791 {
1792 struct fib6_config cfg = {
1793 .fc_table = RT6_TABLE_INFO,
1794 .fc_metric = IP6_RT_PRIO_USER,
1795 .fc_ifindex = ifindex,
1796 .fc_dst_len = prefixlen,
1797 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1798 RTF_UP | RTF_PREF(pref),
1799 .fc_nlinfo.pid = 0,
1800 .fc_nlinfo.nlh = NULL,
1801 .fc_nlinfo.nl_net = net,
1802 };
1803
1804 ipv6_addr_copy(&cfg.fc_dst, prefix);
1805 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1806
1807 /* We should treat it as a default route if prefix length is 0. */
1808 if (!prefixlen)
1809 cfg.fc_flags |= RTF_DEFAULT;
1810
1811 ip6_route_add(&cfg);
1812
1813 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1814 }
1815 #endif
1816
rt6_get_dflt_router(struct in6_addr * addr,struct net_device * dev)1817 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1818 {
1819 struct rt6_info *rt;
1820 struct fib6_table *table;
1821
1822 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1823 if (table == NULL)
1824 return NULL;
1825
1826 write_lock_bh(&table->tb6_lock);
1827 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1828 if (dev == rt->rt6i_dev &&
1829 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1830 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1831 break;
1832 }
1833 if (rt)
1834 dst_hold(&rt->dst);
1835 write_unlock_bh(&table->tb6_lock);
1836 return rt;
1837 }
1838
rt6_add_dflt_router(struct in6_addr * gwaddr,struct net_device * dev,unsigned int pref)1839 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1840 struct net_device *dev,
1841 unsigned int pref)
1842 {
1843 struct fib6_config cfg = {
1844 .fc_table = RT6_TABLE_DFLT,
1845 .fc_metric = IP6_RT_PRIO_USER,
1846 .fc_ifindex = dev->ifindex,
1847 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1848 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1849 .fc_nlinfo.pid = 0,
1850 .fc_nlinfo.nlh = NULL,
1851 .fc_nlinfo.nl_net = dev_net(dev),
1852 };
1853
1854 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1855
1856 ip6_route_add(&cfg);
1857
1858 return rt6_get_dflt_router(gwaddr, dev);
1859 }
1860
rt6_purge_dflt_routers(struct net * net)1861 void rt6_purge_dflt_routers(struct net *net)
1862 {
1863 struct rt6_info *rt;
1864 struct fib6_table *table;
1865
1866 /* NOTE: Keep consistent with rt6_get_dflt_router */
1867 table = fib6_get_table(net, RT6_TABLE_DFLT);
1868 if (table == NULL)
1869 return;
1870
1871 restart:
1872 read_lock_bh(&table->tb6_lock);
1873 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1874 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1875 dst_hold(&rt->dst);
1876 read_unlock_bh(&table->tb6_lock);
1877 ip6_del_rt(rt);
1878 goto restart;
1879 }
1880 }
1881 read_unlock_bh(&table->tb6_lock);
1882 }
1883
rtmsg_to_fib6_config(struct net * net,struct in6_rtmsg * rtmsg,struct fib6_config * cfg)1884 static void rtmsg_to_fib6_config(struct net *net,
1885 struct in6_rtmsg *rtmsg,
1886 struct fib6_config *cfg)
1887 {
1888 memset(cfg, 0, sizeof(*cfg));
1889
1890 cfg->fc_table = RT6_TABLE_MAIN;
1891 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1892 cfg->fc_metric = rtmsg->rtmsg_metric;
1893 cfg->fc_expires = rtmsg->rtmsg_info;
1894 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1895 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1896 cfg->fc_flags = rtmsg->rtmsg_flags;
1897
1898 cfg->fc_nlinfo.nl_net = net;
1899
1900 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1901 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1902 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1903 }
1904
ipv6_route_ioctl(struct net * net,unsigned int cmd,void __user * arg)1905 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1906 {
1907 struct fib6_config cfg;
1908 struct in6_rtmsg rtmsg;
1909 int err;
1910
1911 switch(cmd) {
1912 case SIOCADDRT: /* Add a route */
1913 case SIOCDELRT: /* Delete a route */
1914 if (!capable(CAP_NET_ADMIN))
1915 return -EPERM;
1916 err = copy_from_user(&rtmsg, arg,
1917 sizeof(struct in6_rtmsg));
1918 if (err)
1919 return -EFAULT;
1920
1921 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1922
1923 rtnl_lock();
1924 switch (cmd) {
1925 case SIOCADDRT:
1926 err = ip6_route_add(&cfg);
1927 break;
1928 case SIOCDELRT:
1929 err = ip6_route_del(&cfg);
1930 break;
1931 default:
1932 err = -EINVAL;
1933 }
1934 rtnl_unlock();
1935
1936 return err;
1937 }
1938
1939 return -EINVAL;
1940 }
1941
1942 /*
1943 * Drop the packet on the floor
1944 */
1945
ip6_pkt_drop(struct sk_buff * skb,u8 code,int ipstats_mib_noroutes)1946 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1947 {
1948 int type;
1949 struct dst_entry *dst = skb_dst(skb);
1950 switch (ipstats_mib_noroutes) {
1951 case IPSTATS_MIB_INNOROUTES:
1952 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1953 if (type == IPV6_ADDR_ANY) {
1954 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1955 IPSTATS_MIB_INADDRERRORS);
1956 break;
1957 }
1958 /* FALLTHROUGH */
1959 case IPSTATS_MIB_OUTNOROUTES:
1960 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1961 ipstats_mib_noroutes);
1962 break;
1963 }
1964 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1965 kfree_skb(skb);
1966 return 0;
1967 }
1968
ip6_pkt_discard(struct sk_buff * skb)1969 static int ip6_pkt_discard(struct sk_buff *skb)
1970 {
1971 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1972 }
1973
ip6_pkt_discard_out(struct sk_buff * skb)1974 static int ip6_pkt_discard_out(struct sk_buff *skb)
1975 {
1976 skb->dev = skb_dst(skb)->dev;
1977 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1978 }
1979
1980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1981
ip6_pkt_prohibit(struct sk_buff * skb)1982 static int ip6_pkt_prohibit(struct sk_buff *skb)
1983 {
1984 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1985 }
1986
ip6_pkt_prohibit_out(struct sk_buff * skb)1987 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1988 {
1989 skb->dev = skb_dst(skb)->dev;
1990 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1991 }
1992
1993 #endif
1994
1995 /*
1996 * Allocate a dst for local (unicast / anycast) address.
1997 */
1998
addrconf_dst_alloc(struct inet6_dev * idev,const struct in6_addr * addr,int anycast)1999 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2000 const struct in6_addr *addr,
2001 int anycast)
2002 {
2003 struct net *net = dev_net(idev->dev);
2004 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
2005 struct neighbour *neigh;
2006
2007 if (rt == NULL) {
2008 if (net_ratelimit())
2009 pr_warning("IPv6: Maximum number of routes reached,"
2010 " consider increasing route/max_size.\n");
2011 return ERR_PTR(-ENOMEM);
2012 }
2013
2014 dev_hold(net->loopback_dev);
2015 in6_dev_hold(idev);
2016
2017 rt->dst.flags = DST_HOST;
2018 rt->dst.input = ip6_input;
2019 rt->dst.output = ip6_output;
2020 rt->rt6i_dev = net->loopback_dev;
2021 rt->rt6i_idev = idev;
2022 rt->dst.obsolete = -1;
2023
2024 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2025 if (anycast)
2026 rt->rt6i_flags |= RTF_ANYCAST;
2027 else
2028 rt->rt6i_flags |= RTF_LOCAL;
2029 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2030 if (IS_ERR(neigh)) {
2031 dst_free(&rt->dst);
2032
2033 return ERR_CAST(neigh);
2034 }
2035 rt->rt6i_nexthop = neigh;
2036
2037 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2038 rt->rt6i_dst.plen = 128;
2039 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2040
2041 atomic_set(&rt->dst.__refcnt, 1);
2042
2043 return rt;
2044 }
2045
2046 struct arg_dev_net {
2047 struct net_device *dev;
2048 struct net *net;
2049 };
2050
fib6_ifdown(struct rt6_info * rt,void * arg)2051 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2052 {
2053 const struct arg_dev_net *adn = arg;
2054 const struct net_device *dev = adn->dev;
2055
2056 if ((rt->rt6i_dev == dev || dev == NULL) &&
2057 rt != adn->net->ipv6.ip6_null_entry) {
2058 RT6_TRACE("deleted by ifdown %p\n", rt);
2059 return -1;
2060 }
2061 return 0;
2062 }
2063
rt6_ifdown(struct net * net,struct net_device * dev)2064 void rt6_ifdown(struct net *net, struct net_device *dev)
2065 {
2066 struct arg_dev_net adn = {
2067 .dev = dev,
2068 .net = net,
2069 };
2070
2071 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2072 icmp6_clean_all(fib6_ifdown, &adn);
2073 }
2074
2075 struct rt6_mtu_change_arg
2076 {
2077 struct net_device *dev;
2078 unsigned mtu;
2079 };
2080
rt6_mtu_change_route(struct rt6_info * rt,void * p_arg)2081 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2082 {
2083 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2084 struct inet6_dev *idev;
2085
2086 /* In IPv6 pmtu discovery is not optional,
2087 so that RTAX_MTU lock cannot disable it.
2088 We still use this lock to block changes
2089 caused by addrconf/ndisc.
2090 */
2091
2092 idev = __in6_dev_get(arg->dev);
2093 if (idev == NULL)
2094 return 0;
2095
2096 /* For administrative MTU increase, there is no way to discover
2097 IPv6 PMTU increase, so PMTU increase should be updated here.
2098 Since RFC 1981 doesn't include administrative MTU increase
2099 update PMTU increase is a MUST. (i.e. jumbo frame)
2100 */
2101 /*
2102 If new MTU is less than route PMTU, this new MTU will be the
2103 lowest MTU in the path, update the route PMTU to reflect PMTU
2104 decreases; if new MTU is greater than route PMTU, and the
2105 old MTU is the lowest MTU in the path, update the route PMTU
2106 to reflect the increase. In this case if the other nodes' MTU
2107 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2108 PMTU discouvery.
2109 */
2110 if (rt->rt6i_dev == arg->dev &&
2111 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2112 (dst_mtu(&rt->dst) >= arg->mtu ||
2113 (dst_mtu(&rt->dst) < arg->mtu &&
2114 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2115 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2116 }
2117 return 0;
2118 }
2119
rt6_mtu_change(struct net_device * dev,unsigned mtu)2120 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2121 {
2122 struct rt6_mtu_change_arg arg = {
2123 .dev = dev,
2124 .mtu = mtu,
2125 };
2126
2127 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2128 }
2129
2130 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2131 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2132 [RTA_OIF] = { .type = NLA_U32 },
2133 [RTA_IIF] = { .type = NLA_U32 },
2134 [RTA_PRIORITY] = { .type = NLA_U32 },
2135 [RTA_METRICS] = { .type = NLA_NESTED },
2136 };
2137
rtm_to_fib6_config(struct sk_buff * skb,struct nlmsghdr * nlh,struct fib6_config * cfg)2138 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2139 struct fib6_config *cfg)
2140 {
2141 struct rtmsg *rtm;
2142 struct nlattr *tb[RTA_MAX+1];
2143 int err;
2144
2145 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2146 if (err < 0)
2147 goto errout;
2148
2149 err = -EINVAL;
2150 rtm = nlmsg_data(nlh);
2151 memset(cfg, 0, sizeof(*cfg));
2152
2153 cfg->fc_table = rtm->rtm_table;
2154 cfg->fc_dst_len = rtm->rtm_dst_len;
2155 cfg->fc_src_len = rtm->rtm_src_len;
2156 cfg->fc_flags = RTF_UP;
2157 cfg->fc_protocol = rtm->rtm_protocol;
2158
2159 if (rtm->rtm_type == RTN_UNREACHABLE)
2160 cfg->fc_flags |= RTF_REJECT;
2161
2162 if (rtm->rtm_type == RTN_LOCAL)
2163 cfg->fc_flags |= RTF_LOCAL;
2164
2165 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2166 cfg->fc_nlinfo.nlh = nlh;
2167 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2168
2169 if (tb[RTA_GATEWAY]) {
2170 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2171 cfg->fc_flags |= RTF_GATEWAY;
2172 }
2173
2174 if (tb[RTA_DST]) {
2175 int plen = (rtm->rtm_dst_len + 7) >> 3;
2176
2177 if (nla_len(tb[RTA_DST]) < plen)
2178 goto errout;
2179
2180 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2181 }
2182
2183 if (tb[RTA_SRC]) {
2184 int plen = (rtm->rtm_src_len + 7) >> 3;
2185
2186 if (nla_len(tb[RTA_SRC]) < plen)
2187 goto errout;
2188
2189 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2190 }
2191
2192 if (tb[RTA_OIF])
2193 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2194
2195 if (tb[RTA_PRIORITY])
2196 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2197
2198 if (tb[RTA_METRICS]) {
2199 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2200 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2201 }
2202
2203 if (tb[RTA_TABLE])
2204 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2205
2206 err = 0;
2207 errout:
2208 return err;
2209 }
2210
inet6_rtm_delroute(struct sk_buff * skb,struct nlmsghdr * nlh,void * arg)2211 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2212 {
2213 struct fib6_config cfg;
2214 int err;
2215
2216 err = rtm_to_fib6_config(skb, nlh, &cfg);
2217 if (err < 0)
2218 return err;
2219
2220 return ip6_route_del(&cfg);
2221 }
2222
inet6_rtm_newroute(struct sk_buff * skb,struct nlmsghdr * nlh,void * arg)2223 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2224 {
2225 struct fib6_config cfg;
2226 int err;
2227
2228 err = rtm_to_fib6_config(skb, nlh, &cfg);
2229 if (err < 0)
2230 return err;
2231
2232 return ip6_route_add(&cfg);
2233 }
2234
rt6_nlmsg_size(void)2235 static inline size_t rt6_nlmsg_size(void)
2236 {
2237 return NLMSG_ALIGN(sizeof(struct rtmsg))
2238 + nla_total_size(16) /* RTA_SRC */
2239 + nla_total_size(16) /* RTA_DST */
2240 + nla_total_size(16) /* RTA_GATEWAY */
2241 + nla_total_size(16) /* RTA_PREFSRC */
2242 + nla_total_size(4) /* RTA_TABLE */
2243 + nla_total_size(4) /* RTA_IIF */
2244 + nla_total_size(4) /* RTA_OIF */
2245 + nla_total_size(4) /* RTA_PRIORITY */
2246 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2247 + nla_total_size(sizeof(struct rta_cacheinfo));
2248 }
2249
rt6_fill_node(struct net * net,struct sk_buff * skb,struct rt6_info * rt,struct in6_addr * dst,struct in6_addr * src,int iif,int type,u32 pid,u32 seq,int prefix,int nowait,unsigned int flags)2250 static int rt6_fill_node(struct net *net,
2251 struct sk_buff *skb, struct rt6_info *rt,
2252 struct in6_addr *dst, struct in6_addr *src,
2253 int iif, int type, u32 pid, u32 seq,
2254 int prefix, int nowait, unsigned int flags)
2255 {
2256 struct rtmsg *rtm;
2257 struct nlmsghdr *nlh;
2258 long expires;
2259 u32 table;
2260
2261 if (prefix) { /* user wants prefix routes only */
2262 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2263 /* success since this is not a prefix route */
2264 return 1;
2265 }
2266 }
2267
2268 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2269 if (nlh == NULL)
2270 return -EMSGSIZE;
2271
2272 rtm = nlmsg_data(nlh);
2273 rtm->rtm_family = AF_INET6;
2274 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2275 rtm->rtm_src_len = rt->rt6i_src.plen;
2276 rtm->rtm_tos = 0;
2277 if (rt->rt6i_table)
2278 table = rt->rt6i_table->tb6_id;
2279 else
2280 table = RT6_TABLE_UNSPEC;
2281 rtm->rtm_table = table;
2282 NLA_PUT_U32(skb, RTA_TABLE, table);
2283 if (rt->rt6i_flags&RTF_REJECT)
2284 rtm->rtm_type = RTN_UNREACHABLE;
2285 else if (rt->rt6i_flags&RTF_LOCAL)
2286 rtm->rtm_type = RTN_LOCAL;
2287 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2288 rtm->rtm_type = RTN_LOCAL;
2289 else
2290 rtm->rtm_type = RTN_UNICAST;
2291 rtm->rtm_flags = 0;
2292 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2293 rtm->rtm_protocol = rt->rt6i_protocol;
2294 if (rt->rt6i_flags&RTF_DYNAMIC)
2295 rtm->rtm_protocol = RTPROT_REDIRECT;
2296 else if (rt->rt6i_flags & RTF_ADDRCONF)
2297 rtm->rtm_protocol = RTPROT_KERNEL;
2298 else if (rt->rt6i_flags&RTF_DEFAULT)
2299 rtm->rtm_protocol = RTPROT_RA;
2300
2301 if (rt->rt6i_flags&RTF_CACHE)
2302 rtm->rtm_flags |= RTM_F_CLONED;
2303
2304 if (dst) {
2305 NLA_PUT(skb, RTA_DST, 16, dst);
2306 rtm->rtm_dst_len = 128;
2307 } else if (rtm->rtm_dst_len)
2308 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2309 #ifdef CONFIG_IPV6_SUBTREES
2310 if (src) {
2311 NLA_PUT(skb, RTA_SRC, 16, src);
2312 rtm->rtm_src_len = 128;
2313 } else if (rtm->rtm_src_len)
2314 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2315 #endif
2316 if (iif) {
2317 #ifdef CONFIG_IPV6_MROUTE
2318 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2319 int err = ip6mr_get_route(net, skb, rtm, nowait);
2320 if (err <= 0) {
2321 if (!nowait) {
2322 if (err == 0)
2323 return 0;
2324 goto nla_put_failure;
2325 } else {
2326 if (err == -EMSGSIZE)
2327 goto nla_put_failure;
2328 }
2329 }
2330 } else
2331 #endif
2332 NLA_PUT_U32(skb, RTA_IIF, iif);
2333 } else if (dst) {
2334 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2335 struct in6_addr saddr_buf;
2336 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2337 dst, 0, &saddr_buf) == 0)
2338 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2339 }
2340
2341 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2342 goto nla_put_failure;
2343
2344 if (rt->dst.neighbour)
2345 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2346
2347 if (rt->dst.dev)
2348 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2349
2350 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2351
2352 if (!(rt->rt6i_flags & RTF_EXPIRES))
2353 expires = 0;
2354 else if (rt->rt6i_expires - jiffies < INT_MAX)
2355 expires = rt->rt6i_expires - jiffies;
2356 else
2357 expires = INT_MAX;
2358
2359 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2360 expires, rt->dst.error) < 0)
2361 goto nla_put_failure;
2362
2363 return nlmsg_end(skb, nlh);
2364
2365 nla_put_failure:
2366 nlmsg_cancel(skb, nlh);
2367 return -EMSGSIZE;
2368 }
2369
rt6_dump_route(struct rt6_info * rt,void * p_arg)2370 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2371 {
2372 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2373 int prefix;
2374
2375 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2376 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2377 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2378 } else
2379 prefix = 0;
2380
2381 return rt6_fill_node(arg->net,
2382 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2383 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2384 prefix, 0, NLM_F_MULTI);
2385 }
2386
inet6_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,void * arg)2387 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2388 {
2389 struct net *net = sock_net(in_skb->sk);
2390 struct nlattr *tb[RTA_MAX+1];
2391 struct rt6_info *rt;
2392 struct sk_buff *skb;
2393 struct rtmsg *rtm;
2394 struct flowi6 fl6;
2395 int err, iif = 0;
2396
2397 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2398 if (err < 0)
2399 goto errout;
2400
2401 err = -EINVAL;
2402 memset(&fl6, 0, sizeof(fl6));
2403
2404 if (tb[RTA_SRC]) {
2405 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2406 goto errout;
2407
2408 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2409 }
2410
2411 if (tb[RTA_DST]) {
2412 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2413 goto errout;
2414
2415 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2416 }
2417
2418 if (tb[RTA_IIF])
2419 iif = nla_get_u32(tb[RTA_IIF]);
2420
2421 if (tb[RTA_OIF])
2422 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2423
2424 if (iif) {
2425 struct net_device *dev;
2426 dev = __dev_get_by_index(net, iif);
2427 if (!dev) {
2428 err = -ENODEV;
2429 goto errout;
2430 }
2431 }
2432
2433 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2434 if (skb == NULL) {
2435 err = -ENOBUFS;
2436 goto errout;
2437 }
2438
2439 /* Reserve room for dummy headers, this skb can pass
2440 through good chunk of routing engine.
2441 */
2442 skb_reset_mac_header(skb);
2443 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2444
2445 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2446 skb_dst_set(skb, &rt->dst);
2447
2448 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2449 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2450 nlh->nlmsg_seq, 0, 0, 0);
2451 if (err < 0) {
2452 kfree_skb(skb);
2453 goto errout;
2454 }
2455
2456 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2457 errout:
2458 return err;
2459 }
2460
inet6_rt_notify(int event,struct rt6_info * rt,struct nl_info * info)2461 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2462 {
2463 struct sk_buff *skb;
2464 struct net *net = info->nl_net;
2465 u32 seq;
2466 int err;
2467
2468 err = -ENOBUFS;
2469 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2470
2471 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2472 if (skb == NULL)
2473 goto errout;
2474
2475 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2476 event, info->pid, seq, 0, 0, 0);
2477 if (err < 0) {
2478 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2479 WARN_ON(err == -EMSGSIZE);
2480 kfree_skb(skb);
2481 goto errout;
2482 }
2483 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2484 info->nlh, gfp_any());
2485 return;
2486 errout:
2487 if (err < 0)
2488 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2489 }
2490
ip6_route_dev_notify(struct notifier_block * this,unsigned long event,void * data)2491 static int ip6_route_dev_notify(struct notifier_block *this,
2492 unsigned long event, void *data)
2493 {
2494 struct net_device *dev = (struct net_device *)data;
2495 struct net *net = dev_net(dev);
2496
2497 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2498 net->ipv6.ip6_null_entry->dst.dev = dev;
2499 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2500 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2501 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2502 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2503 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2504 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2505 #endif
2506 }
2507
2508 return NOTIFY_OK;
2509 }
2510
2511 /*
2512 * /proc
2513 */
2514
2515 #ifdef CONFIG_PROC_FS
2516
2517 struct rt6_proc_arg
2518 {
2519 char *buffer;
2520 int offset;
2521 int length;
2522 int skip;
2523 int len;
2524 };
2525
rt6_info_route(struct rt6_info * rt,void * p_arg)2526 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2527 {
2528 struct seq_file *m = p_arg;
2529
2530 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2531
2532 #ifdef CONFIG_IPV6_SUBTREES
2533 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2534 #else
2535 seq_puts(m, "00000000000000000000000000000000 00 ");
2536 #endif
2537
2538 if (rt->rt6i_nexthop) {
2539 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2540 } else {
2541 seq_puts(m, "00000000000000000000000000000000");
2542 }
2543 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2544 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2545 rt->dst.__use, rt->rt6i_flags,
2546 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2547 return 0;
2548 }
2549
ipv6_route_show(struct seq_file * m,void * v)2550 static int ipv6_route_show(struct seq_file *m, void *v)
2551 {
2552 struct net *net = (struct net *)m->private;
2553 fib6_clean_all(net, rt6_info_route, 0, m);
2554 return 0;
2555 }
2556
ipv6_route_open(struct inode * inode,struct file * file)2557 static int ipv6_route_open(struct inode *inode, struct file *file)
2558 {
2559 return single_open_net(inode, file, ipv6_route_show);
2560 }
2561
2562 static const struct file_operations ipv6_route_proc_fops = {
2563 .owner = THIS_MODULE,
2564 .open = ipv6_route_open,
2565 .read = seq_read,
2566 .llseek = seq_lseek,
2567 .release = single_release_net,
2568 };
2569
rt6_stats_seq_show(struct seq_file * seq,void * v)2570 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2571 {
2572 struct net *net = (struct net *)seq->private;
2573 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2574 net->ipv6.rt6_stats->fib_nodes,
2575 net->ipv6.rt6_stats->fib_route_nodes,
2576 net->ipv6.rt6_stats->fib_rt_alloc,
2577 net->ipv6.rt6_stats->fib_rt_entries,
2578 net->ipv6.rt6_stats->fib_rt_cache,
2579 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2580 net->ipv6.rt6_stats->fib_discarded_routes);
2581
2582 return 0;
2583 }
2584
rt6_stats_seq_open(struct inode * inode,struct file * file)2585 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2586 {
2587 return single_open_net(inode, file, rt6_stats_seq_show);
2588 }
2589
2590 static const struct file_operations rt6_stats_seq_fops = {
2591 .owner = THIS_MODULE,
2592 .open = rt6_stats_seq_open,
2593 .read = seq_read,
2594 .llseek = seq_lseek,
2595 .release = single_release_net,
2596 };
2597 #endif /* CONFIG_PROC_FS */
2598
2599 #ifdef CONFIG_SYSCTL
2600
2601 static
ipv6_sysctl_rtcache_flush(ctl_table * ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)2602 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2603 void __user *buffer, size_t *lenp, loff_t *ppos)
2604 {
2605 struct net *net;
2606 int delay;
2607 if (!write)
2608 return -EINVAL;
2609
2610 net = (struct net *)ctl->extra1;
2611 delay = net->ipv6.sysctl.flush_delay;
2612 proc_dointvec(ctl, write, buffer, lenp, ppos);
2613 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2614 return 0;
2615 }
2616
2617 ctl_table ipv6_route_table_template[] = {
2618 {
2619 .procname = "flush",
2620 .data = &init_net.ipv6.sysctl.flush_delay,
2621 .maxlen = sizeof(int),
2622 .mode = 0200,
2623 .proc_handler = ipv6_sysctl_rtcache_flush
2624 },
2625 {
2626 .procname = "gc_thresh",
2627 .data = &ip6_dst_ops_template.gc_thresh,
2628 .maxlen = sizeof(int),
2629 .mode = 0644,
2630 .proc_handler = proc_dointvec,
2631 },
2632 {
2633 .procname = "max_size",
2634 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2635 .maxlen = sizeof(int),
2636 .mode = 0644,
2637 .proc_handler = proc_dointvec,
2638 },
2639 {
2640 .procname = "gc_min_interval",
2641 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2642 .maxlen = sizeof(int),
2643 .mode = 0644,
2644 .proc_handler = proc_dointvec_jiffies,
2645 },
2646 {
2647 .procname = "gc_timeout",
2648 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2649 .maxlen = sizeof(int),
2650 .mode = 0644,
2651 .proc_handler = proc_dointvec_jiffies,
2652 },
2653 {
2654 .procname = "gc_interval",
2655 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2656 .maxlen = sizeof(int),
2657 .mode = 0644,
2658 .proc_handler = proc_dointvec_jiffies,
2659 },
2660 {
2661 .procname = "gc_elasticity",
2662 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2663 .maxlen = sizeof(int),
2664 .mode = 0644,
2665 .proc_handler = proc_dointvec,
2666 },
2667 {
2668 .procname = "mtu_expires",
2669 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2670 .maxlen = sizeof(int),
2671 .mode = 0644,
2672 .proc_handler = proc_dointvec_jiffies,
2673 },
2674 {
2675 .procname = "min_adv_mss",
2676 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2677 .maxlen = sizeof(int),
2678 .mode = 0644,
2679 .proc_handler = proc_dointvec,
2680 },
2681 {
2682 .procname = "gc_min_interval_ms",
2683 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2684 .maxlen = sizeof(int),
2685 .mode = 0644,
2686 .proc_handler = proc_dointvec_ms_jiffies,
2687 },
2688 { }
2689 };
2690
ipv6_route_sysctl_init(struct net * net)2691 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2692 {
2693 struct ctl_table *table;
2694
2695 table = kmemdup(ipv6_route_table_template,
2696 sizeof(ipv6_route_table_template),
2697 GFP_KERNEL);
2698
2699 if (table) {
2700 table[0].data = &net->ipv6.sysctl.flush_delay;
2701 table[0].extra1 = net;
2702 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2703 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2704 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2705 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2706 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2707 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2708 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2709 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2710 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2711 }
2712
2713 return table;
2714 }
2715 #endif
2716
ip6_route_net_init(struct net * net)2717 static int __net_init ip6_route_net_init(struct net *net)
2718 {
2719 int ret = -ENOMEM;
2720
2721 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2722 sizeof(net->ipv6.ip6_dst_ops));
2723
2724 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2725 goto out_ip6_dst_ops;
2726
2727 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2728 sizeof(*net->ipv6.ip6_null_entry),
2729 GFP_KERNEL);
2730 if (!net->ipv6.ip6_null_entry)
2731 goto out_ip6_dst_entries;
2732 net->ipv6.ip6_null_entry->dst.path =
2733 (struct dst_entry *)net->ipv6.ip6_null_entry;
2734 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2735 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2736 ip6_template_metrics, true);
2737
2738 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2739 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2740 sizeof(*net->ipv6.ip6_prohibit_entry),
2741 GFP_KERNEL);
2742 if (!net->ipv6.ip6_prohibit_entry)
2743 goto out_ip6_null_entry;
2744 net->ipv6.ip6_prohibit_entry->dst.path =
2745 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2746 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2747 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2748 ip6_template_metrics, true);
2749
2750 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2751 sizeof(*net->ipv6.ip6_blk_hole_entry),
2752 GFP_KERNEL);
2753 if (!net->ipv6.ip6_blk_hole_entry)
2754 goto out_ip6_prohibit_entry;
2755 net->ipv6.ip6_blk_hole_entry->dst.path =
2756 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2757 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2758 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2759 ip6_template_metrics, true);
2760 #endif
2761
2762 net->ipv6.sysctl.flush_delay = 0;
2763 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2764 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2765 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2766 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2767 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2768 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2769 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2770
2771 #ifdef CONFIG_PROC_FS
2772 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2773 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2774 #endif
2775 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2776
2777 ret = 0;
2778 out:
2779 return ret;
2780
2781 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2782 out_ip6_prohibit_entry:
2783 kfree(net->ipv6.ip6_prohibit_entry);
2784 out_ip6_null_entry:
2785 kfree(net->ipv6.ip6_null_entry);
2786 #endif
2787 out_ip6_dst_entries:
2788 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2789 out_ip6_dst_ops:
2790 goto out;
2791 }
2792
ip6_route_net_exit(struct net * net)2793 static void __net_exit ip6_route_net_exit(struct net *net)
2794 {
2795 #ifdef CONFIG_PROC_FS
2796 proc_net_remove(net, "ipv6_route");
2797 proc_net_remove(net, "rt6_stats");
2798 #endif
2799 kfree(net->ipv6.ip6_null_entry);
2800 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2801 kfree(net->ipv6.ip6_prohibit_entry);
2802 kfree(net->ipv6.ip6_blk_hole_entry);
2803 #endif
2804 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2805 }
2806
2807 static struct pernet_operations ip6_route_net_ops = {
2808 .init = ip6_route_net_init,
2809 .exit = ip6_route_net_exit,
2810 };
2811
2812 static struct notifier_block ip6_route_dev_notifier = {
2813 .notifier_call = ip6_route_dev_notify,
2814 .priority = 0,
2815 };
2816
ip6_route_init(void)2817 int __init ip6_route_init(void)
2818 {
2819 int ret;
2820
2821 ret = -ENOMEM;
2822 ip6_dst_ops_template.kmem_cachep =
2823 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2824 SLAB_HWCACHE_ALIGN, NULL);
2825 if (!ip6_dst_ops_template.kmem_cachep)
2826 goto out;
2827
2828 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2829 if (ret)
2830 goto out_kmem_cache;
2831
2832 ret = register_pernet_subsys(&ip6_route_net_ops);
2833 if (ret)
2834 goto out_dst_entries;
2835
2836 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2837
2838 /* Registering of the loopback is done before this portion of code,
2839 * the loopback reference in rt6_info will not be taken, do it
2840 * manually for init_net */
2841 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2842 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2843 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2844 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2845 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2846 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2847 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2848 #endif
2849 ret = fib6_init();
2850 if (ret)
2851 goto out_register_subsys;
2852
2853 ret = xfrm6_init();
2854 if (ret)
2855 goto out_fib6_init;
2856
2857 ret = fib6_rules_init();
2858 if (ret)
2859 goto xfrm6_init;
2860
2861 ret = -ENOBUFS;
2862 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2863 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2864 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2865 goto fib6_rules_init;
2866
2867 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2868 if (ret)
2869 goto fib6_rules_init;
2870
2871 out:
2872 return ret;
2873
2874 fib6_rules_init:
2875 fib6_rules_cleanup();
2876 xfrm6_init:
2877 xfrm6_fini();
2878 out_fib6_init:
2879 fib6_gc_cleanup();
2880 out_register_subsys:
2881 unregister_pernet_subsys(&ip6_route_net_ops);
2882 out_dst_entries:
2883 dst_entries_destroy(&ip6_dst_blackhole_ops);
2884 out_kmem_cache:
2885 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2886 goto out;
2887 }
2888
ip6_route_cleanup(void)2889 void ip6_route_cleanup(void)
2890 {
2891 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2892 fib6_rules_cleanup();
2893 xfrm6_fini();
2894 fib6_gc_cleanup();
2895 unregister_pernet_subsys(&ip6_route_net_ops);
2896 dst_entries_destroy(&ip6_dst_blackhole_ops);
2897 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2898 }
2899