1 /*
2 * Linux NET3: GRE over IP protocol decoder.
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/inet_ecn.h>
40
41 #ifdef CONFIG_IPV6
42 #include <net/ipv6.h>
43 #include <net/ip6_fib.h>
44 #include <net/ip6_route.h>
45 #endif
46
47 /*
48 Problems & solutions
49 --------------------
50
51 1. The most important issue is detecting local dead loops.
52 They would cause complete host lockup in transmit, which
53 would be "resolved" by stack overflow or, if queueing is enabled,
54 with infinite looping in net_bh.
55
56 We cannot track such dead loops during route installation,
57 it is infeasible task. The most general solutions would be
58 to keep skb->encapsulation counter (sort of local ttl),
59 and silently drop packet when it expires. It is the best
60 solution, but it supposes maintaing new variable in ALL
61 skb, even if no tunneling is used.
62
63 Current solution: t->recursion lock breaks dead loops. It looks
64 like dev->tbusy flag, but I preferred new variable, because
65 the semantics is different. One day, when hard_start_xmit
66 will be multithreaded we will have to use skb->encapsulation.
67
68
69
70 2. Networking dead loops would not kill routers, but would really
71 kill network. IP hop limit plays role of "t->recursion" in this case,
72 if we copy it from packet being encapsulated to upper header.
73 It is very good solution, but it introduces two problems:
74
75 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
76 do not work over tunnels.
77 - traceroute does not work. I planned to relay ICMP from tunnel,
78 so that this problem would be solved and traceroute output
79 would even more informative. This idea appeared to be wrong:
80 only Linux complies to rfc1812 now (yes, guys, Linux is the only
81 true router now :-)), all routers (at least, in neighbourhood of mine)
82 return only 8 bytes of payload. It is the end.
83
84 Hence, if we want that OSPF worked or traceroute said something reasonable,
85 we should search for another solution.
86
87 One of them is to parse packet trying to detect inner encapsulation
88 made by our node. It is difficult or even impossible, especially,
89 taking into account fragmentation. TO be short, tt is not solution at all.
90
91 Current solution: The solution was UNEXPECTEDLY SIMPLE.
92 We force DF flag on tunnels with preconfigured hop limit,
93 that is ALL. :-) Well, it does not remove the problem completely,
94 but exponential growth of network traffic is changed to linear
95 (branches, that exceed pmtu are pruned) and tunnel mtu
96 fastly degrades to value <68, where looping stops.
97 Yes, it is not good if there exists a router in the loop,
98 which does not force DF, even when encapsulating packets have DF set.
99 But it is not our problem! Nobody could accuse us, we made
100 all that we could make. Even if it is your gated who injected
101 fatal route to network, even if it were you who configured
102 fatal static route: you are innocent. :-)
103
104
105
106 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
107 practically identical code. It would be good to glue them
108 together, but it is not very evident, how to make them modular.
109 sit is integral part of IPv6, ipip and gre are naturally modular.
110 We could extract common parts (hash table, ioctl etc)
111 to a separate module (ip_tunnel.c).
112
113 Alexey Kuznetsov.
114 */
115
116 static int ipgre_tunnel_init(struct net_device *dev);
117
118 /* Fallback tunnel: no source, no destination, no key, no options */
119
120 static int ipgre_fb_tunnel_init(struct net_device *dev);
121
122 static struct net_device ipgre_fb_tunnel_dev = {
123 "gre0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
124 };
125
126 static struct ip_tunnel ipgre_fb_tunnel = {
127 NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
128 };
129
130 /* Tunnel hash table */
131
132 /*
133 4 hash tables:
134
135 3: (remote,local)
136 2: (remote,*)
137 1: (*,local)
138 0: (*,*)
139
140 We require exact key match i.e. if a key is present in packet
141 it will match only tunnel with the same key; if it is not present,
142 it will match only keyless tunnel.
143
144 All keysless packets, if not matched configured keyless tunnels
145 will match fallback tunnel.
146 */
147
148 #define HASH_SIZE 16
149 #define HASH(addr) ((addr^(addr>>4))&0xF)
150
151 static struct ip_tunnel *tunnels[4][HASH_SIZE];
152
153 #define tunnels_r_l (tunnels[3])
154 #define tunnels_r (tunnels[2])
155 #define tunnels_l (tunnels[1])
156 #define tunnels_wc (tunnels[0])
157
158 static rwlock_t ipgre_lock = RW_LOCK_UNLOCKED;
159
160 /* Given src, dst and key, find approriate for input tunnel. */
161
ipgre_tunnel_lookup(u32 remote,u32 local,u32 key)162 static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
163 {
164 unsigned h0 = HASH(remote);
165 unsigned h1 = HASH(key);
166 struct ip_tunnel *t;
167
168 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
169 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
170 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
171 return t;
172 }
173 }
174 for (t = tunnels_r[h0^h1]; t; t = t->next) {
175 if (remote == t->parms.iph.daddr) {
176 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177 return t;
178 }
179 }
180 for (t = tunnels_l[h1]; t; t = t->next) {
181 if (local == t->parms.iph.saddr ||
182 (local == t->parms.iph.daddr && MULTICAST(local))) {
183 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
184 return t;
185 }
186 }
187 for (t = tunnels_wc[h1]; t; t = t->next) {
188 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
189 return t;
190 }
191 if (ipgre_fb_tunnel_dev.flags&IFF_UP)
192 return &ipgre_fb_tunnel;
193 return NULL;
194 }
195
ipgre_bucket(struct ip_tunnel * t)196 static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
197 {
198 u32 remote = t->parms.iph.daddr;
199 u32 local = t->parms.iph.saddr;
200 u32 key = t->parms.i_key;
201 unsigned h = HASH(key);
202 int prio = 0;
203
204 if (local)
205 prio |= 1;
206 if (remote && !MULTICAST(remote)) {
207 prio |= 2;
208 h ^= HASH(remote);
209 }
210
211 return &tunnels[prio][h];
212 }
213
ipgre_tunnel_link(struct ip_tunnel * t)214 static void ipgre_tunnel_link(struct ip_tunnel *t)
215 {
216 struct ip_tunnel **tp = ipgre_bucket(t);
217
218 t->next = *tp;
219 write_lock_bh(&ipgre_lock);
220 *tp = t;
221 write_unlock_bh(&ipgre_lock);
222 }
223
ipgre_tunnel_unlink(struct ip_tunnel * t)224 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
225 {
226 struct ip_tunnel **tp;
227
228 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
229 if (t == *tp) {
230 write_lock_bh(&ipgre_lock);
231 *tp = t->next;
232 write_unlock_bh(&ipgre_lock);
233 break;
234 }
235 }
236 }
237
ipgre_tunnel_locate(struct ip_tunnel_parm * parms,int create)238 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
239 {
240 u32 remote = parms->iph.daddr;
241 u32 local = parms->iph.saddr;
242 u32 key = parms->i_key;
243 struct ip_tunnel *t, **tp, *nt;
244 struct net_device *dev;
245 unsigned h = HASH(key);
246 int prio = 0;
247
248 if (local)
249 prio |= 1;
250 if (remote && !MULTICAST(remote)) {
251 prio |= 2;
252 h ^= HASH(remote);
253 }
254 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
255 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
256 if (key == t->parms.i_key)
257 return t;
258 }
259 }
260 if (!create)
261 return NULL;
262
263 MOD_INC_USE_COUNT;
264 dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
265 if (dev == NULL) {
266 MOD_DEC_USE_COUNT;
267 return NULL;
268 }
269 memset(dev, 0, sizeof(*dev) + sizeof(*t));
270 dev->priv = (void*)(dev+1);
271 nt = (struct ip_tunnel*)dev->priv;
272 nt->dev = dev;
273 dev->init = ipgre_tunnel_init;
274 dev->features |= NETIF_F_DYNALLOC;
275 memcpy(&nt->parms, parms, sizeof(*parms));
276 nt->parms.name[IFNAMSIZ-1] = '\0';
277 strcpy(dev->name, nt->parms.name);
278 if (dev->name[0] == 0) {
279 int i;
280 for (i=1; i<100; i++) {
281 sprintf(dev->name, "gre%d", i);
282 if (__dev_get_by_name(dev->name) == NULL)
283 break;
284 }
285 if (i==100)
286 goto failed;
287 memcpy(nt->parms.name, dev->name, IFNAMSIZ);
288 }
289 if (register_netdevice(dev) < 0)
290 goto failed;
291
292 dev_hold(dev);
293 ipgre_tunnel_link(nt);
294 /* Do not decrement MOD_USE_COUNT here. */
295 return nt;
296
297 failed:
298 kfree(dev);
299 MOD_DEC_USE_COUNT;
300 return NULL;
301 }
302
ipgre_tunnel_destructor(struct net_device * dev)303 static void ipgre_tunnel_destructor(struct net_device *dev)
304 {
305 if (dev != &ipgre_fb_tunnel_dev) {
306 MOD_DEC_USE_COUNT;
307 }
308 }
309
ipgre_tunnel_uninit(struct net_device * dev)310 static void ipgre_tunnel_uninit(struct net_device *dev)
311 {
312 ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
313 dev_put(dev);
314 }
315
316
ipgre_err(struct sk_buff * skb,u32 info)317 void ipgre_err(struct sk_buff *skb, u32 info)
318 {
319 #ifndef I_WISH_WORLD_WERE_PERFECT
320
321 /* It is not :-( All the routers (except for Linux) return only
322 8 bytes of packet payload. It means, that precise relaying of
323 ICMP in the real Internet is absolutely infeasible.
324
325 Moreover, Cisco "wise men" put GRE key to the third word
326 in GRE header. It makes impossible maintaining even soft state for keyed
327 GRE tunnels with enabled checksum. Tell them "thank you".
328
329 Well, I wonder, rfc1812 was written by Cisco employee,
330 what the hell these idiots break standrads established
331 by themself???
332 */
333
334 struct iphdr *iph = (struct iphdr*)skb->data;
335 u16 *p = (u16*)(skb->data+(iph->ihl<<2));
336 int grehlen = (iph->ihl<<2) + 4;
337 int type = skb->h.icmph->type;
338 int code = skb->h.icmph->code;
339 struct ip_tunnel *t;
340 u16 flags;
341
342 flags = p[0];
343 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
344 if (flags&(GRE_VERSION|GRE_ROUTING))
345 return;
346 if (flags&GRE_KEY) {
347 grehlen += 4;
348 if (flags&GRE_CSUM)
349 grehlen += 4;
350 }
351 }
352
353 /* If only 8 bytes returned, keyed message will be dropped here */
354 if (skb_headlen(skb) < grehlen)
355 return;
356
357 switch (type) {
358 default:
359 case ICMP_PARAMETERPROB:
360 return;
361
362 case ICMP_DEST_UNREACH:
363 switch (code) {
364 case ICMP_SR_FAILED:
365 case ICMP_PORT_UNREACH:
366 /* Impossible event. */
367 return;
368 case ICMP_FRAG_NEEDED:
369 /* Soft state for pmtu is maintained by IP core. */
370 return;
371 default:
372 /* All others are translated to HOST_UNREACH.
373 rfc2003 contains "deep thoughts" about NET_UNREACH,
374 I believe they are just ether pollution. --ANK
375 */
376 break;
377 }
378 break;
379 case ICMP_TIME_EXCEEDED:
380 if (code != ICMP_EXC_TTL)
381 return;
382 break;
383 }
384
385 read_lock(&ipgre_lock);
386 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
387 if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
388 goto out;
389
390 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
391 goto out;
392
393 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
394 t->err_count++;
395 else
396 t->err_count = 1;
397 t->err_time = jiffies;
398 out:
399 read_unlock(&ipgre_lock);
400 return;
401 #else
402 struct iphdr *iph = (struct iphdr*)dp;
403 struct iphdr *eiph;
404 u16 *p = (u16*)(dp+(iph->ihl<<2));
405 int type = skb->h.icmph->type;
406 int code = skb->h.icmph->code;
407 int rel_type = 0;
408 int rel_code = 0;
409 int rel_info = 0;
410 u16 flags;
411 int grehlen = (iph->ihl<<2) + 4;
412 struct sk_buff *skb2;
413 struct rtable *rt;
414
415 if (p[1] != htons(ETH_P_IP))
416 return;
417
418 flags = p[0];
419 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
420 if (flags&(GRE_VERSION|GRE_ROUTING))
421 return;
422 if (flags&GRE_CSUM)
423 grehlen += 4;
424 if (flags&GRE_KEY)
425 grehlen += 4;
426 if (flags&GRE_SEQ)
427 grehlen += 4;
428 }
429 if (len < grehlen + sizeof(struct iphdr))
430 return;
431 eiph = (struct iphdr*)(dp + grehlen);
432
433 switch (type) {
434 default:
435 return;
436 case ICMP_PARAMETERPROB:
437 if (skb->h.icmph->un.gateway < (iph->ihl<<2))
438 return;
439
440 /* So... This guy found something strange INSIDE encapsulated
441 packet. Well, he is fool, but what can we do ?
442 */
443 rel_type = ICMP_PARAMETERPROB;
444 rel_info = skb->h.icmph->un.gateway - grehlen;
445 break;
446
447 case ICMP_DEST_UNREACH:
448 switch (code) {
449 case ICMP_SR_FAILED:
450 case ICMP_PORT_UNREACH:
451 /* Impossible event. */
452 return;
453 case ICMP_FRAG_NEEDED:
454 /* And it is the only really necesary thing :-) */
455 rel_info = ntohs(skb->h.icmph->un.frag.mtu);
456 if (rel_info < grehlen+68)
457 return;
458 rel_info -= grehlen;
459 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
460 if (rel_info > ntohs(eiph->tot_len))
461 return;
462 break;
463 default:
464 /* All others are translated to HOST_UNREACH.
465 rfc2003 contains "deep thoughts" about NET_UNREACH,
466 I believe, it is just ether pollution. --ANK
467 */
468 rel_type = ICMP_DEST_UNREACH;
469 rel_code = ICMP_HOST_UNREACH;
470 break;
471 }
472 break;
473 case ICMP_TIME_EXCEEDED:
474 if (code != ICMP_EXC_TTL)
475 return;
476 break;
477 }
478
479 /* Prepare fake skb to feed it to icmp_send */
480 skb2 = skb_clone(skb, GFP_ATOMIC);
481 if (skb2 == NULL)
482 return;
483 dst_release(skb2->dst);
484 skb2->dst = NULL;
485 skb_pull(skb2, skb->data - (u8*)eiph);
486 skb2->nh.raw = skb2->data;
487
488 /* Try to guess incoming interface */
489 if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
490 kfree_skb(skb2);
491 return;
492 }
493 skb2->dev = rt->u.dst.dev;
494
495 /* route "incoming" packet */
496 if (rt->rt_flags&RTCF_LOCAL) {
497 ip_rt_put(rt);
498 rt = NULL;
499 if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
500 rt->u.dst.dev->type != ARPHRD_IPGRE) {
501 ip_rt_put(rt);
502 kfree_skb(skb2);
503 return;
504 }
505 } else {
506 ip_rt_put(rt);
507 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
508 skb2->dst->dev->type != ARPHRD_IPGRE) {
509 kfree_skb(skb2);
510 return;
511 }
512 }
513
514 /* change mtu on this route */
515 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
516 if (rel_info > skb2->dst->pmtu) {
517 kfree_skb(skb2);
518 return;
519 }
520 skb2->dst->pmtu = rel_info;
521 rel_info = htonl(rel_info);
522 } else if (type == ICMP_TIME_EXCEEDED) {
523 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
524 if (t->parms.iph.ttl) {
525 rel_type = ICMP_DEST_UNREACH;
526 rel_code = ICMP_HOST_UNREACH;
527 }
528 }
529
530 icmp_send(skb2, rel_type, rel_code, rel_info);
531 kfree_skb(skb2);
532 #endif
533 }
534
ipgre_ecn_decapsulate(struct iphdr * iph,struct sk_buff * skb)535 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
536 {
537 if (INET_ECN_is_ce(iph->tos)) {
538 if (skb->protocol == htons(ETH_P_IP)) {
539 if (INET_ECN_is_not_ce(skb->nh.iph->tos))
540 IP_ECN_set_ce(skb->nh.iph);
541 } else if (skb->protocol == htons(ETH_P_IPV6)) {
542 if (INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h)))
543 IP6_ECN_set_ce(skb->nh.ipv6h);
544 }
545 }
546 }
547
548 static inline u8
ipgre_ecn_encapsulate(u8 tos,struct iphdr * old_iph,struct sk_buff * skb)549 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 {
551 u8 inner = 0;
552 if (skb->protocol == htons(ETH_P_IP))
553 inner = old_iph->tos;
554 else if (skb->protocol == htons(ETH_P_IPV6))
555 inner = ip6_get_dsfield((struct ipv6hdr*)old_iph);
556 return INET_ECN_encapsulate(tos, inner);
557 }
558
ipgre_rcv(struct sk_buff * skb)559 int ipgre_rcv(struct sk_buff *skb)
560 {
561 struct iphdr *iph;
562 u8 *h;
563 u16 flags;
564 u16 csum = 0;
565 u32 key = 0;
566 u32 seqno = 0;
567 struct ip_tunnel *tunnel;
568 int offset = 4;
569
570 if (!pskb_may_pull(skb, 16))
571 goto drop_nolock;
572
573 iph = skb->nh.iph;
574 h = skb->data;
575 flags = *(u16*)h;
576
577 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
578 /* - Version must be 0.
579 - We do not support routing headers.
580 */
581 if (flags&(GRE_VERSION|GRE_ROUTING))
582 goto drop_nolock;
583
584 if (flags&GRE_CSUM) {
585 if (skb->ip_summed == CHECKSUM_HW) {
586 csum = (u16)csum_fold(skb->csum);
587 if (csum)
588 skb->ip_summed = CHECKSUM_NONE;
589 }
590 if (skb->ip_summed == CHECKSUM_NONE) {
591 skb->csum = skb_checksum(skb, 0, skb->len, 0);
592 skb->ip_summed = CHECKSUM_HW;
593 csum = (u16)csum_fold(skb->csum);
594 }
595 offset += 4;
596 }
597 if (flags&GRE_KEY) {
598 key = *(u32*)(h + offset);
599 offset += 4;
600 }
601 if (flags&GRE_SEQ) {
602 seqno = ntohl(*(u32*)(h + offset));
603 offset += 4;
604 }
605 }
606
607 read_lock(&ipgre_lock);
608 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
609 skb->mac.raw = skb->nh.raw;
610 skb->nh.raw = __pskb_pull(skb, offset);
611 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
612 if (skb->ip_summed == CHECKSUM_HW)
613 skb->csum = csum_sub(skb->csum,
614 csum_partial(skb->mac.raw, skb->nh.raw-skb->mac.raw, 0));
615 skb->protocol = *(u16*)(h + 2);
616 skb->pkt_type = PACKET_HOST;
617 #ifdef CONFIG_NET_IPGRE_BROADCAST
618 if (MULTICAST(iph->daddr)) {
619 /* Looped back packet, drop it! */
620 if (((struct rtable*)skb->dst)->key.iif == 0)
621 goto drop;
622 tunnel->stat.multicast++;
623 skb->pkt_type = PACKET_BROADCAST;
624 }
625 #endif
626
627 if (((flags&GRE_CSUM) && csum) ||
628 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
629 tunnel->stat.rx_crc_errors++;
630 tunnel->stat.rx_errors++;
631 goto drop;
632 }
633 if (tunnel->parms.i_flags&GRE_SEQ) {
634 if (!(flags&GRE_SEQ) ||
635 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
636 tunnel->stat.rx_fifo_errors++;
637 tunnel->stat.rx_errors++;
638 goto drop;
639 }
640 tunnel->i_seqno = seqno + 1;
641 }
642 tunnel->stat.rx_packets++;
643 tunnel->stat.rx_bytes += skb->len;
644 skb->dev = tunnel->dev;
645 dst_release(skb->dst);
646 skb->dst = NULL;
647 nf_reset(skb);
648 ipgre_ecn_decapsulate(iph, skb);
649 netif_rx(skb);
650 read_unlock(&ipgre_lock);
651 return(0);
652 }
653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
654
655 drop:
656 read_unlock(&ipgre_lock);
657 drop_nolock:
658 kfree_skb(skb);
659 return(0);
660 }
661
662 /* Need this wrapper because NF_HOOK takes the function address */
do_ip_send(struct sk_buff * skb)663 static inline int do_ip_send(struct sk_buff *skb)
664 {
665 return ip_send(skb);
666 }
667
ipgre_tunnel_xmit(struct sk_buff * skb,struct net_device * dev)668 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
669 {
670 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
671 struct net_device_stats *stats = &tunnel->stat;
672 struct iphdr *old_iph = skb->nh.iph;
673 struct iphdr *tiph;
674 u8 tos;
675 u16 df;
676 struct rtable *rt; /* Route to the other host */
677 struct net_device *tdev; /* Device to other host */
678 struct iphdr *iph; /* Our new IP header */
679 int max_headroom; /* The extra header space needed */
680 int gre_hlen;
681 u32 dst;
682 int mtu;
683
684 if (tunnel->recursion++) {
685 tunnel->stat.collisions++;
686 goto tx_error;
687 }
688
689 if (dev->hard_header) {
690 gre_hlen = 0;
691 tiph = (struct iphdr*)skb->data;
692 } else {
693 gre_hlen = tunnel->hlen;
694 tiph = &tunnel->parms.iph;
695 }
696
697 if ((dst = tiph->daddr) == 0) {
698 /* NBMA tunnel */
699
700 if (skb->dst == NULL) {
701 tunnel->stat.tx_fifo_errors++;
702 goto tx_error;
703 }
704
705 if (skb->protocol == htons(ETH_P_IP)) {
706 rt = (struct rtable*)skb->dst;
707 if ((dst = rt->rt_gateway) == 0)
708 goto tx_error_icmp;
709 }
710 #ifdef CONFIG_IPV6
711 else if (skb->protocol == htons(ETH_P_IPV6)) {
712 struct in6_addr *addr6;
713 int addr_type;
714 struct neighbour *neigh = skb->dst->neighbour;
715
716 if (neigh == NULL)
717 goto tx_error;
718
719 addr6 = (struct in6_addr*)&neigh->primary_key;
720 addr_type = ipv6_addr_type(addr6);
721
722 if (addr_type == IPV6_ADDR_ANY) {
723 addr6 = &skb->nh.ipv6h->daddr;
724 addr_type = ipv6_addr_type(addr6);
725 }
726
727 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728 goto tx_error_icmp;
729
730 dst = addr6->s6_addr32[3];
731 }
732 #endif
733 else
734 goto tx_error;
735 }
736
737 tos = tiph->tos;
738 if (tos&1) {
739 if (skb->protocol == htons(ETH_P_IP))
740 tos = old_iph->tos;
741 tos &= ~1;
742 }
743
744 if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
745 tunnel->stat.tx_carrier_errors++;
746 goto tx_error;
747 }
748 tdev = rt->u.dst.dev;
749
750 if (tdev == dev) {
751 ip_rt_put(rt);
752 tunnel->stat.collisions++;
753 goto tx_error;
754 }
755
756 df = tiph->frag_off;
757 if (df)
758 mtu = rt->u.dst.pmtu - tunnel->hlen;
759 else
760 mtu = skb->dst ? skb->dst->pmtu : dev->mtu;
761
762 if (skb->protocol == htons(ETH_P_IP)) {
763 if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
764 skb->dst->pmtu = mtu;
765
766 df |= (old_iph->frag_off&htons(IP_DF));
767
768 if ((old_iph->frag_off&htons(IP_DF)) &&
769 mtu < ntohs(old_iph->tot_len)) {
770 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
771 ip_rt_put(rt);
772 goto tx_error;
773 }
774 }
775 #ifdef CONFIG_IPV6
776 else if (skb->protocol == htons(ETH_P_IPV6)) {
777 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
778
779 if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
780 if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
781 rt6->rt6i_dst.plen == 128) {
782 rt6->rt6i_flags |= RTF_MODIFIED;
783 skb->dst->pmtu = mtu;
784 }
785 }
786
787 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
788 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
789 ip_rt_put(rt);
790 goto tx_error;
791 }
792 }
793 #endif
794
795 if (tunnel->err_count > 0) {
796 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
797 tunnel->err_count--;
798
799 dst_link_failure(skb);
800 } else
801 tunnel->err_count = 0;
802 }
803
804 max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
805
806 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
807 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
808 if (!new_skb) {
809 ip_rt_put(rt);
810 stats->tx_dropped++;
811 dev_kfree_skb(skb);
812 tunnel->recursion--;
813 return 0;
814 }
815 if (skb->sk)
816 skb_set_owner_w(new_skb, skb->sk);
817 dev_kfree_skb(skb);
818 skb = new_skb;
819 old_iph = skb->nh.iph;
820 }
821
822 skb->h.raw = skb->nh.raw;
823 skb->nh.raw = skb_push(skb, gre_hlen);
824 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
825 dst_release(skb->dst);
826 skb->dst = &rt->u.dst;
827
828 /*
829 * Push down and install the IPIP header.
830 */
831
832 iph = skb->nh.iph;
833 iph->version = 4;
834 iph->ihl = sizeof(struct iphdr) >> 2;
835 iph->frag_off = df;
836 iph->protocol = IPPROTO_GRE;
837 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
838 iph->daddr = rt->rt_dst;
839 iph->saddr = rt->rt_src;
840
841 if ((iph->ttl = tiph->ttl) == 0) {
842 if (skb->protocol == htons(ETH_P_IP))
843 iph->ttl = old_iph->ttl;
844 #ifdef CONFIG_IPV6
845 else if (skb->protocol == htons(ETH_P_IPV6))
846 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
847 #endif
848 else
849 iph->ttl = sysctl_ip_default_ttl;
850 }
851
852 ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
853 ((u16*)(iph+1))[1] = skb->protocol;
854
855 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
856 u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
857
858 if (tunnel->parms.o_flags&GRE_SEQ) {
859 ++tunnel->o_seqno;
860 *ptr = htonl(tunnel->o_seqno);
861 ptr--;
862 }
863 if (tunnel->parms.o_flags&GRE_KEY) {
864 *ptr = tunnel->parms.o_key;
865 ptr--;
866 }
867 if (tunnel->parms.o_flags&GRE_CSUM) {
868 *ptr = 0;
869 *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
870 }
871 }
872
873 nf_reset(skb);
874
875 IPTUNNEL_XMIT();
876 tunnel->recursion--;
877 return 0;
878
879 tx_error_icmp:
880 dst_link_failure(skb);
881
882 tx_error:
883 stats->tx_errors++;
884 dev_kfree_skb(skb);
885 tunnel->recursion--;
886 return 0;
887 }
888
889 static int
ipgre_tunnel_ioctl(struct net_device * dev,struct ifreq * ifr,int cmd)890 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
891 {
892 int err = 0;
893 struct ip_tunnel_parm p;
894 struct ip_tunnel *t;
895
896 MOD_INC_USE_COUNT;
897
898 switch (cmd) {
899 case SIOCGETTUNNEL:
900 t = NULL;
901 if (dev == &ipgre_fb_tunnel_dev) {
902 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
903 err = -EFAULT;
904 break;
905 }
906 t = ipgre_tunnel_locate(&p, 0);
907 }
908 if (t == NULL)
909 t = (struct ip_tunnel*)dev->priv;
910 memcpy(&p, &t->parms, sizeof(p));
911 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
912 err = -EFAULT;
913 break;
914
915 case SIOCADDTUNNEL:
916 case SIOCCHGTUNNEL:
917 err = -EPERM;
918 if (!capable(CAP_NET_ADMIN))
919 goto done;
920
921 err = -EFAULT;
922 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
923 goto done;
924
925 err = -EINVAL;
926 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
927 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
928 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
929 goto done;
930 if (p.iph.ttl)
931 p.iph.frag_off |= htons(IP_DF);
932
933 if (!(p.i_flags&GRE_KEY))
934 p.i_key = 0;
935 if (!(p.o_flags&GRE_KEY))
936 p.o_key = 0;
937
938 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
939
940 if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
941 t != &ipgre_fb_tunnel) {
942 if (t != NULL) {
943 if (t->dev != dev) {
944 err = -EEXIST;
945 break;
946 }
947 } else {
948 unsigned nflags=0;
949
950 t = (struct ip_tunnel*)dev->priv;
951
952 if (MULTICAST(p.iph.daddr))
953 nflags = IFF_BROADCAST;
954 else if (p.iph.daddr)
955 nflags = IFF_POINTOPOINT;
956
957 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
958 err = -EINVAL;
959 break;
960 }
961 ipgre_tunnel_unlink(t);
962 t->parms.iph.saddr = p.iph.saddr;
963 t->parms.iph.daddr = p.iph.daddr;
964 t->parms.i_key = p.i_key;
965 t->parms.o_key = p.o_key;
966 memcpy(dev->dev_addr, &p.iph.saddr, 4);
967 memcpy(dev->broadcast, &p.iph.daddr, 4);
968 ipgre_tunnel_link(t);
969 netdev_state_change(dev);
970 }
971 }
972
973 if (t) {
974 err = 0;
975 if (cmd == SIOCCHGTUNNEL) {
976 t->parms.iph.ttl = p.iph.ttl;
977 t->parms.iph.tos = p.iph.tos;
978 t->parms.iph.frag_off = p.iph.frag_off;
979 }
980 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
981 err = -EFAULT;
982 } else
983 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
984 break;
985
986 case SIOCDELTUNNEL:
987 err = -EPERM;
988 if (!capable(CAP_NET_ADMIN))
989 goto done;
990
991 if (dev == &ipgre_fb_tunnel_dev) {
992 err = -EFAULT;
993 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
994 goto done;
995 err = -ENOENT;
996 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
997 goto done;
998 err = -EPERM;
999 if (t == &ipgre_fb_tunnel)
1000 goto done;
1001 dev = t->dev;
1002 }
1003 err = unregister_netdevice(dev);
1004 break;
1005
1006 default:
1007 err = -EINVAL;
1008 }
1009
1010 done:
1011 MOD_DEC_USE_COUNT;
1012 return err;
1013 }
1014
ipgre_tunnel_get_stats(struct net_device * dev)1015 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1016 {
1017 return &(((struct ip_tunnel*)dev->priv)->stat);
1018 }
1019
ipgre_tunnel_change_mtu(struct net_device * dev,int new_mtu)1020 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1021 {
1022 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1023 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1024 return -EINVAL;
1025 dev->mtu = new_mtu;
1026 return 0;
1027 }
1028
1029 #ifdef CONFIG_NET_IPGRE_BROADCAST
1030 /* Nice toy. Unfortunately, useless in real life :-)
1031 It allows to construct virtual multiprotocol broadcast "LAN"
1032 over the Internet, provided multicast routing is tuned.
1033
1034
1035 I have no idea was this bicycle invented before me,
1036 so that I had to set ARPHRD_IPGRE to a random value.
1037 I have an impression, that Cisco could make something similar,
1038 but this feature is apparently missing in IOS<=11.2(8).
1039
1040 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1041 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1042
1043 ping -t 255 224.66.66.66
1044
1045 If nobody answers, mbone does not work.
1046
1047 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1048 ip addr add 10.66.66.<somewhat>/24 dev Universe
1049 ifconfig Universe up
1050 ifconfig Universe add fe80::<Your_real_addr>/10
1051 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1052 ftp 10.66.66.66
1053 ...
1054 ftp fec0:6666:6666::193.233.7.65
1055 ...
1056
1057 */
1058
ipgre_header(struct sk_buff * skb,struct net_device * dev,unsigned short type,void * daddr,void * saddr,unsigned len)1059 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1060 void *daddr, void *saddr, unsigned len)
1061 {
1062 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1063 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1064 u16 *p = (u16*)(iph+1);
1065
1066 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1067 p[0] = t->parms.o_flags;
1068 p[1] = htons(type);
1069
1070 /*
1071 * Set the source hardware address.
1072 */
1073
1074 if (saddr)
1075 memcpy(&iph->saddr, saddr, 4);
1076
1077 if (daddr) {
1078 memcpy(&iph->daddr, daddr, 4);
1079 return t->hlen;
1080 }
1081 if (iph->daddr && !MULTICAST(iph->daddr))
1082 return t->hlen;
1083
1084 return -t->hlen;
1085 }
1086
ipgre_open(struct net_device * dev)1087 static int ipgre_open(struct net_device *dev)
1088 {
1089 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1090
1091 MOD_INC_USE_COUNT;
1092 if (MULTICAST(t->parms.iph.daddr)) {
1093 struct rtable *rt;
1094 if (ip_route_output(&rt, t->parms.iph.daddr,
1095 t->parms.iph.saddr, RT_TOS(t->parms.iph.tos),
1096 t->parms.link)) {
1097 MOD_DEC_USE_COUNT;
1098 return -EADDRNOTAVAIL;
1099 }
1100 dev = rt->u.dst.dev;
1101 ip_rt_put(rt);
1102 if (__in_dev_get(dev) == NULL) {
1103 MOD_DEC_USE_COUNT;
1104 return -EADDRNOTAVAIL;
1105 }
1106 t->mlink = dev->ifindex;
1107 ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
1108 }
1109 return 0;
1110 }
1111
ipgre_close(struct net_device * dev)1112 static int ipgre_close(struct net_device *dev)
1113 {
1114 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1115 if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1116 struct in_device *in_dev = inetdev_by_index(t->mlink);
1117 if (in_dev) {
1118 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1119 in_dev_put(in_dev);
1120 }
1121 }
1122 MOD_DEC_USE_COUNT;
1123 return 0;
1124 }
1125
1126 #endif
1127
ipgre_tunnel_init_gen(struct net_device * dev)1128 static void ipgre_tunnel_init_gen(struct net_device *dev)
1129 {
1130 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1131
1132 dev->uninit = ipgre_tunnel_uninit;
1133 dev->destructor = ipgre_tunnel_destructor;
1134 dev->hard_start_xmit = ipgre_tunnel_xmit;
1135 dev->get_stats = ipgre_tunnel_get_stats;
1136 dev->do_ioctl = ipgre_tunnel_ioctl;
1137 dev->change_mtu = ipgre_tunnel_change_mtu;
1138
1139 dev->type = ARPHRD_IPGRE;
1140 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1141 dev->mtu = 1500 - sizeof(struct iphdr) - 4;
1142 dev->flags = IFF_NOARP;
1143 dev->iflink = 0;
1144 dev->addr_len = 4;
1145 memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
1146 memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
1147 }
1148
ipgre_tunnel_init(struct net_device * dev)1149 static int ipgre_tunnel_init(struct net_device *dev)
1150 {
1151 struct net_device *tdev = NULL;
1152 struct ip_tunnel *tunnel;
1153 struct iphdr *iph;
1154 int hlen = LL_MAX_HEADER;
1155 int mtu = 1500;
1156 int addend = sizeof(struct iphdr) + 4;
1157
1158 tunnel = (struct ip_tunnel*)dev->priv;
1159 iph = &tunnel->parms.iph;
1160
1161 ipgre_tunnel_init_gen(dev);
1162
1163 /* Guess output device to choose reasonable mtu and hard_header_len */
1164
1165 if (iph->daddr) {
1166 struct rtable *rt;
1167 if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
1168 tdev = rt->u.dst.dev;
1169 ip_rt_put(rt);
1170 }
1171
1172 dev->flags |= IFF_POINTOPOINT;
1173
1174 #ifdef CONFIG_NET_IPGRE_BROADCAST
1175 if (MULTICAST(iph->daddr)) {
1176 if (!iph->saddr)
1177 return -EINVAL;
1178 dev->flags = IFF_BROADCAST;
1179 dev->hard_header = ipgre_header;
1180 dev->open = ipgre_open;
1181 dev->stop = ipgre_close;
1182 }
1183 #endif
1184 }
1185
1186 if (!tdev && tunnel->parms.link)
1187 tdev = __dev_get_by_index(tunnel->parms.link);
1188
1189 if (tdev) {
1190 hlen = tdev->hard_header_len;
1191 mtu = tdev->mtu;
1192 }
1193 dev->iflink = tunnel->parms.link;
1194
1195 /* Precalculate GRE options length */
1196 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1197 if (tunnel->parms.o_flags&GRE_CSUM)
1198 addend += 4;
1199 if (tunnel->parms.o_flags&GRE_KEY)
1200 addend += 4;
1201 if (tunnel->parms.o_flags&GRE_SEQ)
1202 addend += 4;
1203 }
1204 dev->hard_header_len = hlen + addend;
1205 dev->mtu = mtu - addend;
1206 tunnel->hlen = addend;
1207 return 0;
1208 }
1209
1210 #ifdef MODULE
ipgre_fb_tunnel_open(struct net_device * dev)1211 static int ipgre_fb_tunnel_open(struct net_device *dev)
1212 {
1213 MOD_INC_USE_COUNT;
1214 return 0;
1215 }
1216
ipgre_fb_tunnel_close(struct net_device * dev)1217 static int ipgre_fb_tunnel_close(struct net_device *dev)
1218 {
1219 MOD_DEC_USE_COUNT;
1220 return 0;
1221 }
1222 #endif
1223
ipgre_fb_tunnel_init(struct net_device * dev)1224 int __init ipgre_fb_tunnel_init(struct net_device *dev)
1225 {
1226 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1227 struct iphdr *iph;
1228
1229 ipgre_tunnel_init_gen(dev);
1230 #ifdef MODULE
1231 dev->open = ipgre_fb_tunnel_open;
1232 dev->stop = ipgre_fb_tunnel_close;
1233 #endif
1234
1235 iph = &ipgre_fb_tunnel.parms.iph;
1236 iph->version = 4;
1237 iph->protocol = IPPROTO_GRE;
1238 iph->ihl = 5;
1239 tunnel->hlen = sizeof(struct iphdr) + 4;
1240
1241 dev_hold(dev);
1242 tunnels_wc[0] = &ipgre_fb_tunnel;
1243 return 0;
1244 }
1245
1246
1247 static struct inet_protocol ipgre_protocol = {
1248 ipgre_rcv, /* GRE handler */
1249 ipgre_err, /* TUNNEL error control */
1250 0, /* next */
1251 IPPROTO_GRE, /* protocol ID */
1252 0, /* copy */
1253 NULL, /* data */
1254 "GRE" /* name */
1255 };
1256
1257
1258 /*
1259 * And now the modules code and kernel interface.
1260 */
1261
1262 #ifdef MODULE
init_module(void)1263 int init_module(void)
1264 #else
1265 int __init ipgre_init(void)
1266 #endif
1267 {
1268 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1269
1270 ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
1271 register_netdev(&ipgre_fb_tunnel_dev);
1272 inet_add_protocol(&ipgre_protocol);
1273 return 0;
1274 }
1275
1276 #ifdef MODULE
1277
cleanup_module(void)1278 void cleanup_module(void)
1279 {
1280 if ( inet_del_protocol(&ipgre_protocol) < 0 )
1281 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1282
1283 unregister_netdev(&ipgre_fb_tunnel_dev);
1284 }
1285
1286 #endif
1287 MODULE_LICENSE("GPL");
1288