1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.99.2.1 2002/03/10 04:26:08 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *
19 * See ip_input.c for original log
20 *
21 * Fixes:
22 * Alan Cox : Missing nonblock feature in ip_build_xmit.
23 * Mike Kilburn : htons() missing in ip_build_xmit.
24 * Bradford Johnson: Fix faulty handling of some frames when
25 * no route is found.
26 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
27 * (in case if packet not accepted by
28 * output firewall rules)
29 * Mike McLagan : Routing by source
30 * Alexey Kuznetsov: use new route cache
31 * Andi Kleen: Fix broken PMTU recovery and remove
32 * some redundant tests.
33 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
34 * Andi Kleen : Replace ip_reply with ip_send_reply.
35 * Andi Kleen : Split fast and slow ip_build_xmit path
36 * for decreased register pressure on x86
37 * and more readibility.
38 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
39 * silently drop skb instead of failing with -EPERM.
40 * Detlev Wengorz : Copy protocol for fragments.
41 */
42
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
45 #include <linux/types.h>
46 #include <linux/kernel.h>
47 #include <linux/sched.h>
48 #include <linux/mm.h>
49 #include <linux/string.h>
50 #include <linux/errno.h>
51 #include <linux/config.h>
52
53 #include <linux/socket.h>
54 #include <linux/sockios.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/etherdevice.h>
59 #include <linux/proc_fs.h>
60 #include <linux/stat.h>
61 #include <linux/init.h>
62
63 #include <net/snmp.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <net/route.h>
67 #include <net/tcp.h>
68 #include <net/udp.h>
69 #include <linux/skbuff.h>
70 #include <net/sock.h>
71 #include <net/arp.h>
72 #include <net/icmp.h>
73 #include <net/raw.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/mroute.h>
79 #include <linux/netlink.h>
80
81 /*
82 * Shall we try to damage output packets if routing dev changes?
83 */
84
85 int sysctl_ip_dynaddr = 0;
86 int sysctl_ip_default_ttl = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
ip_send_check(struct iphdr * iph)89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91 iph->check = 0;
92 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
ip_dev_loopback_xmit(struct sk_buff * newskb)96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98 newskb->mac.raw = newskb->data;
99 __skb_pull(newskb, newskb->nh.raw - newskb->data);
100 newskb->pkt_type = PACKET_LOOPBACK;
101 newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 BUG_TRAP(newskb->dst);
103
104 #ifdef CONFIG_NETFILTER_DEBUG
105 nf_debug_ip_loopback_xmit(newskb);
106 #endif
107 netif_rx(newskb);
108 return 0;
109 }
110
111 /* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
112 changes route */
113 static inline int
output_maybe_reroute(struct sk_buff * skb)114 output_maybe_reroute(struct sk_buff *skb)
115 {
116 return skb->dst->output(skb);
117 }
118
119 /*
120 * Add an ip header to a skbuff and send it out.
121 */
ip_build_and_send_pkt(struct sk_buff * skb,struct sock * sk,u32 saddr,u32 daddr,struct ip_options * opt)122 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
123 u32 saddr, u32 daddr, struct ip_options *opt)
124 {
125 struct rtable *rt = (struct rtable *)skb->dst;
126 struct iphdr *iph;
127
128 /* Build the IP header. */
129 if (opt)
130 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
131 else
132 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
133
134 iph->version = 4;
135 iph->ihl = 5;
136 iph->tos = sk->protinfo.af_inet.tos;
137 if (ip_dont_fragment(sk, &rt->u.dst))
138 iph->frag_off = htons(IP_DF);
139 else
140 iph->frag_off = 0;
141 iph->ttl = sk->protinfo.af_inet.ttl;
142 iph->daddr = rt->rt_dst;
143 iph->saddr = rt->rt_src;
144 iph->protocol = sk->protocol;
145 iph->tot_len = htons(skb->len);
146 ip_select_ident(iph, &rt->u.dst, sk);
147 skb->nh.iph = iph;
148
149 if (opt && opt->optlen) {
150 iph->ihl += opt->optlen>>2;
151 ip_options_build(skb, opt, daddr, rt, 0);
152 }
153 ip_send_check(iph);
154
155 /* Send it out. */
156 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
157 output_maybe_reroute);
158 }
159
ip_finish_output2(struct sk_buff * skb)160 static inline int ip_finish_output2(struct sk_buff *skb)
161 {
162 struct dst_entry *dst = skb->dst;
163 struct hh_cache *hh = dst->hh;
164
165 #ifdef CONFIG_NETFILTER_DEBUG
166 nf_debug_ip_finish_output2(skb);
167 #endif /*CONFIG_NETFILTER_DEBUG*/
168
169 if (hh) {
170 int hh_alen;
171
172 read_lock_bh(&hh->hh_lock);
173 hh_alen = HH_DATA_ALIGN(hh->hh_len);
174 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
175 read_unlock_bh(&hh->hh_lock);
176 skb_push(skb, hh->hh_len);
177 return hh->hh_output(skb);
178 } else if (dst->neighbour)
179 return dst->neighbour->output(skb);
180
181 if (net_ratelimit())
182 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
183 kfree_skb(skb);
184 return -EINVAL;
185 }
186
__ip_finish_output(struct sk_buff * skb)187 static __inline__ int __ip_finish_output(struct sk_buff *skb)
188 {
189 struct net_device *dev = skb->dst->dev;
190
191 skb->dev = dev;
192 skb->protocol = htons(ETH_P_IP);
193
194 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
195 ip_finish_output2);
196 }
197
ip_finish_output(struct sk_buff * skb)198 int ip_finish_output(struct sk_buff *skb)
199 {
200 return __ip_finish_output(skb);
201 }
202
ip_mc_output(struct sk_buff * skb)203 int ip_mc_output(struct sk_buff *skb)
204 {
205 struct sock *sk = skb->sk;
206 struct rtable *rt = (struct rtable*)skb->dst;
207 struct net_device *dev = rt->u.dst.dev;
208
209 /*
210 * If the indicated interface is up and running, send the packet.
211 */
212 IP_INC_STATS(IpOutRequests);
213 #ifdef CONFIG_IP_ROUTE_NAT
214 if (rt->rt_flags & RTCF_NAT)
215 ip_do_nat(skb);
216 #endif
217
218 skb->dev = dev;
219 skb->protocol = htons(ETH_P_IP);
220
221 /*
222 * Multicasts are looped back for other local users
223 */
224
225 if (rt->rt_flags&RTCF_MULTICAST) {
226 if ((!sk || sk->protinfo.af_inet.mc_loop)
227 #ifdef CONFIG_IP_MROUTE
228 /* Small optimization: do not loopback not local frames,
229 which returned after forwarding; they will be dropped
230 by ip_mr_input in any case.
231 Note, that local frames are looped back to be delivered
232 to local recipients.
233
234 This check is duplicated in ip_mr_input at the moment.
235 */
236 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
237 #endif
238 ) {
239 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
240 if (newskb)
241 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
242 newskb->dev,
243 ip_dev_loopback_xmit);
244 }
245
246 /* Multicasts with ttl 0 must not go beyond the host */
247
248 if (skb->nh.iph->ttl == 0) {
249 kfree_skb(skb);
250 return 0;
251 }
252 }
253
254 if (rt->rt_flags&RTCF_BROADCAST) {
255 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
256 if (newskb)
257 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
258 newskb->dev, ip_dev_loopback_xmit);
259 }
260
261 return __ip_finish_output(skb);
262 }
263
ip_output(struct sk_buff * skb)264 int ip_output(struct sk_buff *skb)
265 {
266 #ifdef CONFIG_IP_ROUTE_NAT
267 struct rtable *rt = (struct rtable*)skb->dst;
268 #endif
269
270 IP_INC_STATS(IpOutRequests);
271
272 #ifdef CONFIG_IP_ROUTE_NAT
273 if (rt->rt_flags&RTCF_NAT)
274 ip_do_nat(skb);
275 #endif
276
277 return __ip_finish_output(skb);
278 }
279
280 /* Queues a packet to be sent, and starts the transmitter if necessary.
281 * This routine also needs to put in the total length and compute the
282 * checksum. We use to do this in two stages, ip_build_header() then
283 * this, but that scheme created a mess when routes disappeared etc.
284 * So we do it all here, and the TCP send engine has been changed to
285 * match. (No more unroutable FIN disasters, etc. wheee...) This will
286 * most likely make other reliable transport layers above IP easier
287 * to implement under Linux.
288 */
ip_queue_xmit2(struct sk_buff * skb)289 static inline int ip_queue_xmit2(struct sk_buff *skb)
290 {
291 struct sock *sk = skb->sk;
292 struct rtable *rt = (struct rtable *)skb->dst;
293 struct net_device *dev;
294 struct iphdr *iph = skb->nh.iph;
295
296 dev = rt->u.dst.dev;
297
298 /* This can happen when the transport layer has segments queued
299 * with a cached route, and by the time we get here things are
300 * re-routed to a device with a different MTU than the original
301 * device. Sick, but we must cover it.
302 */
303 if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
304 struct sk_buff *skb2;
305
306 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
307 kfree_skb(skb);
308 if (skb2 == NULL)
309 return -ENOMEM;
310 if (sk)
311 skb_set_owner_w(skb2, sk);
312 skb = skb2;
313 iph = skb->nh.iph;
314 }
315
316 if (skb->len > rt->u.dst.pmtu)
317 goto fragment;
318
319 ip_select_ident(iph, &rt->u.dst, sk);
320
321 /* Add an IP checksum. */
322 ip_send_check(iph);
323
324 skb->priority = sk->priority;
325 return skb->dst->output(skb);
326
327 fragment:
328 if (ip_dont_fragment(sk, &rt->u.dst)) {
329 /* Reject packet ONLY if TCP might fragment
330 * it itself, if were careful enough.
331 */
332 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
333 skb->len, rt->u.dst.pmtu));
334
335 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
336 htonl(rt->u.dst.pmtu));
337 kfree_skb(skb);
338 return -EMSGSIZE;
339 }
340 ip_select_ident(iph, &rt->u.dst, sk);
341 if (skb->ip_summed == CHECKSUM_HW &&
342 (skb = skb_checksum_help(skb)) == NULL)
343 return -ENOMEM;
344 return ip_fragment(skb, skb->dst->output);
345 }
346
ip_queue_xmit(struct sk_buff * skb,int ipfragok)347 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
348 {
349 struct sock *sk = skb->sk;
350 struct ip_options *opt = sk->protinfo.af_inet.opt;
351 struct rtable *rt;
352 struct iphdr *iph;
353
354 /* Skip all of this if the packet is already routed,
355 * f.e. by something like SCTP.
356 */
357 rt = (struct rtable *) skb->dst;
358 if (rt != NULL)
359 goto packet_routed;
360
361 /* Make sure we can route this packet. */
362 rt = (struct rtable *)__sk_dst_check(sk, 0);
363 if (rt == NULL) {
364 u32 daddr;
365
366 /* Use correct destination address if we have options. */
367 daddr = sk->daddr;
368 if(opt && opt->srr)
369 daddr = opt->faddr;
370
371 /* If this fails, retransmit mechanism of transport layer will
372 * keep trying until route appears or the connection times itself
373 * out.
374 */
375 if (ip_route_output(&rt, daddr, sk->saddr,
376 RT_CONN_FLAGS(sk),
377 sk->bound_dev_if))
378 goto no_route;
379 __sk_dst_set(sk, &rt->u.dst);
380 sk->route_caps = rt->u.dst.dev->features;
381 }
382 skb->dst = dst_clone(&rt->u.dst);
383
384 packet_routed:
385 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
386 goto no_route;
387
388 /* OK, we know where to send it, allocate and build IP header. */
389 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
390 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
391 iph->tot_len = htons(skb->len);
392 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
393 iph->frag_off = htons(IP_DF);
394 else
395 iph->frag_off = 0;
396 iph->ttl = sk->protinfo.af_inet.ttl;
397 iph->protocol = sk->protocol;
398 iph->saddr = rt->rt_src;
399 iph->daddr = rt->rt_dst;
400 skb->nh.iph = iph;
401 /* Transport layer set skb->h.foo itself. */
402
403 if(opt && opt->optlen) {
404 iph->ihl += opt->optlen >> 2;
405 ip_options_build(skb, opt, sk->daddr, rt, 0);
406 }
407
408 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
409 ip_queue_xmit2);
410
411 no_route:
412 IP_INC_STATS(IpOutNoRoutes);
413 kfree_skb(skb);
414 return -EHOSTUNREACH;
415 }
416
417 /*
418 * Build and send a packet, with as little as one copy
419 *
420 * Doesn't care much about ip options... option length can be
421 * different for fragment at 0 and other fragments.
422 *
423 * Note that the fragment at the highest offset is sent first,
424 * so the getfrag routine can fill in the TCP/UDP checksum header
425 * field in the last fragment it sends... actually it also helps
426 * the reassemblers, they can put most packets in at the head of
427 * the fragment queue, and they know the total size in advance. This
428 * last feature will measurably improve the Linux fragment handler one
429 * day.
430 *
431 * The callback has five args, an arbitrary pointer (copy of frag),
432 * the source IP address (may depend on the routing table), the
433 * destination address (char *), the offset to copy from, and the
434 * length to be copied.
435 */
436
ip_build_xmit_slow(struct sock * sk,int getfrag (const void *,char *,unsigned int,unsigned int,struct sk_buff *),const void * frag,unsigned length,struct ipcm_cookie * ipc,struct rtable * rt,int flags)437 static int ip_build_xmit_slow(struct sock *sk,
438 int getfrag (const void *,
439 char *,
440 unsigned int,
441 unsigned int,
442 struct sk_buff *),
443 const void *frag,
444 unsigned length,
445 struct ipcm_cookie *ipc,
446 struct rtable *rt,
447 int flags)
448 {
449 unsigned int fraglen, maxfraglen, fragheaderlen;
450 int err;
451 int offset, mf;
452 int mtu;
453 u16 id;
454
455 int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
456 int nfrags=0;
457 struct ip_options *opt = ipc->opt;
458 int df = 0;
459
460 mtu = rt->u.dst.pmtu;
461 if (ip_dont_fragment(sk, &rt->u.dst))
462 df = htons(IP_DF);
463
464 length -= sizeof(struct iphdr);
465
466 if (opt) {
467 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
468 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
469 } else {
470 fragheaderlen = sizeof(struct iphdr);
471
472 /*
473 * Fragheaderlen is the size of 'overhead' on each buffer. Now work
474 * out the size of the frames to send.
475 */
476
477 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
478 }
479
480 if (length + fragheaderlen > 0xFFFF) {
481 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
482 return -EMSGSIZE;
483 }
484
485 /*
486 * Start at the end of the frame by handling the remainder.
487 */
488
489 offset = length - (length % (maxfraglen - fragheaderlen));
490
491 /*
492 * Amount of memory to allocate for final fragment.
493 */
494
495 fraglen = length - offset + fragheaderlen;
496
497 if (length-offset==0) {
498 fraglen = maxfraglen;
499 offset -= maxfraglen-fragheaderlen;
500 }
501
502 /*
503 * The last fragment will not have MF (more fragments) set.
504 */
505
506 mf = 0;
507
508 /*
509 * Don't fragment packets for path mtu discovery.
510 */
511
512 if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
513 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
514 return -EMSGSIZE;
515 }
516 if (flags&MSG_PROBE)
517 goto out;
518
519 /*
520 * Begin outputting the bytes.
521 */
522
523 id = sk->protinfo.af_inet.id++;
524
525 do {
526 char *data;
527 struct sk_buff * skb;
528
529 /*
530 * Get the memory we require with some space left for alignment.
531 */
532 if (!(flags & MSG_DONTWAIT) || nfrags == 0) {
533 skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
534 (flags & MSG_DONTWAIT), &err);
535 } else {
536 /* On a non-blocking write, we check for send buffer
537 * usage on the first fragment only.
538 */
539 skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
540 sk->allocation);
541 if (!skb)
542 err = -ENOBUFS;
543 }
544 if (skb == NULL)
545 goto error;
546
547 /*
548 * Fill in the control structures
549 */
550
551 skb->priority = sk->priority;
552 skb->dst = dst_clone(&rt->u.dst);
553 skb_reserve(skb, hh_len);
554
555 /*
556 * Find where to start putting bytes.
557 */
558
559 data = skb_put(skb, fraglen);
560 skb->nh.iph = (struct iphdr *)data;
561
562 /*
563 * Only write IP header onto non-raw packets
564 */
565
566 {
567 struct iphdr *iph = (struct iphdr *)data;
568
569 iph->version = 4;
570 iph->ihl = 5;
571 if (opt) {
572 iph->ihl += opt->optlen>>2;
573 ip_options_build(skb, opt,
574 ipc->addr, rt, offset);
575 }
576 iph->tos = sk->protinfo.af_inet.tos;
577 iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
578 iph->frag_off = htons(offset>>3)|mf|df;
579 iph->id = id;
580 if (!mf) {
581 if (offset || !df) {
582 /* Select an unpredictable ident only
583 * for packets without DF or having
584 * been fragmented.
585 */
586 __ip_select_ident(iph, &rt->u.dst);
587 id = iph->id;
588 }
589
590 /*
591 * Any further fragments will have MF set.
592 */
593 mf = htons(IP_MF);
594 }
595 if (rt->rt_type == RTN_MULTICAST)
596 iph->ttl = sk->protinfo.af_inet.mc_ttl;
597 else
598 iph->ttl = sk->protinfo.af_inet.ttl;
599 iph->protocol = sk->protocol;
600 iph->check = 0;
601 iph->saddr = rt->rt_src;
602 iph->daddr = rt->rt_dst;
603 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
604 data += iph->ihl*4;
605 }
606
607 /*
608 * User data callback
609 */
610
611 if (getfrag(frag, data, offset, fraglen-fragheaderlen, skb)) {
612 err = -EFAULT;
613 kfree_skb(skb);
614 goto error;
615 }
616
617 offset -= (maxfraglen-fragheaderlen);
618 fraglen = maxfraglen;
619
620 nfrags++;
621
622 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
623 skb->dst->dev, output_maybe_reroute);
624 if (err) {
625 if (err > 0)
626 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
627 if (err)
628 goto error;
629 }
630 } while (offset >= 0);
631
632 if (nfrags>1)
633 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
634 out:
635 return 0;
636
637 error:
638 IP_INC_STATS(IpOutDiscards);
639 if (nfrags>1)
640 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
641 return err;
642 }
643
644 /*
645 * Fast path for unfragmented packets.
646 */
ip_build_xmit(struct sock * sk,int getfrag (const void *,char *,unsigned int,unsigned int,struct sk_buff *),const void * frag,unsigned length,struct ipcm_cookie * ipc,struct rtable * rt,int flags)647 int ip_build_xmit(struct sock *sk,
648 int getfrag (const void *,
649 char *,
650 unsigned int,
651 unsigned int,
652 struct sk_buff *),
653 const void *frag,
654 unsigned length,
655 struct ipcm_cookie *ipc,
656 struct rtable *rt,
657 int flags)
658 {
659 int err;
660 struct sk_buff *skb;
661 int df;
662 struct iphdr *iph;
663
664 /*
665 * Try the simple case first. This leaves fragmented frames, and by
666 * choice RAW frames within 20 bytes of maximum size(rare) to the long path
667 */
668
669 if (!sk->protinfo.af_inet.hdrincl) {
670 length += sizeof(struct iphdr);
671
672 /*
673 * Check for slow path.
674 */
675 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
676 return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
677 } else {
678 if (length > rt->u.dst.dev->mtu) {
679 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
680 return -EMSGSIZE;
681 }
682 }
683 if (flags&MSG_PROBE)
684 goto out;
685
686 /*
687 * Do path mtu discovery if needed.
688 */
689 df = 0;
690 if (ip_dont_fragment(sk, &rt->u.dst))
691 df = htons(IP_DF);
692
693 /*
694 * Fast path for unfragmented frames without options.
695 */
696 {
697 int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
698
699 skb = sock_alloc_send_skb(sk, length+hh_len+15,
700 flags&MSG_DONTWAIT, &err);
701 if(skb==NULL)
702 goto error;
703 skb_reserve(skb, hh_len);
704 }
705
706 skb->priority = sk->priority;
707 skb->dst = dst_clone(&rt->u.dst);
708
709 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
710
711 if(!sk->protinfo.af_inet.hdrincl) {
712 iph->version=4;
713 iph->ihl=5;
714 iph->tos=sk->protinfo.af_inet.tos;
715 iph->tot_len = htons(length);
716 iph->frag_off = df;
717 iph->ttl=sk->protinfo.af_inet.mc_ttl;
718 ip_select_ident(iph, &rt->u.dst, sk);
719 if (rt->rt_type != RTN_MULTICAST)
720 iph->ttl=sk->protinfo.af_inet.ttl;
721 iph->protocol=sk->protocol;
722 iph->saddr=rt->rt_src;
723 iph->daddr=rt->rt_dst;
724 iph->check=0;
725 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
726 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4, skb);
727 }
728 else
729 err = getfrag(frag, (void *)iph, 0, length, skb);
730
731 if (err)
732 goto error_fault;
733
734 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
735 output_maybe_reroute);
736 if (err > 0)
737 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
738 if (err)
739 goto error;
740 out:
741 return 0;
742
743 error_fault:
744 err = -EFAULT;
745 kfree_skb(skb);
746 error:
747 IP_INC_STATS(IpOutDiscards);
748 return err;
749 }
750
751 /*
752 * This IP datagram is too large to be sent in one piece. Break it up into
753 * smaller pieces (each of size equal to IP header plus
754 * a block of the data of the original IP data part) that will yet fit in a
755 * single device frame, and queue such a frame for sending.
756 *
757 * Yes this is inefficient, feel free to submit a quicker one.
758 */
759
ip_fragment(struct sk_buff * skb,int (* output)(struct sk_buff *))760 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
761 {
762 struct iphdr *iph;
763 int raw = 0;
764 int ptr;
765 struct net_device *dev;
766 struct sk_buff *skb2;
767 unsigned int mtu, hlen, left, len;
768 int offset;
769 int not_last_frag;
770 struct rtable *rt = (struct rtable*)skb->dst;
771 int err = 0;
772
773 dev = rt->u.dst.dev;
774
775 /*
776 * Point into the IP datagram header.
777 */
778
779 iph = skb->nh.iph;
780
781 /*
782 * Setup starting values.
783 */
784
785 hlen = iph->ihl * 4;
786 left = skb->len - hlen; /* Space per frame */
787 mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
788 ptr = raw + hlen; /* Where to start from */
789
790 /*
791 * Fragment the datagram.
792 */
793
794 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
795 not_last_frag = iph->frag_off & htons(IP_MF);
796
797 /*
798 * Keep copying data until we run out.
799 */
800
801 while(left > 0) {
802 len = left;
803 /* IF: it doesn't fit, use 'mtu' - the data space left */
804 if (len > mtu)
805 len = mtu;
806 /* IF: we are not sending upto and including the packet end
807 then align the next start on an eight byte boundary */
808 if (len < left) {
809 len &= ~7;
810 }
811 /*
812 * Allocate buffer.
813 */
814
815 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
816 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
817 err = -ENOMEM;
818 goto fail;
819 }
820
821 /*
822 * Set up data on packet
823 */
824
825 skb2->pkt_type = skb->pkt_type;
826 skb2->priority = skb->priority;
827 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
828 skb_put(skb2, len + hlen);
829 skb2->nh.raw = skb2->data;
830 skb2->h.raw = skb2->data + hlen;
831 skb2->protocol = skb->protocol;
832 skb2->security = skb->security;
833
834 /*
835 * Charge the memory for the fragment to any owner
836 * it might possess
837 */
838
839 if (skb->sk)
840 skb_set_owner_w(skb2, skb->sk);
841 skb2->dst = dst_clone(skb->dst);
842 skb2->dev = skb->dev;
843
844 /*
845 * Copy the packet header into the new buffer.
846 */
847
848 memcpy(skb2->nh.raw, skb->data, hlen);
849
850 /*
851 * Copy a block of the IP datagram.
852 */
853 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
854 BUG();
855 left -= len;
856
857 /*
858 * Fill in the new header fields.
859 */
860 iph = skb2->nh.iph;
861 iph->frag_off = htons((offset >> 3));
862
863 /* ANK: dirty, but effective trick. Upgrade options only if
864 * the segment to be fragmented was THE FIRST (otherwise,
865 * options are already fixed) and make it ONCE
866 * on the initial skb, so that all the following fragments
867 * will inherit fixed options.
868 */
869 if (offset == 0)
870 ip_options_fragment(skb);
871
872 /* Copy the flags to each fragment. */
873 IPCB(skb2)->flags = IPCB(skb)->flags;
874
875 /*
876 * Added AC : If we are fragmenting a fragment that's not the
877 * last fragment then keep MF on each bit
878 */
879 if (left > 0 || not_last_frag)
880 iph->frag_off |= htons(IP_MF);
881 ptr += len;
882 offset += len;
883
884 #ifdef CONFIG_NET_SCHED
885 skb2->tc_index = skb->tc_index;
886 #endif
887 #ifdef CONFIG_NETFILTER
888 skb2->nfmark = skb->nfmark;
889 skb2->nfcache = skb->nfcache;
890 /* Connection association is same as pre-frag packet */
891 skb2->nfct = skb->nfct;
892 nf_conntrack_get(skb2->nfct);
893 #ifdef CONFIG_NETFILTER_DEBUG
894 skb2->nf_debug = skb->nf_debug;
895 #endif
896 #endif
897
898 /*
899 * Put this fragment into the sending queue.
900 */
901
902 IP_INC_STATS(IpFragCreates);
903
904 iph->tot_len = htons(len + hlen);
905
906 ip_send_check(iph);
907
908 err = output(skb2);
909 if (err)
910 goto fail;
911 }
912 kfree_skb(skb);
913 IP_INC_STATS(IpFragOKs);
914 return err;
915
916 fail:
917 kfree_skb(skb);
918 IP_INC_STATS(IpFragFails);
919 return err;
920 }
921
922 /*
923 * Fetch data from kernel space and fill in checksum if needed.
924 */
ip_reply_glue_bits(const void * dptr,char * to,unsigned int offset,unsigned int fraglen,struct sk_buff * skb)925 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
926 unsigned int fraglen, struct sk_buff *skb)
927 {
928 struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
929 u16 *pktp = (u16 *)to;
930 struct iovec *iov;
931 int len;
932 int hdrflag = 1;
933
934 iov = &dp->iov[0];
935 if (offset >= iov->iov_len) {
936 offset -= iov->iov_len;
937 iov++;
938 hdrflag = 0;
939 }
940 len = iov->iov_len - offset;
941 if (fraglen > len) { /* overlapping. */
942 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
943 dp->csum);
944 offset = 0;
945 fraglen -= len;
946 to += len;
947 iov++;
948 }
949
950 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
951 dp->csum);
952
953 if (hdrflag && dp->csumoffset)
954 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
955 return 0;
956 }
957
958 /*
959 * Generic function to send a packet as reply to another packet.
960 * Used to send TCP resets so far. ICMP should use this function too.
961 *
962 * Should run single threaded per socket because it uses the sock
963 * structure to pass arguments.
964 */
ip_send_reply(struct sock * sk,struct sk_buff * skb,struct ip_reply_arg * arg,unsigned int len)965 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
966 unsigned int len)
967 {
968 struct {
969 struct ip_options opt;
970 char data[40];
971 } replyopts;
972 struct ipcm_cookie ipc;
973 u32 daddr;
974 struct rtable *rt = (struct rtable*)skb->dst;
975
976 if (ip_options_echo(&replyopts.opt, skb))
977 return;
978
979 daddr = ipc.addr = rt->rt_src;
980 ipc.opt = NULL;
981
982 if (replyopts.opt.optlen) {
983 ipc.opt = &replyopts.opt;
984
985 if (ipc.opt->srr)
986 daddr = replyopts.opt.faddr;
987 }
988
989 if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
990 return;
991
992 /* And let IP do all the hard work.
993
994 This chunk is not reenterable, hence spinlock.
995 Note that it uses the fact, that this function is called
996 with locally disabled BH and that sk cannot be already spinlocked.
997 */
998 bh_lock_sock(sk);
999 sk->protinfo.af_inet.tos = skb->nh.iph->tos;
1000 sk->priority = skb->priority;
1001 sk->protocol = skb->nh.iph->protocol;
1002 ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
1003 bh_unlock_sock(sk);
1004
1005 ip_rt_put(rt);
1006 }
1007
1008 /*
1009 * IP protocol layer initialiser
1010 */
1011
1012 static struct packet_type ip_packet_type =
1013 {
1014 __constant_htons(ETH_P_IP),
1015 NULL, /* All devices */
1016 ip_rcv,
1017 (void*)1,
1018 NULL,
1019 };
1020
1021 /*
1022 * IP registers the packet type and then calls the subprotocol initialisers
1023 */
1024
ip_init(void)1025 void __init ip_init(void)
1026 {
1027 dev_add_pack(&ip_packet_type);
1028
1029 ip_rt_init();
1030 inet_initpeers();
1031
1032 #ifdef CONFIG_IP_MULTICAST
1033 proc_net_create("igmp", 0, ip_mc_procinfo);
1034 #endif
1035 proc_net_create("mcfilter", 0, ip_mcf_procinfo);
1036 }
1037