1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Version:	$Id: ip_output.c,v 1.99.2.1 2002/03/10 04:26:08 davem Exp $
9  *
10  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Donald Becker, <becker@super.org>
13  *		Alan Cox, <Alan.Cox@linux.org>
14  *		Richard Underwood
15  *		Stefan Becker, <stefanb@yello.ping.de>
16  *		Jorge Cwik, <jorge@laser.satlink.net>
17  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *
19  *	See ip_input.c for original log
20  *
21  *	Fixes:
22  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
23  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
24  *		Bradford Johnson:	Fix faulty handling of some frames when
25  *					no route is found.
26  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
27  *					(in case if packet not accepted by
28  *					output firewall rules)
29  *		Mike McLagan	:	Routing by source
30  *		Alexey Kuznetsov:	use new route cache
31  *		Andi Kleen:		Fix broken PMTU recovery and remove
32  *					some redundant tests.
33  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
34  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
35  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
36  *					for decreased register pressure on x86
37  *					and more readibility.
38  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
39  *					silently drop skb instead of failing with -EPERM.
40  *		Detlev Wengorz	:	Copy protocol for fragments.
41  */
42 
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
45 #include <linux/types.h>
46 #include <linux/kernel.h>
47 #include <linux/sched.h>
48 #include <linux/mm.h>
49 #include <linux/string.h>
50 #include <linux/errno.h>
51 #include <linux/config.h>
52 
53 #include <linux/socket.h>
54 #include <linux/sockios.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/etherdevice.h>
59 #include <linux/proc_fs.h>
60 #include <linux/stat.h>
61 #include <linux/init.h>
62 
63 #include <net/snmp.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <net/route.h>
67 #include <net/tcp.h>
68 #include <net/udp.h>
69 #include <linux/skbuff.h>
70 #include <net/sock.h>
71 #include <net/arp.h>
72 #include <net/icmp.h>
73 #include <net/raw.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/mroute.h>
79 #include <linux/netlink.h>
80 
81 /*
82  *      Shall we try to damage output packets if routing dev changes?
83  */
84 
85 int sysctl_ip_dynaddr = 0;
86 int sysctl_ip_default_ttl = IPDEFTTL;
87 
88 /* Generate a checksum for an outgoing IP datagram. */
ip_send_check(struct iphdr * iph)89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91 	iph->check = 0;
92 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94 
95 /* dev_loopback_xmit for use with netfilter. */
ip_dev_loopback_xmit(struct sk_buff * newskb)96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98 	newskb->mac.raw = newskb->data;
99 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
100 	newskb->pkt_type = PACKET_LOOPBACK;
101 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 	BUG_TRAP(newskb->dst);
103 
104 #ifdef CONFIG_NETFILTER_DEBUG
105 	nf_debug_ip_loopback_xmit(newskb);
106 #endif
107 	netif_rx(newskb);
108 	return 0;
109 }
110 
111 /* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
112    changes route */
113 static inline int
output_maybe_reroute(struct sk_buff * skb)114 output_maybe_reroute(struct sk_buff *skb)
115 {
116 	return skb->dst->output(skb);
117 }
118 
119 /*
120  *		Add an ip header to a skbuff and send it out.
121  */
ip_build_and_send_pkt(struct sk_buff * skb,struct sock * sk,u32 saddr,u32 daddr,struct ip_options * opt)122 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
123 			  u32 saddr, u32 daddr, struct ip_options *opt)
124 {
125 	struct rtable *rt = (struct rtable *)skb->dst;
126 	struct iphdr *iph;
127 
128 	/* Build the IP header. */
129 	if (opt)
130 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
131 	else
132 		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
133 
134 	iph->version  = 4;
135 	iph->ihl      = 5;
136 	iph->tos      = sk->protinfo.af_inet.tos;
137 	if (ip_dont_fragment(sk, &rt->u.dst))
138 		iph->frag_off = htons(IP_DF);
139 	else
140 		iph->frag_off = 0;
141 	iph->ttl      = sk->protinfo.af_inet.ttl;
142 	iph->daddr    = rt->rt_dst;
143 	iph->saddr    = rt->rt_src;
144 	iph->protocol = sk->protocol;
145 	iph->tot_len  = htons(skb->len);
146 	ip_select_ident(iph, &rt->u.dst, sk);
147 	skb->nh.iph   = iph;
148 
149 	if (opt && opt->optlen) {
150 		iph->ihl += opt->optlen>>2;
151 		ip_options_build(skb, opt, daddr, rt, 0);
152 	}
153 	ip_send_check(iph);
154 
155 	/* Send it out. */
156 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
157 		       output_maybe_reroute);
158 }
159 
ip_finish_output2(struct sk_buff * skb)160 static inline int ip_finish_output2(struct sk_buff *skb)
161 {
162 	struct dst_entry *dst = skb->dst;
163 	struct hh_cache *hh = dst->hh;
164 
165 #ifdef CONFIG_NETFILTER_DEBUG
166 	nf_debug_ip_finish_output2(skb);
167 #endif /*CONFIG_NETFILTER_DEBUG*/
168 
169 	if (hh) {
170 		int hh_alen;
171 
172 		read_lock_bh(&hh->hh_lock);
173 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
174   		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
175 		read_unlock_bh(&hh->hh_lock);
176 	        skb_push(skb, hh->hh_len);
177 		return hh->hh_output(skb);
178 	} else if (dst->neighbour)
179 		return dst->neighbour->output(skb);
180 
181 	if (net_ratelimit())
182 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
183 	kfree_skb(skb);
184 	return -EINVAL;
185 }
186 
__ip_finish_output(struct sk_buff * skb)187 static __inline__ int __ip_finish_output(struct sk_buff *skb)
188 {
189 	struct net_device *dev = skb->dst->dev;
190 
191 	skb->dev = dev;
192 	skb->protocol = htons(ETH_P_IP);
193 
194 	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
195 		       ip_finish_output2);
196 }
197 
ip_finish_output(struct sk_buff * skb)198 int ip_finish_output(struct sk_buff *skb)
199 {
200 	return __ip_finish_output(skb);
201 }
202 
ip_mc_output(struct sk_buff * skb)203 int ip_mc_output(struct sk_buff *skb)
204 {
205 	struct sock *sk = skb->sk;
206 	struct rtable *rt = (struct rtable*)skb->dst;
207 	struct net_device *dev = rt->u.dst.dev;
208 
209 	/*
210 	 *	If the indicated interface is up and running, send the packet.
211 	 */
212 	IP_INC_STATS(IpOutRequests);
213 #ifdef CONFIG_IP_ROUTE_NAT
214 	if (rt->rt_flags & RTCF_NAT)
215 		ip_do_nat(skb);
216 #endif
217 
218 	skb->dev = dev;
219 	skb->protocol = htons(ETH_P_IP);
220 
221 	/*
222 	 *	Multicasts are looped back for other local users
223 	 */
224 
225 	if (rt->rt_flags&RTCF_MULTICAST) {
226 		if ((!sk || sk->protinfo.af_inet.mc_loop)
227 #ifdef CONFIG_IP_MROUTE
228 		/* Small optimization: do not loopback not local frames,
229 		   which returned after forwarding; they will be  dropped
230 		   by ip_mr_input in any case.
231 		   Note, that local frames are looped back to be delivered
232 		   to local recipients.
233 
234 		   This check is duplicated in ip_mr_input at the moment.
235 		 */
236 		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
237 #endif
238 		) {
239 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
240 			if (newskb)
241 				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
242 					newskb->dev,
243 					ip_dev_loopback_xmit);
244 		}
245 
246 		/* Multicasts with ttl 0 must not go beyond the host */
247 
248 		if (skb->nh.iph->ttl == 0) {
249 			kfree_skb(skb);
250 			return 0;
251 		}
252 	}
253 
254 	if (rt->rt_flags&RTCF_BROADCAST) {
255 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
256 		if (newskb)
257 			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
258 				newskb->dev, ip_dev_loopback_xmit);
259 	}
260 
261 	return __ip_finish_output(skb);
262 }
263 
ip_output(struct sk_buff * skb)264 int ip_output(struct sk_buff *skb)
265 {
266 #ifdef CONFIG_IP_ROUTE_NAT
267 	struct rtable *rt = (struct rtable*)skb->dst;
268 #endif
269 
270 	IP_INC_STATS(IpOutRequests);
271 
272 #ifdef CONFIG_IP_ROUTE_NAT
273 	if (rt->rt_flags&RTCF_NAT)
274 		ip_do_nat(skb);
275 #endif
276 
277 	return __ip_finish_output(skb);
278 }
279 
280 /* Queues a packet to be sent, and starts the transmitter if necessary.
281  * This routine also needs to put in the total length and compute the
282  * checksum.  We use to do this in two stages, ip_build_header() then
283  * this, but that scheme created a mess when routes disappeared etc.
284  * So we do it all here, and the TCP send engine has been changed to
285  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
286  * most likely make other reliable transport layers above IP easier
287  * to implement under Linux.
288  */
ip_queue_xmit2(struct sk_buff * skb)289 static inline int ip_queue_xmit2(struct sk_buff *skb)
290 {
291 	struct sock *sk = skb->sk;
292 	struct rtable *rt = (struct rtable *)skb->dst;
293 	struct net_device *dev;
294 	struct iphdr *iph = skb->nh.iph;
295 
296 	dev = rt->u.dst.dev;
297 
298 	/* This can happen when the transport layer has segments queued
299 	 * with a cached route, and by the time we get here things are
300 	 * re-routed to a device with a different MTU than the original
301 	 * device.  Sick, but we must cover it.
302 	 */
303 	if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
304 		struct sk_buff *skb2;
305 
306 		skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
307 		kfree_skb(skb);
308 		if (skb2 == NULL)
309 			return -ENOMEM;
310 		if (sk)
311 			skb_set_owner_w(skb2, sk);
312 		skb = skb2;
313 		iph = skb->nh.iph;
314 	}
315 
316 	if (skb->len > rt->u.dst.pmtu)
317 		goto fragment;
318 
319 	ip_select_ident(iph, &rt->u.dst, sk);
320 
321 	/* Add an IP checksum. */
322 	ip_send_check(iph);
323 
324 	skb->priority = sk->priority;
325 	return skb->dst->output(skb);
326 
327 fragment:
328 	if (ip_dont_fragment(sk, &rt->u.dst)) {
329 		/* Reject packet ONLY if TCP might fragment
330 		 * it itself, if were careful enough.
331 		 */
332 		NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
333 				skb->len, rt->u.dst.pmtu));
334 
335 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
336 			  htonl(rt->u.dst.pmtu));
337 		kfree_skb(skb);
338 		return -EMSGSIZE;
339 	}
340 	ip_select_ident(iph, &rt->u.dst, sk);
341 	if (skb->ip_summed == CHECKSUM_HW &&
342 	    (skb = skb_checksum_help(skb)) == NULL)
343 		return -ENOMEM;
344 	return ip_fragment(skb, skb->dst->output);
345 }
346 
ip_queue_xmit(struct sk_buff * skb,int ipfragok)347 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
348 {
349 	struct sock *sk = skb->sk;
350 	struct ip_options *opt = sk->protinfo.af_inet.opt;
351 	struct rtable *rt;
352 	struct iphdr *iph;
353 
354 	/* Skip all of this if the packet is already routed,
355 	 * f.e. by something like SCTP.
356 	 */
357 	rt = (struct rtable *) skb->dst;
358 	if (rt != NULL)
359 		goto packet_routed;
360 
361 	/* Make sure we can route this packet. */
362 	rt = (struct rtable *)__sk_dst_check(sk, 0);
363 	if (rt == NULL) {
364 		u32 daddr;
365 
366 		/* Use correct destination address if we have options. */
367 		daddr = sk->daddr;
368 		if(opt && opt->srr)
369 			daddr = opt->faddr;
370 
371 		/* If this fails, retransmit mechanism of transport layer will
372 		 * keep trying until route appears or the connection times itself
373 		 * out.
374 		 */
375 		if (ip_route_output(&rt, daddr, sk->saddr,
376 				    RT_CONN_FLAGS(sk),
377 				    sk->bound_dev_if))
378 			goto no_route;
379 		__sk_dst_set(sk, &rt->u.dst);
380 		sk->route_caps = rt->u.dst.dev->features;
381 	}
382 	skb->dst = dst_clone(&rt->u.dst);
383 
384 packet_routed:
385 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
386 		goto no_route;
387 
388 	/* OK, we know where to send it, allocate and build IP header. */
389 	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
390 	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
391 	iph->tot_len = htons(skb->len);
392 	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
393 		iph->frag_off = htons(IP_DF);
394 	else
395 		iph->frag_off = 0;
396 	iph->ttl      = sk->protinfo.af_inet.ttl;
397 	iph->protocol = sk->protocol;
398 	iph->saddr    = rt->rt_src;
399 	iph->daddr    = rt->rt_dst;
400 	skb->nh.iph   = iph;
401 	/* Transport layer set skb->h.foo itself. */
402 
403 	if(opt && opt->optlen) {
404 		iph->ihl += opt->optlen >> 2;
405 		ip_options_build(skb, opt, sk->daddr, rt, 0);
406 	}
407 
408 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
409 		       ip_queue_xmit2);
410 
411 no_route:
412 	IP_INC_STATS(IpOutNoRoutes);
413 	kfree_skb(skb);
414 	return -EHOSTUNREACH;
415 }
416 
417 /*
418  *	Build and send a packet, with as little as one copy
419  *
420  *	Doesn't care much about ip options... option length can be
421  *	different for fragment at 0 and other fragments.
422  *
423  *	Note that the fragment at the highest offset is sent first,
424  *	so the getfrag routine can fill in the TCP/UDP checksum header
425  *	field in the last fragment it sends... actually it also helps
426  * 	the reassemblers, they can put most packets in at the head of
427  *	the fragment queue, and they know the total size in advance. This
428  *	last feature will measurably improve the Linux fragment handler one
429  *	day.
430  *
431  *	The callback has five args, an arbitrary pointer (copy of frag),
432  *	the source IP address (may depend on the routing table), the
433  *	destination address (char *), the offset to copy from, and the
434  *	length to be copied.
435  */
436 
ip_build_xmit_slow(struct sock * sk,int getfrag (const void *,char *,unsigned int,unsigned int,struct sk_buff *),const void * frag,unsigned length,struct ipcm_cookie * ipc,struct rtable * rt,int flags)437 static int ip_build_xmit_slow(struct sock *sk,
438 		  int getfrag (const void *,
439 			       char *,
440 			       unsigned int,
441 			       unsigned int,
442 			       struct sk_buff *),
443 		  const void *frag,
444 		  unsigned length,
445 		  struct ipcm_cookie *ipc,
446 		  struct rtable *rt,
447 		  int flags)
448 {
449 	unsigned int fraglen, maxfraglen, fragheaderlen;
450 	int err;
451 	int offset, mf;
452 	int mtu;
453 	u16 id;
454 
455 	int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
456 	int nfrags=0;
457 	struct ip_options *opt = ipc->opt;
458 	int df = 0;
459 
460 	mtu = rt->u.dst.pmtu;
461 	if (ip_dont_fragment(sk, &rt->u.dst))
462 		df = htons(IP_DF);
463 
464 	length -= sizeof(struct iphdr);
465 
466 	if (opt) {
467 		fragheaderlen = sizeof(struct iphdr) + opt->optlen;
468 		maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
469 	} else {
470 		fragheaderlen = sizeof(struct iphdr);
471 
472 		/*
473 		 *	Fragheaderlen is the size of 'overhead' on each buffer. Now work
474 		 *	out the size of the frames to send.
475 		 */
476 
477 		maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
478 	}
479 
480 	if (length + fragheaderlen > 0xFFFF) {
481 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
482 		return -EMSGSIZE;
483 	}
484 
485 	/*
486 	 *	Start at the end of the frame by handling the remainder.
487 	 */
488 
489 	offset = length - (length % (maxfraglen - fragheaderlen));
490 
491 	/*
492 	 *	Amount of memory to allocate for final fragment.
493 	 */
494 
495 	fraglen = length - offset + fragheaderlen;
496 
497 	if (length-offset==0) {
498 		fraglen = maxfraglen;
499 		offset -= maxfraglen-fragheaderlen;
500 	}
501 
502 	/*
503 	 *	The last fragment will not have MF (more fragments) set.
504 	 */
505 
506 	mf = 0;
507 
508 	/*
509 	 *	Don't fragment packets for path mtu discovery.
510 	 */
511 
512 	if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
513 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
514  		return -EMSGSIZE;
515 	}
516 	if (flags&MSG_PROBE)
517 		goto out;
518 
519 	/*
520 	 *	Begin outputting the bytes.
521 	 */
522 
523 	id = sk->protinfo.af_inet.id++;
524 
525 	do {
526 		char *data;
527 		struct sk_buff * skb;
528 
529 		/*
530 		 *	Get the memory we require with some space left for alignment.
531 		 */
532 		if (!(flags & MSG_DONTWAIT) || nfrags == 0) {
533 			skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
534 						  (flags & MSG_DONTWAIT), &err);
535 		} else {
536 			/* On a non-blocking write, we check for send buffer
537 			 * usage on the first fragment only.
538 			 */
539 			skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
540 					   sk->allocation);
541 			if (!skb)
542 				err = -ENOBUFS;
543 		}
544 		if (skb == NULL)
545 			goto error;
546 
547 		/*
548 		 *	Fill in the control structures
549 		 */
550 
551 		skb->priority = sk->priority;
552 		skb->dst = dst_clone(&rt->u.dst);
553 		skb_reserve(skb, hh_len);
554 
555 		/*
556 		 *	Find where to start putting bytes.
557 		 */
558 
559 		data = skb_put(skb, fraglen);
560 		skb->nh.iph = (struct iphdr *)data;
561 
562 		/*
563 		 *	Only write IP header onto non-raw packets
564 		 */
565 
566 		{
567 			struct iphdr *iph = (struct iphdr *)data;
568 
569 			iph->version = 4;
570 			iph->ihl = 5;
571 			if (opt) {
572 				iph->ihl += opt->optlen>>2;
573 				ip_options_build(skb, opt,
574 						 ipc->addr, rt, offset);
575 			}
576 			iph->tos = sk->protinfo.af_inet.tos;
577 			iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
578 			iph->frag_off = htons(offset>>3)|mf|df;
579 			iph->id = id;
580 			if (!mf) {
581 				if (offset || !df) {
582 					/* Select an unpredictable ident only
583 					 * for packets without DF or having
584 					 * been fragmented.
585 					 */
586 					__ip_select_ident(iph, &rt->u.dst);
587 					id = iph->id;
588 				}
589 
590 				/*
591 				 *	Any further fragments will have MF set.
592 				 */
593 				mf = htons(IP_MF);
594 			}
595 			if (rt->rt_type == RTN_MULTICAST)
596 				iph->ttl = sk->protinfo.af_inet.mc_ttl;
597 			else
598 				iph->ttl = sk->protinfo.af_inet.ttl;
599 			iph->protocol = sk->protocol;
600 			iph->check = 0;
601 			iph->saddr = rt->rt_src;
602 			iph->daddr = rt->rt_dst;
603 			iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
604 			data += iph->ihl*4;
605 		}
606 
607 		/*
608 		 *	User data callback
609 		 */
610 
611 		if (getfrag(frag, data, offset, fraglen-fragheaderlen, skb)) {
612 			err = -EFAULT;
613 			kfree_skb(skb);
614 			goto error;
615 		}
616 
617 		offset -= (maxfraglen-fragheaderlen);
618 		fraglen = maxfraglen;
619 
620 		nfrags++;
621 
622 		err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
623 			      skb->dst->dev, output_maybe_reroute);
624 		if (err) {
625 			if (err > 0)
626 				err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
627 			if (err)
628 				goto error;
629 		}
630 	} while (offset >= 0);
631 
632 	if (nfrags>1)
633 		ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
634 out:
635 	return 0;
636 
637 error:
638 	IP_INC_STATS(IpOutDiscards);
639 	if (nfrags>1)
640 		ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
641 	return err;
642 }
643 
644 /*
645  *	Fast path for unfragmented packets.
646  */
ip_build_xmit(struct sock * sk,int getfrag (const void *,char *,unsigned int,unsigned int,struct sk_buff *),const void * frag,unsigned length,struct ipcm_cookie * ipc,struct rtable * rt,int flags)647 int ip_build_xmit(struct sock *sk,
648 		  int getfrag (const void *,
649 			       char *,
650 			       unsigned int,
651 			       unsigned int,
652 			       struct sk_buff *),
653 		  const void *frag,
654 		  unsigned length,
655 		  struct ipcm_cookie *ipc,
656 		  struct rtable *rt,
657 		  int flags)
658 {
659 	int err;
660 	struct sk_buff *skb;
661 	int df;
662 	struct iphdr *iph;
663 
664 	/*
665 	 *	Try the simple case first. This leaves fragmented frames, and by
666 	 *	choice RAW frames within 20 bytes of maximum size(rare) to the long path
667 	 */
668 
669 	if (!sk->protinfo.af_inet.hdrincl) {
670 		length += sizeof(struct iphdr);
671 
672 		/*
673 		 * 	Check for slow path.
674 		 */
675 		if (length > rt->u.dst.pmtu || ipc->opt != NULL)
676 			return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
677 	} else {
678 		if (length > rt->u.dst.dev->mtu) {
679 			ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
680 			return -EMSGSIZE;
681 		}
682 	}
683 	if (flags&MSG_PROBE)
684 		goto out;
685 
686 	/*
687 	 *	Do path mtu discovery if needed.
688 	 */
689 	df = 0;
690 	if (ip_dont_fragment(sk, &rt->u.dst))
691 		df = htons(IP_DF);
692 
693 	/*
694 	 *	Fast path for unfragmented frames without options.
695 	 */
696 	{
697 	int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
698 
699 	skb = sock_alloc_send_skb(sk, length+hh_len+15,
700 				  flags&MSG_DONTWAIT, &err);
701 	if(skb==NULL)
702 		goto error;
703 	skb_reserve(skb, hh_len);
704 	}
705 
706 	skb->priority = sk->priority;
707 	skb->dst = dst_clone(&rt->u.dst);
708 
709 	skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
710 
711 	if(!sk->protinfo.af_inet.hdrincl) {
712 		iph->version=4;
713 		iph->ihl=5;
714 		iph->tos=sk->protinfo.af_inet.tos;
715 		iph->tot_len = htons(length);
716 		iph->frag_off = df;
717 		iph->ttl=sk->protinfo.af_inet.mc_ttl;
718 		ip_select_ident(iph, &rt->u.dst, sk);
719 		if (rt->rt_type != RTN_MULTICAST)
720 			iph->ttl=sk->protinfo.af_inet.ttl;
721 		iph->protocol=sk->protocol;
722 		iph->saddr=rt->rt_src;
723 		iph->daddr=rt->rt_dst;
724 		iph->check=0;
725 		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
726 		err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4, skb);
727 	}
728 	else
729 		err = getfrag(frag, (void *)iph, 0, length, skb);
730 
731 	if (err)
732 		goto error_fault;
733 
734 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
735 		      output_maybe_reroute);
736 	if (err > 0)
737 		err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
738 	if (err)
739 		goto error;
740 out:
741 	return 0;
742 
743 error_fault:
744 	err = -EFAULT;
745 	kfree_skb(skb);
746 error:
747 	IP_INC_STATS(IpOutDiscards);
748 	return err;
749 }
750 
751 /*
752  *	This IP datagram is too large to be sent in one piece.  Break it up into
753  *	smaller pieces (each of size equal to IP header plus
754  *	a block of the data of the original IP data part) that will yet fit in a
755  *	single device frame, and queue such a frame for sending.
756  *
757  *	Yes this is inefficient, feel free to submit a quicker one.
758  */
759 
ip_fragment(struct sk_buff * skb,int (* output)(struct sk_buff *))760 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
761 {
762 	struct iphdr *iph;
763 	int raw = 0;
764 	int ptr;
765 	struct net_device *dev;
766 	struct sk_buff *skb2;
767 	unsigned int mtu, hlen, left, len;
768 	int offset;
769 	int not_last_frag;
770 	struct rtable *rt = (struct rtable*)skb->dst;
771 	int err = 0;
772 
773 	dev = rt->u.dst.dev;
774 
775 	/*
776 	 *	Point into the IP datagram header.
777 	 */
778 
779 	iph = skb->nh.iph;
780 
781 	/*
782 	 *	Setup starting values.
783 	 */
784 
785 	hlen = iph->ihl * 4;
786 	left = skb->len - hlen;		/* Space per frame */
787 	mtu = rt->u.dst.pmtu - hlen;	/* Size of data space */
788 	ptr = raw + hlen;		/* Where to start from */
789 
790 	/*
791 	 *	Fragment the datagram.
792 	 */
793 
794 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
795 	not_last_frag = iph->frag_off & htons(IP_MF);
796 
797 	/*
798 	 *	Keep copying data until we run out.
799 	 */
800 
801 	while(left > 0)	{
802 		len = left;
803 		/* IF: it doesn't fit, use 'mtu' - the data space left */
804 		if (len > mtu)
805 			len = mtu;
806 		/* IF: we are not sending upto and including the packet end
807 		   then align the next start on an eight byte boundary */
808 		if (len < left)	{
809 			len &= ~7;
810 		}
811 		/*
812 		 *	Allocate buffer.
813 		 */
814 
815 		if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
816 			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
817 			err = -ENOMEM;
818 			goto fail;
819 		}
820 
821 		/*
822 		 *	Set up data on packet
823 		 */
824 
825 		skb2->pkt_type = skb->pkt_type;
826 		skb2->priority = skb->priority;
827 		skb_reserve(skb2, (dev->hard_header_len+15)&~15);
828 		skb_put(skb2, len + hlen);
829 		skb2->nh.raw = skb2->data;
830 		skb2->h.raw = skb2->data + hlen;
831 		skb2->protocol = skb->protocol;
832 		skb2->security = skb->security;
833 
834 		/*
835 		 *	Charge the memory for the fragment to any owner
836 		 *	it might possess
837 		 */
838 
839 		if (skb->sk)
840 			skb_set_owner_w(skb2, skb->sk);
841 		skb2->dst = dst_clone(skb->dst);
842 		skb2->dev = skb->dev;
843 
844 		/*
845 		 *	Copy the packet header into the new buffer.
846 		 */
847 
848 		memcpy(skb2->nh.raw, skb->data, hlen);
849 
850 		/*
851 		 *	Copy a block of the IP datagram.
852 		 */
853 		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
854 			BUG();
855 		left -= len;
856 
857 		/*
858 		 *	Fill in the new header fields.
859 		 */
860 		iph = skb2->nh.iph;
861 		iph->frag_off = htons((offset >> 3));
862 
863 		/* ANK: dirty, but effective trick. Upgrade options only if
864 		 * the segment to be fragmented was THE FIRST (otherwise,
865 		 * options are already fixed) and make it ONCE
866 		 * on the initial skb, so that all the following fragments
867 		 * will inherit fixed options.
868 		 */
869 		if (offset == 0)
870 			ip_options_fragment(skb);
871 
872 		/* Copy the flags to each fragment. */
873 		IPCB(skb2)->flags = IPCB(skb)->flags;
874 
875 		/*
876 		 *	Added AC : If we are fragmenting a fragment that's not the
877 		 *		   last fragment then keep MF on each bit
878 		 */
879 		if (left > 0 || not_last_frag)
880 			iph->frag_off |= htons(IP_MF);
881 		ptr += len;
882 		offset += len;
883 
884 #ifdef CONFIG_NET_SCHED
885 		skb2->tc_index = skb->tc_index;
886 #endif
887 #ifdef CONFIG_NETFILTER
888 		skb2->nfmark = skb->nfmark;
889 		skb2->nfcache = skb->nfcache;
890 		/* Connection association is same as pre-frag packet */
891 		skb2->nfct = skb->nfct;
892 		nf_conntrack_get(skb2->nfct);
893 #ifdef CONFIG_NETFILTER_DEBUG
894 		skb2->nf_debug = skb->nf_debug;
895 #endif
896 #endif
897 
898 		/*
899 		 *	Put this fragment into the sending queue.
900 		 */
901 
902 		IP_INC_STATS(IpFragCreates);
903 
904 		iph->tot_len = htons(len + hlen);
905 
906 		ip_send_check(iph);
907 
908 		err = output(skb2);
909 		if (err)
910 			goto fail;
911 	}
912 	kfree_skb(skb);
913 	IP_INC_STATS(IpFragOKs);
914 	return err;
915 
916 fail:
917 	kfree_skb(skb);
918 	IP_INC_STATS(IpFragFails);
919 	return err;
920 }
921 
922 /*
923  *	Fetch data from kernel space and fill in checksum if needed.
924  */
ip_reply_glue_bits(const void * dptr,char * to,unsigned int offset,unsigned int fraglen,struct sk_buff * skb)925 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
926 			      unsigned int fraglen, struct sk_buff *skb)
927 {
928         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
929 	u16 *pktp = (u16 *)to;
930 	struct iovec *iov;
931 	int len;
932 	int hdrflag = 1;
933 
934 	iov = &dp->iov[0];
935 	if (offset >= iov->iov_len) {
936 		offset -= iov->iov_len;
937 		iov++;
938 		hdrflag = 0;
939 	}
940 	len = iov->iov_len - offset;
941 	if (fraglen > len) { /* overlapping. */
942 		dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
943 					     dp->csum);
944 		offset = 0;
945 		fraglen -= len;
946 		to += len;
947 		iov++;
948 	}
949 
950 	dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
951 					     dp->csum);
952 
953 	if (hdrflag && dp->csumoffset)
954 		*(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
955 	return 0;
956 }
957 
958 /*
959  *	Generic function to send a packet as reply to another packet.
960  *	Used to send TCP resets so far. ICMP should use this function too.
961  *
962  *	Should run single threaded per socket because it uses the sock
963  *     	structure to pass arguments.
964  */
ip_send_reply(struct sock * sk,struct sk_buff * skb,struct ip_reply_arg * arg,unsigned int len)965 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
966 		   unsigned int len)
967 {
968 	struct {
969 		struct ip_options	opt;
970 		char			data[40];
971 	} replyopts;
972 	struct ipcm_cookie ipc;
973 	u32 daddr;
974 	struct rtable *rt = (struct rtable*)skb->dst;
975 
976 	if (ip_options_echo(&replyopts.opt, skb))
977 		return;
978 
979 	daddr = ipc.addr = rt->rt_src;
980 	ipc.opt = NULL;
981 
982 	if (replyopts.opt.optlen) {
983 		ipc.opt = &replyopts.opt;
984 
985 		if (ipc.opt->srr)
986 			daddr = replyopts.opt.faddr;
987 	}
988 
989 	if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
990 		return;
991 
992 	/* And let IP do all the hard work.
993 
994 	   This chunk is not reenterable, hence spinlock.
995 	   Note that it uses the fact, that this function is called
996 	   with locally disabled BH and that sk cannot be already spinlocked.
997 	 */
998 	bh_lock_sock(sk);
999 	sk->protinfo.af_inet.tos = skb->nh.iph->tos;
1000 	sk->priority = skb->priority;
1001 	sk->protocol = skb->nh.iph->protocol;
1002 	ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
1003 	bh_unlock_sock(sk);
1004 
1005 	ip_rt_put(rt);
1006 }
1007 
1008 /*
1009  *	IP protocol layer initialiser
1010  */
1011 
1012 static struct packet_type ip_packet_type =
1013 {
1014 	__constant_htons(ETH_P_IP),
1015 	NULL,	/* All devices */
1016 	ip_rcv,
1017 	(void*)1,
1018 	NULL,
1019 };
1020 
1021 /*
1022  *	IP registers the packet type and then calls the subprotocol initialisers
1023  */
1024 
ip_init(void)1025 void __init ip_init(void)
1026 {
1027 	dev_add_pack(&ip_packet_type);
1028 
1029 	ip_rt_init();
1030 	inet_initpeers();
1031 
1032 #ifdef CONFIG_IP_MULTICAST
1033 	proc_net_create("igmp", 0, ip_mc_procinfo);
1034 #endif
1035 	proc_net_create("mcfilter", 0, ip_mcf_procinfo);
1036 }
1037