1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 
87 /* Generate a checksum for an outgoing IP datagram. */
ip_send_check(struct iphdr * iph)88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 	iph->check = 0;
91 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94 
__ip_local_out(struct sk_buff * skb)95 int __ip_local_out(struct sk_buff *skb)
96 {
97 	struct iphdr *iph = ip_hdr(skb);
98 
99 	iph->tot_len = htons(skb->len);
100 	ip_send_check(iph);
101 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 		       skb_dst(skb)->dev, dst_output);
103 }
104 
ip_local_out(struct sk_buff * skb)105 int ip_local_out(struct sk_buff *skb)
106 {
107 	int err;
108 
109 	err = __ip_local_out(skb);
110 	if (likely(err == 1))
111 		err = dst_output(skb);
112 
113 	return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116 
117 /* dev_loopback_xmit for use with netfilter. */
ip_dev_loopback_xmit(struct sk_buff * newskb)118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120 	skb_reset_mac_header(newskb);
121 	__skb_pull(newskb, skb_network_offset(newskb));
122 	newskb->pkt_type = PACKET_LOOPBACK;
123 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 	WARN_ON(!skb_dst(newskb));
125 	netif_rx_ni(newskb);
126 	return 0;
127 }
128 
ip_select_ttl(struct inet_sock * inet,struct dst_entry * dst)129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 {
131 	int ttl = inet->uc_ttl;
132 
133 	if (ttl < 0)
134 		ttl = ip4_dst_hoplimit(dst);
135 	return ttl;
136 }
137 
138 /*
139  *		Add an ip header to a skbuff and send it out.
140  *
141  */
ip_build_and_send_pkt(struct sk_buff * skb,struct sock * sk,__be32 saddr,__be32 daddr,struct ip_options * opt)142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 			  __be32 saddr, __be32 daddr, struct ip_options *opt)
144 {
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct rtable *rt = skb_rtable(skb);
147 	struct iphdr *iph;
148 
149 	/* Build the IP header. */
150 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151 	skb_reset_network_header(skb);
152 	iph = ip_hdr(skb);
153 	iph->version  = 4;
154 	iph->ihl      = 5;
155 	iph->tos      = inet->tos;
156 	if (ip_dont_fragment(sk, &rt->dst))
157 		iph->frag_off = htons(IP_DF);
158 	else
159 		iph->frag_off = 0;
160 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161 	iph->daddr    = rt->rt_dst;
162 	iph->saddr    = rt->rt_src;
163 	iph->protocol = sk->sk_protocol;
164 	ip_select_ident(iph, &rt->dst, sk);
165 
166 	if (opt && opt->optlen) {
167 		iph->ihl += opt->optlen>>2;
168 		ip_options_build(skb, opt, daddr, rt, 0);
169 	}
170 
171 	skb->priority = sk->sk_priority;
172 	skb->mark = sk->sk_mark;
173 
174 	/* Send it out. */
175 	return ip_local_out(skb);
176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178 
ip_finish_output2(struct sk_buff * skb)179 static inline int ip_finish_output2(struct sk_buff *skb)
180 {
181 	struct dst_entry *dst = skb_dst(skb);
182 	struct rtable *rt = (struct rtable *)dst;
183 	struct net_device *dev = dst->dev;
184 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 
186 	if (rt->rt_type == RTN_MULTICAST) {
187 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188 	} else if (rt->rt_type == RTN_BROADCAST)
189 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190 
191 	/* Be paranoid, rather than too clever. */
192 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193 		struct sk_buff *skb2;
194 
195 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196 		if (skb2 == NULL) {
197 			kfree_skb(skb);
198 			return -ENOMEM;
199 		}
200 		if (skb->sk)
201 			skb_set_owner_w(skb2, skb->sk);
202 		kfree_skb(skb);
203 		skb = skb2;
204 	}
205 
206 	if (dst->hh)
207 		return neigh_hh_output(dst->hh, skb);
208 	else if (dst->neighbour)
209 		return dst->neighbour->output(skb);
210 
211 	if (net_ratelimit())
212 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 	kfree_skb(skb);
214 	return -EINVAL;
215 }
216 
ip_skb_dst_mtu(struct sk_buff * skb)217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218 {
219 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220 
221 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223 }
224 
ip_finish_output(struct sk_buff * skb)225 static int ip_finish_output(struct sk_buff *skb)
226 {
227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 	/* Policy lookup after SNAT yielded a new policy */
229 	if (skb_dst(skb)->xfrm != NULL) {
230 		IPCB(skb)->flags |= IPSKB_REROUTED;
231 		return dst_output(skb);
232 	}
233 #endif
234 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235 		return ip_fragment(skb, ip_finish_output2);
236 	else
237 		return ip_finish_output2(skb);
238 }
239 
ip_mc_output(struct sk_buff * skb)240 int ip_mc_output(struct sk_buff *skb)
241 {
242 	struct sock *sk = skb->sk;
243 	struct rtable *rt = skb_rtable(skb);
244 	struct net_device *dev = rt->dst.dev;
245 
246 	/*
247 	 *	If the indicated interface is up and running, send the packet.
248 	 */
249 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250 
251 	skb->dev = dev;
252 	skb->protocol = htons(ETH_P_IP);
253 
254 	/*
255 	 *	Multicasts are looped back for other local users
256 	 */
257 
258 	if (rt->rt_flags&RTCF_MULTICAST) {
259 		if (sk_mc_loop(sk)
260 #ifdef CONFIG_IP_MROUTE
261 		/* Small optimization: do not loopback not local frames,
262 		   which returned after forwarding; they will be  dropped
263 		   by ip_mr_input in any case.
264 		   Note, that local frames are looped back to be delivered
265 		   to local recipients.
266 
267 		   This check is duplicated in ip_mr_input at the moment.
268 		 */
269 		    &&
270 		    ((rt->rt_flags & RTCF_LOCAL) ||
271 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272 #endif
273 		   ) {
274 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 			if (newskb)
276 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 					newskb, NULL, newskb->dev,
278 					ip_dev_loopback_xmit);
279 		}
280 
281 		/* Multicasts with ttl 0 must not go beyond the host */
282 
283 		if (ip_hdr(skb)->ttl == 0) {
284 			kfree_skb(skb);
285 			return 0;
286 		}
287 	}
288 
289 	if (rt->rt_flags&RTCF_BROADCAST) {
290 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291 		if (newskb)
292 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 				NULL, newskb->dev, ip_dev_loopback_xmit);
294 	}
295 
296 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 			    skb->dev, ip_finish_output,
298 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299 }
300 
ip_output(struct sk_buff * skb)301 int ip_output(struct sk_buff *skb)
302 {
303 	struct net_device *dev = skb_dst(skb)->dev;
304 
305 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306 
307 	skb->dev = dev;
308 	skb->protocol = htons(ETH_P_IP);
309 
310 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311 			    ip_finish_output,
312 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313 }
314 
ip_queue_xmit(struct sk_buff * skb)315 int ip_queue_xmit(struct sk_buff *skb)
316 {
317 	struct sock *sk = skb->sk;
318 	struct inet_sock *inet = inet_sk(sk);
319 	struct ip_options *opt = inet->opt;
320 	struct rtable *rt;
321 	struct iphdr *iph;
322 	int res;
323 
324 	/* Skip all of this if the packet is already routed,
325 	 * f.e. by something like SCTP.
326 	 */
327 	rcu_read_lock();
328 	rt = skb_rtable(skb);
329 	if (rt != NULL)
330 		goto packet_routed;
331 
332 	/* Make sure we can route this packet. */
333 	rt = (struct rtable *)__sk_dst_check(sk, 0);
334 	if (rt == NULL) {
335 		__be32 daddr;
336 
337 		/* Use correct destination address if we have options. */
338 		daddr = inet->inet_daddr;
339 		if(opt && opt->srr)
340 			daddr = opt->faddr;
341 
342 		/* If this fails, retransmit mechanism of transport layer will
343 		 * keep trying until route appears or the connection times
344 		 * itself out.
345 		 */
346 		rt = ip_route_output_ports(sock_net(sk), sk,
347 					   daddr, inet->inet_saddr,
348 					   inet->inet_dport,
349 					   inet->inet_sport,
350 					   sk->sk_protocol,
351 					   RT_CONN_FLAGS(sk),
352 					   sk->sk_bound_dev_if);
353 		if (IS_ERR(rt))
354 			goto no_route;
355 		sk_setup_caps(sk, &rt->dst);
356 	}
357 	skb_dst_set_noref(skb, &rt->dst);
358 
359 packet_routed:
360 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
361 		goto no_route;
362 
363 	/* OK, we know where to send it, allocate and build IP header. */
364 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
365 	skb_reset_network_header(skb);
366 	iph = ip_hdr(skb);
367 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
368 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
369 		iph->frag_off = htons(IP_DF);
370 	else
371 		iph->frag_off = 0;
372 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
373 	iph->protocol = sk->sk_protocol;
374 	iph->saddr    = rt->rt_src;
375 	iph->daddr    = rt->rt_dst;
376 	/* Transport layer set skb->h.foo itself. */
377 
378 	if (opt && opt->optlen) {
379 		iph->ihl += opt->optlen >> 2;
380 		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
381 	}
382 
383 	ip_select_ident_more(iph, &rt->dst, sk,
384 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
385 
386 	skb->priority = sk->sk_priority;
387 	skb->mark = sk->sk_mark;
388 
389 	res = ip_local_out(skb);
390 	rcu_read_unlock();
391 	return res;
392 
393 no_route:
394 	rcu_read_unlock();
395 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
396 	kfree_skb(skb);
397 	return -EHOSTUNREACH;
398 }
399 EXPORT_SYMBOL(ip_queue_xmit);
400 
401 
ip_copy_metadata(struct sk_buff * to,struct sk_buff * from)402 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403 {
404 	to->pkt_type = from->pkt_type;
405 	to->priority = from->priority;
406 	to->protocol = from->protocol;
407 	skb_dst_drop(to);
408 	skb_dst_copy(to, from);
409 	to->dev = from->dev;
410 	to->mark = from->mark;
411 
412 	/* Copy the flags to each fragment. */
413 	IPCB(to)->flags = IPCB(from)->flags;
414 
415 #ifdef CONFIG_NET_SCHED
416 	to->tc_index = from->tc_index;
417 #endif
418 	nf_copy(to, from);
419 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
420     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
421 	to->nf_trace = from->nf_trace;
422 #endif
423 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
424 	to->ipvs_property = from->ipvs_property;
425 #endif
426 	skb_copy_secmark(to, from);
427 }
428 
429 /*
430  *	This IP datagram is too large to be sent in one piece.  Break it up into
431  *	smaller pieces (each of size equal to IP header plus
432  *	a block of the data of the original IP data part) that will yet fit in a
433  *	single device frame, and queue such a frame for sending.
434  */
435 
ip_fragment(struct sk_buff * skb,int (* output)(struct sk_buff *))436 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
437 {
438 	struct iphdr *iph;
439 	int ptr;
440 	struct net_device *dev;
441 	struct sk_buff *skb2;
442 	unsigned int mtu, hlen, left, len, ll_rs;
443 	int offset;
444 	__be16 not_last_frag;
445 	struct rtable *rt = skb_rtable(skb);
446 	int err = 0;
447 
448 	dev = rt->dst.dev;
449 
450 	/*
451 	 *	Point into the IP datagram header.
452 	 */
453 
454 	iph = ip_hdr(skb);
455 
456 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
457 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
458 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
459 			  htonl(ip_skb_dst_mtu(skb)));
460 		kfree_skb(skb);
461 		return -EMSGSIZE;
462 	}
463 
464 	/*
465 	 *	Setup starting values.
466 	 */
467 
468 	hlen = iph->ihl * 4;
469 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
470 #ifdef CONFIG_BRIDGE_NETFILTER
471 	if (skb->nf_bridge)
472 		mtu -= nf_bridge_mtu_reduction(skb);
473 #endif
474 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
475 
476 	/* When frag_list is given, use it. First, check its validity:
477 	 * some transformers could create wrong frag_list or break existing
478 	 * one, it is not prohibited. In this case fall back to copying.
479 	 *
480 	 * LATER: this step can be merged to real generation of fragments,
481 	 * we can switch to copy when see the first bad fragment.
482 	 */
483 	if (skb_has_frag_list(skb)) {
484 		struct sk_buff *frag, *frag2;
485 		int first_len = skb_pagelen(skb);
486 
487 		if (first_len - hlen > mtu ||
488 		    ((first_len - hlen) & 7) ||
489 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
490 		    skb_cloned(skb))
491 			goto slow_path;
492 
493 		skb_walk_frags(skb, frag) {
494 			/* Correct geometry. */
495 			if (frag->len > mtu ||
496 			    ((frag->len & 7) && frag->next) ||
497 			    skb_headroom(frag) < hlen)
498 				goto slow_path_clean;
499 
500 			/* Partially cloned skb? */
501 			if (skb_shared(frag))
502 				goto slow_path_clean;
503 
504 			BUG_ON(frag->sk);
505 			if (skb->sk) {
506 				frag->sk = skb->sk;
507 				frag->destructor = sock_wfree;
508 			}
509 			skb->truesize -= frag->truesize;
510 		}
511 
512 		/* Everything is OK. Generate! */
513 
514 		err = 0;
515 		offset = 0;
516 		frag = skb_shinfo(skb)->frag_list;
517 		skb_frag_list_init(skb);
518 		skb->data_len = first_len - skb_headlen(skb);
519 		skb->len = first_len;
520 		iph->tot_len = htons(first_len);
521 		iph->frag_off = htons(IP_MF);
522 		ip_send_check(iph);
523 
524 		for (;;) {
525 			/* Prepare header of the next frame,
526 			 * before previous one went down. */
527 			if (frag) {
528 				frag->ip_summed = CHECKSUM_NONE;
529 				skb_reset_transport_header(frag);
530 				__skb_push(frag, hlen);
531 				skb_reset_network_header(frag);
532 				memcpy(skb_network_header(frag), iph, hlen);
533 				iph = ip_hdr(frag);
534 				iph->tot_len = htons(frag->len);
535 				ip_copy_metadata(frag, skb);
536 				if (offset == 0)
537 					ip_options_fragment(frag);
538 				offset += skb->len - hlen;
539 				iph->frag_off = htons(offset>>3);
540 				if (frag->next != NULL)
541 					iph->frag_off |= htons(IP_MF);
542 				/* Ready, complete checksum */
543 				ip_send_check(iph);
544 			}
545 
546 			err = output(skb);
547 
548 			if (!err)
549 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
550 			if (err || !frag)
551 				break;
552 
553 			skb = frag;
554 			frag = skb->next;
555 			skb->next = NULL;
556 		}
557 
558 		if (err == 0) {
559 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
560 			return 0;
561 		}
562 
563 		while (frag) {
564 			skb = frag->next;
565 			kfree_skb(frag);
566 			frag = skb;
567 		}
568 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
569 		return err;
570 
571 slow_path_clean:
572 		skb_walk_frags(skb, frag2) {
573 			if (frag2 == frag)
574 				break;
575 			frag2->sk = NULL;
576 			frag2->destructor = NULL;
577 			skb->truesize += frag2->truesize;
578 		}
579 	}
580 
581 slow_path:
582 	left = skb->len - hlen;		/* Space per frame */
583 	ptr = hlen;		/* Where to start from */
584 
585 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
586 	 * we need to make room for the encapsulating header
587 	 */
588 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
589 
590 	/*
591 	 *	Fragment the datagram.
592 	 */
593 
594 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
595 	not_last_frag = iph->frag_off & htons(IP_MF);
596 
597 	/*
598 	 *	Keep copying data until we run out.
599 	 */
600 
601 	while (left > 0) {
602 		len = left;
603 		/* IF: it doesn't fit, use 'mtu' - the data space left */
604 		if (len > mtu)
605 			len = mtu;
606 		/* IF: we are not sending up to and including the packet end
607 		   then align the next start on an eight byte boundary */
608 		if (len < left)	{
609 			len &= ~7;
610 		}
611 		/*
612 		 *	Allocate buffer.
613 		 */
614 
615 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
616 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
617 			err = -ENOMEM;
618 			goto fail;
619 		}
620 
621 		/*
622 		 *	Set up data on packet
623 		 */
624 
625 		ip_copy_metadata(skb2, skb);
626 		skb_reserve(skb2, ll_rs);
627 		skb_put(skb2, len + hlen);
628 		skb_reset_network_header(skb2);
629 		skb2->transport_header = skb2->network_header + hlen;
630 
631 		/*
632 		 *	Charge the memory for the fragment to any owner
633 		 *	it might possess
634 		 */
635 
636 		if (skb->sk)
637 			skb_set_owner_w(skb2, skb->sk);
638 
639 		/*
640 		 *	Copy the packet header into the new buffer.
641 		 */
642 
643 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
644 
645 		/*
646 		 *	Copy a block of the IP datagram.
647 		 */
648 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
649 			BUG();
650 		left -= len;
651 
652 		/*
653 		 *	Fill in the new header fields.
654 		 */
655 		iph = ip_hdr(skb2);
656 		iph->frag_off = htons((offset >> 3));
657 
658 		/* ANK: dirty, but effective trick. Upgrade options only if
659 		 * the segment to be fragmented was THE FIRST (otherwise,
660 		 * options are already fixed) and make it ONCE
661 		 * on the initial skb, so that all the following fragments
662 		 * will inherit fixed options.
663 		 */
664 		if (offset == 0)
665 			ip_options_fragment(skb);
666 
667 		/*
668 		 *	Added AC : If we are fragmenting a fragment that's not the
669 		 *		   last fragment then keep MF on each bit
670 		 */
671 		if (left > 0 || not_last_frag)
672 			iph->frag_off |= htons(IP_MF);
673 		ptr += len;
674 		offset += len;
675 
676 		/*
677 		 *	Put this fragment into the sending queue.
678 		 */
679 		iph->tot_len = htons(len + hlen);
680 
681 		ip_send_check(iph);
682 
683 		err = output(skb2);
684 		if (err)
685 			goto fail;
686 
687 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
688 	}
689 	kfree_skb(skb);
690 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
691 	return err;
692 
693 fail:
694 	kfree_skb(skb);
695 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
696 	return err;
697 }
698 EXPORT_SYMBOL(ip_fragment);
699 
700 int
ip_generic_getfrag(void * from,char * to,int offset,int len,int odd,struct sk_buff * skb)701 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
702 {
703 	struct iovec *iov = from;
704 
705 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
706 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
707 			return -EFAULT;
708 	} else {
709 		__wsum csum = 0;
710 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
711 			return -EFAULT;
712 		skb->csum = csum_block_add(skb->csum, csum, odd);
713 	}
714 	return 0;
715 }
716 EXPORT_SYMBOL(ip_generic_getfrag);
717 
718 static inline __wsum
csum_page(struct page * page,int offset,int copy)719 csum_page(struct page *page, int offset, int copy)
720 {
721 	char *kaddr;
722 	__wsum csum;
723 	kaddr = kmap(page);
724 	csum = csum_partial(kaddr + offset, copy, 0);
725 	kunmap(page);
726 	return csum;
727 }
728 
ip_ufo_append_data(struct sock * sk,struct sk_buff_head * queue,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int hh_len,int fragheaderlen,int transhdrlen,int mtu,unsigned int flags)729 static inline int ip_ufo_append_data(struct sock *sk,
730 			struct sk_buff_head *queue,
731 			int getfrag(void *from, char *to, int offset, int len,
732 			       int odd, struct sk_buff *skb),
733 			void *from, int length, int hh_len, int fragheaderlen,
734 			int transhdrlen, int mtu, unsigned int flags)
735 {
736 	struct sk_buff *skb;
737 	int err;
738 
739 	/* There is support for UDP fragmentation offload by network
740 	 * device, so create one single skb packet containing complete
741 	 * udp datagram
742 	 */
743 	if ((skb = skb_peek_tail(queue)) == NULL) {
744 		skb = sock_alloc_send_skb(sk,
745 			hh_len + fragheaderlen + transhdrlen + 20,
746 			(flags & MSG_DONTWAIT), &err);
747 
748 		if (skb == NULL)
749 			return err;
750 
751 		/* reserve space for Hardware header */
752 		skb_reserve(skb, hh_len);
753 
754 		/* create space for UDP/IP header */
755 		skb_put(skb, fragheaderlen + transhdrlen);
756 
757 		/* initialize network header pointer */
758 		skb_reset_network_header(skb);
759 
760 		/* initialize protocol header pointer */
761 		skb->transport_header = skb->network_header + fragheaderlen;
762 
763 		skb->ip_summed = CHECKSUM_PARTIAL;
764 		skb->csum = 0;
765 
766 		/* specify the length of each IP datagram fragment */
767 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
768 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
769 		__skb_queue_tail(queue, skb);
770 	}
771 
772 	return skb_append_datato_frags(sk, skb, getfrag, from,
773 				       (length - transhdrlen));
774 }
775 
__ip_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork * cork,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags)776 static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
777 			    struct inet_cork *cork,
778 			    int getfrag(void *from, char *to, int offset,
779 					int len, int odd, struct sk_buff *skb),
780 			    void *from, int length, int transhdrlen,
781 			    unsigned int flags)
782 {
783 	struct inet_sock *inet = inet_sk(sk);
784 	struct sk_buff *skb;
785 
786 	struct ip_options *opt = cork->opt;
787 	int hh_len;
788 	int exthdrlen;
789 	int mtu;
790 	int copy;
791 	int err;
792 	int offset = 0;
793 	unsigned int maxfraglen, fragheaderlen;
794 	int csummode = CHECKSUM_NONE;
795 	struct rtable *rt = (struct rtable *)cork->dst;
796 
797 	exthdrlen = transhdrlen ? rt->dst.header_len : 0;
798 	length += exthdrlen;
799 	transhdrlen += exthdrlen;
800 	mtu = cork->fragsize;
801 
802 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
803 
804 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
805 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
806 
807 	if (cork->length + length > 0xFFFF - fragheaderlen) {
808 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
809 			       mtu-exthdrlen);
810 		return -EMSGSIZE;
811 	}
812 
813 	/*
814 	 * transhdrlen > 0 means that this is the first fragment and we wish
815 	 * it won't be fragmented in the future.
816 	 */
817 	if (transhdrlen &&
818 	    length + fragheaderlen <= mtu &&
819 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
820 	    !exthdrlen)
821 		csummode = CHECKSUM_PARTIAL;
822 
823 	skb = skb_peek_tail(queue);
824 
825 	cork->length += length;
826 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
827 	    (sk->sk_protocol == IPPROTO_UDP) &&
828 	    (rt->dst.dev->features & NETIF_F_UFO)) {
829 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
830 					 hh_len, fragheaderlen, transhdrlen,
831 					 mtu, flags);
832 		if (err)
833 			goto error;
834 		return 0;
835 	}
836 
837 	/* So, what's going on in the loop below?
838 	 *
839 	 * We use calculated fragment length to generate chained skb,
840 	 * each of segments is IP fragment ready for sending to network after
841 	 * adding appropriate IP header.
842 	 */
843 
844 	if (!skb)
845 		goto alloc_new_skb;
846 
847 	while (length > 0) {
848 		/* Check if the remaining data fits into current packet. */
849 		copy = mtu - skb->len;
850 		if (copy < length)
851 			copy = maxfraglen - skb->len;
852 		if (copy <= 0) {
853 			char *data;
854 			unsigned int datalen;
855 			unsigned int fraglen;
856 			unsigned int fraggap;
857 			unsigned int alloclen;
858 			struct sk_buff *skb_prev;
859 alloc_new_skb:
860 			skb_prev = skb;
861 			if (skb_prev)
862 				fraggap = skb_prev->len - maxfraglen;
863 			else
864 				fraggap = 0;
865 
866 			/*
867 			 * If remaining data exceeds the mtu,
868 			 * we know we need more fragment(s).
869 			 */
870 			datalen = length + fraggap;
871 			if (datalen > mtu - fragheaderlen)
872 				datalen = maxfraglen - fragheaderlen;
873 			fraglen = datalen + fragheaderlen;
874 
875 			if ((flags & MSG_MORE) &&
876 			    !(rt->dst.dev->features&NETIF_F_SG))
877 				alloclen = mtu;
878 			else
879 				alloclen = fraglen;
880 
881 			/* The last fragment gets additional space at tail.
882 			 * Note, with MSG_MORE we overallocate on fragments,
883 			 * because we have no idea what fragment will be
884 			 * the last.
885 			 */
886 			if (datalen == length + fraggap) {
887 				alloclen += rt->dst.trailer_len;
888 				/* make sure mtu is not reached */
889 				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
890 					datalen -= ALIGN(rt->dst.trailer_len, 8);
891 			}
892 			if (transhdrlen) {
893 				skb = sock_alloc_send_skb(sk,
894 						alloclen + hh_len + 15,
895 						(flags & MSG_DONTWAIT), &err);
896 			} else {
897 				skb = NULL;
898 				if (atomic_read(&sk->sk_wmem_alloc) <=
899 				    2 * sk->sk_sndbuf)
900 					skb = sock_wmalloc(sk,
901 							   alloclen + hh_len + 15, 1,
902 							   sk->sk_allocation);
903 				if (unlikely(skb == NULL))
904 					err = -ENOBUFS;
905 				else
906 					/* only the initial fragment is
907 					   time stamped */
908 					cork->tx_flags = 0;
909 			}
910 			if (skb == NULL)
911 				goto error;
912 
913 			/*
914 			 *	Fill in the control structures
915 			 */
916 			skb->ip_summed = csummode;
917 			skb->csum = 0;
918 			skb_reserve(skb, hh_len);
919 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
920 
921 			/*
922 			 *	Find where to start putting bytes.
923 			 */
924 			data = skb_put(skb, fraglen);
925 			skb_set_network_header(skb, exthdrlen);
926 			skb->transport_header = (skb->network_header +
927 						 fragheaderlen);
928 			data += fragheaderlen;
929 
930 			if (fraggap) {
931 				skb->csum = skb_copy_and_csum_bits(
932 					skb_prev, maxfraglen,
933 					data + transhdrlen, fraggap, 0);
934 				skb_prev->csum = csum_sub(skb_prev->csum,
935 							  skb->csum);
936 				data += fraggap;
937 				pskb_trim_unique(skb_prev, maxfraglen);
938 			}
939 
940 			copy = datalen - transhdrlen - fraggap;
941 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
942 				err = -EFAULT;
943 				kfree_skb(skb);
944 				goto error;
945 			}
946 
947 			offset += copy;
948 			length -= datalen - fraggap;
949 			transhdrlen = 0;
950 			exthdrlen = 0;
951 			csummode = CHECKSUM_NONE;
952 
953 			/*
954 			 * Put the packet on the pending queue.
955 			 */
956 			__skb_queue_tail(queue, skb);
957 			continue;
958 		}
959 
960 		if (copy > length)
961 			copy = length;
962 
963 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
964 			unsigned int off;
965 
966 			off = skb->len;
967 			if (getfrag(from, skb_put(skb, copy),
968 					offset, copy, off, skb) < 0) {
969 				__skb_trim(skb, off);
970 				err = -EFAULT;
971 				goto error;
972 			}
973 		} else {
974 			int i = skb_shinfo(skb)->nr_frags;
975 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
976 			struct page *page = cork->page;
977 			int off = cork->off;
978 			unsigned int left;
979 
980 			if (page && (left = PAGE_SIZE - off) > 0) {
981 				if (copy >= left)
982 					copy = left;
983 				if (page != frag->page) {
984 					if (i == MAX_SKB_FRAGS) {
985 						err = -EMSGSIZE;
986 						goto error;
987 					}
988 					get_page(page);
989 					skb_fill_page_desc(skb, i, page, off, 0);
990 					frag = &skb_shinfo(skb)->frags[i];
991 				}
992 			} else if (i < MAX_SKB_FRAGS) {
993 				if (copy > PAGE_SIZE)
994 					copy = PAGE_SIZE;
995 				page = alloc_pages(sk->sk_allocation, 0);
996 				if (page == NULL)  {
997 					err = -ENOMEM;
998 					goto error;
999 				}
1000 				cork->page = page;
1001 				cork->off = 0;
1002 
1003 				skb_fill_page_desc(skb, i, page, 0, 0);
1004 				frag = &skb_shinfo(skb)->frags[i];
1005 			} else {
1006 				err = -EMSGSIZE;
1007 				goto error;
1008 			}
1009 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1010 				err = -EFAULT;
1011 				goto error;
1012 			}
1013 			cork->off += copy;
1014 			frag->size += copy;
1015 			skb->len += copy;
1016 			skb->data_len += copy;
1017 			skb->truesize += copy;
1018 			atomic_add(copy, &sk->sk_wmem_alloc);
1019 		}
1020 		offset += copy;
1021 		length -= copy;
1022 	}
1023 
1024 	return 0;
1025 
1026 error:
1027 	cork->length -= length;
1028 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1029 	return err;
1030 }
1031 
ip_setup_cork(struct sock * sk,struct inet_cork * cork,struct ipcm_cookie * ipc,struct rtable ** rtp)1032 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033 			 struct ipcm_cookie *ipc, struct rtable **rtp)
1034 {
1035 	struct inet_sock *inet = inet_sk(sk);
1036 	struct ip_options *opt;
1037 	struct rtable *rt;
1038 
1039 	/*
1040 	 * setup for corking.
1041 	 */
1042 	opt = ipc->opt;
1043 	if (opt) {
1044 		if (cork->opt == NULL) {
1045 			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046 					    sk->sk_allocation);
1047 			if (unlikely(cork->opt == NULL))
1048 				return -ENOBUFS;
1049 		}
1050 		memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051 		cork->flags |= IPCORK_OPT;
1052 		cork->addr = ipc->addr;
1053 	}
1054 	rt = *rtp;
1055 	if (unlikely(!rt))
1056 		return -EFAULT;
1057 	/*
1058 	 * We steal reference to this route, caller should not release it
1059 	 */
1060 	*rtp = NULL;
1061 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062 			 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063 	cork->dst = &rt->dst;
1064 	cork->length = 0;
1065 	cork->tx_flags = ipc->tx_flags;
1066 	cork->page = NULL;
1067 	cork->off = 0;
1068 
1069 	return 0;
1070 }
1071 
1072 /*
1073  *	ip_append_data() and ip_append_page() can make one large IP datagram
1074  *	from many pieces of data. Each pieces will be holded on the socket
1075  *	until ip_push_pending_frames() is called. Each piece can be a page
1076  *	or non-page data.
1077  *
1078  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1079  *	this interface potentially.
1080  *
1081  *	LATER: length must be adjusted by pad at tail, when it is required.
1082  */
ip_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm_cookie * ipc,struct rtable ** rtp,unsigned int flags)1083 int ip_append_data(struct sock *sk,
1084 		   int getfrag(void *from, char *to, int offset, int len,
1085 			       int odd, struct sk_buff *skb),
1086 		   void *from, int length, int transhdrlen,
1087 		   struct ipcm_cookie *ipc, struct rtable **rtp,
1088 		   unsigned int flags)
1089 {
1090 	struct inet_sock *inet = inet_sk(sk);
1091 	int err;
1092 
1093 	if (flags&MSG_PROBE)
1094 		return 0;
1095 
1096 	if (skb_queue_empty(&sk->sk_write_queue)) {
1097 		err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098 		if (err)
1099 			return err;
1100 	} else {
1101 		transhdrlen = 0;
1102 	}
1103 
1104 	return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105 				from, length, transhdrlen, flags);
1106 }
1107 
ip_append_page(struct sock * sk,struct page * page,int offset,size_t size,int flags)1108 ssize_t	ip_append_page(struct sock *sk, struct page *page,
1109 		       int offset, size_t size, int flags)
1110 {
1111 	struct inet_sock *inet = inet_sk(sk);
1112 	struct sk_buff *skb;
1113 	struct rtable *rt;
1114 	struct ip_options *opt = NULL;
1115 	int hh_len;
1116 	int mtu;
1117 	int len;
1118 	int err;
1119 	unsigned int maxfraglen, fragheaderlen, fraggap;
1120 
1121 	if (inet->hdrincl)
1122 		return -EPERM;
1123 
1124 	if (flags&MSG_PROBE)
1125 		return 0;
1126 
1127 	if (skb_queue_empty(&sk->sk_write_queue))
1128 		return -EINVAL;
1129 
1130 	rt = (struct rtable *)inet->cork.dst;
1131 	if (inet->cork.flags & IPCORK_OPT)
1132 		opt = inet->cork.opt;
1133 
1134 	if (!(rt->dst.dev->features&NETIF_F_SG))
1135 		return -EOPNOTSUPP;
1136 
1137 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1138 	mtu = inet->cork.fragsize;
1139 
1140 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1141 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1142 
1143 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1144 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1145 		return -EMSGSIZE;
1146 	}
1147 
1148 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1149 		return -EINVAL;
1150 
1151 	inet->cork.length += size;
1152 	if ((size + skb->len > mtu) &&
1153 	    (sk->sk_protocol == IPPROTO_UDP) &&
1154 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1155 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1156 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157 	}
1158 
1159 
1160 	while (size > 0) {
1161 		int i;
1162 
1163 		if (skb_is_gso(skb))
1164 			len = size;
1165 		else {
1166 
1167 			/* Check if the remaining data fits into current packet. */
1168 			len = mtu - skb->len;
1169 			if (len < size)
1170 				len = maxfraglen - skb->len;
1171 		}
1172 		if (len <= 0) {
1173 			struct sk_buff *skb_prev;
1174 			int alloclen;
1175 
1176 			skb_prev = skb;
1177 			fraggap = skb_prev->len - maxfraglen;
1178 
1179 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1180 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1181 			if (unlikely(!skb)) {
1182 				err = -ENOBUFS;
1183 				goto error;
1184 			}
1185 
1186 			/*
1187 			 *	Fill in the control structures
1188 			 */
1189 			skb->ip_summed = CHECKSUM_NONE;
1190 			skb->csum = 0;
1191 			skb_reserve(skb, hh_len);
1192 
1193 			/*
1194 			 *	Find where to start putting bytes.
1195 			 */
1196 			skb_put(skb, fragheaderlen + fraggap);
1197 			skb_reset_network_header(skb);
1198 			skb->transport_header = (skb->network_header +
1199 						 fragheaderlen);
1200 			if (fraggap) {
1201 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1202 								   maxfraglen,
1203 						    skb_transport_header(skb),
1204 								   fraggap, 0);
1205 				skb_prev->csum = csum_sub(skb_prev->csum,
1206 							  skb->csum);
1207 				pskb_trim_unique(skb_prev, maxfraglen);
1208 			}
1209 
1210 			/*
1211 			 * Put the packet on the pending queue.
1212 			 */
1213 			__skb_queue_tail(&sk->sk_write_queue, skb);
1214 			continue;
1215 		}
1216 
1217 		i = skb_shinfo(skb)->nr_frags;
1218 		if (len > size)
1219 			len = size;
1220 		if (skb_can_coalesce(skb, i, page, offset)) {
1221 			skb_shinfo(skb)->frags[i-1].size += len;
1222 		} else if (i < MAX_SKB_FRAGS) {
1223 			get_page(page);
1224 			skb_fill_page_desc(skb, i, page, offset, len);
1225 		} else {
1226 			err = -EMSGSIZE;
1227 			goto error;
1228 		}
1229 
1230 		if (skb->ip_summed == CHECKSUM_NONE) {
1231 			__wsum csum;
1232 			csum = csum_page(page, offset, len);
1233 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1234 		}
1235 
1236 		skb->len += len;
1237 		skb->data_len += len;
1238 		skb->truesize += len;
1239 		atomic_add(len, &sk->sk_wmem_alloc);
1240 		offset += len;
1241 		size -= len;
1242 	}
1243 	return 0;
1244 
1245 error:
1246 	inet->cork.length -= size;
1247 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1248 	return err;
1249 }
1250 
ip_cork_release(struct inet_cork * cork)1251 static void ip_cork_release(struct inet_cork *cork)
1252 {
1253 	cork->flags &= ~IPCORK_OPT;
1254 	kfree(cork->opt);
1255 	cork->opt = NULL;
1256 	dst_release(cork->dst);
1257 	cork->dst = NULL;
1258 }
1259 
1260 /*
1261  *	Combined all pending IP fragments on the socket as one IP datagram
1262  *	and push them out.
1263  */
__ip_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork * cork)1264 struct sk_buff *__ip_make_skb(struct sock *sk,
1265 			      struct sk_buff_head *queue,
1266 			      struct inet_cork *cork)
1267 {
1268 	struct sk_buff *skb, *tmp_skb;
1269 	struct sk_buff **tail_skb;
1270 	struct inet_sock *inet = inet_sk(sk);
1271 	struct net *net = sock_net(sk);
1272 	struct ip_options *opt = NULL;
1273 	struct rtable *rt = (struct rtable *)cork->dst;
1274 	struct iphdr *iph;
1275 	__be16 df = 0;
1276 	__u8 ttl;
1277 
1278 	if ((skb = __skb_dequeue(queue)) == NULL)
1279 		goto out;
1280 	tail_skb = &(skb_shinfo(skb)->frag_list);
1281 
1282 	/* move skb->data to ip header from ext header */
1283 	if (skb->data < skb_network_header(skb))
1284 		__skb_pull(skb, skb_network_offset(skb));
1285 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1286 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1287 		*tail_skb = tmp_skb;
1288 		tail_skb = &(tmp_skb->next);
1289 		skb->len += tmp_skb->len;
1290 		skb->data_len += tmp_skb->len;
1291 		skb->truesize += tmp_skb->truesize;
1292 		tmp_skb->destructor = NULL;
1293 		tmp_skb->sk = NULL;
1294 	}
1295 
1296 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1297 	 * to fragment the frame generated here. No matter, what transforms
1298 	 * how transforms change size of the packet, it will come out.
1299 	 */
1300 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1301 		skb->local_df = 1;
1302 
1303 	/* DF bit is set when we want to see DF on outgoing frames.
1304 	 * If local_df is set too, we still allow to fragment this frame
1305 	 * locally. */
1306 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1307 	    (skb->len <= dst_mtu(&rt->dst) &&
1308 	     ip_dont_fragment(sk, &rt->dst)))
1309 		df = htons(IP_DF);
1310 
1311 	if (cork->flags & IPCORK_OPT)
1312 		opt = cork->opt;
1313 
1314 	if (rt->rt_type == RTN_MULTICAST)
1315 		ttl = inet->mc_ttl;
1316 	else
1317 		ttl = ip_select_ttl(inet, &rt->dst);
1318 
1319 	iph = (struct iphdr *)skb->data;
1320 	iph->version = 4;
1321 	iph->ihl = 5;
1322 	if (opt) {
1323 		iph->ihl += opt->optlen>>2;
1324 		ip_options_build(skb, opt, cork->addr, rt, 0);
1325 	}
1326 	iph->tos = inet->tos;
1327 	iph->frag_off = df;
1328 	ip_select_ident(iph, &rt->dst, sk);
1329 	iph->ttl = ttl;
1330 	iph->protocol = sk->sk_protocol;
1331 	iph->saddr = rt->rt_src;
1332 	iph->daddr = rt->rt_dst;
1333 
1334 	skb->priority = sk->sk_priority;
1335 	skb->mark = sk->sk_mark;
1336 	/*
1337 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1338 	 * on dst refcount
1339 	 */
1340 	cork->dst = NULL;
1341 	skb_dst_set(skb, &rt->dst);
1342 
1343 	if (iph->protocol == IPPROTO_ICMP)
1344 		icmp_out_count(net, ((struct icmphdr *)
1345 			skb_transport_header(skb))->type);
1346 
1347 	ip_cork_release(cork);
1348 out:
1349 	return skb;
1350 }
1351 
ip_send_skb(struct sk_buff * skb)1352 int ip_send_skb(struct sk_buff *skb)
1353 {
1354 	struct net *net = sock_net(skb->sk);
1355 	int err;
1356 
1357 	err = ip_local_out(skb);
1358 	if (err) {
1359 		if (err > 0)
1360 			err = net_xmit_errno(err);
1361 		if (err)
1362 			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1363 	}
1364 
1365 	return err;
1366 }
1367 
ip_push_pending_frames(struct sock * sk)1368 int ip_push_pending_frames(struct sock *sk)
1369 {
1370 	struct sk_buff *skb;
1371 
1372 	skb = ip_finish_skb(sk);
1373 	if (!skb)
1374 		return 0;
1375 
1376 	/* Netfilter gets whole the not fragmented skb. */
1377 	return ip_send_skb(skb);
1378 }
1379 
1380 /*
1381  *	Throw away all pending data on the socket.
1382  */
__ip_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork * cork)1383 static void __ip_flush_pending_frames(struct sock *sk,
1384 				      struct sk_buff_head *queue,
1385 				      struct inet_cork *cork)
1386 {
1387 	struct sk_buff *skb;
1388 
1389 	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1390 		kfree_skb(skb);
1391 
1392 	ip_cork_release(cork);
1393 }
1394 
ip_flush_pending_frames(struct sock * sk)1395 void ip_flush_pending_frames(struct sock *sk)
1396 {
1397 	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1398 }
1399 
ip_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm_cookie * ipc,struct rtable ** rtp,unsigned int flags)1400 struct sk_buff *ip_make_skb(struct sock *sk,
1401 			    int getfrag(void *from, char *to, int offset,
1402 					int len, int odd, struct sk_buff *skb),
1403 			    void *from, int length, int transhdrlen,
1404 			    struct ipcm_cookie *ipc, struct rtable **rtp,
1405 			    unsigned int flags)
1406 {
1407 	struct inet_cork cork = {};
1408 	struct sk_buff_head queue;
1409 	int err;
1410 
1411 	if (flags & MSG_PROBE)
1412 		return NULL;
1413 
1414 	__skb_queue_head_init(&queue);
1415 
1416 	err = ip_setup_cork(sk, &cork, ipc, rtp);
1417 	if (err)
1418 		return ERR_PTR(err);
1419 
1420 	err = __ip_append_data(sk, &queue, &cork, getfrag,
1421 			       from, length, transhdrlen, flags);
1422 	if (err) {
1423 		__ip_flush_pending_frames(sk, &queue, &cork);
1424 		return ERR_PTR(err);
1425 	}
1426 
1427 	return __ip_make_skb(sk, &queue, &cork);
1428 }
1429 
1430 /*
1431  *	Fetch data from kernel space and fill in checksum if needed.
1432  */
ip_reply_glue_bits(void * dptr,char * to,int offset,int len,int odd,struct sk_buff * skb)1433 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1434 			      int len, int odd, struct sk_buff *skb)
1435 {
1436 	__wsum csum;
1437 
1438 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1439 	skb->csum = csum_block_add(skb->csum, csum, odd);
1440 	return 0;
1441 }
1442 
1443 /*
1444  *	Generic function to send a packet as reply to another packet.
1445  *	Used to send TCP resets so far. ICMP should use this function too.
1446  *
1447  *	Should run single threaded per socket because it uses the sock
1448  *     	structure to pass arguments.
1449  */
ip_send_reply(struct sock * sk,struct sk_buff * skb,struct ip_reply_arg * arg,unsigned int len)1450 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1451 		   unsigned int len)
1452 {
1453 	struct inet_sock *inet = inet_sk(sk);
1454 	struct {
1455 		struct ip_options	opt;
1456 		char			data[40];
1457 	} replyopts;
1458 	struct ipcm_cookie ipc;
1459 	__be32 daddr;
1460 	struct rtable *rt = skb_rtable(skb);
1461 
1462 	if (ip_options_echo(&replyopts.opt, skb))
1463 		return;
1464 
1465 	daddr = ipc.addr = rt->rt_src;
1466 	ipc.opt = NULL;
1467 	ipc.tx_flags = 0;
1468 
1469 	if (replyopts.opt.optlen) {
1470 		ipc.opt = &replyopts.opt;
1471 
1472 		if (ipc.opt->srr)
1473 			daddr = replyopts.opt.faddr;
1474 	}
1475 
1476 	{
1477 		struct flowi4 fl4 = {
1478 			.flowi4_oif = arg->bound_dev_if,
1479 			.daddr = daddr,
1480 			.saddr = rt->rt_spec_dst,
1481 			.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
1482 			.fl4_sport = tcp_hdr(skb)->dest,
1483 			.fl4_dport = tcp_hdr(skb)->source,
1484 			.flowi4_proto = sk->sk_protocol,
1485 			.flowi4_flags = ip_reply_arg_flowi_flags(arg),
1486 		};
1487 		security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1488 		rt = ip_route_output_key(sock_net(sk), &fl4);
1489 		if (IS_ERR(rt))
1490 			return;
1491 	}
1492 
1493 	/* And let IP do all the hard work.
1494 
1495 	   This chunk is not reenterable, hence spinlock.
1496 	   Note that it uses the fact, that this function is called
1497 	   with locally disabled BH and that sk cannot be already spinlocked.
1498 	 */
1499 	bh_lock_sock(sk);
1500 	inet->tos = ip_hdr(skb)->tos;
1501 	sk->sk_priority = skb->priority;
1502 	sk->sk_protocol = ip_hdr(skb)->protocol;
1503 	sk->sk_bound_dev_if = arg->bound_dev_if;
1504 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1505 		       &ipc, &rt, MSG_DONTWAIT);
1506 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1507 		if (arg->csumoffset >= 0)
1508 			*((__sum16 *)skb_transport_header(skb) +
1509 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1510 								arg->csum));
1511 		skb->ip_summed = CHECKSUM_NONE;
1512 		ip_push_pending_frames(sk);
1513 	}
1514 
1515 	bh_unlock_sock(sk);
1516 
1517 	ip_rt_put(rt);
1518 }
1519 
ip_init(void)1520 void __init ip_init(void)
1521 {
1522 	ip_rt_init();
1523 	inet_initpeers();
1524 
1525 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1526 	igmp_mc_proc_init();
1527 #endif
1528 }
1529