1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <linux/module.h>
47 #include <linux/types.h>
48 #include <linux/kernel.h>
49 #include <linux/mm.h>
50 #include <linux/string.h>
51 #include <linux/errno.h>
52 #include <linux/highmem.h>
53 #include <linux/slab.h>
54 
55 #include <linux/socket.h>
56 #include <linux/sockios.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/etherdevice.h>
61 #include <linux/proc_fs.h>
62 #include <linux/stat.h>
63 #include <linux/init.h>
64 
65 #include <net/snmp.h>
66 #include <net/ip.h>
67 #include <net/protocol.h>
68 #include <net/route.h>
69 #include <net/xfrm.h>
70 #include <linux/skbuff.h>
71 #include <net/sock.h>
72 #include <net/arp.h>
73 #include <net/icmp.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/netfilter_bridge.h>
79 #include <linux/mroute.h>
80 #include <linux/netlink.h>
81 #include <linux/tcp.h>
82 
83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 
86 /* Generate a checksum for an outgoing IP datagram. */
ip_send_check(struct iphdr * iph)87 __inline__ void ip_send_check(struct iphdr *iph)
88 {
89 	iph->check = 0;
90 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91 }
92 EXPORT_SYMBOL(ip_send_check);
93 
__ip_local_out(struct sk_buff * skb)94 int __ip_local_out(struct sk_buff *skb)
95 {
96 	struct iphdr *iph = ip_hdr(skb);
97 
98 	iph->tot_len = htons(skb->len);
99 	ip_send_check(iph);
100 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 		       skb_dst(skb)->dev, dst_output);
102 }
103 
ip_local_out(struct sk_buff * skb)104 int ip_local_out(struct sk_buff *skb)
105 {
106 	int err;
107 
108 	err = __ip_local_out(skb);
109 	if (likely(err == 1))
110 		err = dst_output(skb);
111 
112 	return err;
113 }
114 EXPORT_SYMBOL_GPL(ip_local_out);
115 
116 /* dev_loopback_xmit for use with netfilter. */
ip_dev_loopback_xmit(struct sk_buff * newskb)117 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118 {
119 	skb_reset_mac_header(newskb);
120 	__skb_pull(newskb, skb_network_offset(newskb));
121 	newskb->pkt_type = PACKET_LOOPBACK;
122 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
123 	WARN_ON(!skb_dst(newskb));
124 	skb_dst_force(newskb);
125 	netif_rx_ni(newskb);
126 	return 0;
127 }
128 
ip_select_ttl(struct inet_sock * inet,struct dst_entry * dst)129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 {
131 	int ttl = inet->uc_ttl;
132 
133 	if (ttl < 0)
134 		ttl = ip4_dst_hoplimit(dst);
135 	return ttl;
136 }
137 
138 /*
139  *		Add an ip header to a skbuff and send it out.
140  *
141  */
ip_build_and_send_pkt(struct sk_buff * skb,struct sock * sk,__be32 saddr,__be32 daddr,struct ip_options_rcu * opt)142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144 {
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct rtable *rt = skb_rtable(skb);
147 	struct iphdr *iph;
148 
149 	/* Build the IP header. */
150 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 	skb_reset_network_header(skb);
152 	iph = ip_hdr(skb);
153 	iph->version  = 4;
154 	iph->ihl      = 5;
155 	iph->tos      = inet->tos;
156 	if (ip_dont_fragment(sk, &rt->dst))
157 		iph->frag_off = htons(IP_DF);
158 	else
159 		iph->frag_off = 0;
160 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161 	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 	iph->saddr    = saddr;
163 	iph->protocol = sk->sk_protocol;
164 	ip_select_ident(skb, &rt->dst, sk);
165 
166 	if (opt && opt->opt.optlen) {
167 		iph->ihl += opt->opt.optlen>>2;
168 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 	}
170 
171 	skb->priority = sk->sk_priority;
172 	skb->mark = sk->sk_mark;
173 
174 	/* Send it out. */
175 	return ip_local_out(skb);
176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178 
ip_finish_output2(struct sk_buff * skb)179 static inline int ip_finish_output2(struct sk_buff *skb)
180 {
181 	struct dst_entry *dst = skb_dst(skb);
182 	struct rtable *rt = (struct rtable *)dst;
183 	struct net_device *dev = dst->dev;
184 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 	struct neighbour *neigh;
186 
187 	if (rt->rt_type == RTN_MULTICAST) {
188 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 	} else if (rt->rt_type == RTN_BROADCAST)
190 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191 
192 	/* Be paranoid, rather than too clever. */
193 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 		struct sk_buff *skb2;
195 
196 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 		if (skb2 == NULL) {
198 			kfree_skb(skb);
199 			return -ENOMEM;
200 		}
201 		if (skb->sk)
202 			skb_set_owner_w(skb2, skb->sk);
203 		kfree_skb(skb);
204 		skb = skb2;
205 	}
206 
207 	rcu_read_lock();
208 	neigh = dst_get_neighbour_noref(dst);
209 	if (neigh) {
210 		int res = neigh_output(neigh, skb);
211 
212 		rcu_read_unlock();
213 		return res;
214 	}
215 	rcu_read_unlock();
216 
217 	if (net_ratelimit())
218 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
219 	kfree_skb(skb);
220 	return -EINVAL;
221 }
222 
ip_skb_dst_mtu(struct sk_buff * skb)223 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224 {
225 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226 
227 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
229 }
230 
ip_finish_output(struct sk_buff * skb)231 static int ip_finish_output(struct sk_buff *skb)
232 {
233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 	/* Policy lookup after SNAT yielded a new policy */
235 	if (skb_dst(skb)->xfrm != NULL) {
236 		IPCB(skb)->flags |= IPSKB_REROUTED;
237 		return dst_output(skb);
238 	}
239 #endif
240 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241 		return ip_fragment(skb, ip_finish_output2);
242 	else
243 		return ip_finish_output2(skb);
244 }
245 
ip_mc_output(struct sk_buff * skb)246 int ip_mc_output(struct sk_buff *skb)
247 {
248 	struct sock *sk = skb->sk;
249 	struct rtable *rt = skb_rtable(skb);
250 	struct net_device *dev = rt->dst.dev;
251 
252 	/*
253 	 *	If the indicated interface is up and running, send the packet.
254 	 */
255 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
256 
257 	skb->dev = dev;
258 	skb->protocol = htons(ETH_P_IP);
259 
260 	/*
261 	 *	Multicasts are looped back for other local users
262 	 */
263 
264 	if (rt->rt_flags&RTCF_MULTICAST) {
265 		if (sk_mc_loop(sk)
266 #ifdef CONFIG_IP_MROUTE
267 		/* Small optimization: do not loopback not local frames,
268 		   which returned after forwarding; they will be  dropped
269 		   by ip_mr_input in any case.
270 		   Note, that local frames are looped back to be delivered
271 		   to local recipients.
272 
273 		   This check is duplicated in ip_mr_input at the moment.
274 		 */
275 		    &&
276 		    ((rt->rt_flags & RTCF_LOCAL) ||
277 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
278 #endif
279 		   ) {
280 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 			if (newskb)
282 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 					newskb, NULL, newskb->dev,
284 					ip_dev_loopback_xmit);
285 		}
286 
287 		/* Multicasts with ttl 0 must not go beyond the host */
288 
289 		if (ip_hdr(skb)->ttl == 0) {
290 			kfree_skb(skb);
291 			return 0;
292 		}
293 	}
294 
295 	if (rt->rt_flags&RTCF_BROADCAST) {
296 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 		if (newskb)
298 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 				NULL, newskb->dev, ip_dev_loopback_xmit);
300 	}
301 
302 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 			    skb->dev, ip_finish_output,
304 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
305 }
306 
ip_output(struct sk_buff * skb)307 int ip_output(struct sk_buff *skb)
308 {
309 	struct net_device *dev = skb_dst(skb)->dev;
310 
311 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
312 
313 	skb->dev = dev;
314 	skb->protocol = htons(ETH_P_IP);
315 
316 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317 			    ip_finish_output,
318 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
319 }
320 
321 /*
322  * copy saddr and daddr, possibly using 64bit load/stores
323  * Equivalent to :
324  *   iph->saddr = fl4->saddr;
325  *   iph->daddr = fl4->daddr;
326  */
ip_copy_addrs(struct iphdr * iph,const struct flowi4 * fl4)327 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
328 {
329 	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
330 		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
331 	memcpy(&iph->saddr, &fl4->saddr,
332 	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
333 }
334 
ip_queue_xmit(struct sk_buff * skb,struct flowi * fl)335 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
336 {
337 	struct sock *sk = skb->sk;
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct ip_options_rcu *inet_opt;
340 	struct flowi4 *fl4;
341 	struct rtable *rt;
342 	struct iphdr *iph;
343 	int res;
344 
345 	/* Skip all of this if the packet is already routed,
346 	 * f.e. by something like SCTP.
347 	 */
348 	rcu_read_lock();
349 	inet_opt = rcu_dereference(inet->inet_opt);
350 	fl4 = &fl->u.ip4;
351 	rt = skb_rtable(skb);
352 	if (rt != NULL)
353 		goto packet_routed;
354 
355 	/* Make sure we can route this packet. */
356 	rt = (struct rtable *)__sk_dst_check(sk, 0);
357 	if (rt == NULL) {
358 		__be32 daddr;
359 
360 		/* Use correct destination address if we have options. */
361 		daddr = inet->inet_daddr;
362 		if (inet_opt && inet_opt->opt.srr)
363 			daddr = inet_opt->opt.faddr;
364 
365 		/* If this fails, retransmit mechanism of transport layer will
366 		 * keep trying until route appears or the connection times
367 		 * itself out.
368 		 */
369 		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
370 					   daddr, inet->inet_saddr,
371 					   inet->inet_dport,
372 					   inet->inet_sport,
373 					   sk->sk_protocol,
374 					   RT_CONN_FLAGS(sk),
375 					   sk->sk_bound_dev_if);
376 		if (IS_ERR(rt))
377 			goto no_route;
378 		sk_setup_caps(sk, &rt->dst);
379 	}
380 	skb_dst_set_noref(skb, &rt->dst);
381 
382 packet_routed:
383 	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
384 		goto no_route;
385 
386 	/* OK, we know where to send it, allocate and build IP header. */
387 	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
388 	skb_reset_network_header(skb);
389 	iph = ip_hdr(skb);
390 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
391 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
392 		iph->frag_off = htons(IP_DF);
393 	else
394 		iph->frag_off = 0;
395 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
396 	iph->protocol = sk->sk_protocol;
397 	ip_copy_addrs(iph, fl4);
398 
399 	/* Transport layer set skb->h.foo itself. */
400 
401 	if (inet_opt && inet_opt->opt.optlen) {
402 		iph->ihl += inet_opt->opt.optlen >> 2;
403 		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
404 	}
405 
406 	ip_select_ident_more(skb, &rt->dst, sk,
407 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
408 
409 	skb->priority = sk->sk_priority;
410 	skb->mark = sk->sk_mark;
411 
412 	res = ip_local_out(skb);
413 	rcu_read_unlock();
414 	return res;
415 
416 no_route:
417 	rcu_read_unlock();
418 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
419 	kfree_skb(skb);
420 	return -EHOSTUNREACH;
421 }
422 EXPORT_SYMBOL(ip_queue_xmit);
423 
424 
ip_copy_metadata(struct sk_buff * to,struct sk_buff * from)425 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
426 {
427 	to->pkt_type = from->pkt_type;
428 	to->priority = from->priority;
429 	to->protocol = from->protocol;
430 	skb_dst_drop(to);
431 	skb_dst_copy(to, from);
432 	to->dev = from->dev;
433 	to->mark = from->mark;
434 
435 	/* Copy the flags to each fragment. */
436 	IPCB(to)->flags = IPCB(from)->flags;
437 
438 #ifdef CONFIG_NET_SCHED
439 	to->tc_index = from->tc_index;
440 #endif
441 	nf_copy(to, from);
442 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
443     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
444 	to->nf_trace = from->nf_trace;
445 #endif
446 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
447 	to->ipvs_property = from->ipvs_property;
448 #endif
449 	skb_copy_secmark(to, from);
450 }
451 
452 /*
453  *	This IP datagram is too large to be sent in one piece.  Break it up into
454  *	smaller pieces (each of size equal to IP header plus
455  *	a block of the data of the original IP data part) that will yet fit in a
456  *	single device frame, and queue such a frame for sending.
457  */
458 
ip_fragment(struct sk_buff * skb,int (* output)(struct sk_buff *))459 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
460 {
461 	struct iphdr *iph;
462 	int ptr;
463 	struct net_device *dev;
464 	struct sk_buff *skb2;
465 	unsigned int mtu, hlen, left, len, ll_rs;
466 	int offset;
467 	__be16 not_last_frag;
468 	struct rtable *rt = skb_rtable(skb);
469 	int err = 0;
470 
471 	dev = rt->dst.dev;
472 
473 	/*
474 	 *	Point into the IP datagram header.
475 	 */
476 
477 	iph = ip_hdr(skb);
478 
479 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
480 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
481 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
482 			  htonl(ip_skb_dst_mtu(skb)));
483 		kfree_skb(skb);
484 		return -EMSGSIZE;
485 	}
486 
487 	/*
488 	 *	Setup starting values.
489 	 */
490 
491 	hlen = iph->ihl * 4;
492 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
493 #ifdef CONFIG_BRIDGE_NETFILTER
494 	if (skb->nf_bridge)
495 		mtu -= nf_bridge_mtu_reduction(skb);
496 #endif
497 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
498 
499 	/* When frag_list is given, use it. First, check its validity:
500 	 * some transformers could create wrong frag_list or break existing
501 	 * one, it is not prohibited. In this case fall back to copying.
502 	 *
503 	 * LATER: this step can be merged to real generation of fragments,
504 	 * we can switch to copy when see the first bad fragment.
505 	 */
506 	if (skb_has_frag_list(skb)) {
507 		struct sk_buff *frag, *frag2;
508 		int first_len = skb_pagelen(skb);
509 
510 		if (first_len - hlen > mtu ||
511 		    ((first_len - hlen) & 7) ||
512 		    ip_is_fragment(iph) ||
513 		    skb_cloned(skb))
514 			goto slow_path;
515 
516 		skb_walk_frags(skb, frag) {
517 			/* Correct geometry. */
518 			if (frag->len > mtu ||
519 			    ((frag->len & 7) && frag->next) ||
520 			    skb_headroom(frag) < hlen)
521 				goto slow_path_clean;
522 
523 			/* Partially cloned skb? */
524 			if (skb_shared(frag))
525 				goto slow_path_clean;
526 
527 			BUG_ON(frag->sk);
528 			if (skb->sk) {
529 				frag->sk = skb->sk;
530 				frag->destructor = sock_wfree;
531 			}
532 			skb->truesize -= frag->truesize;
533 		}
534 
535 		/* Everything is OK. Generate! */
536 
537 		err = 0;
538 		offset = 0;
539 		frag = skb_shinfo(skb)->frag_list;
540 		skb_frag_list_init(skb);
541 		skb->data_len = first_len - skb_headlen(skb);
542 		skb->len = first_len;
543 		iph->tot_len = htons(first_len);
544 		iph->frag_off = htons(IP_MF);
545 		ip_send_check(iph);
546 
547 		for (;;) {
548 			/* Prepare header of the next frame,
549 			 * before previous one went down. */
550 			if (frag) {
551 				frag->ip_summed = CHECKSUM_NONE;
552 				skb_reset_transport_header(frag);
553 				__skb_push(frag, hlen);
554 				skb_reset_network_header(frag);
555 				memcpy(skb_network_header(frag), iph, hlen);
556 				iph = ip_hdr(frag);
557 				iph->tot_len = htons(frag->len);
558 				ip_copy_metadata(frag, skb);
559 				if (offset == 0)
560 					ip_options_fragment(frag);
561 				offset += skb->len - hlen;
562 				iph->frag_off = htons(offset>>3);
563 				if (frag->next != NULL)
564 					iph->frag_off |= htons(IP_MF);
565 				/* Ready, complete checksum */
566 				ip_send_check(iph);
567 			}
568 
569 			err = output(skb);
570 
571 			if (!err)
572 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
573 			if (err || !frag)
574 				break;
575 
576 			skb = frag;
577 			frag = skb->next;
578 			skb->next = NULL;
579 		}
580 
581 		if (err == 0) {
582 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
583 			return 0;
584 		}
585 
586 		while (frag) {
587 			skb = frag->next;
588 			kfree_skb(frag);
589 			frag = skb;
590 		}
591 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
592 		return err;
593 
594 slow_path_clean:
595 		skb_walk_frags(skb, frag2) {
596 			if (frag2 == frag)
597 				break;
598 			frag2->sk = NULL;
599 			frag2->destructor = NULL;
600 			skb->truesize += frag2->truesize;
601 		}
602 	}
603 
604 slow_path:
605 	left = skb->len - hlen;		/* Space per frame */
606 	ptr = hlen;		/* Where to start from */
607 
608 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
609 	 * we need to make room for the encapsulating header
610 	 */
611 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
612 
613 	/*
614 	 *	Fragment the datagram.
615 	 */
616 
617 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
618 	not_last_frag = iph->frag_off & htons(IP_MF);
619 
620 	/*
621 	 *	Keep copying data until we run out.
622 	 */
623 
624 	while (left > 0) {
625 		len = left;
626 		/* IF: it doesn't fit, use 'mtu' - the data space left */
627 		if (len > mtu)
628 			len = mtu;
629 		/* IF: we are not sending up to and including the packet end
630 		   then align the next start on an eight byte boundary */
631 		if (len < left)	{
632 			len &= ~7;
633 		}
634 		/*
635 		 *	Allocate buffer.
636 		 */
637 
638 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
639 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
640 			err = -ENOMEM;
641 			goto fail;
642 		}
643 
644 		/*
645 		 *	Set up data on packet
646 		 */
647 
648 		ip_copy_metadata(skb2, skb);
649 		skb_reserve(skb2, ll_rs);
650 		skb_put(skb2, len + hlen);
651 		skb_reset_network_header(skb2);
652 		skb2->transport_header = skb2->network_header + hlen;
653 
654 		/*
655 		 *	Charge the memory for the fragment to any owner
656 		 *	it might possess
657 		 */
658 
659 		if (skb->sk)
660 			skb_set_owner_w(skb2, skb->sk);
661 
662 		/*
663 		 *	Copy the packet header into the new buffer.
664 		 */
665 
666 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
667 
668 		/*
669 		 *	Copy a block of the IP datagram.
670 		 */
671 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
672 			BUG();
673 		left -= len;
674 
675 		/*
676 		 *	Fill in the new header fields.
677 		 */
678 		iph = ip_hdr(skb2);
679 		iph->frag_off = htons((offset >> 3));
680 
681 		/* ANK: dirty, but effective trick. Upgrade options only if
682 		 * the segment to be fragmented was THE FIRST (otherwise,
683 		 * options are already fixed) and make it ONCE
684 		 * on the initial skb, so that all the following fragments
685 		 * will inherit fixed options.
686 		 */
687 		if (offset == 0)
688 			ip_options_fragment(skb);
689 
690 		/*
691 		 *	Added AC : If we are fragmenting a fragment that's not the
692 		 *		   last fragment then keep MF on each bit
693 		 */
694 		if (left > 0 || not_last_frag)
695 			iph->frag_off |= htons(IP_MF);
696 		ptr += len;
697 		offset += len;
698 
699 		/*
700 		 *	Put this fragment into the sending queue.
701 		 */
702 		iph->tot_len = htons(len + hlen);
703 
704 		ip_send_check(iph);
705 
706 		err = output(skb2);
707 		if (err)
708 			goto fail;
709 
710 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
711 	}
712 	kfree_skb(skb);
713 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
714 	return err;
715 
716 fail:
717 	kfree_skb(skb);
718 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
719 	return err;
720 }
721 EXPORT_SYMBOL(ip_fragment);
722 
723 int
ip_generic_getfrag(void * from,char * to,int offset,int len,int odd,struct sk_buff * skb)724 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
725 {
726 	struct iovec *iov = from;
727 
728 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
729 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
730 			return -EFAULT;
731 	} else {
732 		__wsum csum = 0;
733 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
734 			return -EFAULT;
735 		skb->csum = csum_block_add(skb->csum, csum, odd);
736 	}
737 	return 0;
738 }
739 EXPORT_SYMBOL(ip_generic_getfrag);
740 
741 static inline __wsum
csum_page(struct page * page,int offset,int copy)742 csum_page(struct page *page, int offset, int copy)
743 {
744 	char *kaddr;
745 	__wsum csum;
746 	kaddr = kmap(page);
747 	csum = csum_partial(kaddr + offset, copy, 0);
748 	kunmap(page);
749 	return csum;
750 }
751 
ip_ufo_append_data(struct sock * sk,struct sk_buff_head * queue,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int hh_len,int fragheaderlen,int transhdrlen,int maxfraglen,unsigned int flags)752 static inline int ip_ufo_append_data(struct sock *sk,
753 			struct sk_buff_head *queue,
754 			int getfrag(void *from, char *to, int offset, int len,
755 			       int odd, struct sk_buff *skb),
756 			void *from, int length, int hh_len, int fragheaderlen,
757 			int transhdrlen, int maxfraglen, unsigned int flags)
758 {
759 	struct sk_buff *skb;
760 	int err;
761 
762 	/* There is support for UDP fragmentation offload by network
763 	 * device, so create one single skb packet containing complete
764 	 * udp datagram
765 	 */
766 	if ((skb = skb_peek_tail(queue)) == NULL) {
767 		skb = sock_alloc_send_skb(sk,
768 			hh_len + fragheaderlen + transhdrlen + 20,
769 			(flags & MSG_DONTWAIT), &err);
770 
771 		if (skb == NULL)
772 			return err;
773 
774 		/* reserve space for Hardware header */
775 		skb_reserve(skb, hh_len);
776 
777 		/* create space for UDP/IP header */
778 		skb_put(skb, fragheaderlen + transhdrlen);
779 
780 		/* initialize network header pointer */
781 		skb_reset_network_header(skb);
782 
783 		/* initialize protocol header pointer */
784 		skb->transport_header = skb->network_header + fragheaderlen;
785 
786 		skb->ip_summed = CHECKSUM_PARTIAL;
787 		skb->csum = 0;
788 
789 		/* specify the length of each IP datagram fragment */
790 		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
791 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
792 		__skb_queue_tail(queue, skb);
793 	}
794 
795 	return skb_append_datato_frags(sk, skb, getfrag, from,
796 				       (length - transhdrlen));
797 }
798 
__ip_append_data(struct sock * sk,struct flowi4 * fl4,struct sk_buff_head * queue,struct inet_cork * cork,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,unsigned int flags)799 static int __ip_append_data(struct sock *sk,
800 			    struct flowi4 *fl4,
801 			    struct sk_buff_head *queue,
802 			    struct inet_cork *cork,
803 			    int getfrag(void *from, char *to, int offset,
804 					int len, int odd, struct sk_buff *skb),
805 			    void *from, int length, int transhdrlen,
806 			    unsigned int flags)
807 {
808 	struct inet_sock *inet = inet_sk(sk);
809 	struct sk_buff *skb;
810 
811 	struct ip_options *opt = cork->opt;
812 	int hh_len;
813 	int exthdrlen;
814 	int mtu;
815 	int copy;
816 	int err;
817 	int offset = 0;
818 	unsigned int maxfraglen, fragheaderlen;
819 	int csummode = CHECKSUM_NONE;
820 	struct rtable *rt = (struct rtable *)cork->dst;
821 
822 	skb = skb_peek_tail(queue);
823 
824 	exthdrlen = !skb ? rt->dst.header_len : 0;
825 	mtu = cork->fragsize;
826 
827 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
828 
829 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
830 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
831 
832 	if (cork->length + length > 0xFFFF - fragheaderlen) {
833 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
834 			       mtu-exthdrlen);
835 		return -EMSGSIZE;
836 	}
837 
838 	/*
839 	 * transhdrlen > 0 means that this is the first fragment and we wish
840 	 * it won't be fragmented in the future.
841 	 */
842 	if (transhdrlen &&
843 	    length + fragheaderlen <= mtu &&
844 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
845 	    !exthdrlen)
846 		csummode = CHECKSUM_PARTIAL;
847 
848 	cork->length += length;
849 	if (((length > mtu) || (skb && skb_has_frags(skb))) &&
850 	    (sk->sk_protocol == IPPROTO_UDP) &&
851 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
852 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
853 					 hh_len, fragheaderlen, transhdrlen,
854 					 maxfraglen, flags);
855 		if (err)
856 			goto error;
857 		return 0;
858 	}
859 
860 	/* So, what's going on in the loop below?
861 	 *
862 	 * We use calculated fragment length to generate chained skb,
863 	 * each of segments is IP fragment ready for sending to network after
864 	 * adding appropriate IP header.
865 	 */
866 
867 	if (!skb)
868 		goto alloc_new_skb;
869 
870 	while (length > 0) {
871 		/* Check if the remaining data fits into current packet. */
872 		copy = mtu - skb->len;
873 		if (copy < length)
874 			copy = maxfraglen - skb->len;
875 		if (copy <= 0) {
876 			char *data;
877 			unsigned int datalen;
878 			unsigned int fraglen;
879 			unsigned int fraggap;
880 			unsigned int alloclen;
881 			struct sk_buff *skb_prev;
882 alloc_new_skb:
883 			skb_prev = skb;
884 			if (skb_prev)
885 				fraggap = skb_prev->len - maxfraglen;
886 			else
887 				fraggap = 0;
888 
889 			/*
890 			 * If remaining data exceeds the mtu,
891 			 * we know we need more fragment(s).
892 			 */
893 			datalen = length + fraggap;
894 			if (datalen > mtu - fragheaderlen)
895 				datalen = maxfraglen - fragheaderlen;
896 			fraglen = datalen + fragheaderlen;
897 
898 			if ((flags & MSG_MORE) &&
899 			    !(rt->dst.dev->features&NETIF_F_SG))
900 				alloclen = mtu;
901 			else
902 				alloclen = fraglen;
903 
904 			alloclen += exthdrlen;
905 
906 			/* The last fragment gets additional space at tail.
907 			 * Note, with MSG_MORE we overallocate on fragments,
908 			 * because we have no idea what fragment will be
909 			 * the last.
910 			 */
911 			if (datalen == length + fraggap)
912 				alloclen += rt->dst.trailer_len;
913 
914 			if (transhdrlen) {
915 				skb = sock_alloc_send_skb(sk,
916 						alloclen + hh_len + 15,
917 						(flags & MSG_DONTWAIT), &err);
918 			} else {
919 				skb = NULL;
920 				if (atomic_read(&sk->sk_wmem_alloc) <=
921 				    2 * sk->sk_sndbuf)
922 					skb = sock_wmalloc(sk,
923 							   alloclen + hh_len + 15, 1,
924 							   sk->sk_allocation);
925 				if (unlikely(skb == NULL))
926 					err = -ENOBUFS;
927 				else
928 					/* only the initial fragment is
929 					   time stamped */
930 					cork->tx_flags = 0;
931 			}
932 			if (skb == NULL)
933 				goto error;
934 
935 			/*
936 			 *	Fill in the control structures
937 			 */
938 			skb->ip_summed = csummode;
939 			skb->csum = 0;
940 			skb_reserve(skb, hh_len);
941 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
942 
943 			/*
944 			 *	Find where to start putting bytes.
945 			 */
946 			data = skb_put(skb, fraglen + exthdrlen);
947 			skb_set_network_header(skb, exthdrlen);
948 			skb->transport_header = (skb->network_header +
949 						 fragheaderlen);
950 			data += fragheaderlen + exthdrlen;
951 
952 			if (fraggap) {
953 				skb->csum = skb_copy_and_csum_bits(
954 					skb_prev, maxfraglen,
955 					data + transhdrlen, fraggap, 0);
956 				skb_prev->csum = csum_sub(skb_prev->csum,
957 							  skb->csum);
958 				data += fraggap;
959 				pskb_trim_unique(skb_prev, maxfraglen);
960 			}
961 
962 			copy = datalen - transhdrlen - fraggap;
963 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
964 				err = -EFAULT;
965 				kfree_skb(skb);
966 				goto error;
967 			}
968 
969 			offset += copy;
970 			length -= datalen - fraggap;
971 			transhdrlen = 0;
972 			exthdrlen = 0;
973 			csummode = CHECKSUM_NONE;
974 
975 			/*
976 			 * Put the packet on the pending queue.
977 			 */
978 			__skb_queue_tail(queue, skb);
979 			continue;
980 		}
981 
982 		if (copy > length)
983 			copy = length;
984 
985 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
986 			unsigned int off;
987 
988 			off = skb->len;
989 			if (getfrag(from, skb_put(skb, copy),
990 					offset, copy, off, skb) < 0) {
991 				__skb_trim(skb, off);
992 				err = -EFAULT;
993 				goto error;
994 			}
995 		} else {
996 			int i = skb_shinfo(skb)->nr_frags;
997 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
998 			struct page *page = cork->page;
999 			int off = cork->off;
1000 			unsigned int left;
1001 
1002 			if (page && (left = PAGE_SIZE - off) > 0) {
1003 				if (copy >= left)
1004 					copy = left;
1005 				if (page != skb_frag_page(frag)) {
1006 					if (i == MAX_SKB_FRAGS) {
1007 						err = -EMSGSIZE;
1008 						goto error;
1009 					}
1010 					skb_fill_page_desc(skb, i, page, off, 0);
1011 					skb_frag_ref(skb, i);
1012 					frag = &skb_shinfo(skb)->frags[i];
1013 				}
1014 			} else if (i < MAX_SKB_FRAGS) {
1015 				if (copy > PAGE_SIZE)
1016 					copy = PAGE_SIZE;
1017 				page = alloc_pages(sk->sk_allocation, 0);
1018 				if (page == NULL)  {
1019 					err = -ENOMEM;
1020 					goto error;
1021 				}
1022 				cork->page = page;
1023 				cork->off = 0;
1024 
1025 				skb_fill_page_desc(skb, i, page, 0, 0);
1026 				frag = &skb_shinfo(skb)->frags[i];
1027 			} else {
1028 				err = -EMSGSIZE;
1029 				goto error;
1030 			}
1031 			if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1032 				    offset, copy, skb->len, skb) < 0) {
1033 				err = -EFAULT;
1034 				goto error;
1035 			}
1036 			cork->off += copy;
1037 			skb_frag_size_add(frag, copy);
1038 			skb->len += copy;
1039 			skb->data_len += copy;
1040 			skb->truesize += copy;
1041 			atomic_add(copy, &sk->sk_wmem_alloc);
1042 		}
1043 		offset += copy;
1044 		length -= copy;
1045 	}
1046 
1047 	return 0;
1048 
1049 error:
1050 	cork->length -= length;
1051 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1052 	return err;
1053 }
1054 
ip_setup_cork(struct sock * sk,struct inet_cork * cork,struct ipcm_cookie * ipc,struct rtable ** rtp)1055 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1056 			 struct ipcm_cookie *ipc, struct rtable **rtp)
1057 {
1058 	struct inet_sock *inet = inet_sk(sk);
1059 	struct ip_options_rcu *opt;
1060 	struct rtable *rt;
1061 
1062 	/*
1063 	 * setup for corking.
1064 	 */
1065 	opt = ipc->opt;
1066 	if (opt) {
1067 		if (cork->opt == NULL) {
1068 			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1069 					    sk->sk_allocation);
1070 			if (unlikely(cork->opt == NULL))
1071 				return -ENOBUFS;
1072 		}
1073 		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1074 		cork->flags |= IPCORK_OPT;
1075 		cork->addr = ipc->addr;
1076 	}
1077 	rt = *rtp;
1078 	if (unlikely(!rt))
1079 		return -EFAULT;
1080 	/*
1081 	 * We steal reference to this route, caller should not release it
1082 	 */
1083 	*rtp = NULL;
1084 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1085 			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1086 	cork->dst = &rt->dst;
1087 	cork->length = 0;
1088 	cork->tx_flags = ipc->tx_flags;
1089 	cork->page = NULL;
1090 	cork->off = 0;
1091 
1092 	return 0;
1093 }
1094 
1095 /*
1096  *	ip_append_data() and ip_append_page() can make one large IP datagram
1097  *	from many pieces of data. Each pieces will be holded on the socket
1098  *	until ip_push_pending_frames() is called. Each piece can be a page
1099  *	or non-page data.
1100  *
1101  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1102  *	this interface potentially.
1103  *
1104  *	LATER: length must be adjusted by pad at tail, when it is required.
1105  */
ip_append_data(struct sock * sk,struct flowi4 * fl4,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm_cookie * ipc,struct rtable ** rtp,unsigned int flags)1106 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1107 		   int getfrag(void *from, char *to, int offset, int len,
1108 			       int odd, struct sk_buff *skb),
1109 		   void *from, int length, int transhdrlen,
1110 		   struct ipcm_cookie *ipc, struct rtable **rtp,
1111 		   unsigned int flags)
1112 {
1113 	struct inet_sock *inet = inet_sk(sk);
1114 	int err;
1115 
1116 	if (flags&MSG_PROBE)
1117 		return 0;
1118 
1119 	if (skb_queue_empty(&sk->sk_write_queue)) {
1120 		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1121 		if (err)
1122 			return err;
1123 	} else {
1124 		transhdrlen = 0;
1125 	}
1126 
1127 	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1128 				from, length, transhdrlen, flags);
1129 }
1130 
ip_append_page(struct sock * sk,struct flowi4 * fl4,struct page * page,int offset,size_t size,int flags)1131 ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1132 		       int offset, size_t size, int flags)
1133 {
1134 	struct inet_sock *inet = inet_sk(sk);
1135 	struct sk_buff *skb;
1136 	struct rtable *rt;
1137 	struct ip_options *opt = NULL;
1138 	struct inet_cork *cork;
1139 	int hh_len;
1140 	int mtu;
1141 	int len;
1142 	int err;
1143 	unsigned int maxfraglen, fragheaderlen, fraggap;
1144 
1145 	if (inet->hdrincl)
1146 		return -EPERM;
1147 
1148 	if (flags&MSG_PROBE)
1149 		return 0;
1150 
1151 	if (skb_queue_empty(&sk->sk_write_queue))
1152 		return -EINVAL;
1153 
1154 	cork = &inet->cork.base;
1155 	rt = (struct rtable *)cork->dst;
1156 	if (cork->flags & IPCORK_OPT)
1157 		opt = cork->opt;
1158 
1159 	if (!(rt->dst.dev->features&NETIF_F_SG))
1160 		return -EOPNOTSUPP;
1161 
1162 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1163 	mtu = cork->fragsize;
1164 
1165 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1166 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1167 
1168 	if (cork->length + size > 0xFFFF - fragheaderlen) {
1169 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1170 		return -EMSGSIZE;
1171 	}
1172 
1173 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1174 		return -EINVAL;
1175 
1176 	cork->length += size;
1177 	if ((size + skb->len > mtu) &&
1178 	    (sk->sk_protocol == IPPROTO_UDP) &&
1179 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1180 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1181 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1182 	}
1183 
1184 
1185 	while (size > 0) {
1186 		int i;
1187 
1188 		if (skb_is_gso(skb))
1189 			len = size;
1190 		else {
1191 
1192 			/* Check if the remaining data fits into current packet. */
1193 			len = mtu - skb->len;
1194 			if (len < size)
1195 				len = maxfraglen - skb->len;
1196 		}
1197 		if (len <= 0) {
1198 			struct sk_buff *skb_prev;
1199 			int alloclen;
1200 
1201 			skb_prev = skb;
1202 			fraggap = skb_prev->len - maxfraglen;
1203 
1204 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1205 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1206 			if (unlikely(!skb)) {
1207 				err = -ENOBUFS;
1208 				goto error;
1209 			}
1210 
1211 			/*
1212 			 *	Fill in the control structures
1213 			 */
1214 			skb->ip_summed = CHECKSUM_NONE;
1215 			skb->csum = 0;
1216 			skb_reserve(skb, hh_len);
1217 
1218 			/*
1219 			 *	Find where to start putting bytes.
1220 			 */
1221 			skb_put(skb, fragheaderlen + fraggap);
1222 			skb_reset_network_header(skb);
1223 			skb->transport_header = (skb->network_header +
1224 						 fragheaderlen);
1225 			if (fraggap) {
1226 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1227 								   maxfraglen,
1228 						    skb_transport_header(skb),
1229 								   fraggap, 0);
1230 				skb_prev->csum = csum_sub(skb_prev->csum,
1231 							  skb->csum);
1232 				pskb_trim_unique(skb_prev, maxfraglen);
1233 			}
1234 
1235 			/*
1236 			 * Put the packet on the pending queue.
1237 			 */
1238 			__skb_queue_tail(&sk->sk_write_queue, skb);
1239 			continue;
1240 		}
1241 
1242 		i = skb_shinfo(skb)->nr_frags;
1243 		if (len > size)
1244 			len = size;
1245 		if (skb_can_coalesce(skb, i, page, offset)) {
1246 			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1247 		} else if (i < MAX_SKB_FRAGS) {
1248 			get_page(page);
1249 			skb_fill_page_desc(skb, i, page, offset, len);
1250 		} else {
1251 			err = -EMSGSIZE;
1252 			goto error;
1253 		}
1254 
1255 		if (skb->ip_summed == CHECKSUM_NONE) {
1256 			__wsum csum;
1257 			csum = csum_page(page, offset, len);
1258 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1259 		}
1260 
1261 		skb->len += len;
1262 		skb->data_len += len;
1263 		skb->truesize += len;
1264 		atomic_add(len, &sk->sk_wmem_alloc);
1265 		offset += len;
1266 		size -= len;
1267 	}
1268 	return 0;
1269 
1270 error:
1271 	cork->length -= size;
1272 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1273 	return err;
1274 }
1275 
ip_cork_release(struct inet_cork * cork)1276 static void ip_cork_release(struct inet_cork *cork)
1277 {
1278 	cork->flags &= ~IPCORK_OPT;
1279 	kfree(cork->opt);
1280 	cork->opt = NULL;
1281 	dst_release(cork->dst);
1282 	cork->dst = NULL;
1283 }
1284 
1285 /*
1286  *	Combined all pending IP fragments on the socket as one IP datagram
1287  *	and push them out.
1288  */
__ip_make_skb(struct sock * sk,struct flowi4 * fl4,struct sk_buff_head * queue,struct inet_cork * cork)1289 struct sk_buff *__ip_make_skb(struct sock *sk,
1290 			      struct flowi4 *fl4,
1291 			      struct sk_buff_head *queue,
1292 			      struct inet_cork *cork)
1293 {
1294 	struct sk_buff *skb, *tmp_skb;
1295 	struct sk_buff **tail_skb;
1296 	struct inet_sock *inet = inet_sk(sk);
1297 	struct net *net = sock_net(sk);
1298 	struct ip_options *opt = NULL;
1299 	struct rtable *rt = (struct rtable *)cork->dst;
1300 	struct iphdr *iph;
1301 	__be16 df = 0;
1302 	__u8 ttl;
1303 
1304 	if ((skb = __skb_dequeue(queue)) == NULL)
1305 		goto out;
1306 	tail_skb = &(skb_shinfo(skb)->frag_list);
1307 
1308 	/* move skb->data to ip header from ext header */
1309 	if (skb->data < skb_network_header(skb))
1310 		__skb_pull(skb, skb_network_offset(skb));
1311 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1312 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1313 		*tail_skb = tmp_skb;
1314 		tail_skb = &(tmp_skb->next);
1315 		skb->len += tmp_skb->len;
1316 		skb->data_len += tmp_skb->len;
1317 		skb->truesize += tmp_skb->truesize;
1318 		tmp_skb->destructor = NULL;
1319 		tmp_skb->sk = NULL;
1320 	}
1321 
1322 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1323 	 * to fragment the frame generated here. No matter, what transforms
1324 	 * how transforms change size of the packet, it will come out.
1325 	 */
1326 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1327 		skb->local_df = 1;
1328 
1329 	/* DF bit is set when we want to see DF on outgoing frames.
1330 	 * If local_df is set too, we still allow to fragment this frame
1331 	 * locally. */
1332 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1333 	    (skb->len <= dst_mtu(&rt->dst) &&
1334 	     ip_dont_fragment(sk, &rt->dst)))
1335 		df = htons(IP_DF);
1336 
1337 	if (cork->flags & IPCORK_OPT)
1338 		opt = cork->opt;
1339 
1340 	if (rt->rt_type == RTN_MULTICAST)
1341 		ttl = inet->mc_ttl;
1342 	else
1343 		ttl = ip_select_ttl(inet, &rt->dst);
1344 
1345 	iph = ip_hdr(skb);
1346 	iph->version = 4;
1347 	iph->ihl = 5;
1348 	iph->tos = inet->tos;
1349 	iph->frag_off = df;
1350 	ip_select_ident(skb, &rt->dst, sk);
1351 	iph->ttl = ttl;
1352 	iph->protocol = sk->sk_protocol;
1353 	ip_copy_addrs(iph, fl4);
1354 
1355 	if (opt) {
1356 		iph->ihl += opt->optlen>>2;
1357 		ip_options_build(skb, opt, cork->addr, rt, 0);
1358 	}
1359 
1360 	skb->priority = sk->sk_priority;
1361 	skb->mark = sk->sk_mark;
1362 	/*
1363 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1364 	 * on dst refcount
1365 	 */
1366 	cork->dst = NULL;
1367 	skb_dst_set(skb, &rt->dst);
1368 
1369 	if (iph->protocol == IPPROTO_ICMP)
1370 		icmp_out_count(net, ((struct icmphdr *)
1371 			skb_transport_header(skb))->type);
1372 
1373 	ip_cork_release(cork);
1374 out:
1375 	return skb;
1376 }
1377 
ip_send_skb(struct sk_buff * skb)1378 int ip_send_skb(struct sk_buff *skb)
1379 {
1380 	struct net *net = sock_net(skb->sk);
1381 	int err;
1382 
1383 	err = ip_local_out(skb);
1384 	if (err) {
1385 		if (err > 0)
1386 			err = net_xmit_errno(err);
1387 		if (err)
1388 			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1389 	}
1390 
1391 	return err;
1392 }
1393 
ip_push_pending_frames(struct sock * sk,struct flowi4 * fl4)1394 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1395 {
1396 	struct sk_buff *skb;
1397 
1398 	skb = ip_finish_skb(sk, fl4);
1399 	if (!skb)
1400 		return 0;
1401 
1402 	/* Netfilter gets whole the not fragmented skb. */
1403 	return ip_send_skb(skb);
1404 }
1405 
1406 /*
1407  *	Throw away all pending data on the socket.
1408  */
__ip_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork * cork)1409 static void __ip_flush_pending_frames(struct sock *sk,
1410 				      struct sk_buff_head *queue,
1411 				      struct inet_cork *cork)
1412 {
1413 	struct sk_buff *skb;
1414 
1415 	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1416 		kfree_skb(skb);
1417 
1418 	ip_cork_release(cork);
1419 }
1420 
ip_flush_pending_frames(struct sock * sk)1421 void ip_flush_pending_frames(struct sock *sk)
1422 {
1423 	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1424 }
1425 
ip_make_skb(struct sock * sk,struct flowi4 * fl4,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,struct ipcm_cookie * ipc,struct rtable ** rtp,unsigned int flags)1426 struct sk_buff *ip_make_skb(struct sock *sk,
1427 			    struct flowi4 *fl4,
1428 			    int getfrag(void *from, char *to, int offset,
1429 					int len, int odd, struct sk_buff *skb),
1430 			    void *from, int length, int transhdrlen,
1431 			    struct ipcm_cookie *ipc, struct rtable **rtp,
1432 			    unsigned int flags)
1433 {
1434 	struct inet_cork cork;
1435 	struct sk_buff_head queue;
1436 	int err;
1437 
1438 	if (flags & MSG_PROBE)
1439 		return NULL;
1440 
1441 	__skb_queue_head_init(&queue);
1442 
1443 	cork.flags = 0;
1444 	cork.addr = 0;
1445 	cork.opt = NULL;
1446 	err = ip_setup_cork(sk, &cork, ipc, rtp);
1447 	if (err)
1448 		return ERR_PTR(err);
1449 
1450 	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1451 			       from, length, transhdrlen, flags);
1452 	if (err) {
1453 		__ip_flush_pending_frames(sk, &queue, &cork);
1454 		return ERR_PTR(err);
1455 	}
1456 
1457 	return __ip_make_skb(sk, fl4, &queue, &cork);
1458 }
1459 
1460 /*
1461  *	Fetch data from kernel space and fill in checksum if needed.
1462  */
ip_reply_glue_bits(void * dptr,char * to,int offset,int len,int odd,struct sk_buff * skb)1463 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1464 			      int len, int odd, struct sk_buff *skb)
1465 {
1466 	__wsum csum;
1467 
1468 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1469 	skb->csum = csum_block_add(skb->csum, csum, odd);
1470 	return 0;
1471 }
1472 
1473 /*
1474  *	Generic function to send a packet as reply to another packet.
1475  *	Used to send TCP resets so far. ICMP should use this function too.
1476  *
1477  *	Should run single threaded per socket because it uses the sock
1478  *     	structure to pass arguments.
1479  */
ip_send_reply(struct sock * sk,struct sk_buff * skb,__be32 daddr,const struct ip_reply_arg * arg,unsigned int len)1480 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1481 		   const struct ip_reply_arg *arg, unsigned int len)
1482 {
1483 	struct inet_sock *inet = inet_sk(sk);
1484 	struct ip_options_data replyopts;
1485 	struct ipcm_cookie ipc;
1486 	struct flowi4 fl4;
1487 	struct rtable *rt = skb_rtable(skb);
1488 
1489 	if (ip_options_echo(&replyopts.opt.opt, skb))
1490 		return;
1491 
1492 	ipc.addr = daddr;
1493 	ipc.opt = NULL;
1494 	ipc.tx_flags = 0;
1495 
1496 	if (replyopts.opt.opt.optlen) {
1497 		ipc.opt = &replyopts.opt;
1498 
1499 		if (replyopts.opt.opt.srr)
1500 			daddr = replyopts.opt.opt.faddr;
1501 	}
1502 
1503 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1504 			   RT_TOS(arg->tos),
1505 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 			   ip_reply_arg_flowi_flags(arg),
1507 			   daddr, rt->rt_spec_dst,
1508 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 	rt = ip_route_output_key(sock_net(sk), &fl4);
1511 	if (IS_ERR(rt))
1512 		return;
1513 
1514 	/* And let IP do all the hard work.
1515 
1516 	   This chunk is not reenterable, hence spinlock.
1517 	   Note that it uses the fact, that this function is called
1518 	   with locally disabled BH and that sk cannot be already spinlocked.
1519 	 */
1520 	bh_lock_sock(sk);
1521 	inet->tos = arg->tos;
1522 	sk->sk_priority = skb->priority;
1523 	sk->sk_protocol = ip_hdr(skb)->protocol;
1524 	sk->sk_bound_dev_if = arg->bound_dev_if;
1525 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1526 		       &ipc, &rt, MSG_DONTWAIT);
1527 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1528 		if (arg->csumoffset >= 0)
1529 			*((__sum16 *)skb_transport_header(skb) +
1530 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1531 								arg->csum));
1532 		skb->ip_summed = CHECKSUM_NONE;
1533 		ip_push_pending_frames(sk, &fl4);
1534 	}
1535 
1536 	bh_unlock_sock(sk);
1537 
1538 	ip_rt_put(rt);
1539 }
1540 
ip_init(void)1541 void __init ip_init(void)
1542 {
1543 	ip_rt_init();
1544 	inet_initpeers();
1545 
1546 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1547 	igmp_mc_proc_init();
1548 #endif
1549 }
1550