1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 #include "dev.h"
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_def_write_space_wfree(struct sock *sk);
150 static void sock_def_write_space(struct sock *sk);
151 
152 /**
153  * sk_ns_capable - General socket capability test
154  * @sk: Socket to use a capability on or through
155  * @user_ns: The user namespace of the capability to use
156  * @cap: The capability to use
157  *
158  * Test to see if the opener of the socket had when the socket was
159  * created and the current process has the capability @cap in the user
160  * namespace @user_ns.
161  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)162 bool sk_ns_capable(const struct sock *sk,
163 		   struct user_namespace *user_ns, int cap)
164 {
165 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
166 		ns_capable(user_ns, cap);
167 }
168 EXPORT_SYMBOL(sk_ns_capable);
169 
170 /**
171  * sk_capable - Socket global capability test
172  * @sk: Socket to use a capability on or through
173  * @cap: The global capability to use
174  *
175  * Test to see if the opener of the socket had when the socket was
176  * created and the current process has the capability @cap in all user
177  * namespaces.
178  */
sk_capable(const struct sock * sk,int cap)179 bool sk_capable(const struct sock *sk, int cap)
180 {
181 	return sk_ns_capable(sk, &init_user_ns, cap);
182 }
183 EXPORT_SYMBOL(sk_capable);
184 
185 /**
186  * sk_net_capable - Network namespace socket capability test
187  * @sk: Socket to use a capability on or through
188  * @cap: The capability to use
189  *
190  * Test to see if the opener of the socket had when the socket was created
191  * and the current process has the capability @cap over the network namespace
192  * the socket is a member of.
193  */
sk_net_capable(const struct sock * sk,int cap)194 bool sk_net_capable(const struct sock *sk, int cap)
195 {
196 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
197 }
198 EXPORT_SYMBOL(sk_net_capable);
199 
200 /*
201  * Each address family might have different locking rules, so we have
202  * one slock key per address family and separate keys for internal and
203  * userspace sockets.
204  */
205 static struct lock_class_key af_family_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_keys[AF_MAX];
207 static struct lock_class_key af_family_slock_keys[AF_MAX];
208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
209 
210 /*
211  * Make lock validator output more readable. (we pre-construct these
212  * strings build-time, so that runtime initialization of socket
213  * locks is fast):
214  */
215 
216 #define _sock_locks(x)						  \
217   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
218   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
219   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
220   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
221   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
222   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
223   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
224   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
225   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
226   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
227   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
228   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
229   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
230   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
231   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
232   x "AF_MCTP"  , \
233   x "AF_MAX"
234 
235 static const char *const af_family_key_strings[AF_MAX+1] = {
236 	_sock_locks("sk_lock-")
237 };
238 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
239 	_sock_locks("slock-")
240 };
241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
242 	_sock_locks("clock-")
243 };
244 
245 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-sk_lock-")
247 };
248 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-slock-")
250 };
251 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
252 	_sock_locks("k-clock-")
253 };
254 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("rlock-")
256 };
257 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
258 	_sock_locks("wlock-")
259 };
260 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
261 	_sock_locks("elock-")
262 };
263 
264 /*
265  * sk_callback_lock and sk queues locking rules are per-address-family,
266  * so split the lock classes by using a per-AF key:
267  */
268 static struct lock_class_key af_callback_keys[AF_MAX];
269 static struct lock_class_key af_rlock_keys[AF_MAX];
270 static struct lock_class_key af_wlock_keys[AF_MAX];
271 static struct lock_class_key af_elock_keys[AF_MAX];
272 static struct lock_class_key af_kern_callback_keys[AF_MAX];
273 
274 /* Run time adjustable parameters. */
275 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
276 EXPORT_SYMBOL(sysctl_wmem_max);
277 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
278 EXPORT_SYMBOL(sysctl_rmem_max);
279 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
280 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
281 
282 /* Maximal space eaten by iovec or ancillary data plus some space */
283 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
284 EXPORT_SYMBOL(sysctl_optmem_max);
285 
286 int sysctl_tstamp_allow_data __read_mostly = 1;
287 
288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
289 EXPORT_SYMBOL_GPL(memalloc_socks_key);
290 
291 /**
292  * sk_set_memalloc - sets %SOCK_MEMALLOC
293  * @sk: socket to set it on
294  *
295  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
296  * It's the responsibility of the admin to adjust min_free_kbytes
297  * to meet the requirements
298  */
sk_set_memalloc(struct sock * sk)299 void sk_set_memalloc(struct sock *sk)
300 {
301 	sock_set_flag(sk, SOCK_MEMALLOC);
302 	sk->sk_allocation |= __GFP_MEMALLOC;
303 	static_branch_inc(&memalloc_socks_key);
304 }
305 EXPORT_SYMBOL_GPL(sk_set_memalloc);
306 
sk_clear_memalloc(struct sock * sk)307 void sk_clear_memalloc(struct sock *sk)
308 {
309 	sock_reset_flag(sk, SOCK_MEMALLOC);
310 	sk->sk_allocation &= ~__GFP_MEMALLOC;
311 	static_branch_dec(&memalloc_socks_key);
312 
313 	/*
314 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
315 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
316 	 * it has rmem allocations due to the last swapfile being deactivated
317 	 * but there is a risk that the socket is unusable due to exceeding
318 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
319 	 */
320 	sk_mem_reclaim(sk);
321 }
322 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
323 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
325 {
326 	int ret;
327 	unsigned int noreclaim_flag;
328 
329 	/* these should have been dropped before queueing */
330 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
331 
332 	noreclaim_flag = memalloc_noreclaim_save();
333 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
334 				 tcp_v6_do_rcv,
335 				 tcp_v4_do_rcv,
336 				 sk, skb);
337 	memalloc_noreclaim_restore(noreclaim_flag);
338 
339 	return ret;
340 }
341 EXPORT_SYMBOL(__sk_backlog_rcv);
342 
sk_error_report(struct sock * sk)343 void sk_error_report(struct sock *sk)
344 {
345 	sk->sk_error_report(sk);
346 
347 	switch (sk->sk_family) {
348 	case AF_INET:
349 		fallthrough;
350 	case AF_INET6:
351 		trace_inet_sk_error_report(sk);
352 		break;
353 	default:
354 		break;
355 	}
356 }
357 EXPORT_SYMBOL(sk_error_report);
358 
sock_get_timeout(long timeo,void * optval,bool old_timeval)359 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
360 {
361 	struct __kernel_sock_timeval tv;
362 
363 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
364 		tv.tv_sec = 0;
365 		tv.tv_usec = 0;
366 	} else {
367 		tv.tv_sec = timeo / HZ;
368 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
369 	}
370 
371 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
372 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
373 		*(struct old_timeval32 *)optval = tv32;
374 		return sizeof(tv32);
375 	}
376 
377 	if (old_timeval) {
378 		struct __kernel_old_timeval old_tv;
379 		old_tv.tv_sec = tv.tv_sec;
380 		old_tv.tv_usec = tv.tv_usec;
381 		*(struct __kernel_old_timeval *)optval = old_tv;
382 		return sizeof(old_tv);
383 	}
384 
385 	*(struct __kernel_sock_timeval *)optval = tv;
386 	return sizeof(tv);
387 }
388 EXPORT_SYMBOL(sock_get_timeout);
389 
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
391 			   sockptr_t optval, int optlen, bool old_timeval)
392 {
393 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
394 		struct old_timeval32 tv32;
395 
396 		if (optlen < sizeof(tv32))
397 			return -EINVAL;
398 
399 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
400 			return -EFAULT;
401 		tv->tv_sec = tv32.tv_sec;
402 		tv->tv_usec = tv32.tv_usec;
403 	} else if (old_timeval) {
404 		struct __kernel_old_timeval old_tv;
405 
406 		if (optlen < sizeof(old_tv))
407 			return -EINVAL;
408 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
409 			return -EFAULT;
410 		tv->tv_sec = old_tv.tv_sec;
411 		tv->tv_usec = old_tv.tv_usec;
412 	} else {
413 		if (optlen < sizeof(*tv))
414 			return -EINVAL;
415 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
416 			return -EFAULT;
417 	}
418 
419 	return 0;
420 }
421 EXPORT_SYMBOL(sock_copy_user_timeval);
422 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
424 			    bool old_timeval)
425 {
426 	struct __kernel_sock_timeval tv;
427 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
428 
429 	if (err)
430 		return err;
431 
432 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
433 		return -EDOM;
434 
435 	if (tv.tv_sec < 0) {
436 		static int warned __read_mostly;
437 
438 		*timeo_p = 0;
439 		if (warned < 10 && net_ratelimit()) {
440 			warned++;
441 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
442 				__func__, current->comm, task_pid_nr(current));
443 		}
444 		return 0;
445 	}
446 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
447 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
448 		return 0;
449 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
450 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
451 	return 0;
452 }
453 
sock_needs_netstamp(const struct sock * sk)454 static bool sock_needs_netstamp(const struct sock *sk)
455 {
456 	switch (sk->sk_family) {
457 	case AF_UNSPEC:
458 	case AF_UNIX:
459 		return false;
460 	default:
461 		return true;
462 	}
463 }
464 
sock_disable_timestamp(struct sock * sk,unsigned long flags)465 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
466 {
467 	if (sk->sk_flags & flags) {
468 		sk->sk_flags &= ~flags;
469 		if (sock_needs_netstamp(sk) &&
470 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
471 			net_disable_timestamp();
472 	}
473 }
474 
475 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)476 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
477 {
478 	unsigned long flags;
479 	struct sk_buff_head *list = &sk->sk_receive_queue;
480 
481 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
482 		atomic_inc(&sk->sk_drops);
483 		trace_sock_rcvqueue_full(sk, skb);
484 		return -ENOMEM;
485 	}
486 
487 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
488 		atomic_inc(&sk->sk_drops);
489 		return -ENOBUFS;
490 	}
491 
492 	skb->dev = NULL;
493 	skb_set_owner_r(skb, sk);
494 
495 	/* we escape from rcu protected region, make sure we dont leak
496 	 * a norefcounted dst
497 	 */
498 	skb_dst_force(skb);
499 
500 	spin_lock_irqsave(&list->lock, flags);
501 	sock_skb_set_dropcount(sk, skb);
502 	__skb_queue_tail(list, skb);
503 	spin_unlock_irqrestore(&list->lock, flags);
504 
505 	if (!sock_flag(sk, SOCK_DEAD))
506 		sk->sk_data_ready(sk);
507 	return 0;
508 }
509 EXPORT_SYMBOL(__sock_queue_rcv_skb);
510 
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)511 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
512 			      enum skb_drop_reason *reason)
513 {
514 	enum skb_drop_reason drop_reason;
515 	int err;
516 
517 	err = sk_filter(sk, skb);
518 	if (err) {
519 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
520 		goto out;
521 	}
522 	err = __sock_queue_rcv_skb(sk, skb);
523 	switch (err) {
524 	case -ENOMEM:
525 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
526 		break;
527 	case -ENOBUFS:
528 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
529 		break;
530 	default:
531 		drop_reason = SKB_NOT_DROPPED_YET;
532 		break;
533 	}
534 out:
535 	if (reason)
536 		*reason = drop_reason;
537 	return err;
538 }
539 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
540 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)541 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
542 		     const int nested, unsigned int trim_cap, bool refcounted)
543 {
544 	int rc = NET_RX_SUCCESS;
545 
546 	if (sk_filter_trim_cap(sk, skb, trim_cap))
547 		goto discard_and_relse;
548 
549 	skb->dev = NULL;
550 
551 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
552 		atomic_inc(&sk->sk_drops);
553 		goto discard_and_relse;
554 	}
555 	if (nested)
556 		bh_lock_sock_nested(sk);
557 	else
558 		bh_lock_sock(sk);
559 	if (!sock_owned_by_user(sk)) {
560 		/*
561 		 * trylock + unlock semantics:
562 		 */
563 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
564 
565 		rc = sk_backlog_rcv(sk, skb);
566 
567 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
568 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
569 		bh_unlock_sock(sk);
570 		atomic_inc(&sk->sk_drops);
571 		goto discard_and_relse;
572 	}
573 
574 	bh_unlock_sock(sk);
575 out:
576 	if (refcounted)
577 		sock_put(sk);
578 	return rc;
579 discard_and_relse:
580 	kfree_skb(skb);
581 	goto out;
582 }
583 EXPORT_SYMBOL(__sk_receive_skb);
584 
585 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
586 							  u32));
587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
588 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)589 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
590 {
591 	struct dst_entry *dst = __sk_dst_get(sk);
592 
593 	if (dst && dst->obsolete &&
594 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
595 			       dst, cookie) == NULL) {
596 		sk_tx_queue_clear(sk);
597 		sk->sk_dst_pending_confirm = 0;
598 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
599 		dst_release(dst);
600 		return NULL;
601 	}
602 
603 	return dst;
604 }
605 EXPORT_SYMBOL(__sk_dst_check);
606 
sk_dst_check(struct sock * sk,u32 cookie)607 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
608 {
609 	struct dst_entry *dst = sk_dst_get(sk);
610 
611 	if (dst && dst->obsolete &&
612 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613 			       dst, cookie) == NULL) {
614 		sk_dst_reset(sk);
615 		dst_release(dst);
616 		return NULL;
617 	}
618 
619 	return dst;
620 }
621 EXPORT_SYMBOL(sk_dst_check);
622 
sock_bindtoindex_locked(struct sock * sk,int ifindex)623 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
624 {
625 	int ret = -ENOPROTOOPT;
626 #ifdef CONFIG_NETDEVICES
627 	struct net *net = sock_net(sk);
628 
629 	/* Sorry... */
630 	ret = -EPERM;
631 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
632 		goto out;
633 
634 	ret = -EINVAL;
635 	if (ifindex < 0)
636 		goto out;
637 
638 	/* Paired with all READ_ONCE() done locklessly. */
639 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
640 
641 	if (sk->sk_prot->rehash)
642 		sk->sk_prot->rehash(sk);
643 	sk_dst_reset(sk);
644 
645 	ret = 0;
646 
647 out:
648 #endif
649 
650 	return ret;
651 }
652 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)653 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
654 {
655 	int ret;
656 
657 	if (lock_sk)
658 		lock_sock(sk);
659 	ret = sock_bindtoindex_locked(sk, ifindex);
660 	if (lock_sk)
661 		release_sock(sk);
662 
663 	return ret;
664 }
665 EXPORT_SYMBOL(sock_bindtoindex);
666 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)667 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
668 {
669 	int ret = -ENOPROTOOPT;
670 #ifdef CONFIG_NETDEVICES
671 	struct net *net = sock_net(sk);
672 	char devname[IFNAMSIZ];
673 	int index;
674 
675 	ret = -EINVAL;
676 	if (optlen < 0)
677 		goto out;
678 
679 	/* Bind this socket to a particular device like "eth0",
680 	 * as specified in the passed interface name. If the
681 	 * name is "" or the option length is zero the socket
682 	 * is not bound.
683 	 */
684 	if (optlen > IFNAMSIZ - 1)
685 		optlen = IFNAMSIZ - 1;
686 	memset(devname, 0, sizeof(devname));
687 
688 	ret = -EFAULT;
689 	if (copy_from_sockptr(devname, optval, optlen))
690 		goto out;
691 
692 	index = 0;
693 	if (devname[0] != '\0') {
694 		struct net_device *dev;
695 
696 		rcu_read_lock();
697 		dev = dev_get_by_name_rcu(net, devname);
698 		if (dev)
699 			index = dev->ifindex;
700 		rcu_read_unlock();
701 		ret = -ENODEV;
702 		if (!dev)
703 			goto out;
704 	}
705 
706 	return sock_bindtoindex(sk, index, true);
707 out:
708 #endif
709 
710 	return ret;
711 }
712 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)713 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
714 				int __user *optlen, int len)
715 {
716 	int ret = -ENOPROTOOPT;
717 #ifdef CONFIG_NETDEVICES
718 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
719 	struct net *net = sock_net(sk);
720 	char devname[IFNAMSIZ];
721 
722 	if (bound_dev_if == 0) {
723 		len = 0;
724 		goto zero;
725 	}
726 
727 	ret = -EINVAL;
728 	if (len < IFNAMSIZ)
729 		goto out;
730 
731 	ret = netdev_get_name(net, devname, bound_dev_if);
732 	if (ret)
733 		goto out;
734 
735 	len = strlen(devname) + 1;
736 
737 	ret = -EFAULT;
738 	if (copy_to_user(optval, devname, len))
739 		goto out;
740 
741 zero:
742 	ret = -EFAULT;
743 	if (put_user(len, optlen))
744 		goto out;
745 
746 	ret = 0;
747 
748 out:
749 #endif
750 
751 	return ret;
752 }
753 
sk_mc_loop(struct sock * sk)754 bool sk_mc_loop(struct sock *sk)
755 {
756 	if (dev_recursion_level())
757 		return false;
758 	if (!sk)
759 		return true;
760 	switch (sk->sk_family) {
761 	case AF_INET:
762 		return inet_sk(sk)->mc_loop;
763 #if IS_ENABLED(CONFIG_IPV6)
764 	case AF_INET6:
765 		return inet6_sk(sk)->mc_loop;
766 #endif
767 	}
768 	WARN_ON_ONCE(1);
769 	return true;
770 }
771 EXPORT_SYMBOL(sk_mc_loop);
772 
sock_set_reuseaddr(struct sock * sk)773 void sock_set_reuseaddr(struct sock *sk)
774 {
775 	lock_sock(sk);
776 	sk->sk_reuse = SK_CAN_REUSE;
777 	release_sock(sk);
778 }
779 EXPORT_SYMBOL(sock_set_reuseaddr);
780 
sock_set_reuseport(struct sock * sk)781 void sock_set_reuseport(struct sock *sk)
782 {
783 	lock_sock(sk);
784 	sk->sk_reuseport = true;
785 	release_sock(sk);
786 }
787 EXPORT_SYMBOL(sock_set_reuseport);
788 
sock_no_linger(struct sock * sk)789 void sock_no_linger(struct sock *sk)
790 {
791 	lock_sock(sk);
792 	sk->sk_lingertime = 0;
793 	sock_set_flag(sk, SOCK_LINGER);
794 	release_sock(sk);
795 }
796 EXPORT_SYMBOL(sock_no_linger);
797 
sock_set_priority(struct sock * sk,u32 priority)798 void sock_set_priority(struct sock *sk, u32 priority)
799 {
800 	lock_sock(sk);
801 	sk->sk_priority = priority;
802 	release_sock(sk);
803 }
804 EXPORT_SYMBOL(sock_set_priority);
805 
sock_set_sndtimeo(struct sock * sk,s64 secs)806 void sock_set_sndtimeo(struct sock *sk, s64 secs)
807 {
808 	lock_sock(sk);
809 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
810 		sk->sk_sndtimeo = secs * HZ;
811 	else
812 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
813 	release_sock(sk);
814 }
815 EXPORT_SYMBOL(sock_set_sndtimeo);
816 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)817 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
818 {
819 	if (val)  {
820 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
821 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
822 		sock_set_flag(sk, SOCK_RCVTSTAMP);
823 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
824 	} else {
825 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
826 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
827 	}
828 }
829 
sock_enable_timestamps(struct sock * sk)830 void sock_enable_timestamps(struct sock *sk)
831 {
832 	lock_sock(sk);
833 	__sock_set_timestamps(sk, true, false, true);
834 	release_sock(sk);
835 }
836 EXPORT_SYMBOL(sock_enable_timestamps);
837 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)838 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
839 {
840 	switch (optname) {
841 	case SO_TIMESTAMP_OLD:
842 		__sock_set_timestamps(sk, valbool, false, false);
843 		break;
844 	case SO_TIMESTAMP_NEW:
845 		__sock_set_timestamps(sk, valbool, true, false);
846 		break;
847 	case SO_TIMESTAMPNS_OLD:
848 		__sock_set_timestamps(sk, valbool, false, true);
849 		break;
850 	case SO_TIMESTAMPNS_NEW:
851 		__sock_set_timestamps(sk, valbool, true, true);
852 		break;
853 	}
854 }
855 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)856 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
857 {
858 	struct net *net = sock_net(sk);
859 	struct net_device *dev = NULL;
860 	bool match = false;
861 	int *vclock_index;
862 	int i, num;
863 
864 	if (sk->sk_bound_dev_if)
865 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
866 
867 	if (!dev) {
868 		pr_err("%s: sock not bind to device\n", __func__);
869 		return -EOPNOTSUPP;
870 	}
871 
872 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
873 	dev_put(dev);
874 
875 	for (i = 0; i < num; i++) {
876 		if (*(vclock_index + i) == phc_index) {
877 			match = true;
878 			break;
879 		}
880 	}
881 
882 	if (num > 0)
883 		kfree(vclock_index);
884 
885 	if (!match)
886 		return -EINVAL;
887 
888 	sk->sk_bind_phc = phc_index;
889 
890 	return 0;
891 }
892 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)893 int sock_set_timestamping(struct sock *sk, int optname,
894 			  struct so_timestamping timestamping)
895 {
896 	int val = timestamping.flags;
897 	int ret;
898 
899 	if (val & ~SOF_TIMESTAMPING_MASK)
900 		return -EINVAL;
901 
902 	if (val & SOF_TIMESTAMPING_OPT_ID &&
903 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
904 		if (sk_is_tcp(sk)) {
905 			if ((1 << sk->sk_state) &
906 			    (TCPF_CLOSE | TCPF_LISTEN))
907 				return -EINVAL;
908 			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
909 		} else {
910 			atomic_set(&sk->sk_tskey, 0);
911 		}
912 	}
913 
914 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
915 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
916 		return -EINVAL;
917 
918 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
919 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
920 		if (ret)
921 			return ret;
922 	}
923 
924 	sk->sk_tsflags = val;
925 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
926 
927 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
928 		sock_enable_timestamp(sk,
929 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
930 	else
931 		sock_disable_timestamp(sk,
932 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
933 	return 0;
934 }
935 
sock_set_keepalive(struct sock * sk)936 void sock_set_keepalive(struct sock *sk)
937 {
938 	lock_sock(sk);
939 	if (sk->sk_prot->keepalive)
940 		sk->sk_prot->keepalive(sk, true);
941 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
942 	release_sock(sk);
943 }
944 EXPORT_SYMBOL(sock_set_keepalive);
945 
__sock_set_rcvbuf(struct sock * sk,int val)946 static void __sock_set_rcvbuf(struct sock *sk, int val)
947 {
948 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
949 	 * as a negative value.
950 	 */
951 	val = min_t(int, val, INT_MAX / 2);
952 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
953 
954 	/* We double it on the way in to account for "struct sk_buff" etc.
955 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
956 	 * will allow that much actual data to be received on that socket.
957 	 *
958 	 * Applications are unaware that "struct sk_buff" and other overheads
959 	 * allocate from the receive buffer during socket buffer allocation.
960 	 *
961 	 * And after considering the possible alternatives, returning the value
962 	 * we actually used in getsockopt is the most desirable behavior.
963 	 */
964 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
965 }
966 
sock_set_rcvbuf(struct sock * sk,int val)967 void sock_set_rcvbuf(struct sock *sk, int val)
968 {
969 	lock_sock(sk);
970 	__sock_set_rcvbuf(sk, val);
971 	release_sock(sk);
972 }
973 EXPORT_SYMBOL(sock_set_rcvbuf);
974 
__sock_set_mark(struct sock * sk,u32 val)975 static void __sock_set_mark(struct sock *sk, u32 val)
976 {
977 	if (val != sk->sk_mark) {
978 		sk->sk_mark = val;
979 		sk_dst_reset(sk);
980 	}
981 }
982 
sock_set_mark(struct sock * sk,u32 val)983 void sock_set_mark(struct sock *sk, u32 val)
984 {
985 	lock_sock(sk);
986 	__sock_set_mark(sk, val);
987 	release_sock(sk);
988 }
989 EXPORT_SYMBOL(sock_set_mark);
990 
sock_release_reserved_memory(struct sock * sk,int bytes)991 static void sock_release_reserved_memory(struct sock *sk, int bytes)
992 {
993 	/* Round down bytes to multiple of pages */
994 	bytes &= ~(SK_MEM_QUANTUM - 1);
995 
996 	WARN_ON(bytes > sk->sk_reserved_mem);
997 	sk->sk_reserved_mem -= bytes;
998 	sk_mem_reclaim(sk);
999 }
1000 
sock_reserve_memory(struct sock * sk,int bytes)1001 static int sock_reserve_memory(struct sock *sk, int bytes)
1002 {
1003 	long allocated;
1004 	bool charged;
1005 	int pages;
1006 
1007 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008 		return -EOPNOTSUPP;
1009 
1010 	if (!bytes)
1011 		return 0;
1012 
1013 	pages = sk_mem_pages(bytes);
1014 
1015 	/* pre-charge to memcg */
1016 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018 	if (!charged)
1019 		return -ENOMEM;
1020 
1021 	/* pre-charge to forward_alloc */
1022 	allocated = sk_memory_allocated_add(sk, pages);
1023 	/* If the system goes into memory pressure with this
1024 	 * precharge, give up and return error.
1025 	 */
1026 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1027 		sk_memory_allocated_sub(sk, pages);
1028 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1029 		return -ENOMEM;
1030 	}
1031 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1032 
1033 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1034 
1035 	return 0;
1036 }
1037 
1038 /*
1039  *	This is meant for all protocols to use and covers goings on
1040  *	at the socket level. Everything here is generic.
1041  */
1042 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1043 int sock_setsockopt(struct socket *sock, int level, int optname,
1044 		    sockptr_t optval, unsigned int optlen)
1045 {
1046 	struct so_timestamping timestamping;
1047 	struct sock_txtime sk_txtime;
1048 	struct sock *sk = sock->sk;
1049 	int val;
1050 	int valbool;
1051 	struct linger ling;
1052 	int ret = 0;
1053 
1054 	/*
1055 	 *	Options without arguments
1056 	 */
1057 
1058 	if (optname == SO_BINDTODEVICE)
1059 		return sock_setbindtodevice(sk, optval, optlen);
1060 
1061 	if (optlen < sizeof(int))
1062 		return -EINVAL;
1063 
1064 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1065 		return -EFAULT;
1066 
1067 	valbool = val ? 1 : 0;
1068 
1069 	lock_sock(sk);
1070 
1071 	switch (optname) {
1072 	case SO_DEBUG:
1073 		if (val && !capable(CAP_NET_ADMIN))
1074 			ret = -EACCES;
1075 		else
1076 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1077 		break;
1078 	case SO_REUSEADDR:
1079 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1080 		break;
1081 	case SO_REUSEPORT:
1082 		sk->sk_reuseport = valbool;
1083 		break;
1084 	case SO_TYPE:
1085 	case SO_PROTOCOL:
1086 	case SO_DOMAIN:
1087 	case SO_ERROR:
1088 		ret = -ENOPROTOOPT;
1089 		break;
1090 	case SO_DONTROUTE:
1091 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1092 		sk_dst_reset(sk);
1093 		break;
1094 	case SO_BROADCAST:
1095 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1096 		break;
1097 	case SO_SNDBUF:
1098 		/* Don't error on this BSD doesn't and if you think
1099 		 * about it this is right. Otherwise apps have to
1100 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1101 		 * are treated in BSD as hints
1102 		 */
1103 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1104 set_sndbuf:
1105 		/* Ensure val * 2 fits into an int, to prevent max_t()
1106 		 * from treating it as a negative value.
1107 		 */
1108 		val = min_t(int, val, INT_MAX / 2);
1109 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1110 		WRITE_ONCE(sk->sk_sndbuf,
1111 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1112 		/* Wake up sending tasks if we upped the value. */
1113 		sk->sk_write_space(sk);
1114 		break;
1115 
1116 	case SO_SNDBUFFORCE:
1117 		if (!capable(CAP_NET_ADMIN)) {
1118 			ret = -EPERM;
1119 			break;
1120 		}
1121 
1122 		/* No negative values (to prevent underflow, as val will be
1123 		 * multiplied by 2).
1124 		 */
1125 		if (val < 0)
1126 			val = 0;
1127 		goto set_sndbuf;
1128 
1129 	case SO_RCVBUF:
1130 		/* Don't error on this BSD doesn't and if you think
1131 		 * about it this is right. Otherwise apps have to
1132 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1133 		 * are treated in BSD as hints
1134 		 */
1135 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1136 		break;
1137 
1138 	case SO_RCVBUFFORCE:
1139 		if (!capable(CAP_NET_ADMIN)) {
1140 			ret = -EPERM;
1141 			break;
1142 		}
1143 
1144 		/* No negative values (to prevent underflow, as val will be
1145 		 * multiplied by 2).
1146 		 */
1147 		__sock_set_rcvbuf(sk, max(val, 0));
1148 		break;
1149 
1150 	case SO_KEEPALIVE:
1151 		if (sk->sk_prot->keepalive)
1152 			sk->sk_prot->keepalive(sk, valbool);
1153 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1154 		break;
1155 
1156 	case SO_OOBINLINE:
1157 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1158 		break;
1159 
1160 	case SO_NO_CHECK:
1161 		sk->sk_no_check_tx = valbool;
1162 		break;
1163 
1164 	case SO_PRIORITY:
1165 		if ((val >= 0 && val <= 6) ||
1166 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1167 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1168 			sk->sk_priority = val;
1169 		else
1170 			ret = -EPERM;
1171 		break;
1172 
1173 	case SO_LINGER:
1174 		if (optlen < sizeof(ling)) {
1175 			ret = -EINVAL;	/* 1003.1g */
1176 			break;
1177 		}
1178 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1179 			ret = -EFAULT;
1180 			break;
1181 		}
1182 		if (!ling.l_onoff)
1183 			sock_reset_flag(sk, SOCK_LINGER);
1184 		else {
1185 #if (BITS_PER_LONG == 32)
1186 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1187 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1188 			else
1189 #endif
1190 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1191 			sock_set_flag(sk, SOCK_LINGER);
1192 		}
1193 		break;
1194 
1195 	case SO_BSDCOMPAT:
1196 		break;
1197 
1198 	case SO_PASSCRED:
1199 		if (valbool)
1200 			set_bit(SOCK_PASSCRED, &sock->flags);
1201 		else
1202 			clear_bit(SOCK_PASSCRED, &sock->flags);
1203 		break;
1204 
1205 	case SO_TIMESTAMP_OLD:
1206 	case SO_TIMESTAMP_NEW:
1207 	case SO_TIMESTAMPNS_OLD:
1208 	case SO_TIMESTAMPNS_NEW:
1209 		sock_set_timestamp(sk, optname, valbool);
1210 		break;
1211 
1212 	case SO_TIMESTAMPING_NEW:
1213 	case SO_TIMESTAMPING_OLD:
1214 		if (optlen == sizeof(timestamping)) {
1215 			if (copy_from_sockptr(&timestamping, optval,
1216 					      sizeof(timestamping))) {
1217 				ret = -EFAULT;
1218 				break;
1219 			}
1220 		} else {
1221 			memset(&timestamping, 0, sizeof(timestamping));
1222 			timestamping.flags = val;
1223 		}
1224 		ret = sock_set_timestamping(sk, optname, timestamping);
1225 		break;
1226 
1227 	case SO_RCVLOWAT:
1228 		if (val < 0)
1229 			val = INT_MAX;
1230 		if (sock->ops->set_rcvlowat)
1231 			ret = sock->ops->set_rcvlowat(sk, val);
1232 		else
1233 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1234 		break;
1235 
1236 	case SO_RCVTIMEO_OLD:
1237 	case SO_RCVTIMEO_NEW:
1238 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1239 				       optlen, optname == SO_RCVTIMEO_OLD);
1240 		break;
1241 
1242 	case SO_SNDTIMEO_OLD:
1243 	case SO_SNDTIMEO_NEW:
1244 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1245 				       optlen, optname == SO_SNDTIMEO_OLD);
1246 		break;
1247 
1248 	case SO_ATTACH_FILTER: {
1249 		struct sock_fprog fprog;
1250 
1251 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1252 		if (!ret)
1253 			ret = sk_attach_filter(&fprog, sk);
1254 		break;
1255 	}
1256 	case SO_ATTACH_BPF:
1257 		ret = -EINVAL;
1258 		if (optlen == sizeof(u32)) {
1259 			u32 ufd;
1260 
1261 			ret = -EFAULT;
1262 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1263 				break;
1264 
1265 			ret = sk_attach_bpf(ufd, sk);
1266 		}
1267 		break;
1268 
1269 	case SO_ATTACH_REUSEPORT_CBPF: {
1270 		struct sock_fprog fprog;
1271 
1272 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1273 		if (!ret)
1274 			ret = sk_reuseport_attach_filter(&fprog, sk);
1275 		break;
1276 	}
1277 	case SO_ATTACH_REUSEPORT_EBPF:
1278 		ret = -EINVAL;
1279 		if (optlen == sizeof(u32)) {
1280 			u32 ufd;
1281 
1282 			ret = -EFAULT;
1283 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1284 				break;
1285 
1286 			ret = sk_reuseport_attach_bpf(ufd, sk);
1287 		}
1288 		break;
1289 
1290 	case SO_DETACH_REUSEPORT_BPF:
1291 		ret = reuseport_detach_prog(sk);
1292 		break;
1293 
1294 	case SO_DETACH_FILTER:
1295 		ret = sk_detach_filter(sk);
1296 		break;
1297 
1298 	case SO_LOCK_FILTER:
1299 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1300 			ret = -EPERM;
1301 		else
1302 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1303 		break;
1304 
1305 	case SO_PASSSEC:
1306 		if (valbool)
1307 			set_bit(SOCK_PASSSEC, &sock->flags);
1308 		else
1309 			clear_bit(SOCK_PASSSEC, &sock->flags);
1310 		break;
1311 	case SO_MARK:
1312 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1313 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1314 			ret = -EPERM;
1315 			break;
1316 		}
1317 
1318 		__sock_set_mark(sk, val);
1319 		break;
1320 	case SO_RCVMARK:
1321 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1322 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1323 			ret = -EPERM;
1324 			break;
1325 		}
1326 
1327 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1328 		break;
1329 
1330 	case SO_RXQ_OVFL:
1331 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1332 		break;
1333 
1334 	case SO_WIFI_STATUS:
1335 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1336 		break;
1337 
1338 	case SO_PEEK_OFF:
1339 		if (sock->ops->set_peek_off)
1340 			ret = sock->ops->set_peek_off(sk, val);
1341 		else
1342 			ret = -EOPNOTSUPP;
1343 		break;
1344 
1345 	case SO_NOFCS:
1346 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1347 		break;
1348 
1349 	case SO_SELECT_ERR_QUEUE:
1350 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1351 		break;
1352 
1353 #ifdef CONFIG_NET_RX_BUSY_POLL
1354 	case SO_BUSY_POLL:
1355 		/* allow unprivileged users to decrease the value */
1356 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1357 			ret = -EPERM;
1358 		else {
1359 			if (val < 0)
1360 				ret = -EINVAL;
1361 			else
1362 				WRITE_ONCE(sk->sk_ll_usec, val);
1363 		}
1364 		break;
1365 	case SO_PREFER_BUSY_POLL:
1366 		if (valbool && !capable(CAP_NET_ADMIN))
1367 			ret = -EPERM;
1368 		else
1369 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1370 		break;
1371 	case SO_BUSY_POLL_BUDGET:
1372 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1373 			ret = -EPERM;
1374 		} else {
1375 			if (val < 0 || val > U16_MAX)
1376 				ret = -EINVAL;
1377 			else
1378 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1379 		}
1380 		break;
1381 #endif
1382 
1383 	case SO_MAX_PACING_RATE:
1384 		{
1385 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1386 
1387 		if (sizeof(ulval) != sizeof(val) &&
1388 		    optlen >= sizeof(ulval) &&
1389 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1390 			ret = -EFAULT;
1391 			break;
1392 		}
1393 		if (ulval != ~0UL)
1394 			cmpxchg(&sk->sk_pacing_status,
1395 				SK_PACING_NONE,
1396 				SK_PACING_NEEDED);
1397 		sk->sk_max_pacing_rate = ulval;
1398 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1399 		break;
1400 		}
1401 	case SO_INCOMING_CPU:
1402 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1403 		break;
1404 
1405 	case SO_CNX_ADVICE:
1406 		if (val == 1)
1407 			dst_negative_advice(sk);
1408 		break;
1409 
1410 	case SO_ZEROCOPY:
1411 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1412 			if (!(sk_is_tcp(sk) ||
1413 			      (sk->sk_type == SOCK_DGRAM &&
1414 			       sk->sk_protocol == IPPROTO_UDP)))
1415 				ret = -EOPNOTSUPP;
1416 		} else if (sk->sk_family != PF_RDS) {
1417 			ret = -EOPNOTSUPP;
1418 		}
1419 		if (!ret) {
1420 			if (val < 0 || val > 1)
1421 				ret = -EINVAL;
1422 			else
1423 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1424 		}
1425 		break;
1426 
1427 	case SO_TXTIME:
1428 		if (optlen != sizeof(struct sock_txtime)) {
1429 			ret = -EINVAL;
1430 			break;
1431 		} else if (copy_from_sockptr(&sk_txtime, optval,
1432 			   sizeof(struct sock_txtime))) {
1433 			ret = -EFAULT;
1434 			break;
1435 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1436 			ret = -EINVAL;
1437 			break;
1438 		}
1439 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1440 		 * scheduler has enough safe guards.
1441 		 */
1442 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1443 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1444 			ret = -EPERM;
1445 			break;
1446 		}
1447 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1448 		sk->sk_clockid = sk_txtime.clockid;
1449 		sk->sk_txtime_deadline_mode =
1450 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1451 		sk->sk_txtime_report_errors =
1452 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1453 		break;
1454 
1455 	case SO_BINDTOIFINDEX:
1456 		ret = sock_bindtoindex_locked(sk, val);
1457 		break;
1458 
1459 	case SO_BUF_LOCK:
1460 		if (val & ~SOCK_BUF_LOCK_MASK) {
1461 			ret = -EINVAL;
1462 			break;
1463 		}
1464 		sk->sk_userlocks = val | (sk->sk_userlocks &
1465 					  ~SOCK_BUF_LOCK_MASK);
1466 		break;
1467 
1468 	case SO_RESERVE_MEM:
1469 	{
1470 		int delta;
1471 
1472 		if (val < 0) {
1473 			ret = -EINVAL;
1474 			break;
1475 		}
1476 
1477 		delta = val - sk->sk_reserved_mem;
1478 		if (delta < 0)
1479 			sock_release_reserved_memory(sk, -delta);
1480 		else
1481 			ret = sock_reserve_memory(sk, delta);
1482 		break;
1483 	}
1484 
1485 	case SO_TXREHASH:
1486 		if (val < -1 || val > 1) {
1487 			ret = -EINVAL;
1488 			break;
1489 		}
1490 		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1491 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1492 		break;
1493 
1494 	default:
1495 		ret = -ENOPROTOOPT;
1496 		break;
1497 	}
1498 	release_sock(sk);
1499 	return ret;
1500 }
1501 EXPORT_SYMBOL(sock_setsockopt);
1502 
sk_get_peer_cred(struct sock * sk)1503 static const struct cred *sk_get_peer_cred(struct sock *sk)
1504 {
1505 	const struct cred *cred;
1506 
1507 	spin_lock(&sk->sk_peer_lock);
1508 	cred = get_cred(sk->sk_peer_cred);
1509 	spin_unlock(&sk->sk_peer_lock);
1510 
1511 	return cred;
1512 }
1513 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1514 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1515 			  struct ucred *ucred)
1516 {
1517 	ucred->pid = pid_vnr(pid);
1518 	ucred->uid = ucred->gid = -1;
1519 	if (cred) {
1520 		struct user_namespace *current_ns = current_user_ns();
1521 
1522 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1523 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1524 	}
1525 }
1526 
groups_to_user(gid_t __user * dst,const struct group_info * src)1527 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1528 {
1529 	struct user_namespace *user_ns = current_user_ns();
1530 	int i;
1531 
1532 	for (i = 0; i < src->ngroups; i++)
1533 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1534 			return -EFAULT;
1535 
1536 	return 0;
1537 }
1538 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1539 int sock_getsockopt(struct socket *sock, int level, int optname,
1540 		    char __user *optval, int __user *optlen)
1541 {
1542 	struct sock *sk = sock->sk;
1543 
1544 	union {
1545 		int val;
1546 		u64 val64;
1547 		unsigned long ulval;
1548 		struct linger ling;
1549 		struct old_timeval32 tm32;
1550 		struct __kernel_old_timeval tm;
1551 		struct  __kernel_sock_timeval stm;
1552 		struct sock_txtime txtime;
1553 		struct so_timestamping timestamping;
1554 	} v;
1555 
1556 	int lv = sizeof(int);
1557 	int len;
1558 
1559 	if (get_user(len, optlen))
1560 		return -EFAULT;
1561 	if (len < 0)
1562 		return -EINVAL;
1563 
1564 	memset(&v, 0, sizeof(v));
1565 
1566 	switch (optname) {
1567 	case SO_DEBUG:
1568 		v.val = sock_flag(sk, SOCK_DBG);
1569 		break;
1570 
1571 	case SO_DONTROUTE:
1572 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1573 		break;
1574 
1575 	case SO_BROADCAST:
1576 		v.val = sock_flag(sk, SOCK_BROADCAST);
1577 		break;
1578 
1579 	case SO_SNDBUF:
1580 		v.val = sk->sk_sndbuf;
1581 		break;
1582 
1583 	case SO_RCVBUF:
1584 		v.val = sk->sk_rcvbuf;
1585 		break;
1586 
1587 	case SO_REUSEADDR:
1588 		v.val = sk->sk_reuse;
1589 		break;
1590 
1591 	case SO_REUSEPORT:
1592 		v.val = sk->sk_reuseport;
1593 		break;
1594 
1595 	case SO_KEEPALIVE:
1596 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1597 		break;
1598 
1599 	case SO_TYPE:
1600 		v.val = sk->sk_type;
1601 		break;
1602 
1603 	case SO_PROTOCOL:
1604 		v.val = sk->sk_protocol;
1605 		break;
1606 
1607 	case SO_DOMAIN:
1608 		v.val = sk->sk_family;
1609 		break;
1610 
1611 	case SO_ERROR:
1612 		v.val = -sock_error(sk);
1613 		if (v.val == 0)
1614 			v.val = xchg(&sk->sk_err_soft, 0);
1615 		break;
1616 
1617 	case SO_OOBINLINE:
1618 		v.val = sock_flag(sk, SOCK_URGINLINE);
1619 		break;
1620 
1621 	case SO_NO_CHECK:
1622 		v.val = sk->sk_no_check_tx;
1623 		break;
1624 
1625 	case SO_PRIORITY:
1626 		v.val = sk->sk_priority;
1627 		break;
1628 
1629 	case SO_LINGER:
1630 		lv		= sizeof(v.ling);
1631 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1632 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1633 		break;
1634 
1635 	case SO_BSDCOMPAT:
1636 		break;
1637 
1638 	case SO_TIMESTAMP_OLD:
1639 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1640 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1641 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1642 		break;
1643 
1644 	case SO_TIMESTAMPNS_OLD:
1645 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1646 		break;
1647 
1648 	case SO_TIMESTAMP_NEW:
1649 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1650 		break;
1651 
1652 	case SO_TIMESTAMPNS_NEW:
1653 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1654 		break;
1655 
1656 	case SO_TIMESTAMPING_OLD:
1657 		lv = sizeof(v.timestamping);
1658 		v.timestamping.flags = sk->sk_tsflags;
1659 		v.timestamping.bind_phc = sk->sk_bind_phc;
1660 		break;
1661 
1662 	case SO_RCVTIMEO_OLD:
1663 	case SO_RCVTIMEO_NEW:
1664 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1665 		break;
1666 
1667 	case SO_SNDTIMEO_OLD:
1668 	case SO_SNDTIMEO_NEW:
1669 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1670 		break;
1671 
1672 	case SO_RCVLOWAT:
1673 		v.val = sk->sk_rcvlowat;
1674 		break;
1675 
1676 	case SO_SNDLOWAT:
1677 		v.val = 1;
1678 		break;
1679 
1680 	case SO_PASSCRED:
1681 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1682 		break;
1683 
1684 	case SO_PEERCRED:
1685 	{
1686 		struct ucred peercred;
1687 		if (len > sizeof(peercred))
1688 			len = sizeof(peercred);
1689 
1690 		spin_lock(&sk->sk_peer_lock);
1691 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1692 		spin_unlock(&sk->sk_peer_lock);
1693 
1694 		if (copy_to_user(optval, &peercred, len))
1695 			return -EFAULT;
1696 		goto lenout;
1697 	}
1698 
1699 	case SO_PEERGROUPS:
1700 	{
1701 		const struct cred *cred;
1702 		int ret, n;
1703 
1704 		cred = sk_get_peer_cred(sk);
1705 		if (!cred)
1706 			return -ENODATA;
1707 
1708 		n = cred->group_info->ngroups;
1709 		if (len < n * sizeof(gid_t)) {
1710 			len = n * sizeof(gid_t);
1711 			put_cred(cred);
1712 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1713 		}
1714 		len = n * sizeof(gid_t);
1715 
1716 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1717 		put_cred(cred);
1718 		if (ret)
1719 			return ret;
1720 		goto lenout;
1721 	}
1722 
1723 	case SO_PEERNAME:
1724 	{
1725 		char address[128];
1726 
1727 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1728 		if (lv < 0)
1729 			return -ENOTCONN;
1730 		if (lv < len)
1731 			return -EINVAL;
1732 		if (copy_to_user(optval, address, len))
1733 			return -EFAULT;
1734 		goto lenout;
1735 	}
1736 
1737 	/* Dubious BSD thing... Probably nobody even uses it, but
1738 	 * the UNIX standard wants it for whatever reason... -DaveM
1739 	 */
1740 	case SO_ACCEPTCONN:
1741 		v.val = sk->sk_state == TCP_LISTEN;
1742 		break;
1743 
1744 	case SO_PASSSEC:
1745 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1746 		break;
1747 
1748 	case SO_PEERSEC:
1749 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1750 
1751 	case SO_MARK:
1752 		v.val = sk->sk_mark;
1753 		break;
1754 
1755 	case SO_RCVMARK:
1756 		v.val = sock_flag(sk, SOCK_RCVMARK);
1757 		break;
1758 
1759 	case SO_RXQ_OVFL:
1760 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1761 		break;
1762 
1763 	case SO_WIFI_STATUS:
1764 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1765 		break;
1766 
1767 	case SO_PEEK_OFF:
1768 		if (!sock->ops->set_peek_off)
1769 			return -EOPNOTSUPP;
1770 
1771 		v.val = sk->sk_peek_off;
1772 		break;
1773 	case SO_NOFCS:
1774 		v.val = sock_flag(sk, SOCK_NOFCS);
1775 		break;
1776 
1777 	case SO_BINDTODEVICE:
1778 		return sock_getbindtodevice(sk, optval, optlen, len);
1779 
1780 	case SO_GET_FILTER:
1781 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1782 		if (len < 0)
1783 			return len;
1784 
1785 		goto lenout;
1786 
1787 	case SO_LOCK_FILTER:
1788 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1789 		break;
1790 
1791 	case SO_BPF_EXTENSIONS:
1792 		v.val = bpf_tell_extensions();
1793 		break;
1794 
1795 	case SO_SELECT_ERR_QUEUE:
1796 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1797 		break;
1798 
1799 #ifdef CONFIG_NET_RX_BUSY_POLL
1800 	case SO_BUSY_POLL:
1801 		v.val = sk->sk_ll_usec;
1802 		break;
1803 	case SO_PREFER_BUSY_POLL:
1804 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1805 		break;
1806 #endif
1807 
1808 	case SO_MAX_PACING_RATE:
1809 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1810 			lv = sizeof(v.ulval);
1811 			v.ulval = sk->sk_max_pacing_rate;
1812 		} else {
1813 			/* 32bit version */
1814 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1815 		}
1816 		break;
1817 
1818 	case SO_INCOMING_CPU:
1819 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1820 		break;
1821 
1822 	case SO_MEMINFO:
1823 	{
1824 		u32 meminfo[SK_MEMINFO_VARS];
1825 
1826 		sk_get_meminfo(sk, meminfo);
1827 
1828 		len = min_t(unsigned int, len, sizeof(meminfo));
1829 		if (copy_to_user(optval, &meminfo, len))
1830 			return -EFAULT;
1831 
1832 		goto lenout;
1833 	}
1834 
1835 #ifdef CONFIG_NET_RX_BUSY_POLL
1836 	case SO_INCOMING_NAPI_ID:
1837 		v.val = READ_ONCE(sk->sk_napi_id);
1838 
1839 		/* aggregate non-NAPI IDs down to 0 */
1840 		if (v.val < MIN_NAPI_ID)
1841 			v.val = 0;
1842 
1843 		break;
1844 #endif
1845 
1846 	case SO_COOKIE:
1847 		lv = sizeof(u64);
1848 		if (len < lv)
1849 			return -EINVAL;
1850 		v.val64 = sock_gen_cookie(sk);
1851 		break;
1852 
1853 	case SO_ZEROCOPY:
1854 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1855 		break;
1856 
1857 	case SO_TXTIME:
1858 		lv = sizeof(v.txtime);
1859 		v.txtime.clockid = sk->sk_clockid;
1860 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1861 				  SOF_TXTIME_DEADLINE_MODE : 0;
1862 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1863 				  SOF_TXTIME_REPORT_ERRORS : 0;
1864 		break;
1865 
1866 	case SO_BINDTOIFINDEX:
1867 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1868 		break;
1869 
1870 	case SO_NETNS_COOKIE:
1871 		lv = sizeof(u64);
1872 		if (len != lv)
1873 			return -EINVAL;
1874 		v.val64 = sock_net(sk)->net_cookie;
1875 		break;
1876 
1877 	case SO_BUF_LOCK:
1878 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1879 		break;
1880 
1881 	case SO_RESERVE_MEM:
1882 		v.val = sk->sk_reserved_mem;
1883 		break;
1884 
1885 	case SO_TXREHASH:
1886 		v.val = sk->sk_txrehash;
1887 		break;
1888 
1889 	default:
1890 		/* We implement the SO_SNDLOWAT etc to not be settable
1891 		 * (1003.1g 7).
1892 		 */
1893 		return -ENOPROTOOPT;
1894 	}
1895 
1896 	if (len > lv)
1897 		len = lv;
1898 	if (copy_to_user(optval, &v, len))
1899 		return -EFAULT;
1900 lenout:
1901 	if (put_user(len, optlen))
1902 		return -EFAULT;
1903 	return 0;
1904 }
1905 
1906 /*
1907  * Initialize an sk_lock.
1908  *
1909  * (We also register the sk_lock with the lock validator.)
1910  */
sock_lock_init(struct sock * sk)1911 static inline void sock_lock_init(struct sock *sk)
1912 {
1913 	if (sk->sk_kern_sock)
1914 		sock_lock_init_class_and_name(
1915 			sk,
1916 			af_family_kern_slock_key_strings[sk->sk_family],
1917 			af_family_kern_slock_keys + sk->sk_family,
1918 			af_family_kern_key_strings[sk->sk_family],
1919 			af_family_kern_keys + sk->sk_family);
1920 	else
1921 		sock_lock_init_class_and_name(
1922 			sk,
1923 			af_family_slock_key_strings[sk->sk_family],
1924 			af_family_slock_keys + sk->sk_family,
1925 			af_family_key_strings[sk->sk_family],
1926 			af_family_keys + sk->sk_family);
1927 }
1928 
1929 /*
1930  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1931  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1932  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1933  */
sock_copy(struct sock * nsk,const struct sock * osk)1934 static void sock_copy(struct sock *nsk, const struct sock *osk)
1935 {
1936 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1937 #ifdef CONFIG_SECURITY_NETWORK
1938 	void *sptr = nsk->sk_security;
1939 #endif
1940 
1941 	/* If we move sk_tx_queue_mapping out of the private section,
1942 	 * we must check if sk_tx_queue_clear() is called after
1943 	 * sock_copy() in sk_clone_lock().
1944 	 */
1945 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1946 		     offsetof(struct sock, sk_dontcopy_begin) ||
1947 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1948 		     offsetof(struct sock, sk_dontcopy_end));
1949 
1950 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1951 
1952 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1953 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1954 
1955 #ifdef CONFIG_SECURITY_NETWORK
1956 	nsk->sk_security = sptr;
1957 	security_sk_clone(osk, nsk);
1958 #endif
1959 }
1960 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1961 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1962 		int family)
1963 {
1964 	struct sock *sk;
1965 	struct kmem_cache *slab;
1966 
1967 	slab = prot->slab;
1968 	if (slab != NULL) {
1969 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1970 		if (!sk)
1971 			return sk;
1972 		if (want_init_on_alloc(priority))
1973 			sk_prot_clear_nulls(sk, prot->obj_size);
1974 	} else
1975 		sk = kmalloc(prot->obj_size, priority);
1976 
1977 	if (sk != NULL) {
1978 		if (security_sk_alloc(sk, family, priority))
1979 			goto out_free;
1980 
1981 		if (!try_module_get(prot->owner))
1982 			goto out_free_sec;
1983 	}
1984 
1985 	return sk;
1986 
1987 out_free_sec:
1988 	security_sk_free(sk);
1989 out_free:
1990 	if (slab != NULL)
1991 		kmem_cache_free(slab, sk);
1992 	else
1993 		kfree(sk);
1994 	return NULL;
1995 }
1996 
sk_prot_free(struct proto * prot,struct sock * sk)1997 static void sk_prot_free(struct proto *prot, struct sock *sk)
1998 {
1999 	struct kmem_cache *slab;
2000 	struct module *owner;
2001 
2002 	owner = prot->owner;
2003 	slab = prot->slab;
2004 
2005 	cgroup_sk_free(&sk->sk_cgrp_data);
2006 	mem_cgroup_sk_free(sk);
2007 	security_sk_free(sk);
2008 	if (slab != NULL)
2009 		kmem_cache_free(slab, sk);
2010 	else
2011 		kfree(sk);
2012 	module_put(owner);
2013 }
2014 
2015 /**
2016  *	sk_alloc - All socket objects are allocated here
2017  *	@net: the applicable net namespace
2018  *	@family: protocol family
2019  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2020  *	@prot: struct proto associated with this new sock instance
2021  *	@kern: is this to be a kernel socket?
2022  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2023 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2024 		      struct proto *prot, int kern)
2025 {
2026 	struct sock *sk;
2027 
2028 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2029 	if (sk) {
2030 		sk->sk_family = family;
2031 		/*
2032 		 * See comment in struct sock definition to understand
2033 		 * why we need sk_prot_creator -acme
2034 		 */
2035 		sk->sk_prot = sk->sk_prot_creator = prot;
2036 		sk->sk_kern_sock = kern;
2037 		sock_lock_init(sk);
2038 		sk->sk_net_refcnt = kern ? 0 : 1;
2039 		if (likely(sk->sk_net_refcnt)) {
2040 			get_net_track(net, &sk->ns_tracker, priority);
2041 			sock_inuse_add(net, 1);
2042 		}
2043 
2044 		sock_net_set(sk, net);
2045 		refcount_set(&sk->sk_wmem_alloc, 1);
2046 
2047 		mem_cgroup_sk_alloc(sk);
2048 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2049 		sock_update_classid(&sk->sk_cgrp_data);
2050 		sock_update_netprioidx(&sk->sk_cgrp_data);
2051 		sk_tx_queue_clear(sk);
2052 	}
2053 
2054 	return sk;
2055 }
2056 EXPORT_SYMBOL(sk_alloc);
2057 
2058 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2059  * grace period. This is the case for UDP sockets and TCP listeners.
2060  */
__sk_destruct(struct rcu_head * head)2061 static void __sk_destruct(struct rcu_head *head)
2062 {
2063 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2064 	struct sk_filter *filter;
2065 
2066 	if (sk->sk_destruct)
2067 		sk->sk_destruct(sk);
2068 
2069 	filter = rcu_dereference_check(sk->sk_filter,
2070 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2071 	if (filter) {
2072 		sk_filter_uncharge(sk, filter);
2073 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2074 	}
2075 
2076 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2077 
2078 #ifdef CONFIG_BPF_SYSCALL
2079 	bpf_sk_storage_free(sk);
2080 #endif
2081 
2082 	if (atomic_read(&sk->sk_omem_alloc))
2083 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2084 			 __func__, atomic_read(&sk->sk_omem_alloc));
2085 
2086 	if (sk->sk_frag.page) {
2087 		put_page(sk->sk_frag.page);
2088 		sk->sk_frag.page = NULL;
2089 	}
2090 
2091 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2092 	put_cred(sk->sk_peer_cred);
2093 	put_pid(sk->sk_peer_pid);
2094 
2095 	if (likely(sk->sk_net_refcnt))
2096 		put_net_track(sock_net(sk), &sk->ns_tracker);
2097 	sk_prot_free(sk->sk_prot_creator, sk);
2098 }
2099 
sk_destruct(struct sock * sk)2100 void sk_destruct(struct sock *sk)
2101 {
2102 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2103 
2104 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2105 		reuseport_detach_sock(sk);
2106 		use_call_rcu = true;
2107 	}
2108 
2109 	if (use_call_rcu)
2110 		call_rcu(&sk->sk_rcu, __sk_destruct);
2111 	else
2112 		__sk_destruct(&sk->sk_rcu);
2113 }
2114 
__sk_free(struct sock * sk)2115 static void __sk_free(struct sock *sk)
2116 {
2117 	if (likely(sk->sk_net_refcnt))
2118 		sock_inuse_add(sock_net(sk), -1);
2119 
2120 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2121 		sock_diag_broadcast_destroy(sk);
2122 	else
2123 		sk_destruct(sk);
2124 }
2125 
sk_free(struct sock * sk)2126 void sk_free(struct sock *sk)
2127 {
2128 	/*
2129 	 * We subtract one from sk_wmem_alloc and can know if
2130 	 * some packets are still in some tx queue.
2131 	 * If not null, sock_wfree() will call __sk_free(sk) later
2132 	 */
2133 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2134 		__sk_free(sk);
2135 }
2136 EXPORT_SYMBOL(sk_free);
2137 
sk_init_common(struct sock * sk)2138 static void sk_init_common(struct sock *sk)
2139 {
2140 	skb_queue_head_init(&sk->sk_receive_queue);
2141 	skb_queue_head_init(&sk->sk_write_queue);
2142 	skb_queue_head_init(&sk->sk_error_queue);
2143 
2144 	rwlock_init(&sk->sk_callback_lock);
2145 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2146 			af_rlock_keys + sk->sk_family,
2147 			af_family_rlock_key_strings[sk->sk_family]);
2148 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2149 			af_wlock_keys + sk->sk_family,
2150 			af_family_wlock_key_strings[sk->sk_family]);
2151 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2152 			af_elock_keys + sk->sk_family,
2153 			af_family_elock_key_strings[sk->sk_family]);
2154 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2155 			af_callback_keys + sk->sk_family,
2156 			af_family_clock_key_strings[sk->sk_family]);
2157 }
2158 
2159 /**
2160  *	sk_clone_lock - clone a socket, and lock its clone
2161  *	@sk: the socket to clone
2162  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2163  *
2164  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2165  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2166 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2167 {
2168 	struct proto *prot = READ_ONCE(sk->sk_prot);
2169 	struct sk_filter *filter;
2170 	bool is_charged = true;
2171 	struct sock *newsk;
2172 
2173 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2174 	if (!newsk)
2175 		goto out;
2176 
2177 	sock_copy(newsk, sk);
2178 
2179 	newsk->sk_prot_creator = prot;
2180 
2181 	/* SANITY */
2182 	if (likely(newsk->sk_net_refcnt)) {
2183 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2184 		sock_inuse_add(sock_net(newsk), 1);
2185 	}
2186 	sk_node_init(&newsk->sk_node);
2187 	sock_lock_init(newsk);
2188 	bh_lock_sock(newsk);
2189 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2190 	newsk->sk_backlog.len = 0;
2191 
2192 	atomic_set(&newsk->sk_rmem_alloc, 0);
2193 
2194 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2195 	refcount_set(&newsk->sk_wmem_alloc, 1);
2196 
2197 	atomic_set(&newsk->sk_omem_alloc, 0);
2198 	sk_init_common(newsk);
2199 
2200 	newsk->sk_dst_cache	= NULL;
2201 	newsk->sk_dst_pending_confirm = 0;
2202 	newsk->sk_wmem_queued	= 0;
2203 	newsk->sk_forward_alloc = 0;
2204 	newsk->sk_reserved_mem  = 0;
2205 	atomic_set(&newsk->sk_drops, 0);
2206 	newsk->sk_send_head	= NULL;
2207 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2208 	atomic_set(&newsk->sk_zckey, 0);
2209 
2210 	sock_reset_flag(newsk, SOCK_DONE);
2211 
2212 	/* sk->sk_memcg will be populated at accept() time */
2213 	newsk->sk_memcg = NULL;
2214 
2215 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2216 
2217 	rcu_read_lock();
2218 	filter = rcu_dereference(sk->sk_filter);
2219 	if (filter != NULL)
2220 		/* though it's an empty new sock, the charging may fail
2221 		 * if sysctl_optmem_max was changed between creation of
2222 		 * original socket and cloning
2223 		 */
2224 		is_charged = sk_filter_charge(newsk, filter);
2225 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2226 	rcu_read_unlock();
2227 
2228 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2229 		/* We need to make sure that we don't uncharge the new
2230 		 * socket if we couldn't charge it in the first place
2231 		 * as otherwise we uncharge the parent's filter.
2232 		 */
2233 		if (!is_charged)
2234 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2235 		sk_free_unlock_clone(newsk);
2236 		newsk = NULL;
2237 		goto out;
2238 	}
2239 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2240 
2241 	if (bpf_sk_storage_clone(sk, newsk)) {
2242 		sk_free_unlock_clone(newsk);
2243 		newsk = NULL;
2244 		goto out;
2245 	}
2246 
2247 	/* Clear sk_user_data if parent had the pointer tagged
2248 	 * as not suitable for copying when cloning.
2249 	 */
2250 	if (sk_user_data_is_nocopy(newsk))
2251 		newsk->sk_user_data = NULL;
2252 
2253 	newsk->sk_err	   = 0;
2254 	newsk->sk_err_soft = 0;
2255 	newsk->sk_priority = 0;
2256 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2257 
2258 	/* Before updating sk_refcnt, we must commit prior changes to memory
2259 	 * (Documentation/RCU/rculist_nulls.rst for details)
2260 	 */
2261 	smp_wmb();
2262 	refcount_set(&newsk->sk_refcnt, 2);
2263 
2264 	/* Increment the counter in the same struct proto as the master
2265 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2266 	 * is the same as sk->sk_prot->socks, as this field was copied
2267 	 * with memcpy).
2268 	 *
2269 	 * This _changes_ the previous behaviour, where
2270 	 * tcp_create_openreq_child always was incrementing the
2271 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2272 	 * to be taken into account in all callers. -acme
2273 	 */
2274 	sk_refcnt_debug_inc(newsk);
2275 	sk_set_socket(newsk, NULL);
2276 	sk_tx_queue_clear(newsk);
2277 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2278 
2279 	if (newsk->sk_prot->sockets_allocated)
2280 		sk_sockets_allocated_inc(newsk);
2281 
2282 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2283 		net_enable_timestamp();
2284 out:
2285 	return newsk;
2286 }
2287 EXPORT_SYMBOL_GPL(sk_clone_lock);
2288 
sk_free_unlock_clone(struct sock * sk)2289 void sk_free_unlock_clone(struct sock *sk)
2290 {
2291 	/* It is still raw copy of parent, so invalidate
2292 	 * destructor and make plain sk_free() */
2293 	sk->sk_destruct = NULL;
2294 	bh_unlock_sock(sk);
2295 	sk_free(sk);
2296 }
2297 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2298 
sk_trim_gso_size(struct sock * sk)2299 static void sk_trim_gso_size(struct sock *sk)
2300 {
2301 	if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2302 		return;
2303 #if IS_ENABLED(CONFIG_IPV6)
2304 	if (sk->sk_family == AF_INET6 &&
2305 	    sk_is_tcp(sk) &&
2306 	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2307 		return;
2308 #endif
2309 	sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2310 }
2311 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2312 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2313 {
2314 	u32 max_segs = 1;
2315 
2316 	sk_dst_set(sk, dst);
2317 	sk->sk_route_caps = dst->dev->features;
2318 	if (sk_is_tcp(sk))
2319 		sk->sk_route_caps |= NETIF_F_GSO;
2320 	if (sk->sk_route_caps & NETIF_F_GSO)
2321 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2322 	if (unlikely(sk->sk_gso_disabled))
2323 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2324 	if (sk_can_gso(sk)) {
2325 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2326 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2327 		} else {
2328 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2329 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2330 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2331 			sk_trim_gso_size(sk);
2332 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2333 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2334 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2335 		}
2336 	}
2337 	sk->sk_gso_max_segs = max_segs;
2338 }
2339 EXPORT_SYMBOL_GPL(sk_setup_caps);
2340 
2341 /*
2342  *	Simple resource managers for sockets.
2343  */
2344 
2345 
2346 /*
2347  * Write buffer destructor automatically called from kfree_skb.
2348  */
sock_wfree(struct sk_buff * skb)2349 void sock_wfree(struct sk_buff *skb)
2350 {
2351 	struct sock *sk = skb->sk;
2352 	unsigned int len = skb->truesize;
2353 	bool free;
2354 
2355 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2356 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2357 		    sk->sk_write_space == sock_def_write_space) {
2358 			rcu_read_lock();
2359 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2360 			sock_def_write_space_wfree(sk);
2361 			rcu_read_unlock();
2362 			if (unlikely(free))
2363 				__sk_free(sk);
2364 			return;
2365 		}
2366 
2367 		/*
2368 		 * Keep a reference on sk_wmem_alloc, this will be released
2369 		 * after sk_write_space() call
2370 		 */
2371 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2372 		sk->sk_write_space(sk);
2373 		len = 1;
2374 	}
2375 	/*
2376 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2377 	 * could not do because of in-flight packets
2378 	 */
2379 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2380 		__sk_free(sk);
2381 }
2382 EXPORT_SYMBOL(sock_wfree);
2383 
2384 /* This variant of sock_wfree() is used by TCP,
2385  * since it sets SOCK_USE_WRITE_QUEUE.
2386  */
__sock_wfree(struct sk_buff * skb)2387 void __sock_wfree(struct sk_buff *skb)
2388 {
2389 	struct sock *sk = skb->sk;
2390 
2391 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2392 		__sk_free(sk);
2393 }
2394 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2395 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2396 {
2397 	skb_orphan(skb);
2398 	skb->sk = sk;
2399 #ifdef CONFIG_INET
2400 	if (unlikely(!sk_fullsock(sk))) {
2401 		skb->destructor = sock_edemux;
2402 		sock_hold(sk);
2403 		return;
2404 	}
2405 #endif
2406 	skb->destructor = sock_wfree;
2407 	skb_set_hash_from_sk(skb, sk);
2408 	/*
2409 	 * We used to take a refcount on sk, but following operation
2410 	 * is enough to guarantee sk_free() wont free this sock until
2411 	 * all in-flight packets are completed
2412 	 */
2413 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2414 }
2415 EXPORT_SYMBOL(skb_set_owner_w);
2416 
can_skb_orphan_partial(const struct sk_buff * skb)2417 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2418 {
2419 #ifdef CONFIG_TLS_DEVICE
2420 	/* Drivers depend on in-order delivery for crypto offload,
2421 	 * partial orphan breaks out-of-order-OK logic.
2422 	 */
2423 	if (skb->decrypted)
2424 		return false;
2425 #endif
2426 	return (skb->destructor == sock_wfree ||
2427 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2428 }
2429 
2430 /* This helper is used by netem, as it can hold packets in its
2431  * delay queue. We want to allow the owner socket to send more
2432  * packets, as if they were already TX completed by a typical driver.
2433  * But we also want to keep skb->sk set because some packet schedulers
2434  * rely on it (sch_fq for example).
2435  */
skb_orphan_partial(struct sk_buff * skb)2436 void skb_orphan_partial(struct sk_buff *skb)
2437 {
2438 	if (skb_is_tcp_pure_ack(skb))
2439 		return;
2440 
2441 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2442 		return;
2443 
2444 	skb_orphan(skb);
2445 }
2446 EXPORT_SYMBOL(skb_orphan_partial);
2447 
2448 /*
2449  * Read buffer destructor automatically called from kfree_skb.
2450  */
sock_rfree(struct sk_buff * skb)2451 void sock_rfree(struct sk_buff *skb)
2452 {
2453 	struct sock *sk = skb->sk;
2454 	unsigned int len = skb->truesize;
2455 
2456 	atomic_sub(len, &sk->sk_rmem_alloc);
2457 	sk_mem_uncharge(sk, len);
2458 }
2459 EXPORT_SYMBOL(sock_rfree);
2460 
2461 /*
2462  * Buffer destructor for skbs that are not used directly in read or write
2463  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2464  */
sock_efree(struct sk_buff * skb)2465 void sock_efree(struct sk_buff *skb)
2466 {
2467 	sock_put(skb->sk);
2468 }
2469 EXPORT_SYMBOL(sock_efree);
2470 
2471 /* Buffer destructor for prefetch/receive path where reference count may
2472  * not be held, e.g. for listen sockets.
2473  */
2474 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2475 void sock_pfree(struct sk_buff *skb)
2476 {
2477 	if (sk_is_refcounted(skb->sk))
2478 		sock_gen_put(skb->sk);
2479 }
2480 EXPORT_SYMBOL(sock_pfree);
2481 #endif /* CONFIG_INET */
2482 
sock_i_uid(struct sock * sk)2483 kuid_t sock_i_uid(struct sock *sk)
2484 {
2485 	kuid_t uid;
2486 
2487 	read_lock_bh(&sk->sk_callback_lock);
2488 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2489 	read_unlock_bh(&sk->sk_callback_lock);
2490 	return uid;
2491 }
2492 EXPORT_SYMBOL(sock_i_uid);
2493 
sock_i_ino(struct sock * sk)2494 unsigned long sock_i_ino(struct sock *sk)
2495 {
2496 	unsigned long ino;
2497 
2498 	read_lock_bh(&sk->sk_callback_lock);
2499 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2500 	read_unlock_bh(&sk->sk_callback_lock);
2501 	return ino;
2502 }
2503 EXPORT_SYMBOL(sock_i_ino);
2504 
2505 /*
2506  * Allocate a skb from the socket's send buffer.
2507  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2508 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2509 			     gfp_t priority)
2510 {
2511 	if (force ||
2512 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2513 		struct sk_buff *skb = alloc_skb(size, priority);
2514 
2515 		if (skb) {
2516 			skb_set_owner_w(skb, sk);
2517 			return skb;
2518 		}
2519 	}
2520 	return NULL;
2521 }
2522 EXPORT_SYMBOL(sock_wmalloc);
2523 
sock_ofree(struct sk_buff * skb)2524 static void sock_ofree(struct sk_buff *skb)
2525 {
2526 	struct sock *sk = skb->sk;
2527 
2528 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2529 }
2530 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2531 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2532 			     gfp_t priority)
2533 {
2534 	struct sk_buff *skb;
2535 
2536 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2537 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2538 	    READ_ONCE(sysctl_optmem_max))
2539 		return NULL;
2540 
2541 	skb = alloc_skb(size, priority);
2542 	if (!skb)
2543 		return NULL;
2544 
2545 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2546 	skb->sk = sk;
2547 	skb->destructor = sock_ofree;
2548 	return skb;
2549 }
2550 
2551 /*
2552  * Allocate a memory block from the socket's option memory buffer.
2553  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2554 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2555 {
2556 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2557 
2558 	if ((unsigned int)size <= optmem_max &&
2559 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2560 		void *mem;
2561 		/* First do the add, to avoid the race if kmalloc
2562 		 * might sleep.
2563 		 */
2564 		atomic_add(size, &sk->sk_omem_alloc);
2565 		mem = kmalloc(size, priority);
2566 		if (mem)
2567 			return mem;
2568 		atomic_sub(size, &sk->sk_omem_alloc);
2569 	}
2570 	return NULL;
2571 }
2572 EXPORT_SYMBOL(sock_kmalloc);
2573 
2574 /* Free an option memory block. Note, we actually want the inline
2575  * here as this allows gcc to detect the nullify and fold away the
2576  * condition entirely.
2577  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2578 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2579 				  const bool nullify)
2580 {
2581 	if (WARN_ON_ONCE(!mem))
2582 		return;
2583 	if (nullify)
2584 		kfree_sensitive(mem);
2585 	else
2586 		kfree(mem);
2587 	atomic_sub(size, &sk->sk_omem_alloc);
2588 }
2589 
sock_kfree_s(struct sock * sk,void * mem,int size)2590 void sock_kfree_s(struct sock *sk, void *mem, int size)
2591 {
2592 	__sock_kfree_s(sk, mem, size, false);
2593 }
2594 EXPORT_SYMBOL(sock_kfree_s);
2595 
sock_kzfree_s(struct sock * sk,void * mem,int size)2596 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2597 {
2598 	__sock_kfree_s(sk, mem, size, true);
2599 }
2600 EXPORT_SYMBOL(sock_kzfree_s);
2601 
2602 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2603    I think, these locks should be removed for datagram sockets.
2604  */
sock_wait_for_wmem(struct sock * sk,long timeo)2605 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2606 {
2607 	DEFINE_WAIT(wait);
2608 
2609 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2610 	for (;;) {
2611 		if (!timeo)
2612 			break;
2613 		if (signal_pending(current))
2614 			break;
2615 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2616 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2617 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2618 			break;
2619 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2620 			break;
2621 		if (sk->sk_err)
2622 			break;
2623 		timeo = schedule_timeout(timeo);
2624 	}
2625 	finish_wait(sk_sleep(sk), &wait);
2626 	return timeo;
2627 }
2628 
2629 
2630 /*
2631  *	Generic send/receive buffer handlers
2632  */
2633 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2634 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2635 				     unsigned long data_len, int noblock,
2636 				     int *errcode, int max_page_order)
2637 {
2638 	struct sk_buff *skb;
2639 	long timeo;
2640 	int err;
2641 
2642 	timeo = sock_sndtimeo(sk, noblock);
2643 	for (;;) {
2644 		err = sock_error(sk);
2645 		if (err != 0)
2646 			goto failure;
2647 
2648 		err = -EPIPE;
2649 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2650 			goto failure;
2651 
2652 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2653 			break;
2654 
2655 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2656 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2657 		err = -EAGAIN;
2658 		if (!timeo)
2659 			goto failure;
2660 		if (signal_pending(current))
2661 			goto interrupted;
2662 		timeo = sock_wait_for_wmem(sk, timeo);
2663 	}
2664 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2665 				   errcode, sk->sk_allocation);
2666 	if (skb)
2667 		skb_set_owner_w(skb, sk);
2668 	return skb;
2669 
2670 interrupted:
2671 	err = sock_intr_errno(timeo);
2672 failure:
2673 	*errcode = err;
2674 	return NULL;
2675 }
2676 EXPORT_SYMBOL(sock_alloc_send_pskb);
2677 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2678 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2679 		     struct sockcm_cookie *sockc)
2680 {
2681 	u32 tsflags;
2682 
2683 	switch (cmsg->cmsg_type) {
2684 	case SO_MARK:
2685 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2686 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2687 			return -EPERM;
2688 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2689 			return -EINVAL;
2690 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2691 		break;
2692 	case SO_TIMESTAMPING_OLD:
2693 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2694 			return -EINVAL;
2695 
2696 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2697 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2698 			return -EINVAL;
2699 
2700 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2701 		sockc->tsflags |= tsflags;
2702 		break;
2703 	case SCM_TXTIME:
2704 		if (!sock_flag(sk, SOCK_TXTIME))
2705 			return -EINVAL;
2706 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2707 			return -EINVAL;
2708 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2709 		break;
2710 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2711 	case SCM_RIGHTS:
2712 	case SCM_CREDENTIALS:
2713 		break;
2714 	default:
2715 		return -EINVAL;
2716 	}
2717 	return 0;
2718 }
2719 EXPORT_SYMBOL(__sock_cmsg_send);
2720 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2721 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2722 		   struct sockcm_cookie *sockc)
2723 {
2724 	struct cmsghdr *cmsg;
2725 	int ret;
2726 
2727 	for_each_cmsghdr(cmsg, msg) {
2728 		if (!CMSG_OK(msg, cmsg))
2729 			return -EINVAL;
2730 		if (cmsg->cmsg_level != SOL_SOCKET)
2731 			continue;
2732 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2733 		if (ret)
2734 			return ret;
2735 	}
2736 	return 0;
2737 }
2738 EXPORT_SYMBOL(sock_cmsg_send);
2739 
sk_enter_memory_pressure(struct sock * sk)2740 static void sk_enter_memory_pressure(struct sock *sk)
2741 {
2742 	if (!sk->sk_prot->enter_memory_pressure)
2743 		return;
2744 
2745 	sk->sk_prot->enter_memory_pressure(sk);
2746 }
2747 
sk_leave_memory_pressure(struct sock * sk)2748 static void sk_leave_memory_pressure(struct sock *sk)
2749 {
2750 	if (sk->sk_prot->leave_memory_pressure) {
2751 		sk->sk_prot->leave_memory_pressure(sk);
2752 	} else {
2753 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2754 
2755 		if (memory_pressure && READ_ONCE(*memory_pressure))
2756 			WRITE_ONCE(*memory_pressure, 0);
2757 	}
2758 }
2759 
2760 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2761 
2762 /**
2763  * skb_page_frag_refill - check that a page_frag contains enough room
2764  * @sz: minimum size of the fragment we want to get
2765  * @pfrag: pointer to page_frag
2766  * @gfp: priority for memory allocation
2767  *
2768  * Note: While this allocator tries to use high order pages, there is
2769  * no guarantee that allocations succeed. Therefore, @sz MUST be
2770  * less or equal than PAGE_SIZE.
2771  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2772 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2773 {
2774 	if (pfrag->page) {
2775 		if (page_ref_count(pfrag->page) == 1) {
2776 			pfrag->offset = 0;
2777 			return true;
2778 		}
2779 		if (pfrag->offset + sz <= pfrag->size)
2780 			return true;
2781 		put_page(pfrag->page);
2782 	}
2783 
2784 	pfrag->offset = 0;
2785 	if (SKB_FRAG_PAGE_ORDER &&
2786 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2787 		/* Avoid direct reclaim but allow kswapd to wake */
2788 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2789 					  __GFP_COMP | __GFP_NOWARN |
2790 					  __GFP_NORETRY,
2791 					  SKB_FRAG_PAGE_ORDER);
2792 		if (likely(pfrag->page)) {
2793 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2794 			return true;
2795 		}
2796 	}
2797 	pfrag->page = alloc_page(gfp);
2798 	if (likely(pfrag->page)) {
2799 		pfrag->size = PAGE_SIZE;
2800 		return true;
2801 	}
2802 	return false;
2803 }
2804 EXPORT_SYMBOL(skb_page_frag_refill);
2805 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2806 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2807 {
2808 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2809 		return true;
2810 
2811 	sk_enter_memory_pressure(sk);
2812 	sk_stream_moderate_sndbuf(sk);
2813 	return false;
2814 }
2815 EXPORT_SYMBOL(sk_page_frag_refill);
2816 
__lock_sock(struct sock * sk)2817 void __lock_sock(struct sock *sk)
2818 	__releases(&sk->sk_lock.slock)
2819 	__acquires(&sk->sk_lock.slock)
2820 {
2821 	DEFINE_WAIT(wait);
2822 
2823 	for (;;) {
2824 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2825 					TASK_UNINTERRUPTIBLE);
2826 		spin_unlock_bh(&sk->sk_lock.slock);
2827 		schedule();
2828 		spin_lock_bh(&sk->sk_lock.slock);
2829 		if (!sock_owned_by_user(sk))
2830 			break;
2831 	}
2832 	finish_wait(&sk->sk_lock.wq, &wait);
2833 }
2834 
__release_sock(struct sock * sk)2835 void __release_sock(struct sock *sk)
2836 	__releases(&sk->sk_lock.slock)
2837 	__acquires(&sk->sk_lock.slock)
2838 {
2839 	struct sk_buff *skb, *next;
2840 
2841 	while ((skb = sk->sk_backlog.head) != NULL) {
2842 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2843 
2844 		spin_unlock_bh(&sk->sk_lock.slock);
2845 
2846 		do {
2847 			next = skb->next;
2848 			prefetch(next);
2849 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2850 			skb_mark_not_on_list(skb);
2851 			sk_backlog_rcv(sk, skb);
2852 
2853 			cond_resched();
2854 
2855 			skb = next;
2856 		} while (skb != NULL);
2857 
2858 		spin_lock_bh(&sk->sk_lock.slock);
2859 	}
2860 
2861 	/*
2862 	 * Doing the zeroing here guarantee we can not loop forever
2863 	 * while a wild producer attempts to flood us.
2864 	 */
2865 	sk->sk_backlog.len = 0;
2866 }
2867 
__sk_flush_backlog(struct sock * sk)2868 void __sk_flush_backlog(struct sock *sk)
2869 {
2870 	spin_lock_bh(&sk->sk_lock.slock);
2871 	__release_sock(sk);
2872 	spin_unlock_bh(&sk->sk_lock.slock);
2873 }
2874 
2875 /**
2876  * sk_wait_data - wait for data to arrive at sk_receive_queue
2877  * @sk:    sock to wait on
2878  * @timeo: for how long
2879  * @skb:   last skb seen on sk_receive_queue
2880  *
2881  * Now socket state including sk->sk_err is changed only under lock,
2882  * hence we may omit checks after joining wait queue.
2883  * We check receive queue before schedule() only as optimization;
2884  * it is very likely that release_sock() added new data.
2885  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2886 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2887 {
2888 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2889 	int rc;
2890 
2891 	add_wait_queue(sk_sleep(sk), &wait);
2892 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2893 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2894 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2895 	remove_wait_queue(sk_sleep(sk), &wait);
2896 	return rc;
2897 }
2898 EXPORT_SYMBOL(sk_wait_data);
2899 
2900 /**
2901  *	__sk_mem_raise_allocated - increase memory_allocated
2902  *	@sk: socket
2903  *	@size: memory size to allocate
2904  *	@amt: pages to allocate
2905  *	@kind: allocation type
2906  *
2907  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2908  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2909 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2910 {
2911 	struct proto *prot = sk->sk_prot;
2912 	long allocated = sk_memory_allocated_add(sk, amt);
2913 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2914 	bool charged = true;
2915 
2916 	if (memcg_charge &&
2917 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2918 						gfp_memcg_charge())))
2919 		goto suppress_allocation;
2920 
2921 	/* Under limit. */
2922 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2923 		sk_leave_memory_pressure(sk);
2924 		return 1;
2925 	}
2926 
2927 	/* Under pressure. */
2928 	if (allocated > sk_prot_mem_limits(sk, 1))
2929 		sk_enter_memory_pressure(sk);
2930 
2931 	/* Over hard limit. */
2932 	if (allocated > sk_prot_mem_limits(sk, 2))
2933 		goto suppress_allocation;
2934 
2935 	/* guarantee minimum buffer size under pressure */
2936 	if (kind == SK_MEM_RECV) {
2937 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2938 			return 1;
2939 
2940 	} else { /* SK_MEM_SEND */
2941 		int wmem0 = sk_get_wmem0(sk, prot);
2942 
2943 		if (sk->sk_type == SOCK_STREAM) {
2944 			if (sk->sk_wmem_queued < wmem0)
2945 				return 1;
2946 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2947 				return 1;
2948 		}
2949 	}
2950 
2951 	if (sk_has_memory_pressure(sk)) {
2952 		u64 alloc;
2953 
2954 		if (!sk_under_memory_pressure(sk))
2955 			return 1;
2956 		alloc = sk_sockets_allocated_read_positive(sk);
2957 		if (sk_prot_mem_limits(sk, 2) > alloc *
2958 		    sk_mem_pages(sk->sk_wmem_queued +
2959 				 atomic_read(&sk->sk_rmem_alloc) +
2960 				 sk->sk_forward_alloc))
2961 			return 1;
2962 	}
2963 
2964 suppress_allocation:
2965 
2966 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2967 		sk_stream_moderate_sndbuf(sk);
2968 
2969 		/* Fail only if socket is _under_ its sndbuf.
2970 		 * In this case we cannot block, so that we have to fail.
2971 		 */
2972 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2973 			/* Force charge with __GFP_NOFAIL */
2974 			if (memcg_charge && !charged) {
2975 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2976 					gfp_memcg_charge() | __GFP_NOFAIL);
2977 			}
2978 			return 1;
2979 		}
2980 	}
2981 
2982 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2983 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2984 
2985 	sk_memory_allocated_sub(sk, amt);
2986 
2987 	if (memcg_charge && charged)
2988 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2989 
2990 	return 0;
2991 }
2992 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2993 
2994 /**
2995  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2996  *	@sk: socket
2997  *	@size: memory size to allocate
2998  *	@kind: allocation type
2999  *
3000  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3001  *	rmem allocation. This function assumes that protocols which have
3002  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3003  */
__sk_mem_schedule(struct sock * sk,int size,int kind)3004 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3005 {
3006 	int ret, amt = sk_mem_pages(size);
3007 
3008 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
3009 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3010 	if (!ret)
3011 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
3012 	return ret;
3013 }
3014 EXPORT_SYMBOL(__sk_mem_schedule);
3015 
3016 /**
3017  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3018  *	@sk: socket
3019  *	@amount: number of quanta
3020  *
3021  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3022  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3023 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3024 {
3025 	sk_memory_allocated_sub(sk, amount);
3026 
3027 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3028 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3029 
3030 	if (sk_under_memory_pressure(sk) &&
3031 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3032 		sk_leave_memory_pressure(sk);
3033 }
3034 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
3035 
3036 /**
3037  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3038  *	@sk: socket
3039  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3040  */
__sk_mem_reclaim(struct sock * sk,int amount)3041 void __sk_mem_reclaim(struct sock *sk, int amount)
3042 {
3043 	amount >>= SK_MEM_QUANTUM_SHIFT;
3044 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3045 	__sk_mem_reduce_allocated(sk, amount);
3046 }
3047 EXPORT_SYMBOL(__sk_mem_reclaim);
3048 
sk_set_peek_off(struct sock * sk,int val)3049 int sk_set_peek_off(struct sock *sk, int val)
3050 {
3051 	sk->sk_peek_off = val;
3052 	return 0;
3053 }
3054 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3055 
3056 /*
3057  * Set of default routines for initialising struct proto_ops when
3058  * the protocol does not support a particular function. In certain
3059  * cases where it makes no sense for a protocol to have a "do nothing"
3060  * function, some default processing is provided.
3061  */
3062 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3063 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3064 {
3065 	return -EOPNOTSUPP;
3066 }
3067 EXPORT_SYMBOL(sock_no_bind);
3068 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3069 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3070 		    int len, int flags)
3071 {
3072 	return -EOPNOTSUPP;
3073 }
3074 EXPORT_SYMBOL(sock_no_connect);
3075 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3076 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3077 {
3078 	return -EOPNOTSUPP;
3079 }
3080 EXPORT_SYMBOL(sock_no_socketpair);
3081 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)3082 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3083 		   bool kern)
3084 {
3085 	return -EOPNOTSUPP;
3086 }
3087 EXPORT_SYMBOL(sock_no_accept);
3088 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3089 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3090 		    int peer)
3091 {
3092 	return -EOPNOTSUPP;
3093 }
3094 EXPORT_SYMBOL(sock_no_getname);
3095 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3096 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3097 {
3098 	return -EOPNOTSUPP;
3099 }
3100 EXPORT_SYMBOL(sock_no_ioctl);
3101 
sock_no_listen(struct socket * sock,int backlog)3102 int sock_no_listen(struct socket *sock, int backlog)
3103 {
3104 	return -EOPNOTSUPP;
3105 }
3106 EXPORT_SYMBOL(sock_no_listen);
3107 
sock_no_shutdown(struct socket * sock,int how)3108 int sock_no_shutdown(struct socket *sock, int how)
3109 {
3110 	return -EOPNOTSUPP;
3111 }
3112 EXPORT_SYMBOL(sock_no_shutdown);
3113 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3114 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3115 {
3116 	return -EOPNOTSUPP;
3117 }
3118 EXPORT_SYMBOL(sock_no_sendmsg);
3119 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3120 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3121 {
3122 	return -EOPNOTSUPP;
3123 }
3124 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3125 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3126 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3127 		    int flags)
3128 {
3129 	return -EOPNOTSUPP;
3130 }
3131 EXPORT_SYMBOL(sock_no_recvmsg);
3132 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3133 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3134 {
3135 	/* Mirror missing mmap method error code */
3136 	return -ENODEV;
3137 }
3138 EXPORT_SYMBOL(sock_no_mmap);
3139 
3140 /*
3141  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3142  * various sock-based usage counts.
3143  */
__receive_sock(struct file * file)3144 void __receive_sock(struct file *file)
3145 {
3146 	struct socket *sock;
3147 
3148 	sock = sock_from_file(file);
3149 	if (sock) {
3150 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3151 		sock_update_classid(&sock->sk->sk_cgrp_data);
3152 	}
3153 }
3154 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)3155 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3156 {
3157 	ssize_t res;
3158 	struct msghdr msg = {.msg_flags = flags};
3159 	struct kvec iov;
3160 	char *kaddr = kmap(page);
3161 	iov.iov_base = kaddr + offset;
3162 	iov.iov_len = size;
3163 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3164 	kunmap(page);
3165 	return res;
3166 }
3167 EXPORT_SYMBOL(sock_no_sendpage);
3168 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)3169 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3170 				int offset, size_t size, int flags)
3171 {
3172 	ssize_t res;
3173 	struct msghdr msg = {.msg_flags = flags};
3174 	struct kvec iov;
3175 	char *kaddr = kmap(page);
3176 
3177 	iov.iov_base = kaddr + offset;
3178 	iov.iov_len = size;
3179 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3180 	kunmap(page);
3181 	return res;
3182 }
3183 EXPORT_SYMBOL(sock_no_sendpage_locked);
3184 
3185 /*
3186  *	Default Socket Callbacks
3187  */
3188 
sock_def_wakeup(struct sock * sk)3189 static void sock_def_wakeup(struct sock *sk)
3190 {
3191 	struct socket_wq *wq;
3192 
3193 	rcu_read_lock();
3194 	wq = rcu_dereference(sk->sk_wq);
3195 	if (skwq_has_sleeper(wq))
3196 		wake_up_interruptible_all(&wq->wait);
3197 	rcu_read_unlock();
3198 }
3199 
sock_def_error_report(struct sock * sk)3200 static void sock_def_error_report(struct sock *sk)
3201 {
3202 	struct socket_wq *wq;
3203 
3204 	rcu_read_lock();
3205 	wq = rcu_dereference(sk->sk_wq);
3206 	if (skwq_has_sleeper(wq))
3207 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3208 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3209 	rcu_read_unlock();
3210 }
3211 
sock_def_readable(struct sock * sk)3212 void sock_def_readable(struct sock *sk)
3213 {
3214 	struct socket_wq *wq;
3215 
3216 	rcu_read_lock();
3217 	wq = rcu_dereference(sk->sk_wq);
3218 	if (skwq_has_sleeper(wq))
3219 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3220 						EPOLLRDNORM | EPOLLRDBAND);
3221 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3222 	rcu_read_unlock();
3223 }
3224 
sock_def_write_space(struct sock * sk)3225 static void sock_def_write_space(struct sock *sk)
3226 {
3227 	struct socket_wq *wq;
3228 
3229 	rcu_read_lock();
3230 
3231 	/* Do not wake up a writer until he can make "significant"
3232 	 * progress.  --DaveM
3233 	 */
3234 	if (sock_writeable(sk)) {
3235 		wq = rcu_dereference(sk->sk_wq);
3236 		if (skwq_has_sleeper(wq))
3237 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3238 						EPOLLWRNORM | EPOLLWRBAND);
3239 
3240 		/* Should agree with poll, otherwise some programs break */
3241 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3242 	}
3243 
3244 	rcu_read_unlock();
3245 }
3246 
3247 /* An optimised version of sock_def_write_space(), should only be called
3248  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3249  * ->sk_wmem_alloc.
3250  */
sock_def_write_space_wfree(struct sock * sk)3251 static void sock_def_write_space_wfree(struct sock *sk)
3252 {
3253 	/* Do not wake up a writer until he can make "significant"
3254 	 * progress.  --DaveM
3255 	 */
3256 	if (sock_writeable(sk)) {
3257 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3258 
3259 		/* rely on refcount_sub from sock_wfree() */
3260 		smp_mb__after_atomic();
3261 		if (wq && waitqueue_active(&wq->wait))
3262 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3263 						EPOLLWRNORM | EPOLLWRBAND);
3264 
3265 		/* Should agree with poll, otherwise some programs break */
3266 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3267 	}
3268 }
3269 
sock_def_destruct(struct sock * sk)3270 static void sock_def_destruct(struct sock *sk)
3271 {
3272 }
3273 
sk_send_sigurg(struct sock * sk)3274 void sk_send_sigurg(struct sock *sk)
3275 {
3276 	if (sk->sk_socket && sk->sk_socket->file)
3277 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3278 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3279 }
3280 EXPORT_SYMBOL(sk_send_sigurg);
3281 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3282 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3283 		    unsigned long expires)
3284 {
3285 	if (!mod_timer(timer, expires))
3286 		sock_hold(sk);
3287 }
3288 EXPORT_SYMBOL(sk_reset_timer);
3289 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3290 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3291 {
3292 	if (del_timer(timer))
3293 		__sock_put(sk);
3294 }
3295 EXPORT_SYMBOL(sk_stop_timer);
3296 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3297 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3298 {
3299 	if (del_timer_sync(timer))
3300 		__sock_put(sk);
3301 }
3302 EXPORT_SYMBOL(sk_stop_timer_sync);
3303 
sock_init_data(struct socket * sock,struct sock * sk)3304 void sock_init_data(struct socket *sock, struct sock *sk)
3305 {
3306 	sk_init_common(sk);
3307 	sk->sk_send_head	=	NULL;
3308 
3309 	timer_setup(&sk->sk_timer, NULL, 0);
3310 
3311 	sk->sk_allocation	=	GFP_KERNEL;
3312 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3313 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3314 	sk->sk_state		=	TCP_CLOSE;
3315 	sk_set_socket(sk, sock);
3316 
3317 	sock_set_flag(sk, SOCK_ZAPPED);
3318 
3319 	if (sock) {
3320 		sk->sk_type	=	sock->type;
3321 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3322 		sock->sk	=	sk;
3323 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3324 	} else {
3325 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3326 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3327 	}
3328 
3329 	rwlock_init(&sk->sk_callback_lock);
3330 	if (sk->sk_kern_sock)
3331 		lockdep_set_class_and_name(
3332 			&sk->sk_callback_lock,
3333 			af_kern_callback_keys + sk->sk_family,
3334 			af_family_kern_clock_key_strings[sk->sk_family]);
3335 	else
3336 		lockdep_set_class_and_name(
3337 			&sk->sk_callback_lock,
3338 			af_callback_keys + sk->sk_family,
3339 			af_family_clock_key_strings[sk->sk_family]);
3340 
3341 	sk->sk_state_change	=	sock_def_wakeup;
3342 	sk->sk_data_ready	=	sock_def_readable;
3343 	sk->sk_write_space	=	sock_def_write_space;
3344 	sk->sk_error_report	=	sock_def_error_report;
3345 	sk->sk_destruct		=	sock_def_destruct;
3346 
3347 	sk->sk_frag.page	=	NULL;
3348 	sk->sk_frag.offset	=	0;
3349 	sk->sk_peek_off		=	-1;
3350 
3351 	sk->sk_peer_pid 	=	NULL;
3352 	sk->sk_peer_cred	=	NULL;
3353 	spin_lock_init(&sk->sk_peer_lock);
3354 
3355 	sk->sk_write_pending	=	0;
3356 	sk->sk_rcvlowat		=	1;
3357 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3358 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3359 
3360 	sk->sk_stamp = SK_DEFAULT_STAMP;
3361 #if BITS_PER_LONG==32
3362 	seqlock_init(&sk->sk_stamp_seq);
3363 #endif
3364 	atomic_set(&sk->sk_zckey, 0);
3365 
3366 #ifdef CONFIG_NET_RX_BUSY_POLL
3367 	sk->sk_napi_id		=	0;
3368 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3369 #endif
3370 
3371 	sk->sk_max_pacing_rate = ~0UL;
3372 	sk->sk_pacing_rate = ~0UL;
3373 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3374 	sk->sk_incoming_cpu = -1;
3375 	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3376 
3377 	sk_rx_queue_clear(sk);
3378 	/*
3379 	 * Before updating sk_refcnt, we must commit prior changes to memory
3380 	 * (Documentation/RCU/rculist_nulls.rst for details)
3381 	 */
3382 	smp_wmb();
3383 	refcount_set(&sk->sk_refcnt, 1);
3384 	atomic_set(&sk->sk_drops, 0);
3385 }
3386 EXPORT_SYMBOL(sock_init_data);
3387 
lock_sock_nested(struct sock * sk,int subclass)3388 void lock_sock_nested(struct sock *sk, int subclass)
3389 {
3390 	/* The sk_lock has mutex_lock() semantics here. */
3391 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3392 
3393 	might_sleep();
3394 	spin_lock_bh(&sk->sk_lock.slock);
3395 	if (sock_owned_by_user_nocheck(sk))
3396 		__lock_sock(sk);
3397 	sk->sk_lock.owned = 1;
3398 	spin_unlock_bh(&sk->sk_lock.slock);
3399 }
3400 EXPORT_SYMBOL(lock_sock_nested);
3401 
release_sock(struct sock * sk)3402 void release_sock(struct sock *sk)
3403 {
3404 	spin_lock_bh(&sk->sk_lock.slock);
3405 	if (sk->sk_backlog.tail)
3406 		__release_sock(sk);
3407 
3408 	/* Warning : release_cb() might need to release sk ownership,
3409 	 * ie call sock_release_ownership(sk) before us.
3410 	 */
3411 	if (sk->sk_prot->release_cb)
3412 		sk->sk_prot->release_cb(sk);
3413 
3414 	sock_release_ownership(sk);
3415 	if (waitqueue_active(&sk->sk_lock.wq))
3416 		wake_up(&sk->sk_lock.wq);
3417 	spin_unlock_bh(&sk->sk_lock.slock);
3418 }
3419 EXPORT_SYMBOL(release_sock);
3420 
__lock_sock_fast(struct sock * sk)3421 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3422 {
3423 	might_sleep();
3424 	spin_lock_bh(&sk->sk_lock.slock);
3425 
3426 	if (!sock_owned_by_user_nocheck(sk)) {
3427 		/*
3428 		 * Fast path return with bottom halves disabled and
3429 		 * sock::sk_lock.slock held.
3430 		 *
3431 		 * The 'mutex' is not contended and holding
3432 		 * sock::sk_lock.slock prevents all other lockers to
3433 		 * proceed so the corresponding unlock_sock_fast() can
3434 		 * avoid the slow path of release_sock() completely and
3435 		 * just release slock.
3436 		 *
3437 		 * From a semantical POV this is equivalent to 'acquiring'
3438 		 * the 'mutex', hence the corresponding lockdep
3439 		 * mutex_release() has to happen in the fast path of
3440 		 * unlock_sock_fast().
3441 		 */
3442 		return false;
3443 	}
3444 
3445 	__lock_sock(sk);
3446 	sk->sk_lock.owned = 1;
3447 	__acquire(&sk->sk_lock.slock);
3448 	spin_unlock_bh(&sk->sk_lock.slock);
3449 	return true;
3450 }
3451 EXPORT_SYMBOL(__lock_sock_fast);
3452 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3453 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3454 		   bool timeval, bool time32)
3455 {
3456 	struct sock *sk = sock->sk;
3457 	struct timespec64 ts;
3458 
3459 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3460 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3461 	if (ts.tv_sec == -1)
3462 		return -ENOENT;
3463 	if (ts.tv_sec == 0) {
3464 		ktime_t kt = ktime_get_real();
3465 		sock_write_timestamp(sk, kt);
3466 		ts = ktime_to_timespec64(kt);
3467 	}
3468 
3469 	if (timeval)
3470 		ts.tv_nsec /= 1000;
3471 
3472 #ifdef CONFIG_COMPAT_32BIT_TIME
3473 	if (time32)
3474 		return put_old_timespec32(&ts, userstamp);
3475 #endif
3476 #ifdef CONFIG_SPARC64
3477 	/* beware of padding in sparc64 timeval */
3478 	if (timeval && !in_compat_syscall()) {
3479 		struct __kernel_old_timeval __user tv = {
3480 			.tv_sec = ts.tv_sec,
3481 			.tv_usec = ts.tv_nsec,
3482 		};
3483 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3484 			return -EFAULT;
3485 		return 0;
3486 	}
3487 #endif
3488 	return put_timespec64(&ts, userstamp);
3489 }
3490 EXPORT_SYMBOL(sock_gettstamp);
3491 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3492 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3493 {
3494 	if (!sock_flag(sk, flag)) {
3495 		unsigned long previous_flags = sk->sk_flags;
3496 
3497 		sock_set_flag(sk, flag);
3498 		/*
3499 		 * we just set one of the two flags which require net
3500 		 * time stamping, but time stamping might have been on
3501 		 * already because of the other one
3502 		 */
3503 		if (sock_needs_netstamp(sk) &&
3504 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3505 			net_enable_timestamp();
3506 	}
3507 }
3508 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3509 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3510 		       int level, int type)
3511 {
3512 	struct sock_exterr_skb *serr;
3513 	struct sk_buff *skb;
3514 	int copied, err;
3515 
3516 	err = -EAGAIN;
3517 	skb = sock_dequeue_err_skb(sk);
3518 	if (skb == NULL)
3519 		goto out;
3520 
3521 	copied = skb->len;
3522 	if (copied > len) {
3523 		msg->msg_flags |= MSG_TRUNC;
3524 		copied = len;
3525 	}
3526 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3527 	if (err)
3528 		goto out_free_skb;
3529 
3530 	sock_recv_timestamp(msg, sk, skb);
3531 
3532 	serr = SKB_EXT_ERR(skb);
3533 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3534 
3535 	msg->msg_flags |= MSG_ERRQUEUE;
3536 	err = copied;
3537 
3538 out_free_skb:
3539 	kfree_skb(skb);
3540 out:
3541 	return err;
3542 }
3543 EXPORT_SYMBOL(sock_recv_errqueue);
3544 
3545 /*
3546  *	Get a socket option on an socket.
3547  *
3548  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3549  *	asynchronous errors should be reported by getsockopt. We assume
3550  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3551  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3552 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3553 			   char __user *optval, int __user *optlen)
3554 {
3555 	struct sock *sk = sock->sk;
3556 
3557 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3558 }
3559 EXPORT_SYMBOL(sock_common_getsockopt);
3560 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3561 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3562 			int flags)
3563 {
3564 	struct sock *sk = sock->sk;
3565 	int addr_len = 0;
3566 	int err;
3567 
3568 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3569 	if (err >= 0)
3570 		msg->msg_namelen = addr_len;
3571 	return err;
3572 }
3573 EXPORT_SYMBOL(sock_common_recvmsg);
3574 
3575 /*
3576  *	Set socket options on an inet socket.
3577  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3578 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3579 			   sockptr_t optval, unsigned int optlen)
3580 {
3581 	struct sock *sk = sock->sk;
3582 
3583 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3584 }
3585 EXPORT_SYMBOL(sock_common_setsockopt);
3586 
sk_common_release(struct sock * sk)3587 void sk_common_release(struct sock *sk)
3588 {
3589 	if (sk->sk_prot->destroy)
3590 		sk->sk_prot->destroy(sk);
3591 
3592 	/*
3593 	 * Observation: when sk_common_release is called, processes have
3594 	 * no access to socket. But net still has.
3595 	 * Step one, detach it from networking:
3596 	 *
3597 	 * A. Remove from hash tables.
3598 	 */
3599 
3600 	sk->sk_prot->unhash(sk);
3601 
3602 	/*
3603 	 * In this point socket cannot receive new packets, but it is possible
3604 	 * that some packets are in flight because some CPU runs receiver and
3605 	 * did hash table lookup before we unhashed socket. They will achieve
3606 	 * receive queue and will be purged by socket destructor.
3607 	 *
3608 	 * Also we still have packets pending on receive queue and probably,
3609 	 * our own packets waiting in device queues. sock_destroy will drain
3610 	 * receive queue, but transmitted packets will delay socket destruction
3611 	 * until the last reference will be released.
3612 	 */
3613 
3614 	sock_orphan(sk);
3615 
3616 	xfrm_sk_free_policy(sk);
3617 
3618 	sk_refcnt_debug_release(sk);
3619 
3620 	sock_put(sk);
3621 }
3622 EXPORT_SYMBOL(sk_common_release);
3623 
sk_get_meminfo(const struct sock * sk,u32 * mem)3624 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3625 {
3626 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3627 
3628 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3629 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3630 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3631 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3632 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3633 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3634 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3635 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3636 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3637 }
3638 
3639 #ifdef CONFIG_PROC_FS
3640 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3641 
sock_prot_inuse_get(struct net * net,struct proto * prot)3642 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3643 {
3644 	int cpu, idx = prot->inuse_idx;
3645 	int res = 0;
3646 
3647 	for_each_possible_cpu(cpu)
3648 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3649 
3650 	return res >= 0 ? res : 0;
3651 }
3652 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3653 
sock_inuse_get(struct net * net)3654 int sock_inuse_get(struct net *net)
3655 {
3656 	int cpu, res = 0;
3657 
3658 	for_each_possible_cpu(cpu)
3659 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3660 
3661 	return res;
3662 }
3663 
3664 EXPORT_SYMBOL_GPL(sock_inuse_get);
3665 
sock_inuse_init_net(struct net * net)3666 static int __net_init sock_inuse_init_net(struct net *net)
3667 {
3668 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3669 	if (net->core.prot_inuse == NULL)
3670 		return -ENOMEM;
3671 	return 0;
3672 }
3673 
sock_inuse_exit_net(struct net * net)3674 static void __net_exit sock_inuse_exit_net(struct net *net)
3675 {
3676 	free_percpu(net->core.prot_inuse);
3677 }
3678 
3679 static struct pernet_operations net_inuse_ops = {
3680 	.init = sock_inuse_init_net,
3681 	.exit = sock_inuse_exit_net,
3682 };
3683 
net_inuse_init(void)3684 static __init int net_inuse_init(void)
3685 {
3686 	if (register_pernet_subsys(&net_inuse_ops))
3687 		panic("Cannot initialize net inuse counters");
3688 
3689 	return 0;
3690 }
3691 
3692 core_initcall(net_inuse_init);
3693 
assign_proto_idx(struct proto * prot)3694 static int assign_proto_idx(struct proto *prot)
3695 {
3696 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3697 
3698 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3699 		pr_err("PROTO_INUSE_NR exhausted\n");
3700 		return -ENOSPC;
3701 	}
3702 
3703 	set_bit(prot->inuse_idx, proto_inuse_idx);
3704 	return 0;
3705 }
3706 
release_proto_idx(struct proto * prot)3707 static void release_proto_idx(struct proto *prot)
3708 {
3709 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3710 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3711 }
3712 #else
assign_proto_idx(struct proto * prot)3713 static inline int assign_proto_idx(struct proto *prot)
3714 {
3715 	return 0;
3716 }
3717 
release_proto_idx(struct proto * prot)3718 static inline void release_proto_idx(struct proto *prot)
3719 {
3720 }
3721 
3722 #endif
3723 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3724 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3725 {
3726 	if (!twsk_prot)
3727 		return;
3728 	kfree(twsk_prot->twsk_slab_name);
3729 	twsk_prot->twsk_slab_name = NULL;
3730 	kmem_cache_destroy(twsk_prot->twsk_slab);
3731 	twsk_prot->twsk_slab = NULL;
3732 }
3733 
tw_prot_init(const struct proto * prot)3734 static int tw_prot_init(const struct proto *prot)
3735 {
3736 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3737 
3738 	if (!twsk_prot)
3739 		return 0;
3740 
3741 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3742 					      prot->name);
3743 	if (!twsk_prot->twsk_slab_name)
3744 		return -ENOMEM;
3745 
3746 	twsk_prot->twsk_slab =
3747 		kmem_cache_create(twsk_prot->twsk_slab_name,
3748 				  twsk_prot->twsk_obj_size, 0,
3749 				  SLAB_ACCOUNT | prot->slab_flags,
3750 				  NULL);
3751 	if (!twsk_prot->twsk_slab) {
3752 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3753 			prot->name);
3754 		return -ENOMEM;
3755 	}
3756 
3757 	return 0;
3758 }
3759 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3760 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3761 {
3762 	if (!rsk_prot)
3763 		return;
3764 	kfree(rsk_prot->slab_name);
3765 	rsk_prot->slab_name = NULL;
3766 	kmem_cache_destroy(rsk_prot->slab);
3767 	rsk_prot->slab = NULL;
3768 }
3769 
req_prot_init(const struct proto * prot)3770 static int req_prot_init(const struct proto *prot)
3771 {
3772 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3773 
3774 	if (!rsk_prot)
3775 		return 0;
3776 
3777 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3778 					prot->name);
3779 	if (!rsk_prot->slab_name)
3780 		return -ENOMEM;
3781 
3782 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3783 					   rsk_prot->obj_size, 0,
3784 					   SLAB_ACCOUNT | prot->slab_flags,
3785 					   NULL);
3786 
3787 	if (!rsk_prot->slab) {
3788 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3789 			prot->name);
3790 		return -ENOMEM;
3791 	}
3792 	return 0;
3793 }
3794 
proto_register(struct proto * prot,int alloc_slab)3795 int proto_register(struct proto *prot, int alloc_slab)
3796 {
3797 	int ret = -ENOBUFS;
3798 
3799 	if (prot->memory_allocated && !prot->sysctl_mem) {
3800 		pr_err("%s: missing sysctl_mem\n", prot->name);
3801 		return -EINVAL;
3802 	}
3803 	if (alloc_slab) {
3804 		prot->slab = kmem_cache_create_usercopy(prot->name,
3805 					prot->obj_size, 0,
3806 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3807 					prot->slab_flags,
3808 					prot->useroffset, prot->usersize,
3809 					NULL);
3810 
3811 		if (prot->slab == NULL) {
3812 			pr_crit("%s: Can't create sock SLAB cache!\n",
3813 				prot->name);
3814 			goto out;
3815 		}
3816 
3817 		if (req_prot_init(prot))
3818 			goto out_free_request_sock_slab;
3819 
3820 		if (tw_prot_init(prot))
3821 			goto out_free_timewait_sock_slab;
3822 	}
3823 
3824 	mutex_lock(&proto_list_mutex);
3825 	ret = assign_proto_idx(prot);
3826 	if (ret) {
3827 		mutex_unlock(&proto_list_mutex);
3828 		goto out_free_timewait_sock_slab;
3829 	}
3830 	list_add(&prot->node, &proto_list);
3831 	mutex_unlock(&proto_list_mutex);
3832 	return ret;
3833 
3834 out_free_timewait_sock_slab:
3835 	if (alloc_slab)
3836 		tw_prot_cleanup(prot->twsk_prot);
3837 out_free_request_sock_slab:
3838 	if (alloc_slab) {
3839 		req_prot_cleanup(prot->rsk_prot);
3840 
3841 		kmem_cache_destroy(prot->slab);
3842 		prot->slab = NULL;
3843 	}
3844 out:
3845 	return ret;
3846 }
3847 EXPORT_SYMBOL(proto_register);
3848 
proto_unregister(struct proto * prot)3849 void proto_unregister(struct proto *prot)
3850 {
3851 	mutex_lock(&proto_list_mutex);
3852 	release_proto_idx(prot);
3853 	list_del(&prot->node);
3854 	mutex_unlock(&proto_list_mutex);
3855 
3856 	kmem_cache_destroy(prot->slab);
3857 	prot->slab = NULL;
3858 
3859 	req_prot_cleanup(prot->rsk_prot);
3860 	tw_prot_cleanup(prot->twsk_prot);
3861 }
3862 EXPORT_SYMBOL(proto_unregister);
3863 
sock_load_diag_module(int family,int protocol)3864 int sock_load_diag_module(int family, int protocol)
3865 {
3866 	if (!protocol) {
3867 		if (!sock_is_registered(family))
3868 			return -ENOENT;
3869 
3870 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3871 				      NETLINK_SOCK_DIAG, family);
3872 	}
3873 
3874 #ifdef CONFIG_INET
3875 	if (family == AF_INET &&
3876 	    protocol != IPPROTO_RAW &&
3877 	    protocol < MAX_INET_PROTOS &&
3878 	    !rcu_access_pointer(inet_protos[protocol]))
3879 		return -ENOENT;
3880 #endif
3881 
3882 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3883 			      NETLINK_SOCK_DIAG, family, protocol);
3884 }
3885 EXPORT_SYMBOL(sock_load_diag_module);
3886 
3887 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3888 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3889 	__acquires(proto_list_mutex)
3890 {
3891 	mutex_lock(&proto_list_mutex);
3892 	return seq_list_start_head(&proto_list, *pos);
3893 }
3894 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3895 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3896 {
3897 	return seq_list_next(v, &proto_list, pos);
3898 }
3899 
proto_seq_stop(struct seq_file * seq,void * v)3900 static void proto_seq_stop(struct seq_file *seq, void *v)
3901 	__releases(proto_list_mutex)
3902 {
3903 	mutex_unlock(&proto_list_mutex);
3904 }
3905 
proto_method_implemented(const void * method)3906 static char proto_method_implemented(const void *method)
3907 {
3908 	return method == NULL ? 'n' : 'y';
3909 }
sock_prot_memory_allocated(struct proto * proto)3910 static long sock_prot_memory_allocated(struct proto *proto)
3911 {
3912 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3913 }
3914 
sock_prot_memory_pressure(struct proto * proto)3915 static const char *sock_prot_memory_pressure(struct proto *proto)
3916 {
3917 	return proto->memory_pressure != NULL ?
3918 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3919 }
3920 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3921 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3922 {
3923 
3924 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3925 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3926 		   proto->name,
3927 		   proto->obj_size,
3928 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3929 		   sock_prot_memory_allocated(proto),
3930 		   sock_prot_memory_pressure(proto),
3931 		   proto->max_header,
3932 		   proto->slab == NULL ? "no" : "yes",
3933 		   module_name(proto->owner),
3934 		   proto_method_implemented(proto->close),
3935 		   proto_method_implemented(proto->connect),
3936 		   proto_method_implemented(proto->disconnect),
3937 		   proto_method_implemented(proto->accept),
3938 		   proto_method_implemented(proto->ioctl),
3939 		   proto_method_implemented(proto->init),
3940 		   proto_method_implemented(proto->destroy),
3941 		   proto_method_implemented(proto->shutdown),
3942 		   proto_method_implemented(proto->setsockopt),
3943 		   proto_method_implemented(proto->getsockopt),
3944 		   proto_method_implemented(proto->sendmsg),
3945 		   proto_method_implemented(proto->recvmsg),
3946 		   proto_method_implemented(proto->sendpage),
3947 		   proto_method_implemented(proto->bind),
3948 		   proto_method_implemented(proto->backlog_rcv),
3949 		   proto_method_implemented(proto->hash),
3950 		   proto_method_implemented(proto->unhash),
3951 		   proto_method_implemented(proto->get_port),
3952 		   proto_method_implemented(proto->enter_memory_pressure));
3953 }
3954 
proto_seq_show(struct seq_file * seq,void * v)3955 static int proto_seq_show(struct seq_file *seq, void *v)
3956 {
3957 	if (v == &proto_list)
3958 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3959 			   "protocol",
3960 			   "size",
3961 			   "sockets",
3962 			   "memory",
3963 			   "press",
3964 			   "maxhdr",
3965 			   "slab",
3966 			   "module",
3967 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3968 	else
3969 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3970 	return 0;
3971 }
3972 
3973 static const struct seq_operations proto_seq_ops = {
3974 	.start  = proto_seq_start,
3975 	.next   = proto_seq_next,
3976 	.stop   = proto_seq_stop,
3977 	.show   = proto_seq_show,
3978 };
3979 
proto_init_net(struct net * net)3980 static __net_init int proto_init_net(struct net *net)
3981 {
3982 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3983 			sizeof(struct seq_net_private)))
3984 		return -ENOMEM;
3985 
3986 	return 0;
3987 }
3988 
proto_exit_net(struct net * net)3989 static __net_exit void proto_exit_net(struct net *net)
3990 {
3991 	remove_proc_entry("protocols", net->proc_net);
3992 }
3993 
3994 
3995 static __net_initdata struct pernet_operations proto_net_ops = {
3996 	.init = proto_init_net,
3997 	.exit = proto_exit_net,
3998 };
3999 
proto_init(void)4000 static int __init proto_init(void)
4001 {
4002 	return register_pernet_subsys(&proto_net_ops);
4003 }
4004 
4005 subsys_initcall(proto_init);
4006 
4007 #endif /* PROC_FS */
4008 
4009 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4010 bool sk_busy_loop_end(void *p, unsigned long start_time)
4011 {
4012 	struct sock *sk = p;
4013 
4014 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4015 	       sk_busy_loop_timeout(sk, start_time);
4016 }
4017 EXPORT_SYMBOL(sk_busy_loop_end);
4018 #endif /* CONFIG_NET_RX_BUSY_POLL */
4019 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4020 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4021 {
4022 	if (!sk->sk_prot->bind_add)
4023 		return -EOPNOTSUPP;
4024 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4025 }
4026 EXPORT_SYMBOL(sock_bind_add);
4027