af_unix.c source code [linux/net/unix/af_unix.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* NET4: Implementation of BSD Unix domain sockets.
4	*
5	* Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6	*
7	* Fixes:
8	* Linus Torvalds : Assorted bug cures.
9	* Niibe Yutaka : async I/O support.
10	* Carsten Paeth : PF_UNIX check, address fixes.
11	* Alan Cox : Limit size of allocated blocks.
12	* Alan Cox : Fixed the stupid socketpair bug.
13	* Alan Cox : BSD compatibility fine tuning.
14	* Alan Cox : Fixed a bug in connect when interrupted.
15	* Alan Cox : Sorted out a proper draft version of
16	* file descriptor passing hacked up from
17	* Mike Shaver's work.
18	* Marty Leisner : Fixes to fd passing
19	* Nick Nevin : recvmsg bugfix.
20	* Alan Cox : Started proper garbage collector
21	* Heiko EiBfeldt : Missing verify_area check
22	* Alan Cox : Started POSIXisms
23	* Andreas Schwab : Replace inode by dentry for proper
24	* reference counting
25	* Kirk Petersen : Made this a module
26	* Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27	* Lots of bug fixes.
28	* Alexey Kuznetosv : Repaired (I hope) bugs introduces
29	* by above two patches.
30	* Andrea Arcangeli : If possible we block in connect(2)
31	* if the max backlog of the listen socket
32	* is been reached. This won't break
33	* old apps and it will avoid huge amount
34	* of socks hashed (this for unix_gc()
35	* performances reasons).
36	* Security fix that limits the max
37	* number of socks to 2*max_files and
38	* the number of skb queueable in the
39	* dgram receiver.
40	* Artur Skawina : Hash function optimizations
41	* Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42	* Malcolm Beattie : Set peercred for socketpair
43	* Michal Ostrowski : Module initialization cleanup.
44	* Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45	* the core infrastructure is doing that
46	* for all net proto families now (2.5.69+)
47	*
48	* Known differences from reference BSD that was tested:
49	*
50	* [TO FIX]
51	* ECONNREFUSED is not returned from one end of a connected() socket to the
52	* other the moment one end closes.
53	* fstat() doesn't return st_dev=0, and give the blksize as high water mark
54	* and a fake inode identifier (nor the BSD first socket fstat twice bug).
55	* [NOT TO FIX]
56	* accept() returns a path name even if the connecting socket has closed
57	* in the meantime (BSD loses the path and gives up).
58	* accept() returns 0 length path for an unbound connector. BSD returns 16
59	* and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60	* socketpair(...SOCK_RAW..) doesn't panic the kernel.
61	* BSD af_unix apparently has connect forgetting to block properly.
62	* (need to check this with the POSIX spec in detail)
63	*
64	* Differences from 2.0.0-11-... (ANK)
65	* Bug fixes and improvements.
66	* - client shutdown killed server socket.
67	* - removed all useless cli/sti pairs.
68	*
69	* Semantic changes/extensions.
70	* - generic control message passing.
71	* - SCM_CREDENTIALS control message.
72	* - "Abstract" (not FS based) socket bindings.
73	* Abstract names are sequences of bytes (not zero terminated)
74	* started by 0, so that this name space does not intersect
75	* with BSD names.
76	*/
77
78	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80	#include <linux/module.h>
81	#include <linux/kernel.h>
82	#include <linux/signal.h>
83	#include <linux/sched/signal.h>
84	#include <linux/errno.h>
85	#include <linux/string.h>
86	#include <linux/stat.h>
87	#include <linux/dcache.h>
88	#include <linux/namei.h>
89	#include <linux/socket.h>
90	#include <linux/un.h>
91	#include <linux/fcntl.h>
92	#include <linux/filter.h>
93	#include <linux/termios.h>
94	#include <linux/sockios.h>
95	#include <linux/net.h>
96	#include <linux/in.h>
97	#include <linux/fs.h>
98	#include <linux/slab.h>
99	#include <linux/uaccess.h>
100	#include <linux/skbuff.h>
101	#include <linux/netdevice.h>
102	#include <net/net_namespace.h>
103	#include <net/sock.h>
104	#include <net/tcp_states.h>
105	#include <net/af_unix.h>
106	#include <linux/proc_fs.h>
107	#include <linux/seq_file.h>
108	#include <net/scm.h>
109	#include <linux/init.h>
110	#include <linux/poll.h>
111	#include <linux/rtnetlink.h>
112	#include <linux/mount.h>
113	#include <net/checksum.h>
114	#include <linux/security.h>
115	#include <linux/splice.h>
116	#include <linux/freezer.h>
117	#include <linux/file.h>
118	#include <linux/btf_ids.h>
119	#include <linux/bpf-cgroup.h>
120
121	static atomic_long_t unix_nr_socks;
122	static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / `2`];
123	static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / `2`];
124
125	/ SMP locking strategy:*
126	* hash table is protected with spinlock.
127	* each socket state is protected by separate spinlock.
128	*/
129
130	static unsigned int unix_unbound_hash(struct sock *sk)
131	{
132	unsigned long hash = (unsigned long)sk;
133
134	hash ^= hash >> `16`;
135	hash ^= hash >> `8`;
136	hash ^= sk->sk_type;
137
138	return hash & UNIX_HASH_MOD;
139	}
140
141	static unsigned int unix_bsd_hash(struct inode *i)
142	{
143	return i->i_ino & UNIX_HASH_MOD;
144	}
145
146	static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147	int addr_len, int type)
148	{
149	__wsum csum = csum_partial(buff: sunaddr, len: addr_len, sum: `0`);
150	unsigned int hash;
151
152	hash = (__force unsigned int)csum_fold(sum: csum);
153	hash ^= hash >> `8`;
154	hash ^= type;
155
156	return UNIX_HASH_MOD + `1` + (hash & UNIX_HASH_MOD);
157	}
158
159	static void unix_table_double_lock(struct net *net,
160	unsigned int hash1, unsigned int hash2)
161	{
162	if (hash1 == hash2) {
163	spin_lock(lock: &net->unx.table.locks[hash1]);
164	return;
165	}
166
167	if (hash1 > hash2)
168	swap(hash1, hash2);
169
170	spin_lock(lock: &net->unx.table.locks[hash1]);
171	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172	}
173
174	static void unix_table_double_unlock(struct net *net,
175	unsigned int hash1, unsigned int hash2)
176	{
177	if (hash1 == hash2) {
178	spin_unlock(lock: &net->unx.table.locks[hash1]);
179	return;
180	}
181
182	spin_unlock(lock: &net->unx.table.locks[hash1]);
183	spin_unlock(lock: &net->unx.table.locks[hash2]);
184	}
185
186	#ifdef CONFIG_SECURITY_NETWORK
187	static void unix_get_secdata(struct scm_cookie scm, struct* sk_buff *skb)
188	{
189	UNIXCB(skb).secid = scm->secid;
190	}
191
192	static inline void unix_set_secdata(struct scm_cookie scm, struct* sk_buff *skb)
193	{
194	scm->secid = UNIXCB(skb).secid;
195	}
196
197	static inline bool unix_secdata_eq(struct scm_cookie scm, struct* sk_buff *skb)
198	{
199	return (scm->secid == UNIXCB(skb).secid);
200	}
201	#else
202	static inline void unix_get_secdata(struct scm_cookie scm, struct* sk_buff *skb)
203	{ }
204
205	static inline void unix_set_secdata(struct scm_cookie scm, struct* sk_buff *skb)
206	{ }
207
208	static inline bool unix_secdata_eq(struct scm_cookie scm, struct* sk_buff *skb)
209	{
210	return true;
211	}
212	#endif /* CONFIG_SECURITY_NETWORK */
213
214	static inline int unix_our_peer(struct sock sk, struct* sock *osk)
215	{
216	return unix_peer(osk) == sk;
217	}
218
219	static inline int unix_may_send(struct sock sk, struct* sock *osk)
220	{
221	return unix_peer(osk) == NULL \|\| unix_our_peer(sk, osk);
222	}
223
224	static inline int unix_recvq_full(const struct sock *sk)
225	{
226	return skb_queue_len(list_: &sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227	}
228
229	static inline int unix_recvq_full_lockless(const struct sock *sk)
230	{
231	return skb_queue_len_lockless(list_: &sk->sk_receive_queue) >
232	READ_ONCE(sk->sk_max_ack_backlog);
233	}
234
235	struct sock unix_peer_get(struct* sock *s)
236	{
237	struct sock *peer;
238
239	unix_state_lock(s);
240	peer = unix_peer(s);
241	if (peer)
242	sock_hold(sk: peer);
243	unix_state_unlock(s);
244	return peer;
245	}
246	EXPORT_SYMBOL_GPL(unix_peer_get);
247
248	static struct unix_address unix_create_addr(struct* sockaddr_un *sunaddr,
249	int addr_len)
250	{
251	struct unix_address *addr;
252
253	addr = kmalloc(size: sizeof(*addr) + addr_len, GFP_KERNEL);
254	if (!addr)
255	return NULL;
256
257	refcount_set(r: &addr->refcnt, n: `1`);
258	addr->len = addr_len;
259	memcpy(addr->name, sunaddr, addr_len);
260
261	return addr;
262	}
263
264	static inline void unix_release_addr(struct unix_address *addr)
265	{
266	if (refcount_dec_and_test(r: &addr->refcnt))
267	kfree(objp: addr);
268	}
269
270	/*
271	* Check unix socket name:
272	* - should be not zero length.
273	* - if started by not zero, should be NULL terminated (FS object)
274	* - if started by zero, it is abstract name.
275	*/
276
277	static int unix_validate_addr(struct sockaddr_un sunaddr, int* addr_len)
278	{
279	if (addr_len <= offsetof(struct sockaddr_un, sun_path) \|\|
280	addr_len > sizeof(*sunaddr))
281	return -EINVAL;
282
283	if (sunaddr->sun_family != AF_UNIX)
284	return -EINVAL;
285
286	return `0`;
287	}
288
289	static int unix_mkname_bsd(struct sockaddr_un sunaddr, int* addr_len)
290	{
291	struct sockaddr_storage addr = (struct* sockaddr_storage *)sunaddr;
292	short offset = offsetof(struct sockaddr_storage, __data);
293
294	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295
296	/ This may look like an off by one error but it is a bit more*
297	* subtle. 108 is the longest valid AF_UNIX path for a binding.
298	* sun_path[108] doesn't as such exist. However in kernel space
299	* we are guaranteed that it is a valid memory location in our
300	* kernel address buffer because syscall functions always pass
301	* a pointer of struct sockaddr_storage which has a bigger buffer
302	* than 108. Also, we must terminate sun_path for strlen() in
303	* getname_kernel().
304	*/
305	addr->__data[addr_len - offset] = `0`;
306
307	/ Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will*
308	* cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
309	* know the actual buffer.
310	*/
311	return strlen(addr->__data) + offset + `1`;
312	}
313
314	static void __unix_remove_socket(struct sock *sk)
315	{
316	sk_del_node_init(sk);
317	}
318
319	static void __unix_insert_socket(struct net net, struct* sock *sk)
320	{
321	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322	sk_add_node(sk, list: &net->unx.table.buckets[sk->sk_hash]);
323	}
324
325	static void __unix_set_addr_hash(struct net net, struct* sock *sk,
326	struct unix_address addr, unsigned* int hash)
327	{
328	__unix_remove_socket(sk);
329	smp_store_release(&unix_sk(sk)->addr, addr);
330
331	sk->sk_hash = hash;
332	__unix_insert_socket(net, sk);
333	}
334
335	static void unix_remove_socket(struct net net, struct* sock *sk)
336	{
337	spin_lock(lock: &net->unx.table.locks[sk->sk_hash]);
338	__unix_remove_socket(sk);
339	spin_unlock(lock: &net->unx.table.locks[sk->sk_hash]);
340	}
341
342	static void unix_insert_unbound_socket(struct net net, struct* sock *sk)
343	{
344	spin_lock(lock: &net->unx.table.locks[sk->sk_hash]);
345	__unix_insert_socket(net, sk);
346	spin_unlock(lock: &net->unx.table.locks[sk->sk_hash]);
347	}
348
349	static void unix_insert_bsd_socket(struct sock *sk)
350	{
351	spin_lock(lock: &bsd_socket_locks[sk->sk_hash]);
352	sk_add_bind_node(sk, list: &bsd_socket_buckets[sk->sk_hash]);
353	spin_unlock(lock: &bsd_socket_locks[sk->sk_hash]);
354	}
355
356	static void unix_remove_bsd_socket(struct sock *sk)
357	{
358	if (!hlist_unhashed(h: &sk->sk_bind_node)) {
359	spin_lock(lock: &bsd_socket_locks[sk->sk_hash]);
360	__sk_del_bind_node(sk);
361	spin_unlock(lock: &bsd_socket_locks[sk->sk_hash]);
362
363	sk_node_init(node: &sk->sk_bind_node);
364	}
365	}
366
367	static struct sock __unix_find_socket_byname(struct* net *net,
368	struct sockaddr_un *sunname,
369	int len, unsigned int hash)
370	{
371	struct sock *s;
372
373	sk_for_each(s, &net->unx.table.buckets[hash]) {
374	struct unix_sock *u = unix_sk(s);
375
376	if (u->addr->len == len &&
377	!memcmp(p: u->addr->name, q: sunname, size: len))
378	return s;
379	}
380	return NULL;
381	}
382
383	static inline struct sock unix_find_socket_byname(struct* net *net,
384	struct sockaddr_un *sunname,
385	int len, unsigned int hash)
386	{
387	struct sock *s;
388
389	spin_lock(lock: &net->unx.table.locks[hash]);
390	s = __unix_find_socket_byname(net, sunname, len, hash);
391	if (s)
392	sock_hold(sk: s);
393	spin_unlock(lock: &net->unx.table.locks[hash]);
394	return s;
395	}
396
397	static struct sock unix_find_socket_byinode(struct* inode *i)
398	{
399	unsigned int hash = unix_bsd_hash(i);
400	struct sock *s;
401
402	spin_lock(lock: &bsd_socket_locks[hash]);
403	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404	struct dentry *dentry = unix_sk(s)->path.dentry;
405
406	if (dentry && d_backing_inode(upper: dentry) == i) {
407	sock_hold(sk: s);
408	spin_unlock(lock: &bsd_socket_locks[hash]);
409	return s;
410	}
411	}
412	spin_unlock(lock: &bsd_socket_locks[hash]);
413	return NULL;
414	}
415
416	/ Support code for asymmetrically connected dgram sockets*
417	*
418	* If a datagram socket is connected to a socket not itself connected
419	* to the first socket (eg, /dev/log), clients may only enqueue more
420	* messages if the present receive queue of the server socket is not
421	* "too large". This means there's a second writeability condition
422	* poll and sendmsg need to test. The dgram recv code will do a wake
423	* up on the peer_wait wait queue of a socket upon reception of a
424	* datagram which needs to be propagated to sleeping would-be writers
425	* since these might not have sent anything so far. This can't be
426	* accomplished via poll_wait because the lifetime of the server
427	* socket might be less than that of its clients if these break their
428	* association with it or if the server socket is closed while clients
429	* are still connected to it and there's no way to inform "a polling
430	* implementation" that it should let go of a certain wait queue
431	*
432	* In order to propagate a wake up, a wait_queue_entry_t of the client
433	* socket is enqueued on the peer_wait queue of the server socket
434	* whose wake function does a wake_up on the ordinary client socket
435	* wait queue. This connection is established whenever a write (or
436	* poll for write) hit the flow control condition and broken when the
437	* association to the server socket is dissolved or after a wake up
438	* was relayed.
439	*/
440
441	static int unix_dgram_peer_wake_relay(wait_queue_entry_t q, unsigned* mode, int flags,
442	void *key)
443	{
444	struct unix_sock *u;
445	wait_queue_head_t *u_sleep;
446
447	u = container_of(q, struct unix_sock, peer_wake);
448
449	__remove_wait_queue(wq_head: &unix_sk(u->peer_wake.private)->peer_wait,
450	wq_entry: q);
451	u->peer_wake.private = NULL;
452
453	/ relaying can only happen while the wq still exists /
454	u_sleep = sk_sleep(sk: &u->sk);
455	if (u_sleep)
456	wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457
458	return `0`;
459	}
460
461	static int unix_dgram_peer_wake_connect(struct sock sk, struct* sock *other)
462	{
463	struct unix_sock u, u_other;
464	int rc;
465
466	u = unix_sk(sk);
467	u_other = unix_sk(other);
468	rc = `0`;
469	spin_lock(lock: &u_other->peer_wait.lock);
470
471	if (!u->peer_wake.private) {
472	u->peer_wake.private = other;
473	__add_wait_queue(wq_head: &u_other->peer_wait, wq_entry: &u->peer_wake);
474
475	rc = `1`;
476	}
477
478	spin_unlock(lock: &u_other->peer_wait.lock);
479	return rc;
480	}
481
482	static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483	struct sock *other)
484	{
485	struct unix_sock u, u_other;
486
487	u = unix_sk(sk);
488	u_other = unix_sk(other);
489	spin_lock(lock: &u_other->peer_wait.lock);
490
491	if (u->peer_wake.private == other) {
492	__remove_wait_queue(wq_head: &u_other->peer_wait, wq_entry: &u->peer_wake);
493	u->peer_wake.private = NULL;
494	}
495
496	spin_unlock(lock: &u_other->peer_wait.lock);
497	}
498
499	static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500	struct sock *other)
501	{
502	unix_dgram_peer_wake_disconnect(sk, other);
503	wake_up_interruptible_poll(sk_sleep(sk),
504	EPOLLOUT \|
505	EPOLLWRNORM \|
506	EPOLLWRBAND);
507	}
508
509	/ preconditions:*
510	* - unix_peer(sk) == other
511	* - association is stable
512	*/
513	static int unix_dgram_peer_wake_me(struct sock sk, struct* sock *other)
514	{
515	int connected;
516
517	connected = unix_dgram_peer_wake_connect(sk, other);
518
519	/ If other is SOCK_DEAD, we want to make sure we signal*
520	* POLLOUT, such that a subsequent write() can get a
521	* -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522	* to other and its full, we will hang waiting for POLLOUT.
523	*/
524	if (unix_recvq_full_lockless(sk: other) && !sock_flag(sk: other, flag: SOCK_DEAD))
525	return `1`;
526
527	if (connected)
528	unix_dgram_peer_wake_disconnect(sk, other);
529
530	return `0`;
531	}
532
533	static int unix_writable(const struct sock *sk)
534	{
535	return sk->sk_state != TCP_LISTEN &&
536	(refcount_read(r: &sk->sk_wmem_alloc) << `2`) <= sk->sk_sndbuf;
537	}
538
539	static void unix_write_space(struct sock *sk)
540	{
541	struct socket_wq *wq;
542
543	rcu_read_lock();
544	if (unix_writable(sk)) {
545	wq = rcu_dereference(sk->sk_wq);
546	if (skwq_has_sleeper(wq))
547	wake_up_interruptible_sync_poll(&wq->wait,
548	EPOLLOUT \| EPOLLWRNORM \| EPOLLWRBAND);
549	sk_wake_async(sk, how: SOCK_WAKE_SPACE, POLL_OUT);
550	}
551	rcu_read_unlock();
552	}
553
554	/ When dgram socket disconnects (or changes its peer), we clear its receive*
555	* queue of packets arrived from previous peer. First, it allows to do
556	* flow control based only on wmem_alloc; second, sk connected to peer
557	* may receive messages only from that peer. */
558	static void unix_dgram_disconnected(struct sock sk, struct* sock *other)
559	{
560	if (!skb_queue_empty(list: &sk->sk_receive_queue)) {
561	skb_queue_purge(list: &sk->sk_receive_queue);
562	wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563
564	/ If one link of bidirectional dgram pipe is disconnected,*
565	* we signal error. Messages are lost. Do not make this,
566	* when peer was not connected to us.
567	*/
568	if (!sock_flag(sk: other, flag: SOCK_DEAD) && unix_peer(other) == sk) {
569	WRITE_ONCE(other->sk_err, ECONNRESET);
570	sk_error_report(sk: other);
571	}
572	}
573	other->sk_state = TCP_CLOSE;
574	}
575
576	static void unix_sock_destructor(struct sock *sk)
577	{
578	struct unix_sock *u = unix_sk(sk);
579
580	skb_queue_purge(list: &sk->sk_receive_queue);
581
582	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585	if (!sock_flag(sk, flag: SOCK_DEAD)) {
586	pr_info("Attempt to release alive unix socket: %p\n", sk);
587	return;
588	}
589
590	if (u->addr)
591	unix_release_addr(addr: u->addr);
592
593	atomic_long_dec(v: &unix_nr_socks);
594	sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: -`1`);
595	#ifdef UNIX_REFCNT_DEBUG
596	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597	atomic_long_read(&unix_nr_socks));
598	#endif
599	}
600
601	static void unix_release_sock(struct sock sk, int* embrion)
602	{
603	struct unix_sock *u = unix_sk(sk);
604	struct sock *skpair;
605	struct sk_buff *skb;
606	struct path path;
607	int state;
608
609	unix_remove_socket(net: sock_net(sk), sk);
610	unix_remove_bsd_socket(sk);
611
612	/ Clear state /
613	unix_state_lock(sk);
614	sock_orphan(sk);
615	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616	path = u->path;
617	u->path.dentry = NULL;
618	u->path.mnt = NULL;
619	state = sk->sk_state;
620	sk->sk_state = TCP_CLOSE;
621
622	skpair = unix_peer(sk);
623	unix_peer(sk) = NULL;
624
625	unix_state_unlock(sk);
626
627	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628	if (u->oob_skb) {
629	kfree_skb(skb: u->oob_skb);
630	u->oob_skb = NULL;
631	}
632	#endif
633
634	wake_up_interruptible_all(&u->peer_wait);
635
636	if (skpair != NULL) {
637	if (sk->sk_type == SOCK_STREAM \|\| sk->sk_type == SOCK_SEQPACKET) {
638	unix_state_lock(skpair);
639	/ No more writes /
640	WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641	if (!skb_queue_empty(list: &sk->sk_receive_queue) \|\| embrion)
642	WRITE_ONCE(skpair->sk_err, ECONNRESET);
643	unix_state_unlock(skpair);
644	skpair->sk_state_change(skpair);
645	sk_wake_async(sk: skpair, how: SOCK_WAKE_WAITD, POLL_HUP);
646	}
647
648	unix_dgram_peer_wake_disconnect(sk, other: skpair);
649	sock_put(sk: skpair); / It may now die /
650	}
651
652	/ Try to flush out this socket. Throw out buffers at least /
653
654	while ((skb = skb_dequeue(list: &sk->sk_receive_queue)) != NULL) {
655	if (state == TCP_LISTEN)
656	unix_release_sock(sk: skb->sk, embrion: `1`);
657	/ passed fds are erased in the kfree_skb hook /
658	UNIXCB(skb).consumed = skb->len;
659	kfree_skb(skb);
660	}
661
662	if (path.dentry)
663	path_put(&path);
664
665	sock_put(sk);
666
667	/ ---- Socket is dead now and most probably destroyed ---- /
668
669	/*
670	* Fixme: BSD difference: In BSD all sockets connected to us get
671	* ECONNRESET and we die on the spot. In Linux we behave
672	* like files and pipes do and wait for the last
673	* dereference.
674	*
675	* Can't we simply set sock->err?
676	*
677	* What the above comment does talk about? --ANK(980817)
678	*/
679
680	if (READ_ONCE(unix_tot_inflight))
681	unix_gc(); / Garbage collect fds /
682	}
683
684	static void init_peercred(struct sock *sk)
685	{
686	const struct cred *old_cred;
687	struct pid *old_pid;
688
689	spin_lock(lock: &sk->sk_peer_lock);
690	old_pid = sk->sk_peer_pid;
691	old_cred = sk->sk_peer_cred;
692	sk->sk_peer_pid = get_pid(pid: task_tgid(current));
693	sk->sk_peer_cred = get_current_cred();
694	spin_unlock(lock: &sk->sk_peer_lock);
695
696	put_pid(pid: old_pid);
697	put_cred(cred: old_cred);
698	}
699
700	static void copy_peercred(struct sock sk, struct* sock *peersk)
701	{
702	const struct cred *old_cred;
703	struct pid *old_pid;
704
705	if (sk < peersk) {
706	spin_lock(lock: &sk->sk_peer_lock);
707	spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708	} else {
709	spin_lock(lock: &peersk->sk_peer_lock);
710	spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711	}
712	old_pid = sk->sk_peer_pid;
713	old_cred = sk->sk_peer_cred;
714	sk->sk_peer_pid = get_pid(pid: peersk->sk_peer_pid);
715	sk->sk_peer_cred = get_cred(cred: peersk->sk_peer_cred);
716
717	spin_unlock(lock: &sk->sk_peer_lock);
718	spin_unlock(lock: &peersk->sk_peer_lock);
719
720	put_pid(pid: old_pid);
721	put_cred(cred: old_cred);
722	}
723
724	static int unix_listen(struct socket sock, int* backlog)
725	{
726	int err;
727	struct sock *sk = sock->sk;
728	struct unix_sock *u = unix_sk(sk);
729
730	err = -EOPNOTSUPP;
731	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732	goto out; / Only stream/seqpacket sockets accept /
733	err = -EINVAL;
734	if (!u->addr)
735	goto out; / No listens on an unbound socket /
736	unix_state_lock(sk);
737	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738	goto out_unlock;
739	if (backlog > sk->sk_max_ack_backlog)
740	wake_up_interruptible_all(&u->peer_wait);
741	sk->sk_max_ack_backlog = backlog;
742	sk->sk_state = TCP_LISTEN;
743	/ set credentials so connect can copy them /
744	init_peercred(sk);
745	err = `0`;
746
747	out_unlock:
748	unix_state_unlock(sk);
749	out:
750	return err;
751	}
752
753	static int unix_release(struct socket *);
754	static int unix_bind(struct socket , struct* sockaddr , int*);
755	static int unix_stream_connect(struct socket , struct* sockaddr *,
756	int addr_len, int flags);
757	static int unix_socketpair(struct socket , struct* socket *);
758	static int unix_accept(struct socket , struct* socket , int*, bool);
759	static int unix_getname(struct socket , struct* sockaddr , int*);
760	static __poll_t unix_poll(struct file , struct* socket , poll_table );
761	static __poll_t unix_dgram_poll(struct file , struct* socket *,
762	poll_table *);
763	static int unix_ioctl(struct socket , unsigned* int, unsigned long);
764	#ifdef CONFIG_COMPAT
765	static int unix_compat_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg);
766	#endif
767	static int unix_shutdown(struct socket , int*);
768	static int unix_stream_sendmsg(struct socket , struct* msghdr *, size_t);
769	static int unix_stream_recvmsg(struct socket , struct* msghdr , size_t, int*);
770	static ssize_t unix_stream_splice_read(struct socket , loff_t ppos,
771	struct pipe_inode_info *, size_t size,
772	unsigned int flags);
773	static int unix_dgram_sendmsg(struct socket , struct* msghdr *, size_t);
774	static int unix_dgram_recvmsg(struct socket , struct* msghdr , size_t, int*);
775	static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776	static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777	static int unix_dgram_connect(struct socket , struct* sockaddr *,
778	int, int);
779	static int unix_seqpacket_sendmsg(struct socket , struct* msghdr *, size_t);
780	static int unix_seqpacket_recvmsg(struct socket , struct* msghdr *, size_t,
781	int);
782
783	#ifdef CONFIG_PROC_FS
784	static int unix_count_nr_fds(struct sock *sk)
785	{
786	struct sk_buff *skb;
787	struct unix_sock *u;
788	int nr_fds = `0`;
789
790	spin_lock(lock: &sk->sk_receive_queue.lock);
791	skb = skb_peek(list_: &sk->sk_receive_queue);
792	while (skb) {
793	u = unix_sk(skb->sk);
794	nr_fds += atomic_read(v: &u->scm_stat.nr_fds);
795	skb = skb_peek_next(skb, list_: &sk->sk_receive_queue);
796	}
797	spin_unlock(lock: &sk->sk_receive_queue.lock);
798
799	return nr_fds;
800	}
801
802	static void unix_show_fdinfo(struct seq_file m, struct* socket *sock)
803	{
804	struct sock *sk = sock->sk;
805	unsigned char s_state;
806	struct unix_sock *u;
807	int nr_fds = `0`;
808
809	if (sk) {
810	s_state = READ_ONCE(sk->sk_state);
811	u = unix_sk(sk);
812
813	/ SOCK_STREAM and SOCK_SEQPACKET sockets never change their*
814	* sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815	* SOCK_DGRAM is ordinary. So, no lock is needed.
816	*/
817	if (sock->type == SOCK_DGRAM \|\| s_state == TCP_ESTABLISHED)
818	nr_fds = atomic_read(v: &u->scm_stat.nr_fds);
819	else if (s_state == TCP_LISTEN)
820	nr_fds = unix_count_nr_fds(sk);
821
822	seq_printf(m, fmt: "scm_fds: %u\n", nr_fds);
823	}
824	}
825	#else
826	#define unix_show_fdinfo NULL
827	#endif
828
829	static const struct proto_ops unix_stream_ops = {
830	.family = PF_UNIX,
831	.owner = THIS_MODULE,
832	.release = unix_release,
833	.bind = unix_bind,
834	.connect = unix_stream_connect,
835	.socketpair = unix_socketpair,
836	.accept = unix_accept,
837	.getname = unix_getname,
838	.poll = unix_poll,
839	.ioctl = unix_ioctl,
840	#ifdef CONFIG_COMPAT
841	.compat_ioctl = unix_compat_ioctl,
842	#endif
843	.listen = unix_listen,
844	.shutdown = unix_shutdown,
845	.sendmsg = unix_stream_sendmsg,
846	.recvmsg = unix_stream_recvmsg,
847	.read_skb = unix_stream_read_skb,
848	.mmap = sock_no_mmap,
849	.splice_read = unix_stream_splice_read,
850	.set_peek_off = sk_set_peek_off,
851	.show_fdinfo = unix_show_fdinfo,
852	};
853
854	static const struct proto_ops unix_dgram_ops = {
855	.family = PF_UNIX,
856	.owner = THIS_MODULE,
857	.release = unix_release,
858	.bind = unix_bind,
859	.connect = unix_dgram_connect,
860	.socketpair = unix_socketpair,
861	.accept = sock_no_accept,
862	.getname = unix_getname,
863	.poll = unix_dgram_poll,
864	.ioctl = unix_ioctl,
865	#ifdef CONFIG_COMPAT
866	.compat_ioctl = unix_compat_ioctl,
867	#endif
868	.listen = sock_no_listen,
869	.shutdown = unix_shutdown,
870	.sendmsg = unix_dgram_sendmsg,
871	.read_skb = unix_read_skb,
872	.recvmsg = unix_dgram_recvmsg,
873	.mmap = sock_no_mmap,
874	.set_peek_off = sk_set_peek_off,
875	.show_fdinfo = unix_show_fdinfo,
876	};
877
878	static const struct proto_ops unix_seqpacket_ops = {
879	.family = PF_UNIX,
880	.owner = THIS_MODULE,
881	.release = unix_release,
882	.bind = unix_bind,
883	.connect = unix_stream_connect,
884	.socketpair = unix_socketpair,
885	.accept = unix_accept,
886	.getname = unix_getname,
887	.poll = unix_dgram_poll,
888	.ioctl = unix_ioctl,
889	#ifdef CONFIG_COMPAT
890	.compat_ioctl = unix_compat_ioctl,
891	#endif
892	.listen = unix_listen,
893	.shutdown = unix_shutdown,
894	.sendmsg = unix_seqpacket_sendmsg,
895	.recvmsg = unix_seqpacket_recvmsg,
896	.mmap = sock_no_mmap,
897	.set_peek_off = sk_set_peek_off,
898	.show_fdinfo = unix_show_fdinfo,
899	};
900
901	static void unix_close(struct sock sk, long* timeout)
902	{
903	/ Nothing to do here, unix socket does not need a ->close().*
904	* This is merely for sockmap.
905	*/
906	}
907
908	static void unix_unhash(struct sock *sk)
909	{
910	/ Nothing to do here, unix socket does not need a ->unhash().*
911	* This is merely for sockmap.
912	*/
913	}
914
915	static bool unix_bpf_bypass_getsockopt(int level, int optname)
916	{
917	if (level == SOL_SOCKET) {
918	switch (optname) {
919	case SO_PEERPIDFD:
920	return true;
921	default:
922	return false;
923	}
924	}
925
926	return false;
927	}
928
929	struct proto unix_dgram_proto = {
930	.name = "UNIX",
931	.owner = THIS_MODULE,
932	.obj_size = sizeof(struct unix_sock),
933	.close = unix_close,
934	.bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
935	#ifdef CONFIG_BPF_SYSCALL
936	.psock_update_sk_prot = unix_dgram_bpf_update_proto,
937	#endif
938	};
939
940	struct proto unix_stream_proto = {
941	.name = "UNIX-STREAM",
942	.owner = THIS_MODULE,
943	.obj_size = sizeof(struct unix_sock),
944	.close = unix_close,
945	.unhash = unix_unhash,
946	.bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
947	#ifdef CONFIG_BPF_SYSCALL
948	.psock_update_sk_prot = unix_stream_bpf_update_proto,
949	#endif
950	};
951
952	static struct sock unix_create1(struct* net net, struct* socket sock, int* kern, int type)
953	{
954	struct unix_sock *u;
955	struct sock *sk;
956	int err;
957
958	atomic_long_inc(v: &unix_nr_socks);
959	if (atomic_long_read(v: &unix_nr_socks) > `2` * get_max_files()) {
960	err = -ENFILE;
961	goto err;
962	}
963
964	if (type == SOCK_STREAM)
965	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, prot: &unix_stream_proto, kern);
966	else /dgram and seqpacket /
967	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, prot: &unix_dgram_proto, kern);
968
969	if (!sk) {
970	err = -ENOMEM;
971	goto err;
972	}
973
974	sock_init_data(sock, sk);
975
976	sk->sk_hash = unix_unbound_hash(sk);
977	sk->sk_allocation = GFP_KERNEL_ACCOUNT;
978	sk->sk_write_space = unix_write_space;
979	sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
980	sk->sk_destruct = unix_sock_destructor;
981	u = unix_sk(sk);
982	u->inflight = `0`;
983	u->path.dentry = NULL;
984	u->path.mnt = NULL;
985	spin_lock_init(&u->lock);
986	INIT_LIST_HEAD(list: &u->link);
987	mutex_init(&u->iolock); / single task reading lock /
988	mutex_init(&u->bindlock); / single task binding lock /
989	init_waitqueue_head(&u->peer_wait);
990	init_waitqueue_func_entry(wq_entry: &u->peer_wake, func: unix_dgram_peer_wake_relay);
991	memset(&u->scm_stat, `0`, sizeof(struct scm_stat));
992	unix_insert_unbound_socket(net, sk);
993
994	sock_prot_inuse_add(net, prot: sk->sk_prot, val: `1`);
995
996	return sk;
997
998	err:
999	atomic_long_dec(v: &unix_nr_socks);
1000	return ERR_PTR(error: err);
1001	}
1002
1003	static int unix_create(struct net net, struct* socket sock, int* protocol,
1004	int kern)
1005	{
1006	struct sock *sk;
1007
1008	if (protocol && protocol != PF_UNIX)
1009	return -EPROTONOSUPPORT;
1010
1011	sock->state = SS_UNCONNECTED;
1012
1013	switch (sock->type) {
1014	case SOCK_STREAM:
1015	sock->ops = &unix_stream_ops;
1016	break;
1017	/*
1018	* Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019	* nothing uses it.
1020	*/
1021	case SOCK_RAW:
1022	sock->type = SOCK_DGRAM;
1023	fallthrough;
1024	case SOCK_DGRAM:
1025	sock->ops = &unix_dgram_ops;
1026	break;
1027	case SOCK_SEQPACKET:
1028	sock->ops = &unix_seqpacket_ops;
1029	break;
1030	default:
1031	return -ESOCKTNOSUPPORT;
1032	}
1033
1034	sk = unix_create1(net, sock, kern, type: sock->type);
1035	if (IS_ERR(ptr: sk))
1036	return PTR_ERR(ptr: sk);
1037
1038	return `0`;
1039	}
1040
1041	static int unix_release(struct socket *sock)
1042	{
1043	struct sock *sk = sock->sk;
1044
1045	if (!sk)
1046	return `0`;
1047
1048	sk->sk_prot->close(sk, `0`);
1049	unix_release_sock(sk, embrion: `0`);
1050	sock->sk = NULL;
1051
1052	return `0`;
1053	}
1054
1055	static struct sock unix_find_bsd(struct* sockaddr_un sunaddr, int* addr_len,
1056	int type)
1057	{
1058	struct inode *inode;
1059	struct path path;
1060	struct sock *sk;
1061	int err;
1062
1063	unix_mkname_bsd(sunaddr, addr_len);
1064	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065	if (err)
1066	goto fail;
1067
1068	err = path_permission(path: &path, MAY_WRITE);
1069	if (err)
1070	goto path_put;
1071
1072	err = -ECONNREFUSED;
1073	inode = d_backing_inode(upper: path.dentry);
1074	if (!S_ISSOCK(inode->i_mode))
1075	goto path_put;
1076
1077	sk = unix_find_socket_byinode(i: inode);
1078	if (!sk)
1079	goto path_put;
1080
1081	err = -EPROTOTYPE;
1082	if (sk->sk_type == type)
1083	touch_atime(&path);
1084	else
1085	goto sock_put;
1086
1087	path_put(&path);
1088
1089	return sk;
1090
1091	sock_put:
1092	sock_put(sk);
1093	path_put:
1094	path_put(&path);
1095	fail:
1096	return ERR_PTR(error: err);
1097	}
1098
1099	static struct sock unix_find_abstract(struct* net *net,
1100	struct sockaddr_un *sunaddr,
1101	int addr_len, int type)
1102	{
1103	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104	struct dentry *dentry;
1105	struct sock *sk;
1106
1107	sk = unix_find_socket_byname(net, sunname: sunaddr, len: addr_len, hash);
1108	if (!sk)
1109	return ERR_PTR(error: -ECONNREFUSED);
1110
1111	dentry = unix_sk(sk)->path.dentry;
1112	if (dentry)
1113	touch_atime(&unix_sk(sk)->path);
1114
1115	return sk;
1116	}
1117
1118	static struct sock unix_find_other(struct* net *net,
1119	struct sockaddr_un *sunaddr,
1120	int addr_len, int type)
1121	{
1122	struct sock *sk;
1123
1124	if (sunaddr->sun_path[`0`])
1125	sk = unix_find_bsd(sunaddr, addr_len, type);
1126	else
1127	sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128
1129	return sk;
1130	}
1131
1132	static int unix_autobind(struct sock *sk)
1133	{
1134	unsigned int new_hash, old_hash = sk->sk_hash;
1135	struct unix_sock *u = unix_sk(sk);
1136	struct net *net = sock_net(sk);
1137	struct unix_address *addr;
1138	u32 lastnum, ordernum;
1139	int err;
1140
1141	err = mutex_lock_interruptible(&u->bindlock);
1142	if (err)
1143	return err;
1144
1145	if (u->addr)
1146	goto out;
1147
1148	err = -ENOMEM;
1149	addr = kzalloc(size: sizeof(*addr) +
1150	offsetof(struct sockaddr_un, sun_path) + `16`, GFP_KERNEL);
1151	if (!addr)
1152	goto out;
1153
1154	addr->len = offsetof(struct sockaddr_un, sun_path) + `6`;
1155	addr->name->sun_family = AF_UNIX;
1156	refcount_set(r: &addr->refcnt, n: `1`);
1157
1158	ordernum = get_random_u32();
1159	lastnum = ordernum & `0xFFFFF`;
1160	retry:
1161	ordernum = (ordernum + `1`) & `0xFFFFF`;
1162	sprintf(buf: addr->name->sun_path + `1`, fmt: "%05x", ordernum);
1163
1164	new_hash = unix_abstract_hash(sunaddr: addr->name, addr_len: addr->len, type: sk->sk_type);
1165	unix_table_double_lock(net, hash1: old_hash, hash2: new_hash);
1166
1167	if (__unix_find_socket_byname(net, sunname: addr->name, len: addr->len, hash: new_hash)) {
1168	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1169
1170	/ __unix_find_socket_byname() may take long time if many names*
1171	* are already in use.
1172	*/
1173	cond_resched();
1174
1175	if (ordernum == lastnum) {
1176	/ Give up if all names seems to be in use. /
1177	err = -ENOSPC;
1178	unix_release_addr(addr);
1179	goto out;
1180	}
1181
1182	goto retry;
1183	}
1184
1185	__unix_set_addr_hash(net, sk, addr, hash: new_hash);
1186	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1187	err = `0`;
1188
1189	out: mutex_unlock(lock: &u->bindlock);
1190	return err;
1191	}
1192
1193	static int unix_bind_bsd(struct sock sk, struct* sockaddr_un *sunaddr,
1194	int addr_len)
1195	{
1196	umode_t mode = S_IFSOCK \|
1197	(SOCK_INODE(socket: sk->sk_socket)->i_mode & ~current_umask());
1198	unsigned int new_hash, old_hash = sk->sk_hash;
1199	struct unix_sock *u = unix_sk(sk);
1200	struct net *net = sock_net(sk);
1201	struct mnt_idmap *idmap;
1202	struct unix_address *addr;
1203	struct dentry *dentry;
1204	struct path parent;
1205	int err;
1206
1207	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1208	addr = unix_create_addr(sunaddr, addr_len);
1209	if (!addr)
1210	return -ENOMEM;
1211
1212	/*
1213	* Get the parent directory, calculate the hash for last
1214	* component.
1215	*/
1216	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, `0`);
1217	if (IS_ERR(ptr: dentry)) {
1218	err = PTR_ERR(ptr: dentry);
1219	goto out;
1220	}
1221
1222	/*
1223	* All right, let's create it.
1224	*/
1225	idmap = mnt_idmap(mnt: parent.mnt);
1226	err = security_path_mknod(dir: &parent, dentry, mode, dev: `0`);
1227	if (!err)
1228	err = vfs_mknod(idmap, d_inode(dentry: parent.dentry), dentry, mode, `0`);
1229	if (err)
1230	goto out_path;
1231	err = mutex_lock_interruptible(&u->bindlock);
1232	if (err)
1233	goto out_unlink;
1234	if (u->addr)
1235	goto out_unlock;
1236
1237	new_hash = unix_bsd_hash(i: d_backing_inode(upper: dentry));
1238	unix_table_double_lock(net, hash1: old_hash, hash2: new_hash);
1239	u->path.mnt = mntget(mnt: parent.mnt);
1240	u->path.dentry = dget(dentry);
1241	__unix_set_addr_hash(net, sk, addr, hash: new_hash);
1242	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1243	unix_insert_bsd_socket(sk);
1244	mutex_unlock(lock: &u->bindlock);
1245	done_path_create(&parent, dentry);
1246	return `0`;
1247
1248	out_unlock:
1249	mutex_unlock(lock: &u->bindlock);
1250	err = -EINVAL;
1251	out_unlink:
1252	/ failed after successful mknod? unlink what we'd created... /
1253	vfs_unlink(idmap, d_inode(dentry: parent.dentry), dentry, NULL);
1254	out_path:
1255	done_path_create(&parent, dentry);
1256	out:
1257	unix_release_addr(addr);
1258	return err == -EEXIST ? -EADDRINUSE : err;
1259	}
1260
1261	static int unix_bind_abstract(struct sock sk, struct* sockaddr_un *sunaddr,
1262	int addr_len)
1263	{
1264	unsigned int new_hash, old_hash = sk->sk_hash;
1265	struct unix_sock *u = unix_sk(sk);
1266	struct net *net = sock_net(sk);
1267	struct unix_address *addr;
1268	int err;
1269
1270	addr = unix_create_addr(sunaddr, addr_len);
1271	if (!addr)
1272	return -ENOMEM;
1273
1274	err = mutex_lock_interruptible(&u->bindlock);
1275	if (err)
1276	goto out;
1277
1278	if (u->addr) {
1279	err = -EINVAL;
1280	goto out_mutex;
1281	}
1282
1283	new_hash = unix_abstract_hash(sunaddr: addr->name, addr_len: addr->len, type: sk->sk_type);
1284	unix_table_double_lock(net, hash1: old_hash, hash2: new_hash);
1285
1286	if (__unix_find_socket_byname(net, sunname: addr->name, len: addr->len, hash: new_hash))
1287	goto out_spin;
1288
1289	__unix_set_addr_hash(net, sk, addr, hash: new_hash);
1290	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1291	mutex_unlock(lock: &u->bindlock);
1292	return `0`;
1293
1294	out_spin:
1295	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1296	err = -EADDRINUSE;
1297	out_mutex:
1298	mutex_unlock(lock: &u->bindlock);
1299	out:
1300	unix_release_addr(addr);
1301	return err;
1302	}
1303
1304	static int unix_bind(struct socket sock, struct* sockaddr uaddr, int* addr_len)
1305	{
1306	struct sockaddr_un sunaddr = (struct* sockaddr_un *)uaddr;
1307	struct sock *sk = sock->sk;
1308	int err;
1309
1310	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1311	sunaddr->sun_family == AF_UNIX)
1312	return unix_autobind(sk);
1313
1314	err = unix_validate_addr(sunaddr, addr_len);
1315	if (err)
1316	return err;
1317
1318	if (sunaddr->sun_path[`0`])
1319	err = unix_bind_bsd(sk, sunaddr, addr_len);
1320	else
1321	err = unix_bind_abstract(sk, sunaddr, addr_len);
1322
1323	return err;
1324	}
1325
1326	static void unix_state_double_lock(struct sock sk1, struct* sock *sk2)
1327	{
1328	if (unlikely(sk1 == sk2) \|\| !sk2) {
1329	unix_state_lock(sk1);
1330	return;
1331	}
1332	if (sk1 > sk2)
1333	swap(sk1, sk2);
1334
1335	unix_state_lock(sk1);
1336	unix_state_lock_nested(sk: sk2, subclass: U_LOCK_SECOND);
1337	}
1338
1339	static void unix_state_double_unlock(struct sock sk1, struct* sock *sk2)
1340	{
1341	if (unlikely(sk1 == sk2) \|\| !sk2) {
1342	unix_state_unlock(sk1);
1343	return;
1344	}
1345	unix_state_unlock(sk1);
1346	unix_state_unlock(sk2);
1347	}
1348
1349	static int unix_dgram_connect(struct socket sock, struct* sockaddr *addr,
1350	int alen, int flags)
1351	{
1352	struct sockaddr_un sunaddr = (struct* sockaddr_un *)addr;
1353	struct sock *sk = sock->sk;
1354	struct sock *other;
1355	int err;
1356
1357	err = -EINVAL;
1358	if (alen < offsetofend(struct sockaddr, sa_family))
1359	goto out;
1360
1361	if (addr->sa_family != AF_UNSPEC) {
1362	err = unix_validate_addr(sunaddr, addr_len: alen);
1363	if (err)
1364	goto out;
1365
1366	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1367	if (err)
1368	goto out;
1369
1370	if ((test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1371	test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1372	!unix_sk(sk)->addr) {
1373	err = unix_autobind(sk);
1374	if (err)
1375	goto out;
1376	}
1377
1378	restart:
1379	other = unix_find_other(net: sock_net(sk), sunaddr, addr_len: alen, type: sock->type);
1380	if (IS_ERR(ptr: other)) {
1381	err = PTR_ERR(ptr: other);
1382	goto out;
1383	}
1384
1385	unix_state_double_lock(sk1: sk, sk2: other);
1386
1387	/ Apparently VFS overslept socket death. Retry. /
1388	if (sock_flag(sk: other, flag: SOCK_DEAD)) {
1389	unix_state_double_unlock(sk1: sk, sk2: other);
1390	sock_put(sk: other);
1391	goto restart;
1392	}
1393
1394	err = -EPERM;
1395	if (!unix_may_send(sk, osk: other))
1396	goto out_unlock;
1397
1398	err = security_unix_may_send(sock: sk->sk_socket, other: other->sk_socket);
1399	if (err)
1400	goto out_unlock;
1401
1402	sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1403	} else {
1404	/*
1405	* 1003.1g breaking connected state with AF_UNSPEC
1406	*/
1407	other = NULL;
1408	unix_state_double_lock(sk1: sk, sk2: other);
1409	}
1410
1411	/*
1412	* If it was connected, reconnect.
1413	*/
1414	if (unix_peer(sk)) {
1415	struct sock *old_peer = unix_peer(sk);
1416
1417	unix_peer(sk) = other;
1418	if (!other)
1419	sk->sk_state = TCP_CLOSE;
1420	unix_dgram_peer_wake_disconnect_wakeup(sk, other: old_peer);
1421
1422	unix_state_double_unlock(sk1: sk, sk2: other);
1423
1424	if (other != old_peer)
1425	unix_dgram_disconnected(sk, other: old_peer);
1426	sock_put(sk: old_peer);
1427	} else {
1428	unix_peer(sk) = other;
1429	unix_state_double_unlock(sk1: sk, sk2: other);
1430	}
1431
1432	return `0`;
1433
1434	out_unlock:
1435	unix_state_double_unlock(sk1: sk, sk2: other);
1436	sock_put(sk: other);
1437	out:
1438	return err;
1439	}
1440
1441	static long unix_wait_for_peer(struct sock other, long* timeo)
1442	__releases(&unix_sk(other)->lock)
1443	{
1444	struct unix_sock *u = unix_sk(other);
1445	int sched;
1446	DEFINE_WAIT(wait);
1447
1448	prepare_to_wait_exclusive(wq_head: &u->peer_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
1449
1450	sched = !sock_flag(sk: other, flag: SOCK_DEAD) &&
1451	!(other->sk_shutdown & RCV_SHUTDOWN) &&
1452	unix_recvq_full_lockless(sk: other);
1453
1454	unix_state_unlock(other);
1455
1456	if (sched)
1457	timeo = schedule_timeout(timeout: timeo);
1458
1459	finish_wait(wq_head: &u->peer_wait, wq_entry: &wait);
1460	return timeo;
1461	}
1462
1463	static int unix_stream_connect(struct socket sock, struct* sockaddr *uaddr,
1464	int addr_len, int flags)
1465	{
1466	struct sockaddr_un sunaddr = (struct* sockaddr_un *)uaddr;
1467	struct sock sk = sock->sk, newsk = NULL, *other = NULL;
1468	struct unix_sock u = unix_sk(sk), newu, *otheru;
1469	struct net *net = sock_net(sk);
1470	struct sk_buff *skb = NULL;
1471	long timeo;
1472	int err;
1473	int st;
1474
1475	err = unix_validate_addr(sunaddr, addr_len);
1476	if (err)
1477	goto out;
1478
1479	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1480	if (err)
1481	goto out;
1482
1483	if ((test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1484	test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1485	err = unix_autobind(sk);
1486	if (err)
1487	goto out;
1488	}
1489
1490	timeo = sock_sndtimeo(sk, noblock: flags & O_NONBLOCK);
1491
1492	/ First of all allocate resources.*
1493	If we will make it after state is locked,
1494	we will have to recheck all again in any case.
1495	*/
1496
1497	/ create new sock for complete connection /
1498	newsk = unix_create1(net, NULL, kern: `0`, type: sock->type);
1499	if (IS_ERR(ptr: newsk)) {
1500	err = PTR_ERR(ptr: newsk);
1501	newsk = NULL;
1502	goto out;
1503	}
1504
1505	err = -ENOMEM;
1506
1507	/ Allocate skb for sending to listening sock /
1508	skb = sock_wmalloc(sk: newsk, size: `1`, force: `0`, GFP_KERNEL);
1509	if (skb == NULL)
1510	goto out;
1511
1512	restart:
1513	/ Find listening sock. /
1514	other = unix_find_other(net, sunaddr, addr_len, type: sk->sk_type);
1515	if (IS_ERR(ptr: other)) {
1516	err = PTR_ERR(ptr: other);
1517	other = NULL;
1518	goto out;
1519	}
1520
1521	/ Latch state of peer /
1522	unix_state_lock(other);
1523
1524	/ Apparently VFS overslept socket death. Retry. /
1525	if (sock_flag(sk: other, flag: SOCK_DEAD)) {
1526	unix_state_unlock(other);
1527	sock_put(sk: other);
1528	goto restart;
1529	}
1530
1531	err = -ECONNREFUSED;
1532	if (other->sk_state != TCP_LISTEN)
1533	goto out_unlock;
1534	if (other->sk_shutdown & RCV_SHUTDOWN)
1535	goto out_unlock;
1536
1537	if (unix_recvq_full(sk: other)) {
1538	err = -EAGAIN;
1539	if (!timeo)
1540	goto out_unlock;
1541
1542	timeo = unix_wait_for_peer(other, timeo);
1543
1544	err = sock_intr_errno(timeo);
1545	if (signal_pending(current))
1546	goto out;
1547	sock_put(sk: other);
1548	goto restart;
1549	}
1550
1551	/ Latch our state.*
1552
1553	It is tricky place. We need to grab our state lock and cannot
1554	drop lock on peer. It is dangerous because deadlock is
1555	possible. Connect to self case and simultaneous
1556	attempt to connect are eliminated by checking socket
1557	state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1558	check this before attempt to grab lock.
1559
1560	Well, and we have to recheck the state after socket locked.
1561	*/
1562	st = sk->sk_state;
1563
1564	switch (st) {
1565	case TCP_CLOSE:
1566	/ This is ok... continue with connect /
1567	break;
1568	case TCP_ESTABLISHED:
1569	/ Socket is already connected /
1570	err = -EISCONN;
1571	goto out_unlock;
1572	default:
1573	err = -EINVAL;
1574	goto out_unlock;
1575	}
1576
1577	unix_state_lock_nested(sk, subclass: U_LOCK_SECOND);
1578
1579	if (sk->sk_state != st) {
1580	unix_state_unlock(sk);
1581	unix_state_unlock(other);
1582	sock_put(sk: other);
1583	goto restart;
1584	}
1585
1586	err = security_unix_stream_connect(sock: sk, other, newsk);
1587	if (err) {
1588	unix_state_unlock(sk);
1589	goto out_unlock;
1590	}
1591
1592	/ The way is open! Fastly set all the necessary fields... /
1593
1594	sock_hold(sk);
1595	unix_peer(newsk) = sk;
1596	newsk->sk_state = TCP_ESTABLISHED;
1597	newsk->sk_type = sk->sk_type;
1598	init_peercred(sk: newsk);
1599	newu = unix_sk(newsk);
1600	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1601	otheru = unix_sk(other);
1602
1603	/ copy address information from listening to new sock*
1604	*
1605	* The contents of *(otheru->addr) and otheru->path
1606	* are seen fully set up here, since we have found
1607	* otheru in hash under its lock. Insertion into the
1608	* hash chain we'd found it in had been done in an
1609	* earlier critical area protected by the chain's lock,
1610	* the same one where we'd set *(otheru->addr) contents,
1611	* as well as otheru->path and otheru->addr itself.
1612	*
1613	* Using smp_store_release() here to set newu->addr
1614	* is enough to make those stores, as well as stores
1615	* to newu->path visible to anyone who gets newu->addr
1616	* by smp_load_acquire(). IOW, the same warranties
1617	* as for unix_sock instances bound in unix_bind() or
1618	* in unix_autobind().
1619	*/
1620	if (otheru->path.dentry) {
1621	path_get(&otheru->path);
1622	newu->path = otheru->path;
1623	}
1624	refcount_inc(r: &otheru->addr->refcnt);
1625	smp_store_release(&newu->addr, otheru->addr);
1626
1627	/ Set credentials /
1628	copy_peercred(sk, peersk: other);
1629
1630	sock->state = SS_CONNECTED;
1631	sk->sk_state = TCP_ESTABLISHED;
1632	sock_hold(sk: newsk);
1633
1634	smp_mb__after_atomic(); / sock_hold() does an atomic_inc() /
1635	unix_peer(sk) = newsk;
1636
1637	unix_state_unlock(sk);
1638
1639	/ take ten and send info to listening sock /
1640	spin_lock(lock: &other->sk_receive_queue.lock);
1641	__skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
1642	spin_unlock(lock: &other->sk_receive_queue.lock);
1643	unix_state_unlock(other);
1644	other->sk_data_ready(other);
1645	sock_put(sk: other);
1646	return `0`;
1647
1648	out_unlock:
1649	if (other)
1650	unix_state_unlock(other);
1651
1652	out:
1653	kfree_skb(skb);
1654	if (newsk)
1655	unix_release_sock(sk: newsk, embrion: `0`);
1656	if (other)
1657	sock_put(sk: other);
1658	return err;
1659	}
1660
1661	static int unix_socketpair(struct socket socka, struct* socket *sockb)
1662	{
1663	struct sock ska = socka->sk, skb = sockb->sk;
1664
1665	/ Join our sockets back to back /
1666	sock_hold(sk: ska);
1667	sock_hold(sk: skb);
1668	unix_peer(ska) = skb;
1669	unix_peer(skb) = ska;
1670	init_peercred(sk: ska);
1671	init_peercred(sk: skb);
1672
1673	ska->sk_state = TCP_ESTABLISHED;
1674	skb->sk_state = TCP_ESTABLISHED;
1675	socka->state = SS_CONNECTED;
1676	sockb->state = SS_CONNECTED;
1677	return `0`;
1678	}
1679
1680	static void unix_sock_inherit_flags(const struct socket *old,
1681	struct socket *new)
1682	{
1683	if (test_bit(SOCK_PASSCRED, &old->flags))
1684	set_bit(SOCK_PASSCRED, addr: &new->flags);
1685	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1686	set_bit(SOCK_PASSPIDFD, addr: &new->flags);
1687	if (test_bit(SOCK_PASSSEC, &old->flags))
1688	set_bit(SOCK_PASSSEC, addr: &new->flags);
1689	}
1690
1691	static int unix_accept(struct socket sock, struct* socket newsock, int* flags,
1692	bool kern)
1693	{
1694	struct sock *sk = sock->sk;
1695	struct sock *tsk;
1696	struct sk_buff *skb;
1697	int err;
1698
1699	err = -EOPNOTSUPP;
1700	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1701	goto out;
1702
1703	err = -EINVAL;
1704	if (sk->sk_state != TCP_LISTEN)
1705	goto out;
1706
1707	/ If socket state is TCP_LISTEN it cannot change (for now...),*
1708	* so that no locks are necessary.
1709	*/
1710
1711	skb = skb_recv_datagram(sk, flags: (flags & O_NONBLOCK) ? MSG_DONTWAIT : `0`,
1712	err: &err);
1713	if (!skb) {
1714	/ This means receive shutdown. /
1715	if (err == `0`)
1716	err = -EINVAL;
1717	goto out;
1718	}
1719
1720	tsk = skb->sk;
1721	skb_free_datagram(sk, skb);
1722	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1723
1724	/ attach accepted sock to socket /
1725	unix_state_lock(tsk);
1726	newsock->state = SS_CONNECTED;
1727	unix_sock_inherit_flags(old: sock, new: newsock);
1728	sock_graft(sk: tsk, parent: newsock);
1729	unix_state_unlock(tsk);
1730	return `0`;
1731
1732	out:
1733	return err;
1734	}
1735
1736
1737	static int unix_getname(struct socket sock, struct* sockaddr uaddr, int* peer)
1738	{
1739	struct sock *sk = sock->sk;
1740	struct unix_address *addr;
1741	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1742	int err = `0`;
1743
1744	if (peer) {
1745	sk = unix_peer_get(sk);
1746
1747	err = -ENOTCONN;
1748	if (!sk)
1749	goto out;
1750	err = `0`;
1751	} else {
1752	sock_hold(sk);
1753	}
1754
1755	addr = smp_load_acquire(&unix_sk(sk)->addr);
1756	if (!addr) {
1757	sunaddr->sun_family = AF_UNIX;
1758	sunaddr->sun_path[`0`] = `0`;
1759	err = offsetof(struct sockaddr_un, sun_path);
1760	} else {
1761	err = addr->len;
1762	memcpy(sunaddr, addr->name, addr->len);
1763
1764	if (peer)
1765	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1766	CGROUP_UNIX_GETPEERNAME);
1767	else
1768	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1769	CGROUP_UNIX_GETSOCKNAME);
1770	}
1771	sock_put(sk);
1772	out:
1773	return err;
1774	}
1775
1776	/ The "user->unix_inflight" variable is protected by the garbage*
1777	* collection lock, and we just read it locklessly here. If you go
1778	* over the limit, there might be a tiny race in actually noticing
1779	* it across threads. Tough.
1780	*/
1781	static inline bool too_many_unix_fds(struct task_struct *p)
1782	{
1783	struct user_struct *user = current_user();
1784
1785	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1786	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1787	return false;
1788	}
1789
1790	static int unix_attach_fds(struct scm_cookie scm, struct* sk_buff *skb)
1791	{
1792	int i;
1793
1794	if (too_many_unix_fds(current))
1795	return -ETOOMANYREFS;
1796
1797	/ Need to duplicate file references for the sake of garbage*
1798	* collection. Otherwise a socket in the fps might become a
1799	* candidate for GC while the skb is not yet queued.
1800	*/
1801	UNIXCB(skb).fp = scm_fp_dup(fpl: scm->fp);
1802	if (!UNIXCB(skb).fp)
1803	return -ENOMEM;
1804
1805	for (i = scm->fp->count - `1`; i >= `0`; i--)
1806	unix_inflight(user: scm->fp->user, fp: scm->fp->fp[i]);
1807
1808	return `0`;
1809	}
1810
1811	static void unix_detach_fds(struct scm_cookie scm, struct* sk_buff *skb)
1812	{
1813	int i;
1814
1815	scm->fp = UNIXCB(skb).fp;
1816	UNIXCB(skb).fp = NULL;
1817
1818	for (i = scm->fp->count - `1`; i >= `0`; i--)
1819	unix_notinflight(user: scm->fp->user, fp: scm->fp->fp[i]);
1820	}
1821
1822	static void unix_peek_fds(struct scm_cookie scm, struct* sk_buff *skb)
1823	{
1824	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1825
1826	/*
1827	* Garbage collection of unix sockets starts by selecting a set of
1828	* candidate sockets which have reference only from being in flight
1829	* (total_refs == inflight_refs). This condition is checked once during
1830	* the candidate collection phase, and candidates are marked as such, so
1831	* that non-candidates can later be ignored. While inflight_refs is
1832	* protected by unix_gc_lock, total_refs (file count) is not, hence this
1833	* is an instantaneous decision.
1834	*
1835	* Once a candidate, however, the socket must not be reinstalled into a
1836	* file descriptor while the garbage collection is in progress.
1837	*
1838	* If the above conditions are met, then the directed graph of
1839	* candidates (*) does not change while unix_gc_lock is held.
1840	*
1841	* Any operations that changes the file count through file descriptors
1842	* (dup, close, sendmsg) does not change the graph since candidates are
1843	* not installed in fds.
1844	*
1845	* Dequeing a candidate via recvmsg would install it into an fd, but
1846	* that takes unix_gc_lock to decrement the inflight count, so it's
1847	* serialized with garbage collection.
1848	*
1849	* MSG_PEEK is special in that it does not change the inflight count,
1850	* yet does install the socket into an fd. The following lock/unlock
1851	* pair is to ensure serialization with garbage collection. It must be
1852	* done between incrementing the file count and installing the file into
1853	* an fd.
1854	*
1855	* If garbage collection starts after the barrier provided by the
1856	* lock/unlock, then it will see the elevated refcount and not mark this
1857	* as a candidate. If a garbage collection is already in progress
1858	* before the file count was incremented, then the lock/unlock pair will
1859	* ensure that garbage collection is finished before progressing to
1860	* installing the fd.
1861	*
1862	* (*) A -> B where B is on the queue of A or B is on the queue of C
1863	* which is on the queue of listening socket A.
1864	*/
1865	spin_lock(lock: &unix_gc_lock);
1866	spin_unlock(lock: &unix_gc_lock);
1867	}
1868
1869	static void unix_destruct_scm(struct sk_buff *skb)
1870	{
1871	struct scm_cookie scm;
1872
1873	memset(&scm, `0`, sizeof(scm));
1874	scm.pid = UNIXCB(skb).pid;
1875	if (UNIXCB(skb).fp)
1876	unix_detach_fds(scm: &scm, skb);
1877
1878	/ Alas, it calls VFS /
1879	/ So fscking what? fput() had been SMP-safe since the last Summer /
1880	scm_destroy(scm: &scm);
1881	sock_wfree(skb);
1882	}
1883
1884	static int unix_scm_to_skb(struct scm_cookie scm, struct* sk_buff *skb, bool send_fds)
1885	{
1886	int err = `0`;
1887
1888	UNIXCB(skb).pid = get_pid(pid: scm->pid);
1889	UNIXCB(skb).uid = scm->creds.uid;
1890	UNIXCB(skb).gid = scm->creds.gid;
1891	UNIXCB(skb).fp = NULL;
1892	unix_get_secdata(scm, skb);
1893	if (scm->fp && send_fds)
1894	err = unix_attach_fds(scm, skb);
1895
1896	skb->destructor = unix_destruct_scm;
1897	return err;
1898	}
1899
1900	static bool unix_passcred_enabled(const struct socket *sock,
1901	const struct sock *other)
1902	{
1903	return test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1904	test_bit(SOCK_PASSPIDFD, &sock->flags) \|\|
1905	!other->sk_socket \|\|
1906	test_bit(SOCK_PASSCRED, &other->sk_socket->flags) \|\|
1907	test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1908	}
1909
1910	/*
1911	* Some apps rely on write() giving SCM_CREDENTIALS
1912	* We include credentials if source or destination socket
1913	* asserted SOCK_PASSCRED.
1914	*/
1915	static void maybe_add_creds(struct sk_buff skb, const* struct socket *sock,
1916	const struct sock *other)
1917	{
1918	if (UNIXCB(skb).pid)
1919	return;
1920	if (unix_passcred_enabled(sock, other)) {
1921	UNIXCB(skb).pid = get_pid(pid: task_tgid(current));
1922	current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1923	}
1924	}
1925
1926	static bool unix_skb_scm_eq(struct sk_buff *skb,
1927	struct scm_cookie *scm)
1928	{
1929	return UNIXCB(skb).pid == scm->pid &&
1930	uid_eq(UNIXCB(skb).uid, right: scm->creds.uid) &&
1931	gid_eq(UNIXCB(skb).gid, right: scm->creds.gid) &&
1932	unix_secdata_eq(scm, skb);
1933	}
1934
1935	static void scm_stat_add(struct sock sk, struct* sk_buff *skb)
1936	{
1937	struct scm_fp_list *fp = UNIXCB(skb).fp;
1938	struct unix_sock *u = unix_sk(sk);
1939
1940	if (unlikely(fp && fp->count))
1941	atomic_add(i: fp->count, v: &u->scm_stat.nr_fds);
1942	}
1943
1944	static void scm_stat_del(struct sock sk, struct* sk_buff *skb)
1945	{
1946	struct scm_fp_list *fp = UNIXCB(skb).fp;
1947	struct unix_sock *u = unix_sk(sk);
1948
1949	if (unlikely(fp && fp->count))
1950	atomic_sub(i: fp->count, v: &u->scm_stat.nr_fds);
1951	}
1952
1953	/*
1954	* Send AF_UNIX data.
1955	*/
1956
1957	static int unix_dgram_sendmsg(struct socket sock, struct* msghdr *msg,
1958	size_t len)
1959	{
1960	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1961	struct sock sk = sock->sk, other = NULL;
1962	struct unix_sock *u = unix_sk(sk);
1963	struct scm_cookie scm;
1964	struct sk_buff *skb;
1965	int data_len = `0`;
1966	int sk_locked;
1967	long timeo;
1968	int err;
1969
1970	err = scm_send(sock, msg, scm: &scm, forcecreds: false);
1971	if (err < `0`)
1972	return err;
1973
1974	wait_for_unix_gc(fpl: scm.fp);
1975
1976	err = -EOPNOTSUPP;
1977	if (msg->msg_flags&MSG_OOB)
1978	goto out;
1979
1980	if (msg->msg_namelen) {
1981	err = unix_validate_addr(sunaddr, addr_len: msg->msg_namelen);
1982	if (err)
1983	goto out;
1984
1985	err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1986	msg->msg_name,
1987	&msg->msg_namelen,
1988	NULL);
1989	if (err)
1990	goto out;
1991	} else {
1992	sunaddr = NULL;
1993	err = -ENOTCONN;
1994	other = unix_peer_get(sk);
1995	if (!other)
1996	goto out;
1997	}
1998
1999	if ((test_bit(SOCK_PASSCRED, &sock->flags) \|\|
2000	test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
2001	err = unix_autobind(sk);
2002	if (err)
2003	goto out;
2004	}
2005
2006	err = -EMSGSIZE;
2007	if (len > sk->sk_sndbuf - `32`)
2008	goto out;
2009
2010	if (len > SKB_MAX_ALLOC) {
2011	data_len = min_t(size_t,
2012	len - SKB_MAX_ALLOC,
2013	MAX_SKB_FRAGS * PAGE_SIZE);
2014	data_len = PAGE_ALIGN(data_len);
2015
2016	BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2017	}
2018
2019	skb = sock_alloc_send_pskb(sk, header_len: len - data_len, data_len,
2020	noblock: msg->msg_flags & MSG_DONTWAIT, errcode: &err,
2021	PAGE_ALLOC_COSTLY_ORDER);
2022	if (skb == NULL)
2023	goto out;
2024
2025	err = unix_scm_to_skb(scm: &scm, skb, send_fds: true);
2026	if (err < `0`)
2027	goto out_free;
2028
2029	skb_put(skb, len: len - data_len);
2030	skb->data_len = data_len;
2031	skb->len = len;
2032	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len);
2033	if (err)
2034	goto out_free;
2035
2036	timeo = sock_sndtimeo(sk, noblock: msg->msg_flags & MSG_DONTWAIT);
2037
2038	restart:
2039	if (!other) {
2040	err = -ECONNRESET;
2041	if (sunaddr == NULL)
2042	goto out_free;
2043
2044	other = unix_find_other(net: sock_net(sk), sunaddr, addr_len: msg->msg_namelen,
2045	type: sk->sk_type);
2046	if (IS_ERR(ptr: other)) {
2047	err = PTR_ERR(ptr: other);
2048	other = NULL;
2049	goto out_free;
2050	}
2051	}
2052
2053	if (sk_filter(sk: other, skb) < `0`) {
2054	/ Toss the packet but do not return any error to the sender /
2055	err = len;
2056	goto out_free;
2057	}
2058
2059	sk_locked = `0`;
2060	unix_state_lock(other);
2061	restart_locked:
2062	err = -EPERM;
2063	if (!unix_may_send(sk, osk: other))
2064	goto out_unlock;
2065
2066	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2067	/*
2068	* Check with 1003.1g - what should
2069	* datagram error
2070	*/
2071	unix_state_unlock(other);
2072	sock_put(sk: other);
2073
2074	if (!sk_locked)
2075	unix_state_lock(sk);
2076
2077	err = `0`;
2078	if (sk->sk_type == SOCK_SEQPACKET) {
2079	/ We are here only when racing with unix_release_sock()*
2080	* is clearing @other. Never change state to TCP_CLOSE
2081	* unlike SOCK_DGRAM wants.
2082	*/
2083	unix_state_unlock(sk);
2084	err = -EPIPE;
2085	} else if (unix_peer(sk) == other) {
2086	unix_peer(sk) = NULL;
2087	unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2088
2089	sk->sk_state = TCP_CLOSE;
2090	unix_state_unlock(sk);
2091
2092	unix_dgram_disconnected(sk, other);
2093	sock_put(sk: other);
2094	err = -ECONNREFUSED;
2095	} else {
2096	unix_state_unlock(sk);
2097	}
2098
2099	other = NULL;
2100	if (err)
2101	goto out_free;
2102	goto restart;
2103	}
2104
2105	err = -EPIPE;
2106	if (other->sk_shutdown & RCV_SHUTDOWN)
2107	goto out_unlock;
2108
2109	if (sk->sk_type != SOCK_SEQPACKET) {
2110	err = security_unix_may_send(sock: sk->sk_socket, other: other->sk_socket);
2111	if (err)
2112	goto out_unlock;
2113	}
2114
2115	/ other == sk && unix_peer(other) != sk if*
2116	* - unix_peer(sk) == NULL, destination address bound to sk
2117	* - unix_peer(sk) == sk by time of get but disconnected before lock
2118	*/
2119	if (other != sk &&
2120	unlikely(unix_peer(other) != sk &&
2121	unix_recvq_full_lockless(other))) {
2122	if (timeo) {
2123	timeo = unix_wait_for_peer(other, timeo);
2124
2125	err = sock_intr_errno(timeo);
2126	if (signal_pending(current))
2127	goto out_free;
2128
2129	goto restart;
2130	}
2131
2132	if (!sk_locked) {
2133	unix_state_unlock(other);
2134	unix_state_double_lock(sk1: sk, sk2: other);
2135	}
2136
2137	if (unix_peer(sk) != other \|\|
2138	unix_dgram_peer_wake_me(sk, other)) {
2139	err = -EAGAIN;
2140	sk_locked = `1`;
2141	goto out_unlock;
2142	}
2143
2144	if (!sk_locked) {
2145	sk_locked = `1`;
2146	goto restart_locked;
2147	}
2148	}
2149
2150	if (unlikely(sk_locked))
2151	unix_state_unlock(sk);
2152
2153	if (sock_flag(sk: other, flag: SOCK_RCVTSTAMP))
2154	__net_timestamp(skb);
2155	maybe_add_creds(skb, sock, other);
2156	scm_stat_add(sk: other, skb);
2157	skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
2158	unix_state_unlock(other);
2159	other->sk_data_ready(other);
2160	sock_put(sk: other);
2161	scm_destroy(scm: &scm);
2162	return len;
2163
2164	out_unlock:
2165	if (sk_locked)
2166	unix_state_unlock(sk);
2167	unix_state_unlock(other);
2168	out_free:
2169	kfree_skb(skb);
2170	out:
2171	if (other)
2172	sock_put(sk: other);
2173	scm_destroy(scm: &scm);
2174	return err;
2175	}
2176
2177	/ We use paged skbs for stream sockets, and limit occupancy to 32768*
2178	* bytes, and a minimum of a full page.
2179	*/
2180	#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2181
2182	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2183	static int queue_oob(struct socket sock, struct* msghdr msg, struct* sock *other,
2184	struct scm_cookie *scm, bool fds_sent)
2185	{
2186	struct unix_sock *ousk = unix_sk(other);
2187	struct sk_buff *skb;
2188	int err = `0`;
2189
2190	skb = sock_alloc_send_skb(sk: sock->sk, size: `1`, noblock: msg->msg_flags & MSG_DONTWAIT, errcode: &err);
2191
2192	if (!skb)
2193	return err;
2194
2195	err = unix_scm_to_skb(scm, skb, send_fds: !fds_sent);
2196	if (err < `0`) {
2197	kfree_skb(skb);
2198	return err;
2199	}
2200	skb_put(skb, len: `1`);
2201	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len: `1`);
2202
2203	if (err) {
2204	kfree_skb(skb);
2205	return err;
2206	}
2207
2208	unix_state_lock(other);
2209
2210	if (sock_flag(sk: other, flag: SOCK_DEAD) \|\|
2211	(other->sk_shutdown & RCV_SHUTDOWN)) {
2212	unix_state_unlock(other);
2213	kfree_skb(skb);
2214	return -EPIPE;
2215	}
2216
2217	maybe_add_creds(skb, sock, other);
2218	skb_get(skb);
2219
2220	if (ousk->oob_skb)
2221	consume_skb(skb: ousk->oob_skb);
2222
2223	WRITE_ONCE(ousk->oob_skb, skb);
2224
2225	scm_stat_add(sk: other, skb);
2226	skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
2227	sk_send_sigurg(sk: other);
2228	unix_state_unlock(other);
2229	other->sk_data_ready(other);
2230
2231	return err;
2232	}
2233	#endif
2234
2235	static int unix_stream_sendmsg(struct socket sock, struct* msghdr *msg,
2236	size_t len)
2237	{
2238	struct sock *sk = sock->sk;
2239	struct sock *other = NULL;
2240	int err, size;
2241	struct sk_buff *skb;
2242	int sent = `0`;
2243	struct scm_cookie scm;
2244	bool fds_sent = false;
2245	int data_len;
2246
2247	err = scm_send(sock, msg, scm: &scm, forcecreds: false);
2248	if (err < `0`)
2249	return err;
2250
2251	wait_for_unix_gc(fpl: scm.fp);
2252
2253	err = -EOPNOTSUPP;
2254	if (msg->msg_flags & MSG_OOB) {
2255	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2256	if (len)
2257	len--;
2258	else
2259	#endif
2260	goto out_err;
2261	}
2262
2263	if (msg->msg_namelen) {
2264	err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2265	goto out_err;
2266	} else {
2267	err = -ENOTCONN;
2268	other = unix_peer(sk);
2269	if (!other)
2270	goto out_err;
2271	}
2272
2273	if (sk->sk_shutdown & SEND_SHUTDOWN)
2274	goto pipe_err;
2275
2276	while (sent < len) {
2277	size = len - sent;
2278
2279	if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2280	skb = sock_alloc_send_pskb(sk, header_len: `0`, data_len: `0`,
2281	noblock: msg->msg_flags & MSG_DONTWAIT,
2282	errcode: &err, max_page_order: `0`);
2283	} else {
2284	/ Keep two messages in the pipe so it schedules better /
2285	size = min_t(int, size, (sk->sk_sndbuf >> `1`) - `64`);
2286
2287	/ allow fallback to order-0 allocations /
2288	size = min_t(int, size, SKB_MAX_HEAD(`0`) + UNIX_SKB_FRAGS_SZ);
2289
2290	data_len = max_t(int, `0`, size - SKB_MAX_HEAD(`0`));
2291
2292	data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2293
2294	skb = sock_alloc_send_pskb(sk, header_len: size - data_len, data_len,
2295	noblock: msg->msg_flags & MSG_DONTWAIT, errcode: &err,
2296	max_page_order: get_order(UNIX_SKB_FRAGS_SZ));
2297	}
2298	if (!skb)
2299	goto out_err;
2300
2301	/ Only send the fds in the first buffer /
2302	err = unix_scm_to_skb(scm: &scm, skb, send_fds: !fds_sent);
2303	if (err < `0`) {
2304	kfree_skb(skb);
2305	goto out_err;
2306	}
2307	fds_sent = true;
2308
2309	if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2310	err = skb_splice_from_iter(skb, iter: &msg->msg_iter, maxsize: size,
2311	gfp: sk->sk_allocation);
2312	if (err < `0`) {
2313	kfree_skb(skb);
2314	goto out_err;
2315	}
2316	size = err;
2317	refcount_add(i: size, r: &sk->sk_wmem_alloc);
2318	} else {
2319	skb_put(skb, len: size - data_len);
2320	skb->data_len = data_len;
2321	skb->len = size;
2322	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len: size);
2323	if (err) {
2324	kfree_skb(skb);
2325	goto out_err;
2326	}
2327	}
2328
2329	unix_state_lock(other);
2330
2331	if (sock_flag(sk: other, flag: SOCK_DEAD) \|\|
2332	(other->sk_shutdown & RCV_SHUTDOWN))
2333	goto pipe_err_free;
2334
2335	maybe_add_creds(skb, sock, other);
2336	scm_stat_add(sk: other, skb);
2337	skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
2338	unix_state_unlock(other);
2339	other->sk_data_ready(other);
2340	sent += size;
2341	}
2342
2343	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2344	if (msg->msg_flags & MSG_OOB) {
2345	err = queue_oob(sock, msg, other, scm: &scm, fds_sent);
2346	if (err)
2347	goto out_err;
2348	sent++;
2349	}
2350	#endif
2351
2352	scm_destroy(scm: &scm);
2353
2354	return sent;
2355
2356	pipe_err_free:
2357	unix_state_unlock(other);
2358	kfree_skb(skb);
2359	pipe_err:
2360	if (sent == `0` && !(msg->msg_flags&MSG_NOSIGNAL))
2361	send_sig(SIGPIPE, current, `0`);
2362	err = -EPIPE;
2363	out_err:
2364	scm_destroy(scm: &scm);
2365	return sent ? : err;
2366	}
2367
2368	static int unix_seqpacket_sendmsg(struct socket sock, struct* msghdr *msg,
2369	size_t len)
2370	{
2371	int err;
2372	struct sock *sk = sock->sk;
2373
2374	err = sock_error(sk);
2375	if (err)
2376	return err;
2377
2378	if (sk->sk_state != TCP_ESTABLISHED)
2379	return -ENOTCONN;
2380
2381	if (msg->msg_namelen)
2382	msg->msg_namelen = `0`;
2383
2384	return unix_dgram_sendmsg(sock, msg, len);
2385	}
2386
2387	static int unix_seqpacket_recvmsg(struct socket sock, struct* msghdr *msg,
2388	size_t size, int flags)
2389	{
2390	struct sock *sk = sock->sk;
2391
2392	if (sk->sk_state != TCP_ESTABLISHED)
2393	return -ENOTCONN;
2394
2395	return unix_dgram_recvmsg(sock, msg, size, flags);
2396	}
2397
2398	static void unix_copy_addr(struct msghdr msg, struct* sock *sk)
2399	{
2400	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2401
2402	if (addr) {
2403	msg->msg_namelen = addr->len;
2404	memcpy(msg->msg_name, addr->name, addr->len);
2405	}
2406	}
2407
2408	int __unix_dgram_recvmsg(struct sock sk, struct* msghdr *msg, size_t size,
2409	int flags)
2410	{
2411	struct scm_cookie scm;
2412	struct socket *sock = sk->sk_socket;
2413	struct unix_sock *u = unix_sk(sk);
2414	struct sk_buff skb, last;
2415	long timeo;
2416	int skip;
2417	int err;
2418
2419	err = -EOPNOTSUPP;
2420	if (flags&MSG_OOB)
2421	goto out;
2422
2423	timeo = sock_rcvtimeo(sk, noblock: flags & MSG_DONTWAIT);
2424
2425	do {
2426	mutex_lock(&u->iolock);
2427
2428	skip = sk_peek_offset(sk, flags);
2429	skb = __skb_try_recv_datagram(sk, queue: &sk->sk_receive_queue, flags,
2430	off: &skip, err: &err, last: &last);
2431	if (skb) {
2432	if (!(flags & MSG_PEEK))
2433	scm_stat_del(sk, skb);
2434	break;
2435	}
2436
2437	mutex_unlock(lock: &u->iolock);
2438
2439	if (err != -EAGAIN)
2440	break;
2441	} while (timeo &&
2442	!__skb_wait_for_more_packets(sk, queue: &sk->sk_receive_queue,
2443	err: &err, timeo_p: &timeo, skb: last));
2444
2445	if (!skb) { / implies iolock unlocked /
2446	unix_state_lock(sk);
2447	/ Signal EOF on disconnected non-blocking SEQPACKET socket. /
2448	if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2449	(sk->sk_shutdown & RCV_SHUTDOWN))
2450	err = `0`;
2451	unix_state_unlock(sk);
2452	goto out;
2453	}
2454
2455	if (wq_has_sleeper(wq_head: &u->peer_wait))
2456	wake_up_interruptible_sync_poll(&u->peer_wait,
2457	EPOLLOUT \| EPOLLWRNORM \|
2458	EPOLLWRBAND);
2459
2460	if (msg->msg_name) {
2461	unix_copy_addr(msg, sk: skb->sk);
2462
2463	BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2464	msg->msg_name,
2465	&msg->msg_namelen);
2466	}
2467
2468	if (size > skb->len - skip)
2469	size = skb->len - skip;
2470	else if (size < skb->len - skip)
2471	msg->msg_flags \|= MSG_TRUNC;
2472
2473	err = skb_copy_datagram_msg(from: skb, offset: skip, msg, size);
2474	if (err)
2475	goto out_free;
2476
2477	if (sock_flag(sk, flag: SOCK_RCVTSTAMP))
2478	__sock_recv_timestamp(msg, sk, skb);
2479
2480	memset(&scm, `0`, sizeof(scm));
2481
2482	scm_set_cred(scm: &scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2483	unix_set_secdata(scm: &scm, skb);
2484
2485	if (!(flags & MSG_PEEK)) {
2486	if (UNIXCB(skb).fp)
2487	unix_detach_fds(scm: &scm, skb);
2488
2489	sk_peek_offset_bwd(sk, val: skb->len);
2490	} else {
2491	/ It is questionable: on PEEK we could:*
2492	- do not return fds - good, but too simple 8)
2493	- return fds, and do not return them on read (old strategy,
2494	apparently wrong)
2495	- clone fds (I chose it for now, it is the most universal
2496	solution)
2497
2498	POSIX 1003.1g does not actually define this clearly
2499	at all. POSIX 1003.1g doesn't define a lot of things
2500	clearly however!
2501
2502	*/
2503
2504	sk_peek_offset_fwd(sk, val: size);
2505
2506	if (UNIXCB(skb).fp)
2507	unix_peek_fds(scm: &scm, skb);
2508	}
2509	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2510
2511	scm_recv_unix(sock, msg, scm: &scm, flags);
2512
2513	out_free:
2514	skb_free_datagram(sk, skb);
2515	mutex_unlock(lock: &u->iolock);
2516	out:
2517	return err;
2518	}
2519
2520	static int unix_dgram_recvmsg(struct socket sock, struct* msghdr *msg, size_t size,
2521	int flags)
2522	{
2523	struct sock *sk = sock->sk;
2524
2525	#ifdef CONFIG_BPF_SYSCALL
2526	const struct proto *prot = READ_ONCE(sk->sk_prot);
2527
2528	if (prot != &unix_dgram_proto)
2529	return prot->recvmsg(sk, msg, size, flags, NULL);
2530	#endif
2531	return __unix_dgram_recvmsg(sk, msg, size, flags);
2532	}
2533
2534	static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2535	{
2536	struct unix_sock *u = unix_sk(sk);
2537	struct sk_buff *skb;
2538	int err;
2539
2540	mutex_lock(&u->iolock);
2541	skb = skb_recv_datagram(sk, MSG_DONTWAIT, err: &err);
2542	mutex_unlock(lock: &u->iolock);
2543	if (!skb)
2544	return err;
2545
2546	return recv_actor(sk, skb);
2547	}
2548
2549	/*
2550	* Sleep until more data has arrived. But check for races..
2551	*/
2552	static long unix_stream_data_wait(struct sock sk, long* timeo,
2553	struct sk_buff last, unsigned* int last_len,
2554	bool freezable)
2555	{
2556	unsigned int state = TASK_INTERRUPTIBLE \| freezable * TASK_FREEZABLE;
2557	struct sk_buff *tail;
2558	DEFINE_WAIT(wait);
2559
2560	unix_state_lock(sk);
2561
2562	for (;;) {
2563	prepare_to_wait(wq_head: sk_sleep(sk), wq_entry: &wait, state);
2564
2565	tail = skb_peek_tail(list_: &sk->sk_receive_queue);
2566	if (tail != last \|\|
2567	(tail && tail->len != last_len) \|\|
2568	sk->sk_err \|\|
2569	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
2570	signal_pending(current) \|\|
2571	!timeo)
2572	break;
2573
2574	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575	unix_state_unlock(sk);
2576	timeo = schedule_timeout(timeout: timeo);
2577	unix_state_lock(sk);
2578
2579	if (sock_flag(sk, flag: SOCK_DEAD))
2580	break;
2581
2582	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2583	}
2584
2585	finish_wait(wq_head: sk_sleep(sk), wq_entry: &wait);
2586	unix_state_unlock(sk);
2587	return timeo;
2588	}
2589
2590	static unsigned int unix_skb_len(const struct sk_buff *skb)
2591	{
2592	return skb->len - UNIXCB(skb).consumed;
2593	}
2594
2595	struct unix_stream_read_state {
2596	int (recv_actor)(struct* sk_buff , int, int*,
2597	struct unix_stream_read_state *);
2598	struct socket *socket;
2599	struct msghdr *msg;
2600	struct pipe_inode_info *pipe;
2601	size_t size;
2602	int flags;
2603	unsigned int splice_flags;
2604	};
2605
2606	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2607	static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2608	{
2609	struct socket *sock = state->socket;
2610	struct sock *sk = sock->sk;
2611	struct unix_sock *u = unix_sk(sk);
2612	int chunk = `1`;
2613	struct sk_buff *oob_skb;
2614
2615	mutex_lock(&u->iolock);
2616	unix_state_lock(sk);
2617
2618	if (sock_flag(sk, flag: SOCK_URGINLINE) \|\| !u->oob_skb) {
2619	unix_state_unlock(sk);
2620	mutex_unlock(lock: &u->iolock);
2621	return -EINVAL;
2622	}
2623
2624	oob_skb = u->oob_skb;
2625
2626	if (!(state->flags & MSG_PEEK))
2627	WRITE_ONCE(u->oob_skb, NULL);
2628	else
2629	skb_get(skb: oob_skb);
2630	unix_state_unlock(sk);
2631
2632	chunk = state->recv_actor(oob_skb, `0`, chunk, state);
2633
2634	if (!(state->flags & MSG_PEEK))
2635	UNIXCB(oob_skb).consumed += `1`;
2636
2637	consume_skb(skb: oob_skb);
2638
2639	mutex_unlock(lock: &u->iolock);
2640
2641	if (chunk < `0`)
2642	return -EFAULT;
2643
2644	state->msg->msg_flags \|= MSG_OOB;
2645	return `1`;
2646	}
2647
2648	static struct sk_buff manage_oob(struct* sk_buff skb, struct* sock *sk,
2649	int flags, int copied)
2650	{
2651	struct unix_sock *u = unix_sk(sk);
2652
2653	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2654	skb_unlink(skb, list: &sk->sk_receive_queue);
2655	consume_skb(skb);
2656	skb = NULL;
2657	} else {
2658	if (skb == u->oob_skb) {
2659	if (copied) {
2660	skb = NULL;
2661	} else if (sock_flag(sk, flag: SOCK_URGINLINE)) {
2662	if (!(flags & MSG_PEEK)) {
2663	WRITE_ONCE(u->oob_skb, NULL);
2664	consume_skb(skb);
2665	}
2666	} else if (flags & MSG_PEEK) {
2667	skb = NULL;
2668	} else {
2669	skb_unlink(skb, list: &sk->sk_receive_queue);
2670	WRITE_ONCE(u->oob_skb, NULL);
2671	if (!WARN_ON_ONCE(skb_unref(skb)))
2672	kfree_skb(skb);
2673	skb = skb_peek(list_: &sk->sk_receive_queue);
2674	}
2675	}
2676	}
2677	return skb;
2678	}
2679	#endif
2680
2681	static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2682	{
2683	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2684	return -ENOTCONN;
2685
2686	return unix_read_skb(sk, recv_actor);
2687	}
2688
2689	static int unix_stream_read_generic(struct unix_stream_read_state *state,
2690	bool freezable)
2691	{
2692	struct scm_cookie scm;
2693	struct socket *sock = state->socket;
2694	struct sock *sk = sock->sk;
2695	struct unix_sock *u = unix_sk(sk);
2696	int copied = `0`;
2697	int flags = state->flags;
2698	int noblock = flags & MSG_DONTWAIT;
2699	bool check_creds = false;
2700	int target;
2701	int err = `0`;
2702	long timeo;
2703	int skip;
2704	size_t size = state->size;
2705	unsigned int last_len;
2706
2707	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2708	err = -EINVAL;
2709	goto out;
2710	}
2711
2712	if (unlikely(flags & MSG_OOB)) {
2713	err = -EOPNOTSUPP;
2714	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2715	err = unix_stream_recv_urg(state);
2716	#endif
2717	goto out;
2718	}
2719
2720	target = sock_rcvlowat(sk, waitall: flags & MSG_WAITALL, len: size);
2721	timeo = sock_rcvtimeo(sk, noblock);
2722
2723	memset(&scm, `0`, sizeof(scm));
2724
2725	/ Lock the socket to prevent queue disordering*
2726	* while sleeps in memcpy_tomsg
2727	*/
2728	mutex_lock(&u->iolock);
2729
2730	skip = max(sk_peek_offset(sk, flags), `0`);
2731
2732	do {
2733	int chunk;
2734	bool drop_skb;
2735	struct sk_buff skb, last;
2736
2737	redo:
2738	unix_state_lock(sk);
2739	if (sock_flag(sk, flag: SOCK_DEAD)) {
2740	err = -ECONNRESET;
2741	goto unlock;
2742	}
2743	last = skb = skb_peek(list_: &sk->sk_receive_queue);
2744	last_len = last ? last->len : `0`;
2745
2746	again:
2747	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2748	if (skb) {
2749	skb = manage_oob(skb, sk, flags, copied);
2750	if (!skb && copied) {
2751	unix_state_unlock(sk);
2752	break;
2753	}
2754	}
2755	#endif
2756	if (skb == NULL) {
2757	if (copied >= target)
2758	goto unlock;
2759
2760	/*
2761	* POSIX 1003.1g mandates this order.
2762	*/
2763
2764	err = sock_error(sk);
2765	if (err)
2766	goto unlock;
2767	if (sk->sk_shutdown & RCV_SHUTDOWN)
2768	goto unlock;
2769
2770	unix_state_unlock(sk);
2771	if (!timeo) {
2772	err = -EAGAIN;
2773	break;
2774	}
2775
2776	mutex_unlock(lock: &u->iolock);
2777
2778	timeo = unix_stream_data_wait(sk, timeo, last,
2779	last_len, freezable);
2780
2781	if (signal_pending(current)) {
2782	err = sock_intr_errno(timeo);
2783	scm_destroy(scm: &scm);
2784	goto out;
2785	}
2786
2787	mutex_lock(&u->iolock);
2788	goto redo;
2789	unlock:
2790	unix_state_unlock(sk);
2791	break;
2792	}
2793
2794	while (skip >= unix_skb_len(skb)) {
2795	skip -= unix_skb_len(skb);
2796	last = skb;
2797	last_len = skb->len;
2798	skb = skb_peek_next(skb, list_: &sk->sk_receive_queue);
2799	if (!skb)
2800	goto again;
2801	}
2802
2803	unix_state_unlock(sk);
2804
2805	if (check_creds) {
2806	/ Never glue messages from different writers /
2807	if (!unix_skb_scm_eq(skb, scm: &scm))
2808	break;
2809	} else if (test_bit(SOCK_PASSCRED, &sock->flags) \|\|
2810	test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2811	/ Copy credentials /
2812	scm_set_cred(scm: &scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2813	unix_set_secdata(scm: &scm, skb);
2814	check_creds = true;
2815	}
2816
2817	/ Copy address just once /
2818	if (state->msg && state->msg->msg_name) {
2819	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2820	state->msg->msg_name);
2821	unix_copy_addr(msg: state->msg, sk: skb->sk);
2822
2823	BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2824	state->msg->msg_name,
2825	&state->msg->msg_namelen);
2826
2827	sunaddr = NULL;
2828	}
2829
2830	chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2831	skb_get(skb);
2832	chunk = state->recv_actor(skb, skip, chunk, state);
2833	drop_skb = !unix_skb_len(skb);
2834	/ skb is only safe to use if !drop_skb /
2835	consume_skb(skb);
2836	if (chunk < `0`) {
2837	if (copied == `0`)
2838	copied = -EFAULT;
2839	break;
2840	}
2841	copied += chunk;
2842	size -= chunk;
2843
2844	if (drop_skb) {
2845	/ the skb was touched by a concurrent reader;*
2846	* we should not expect anything from this skb
2847	* anymore and assume it invalid - we can be
2848	* sure it was dropped from the socket queue
2849	*
2850	* let's report a short read
2851	*/
2852	err = `0`;
2853	break;
2854	}
2855
2856	/ Mark read part of skb as used /
2857	if (!(flags & MSG_PEEK)) {
2858	UNIXCB(skb).consumed += chunk;
2859
2860	sk_peek_offset_bwd(sk, val: chunk);
2861
2862	if (UNIXCB(skb).fp) {
2863	scm_stat_del(sk, skb);
2864	unix_detach_fds(scm: &scm, skb);
2865	}
2866
2867	if (unix_skb_len(skb))
2868	break;
2869
2870	skb_unlink(skb, list: &sk->sk_receive_queue);
2871	consume_skb(skb);
2872
2873	if (scm.fp)
2874	break;
2875	} else {
2876	/ It is questionable, see note in unix_dgram_recvmsg.*
2877	*/
2878	if (UNIXCB(skb).fp)
2879	unix_peek_fds(scm: &scm, skb);
2880
2881	sk_peek_offset_fwd(sk, val: chunk);
2882
2883	if (UNIXCB(skb).fp)
2884	break;
2885
2886	skip = `0`;
2887	last = skb;
2888	last_len = skb->len;
2889	unix_state_lock(sk);
2890	skb = skb_peek_next(skb, list_: &sk->sk_receive_queue);
2891	if (skb)
2892	goto again;
2893	unix_state_unlock(sk);
2894	break;
2895	}
2896	} while (size);
2897
2898	mutex_unlock(lock: &u->iolock);
2899	if (state->msg)
2900	scm_recv_unix(sock, msg: state->msg, scm: &scm, flags);
2901	else
2902	scm_destroy(scm: &scm);
2903	out:
2904	return copied ? : err;
2905	}
2906
2907	static int unix_stream_read_actor(struct sk_buff *skb,
2908	int skip, int chunk,
2909	struct unix_stream_read_state *state)
2910	{
2911	int ret;
2912
2913	ret = skb_copy_datagram_msg(from: skb, UNIXCB(skb).consumed + skip,
2914	msg: state->msg, size: chunk);
2915	return ret ?: chunk;
2916	}
2917
2918	int __unix_stream_recvmsg(struct sock sk, struct* msghdr *msg,
2919	size_t size, int flags)
2920	{
2921	struct unix_stream_read_state state = {
2922	.recv_actor = unix_stream_read_actor,
2923	.socket = sk->sk_socket,
2924	.msg = msg,
2925	.size = size,
2926	.flags = flags
2927	};
2928
2929	return unix_stream_read_generic(state: &state, freezable: true);
2930	}
2931
2932	static int unix_stream_recvmsg(struct socket sock, struct* msghdr *msg,
2933	size_t size, int flags)
2934	{
2935	struct unix_stream_read_state state = {
2936	.recv_actor = unix_stream_read_actor,
2937	.socket = sock,
2938	.msg = msg,
2939	.size = size,
2940	.flags = flags
2941	};
2942
2943	#ifdef CONFIG_BPF_SYSCALL
2944	struct sock *sk = sock->sk;
2945	const struct proto *prot = READ_ONCE(sk->sk_prot);
2946
2947	if (prot != &unix_stream_proto)
2948	return prot->recvmsg(sk, msg, size, flags, NULL);
2949	#endif
2950	return unix_stream_read_generic(state: &state, freezable: true);
2951	}
2952
2953	static int unix_stream_splice_actor(struct sk_buff *skb,
2954	int skip, int chunk,
2955	struct unix_stream_read_state *state)
2956	{
2957	return skb_splice_bits(skb, sk: state->socket->sk,
2958	UNIXCB(skb).consumed + skip,
2959	pipe: state->pipe, len: chunk, flags: state->splice_flags);
2960	}
2961
2962	static ssize_t unix_stream_splice_read(struct socket sock, loff_t ppos,
2963	struct pipe_inode_info *pipe,
2964	size_t size, unsigned int flags)
2965	{
2966	struct unix_stream_read_state state = {
2967	.recv_actor = unix_stream_splice_actor,
2968	.socket = sock,
2969	.pipe = pipe,
2970	.size = size,
2971	.splice_flags = flags,
2972	};
2973
2974	if (unlikely(*ppos))
2975	return -ESPIPE;
2976
2977	if (sock->file->f_flags & O_NONBLOCK \|\|
2978	flags & SPLICE_F_NONBLOCK)
2979	state.flags = MSG_DONTWAIT;
2980
2981	return unix_stream_read_generic(state: &state, freezable: false);
2982	}
2983
2984	static int unix_shutdown(struct socket sock, int* mode)
2985	{
2986	struct sock *sk = sock->sk;
2987	struct sock *other;
2988
2989	if (mode < SHUT_RD \|\| mode > SHUT_RDWR)
2990	return -EINVAL;
2991	/ This maps:*
2992	* SHUT_RD (0) -> RCV_SHUTDOWN (1)
2993	* SHUT_WR (1) -> SEND_SHUTDOWN (2)
2994	* SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2995	*/
2996	++mode;
2997
2998	unix_state_lock(sk);
2999	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| mode);
3000	other = unix_peer(sk);
3001	if (other)
3002	sock_hold(sk: other);
3003	unix_state_unlock(sk);
3004	sk->sk_state_change(sk);
3005
3006	if (other &&
3007	(sk->sk_type == SOCK_STREAM \|\| sk->sk_type == SOCK_SEQPACKET)) {
3008
3009	int peer_mode = `0`;
3010	const struct proto *prot = READ_ONCE(other->sk_prot);
3011
3012	if (prot->unhash)
3013	prot->unhash(other);
3014	if (mode&RCV_SHUTDOWN)
3015	peer_mode \|= SEND_SHUTDOWN;
3016	if (mode&SEND_SHUTDOWN)
3017	peer_mode \|= RCV_SHUTDOWN;
3018	unix_state_lock(other);
3019	WRITE_ONCE(other->sk_shutdown, other->sk_shutdown \| peer_mode);
3020	unix_state_unlock(other);
3021	other->sk_state_change(other);
3022	if (peer_mode == SHUTDOWN_MASK)
3023	sk_wake_async(sk: other, how: SOCK_WAKE_WAITD, POLL_HUP);
3024	else if (peer_mode & RCV_SHUTDOWN)
3025	sk_wake_async(sk: other, how: SOCK_WAKE_WAITD, POLL_IN);
3026	}
3027	if (other)
3028	sock_put(sk: other);
3029
3030	return `0`;
3031	}
3032
3033	long unix_inq_len(struct sock *sk)
3034	{
3035	struct sk_buff *skb;
3036	long amount = `0`;
3037
3038	if (sk->sk_state == TCP_LISTEN)
3039	return -EINVAL;
3040
3041	spin_lock(lock: &sk->sk_receive_queue.lock);
3042	if (sk->sk_type == SOCK_STREAM \|\|
3043	sk->sk_type == SOCK_SEQPACKET) {
3044	skb_queue_walk(&sk->sk_receive_queue, skb)
3045	amount += unix_skb_len(skb);
3046	} else {
3047	skb = skb_peek(list_: &sk->sk_receive_queue);
3048	if (skb)
3049	amount = skb->len;
3050	}
3051	spin_unlock(lock: &sk->sk_receive_queue.lock);
3052
3053	return amount;
3054	}
3055	EXPORT_SYMBOL_GPL(unix_inq_len);
3056
3057	long unix_outq_len(struct sock *sk)
3058	{
3059	return sk_wmem_alloc_get(sk);
3060	}
3061	EXPORT_SYMBOL_GPL(unix_outq_len);
3062
3063	static int unix_open_file(struct sock *sk)
3064	{
3065	struct path path;
3066	struct file *f;
3067	int fd;
3068
3069	if (!ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN))
3070	return -EPERM;
3071
3072	if (!smp_load_acquire(&unix_sk(sk)->addr))
3073	return -ENOENT;
3074
3075	path = unix_sk(sk)->path;
3076	if (!path.dentry)
3077	return -ENOENT;
3078
3079	path_get(&path);
3080
3081	fd = get_unused_fd_flags(O_CLOEXEC);
3082	if (fd < `0`)
3083	goto out;
3084
3085	f = dentry_open(path: &path, O_PATH, current_cred());
3086	if (IS_ERR(ptr: f)) {
3087	put_unused_fd(fd);
3088	fd = PTR_ERR(ptr: f);
3089	goto out;
3090	}
3091
3092	fd_install(fd, file: f);
3093	out:
3094	path_put(&path);
3095
3096	return fd;
3097	}
3098
3099	static int unix_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg)
3100	{
3101	struct sock *sk = sock->sk;
3102	long amount = `0`;
3103	int err;
3104
3105	switch (cmd) {
3106	case SIOCOUTQ:
3107	amount = unix_outq_len(sk);
3108	err = put_user(amount, (int __user *)arg);
3109	break;
3110	case SIOCINQ:
3111	amount = unix_inq_len(sk);
3112	if (amount < `0`)
3113	err = amount;
3114	else
3115	err = put_user(amount, (int __user *)arg);
3116	break;
3117	case SIOCUNIXFILE:
3118	err = unix_open_file(sk);
3119	break;
3120	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3121	case SIOCATMARK:
3122	{
3123	struct sk_buff *skb;
3124	int answ = `0`;
3125
3126	skb = skb_peek(list_: &sk->sk_receive_queue);
3127	if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3128	answ = `1`;
3129	err = put_user(answ, (int __user *)arg);
3130	}
3131	break;
3132	#endif
3133	default:
3134	err = -ENOIOCTLCMD;
3135	break;
3136	}
3137	return err;
3138	}
3139
3140	#ifdef CONFIG_COMPAT
3141	static int unix_compat_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg)
3142	{
3143	return unix_ioctl(sock, cmd, arg: (unsigned long)compat_ptr(uptr: arg));
3144	}
3145	#endif
3146
3147	static __poll_t unix_poll(struct file file, struct* socket sock, poll_table wait)
3148	{
3149	struct sock *sk = sock->sk;
3150	__poll_t mask;
3151	u8 shutdown;
3152
3153	sock_poll_wait(filp: file, sock, p: wait);
3154	mask = `0`;
3155	shutdown = READ_ONCE(sk->sk_shutdown);
3156
3157	/ exceptional events? /
3158	if (READ_ONCE(sk->sk_err))
3159	mask \|= EPOLLERR;
3160	if (shutdown == SHUTDOWN_MASK)
3161	mask \|= EPOLLHUP;
3162	if (shutdown & RCV_SHUTDOWN)
3163	mask \|= EPOLLRDHUP \| EPOLLIN \| EPOLLRDNORM;
3164
3165	/ readable? /
3166	if (!skb_queue_empty_lockless(list: &sk->sk_receive_queue))
3167	mask \|= EPOLLIN \| EPOLLRDNORM;
3168	if (sk_is_readable(sk))
3169	mask \|= EPOLLIN \| EPOLLRDNORM;
3170	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3171	if (READ_ONCE(unix_sk(sk)->oob_skb))
3172	mask \|= EPOLLPRI;
3173	#endif
3174
3175	/ Connection-based need to check for termination and startup /
3176	if ((sk->sk_type == SOCK_STREAM \|\| sk->sk_type == SOCK_SEQPACKET) &&
3177	sk->sk_state == TCP_CLOSE)
3178	mask \|= EPOLLHUP;
3179
3180	/*
3181	* we set writable also when the other side has shut down the
3182	* connection. This prevents stuck sockets.
3183	*/
3184	if (unix_writable(sk))
3185	mask \|= EPOLLOUT \| EPOLLWRNORM \| EPOLLWRBAND;
3186
3187	return mask;
3188	}
3189
3190	static __poll_t unix_dgram_poll(struct file file, struct* socket *sock,
3191	poll_table *wait)
3192	{
3193	struct sock sk = sock->sk, other;
3194	unsigned int writable;
3195	__poll_t mask;
3196	u8 shutdown;
3197
3198	sock_poll_wait(filp: file, sock, p: wait);
3199	mask = `0`;
3200	shutdown = READ_ONCE(sk->sk_shutdown);
3201
3202	/ exceptional events? /
3203	if (READ_ONCE(sk->sk_err) \|\|
3204	!skb_queue_empty_lockless(list: &sk->sk_error_queue))
3205	mask \|= EPOLLERR \|
3206	(sock_flag(sk, flag: SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : `0`);
3207
3208	if (shutdown & RCV_SHUTDOWN)
3209	mask \|= EPOLLRDHUP \| EPOLLIN \| EPOLLRDNORM;
3210	if (shutdown == SHUTDOWN_MASK)
3211	mask \|= EPOLLHUP;
3212
3213	/ readable? /
3214	if (!skb_queue_empty_lockless(list: &sk->sk_receive_queue))
3215	mask \|= EPOLLIN \| EPOLLRDNORM;
3216	if (sk_is_readable(sk))
3217	mask \|= EPOLLIN \| EPOLLRDNORM;
3218
3219	/ Connection-based need to check for termination and startup /
3220	if (sk->sk_type == SOCK_SEQPACKET) {
3221	if (sk->sk_state == TCP_CLOSE)
3222	mask \|= EPOLLHUP;
3223	/ connection hasn't started yet? /
3224	if (sk->sk_state == TCP_SYN_SENT)
3225	return mask;
3226	}
3227
3228	/ No write status requested, avoid expensive OUT tests. /
3229	if (!(poll_requested_events(p: wait) & (EPOLLWRBAND\|EPOLLWRNORM\|EPOLLOUT)))
3230	return mask;
3231
3232	writable = unix_writable(sk);
3233	if (writable) {
3234	unix_state_lock(sk);
3235
3236	other = unix_peer(sk);
3237	if (other && unix_peer(other) != sk &&
3238	unix_recvq_full_lockless(sk: other) &&
3239	unix_dgram_peer_wake_me(sk, other))
3240	writable = `0`;
3241
3242	unix_state_unlock(sk);
3243	}
3244
3245	if (writable)
3246	mask \|= EPOLLOUT \| EPOLLWRNORM \| EPOLLWRBAND;
3247	else
3248	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3249
3250	return mask;
3251	}
3252
3253	#ifdef CONFIG_PROC_FS
3254
3255	#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3256
3257	#define get_bucket(x) ((x) >> BUCKET_SPACE)
3258	#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3259	#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE \| (o))
3260
3261	static struct sock unix_from_bucket(struct* seq_file seq, loff_t pos)
3262	{
3263	unsigned long offset = get_offset(*pos);
3264	unsigned long bucket = get_bucket(*pos);
3265	unsigned long count = `0`;
3266	struct sock *sk;
3267
3268	for (sk = sk_head(head: &seq_file_net(seq)->unx.table.buckets[bucket]);
3269	sk; sk = sk_next(sk)) {
3270	if (++count == offset)
3271	break;
3272	}
3273
3274	return sk;
3275	}
3276
3277	static struct sock unix_get_first(struct* seq_file seq, loff_t pos)
3278	{
3279	unsigned long bucket = get_bucket(*pos);
3280	struct net *net = seq_file_net(seq);
3281	struct sock *sk;
3282
3283	while (bucket < UNIX_HASH_SIZE) {
3284	spin_lock(lock: &net->unx.table.locks[bucket]);
3285
3286	sk = unix_from_bucket(seq, pos);
3287	if (sk)
3288	return sk;
3289
3290	spin_unlock(lock: &net->unx.table.locks[bucket]);
3291
3292	*pos = set_bucket_offset(++bucket, `1`);
3293	}
3294
3295	return NULL;
3296	}
3297
3298	static struct sock unix_get_next(struct* seq_file seq, struct* sock *sk,
3299	loff_t *pos)
3300	{
3301	unsigned long bucket = get_bucket(*pos);
3302
3303	sk = sk_next(sk);
3304	if (sk)
3305	return sk;
3306
3307
3308	spin_unlock(lock: &seq_file_net(seq)->unx.table.locks[bucket]);
3309
3310	*pos = set_bucket_offset(++bucket, `1`);
3311
3312	return unix_get_first(seq, pos);
3313	}
3314
3315	static void unix_seq_start(struct* seq_file seq, loff_t pos)
3316	{
3317	if (!*pos)
3318	return SEQ_START_TOKEN;
3319
3320	return unix_get_first(seq, pos);
3321	}
3322
3323	static void unix_seq_next(struct* seq_file seq, void* v, loff_t pos)
3324	{
3325	++*pos;
3326
3327	if (v == SEQ_START_TOKEN)
3328	return unix_get_first(seq, pos);
3329
3330	return unix_get_next(seq, sk: v, pos);
3331	}
3332
3333	static void unix_seq_stop(struct seq_file seq, void* *v)
3334	{
3335	struct sock *sk = v;
3336
3337	if (sk)
3338	spin_unlock(lock: &seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3339	}
3340
3341	static int unix_seq_show(struct seq_file seq, void* *v)
3342	{
3343
3344	if (v == SEQ_START_TOKEN)
3345	seq_puts(m: seq, s: "Num RefCount Protocol Flags Type St "
3346	"Inode Path\n");
3347	else {
3348	struct sock *s = v;
3349	struct unix_sock *u = unix_sk(s);
3350	unix_state_lock(s);
3351
3352	seq_printf(m: seq, fmt: "%pK: %08X %08X %08X %04X %02X %5lu",
3353	s,
3354	refcount_read(r: &s->sk_refcnt),
3355	`0`,
3356	s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : `0`,
3357	s->sk_type,
3358	s->sk_socket ?
3359	(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3360	(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3361	sock_i_ino(sk: s));
3362
3363	if (u->addr) { // under a hash table lock here
3364	int i, len;
3365	seq_putc(m: seq, c: `' '`);
3366
3367	i = `0`;
3368	len = u->addr->len -
3369	offsetof(struct sockaddr_un, sun_path);
3370	if (u->addr->name->sun_path[`0`]) {
3371	len--;
3372	} else {
3373	seq_putc(m: seq, c: `'@'`);
3374	i++;
3375	}
3376	for ( ; i < len; i++)
3377	seq_putc(m: seq, c: u->addr->name->sun_path[i] ?:
3378	`'@'`);
3379	}
3380	unix_state_unlock(s);
3381	seq_putc(m: seq, c: `'\n'`);
3382	}
3383
3384	return `0`;
3385	}
3386
3387	static const struct seq_operations unix_seq_ops = {
3388	.start = unix_seq_start,
3389	.next = unix_seq_next,
3390	.stop = unix_seq_stop,
3391	.show = unix_seq_show,
3392	};
3393
3394	#ifdef CONFIG_BPF_SYSCALL
3395	struct bpf_unix_iter_state {
3396	struct seq_net_private p;
3397	unsigned int cur_sk;
3398	unsigned int end_sk;
3399	unsigned int max_sk;
3400	struct sock **batch;
3401	bool st_bucket_done;
3402	};
3403
3404	struct bpf_iter__unix {
3405	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3406	__bpf_md_ptr(struct unix_sock *, unix_sk);
3407	uid_t uid __aligned(`8`);
3408	};
3409
3410	static int unix_prog_seq_show(struct bpf_prog prog, struct* bpf_iter_meta *meta,
3411	struct unix_sock *unix_sk, uid_t uid)
3412	{
3413	struct bpf_iter__unix ctx;
3414
3415	meta->seq_num--; / skip SEQ_START_TOKEN /
3416	ctx.meta = meta;
3417	ctx.unix_sk = unix_sk;
3418	ctx.uid = uid;
3419	return bpf_iter_run_prog(prog, ctx: &ctx);
3420	}
3421
3422	static int bpf_iter_unix_hold_batch(struct seq_file seq, struct* sock *start_sk)
3423
3424	{
3425	struct bpf_unix_iter_state *iter = seq->private;
3426	unsigned int expected = `1`;
3427	struct sock *sk;
3428
3429	sock_hold(sk: start_sk);
3430	iter->batch[iter->end_sk++] = start_sk;
3431
3432	for (sk = sk_next(sk: start_sk); sk; sk = sk_next(sk)) {
3433	if (iter->end_sk < iter->max_sk) {
3434	sock_hold(sk);
3435	iter->batch[iter->end_sk++] = sk;
3436	}
3437
3438	expected++;
3439	}
3440
3441	spin_unlock(lock: &seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3442
3443	return expected;
3444	}
3445
3446	static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3447	{
3448	while (iter->cur_sk < iter->end_sk)
3449	sock_put(sk: iter->batch[iter->cur_sk++]);
3450	}
3451
3452	static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3453	unsigned int new_batch_sz)
3454	{
3455	struct sock **new_batch;
3456
3457	new_batch = kvmalloc(size: sizeof(new_batch) new_batch_sz,
3458	GFP_USER \| __GFP_NOWARN);
3459	if (!new_batch)
3460	return -ENOMEM;
3461
3462	bpf_iter_unix_put_batch(iter);
3463	kvfree(addr: iter->batch);
3464	iter->batch = new_batch;
3465	iter->max_sk = new_batch_sz;
3466
3467	return `0`;
3468	}
3469
3470	static struct sock bpf_iter_unix_batch(struct* seq_file *seq,
3471	loff_t *pos)
3472	{
3473	struct bpf_unix_iter_state *iter = seq->private;
3474	unsigned int expected;
3475	bool resized = false;
3476	struct sock *sk;
3477
3478	if (iter->st_bucket_done)
3479	pos = set_bucket_offset(get_bucket(pos) + `1`, `1`);
3480
3481	again:
3482	/ Get a new batch /
3483	iter->cur_sk = `0`;
3484	iter->end_sk = `0`;
3485
3486	sk = unix_get_first(seq, pos);
3487	if (!sk)
3488	return NULL; / Done /
3489
3490	expected = bpf_iter_unix_hold_batch(seq, start_sk: sk);
3491
3492	if (iter->end_sk == expected) {
3493	iter->st_bucket_done = true;
3494	return sk;
3495	}
3496
3497	if (!resized && !bpf_iter_unix_realloc_batch(iter, new_batch_sz: expected * `3` / `2`)) {
3498	resized = true;
3499	goto again;
3500	}
3501
3502	return sk;
3503	}
3504
3505	static void bpf_iter_unix_seq_start(struct* seq_file seq, loff_t pos)
3506	{
3507	if (!*pos)
3508	return SEQ_START_TOKEN;
3509
3510	/ bpf iter does not support lseek, so it always*
3511	* continue from where it was stop()-ped.
3512	*/
3513	return bpf_iter_unix_batch(seq, pos);
3514	}
3515
3516	static void bpf_iter_unix_seq_next(struct* seq_file seq, void* v, loff_t pos)
3517	{
3518	struct bpf_unix_iter_state *iter = seq->private;
3519	struct sock *sk;
3520
3521	/ Whenever seq_next() is called, the iter->cur_sk is*
3522	* done with seq_show(), so advance to the next sk in
3523	* the batch.
3524	*/
3525	if (iter->cur_sk < iter->end_sk)
3526	sock_put(sk: iter->batch[iter->cur_sk++]);
3527
3528	++*pos;
3529
3530	if (iter->cur_sk < iter->end_sk)
3531	sk = iter->batch[iter->cur_sk];
3532	else
3533	sk = bpf_iter_unix_batch(seq, pos);
3534
3535	return sk;
3536	}
3537
3538	static int bpf_iter_unix_seq_show(struct seq_file seq, void* *v)
3539	{
3540	struct bpf_iter_meta meta;
3541	struct bpf_prog *prog;
3542	struct sock *sk = v;
3543	uid_t uid;
3544	bool slow;
3545	int ret;
3546
3547	if (v == SEQ_START_TOKEN)
3548	return `0`;
3549
3550	slow = lock_sock_fast(sk);
3551
3552	if (unlikely(sk_unhashed(sk))) {
3553	ret = SEQ_SKIP;
3554	goto unlock;
3555	}
3556
3557	uid = from_kuid_munged(to: seq_user_ns(seq), uid: sock_i_uid(sk));
3558	meta.seq = seq;
3559	prog = bpf_iter_get_info(meta: &meta, in_stop: false);
3560	ret = unix_prog_seq_show(prog, meta: &meta, unix_sk: v, uid);
3561	unlock:
3562	unlock_sock_fast(sk, slow);
3563	return ret;
3564	}
3565
3566	static void bpf_iter_unix_seq_stop(struct seq_file seq, void* *v)
3567	{
3568	struct bpf_unix_iter_state *iter = seq->private;
3569	struct bpf_iter_meta meta;
3570	struct bpf_prog *prog;
3571
3572	if (!v) {
3573	meta.seq = seq;
3574	prog = bpf_iter_get_info(meta: &meta, in_stop: true);
3575	if (prog)
3576	(void)unix_prog_seq_show(prog, meta: &meta, unix_sk: v, uid: `0`);
3577	}
3578
3579	if (iter->cur_sk < iter->end_sk)
3580	bpf_iter_unix_put_batch(iter);
3581	}
3582
3583	static const struct seq_operations bpf_iter_unix_seq_ops = {
3584	.start = bpf_iter_unix_seq_start,
3585	.next = bpf_iter_unix_seq_next,
3586	.stop = bpf_iter_unix_seq_stop,
3587	.show = bpf_iter_unix_seq_show,
3588	};
3589	#endif
3590	#endif
3591
3592	static const struct net_proto_family unix_family_ops = {
3593	.family = PF_UNIX,
3594	.create = unix_create,
3595	.owner = THIS_MODULE,
3596	};
3597
3598
3599	static int __net_init unix_net_init(struct net *net)
3600	{
3601	int i;
3602
3603	net->unx.sysctl_max_dgram_qlen = `10`;
3604	if (unix_sysctl_register(net))
3605	goto out;
3606
3607	#ifdef CONFIG_PROC_FS
3608	if (!proc_create_net("unix", `0`, net->proc_net, &unix_seq_ops,
3609	sizeof(struct seq_net_private)))
3610	goto err_sysctl;
3611	#endif
3612
3613	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3614	size: sizeof(spinlock_t), GFP_KERNEL);
3615	if (!net->unx.table.locks)
3616	goto err_proc;
3617
3618	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3619	size: sizeof(struct hlist_head),
3620	GFP_KERNEL);
3621	if (!net->unx.table.buckets)
3622	goto free_locks;
3623
3624	for (i = `0`; i < UNIX_HASH_SIZE; i++) {
3625	spin_lock_init(&net->unx.table.locks[i]);
3626	INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3627	}
3628
3629	return `0`;
3630
3631	free_locks:
3632	kvfree(addr: net->unx.table.locks);
3633	err_proc:
3634	#ifdef CONFIG_PROC_FS
3635	remove_proc_entry("unix", net->proc_net);
3636	err_sysctl:
3637	#endif
3638	unix_sysctl_unregister(net);
3639	out:
3640	return -ENOMEM;
3641	}
3642
3643	static void __net_exit unix_net_exit(struct net *net)
3644	{
3645	kvfree(addr: net->unx.table.buckets);
3646	kvfree(addr: net->unx.table.locks);
3647	unix_sysctl_unregister(net);
3648	remove_proc_entry("unix", net->proc_net);
3649	}
3650
3651	static struct pernet_operations unix_net_ops = {
3652	.init = unix_net_init,
3653	.exit = unix_net_exit,
3654	};
3655
3656	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3657	DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3658	struct unix_sock *unix_sk, uid_t uid)
3659
3660	#define INIT_BATCH_SZ 16
3661
3662	static int bpf_iter_init_unix(void priv_data, struct* bpf_iter_aux_info *aux)
3663	{
3664	struct bpf_unix_iter_state *iter = priv_data;
3665	int err;
3666
3667	err = bpf_iter_init_seq_net(priv_data, aux);
3668	if (err)
3669	return err;
3670
3671	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3672	if (err) {
3673	bpf_iter_fini_seq_net(priv_data);
3674	return err;
3675	}
3676
3677	return `0`;
3678	}
3679
3680	static void bpf_iter_fini_unix(void *priv_data)
3681	{
3682	struct bpf_unix_iter_state *iter = priv_data;
3683
3684	bpf_iter_fini_seq_net(priv_data);
3685	kvfree(addr: iter->batch);
3686	}
3687
3688	static const struct bpf_iter_seq_info unix_seq_info = {
3689	.seq_ops = &bpf_iter_unix_seq_ops,
3690	.init_seq_private = bpf_iter_init_unix,
3691	.fini_seq_private = bpf_iter_fini_unix,
3692	.seq_priv_size = sizeof(struct bpf_unix_iter_state),
3693	};
3694
3695	static const struct bpf_func_proto *
3696	bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3697	const struct bpf_prog *prog)
3698	{
3699	switch (func_id) {
3700	case BPF_FUNC_setsockopt:
3701	return &bpf_sk_setsockopt_proto;
3702	case BPF_FUNC_getsockopt:
3703	return &bpf_sk_getsockopt_proto;
3704	default:
3705	return NULL;
3706	}
3707	}
3708
3709	static struct bpf_iter_reg unix_reg_info = {
3710	.target = "unix",
3711	.ctx_arg_info_size = `1`,
3712	.ctx_arg_info = {
3713	{ offsetof(struct bpf_iter__unix, unix_sk),
3714	PTR_TO_BTF_ID_OR_NULL },
3715	},
3716	.get_func_proto = bpf_iter_unix_get_func_proto,
3717	.seq_info = &unix_seq_info,
3718	};
3719
3720	static void __init bpf_iter_register(void)
3721	{
3722	unix_reg_info.ctx_arg_info[`0`].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3723	if (bpf_iter_reg_target(reg_info: &unix_reg_info))
3724	pr_warn("Warning: could not register bpf iterator unix\n");
3725	}
3726	#endif
3727
3728	static int __init af_unix_init(void)
3729	{
3730	int i, rc = -`1`;
3731
3732	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3733
3734	for (i = `0`; i < UNIX_HASH_SIZE / `2`; i++) {
3735	spin_lock_init(&bsd_socket_locks[i]);
3736	INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3737	}
3738
3739	rc = proto_register(prot: &unix_dgram_proto, alloc_slab: `1`);
3740	if (rc != `0`) {
3741	pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3742	goto out;
3743	}
3744
3745	rc = proto_register(prot: &unix_stream_proto, alloc_slab: `1`);
3746	if (rc != `0`) {
3747	pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3748	proto_unregister(prot: &unix_dgram_proto);
3749	goto out;
3750	}
3751
3752	sock_register(fam: &unix_family_ops);
3753	register_pernet_subsys(&unix_net_ops);
3754	unix_bpf_build_proto();
3755
3756	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3757	bpf_iter_register();
3758	#endif
3759
3760	out:
3761	return rc;
3762	}
3763
3764	/ Later than subsys_initcall() because we depend on stuff initialised there /
3765	fs_initcall(af_unix_init);
3766

source code of linux/net/unix/af_unix.c