af_packet.c source code [linux/net/packet/af_packet.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* PACKET - implements raw packet sockets.
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Alan Cox, <gw4pts@gw4pts.ampr.org>
12	*
13	* Fixes:
14	* Alan Cox : verify_area() now used correctly
15	* Alan Cox : new skbuff lists, look ma no backlogs!
16	* Alan Cox : tidied skbuff lists.
17	* Alan Cox : Now uses generic datagram routines I
18	* added. Also fixed the peek/read crash
19	* from all old Linux datagram code.
20	* Alan Cox : Uses the improved datagram code.
21	* Alan Cox : Added NULL's for socket options.
22	* Alan Cox : Re-commented the code.
23	* Alan Cox : Use new kernel side addressing
24	* Rob Janssen : Correct MTU usage.
25	* Dave Platt : Counter leaks caused by incorrect
26	* interrupt locking and some slightly
27	* dubious gcc output. Can you read
28	* compiler: it said _VOLATILE_
29	* Richard Kooijman : Timestamp fixes.
30	* Alan Cox : New buffers. Use sk->mac.raw.
31	* Alan Cox : sendmsg/recvmsg support.
32	* Alan Cox : Protocol setting support
33	* Alexey Kuznetsov : Untied from IPv4 stack.
34	* Cyrus Durgin : Fixed kerneld for kmod.
35	* Michal Ostrowski : Module initialization cleanup.
36	* Ulises Alonso : Frame number limit removal and
37	* packet_set_ring memory leak.
38	* Eric Biederman : Allow for > 8 byte hardware addresses.
39	* The convention is that longer addresses
40	* will simply extend the hardware address
41	* byte arrays at the end of sockaddr_ll
42	* and packet_mreq.
43	* Johann Baudy : Added TX RING.
44	* Chetan Loke : Implemented TPACKET_V3 block abstraction
45	* layer.
46	* Copyright (C) 2011, <lokec@ccs.neu.edu>
47	*/
48
49	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50
51	#include <linux/ethtool.h>
52	#include <linux/filter.h>
53	#include <linux/types.h>
54	#include <linux/mm.h>
55	#include <linux/capability.h>
56	#include <linux/fcntl.h>
57	#include <linux/socket.h>
58	#include <linux/in.h>
59	#include <linux/inet.h>
60	#include <linux/netdevice.h>
61	#include <linux/if_packet.h>
62	#include <linux/wireless.h>
63	#include <linux/kernel.h>
64	#include <linux/kmod.h>
65	#include <linux/slab.h>
66	#include <linux/vmalloc.h>
67	#include <net/net_namespace.h>
68	#include <net/ip.h>
69	#include <net/protocol.h>
70	#include <linux/skbuff.h>
71	#include <net/sock.h>
72	#include <linux/errno.h>
73	#include <linux/timer.h>
74	#include <linux/uaccess.h>
75	#include <asm/ioctls.h>
76	#include <asm/page.h>
77	#include <asm/cacheflush.h>
78	#include <asm/io.h>
79	#include <linux/proc_fs.h>
80	#include <linux/seq_file.h>
81	#include <linux/poll.h>
82	#include <linux/module.h>
83	#include <linux/init.h>
84	#include <linux/mutex.h>
85	#include <linux/if_vlan.h>
86	#include <linux/virtio_net.h>
87	#include <linux/errqueue.h>
88	#include <linux/net_tstamp.h>
89	#include <linux/percpu.h>
90	#ifdef CONFIG_INET
91	#include <net/inet_common.h>
92	#endif
93	#include <linux/bpf.h>
94	#include <net/compat.h>
95	#include <linux/netfilter_netdev.h>
96
97	#include "internal.h"
98
99	/*
100	Assumptions:
101	- If the device has no dev->header_ops->create, there is no LL header
102	visible above the device. In this case, its hard_header_len should be 0.
103	The device may prepend its own header internally. In this case, its
104	needed_headroom should be set to the space needed for it to add its
105	internal header.
106	For example, a WiFi driver pretending to be an Ethernet driver should
107	set its hard_header_len to be the Ethernet header length, and set its
108	needed_headroom to be (the real WiFi header length - the fake Ethernet
109	header length).
110	- packet socket receives packets with pulled ll header,
111	so that SOCK_RAW should push it back.
112
113	On receive:
114	-----------
115
116	Incoming, dev_has_header(dev) == true
117	mac_header -> ll header
118	data -> data
119
120	Outgoing, dev_has_header(dev) == true
121	mac_header -> ll header
122	data -> ll header
123
124	Incoming, dev_has_header(dev) == false
125	mac_header -> data
126	However drivers often make it point to the ll header.
127	This is incorrect because the ll header should be invisible to us.
128	data -> data
129
130	Outgoing, dev_has_header(dev) == false
131	mac_header -> data. ll header is invisible to us.
132	data -> data
133
134	Resume
135	If dev_has_header(dev) == false we are unable to restore the ll header,
136	because it is invisible to us.
137
138
139	On transmit:
140	------------
141
142	dev_has_header(dev) == true
143	mac_header -> ll header
144	data -> ll header
145
146	dev_has_header(dev) == false (ll header is invisible to us)
147	mac_header -> data
148	data -> data
149
150	We should set network_header on output to the correct position,
151	packet classifier depends on it.
152	*/
153
154	/ Private packet socket structures. /
155
156	/ identical to struct packet_mreq except it has*
157	* a longer address field.
158	*/
159	struct packet_mreq_max {
160	int mr_ifindex;
161	unsigned short mr_type;
162	unsigned short mr_alen;
163	unsigned char mr_address[MAX_ADDR_LEN];
164	};
165
166	union tpacket_uhdr {
167	struct tpacket_hdr *h1;
168	struct tpacket2_hdr *h2;
169	struct tpacket3_hdr *h3;
170	void *raw;
171	};
172
173	static int packet_set_ring(struct sock sk, union* tpacket_req_u *req_u,
174	int closing, int tx_ring);
175
176	#define V3_ALIGNMENT (8)
177
178	#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
179
180	#define BLK_PLUS_PRIV(sz_of_priv) \
181	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
182
183	#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
184	#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
185	#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
186	#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
187	#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
188	#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
189
190	struct packet_sock;
191	static int tpacket_rcv(struct sk_buff skb, struct* net_device *dev,
192	struct packet_type pt, struct* net_device *orig_dev);
193
194	static void packet_previous_frame(struct* packet_sock *po,
195	struct packet_ring_buffer *rb,
196	int status);
197	static void packet_increment_head(struct packet_ring_buffer *buff);
198	static int prb_curr_blk_in_use(struct tpacket_block_desc *);
199	static void prb_dispatch_next_block(struct* tpacket_kbdq_core *,
200	struct packet_sock *);
201	static void prb_retire_current_block(struct tpacket_kbdq_core *,
202	struct packet_sock , unsigned* int status);
203	static int prb_queue_frozen(struct tpacket_kbdq_core *);
204	static void prb_open_block(struct tpacket_kbdq_core *,
205	struct tpacket_block_desc *);
206	static void prb_retire_rx_blk_timer_expired(struct timer_list *);
207	static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208	static void prb_fill_rxhash(struct tpacket_kbdq_core , struct* tpacket3_hdr *);
209	static void prb_clear_rxhash(struct tpacket_kbdq_core *,
210	struct tpacket3_hdr *);
211	static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
212	struct tpacket3_hdr *);
213	static void packet_flush_mclist(struct sock *sk);
214	static u16 packet_pick_tx_queue(struct sk_buff *skb);
215
216	struct packet_skb_cb {
217	union {
218	struct sockaddr_pkt pkt;
219	union {
220	/ Trick: alias skb original length with*
221	* ll.sll_family and ll.protocol in order
222	* to save room.
223	*/
224	unsigned int origlen;
225	struct sockaddr_ll ll;
226	};
227	} sa;
228	};
229
230	#define vio_le() virtio_legacy_is_little_endian()
231
232	#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
233
234	#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
235	#define GET_PBLOCK_DESC(x, bid) \
236	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
237	#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
238	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
239	#define GET_NEXT_PRB_BLK_NUM(x) \
240	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241	((x)->kactive_blk_num+1) : 0)
242
243	static void __fanout_unlink(struct sock sk, struct* packet_sock *po);
244	static void __fanout_link(struct sock sk, struct* packet_sock *po);
245
246	#ifdef CONFIG_NETFILTER_EGRESS
247	static noinline struct sk_buff nf_hook_direct_egress(struct* sk_buff *skb)
248	{
249	struct sk_buff next, head = NULL, *tail;
250	int rc;
251
252	rcu_read_lock();
253	for (; skb != NULL; skb = next) {
254	next = skb->next;
255	skb_mark_not_on_list(skb);
256
257	if (!nf_hook_egress(skb, rc: &rc, dev: skb->dev))
258	continue;
259
260	if (!head)
261	head = skb;
262	else
263	tail->next = skb;
264
265	tail = skb;
266	}
267	rcu_read_unlock();
268
269	return head;
270	}
271	#endif
272
273	static int packet_xmit(const struct packet_sock po, struct* sk_buff *skb)
274	{
275	if (!packet_sock_flag(po, flag: PACKET_SOCK_QDISC_BYPASS))
276	return dev_queue_xmit(skb);
277
278	#ifdef CONFIG_NETFILTER_EGRESS
279	if (nf_hook_egress_active()) {
280	skb = nf_hook_direct_egress(skb);
281	if (!skb)
282	return NET_XMIT_DROP;
283	}
284	#endif
285	return dev_direct_xmit(skb, queue_id: packet_pick_tx_queue(skb));
286	}
287
288	static struct net_device packet_cached_dev_get(struct* packet_sock *po)
289	{
290	struct net_device *dev;
291
292	rcu_read_lock();
293	dev = rcu_dereference(po->cached_dev);
294	dev_hold(dev);
295	rcu_read_unlock();
296
297	return dev;
298	}
299
300	static void packet_cached_dev_assign(struct packet_sock *po,
301	struct net_device *dev)
302	{
303	rcu_assign_pointer(po->cached_dev, dev);
304	}
305
306	static void packet_cached_dev_reset(struct packet_sock *po)
307	{
308	RCU_INIT_POINTER(po->cached_dev, NULL);
309	}
310
311	static u16 packet_pick_tx_queue(struct sk_buff *skb)
312	{
313	struct net_device *dev = skb->dev;
314	const struct net_device_ops *ops = dev->netdev_ops;
315	int cpu = raw_smp_processor_id();
316	u16 queue_index;
317
318	#ifdef CONFIG_XPS
319	skb->sender_cpu = cpu + `1`;
320	#endif
321	skb_record_rx_queue(skb, rx_queue: cpu % dev->real_num_tx_queues);
322	if (ops->ndo_select_queue) {
323	queue_index = ops->ndo_select_queue(dev, skb, NULL);
324	queue_index = netdev_cap_txqueue(dev, queue_index);
325	} else {
326	queue_index = netdev_pick_tx(dev, skb, NULL);
327	}
328
329	return queue_index;
330	}
331
332	/ __register_prot_hook must be invoked through register_prot_hook*
333	* or from a context in which asynchronous accesses to the packet
334	* socket is not possible (packet_create()).
335	*/
336	static void __register_prot_hook(struct sock *sk)
337	{
338	struct packet_sock *po = pkt_sk(sk);
339
340	if (!packet_sock_flag(po, flag: PACKET_SOCK_RUNNING)) {
341	if (po->fanout)
342	__fanout_link(sk, po);
343	else
344	dev_add_pack(pt: &po->prot_hook);
345
346	sock_hold(sk);
347	packet_sock_flag_set(po, flag: PACKET_SOCK_RUNNING, val: `1`);
348	}
349	}
350
351	static void register_prot_hook(struct sock *sk)
352	{
353	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
354	__register_prot_hook(sk);
355	}
356
357	/ If the sync parameter is true, we will temporarily drop*
358	* the po->bind_lock and do a synchronize_net to make sure no
359	* asynchronous packet processing paths still refer to the elements
360	* of po->prot_hook. If the sync parameter is false, it is the
361	* callers responsibility to take care of this.
362	*/
363	static void __unregister_prot_hook(struct sock *sk, bool sync)
364	{
365	struct packet_sock *po = pkt_sk(sk);
366
367	lockdep_assert_held_once(&po->bind_lock);
368
369	packet_sock_flag_set(po, flag: PACKET_SOCK_RUNNING, val: `0`);
370
371	if (po->fanout)
372	__fanout_unlink(sk, po);
373	else
374	__dev_remove_pack(pt: &po->prot_hook);
375
376	__sock_put(sk);
377
378	if (sync) {
379	spin_unlock(lock: &po->bind_lock);
380	synchronize_net();
381	spin_lock(lock: &po->bind_lock);
382	}
383	}
384
385	static void unregister_prot_hook(struct sock *sk, bool sync)
386	{
387	struct packet_sock *po = pkt_sk(sk);
388
389	if (packet_sock_flag(po, flag: PACKET_SOCK_RUNNING))
390	__unregister_prot_hook(sk, sync);
391	}
392
393	static inline struct page * __pure pgv_to_page(void *addr)
394	{
395	if (is_vmalloc_addr(x: addr))
396	return vmalloc_to_page(addr);
397	return virt_to_page(addr);
398	}
399
400	static void __packet_set_status(struct packet_sock po, void* frame, int* status)
401	{
402	union tpacket_uhdr h;
403
404	/ WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status /
405
406	h.raw = frame;
407	switch (po->tp_version) {
408	case TPACKET_V1:
409	WRITE_ONCE(h.h1->tp_status, status);
410	flush_dcache_page(page: pgv_to_page(addr: &h.h1->tp_status));
411	break;
412	case TPACKET_V2:
413	WRITE_ONCE(h.h2->tp_status, status);
414	flush_dcache_page(page: pgv_to_page(addr: &h.h2->tp_status));
415	break;
416	case TPACKET_V3:
417	WRITE_ONCE(h.h3->tp_status, status);
418	flush_dcache_page(page: pgv_to_page(addr: &h.h3->tp_status));
419	break;
420	default:
421	WARN(`1`, "TPACKET version not supported.\n");
422	BUG();
423	}
424
425	smp_wmb();
426	}
427
428	static int __packet_get_status(const struct packet_sock po, void* *frame)
429	{
430	union tpacket_uhdr h;
431
432	smp_rmb();
433
434	/ READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status /
435
436	h.raw = frame;
437	switch (po->tp_version) {
438	case TPACKET_V1:
439	flush_dcache_page(page: pgv_to_page(addr: &h.h1->tp_status));
440	return READ_ONCE(h.h1->tp_status);
441	case TPACKET_V2:
442	flush_dcache_page(page: pgv_to_page(addr: &h.h2->tp_status));
443	return READ_ONCE(h.h2->tp_status);
444	case TPACKET_V3:
445	flush_dcache_page(page: pgv_to_page(addr: &h.h3->tp_status));
446	return READ_ONCE(h.h3->tp_status);
447	default:
448	WARN(`1`, "TPACKET version not supported.\n");
449	BUG();
450	return `0`;
451	}
452	}
453
454	static __u32 tpacket_get_timestamp(struct sk_buff skb, struct* timespec64 *ts,
455	unsigned int flags)
456	{
457	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
458
459	if (shhwtstamps &&
460	(flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
461	ktime_to_timespec64_cond(kt: shhwtstamps->hwtstamp, ts))
462	return TP_STATUS_TS_RAW_HARDWARE;
463
464	if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
465	ktime_to_timespec64_cond(kt: skb_tstamp(skb), ts))
466	return TP_STATUS_TS_SOFTWARE;
467
468	return `0`;
469	}
470
471	static __u32 __packet_set_timestamp(struct packet_sock po, void* *frame,
472	struct sk_buff *skb)
473	{
474	union tpacket_uhdr h;
475	struct timespec64 ts;
476	__u32 ts_status;
477
478	if (!(ts_status = tpacket_get_timestamp(skb, ts: &ts, READ_ONCE(po->tp_tstamp))))
479	return `0`;
480
481	h.raw = frame;
482	/*
483	* versions 1 through 3 overflow the timestamps in y2106, since they
484	* all store the seconds in a 32-bit unsigned integer.
485	* If we create a version 4, that should have a 64-bit timestamp,
486	* either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
487	* nanoseconds.
488	*/
489	switch (po->tp_version) {
490	case TPACKET_V1:
491	h.h1->tp_sec = ts.tv_sec;
492	h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
493	break;
494	case TPACKET_V2:
495	h.h2->tp_sec = ts.tv_sec;
496	h.h2->tp_nsec = ts.tv_nsec;
497	break;
498	case TPACKET_V3:
499	h.h3->tp_sec = ts.tv_sec;
500	h.h3->tp_nsec = ts.tv_nsec;
501	break;
502	default:
503	WARN(`1`, "TPACKET version not supported.\n");
504	BUG();
505	}
506
507	/ one flush is safe, as both fields always lie on the same cacheline /
508	flush_dcache_page(page: pgv_to_page(addr: &h.h1->tp_sec));
509	smp_wmb();
510
511	return ts_status;
512	}
513
514	static void packet_lookup_frame(const* struct packet_sock *po,
515	const struct packet_ring_buffer *rb,
516	unsigned int position,
517	int status)
518	{
519	unsigned int pg_vec_pos, frame_offset;
520	union tpacket_uhdr h;
521
522	pg_vec_pos = position / rb->frames_per_block;
523	frame_offset = position % rb->frames_per_block;
524
525	h.raw = rb->pg_vec[pg_vec_pos].buffer +
526	(frame_offset * rb->frame_size);
527
528	if (status != __packet_get_status(po, frame: h.raw))
529	return NULL;
530
531	return h.raw;
532	}
533
534	static void packet_current_frame(struct* packet_sock *po,
535	struct packet_ring_buffer *rb,
536	int status)
537	{
538	return packet_lookup_frame(po, rb, position: rb->head, status);
539	}
540
541	static u16 vlan_get_tci(const struct sk_buff skb, struct* net_device *dev)
542	{
543	struct vlan_hdr vhdr, *vh;
544	unsigned int header_len;
545
546	if (!dev)
547	return `0`;
548
549	/ In the SOCK_DGRAM scenario, skb data starts at the network*
550	* protocol, which is after the VLAN headers. The outer VLAN
551	* header is at the hard_header_len offset in non-variable
552	* length link layer headers. If it's a VLAN device, the
553	* min_header_len should be used to exclude the VLAN header
554	* size.
555	*/
556	if (dev->min_header_len == dev->hard_header_len)
557	header_len = dev->hard_header_len;
558	else if (is_vlan_dev(dev))
559	header_len = dev->min_header_len;
560	else
561	return `0`;
562
563	vh = skb_header_pointer(skb, offset: skb_mac_offset(skb) + header_len,
564	len: sizeof(vhdr), buffer: &vhdr);
565	if (unlikely(!vh))
566	return `0`;
567
568	return ntohs(vh->h_vlan_TCI);
569	}
570
571	static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
572	{
573	__be16 proto = skb->protocol;
574
575	if (unlikely(eth_type_vlan(proto)))
576	proto = __vlan_get_protocol_offset(skb, type: proto,
577	mac_offset: skb_mac_offset(skb), NULL);
578
579	return proto;
580	}
581
582	static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
583	{
584	timer_delete_sync(timer: &pkc->retire_blk_timer);
585	}
586
587	static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
588	struct sk_buff_head *rb_queue)
589	{
590	struct tpacket_kbdq_core *pkc;
591
592	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
593
594	spin_lock_bh(lock: &rb_queue->lock);
595	pkc->delete_blk_timer = `1`;
596	spin_unlock_bh(lock: &rb_queue->lock);
597
598	prb_del_retire_blk_timer(pkc);
599	}
600
601	static void prb_setup_retire_blk_timer(struct packet_sock *po)
602	{
603	struct tpacket_kbdq_core *pkc;
604
605	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
606	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
607	`0`);
608	pkc->retire_blk_timer.expires = jiffies;
609	}
610
611	static int prb_calc_retire_blk_tmo(struct packet_sock *po,
612	int blk_size_in_bytes)
613	{
614	struct net_device *dev;
615	unsigned int mbits, div;
616	struct ethtool_link_ksettings ecmd;
617	int err;
618
619	rtnl_lock();
620	dev = __dev_get_by_index(net: sock_net(sk: &po->sk), ifindex: po->ifindex);
621	if (unlikely(!dev)) {
622	rtnl_unlock();
623	return DEFAULT_PRB_RETIRE_TOV;
624	}
625	err = __ethtool_get_link_ksettings(dev, link_ksettings: &ecmd);
626	rtnl_unlock();
627	if (err)
628	return DEFAULT_PRB_RETIRE_TOV;
629
630	/ If the link speed is so slow you don't really*
631	* need to worry about perf anyways
632	*/
633	if (ecmd.base.speed < SPEED_1000 \|\|
634	ecmd.base.speed == SPEED_UNKNOWN)
635	return DEFAULT_PRB_RETIRE_TOV;
636
637	div = ecmd.base.speed / `1000`;
638	mbits = (blk_size_in_bytes * `8`) / (`1024` * `1024`);
639
640	if (div)
641	mbits /= div;
642
643	if (div)
644	return mbits + `1`;
645	return mbits;
646	}
647
648	static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
649	union tpacket_req_u *req_u)
650	{
651	p1->feature_req_word = req_u->req3.tp_feature_req_word;
652	}
653
654	static void init_prb_bdqc(struct packet_sock *po,
655	struct packet_ring_buffer *rb,
656	struct pgv *pg_vec,
657	union tpacket_req_u *req_u)
658	{
659	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
660	struct tpacket_block_desc *pbd;
661
662	memset(p1, `0x0`, sizeof(*p1));
663
664	p1->knxt_seq_num = `1`;
665	p1->pkbdq = pg_vec;
666	pbd = (struct tpacket_block_desc *)pg_vec[`0`].buffer;
667	p1->pkblk_start = pg_vec[`0`].buffer;
668	p1->kblk_size = req_u->req3.tp_block_size;
669	p1->knum_blocks = req_u->req3.tp_block_nr;
670	p1->hdrlen = po->tp_hdrlen;
671	p1->version = po->tp_version;
672	p1->last_kactive_blk_num = `0`;
673	po->stats.stats3.tp_freeze_q_cnt = `0`;
674	if (req_u->req3.tp_retire_blk_tov)
675	p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
676	else
677	p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
678	blk_size_in_bytes: req_u->req3.tp_block_size);
679	p1->tov_in_jiffies = msecs_to_jiffies(m: p1->retire_blk_tov);
680	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
681	rwlock_init(&p1->blk_fill_in_prog_lock);
682
683	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
684	prb_init_ft_ops(p1, req_u);
685	prb_setup_retire_blk_timer(po);
686	prb_open_block(p1, pbd);
687	}
688
689	/ Do NOT update the last_blk_num first.*
690	* Assumes sk_buff_head lock is held.
691	*/
692	static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
693	{
694	mod_timer(timer: &pkc->retire_blk_timer,
695	expires: jiffies + pkc->tov_in_jiffies);
696	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
697	}
698
699	/*
700	* Timer logic:
701	* 1) We refresh the timer only when we open a block.
702	* By doing this we don't waste cycles refreshing the timer
703	* on packet-by-packet basis.
704	*
705	* With a 1MB block-size, on a 1Gbps line, it will take
706	* i) ~8 ms to fill a block + ii) memcpy etc.
707	* In this cut we are not accounting for the memcpy time.
708	*
709	* So, if the user sets the 'tmo' to 10ms then the timer
710	* will never fire while the block is still getting filled
711	* (which is what we want). However, the user could choose
712	* to close a block early and that's fine.
713	*
714	* But when the timer does fire, we check whether or not to refresh it.
715	* Since the tmo granularity is in msecs, it is not too expensive
716	* to refresh the timer, lets say every '8' msecs.
717	* Either the user can set the 'tmo' or we can derive it based on
718	* a) line-speed and b) block-size.
719	* prb_calc_retire_blk_tmo() calculates the tmo.
720	*
721	*/
722	static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
723	{
724	struct packet_sock *po =
725	timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
726	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
727	unsigned int frozen;
728	struct tpacket_block_desc *pbd;
729
730	spin_lock(lock: &po->sk.sk_receive_queue.lock);
731
732	frozen = prb_queue_frozen(pkc);
733	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
734
735	if (unlikely(pkc->delete_blk_timer))
736	goto out;
737
738	/ We only need to plug the race when the block is partially filled.*
739	* tpacket_rcv:
740	* lock(); increment BLOCK_NUM_PKTS; unlock()
741	* copy_bits() is in progress ...
742	* timer fires on other cpu:
743	* we can't retire the current block because copy_bits
744	* is in progress.
745	*
746	*/
747	if (BLOCK_NUM_PKTS(pbd)) {
748	/ Waiting for skb_copy_bits to finish... /
749	write_lock(&pkc->blk_fill_in_prog_lock);
750	write_unlock(&pkc->blk_fill_in_prog_lock);
751	}
752
753	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
754	if (!frozen) {
755	if (!BLOCK_NUM_PKTS(pbd)) {
756	/ An empty block. Just refresh the timer. /
757	goto refresh_timer;
758	}
759	prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
760	if (!prb_dispatch_next_block(pkc, po))
761	goto refresh_timer;
762	else
763	goto out;
764	} else {
765	/ Case 1. Queue was frozen because user-space was*
766	* lagging behind.
767	*/
768	if (prb_curr_blk_in_use(pbd)) {
769	/*
770	* Ok, user-space is still behind.
771	* So just refresh the timer.
772	*/
773	goto refresh_timer;
774	} else {
775	/ Case 2. queue was frozen,user-space caught up,*
776	* now the link went idle && the timer fired.
777	* We don't have a block to close.So we open this
778	* block and restart the timer.
779	* opening a block thaws the queue,restarts timer
780	* Thawing/timer-refresh is a side effect.
781	*/
782	prb_open_block(pkc, pbd);
783	goto out;
784	}
785	}
786	}
787
788	refresh_timer:
789	_prb_refresh_rx_retire_blk_timer(pkc);
790
791	out:
792	spin_unlock(lock: &po->sk.sk_receive_queue.lock);
793	}
794
795	static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
796	struct tpacket_block_desc *pbd1, __u32 status)
797	{
798	/ Flush everything minus the block header /
799
800	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
801	u8 start, end;
802
803	start = (u8 *)pbd1;
804
805	/ Skip the block header(we know header WILL fit in 4K) /
806	start += PAGE_SIZE;
807
808	end = (u8 )PAGE_ALIGN((unsigned* long)pkc1->pkblk_end);
809	for (; start < end; start += PAGE_SIZE)
810	flush_dcache_page(pgv_to_page(start));
811
812	smp_wmb();
813	#endif
814
815	/ Now update the block status. /
816
817	BLOCK_STATUS(pbd1) = status;
818
819	/ Flush the block header /
820
821	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
822	start = (u8 *)pbd1;
823	flush_dcache_page(pgv_to_page(start));
824
825	smp_wmb();
826	#endif
827	}
828
829	/*
830	* Side effect:
831	*
832	* 1) flush the block
833	* 2) Increment active_blk_num
834	*
835	* Note:We DONT refresh the timer on purpose.
836	* Because almost always the next block will be opened.
837	*/
838	static void prb_close_block(struct tpacket_kbdq_core *pkc1,
839	struct tpacket_block_desc *pbd1,
840	struct packet_sock po, unsigned* int stat)
841	{
842	__u32 status = TP_STATUS_USER \| stat;
843
844	struct tpacket3_hdr *last_pkt;
845	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
846	struct sock *sk = &po->sk;
847
848	if (atomic_read(v: &po->tp_drops))
849	status \|= TP_STATUS_LOSING;
850
851	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
852	last_pkt->tp_next_offset = `0`;
853
854	/ Get the ts of the last pkt /
855	if (BLOCK_NUM_PKTS(pbd1)) {
856	h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
857	h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
858	} else {
859	/ Ok, we tmo'd - so get the current time.*
860	*
861	* It shouldn't really happen as we don't close empty
862	* blocks. See prb_retire_rx_blk_timer_expired().
863	*/
864	struct timespec64 ts;
865	ktime_get_real_ts64(tv: &ts);
866	h1->ts_last_pkt.ts_sec = ts.tv_sec;
867	h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
868	}
869
870	smp_wmb();
871
872	/ Flush the block /
873	prb_flush_block(pkc1, pbd1, status);
874
875	sk->sk_data_ready(sk);
876
877	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
878	}
879
880	static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
881	{
882	pkc->reset_pending_on_curr_blk = `0`;
883	}
884
885	/*
886	* Side effect of opening a block:
887	*
888	* 1) prb_queue is thawed.
889	* 2) retire_blk_timer is refreshed.
890	*
891	*/
892	static void prb_open_block(struct tpacket_kbdq_core *pkc1,
893	struct tpacket_block_desc *pbd1)
894	{
895	struct timespec64 ts;
896	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
897
898	smp_rmb();
899
900	/ We could have just memset this but we will lose the*
901	* flexibility of making the priv area sticky
902	*/
903
904	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
905	BLOCK_NUM_PKTS(pbd1) = `0`;
906	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
907
908	ktime_get_real_ts64(tv: &ts);
909
910	h1->ts_first_pkt.ts_sec = ts.tv_sec;
911	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
912
913	pkc1->pkblk_start = (char *)pbd1;
914	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
915
916	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
917	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
918
919	pbd1->version = pkc1->version;
920	pkc1->prev = pkc1->nxt_offset;
921	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
922
923	prb_thaw_queue(pkc: pkc1);
924	_prb_refresh_rx_retire_blk_timer(pkc: pkc1);
925
926	smp_wmb();
927	}
928
929	/*
930	* Queue freeze logic:
931	* 1) Assume tp_block_nr = 8 blocks.
932	* 2) At time 't0', user opens Rx ring.
933	* 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
934	* 4) user-space is either sleeping or processing block '0'.
935	* 5) tpacket_rcv is currently filling block '7', since there is no space left,
936	* it will close block-7,loop around and try to fill block '0'.
937	* call-flow:
938	* __packet_lookup_frame_in_block
939	* prb_retire_current_block()
940	* prb_dispatch_next_block()
941	* \|->(BLOCK_STATUS == USER) evaluates to true
942	* 5.1) Since block-0 is currently in-use, we just freeze the queue.
943	* 6) Now there are two cases:
944	* 6.1) Link goes idle right after the queue is frozen.
945	* But remember, the last open_block() refreshed the timer.
946	* When this timer expires,it will refresh itself so that we can
947	* re-open block-0 in near future.
948	* 6.2) Link is busy and keeps on receiving packets. This is a simple
949	* case and __packet_lookup_frame_in_block will check if block-0
950	* is free and can now be re-used.
951	*/
952	static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
953	struct packet_sock *po)
954	{
955	pkc->reset_pending_on_curr_blk = `1`;
956	po->stats.stats3.tp_freeze_q_cnt++;
957	}
958
959	#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
960
961	/*
962	* If the next block is free then we will dispatch it
963	* and return a good offset.
964	* Else, we will freeze the queue.
965	* So, caller must check the return value.
966	*/
967	static void prb_dispatch_next_block(struct* tpacket_kbdq_core *pkc,
968	struct packet_sock *po)
969	{
970	struct tpacket_block_desc *pbd;
971
972	smp_rmb();
973
974	/ 1. Get current block num /
975	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
976
977	/ 2. If this block is currently in_use then freeze the queue /
978	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
979	prb_freeze_queue(pkc, po);
980	return NULL;
981	}
982
983	/*
984	* 3.
985	* open this block and return the offset where the first packet
986	* needs to get stored.
987	*/
988	prb_open_block(pkc1: pkc, pbd1: pbd);
989	return (void *)pkc->nxt_offset;
990	}
991
992	static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
993	struct packet_sock po, unsigned* int status)
994	{
995	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
996
997	/ retire/close the current block /
998	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
999	/*
1000	* Plug the case where copy_bits() is in progress on
1001	* cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
1002	* have space to copy the pkt in the current block and
1003	* called prb_retire_current_block()
1004	*
1005	* We don't need to worry about the TMO case because
1006	* the timer-handler already handled this case.
1007	*/
1008	if (!(status & TP_STATUS_BLK_TMO)) {
1009	/ Waiting for skb_copy_bits to finish... /
1010	write_lock(&pkc->blk_fill_in_prog_lock);
1011	write_unlock(&pkc->blk_fill_in_prog_lock);
1012	}
1013	prb_close_block(pkc1: pkc, pbd1: pbd, po, stat: status);
1014	return;
1015	}
1016	}
1017
1018	static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
1019	{
1020	return TP_STATUS_USER & BLOCK_STATUS(pbd);
1021	}
1022
1023	static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
1024	{
1025	return pkc->reset_pending_on_curr_blk;
1026	}
1027
1028	static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
1029	__releases(&pkc->blk_fill_in_prog_lock)
1030	{
1031	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1032
1033	read_unlock(&pkc->blk_fill_in_prog_lock);
1034	}
1035
1036	static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
1037	struct tpacket3_hdr *ppd)
1038	{
1039	ppd->hv1.tp_rxhash = skb_get_hash(skb: pkc->skb);
1040	}
1041
1042	static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
1043	struct tpacket3_hdr *ppd)
1044	{
1045	ppd->hv1.tp_rxhash = `0`;
1046	}
1047
1048	static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
1049	struct tpacket3_hdr *ppd)
1050	{
1051	struct packet_sock po = container_of(pkc, struct* packet_sock, rx_ring.prb_bdqc);
1052
1053	if (skb_vlan_tag_present(pkc->skb)) {
1054	ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1055	ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1056	ppd->tp_status = TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
1057	} else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) {
1058	ppd->hv1.tp_vlan_tci = vlan_get_tci(skb: pkc->skb, dev: pkc->skb->dev);
1059	ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol);
1060	ppd->tp_status = TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
1061	} else {
1062	ppd->hv1.tp_vlan_tci = `0`;
1063	ppd->hv1.tp_vlan_tpid = `0`;
1064	ppd->tp_status = TP_STATUS_AVAILABLE;
1065	}
1066	}
1067
1068	static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1069	struct tpacket3_hdr *ppd)
1070	{
1071	ppd->hv1.tp_padding = `0`;
1072	prb_fill_vlan_info(pkc, ppd);
1073
1074	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1075	prb_fill_rxhash(pkc, ppd);
1076	else
1077	prb_clear_rxhash(pkc, ppd);
1078	}
1079
1080	static void prb_fill_curr_block(char *curr,
1081	struct tpacket_kbdq_core *pkc,
1082	struct tpacket_block_desc *pbd,
1083	unsigned int len)
1084	__acquires(&pkc->blk_fill_in_prog_lock)
1085	{
1086	struct tpacket3_hdr *ppd;
1087
1088	ppd = (struct tpacket3_hdr *)curr;
1089	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1090	pkc->prev = curr;
1091	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1092	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1093	BLOCK_NUM_PKTS(pbd) += `1`;
1094	read_lock(&pkc->blk_fill_in_prog_lock);
1095	prb_run_all_ft_ops(pkc, ppd);
1096	}
1097
1098	/ Assumes caller has the sk->rx_queue.lock /
1099	static void __packet_lookup_frame_in_block(struct* packet_sock *po,
1100	struct sk_buff *skb,
1101	unsigned int len
1102	)
1103	{
1104	struct tpacket_kbdq_core *pkc;
1105	struct tpacket_block_desc *pbd;
1106	char curr, end;
1107
1108	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1109	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1110
1111	/ Queue is frozen when user space is lagging behind /
1112	if (prb_queue_frozen(pkc)) {
1113	/*
1114	* Check if that last block which caused the queue to freeze,
1115	* is still in_use by user-space.
1116	*/
1117	if (prb_curr_blk_in_use(pbd)) {
1118	/ Can't record this packet /
1119	return NULL;
1120	} else {
1121	/*
1122	* Ok, the block was released by user-space.
1123	* Now let's open that block.
1124	* opening a block also thaws the queue.
1125	* Thawing is a side effect.
1126	*/
1127	prb_open_block(pkc1: pkc, pbd1: pbd);
1128	}
1129	}
1130
1131	smp_mb();
1132	curr = pkc->nxt_offset;
1133	pkc->skb = skb;
1134	end = (char *)pbd + pkc->kblk_size;
1135
1136	/ first try the current block /
1137	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1138	prb_fill_curr_block(curr, pkc, pbd, len);
1139	return (void *)curr;
1140	}
1141
1142	/ Ok, close the current block /
1143	prb_retire_current_block(pkc, po, status: `0`);
1144
1145	/ Now, try to dispatch the next block /
1146	curr = (char *)prb_dispatch_next_block(pkc, po);
1147	if (curr) {
1148	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1149	prb_fill_curr_block(curr, pkc, pbd, len);
1150	return (void *)curr;
1151	}
1152
1153	/*
1154	* No free blocks are available.user_space hasn't caught up yet.
1155	* Queue was just frozen and now this packet will get dropped.
1156	*/
1157	return NULL;
1158	}
1159
1160	static void packet_current_rx_frame(struct* packet_sock *po,
1161	struct sk_buff *skb,
1162	int status, unsigned int len)
1163	{
1164	char *curr = NULL;
1165	switch (po->tp_version) {
1166	case TPACKET_V1:
1167	case TPACKET_V2:
1168	curr = packet_lookup_frame(po, rb: &po->rx_ring,
1169	position: po->rx_ring.head, status);
1170	return curr;
1171	case TPACKET_V3:
1172	return __packet_lookup_frame_in_block(po, skb, len);
1173	default:
1174	WARN(`1`, "TPACKET version not supported\n");
1175	BUG();
1176	return NULL;
1177	}
1178	}
1179
1180	static void prb_lookup_block(const* struct packet_sock *po,
1181	const struct packet_ring_buffer *rb,
1182	unsigned int idx,
1183	int status)
1184	{
1185	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1186	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1187
1188	if (status != BLOCK_STATUS(pbd))
1189	return NULL;
1190	return pbd;
1191	}
1192
1193	static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1194	{
1195	unsigned int prev;
1196	if (rb->prb_bdqc.kactive_blk_num)
1197	prev = rb->prb_bdqc.kactive_blk_num-`1`;
1198	else
1199	prev = rb->prb_bdqc.knum_blocks-`1`;
1200	return prev;
1201	}
1202
1203	/ Assumes caller has held the rx_queue.lock /
1204	static void __prb_previous_block(struct* packet_sock *po,
1205	struct packet_ring_buffer *rb,
1206	int status)
1207	{
1208	unsigned int previous = prb_previous_blk_num(rb);
1209	return prb_lookup_block(po, rb, idx: previous, status);
1210	}
1211
1212	static void packet_previous_rx_frame(struct* packet_sock *po,
1213	struct packet_ring_buffer *rb,
1214	int status)
1215	{
1216	if (po->tp_version <= TPACKET_V2)
1217	return packet_previous_frame(po, rb, status);
1218
1219	return __prb_previous_block(po, rb, status);
1220	}
1221
1222	static void packet_increment_rx_head(struct packet_sock *po,
1223	struct packet_ring_buffer *rb)
1224	{
1225	switch (po->tp_version) {
1226	case TPACKET_V1:
1227	case TPACKET_V2:
1228	return packet_increment_head(buff: rb);
1229	case TPACKET_V3:
1230	default:
1231	WARN(`1`, "TPACKET version not supported.\n");
1232	BUG();
1233	return;
1234	}
1235	}
1236
1237	static void packet_previous_frame(struct* packet_sock *po,
1238	struct packet_ring_buffer *rb,
1239	int status)
1240	{
1241	unsigned int previous = rb->head ? rb->head - `1` : rb->frame_max;
1242	return packet_lookup_frame(po, rb, position: previous, status);
1243	}
1244
1245	static void packet_increment_head(struct packet_ring_buffer *buff)
1246	{
1247	buff->head = buff->head != buff->frame_max ? buff->head+`1` : `0`;
1248	}
1249
1250	static void packet_inc_pending(struct packet_ring_buffer *rb)
1251	{
1252	this_cpu_inc(*rb->pending_refcnt);
1253	}
1254
1255	static void packet_dec_pending(struct packet_ring_buffer *rb)
1256	{
1257	this_cpu_dec(*rb->pending_refcnt);
1258	}
1259
1260	static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1261	{
1262	unsigned int refcnt = `0`;
1263	int cpu;
1264
1265	/ We don't use pending refcount in rx_ring. /
1266	if (rb->pending_refcnt == NULL)
1267	return `0`;
1268
1269	for_each_possible_cpu(cpu)
1270	refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1271
1272	return refcnt;
1273	}
1274
1275	static int packet_alloc_pending(struct packet_sock *po)
1276	{
1277	po->rx_ring.pending_refcnt = NULL;
1278
1279	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1280	if (unlikely(po->tx_ring.pending_refcnt == NULL))
1281	return -ENOBUFS;
1282
1283	return `0`;
1284	}
1285
1286	static void packet_free_pending(struct packet_sock *po)
1287	{
1288	free_percpu(pdata: po->tx_ring.pending_refcnt);
1289	}
1290
1291	#define ROOM_POW_OFF 2
1292	#define ROOM_NONE 0x0
1293	#define ROOM_LOW 0x1
1294	#define ROOM_NORMAL 0x2
1295
1296	static bool __tpacket_has_room(const struct packet_sock po, int* pow_off)
1297	{
1298	int idx, len;
1299
1300	len = READ_ONCE(po->rx_ring.frame_max) + `1`;
1301	idx = READ_ONCE(po->rx_ring.head);
1302	if (pow_off)
1303	idx += len >> pow_off;
1304	if (idx >= len)
1305	idx -= len;
1306	return packet_lookup_frame(po, rb: &po->rx_ring, position: idx, TP_STATUS_KERNEL);
1307	}
1308
1309	static bool __tpacket_v3_has_room(const struct packet_sock po, int* pow_off)
1310	{
1311	int idx, len;
1312
1313	len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1314	idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1315	if (pow_off)
1316	idx += len >> pow_off;
1317	if (idx >= len)
1318	idx -= len;
1319	return prb_lookup_block(po, rb: &po->rx_ring, idx, TP_STATUS_KERNEL);
1320	}
1321
1322	static int __packet_rcv_has_room(const struct packet_sock *po,
1323	const struct sk_buff *skb)
1324	{
1325	const struct sock *sk = &po->sk;
1326	int ret = ROOM_NONE;
1327
1328	if (po->prot_hook.func != tpacket_rcv) {
1329	int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1330	int avail = rcvbuf - atomic_read(v: &sk->sk_rmem_alloc)
1331	- (skb ? skb->truesize : `0`);
1332
1333	if (avail > (rcvbuf >> ROOM_POW_OFF))
1334	return ROOM_NORMAL;
1335	else if (avail > `0`)
1336	return ROOM_LOW;
1337	else
1338	return ROOM_NONE;
1339	}
1340
1341	if (po->tp_version == TPACKET_V3) {
1342	if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1343	ret = ROOM_NORMAL;
1344	else if (__tpacket_v3_has_room(po, pow_off: `0`))
1345	ret = ROOM_LOW;
1346	} else {
1347	if (__tpacket_has_room(po, ROOM_POW_OFF))
1348	ret = ROOM_NORMAL;
1349	else if (__tpacket_has_room(po, pow_off: `0`))
1350	ret = ROOM_LOW;
1351	}
1352
1353	return ret;
1354	}
1355
1356	static int packet_rcv_has_room(struct packet_sock po, struct* sk_buff *skb)
1357	{
1358	bool pressure;
1359	int ret;
1360
1361	ret = __packet_rcv_has_room(po, skb);
1362	pressure = ret != ROOM_NORMAL;
1363
1364	if (packet_sock_flag(po, flag: PACKET_SOCK_PRESSURE) != pressure)
1365	packet_sock_flag_set(po, flag: PACKET_SOCK_PRESSURE, val: pressure);
1366
1367	return ret;
1368	}
1369
1370	static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1371	{
1372	if (packet_sock_flag(po, flag: PACKET_SOCK_PRESSURE) &&
1373	__packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1374	packet_sock_flag_set(po, flag: PACKET_SOCK_PRESSURE, val: false);
1375	}
1376
1377	static void packet_sock_destruct(struct sock *sk)
1378	{
1379	skb_queue_purge(list: &sk->sk_error_queue);
1380
1381	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1382	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1383
1384	if (!sock_flag(sk, flag: SOCK_DEAD)) {
1385	pr_err("Attempt to release alive packet socket: %p\n", sk);
1386	return;
1387	}
1388	}
1389
1390	static bool fanout_flow_is_huge(struct packet_sock po, struct* sk_buff *skb)
1391	{
1392	u32 *history = po->rollover->history;
1393	u32 victim, rxhash;
1394	int i, count = `0`;
1395
1396	rxhash = skb_get_hash(skb);
1397	for (i = `0`; i < ROLLOVER_HLEN; i++)
1398	if (READ_ONCE(history[i]) == rxhash)
1399	count++;
1400
1401	victim = get_random_u32_below(ROLLOVER_HLEN);
1402
1403	/ Avoid dirtying the cache line if possible /
1404	if (READ_ONCE(history[victim]) != rxhash)
1405	WRITE_ONCE(history[victim], rxhash);
1406
1407	return count > (ROLLOVER_HLEN >> `1`);
1408	}
1409
1410	static unsigned int fanout_demux_hash(struct packet_fanout *f,
1411	struct sk_buff *skb,
1412	unsigned int num)
1413	{
1414	return reciprocal_scale(val: __skb_get_hash_symmetric(skb), ep_ro: num);
1415	}
1416
1417	static unsigned int fanout_demux_lb(struct packet_fanout *f,
1418	struct sk_buff *skb,
1419	unsigned int num)
1420	{
1421	unsigned int val = atomic_inc_return(v: &f->rr_cur);
1422
1423	return val % num;
1424	}
1425
1426	static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1427	struct sk_buff *skb,
1428	unsigned int num)
1429	{
1430	return smp_processor_id() % num;
1431	}
1432
1433	static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1434	struct sk_buff *skb,
1435	unsigned int num)
1436	{
1437	return get_random_u32_below(ceil: num);
1438	}
1439
1440	static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1441	struct sk_buff *skb,
1442	unsigned int idx, bool try_self,
1443	unsigned int num)
1444	{
1445	struct packet_sock po, po_next, *po_skip = NULL;
1446	unsigned int i, j, room = ROOM_NONE;
1447
1448	po = pkt_sk(rcu_dereference(f->arr[idx]));
1449
1450	if (try_self) {
1451	room = packet_rcv_has_room(po, skb);
1452	if (room == ROOM_NORMAL \|\|
1453	(room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1454	return idx;
1455	po_skip = po;
1456	}
1457
1458	i = j = min_t(int, po->rollover->sock, num - `1`);
1459	do {
1460	po_next = pkt_sk(rcu_dereference(f->arr[i]));
1461	if (po_next != po_skip &&
1462	!packet_sock_flag(po: po_next, flag: PACKET_SOCK_PRESSURE) &&
1463	packet_rcv_has_room(po: po_next, skb) == ROOM_NORMAL) {
1464	if (i != j)
1465	po->rollover->sock = i;
1466	atomic_long_inc(v: &po->rollover->num);
1467	if (room == ROOM_LOW)
1468	atomic_long_inc(v: &po->rollover->num_huge);
1469	return i;
1470	}
1471
1472	if (++i == num)
1473	i = `0`;
1474	} while (i != j);
1475
1476	atomic_long_inc(v: &po->rollover->num_failed);
1477	return idx;
1478	}
1479
1480	static unsigned int fanout_demux_qm(struct packet_fanout *f,
1481	struct sk_buff *skb,
1482	unsigned int num)
1483	{
1484	return skb_get_queue_mapping(skb) % num;
1485	}
1486
1487	static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1488	struct sk_buff *skb,
1489	unsigned int num)
1490	{
1491	struct bpf_prog *prog;
1492	unsigned int ret = `0`;
1493
1494	rcu_read_lock();
1495	prog = rcu_dereference(f->bpf_prog);
1496	if (prog)
1497	ret = bpf_prog_run_clear_cb(prog, skb) % num;
1498	rcu_read_unlock();
1499
1500	return ret;
1501	}
1502
1503	static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1504	{
1505	return f->flags & (flag >> `8`);
1506	}
1507
1508	static int packet_rcv_fanout(struct sk_buff skb, struct* net_device *dev,
1509	struct packet_type pt, struct* net_device *orig_dev)
1510	{
1511	struct packet_fanout *f = pt->af_packet_priv;
1512	unsigned int num = READ_ONCE(f->num_members);
1513	struct net *net = read_pnet(pnet: &f->net);
1514	struct packet_sock *po;
1515	unsigned int idx;
1516
1517	if (!net_eq(net1: dev_net(dev), net2: net) \|\| !num) {
1518	kfree_skb(skb);
1519	return `0`;
1520	}
1521
1522	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1523	skb = ip_check_defrag(net, skb, user: IP_DEFRAG_AF_PACKET);
1524	if (!skb)
1525	return `0`;
1526	}
1527	switch (f->type) {
1528	case PACKET_FANOUT_HASH:
1529	default:
1530	idx = fanout_demux_hash(f, skb, num);
1531	break;
1532	case PACKET_FANOUT_LB:
1533	idx = fanout_demux_lb(f, skb, num);
1534	break;
1535	case PACKET_FANOUT_CPU:
1536	idx = fanout_demux_cpu(f, skb, num);
1537	break;
1538	case PACKET_FANOUT_RND:
1539	idx = fanout_demux_rnd(f, skb, num);
1540	break;
1541	case PACKET_FANOUT_QM:
1542	idx = fanout_demux_qm(f, skb, num);
1543	break;
1544	case PACKET_FANOUT_ROLLOVER:
1545	idx = fanout_demux_rollover(f, skb, idx: `0`, try_self: false, num);
1546	break;
1547	case PACKET_FANOUT_CBPF:
1548	case PACKET_FANOUT_EBPF:
1549	idx = fanout_demux_bpf(f, skb, num);
1550	break;
1551	}
1552
1553	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1554	idx = fanout_demux_rollover(f, skb, idx, try_self: true, num);
1555
1556	po = pkt_sk(rcu_dereference(f->arr[idx]));
1557	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1558	}
1559
1560	DEFINE_MUTEX(fanout_mutex);
1561	EXPORT_SYMBOL_GPL(fanout_mutex);
1562	static LIST_HEAD(fanout_list);
1563	static u16 fanout_next_id;
1564
1565	static void __fanout_link(struct sock sk, struct* packet_sock *po)
1566	{
1567	struct packet_fanout *f = po->fanout;
1568
1569	spin_lock(lock: &f->lock);
1570	rcu_assign_pointer(f->arr[f->num_members], sk);
1571	smp_wmb();
1572	f->num_members++;
1573	if (f->num_members == `1`)
1574	dev_add_pack(pt: &f->prot_hook);
1575	spin_unlock(lock: &f->lock);
1576	}
1577
1578	static void __fanout_unlink(struct sock sk, struct* packet_sock *po)
1579	{
1580	struct packet_fanout *f = po->fanout;
1581	int i;
1582
1583	spin_lock(lock: &f->lock);
1584	for (i = `0`; i < f->num_members; i++) {
1585	if (rcu_dereference_protected(f->arr[i],
1586	lockdep_is_held(&f->lock)) == sk)
1587	break;
1588	}
1589	BUG_ON(i >= f->num_members);
1590	rcu_assign_pointer(f->arr[i],
1591	rcu_dereference_protected(f->arr[f->num_members - `1`],
1592	lockdep_is_held(&f->lock)));
1593	f->num_members--;
1594	if (f->num_members == `0`)
1595	__dev_remove_pack(pt: &f->prot_hook);
1596	spin_unlock(lock: &f->lock);
1597	}
1598
1599	static bool match_fanout_group(struct packet_type ptype, struct* sock *sk)
1600	{
1601	if (sk->sk_family != PF_PACKET)
1602	return false;
1603
1604	return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1605	}
1606
1607	static void fanout_init_data(struct packet_fanout *f)
1608	{
1609	switch (f->type) {
1610	case PACKET_FANOUT_LB:
1611	atomic_set(v: &f->rr_cur, i: `0`);
1612	break;
1613	case PACKET_FANOUT_CBPF:
1614	case PACKET_FANOUT_EBPF:
1615	RCU_INIT_POINTER(f->bpf_prog, NULL);
1616	break;
1617	}
1618	}
1619
1620	static void __fanout_set_data_bpf(struct packet_fanout f, struct* bpf_prog *new)
1621	{
1622	struct bpf_prog *old;
1623
1624	spin_lock(lock: &f->lock);
1625	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1626	rcu_assign_pointer(f->bpf_prog, new);
1627	spin_unlock(lock: &f->lock);
1628
1629	if (old) {
1630	synchronize_net();
1631	bpf_prog_destroy(fp: old);
1632	}
1633	}
1634
1635	static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
1636	unsigned int len)
1637	{
1638	struct bpf_prog *new;
1639	struct sock_fprog fprog;
1640	int ret;
1641
1642	if (sock_flag(sk: &po->sk, flag: SOCK_FILTER_LOCKED))
1643	return -EPERM;
1644
1645	ret = copy_bpf_fprog_from_user(dst: &fprog, src: data, len);
1646	if (ret)
1647	return ret;
1648
1649	ret = bpf_prog_create_from_user(pfp: &new, fprog: &fprog, NULL, save_orig: false);
1650	if (ret)
1651	return ret;
1652
1653	__fanout_set_data_bpf(f: po->fanout, new);
1654	return `0`;
1655	}
1656
1657	static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
1658	unsigned int len)
1659	{
1660	struct bpf_prog *new;
1661	u32 fd;
1662
1663	if (sock_flag(sk: &po->sk, flag: SOCK_FILTER_LOCKED))
1664	return -EPERM;
1665	if (len != sizeof(fd))
1666	return -EINVAL;
1667	if (copy_from_sockptr(dst: &fd, src: data, size: len))
1668	return -EFAULT;
1669
1670	new = bpf_prog_get_type(ufd: fd, type: BPF_PROG_TYPE_SOCKET_FILTER);
1671	if (IS_ERR(ptr: new))
1672	return PTR_ERR(ptr: new);
1673
1674	__fanout_set_data_bpf(f: po->fanout, new);
1675	return `0`;
1676	}
1677
1678	static int fanout_set_data(struct packet_sock *po, sockptr_t data,
1679	unsigned int len)
1680	{
1681	switch (po->fanout->type) {
1682	case PACKET_FANOUT_CBPF:
1683	return fanout_set_data_cbpf(po, data, len);
1684	case PACKET_FANOUT_EBPF:
1685	return fanout_set_data_ebpf(po, data, len);
1686	default:
1687	return -EINVAL;
1688	}
1689	}
1690
1691	static void fanout_release_data(struct packet_fanout *f)
1692	{
1693	switch (f->type) {
1694	case PACKET_FANOUT_CBPF:
1695	case PACKET_FANOUT_EBPF:
1696	__fanout_set_data_bpf(f, NULL);
1697	}
1698	}
1699
1700	static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1701	{
1702	struct packet_fanout *f;
1703
1704	list_for_each_entry(f, &fanout_list, list) {
1705	if (f->id == candidate_id &&
1706	read_pnet(pnet: &f->net) == sock_net(sk)) {
1707	return false;
1708	}
1709	}
1710	return true;
1711	}
1712
1713	static bool fanout_find_new_id(struct sock sk, u16 new_id)
1714	{
1715	u16 id = fanout_next_id;
1716
1717	do {
1718	if (__fanout_id_is_free(sk, candidate_id: id)) {
1719	*new_id = id;
1720	fanout_next_id = id + `1`;
1721	return true;
1722	}
1723
1724	id++;
1725	} while (id != fanout_next_id);
1726
1727	return false;
1728	}
1729
1730	static int fanout_add(struct sock sk, struct* fanout_args *args)
1731	{
1732	struct packet_rollover *rollover = NULL;
1733	struct packet_sock *po = pkt_sk(sk);
1734	u16 type_flags = args->type_flags;
1735	struct packet_fanout f, match;
1736	u8 type = type_flags & `0xff`;
1737	u8 flags = type_flags >> `8`;
1738	u16 id = args->id;
1739	int err;
1740
1741	switch (type) {
1742	case PACKET_FANOUT_ROLLOVER:
1743	if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1744	return -EINVAL;
1745	break;
1746	case PACKET_FANOUT_HASH:
1747	case PACKET_FANOUT_LB:
1748	case PACKET_FANOUT_CPU:
1749	case PACKET_FANOUT_RND:
1750	case PACKET_FANOUT_QM:
1751	case PACKET_FANOUT_CBPF:
1752	case PACKET_FANOUT_EBPF:
1753	break;
1754	default:
1755	return -EINVAL;
1756	}
1757
1758	mutex_lock(&fanout_mutex);
1759
1760	err = -EALREADY;
1761	if (po->fanout)
1762	goto out;
1763
1764	if (type == PACKET_FANOUT_ROLLOVER \|\|
1765	(type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1766	err = -ENOMEM;
1767	rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1768	if (!rollover)
1769	goto out;
1770	atomic_long_set(v: &rollover->num, i: `0`);
1771	atomic_long_set(v: &rollover->num_huge, i: `0`);
1772	atomic_long_set(v: &rollover->num_failed, i: `0`);
1773	}
1774
1775	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1776	if (id != `0`) {
1777	err = -EINVAL;
1778	goto out;
1779	}
1780	if (!fanout_find_new_id(sk, new_id: &id)) {
1781	err = -ENOMEM;
1782	goto out;
1783	}
1784	/ ephemeral flag for the first socket in the group: drop it /
1785	flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> `8`);
1786	}
1787
1788	match = NULL;
1789	list_for_each_entry(f, &fanout_list, list) {
1790	if (f->id == id &&
1791	read_pnet(pnet: &f->net) == sock_net(sk)) {
1792	match = f;
1793	break;
1794	}
1795	}
1796	err = -EINVAL;
1797	if (match) {
1798	if (match->flags != flags)
1799	goto out;
1800	if (args->max_num_members &&
1801	args->max_num_members != match->max_num_members)
1802	goto out;
1803	} else {
1804	if (args->max_num_members > PACKET_FANOUT_MAX)
1805	goto out;
1806	if (!args->max_num_members)
1807	/ legacy PACKET_FANOUT_MAX /
1808	args->max_num_members = `256`;
1809	err = -ENOMEM;
1810	match = kvzalloc(struct_size(match, arr, args->max_num_members),
1811	GFP_KERNEL);
1812	if (!match)
1813	goto out;
1814	write_pnet(pnet: &match->net, net: sock_net(sk));
1815	match->id = id;
1816	match->type = type;
1817	match->flags = flags;
1818	INIT_LIST_HEAD(list: &match->list);
1819	spin_lock_init(&match->lock);
1820	refcount_set(r: &match->sk_ref, n: `0`);
1821	fanout_init_data(f: match);
1822	match->prot_hook.type = po->prot_hook.type;
1823	match->prot_hook.dev = po->prot_hook.dev;
1824	match->prot_hook.func = packet_rcv_fanout;
1825	match->prot_hook.af_packet_priv = match;
1826	match->prot_hook.af_packet_net = read_pnet(pnet: &match->net);
1827	match->prot_hook.id_match = match_fanout_group;
1828	match->max_num_members = args->max_num_members;
1829	match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
1830	list_add(new: &match->list, head: &fanout_list);
1831	}
1832	err = -EINVAL;
1833
1834	spin_lock(lock: &po->bind_lock);
1835	if (po->num &&
1836	match->type == type &&
1837	match->prot_hook.type == po->prot_hook.type &&
1838	match->prot_hook.dev == po->prot_hook.dev) {
1839	err = -ENOSPC;
1840	if (refcount_read(r: &match->sk_ref) < match->max_num_members) {
1841	/ Paired with packet_setsockopt(PACKET_FANOUT_DATA) /
1842	WRITE_ONCE(po->fanout, match);
1843
1844	po->rollover = rollover;
1845	rollover = NULL;
1846	refcount_set(r: &match->sk_ref, n: refcount_read(r: &match->sk_ref) + `1`);
1847	if (packet_sock_flag(po, flag: PACKET_SOCK_RUNNING)) {
1848	__dev_remove_pack(pt: &po->prot_hook);
1849	__fanout_link(sk, po);
1850	}
1851	err = `0`;
1852	}
1853	}
1854	spin_unlock(lock: &po->bind_lock);
1855
1856	if (err && !refcount_read(r: &match->sk_ref)) {
1857	list_del(entry: &match->list);
1858	kvfree(addr: match);
1859	}
1860
1861	out:
1862	kfree(objp: rollover);
1863	mutex_unlock(lock: &fanout_mutex);
1864	return err;
1865	}
1866
1867	/ If pkt_sk(sk)->fanout->sk_ref is zero, this function removes*
1868	* pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1869	* It is the responsibility of the caller to call fanout_release_data() and
1870	* free the returned packet_fanout (after synchronize_net())
1871	*/
1872	static struct packet_fanout fanout_release(struct* sock *sk)
1873	{
1874	struct packet_sock *po = pkt_sk(sk);
1875	struct packet_fanout *f;
1876
1877	mutex_lock(&fanout_mutex);
1878	f = po->fanout;
1879	if (f) {
1880	po->fanout = NULL;
1881
1882	if (refcount_dec_and_test(r: &f->sk_ref))
1883	list_del(entry: &f->list);
1884	else
1885	f = NULL;
1886	}
1887	mutex_unlock(lock: &fanout_mutex);
1888
1889	return f;
1890	}
1891
1892	static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1893	struct sk_buff *skb)
1894	{
1895	/ Earlier code assumed this would be a VLAN pkt, double-check*
1896	* this now that we have the actual packet in hand. We can only
1897	* do this check on Ethernet devices.
1898	*/
1899	if (unlikely(dev->type != ARPHRD_ETHER))
1900	return false;
1901
1902	skb_reset_mac_header(skb);
1903	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1904	}
1905
1906	static const struct proto_ops packet_ops;
1907
1908	static const struct proto_ops packet_ops_spkt;
1909
1910	static int packet_rcv_spkt(struct sk_buff skb, struct* net_device *dev,
1911	struct packet_type pt, struct* net_device *orig_dev)
1912	{
1913	struct sock *sk;
1914	struct sockaddr_pkt *spkt;
1915
1916	/*
1917	* When we registered the protocol we saved the socket in the data
1918	* field for just this event.
1919	*/
1920
1921	sk = pt->af_packet_priv;
1922
1923	/*
1924	* Yank back the headers [hope the device set this
1925	* right or kerboom...]
1926	*
1927	* Incoming packets have ll header pulled,
1928	* push it back.
1929	*
1930	* For outgoing ones skb->data == skb_mac_header(skb)
1931	* so that this procedure is noop.
1932	*/
1933
1934	if (skb->pkt_type == PACKET_LOOPBACK)
1935	goto out;
1936
1937	if (!net_eq(net1: dev_net(dev), net2: sock_net(sk)))
1938	goto out;
1939
1940	skb = skb_share_check(skb, GFP_ATOMIC);
1941	if (skb == NULL)
1942	goto oom;
1943
1944	/ drop any routing info /
1945	skb_dst_drop(skb);
1946
1947	/ drop conntrack reference /
1948	nf_reset_ct(skb);
1949
1950	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1951
1952	skb_push(skb, len: skb->data - skb_mac_header(skb));
1953
1954	/*
1955	* The SOCK_PACKET socket receives _all_ frames.
1956	*/
1957
1958	spkt->spkt_family = dev->type;
1959	strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1960	spkt->spkt_protocol = skb->protocol;
1961
1962	/*
1963	* Charge the memory to the socket. This is done specifically
1964	* to prevent sockets using all the memory up.
1965	*/
1966
1967	if (sock_queue_rcv_skb(sk, skb) == `0`)
1968	return `0`;
1969
1970	out:
1971	kfree_skb(skb);
1972	oom:
1973	return `0`;
1974	}
1975
1976	static void packet_parse_headers(struct sk_buff skb, struct* socket *sock)
1977	{
1978	int depth;
1979
1980	if ((!skb->protocol \|\| skb->protocol == htons(ETH_P_ALL)) &&
1981	sock->type == SOCK_RAW) {
1982	skb_reset_mac_header(skb);
1983	skb->protocol = dev_parse_header_protocol(skb);
1984	}
1985
1986	/ Move network header to the right position for VLAN tagged packets /
1987	if (likely(skb->dev->type == ARPHRD_ETHER) &&
1988	eth_type_vlan(ethertype: skb->protocol) &&
1989	vlan_get_protocol_and_depth(skb, type: skb->protocol, depth: &depth) != `0`)
1990	skb_set_network_header(skb, offset: depth);
1991
1992	skb_probe_transport_header(skb);
1993	}
1994
1995	/*
1996	* Output a raw packet to a device layer. This bypasses all the other
1997	* protocol layers and you must therefore supply it with a complete frame
1998	*/
1999
2000	static int packet_sendmsg_spkt(struct socket sock, struct* msghdr *msg,
2001	size_t len)
2002	{
2003	struct sock *sk = sock->sk;
2004	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
2005	struct sk_buff *skb = NULL;
2006	struct net_device *dev;
2007	struct sockcm_cookie sockc;
2008	__be16 proto = `0`;
2009	int err;
2010	int extra_len = `0`;
2011
2012	/*
2013	* Get and verify the address.
2014	*/
2015
2016	if (saddr) {
2017	if (msg->msg_namelen < sizeof(struct sockaddr))
2018	return -EINVAL;
2019	if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
2020	proto = saddr->spkt_protocol;
2021	} else
2022	return -ENOTCONN; / SOCK_PACKET must be sent giving an address /
2023
2024	/*
2025	* Find the device first to size check it
2026	*/
2027
2028	saddr->spkt_device[sizeof(saddr->spkt_device) - `1`] = `0`;
2029	retry:
2030	rcu_read_lock();
2031	dev = dev_get_by_name_rcu(net: sock_net(sk), name: saddr->spkt_device);
2032	err = -ENODEV;
2033	if (dev == NULL)
2034	goto out_unlock;
2035
2036	err = -ENETDOWN;
2037	if (!(dev->flags & IFF_UP))
2038	goto out_unlock;
2039
2040	/*
2041	* You may not queue a frame bigger than the mtu. This is the lowest level
2042	* raw protocol and you must do your own fragmentation at this level.
2043	*/
2044
2045	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2046	if (!netif_supports_nofcs(dev)) {
2047	err = -EPROTONOSUPPORT;
2048	goto out_unlock;
2049	}
2050	extra_len = `4`; / We're doing our own CRC /
2051	}
2052
2053	err = -EMSGSIZE;
2054	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
2055	goto out_unlock;
2056
2057	if (!skb) {
2058	size_t reserved = LL_RESERVED_SPACE(dev);
2059	int tlen = dev->needed_tailroom;
2060	unsigned int hhlen = dev->header_ops ? dev->hard_header_len : `0`;
2061
2062	rcu_read_unlock();
2063	skb = sock_wmalloc(sk, size: len + reserved + tlen, force: `0`, GFP_KERNEL);
2064	if (skb == NULL)
2065	return -ENOBUFS;
2066	/ FIXME: Save some space for broken drivers that write a hard*
2067	* header at transmission time by themselves. PPP is the notable
2068	* one here. This should really be fixed at the driver level.
2069	*/
2070	skb_reserve(skb, len: reserved);
2071	skb_reset_network_header(skb);
2072
2073	/ Try to align data part correctly /
2074	if (hhlen) {
2075	skb->data -= hhlen;
2076	skb->tail -= hhlen;
2077	if (len < hhlen)
2078	skb_reset_network_header(skb);
2079	}
2080	err = memcpy_from_msg(data: skb_put(skb, len), msg, len);
2081	if (err)
2082	goto out_free;
2083	goto retry;
2084	}
2085
2086	if (!dev_validate_header(dev, ll_header: skb->data, len) \|\| !skb->len) {
2087	err = -EINVAL;
2088	goto out_unlock;
2089	}
2090	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2091	!packet_extra_vlan_len_allowed(dev, skb)) {
2092	err = -EMSGSIZE;
2093	goto out_unlock;
2094	}
2095
2096	sockcm_init(sockc: &sockc, sk);
2097	if (msg->msg_controllen) {
2098	err = sock_cmsg_send(sk, msg, sockc: &sockc);
2099	if (unlikely(err))
2100	goto out_unlock;
2101	}
2102
2103	skb->protocol = proto;
2104	skb->dev = dev;
2105	skb->priority = sockc.priority;
2106	skb->mark = sockc.mark;
2107	skb_set_delivery_type_by_clockid(skb, kt: sockc.transmit_time, clockid: sk->sk_clockid);
2108	skb_setup_tx_timestamp(skb, sockc: &sockc);
2109
2110	if (unlikely(extra_len == `4`))
2111	skb->no_fcs = `1`;
2112
2113	packet_parse_headers(skb, sock);
2114
2115	dev_queue_xmit(skb);
2116	rcu_read_unlock();
2117	return len;
2118
2119	out_unlock:
2120	rcu_read_unlock();
2121	out_free:
2122	kfree_skb(skb);
2123	return err;
2124	}
2125
2126	static unsigned int run_filter(struct sk_buff *skb,
2127	const struct sock *sk,
2128	unsigned int res)
2129	{
2130	struct sk_filter *filter;
2131
2132	rcu_read_lock();
2133	filter = rcu_dereference(sk->sk_filter);
2134	if (filter != NULL)
2135	res = bpf_prog_run_clear_cb(prog: filter->prog, skb);
2136	rcu_read_unlock();
2137
2138	return res;
2139	}
2140
2141	static int packet_rcv_vnet(struct msghdr msg, const* struct sk_buff *skb,
2142	size_t len, int* vnet_hdr_sz)
2143	{
2144	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = `0` };
2145
2146	if (*len < vnet_hdr_sz)
2147	return -EINVAL;
2148	*len -= vnet_hdr_sz;
2149
2150	if (virtio_net_hdr_from_skb(skb, hdr: (struct virtio_net_hdr *)&vnet_hdr, vio_le(), has_data_valid: true, vlan_hlen: `0`))
2151	return -EINVAL;
2152
2153	return memcpy_to_msg(msg, data: (void *)&vnet_hdr, len: vnet_hdr_sz);
2154	}
2155
2156	/*
2157	* This function makes lazy skb cloning in hope that most of packets
2158	* are discarded by BPF.
2159	*
2160	* Note tricky part: we DO mangle shared skb! skb->data, skb->len
2161	* and skb->cb are mangled. It works because (and until) packets
2162	* falling here are owned by current CPU. Output packets are cloned
2163	* by dev_queue_xmit_nit(), input packets are processed by net_bh
2164	* sequentially, so that if we return skb to original state on exit,
2165	* we will not harm anyone.
2166	*/
2167
2168	static int packet_rcv(struct sk_buff skb, struct* net_device *dev,
2169	struct packet_type pt, struct* net_device *orig_dev)
2170	{
2171	enum skb_drop_reason drop_reason = SKB_CONSUMED;
2172	struct sock *sk = NULL;
2173	struct sockaddr_ll *sll;
2174	struct packet_sock *po;
2175	u8 *skb_head = skb->data;
2176	int skb_len = skb->len;
2177	unsigned int snaplen, res;
2178
2179	if (skb->pkt_type == PACKET_LOOPBACK)
2180	goto drop;
2181
2182	sk = pt->af_packet_priv;
2183	po = pkt_sk(sk);
2184
2185	if (!net_eq(net1: dev_net(dev), net2: sock_net(sk)))
2186	goto drop;
2187
2188	skb->dev = dev;
2189
2190	if (dev_has_header(dev)) {
2191	/ The device has an explicit notion of ll header,*
2192	* exported to higher levels.
2193	*
2194	* Otherwise, the device hides details of its frame
2195	* structure, so that corresponding packet head is
2196	* never delivered to user.
2197	*/
2198	if (sk->sk_type != SOCK_DGRAM)
2199	skb_push(skb, len: skb->data - skb_mac_header(skb));
2200	else if (skb->pkt_type == PACKET_OUTGOING) {
2201	/ Special case: outgoing packets have ll header at head /
2202	skb_pull(skb, len: skb_network_offset(skb));
2203	}
2204	}
2205
2206	snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);
2207
2208	res = run_filter(skb, sk, res: snaplen);
2209	if (!res)
2210	goto drop_n_restore;
2211	if (snaplen > res)
2212	snaplen = res;
2213
2214	if (atomic_read(v: &sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2215	goto drop_n_acct;
2216
2217	if (skb_shared(skb)) {
2218	struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2219	if (nskb == NULL)
2220	goto drop_n_acct;
2221
2222	if (skb_head != skb->data) {
2223	skb->data = skb_head;
2224	skb->len = skb_len;
2225	}
2226	consume_skb(skb);
2227	skb = nskb;
2228	}
2229
2230	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - `8`);
2231
2232	sll = &PACKET_SKB_CB(skb)->sa.ll;
2233	sll->sll_hatype = dev->type;
2234	sll->sll_pkttype = skb->pkt_type;
2235	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2236	sll->sll_ifindex = orig_dev->ifindex;
2237	else
2238	sll->sll_ifindex = dev->ifindex;
2239
2240	sll->sll_halen = dev_parse_header(skb, haddr: sll->sll_addr);
2241
2242	/ sll->sll_family and sll->sll_protocol are set in packet_recvmsg().*
2243	* Use their space for storing the original skb length.
2244	*/
2245	PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2246
2247	if (pskb_trim(skb, len: snaplen))
2248	goto drop_n_acct;
2249
2250	skb_set_owner_r(skb, sk);
2251	skb->dev = NULL;
2252	skb_dst_drop(skb);
2253
2254	/ drop conntrack reference /
2255	nf_reset_ct(skb);
2256
2257	spin_lock(lock: &sk->sk_receive_queue.lock);
2258	po->stats.stats1.tp_packets++;
2259	sock_skb_set_dropcount(sk, skb);
2260	skb_clear_delivery_time(skb);
2261	__skb_queue_tail(list: &sk->sk_receive_queue, newsk: skb);
2262	spin_unlock(lock: &sk->sk_receive_queue.lock);
2263	sk->sk_data_ready(sk);
2264	return `0`;
2265
2266	drop_n_acct:
2267	atomic_inc(v: &po->tp_drops);
2268	atomic_inc(v: &sk->sk_drops);
2269	drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
2270
2271	drop_n_restore:
2272	if (skb_head != skb->data && skb_shared(skb)) {
2273	skb->data = skb_head;
2274	skb->len = skb_len;
2275	}
2276	drop:
2277	sk_skb_reason_drop(sk, skb, reason: drop_reason);
2278	return `0`;
2279	}
2280
2281	static int tpacket_rcv(struct sk_buff skb, struct* net_device *dev,
2282	struct packet_type pt, struct* net_device *orig_dev)
2283	{
2284	enum skb_drop_reason drop_reason = SKB_CONSUMED;
2285	struct sock *sk = NULL;
2286	struct packet_sock *po;
2287	struct sockaddr_ll *sll;
2288	union tpacket_uhdr h;
2289	u8 *skb_head = skb->data;
2290	int skb_len = skb->len;
2291	unsigned int snaplen, res;
2292	unsigned long status = TP_STATUS_USER;
2293	unsigned short macoff, hdrlen;
2294	unsigned int netoff;
2295	struct sk_buff *copy_skb = NULL;
2296	struct timespec64 ts;
2297	__u32 ts_status;
2298	unsigned int slot_id = `0`;
2299	int vnet_hdr_sz = `0`;
2300
2301	/ struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.*
2302	* We may add members to them until current aligned size without forcing
2303	* userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2304	*/
2305	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != `32`);
2306	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != `48`);
2307
2308	if (skb->pkt_type == PACKET_LOOPBACK)
2309	goto drop;
2310
2311	sk = pt->af_packet_priv;
2312	po = pkt_sk(sk);
2313
2314	if (!net_eq(net1: dev_net(dev), net2: sock_net(sk)))
2315	goto drop;
2316
2317	if (dev_has_header(dev)) {
2318	if (sk->sk_type != SOCK_DGRAM)
2319	skb_push(skb, len: skb->data - skb_mac_header(skb));
2320	else if (skb->pkt_type == PACKET_OUTGOING) {
2321	/ Special case: outgoing packets have ll header at head /
2322	skb_pull(skb, len: skb_network_offset(skb));
2323	}
2324	}
2325
2326	snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);
2327
2328	res = run_filter(skb, sk, res: snaplen);
2329	if (!res)
2330	goto drop_n_restore;
2331
2332	/ If we are flooded, just give up /
2333	if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2334	atomic_inc(v: &po->tp_drops);
2335	goto drop_n_restore;
2336	}
2337
2338	if (skb->ip_summed == CHECKSUM_PARTIAL)
2339	status \|= TP_STATUS_CSUMNOTREADY;
2340	else if (skb->pkt_type != PACKET_OUTGOING &&
2341	skb_csum_unnecessary(skb))
2342	status \|= TP_STATUS_CSUM_VALID;
2343	if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
2344	status \|= TP_STATUS_GSO_TCP;
2345
2346	if (snaplen > res)
2347	snaplen = res;
2348
2349	if (sk->sk_type == SOCK_DGRAM) {
2350	macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + `16` +
2351	po->tp_reserve;
2352	} else {
2353	unsigned int maclen = skb_network_offset(skb);
2354	netoff = TPACKET_ALIGN(po->tp_hdrlen +
2355	(maclen < `16` ? `16` : maclen)) +
2356	po->tp_reserve;
2357	vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2358	if (vnet_hdr_sz)
2359	netoff += vnet_hdr_sz;
2360	macoff = netoff - maclen;
2361	}
2362	if (netoff > USHRT_MAX) {
2363	atomic_inc(v: &po->tp_drops);
2364	goto drop_n_restore;
2365	}
2366	if (po->tp_version <= TPACKET_V2) {
2367	if (macoff + snaplen > po->rx_ring.frame_size) {
2368	if (READ_ONCE(po->copy_thresh) &&
2369	atomic_read(v: &sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2370	if (skb_shared(skb)) {
2371	copy_skb = skb_clone(skb, GFP_ATOMIC);
2372	} else {
2373	copy_skb = skb_get(skb);
2374	skb_head = skb->data;
2375	}
2376	if (copy_skb) {
2377	memset(&PACKET_SKB_CB(copy_skb)->sa.ll, `0`,
2378	sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
2379	skb_set_owner_r(skb: copy_skb, sk);
2380	}
2381	}
2382	snaplen = po->rx_ring.frame_size - macoff;
2383	if ((int)snaplen < `0`) {
2384	snaplen = `0`;
2385	vnet_hdr_sz = `0`;
2386	}
2387	}
2388	} else if (unlikely(macoff + snaplen >
2389	GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2390	u32 nval;
2391
2392	nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2393	pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2394	snaplen, nval, macoff);
2395	snaplen = nval;
2396	if (unlikely((int)snaplen < `0`)) {
2397	snaplen = `0`;
2398	macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2399	vnet_hdr_sz = `0`;
2400	}
2401	}
2402	spin_lock(lock: &sk->sk_receive_queue.lock);
2403	h.raw = packet_current_rx_frame(po, skb,
2404	TP_STATUS_KERNEL, len: (macoff+snaplen));
2405	if (!h.raw)
2406	goto drop_n_account;
2407
2408	if (po->tp_version <= TPACKET_V2) {
2409	slot_id = po->rx_ring.head;
2410	if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2411	goto drop_n_account;
2412	__set_bit(slot_id, po->rx_ring.rx_owner_map);
2413	}
2414
2415	if (vnet_hdr_sz &&
2416	virtio_net_hdr_from_skb(skb, hdr: h.raw + macoff -
2417	sizeof(struct virtio_net_hdr),
2418	vio_le(), has_data_valid: true, vlan_hlen: `0`)) {
2419	if (po->tp_version == TPACKET_V3)
2420	prb_clear_blk_fill_status(rb: &po->rx_ring);
2421	goto drop_n_account;
2422	}
2423
2424	if (po->tp_version <= TPACKET_V2) {
2425	packet_increment_rx_head(po, rb: &po->rx_ring);
2426	/*
2427	* LOSING will be reported till you read the stats,
2428	* because it's COR - Clear On Read.
2429	* Anyways, moving it for V1/V2 only as V3 doesn't need this
2430	* at packet level.
2431	*/
2432	if (atomic_read(v: &po->tp_drops))
2433	status \|= TP_STATUS_LOSING;
2434	}
2435
2436	po->stats.stats1.tp_packets++;
2437	if (copy_skb) {
2438	status \|= TP_STATUS_COPY;
2439	skb_clear_delivery_time(skb: copy_skb);
2440	__skb_queue_tail(list: &sk->sk_receive_queue, newsk: copy_skb);
2441	}
2442	spin_unlock(lock: &sk->sk_receive_queue.lock);
2443
2444	skb_copy_bits(skb, offset: `0`, to: h.raw + macoff, len: snaplen);
2445
2446	/ Always timestamp; prefer an existing software timestamp taken*
2447	* closer to the time of capture.
2448	*/
2449	ts_status = tpacket_get_timestamp(skb, ts: &ts,
2450	READ_ONCE(po->tp_tstamp) \|
2451	SOF_TIMESTAMPING_SOFTWARE);
2452	if (!ts_status)
2453	ktime_get_real_ts64(tv: &ts);
2454
2455	status \|= ts_status;
2456
2457	switch (po->tp_version) {
2458	case TPACKET_V1:
2459	h.h1->tp_len = skb->len;
2460	h.h1->tp_snaplen = snaplen;
2461	h.h1->tp_mac = macoff;
2462	h.h1->tp_net = netoff;
2463	h.h1->tp_sec = ts.tv_sec;
2464	h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2465	hdrlen = sizeof(*h.h1);
2466	break;
2467	case TPACKET_V2:
2468	h.h2->tp_len = skb->len;
2469	h.h2->tp_snaplen = snaplen;
2470	h.h2->tp_mac = macoff;
2471	h.h2->tp_net = netoff;
2472	h.h2->tp_sec = ts.tv_sec;
2473	h.h2->tp_nsec = ts.tv_nsec;
2474	if (skb_vlan_tag_present(skb)) {
2475	h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2476	h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2477	status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
2478	} else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
2479	h.h2->tp_vlan_tci = vlan_get_tci(skb, dev: skb->dev);
2480	h.h2->tp_vlan_tpid = ntohs(skb->protocol);
2481	status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
2482	} else {
2483	h.h2->tp_vlan_tci = `0`;
2484	h.h2->tp_vlan_tpid = `0`;
2485	}
2486	memset(h.h2->tp_padding, `0`, sizeof(h.h2->tp_padding));
2487	hdrlen = sizeof(*h.h2);
2488	break;
2489	case TPACKET_V3:
2490	/ tp_nxt_offset,vlan are already populated above.*
2491	* So DONT clear those fields here
2492	*/
2493	h.h3->tp_status \|= status;
2494	h.h3->tp_len = skb->len;
2495	h.h3->tp_snaplen = snaplen;
2496	h.h3->tp_mac = macoff;
2497	h.h3->tp_net = netoff;
2498	h.h3->tp_sec = ts.tv_sec;
2499	h.h3->tp_nsec = ts.tv_nsec;
2500	memset(h.h3->tp_padding, `0`, sizeof(h.h3->tp_padding));
2501	hdrlen = sizeof(*h.h3);
2502	break;
2503	default:
2504	BUG();
2505	}
2506
2507	sll = h.raw + TPACKET_ALIGN(hdrlen);
2508	sll->sll_halen = dev_parse_header(skb, haddr: sll->sll_addr);
2509	sll->sll_family = AF_PACKET;
2510	sll->sll_hatype = dev->type;
2511	sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ?
2512	vlan_get_protocol_dgram(skb) : skb->protocol;
2513	sll->sll_pkttype = skb->pkt_type;
2514	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2515	sll->sll_ifindex = orig_dev->ifindex;
2516	else
2517	sll->sll_ifindex = dev->ifindex;
2518
2519	smp_mb();
2520
2521	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2522	if (po->tp_version <= TPACKET_V2) {
2523	u8 start, end;
2524
2525	end = (u8 ) PAGE_ALIGN((unsigned* long) h.raw +
2526	macoff + snaplen);
2527
2528	for (start = h.raw; start < end; start += PAGE_SIZE)
2529	flush_dcache_page(pgv_to_page(start));
2530	}
2531	smp_wmb();
2532	#endif
2533
2534	if (po->tp_version <= TPACKET_V2) {
2535	spin_lock(lock: &sk->sk_receive_queue.lock);
2536	__packet_set_status(po, frame: h.raw, status);
2537	__clear_bit(slot_id, po->rx_ring.rx_owner_map);
2538	spin_unlock(lock: &sk->sk_receive_queue.lock);
2539	sk->sk_data_ready(sk);
2540	} else if (po->tp_version == TPACKET_V3) {
2541	prb_clear_blk_fill_status(rb: &po->rx_ring);
2542	}
2543
2544	drop_n_restore:
2545	if (skb_head != skb->data && skb_shared(skb)) {
2546	skb->data = skb_head;
2547	skb->len = skb_len;
2548	}
2549	drop:
2550	sk_skb_reason_drop(sk, skb, reason: drop_reason);
2551	return `0`;
2552
2553	drop_n_account:
2554	spin_unlock(lock: &sk->sk_receive_queue.lock);
2555	atomic_inc(v: &po->tp_drops);
2556	drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
2557
2558	sk->sk_data_ready(sk);
2559	sk_skb_reason_drop(sk, skb: copy_skb, reason: drop_reason);
2560	goto drop_n_restore;
2561	}
2562
2563	static void tpacket_destruct_skb(struct sk_buff *skb)
2564	{
2565	struct packet_sock *po = pkt_sk(skb->sk);
2566
2567	if (likely(po->tx_ring.pg_vec)) {
2568	void *ph;
2569	__u32 ts;
2570
2571	ph = skb_zcopy_get_nouarg(skb);
2572	packet_dec_pending(rb: &po->tx_ring);
2573
2574	ts = __packet_set_timestamp(po, frame: ph, skb);
2575	__packet_set_status(po, frame: ph, TP_STATUS_AVAILABLE \| ts);
2576
2577	complete(&po->skb_completion);
2578	}
2579
2580	sock_wfree(skb);
2581	}
2582
2583	static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2584	{
2585	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2586	(__virtio16_to_cpu(vio_le(), val: vnet_hdr->csum_start) +
2587	__virtio16_to_cpu(vio_le(), val: vnet_hdr->csum_offset) + `2` >
2588	__virtio16_to_cpu(vio_le(), val: vnet_hdr->hdr_len)))
2589	vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2590	val: __virtio16_to_cpu(vio_le(), val: vnet_hdr->csum_start) +
2591	__virtio16_to_cpu(vio_le(), val: vnet_hdr->csum_offset) + `2`);
2592
2593	if (__virtio16_to_cpu(vio_le(), val: vnet_hdr->hdr_len) > len)
2594	return -EINVAL;
2595
2596	return `0`;
2597	}
2598
2599	static int packet_snd_vnet_parse(struct msghdr msg, size_t len,
2600	struct virtio_net_hdr vnet_hdr, int* vnet_hdr_sz)
2601	{
2602	int ret;
2603
2604	if (*len < vnet_hdr_sz)
2605	return -EINVAL;
2606	*len -= vnet_hdr_sz;
2607
2608	if (!copy_from_iter_full(addr: vnet_hdr, bytes: sizeof(*vnet_hdr), i: &msg->msg_iter))
2609	return -EFAULT;
2610
2611	ret = __packet_snd_vnet_parse(vnet_hdr, len: *len);
2612	if (ret)
2613	return ret;
2614
2615	/ move iter to point to the start of mac header /
2616	if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
2617	iov_iter_advance(i: &msg->msg_iter, bytes: vnet_hdr_sz - sizeof(struct virtio_net_hdr));
2618
2619	return `0`;
2620	}
2621
2622	static int tpacket_fill_skb(struct packet_sock po, struct* sk_buff *skb,
2623	void frame, struct* net_device dev, void* data, int* tp_len,
2624	__be16 proto, unsigned char addr, int* hlen, int copylen,
2625	const struct sockcm_cookie *sockc)
2626	{
2627	union tpacket_uhdr ph;
2628	int to_write, offset, len, nr_frags, len_max;
2629	struct socket *sock = po->sk.sk_socket;
2630	struct page *page;
2631	int err;
2632
2633	ph.raw = frame;
2634
2635	skb->protocol = proto;
2636	skb->dev = dev;
2637	skb->priority = sockc->priority;
2638	skb->mark = sockc->mark;
2639	skb_set_delivery_type_by_clockid(skb, kt: sockc->transmit_time, clockid: po->sk.sk_clockid);
2640	skb_setup_tx_timestamp(skb, sockc);
2641	skb_zcopy_set_nouarg(skb, val: ph.raw);
2642
2643	skb_reserve(skb, len: hlen);
2644	skb_reset_network_header(skb);
2645
2646	to_write = tp_len;
2647
2648	if (sock->type == SOCK_DGRAM) {
2649	err = dev_hard_header(skb, dev, ntohs(proto), daddr: addr,
2650	NULL, len: tp_len);
2651	if (unlikely(err < `0`))
2652	return -EINVAL;
2653	} else if (copylen) {
2654	int hdrlen = min_t(int, copylen, tp_len);
2655
2656	skb_push(skb, len: dev->hard_header_len);
2657	skb_put(skb, len: copylen - dev->hard_header_len);
2658	err = skb_store_bits(skb, offset: `0`, from: data, len: hdrlen);
2659	if (unlikely(err))
2660	return err;
2661	if (!dev_validate_header(dev, ll_header: skb->data, len: hdrlen))
2662	return -EINVAL;
2663
2664	data += hdrlen;
2665	to_write -= hdrlen;
2666	}
2667
2668	offset = offset_in_page(data);
2669	len_max = PAGE_SIZE - offset;
2670	len = ((to_write > len_max) ? len_max : to_write);
2671
2672	skb->data_len = to_write;
2673	skb->len += to_write;
2674	skb->truesize += to_write;
2675	refcount_add(i: to_write, r: &po->sk.sk_wmem_alloc);
2676
2677	while (likely(to_write)) {
2678	nr_frags = skb_shinfo(skb)->nr_frags;
2679
2680	if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2681	pr_err("Packet exceed the number of skb frags(%u)\n",
2682	(unsigned int)MAX_SKB_FRAGS);
2683	return -EFAULT;
2684	}
2685
2686	page = pgv_to_page(addr: data);
2687	data += len;
2688	flush_dcache_page(page);
2689	get_page(page);
2690	skb_fill_page_desc(skb, i: nr_frags, page, off: offset, size: len);
2691	to_write -= len;
2692	offset = `0`;
2693	len_max = PAGE_SIZE;
2694	len = ((to_write > len_max) ? len_max : to_write);
2695	}
2696
2697	packet_parse_headers(skb, sock);
2698
2699	return tp_len;
2700	}
2701
2702	static int tpacket_parse_header(struct packet_sock po, void* *frame,
2703	int size_max, void **data)
2704	{
2705	union tpacket_uhdr ph;
2706	int tp_len, off;
2707
2708	ph.raw = frame;
2709
2710	switch (po->tp_version) {
2711	case TPACKET_V3:
2712	if (ph.h3->tp_next_offset != `0`) {
2713	pr_warn_once("variable sized slot not supported");
2714	return -EINVAL;
2715	}
2716	tp_len = ph.h3->tp_len;
2717	break;
2718	case TPACKET_V2:
2719	tp_len = ph.h2->tp_len;
2720	break;
2721	default:
2722	tp_len = ph.h1->tp_len;
2723	break;
2724	}
2725	if (unlikely(tp_len > size_max)) {
2726	pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2727	return -EMSGSIZE;
2728	}
2729
2730	if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) {
2731	int off_min, off_max;
2732
2733	off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2734	off_max = po->tx_ring.frame_size - tp_len;
2735	if (po->sk.sk_type == SOCK_DGRAM) {
2736	switch (po->tp_version) {
2737	case TPACKET_V3:
2738	off = ph.h3->tp_net;
2739	break;
2740	case TPACKET_V2:
2741	off = ph.h2->tp_net;
2742	break;
2743	default:
2744	off = ph.h1->tp_net;
2745	break;
2746	}
2747	} else {
2748	switch (po->tp_version) {
2749	case TPACKET_V3:
2750	off = ph.h3->tp_mac;
2751	break;
2752	case TPACKET_V2:
2753	off = ph.h2->tp_mac;
2754	break;
2755	default:
2756	off = ph.h1->tp_mac;
2757	break;
2758	}
2759	}
2760	if (unlikely((off < off_min) \|\| (off_max < off)))
2761	return -EINVAL;
2762	} else {
2763	off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2764	}
2765
2766	*data = frame + off;
2767	return tp_len;
2768	}
2769
2770	static int tpacket_snd(struct packet_sock po, struct* msghdr *msg)
2771	{
2772	struct sk_buff *skb = NULL;
2773	struct net_device *dev;
2774	struct virtio_net_hdr *vnet_hdr = NULL;
2775	struct sockcm_cookie sockc;
2776	__be16 proto;
2777	int err, reserve = `0`;
2778	void *ph;
2779	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2780	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2781	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2782	unsigned char *addr = NULL;
2783	int tp_len, size_max;
2784	void *data;
2785	int len_sum = `0`;
2786	int status = TP_STATUS_AVAILABLE;
2787	int hlen, tlen, copylen = `0`;
2788	long timeo = `0`;
2789
2790	mutex_lock(&po->pg_vec_lock);
2791
2792	/ packet_sendmsg() check on tx_ring.pg_vec was lockless,*
2793	* we need to confirm it under protection of pg_vec_lock.
2794	*/
2795	if (unlikely(!po->tx_ring.pg_vec)) {
2796	err = -EBUSY;
2797	goto out;
2798	}
2799	if (likely(saddr == NULL)) {
2800	dev = packet_cached_dev_get(po);
2801	proto = READ_ONCE(po->num);
2802	} else {
2803	err = -EINVAL;
2804	if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2805	goto out;
2806	if (msg->msg_namelen < (saddr->sll_halen
2807	+ offsetof(struct sockaddr_ll,
2808	sll_addr)))
2809	goto out;
2810	proto = saddr->sll_protocol;
2811	dev = dev_get_by_index(net: sock_net(sk: &po->sk), ifindex: saddr->sll_ifindex);
2812	if (po->sk.sk_socket->type == SOCK_DGRAM) {
2813	if (dev && msg->msg_namelen < dev->addr_len +
2814	offsetof(struct sockaddr_ll, sll_addr))
2815	goto out_put;
2816	addr = saddr->sll_addr;
2817	}
2818	}
2819
2820	err = -ENXIO;
2821	if (unlikely(dev == NULL))
2822	goto out;
2823	err = -ENETDOWN;
2824	if (unlikely(!(dev->flags & IFF_UP)))
2825	goto out_put;
2826
2827	sockcm_init(sockc: &sockc, sk: &po->sk);
2828	if (msg->msg_controllen) {
2829	err = sock_cmsg_send(sk: &po->sk, msg, sockc: &sockc);
2830	if (unlikely(err))
2831	goto out_put;
2832	}
2833
2834	if (po->sk.sk_socket->type == SOCK_RAW)
2835	reserve = dev->hard_header_len;
2836	size_max = po->tx_ring.frame_size
2837	- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2838
2839	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
2840	size_max = dev->mtu + reserve + VLAN_HLEN;
2841
2842	reinit_completion(x: &po->skb_completion);
2843
2844	do {
2845	ph = packet_current_frame(po, rb: &po->tx_ring,
2846	TP_STATUS_SEND_REQUEST);
2847	if (unlikely(ph == NULL)) {
2848	if (need_wait && skb) {
2849	timeo = sock_sndtimeo(sk: &po->sk, noblock: msg->msg_flags & MSG_DONTWAIT);
2850	timeo = wait_for_completion_interruptible_timeout(x: &po->skb_completion, timeout: timeo);
2851	if (timeo <= `0`) {
2852	err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2853	goto out_put;
2854	}
2855	}
2856	/ check for additional frames /
2857	continue;
2858	}
2859
2860	skb = NULL;
2861	tp_len = tpacket_parse_header(po, frame: ph, size_max, data: &data);
2862	if (tp_len < `0`)
2863	goto tpacket_error;
2864
2865	status = TP_STATUS_SEND_REQUEST;
2866	hlen = LL_RESERVED_SPACE(dev);
2867	tlen = dev->needed_tailroom;
2868	if (vnet_hdr_sz) {
2869	vnet_hdr = data;
2870	data += vnet_hdr_sz;
2871	tp_len -= vnet_hdr_sz;
2872	if (tp_len < `0` \|\|
2873	__packet_snd_vnet_parse(vnet_hdr, len: tp_len)) {
2874	tp_len = -EINVAL;
2875	goto tpacket_error;
2876	}
2877	copylen = __virtio16_to_cpu(vio_le(),
2878	val: vnet_hdr->hdr_len);
2879	}
2880	copylen = max_t(int, copylen, dev->hard_header_len);
2881	skb = sock_alloc_send_skb(sk: &po->sk,
2882	size: hlen + tlen + sizeof(struct sockaddr_ll) +
2883	(copylen - dev->hard_header_len),
2884	noblock: !need_wait, errcode: &err);
2885
2886	if (unlikely(skb == NULL)) {
2887	/ we assume the socket was initially writeable ... /
2888	if (likely(len_sum > `0`))
2889	err = len_sum;
2890	goto out_status;
2891	}
2892	tp_len = tpacket_fill_skb(po, skb, frame: ph, dev, data, tp_len, proto,
2893	addr, hlen, copylen, sockc: &sockc);
2894	if (likely(tp_len >= `0`) &&
2895	tp_len > dev->mtu + reserve &&
2896	!vnet_hdr_sz &&
2897	!packet_extra_vlan_len_allowed(dev, skb))
2898	tp_len = -EMSGSIZE;
2899
2900	if (unlikely(tp_len < `0`)) {
2901	tpacket_error:
2902	if (packet_sock_flag(po, flag: PACKET_SOCK_TP_LOSS)) {
2903	__packet_set_status(po, frame: ph,
2904	TP_STATUS_AVAILABLE);
2905	packet_increment_head(buff: &po->tx_ring);
2906	kfree_skb(skb);
2907	continue;
2908	} else {
2909	status = TP_STATUS_WRONG_FORMAT;
2910	err = tp_len;
2911	goto out_status;
2912	}
2913	}
2914
2915	if (vnet_hdr_sz) {
2916	if (virtio_net_hdr_to_skb(skb, hdr: vnet_hdr, vio_le())) {
2917	tp_len = -EINVAL;
2918	goto tpacket_error;
2919	}
2920	virtio_net_hdr_set_proto(skb, hdr: vnet_hdr);
2921	}
2922
2923	skb->destructor = tpacket_destruct_skb;
2924	__packet_set_status(po, frame: ph, TP_STATUS_SENDING);
2925	packet_inc_pending(rb: &po->tx_ring);
2926
2927	status = TP_STATUS_SEND_REQUEST;
2928	err = packet_xmit(po, skb);
2929	if (unlikely(err != `0`)) {
2930	if (err > `0`)
2931	err = net_xmit_errno(err);
2932	if (err && __packet_get_status(po, frame: ph) ==
2933	TP_STATUS_AVAILABLE) {
2934	/ skb was destructed already /
2935	skb = NULL;
2936	goto out_status;
2937	}
2938	/*
2939	* skb was dropped but not destructed yet;
2940	* let's treat it like congestion or err < 0
2941	*/
2942	err = `0`;
2943	}
2944	packet_increment_head(buff: &po->tx_ring);
2945	len_sum += tp_len;
2946	} while (likely((ph != NULL) \|\|
2947	/ Note: packet_read_pending() might be slow if we have*
2948	* to call it as it's per_cpu variable, but in fast-path
2949	* we already short-circuit the loop with the first
2950	* condition, and luckily don't have to go that path
2951	* anyway.
2952	*/
2953	(need_wait && packet_read_pending(&po->tx_ring))));
2954
2955	err = len_sum;
2956	goto out_put;
2957
2958	out_status:
2959	__packet_set_status(po, frame: ph, status);
2960	kfree_skb(skb);
2961	out_put:
2962	dev_put(dev);
2963	out:
2964	mutex_unlock(lock: &po->pg_vec_lock);
2965	return err;
2966	}
2967
2968	static struct sk_buff packet_alloc_skb(struct* sock *sk, size_t prepad,
2969	size_t reserve, size_t len,
2970	size_t linear, int noblock,
2971	int *err)
2972	{
2973	struct sk_buff *skb;
2974
2975	/ Under a page? Don't bother with paged skb. /
2976	if (prepad + len < PAGE_SIZE \|\| !linear)
2977	linear = len;
2978
2979	if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
2980	linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
2981	skb = sock_alloc_send_pskb(sk, header_len: prepad + linear, data_len: len - linear, noblock,
2982	errcode: err, PAGE_ALLOC_COSTLY_ORDER);
2983	if (!skb)
2984	return NULL;
2985
2986	skb_reserve(skb, len: reserve);
2987	skb_put(skb, len: linear);
2988	skb->data_len = len - linear;
2989	skb->len += len - linear;
2990
2991	return skb;
2992	}
2993
2994	static int packet_snd(struct socket sock, struct* msghdr *msg, size_t len)
2995	{
2996	struct sock *sk = sock->sk;
2997	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2998	struct sk_buff *skb;
2999	struct net_device *dev;
3000	__be16 proto;
3001	unsigned char *addr = NULL;
3002	int err, reserve = `0`;
3003	struct sockcm_cookie sockc;
3004	struct virtio_net_hdr vnet_hdr = { `0` };
3005	int offset = `0`;
3006	struct packet_sock *po = pkt_sk(sk);
3007	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
3008	int hlen, tlen, linear;
3009	int extra_len = `0`;
3010
3011	/*
3012	* Get and verify the address.
3013	*/
3014
3015	if (likely(saddr == NULL)) {
3016	dev = packet_cached_dev_get(po);
3017	proto = READ_ONCE(po->num);
3018	} else {
3019	err = -EINVAL;
3020	if (msg->msg_namelen < sizeof(struct sockaddr_ll))
3021	goto out;
3022	if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
3023	goto out;
3024	proto = saddr->sll_protocol;
3025	dev = dev_get_by_index(net: sock_net(sk), ifindex: saddr->sll_ifindex);
3026	if (sock->type == SOCK_DGRAM) {
3027	if (dev && msg->msg_namelen < dev->addr_len +
3028	offsetof(struct sockaddr_ll, sll_addr))
3029	goto out_unlock;
3030	addr = saddr->sll_addr;
3031	}
3032	}
3033
3034	err = -ENXIO;
3035	if (unlikely(dev == NULL))
3036	goto out_unlock;
3037	err = -ENETDOWN;
3038	if (unlikely(!(dev->flags & IFF_UP)))
3039	goto out_unlock;
3040
3041	sockcm_init(sockc: &sockc, sk);
3042	if (msg->msg_controllen) {
3043	err = sock_cmsg_send(sk, msg, sockc: &sockc);
3044	if (unlikely(err))
3045	goto out_unlock;
3046	}
3047
3048	if (sock->type == SOCK_RAW)
3049	reserve = dev->hard_header_len;
3050	if (vnet_hdr_sz) {
3051	err = packet_snd_vnet_parse(msg, len: &len, vnet_hdr: &vnet_hdr, vnet_hdr_sz);
3052	if (err)
3053	goto out_unlock;
3054	}
3055
3056	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
3057	if (!netif_supports_nofcs(dev)) {
3058	err = -EPROTONOSUPPORT;
3059	goto out_unlock;
3060	}
3061	extra_len = `4`; / We're doing our own CRC /
3062	}
3063
3064	err = -EMSGSIZE;
3065	if (!vnet_hdr.gso_type &&
3066	(len > dev->mtu + reserve + VLAN_HLEN + extra_len))
3067	goto out_unlock;
3068
3069	err = -ENOBUFS;
3070	hlen = LL_RESERVED_SPACE(dev);
3071	tlen = dev->needed_tailroom;
3072	linear = __virtio16_to_cpu(vio_le(), val: vnet_hdr.hdr_len);
3073	linear = max(linear, min_t(int, len, dev->hard_header_len));
3074	skb = packet_alloc_skb(sk, prepad: hlen + tlen, reserve: hlen, len, linear,
3075	noblock: msg->msg_flags & MSG_DONTWAIT, err: &err);
3076	if (skb == NULL)
3077	goto out_unlock;
3078
3079	skb_reset_network_header(skb);
3080
3081	err = -EINVAL;
3082	if (sock->type == SOCK_DGRAM) {
3083	offset = dev_hard_header(skb, dev, ntohs(proto), daddr: addr, NULL, len);
3084	if (unlikely(offset < `0`))
3085	goto out_free;
3086	} else if (reserve) {
3087	skb_reserve(skb, len: -reserve);
3088	if (len < reserve + sizeof(struct ipv6hdr) &&
3089	dev->min_header_len != dev->hard_header_len)
3090	skb_reset_network_header(skb);
3091	}
3092
3093	/ Returns -EFAULT on error /
3094	err = skb_copy_datagram_from_iter(skb, offset, from: &msg->msg_iter, len);
3095	if (err)
3096	goto out_free;
3097
3098	if ((sock->type == SOCK_RAW &&
3099	!dev_validate_header(dev, ll_header: skb->data, len)) \|\| !skb->len) {
3100	err = -EINVAL;
3101	goto out_free;
3102	}
3103
3104	skb_setup_tx_timestamp(skb, sockc: &sockc);
3105
3106	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3107	!packet_extra_vlan_len_allowed(dev, skb)) {
3108	err = -EMSGSIZE;
3109	goto out_free;
3110	}
3111
3112	skb->protocol = proto;
3113	skb->dev = dev;
3114	skb->priority = sockc.priority;
3115	skb->mark = sockc.mark;
3116	skb_set_delivery_type_by_clockid(skb, kt: sockc.transmit_time, clockid: sk->sk_clockid);
3117
3118	if (unlikely(extra_len == `4`))
3119	skb->no_fcs = `1`;
3120
3121	packet_parse_headers(skb, sock);
3122
3123	if (vnet_hdr_sz) {
3124	err = virtio_net_hdr_to_skb(skb, hdr: &vnet_hdr, vio_le());
3125	if (err)
3126	goto out_free;
3127	len += vnet_hdr_sz;
3128	virtio_net_hdr_set_proto(skb, hdr: &vnet_hdr);
3129	}
3130
3131	err = packet_xmit(po, skb);
3132
3133	if (unlikely(err != `0`)) {
3134	if (err > `0`)
3135	err = net_xmit_errno(err);
3136	if (err)
3137	goto out_unlock;
3138	}
3139
3140	dev_put(dev);
3141
3142	return len;
3143
3144	out_free:
3145	kfree_skb(skb);
3146	out_unlock:
3147	dev_put(dev);
3148	out:
3149	return err;
3150	}
3151
3152	static int packet_sendmsg(struct socket sock, struct* msghdr *msg, size_t len)
3153	{
3154	struct sock *sk = sock->sk;
3155	struct packet_sock *po = pkt_sk(sk);
3156
3157	/ Reading tx_ring.pg_vec without holding pg_vec_lock is racy.*
3158	* tpacket_snd() will redo the check safely.
3159	*/
3160	if (data_race(po->tx_ring.pg_vec))
3161	return tpacket_snd(po, msg);
3162
3163	return packet_snd(sock, msg, len);
3164	}
3165
3166	/*
3167	* Close a PACKET socket. This is fairly simple. We immediately go
3168	* to 'closed' state and remove our protocol entry in the device list.
3169	*/
3170
3171	static int packet_release(struct socket *sock)
3172	{
3173	struct sock *sk = sock->sk;
3174	struct packet_sock *po;
3175	struct packet_fanout *f;
3176	struct net *net;
3177	union tpacket_req_u req_u;
3178
3179	if (!sk)
3180	return `0`;
3181
3182	net = sock_net(sk);
3183	po = pkt_sk(sk);
3184
3185	mutex_lock(&net->packet.sklist_lock);
3186	sk_del_node_init_rcu(sk);
3187	mutex_unlock(lock: &net->packet.sklist_lock);
3188
3189	sock_prot_inuse_add(net, prot: sk->sk_prot, val: -`1`);
3190
3191	spin_lock(lock: &po->bind_lock);
3192	unregister_prot_hook(sk, sync: false);
3193	packet_cached_dev_reset(po);
3194
3195	if (po->prot_hook.dev) {
3196	netdev_put(dev: po->prot_hook.dev, tracker: &po->prot_hook.dev_tracker);
3197	po->prot_hook.dev = NULL;
3198	}
3199	spin_unlock(lock: &po->bind_lock);
3200
3201	packet_flush_mclist(sk);
3202
3203	lock_sock(sk);
3204	if (po->rx_ring.pg_vec) {
3205	memset(&req_u, `0`, sizeof(req_u));
3206	packet_set_ring(sk, req_u: &req_u, closing: `1`, tx_ring: `0`);
3207	}
3208
3209	if (po->tx_ring.pg_vec) {
3210	memset(&req_u, `0`, sizeof(req_u));
3211	packet_set_ring(sk, req_u: &req_u, closing: `1`, tx_ring: `1`);
3212	}
3213	release_sock(sk);
3214
3215	f = fanout_release(sk);
3216
3217	synchronize_net();
3218
3219	kfree(objp: po->rollover);
3220	if (f) {
3221	fanout_release_data(f);
3222	kvfree(addr: f);
3223	}
3224	/*
3225	* Now the socket is dead. No more input will appear.
3226	*/
3227	sock_orphan(sk);
3228	sock->sk = NULL;
3229
3230	/ Purge queues /
3231
3232	skb_queue_purge(list: &sk->sk_receive_queue);
3233	packet_free_pending(po);
3234
3235	sock_put(sk);
3236	return `0`;
3237	}
3238
3239	/*
3240	* Attach a packet hook.
3241	*/
3242
3243	static int packet_do_bind(struct sock sk, const* char name, int* ifindex,
3244	__be16 proto)
3245	{
3246	struct packet_sock *po = pkt_sk(sk);
3247	struct net_device *dev = NULL;
3248	bool unlisted = false;
3249	bool need_rehook;
3250	int ret = `0`;
3251
3252	lock_sock(sk);
3253	spin_lock(lock: &po->bind_lock);
3254	if (!proto)
3255	proto = po->num;
3256
3257	rcu_read_lock();
3258
3259	if (po->fanout) {
3260	ret = -EINVAL;
3261	goto out_unlock;
3262	}
3263
3264	if (name) {
3265	dev = dev_get_by_name_rcu(net: sock_net(sk), name);
3266	if (!dev) {
3267	ret = -ENODEV;
3268	goto out_unlock;
3269	}
3270	} else if (ifindex) {
3271	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex);
3272	if (!dev) {
3273	ret = -ENODEV;
3274	goto out_unlock;
3275	}
3276	}
3277
3278	need_rehook = po->prot_hook.type != proto \|\| po->prot_hook.dev != dev;
3279
3280	if (need_rehook) {
3281	dev_hold(dev);
3282	if (packet_sock_flag(po, flag: PACKET_SOCK_RUNNING)) {
3283	rcu_read_unlock();
3284	/ prevents packet_notifier() from calling*
3285	* register_prot_hook()
3286	*/
3287	WRITE_ONCE(po->num, `0`);
3288	__unregister_prot_hook(sk, sync: true);
3289	rcu_read_lock();
3290	if (dev)
3291	unlisted = !dev_get_by_index_rcu(net: sock_net(sk),
3292	ifindex: dev->ifindex);
3293	}
3294
3295	BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
3296	WRITE_ONCE(po->num, proto);
3297	po->prot_hook.type = proto;
3298
3299	netdev_put(dev: po->prot_hook.dev, tracker: &po->prot_hook.dev_tracker);
3300
3301	if (unlikely(unlisted)) {
3302	po->prot_hook.dev = NULL;
3303	WRITE_ONCE(po->ifindex, -`1`);
3304	packet_cached_dev_reset(po);
3305	} else {
3306	netdev_hold(dev, tracker: &po->prot_hook.dev_tracker,
3307	GFP_ATOMIC);
3308	po->prot_hook.dev = dev;
3309	WRITE_ONCE(po->ifindex, dev ? dev->ifindex : `0`);
3310	packet_cached_dev_assign(po, dev);
3311	}
3312	dev_put(dev);
3313	}
3314
3315	if (proto == `0` \|\| !need_rehook)
3316	goto out_unlock;
3317
3318	if (!unlisted && (!dev \|\| (dev->flags & IFF_UP))) {
3319	register_prot_hook(sk);
3320	} else {
3321	sk->sk_err = ENETDOWN;
3322	if (!sock_flag(sk, flag: SOCK_DEAD))
3323	sk_error_report(sk);
3324	}
3325
3326	out_unlock:
3327	rcu_read_unlock();
3328	spin_unlock(lock: &po->bind_lock);
3329	release_sock(sk);
3330	return ret;
3331	}
3332
3333	/*
3334	* Bind a packet socket to a device
3335	*/
3336
3337	static int packet_bind_spkt(struct socket sock, struct* sockaddr *uaddr,
3338	int addr_len)
3339	{
3340	struct sock *sk = sock->sk;
3341	char name[sizeof(uaddr->sa_data_min) + `1`];
3342
3343	/*
3344	* Check legality
3345	*/
3346
3347	if (addr_len != sizeof(struct sockaddr))
3348	return -EINVAL;
3349	/ uaddr->sa_data comes from the userspace, it's not guaranteed to be*
3350	* zero-terminated.
3351	*/
3352	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
3353	name[sizeof(uaddr->sa_data_min)] = `0`;
3354
3355	return packet_do_bind(sk, name, ifindex: `0`, proto: `0`);
3356	}
3357
3358	static int packet_bind(struct socket sock, struct* sockaddr uaddr, int* addr_len)
3359	{
3360	struct sockaddr_ll sll = (struct* sockaddr_ll *)uaddr;
3361	struct sock *sk = sock->sk;
3362
3363	/*
3364	* Check legality
3365	*/
3366
3367	if (addr_len < sizeof(struct sockaddr_ll))
3368	return -EINVAL;
3369	if (sll->sll_family != AF_PACKET)
3370	return -EINVAL;
3371
3372	return packet_do_bind(sk, NULL, ifindex: sll->sll_ifindex, proto: sll->sll_protocol);
3373	}
3374
3375	static struct proto packet_proto = {
3376	.name = "PACKET",
3377	.owner = THIS_MODULE,
3378	.obj_size = sizeof(struct packet_sock),
3379	};
3380
3381	/*
3382	* Create a packet of type SOCK_PACKET.
3383	*/
3384
3385	static int packet_create(struct net net, struct* socket sock, int* protocol,
3386	int kern)
3387	{
3388	struct sock *sk;
3389	struct packet_sock *po;
3390	__be16 proto = (__force __be16)protocol; / weird, but documented /
3391	int err;
3392
3393	if (!ns_capable(ns: net->user_ns, CAP_NET_RAW))
3394	return -EPERM;
3395	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3396	sock->type != SOCK_PACKET)
3397	return -ESOCKTNOSUPPORT;
3398
3399	sock->state = SS_UNCONNECTED;
3400
3401	err = -ENOBUFS;
3402	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, prot: &packet_proto, kern);
3403	if (sk == NULL)
3404	goto out;
3405
3406	sock->ops = &packet_ops;
3407	if (sock->type == SOCK_PACKET)
3408	sock->ops = &packet_ops_spkt;
3409
3410	po = pkt_sk(sk);
3411	err = packet_alloc_pending(po);
3412	if (err)
3413	goto out_sk_free;
3414
3415	sock_init_data(sock, sk);
3416
3417	init_completion(x: &po->skb_completion);
3418	sk->sk_family = PF_PACKET;
3419	po->num = proto;
3420
3421	packet_cached_dev_reset(po);
3422
3423	sk->sk_destruct = packet_sock_destruct;
3424
3425	/*
3426	* Attach a protocol block
3427	*/
3428
3429	spin_lock_init(&po->bind_lock);
3430	mutex_init(&po->pg_vec_lock);
3431	po->rollover = NULL;
3432	po->prot_hook.func = packet_rcv;
3433
3434	if (sock->type == SOCK_PACKET)
3435	po->prot_hook.func = packet_rcv_spkt;
3436
3437	po->prot_hook.af_packet_priv = sk;
3438	po->prot_hook.af_packet_net = sock_net(sk);
3439
3440	if (proto) {
3441	po->prot_hook.type = proto;
3442	__register_prot_hook(sk);
3443	}
3444
3445	mutex_lock(&net->packet.sklist_lock);
3446	sk_add_node_tail_rcu(sk, list: &net->packet.sklist);
3447	mutex_unlock(lock: &net->packet.sklist_lock);
3448
3449	sock_prot_inuse_add(net, prot: &packet_proto, val: `1`);
3450
3451	return `0`;
3452	out_sk_free:
3453	sk_free(sk);
3454	out:
3455	return err;
3456	}
3457
3458	/*
3459	* Pull a packet from our receive queue and hand it to the user.
3460	* If necessary we block.
3461	*/
3462
3463	static int packet_recvmsg(struct socket sock, struct* msghdr *msg, size_t len,
3464	int flags)
3465	{
3466	struct sock *sk = sock->sk;
3467	struct sk_buff *skb;
3468	int copied, err;
3469	int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
3470	unsigned int origlen = `0`;
3471
3472	err = -EINVAL;
3473	if (flags & ~(MSG_PEEK\|MSG_DONTWAIT\|MSG_TRUNC\|MSG_CMSG_COMPAT\|MSG_ERRQUEUE))
3474	goto out;
3475
3476	#if 0
3477	/ What error should we return now? EUNATTACH? /
3478	if (pkt_sk(sk)->ifindex < `0`)
3479	return -ENODEV;
3480	#endif
3481
3482	if (flags & MSG_ERRQUEUE) {
3483	err = sock_recv_errqueue(sk, msg, len,
3484	SOL_PACKET, PACKET_TX_TIMESTAMP);
3485	goto out;
3486	}
3487
3488	/*
3489	* Call the generic datagram receiver. This handles all sorts
3490	* of horrible races and re-entrancy so we can forget about it
3491	* in the protocol layers.
3492	*
3493	* Now it will return ENETDOWN, if device have just gone down,
3494	* but then it will block.
3495	*/
3496
3497	skb = skb_recv_datagram(sk, flags, err: &err);
3498
3499	/*
3500	* An error occurred so return it. Because skb_recv_datagram()
3501	* handles the blocking we don't see and worry about blocking
3502	* retries.
3503	*/
3504
3505	if (skb == NULL)
3506	goto out;
3507
3508	packet_rcv_try_clear_pressure(pkt_sk(sk));
3509
3510	if (vnet_hdr_len) {
3511	err = packet_rcv_vnet(msg, skb, len: &len, vnet_hdr_sz: vnet_hdr_len);
3512	if (err)
3513	goto out_free;
3514	}
3515
3516	/ You lose any data beyond the buffer you gave. If it worries*
3517	* a user program they can ask the device for its MTU
3518	* anyway.
3519	*/
3520	copied = skb->len;
3521	if (copied > len) {
3522	copied = len;
3523	msg->msg_flags \|= MSG_TRUNC;
3524	}
3525
3526	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: copied);
3527	if (err)
3528	goto out_free;
3529
3530	if (sock->type != SOCK_PACKET) {
3531	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3532
3533	/ Original length was stored in sockaddr_ll fields /
3534	origlen = PACKET_SKB_CB(skb)->sa.origlen;
3535	sll->sll_family = AF_PACKET;
3536	sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
3537	vlan_get_protocol_dgram(skb) : skb->protocol;
3538	}
3539
3540	sock_recv_cmsgs(msg, sk, skb);
3541
3542	if (msg->msg_name) {
3543	const size_t max_len = min(sizeof(skb->cb),
3544	sizeof(struct sockaddr_storage));
3545	int copy_len;
3546
3547	/ If the address length field is there to be filled*
3548	* in, we fill it in now.
3549	*/
3550	if (sock->type == SOCK_PACKET) {
3551	__sockaddr_check_size(sizeof(struct sockaddr_pkt));
3552	msg->msg_namelen = sizeof(struct sockaddr_pkt);
3553	copy_len = msg->msg_namelen;
3554	} else {
3555	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3556
3557	msg->msg_namelen = sll->sll_halen +
3558	offsetof(struct sockaddr_ll, sll_addr);
3559	copy_len = msg->msg_namelen;
3560	if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3561	memset(msg->msg_name +
3562	offsetof(struct sockaddr_ll, sll_addr),
3563	`0`, sizeof(sll->sll_addr));
3564	msg->msg_namelen = sizeof(struct sockaddr_ll);
3565	}
3566	}
3567	if (WARN_ON_ONCE(copy_len > max_len)) {
3568	copy_len = max_len;
3569	msg->msg_namelen = copy_len;
3570	}
3571	memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3572	}
3573
3574	if (packet_sock_flag(pkt_sk(sk), flag: PACKET_SOCK_AUXDATA)) {
3575	struct tpacket_auxdata aux;
3576
3577	aux.tp_status = TP_STATUS_USER;
3578	if (skb->ip_summed == CHECKSUM_PARTIAL)
3579	aux.tp_status \|= TP_STATUS_CSUMNOTREADY;
3580	else if (skb->pkt_type != PACKET_OUTGOING &&
3581	skb_csum_unnecessary(skb))
3582	aux.tp_status \|= TP_STATUS_CSUM_VALID;
3583	if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
3584	aux.tp_status \|= TP_STATUS_GSO_TCP;
3585
3586	aux.tp_len = origlen;
3587	aux.tp_snaplen = skb->len;
3588	aux.tp_mac = `0`;
3589	aux.tp_net = skb_network_offset(skb);
3590	if (skb_vlan_tag_present(skb)) {
3591	aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3592	aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3593	aux.tp_status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
3594	} else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
3595	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3596	struct net_device *dev;
3597
3598	rcu_read_lock();
3599	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: sll->sll_ifindex);
3600	if (dev) {
3601	aux.tp_vlan_tci = vlan_get_tci(skb, dev);
3602	aux.tp_vlan_tpid = ntohs(skb->protocol);
3603	aux.tp_status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
3604	} else {
3605	aux.tp_vlan_tci = `0`;
3606	aux.tp_vlan_tpid = `0`;
3607	}
3608	rcu_read_unlock();
3609	} else {
3610	aux.tp_vlan_tci = `0`;
3611	aux.tp_vlan_tpid = `0`;
3612	}
3613	put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, len: sizeof(aux), data: &aux);
3614	}
3615
3616	/*
3617	* Free or return the buffer as appropriate. Again this
3618	* hides all the races and re-entrancy issues from us.
3619	*/
3620	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3621
3622	out_free:
3623	skb_free_datagram(sk, skb);
3624	out:
3625	return err;
3626	}
3627
3628	static int packet_getname_spkt(struct socket sock, struct* sockaddr *uaddr,
3629	int peer)
3630	{
3631	struct net_device *dev;
3632	struct sock *sk = sock->sk;
3633
3634	if (peer)
3635	return -EOPNOTSUPP;
3636
3637	uaddr->sa_family = AF_PACKET;
3638	memset(uaddr->sa_data, `0`, sizeof(uaddr->sa_data_min));
3639	rcu_read_lock();
3640	dev = dev_get_by_index_rcu(net: sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3641	if (dev)
3642	strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
3643	rcu_read_unlock();
3644
3645	return sizeof(*uaddr);
3646	}
3647
3648	static int packet_getname(struct socket sock, struct* sockaddr *uaddr,
3649	int peer)
3650	{
3651	struct net_device *dev;
3652	struct sock *sk = sock->sk;
3653	struct packet_sock *po = pkt_sk(sk);
3654	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3655	int ifindex;
3656
3657	if (peer)
3658	return -EOPNOTSUPP;
3659
3660	ifindex = READ_ONCE(po->ifindex);
3661	sll->sll_family = AF_PACKET;
3662	sll->sll_ifindex = ifindex;
3663	sll->sll_protocol = READ_ONCE(po->num);
3664	sll->sll_pkttype = `0`;
3665	rcu_read_lock();
3666	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex);
3667	if (dev) {
3668	sll->sll_hatype = dev->type;
3669	sll->sll_halen = dev->addr_len;
3670
3671	/ Let __fortify_memcpy_chk() know the actual buffer size. /
3672	memcpy(((struct sockaddr_storage *)sll)->__data +
3673	offsetof(struct sockaddr_ll, sll_addr) -
3674	offsetofend(struct sockaddr_ll, sll_family),
3675	dev->dev_addr, dev->addr_len);
3676	} else {
3677	sll->sll_hatype = `0`; / Bad: we have no ARPHRD_UNSPEC /
3678	sll->sll_halen = `0`;
3679	}
3680	rcu_read_unlock();
3681
3682	return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3683	}
3684
3685	static int packet_dev_mc(struct net_device dev, struct* packet_mclist *i,
3686	int what)
3687	{
3688	switch (i->type) {
3689	case PACKET_MR_MULTICAST:
3690	if (i->alen != dev->addr_len)
3691	return -EINVAL;
3692	if (what > `0`)
3693	return dev_mc_add(dev, addr: i->addr);
3694	else
3695	return dev_mc_del(dev, addr: i->addr);
3696	break;
3697	case PACKET_MR_PROMISC:
3698	return dev_set_promiscuity(dev, inc: what);
3699	case PACKET_MR_ALLMULTI:
3700	return dev_set_allmulti(dev, inc: what);
3701	case PACKET_MR_UNICAST:
3702	if (i->alen != dev->addr_len)
3703	return -EINVAL;
3704	if (what > `0`)
3705	return dev_uc_add(dev, addr: i->addr);
3706	else
3707	return dev_uc_del(dev, addr: i->addr);
3708	break;
3709	default:
3710	break;
3711	}
3712	return `0`;
3713	}
3714
3715	static void packet_dev_mclist_delete(struct net_device *dev,
3716	struct packet_mclist **mlp,
3717	struct list_head *list)
3718	{
3719	struct packet_mclist *ml;
3720
3721	while ((ml = *mlp) != NULL) {
3722	if (ml->ifindex == dev->ifindex) {
3723	list_add(new: &ml->remove_list, head: list);
3724	*mlp = ml->next;
3725	} else
3726	mlp = &ml->next;
3727	}
3728	}
3729
3730	static int packet_mc_add(struct sock sk, struct* packet_mreq_max *mreq)
3731	{
3732	struct packet_sock *po = pkt_sk(sk);
3733	struct packet_mclist ml, i;
3734	struct net_device *dev;
3735	int err;
3736
3737	rtnl_lock();
3738
3739	err = -ENODEV;
3740	dev = __dev_get_by_index(net: sock_net(sk), ifindex: mreq->mr_ifindex);
3741	if (!dev)
3742	goto done;
3743
3744	err = -EINVAL;
3745	if (mreq->mr_alen > dev->addr_len)
3746	goto done;
3747
3748	err = -ENOBUFS;
3749	i = kmalloc(sizeof(*i), GFP_KERNEL);
3750	if (i == NULL)
3751	goto done;
3752
3753	err = `0`;
3754	for (ml = po->mclist; ml; ml = ml->next) {
3755	if (ml->ifindex == mreq->mr_ifindex &&
3756	ml->type == mreq->mr_type &&
3757	ml->alen == mreq->mr_alen &&
3758	memcmp(p: ml->addr, q: mreq->mr_address, size: ml->alen) == `0`) {
3759	ml->count++;
3760	/ Free the new element ... /
3761	kfree(objp: i);
3762	goto done;
3763	}
3764	}
3765
3766	i->type = mreq->mr_type;
3767	i->ifindex = mreq->mr_ifindex;
3768	i->alen = mreq->mr_alen;
3769	memcpy(i->addr, mreq->mr_address, i->alen);
3770	memset(i->addr + i->alen, `0`, sizeof(i->addr) - i->alen);
3771	i->count = `1`;
3772	INIT_LIST_HEAD(list: &i->remove_list);
3773	i->next = po->mclist;
3774	po->mclist = i;
3775	err = packet_dev_mc(dev, i, what: `1`);
3776	if (err) {
3777	po->mclist = i->next;
3778	kfree(objp: i);
3779	}
3780
3781	done:
3782	rtnl_unlock();
3783	return err;
3784	}
3785
3786	static int packet_mc_drop(struct sock sk, struct* packet_mreq_max *mreq)
3787	{
3788	struct packet_mclist ml, *mlp;
3789
3790	rtnl_lock();
3791
3792	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3793	if (ml->ifindex == mreq->mr_ifindex &&
3794	ml->type == mreq->mr_type &&
3795	ml->alen == mreq->mr_alen &&
3796	memcmp(p: ml->addr, q: mreq->mr_address, size: ml->alen) == `0`) {
3797	if (--ml->count == `0`) {
3798	struct net_device *dev;
3799	*mlp = ml->next;
3800	dev = __dev_get_by_index(net: sock_net(sk), ifindex: ml->ifindex);
3801	if (dev)
3802	packet_dev_mc(dev, i: ml, what: -`1`);
3803	kfree(objp: ml);
3804	}
3805	break;
3806	}
3807	}
3808	rtnl_unlock();
3809	return `0`;
3810	}
3811
3812	static void packet_flush_mclist(struct sock *sk)
3813	{
3814	struct packet_sock *po = pkt_sk(sk);
3815	struct packet_mclist *ml;
3816
3817	if (!po->mclist)
3818	return;
3819
3820	rtnl_lock();
3821	while ((ml = po->mclist) != NULL) {
3822	struct net_device *dev;
3823
3824	po->mclist = ml->next;
3825	dev = __dev_get_by_index(net: sock_net(sk), ifindex: ml->ifindex);
3826	if (dev != NULL)
3827	packet_dev_mc(dev, i: ml, what: -`1`);
3828	kfree(objp: ml);
3829	}
3830	rtnl_unlock();
3831	}
3832
3833	static int
3834	packet_setsockopt(struct socket sock, int* level, int optname, sockptr_t optval,
3835	unsigned int optlen)
3836	{
3837	struct sock *sk = sock->sk;
3838	struct packet_sock *po = pkt_sk(sk);
3839	int ret;
3840
3841	if (level != SOL_PACKET)
3842	return -ENOPROTOOPT;
3843
3844	switch (optname) {
3845	case PACKET_ADD_MEMBERSHIP:
3846	case PACKET_DROP_MEMBERSHIP:
3847	{
3848	struct packet_mreq_max mreq;
3849	int len = optlen;
3850	memset(&mreq, `0`, sizeof(mreq));
3851	if (len < sizeof(struct packet_mreq))
3852	return -EINVAL;
3853	if (len > sizeof(mreq))
3854	len = sizeof(mreq);
3855	if (copy_from_sockptr(dst: &mreq, src: optval, size: len))
3856	return -EFAULT;
3857	if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3858	return -EINVAL;
3859	if (optname == PACKET_ADD_MEMBERSHIP)
3860	ret = packet_mc_add(sk, mreq: &mreq);
3861	else
3862	ret = packet_mc_drop(sk, mreq: &mreq);
3863	return ret;
3864	}
3865
3866	case PACKET_RX_RING:
3867	case PACKET_TX_RING:
3868	{
3869	union tpacket_req_u req_u;
3870
3871	ret = -EINVAL;
3872	lock_sock(sk);
3873	switch (po->tp_version) {
3874	case TPACKET_V1:
3875	case TPACKET_V2:
3876	if (optlen < sizeof(req_u.req))
3877	break;
3878	ret = copy_from_sockptr(dst: &req_u.req, src: optval,
3879	size: sizeof(req_u.req)) ?
3880	-EINVAL : `0`;
3881	break;
3882	case TPACKET_V3:
3883	default:
3884	if (optlen < sizeof(req_u.req3))
3885	break;
3886	ret = copy_from_sockptr(dst: &req_u.req3, src: optval,
3887	size: sizeof(req_u.req3)) ?
3888	-EINVAL : `0`;
3889	break;
3890	}
3891	if (!ret)
3892	ret = packet_set_ring(sk, req_u: &req_u, closing: `0`,
3893	tx_ring: optname == PACKET_TX_RING);
3894	release_sock(sk);
3895	return ret;
3896	}
3897	case PACKET_COPY_THRESH:
3898	{
3899	int val;
3900
3901	if (optlen != sizeof(val))
3902	return -EINVAL;
3903	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3904	return -EFAULT;
3905
3906	WRITE_ONCE(pkt_sk(sk)->copy_thresh, val);
3907	return `0`;
3908	}
3909	case PACKET_VERSION:
3910	{
3911	int val;
3912
3913	if (optlen != sizeof(val))
3914	return -EINVAL;
3915	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3916	return -EFAULT;
3917	switch (val) {
3918	case TPACKET_V1:
3919	case TPACKET_V2:
3920	case TPACKET_V3:
3921	break;
3922	default:
3923	return -EINVAL;
3924	}
3925	lock_sock(sk);
3926	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
3927	ret = -EBUSY;
3928	} else {
3929	po->tp_version = val;
3930	ret = `0`;
3931	}
3932	release_sock(sk);
3933	return ret;
3934	}
3935	case PACKET_RESERVE:
3936	{
3937	unsigned int val;
3938
3939	if (optlen != sizeof(val))
3940	return -EINVAL;
3941	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3942	return -EFAULT;
3943	if (val > INT_MAX)
3944	return -EINVAL;
3945	lock_sock(sk);
3946	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
3947	ret = -EBUSY;
3948	} else {
3949	po->tp_reserve = val;
3950	ret = `0`;
3951	}
3952	release_sock(sk);
3953	return ret;
3954	}
3955	case PACKET_LOSS:
3956	{
3957	unsigned int val;
3958
3959	if (optlen != sizeof(val))
3960	return -EINVAL;
3961	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3962	return -EFAULT;
3963
3964	lock_sock(sk);
3965	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
3966	ret = -EBUSY;
3967	} else {
3968	packet_sock_flag_set(po, flag: PACKET_SOCK_TP_LOSS, val);
3969	ret = `0`;
3970	}
3971	release_sock(sk);
3972	return ret;
3973	}
3974	case PACKET_AUXDATA:
3975	{
3976	int val;
3977
3978	if (optlen < sizeof(val))
3979	return -EINVAL;
3980	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3981	return -EFAULT;
3982
3983	packet_sock_flag_set(po, flag: PACKET_SOCK_AUXDATA, val);
3984	return `0`;
3985	}
3986	case PACKET_ORIGDEV:
3987	{
3988	int val;
3989
3990	if (optlen < sizeof(val))
3991	return -EINVAL;
3992	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3993	return -EFAULT;
3994
3995	packet_sock_flag_set(po, flag: PACKET_SOCK_ORIGDEV, val);
3996	return `0`;
3997	}
3998	case PACKET_VNET_HDR:
3999	case PACKET_VNET_HDR_SZ:
4000	{
4001	int val, hdr_len;
4002
4003	if (sock->type != SOCK_RAW)
4004	return -EINVAL;
4005	if (optlen < sizeof(val))
4006	return -EINVAL;
4007	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
4008	return -EFAULT;
4009
4010	if (optname == PACKET_VNET_HDR_SZ) {
4011	if (val && val != sizeof(struct virtio_net_hdr) &&
4012	val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
4013	return -EINVAL;
4014	hdr_len = val;
4015	} else {
4016	hdr_len = val ? sizeof(struct virtio_net_hdr) : `0`;
4017	}
4018	lock_sock(sk);
4019	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
4020	ret = -EBUSY;
4021	} else {
4022	WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
4023	ret = `0`;
4024	}
4025	release_sock(sk);
4026	return ret;
4027	}
4028	case PACKET_TIMESTAMP:
4029	{
4030	int val;
4031
4032	if (optlen != sizeof(val))
4033	return -EINVAL;
4034	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
4035	return -EFAULT;
4036
4037	WRITE_ONCE(po->tp_tstamp, val);
4038	return `0`;
4039	}
4040	case PACKET_FANOUT:
4041	{
4042	struct fanout_args args = { `0` };
4043
4044	if (optlen != sizeof(int) && optlen != sizeof(args))
4045	return -EINVAL;
4046	if (copy_from_sockptr(dst: &args, src: optval, size: optlen))
4047	return -EFAULT;
4048
4049	return fanout_add(sk, args: &args);
4050	}
4051	case PACKET_FANOUT_DATA:
4052	{
4053	/ Paired with the WRITE_ONCE() in fanout_add() /
4054	if (!READ_ONCE(po->fanout))
4055	return -EINVAL;
4056
4057	return fanout_set_data(po, data: optval, len: optlen);
4058	}
4059	case PACKET_IGNORE_OUTGOING:
4060	{
4061	int val;
4062
4063	if (optlen != sizeof(val))
4064	return -EINVAL;
4065	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
4066	return -EFAULT;
4067	if (val < `0` \|\| val > `1`)
4068	return -EINVAL;
4069
4070	WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
4071	return `0`;
4072	}
4073	case PACKET_TX_HAS_OFF:
4074	{
4075	unsigned int val;
4076
4077	if (optlen != sizeof(val))
4078	return -EINVAL;
4079	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
4080	return -EFAULT;
4081
4082	lock_sock(sk);
4083	if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
4084	packet_sock_flag_set(po, flag: PACKET_SOCK_TX_HAS_OFF, val);
4085
4086	release_sock(sk);
4087	return `0`;
4088	}
4089	case PACKET_QDISC_BYPASS:
4090	{
4091	int val;
4092
4093	if (optlen != sizeof(val))
4094	return -EINVAL;
4095	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
4096	return -EFAULT;
4097
4098	packet_sock_flag_set(po, flag: PACKET_SOCK_QDISC_BYPASS, val);
4099	return `0`;
4100	}
4101	default:
4102	return -ENOPROTOOPT;
4103	}
4104	}
4105
4106	static int packet_getsockopt(struct socket sock, int* level, int optname,
4107	char __user optval, int* __user *optlen)
4108	{
4109	int len;
4110	int val, lv = sizeof(val);
4111	struct sock *sk = sock->sk;
4112	struct packet_sock *po = pkt_sk(sk);
4113	void *data = &val;
4114	union tpacket_stats_u st;
4115	struct tpacket_rollover_stats rstats;
4116	int drops;
4117
4118	if (level != SOL_PACKET)
4119	return -ENOPROTOOPT;
4120
4121	if (get_user(len, optlen))
4122	return -EFAULT;
4123
4124	if (len < `0`)
4125	return -EINVAL;
4126
4127	switch (optname) {
4128	case PACKET_STATISTICS:
4129	spin_lock_bh(lock: &sk->sk_receive_queue.lock);
4130	memcpy(&st, &po->stats, sizeof(st));
4131	memset(&po->stats, `0`, sizeof(po->stats));
4132	spin_unlock_bh(lock: &sk->sk_receive_queue.lock);
4133	drops = atomic_xchg(v: &po->tp_drops, new: `0`);
4134
4135	if (po->tp_version == TPACKET_V3) {
4136	lv = sizeof(struct tpacket_stats_v3);
4137	st.stats3.tp_drops = drops;
4138	st.stats3.tp_packets += drops;
4139	data = &st.stats3;
4140	} else {
4141	lv = sizeof(struct tpacket_stats);
4142	st.stats1.tp_drops = drops;
4143	st.stats1.tp_packets += drops;
4144	data = &st.stats1;
4145	}
4146
4147	break;
4148	case PACKET_AUXDATA:
4149	val = packet_sock_flag(po, flag: PACKET_SOCK_AUXDATA);
4150	break;
4151	case PACKET_ORIGDEV:
4152	val = packet_sock_flag(po, flag: PACKET_SOCK_ORIGDEV);
4153	break;
4154	case PACKET_VNET_HDR:
4155	val = !!READ_ONCE(po->vnet_hdr_sz);
4156	break;
4157	case PACKET_VNET_HDR_SZ:
4158	val = READ_ONCE(po->vnet_hdr_sz);
4159	break;
4160	case PACKET_COPY_THRESH:
4161	val = READ_ONCE(pkt_sk(sk)->copy_thresh);
4162	break;
4163	case PACKET_VERSION:
4164	val = po->tp_version;
4165	break;
4166	case PACKET_HDRLEN:
4167	if (len > sizeof(int))
4168	len = sizeof(int);
4169	if (len < sizeof(int))
4170	return -EINVAL;
4171	if (copy_from_user(to: &val, from: optval, n: len))
4172	return -EFAULT;
4173	switch (val) {
4174	case TPACKET_V1:
4175	val = sizeof(struct tpacket_hdr);
4176	break;
4177	case TPACKET_V2:
4178	val = sizeof(struct tpacket2_hdr);
4179	break;
4180	case TPACKET_V3:
4181	val = sizeof(struct tpacket3_hdr);
4182	break;
4183	default:
4184	return -EINVAL;
4185	}
4186	break;
4187	case PACKET_RESERVE:
4188	val = po->tp_reserve;
4189	break;
4190	case PACKET_LOSS:
4191	val = packet_sock_flag(po, flag: PACKET_SOCK_TP_LOSS);
4192	break;
4193	case PACKET_TIMESTAMP:
4194	val = READ_ONCE(po->tp_tstamp);
4195	break;
4196	case PACKET_FANOUT:
4197	val = (po->fanout ?
4198	((u32)po->fanout->id \|
4199	((u32)po->fanout->type << `16`) \|
4200	((u32)po->fanout->flags << `24`)) :
4201	`0`);
4202	break;
4203	case PACKET_IGNORE_OUTGOING:
4204	val = READ_ONCE(po->prot_hook.ignore_outgoing);
4205	break;
4206	case PACKET_ROLLOVER_STATS:
4207	if (!po->rollover)
4208	return -EINVAL;
4209	rstats.tp_all = atomic_long_read(v: &po->rollover->num);
4210	rstats.tp_huge = atomic_long_read(v: &po->rollover->num_huge);
4211	rstats.tp_failed = atomic_long_read(v: &po->rollover->num_failed);
4212	data = &rstats;
4213	lv = sizeof(rstats);
4214	break;
4215	case PACKET_TX_HAS_OFF:
4216	val = packet_sock_flag(po, flag: PACKET_SOCK_TX_HAS_OFF);
4217	break;
4218	case PACKET_QDISC_BYPASS:
4219	val = packet_sock_flag(po, flag: PACKET_SOCK_QDISC_BYPASS);
4220	break;
4221	default:
4222	return -ENOPROTOOPT;
4223	}
4224
4225	if (len > lv)
4226	len = lv;
4227	if (put_user(len, optlen))
4228	return -EFAULT;
4229	if (copy_to_user(to: optval, from: data, n: len))
4230	return -EFAULT;
4231	return `0`;
4232	}
4233
4234	static int packet_notifier(struct notifier_block *this,
4235	unsigned long msg, void *ptr)
4236	{
4237	struct net_device *dev = netdev_notifier_info_to_dev(info: ptr);
4238	struct net *net = dev_net(dev);
4239	struct packet_mclist ml, tmp;
4240	LIST_HEAD(mclist);
4241	struct sock *sk;
4242
4243	rcu_read_lock();
4244	sk_for_each_rcu(sk, &net->packet.sklist) {
4245	struct packet_sock *po = pkt_sk(sk);
4246
4247	switch (msg) {
4248	case NETDEV_UNREGISTER:
4249	if (po->mclist)
4250	packet_dev_mclist_delete(dev, mlp: &po->mclist,
4251	list: &mclist);
4252	fallthrough;
4253
4254	case NETDEV_DOWN:
4255	if (dev->ifindex == po->ifindex) {
4256	spin_lock(lock: &po->bind_lock);
4257	if (packet_sock_flag(po, flag: PACKET_SOCK_RUNNING)) {
4258	__unregister_prot_hook(sk, sync: false);
4259	sk->sk_err = ENETDOWN;
4260	if (!sock_flag(sk, flag: SOCK_DEAD))
4261	sk_error_report(sk);
4262	}
4263	if (msg == NETDEV_UNREGISTER) {
4264	packet_cached_dev_reset(po);
4265	WRITE_ONCE(po->ifindex, -`1`);
4266	netdev_put(dev: po->prot_hook.dev,
4267	tracker: &po->prot_hook.dev_tracker);
4268	po->prot_hook.dev = NULL;
4269	}
4270	spin_unlock(lock: &po->bind_lock);
4271	}
4272	break;
4273	case NETDEV_UP:
4274	if (dev->ifindex == po->ifindex) {
4275	spin_lock(lock: &po->bind_lock);
4276	if (po->num)
4277	register_prot_hook(sk);
4278	spin_unlock(lock: &po->bind_lock);
4279	}
4280	break;
4281	}
4282	}
4283	rcu_read_unlock();
4284
4285	/ packet_dev_mc might grab instance locks so can't run under rcu /
4286	list_for_each_entry_safe(ml, tmp, &mclist, remove_list) {
4287	packet_dev_mc(dev, i: ml, what: -`1`);
4288	kfree(objp: ml);
4289	}
4290
4291	return NOTIFY_DONE;
4292	}
4293
4294
4295	static int packet_ioctl(struct socket sock, unsigned* int cmd,
4296	unsigned long arg)
4297	{
4298	struct sock *sk = sock->sk;
4299
4300	switch (cmd) {
4301	case SIOCOUTQ:
4302	{
4303	int amount = sk_wmem_alloc_get(sk);
4304
4305	return put_user(amount, (int __user *)arg);
4306	}
4307	case SIOCINQ:
4308	{
4309	struct sk_buff *skb;
4310	int amount = `0`;
4311
4312	spin_lock_bh(lock: &sk->sk_receive_queue.lock);
4313	skb = skb_peek(list_: &sk->sk_receive_queue);
4314	if (skb)
4315	amount = skb->len;
4316	spin_unlock_bh(lock: &sk->sk_receive_queue.lock);
4317	return put_user(amount, (int __user *)arg);
4318	}
4319	#ifdef CONFIG_INET
4320	case SIOCADDRT:
4321	case SIOCDELRT:
4322	case SIOCDARP:
4323	case SIOCGARP:
4324	case SIOCSARP:
4325	case SIOCGIFADDR:
4326	case SIOCSIFADDR:
4327	case SIOCGIFBRDADDR:
4328	case SIOCSIFBRDADDR:
4329	case SIOCGIFNETMASK:
4330	case SIOCSIFNETMASK:
4331	case SIOCGIFDSTADDR:
4332	case SIOCSIFDSTADDR:
4333	case SIOCSIFFLAGS:
4334	return inet_dgram_ops.ioctl(sock, cmd, arg);
4335	#endif
4336
4337	default:
4338	return -ENOIOCTLCMD;
4339	}
4340	return `0`;
4341	}
4342
4343	static __poll_t packet_poll(struct file file, struct* socket *sock,
4344	poll_table *wait)
4345	{
4346	struct sock *sk = sock->sk;
4347	struct packet_sock *po = pkt_sk(sk);
4348	__poll_t mask = datagram_poll(file, sock, wait);
4349
4350	spin_lock_bh(lock: &sk->sk_receive_queue.lock);
4351	if (po->rx_ring.pg_vec) {
4352	if (!packet_previous_rx_frame(po, rb: &po->rx_ring,
4353	TP_STATUS_KERNEL))
4354	mask \|= EPOLLIN \| EPOLLRDNORM;
4355	}
4356	packet_rcv_try_clear_pressure(po);
4357	spin_unlock_bh(lock: &sk->sk_receive_queue.lock);
4358	spin_lock_bh(lock: &sk->sk_write_queue.lock);
4359	if (po->tx_ring.pg_vec) {
4360	if (packet_current_frame(po, rb: &po->tx_ring, TP_STATUS_AVAILABLE))
4361	mask \|= EPOLLOUT \| EPOLLWRNORM;
4362	}
4363	spin_unlock_bh(lock: &sk->sk_write_queue.lock);
4364	return mask;
4365	}
4366
4367
4368	/ Dirty? Well, I still did not learn better way to account*
4369	* for user mmaps.
4370	*/
4371
4372	static void packet_mm_open(struct vm_area_struct *vma)
4373	{
4374	struct file *file = vma->vm_file;
4375	struct socket *sock = file->private_data;
4376	struct sock *sk = sock->sk;
4377
4378	if (sk)
4379	atomic_long_inc(v: &pkt_sk(sk)->mapped);
4380	}
4381
4382	static void packet_mm_close(struct vm_area_struct *vma)
4383	{
4384	struct file *file = vma->vm_file;
4385	struct socket *sock = file->private_data;
4386	struct sock *sk = sock->sk;
4387
4388	if (sk)
4389	atomic_long_dec(v: &pkt_sk(sk)->mapped);
4390	}
4391
4392	static const struct vm_operations_struct packet_mmap_ops = {
4393	.open = packet_mm_open,
4394	.close = packet_mm_close,
4395	};
4396
4397	static void free_pg_vec(struct pgv pg_vec, unsigned* int order,
4398	unsigned int len)
4399	{
4400	int i;
4401
4402	for (i = `0`; i < len; i++) {
4403	if (likely(pg_vec[i].buffer)) {
4404	if (is_vmalloc_addr(x: pg_vec[i].buffer))
4405	vfree(addr: pg_vec[i].buffer);
4406	else
4407	free_pages(addr: (unsigned long)pg_vec[i].buffer,
4408	order);
4409	pg_vec[i].buffer = NULL;
4410	}
4411	}
4412	kfree(objp: pg_vec);
4413	}
4414
4415	static char alloc_one_pg_vec_page(unsigned* long order)
4416	{
4417	char *buffer;
4418	gfp_t gfp_flags = GFP_KERNEL \| __GFP_COMP \|
4419	__GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY;
4420
4421	buffer = (char *) __get_free_pages(gfp_flags, order);
4422	if (buffer)
4423	return buffer;
4424
4425	/ __get_free_pages failed, fall back to vmalloc /
4426	buffer = vzalloc(array_size((`1` << order), PAGE_SIZE));
4427	if (buffer)
4428	return buffer;
4429
4430	/ vmalloc failed, lets dig into swap here /
4431	gfp_flags &= ~__GFP_NORETRY;
4432	buffer = (char *) __get_free_pages(gfp_flags, order);
4433	if (buffer)
4434	return buffer;
4435
4436	/ complete and utter failure /
4437	return NULL;
4438	}
4439
4440	static struct pgv alloc_pg_vec(struct* tpacket_req req, int* order)
4441	{
4442	unsigned int block_nr = req->tp_block_nr;
4443	struct pgv *pg_vec;
4444	int i;
4445
4446	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL \| __GFP_NOWARN);
4447	if (unlikely(!pg_vec))
4448	goto out;
4449
4450	for (i = `0`; i < block_nr; i++) {
4451	pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4452	if (unlikely(!pg_vec[i].buffer))
4453	goto out_free_pgvec;
4454	}
4455
4456	out:
4457	return pg_vec;
4458
4459	out_free_pgvec:
4460	free_pg_vec(pg_vec, order, len: block_nr);
4461	pg_vec = NULL;
4462	goto out;
4463	}
4464
4465	static int packet_set_ring(struct sock sk, union* tpacket_req_u *req_u,
4466	int closing, int tx_ring)
4467	{
4468	struct pgv *pg_vec = NULL;
4469	struct packet_sock *po = pkt_sk(sk);
4470	unsigned long *rx_owner_map = NULL;
4471	int was_running, order = `0`;
4472	struct packet_ring_buffer *rb;
4473	struct sk_buff_head *rb_queue;
4474	__be16 num;
4475	int err;
4476	/ Added to avoid minimal code churn /
4477	struct tpacket_req *req = &req_u->req;
4478
4479	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4480	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4481
4482	err = -EBUSY;
4483	if (!closing) {
4484	if (atomic_long_read(v: &po->mapped))
4485	goto out;
4486	if (packet_read_pending(rb))
4487	goto out;
4488	}
4489
4490	if (req->tp_block_nr) {
4491	unsigned int min_frame_size;
4492
4493	/ Sanity tests and some calculations /
4494	err = -EBUSY;
4495	if (unlikely(rb->pg_vec))
4496	goto out;
4497
4498	switch (po->tp_version) {
4499	case TPACKET_V1:
4500	po->tp_hdrlen = TPACKET_HDRLEN;
4501	break;
4502	case TPACKET_V2:
4503	po->tp_hdrlen = TPACKET2_HDRLEN;
4504	break;
4505	case TPACKET_V3:
4506	po->tp_hdrlen = TPACKET3_HDRLEN;
4507	break;
4508	}
4509
4510	err = -EINVAL;
4511	if (unlikely((int)req->tp_block_size <= `0`))
4512	goto out;
4513	if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4514	goto out;
4515	min_frame_size = po->tp_hdrlen + po->tp_reserve;
4516	if (po->tp_version >= TPACKET_V3 &&
4517	req->tp_block_size <
4518	BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4519	goto out;
4520	if (unlikely(req->tp_frame_size < min_frame_size))
4521	goto out;
4522	if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - `1`)))
4523	goto out;
4524
4525	rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4526	if (unlikely(rb->frames_per_block == `0`))
4527	goto out;
4528	if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4529	goto out;
4530	if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4531	req->tp_frame_nr))
4532	goto out;
4533
4534	err = -ENOMEM;
4535	order = get_order(size: req->tp_block_size);
4536	pg_vec = alloc_pg_vec(req, order);
4537	if (unlikely(!pg_vec))
4538	goto out;
4539	switch (po->tp_version) {
4540	case TPACKET_V3:
4541	/ Block transmit is not supported yet /
4542	if (!tx_ring) {
4543	init_prb_bdqc(po, rb, pg_vec, req_u);
4544	} else {
4545	struct tpacket_req3 *req3 = &req_u->req3;
4546
4547	if (req3->tp_retire_blk_tov \|\|
4548	req3->tp_sizeof_priv \|\|
4549	req3->tp_feature_req_word) {
4550	err = -EINVAL;
4551	goto out_free_pg_vec;
4552	}
4553	}
4554	break;
4555	default:
4556	if (!tx_ring) {
4557	rx_owner_map = bitmap_alloc(nbits: req->tp_frame_nr,
4558	GFP_KERNEL \| __GFP_NOWARN \| __GFP_ZERO);
4559	if (!rx_owner_map)
4560	goto out_free_pg_vec;
4561	}
4562	break;
4563	}
4564	}
4565	/ Done /
4566	else {
4567	err = -EINVAL;
4568	if (unlikely(req->tp_frame_nr))
4569	goto out;
4570	}
4571
4572
4573	/ Detach socket from network /
4574	spin_lock(lock: &po->bind_lock);
4575	was_running = packet_sock_flag(po, flag: PACKET_SOCK_RUNNING);
4576	num = po->num;
4577	if (was_running) {
4578	WRITE_ONCE(po->num, `0`);
4579	__unregister_prot_hook(sk, sync: false);
4580	}
4581	spin_unlock(lock: &po->bind_lock);
4582
4583	synchronize_net();
4584
4585	err = -EBUSY;
4586	mutex_lock(&po->pg_vec_lock);
4587	if (closing \|\| atomic_long_read(v: &po->mapped) == `0`) {
4588	err = `0`;
4589	spin_lock_bh(lock: &rb_queue->lock);
4590	swap(rb->pg_vec, pg_vec);
4591	if (po->tp_version <= TPACKET_V2)
4592	swap(rb->rx_owner_map, rx_owner_map);
4593	rb->frame_max = (req->tp_frame_nr - `1`);
4594	rb->head = `0`;
4595	rb->frame_size = req->tp_frame_size;
4596	spin_unlock_bh(lock: &rb_queue->lock);
4597
4598	swap(rb->pg_vec_order, order);
4599	swap(rb->pg_vec_len, req->tp_block_nr);
4600
4601	rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4602	po->prot_hook.func = (po->rx_ring.pg_vec) ?
4603	tpacket_rcv : packet_rcv;
4604	skb_queue_purge(list: rb_queue);
4605	if (atomic_long_read(v: &po->mapped))
4606	pr_err("packet_mmap: vma is busy: %ld\n",
4607	atomic_long_read(&po->mapped));
4608	}
4609	mutex_unlock(lock: &po->pg_vec_lock);
4610
4611	spin_lock(lock: &po->bind_lock);
4612	if (was_running) {
4613	WRITE_ONCE(po->num, num);
4614	register_prot_hook(sk);
4615	}
4616	spin_unlock(lock: &po->bind_lock);
4617	if (pg_vec && (po->tp_version > TPACKET_V2)) {
4618	/ Because we don't support block-based V3 on tx-ring /
4619	if (!tx_ring)
4620	prb_shutdown_retire_blk_timer(po, rb_queue);
4621	}
4622
4623	out_free_pg_vec:
4624	if (pg_vec) {
4625	bitmap_free(bitmap: rx_owner_map);
4626	free_pg_vec(pg_vec, order, len: req->tp_block_nr);
4627	}
4628	out:
4629	return err;
4630	}
4631
4632	static int packet_mmap(struct file file, struct* socket *sock,
4633	struct vm_area_struct *vma)
4634	{
4635	struct sock *sk = sock->sk;
4636	struct packet_sock *po = pkt_sk(sk);
4637	unsigned long size, expected_size;
4638	struct packet_ring_buffer *rb;
4639	unsigned long start;
4640	int err = -EINVAL;
4641	int i;
4642
4643	if (vma->vm_pgoff)
4644	return -EINVAL;
4645
4646	mutex_lock(&po->pg_vec_lock);
4647
4648	expected_size = `0`;
4649	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4650	if (rb->pg_vec) {
4651	expected_size += rb->pg_vec_len
4652	* rb->pg_vec_pages
4653	* PAGE_SIZE;
4654	}
4655	}
4656
4657	if (expected_size == `0`)
4658	goto out;
4659
4660	size = vma->vm_end - vma->vm_start;
4661	if (size != expected_size)
4662	goto out;
4663
4664	start = vma->vm_start;
4665	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4666	if (rb->pg_vec == NULL)
4667	continue;
4668
4669	for (i = `0`; i < rb->pg_vec_len; i++) {
4670	struct page *page;
4671	void *kaddr = rb->pg_vec[i].buffer;
4672	int pg_num;
4673
4674	for (pg_num = `0`; pg_num < rb->pg_vec_pages; pg_num++) {
4675	page = pgv_to_page(addr: kaddr);
4676	err = vm_insert_page(vma, addr: start, page);
4677	if (unlikely(err))
4678	goto out;
4679	start += PAGE_SIZE;
4680	kaddr += PAGE_SIZE;
4681	}
4682	}
4683	}
4684
4685	atomic_long_inc(v: &po->mapped);
4686	vma->vm_ops = &packet_mmap_ops;
4687	err = `0`;
4688
4689	out:
4690	mutex_unlock(lock: &po->pg_vec_lock);
4691	return err;
4692	}
4693
4694	static const struct proto_ops packet_ops_spkt = {
4695	.family = PF_PACKET,
4696	.owner = THIS_MODULE,
4697	.release = packet_release,
4698	.bind = packet_bind_spkt,
4699	.connect = sock_no_connect,
4700	.socketpair = sock_no_socketpair,
4701	.accept = sock_no_accept,
4702	.getname = packet_getname_spkt,
4703	.poll = datagram_poll,
4704	.ioctl = packet_ioctl,
4705	.gettstamp = sock_gettstamp,
4706	.listen = sock_no_listen,
4707	.shutdown = sock_no_shutdown,
4708	.sendmsg = packet_sendmsg_spkt,
4709	.recvmsg = packet_recvmsg,
4710	.mmap = sock_no_mmap,
4711	};
4712
4713	static const struct proto_ops packet_ops = {
4714	.family = PF_PACKET,
4715	.owner = THIS_MODULE,
4716	.release = packet_release,
4717	.bind = packet_bind,
4718	.connect = sock_no_connect,
4719	.socketpair = sock_no_socketpair,
4720	.accept = sock_no_accept,
4721	.getname = packet_getname,
4722	.poll = packet_poll,
4723	.ioctl = packet_ioctl,
4724	.gettstamp = sock_gettstamp,
4725	.listen = sock_no_listen,
4726	.shutdown = sock_no_shutdown,
4727	.setsockopt = packet_setsockopt,
4728	.getsockopt = packet_getsockopt,
4729	.sendmsg = packet_sendmsg,
4730	.recvmsg = packet_recvmsg,
4731	.mmap = packet_mmap,
4732	};
4733
4734	static const struct net_proto_family packet_family_ops = {
4735	.family = PF_PACKET,
4736	.create = packet_create,
4737	.owner = THIS_MODULE,
4738	};
4739
4740	static struct notifier_block packet_netdev_notifier = {
4741	.notifier_call = packet_notifier,
4742	};
4743
4744	#ifdef CONFIG_PROC_FS
4745
4746	static void packet_seq_start(struct* seq_file seq, loff_t pos)
4747	__acquires(RCU)
4748	{
4749	struct net *net = seq_file_net(seq);
4750
4751	rcu_read_lock();
4752	return seq_hlist_start_head_rcu(head: &net->packet.sklist, pos: *pos);
4753	}
4754
4755	static void packet_seq_next(struct* seq_file seq, void* v, loff_t pos)
4756	{
4757	struct net *net = seq_file_net(seq);
4758	return seq_hlist_next_rcu(v, head: &net->packet.sklist, ppos: pos);
4759	}
4760
4761	static void packet_seq_stop(struct seq_file seq, void* *v)
4762	__releases(RCU)
4763	{
4764	rcu_read_unlock();
4765	}
4766
4767	static int packet_seq_show(struct seq_file seq, void* *v)
4768	{
4769	if (v == SEQ_START_TOKEN)
4770	seq_printf(m: seq,
4771	fmt: "%*sRefCnt Type Proto Iface R Rmem User Inode\n",
4772	IS_ENABLED(CONFIG_64BIT) ? -`17` : -`9`, "sk");
4773	else {
4774	struct sock *s = sk_entry(node: v);
4775	const struct packet_sock *po = pkt_sk(s);
4776
4777	seq_printf(m: seq,
4778	fmt: "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4779	s,
4780	refcount_read(r: &s->sk_refcnt),
4781	s->sk_type,
4782	ntohs(READ_ONCE(po->num)),
4783	READ_ONCE(po->ifindex),
4784	packet_sock_flag(po, flag: PACKET_SOCK_RUNNING),
4785	atomic_read(v: &s->sk_rmem_alloc),
4786	from_kuid_munged(to: seq_user_ns(seq), uid: sock_i_uid(sk: s)),
4787	sock_i_ino(sk: s));
4788	}
4789
4790	return `0`;
4791	}
4792
4793	static const struct seq_operations packet_seq_ops = {
4794	.start = packet_seq_start,
4795	.next = packet_seq_next,
4796	.stop = packet_seq_stop,
4797	.show = packet_seq_show,
4798	};
4799	#endif
4800
4801	static int __net_init packet_net_init(struct net *net)
4802	{
4803	mutex_init(&net->packet.sklist_lock);
4804	INIT_HLIST_HEAD(&net->packet.sklist);
4805
4806	#ifdef CONFIG_PROC_FS
4807	if (!proc_create_net("packet", `0`, net->proc_net, &packet_seq_ops,
4808	sizeof(struct seq_net_private)))
4809	return -ENOMEM;
4810	#endif /* CONFIG_PROC_FS */
4811
4812	return `0`;
4813	}
4814
4815	static void __net_exit packet_net_exit(struct net *net)
4816	{
4817	remove_proc_entry("packet", net->proc_net);
4818	WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4819	}
4820
4821	static struct pernet_operations packet_net_ops = {
4822	.init = packet_net_init,
4823	.exit = packet_net_exit,
4824	};
4825
4826
4827	static void __exit packet_exit(void)
4828	{
4829	sock_unregister(PF_PACKET);
4830	proto_unregister(prot: &packet_proto);
4831	unregister_netdevice_notifier(nb: &packet_netdev_notifier);
4832	unregister_pernet_subsys(&packet_net_ops);
4833	}
4834
4835	static int __init packet_init(void)
4836	{
4837	int rc;
4838
4839	rc = register_pernet_subsys(&packet_net_ops);
4840	if (rc)
4841	goto out;
4842	rc = register_netdevice_notifier(nb: &packet_netdev_notifier);
4843	if (rc)
4844	goto out_pernet;
4845	rc = proto_register(prot: &packet_proto, alloc_slab: `0`);
4846	if (rc)
4847	goto out_notifier;
4848	rc = sock_register(fam: &packet_family_ops);
4849	if (rc)
4850	goto out_proto;
4851
4852	return `0`;
4853
4854	out_proto:
4855	proto_unregister(prot: &packet_proto);
4856	out_notifier:
4857	unregister_netdevice_notifier(nb: &packet_netdev_notifier);
4858	out_pernet:
4859	unregister_pernet_subsys(&packet_net_ops);
4860	out:
4861	return rc;
4862	}
4863
4864	module_init(packet_init);
4865	module_exit(packet_exit);
4866	MODULE_DESCRIPTION("Packet socket support (AF_PACKET)");
4867	MODULE_LICENSE("GPL");
4868	MODULE_ALIAS_NETPROTO(PF_PACKET);
4869

source code of linux/net/packet/af_packet.c