gve_tx.c source code [linux/drivers/net/ethernet/google/gve/gve_tx.c]

1	// SPDX-License-Identifier: (GPL-2.0 OR MIT)
2	/ Google virtual Ethernet (gve) driver*
3	*
4	* Copyright (C) 2015-2021 Google, Inc.
5	*/
6
7	#include "gve.h"
8	#include "gve_adminq.h"
9	#include "gve_utils.h"
10	#include <linux/ip.h>
11	#include <linux/tcp.h>
12	#include <linux/vmalloc.h>
13	#include <linux/skbuff.h>
14	#include <net/xdp_sock_drv.h>
15
16	static inline void gve_tx_put_doorbell(struct gve_priv *priv,
17	struct gve_queue_resources *q_resources,
18	u32 val)
19	{
20	iowrite32be(val, &priv->db_bar2[be32_to_cpu(q_resources->db_index)]);
21	}
22
23	void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid)
24	{
25	u32 tx_qid = gve_xdp_tx_queue_id(priv, queue_id: xdp_qid);
26	struct gve_tx_ring *tx = &priv->tx[tx_qid];
27
28	gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req);
29	}
30
31	/ gvnic can only transmit from a Registered Segment.*
32	* We copy skb payloads into the registered segment before writing Tx
33	* descriptors and ringing the Tx doorbell.
34	*
35	* gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must
36	* free allocations in the order they were allocated.
37	*/
38
39	static int gve_tx_fifo_init(struct gve_priv priv, struct* gve_tx_fifo *fifo)
40	{
41	fifo->base = vmap(pages: fifo->qpl->pages, count: fifo->qpl->num_entries, VM_MAP,
42	PAGE_KERNEL);
43	if (unlikely(!fifo->base)) {
44	netif_err(priv, drv, priv->dev, "Failed to vmap fifo, qpl_id = %d\n",
45	fifo->qpl->id);
46	return -ENOMEM;
47	}
48
49	fifo->size = fifo->qpl->num_entries * PAGE_SIZE;
50	atomic_set(v: &fifo->available, i: fifo->size);
51	fifo->head = `0`;
52	return `0`;
53	}
54
55	static void gve_tx_fifo_release(struct gve_priv priv, struct* gve_tx_fifo *fifo)
56	{
57	WARN(atomic_read(&fifo->available) != fifo->size,
58	"Releasing non-empty fifo");
59
60	vunmap(addr: fifo->base);
61	}
62
63	static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo,
64	size_t bytes)
65	{
66	return (fifo->head + bytes < fifo->size) ? `0` : fifo->size - fifo->head;
67	}
68
69	static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
70	{
71	return (atomic_read(v: &fifo->available) <= bytes) ? false : true;
72	}
73
74	/ gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO*
75	* @fifo: FIFO to allocate from
76	* @bytes: Allocation size
77	* @iov: Scatter-gather elements to fill with allocation fragment base/len
78	*
79	* Returns number of valid elements in iov[] or negative on error.
80	*
81	* Allocations from a given FIFO must be externally synchronized but concurrent
82	* allocation and frees are allowed.
83	*/
84	static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
85	struct gve_tx_iovec iov[`2`])
86	{
87	size_t overflow, padding;
88	u32 aligned_head;
89	int nfrags = `0`;
90
91	if (!bytes)
92	return `0`;
93
94	/ This check happens before we know how much padding is needed to*
95	* align to a cacheline boundary for the payload, but that is fine,
96	* because the FIFO head always start aligned, and the FIFO's boundaries
97	* are aligned, so if there is space for the data, there is space for
98	* the padding to the next alignment.
99	*/
100	WARN(!gve_tx_fifo_can_alloc(fifo, bytes),
101	"Reached %s when there's not enough space in the fifo", __func__);
102
103	nfrags++;
104
105	iov[`0`].iov_offset = fifo->head;
106	iov[`0`].iov_len = bytes;
107	fifo->head += bytes;
108
109	if (fifo->head > fifo->size) {
110	/ If the allocation did not fit in the tail fragment of the*
111	* FIFO, also use the head fragment.
112	*/
113	nfrags++;
114	overflow = fifo->head - fifo->size;
115	iov[`0`].iov_len -= overflow;
116	iov[`1`].iov_offset = `0`; / Start of fifo/
117	iov[`1`].iov_len = overflow;
118
119	fifo->head = overflow;
120	}
121
122	/ Re-align to a cacheline boundary /
123	aligned_head = L1_CACHE_ALIGN(fifo->head);
124	padding = aligned_head - fifo->head;
125	iov[nfrags - `1`].iov_padding = padding;
126	atomic_sub(i: bytes + padding, v: &fifo->available);
127	fifo->head = aligned_head;
128
129	if (fifo->head == fifo->size)
130	fifo->head = `0`;
131
132	return nfrags;
133	}
134
135	/ gve_tx_free_fifo - Return space to Tx FIFO*
136	* @fifo: FIFO to return fragments to
137	* @bytes: Bytes to free
138	*/
139	static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
140	{
141	atomic_add(i: bytes, v: &fifo->available);
142	}
143
144	static size_t gve_tx_clear_buffer_state(struct gve_tx_buffer_state *info)
145	{
146	size_t space_freed = `0`;
147	int i;
148
149	for (i = `0`; i < ARRAY_SIZE(info->iov); i++) {
150	space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
151	info->iov[i].iov_len = `0`;
152	info->iov[i].iov_padding = `0`;
153	}
154	return space_freed;
155	}
156
157	static int gve_clean_xdp_done(struct gve_priv priv, struct* gve_tx_ring *tx,
158	u32 to_do)
159	{
160	struct gve_tx_buffer_state *info;
161	u32 clean_end = tx->done + to_do;
162	u64 pkts = `0`, bytes = `0`;
163	size_t space_freed = `0`;
164	u32 xsk_complete = `0`;
165	u32 idx;
166
167	for (; tx->done < clean_end; tx->done++) {
168	idx = tx->done & tx->mask;
169	info = &tx->info[idx];
170
171	if (unlikely(!info->xdp.size))
172	continue;
173
174	bytes += info->xdp.size;
175	pkts++;
176	xsk_complete += info->xdp.is_xsk;
177
178	info->xdp.size = `0`;
179	if (info->xdp_frame) {
180	xdp_return_frame(xdpf: info->xdp_frame);
181	info->xdp_frame = NULL;
182	}
183	space_freed += gve_tx_clear_buffer_state(info);
184	}
185
186	gve_tx_free_fifo(fifo: &tx->tx_fifo, bytes: space_freed);
187	if (xsk_complete > `0` && tx->xsk_pool)
188	xsk_tx_completed(pool: tx->xsk_pool, nb_entries: xsk_complete);
189	u64_stats_update_begin(syncp: &tx->statss);
190	tx->bytes_done += bytes;
191	tx->pkt_done += pkts;
192	u64_stats_update_end(syncp: &tx->statss);
193	return pkts;
194	}
195
196	static int gve_clean_tx_done(struct gve_priv priv, struct* gve_tx_ring *tx,
197	u32 to_do, bool try_to_wake);
198
199	void gve_tx_stop_ring_gqi(struct gve_priv priv, int* idx)
200	{
201	int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx: idx);
202	struct gve_tx_ring *tx = &priv->tx[idx];
203
204	if (!gve_tx_was_added_to_block(priv, queue_idx: idx))
205	return;
206
207	gve_remove_napi(priv, ntfy_idx);
208	gve_clean_tx_done(priv, tx, to_do: priv->tx_desc_cnt, try_to_wake: false);
209	netdev_tx_reset_queue(q: tx->netdev_txq);
210	gve_tx_remove_from_block(priv, queue_idx: idx);
211	}
212
213	static void gve_tx_free_ring_gqi(struct gve_priv priv, struct* gve_tx_ring *tx,
214	struct gve_tx_alloc_rings_cfg *cfg)
215	{
216	struct device *hdev = &priv->pdev->dev;
217	int idx = tx->q_num;
218	size_t bytes;
219	u32 slots;
220
221	slots = tx->mask + `1`;
222	dma_free_coherent(dev: hdev, size: sizeof(*tx->q_resources),
223	cpu_addr: tx->q_resources, dma_handle: tx->q_resources_bus);
224	tx->q_resources = NULL;
225
226	if (!tx->raw_addressing) {
227	gve_tx_fifo_release(priv, fifo: &tx->tx_fifo);
228	gve_unassign_qpl(qpl_cfg: cfg->qpl_cfg, id: tx->tx_fifo.qpl->id);
229	tx->tx_fifo.qpl = NULL;
230	}
231
232	bytes = sizeof(tx->desc) slots;
233	dma_free_coherent(dev: hdev, size: bytes, cpu_addr: tx->desc, dma_handle: tx->bus);
234	tx->desc = NULL;
235
236	vfree(addr: tx->info);
237	tx->info = NULL;
238
239	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
240	}
241
242	void gve_tx_start_ring_gqi(struct gve_priv priv, int* idx)
243	{
244	int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx: idx);
245	struct gve_tx_ring *tx = &priv->tx[idx];
246
247	gve_tx_add_to_block(priv, queue_idx: idx);
248
249	tx->netdev_txq = netdev_get_tx_queue(dev: priv->dev, index: idx);
250	gve_add_napi(priv, ntfy_idx, gve_poll: gve_napi_poll);
251	}
252
253	static int gve_tx_alloc_ring_gqi(struct gve_priv *priv,
254	struct gve_tx_alloc_rings_cfg *cfg,
255	struct gve_tx_ring *tx,
256	int idx)
257	{
258	struct device *hdev = &priv->pdev->dev;
259	size_t bytes;
260
261	/ Make sure everything is zeroed to start /
262	memset(tx, `0`, sizeof(*tx));
263	spin_lock_init(&tx->clean_lock);
264	spin_lock_init(&tx->xdp_lock);
265	tx->q_num = idx;
266
267	tx->mask = cfg->ring_size - `1`;
268
269	/ alloc metadata /
270	tx->info = vcalloc(n: cfg->ring_size, size: sizeof(*tx->info));
271	if (!tx->info)
272	return -ENOMEM;
273
274	/ alloc tx queue /
275	bytes = sizeof(tx->desc) cfg->ring_size;
276	tx->desc = dma_alloc_coherent(dev: hdev, size: bytes, dma_handle: &tx->bus, GFP_KERNEL);
277	if (!tx->desc)
278	goto abort_with_info;
279
280	tx->raw_addressing = cfg->raw_addressing;
281	tx->dev = hdev;
282	if (!tx->raw_addressing) {
283	tx->tx_fifo.qpl = gve_assign_tx_qpl(cfg, tx_qid: idx);
284	if (!tx->tx_fifo.qpl)
285	goto abort_with_desc;
286	/ map Tx FIFO /
287	if (gve_tx_fifo_init(priv, fifo: &tx->tx_fifo))
288	goto abort_with_qpl;
289	}
290
291	tx->q_resources =
292	dma_alloc_coherent(dev: hdev,
293	size: sizeof(*tx->q_resources),
294	dma_handle: &tx->q_resources_bus,
295	GFP_KERNEL);
296	if (!tx->q_resources)
297	goto abort_with_fifo;
298
299	return `0`;
300
301	abort_with_fifo:
302	if (!tx->raw_addressing)
303	gve_tx_fifo_release(priv, fifo: &tx->tx_fifo);
304	abort_with_qpl:
305	if (!tx->raw_addressing)
306	gve_unassign_qpl(qpl_cfg: cfg->qpl_cfg, id: tx->tx_fifo.qpl->id);
307	abort_with_desc:
308	dma_free_coherent(dev: hdev, size: bytes, cpu_addr: tx->desc, dma_handle: tx->bus);
309	tx->desc = NULL;
310	abort_with_info:
311	vfree(addr: tx->info);
312	tx->info = NULL;
313	return -ENOMEM;
314	}
315
316	int gve_tx_alloc_rings_gqi(struct gve_priv *priv,
317	struct gve_tx_alloc_rings_cfg *cfg)
318	{
319	struct gve_tx_ring *tx = cfg->tx;
320	int err = `0`;
321	int i, j;
322
323	if (!cfg->raw_addressing && !cfg->qpls) {
324	netif_err(priv, drv, priv->dev,
325	"Cannot alloc QPL ring before allocing QPLs\n");
326	return -EINVAL;
327	}
328
329	if (cfg->start_idx + cfg->num_rings > cfg->qcfg->max_queues) {
330	netif_err(priv, drv, priv->dev,
331	"Cannot alloc more than the max num of Tx rings\n");
332	return -EINVAL;
333	}
334
335	if (cfg->start_idx == `0`) {
336	tx = kvcalloc(n: cfg->qcfg->max_queues, size: sizeof(struct gve_tx_ring),
337	GFP_KERNEL);
338	if (!tx)
339	return -ENOMEM;
340	} else if (!tx) {
341	netif_err(priv, drv, priv->dev,
342	"Cannot alloc tx rings from a nonzero start idx without tx array\n");
343	return -EINVAL;
344	}
345
346	for (i = cfg->start_idx; i < cfg->start_idx + cfg->num_rings; i++) {
347	err = gve_tx_alloc_ring_gqi(priv, cfg, tx: &tx[i], idx: i);
348	if (err) {
349	netif_err(priv, drv, priv->dev,
350	"Failed to alloc tx ring=%d: err=%d\n",
351	i, err);
352	goto cleanup;
353	}
354	}
355
356	cfg->tx = tx;
357	return `0`;
358
359	cleanup:
360	for (j = `0`; j < i; j++)
361	gve_tx_free_ring_gqi(priv, tx: &tx[j], cfg);
362	if (cfg->start_idx == `0`)
363	kvfree(addr: tx);
364	return err;
365	}
366
367	void gve_tx_free_rings_gqi(struct gve_priv *priv,
368	struct gve_tx_alloc_rings_cfg *cfg)
369	{
370	struct gve_tx_ring *tx = cfg->tx;
371	int i;
372
373	if (!tx)
374	return;
375
376	for (i = cfg->start_idx; i < cfg->start_idx + cfg->num_rings; i++)
377	gve_tx_free_ring_gqi(priv, tx: &tx[i], cfg);
378
379	if (cfg->start_idx == `0`) {
380	kvfree(addr: tx);
381	cfg->tx = NULL;
382	}
383	}
384
385	/ gve_tx_avail - Calculates the number of slots available in the ring*
386	* @tx: tx ring to check
387	*
388	* Returns the number of slots available
389	*
390	* The capacity of the queue is mask + 1. We don't need to reserve an entry.
391	**/
392	static inline u32 gve_tx_avail(struct gve_tx_ring *tx)
393	{
394	return tx->mask + `1` - (tx->req - tx->done);
395	}
396
397	static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
398	struct sk_buff *skb)
399	{
400	int pad_bytes, align_hdr_pad;
401	int bytes;
402	int hlen;
403
404	hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + tcp_hdrlen(skb) :
405	min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len);
406
407	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(fifo: &tx->tx_fifo,
408	bytes: hlen);
409	/ We need to take into account the header alignment padding. /
410	align_hdr_pad = L1_CACHE_ALIGN(hlen) - hlen;
411	bytes = align_hdr_pad + pad_bytes + skb->len;
412
413	return bytes;
414	}
415
416	/ The most descriptors we could need is MAX_SKB_FRAGS + 4 :*
417	* 1 for each skb frag
418	* 1 for the skb linear portion
419	* 1 for when tcp hdr needs to be in separate descriptor
420	* 1 if the payload wraps to the beginning of the FIFO
421	* 1 for metadata descriptor
422	*/
423	#define MAX_TX_DESC_NEEDED (MAX_SKB_FRAGS + 4)
424	static void gve_tx_unmap_buf(struct device dev, struct* gve_tx_buffer_state *info)
425	{
426	if (info->skb) {
427	dma_unmap_single(dev, dma_unmap_addr(info, dma),
428	dma_unmap_len(info, len),
429	DMA_TO_DEVICE);
430	dma_unmap_len_set(info, len, `0`);
431	} else {
432	dma_unmap_page(dev, dma_unmap_addr(info, dma),
433	dma_unmap_len(info, len),
434	DMA_TO_DEVICE);
435	dma_unmap_len_set(info, len, `0`);
436	}
437	}
438
439	/ Check if sufficient resources (descriptor ring space, FIFO space) are*
440	* available to transmit the given number of bytes.
441	*/
442	static inline bool gve_can_tx(struct gve_tx_ring tx, int* bytes_required)
443	{
444	bool can_alloc = true;
445
446	if (!tx->raw_addressing)
447	can_alloc = gve_tx_fifo_can_alloc(fifo: &tx->tx_fifo, bytes: bytes_required);
448
449	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);
450	}
451
452	static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED);
453
454	/ Stops the queue if the skb cannot be transmitted. /
455	static int gve_maybe_stop_tx(struct gve_priv priv, struct* gve_tx_ring *tx,
456	struct sk_buff *skb)
457	{
458	int bytes_required = `0`;
459	u32 nic_done;
460	u32 to_do;
461	int ret;
462
463	if (!tx->raw_addressing)
464	bytes_required = gve_skb_fifo_bytes_required(tx, skb);
465
466	if (likely(gve_can_tx(tx, bytes_required)))
467	return `0`;
468
469	ret = -EBUSY;
470	spin_lock(lock: &tx->clean_lock);
471	nic_done = gve_tx_load_event_counter(priv, tx);
472	to_do = nic_done - tx->done;
473
474	/ Only try to clean if there is hope for TX /
475	if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) {
476	if (to_do > `0`) {
477	to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT);
478	gve_clean_tx_done(priv, tx, to_do, try_to_wake: false);
479	}
480	if (likely(gve_can_tx(tx, bytes_required)))
481	ret = `0`;
482	}
483	if (ret) {
484	/ No space, so stop the queue /
485	tx->stop_queue++;
486	netif_tx_stop_queue(dev_queue: tx->netdev_txq);
487	}
488	spin_unlock(lock: &tx->clean_lock);
489
490	return ret;
491	}
492
493	static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,
494	u16 csum_offset, u8 ip_summed, bool is_gso,
495	int l4_hdr_offset, u32 desc_cnt,
496	u16 hlen, u64 addr, u16 pkt_len)
497	{
498	/ l4_hdr_offset and csum_offset are in units of 16-bit words /
499	if (is_gso) {
500	pkt_desc->pkt.type_flags = GVE_TXD_TSO \| GVE_TXF_L4CSUM;
501	pkt_desc->pkt.l4_csum_offset = csum_offset >> `1`;
502	pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> `1`;
503	} else if (likely(ip_summed == CHECKSUM_PARTIAL)) {
504	pkt_desc->pkt.type_flags = GVE_TXD_STD \| GVE_TXF_L4CSUM;
505	pkt_desc->pkt.l4_csum_offset = csum_offset >> `1`;
506	pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> `1`;
507	} else {
508	pkt_desc->pkt.type_flags = GVE_TXD_STD;
509	pkt_desc->pkt.l4_csum_offset = `0`;
510	pkt_desc->pkt.l4_hdr_offset = `0`;
511	}
512	pkt_desc->pkt.desc_cnt = desc_cnt;
513	pkt_desc->pkt.len = cpu_to_be16(pkt_len);
514	pkt_desc->pkt.seg_len = cpu_to_be16(hlen);
515	pkt_desc->pkt.seg_addr = cpu_to_be64(addr);
516	}
517
518	static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc,
519	struct sk_buff *skb)
520	{
521	BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt));
522
523	mtd_desc->mtd.type_flags = GVE_TXD_MTD \| GVE_MTD_SUBTYPE_PATH;
524	mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT \|
525	GVE_MTD_PATH_HASH_L4;
526	mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash);
527	mtd_desc->mtd.reserved0 = `0`;
528	mtd_desc->mtd.reserved1 = `0`;
529	}
530
531	static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
532	u16 l3_offset, u16 gso_size,
533	bool is_gso_v6, bool is_gso,
534	u16 len, u64 addr)
535	{
536	seg_desc->seg.type_flags = GVE_TXD_SEG;
537	if (is_gso) {
538	if (is_gso_v6)
539	seg_desc->seg.type_flags \|= GVE_TXSF_IPV6;
540	seg_desc->seg.l3_offset = l3_offset >> `1`;
541	seg_desc->seg.mss = cpu_to_be16(gso_size);
542	}
543	seg_desc->seg.seg_len = cpu_to_be16(len);
544	seg_desc->seg.seg_addr = cpu_to_be64(addr);
545	}
546
547	static void gve_dma_sync_for_device(struct device dev, dma_addr_t page_buses,
548	u64 iov_offset, u64 iov_len)
549	{
550	u64 last_page = (iov_offset + iov_len - `1`) / PAGE_SIZE;
551	u64 first_page = iov_offset / PAGE_SIZE;
552	u64 page;
553
554	for (page = first_page; page <= last_page; page++)
555	dma_sync_single_for_device(dev, addr: page_buses[page], PAGE_SIZE, dir: DMA_TO_DEVICE);
556	}
557
558	static int gve_tx_add_skb_copy(struct gve_priv priv, struct* gve_tx_ring tx, struct* sk_buff *skb)
559	{
560	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
561	union gve_tx_desc pkt_desc, seg_desc;
562	struct gve_tx_buffer_state *info;
563	int mtd_desc_nr = !!skb->l4_hash;
564	bool is_gso = skb_is_gso(skb);
565	u32 idx = tx->req & tx->mask;
566	int payload_iov = `2`;
567	int copy_offset;
568	u32 next_idx;
569	int i;
570
571	info = &tx->info[idx];
572	pkt_desc = &tx->desc[idx];
573
574	l4_hdr_offset = skb_checksum_start_offset(skb);
575	/ If the skb is gso, then we want the tcp header alone in the first segment*
576	* otherwise we want the minimum required by the gVNIC spec.
577	*/
578	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
579	min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len);
580
581	info->skb = skb;
582	/ We don't want to split the header, so if necessary, pad to the end*
583	* of the fifo and then put the header at the beginning of the fifo.
584	*/
585	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(fifo: &tx->tx_fifo, bytes: hlen);
586	hdr_nfrags = gve_tx_alloc_fifo(fifo: &tx->tx_fifo, bytes: hlen + pad_bytes,
587	iov: &info->iov[`0`]);
588	WARN(!hdr_nfrags, "hdr_nfrags should never be 0!");
589	payload_nfrags = gve_tx_alloc_fifo(fifo: &tx->tx_fifo, bytes: skb->len - hlen,
590	iov: &info->iov[payload_iov]);
591
592	gve_tx_fill_pkt_desc(pkt_desc, csum_offset: skb->csum_offset, ip_summed: skb->ip_summed,
593	is_gso, l4_hdr_offset,
594	desc_cnt: `1` + mtd_desc_nr + payload_nfrags, hlen,
595	addr: info->iov[hdr_nfrags - `1`].iov_offset, pkt_len: skb->len);
596
597	skb_copy_bits(skb, offset: `0`,
598	to: tx->tx_fifo.base + info->iov[hdr_nfrags - `1`].iov_offset,
599	len: hlen);
600	gve_dma_sync_for_device(dev: &priv->pdev->dev, page_buses: tx->tx_fifo.qpl->page_buses,
601	iov_offset: info->iov[hdr_nfrags - `1`].iov_offset,
602	iov_len: info->iov[hdr_nfrags - `1`].iov_len);
603	copy_offset = hlen;
604
605	if (mtd_desc_nr) {
606	next_idx = (tx->req + `1`) & tx->mask;
607	gve_tx_fill_mtd_desc(mtd_desc: &tx->desc[next_idx], skb);
608	}
609
610	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
611	next_idx = (tx->req + `1` + mtd_desc_nr + i - payload_iov) & tx->mask;
612	seg_desc = &tx->desc[next_idx];
613
614	gve_tx_fill_seg_desc(seg_desc, l3_offset: skb_network_offset(skb),
615	skb_shinfo(skb)->gso_size,
616	is_gso_v6: skb_is_gso_v6(skb), is_gso,
617	len: info->iov[i].iov_len,
618	addr: info->iov[i].iov_offset);
619
620	skb_copy_bits(skb, offset: copy_offset,
621	to: tx->tx_fifo.base + info->iov[i].iov_offset,
622	len: info->iov[i].iov_len);
623	gve_dma_sync_for_device(dev: &priv->pdev->dev, page_buses: tx->tx_fifo.qpl->page_buses,
624	iov_offset: info->iov[i].iov_offset,
625	iov_len: info->iov[i].iov_len);
626	copy_offset += info->iov[i].iov_len;
627	}
628
629	return `1` + mtd_desc_nr + payload_nfrags;
630	}
631
632	static int gve_tx_add_skb_no_copy(struct gve_priv priv, struct* gve_tx_ring *tx,
633	struct sk_buff *skb)
634	{
635	const struct skb_shared_info *shinfo = skb_shinfo(skb);
636	int hlen, num_descriptors, l4_hdr_offset;
637	union gve_tx_desc pkt_desc, mtd_desc, *seg_desc;
638	struct gve_tx_buffer_state *info;
639	int mtd_desc_nr = !!skb->l4_hash;
640	bool is_gso = skb_is_gso(skb);
641	u32 idx = tx->req & tx->mask;
642	u64 addr;
643	u32 len;
644	int i;
645
646	info = &tx->info[idx];
647	pkt_desc = &tx->desc[idx];
648
649	l4_hdr_offset = skb_checksum_start_offset(skb);
650	/ If the skb is gso, then we want only up to the tcp header in the first segment*
651	* to efficiently replicate on each segment otherwise we want the linear portion
652	* of the skb (which will contain the checksum because skb->csum_start and
653	* skb->csum_offset are given relative to skb->head) in the first segment.
654	*/
655	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb);
656	len = skb_headlen(skb);
657
658	info->skb = skb;
659
660	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
661	if (unlikely(dma_mapping_error(tx->dev, addr))) {
662	tx->dma_mapping_error++;
663	goto drop;
664	}
665	dma_unmap_len_set(info, len, len);
666	dma_unmap_addr_set(info, dma, addr);
667
668	num_descriptors = `1` + shinfo->nr_frags;
669	if (hlen < len)
670	num_descriptors++;
671	if (mtd_desc_nr)
672	num_descriptors++;
673
674	gve_tx_fill_pkt_desc(pkt_desc, csum_offset: skb->csum_offset, ip_summed: skb->ip_summed,
675	is_gso, l4_hdr_offset,
676	desc_cnt: num_descriptors, hlen, addr, pkt_len: skb->len);
677
678	if (mtd_desc_nr) {
679	idx = (idx + `1`) & tx->mask;
680	mtd_desc = &tx->desc[idx];
681	gve_tx_fill_mtd_desc(mtd_desc, skb);
682	}
683
684	if (hlen < len) {
685	/ For gso the rest of the linear portion of the skb needs to*
686	* be in its own descriptor.
687	*/
688	len -= hlen;
689	addr += hlen;
690	idx = (idx + `1`) & tx->mask;
691	seg_desc = &tx->desc[idx];
692	gve_tx_fill_seg_desc(seg_desc, l3_offset: skb_network_offset(skb),
693	skb_shinfo(skb)->gso_size,
694	is_gso_v6: skb_is_gso_v6(skb), is_gso, len, addr);
695	}
696
697	for (i = `0`; i < shinfo->nr_frags; i++) {
698	const skb_frag_t *frag = &shinfo->frags[i];
699
700	idx = (idx + `1`) & tx->mask;
701	seg_desc = &tx->desc[idx];
702	len = skb_frag_size(frag);
703	addr = skb_frag_dma_map(dev: tx->dev, frag, offset: `0`, size: len, dir: DMA_TO_DEVICE);
704	if (unlikely(dma_mapping_error(tx->dev, addr))) {
705	tx->dma_mapping_error++;
706	goto unmap_drop;
707	}
708	tx->info[idx].skb = NULL;
709	dma_unmap_len_set(&tx->info[idx], len, len);
710	dma_unmap_addr_set(&tx->info[idx], dma, addr);
711
712	gve_tx_fill_seg_desc(seg_desc, l3_offset: skb_network_offset(skb),
713	skb_shinfo(skb)->gso_size,
714	is_gso_v6: skb_is_gso_v6(skb), is_gso, len, addr);
715	}
716
717	return num_descriptors;
718
719	unmap_drop:
720	i += num_descriptors - shinfo->nr_frags;
721	while (i--) {
722	/ Skip metadata descriptor, if set /
723	if (i == `1` && mtd_desc_nr == `1`)
724	continue;
725	idx--;
726	gve_tx_unmap_buf(dev: tx->dev, info: &tx->info[idx & tx->mask]);
727	}
728	drop:
729	tx->dropped_pkt++;
730	return `0`;
731	}
732
733	netdev_tx_t gve_tx(struct sk_buff skb, struct* net_device *dev)
734	{
735	struct gve_priv *priv = netdev_priv(dev);
736	struct gve_tx_ring *tx;
737	int nsegs;
738
739	WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues,
740	"skb queue index out of range");
741	tx = &priv->tx[skb_get_queue_mapping(skb)];
742	if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) {
743	/ We need to ring the txq doorbell -- we have stopped the Tx*
744	* queue for want of resources, but prior calls to gve_tx()
745	* may have added descriptors without ringing the doorbell.
746	*/
747
748	gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req);
749	return NETDEV_TX_BUSY;
750	}
751	if (tx->raw_addressing)
752	nsegs = gve_tx_add_skb_no_copy(priv, tx, skb);
753	else
754	nsegs = gve_tx_add_skb_copy(priv, tx, skb);
755
756	/ If the packet is getting sent, we need to update the skb /
757	if (nsegs) {
758	netdev_tx_sent_queue(dev_queue: tx->netdev_txq, bytes: skb->len);
759	skb_tx_timestamp(skb);
760	tx->req += nsegs;
761	} else {
762	dev_kfree_skb_any(skb);
763	}
764
765	if (!netif_xmit_stopped(dev_queue: tx->netdev_txq) && netdev_xmit_more())
766	return NETDEV_TX_OK;
767
768	/ Give packets to NIC. Even if this packet failed to send the doorbell*
769	* might need to be rung because of xmit_more.
770	*/
771	gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req);
772	return NETDEV_TX_OK;
773	}
774
775	static int gve_tx_fill_xdp(struct gve_priv priv, struct* gve_tx_ring *tx,
776	void data, int* len, void *frame_p, bool is_xsk)
777	{
778	int pad, nfrags, ndescs, iovi, offset;
779	struct gve_tx_buffer_state *info;
780	u32 reqi = tx->req;
781
782	pad = gve_tx_fifo_pad_alloc_one_frag(fifo: &tx->tx_fifo, bytes: len);
783	if (pad >= GVE_GQ_TX_MIN_PKT_DESC_BYTES)
784	pad = `0`;
785	info = &tx->info[reqi & tx->mask];
786	info->xdp_frame = frame_p;
787	info->xdp.size = len;
788	info->xdp.is_xsk = is_xsk;
789
790	nfrags = gve_tx_alloc_fifo(fifo: &tx->tx_fifo, bytes: pad + len,
791	iov: &info->iov[`0`]);
792	iovi = pad > `0`;
793	ndescs = nfrags - iovi;
794	offset = `0`;
795
796	while (iovi < nfrags) {
797	if (!offset)
798	gve_tx_fill_pkt_desc(pkt_desc: &tx->desc[reqi & tx->mask], csum_offset: `0`,
799	CHECKSUM_NONE, is_gso: false, l4_hdr_offset: `0`, desc_cnt: ndescs,
800	hlen: info->iov[iovi].iov_len,
801	addr: info->iov[iovi].iov_offset, pkt_len: len);
802	else
803	gve_tx_fill_seg_desc(seg_desc: &tx->desc[reqi & tx->mask],
804	l3_offset: `0`, gso_size: `0`, is_gso_v6: false, is_gso: false,
805	len: info->iov[iovi].iov_len,
806	addr: info->iov[iovi].iov_offset);
807
808	memcpy(tx->tx_fifo.base + info->iov[iovi].iov_offset,
809	data + offset, info->iov[iovi].iov_len);
810	gve_dma_sync_for_device(dev: &priv->pdev->dev,
811	page_buses: tx->tx_fifo.qpl->page_buses,
812	iov_offset: info->iov[iovi].iov_offset,
813	iov_len: info->iov[iovi].iov_len);
814	offset += info->iov[iovi].iov_len;
815	iovi++;
816	reqi++;
817	}
818
819	return ndescs;
820	}
821
822	int gve_xdp_xmit(struct net_device dev, int* n, struct xdp_frame **frames,
823	u32 flags)
824	{
825	struct gve_priv *priv = netdev_priv(dev);
826	struct gve_tx_ring *tx;
827	int i, err = `0`, qid;
828
829	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
830	return -EINVAL;
831
832	qid = gve_xdp_tx_queue_id(priv,
833	smp_processor_id() % priv->num_xdp_queues);
834
835	tx = &priv->tx[qid];
836
837	spin_lock(lock: &tx->xdp_lock);
838	for (i = `0`; i < n; i++) {
839	err = gve_xdp_xmit_one(priv, tx, data: frames[i]->data,
840	len: frames[i]->len, frame_p: frames[i]);
841	if (err)
842	break;
843	}
844
845	if (flags & XDP_XMIT_FLUSH)
846	gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req);
847
848	spin_unlock(lock: &tx->xdp_lock);
849
850	u64_stats_update_begin(syncp: &tx->statss);
851	tx->xdp_xmit += n;
852	tx->xdp_xmit_errors += n - i;
853	u64_stats_update_end(syncp: &tx->statss);
854
855	return i ? i : err;
856	}
857
858	int gve_xdp_xmit_one(struct gve_priv priv, struct* gve_tx_ring *tx,
859	void data, int* len, void *frame_p)
860	{
861	int nsegs;
862
863	if (!gve_can_tx(tx, bytes_required: len + GVE_GQ_TX_MIN_PKT_DESC_BYTES - `1`))
864	return -EBUSY;
865
866	nsegs = gve_tx_fill_xdp(priv, tx, data, len, frame_p, is_xsk: false);
867	tx->req += nsegs;
868
869	return `0`;
870	}
871
872	#define GVE_TX_START_THRESH 4096
873
874	static int gve_clean_tx_done(struct gve_priv priv, struct* gve_tx_ring *tx,
875	u32 to_do, bool try_to_wake)
876	{
877	struct gve_tx_buffer_state *info;
878	u64 pkts = `0`, bytes = `0`;
879	size_t space_freed = `0`;
880	struct sk_buff *skb;
881	u32 idx;
882	int j;
883
884	for (j = `0`; j < to_do; j++) {
885	idx = tx->done & tx->mask;
886	netif_info(priv, tx_done, priv->dev,
887	"[%d] %s: idx=%d (req=%u done=%u)\n",
888	tx->q_num, __func__, idx, tx->req, tx->done);
889	info = &tx->info[idx];
890	skb = info->skb;
891
892	/ Unmap the buffer /
893	if (tx->raw_addressing)
894	gve_tx_unmap_buf(dev: tx->dev, info);
895	tx->done++;
896	/ Mark as free /
897	if (skb) {
898	info->skb = NULL;
899	bytes += skb->len;
900	pkts++;
901	dev_consume_skb_any(skb);
902	if (tx->raw_addressing)
903	continue;
904	space_freed += gve_tx_clear_buffer_state(info);
905	}
906	}
907
908	if (!tx->raw_addressing)
909	gve_tx_free_fifo(fifo: &tx->tx_fifo, bytes: space_freed);
910	u64_stats_update_begin(syncp: &tx->statss);
911	tx->bytes_done += bytes;
912	tx->pkt_done += pkts;
913	u64_stats_update_end(syncp: &tx->statss);
914	netdev_tx_completed_queue(dev_queue: tx->netdev_txq, pkts, bytes);
915
916	/ start the queue if we've stopped it /
917	#ifndef CONFIG_BQL
918	/ Make sure that the doorbells are synced /
919	smp_mb();
920	#endif
921	if (try_to_wake && netif_tx_queue_stopped(dev_queue: tx->netdev_txq) &&
922	likely(gve_can_tx(tx, GVE_TX_START_THRESH))) {
923	tx->wake_queue++;
924	netif_tx_wake_queue(dev_queue: tx->netdev_txq);
925	}
926
927	return pkts;
928	}
929
930	u32 gve_tx_load_event_counter(struct gve_priv *priv,
931	struct gve_tx_ring *tx)
932	{
933	u32 counter_index = be32_to_cpu(tx->q_resources->counter_index);
934	__be32 counter = READ_ONCE(priv->counter_array[counter_index]);
935
936	return be32_to_cpu(counter);
937	}
938
939	static int gve_xsk_tx(struct gve_priv priv, struct* gve_tx_ring *tx,
940	int budget)
941	{
942	struct xdp_desc desc;
943	int sent = `0`, nsegs;
944	void *data;
945
946	spin_lock(lock: &tx->xdp_lock);
947	while (sent < budget) {
948	if (!gve_can_tx(tx, GVE_TX_START_THRESH))
949	goto out;
950
951	if (!xsk_tx_peek_desc(pool: tx->xsk_pool, desc: &desc)) {
952	tx->xdp_xsk_done = tx->xdp_xsk_wakeup;
953	goto out;
954	}
955
956	data = xsk_buff_raw_get_data(pool: tx->xsk_pool, addr: desc.addr);
957	nsegs = gve_tx_fill_xdp(priv, tx, data, len: desc.len, NULL, is_xsk: true);
958	tx->req += nsegs;
959	sent++;
960	}
961	out:
962	if (sent > `0`) {
963	gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req);
964	xsk_tx_release(pool: tx->xsk_pool);
965	}
966	spin_unlock(lock: &tx->xdp_lock);
967	return sent;
968	}
969
970	bool gve_xdp_poll(struct gve_notify_block block, int* budget)
971	{
972	struct gve_priv *priv = block->priv;
973	struct gve_tx_ring *tx = block->tx;
974	u32 nic_done;
975	bool repoll;
976	u32 to_do;
977
978	/ Find out how much work there is to be done /
979	nic_done = gve_tx_load_event_counter(priv, tx);
980	to_do = min_t(u32, (nic_done - tx->done), budget);
981	gve_clean_xdp_done(priv, tx, to_do);
982	repoll = nic_done != tx->done;
983
984	if (tx->xsk_pool) {
985	int sent = gve_xsk_tx(priv, tx, budget);
986
987	u64_stats_update_begin(syncp: &tx->statss);
988	tx->xdp_xsk_sent += sent;
989	u64_stats_update_end(syncp: &tx->statss);
990	repoll \|= (sent == budget);
991	if (xsk_uses_need_wakeup(pool: tx->xsk_pool))
992	xsk_set_tx_need_wakeup(pool: tx->xsk_pool);
993	}
994
995	/ If we still have work we want to repoll /
996	return repoll;
997	}
998
999	bool gve_tx_poll(struct gve_notify_block block, int* budget)
1000	{
1001	struct gve_priv *priv = block->priv;
1002	struct gve_tx_ring *tx = block->tx;
1003	u32 nic_done;
1004	u32 to_do;
1005
1006	/ If budget is 0, do all the work /
1007	if (budget == `0`)
1008	budget = INT_MAX;
1009
1010	/ In TX path, it may try to clean completed pkts in order to xmit,*
1011	* to avoid cleaning conflict, use spin_lock(), it yields better
1012	* concurrency between xmit/clean than netif's lock.
1013	*/
1014	spin_lock(lock: &tx->clean_lock);
1015	/ Find out how much work there is to be done /
1016	nic_done = gve_tx_load_event_counter(priv, tx);
1017	to_do = min_t(u32, (nic_done - tx->done), budget);
1018	gve_clean_tx_done(priv, tx, to_do, try_to_wake: true);
1019	spin_unlock(lock: &tx->clean_lock);
1020	/ If we still have work we want to repoll /
1021	return nic_done != tx->done;
1022	}
1023
1024	bool gve_tx_clean_pending(struct gve_priv priv, struct* gve_tx_ring *tx)
1025	{
1026	u32 nic_done = gve_tx_load_event_counter(priv, tx);
1027
1028	return nic_done != tx->done;
1029	}
1030

source code of linux/drivers/net/ethernet/google/gve/gve_tx.c