1 | // SPDX-License-Identifier: (GPL-2.0 OR MIT) |
2 | /* Google virtual Ethernet (gve) driver |
3 | * |
4 | * Copyright (C) 2015-2021 Google, Inc. |
5 | */ |
6 | |
7 | #include "gve.h" |
8 | #include "gve_adminq.h" |
9 | #include "gve_utils.h" |
10 | #include <linux/ip.h> |
11 | #include <linux/tcp.h> |
12 | #include <linux/vmalloc.h> |
13 | #include <linux/skbuff.h> |
14 | #include <net/xdp_sock_drv.h> |
15 | |
16 | static inline void gve_tx_put_doorbell(struct gve_priv *priv, |
17 | struct gve_queue_resources *q_resources, |
18 | u32 val) |
19 | { |
20 | iowrite32be(val, &priv->db_bar2[be32_to_cpu(q_resources->db_index)]); |
21 | } |
22 | |
23 | void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid) |
24 | { |
25 | u32 tx_qid = gve_xdp_tx_queue_id(priv, queue_id: xdp_qid); |
26 | struct gve_tx_ring *tx = &priv->tx[tx_qid]; |
27 | |
28 | gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req); |
29 | } |
30 | |
31 | /* gvnic can only transmit from a Registered Segment. |
32 | * We copy skb payloads into the registered segment before writing Tx |
33 | * descriptors and ringing the Tx doorbell. |
34 | * |
35 | * gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must |
36 | * free allocations in the order they were allocated. |
37 | */ |
38 | |
39 | static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_fifo *fifo) |
40 | { |
41 | fifo->base = vmap(pages: fifo->qpl->pages, count: fifo->qpl->num_entries, VM_MAP, |
42 | PAGE_KERNEL); |
43 | if (unlikely(!fifo->base)) { |
44 | netif_err(priv, drv, priv->dev, "Failed to vmap fifo, qpl_id = %d\n" , |
45 | fifo->qpl->id); |
46 | return -ENOMEM; |
47 | } |
48 | |
49 | fifo->size = fifo->qpl->num_entries * PAGE_SIZE; |
50 | atomic_set(v: &fifo->available, i: fifo->size); |
51 | fifo->head = 0; |
52 | return 0; |
53 | } |
54 | |
55 | static void gve_tx_fifo_release(struct gve_priv *priv, struct gve_tx_fifo *fifo) |
56 | { |
57 | WARN(atomic_read(&fifo->available) != fifo->size, |
58 | "Releasing non-empty fifo" ); |
59 | |
60 | vunmap(addr: fifo->base); |
61 | } |
62 | |
63 | static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, |
64 | size_t bytes) |
65 | { |
66 | return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; |
67 | } |
68 | |
69 | static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) |
70 | { |
71 | return (atomic_read(v: &fifo->available) <= bytes) ? false : true; |
72 | } |
73 | |
74 | /* gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO |
75 | * @fifo: FIFO to allocate from |
76 | * @bytes: Allocation size |
77 | * @iov: Scatter-gather elements to fill with allocation fragment base/len |
78 | * |
79 | * Returns number of valid elements in iov[] or negative on error. |
80 | * |
81 | * Allocations from a given FIFO must be externally synchronized but concurrent |
82 | * allocation and frees are allowed. |
83 | */ |
84 | static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, |
85 | struct gve_tx_iovec iov[2]) |
86 | { |
87 | size_t overflow, padding; |
88 | u32 aligned_head; |
89 | int nfrags = 0; |
90 | |
91 | if (!bytes) |
92 | return 0; |
93 | |
94 | /* This check happens before we know how much padding is needed to |
95 | * align to a cacheline boundary for the payload, but that is fine, |
96 | * because the FIFO head always start aligned, and the FIFO's boundaries |
97 | * are aligned, so if there is space for the data, there is space for |
98 | * the padding to the next alignment. |
99 | */ |
100 | WARN(!gve_tx_fifo_can_alloc(fifo, bytes), |
101 | "Reached %s when there's not enough space in the fifo" , __func__); |
102 | |
103 | nfrags++; |
104 | |
105 | iov[0].iov_offset = fifo->head; |
106 | iov[0].iov_len = bytes; |
107 | fifo->head += bytes; |
108 | |
109 | if (fifo->head > fifo->size) { |
110 | /* If the allocation did not fit in the tail fragment of the |
111 | * FIFO, also use the head fragment. |
112 | */ |
113 | nfrags++; |
114 | overflow = fifo->head - fifo->size; |
115 | iov[0].iov_len -= overflow; |
116 | iov[1].iov_offset = 0; /* Start of fifo*/ |
117 | iov[1].iov_len = overflow; |
118 | |
119 | fifo->head = overflow; |
120 | } |
121 | |
122 | /* Re-align to a cacheline boundary */ |
123 | aligned_head = L1_CACHE_ALIGN(fifo->head); |
124 | padding = aligned_head - fifo->head; |
125 | iov[nfrags - 1].iov_padding = padding; |
126 | atomic_sub(i: bytes + padding, v: &fifo->available); |
127 | fifo->head = aligned_head; |
128 | |
129 | if (fifo->head == fifo->size) |
130 | fifo->head = 0; |
131 | |
132 | return nfrags; |
133 | } |
134 | |
135 | /* gve_tx_free_fifo - Return space to Tx FIFO |
136 | * @fifo: FIFO to return fragments to |
137 | * @bytes: Bytes to free |
138 | */ |
139 | static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) |
140 | { |
141 | atomic_add(i: bytes, v: &fifo->available); |
142 | } |
143 | |
144 | static size_t gve_tx_clear_buffer_state(struct gve_tx_buffer_state *info) |
145 | { |
146 | size_t space_freed = 0; |
147 | int i; |
148 | |
149 | for (i = 0; i < ARRAY_SIZE(info->iov); i++) { |
150 | space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; |
151 | info->iov[i].iov_len = 0; |
152 | info->iov[i].iov_padding = 0; |
153 | } |
154 | return space_freed; |
155 | } |
156 | |
157 | static int gve_clean_xdp_done(struct gve_priv *priv, struct gve_tx_ring *tx, |
158 | u32 to_do) |
159 | { |
160 | struct gve_tx_buffer_state *info; |
161 | u32 clean_end = tx->done + to_do; |
162 | u64 pkts = 0, bytes = 0; |
163 | size_t space_freed = 0; |
164 | u32 xsk_complete = 0; |
165 | u32 idx; |
166 | |
167 | for (; tx->done < clean_end; tx->done++) { |
168 | idx = tx->done & tx->mask; |
169 | info = &tx->info[idx]; |
170 | |
171 | if (unlikely(!info->xdp.size)) |
172 | continue; |
173 | |
174 | bytes += info->xdp.size; |
175 | pkts++; |
176 | xsk_complete += info->xdp.is_xsk; |
177 | |
178 | info->xdp.size = 0; |
179 | if (info->xdp_frame) { |
180 | xdp_return_frame(xdpf: info->xdp_frame); |
181 | info->xdp_frame = NULL; |
182 | } |
183 | space_freed += gve_tx_clear_buffer_state(info); |
184 | } |
185 | |
186 | gve_tx_free_fifo(fifo: &tx->tx_fifo, bytes: space_freed); |
187 | if (xsk_complete > 0 && tx->xsk_pool) |
188 | xsk_tx_completed(pool: tx->xsk_pool, nb_entries: xsk_complete); |
189 | u64_stats_update_begin(syncp: &tx->statss); |
190 | tx->bytes_done += bytes; |
191 | tx->pkt_done += pkts; |
192 | u64_stats_update_end(syncp: &tx->statss); |
193 | return pkts; |
194 | } |
195 | |
196 | static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, |
197 | u32 to_do, bool try_to_wake); |
198 | |
199 | void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx) |
200 | { |
201 | int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx: idx); |
202 | struct gve_tx_ring *tx = &priv->tx[idx]; |
203 | |
204 | if (!gve_tx_was_added_to_block(priv, queue_idx: idx)) |
205 | return; |
206 | |
207 | gve_remove_napi(priv, ntfy_idx); |
208 | gve_clean_tx_done(priv, tx, to_do: priv->tx_desc_cnt, try_to_wake: false); |
209 | netdev_tx_reset_queue(q: tx->netdev_txq); |
210 | gve_tx_remove_from_block(priv, queue_idx: idx); |
211 | } |
212 | |
213 | static void gve_tx_free_ring_gqi(struct gve_priv *priv, struct gve_tx_ring *tx, |
214 | struct gve_tx_alloc_rings_cfg *cfg) |
215 | { |
216 | struct device *hdev = &priv->pdev->dev; |
217 | int idx = tx->q_num; |
218 | size_t bytes; |
219 | u32 slots; |
220 | |
221 | slots = tx->mask + 1; |
222 | dma_free_coherent(dev: hdev, size: sizeof(*tx->q_resources), |
223 | cpu_addr: tx->q_resources, dma_handle: tx->q_resources_bus); |
224 | tx->q_resources = NULL; |
225 | |
226 | if (!tx->raw_addressing) { |
227 | gve_tx_fifo_release(priv, fifo: &tx->tx_fifo); |
228 | gve_unassign_qpl(qpl_cfg: cfg->qpl_cfg, id: tx->tx_fifo.qpl->id); |
229 | tx->tx_fifo.qpl = NULL; |
230 | } |
231 | |
232 | bytes = sizeof(*tx->desc) * slots; |
233 | dma_free_coherent(dev: hdev, size: bytes, cpu_addr: tx->desc, dma_handle: tx->bus); |
234 | tx->desc = NULL; |
235 | |
236 | vfree(addr: tx->info); |
237 | tx->info = NULL; |
238 | |
239 | netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n" , idx); |
240 | } |
241 | |
242 | void gve_tx_start_ring_gqi(struct gve_priv *priv, int idx) |
243 | { |
244 | int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx: idx); |
245 | struct gve_tx_ring *tx = &priv->tx[idx]; |
246 | |
247 | gve_tx_add_to_block(priv, queue_idx: idx); |
248 | |
249 | tx->netdev_txq = netdev_get_tx_queue(dev: priv->dev, index: idx); |
250 | gve_add_napi(priv, ntfy_idx, gve_poll: gve_napi_poll); |
251 | } |
252 | |
253 | static int gve_tx_alloc_ring_gqi(struct gve_priv *priv, |
254 | struct gve_tx_alloc_rings_cfg *cfg, |
255 | struct gve_tx_ring *tx, |
256 | int idx) |
257 | { |
258 | struct device *hdev = &priv->pdev->dev; |
259 | size_t bytes; |
260 | |
261 | /* Make sure everything is zeroed to start */ |
262 | memset(tx, 0, sizeof(*tx)); |
263 | spin_lock_init(&tx->clean_lock); |
264 | spin_lock_init(&tx->xdp_lock); |
265 | tx->q_num = idx; |
266 | |
267 | tx->mask = cfg->ring_size - 1; |
268 | |
269 | /* alloc metadata */ |
270 | tx->info = vcalloc(n: cfg->ring_size, size: sizeof(*tx->info)); |
271 | if (!tx->info) |
272 | return -ENOMEM; |
273 | |
274 | /* alloc tx queue */ |
275 | bytes = sizeof(*tx->desc) * cfg->ring_size; |
276 | tx->desc = dma_alloc_coherent(dev: hdev, size: bytes, dma_handle: &tx->bus, GFP_KERNEL); |
277 | if (!tx->desc) |
278 | goto abort_with_info; |
279 | |
280 | tx->raw_addressing = cfg->raw_addressing; |
281 | tx->dev = hdev; |
282 | if (!tx->raw_addressing) { |
283 | tx->tx_fifo.qpl = gve_assign_tx_qpl(cfg, tx_qid: idx); |
284 | if (!tx->tx_fifo.qpl) |
285 | goto abort_with_desc; |
286 | /* map Tx FIFO */ |
287 | if (gve_tx_fifo_init(priv, fifo: &tx->tx_fifo)) |
288 | goto abort_with_qpl; |
289 | } |
290 | |
291 | tx->q_resources = |
292 | dma_alloc_coherent(dev: hdev, |
293 | size: sizeof(*tx->q_resources), |
294 | dma_handle: &tx->q_resources_bus, |
295 | GFP_KERNEL); |
296 | if (!tx->q_resources) |
297 | goto abort_with_fifo; |
298 | |
299 | return 0; |
300 | |
301 | abort_with_fifo: |
302 | if (!tx->raw_addressing) |
303 | gve_tx_fifo_release(priv, fifo: &tx->tx_fifo); |
304 | abort_with_qpl: |
305 | if (!tx->raw_addressing) |
306 | gve_unassign_qpl(qpl_cfg: cfg->qpl_cfg, id: tx->tx_fifo.qpl->id); |
307 | abort_with_desc: |
308 | dma_free_coherent(dev: hdev, size: bytes, cpu_addr: tx->desc, dma_handle: tx->bus); |
309 | tx->desc = NULL; |
310 | abort_with_info: |
311 | vfree(addr: tx->info); |
312 | tx->info = NULL; |
313 | return -ENOMEM; |
314 | } |
315 | |
316 | int gve_tx_alloc_rings_gqi(struct gve_priv *priv, |
317 | struct gve_tx_alloc_rings_cfg *cfg) |
318 | { |
319 | struct gve_tx_ring *tx = cfg->tx; |
320 | int err = 0; |
321 | int i, j; |
322 | |
323 | if (!cfg->raw_addressing && !cfg->qpls) { |
324 | netif_err(priv, drv, priv->dev, |
325 | "Cannot alloc QPL ring before allocing QPLs\n" ); |
326 | return -EINVAL; |
327 | } |
328 | |
329 | if (cfg->start_idx + cfg->num_rings > cfg->qcfg->max_queues) { |
330 | netif_err(priv, drv, priv->dev, |
331 | "Cannot alloc more than the max num of Tx rings\n" ); |
332 | return -EINVAL; |
333 | } |
334 | |
335 | if (cfg->start_idx == 0) { |
336 | tx = kvcalloc(n: cfg->qcfg->max_queues, size: sizeof(struct gve_tx_ring), |
337 | GFP_KERNEL); |
338 | if (!tx) |
339 | return -ENOMEM; |
340 | } else if (!tx) { |
341 | netif_err(priv, drv, priv->dev, |
342 | "Cannot alloc tx rings from a nonzero start idx without tx array\n" ); |
343 | return -EINVAL; |
344 | } |
345 | |
346 | for (i = cfg->start_idx; i < cfg->start_idx + cfg->num_rings; i++) { |
347 | err = gve_tx_alloc_ring_gqi(priv, cfg, tx: &tx[i], idx: i); |
348 | if (err) { |
349 | netif_err(priv, drv, priv->dev, |
350 | "Failed to alloc tx ring=%d: err=%d\n" , |
351 | i, err); |
352 | goto cleanup; |
353 | } |
354 | } |
355 | |
356 | cfg->tx = tx; |
357 | return 0; |
358 | |
359 | cleanup: |
360 | for (j = 0; j < i; j++) |
361 | gve_tx_free_ring_gqi(priv, tx: &tx[j], cfg); |
362 | if (cfg->start_idx == 0) |
363 | kvfree(addr: tx); |
364 | return err; |
365 | } |
366 | |
367 | void gve_tx_free_rings_gqi(struct gve_priv *priv, |
368 | struct gve_tx_alloc_rings_cfg *cfg) |
369 | { |
370 | struct gve_tx_ring *tx = cfg->tx; |
371 | int i; |
372 | |
373 | if (!tx) |
374 | return; |
375 | |
376 | for (i = cfg->start_idx; i < cfg->start_idx + cfg->num_rings; i++) |
377 | gve_tx_free_ring_gqi(priv, tx: &tx[i], cfg); |
378 | |
379 | if (cfg->start_idx == 0) { |
380 | kvfree(addr: tx); |
381 | cfg->tx = NULL; |
382 | } |
383 | } |
384 | |
385 | /* gve_tx_avail - Calculates the number of slots available in the ring |
386 | * @tx: tx ring to check |
387 | * |
388 | * Returns the number of slots available |
389 | * |
390 | * The capacity of the queue is mask + 1. We don't need to reserve an entry. |
391 | **/ |
392 | static inline u32 gve_tx_avail(struct gve_tx_ring *tx) |
393 | { |
394 | return tx->mask + 1 - (tx->req - tx->done); |
395 | } |
396 | |
397 | static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx, |
398 | struct sk_buff *skb) |
399 | { |
400 | int pad_bytes, align_hdr_pad; |
401 | int bytes; |
402 | int hlen; |
403 | |
404 | hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + tcp_hdrlen(skb) : |
405 | min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len); |
406 | |
407 | pad_bytes = gve_tx_fifo_pad_alloc_one_frag(fifo: &tx->tx_fifo, |
408 | bytes: hlen); |
409 | /* We need to take into account the header alignment padding. */ |
410 | align_hdr_pad = L1_CACHE_ALIGN(hlen) - hlen; |
411 | bytes = align_hdr_pad + pad_bytes + skb->len; |
412 | |
413 | return bytes; |
414 | } |
415 | |
416 | /* The most descriptors we could need is MAX_SKB_FRAGS + 4 : |
417 | * 1 for each skb frag |
418 | * 1 for the skb linear portion |
419 | * 1 for when tcp hdr needs to be in separate descriptor |
420 | * 1 if the payload wraps to the beginning of the FIFO |
421 | * 1 for metadata descriptor |
422 | */ |
423 | #define MAX_TX_DESC_NEEDED (MAX_SKB_FRAGS + 4) |
424 | static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info) |
425 | { |
426 | if (info->skb) { |
427 | dma_unmap_single(dev, dma_unmap_addr(info, dma), |
428 | dma_unmap_len(info, len), |
429 | DMA_TO_DEVICE); |
430 | dma_unmap_len_set(info, len, 0); |
431 | } else { |
432 | dma_unmap_page(dev, dma_unmap_addr(info, dma), |
433 | dma_unmap_len(info, len), |
434 | DMA_TO_DEVICE); |
435 | dma_unmap_len_set(info, len, 0); |
436 | } |
437 | } |
438 | |
439 | /* Check if sufficient resources (descriptor ring space, FIFO space) are |
440 | * available to transmit the given number of bytes. |
441 | */ |
442 | static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) |
443 | { |
444 | bool can_alloc = true; |
445 | |
446 | if (!tx->raw_addressing) |
447 | can_alloc = gve_tx_fifo_can_alloc(fifo: &tx->tx_fifo, bytes: bytes_required); |
448 | |
449 | return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc); |
450 | } |
451 | |
452 | static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED); |
453 | |
454 | /* Stops the queue if the skb cannot be transmitted. */ |
455 | static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx, |
456 | struct sk_buff *skb) |
457 | { |
458 | int bytes_required = 0; |
459 | u32 nic_done; |
460 | u32 to_do; |
461 | int ret; |
462 | |
463 | if (!tx->raw_addressing) |
464 | bytes_required = gve_skb_fifo_bytes_required(tx, skb); |
465 | |
466 | if (likely(gve_can_tx(tx, bytes_required))) |
467 | return 0; |
468 | |
469 | ret = -EBUSY; |
470 | spin_lock(lock: &tx->clean_lock); |
471 | nic_done = gve_tx_load_event_counter(priv, tx); |
472 | to_do = nic_done - tx->done; |
473 | |
474 | /* Only try to clean if there is hope for TX */ |
475 | if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) { |
476 | if (to_do > 0) { |
477 | to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT); |
478 | gve_clean_tx_done(priv, tx, to_do, try_to_wake: false); |
479 | } |
480 | if (likely(gve_can_tx(tx, bytes_required))) |
481 | ret = 0; |
482 | } |
483 | if (ret) { |
484 | /* No space, so stop the queue */ |
485 | tx->stop_queue++; |
486 | netif_tx_stop_queue(dev_queue: tx->netdev_txq); |
487 | } |
488 | spin_unlock(lock: &tx->clean_lock); |
489 | |
490 | return ret; |
491 | } |
492 | |
493 | static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, |
494 | u16 csum_offset, u8 ip_summed, bool is_gso, |
495 | int l4_hdr_offset, u32 desc_cnt, |
496 | u16 hlen, u64 addr, u16 pkt_len) |
497 | { |
498 | /* l4_hdr_offset and csum_offset are in units of 16-bit words */ |
499 | if (is_gso) { |
500 | pkt_desc->pkt.type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; |
501 | pkt_desc->pkt.l4_csum_offset = csum_offset >> 1; |
502 | pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1; |
503 | } else if (likely(ip_summed == CHECKSUM_PARTIAL)) { |
504 | pkt_desc->pkt.type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; |
505 | pkt_desc->pkt.l4_csum_offset = csum_offset >> 1; |
506 | pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1; |
507 | } else { |
508 | pkt_desc->pkt.type_flags = GVE_TXD_STD; |
509 | pkt_desc->pkt.l4_csum_offset = 0; |
510 | pkt_desc->pkt.l4_hdr_offset = 0; |
511 | } |
512 | pkt_desc->pkt.desc_cnt = desc_cnt; |
513 | pkt_desc->pkt.len = cpu_to_be16(pkt_len); |
514 | pkt_desc->pkt.seg_len = cpu_to_be16(hlen); |
515 | pkt_desc->pkt.seg_addr = cpu_to_be64(addr); |
516 | } |
517 | |
518 | static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc, |
519 | struct sk_buff *skb) |
520 | { |
521 | BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt)); |
522 | |
523 | mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; |
524 | mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT | |
525 | GVE_MTD_PATH_HASH_L4; |
526 | mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash); |
527 | mtd_desc->mtd.reserved0 = 0; |
528 | mtd_desc->mtd.reserved1 = 0; |
529 | } |
530 | |
531 | static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc, |
532 | u16 l3_offset, u16 gso_size, |
533 | bool is_gso_v6, bool is_gso, |
534 | u16 len, u64 addr) |
535 | { |
536 | seg_desc->seg.type_flags = GVE_TXD_SEG; |
537 | if (is_gso) { |
538 | if (is_gso_v6) |
539 | seg_desc->seg.type_flags |= GVE_TXSF_IPV6; |
540 | seg_desc->seg.l3_offset = l3_offset >> 1; |
541 | seg_desc->seg.mss = cpu_to_be16(gso_size); |
542 | } |
543 | seg_desc->seg.seg_len = cpu_to_be16(len); |
544 | seg_desc->seg.seg_addr = cpu_to_be64(addr); |
545 | } |
546 | |
547 | static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses, |
548 | u64 iov_offset, u64 iov_len) |
549 | { |
550 | u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; |
551 | u64 first_page = iov_offset / PAGE_SIZE; |
552 | u64 page; |
553 | |
554 | for (page = first_page; page <= last_page; page++) |
555 | dma_sync_single_for_device(dev, addr: page_buses[page], PAGE_SIZE, dir: DMA_TO_DEVICE); |
556 | } |
557 | |
558 | static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb) |
559 | { |
560 | int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset; |
561 | union gve_tx_desc *pkt_desc, *seg_desc; |
562 | struct gve_tx_buffer_state *info; |
563 | int mtd_desc_nr = !!skb->l4_hash; |
564 | bool is_gso = skb_is_gso(skb); |
565 | u32 idx = tx->req & tx->mask; |
566 | int payload_iov = 2; |
567 | int copy_offset; |
568 | u32 next_idx; |
569 | int i; |
570 | |
571 | info = &tx->info[idx]; |
572 | pkt_desc = &tx->desc[idx]; |
573 | |
574 | l4_hdr_offset = skb_checksum_start_offset(skb); |
575 | /* If the skb is gso, then we want the tcp header alone in the first segment |
576 | * otherwise we want the minimum required by the gVNIC spec. |
577 | */ |
578 | hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : |
579 | min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len); |
580 | |
581 | info->skb = skb; |
582 | /* We don't want to split the header, so if necessary, pad to the end |
583 | * of the fifo and then put the header at the beginning of the fifo. |
584 | */ |
585 | pad_bytes = gve_tx_fifo_pad_alloc_one_frag(fifo: &tx->tx_fifo, bytes: hlen); |
586 | hdr_nfrags = gve_tx_alloc_fifo(fifo: &tx->tx_fifo, bytes: hlen + pad_bytes, |
587 | iov: &info->iov[0]); |
588 | WARN(!hdr_nfrags, "hdr_nfrags should never be 0!" ); |
589 | payload_nfrags = gve_tx_alloc_fifo(fifo: &tx->tx_fifo, bytes: skb->len - hlen, |
590 | iov: &info->iov[payload_iov]); |
591 | |
592 | gve_tx_fill_pkt_desc(pkt_desc, csum_offset: skb->csum_offset, ip_summed: skb->ip_summed, |
593 | is_gso, l4_hdr_offset, |
594 | desc_cnt: 1 + mtd_desc_nr + payload_nfrags, hlen, |
595 | addr: info->iov[hdr_nfrags - 1].iov_offset, pkt_len: skb->len); |
596 | |
597 | skb_copy_bits(skb, offset: 0, |
598 | to: tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset, |
599 | len: hlen); |
600 | gve_dma_sync_for_device(dev: &priv->pdev->dev, page_buses: tx->tx_fifo.qpl->page_buses, |
601 | iov_offset: info->iov[hdr_nfrags - 1].iov_offset, |
602 | iov_len: info->iov[hdr_nfrags - 1].iov_len); |
603 | copy_offset = hlen; |
604 | |
605 | if (mtd_desc_nr) { |
606 | next_idx = (tx->req + 1) & tx->mask; |
607 | gve_tx_fill_mtd_desc(mtd_desc: &tx->desc[next_idx], skb); |
608 | } |
609 | |
610 | for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { |
611 | next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; |
612 | seg_desc = &tx->desc[next_idx]; |
613 | |
614 | gve_tx_fill_seg_desc(seg_desc, l3_offset: skb_network_offset(skb), |
615 | skb_shinfo(skb)->gso_size, |
616 | is_gso_v6: skb_is_gso_v6(skb), is_gso, |
617 | len: info->iov[i].iov_len, |
618 | addr: info->iov[i].iov_offset); |
619 | |
620 | skb_copy_bits(skb, offset: copy_offset, |
621 | to: tx->tx_fifo.base + info->iov[i].iov_offset, |
622 | len: info->iov[i].iov_len); |
623 | gve_dma_sync_for_device(dev: &priv->pdev->dev, page_buses: tx->tx_fifo.qpl->page_buses, |
624 | iov_offset: info->iov[i].iov_offset, |
625 | iov_len: info->iov[i].iov_len); |
626 | copy_offset += info->iov[i].iov_len; |
627 | } |
628 | |
629 | return 1 + mtd_desc_nr + payload_nfrags; |
630 | } |
631 | |
632 | static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx, |
633 | struct sk_buff *skb) |
634 | { |
635 | const struct skb_shared_info *shinfo = skb_shinfo(skb); |
636 | int hlen, num_descriptors, l4_hdr_offset; |
637 | union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc; |
638 | struct gve_tx_buffer_state *info; |
639 | int mtd_desc_nr = !!skb->l4_hash; |
640 | bool is_gso = skb_is_gso(skb); |
641 | u32 idx = tx->req & tx->mask; |
642 | u64 addr; |
643 | u32 len; |
644 | int i; |
645 | |
646 | info = &tx->info[idx]; |
647 | pkt_desc = &tx->desc[idx]; |
648 | |
649 | l4_hdr_offset = skb_checksum_start_offset(skb); |
650 | /* If the skb is gso, then we want only up to the tcp header in the first segment |
651 | * to efficiently replicate on each segment otherwise we want the linear portion |
652 | * of the skb (which will contain the checksum because skb->csum_start and |
653 | * skb->csum_offset are given relative to skb->head) in the first segment. |
654 | */ |
655 | hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb); |
656 | len = skb_headlen(skb); |
657 | |
658 | info->skb = skb; |
659 | |
660 | addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); |
661 | if (unlikely(dma_mapping_error(tx->dev, addr))) { |
662 | tx->dma_mapping_error++; |
663 | goto drop; |
664 | } |
665 | dma_unmap_len_set(info, len, len); |
666 | dma_unmap_addr_set(info, dma, addr); |
667 | |
668 | num_descriptors = 1 + shinfo->nr_frags; |
669 | if (hlen < len) |
670 | num_descriptors++; |
671 | if (mtd_desc_nr) |
672 | num_descriptors++; |
673 | |
674 | gve_tx_fill_pkt_desc(pkt_desc, csum_offset: skb->csum_offset, ip_summed: skb->ip_summed, |
675 | is_gso, l4_hdr_offset, |
676 | desc_cnt: num_descriptors, hlen, addr, pkt_len: skb->len); |
677 | |
678 | if (mtd_desc_nr) { |
679 | idx = (idx + 1) & tx->mask; |
680 | mtd_desc = &tx->desc[idx]; |
681 | gve_tx_fill_mtd_desc(mtd_desc, skb); |
682 | } |
683 | |
684 | if (hlen < len) { |
685 | /* For gso the rest of the linear portion of the skb needs to |
686 | * be in its own descriptor. |
687 | */ |
688 | len -= hlen; |
689 | addr += hlen; |
690 | idx = (idx + 1) & tx->mask; |
691 | seg_desc = &tx->desc[idx]; |
692 | gve_tx_fill_seg_desc(seg_desc, l3_offset: skb_network_offset(skb), |
693 | skb_shinfo(skb)->gso_size, |
694 | is_gso_v6: skb_is_gso_v6(skb), is_gso, len, addr); |
695 | } |
696 | |
697 | for (i = 0; i < shinfo->nr_frags; i++) { |
698 | const skb_frag_t *frag = &shinfo->frags[i]; |
699 | |
700 | idx = (idx + 1) & tx->mask; |
701 | seg_desc = &tx->desc[idx]; |
702 | len = skb_frag_size(frag); |
703 | addr = skb_frag_dma_map(dev: tx->dev, frag, offset: 0, size: len, dir: DMA_TO_DEVICE); |
704 | if (unlikely(dma_mapping_error(tx->dev, addr))) { |
705 | tx->dma_mapping_error++; |
706 | goto unmap_drop; |
707 | } |
708 | tx->info[idx].skb = NULL; |
709 | dma_unmap_len_set(&tx->info[idx], len, len); |
710 | dma_unmap_addr_set(&tx->info[idx], dma, addr); |
711 | |
712 | gve_tx_fill_seg_desc(seg_desc, l3_offset: skb_network_offset(skb), |
713 | skb_shinfo(skb)->gso_size, |
714 | is_gso_v6: skb_is_gso_v6(skb), is_gso, len, addr); |
715 | } |
716 | |
717 | return num_descriptors; |
718 | |
719 | unmap_drop: |
720 | i += num_descriptors - shinfo->nr_frags; |
721 | while (i--) { |
722 | /* Skip metadata descriptor, if set */ |
723 | if (i == 1 && mtd_desc_nr == 1) |
724 | continue; |
725 | idx--; |
726 | gve_tx_unmap_buf(dev: tx->dev, info: &tx->info[idx & tx->mask]); |
727 | } |
728 | drop: |
729 | tx->dropped_pkt++; |
730 | return 0; |
731 | } |
732 | |
733 | netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) |
734 | { |
735 | struct gve_priv *priv = netdev_priv(dev); |
736 | struct gve_tx_ring *tx; |
737 | int nsegs; |
738 | |
739 | WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues, |
740 | "skb queue index out of range" ); |
741 | tx = &priv->tx[skb_get_queue_mapping(skb)]; |
742 | if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) { |
743 | /* We need to ring the txq doorbell -- we have stopped the Tx |
744 | * queue for want of resources, but prior calls to gve_tx() |
745 | * may have added descriptors without ringing the doorbell. |
746 | */ |
747 | |
748 | gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req); |
749 | return NETDEV_TX_BUSY; |
750 | } |
751 | if (tx->raw_addressing) |
752 | nsegs = gve_tx_add_skb_no_copy(priv, tx, skb); |
753 | else |
754 | nsegs = gve_tx_add_skb_copy(priv, tx, skb); |
755 | |
756 | /* If the packet is getting sent, we need to update the skb */ |
757 | if (nsegs) { |
758 | netdev_tx_sent_queue(dev_queue: tx->netdev_txq, bytes: skb->len); |
759 | skb_tx_timestamp(skb); |
760 | tx->req += nsegs; |
761 | } else { |
762 | dev_kfree_skb_any(skb); |
763 | } |
764 | |
765 | if (!netif_xmit_stopped(dev_queue: tx->netdev_txq) && netdev_xmit_more()) |
766 | return NETDEV_TX_OK; |
767 | |
768 | /* Give packets to NIC. Even if this packet failed to send the doorbell |
769 | * might need to be rung because of xmit_more. |
770 | */ |
771 | gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req); |
772 | return NETDEV_TX_OK; |
773 | } |
774 | |
775 | static int gve_tx_fill_xdp(struct gve_priv *priv, struct gve_tx_ring *tx, |
776 | void *data, int len, void *frame_p, bool is_xsk) |
777 | { |
778 | int pad, nfrags, ndescs, iovi, offset; |
779 | struct gve_tx_buffer_state *info; |
780 | u32 reqi = tx->req; |
781 | |
782 | pad = gve_tx_fifo_pad_alloc_one_frag(fifo: &tx->tx_fifo, bytes: len); |
783 | if (pad >= GVE_GQ_TX_MIN_PKT_DESC_BYTES) |
784 | pad = 0; |
785 | info = &tx->info[reqi & tx->mask]; |
786 | info->xdp_frame = frame_p; |
787 | info->xdp.size = len; |
788 | info->xdp.is_xsk = is_xsk; |
789 | |
790 | nfrags = gve_tx_alloc_fifo(fifo: &tx->tx_fifo, bytes: pad + len, |
791 | iov: &info->iov[0]); |
792 | iovi = pad > 0; |
793 | ndescs = nfrags - iovi; |
794 | offset = 0; |
795 | |
796 | while (iovi < nfrags) { |
797 | if (!offset) |
798 | gve_tx_fill_pkt_desc(pkt_desc: &tx->desc[reqi & tx->mask], csum_offset: 0, |
799 | CHECKSUM_NONE, is_gso: false, l4_hdr_offset: 0, desc_cnt: ndescs, |
800 | hlen: info->iov[iovi].iov_len, |
801 | addr: info->iov[iovi].iov_offset, pkt_len: len); |
802 | else |
803 | gve_tx_fill_seg_desc(seg_desc: &tx->desc[reqi & tx->mask], |
804 | l3_offset: 0, gso_size: 0, is_gso_v6: false, is_gso: false, |
805 | len: info->iov[iovi].iov_len, |
806 | addr: info->iov[iovi].iov_offset); |
807 | |
808 | memcpy(tx->tx_fifo.base + info->iov[iovi].iov_offset, |
809 | data + offset, info->iov[iovi].iov_len); |
810 | gve_dma_sync_for_device(dev: &priv->pdev->dev, |
811 | page_buses: tx->tx_fifo.qpl->page_buses, |
812 | iov_offset: info->iov[iovi].iov_offset, |
813 | iov_len: info->iov[iovi].iov_len); |
814 | offset += info->iov[iovi].iov_len; |
815 | iovi++; |
816 | reqi++; |
817 | } |
818 | |
819 | return ndescs; |
820 | } |
821 | |
822 | int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, |
823 | u32 flags) |
824 | { |
825 | struct gve_priv *priv = netdev_priv(dev); |
826 | struct gve_tx_ring *tx; |
827 | int i, err = 0, qid; |
828 | |
829 | if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) |
830 | return -EINVAL; |
831 | |
832 | qid = gve_xdp_tx_queue_id(priv, |
833 | smp_processor_id() % priv->num_xdp_queues); |
834 | |
835 | tx = &priv->tx[qid]; |
836 | |
837 | spin_lock(lock: &tx->xdp_lock); |
838 | for (i = 0; i < n; i++) { |
839 | err = gve_xdp_xmit_one(priv, tx, data: frames[i]->data, |
840 | len: frames[i]->len, frame_p: frames[i]); |
841 | if (err) |
842 | break; |
843 | } |
844 | |
845 | if (flags & XDP_XMIT_FLUSH) |
846 | gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req); |
847 | |
848 | spin_unlock(lock: &tx->xdp_lock); |
849 | |
850 | u64_stats_update_begin(syncp: &tx->statss); |
851 | tx->xdp_xmit += n; |
852 | tx->xdp_xmit_errors += n - i; |
853 | u64_stats_update_end(syncp: &tx->statss); |
854 | |
855 | return i ? i : err; |
856 | } |
857 | |
858 | int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx, |
859 | void *data, int len, void *frame_p) |
860 | { |
861 | int nsegs; |
862 | |
863 | if (!gve_can_tx(tx, bytes_required: len + GVE_GQ_TX_MIN_PKT_DESC_BYTES - 1)) |
864 | return -EBUSY; |
865 | |
866 | nsegs = gve_tx_fill_xdp(priv, tx, data, len, frame_p, is_xsk: false); |
867 | tx->req += nsegs; |
868 | |
869 | return 0; |
870 | } |
871 | |
872 | #define GVE_TX_START_THRESH 4096 |
873 | |
874 | static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, |
875 | u32 to_do, bool try_to_wake) |
876 | { |
877 | struct gve_tx_buffer_state *info; |
878 | u64 pkts = 0, bytes = 0; |
879 | size_t space_freed = 0; |
880 | struct sk_buff *skb; |
881 | u32 idx; |
882 | int j; |
883 | |
884 | for (j = 0; j < to_do; j++) { |
885 | idx = tx->done & tx->mask; |
886 | netif_info(priv, tx_done, priv->dev, |
887 | "[%d] %s: idx=%d (req=%u done=%u)\n" , |
888 | tx->q_num, __func__, idx, tx->req, tx->done); |
889 | info = &tx->info[idx]; |
890 | skb = info->skb; |
891 | |
892 | /* Unmap the buffer */ |
893 | if (tx->raw_addressing) |
894 | gve_tx_unmap_buf(dev: tx->dev, info); |
895 | tx->done++; |
896 | /* Mark as free */ |
897 | if (skb) { |
898 | info->skb = NULL; |
899 | bytes += skb->len; |
900 | pkts++; |
901 | dev_consume_skb_any(skb); |
902 | if (tx->raw_addressing) |
903 | continue; |
904 | space_freed += gve_tx_clear_buffer_state(info); |
905 | } |
906 | } |
907 | |
908 | if (!tx->raw_addressing) |
909 | gve_tx_free_fifo(fifo: &tx->tx_fifo, bytes: space_freed); |
910 | u64_stats_update_begin(syncp: &tx->statss); |
911 | tx->bytes_done += bytes; |
912 | tx->pkt_done += pkts; |
913 | u64_stats_update_end(syncp: &tx->statss); |
914 | netdev_tx_completed_queue(dev_queue: tx->netdev_txq, pkts, bytes); |
915 | |
916 | /* start the queue if we've stopped it */ |
917 | #ifndef CONFIG_BQL |
918 | /* Make sure that the doorbells are synced */ |
919 | smp_mb(); |
920 | #endif |
921 | if (try_to_wake && netif_tx_queue_stopped(dev_queue: tx->netdev_txq) && |
922 | likely(gve_can_tx(tx, GVE_TX_START_THRESH))) { |
923 | tx->wake_queue++; |
924 | netif_tx_wake_queue(dev_queue: tx->netdev_txq); |
925 | } |
926 | |
927 | return pkts; |
928 | } |
929 | |
930 | u32 gve_tx_load_event_counter(struct gve_priv *priv, |
931 | struct gve_tx_ring *tx) |
932 | { |
933 | u32 counter_index = be32_to_cpu(tx->q_resources->counter_index); |
934 | __be32 counter = READ_ONCE(priv->counter_array[counter_index]); |
935 | |
936 | return be32_to_cpu(counter); |
937 | } |
938 | |
939 | static int gve_xsk_tx(struct gve_priv *priv, struct gve_tx_ring *tx, |
940 | int budget) |
941 | { |
942 | struct xdp_desc desc; |
943 | int sent = 0, nsegs; |
944 | void *data; |
945 | |
946 | spin_lock(lock: &tx->xdp_lock); |
947 | while (sent < budget) { |
948 | if (!gve_can_tx(tx, GVE_TX_START_THRESH)) |
949 | goto out; |
950 | |
951 | if (!xsk_tx_peek_desc(pool: tx->xsk_pool, desc: &desc)) { |
952 | tx->xdp_xsk_done = tx->xdp_xsk_wakeup; |
953 | goto out; |
954 | } |
955 | |
956 | data = xsk_buff_raw_get_data(pool: tx->xsk_pool, addr: desc.addr); |
957 | nsegs = gve_tx_fill_xdp(priv, tx, data, len: desc.len, NULL, is_xsk: true); |
958 | tx->req += nsegs; |
959 | sent++; |
960 | } |
961 | out: |
962 | if (sent > 0) { |
963 | gve_tx_put_doorbell(priv, q_resources: tx->q_resources, val: tx->req); |
964 | xsk_tx_release(pool: tx->xsk_pool); |
965 | } |
966 | spin_unlock(lock: &tx->xdp_lock); |
967 | return sent; |
968 | } |
969 | |
970 | bool gve_xdp_poll(struct gve_notify_block *block, int budget) |
971 | { |
972 | struct gve_priv *priv = block->priv; |
973 | struct gve_tx_ring *tx = block->tx; |
974 | u32 nic_done; |
975 | bool repoll; |
976 | u32 to_do; |
977 | |
978 | /* Find out how much work there is to be done */ |
979 | nic_done = gve_tx_load_event_counter(priv, tx); |
980 | to_do = min_t(u32, (nic_done - tx->done), budget); |
981 | gve_clean_xdp_done(priv, tx, to_do); |
982 | repoll = nic_done != tx->done; |
983 | |
984 | if (tx->xsk_pool) { |
985 | int sent = gve_xsk_tx(priv, tx, budget); |
986 | |
987 | u64_stats_update_begin(syncp: &tx->statss); |
988 | tx->xdp_xsk_sent += sent; |
989 | u64_stats_update_end(syncp: &tx->statss); |
990 | repoll |= (sent == budget); |
991 | if (xsk_uses_need_wakeup(pool: tx->xsk_pool)) |
992 | xsk_set_tx_need_wakeup(pool: tx->xsk_pool); |
993 | } |
994 | |
995 | /* If we still have work we want to repoll */ |
996 | return repoll; |
997 | } |
998 | |
999 | bool gve_tx_poll(struct gve_notify_block *block, int budget) |
1000 | { |
1001 | struct gve_priv *priv = block->priv; |
1002 | struct gve_tx_ring *tx = block->tx; |
1003 | u32 nic_done; |
1004 | u32 to_do; |
1005 | |
1006 | /* If budget is 0, do all the work */ |
1007 | if (budget == 0) |
1008 | budget = INT_MAX; |
1009 | |
1010 | /* In TX path, it may try to clean completed pkts in order to xmit, |
1011 | * to avoid cleaning conflict, use spin_lock(), it yields better |
1012 | * concurrency between xmit/clean than netif's lock. |
1013 | */ |
1014 | spin_lock(lock: &tx->clean_lock); |
1015 | /* Find out how much work there is to be done */ |
1016 | nic_done = gve_tx_load_event_counter(priv, tx); |
1017 | to_do = min_t(u32, (nic_done - tx->done), budget); |
1018 | gve_clean_tx_done(priv, tx, to_do, try_to_wake: true); |
1019 | spin_unlock(lock: &tx->clean_lock); |
1020 | /* If we still have work we want to repoll */ |
1021 | return nic_done != tx->done; |
1022 | } |
1023 | |
1024 | bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx) |
1025 | { |
1026 | u32 nic_done = gve_tx_load_event_counter(priv, tx); |
1027 | |
1028 | return nic_done != tx->done; |
1029 | } |
1030 | |