1 | // SPDX-License-Identifier: (GPL-2.0 OR MIT) |
2 | /* Google virtual Ethernet (gve) driver |
3 | * |
4 | * Copyright (C) 2015-2021 Google, Inc. |
5 | */ |
6 | |
7 | #include "gve.h" |
8 | #include "gve_adminq.h" |
9 | #include "gve_utils.h" |
10 | #include <linux/etherdevice.h> |
11 | #include <linux/filter.h> |
12 | #include <net/xdp.h> |
13 | #include <net/xdp_sock_drv.h> |
14 | |
15 | static void gve_rx_free_buffer(struct device *dev, |
16 | struct gve_rx_slot_page_info *page_info, |
17 | union gve_rx_data_slot *data_slot) |
18 | { |
19 | dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & |
20 | GVE_DATA_SLOT_ADDR_PAGE_MASK); |
21 | |
22 | page_ref_sub(page: page_info->page, nr: page_info->pagecnt_bias - 1); |
23 | gve_free_page(dev, page: page_info->page, dma, DMA_FROM_DEVICE); |
24 | } |
25 | |
26 | static void gve_rx_unfill_pages(struct gve_priv *priv, |
27 | struct gve_rx_ring *rx, |
28 | struct gve_rx_alloc_rings_cfg *cfg) |
29 | { |
30 | u32 slots = rx->mask + 1; |
31 | int i; |
32 | |
33 | if (rx->data.raw_addressing) { |
34 | for (i = 0; i < slots; i++) |
35 | gve_rx_free_buffer(dev: &priv->pdev->dev, page_info: &rx->data.page_info[i], |
36 | data_slot: &rx->data.data_ring[i]); |
37 | } else { |
38 | for (i = 0; i < slots; i++) |
39 | page_ref_sub(page: rx->data.page_info[i].page, |
40 | nr: rx->data.page_info[i].pagecnt_bias - 1); |
41 | gve_unassign_qpl(qpl_cfg: cfg->qpl_cfg, id: rx->data.qpl->id); |
42 | rx->data.qpl = NULL; |
43 | |
44 | for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { |
45 | page_ref_sub(page: rx->qpl_copy_pool[i].page, |
46 | nr: rx->qpl_copy_pool[i].pagecnt_bias - 1); |
47 | put_page(page: rx->qpl_copy_pool[i].page); |
48 | } |
49 | } |
50 | kvfree(addr: rx->data.page_info); |
51 | rx->data.page_info = NULL; |
52 | } |
53 | |
54 | void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx) |
55 | { |
56 | int ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx: idx); |
57 | |
58 | if (!gve_rx_was_added_to_block(priv, queue_idx: idx)) |
59 | return; |
60 | |
61 | gve_remove_napi(priv, ntfy_idx); |
62 | gve_rx_remove_from_block(priv, queue_idx: idx); |
63 | } |
64 | |
65 | static void gve_rx_free_ring_gqi(struct gve_priv *priv, struct gve_rx_ring *rx, |
66 | struct gve_rx_alloc_rings_cfg *cfg) |
67 | { |
68 | struct device *dev = &priv->pdev->dev; |
69 | u32 slots = rx->mask + 1; |
70 | int idx = rx->q_num; |
71 | size_t bytes; |
72 | |
73 | bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; |
74 | dma_free_coherent(dev, size: bytes, cpu_addr: rx->desc.desc_ring, dma_handle: rx->desc.bus); |
75 | rx->desc.desc_ring = NULL; |
76 | |
77 | dma_free_coherent(dev, size: sizeof(*rx->q_resources), |
78 | cpu_addr: rx->q_resources, dma_handle: rx->q_resources_bus); |
79 | rx->q_resources = NULL; |
80 | |
81 | gve_rx_unfill_pages(priv, rx, cfg); |
82 | |
83 | bytes = sizeof(*rx->data.data_ring) * slots; |
84 | dma_free_coherent(dev, size: bytes, cpu_addr: rx->data.data_ring, |
85 | dma_handle: rx->data.data_bus); |
86 | rx->data.data_ring = NULL; |
87 | |
88 | kvfree(addr: rx->qpl_copy_pool); |
89 | rx->qpl_copy_pool = NULL; |
90 | |
91 | netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n" , idx); |
92 | } |
93 | |
94 | static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, |
95 | dma_addr_t addr, struct page *page, __be64 *slot_addr) |
96 | { |
97 | page_info->page = page; |
98 | page_info->page_offset = 0; |
99 | page_info->page_address = page_address(page); |
100 | *slot_addr = cpu_to_be64(addr); |
101 | /* The page already has 1 ref */ |
102 | page_ref_add(page, INT_MAX - 1); |
103 | page_info->pagecnt_bias = INT_MAX; |
104 | } |
105 | |
106 | static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, |
107 | struct gve_rx_slot_page_info *page_info, |
108 | union gve_rx_data_slot *data_slot, |
109 | struct gve_rx_ring *rx) |
110 | { |
111 | struct page *page; |
112 | dma_addr_t dma; |
113 | int err; |
114 | |
115 | err = gve_alloc_page(priv, dev, page: &page, dma: &dma, DMA_FROM_DEVICE, |
116 | GFP_ATOMIC); |
117 | if (err) { |
118 | u64_stats_update_begin(syncp: &rx->statss); |
119 | rx->rx_buf_alloc_fail++; |
120 | u64_stats_update_end(syncp: &rx->statss); |
121 | return err; |
122 | } |
123 | |
124 | gve_setup_rx_buffer(page_info, addr: dma, page, slot_addr: &data_slot->addr); |
125 | return 0; |
126 | } |
127 | |
128 | static int gve_rx_prefill_pages(struct gve_rx_ring *rx, |
129 | struct gve_rx_alloc_rings_cfg *cfg) |
130 | { |
131 | struct gve_priv *priv = rx->gve; |
132 | u32 slots; |
133 | int err; |
134 | int i; |
135 | int j; |
136 | |
137 | /* Allocate one page per Rx queue slot. Each page is split into two |
138 | * packet buffers, when possible we "page flip" between the two. |
139 | */ |
140 | slots = rx->mask + 1; |
141 | |
142 | rx->data.page_info = kvzalloc(size: slots * |
143 | sizeof(*rx->data.page_info), GFP_KERNEL); |
144 | if (!rx->data.page_info) |
145 | return -ENOMEM; |
146 | |
147 | if (!rx->data.raw_addressing) { |
148 | rx->data.qpl = gve_assign_rx_qpl(cfg, rx_qid: rx->q_num); |
149 | if (!rx->data.qpl) { |
150 | kvfree(addr: rx->data.page_info); |
151 | rx->data.page_info = NULL; |
152 | return -ENOMEM; |
153 | } |
154 | } |
155 | for (i = 0; i < slots; i++) { |
156 | if (!rx->data.raw_addressing) { |
157 | struct page *page = rx->data.qpl->pages[i]; |
158 | dma_addr_t addr = i * PAGE_SIZE; |
159 | |
160 | gve_setup_rx_buffer(page_info: &rx->data.page_info[i], addr, page, |
161 | slot_addr: &rx->data.data_ring[i].qpl_offset); |
162 | continue; |
163 | } |
164 | err = gve_rx_alloc_buffer(priv, dev: &priv->pdev->dev, |
165 | page_info: &rx->data.page_info[i], |
166 | data_slot: &rx->data.data_ring[i], rx); |
167 | if (err) |
168 | goto alloc_err_rda; |
169 | } |
170 | |
171 | if (!rx->data.raw_addressing) { |
172 | for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { |
173 | struct page *page = alloc_page(GFP_KERNEL); |
174 | |
175 | if (!page) { |
176 | err = -ENOMEM; |
177 | goto alloc_err_qpl; |
178 | } |
179 | |
180 | rx->qpl_copy_pool[j].page = page; |
181 | rx->qpl_copy_pool[j].page_offset = 0; |
182 | rx->qpl_copy_pool[j].page_address = page_address(page); |
183 | |
184 | /* The page already has 1 ref. */ |
185 | page_ref_add(page, INT_MAX - 1); |
186 | rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; |
187 | } |
188 | } |
189 | |
190 | return slots; |
191 | |
192 | alloc_err_qpl: |
193 | /* Fully free the copy pool pages. */ |
194 | while (j--) { |
195 | page_ref_sub(page: rx->qpl_copy_pool[j].page, |
196 | nr: rx->qpl_copy_pool[j].pagecnt_bias - 1); |
197 | put_page(page: rx->qpl_copy_pool[j].page); |
198 | } |
199 | |
200 | /* Do not fully free QPL pages - only remove the bias added in this |
201 | * function with gve_setup_rx_buffer. |
202 | */ |
203 | while (i--) |
204 | page_ref_sub(page: rx->data.page_info[i].page, |
205 | nr: rx->data.page_info[i].pagecnt_bias - 1); |
206 | |
207 | gve_unassign_qpl(qpl_cfg: cfg->qpl_cfg, id: rx->data.qpl->id); |
208 | rx->data.qpl = NULL; |
209 | |
210 | return err; |
211 | |
212 | alloc_err_rda: |
213 | while (i--) |
214 | gve_rx_free_buffer(dev: &priv->pdev->dev, |
215 | page_info: &rx->data.page_info[i], |
216 | data_slot: &rx->data.data_ring[i]); |
217 | return err; |
218 | } |
219 | |
220 | static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) |
221 | { |
222 | ctx->skb_head = NULL; |
223 | ctx->skb_tail = NULL; |
224 | ctx->total_size = 0; |
225 | ctx->frag_cnt = 0; |
226 | ctx->drop_pkt = false; |
227 | } |
228 | |
229 | void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx) |
230 | { |
231 | int ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx: idx); |
232 | |
233 | gve_rx_add_to_block(priv, queue_idx: idx); |
234 | gve_add_napi(priv, ntfy_idx, gve_poll: gve_napi_poll); |
235 | } |
236 | |
237 | static int gve_rx_alloc_ring_gqi(struct gve_priv *priv, |
238 | struct gve_rx_alloc_rings_cfg *cfg, |
239 | struct gve_rx_ring *rx, |
240 | int idx) |
241 | { |
242 | struct device *hdev = &priv->pdev->dev; |
243 | u32 slots = priv->rx_data_slot_cnt; |
244 | int filled_pages; |
245 | size_t bytes; |
246 | int err; |
247 | |
248 | netif_dbg(priv, drv, priv->dev, "allocating rx ring\n" ); |
249 | /* Make sure everything is zeroed to start with */ |
250 | memset(rx, 0, sizeof(*rx)); |
251 | |
252 | rx->gve = priv; |
253 | rx->q_num = idx; |
254 | |
255 | rx->mask = slots - 1; |
256 | rx->data.raw_addressing = cfg->raw_addressing; |
257 | |
258 | /* alloc rx data ring */ |
259 | bytes = sizeof(*rx->data.data_ring) * slots; |
260 | rx->data.data_ring = dma_alloc_coherent(dev: hdev, size: bytes, |
261 | dma_handle: &rx->data.data_bus, |
262 | GFP_KERNEL); |
263 | if (!rx->data.data_ring) |
264 | return -ENOMEM; |
265 | |
266 | rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; |
267 | rx->qpl_copy_pool_head = 0; |
268 | rx->qpl_copy_pool = kvcalloc(n: rx->qpl_copy_pool_mask + 1, |
269 | size: sizeof(rx->qpl_copy_pool[0]), |
270 | GFP_KERNEL); |
271 | |
272 | if (!rx->qpl_copy_pool) { |
273 | err = -ENOMEM; |
274 | goto abort_with_slots; |
275 | } |
276 | |
277 | filled_pages = gve_rx_prefill_pages(rx, cfg); |
278 | if (filled_pages < 0) { |
279 | err = -ENOMEM; |
280 | goto abort_with_copy_pool; |
281 | } |
282 | rx->fill_cnt = filled_pages; |
283 | /* Ensure data ring slots (packet buffers) are visible. */ |
284 | dma_wmb(); |
285 | |
286 | /* Alloc gve_queue_resources */ |
287 | rx->q_resources = |
288 | dma_alloc_coherent(dev: hdev, |
289 | size: sizeof(*rx->q_resources), |
290 | dma_handle: &rx->q_resources_bus, |
291 | GFP_KERNEL); |
292 | if (!rx->q_resources) { |
293 | err = -ENOMEM; |
294 | goto abort_filled; |
295 | } |
296 | netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n" , idx, |
297 | (unsigned long)rx->data.data_bus); |
298 | |
299 | /* alloc rx desc ring */ |
300 | bytes = sizeof(struct gve_rx_desc) * cfg->ring_size; |
301 | rx->desc.desc_ring = dma_alloc_coherent(dev: hdev, size: bytes, dma_handle: &rx->desc.bus, |
302 | GFP_KERNEL); |
303 | if (!rx->desc.desc_ring) { |
304 | err = -ENOMEM; |
305 | goto abort_with_q_resources; |
306 | } |
307 | rx->cnt = 0; |
308 | rx->db_threshold = slots / 2; |
309 | rx->desc.seqno = 1; |
310 | |
311 | rx->packet_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE; |
312 | gve_rx_ctx_clear(ctx: &rx->ctx); |
313 | |
314 | return 0; |
315 | |
316 | abort_with_q_resources: |
317 | dma_free_coherent(dev: hdev, size: sizeof(*rx->q_resources), |
318 | cpu_addr: rx->q_resources, dma_handle: rx->q_resources_bus); |
319 | rx->q_resources = NULL; |
320 | abort_filled: |
321 | gve_rx_unfill_pages(priv, rx, cfg); |
322 | abort_with_copy_pool: |
323 | kvfree(addr: rx->qpl_copy_pool); |
324 | rx->qpl_copy_pool = NULL; |
325 | abort_with_slots: |
326 | bytes = sizeof(*rx->data.data_ring) * slots; |
327 | dma_free_coherent(dev: hdev, size: bytes, cpu_addr: rx->data.data_ring, dma_handle: rx->data.data_bus); |
328 | rx->data.data_ring = NULL; |
329 | |
330 | return err; |
331 | } |
332 | |
333 | int gve_rx_alloc_rings_gqi(struct gve_priv *priv, |
334 | struct gve_rx_alloc_rings_cfg *cfg) |
335 | { |
336 | struct gve_rx_ring *rx; |
337 | int err = 0; |
338 | int i, j; |
339 | |
340 | if (!cfg->raw_addressing && !cfg->qpls) { |
341 | netif_err(priv, drv, priv->dev, |
342 | "Cannot alloc QPL ring before allocing QPLs\n" ); |
343 | return -EINVAL; |
344 | } |
345 | |
346 | rx = kvcalloc(n: cfg->qcfg->max_queues, size: sizeof(struct gve_rx_ring), |
347 | GFP_KERNEL); |
348 | if (!rx) |
349 | return -ENOMEM; |
350 | |
351 | for (i = 0; i < cfg->qcfg->num_queues; i++) { |
352 | err = gve_rx_alloc_ring_gqi(priv, cfg, rx: &rx[i], idx: i); |
353 | if (err) { |
354 | netif_err(priv, drv, priv->dev, |
355 | "Failed to alloc rx ring=%d: err=%d\n" , |
356 | i, err); |
357 | goto cleanup; |
358 | } |
359 | } |
360 | |
361 | cfg->rx = rx; |
362 | return 0; |
363 | |
364 | cleanup: |
365 | for (j = 0; j < i; j++) |
366 | gve_rx_free_ring_gqi(priv, rx: &rx[j], cfg); |
367 | kvfree(addr: rx); |
368 | return err; |
369 | } |
370 | |
371 | void gve_rx_free_rings_gqi(struct gve_priv *priv, |
372 | struct gve_rx_alloc_rings_cfg *cfg) |
373 | { |
374 | struct gve_rx_ring *rx = cfg->rx; |
375 | int i; |
376 | |
377 | if (!rx) |
378 | return; |
379 | |
380 | for (i = 0; i < cfg->qcfg->num_queues; i++) |
381 | gve_rx_free_ring_gqi(priv, rx: &rx[i], cfg); |
382 | |
383 | kvfree(addr: rx); |
384 | cfg->rx = NULL; |
385 | } |
386 | |
387 | void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) |
388 | { |
389 | u32 db_idx = be32_to_cpu(rx->q_resources->db_index); |
390 | |
391 | iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); |
392 | } |
393 | |
394 | static enum pkt_hash_types (__be16 pkt_flags) |
395 | { |
396 | if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) |
397 | return PKT_HASH_TYPE_L4; |
398 | if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) |
399 | return PKT_HASH_TYPE_L3; |
400 | return PKT_HASH_TYPE_L2; |
401 | } |
402 | |
403 | static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, |
404 | struct gve_rx_slot_page_info *page_info, |
405 | unsigned int truesize, u16 len, |
406 | struct gve_rx_ctx *ctx) |
407 | { |
408 | u32 offset = page_info->page_offset + page_info->pad; |
409 | struct sk_buff *skb = ctx->skb_tail; |
410 | int num_frags = 0; |
411 | |
412 | if (!skb) { |
413 | skb = napi_get_frags(napi); |
414 | if (unlikely(!skb)) |
415 | return NULL; |
416 | |
417 | ctx->skb_head = skb; |
418 | ctx->skb_tail = skb; |
419 | } else { |
420 | num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; |
421 | if (num_frags == MAX_SKB_FRAGS) { |
422 | skb = napi_alloc_skb(napi, length: 0); |
423 | if (!skb) |
424 | return NULL; |
425 | |
426 | // We will never chain more than two SKBs: 2 * 16 * 2k > 64k |
427 | // which is why we do not need to chain by using skb->next |
428 | skb_shinfo(ctx->skb_tail)->frag_list = skb; |
429 | |
430 | ctx->skb_tail = skb; |
431 | num_frags = 0; |
432 | } |
433 | } |
434 | |
435 | if (skb != ctx->skb_head) { |
436 | ctx->skb_head->len += len; |
437 | ctx->skb_head->data_len += len; |
438 | ctx->skb_head->truesize += truesize; |
439 | } |
440 | skb_add_rx_frag(skb, i: num_frags, page: page_info->page, |
441 | off: offset, size: len, truesize); |
442 | |
443 | return ctx->skb_head; |
444 | } |
445 | |
446 | static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) |
447 | { |
448 | const __be64 offset = cpu_to_be64(GVE_DEFAULT_RX_BUFFER_OFFSET); |
449 | |
450 | /* "flip" to other packet buffer on this page */ |
451 | page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; |
452 | *(slot_addr) ^= offset; |
453 | } |
454 | |
455 | static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) |
456 | { |
457 | int pagecount = page_count(page: page_info->page); |
458 | |
459 | /* This page is not being used by any SKBs - reuse */ |
460 | if (pagecount == page_info->pagecnt_bias) |
461 | return 1; |
462 | /* This page is still being used by an SKB - we can't reuse */ |
463 | else if (pagecount > page_info->pagecnt_bias) |
464 | return 0; |
465 | WARN(pagecount < page_info->pagecnt_bias, |
466 | "Pagecount should never be less than the bias." ); |
467 | return -1; |
468 | } |
469 | |
470 | static struct sk_buff * |
471 | gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, |
472 | struct gve_rx_slot_page_info *page_info, u16 len, |
473 | struct napi_struct *napi, |
474 | union gve_rx_data_slot *data_slot, |
475 | u16 packet_buffer_size, struct gve_rx_ctx *ctx) |
476 | { |
477 | struct sk_buff *skb = gve_rx_add_frags(napi, page_info, truesize: packet_buffer_size, len, ctx); |
478 | |
479 | if (!skb) |
480 | return NULL; |
481 | |
482 | /* Optimistically stop the kernel from freeing the page. |
483 | * We will check again in refill to determine if we need to alloc a |
484 | * new page. |
485 | */ |
486 | gve_dec_pagecnt_bias(page_info); |
487 | |
488 | return skb; |
489 | } |
490 | |
491 | static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, |
492 | struct gve_rx_slot_page_info *page_info, |
493 | u16 len, struct napi_struct *napi) |
494 | { |
495 | u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; |
496 | void *src = page_info->page_address + page_info->page_offset; |
497 | struct gve_rx_slot_page_info *copy_page_info; |
498 | struct gve_rx_ctx *ctx = &rx->ctx; |
499 | bool alloc_page = false; |
500 | struct sk_buff *skb; |
501 | void *dst; |
502 | |
503 | copy_page_info = &rx->qpl_copy_pool[pool_idx]; |
504 | if (!copy_page_info->can_flip) { |
505 | int recycle = gve_rx_can_recycle_buffer(page_info: copy_page_info); |
506 | |
507 | if (unlikely(recycle < 0)) { |
508 | gve_schedule_reset(priv: rx->gve); |
509 | return NULL; |
510 | } |
511 | alloc_page = !recycle; |
512 | } |
513 | |
514 | if (alloc_page) { |
515 | struct gve_rx_slot_page_info alloc_page_info; |
516 | struct page *page; |
517 | |
518 | /* The least recently used page turned out to be |
519 | * still in use by the kernel. Ignoring it and moving |
520 | * on alleviates head-of-line blocking. |
521 | */ |
522 | rx->qpl_copy_pool_head++; |
523 | |
524 | page = alloc_page(GFP_ATOMIC); |
525 | if (!page) |
526 | return NULL; |
527 | |
528 | alloc_page_info.page = page; |
529 | alloc_page_info.page_offset = 0; |
530 | alloc_page_info.page_address = page_address(page); |
531 | alloc_page_info.pad = page_info->pad; |
532 | |
533 | memcpy(alloc_page_info.page_address, src, page_info->pad + len); |
534 | skb = gve_rx_add_frags(napi, page_info: &alloc_page_info, |
535 | PAGE_SIZE, |
536 | len, ctx); |
537 | |
538 | u64_stats_update_begin(syncp: &rx->statss); |
539 | rx->rx_frag_copy_cnt++; |
540 | rx->rx_frag_alloc_cnt++; |
541 | u64_stats_update_end(syncp: &rx->statss); |
542 | |
543 | return skb; |
544 | } |
545 | |
546 | dst = copy_page_info->page_address + copy_page_info->page_offset; |
547 | memcpy(dst, src, page_info->pad + len); |
548 | copy_page_info->pad = page_info->pad; |
549 | |
550 | skb = gve_rx_add_frags(napi, page_info: copy_page_info, |
551 | truesize: rx->packet_buffer_size, len, ctx); |
552 | if (unlikely(!skb)) |
553 | return NULL; |
554 | |
555 | gve_dec_pagecnt_bias(page_info: copy_page_info); |
556 | copy_page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; |
557 | |
558 | if (copy_page_info->can_flip) { |
559 | /* We have used both halves of this copy page, it |
560 | * is time for it to go to the back of the queue. |
561 | */ |
562 | copy_page_info->can_flip = false; |
563 | rx->qpl_copy_pool_head++; |
564 | prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); |
565 | } else { |
566 | copy_page_info->can_flip = true; |
567 | } |
568 | |
569 | u64_stats_update_begin(syncp: &rx->statss); |
570 | rx->rx_frag_copy_cnt++; |
571 | u64_stats_update_end(syncp: &rx->statss); |
572 | |
573 | return skb; |
574 | } |
575 | |
576 | static struct sk_buff * |
577 | gve_rx_qpl(struct device *dev, struct net_device *netdev, |
578 | struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, |
579 | u16 len, struct napi_struct *napi, |
580 | union gve_rx_data_slot *data_slot) |
581 | { |
582 | struct gve_rx_ctx *ctx = &rx->ctx; |
583 | struct sk_buff *skb; |
584 | |
585 | /* if raw_addressing mode is not enabled gvnic can only receive into |
586 | * registered segments. If the buffer can't be recycled, our only |
587 | * choice is to copy the data out of it so that we can return it to the |
588 | * device. |
589 | */ |
590 | if (page_info->can_flip) { |
591 | skb = gve_rx_add_frags(napi, page_info, truesize: rx->packet_buffer_size, len, ctx); |
592 | /* No point in recycling if we didn't get the skb */ |
593 | if (skb) { |
594 | /* Make sure that the page isn't freed. */ |
595 | gve_dec_pagecnt_bias(page_info); |
596 | gve_rx_flip_buff(page_info, slot_addr: &data_slot->qpl_offset); |
597 | } |
598 | } else { |
599 | skb = gve_rx_copy_to_pool(rx, page_info, len, napi); |
600 | } |
601 | return skb; |
602 | } |
603 | |
604 | static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, |
605 | struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, |
606 | u16 len, union gve_rx_data_slot *data_slot, |
607 | bool is_only_frag) |
608 | { |
609 | struct net_device *netdev = priv->dev; |
610 | struct gve_rx_ctx *ctx = &rx->ctx; |
611 | struct sk_buff *skb = NULL; |
612 | |
613 | if (len <= priv->rx_copybreak && is_only_frag) { |
614 | /* Just copy small packets */ |
615 | skb = gve_rx_copy(dev: netdev, napi, page_info, len); |
616 | if (skb) { |
617 | u64_stats_update_begin(syncp: &rx->statss); |
618 | rx->rx_copied_pkt++; |
619 | rx->rx_frag_copy_cnt++; |
620 | rx->rx_copybreak_pkt++; |
621 | u64_stats_update_end(syncp: &rx->statss); |
622 | } |
623 | } else { |
624 | int recycle = gve_rx_can_recycle_buffer(page_info); |
625 | |
626 | if (unlikely(recycle < 0)) { |
627 | gve_schedule_reset(priv); |
628 | return NULL; |
629 | } |
630 | page_info->can_flip = recycle; |
631 | if (page_info->can_flip) { |
632 | u64_stats_update_begin(syncp: &rx->statss); |
633 | rx->rx_frag_flip_cnt++; |
634 | u64_stats_update_end(syncp: &rx->statss); |
635 | } |
636 | |
637 | if (rx->data.raw_addressing) { |
638 | skb = gve_rx_raw_addressing(dev: &priv->pdev->dev, netdev, |
639 | page_info, len, napi, |
640 | data_slot, |
641 | packet_buffer_size: rx->packet_buffer_size, ctx); |
642 | } else { |
643 | skb = gve_rx_qpl(dev: &priv->pdev->dev, netdev, rx, |
644 | page_info, len, napi, data_slot); |
645 | } |
646 | } |
647 | return skb; |
648 | } |
649 | |
650 | static int gve_xsk_pool_redirect(struct net_device *dev, |
651 | struct gve_rx_ring *rx, |
652 | void *data, int len, |
653 | struct bpf_prog *xdp_prog) |
654 | { |
655 | struct xdp_buff *xdp; |
656 | int err; |
657 | |
658 | if (rx->xsk_pool->frame_len < len) |
659 | return -E2BIG; |
660 | xdp = xsk_buff_alloc(pool: rx->xsk_pool); |
661 | if (!xdp) { |
662 | u64_stats_update_begin(syncp: &rx->statss); |
663 | rx->xdp_alloc_fails++; |
664 | u64_stats_update_end(syncp: &rx->statss); |
665 | return -ENOMEM; |
666 | } |
667 | xdp->data_end = xdp->data + len; |
668 | memcpy(xdp->data, data, len); |
669 | err = xdp_do_redirect(dev, xdp, prog: xdp_prog); |
670 | if (err) |
671 | xsk_buff_free(xdp); |
672 | return err; |
673 | } |
674 | |
675 | static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, |
676 | struct xdp_buff *orig, struct bpf_prog *xdp_prog) |
677 | { |
678 | int total_len, len = orig->data_end - orig->data; |
679 | int headroom = XDP_PACKET_HEADROOM; |
680 | struct xdp_buff new; |
681 | void *frame; |
682 | int err; |
683 | |
684 | if (rx->xsk_pool) |
685 | return gve_xsk_pool_redirect(dev, rx, data: orig->data, |
686 | len, xdp_prog); |
687 | |
688 | total_len = headroom + SKB_DATA_ALIGN(len) + |
689 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
690 | frame = page_frag_alloc(nc: &rx->page_cache, fragsz: total_len, GFP_ATOMIC); |
691 | if (!frame) { |
692 | u64_stats_update_begin(syncp: &rx->statss); |
693 | rx->xdp_alloc_fails++; |
694 | u64_stats_update_end(syncp: &rx->statss); |
695 | return -ENOMEM; |
696 | } |
697 | xdp_init_buff(xdp: &new, frame_sz: total_len, rxq: &rx->xdp_rxq); |
698 | xdp_prepare_buff(xdp: &new, hard_start: frame, headroom, data_len: len, meta_valid: false); |
699 | memcpy(new.data, orig->data, len); |
700 | |
701 | err = xdp_do_redirect(dev, xdp: &new, prog: xdp_prog); |
702 | if (err) |
703 | page_frag_free(addr: frame); |
704 | |
705 | return err; |
706 | } |
707 | |
708 | static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, |
709 | struct xdp_buff *xdp, struct bpf_prog *xprog, |
710 | int xdp_act) |
711 | { |
712 | struct gve_tx_ring *tx; |
713 | int tx_qid; |
714 | int err; |
715 | |
716 | switch (xdp_act) { |
717 | case XDP_ABORTED: |
718 | case XDP_DROP: |
719 | default: |
720 | break; |
721 | case XDP_TX: |
722 | tx_qid = gve_xdp_tx_queue_id(priv, queue_id: rx->q_num); |
723 | tx = &priv->tx[tx_qid]; |
724 | spin_lock(lock: &tx->xdp_lock); |
725 | err = gve_xdp_xmit_one(priv, tx, data: xdp->data, |
726 | len: xdp->data_end - xdp->data, NULL); |
727 | spin_unlock(lock: &tx->xdp_lock); |
728 | |
729 | if (unlikely(err)) { |
730 | u64_stats_update_begin(syncp: &rx->statss); |
731 | rx->xdp_tx_errors++; |
732 | u64_stats_update_end(syncp: &rx->statss); |
733 | } |
734 | break; |
735 | case XDP_REDIRECT: |
736 | err = gve_xdp_redirect(dev: priv->dev, rx, orig: xdp, xdp_prog: xprog); |
737 | |
738 | if (unlikely(err)) { |
739 | u64_stats_update_begin(syncp: &rx->statss); |
740 | rx->xdp_redirect_errors++; |
741 | u64_stats_update_end(syncp: &rx->statss); |
742 | } |
743 | break; |
744 | } |
745 | u64_stats_update_begin(syncp: &rx->statss); |
746 | if ((u32)xdp_act < GVE_XDP_ACTIONS) |
747 | rx->xdp_actions[xdp_act]++; |
748 | u64_stats_update_end(syncp: &rx->statss); |
749 | } |
750 | |
751 | #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) |
752 | static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, |
753 | struct gve_rx_desc *desc, u32 idx, |
754 | struct gve_rx_cnts *cnts) |
755 | { |
756 | bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); |
757 | struct gve_rx_slot_page_info *page_info; |
758 | u16 frag_size = be16_to_cpu(desc->len); |
759 | struct gve_rx_ctx *ctx = &rx->ctx; |
760 | union gve_rx_data_slot *data_slot; |
761 | struct gve_priv *priv = rx->gve; |
762 | struct sk_buff *skb = NULL; |
763 | struct bpf_prog *xprog; |
764 | struct xdp_buff xdp; |
765 | dma_addr_t page_bus; |
766 | void *va; |
767 | |
768 | u16 len = frag_size; |
769 | struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; |
770 | bool is_first_frag = ctx->frag_cnt == 0; |
771 | |
772 | bool is_only_frag = is_first_frag && is_last_frag; |
773 | |
774 | if (unlikely(ctx->drop_pkt)) |
775 | goto finish_frag; |
776 | |
777 | if (desc->flags_seq & GVE_RXF_ERR) { |
778 | ctx->drop_pkt = true; |
779 | cnts->desc_err_pkt_cnt++; |
780 | napi_free_frags(napi); |
781 | goto finish_frag; |
782 | } |
783 | |
784 | if (unlikely(frag_size > rx->packet_buffer_size)) { |
785 | netdev_warn(dev: priv->dev, format: "Unexpected frag size %d, can't exceed %d, scheduling reset" , |
786 | frag_size, rx->packet_buffer_size); |
787 | ctx->drop_pkt = true; |
788 | napi_free_frags(napi); |
789 | gve_schedule_reset(priv: rx->gve); |
790 | goto finish_frag; |
791 | } |
792 | |
793 | /* Prefetch two packet buffers ahead, we will need it soon. */ |
794 | page_info = &rx->data.page_info[(idx + 2) & rx->mask]; |
795 | va = page_info->page_address + page_info->page_offset; |
796 | prefetch(page_info->page); /* Kernel page struct. */ |
797 | prefetch(va); /* Packet header. */ |
798 | prefetch(va + 64); /* Next cacheline too. */ |
799 | |
800 | page_info = &rx->data.page_info[idx]; |
801 | data_slot = &rx->data.data_ring[idx]; |
802 | page_bus = (rx->data.raw_addressing) ? |
803 | be64_to_cpu(data_slot->addr) - page_info->page_offset : |
804 | rx->data.qpl->page_buses[idx]; |
805 | dma_sync_single_for_cpu(dev: &priv->pdev->dev, addr: page_bus, |
806 | PAGE_SIZE, dir: DMA_FROM_DEVICE); |
807 | page_info->pad = is_first_frag ? GVE_RX_PAD : 0; |
808 | len -= page_info->pad; |
809 | frag_size -= page_info->pad; |
810 | |
811 | xprog = READ_ONCE(priv->xdp_prog); |
812 | if (xprog && is_only_frag) { |
813 | void *old_data; |
814 | int xdp_act; |
815 | |
816 | xdp_init_buff(xdp: &xdp, frame_sz: rx->packet_buffer_size, rxq: &rx->xdp_rxq); |
817 | xdp_prepare_buff(xdp: &xdp, hard_start: page_info->page_address + |
818 | page_info->page_offset, GVE_RX_PAD, |
819 | data_len: len, meta_valid: false); |
820 | old_data = xdp.data; |
821 | xdp_act = bpf_prog_run_xdp(prog: xprog, xdp: &xdp); |
822 | if (xdp_act != XDP_PASS) { |
823 | gve_xdp_done(priv, rx, xdp: &xdp, xprog, xdp_act); |
824 | ctx->total_size += frag_size; |
825 | goto finish_ok_pkt; |
826 | } |
827 | |
828 | page_info->pad += xdp.data - old_data; |
829 | len = xdp.data_end - xdp.data; |
830 | |
831 | u64_stats_update_begin(syncp: &rx->statss); |
832 | rx->xdp_actions[XDP_PASS]++; |
833 | u64_stats_update_end(syncp: &rx->statss); |
834 | } |
835 | |
836 | skb = gve_rx_skb(priv, rx, page_info, napi, len, |
837 | data_slot, is_only_frag); |
838 | if (!skb) { |
839 | u64_stats_update_begin(syncp: &rx->statss); |
840 | rx->rx_skb_alloc_fail++; |
841 | u64_stats_update_end(syncp: &rx->statss); |
842 | |
843 | napi_free_frags(napi); |
844 | ctx->drop_pkt = true; |
845 | goto finish_frag; |
846 | } |
847 | ctx->total_size += frag_size; |
848 | |
849 | if (is_first_frag) { |
850 | if (likely(feat & NETIF_F_RXCSUM)) { |
851 | /* NIC passes up the partial sum */ |
852 | if (desc->csum) |
853 | skb->ip_summed = CHECKSUM_COMPLETE; |
854 | else |
855 | skb->ip_summed = CHECKSUM_NONE; |
856 | skb->csum = csum_unfold(n: desc->csum); |
857 | } |
858 | |
859 | /* parse flags & pass relevant info up */ |
860 | if (likely(feat & NETIF_F_RXHASH) && |
861 | gve_needs_rss(flag: desc->flags_seq)) |
862 | skb_set_hash(skb, be32_to_cpu(desc->rss_hash), |
863 | type: gve_rss_type(pkt_flags: desc->flags_seq)); |
864 | } |
865 | |
866 | if (is_last_frag) { |
867 | skb_record_rx_queue(skb, rx_queue: rx->q_num); |
868 | if (skb_is_nonlinear(skb)) |
869 | napi_gro_frags(napi); |
870 | else |
871 | napi_gro_receive(napi, skb); |
872 | goto finish_ok_pkt; |
873 | } |
874 | |
875 | goto finish_frag; |
876 | |
877 | finish_ok_pkt: |
878 | cnts->ok_pkt_bytes += ctx->total_size; |
879 | cnts->ok_pkt_cnt++; |
880 | finish_frag: |
881 | ctx->frag_cnt++; |
882 | if (is_last_frag) { |
883 | cnts->total_pkt_cnt++; |
884 | cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); |
885 | gve_rx_ctx_clear(ctx); |
886 | } |
887 | } |
888 | |
889 | bool gve_rx_work_pending(struct gve_rx_ring *rx) |
890 | { |
891 | struct gve_rx_desc *desc; |
892 | __be16 flags_seq; |
893 | u32 next_idx; |
894 | |
895 | next_idx = rx->cnt & rx->mask; |
896 | desc = rx->desc.desc_ring + next_idx; |
897 | |
898 | flags_seq = desc->flags_seq; |
899 | |
900 | return (GVE_SEQNO(flags_seq) == rx->desc.seqno); |
901 | } |
902 | |
903 | static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) |
904 | { |
905 | int refill_target = rx->mask + 1; |
906 | u32 fill_cnt = rx->fill_cnt; |
907 | |
908 | while (fill_cnt - rx->cnt < refill_target) { |
909 | struct gve_rx_slot_page_info *page_info; |
910 | u32 idx = fill_cnt & rx->mask; |
911 | |
912 | page_info = &rx->data.page_info[idx]; |
913 | if (page_info->can_flip) { |
914 | /* The other half of the page is free because it was |
915 | * free when we processed the descriptor. Flip to it. |
916 | */ |
917 | union gve_rx_data_slot *data_slot = |
918 | &rx->data.data_ring[idx]; |
919 | |
920 | gve_rx_flip_buff(page_info, slot_addr: &data_slot->addr); |
921 | page_info->can_flip = 0; |
922 | } else { |
923 | /* It is possible that the networking stack has already |
924 | * finished processing all outstanding packets in the buffer |
925 | * and it can be reused. |
926 | * Flipping is unnecessary here - if the networking stack still |
927 | * owns half the page it is impossible to tell which half. Either |
928 | * the whole page is free or it needs to be replaced. |
929 | */ |
930 | int recycle = gve_rx_can_recycle_buffer(page_info); |
931 | |
932 | if (recycle < 0) { |
933 | if (!rx->data.raw_addressing) |
934 | gve_schedule_reset(priv); |
935 | return false; |
936 | } |
937 | if (!recycle) { |
938 | /* We can't reuse the buffer - alloc a new one*/ |
939 | union gve_rx_data_slot *data_slot = |
940 | &rx->data.data_ring[idx]; |
941 | struct device *dev = &priv->pdev->dev; |
942 | gve_rx_free_buffer(dev, page_info, data_slot); |
943 | page_info->page = NULL; |
944 | if (gve_rx_alloc_buffer(priv, dev, page_info, |
945 | data_slot, rx)) { |
946 | break; |
947 | } |
948 | } |
949 | } |
950 | fill_cnt++; |
951 | } |
952 | rx->fill_cnt = fill_cnt; |
953 | return true; |
954 | } |
955 | |
956 | static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, |
957 | netdev_features_t feat) |
958 | { |
959 | u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; |
960 | u64 xdp_txs = rx->xdp_actions[XDP_TX]; |
961 | struct gve_rx_ctx *ctx = &rx->ctx; |
962 | struct gve_priv *priv = rx->gve; |
963 | struct gve_rx_cnts cnts = {0}; |
964 | struct gve_rx_desc *next_desc; |
965 | u32 idx = rx->cnt & rx->mask; |
966 | u32 work_done = 0; |
967 | |
968 | struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; |
969 | |
970 | // Exceed budget only if (and till) the inflight packet is consumed. |
971 | while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && |
972 | (work_done < budget || ctx->frag_cnt)) { |
973 | next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; |
974 | prefetch(next_desc); |
975 | |
976 | gve_rx(rx, feat, desc, idx, cnts: &cnts); |
977 | |
978 | rx->cnt++; |
979 | idx = rx->cnt & rx->mask; |
980 | desc = &rx->desc.desc_ring[idx]; |
981 | rx->desc.seqno = gve_next_seqno(seq: rx->desc.seqno); |
982 | work_done++; |
983 | } |
984 | |
985 | // The device will only send whole packets. |
986 | if (unlikely(ctx->frag_cnt)) { |
987 | struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; |
988 | |
989 | napi_free_frags(napi); |
990 | gve_rx_ctx_clear(ctx: &rx->ctx); |
991 | netdev_warn(dev: priv->dev, format: "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset" , |
992 | GVE_SEQNO(desc->flags_seq), rx->desc.seqno); |
993 | gve_schedule_reset(priv: rx->gve); |
994 | } |
995 | |
996 | if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) |
997 | return 0; |
998 | |
999 | if (work_done) { |
1000 | u64_stats_update_begin(syncp: &rx->statss); |
1001 | rx->rpackets += cnts.ok_pkt_cnt; |
1002 | rx->rbytes += cnts.ok_pkt_bytes; |
1003 | rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; |
1004 | rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; |
1005 | u64_stats_update_end(syncp: &rx->statss); |
1006 | } |
1007 | |
1008 | if (xdp_txs != rx->xdp_actions[XDP_TX]) |
1009 | gve_xdp_tx_flush(priv, xdp_qid: rx->q_num); |
1010 | |
1011 | if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) |
1012 | xdp_do_flush(); |
1013 | |
1014 | /* restock ring slots */ |
1015 | if (!rx->data.raw_addressing) { |
1016 | /* In QPL mode buffs are refilled as the desc are processed */ |
1017 | rx->fill_cnt += work_done; |
1018 | } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { |
1019 | /* In raw addressing mode buffs are only refilled if the avail |
1020 | * falls below a threshold. |
1021 | */ |
1022 | if (!gve_rx_refill_buffers(priv, rx)) |
1023 | return 0; |
1024 | |
1025 | /* If we were not able to completely refill buffers, we'll want |
1026 | * to schedule this queue for work again to refill buffers. |
1027 | */ |
1028 | if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { |
1029 | gve_rx_write_doorbell(priv, rx); |
1030 | return budget; |
1031 | } |
1032 | } |
1033 | |
1034 | gve_rx_write_doorbell(priv, rx); |
1035 | return cnts.total_pkt_cnt; |
1036 | } |
1037 | |
1038 | int gve_rx_poll(struct gve_notify_block *block, int budget) |
1039 | { |
1040 | struct gve_rx_ring *rx = block->rx; |
1041 | netdev_features_t feat; |
1042 | int work_done = 0; |
1043 | |
1044 | feat = block->napi.dev->features; |
1045 | |
1046 | if (budget > 0) |
1047 | work_done = gve_clean_rx_done(rx, budget, feat); |
1048 | |
1049 | return work_done; |
1050 | } |
1051 | |