1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright(c) 2013 - 2018 Intel Corporation. */ |
3 | |
4 | #include <linux/prefetch.h> |
5 | |
6 | #include "iavf.h" |
7 | #include "iavf_trace.h" |
8 | #include "iavf_prototype.h" |
9 | |
10 | static __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, |
11 | u32 td_tag) |
12 | { |
13 | return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA | |
14 | ((u64)td_cmd << IAVF_TXD_QW1_CMD_SHIFT) | |
15 | ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) | |
16 | ((u64)size << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) | |
17 | ((u64)td_tag << IAVF_TXD_QW1_L2TAG1_SHIFT)); |
18 | } |
19 | |
20 | #define IAVF_TXD_CMD (IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_RS) |
21 | |
22 | /** |
23 | * iavf_unmap_and_free_tx_resource - Release a Tx buffer |
24 | * @ring: the ring that owns the buffer |
25 | * @tx_buffer: the buffer to free |
26 | **/ |
27 | static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring, |
28 | struct iavf_tx_buffer *tx_buffer) |
29 | { |
30 | if (tx_buffer->skb) { |
31 | if (tx_buffer->tx_flags & IAVF_TX_FLAGS_FD_SB) |
32 | kfree(objp: tx_buffer->raw_buf); |
33 | else |
34 | dev_kfree_skb_any(skb: tx_buffer->skb); |
35 | if (dma_unmap_len(tx_buffer, len)) |
36 | dma_unmap_single(ring->dev, |
37 | dma_unmap_addr(tx_buffer, dma), |
38 | dma_unmap_len(tx_buffer, len), |
39 | DMA_TO_DEVICE); |
40 | } else if (dma_unmap_len(tx_buffer, len)) { |
41 | dma_unmap_page(ring->dev, |
42 | dma_unmap_addr(tx_buffer, dma), |
43 | dma_unmap_len(tx_buffer, len), |
44 | DMA_TO_DEVICE); |
45 | } |
46 | |
47 | tx_buffer->next_to_watch = NULL; |
48 | tx_buffer->skb = NULL; |
49 | dma_unmap_len_set(tx_buffer, len, 0); |
50 | /* tx_buffer must be completely set up in the transmit path */ |
51 | } |
52 | |
53 | /** |
54 | * iavf_clean_tx_ring - Free any empty Tx buffers |
55 | * @tx_ring: ring to be cleaned |
56 | **/ |
57 | static void iavf_clean_tx_ring(struct iavf_ring *tx_ring) |
58 | { |
59 | unsigned long bi_size; |
60 | u16 i; |
61 | |
62 | /* ring already cleared, nothing to do */ |
63 | if (!tx_ring->tx_bi) |
64 | return; |
65 | |
66 | /* Free all the Tx ring sk_buffs */ |
67 | for (i = 0; i < tx_ring->count; i++) |
68 | iavf_unmap_and_free_tx_resource(ring: tx_ring, tx_buffer: &tx_ring->tx_bi[i]); |
69 | |
70 | bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count; |
71 | memset(tx_ring->tx_bi, 0, bi_size); |
72 | |
73 | /* Zero out the descriptor ring */ |
74 | memset(tx_ring->desc, 0, tx_ring->size); |
75 | |
76 | tx_ring->next_to_use = 0; |
77 | tx_ring->next_to_clean = 0; |
78 | |
79 | if (!tx_ring->netdev) |
80 | return; |
81 | |
82 | /* cleanup Tx queue statistics */ |
83 | netdev_tx_reset_queue(q: txring_txq(ring: tx_ring)); |
84 | } |
85 | |
86 | /** |
87 | * iavf_free_tx_resources - Free Tx resources per queue |
88 | * @tx_ring: Tx descriptor ring for a specific queue |
89 | * |
90 | * Free all transmit software resources |
91 | **/ |
92 | void iavf_free_tx_resources(struct iavf_ring *tx_ring) |
93 | { |
94 | iavf_clean_tx_ring(tx_ring); |
95 | kfree(objp: tx_ring->tx_bi); |
96 | tx_ring->tx_bi = NULL; |
97 | |
98 | if (tx_ring->desc) { |
99 | dma_free_coherent(dev: tx_ring->dev, size: tx_ring->size, |
100 | cpu_addr: tx_ring->desc, dma_handle: tx_ring->dma); |
101 | tx_ring->desc = NULL; |
102 | } |
103 | } |
104 | |
105 | /** |
106 | * iavf_get_tx_pending - how many Tx descriptors not processed |
107 | * @ring: the ring of descriptors |
108 | * @in_sw: is tx_pending being checked in SW or HW |
109 | * |
110 | * Since there is no access to the ring head register |
111 | * in XL710, we need to use our local copies |
112 | **/ |
113 | static u32 iavf_get_tx_pending(struct iavf_ring *ring, bool in_sw) |
114 | { |
115 | u32 head, tail; |
116 | |
117 | /* underlying hardware might not allow access and/or always return |
118 | * 0 for the head/tail registers so just use the cached values |
119 | */ |
120 | head = ring->next_to_clean; |
121 | tail = ring->next_to_use; |
122 | |
123 | if (head != tail) |
124 | return (head < tail) ? |
125 | tail - head : (tail + ring->count - head); |
126 | |
127 | return 0; |
128 | } |
129 | |
130 | /** |
131 | * iavf_force_wb - Issue SW Interrupt so HW does a wb |
132 | * @vsi: the VSI we care about |
133 | * @q_vector: the vector on which to force writeback |
134 | **/ |
135 | static void iavf_force_wb(struct iavf_vsi *vsi, struct iavf_q_vector *q_vector) |
136 | { |
137 | u32 val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK | |
138 | IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK | /* set noitr */ |
139 | IAVF_VFINT_DYN_CTLN1_SWINT_TRIG_MASK | |
140 | IAVF_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK |
141 | /* allow 00 to be written to the index */; |
142 | |
143 | wr32(&vsi->back->hw, |
144 | IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx), |
145 | val); |
146 | } |
147 | |
148 | /** |
149 | * iavf_detect_recover_hung - Function to detect and recover hung_queues |
150 | * @vsi: pointer to vsi struct with tx queues |
151 | * |
152 | * VSI has netdev and netdev has TX queues. This function is to check each of |
153 | * those TX queues if they are hung, trigger recovery by issuing SW interrupt. |
154 | **/ |
155 | void iavf_detect_recover_hung(struct iavf_vsi *vsi) |
156 | { |
157 | struct iavf_ring *tx_ring = NULL; |
158 | struct net_device *netdev; |
159 | unsigned int i; |
160 | int packets; |
161 | |
162 | if (!vsi) |
163 | return; |
164 | |
165 | if (test_bit(__IAVF_VSI_DOWN, vsi->state)) |
166 | return; |
167 | |
168 | netdev = vsi->netdev; |
169 | if (!netdev) |
170 | return; |
171 | |
172 | if (!netif_carrier_ok(dev: netdev)) |
173 | return; |
174 | |
175 | for (i = 0; i < vsi->back->num_active_queues; i++) { |
176 | tx_ring = &vsi->back->tx_rings[i]; |
177 | if (tx_ring && tx_ring->desc) { |
178 | /* If packet counter has not changed the queue is |
179 | * likely stalled, so force an interrupt for this |
180 | * queue. |
181 | * |
182 | * prev_pkt_ctr would be negative if there was no |
183 | * pending work. |
184 | */ |
185 | packets = tx_ring->stats.packets & INT_MAX; |
186 | if (tx_ring->tx_stats.prev_pkt_ctr == packets) { |
187 | iavf_force_wb(vsi, q_vector: tx_ring->q_vector); |
188 | continue; |
189 | } |
190 | |
191 | /* Memory barrier between read of packet count and call |
192 | * to iavf_get_tx_pending() |
193 | */ |
194 | smp_rmb(); |
195 | tx_ring->tx_stats.prev_pkt_ctr = |
196 | iavf_get_tx_pending(ring: tx_ring, in_sw: true) ? packets : -1; |
197 | } |
198 | } |
199 | } |
200 | |
201 | #define WB_STRIDE 4 |
202 | |
203 | /** |
204 | * iavf_clean_tx_irq - Reclaim resources after transmit completes |
205 | * @vsi: the VSI we care about |
206 | * @tx_ring: Tx ring to clean |
207 | * @napi_budget: Used to determine if we are in netpoll |
208 | * |
209 | * Returns true if there's any budget left (e.g. the clean is finished) |
210 | **/ |
211 | static bool iavf_clean_tx_irq(struct iavf_vsi *vsi, |
212 | struct iavf_ring *tx_ring, int napi_budget) |
213 | { |
214 | int i = tx_ring->next_to_clean; |
215 | struct iavf_tx_buffer *tx_buf; |
216 | struct iavf_tx_desc *tx_desc; |
217 | unsigned int total_bytes = 0, total_packets = 0; |
218 | unsigned int budget = IAVF_DEFAULT_IRQ_WORK; |
219 | |
220 | tx_buf = &tx_ring->tx_bi[i]; |
221 | tx_desc = IAVF_TX_DESC(tx_ring, i); |
222 | i -= tx_ring->count; |
223 | |
224 | do { |
225 | struct iavf_tx_desc *eop_desc = tx_buf->next_to_watch; |
226 | |
227 | /* if next_to_watch is not set then there is no work pending */ |
228 | if (!eop_desc) |
229 | break; |
230 | |
231 | /* prevent any other reads prior to eop_desc */ |
232 | smp_rmb(); |
233 | |
234 | iavf_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); |
235 | /* if the descriptor isn't done, no work yet to do */ |
236 | if (!(eop_desc->cmd_type_offset_bsz & |
237 | cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE))) |
238 | break; |
239 | |
240 | /* clear next_to_watch to prevent false hangs */ |
241 | tx_buf->next_to_watch = NULL; |
242 | |
243 | /* update the statistics for this packet */ |
244 | total_bytes += tx_buf->bytecount; |
245 | total_packets += tx_buf->gso_segs; |
246 | |
247 | /* free the skb */ |
248 | napi_consume_skb(skb: tx_buf->skb, budget: napi_budget); |
249 | |
250 | /* unmap skb header data */ |
251 | dma_unmap_single(tx_ring->dev, |
252 | dma_unmap_addr(tx_buf, dma), |
253 | dma_unmap_len(tx_buf, len), |
254 | DMA_TO_DEVICE); |
255 | |
256 | /* clear tx_buffer data */ |
257 | tx_buf->skb = NULL; |
258 | dma_unmap_len_set(tx_buf, len, 0); |
259 | |
260 | /* unmap remaining buffers */ |
261 | while (tx_desc != eop_desc) { |
262 | iavf_trace(clean_tx_irq_unmap, |
263 | tx_ring, tx_desc, tx_buf); |
264 | |
265 | tx_buf++; |
266 | tx_desc++; |
267 | i++; |
268 | if (unlikely(!i)) { |
269 | i -= tx_ring->count; |
270 | tx_buf = tx_ring->tx_bi; |
271 | tx_desc = IAVF_TX_DESC(tx_ring, 0); |
272 | } |
273 | |
274 | /* unmap any remaining paged data */ |
275 | if (dma_unmap_len(tx_buf, len)) { |
276 | dma_unmap_page(tx_ring->dev, |
277 | dma_unmap_addr(tx_buf, dma), |
278 | dma_unmap_len(tx_buf, len), |
279 | DMA_TO_DEVICE); |
280 | dma_unmap_len_set(tx_buf, len, 0); |
281 | } |
282 | } |
283 | |
284 | /* move us one more past the eop_desc for start of next pkt */ |
285 | tx_buf++; |
286 | tx_desc++; |
287 | i++; |
288 | if (unlikely(!i)) { |
289 | i -= tx_ring->count; |
290 | tx_buf = tx_ring->tx_bi; |
291 | tx_desc = IAVF_TX_DESC(tx_ring, 0); |
292 | } |
293 | |
294 | prefetch(tx_desc); |
295 | |
296 | /* update budget accounting */ |
297 | budget--; |
298 | } while (likely(budget)); |
299 | |
300 | i += tx_ring->count; |
301 | tx_ring->next_to_clean = i; |
302 | u64_stats_update_begin(syncp: &tx_ring->syncp); |
303 | tx_ring->stats.bytes += total_bytes; |
304 | tx_ring->stats.packets += total_packets; |
305 | u64_stats_update_end(syncp: &tx_ring->syncp); |
306 | tx_ring->q_vector->tx.total_bytes += total_bytes; |
307 | tx_ring->q_vector->tx.total_packets += total_packets; |
308 | |
309 | if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) { |
310 | /* check to see if there are < 4 descriptors |
311 | * waiting to be written back, then kick the hardware to force |
312 | * them to be written back in case we stay in NAPI. |
313 | * In this mode on X722 we do not enable Interrupt. |
314 | */ |
315 | unsigned int j = iavf_get_tx_pending(ring: tx_ring, in_sw: false); |
316 | |
317 | if (budget && |
318 | ((j / WB_STRIDE) == 0) && (j > 0) && |
319 | !test_bit(__IAVF_VSI_DOWN, vsi->state) && |
320 | (IAVF_DESC_UNUSED(tx_ring) != tx_ring->count)) |
321 | tx_ring->arm_wb = true; |
322 | } |
323 | |
324 | /* notify netdev of completed buffers */ |
325 | netdev_tx_completed_queue(dev_queue: txring_txq(ring: tx_ring), |
326 | pkts: total_packets, bytes: total_bytes); |
327 | |
328 | #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2)) |
329 | if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) && |
330 | (IAVF_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) { |
331 | /* Make sure that anybody stopping the queue after this |
332 | * sees the new next_to_clean. |
333 | */ |
334 | smp_mb(); |
335 | if (__netif_subqueue_stopped(dev: tx_ring->netdev, |
336 | queue_index: tx_ring->queue_index) && |
337 | !test_bit(__IAVF_VSI_DOWN, vsi->state)) { |
338 | netif_wake_subqueue(dev: tx_ring->netdev, |
339 | queue_index: tx_ring->queue_index); |
340 | ++tx_ring->tx_stats.restart_queue; |
341 | } |
342 | } |
343 | |
344 | return !!budget; |
345 | } |
346 | |
347 | /** |
348 | * iavf_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled |
349 | * @vsi: the VSI we care about |
350 | * @q_vector: the vector on which to enable writeback |
351 | * |
352 | **/ |
353 | static void iavf_enable_wb_on_itr(struct iavf_vsi *vsi, |
354 | struct iavf_q_vector *q_vector) |
355 | { |
356 | u16 flags = q_vector->tx.ring[0].flags; |
357 | u32 val; |
358 | |
359 | if (!(flags & IAVF_TXR_FLAGS_WB_ON_ITR)) |
360 | return; |
361 | |
362 | if (q_vector->arm_wb_state) |
363 | return; |
364 | |
365 | val = IAVF_VFINT_DYN_CTLN1_WB_ON_ITR_MASK | |
366 | IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK; /* set noitr */ |
367 | |
368 | wr32(&vsi->back->hw, |
369 | IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx), val); |
370 | q_vector->arm_wb_state = true; |
371 | } |
372 | |
373 | static bool iavf_container_is_rx(struct iavf_q_vector *q_vector, |
374 | struct iavf_ring_container *rc) |
375 | { |
376 | return &q_vector->rx == rc; |
377 | } |
378 | |
379 | #define IAVF_AIM_MULTIPLIER_100G 2560 |
380 | #define IAVF_AIM_MULTIPLIER_50G 1280 |
381 | #define IAVF_AIM_MULTIPLIER_40G 1024 |
382 | #define IAVF_AIM_MULTIPLIER_20G 512 |
383 | #define IAVF_AIM_MULTIPLIER_10G 256 |
384 | #define IAVF_AIM_MULTIPLIER_1G 32 |
385 | |
386 | static unsigned int iavf_mbps_itr_multiplier(u32 speed_mbps) |
387 | { |
388 | switch (speed_mbps) { |
389 | case SPEED_100000: |
390 | return IAVF_AIM_MULTIPLIER_100G; |
391 | case SPEED_50000: |
392 | return IAVF_AIM_MULTIPLIER_50G; |
393 | case SPEED_40000: |
394 | return IAVF_AIM_MULTIPLIER_40G; |
395 | case SPEED_25000: |
396 | case SPEED_20000: |
397 | return IAVF_AIM_MULTIPLIER_20G; |
398 | case SPEED_10000: |
399 | default: |
400 | return IAVF_AIM_MULTIPLIER_10G; |
401 | case SPEED_1000: |
402 | case SPEED_100: |
403 | return IAVF_AIM_MULTIPLIER_1G; |
404 | } |
405 | } |
406 | |
407 | static unsigned int |
408 | iavf_virtchnl_itr_multiplier(enum virtchnl_link_speed speed_virtchnl) |
409 | { |
410 | switch (speed_virtchnl) { |
411 | case VIRTCHNL_LINK_SPEED_40GB: |
412 | return IAVF_AIM_MULTIPLIER_40G; |
413 | case VIRTCHNL_LINK_SPEED_25GB: |
414 | case VIRTCHNL_LINK_SPEED_20GB: |
415 | return IAVF_AIM_MULTIPLIER_20G; |
416 | case VIRTCHNL_LINK_SPEED_10GB: |
417 | default: |
418 | return IAVF_AIM_MULTIPLIER_10G; |
419 | case VIRTCHNL_LINK_SPEED_1GB: |
420 | case VIRTCHNL_LINK_SPEED_100MB: |
421 | return IAVF_AIM_MULTIPLIER_1G; |
422 | } |
423 | } |
424 | |
425 | static unsigned int iavf_itr_divisor(struct iavf_adapter *adapter) |
426 | { |
427 | if (ADV_LINK_SUPPORT(adapter)) |
428 | return IAVF_ITR_ADAPTIVE_MIN_INC * |
429 | iavf_mbps_itr_multiplier(speed_mbps: adapter->link_speed_mbps); |
430 | else |
431 | return IAVF_ITR_ADAPTIVE_MIN_INC * |
432 | iavf_virtchnl_itr_multiplier(speed_virtchnl: adapter->link_speed); |
433 | } |
434 | |
435 | /** |
436 | * iavf_update_itr - update the dynamic ITR value based on statistics |
437 | * @q_vector: structure containing interrupt and ring information |
438 | * @rc: structure containing ring performance data |
439 | * |
440 | * Stores a new ITR value based on packets and byte |
441 | * counts during the last interrupt. The advantage of per interrupt |
442 | * computation is faster updates and more accurate ITR for the current |
443 | * traffic pattern. Constants in this function were computed |
444 | * based on theoretical maximum wire speed and thresholds were set based |
445 | * on testing data as well as attempting to minimize response time |
446 | * while increasing bulk throughput. |
447 | **/ |
448 | static void iavf_update_itr(struct iavf_q_vector *q_vector, |
449 | struct iavf_ring_container *rc) |
450 | { |
451 | unsigned int avg_wire_size, packets, bytes, itr; |
452 | unsigned long next_update = jiffies; |
453 | |
454 | /* If we don't have any rings just leave ourselves set for maximum |
455 | * possible latency so we take ourselves out of the equation. |
456 | */ |
457 | if (!rc->ring || !ITR_IS_DYNAMIC(rc->ring->itr_setting)) |
458 | return; |
459 | |
460 | /* For Rx we want to push the delay up and default to low latency. |
461 | * for Tx we want to pull the delay down and default to high latency. |
462 | */ |
463 | itr = iavf_container_is_rx(q_vector, rc) ? |
464 | IAVF_ITR_ADAPTIVE_MIN_USECS | IAVF_ITR_ADAPTIVE_LATENCY : |
465 | IAVF_ITR_ADAPTIVE_MAX_USECS | IAVF_ITR_ADAPTIVE_LATENCY; |
466 | |
467 | /* If we didn't update within up to 1 - 2 jiffies we can assume |
468 | * that either packets are coming in so slow there hasn't been |
469 | * any work, or that there is so much work that NAPI is dealing |
470 | * with interrupt moderation and we don't need to do anything. |
471 | */ |
472 | if (time_after(next_update, rc->next_update)) |
473 | goto clear_counts; |
474 | |
475 | /* If itr_countdown is set it means we programmed an ITR within |
476 | * the last 4 interrupt cycles. This has a side effect of us |
477 | * potentially firing an early interrupt. In order to work around |
478 | * this we need to throw out any data received for a few |
479 | * interrupts following the update. |
480 | */ |
481 | if (q_vector->itr_countdown) { |
482 | itr = rc->target_itr; |
483 | goto clear_counts; |
484 | } |
485 | |
486 | packets = rc->total_packets; |
487 | bytes = rc->total_bytes; |
488 | |
489 | if (iavf_container_is_rx(q_vector, rc)) { |
490 | /* If Rx there are 1 to 4 packets and bytes are less than |
491 | * 9000 assume insufficient data to use bulk rate limiting |
492 | * approach unless Tx is already in bulk rate limiting. We |
493 | * are likely latency driven. |
494 | */ |
495 | if (packets && packets < 4 && bytes < 9000 && |
496 | (q_vector->tx.target_itr & IAVF_ITR_ADAPTIVE_LATENCY)) { |
497 | itr = IAVF_ITR_ADAPTIVE_LATENCY; |
498 | goto adjust_by_size; |
499 | } |
500 | } else if (packets < 4) { |
501 | /* If we have Tx and Rx ITR maxed and Tx ITR is running in |
502 | * bulk mode and we are receiving 4 or fewer packets just |
503 | * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so |
504 | * that the Rx can relax. |
505 | */ |
506 | if (rc->target_itr == IAVF_ITR_ADAPTIVE_MAX_USECS && |
507 | (q_vector->rx.target_itr & IAVF_ITR_MASK) == |
508 | IAVF_ITR_ADAPTIVE_MAX_USECS) |
509 | goto clear_counts; |
510 | } else if (packets > 32) { |
511 | /* If we have processed over 32 packets in a single interrupt |
512 | * for Tx assume we need to switch over to "bulk" mode. |
513 | */ |
514 | rc->target_itr &= ~IAVF_ITR_ADAPTIVE_LATENCY; |
515 | } |
516 | |
517 | /* We have no packets to actually measure against. This means |
518 | * either one of the other queues on this vector is active or |
519 | * we are a Tx queue doing TSO with too high of an interrupt rate. |
520 | * |
521 | * Between 4 and 56 we can assume that our current interrupt delay |
522 | * is only slightly too low. As such we should increase it by a small |
523 | * fixed amount. |
524 | */ |
525 | if (packets < 56) { |
526 | itr = rc->target_itr + IAVF_ITR_ADAPTIVE_MIN_INC; |
527 | if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) { |
528 | itr &= IAVF_ITR_ADAPTIVE_LATENCY; |
529 | itr += IAVF_ITR_ADAPTIVE_MAX_USECS; |
530 | } |
531 | goto clear_counts; |
532 | } |
533 | |
534 | if (packets <= 256) { |
535 | itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr); |
536 | itr &= IAVF_ITR_MASK; |
537 | |
538 | /* Between 56 and 112 is our "goldilocks" zone where we are |
539 | * working out "just right". Just report that our current |
540 | * ITR is good for us. |
541 | */ |
542 | if (packets <= 112) |
543 | goto clear_counts; |
544 | |
545 | /* If packet count is 128 or greater we are likely looking |
546 | * at a slight overrun of the delay we want. Try halving |
547 | * our delay to see if that will cut the number of packets |
548 | * in half per interrupt. |
549 | */ |
550 | itr /= 2; |
551 | itr &= IAVF_ITR_MASK; |
552 | if (itr < IAVF_ITR_ADAPTIVE_MIN_USECS) |
553 | itr = IAVF_ITR_ADAPTIVE_MIN_USECS; |
554 | |
555 | goto clear_counts; |
556 | } |
557 | |
558 | /* The paths below assume we are dealing with a bulk ITR since |
559 | * number of packets is greater than 256. We are just going to have |
560 | * to compute a value and try to bring the count under control, |
561 | * though for smaller packet sizes there isn't much we can do as |
562 | * NAPI polling will likely be kicking in sooner rather than later. |
563 | */ |
564 | itr = IAVF_ITR_ADAPTIVE_BULK; |
565 | |
566 | adjust_by_size: |
567 | /* If packet counts are 256 or greater we can assume we have a gross |
568 | * overestimation of what the rate should be. Instead of trying to fine |
569 | * tune it just use the formula below to try and dial in an exact value |
570 | * give the current packet size of the frame. |
571 | */ |
572 | avg_wire_size = bytes / packets; |
573 | |
574 | /* The following is a crude approximation of: |
575 | * wmem_default / (size + overhead) = desired_pkts_per_int |
576 | * rate / bits_per_byte / (size + ethernet overhead) = pkt_rate |
577 | * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value |
578 | * |
579 | * Assuming wmem_default is 212992 and overhead is 640 bytes per |
580 | * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the |
581 | * formula down to |
582 | * |
583 | * (170 * (size + 24)) / (size + 640) = ITR |
584 | * |
585 | * We first do some math on the packet size and then finally bitshift |
586 | * by 8 after rounding up. We also have to account for PCIe link speed |
587 | * difference as ITR scales based on this. |
588 | */ |
589 | if (avg_wire_size <= 60) { |
590 | /* Start at 250k ints/sec */ |
591 | avg_wire_size = 4096; |
592 | } else if (avg_wire_size <= 380) { |
593 | /* 250K ints/sec to 60K ints/sec */ |
594 | avg_wire_size *= 40; |
595 | avg_wire_size += 1696; |
596 | } else if (avg_wire_size <= 1084) { |
597 | /* 60K ints/sec to 36K ints/sec */ |
598 | avg_wire_size *= 15; |
599 | avg_wire_size += 11452; |
600 | } else if (avg_wire_size <= 1980) { |
601 | /* 36K ints/sec to 30K ints/sec */ |
602 | avg_wire_size *= 5; |
603 | avg_wire_size += 22420; |
604 | } else { |
605 | /* plateau at a limit of 30K ints/sec */ |
606 | avg_wire_size = 32256; |
607 | } |
608 | |
609 | /* If we are in low latency mode halve our delay which doubles the |
610 | * rate to somewhere between 100K to 16K ints/sec |
611 | */ |
612 | if (itr & IAVF_ITR_ADAPTIVE_LATENCY) |
613 | avg_wire_size /= 2; |
614 | |
615 | /* Resultant value is 256 times larger than it needs to be. This |
616 | * gives us room to adjust the value as needed to either increase |
617 | * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc. |
618 | * |
619 | * Use addition as we have already recorded the new latency flag |
620 | * for the ITR value. |
621 | */ |
622 | itr += DIV_ROUND_UP(avg_wire_size, |
623 | iavf_itr_divisor(q_vector->adapter)) * |
624 | IAVF_ITR_ADAPTIVE_MIN_INC; |
625 | |
626 | if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) { |
627 | itr &= IAVF_ITR_ADAPTIVE_LATENCY; |
628 | itr += IAVF_ITR_ADAPTIVE_MAX_USECS; |
629 | } |
630 | |
631 | clear_counts: |
632 | /* write back value */ |
633 | rc->target_itr = itr; |
634 | |
635 | /* next update should occur within next jiffy */ |
636 | rc->next_update = next_update + 1; |
637 | |
638 | rc->total_bytes = 0; |
639 | rc->total_packets = 0; |
640 | } |
641 | |
642 | /** |
643 | * iavf_setup_tx_descriptors - Allocate the Tx descriptors |
644 | * @tx_ring: the tx ring to set up |
645 | * |
646 | * Return 0 on success, negative on error |
647 | **/ |
648 | int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring) |
649 | { |
650 | struct device *dev = tx_ring->dev; |
651 | int bi_size; |
652 | |
653 | if (!dev) |
654 | return -ENOMEM; |
655 | |
656 | /* warn if we are about to overwrite the pointer */ |
657 | WARN_ON(tx_ring->tx_bi); |
658 | bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count; |
659 | tx_ring->tx_bi = kzalloc(size: bi_size, GFP_KERNEL); |
660 | if (!tx_ring->tx_bi) |
661 | goto err; |
662 | |
663 | /* round up to nearest 4K */ |
664 | tx_ring->size = tx_ring->count * sizeof(struct iavf_tx_desc); |
665 | tx_ring->size = ALIGN(tx_ring->size, 4096); |
666 | tx_ring->desc = dma_alloc_coherent(dev, size: tx_ring->size, |
667 | dma_handle: &tx_ring->dma, GFP_KERNEL); |
668 | if (!tx_ring->desc) { |
669 | dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n" , |
670 | tx_ring->size); |
671 | goto err; |
672 | } |
673 | |
674 | tx_ring->next_to_use = 0; |
675 | tx_ring->next_to_clean = 0; |
676 | tx_ring->tx_stats.prev_pkt_ctr = -1; |
677 | return 0; |
678 | |
679 | err: |
680 | kfree(objp: tx_ring->tx_bi); |
681 | tx_ring->tx_bi = NULL; |
682 | return -ENOMEM; |
683 | } |
684 | |
685 | /** |
686 | * iavf_clean_rx_ring - Free Rx buffers |
687 | * @rx_ring: ring to be cleaned |
688 | **/ |
689 | static void iavf_clean_rx_ring(struct iavf_ring *rx_ring) |
690 | { |
691 | unsigned long bi_size; |
692 | u16 i; |
693 | |
694 | /* ring already cleared, nothing to do */ |
695 | if (!rx_ring->rx_bi) |
696 | return; |
697 | |
698 | if (rx_ring->skb) { |
699 | dev_kfree_skb(rx_ring->skb); |
700 | rx_ring->skb = NULL; |
701 | } |
702 | |
703 | /* Free all the Rx ring sk_buffs */ |
704 | for (i = 0; i < rx_ring->count; i++) { |
705 | struct iavf_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; |
706 | |
707 | if (!rx_bi->page) |
708 | continue; |
709 | |
710 | /* Invalidate cache lines that may have been written to by |
711 | * device so that we avoid corrupting memory. |
712 | */ |
713 | dma_sync_single_range_for_cpu(dev: rx_ring->dev, |
714 | addr: rx_bi->dma, |
715 | offset: rx_bi->page_offset, |
716 | size: rx_ring->rx_buf_len, |
717 | dir: DMA_FROM_DEVICE); |
718 | |
719 | /* free resources associated with mapping */ |
720 | dma_unmap_page_attrs(dev: rx_ring->dev, addr: rx_bi->dma, |
721 | iavf_rx_pg_size(rx_ring), |
722 | dir: DMA_FROM_DEVICE, |
723 | IAVF_RX_DMA_ATTR); |
724 | |
725 | __page_frag_cache_drain(page: rx_bi->page, count: rx_bi->pagecnt_bias); |
726 | |
727 | rx_bi->page = NULL; |
728 | rx_bi->page_offset = 0; |
729 | } |
730 | |
731 | bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count; |
732 | memset(rx_ring->rx_bi, 0, bi_size); |
733 | |
734 | /* Zero out the descriptor ring */ |
735 | memset(rx_ring->desc, 0, rx_ring->size); |
736 | |
737 | rx_ring->next_to_alloc = 0; |
738 | rx_ring->next_to_clean = 0; |
739 | rx_ring->next_to_use = 0; |
740 | } |
741 | |
742 | /** |
743 | * iavf_free_rx_resources - Free Rx resources |
744 | * @rx_ring: ring to clean the resources from |
745 | * |
746 | * Free all receive software resources |
747 | **/ |
748 | void iavf_free_rx_resources(struct iavf_ring *rx_ring) |
749 | { |
750 | iavf_clean_rx_ring(rx_ring); |
751 | kfree(objp: rx_ring->rx_bi); |
752 | rx_ring->rx_bi = NULL; |
753 | |
754 | if (rx_ring->desc) { |
755 | dma_free_coherent(dev: rx_ring->dev, size: rx_ring->size, |
756 | cpu_addr: rx_ring->desc, dma_handle: rx_ring->dma); |
757 | rx_ring->desc = NULL; |
758 | } |
759 | } |
760 | |
761 | /** |
762 | * iavf_setup_rx_descriptors - Allocate Rx descriptors |
763 | * @rx_ring: Rx descriptor ring (for a specific queue) to setup |
764 | * |
765 | * Returns 0 on success, negative on failure |
766 | **/ |
767 | int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring) |
768 | { |
769 | struct device *dev = rx_ring->dev; |
770 | int bi_size; |
771 | |
772 | /* warn if we are about to overwrite the pointer */ |
773 | WARN_ON(rx_ring->rx_bi); |
774 | bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count; |
775 | rx_ring->rx_bi = kzalloc(size: bi_size, GFP_KERNEL); |
776 | if (!rx_ring->rx_bi) |
777 | goto err; |
778 | |
779 | u64_stats_init(syncp: &rx_ring->syncp); |
780 | |
781 | /* Round up to nearest 4K */ |
782 | rx_ring->size = rx_ring->count * sizeof(union iavf_32byte_rx_desc); |
783 | rx_ring->size = ALIGN(rx_ring->size, 4096); |
784 | rx_ring->desc = dma_alloc_coherent(dev, size: rx_ring->size, |
785 | dma_handle: &rx_ring->dma, GFP_KERNEL); |
786 | |
787 | if (!rx_ring->desc) { |
788 | dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n" , |
789 | rx_ring->size); |
790 | goto err; |
791 | } |
792 | |
793 | rx_ring->next_to_alloc = 0; |
794 | rx_ring->next_to_clean = 0; |
795 | rx_ring->next_to_use = 0; |
796 | |
797 | return 0; |
798 | err: |
799 | kfree(objp: rx_ring->rx_bi); |
800 | rx_ring->rx_bi = NULL; |
801 | return -ENOMEM; |
802 | } |
803 | |
804 | /** |
805 | * iavf_release_rx_desc - Store the new tail and head values |
806 | * @rx_ring: ring to bump |
807 | * @val: new head index |
808 | **/ |
809 | static void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val) |
810 | { |
811 | rx_ring->next_to_use = val; |
812 | |
813 | /* update next to alloc since we have filled the ring */ |
814 | rx_ring->next_to_alloc = val; |
815 | |
816 | /* Force memory writes to complete before letting h/w |
817 | * know there are new descriptors to fetch. (Only |
818 | * applicable for weak-ordered memory model archs, |
819 | * such as IA-64). |
820 | */ |
821 | wmb(); |
822 | writel(val, addr: rx_ring->tail); |
823 | } |
824 | |
825 | /** |
826 | * iavf_rx_offset - Return expected offset into page to access data |
827 | * @rx_ring: Ring we are requesting offset of |
828 | * |
829 | * Returns the offset value for ring into the data buffer. |
830 | */ |
831 | static unsigned int iavf_rx_offset(struct iavf_ring *rx_ring) |
832 | { |
833 | return ring_uses_build_skb(ring: rx_ring) ? IAVF_SKB_PAD : 0; |
834 | } |
835 | |
836 | /** |
837 | * iavf_alloc_mapped_page - recycle or make a new page |
838 | * @rx_ring: ring to use |
839 | * @bi: rx_buffer struct to modify |
840 | * |
841 | * Returns true if the page was successfully allocated or |
842 | * reused. |
843 | **/ |
844 | static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring, |
845 | struct iavf_rx_buffer *bi) |
846 | { |
847 | struct page *page = bi->page; |
848 | dma_addr_t dma; |
849 | |
850 | /* since we are recycling buffers we should seldom need to alloc */ |
851 | if (likely(page)) { |
852 | rx_ring->rx_stats.page_reuse_count++; |
853 | return true; |
854 | } |
855 | |
856 | /* alloc new page for storage */ |
857 | page = dev_alloc_pages(order: iavf_rx_pg_order(ring: rx_ring)); |
858 | if (unlikely(!page)) { |
859 | rx_ring->rx_stats.alloc_page_failed++; |
860 | return false; |
861 | } |
862 | |
863 | /* map page for use */ |
864 | dma = dma_map_page_attrs(dev: rx_ring->dev, page, offset: 0, |
865 | iavf_rx_pg_size(rx_ring), |
866 | dir: DMA_FROM_DEVICE, |
867 | IAVF_RX_DMA_ATTR); |
868 | |
869 | /* if mapping failed free memory back to system since |
870 | * there isn't much point in holding memory we can't use |
871 | */ |
872 | if (dma_mapping_error(dev: rx_ring->dev, dma_addr: dma)) { |
873 | __free_pages(page, order: iavf_rx_pg_order(ring: rx_ring)); |
874 | rx_ring->rx_stats.alloc_page_failed++; |
875 | return false; |
876 | } |
877 | |
878 | bi->dma = dma; |
879 | bi->page = page; |
880 | bi->page_offset = iavf_rx_offset(rx_ring); |
881 | |
882 | /* initialize pagecnt_bias to 1 representing we fully own page */ |
883 | bi->pagecnt_bias = 1; |
884 | |
885 | return true; |
886 | } |
887 | |
888 | /** |
889 | * iavf_receive_skb - Send a completed packet up the stack |
890 | * @rx_ring: rx ring in play |
891 | * @skb: packet to send up |
892 | * @vlan_tag: vlan tag for packet |
893 | **/ |
894 | static void iavf_receive_skb(struct iavf_ring *rx_ring, |
895 | struct sk_buff *skb, u16 vlan_tag) |
896 | { |
897 | struct iavf_q_vector *q_vector = rx_ring->q_vector; |
898 | |
899 | if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) && |
900 | (vlan_tag & VLAN_VID_MASK)) |
901 | __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci: vlan_tag); |
902 | else if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX) && |
903 | vlan_tag & VLAN_VID_MASK) |
904 | __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tci: vlan_tag); |
905 | |
906 | napi_gro_receive(napi: &q_vector->napi, skb); |
907 | } |
908 | |
909 | /** |
910 | * iavf_alloc_rx_buffers - Replace used receive buffers |
911 | * @rx_ring: ring to place buffers on |
912 | * @cleaned_count: number of buffers to replace |
913 | * |
914 | * Returns false if all allocations were successful, true if any fail |
915 | **/ |
916 | bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count) |
917 | { |
918 | u16 ntu = rx_ring->next_to_use; |
919 | union iavf_rx_desc *rx_desc; |
920 | struct iavf_rx_buffer *bi; |
921 | |
922 | /* do nothing if no valid netdev defined */ |
923 | if (!rx_ring->netdev || !cleaned_count) |
924 | return false; |
925 | |
926 | rx_desc = IAVF_RX_DESC(rx_ring, ntu); |
927 | bi = &rx_ring->rx_bi[ntu]; |
928 | |
929 | do { |
930 | if (!iavf_alloc_mapped_page(rx_ring, bi)) |
931 | goto no_buffers; |
932 | |
933 | /* sync the buffer for use by the device */ |
934 | dma_sync_single_range_for_device(dev: rx_ring->dev, addr: bi->dma, |
935 | offset: bi->page_offset, |
936 | size: rx_ring->rx_buf_len, |
937 | dir: DMA_FROM_DEVICE); |
938 | |
939 | /* Refresh the desc even if buffer_addrs didn't change |
940 | * because each write-back erases this info. |
941 | */ |
942 | rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset); |
943 | |
944 | rx_desc++; |
945 | bi++; |
946 | ntu++; |
947 | if (unlikely(ntu == rx_ring->count)) { |
948 | rx_desc = IAVF_RX_DESC(rx_ring, 0); |
949 | bi = rx_ring->rx_bi; |
950 | ntu = 0; |
951 | } |
952 | |
953 | /* clear the status bits for the next_to_use descriptor */ |
954 | rx_desc->wb.qword1.status_error_len = 0; |
955 | |
956 | cleaned_count--; |
957 | } while (cleaned_count); |
958 | |
959 | if (rx_ring->next_to_use != ntu) |
960 | iavf_release_rx_desc(rx_ring, val: ntu); |
961 | |
962 | return false; |
963 | |
964 | no_buffers: |
965 | if (rx_ring->next_to_use != ntu) |
966 | iavf_release_rx_desc(rx_ring, val: ntu); |
967 | |
968 | /* make sure to come back via polling to try again after |
969 | * allocation failure |
970 | */ |
971 | return true; |
972 | } |
973 | |
974 | /** |
975 | * iavf_rx_checksum - Indicate in skb if hw indicated a good cksum |
976 | * @vsi: the VSI we care about |
977 | * @skb: skb currently being received and modified |
978 | * @rx_desc: the receive descriptor |
979 | **/ |
980 | static void iavf_rx_checksum(struct iavf_vsi *vsi, |
981 | struct sk_buff *skb, |
982 | union iavf_rx_desc *rx_desc) |
983 | { |
984 | struct iavf_rx_ptype_decoded decoded; |
985 | u32 rx_error, rx_status; |
986 | bool ipv4, ipv6; |
987 | u8 ptype; |
988 | u64 qword; |
989 | |
990 | qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); |
991 | ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT; |
992 | rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >> |
993 | IAVF_RXD_QW1_ERROR_SHIFT; |
994 | rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >> |
995 | IAVF_RXD_QW1_STATUS_SHIFT; |
996 | decoded = decode_rx_desc_ptype(ptype); |
997 | |
998 | skb->ip_summed = CHECKSUM_NONE; |
999 | |
1000 | skb_checksum_none_assert(skb); |
1001 | |
1002 | /* Rx csum enabled and ip headers found? */ |
1003 | if (!(vsi->netdev->features & NETIF_F_RXCSUM)) |
1004 | return; |
1005 | |
1006 | /* did the hardware decode the packet and checksum? */ |
1007 | if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT))) |
1008 | return; |
1009 | |
1010 | /* both known and outer_ip must be set for the below code to work */ |
1011 | if (!(decoded.known && decoded.outer_ip)) |
1012 | return; |
1013 | |
1014 | ipv4 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) && |
1015 | (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV4); |
1016 | ipv6 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) && |
1017 | (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV6); |
1018 | |
1019 | if (ipv4 && |
1020 | (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) | |
1021 | BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT)))) |
1022 | goto checksum_fail; |
1023 | |
1024 | /* likely incorrect csum if alternate IP extension headers found */ |
1025 | if (ipv6 && |
1026 | rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT)) |
1027 | /* don't increment checksum err here, non-fatal err */ |
1028 | return; |
1029 | |
1030 | /* there was some L4 error, count error and punt packet to the stack */ |
1031 | if (rx_error & BIT(IAVF_RX_DESC_ERROR_L4E_SHIFT)) |
1032 | goto checksum_fail; |
1033 | |
1034 | /* handle packets that were not able to be checksummed due |
1035 | * to arrival speed, in this case the stack can compute |
1036 | * the csum. |
1037 | */ |
1038 | if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT)) |
1039 | return; |
1040 | |
1041 | /* Only report checksum unnecessary for TCP, UDP, or SCTP */ |
1042 | switch (decoded.inner_prot) { |
1043 | case IAVF_RX_PTYPE_INNER_PROT_TCP: |
1044 | case IAVF_RX_PTYPE_INNER_PROT_UDP: |
1045 | case IAVF_RX_PTYPE_INNER_PROT_SCTP: |
1046 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
1047 | fallthrough; |
1048 | default: |
1049 | break; |
1050 | } |
1051 | |
1052 | return; |
1053 | |
1054 | checksum_fail: |
1055 | vsi->back->hw_csum_rx_error++; |
1056 | } |
1057 | |
1058 | /** |
1059 | * iavf_ptype_to_htype - get a hash type |
1060 | * @ptype: the ptype value from the descriptor |
1061 | * |
1062 | * Returns a hash type to be used by skb_set_hash |
1063 | **/ |
1064 | static int iavf_ptype_to_htype(u8 ptype) |
1065 | { |
1066 | struct iavf_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype); |
1067 | |
1068 | if (!decoded.known) |
1069 | return PKT_HASH_TYPE_NONE; |
1070 | |
1071 | if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP && |
1072 | decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4) |
1073 | return PKT_HASH_TYPE_L4; |
1074 | else if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP && |
1075 | decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3) |
1076 | return PKT_HASH_TYPE_L3; |
1077 | else |
1078 | return PKT_HASH_TYPE_L2; |
1079 | } |
1080 | |
1081 | /** |
1082 | * iavf_rx_hash - set the hash value in the skb |
1083 | * @ring: descriptor ring |
1084 | * @rx_desc: specific descriptor |
1085 | * @skb: skb currently being received and modified |
1086 | * @rx_ptype: Rx packet type |
1087 | **/ |
1088 | static void iavf_rx_hash(struct iavf_ring *ring, |
1089 | union iavf_rx_desc *rx_desc, |
1090 | struct sk_buff *skb, |
1091 | u8 rx_ptype) |
1092 | { |
1093 | u32 hash; |
1094 | const __le64 = |
1095 | cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH << |
1096 | IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT); |
1097 | |
1098 | if (!(ring->netdev->features & NETIF_F_RXHASH)) |
1099 | return; |
1100 | |
1101 | if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) { |
1102 | hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss); |
1103 | skb_set_hash(skb, hash, type: iavf_ptype_to_htype(ptype: rx_ptype)); |
1104 | } |
1105 | } |
1106 | |
1107 | /** |
1108 | * iavf_process_skb_fields - Populate skb header fields from Rx descriptor |
1109 | * @rx_ring: rx descriptor ring packet is being transacted on |
1110 | * @rx_desc: pointer to the EOP Rx descriptor |
1111 | * @skb: pointer to current skb being populated |
1112 | * @rx_ptype: the packet type decoded by hardware |
1113 | * |
1114 | * This function checks the ring, descriptor, and packet information in |
1115 | * order to populate the hash, checksum, VLAN, protocol, and |
1116 | * other fields within the skb. |
1117 | **/ |
1118 | static void |
1119 | iavf_process_skb_fields(struct iavf_ring *rx_ring, |
1120 | union iavf_rx_desc *rx_desc, struct sk_buff *skb, |
1121 | u8 rx_ptype) |
1122 | { |
1123 | iavf_rx_hash(ring: rx_ring, rx_desc, skb, rx_ptype); |
1124 | |
1125 | iavf_rx_checksum(vsi: rx_ring->vsi, skb, rx_desc); |
1126 | |
1127 | skb_record_rx_queue(skb, rx_queue: rx_ring->queue_index); |
1128 | |
1129 | /* modifies the skb - consumes the enet header */ |
1130 | skb->protocol = eth_type_trans(skb, dev: rx_ring->netdev); |
1131 | } |
1132 | |
1133 | /** |
1134 | * iavf_cleanup_headers - Correct empty headers |
1135 | * @rx_ring: rx descriptor ring packet is being transacted on |
1136 | * @skb: pointer to current skb being fixed |
1137 | * |
1138 | * Also address the case where we are pulling data in on pages only |
1139 | * and as such no data is present in the skb header. |
1140 | * |
1141 | * In addition if skb is not at least 60 bytes we need to pad it so that |
1142 | * it is large enough to qualify as a valid Ethernet frame. |
1143 | * |
1144 | * Returns true if an error was encountered and skb was freed. |
1145 | **/ |
1146 | static bool (struct iavf_ring *rx_ring, struct sk_buff *skb) |
1147 | { |
1148 | /* if eth_skb_pad returns an error the skb was freed */ |
1149 | if (eth_skb_pad(skb)) |
1150 | return true; |
1151 | |
1152 | return false; |
1153 | } |
1154 | |
1155 | /** |
1156 | * iavf_reuse_rx_page - page flip buffer and store it back on the ring |
1157 | * @rx_ring: rx descriptor ring to store buffers on |
1158 | * @old_buff: donor buffer to have page reused |
1159 | * |
1160 | * Synchronizes page for reuse by the adapter |
1161 | **/ |
1162 | static void iavf_reuse_rx_page(struct iavf_ring *rx_ring, |
1163 | struct iavf_rx_buffer *old_buff) |
1164 | { |
1165 | struct iavf_rx_buffer *new_buff; |
1166 | u16 nta = rx_ring->next_to_alloc; |
1167 | |
1168 | new_buff = &rx_ring->rx_bi[nta]; |
1169 | |
1170 | /* update, and store next to alloc */ |
1171 | nta++; |
1172 | rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; |
1173 | |
1174 | /* transfer page from old buffer to new buffer */ |
1175 | new_buff->dma = old_buff->dma; |
1176 | new_buff->page = old_buff->page; |
1177 | new_buff->page_offset = old_buff->page_offset; |
1178 | new_buff->pagecnt_bias = old_buff->pagecnt_bias; |
1179 | } |
1180 | |
1181 | /** |
1182 | * iavf_can_reuse_rx_page - Determine if this page can be reused by |
1183 | * the adapter for another receive |
1184 | * |
1185 | * @rx_buffer: buffer containing the page |
1186 | * |
1187 | * If page is reusable, rx_buffer->page_offset is adjusted to point to |
1188 | * an unused region in the page. |
1189 | * |
1190 | * For small pages, @truesize will be a constant value, half the size |
1191 | * of the memory at page. We'll attempt to alternate between high and |
1192 | * low halves of the page, with one half ready for use by the hardware |
1193 | * and the other half being consumed by the stack. We use the page |
1194 | * ref count to determine whether the stack has finished consuming the |
1195 | * portion of this page that was passed up with a previous packet. If |
1196 | * the page ref count is >1, we'll assume the "other" half page is |
1197 | * still busy, and this page cannot be reused. |
1198 | * |
1199 | * For larger pages, @truesize will be the actual space used by the |
1200 | * received packet (adjusted upward to an even multiple of the cache |
1201 | * line size). This will advance through the page by the amount |
1202 | * actually consumed by the received packets while there is still |
1203 | * space for a buffer. Each region of larger pages will be used at |
1204 | * most once, after which the page will not be reused. |
1205 | * |
1206 | * In either case, if the page is reusable its refcount is increased. |
1207 | **/ |
1208 | static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer) |
1209 | { |
1210 | unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; |
1211 | struct page *page = rx_buffer->page; |
1212 | |
1213 | /* Is any reuse possible? */ |
1214 | if (!dev_page_is_reusable(page)) |
1215 | return false; |
1216 | |
1217 | #if (PAGE_SIZE < 8192) |
1218 | /* if we are only owner of page we can reuse it */ |
1219 | if (unlikely((page_count(page) - pagecnt_bias) > 1)) |
1220 | return false; |
1221 | #else |
1222 | #define IAVF_LAST_OFFSET \ |
1223 | (SKB_WITH_OVERHEAD(PAGE_SIZE) - IAVF_RXBUFFER_2048) |
1224 | if (rx_buffer->page_offset > IAVF_LAST_OFFSET) |
1225 | return false; |
1226 | #endif |
1227 | |
1228 | /* If we have drained the page fragment pool we need to update |
1229 | * the pagecnt_bias and page count so that we fully restock the |
1230 | * number of references the driver holds. |
1231 | */ |
1232 | if (unlikely(!pagecnt_bias)) { |
1233 | page_ref_add(page, USHRT_MAX); |
1234 | rx_buffer->pagecnt_bias = USHRT_MAX; |
1235 | } |
1236 | |
1237 | return true; |
1238 | } |
1239 | |
1240 | /** |
1241 | * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff |
1242 | * @rx_ring: rx descriptor ring to transact packets on |
1243 | * @rx_buffer: buffer containing page to add |
1244 | * @skb: sk_buff to place the data into |
1245 | * @size: packet length from rx_desc |
1246 | * |
1247 | * This function will add the data contained in rx_buffer->page to the skb. |
1248 | * It will just attach the page as a frag to the skb. |
1249 | * |
1250 | * The function will then update the page offset. |
1251 | **/ |
1252 | static void iavf_add_rx_frag(struct iavf_ring *rx_ring, |
1253 | struct iavf_rx_buffer *rx_buffer, |
1254 | struct sk_buff *skb, |
1255 | unsigned int size) |
1256 | { |
1257 | #if (PAGE_SIZE < 8192) |
1258 | unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2; |
1259 | #else |
1260 | unsigned int truesize = SKB_DATA_ALIGN(size + iavf_rx_offset(rx_ring)); |
1261 | #endif |
1262 | |
1263 | if (!size) |
1264 | return; |
1265 | |
1266 | skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page: rx_buffer->page, |
1267 | off: rx_buffer->page_offset, size, truesize); |
1268 | |
1269 | /* page is being used so we must update the page offset */ |
1270 | #if (PAGE_SIZE < 8192) |
1271 | rx_buffer->page_offset ^= truesize; |
1272 | #else |
1273 | rx_buffer->page_offset += truesize; |
1274 | #endif |
1275 | } |
1276 | |
1277 | /** |
1278 | * iavf_get_rx_buffer - Fetch Rx buffer and synchronize data for use |
1279 | * @rx_ring: rx descriptor ring to transact packets on |
1280 | * @size: size of buffer to add to skb |
1281 | * |
1282 | * This function will pull an Rx buffer from the ring and synchronize it |
1283 | * for use by the CPU. |
1284 | */ |
1285 | static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring, |
1286 | const unsigned int size) |
1287 | { |
1288 | struct iavf_rx_buffer *rx_buffer; |
1289 | |
1290 | rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean]; |
1291 | prefetchw(x: rx_buffer->page); |
1292 | if (!size) |
1293 | return rx_buffer; |
1294 | |
1295 | /* we are reusing so sync this buffer for CPU use */ |
1296 | dma_sync_single_range_for_cpu(dev: rx_ring->dev, |
1297 | addr: rx_buffer->dma, |
1298 | offset: rx_buffer->page_offset, |
1299 | size, |
1300 | dir: DMA_FROM_DEVICE); |
1301 | |
1302 | /* We have pulled a buffer for use, so decrement pagecnt_bias */ |
1303 | rx_buffer->pagecnt_bias--; |
1304 | |
1305 | return rx_buffer; |
1306 | } |
1307 | |
1308 | /** |
1309 | * iavf_construct_skb - Allocate skb and populate it |
1310 | * @rx_ring: rx descriptor ring to transact packets on |
1311 | * @rx_buffer: rx buffer to pull data from |
1312 | * @size: size of buffer to add to skb |
1313 | * |
1314 | * This function allocates an skb. It then populates it with the page |
1315 | * data from the current receive descriptor, taking care to set up the |
1316 | * skb correctly. |
1317 | */ |
1318 | static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring, |
1319 | struct iavf_rx_buffer *rx_buffer, |
1320 | unsigned int size) |
1321 | { |
1322 | void *va; |
1323 | #if (PAGE_SIZE < 8192) |
1324 | unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2; |
1325 | #else |
1326 | unsigned int truesize = SKB_DATA_ALIGN(size); |
1327 | #endif |
1328 | unsigned int headlen; |
1329 | struct sk_buff *skb; |
1330 | |
1331 | if (!rx_buffer) |
1332 | return NULL; |
1333 | /* prefetch first cache line of first page */ |
1334 | va = page_address(rx_buffer->page) + rx_buffer->page_offset; |
1335 | net_prefetch(p: va); |
1336 | |
1337 | /* allocate a skb to store the frags */ |
1338 | skb = __napi_alloc_skb(napi: &rx_ring->q_vector->napi, |
1339 | IAVF_RX_HDR_SIZE, |
1340 | GFP_ATOMIC | __GFP_NOWARN); |
1341 | if (unlikely(!skb)) |
1342 | return NULL; |
1343 | |
1344 | /* Determine available headroom for copy */ |
1345 | headlen = size; |
1346 | if (headlen > IAVF_RX_HDR_SIZE) |
1347 | headlen = eth_get_headlen(dev: skb->dev, data: va, IAVF_RX_HDR_SIZE); |
1348 | |
1349 | /* align pull length to size of long to optimize memcpy performance */ |
1350 | memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); |
1351 | |
1352 | /* update all of the pointers */ |
1353 | size -= headlen; |
1354 | if (size) { |
1355 | skb_add_rx_frag(skb, i: 0, page: rx_buffer->page, |
1356 | off: rx_buffer->page_offset + headlen, |
1357 | size, truesize); |
1358 | |
1359 | /* buffer is used by skb, update page_offset */ |
1360 | #if (PAGE_SIZE < 8192) |
1361 | rx_buffer->page_offset ^= truesize; |
1362 | #else |
1363 | rx_buffer->page_offset += truesize; |
1364 | #endif |
1365 | } else { |
1366 | /* buffer is unused, reset bias back to rx_buffer */ |
1367 | rx_buffer->pagecnt_bias++; |
1368 | } |
1369 | |
1370 | return skb; |
1371 | } |
1372 | |
1373 | /** |
1374 | * iavf_build_skb - Build skb around an existing buffer |
1375 | * @rx_ring: Rx descriptor ring to transact packets on |
1376 | * @rx_buffer: Rx buffer to pull data from |
1377 | * @size: size of buffer to add to skb |
1378 | * |
1379 | * This function builds an skb around an existing Rx buffer, taking care |
1380 | * to set up the skb correctly and avoid any memcpy overhead. |
1381 | */ |
1382 | static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring, |
1383 | struct iavf_rx_buffer *rx_buffer, |
1384 | unsigned int size) |
1385 | { |
1386 | void *va; |
1387 | #if (PAGE_SIZE < 8192) |
1388 | unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2; |
1389 | #else |
1390 | unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + |
1391 | SKB_DATA_ALIGN(IAVF_SKB_PAD + size); |
1392 | #endif |
1393 | struct sk_buff *skb; |
1394 | |
1395 | if (!rx_buffer || !size) |
1396 | return NULL; |
1397 | /* prefetch first cache line of first page */ |
1398 | va = page_address(rx_buffer->page) + rx_buffer->page_offset; |
1399 | net_prefetch(p: va); |
1400 | |
1401 | /* build an skb around the page buffer */ |
1402 | skb = napi_build_skb(data: va - IAVF_SKB_PAD, frag_size: truesize); |
1403 | if (unlikely(!skb)) |
1404 | return NULL; |
1405 | |
1406 | /* update pointers within the skb to store the data */ |
1407 | skb_reserve(skb, IAVF_SKB_PAD); |
1408 | __skb_put(skb, len: size); |
1409 | |
1410 | /* buffer is used by skb, update page_offset */ |
1411 | #if (PAGE_SIZE < 8192) |
1412 | rx_buffer->page_offset ^= truesize; |
1413 | #else |
1414 | rx_buffer->page_offset += truesize; |
1415 | #endif |
1416 | |
1417 | return skb; |
1418 | } |
1419 | |
1420 | /** |
1421 | * iavf_put_rx_buffer - Clean up used buffer and either recycle or free |
1422 | * @rx_ring: rx descriptor ring to transact packets on |
1423 | * @rx_buffer: rx buffer to pull data from |
1424 | * |
1425 | * This function will clean up the contents of the rx_buffer. It will |
1426 | * either recycle the buffer or unmap it and free the associated resources. |
1427 | */ |
1428 | static void iavf_put_rx_buffer(struct iavf_ring *rx_ring, |
1429 | struct iavf_rx_buffer *rx_buffer) |
1430 | { |
1431 | if (!rx_buffer) |
1432 | return; |
1433 | |
1434 | if (iavf_can_reuse_rx_page(rx_buffer)) { |
1435 | /* hand second half of page back to the ring */ |
1436 | iavf_reuse_rx_page(rx_ring, old_buff: rx_buffer); |
1437 | rx_ring->rx_stats.page_reuse_count++; |
1438 | } else { |
1439 | /* we are not reusing the buffer so unmap it */ |
1440 | dma_unmap_page_attrs(dev: rx_ring->dev, addr: rx_buffer->dma, |
1441 | iavf_rx_pg_size(rx_ring), |
1442 | dir: DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR); |
1443 | __page_frag_cache_drain(page: rx_buffer->page, |
1444 | count: rx_buffer->pagecnt_bias); |
1445 | } |
1446 | |
1447 | /* clear contents of buffer_info */ |
1448 | rx_buffer->page = NULL; |
1449 | } |
1450 | |
1451 | /** |
1452 | * iavf_is_non_eop - process handling of non-EOP buffers |
1453 | * @rx_ring: Rx ring being processed |
1454 | * @rx_desc: Rx descriptor for current buffer |
1455 | * @skb: Current socket buffer containing buffer in progress |
1456 | * |
1457 | * This function updates next to clean. If the buffer is an EOP buffer |
1458 | * this function exits returning false, otherwise it will place the |
1459 | * sk_buff in the next buffer to be chained and return true indicating |
1460 | * that this is in fact a non-EOP buffer. |
1461 | **/ |
1462 | static bool iavf_is_non_eop(struct iavf_ring *rx_ring, |
1463 | union iavf_rx_desc *rx_desc, |
1464 | struct sk_buff *skb) |
1465 | { |
1466 | u32 ntc = rx_ring->next_to_clean + 1; |
1467 | |
1468 | /* fetch, update, and store next to clean */ |
1469 | ntc = (ntc < rx_ring->count) ? ntc : 0; |
1470 | rx_ring->next_to_clean = ntc; |
1471 | |
1472 | prefetch(IAVF_RX_DESC(rx_ring, ntc)); |
1473 | |
1474 | /* if we are the last buffer then there is nothing else to do */ |
1475 | #define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT) |
1476 | if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF))) |
1477 | return false; |
1478 | |
1479 | rx_ring->rx_stats.non_eop_descs++; |
1480 | |
1481 | return true; |
1482 | } |
1483 | |
1484 | /** |
1485 | * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf |
1486 | * @rx_ring: rx descriptor ring to transact packets on |
1487 | * @budget: Total limit on number of packets to process |
1488 | * |
1489 | * This function provides a "bounce buffer" approach to Rx interrupt |
1490 | * processing. The advantage to this is that on systems that have |
1491 | * expensive overhead for IOMMU access this provides a means of avoiding |
1492 | * it by maintaining the mapping of the page to the system. |
1493 | * |
1494 | * Returns amount of work completed |
1495 | **/ |
1496 | static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget) |
1497 | { |
1498 | unsigned int total_rx_bytes = 0, total_rx_packets = 0; |
1499 | struct sk_buff *skb = rx_ring->skb; |
1500 | u16 cleaned_count = IAVF_DESC_UNUSED(rx_ring); |
1501 | bool failure = false; |
1502 | |
1503 | while (likely(total_rx_packets < (unsigned int)budget)) { |
1504 | struct iavf_rx_buffer *rx_buffer; |
1505 | union iavf_rx_desc *rx_desc; |
1506 | unsigned int size; |
1507 | u16 vlan_tag = 0; |
1508 | u8 rx_ptype; |
1509 | u64 qword; |
1510 | |
1511 | /* return some buffers to hardware, one at a time is too slow */ |
1512 | if (cleaned_count >= IAVF_RX_BUFFER_WRITE) { |
1513 | failure = failure || |
1514 | iavf_alloc_rx_buffers(rx_ring, cleaned_count); |
1515 | cleaned_count = 0; |
1516 | } |
1517 | |
1518 | rx_desc = IAVF_RX_DESC(rx_ring, rx_ring->next_to_clean); |
1519 | |
1520 | /* status_error_len will always be zero for unused descriptors |
1521 | * because it's cleared in cleanup, and overlaps with hdr_addr |
1522 | * which is always zero because packet split isn't used, if the |
1523 | * hardware wrote DD then the length will be non-zero |
1524 | */ |
1525 | qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); |
1526 | |
1527 | /* This memory barrier is needed to keep us from reading |
1528 | * any other fields out of the rx_desc until we have |
1529 | * verified the descriptor has been written back. |
1530 | */ |
1531 | dma_rmb(); |
1532 | #define IAVF_RXD_DD BIT(IAVF_RX_DESC_STATUS_DD_SHIFT) |
1533 | if (!iavf_test_staterr(rx_desc, IAVF_RXD_DD)) |
1534 | break; |
1535 | |
1536 | size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >> |
1537 | IAVF_RXD_QW1_LENGTH_PBUF_SHIFT; |
1538 | |
1539 | iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb); |
1540 | rx_buffer = iavf_get_rx_buffer(rx_ring, size); |
1541 | |
1542 | /* retrieve a buffer from the ring */ |
1543 | if (skb) |
1544 | iavf_add_rx_frag(rx_ring, rx_buffer, skb, size); |
1545 | else if (ring_uses_build_skb(ring: rx_ring)) |
1546 | skb = iavf_build_skb(rx_ring, rx_buffer, size); |
1547 | else |
1548 | skb = iavf_construct_skb(rx_ring, rx_buffer, size); |
1549 | |
1550 | /* exit if we failed to retrieve a buffer */ |
1551 | if (!skb) { |
1552 | rx_ring->rx_stats.alloc_buff_failed++; |
1553 | if (rx_buffer && size) |
1554 | rx_buffer->pagecnt_bias++; |
1555 | break; |
1556 | } |
1557 | |
1558 | iavf_put_rx_buffer(rx_ring, rx_buffer); |
1559 | cleaned_count++; |
1560 | |
1561 | if (iavf_is_non_eop(rx_ring, rx_desc, skb)) |
1562 | continue; |
1563 | |
1564 | /* ERR_MASK will only have valid bits if EOP set, and |
1565 | * what we are doing here is actually checking |
1566 | * IAVF_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in |
1567 | * the error field |
1568 | */ |
1569 | if (unlikely(iavf_test_staterr(rx_desc, BIT(IAVF_RXD_QW1_ERROR_SHIFT)))) { |
1570 | dev_kfree_skb_any(skb); |
1571 | skb = NULL; |
1572 | continue; |
1573 | } |
1574 | |
1575 | if (iavf_cleanup_headers(rx_ring, skb)) { |
1576 | skb = NULL; |
1577 | continue; |
1578 | } |
1579 | |
1580 | /* probably a little skewed due to removing CRC */ |
1581 | total_rx_bytes += skb->len; |
1582 | |
1583 | qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); |
1584 | rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> |
1585 | IAVF_RXD_QW1_PTYPE_SHIFT; |
1586 | |
1587 | /* populate checksum, VLAN, and protocol */ |
1588 | iavf_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype); |
1589 | |
1590 | if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) && |
1591 | rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1) |
1592 | vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1); |
1593 | if (rx_desc->wb.qword2.ext_status & |
1594 | cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) && |
1595 | rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2) |
1596 | vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2); |
1597 | |
1598 | iavf_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb); |
1599 | iavf_receive_skb(rx_ring, skb, vlan_tag); |
1600 | skb = NULL; |
1601 | |
1602 | /* update budget accounting */ |
1603 | total_rx_packets++; |
1604 | } |
1605 | |
1606 | rx_ring->skb = skb; |
1607 | |
1608 | u64_stats_update_begin(syncp: &rx_ring->syncp); |
1609 | rx_ring->stats.packets += total_rx_packets; |
1610 | rx_ring->stats.bytes += total_rx_bytes; |
1611 | u64_stats_update_end(syncp: &rx_ring->syncp); |
1612 | rx_ring->q_vector->rx.total_packets += total_rx_packets; |
1613 | rx_ring->q_vector->rx.total_bytes += total_rx_bytes; |
1614 | |
1615 | /* guarantee a trip back through this routine if there was a failure */ |
1616 | return failure ? budget : (int)total_rx_packets; |
1617 | } |
1618 | |
1619 | static inline u32 iavf_buildreg_itr(const int type, u16 itr) |
1620 | { |
1621 | u32 val; |
1622 | |
1623 | /* We don't bother with setting the CLEARPBA bit as the data sheet |
1624 | * points out doing so is "meaningless since it was already |
1625 | * auto-cleared". The auto-clearing happens when the interrupt is |
1626 | * asserted. |
1627 | * |
1628 | * Hardware errata 28 for also indicates that writing to a |
1629 | * xxINT_DYN_CTLx CSR with INTENA_MSK (bit 31) set to 0 will clear |
1630 | * an event in the PBA anyway so we need to rely on the automask |
1631 | * to hold pending events for us until the interrupt is re-enabled |
1632 | * |
1633 | * The itr value is reported in microseconds, and the register |
1634 | * value is recorded in 2 microsecond units. For this reason we |
1635 | * only need to shift by the interval shift - 1 instead of the |
1636 | * full value. |
1637 | */ |
1638 | itr &= IAVF_ITR_MASK; |
1639 | |
1640 | val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK | |
1641 | (type << IAVF_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) | |
1642 | (itr << (IAVF_VFINT_DYN_CTLN1_INTERVAL_SHIFT - 1)); |
1643 | |
1644 | return val; |
1645 | } |
1646 | |
1647 | /* a small macro to shorten up some long lines */ |
1648 | #define INTREG IAVF_VFINT_DYN_CTLN1 |
1649 | |
1650 | /* The act of updating the ITR will cause it to immediately trigger. In order |
1651 | * to prevent this from throwing off adaptive update statistics we defer the |
1652 | * update so that it can only happen so often. So after either Tx or Rx are |
1653 | * updated we make the adaptive scheme wait until either the ITR completely |
1654 | * expires via the next_update expiration or we have been through at least |
1655 | * 3 interrupts. |
1656 | */ |
1657 | #define ITR_COUNTDOWN_START 3 |
1658 | |
1659 | /** |
1660 | * iavf_update_enable_itr - Update itr and re-enable MSIX interrupt |
1661 | * @vsi: the VSI we care about |
1662 | * @q_vector: q_vector for which itr is being updated and interrupt enabled |
1663 | * |
1664 | **/ |
1665 | static void iavf_update_enable_itr(struct iavf_vsi *vsi, |
1666 | struct iavf_q_vector *q_vector) |
1667 | { |
1668 | struct iavf_hw *hw = &vsi->back->hw; |
1669 | u32 intval; |
1670 | |
1671 | /* These will do nothing if dynamic updates are not enabled */ |
1672 | iavf_update_itr(q_vector, rc: &q_vector->tx); |
1673 | iavf_update_itr(q_vector, rc: &q_vector->rx); |
1674 | |
1675 | /* This block of logic allows us to get away with only updating |
1676 | * one ITR value with each interrupt. The idea is to perform a |
1677 | * pseudo-lazy update with the following criteria. |
1678 | * |
1679 | * 1. Rx is given higher priority than Tx if both are in same state |
1680 | * 2. If we must reduce an ITR that is given highest priority. |
1681 | * 3. We then give priority to increasing ITR based on amount. |
1682 | */ |
1683 | if (q_vector->rx.target_itr < q_vector->rx.current_itr) { |
1684 | /* Rx ITR needs to be reduced, this is highest priority */ |
1685 | intval = iavf_buildreg_itr(IAVF_RX_ITR, |
1686 | itr: q_vector->rx.target_itr); |
1687 | q_vector->rx.current_itr = q_vector->rx.target_itr; |
1688 | q_vector->itr_countdown = ITR_COUNTDOWN_START; |
1689 | } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || |
1690 | ((q_vector->rx.target_itr - q_vector->rx.current_itr) < |
1691 | (q_vector->tx.target_itr - q_vector->tx.current_itr))) { |
1692 | /* Tx ITR needs to be reduced, this is second priority |
1693 | * Tx ITR needs to be increased more than Rx, fourth priority |
1694 | */ |
1695 | intval = iavf_buildreg_itr(IAVF_TX_ITR, |
1696 | itr: q_vector->tx.target_itr); |
1697 | q_vector->tx.current_itr = q_vector->tx.target_itr; |
1698 | q_vector->itr_countdown = ITR_COUNTDOWN_START; |
1699 | } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { |
1700 | /* Rx ITR needs to be increased, third priority */ |
1701 | intval = iavf_buildreg_itr(IAVF_RX_ITR, |
1702 | itr: q_vector->rx.target_itr); |
1703 | q_vector->rx.current_itr = q_vector->rx.target_itr; |
1704 | q_vector->itr_countdown = ITR_COUNTDOWN_START; |
1705 | } else { |
1706 | /* No ITR update, lowest priority */ |
1707 | intval = iavf_buildreg_itr(type: IAVF_ITR_NONE, itr: 0); |
1708 | if (q_vector->itr_countdown) |
1709 | q_vector->itr_countdown--; |
1710 | } |
1711 | |
1712 | if (!test_bit(__IAVF_VSI_DOWN, vsi->state)) |
1713 | wr32(hw, INTREG(q_vector->reg_idx), intval); |
1714 | } |
1715 | |
1716 | /** |
1717 | * iavf_napi_poll - NAPI polling Rx/Tx cleanup routine |
1718 | * @napi: napi struct with our devices info in it |
1719 | * @budget: amount of work driver is allowed to do this pass, in packets |
1720 | * |
1721 | * This function will clean all queues associated with a q_vector. |
1722 | * |
1723 | * Returns the amount of work done |
1724 | **/ |
1725 | int iavf_napi_poll(struct napi_struct *napi, int budget) |
1726 | { |
1727 | struct iavf_q_vector *q_vector = |
1728 | container_of(napi, struct iavf_q_vector, napi); |
1729 | struct iavf_vsi *vsi = q_vector->vsi; |
1730 | struct iavf_ring *ring; |
1731 | bool clean_complete = true; |
1732 | bool arm_wb = false; |
1733 | int budget_per_ring; |
1734 | int work_done = 0; |
1735 | |
1736 | if (test_bit(__IAVF_VSI_DOWN, vsi->state)) { |
1737 | napi_complete(n: napi); |
1738 | return 0; |
1739 | } |
1740 | |
1741 | /* Since the actual Tx work is minimal, we can give the Tx a larger |
1742 | * budget and be more aggressive about cleaning up the Tx descriptors. |
1743 | */ |
1744 | iavf_for_each_ring(ring, q_vector->tx) { |
1745 | if (!iavf_clean_tx_irq(vsi, tx_ring: ring, napi_budget: budget)) { |
1746 | clean_complete = false; |
1747 | continue; |
1748 | } |
1749 | arm_wb |= ring->arm_wb; |
1750 | ring->arm_wb = false; |
1751 | } |
1752 | |
1753 | /* Handle case where we are called by netpoll with a budget of 0 */ |
1754 | if (budget <= 0) |
1755 | goto tx_only; |
1756 | |
1757 | /* We attempt to distribute budget to each Rx queue fairly, but don't |
1758 | * allow the budget to go below 1 because that would exit polling early. |
1759 | */ |
1760 | budget_per_ring = max(budget/q_vector->num_ringpairs, 1); |
1761 | |
1762 | iavf_for_each_ring(ring, q_vector->rx) { |
1763 | int cleaned = iavf_clean_rx_irq(rx_ring: ring, budget: budget_per_ring); |
1764 | |
1765 | work_done += cleaned; |
1766 | /* if we clean as many as budgeted, we must not be done */ |
1767 | if (cleaned >= budget_per_ring) |
1768 | clean_complete = false; |
1769 | } |
1770 | |
1771 | /* If work not completed, return budget and polling will return */ |
1772 | if (!clean_complete) { |
1773 | int cpu_id = smp_processor_id(); |
1774 | |
1775 | /* It is possible that the interrupt affinity has changed but, |
1776 | * if the cpu is pegged at 100%, polling will never exit while |
1777 | * traffic continues and the interrupt will be stuck on this |
1778 | * cpu. We check to make sure affinity is correct before we |
1779 | * continue to poll, otherwise we must stop polling so the |
1780 | * interrupt can move to the correct cpu. |
1781 | */ |
1782 | if (!cpumask_test_cpu(cpu: cpu_id, cpumask: &q_vector->affinity_mask)) { |
1783 | /* Tell napi that we are done polling */ |
1784 | napi_complete_done(n: napi, work_done); |
1785 | |
1786 | /* Force an interrupt */ |
1787 | iavf_force_wb(vsi, q_vector); |
1788 | |
1789 | /* Return budget-1 so that polling stops */ |
1790 | return budget - 1; |
1791 | } |
1792 | tx_only: |
1793 | if (arm_wb) { |
1794 | q_vector->tx.ring[0].tx_stats.tx_force_wb++; |
1795 | iavf_enable_wb_on_itr(vsi, q_vector); |
1796 | } |
1797 | return budget; |
1798 | } |
1799 | |
1800 | if (vsi->back->flags & IAVF_TXR_FLAGS_WB_ON_ITR) |
1801 | q_vector->arm_wb_state = false; |
1802 | |
1803 | /* Exit the polling mode, but don't re-enable interrupts if stack might |
1804 | * poll us due to busy-polling |
1805 | */ |
1806 | if (likely(napi_complete_done(napi, work_done))) |
1807 | iavf_update_enable_itr(vsi, q_vector); |
1808 | |
1809 | return min_t(int, work_done, budget - 1); |
1810 | } |
1811 | |
1812 | /** |
1813 | * iavf_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW |
1814 | * @skb: send buffer |
1815 | * @tx_ring: ring to send buffer on |
1816 | * @flags: the tx flags to be set |
1817 | * |
1818 | * Checks the skb and set up correspondingly several generic transmit flags |
1819 | * related to VLAN tagging for the HW, such as VLAN, DCB, etc. |
1820 | * |
1821 | * Returns error code indicate the frame should be dropped upon error and the |
1822 | * otherwise returns 0 to indicate the flags has been set properly. |
1823 | **/ |
1824 | static void iavf_tx_prepare_vlan_flags(struct sk_buff *skb, |
1825 | struct iavf_ring *tx_ring, u32 *flags) |
1826 | { |
1827 | u32 tx_flags = 0; |
1828 | |
1829 | |
1830 | /* stack will only request hardware VLAN insertion offload for protocols |
1831 | * that the driver supports and has enabled |
1832 | */ |
1833 | if (!skb_vlan_tag_present(skb)) |
1834 | return; |
1835 | |
1836 | tx_flags |= skb_vlan_tag_get(skb) << IAVF_TX_FLAGS_VLAN_SHIFT; |
1837 | if (tx_ring->flags & IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2) { |
1838 | tx_flags |= IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN; |
1839 | } else if (tx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1) { |
1840 | tx_flags |= IAVF_TX_FLAGS_HW_VLAN; |
1841 | } else { |
1842 | dev_dbg(tx_ring->dev, "Unsupported Tx VLAN tag location requested\n" ); |
1843 | return; |
1844 | } |
1845 | |
1846 | *flags = tx_flags; |
1847 | } |
1848 | |
1849 | /** |
1850 | * iavf_tso - set up the tso context descriptor |
1851 | * @first: pointer to first Tx buffer for xmit |
1852 | * @hdr_len: ptr to the size of the packet header |
1853 | * @cd_type_cmd_tso_mss: Quad Word 1 |
1854 | * |
1855 | * Returns 0 if no TSO can happen, 1 if tso is going, or error |
1856 | **/ |
1857 | static int iavf_tso(struct iavf_tx_buffer *first, u8 *hdr_len, |
1858 | u64 *cd_type_cmd_tso_mss) |
1859 | { |
1860 | struct sk_buff *skb = first->skb; |
1861 | u64 cd_cmd, cd_tso_len, cd_mss; |
1862 | union { |
1863 | struct iphdr *v4; |
1864 | struct ipv6hdr *v6; |
1865 | unsigned char *hdr; |
1866 | } ip; |
1867 | union { |
1868 | struct tcphdr *tcp; |
1869 | struct udphdr *udp; |
1870 | unsigned char *hdr; |
1871 | } l4; |
1872 | u32 paylen, l4_offset; |
1873 | u16 gso_segs, gso_size; |
1874 | int err; |
1875 | |
1876 | if (skb->ip_summed != CHECKSUM_PARTIAL) |
1877 | return 0; |
1878 | |
1879 | if (!skb_is_gso(skb)) |
1880 | return 0; |
1881 | |
1882 | err = skb_cow_head(skb, headroom: 0); |
1883 | if (err < 0) |
1884 | return err; |
1885 | |
1886 | ip.hdr = skb_network_header(skb); |
1887 | l4.hdr = skb_transport_header(skb); |
1888 | |
1889 | /* initialize outer IP header fields */ |
1890 | if (ip.v4->version == 4) { |
1891 | ip.v4->tot_len = 0; |
1892 | ip.v4->check = 0; |
1893 | } else { |
1894 | ip.v6->payload_len = 0; |
1895 | } |
1896 | |
1897 | if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | |
1898 | SKB_GSO_GRE_CSUM | |
1899 | SKB_GSO_IPXIP4 | |
1900 | SKB_GSO_IPXIP6 | |
1901 | SKB_GSO_UDP_TUNNEL | |
1902 | SKB_GSO_UDP_TUNNEL_CSUM)) { |
1903 | if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) && |
1904 | (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) { |
1905 | l4.udp->len = 0; |
1906 | |
1907 | /* determine offset of outer transport header */ |
1908 | l4_offset = l4.hdr - skb->data; |
1909 | |
1910 | /* remove payload length from outer checksum */ |
1911 | paylen = skb->len - l4_offset; |
1912 | csum_replace_by_diff(sum: &l4.udp->check, |
1913 | diff: (__force __wsum)htonl(paylen)); |
1914 | } |
1915 | |
1916 | /* reset pointers to inner headers */ |
1917 | ip.hdr = skb_inner_network_header(skb); |
1918 | l4.hdr = skb_inner_transport_header(skb); |
1919 | |
1920 | /* initialize inner IP header fields */ |
1921 | if (ip.v4->version == 4) { |
1922 | ip.v4->tot_len = 0; |
1923 | ip.v4->check = 0; |
1924 | } else { |
1925 | ip.v6->payload_len = 0; |
1926 | } |
1927 | } |
1928 | |
1929 | /* determine offset of inner transport header */ |
1930 | l4_offset = l4.hdr - skb->data; |
1931 | /* remove payload length from inner checksum */ |
1932 | paylen = skb->len - l4_offset; |
1933 | |
1934 | if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { |
1935 | csum_replace_by_diff(sum: &l4.udp->check, |
1936 | diff: (__force __wsum)htonl(paylen)); |
1937 | /* compute length of UDP segmentation header */ |
1938 | *hdr_len = (u8)sizeof(l4.udp) + l4_offset; |
1939 | } else { |
1940 | csum_replace_by_diff(sum: &l4.tcp->check, |
1941 | diff: (__force __wsum)htonl(paylen)); |
1942 | /* compute length of TCP segmentation header */ |
1943 | *hdr_len = (u8)((l4.tcp->doff * 4) + l4_offset); |
1944 | } |
1945 | |
1946 | /* pull values out of skb_shinfo */ |
1947 | gso_size = skb_shinfo(skb)->gso_size; |
1948 | gso_segs = skb_shinfo(skb)->gso_segs; |
1949 | |
1950 | /* update GSO size and bytecount with header size */ |
1951 | first->gso_segs = gso_segs; |
1952 | first->bytecount += (first->gso_segs - 1) * *hdr_len; |
1953 | |
1954 | /* find the field values */ |
1955 | cd_cmd = IAVF_TX_CTX_DESC_TSO; |
1956 | cd_tso_len = skb->len - *hdr_len; |
1957 | cd_mss = gso_size; |
1958 | *cd_type_cmd_tso_mss |= (cd_cmd << IAVF_TXD_CTX_QW1_CMD_SHIFT) | |
1959 | (cd_tso_len << IAVF_TXD_CTX_QW1_TSO_LEN_SHIFT) | |
1960 | (cd_mss << IAVF_TXD_CTX_QW1_MSS_SHIFT); |
1961 | return 1; |
1962 | } |
1963 | |
1964 | /** |
1965 | * iavf_tx_enable_csum - Enable Tx checksum offloads |
1966 | * @skb: send buffer |
1967 | * @tx_flags: pointer to Tx flags currently set |
1968 | * @td_cmd: Tx descriptor command bits to set |
1969 | * @td_offset: Tx descriptor header offsets to set |
1970 | * @tx_ring: Tx descriptor ring |
1971 | * @cd_tunneling: ptr to context desc bits |
1972 | **/ |
1973 | static int iavf_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags, |
1974 | u32 *td_cmd, u32 *td_offset, |
1975 | struct iavf_ring *tx_ring, |
1976 | u32 *cd_tunneling) |
1977 | { |
1978 | union { |
1979 | struct iphdr *v4; |
1980 | struct ipv6hdr *v6; |
1981 | unsigned char *hdr; |
1982 | } ip; |
1983 | union { |
1984 | struct tcphdr *tcp; |
1985 | struct udphdr *udp; |
1986 | unsigned char *hdr; |
1987 | } l4; |
1988 | unsigned char *exthdr; |
1989 | u32 offset, cmd = 0; |
1990 | __be16 frag_off; |
1991 | u8 l4_proto = 0; |
1992 | |
1993 | if (skb->ip_summed != CHECKSUM_PARTIAL) |
1994 | return 0; |
1995 | |
1996 | ip.hdr = skb_network_header(skb); |
1997 | l4.hdr = skb_transport_header(skb); |
1998 | |
1999 | /* compute outer L2 header size */ |
2000 | offset = ((ip.hdr - skb->data) / 2) << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT; |
2001 | |
2002 | if (skb->encapsulation) { |
2003 | u32 tunnel = 0; |
2004 | /* define outer network header type */ |
2005 | if (*tx_flags & IAVF_TX_FLAGS_IPV4) { |
2006 | tunnel |= (*tx_flags & IAVF_TX_FLAGS_TSO) ? |
2007 | IAVF_TX_CTX_EXT_IP_IPV4 : |
2008 | IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM; |
2009 | |
2010 | l4_proto = ip.v4->protocol; |
2011 | } else if (*tx_flags & IAVF_TX_FLAGS_IPV6) { |
2012 | tunnel |= IAVF_TX_CTX_EXT_IP_IPV6; |
2013 | |
2014 | exthdr = ip.hdr + sizeof(*ip.v6); |
2015 | l4_proto = ip.v6->nexthdr; |
2016 | if (l4.hdr != exthdr) |
2017 | ipv6_skip_exthdr(skb, start: exthdr - skb->data, |
2018 | nexthdrp: &l4_proto, frag_offp: &frag_off); |
2019 | } |
2020 | |
2021 | /* define outer transport */ |
2022 | switch (l4_proto) { |
2023 | case IPPROTO_UDP: |
2024 | tunnel |= IAVF_TXD_CTX_UDP_TUNNELING; |
2025 | *tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL; |
2026 | break; |
2027 | case IPPROTO_GRE: |
2028 | tunnel |= IAVF_TXD_CTX_GRE_TUNNELING; |
2029 | *tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL; |
2030 | break; |
2031 | case IPPROTO_IPIP: |
2032 | case IPPROTO_IPV6: |
2033 | *tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL; |
2034 | l4.hdr = skb_inner_network_header(skb); |
2035 | break; |
2036 | default: |
2037 | if (*tx_flags & IAVF_TX_FLAGS_TSO) |
2038 | return -1; |
2039 | |
2040 | skb_checksum_help(skb); |
2041 | return 0; |
2042 | } |
2043 | |
2044 | /* compute outer L3 header size */ |
2045 | tunnel |= ((l4.hdr - ip.hdr) / 4) << |
2046 | IAVF_TXD_CTX_QW0_EXT_IPLEN_SHIFT; |
2047 | |
2048 | /* switch IP header pointer from outer to inner header */ |
2049 | ip.hdr = skb_inner_network_header(skb); |
2050 | |
2051 | /* compute tunnel header size */ |
2052 | tunnel |= ((ip.hdr - l4.hdr) / 2) << |
2053 | IAVF_TXD_CTX_QW0_NATLEN_SHIFT; |
2054 | |
2055 | /* indicate if we need to offload outer UDP header */ |
2056 | if ((*tx_flags & IAVF_TX_FLAGS_TSO) && |
2057 | !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) && |
2058 | (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) |
2059 | tunnel |= IAVF_TXD_CTX_QW0_L4T_CS_MASK; |
2060 | |
2061 | /* record tunnel offload values */ |
2062 | *cd_tunneling |= tunnel; |
2063 | |
2064 | /* switch L4 header pointer from outer to inner */ |
2065 | l4.hdr = skb_inner_transport_header(skb); |
2066 | l4_proto = 0; |
2067 | |
2068 | /* reset type as we transition from outer to inner headers */ |
2069 | *tx_flags &= ~(IAVF_TX_FLAGS_IPV4 | IAVF_TX_FLAGS_IPV6); |
2070 | if (ip.v4->version == 4) |
2071 | *tx_flags |= IAVF_TX_FLAGS_IPV4; |
2072 | if (ip.v6->version == 6) |
2073 | *tx_flags |= IAVF_TX_FLAGS_IPV6; |
2074 | } |
2075 | |
2076 | /* Enable IP checksum offloads */ |
2077 | if (*tx_flags & IAVF_TX_FLAGS_IPV4) { |
2078 | l4_proto = ip.v4->protocol; |
2079 | /* the stack computes the IP header already, the only time we |
2080 | * need the hardware to recompute it is in the case of TSO. |
2081 | */ |
2082 | cmd |= (*tx_flags & IAVF_TX_FLAGS_TSO) ? |
2083 | IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM : |
2084 | IAVF_TX_DESC_CMD_IIPT_IPV4; |
2085 | } else if (*tx_flags & IAVF_TX_FLAGS_IPV6) { |
2086 | cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6; |
2087 | |
2088 | exthdr = ip.hdr + sizeof(*ip.v6); |
2089 | l4_proto = ip.v6->nexthdr; |
2090 | if (l4.hdr != exthdr) |
2091 | ipv6_skip_exthdr(skb, start: exthdr - skb->data, |
2092 | nexthdrp: &l4_proto, frag_offp: &frag_off); |
2093 | } |
2094 | |
2095 | /* compute inner L3 header size */ |
2096 | offset |= ((l4.hdr - ip.hdr) / 4) << IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; |
2097 | |
2098 | /* Enable L4 checksum offloads */ |
2099 | switch (l4_proto) { |
2100 | case IPPROTO_TCP: |
2101 | /* enable checksum offloads */ |
2102 | cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP; |
2103 | offset |= l4.tcp->doff << IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; |
2104 | break; |
2105 | case IPPROTO_SCTP: |
2106 | /* enable SCTP checksum offload */ |
2107 | cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP; |
2108 | offset |= (sizeof(struct sctphdr) >> 2) << |
2109 | IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; |
2110 | break; |
2111 | case IPPROTO_UDP: |
2112 | /* enable UDP checksum offload */ |
2113 | cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP; |
2114 | offset |= (sizeof(struct udphdr) >> 2) << |
2115 | IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; |
2116 | break; |
2117 | default: |
2118 | if (*tx_flags & IAVF_TX_FLAGS_TSO) |
2119 | return -1; |
2120 | skb_checksum_help(skb); |
2121 | return 0; |
2122 | } |
2123 | |
2124 | *td_cmd |= cmd; |
2125 | *td_offset |= offset; |
2126 | |
2127 | return 1; |
2128 | } |
2129 | |
2130 | /** |
2131 | * iavf_create_tx_ctx - Build the Tx context descriptor |
2132 | * @tx_ring: ring to create the descriptor on |
2133 | * @cd_type_cmd_tso_mss: Quad Word 1 |
2134 | * @cd_tunneling: Quad Word 0 - bits 0-31 |
2135 | * @cd_l2tag2: Quad Word 0 - bits 32-63 |
2136 | **/ |
2137 | static void iavf_create_tx_ctx(struct iavf_ring *tx_ring, |
2138 | const u64 cd_type_cmd_tso_mss, |
2139 | const u32 cd_tunneling, const u32 cd_l2tag2) |
2140 | { |
2141 | struct iavf_tx_context_desc *context_desc; |
2142 | int i = tx_ring->next_to_use; |
2143 | |
2144 | if ((cd_type_cmd_tso_mss == IAVF_TX_DESC_DTYPE_CONTEXT) && |
2145 | !cd_tunneling && !cd_l2tag2) |
2146 | return; |
2147 | |
2148 | /* grab the next descriptor */ |
2149 | context_desc = IAVF_TX_CTXTDESC(tx_ring, i); |
2150 | |
2151 | i++; |
2152 | tx_ring->next_to_use = (i < tx_ring->count) ? i : 0; |
2153 | |
2154 | /* cpu_to_le32 and assign to struct fields */ |
2155 | context_desc->tunneling_params = cpu_to_le32(cd_tunneling); |
2156 | context_desc->l2tag2 = cpu_to_le16(cd_l2tag2); |
2157 | context_desc->rsvd = cpu_to_le16(0); |
2158 | context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss); |
2159 | } |
2160 | |
2161 | /** |
2162 | * __iavf_chk_linearize - Check if there are more than 8 buffers per packet |
2163 | * @skb: send buffer |
2164 | * |
2165 | * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire |
2166 | * and so we need to figure out the cases where we need to linearize the skb. |
2167 | * |
2168 | * For TSO we need to count the TSO header and segment payload separately. |
2169 | * As such we need to check cases where we have 7 fragments or more as we |
2170 | * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for |
2171 | * the segment payload in the first descriptor, and another 7 for the |
2172 | * fragments. |
2173 | **/ |
2174 | bool __iavf_chk_linearize(struct sk_buff *skb) |
2175 | { |
2176 | const skb_frag_t *frag, *stale; |
2177 | int nr_frags, sum; |
2178 | |
2179 | /* no need to check if number of frags is less than 7 */ |
2180 | nr_frags = skb_shinfo(skb)->nr_frags; |
2181 | if (nr_frags < (IAVF_MAX_BUFFER_TXD - 1)) |
2182 | return false; |
2183 | |
2184 | /* We need to walk through the list and validate that each group |
2185 | * of 6 fragments totals at least gso_size. |
2186 | */ |
2187 | nr_frags -= IAVF_MAX_BUFFER_TXD - 2; |
2188 | frag = &skb_shinfo(skb)->frags[0]; |
2189 | |
2190 | /* Initialize size to the negative value of gso_size minus 1. We |
2191 | * use this as the worst case scenerio in which the frag ahead |
2192 | * of us only provides one byte which is why we are limited to 6 |
2193 | * descriptors for a single transmit as the header and previous |
2194 | * fragment are already consuming 2 descriptors. |
2195 | */ |
2196 | sum = 1 - skb_shinfo(skb)->gso_size; |
2197 | |
2198 | /* Add size of frags 0 through 4 to create our initial sum */ |
2199 | sum += skb_frag_size(frag: frag++); |
2200 | sum += skb_frag_size(frag: frag++); |
2201 | sum += skb_frag_size(frag: frag++); |
2202 | sum += skb_frag_size(frag: frag++); |
2203 | sum += skb_frag_size(frag: frag++); |
2204 | |
2205 | /* Walk through fragments adding latest fragment, testing it, and |
2206 | * then removing stale fragments from the sum. |
2207 | */ |
2208 | for (stale = &skb_shinfo(skb)->frags[0];; stale++) { |
2209 | int stale_size = skb_frag_size(frag: stale); |
2210 | |
2211 | sum += skb_frag_size(frag: frag++); |
2212 | |
2213 | /* The stale fragment may present us with a smaller |
2214 | * descriptor than the actual fragment size. To account |
2215 | * for that we need to remove all the data on the front and |
2216 | * figure out what the remainder would be in the last |
2217 | * descriptor associated with the fragment. |
2218 | */ |
2219 | if (stale_size > IAVF_MAX_DATA_PER_TXD) { |
2220 | int align_pad = -(skb_frag_off(frag: stale)) & |
2221 | (IAVF_MAX_READ_REQ_SIZE - 1); |
2222 | |
2223 | sum -= align_pad; |
2224 | stale_size -= align_pad; |
2225 | |
2226 | do { |
2227 | sum -= IAVF_MAX_DATA_PER_TXD_ALIGNED; |
2228 | stale_size -= IAVF_MAX_DATA_PER_TXD_ALIGNED; |
2229 | } while (stale_size > IAVF_MAX_DATA_PER_TXD); |
2230 | } |
2231 | |
2232 | /* if sum is negative we failed to make sufficient progress */ |
2233 | if (sum < 0) |
2234 | return true; |
2235 | |
2236 | if (!nr_frags--) |
2237 | break; |
2238 | |
2239 | sum -= stale_size; |
2240 | } |
2241 | |
2242 | return false; |
2243 | } |
2244 | |
2245 | /** |
2246 | * __iavf_maybe_stop_tx - 2nd level check for tx stop conditions |
2247 | * @tx_ring: the ring to be checked |
2248 | * @size: the size buffer we want to assure is available |
2249 | * |
2250 | * Returns -EBUSY if a stop is needed, else 0 |
2251 | **/ |
2252 | int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size) |
2253 | { |
2254 | netif_stop_subqueue(dev: tx_ring->netdev, queue_index: tx_ring->queue_index); |
2255 | /* Memory barrier before checking head and tail */ |
2256 | smp_mb(); |
2257 | |
2258 | /* Check again in a case another CPU has just made room available. */ |
2259 | if (likely(IAVF_DESC_UNUSED(tx_ring) < size)) |
2260 | return -EBUSY; |
2261 | |
2262 | /* A reprieve! - use start_queue because it doesn't call schedule */ |
2263 | netif_start_subqueue(dev: tx_ring->netdev, queue_index: tx_ring->queue_index); |
2264 | ++tx_ring->tx_stats.restart_queue; |
2265 | return 0; |
2266 | } |
2267 | |
2268 | /** |
2269 | * iavf_tx_map - Build the Tx descriptor |
2270 | * @tx_ring: ring to send buffer on |
2271 | * @skb: send buffer |
2272 | * @first: first buffer info buffer to use |
2273 | * @tx_flags: collected send information |
2274 | * @hdr_len: size of the packet header |
2275 | * @td_cmd: the command field in the descriptor |
2276 | * @td_offset: offset for checksum or crc |
2277 | **/ |
2278 | static void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb, |
2279 | struct iavf_tx_buffer *first, u32 tx_flags, |
2280 | const u8 hdr_len, u32 td_cmd, u32 td_offset) |
2281 | { |
2282 | unsigned int data_len = skb->data_len; |
2283 | unsigned int size = skb_headlen(skb); |
2284 | skb_frag_t *frag; |
2285 | struct iavf_tx_buffer *tx_bi; |
2286 | struct iavf_tx_desc *tx_desc; |
2287 | u16 i = tx_ring->next_to_use; |
2288 | u32 td_tag = 0; |
2289 | dma_addr_t dma; |
2290 | |
2291 | if (tx_flags & IAVF_TX_FLAGS_HW_VLAN) { |
2292 | td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1; |
2293 | td_tag = (tx_flags & IAVF_TX_FLAGS_VLAN_MASK) >> |
2294 | IAVF_TX_FLAGS_VLAN_SHIFT; |
2295 | } |
2296 | |
2297 | first->tx_flags = tx_flags; |
2298 | |
2299 | dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE); |
2300 | |
2301 | tx_desc = IAVF_TX_DESC(tx_ring, i); |
2302 | tx_bi = first; |
2303 | |
2304 | for (frag = &skb_shinfo(skb)->frags[0];; frag++) { |
2305 | unsigned int max_data = IAVF_MAX_DATA_PER_TXD_ALIGNED; |
2306 | |
2307 | if (dma_mapping_error(dev: tx_ring->dev, dma_addr: dma)) |
2308 | goto dma_error; |
2309 | |
2310 | /* record length, and DMA address */ |
2311 | dma_unmap_len_set(tx_bi, len, size); |
2312 | dma_unmap_addr_set(tx_bi, dma, dma); |
2313 | |
2314 | /* align size to end of page */ |
2315 | max_data += -dma & (IAVF_MAX_READ_REQ_SIZE - 1); |
2316 | tx_desc->buffer_addr = cpu_to_le64(dma); |
2317 | |
2318 | while (unlikely(size > IAVF_MAX_DATA_PER_TXD)) { |
2319 | tx_desc->cmd_type_offset_bsz = |
2320 | build_ctob(td_cmd, td_offset, |
2321 | size: max_data, td_tag); |
2322 | |
2323 | tx_desc++; |
2324 | i++; |
2325 | |
2326 | if (i == tx_ring->count) { |
2327 | tx_desc = IAVF_TX_DESC(tx_ring, 0); |
2328 | i = 0; |
2329 | } |
2330 | |
2331 | dma += max_data; |
2332 | size -= max_data; |
2333 | |
2334 | max_data = IAVF_MAX_DATA_PER_TXD_ALIGNED; |
2335 | tx_desc->buffer_addr = cpu_to_le64(dma); |
2336 | } |
2337 | |
2338 | if (likely(!data_len)) |
2339 | break; |
2340 | |
2341 | tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset, |
2342 | size, td_tag); |
2343 | |
2344 | tx_desc++; |
2345 | i++; |
2346 | |
2347 | if (i == tx_ring->count) { |
2348 | tx_desc = IAVF_TX_DESC(tx_ring, 0); |
2349 | i = 0; |
2350 | } |
2351 | |
2352 | size = skb_frag_size(frag); |
2353 | data_len -= size; |
2354 | |
2355 | dma = skb_frag_dma_map(dev: tx_ring->dev, frag, offset: 0, size, |
2356 | dir: DMA_TO_DEVICE); |
2357 | |
2358 | tx_bi = &tx_ring->tx_bi[i]; |
2359 | } |
2360 | |
2361 | netdev_tx_sent_queue(dev_queue: txring_txq(ring: tx_ring), bytes: first->bytecount); |
2362 | |
2363 | i++; |
2364 | if (i == tx_ring->count) |
2365 | i = 0; |
2366 | |
2367 | tx_ring->next_to_use = i; |
2368 | |
2369 | iavf_maybe_stop_tx(tx_ring, DESC_NEEDED); |
2370 | |
2371 | /* write last descriptor with RS and EOP bits */ |
2372 | td_cmd |= IAVF_TXD_CMD; |
2373 | tx_desc->cmd_type_offset_bsz = |
2374 | build_ctob(td_cmd, td_offset, size, td_tag); |
2375 | |
2376 | skb_tx_timestamp(skb); |
2377 | |
2378 | /* Force memory writes to complete before letting h/w know there |
2379 | * are new descriptors to fetch. |
2380 | * |
2381 | * We also use this memory barrier to make certain all of the |
2382 | * status bits have been updated before next_to_watch is written. |
2383 | */ |
2384 | wmb(); |
2385 | |
2386 | /* set next_to_watch value indicating a packet is present */ |
2387 | first->next_to_watch = tx_desc; |
2388 | |
2389 | /* notify HW of packet */ |
2390 | if (netif_xmit_stopped(dev_queue: txring_txq(ring: tx_ring)) || !netdev_xmit_more()) { |
2391 | writel(val: i, addr: tx_ring->tail); |
2392 | } |
2393 | |
2394 | return; |
2395 | |
2396 | dma_error: |
2397 | dev_info(tx_ring->dev, "TX DMA map failed\n" ); |
2398 | |
2399 | /* clear dma mappings for failed tx_bi map */ |
2400 | for (;;) { |
2401 | tx_bi = &tx_ring->tx_bi[i]; |
2402 | iavf_unmap_and_free_tx_resource(ring: tx_ring, tx_buffer: tx_bi); |
2403 | if (tx_bi == first) |
2404 | break; |
2405 | if (i == 0) |
2406 | i = tx_ring->count; |
2407 | i--; |
2408 | } |
2409 | |
2410 | tx_ring->next_to_use = i; |
2411 | } |
2412 | |
2413 | /** |
2414 | * iavf_xmit_frame_ring - Sends buffer on Tx ring |
2415 | * @skb: send buffer |
2416 | * @tx_ring: ring to send buffer on |
2417 | * |
2418 | * Returns NETDEV_TX_OK if sent, else an error code |
2419 | **/ |
2420 | static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb, |
2421 | struct iavf_ring *tx_ring) |
2422 | { |
2423 | u64 cd_type_cmd_tso_mss = IAVF_TX_DESC_DTYPE_CONTEXT; |
2424 | u32 cd_tunneling = 0, cd_l2tag2 = 0; |
2425 | struct iavf_tx_buffer *first; |
2426 | u32 td_offset = 0; |
2427 | u32 tx_flags = 0; |
2428 | __be16 protocol; |
2429 | u32 td_cmd = 0; |
2430 | u8 hdr_len = 0; |
2431 | int tso, count; |
2432 | |
2433 | /* prefetch the data, we'll need it later */ |
2434 | prefetch(skb->data); |
2435 | |
2436 | iavf_trace(xmit_frame_ring, skb, tx_ring); |
2437 | |
2438 | count = iavf_xmit_descriptor_count(skb); |
2439 | if (iavf_chk_linearize(skb, count)) { |
2440 | if (__skb_linearize(skb)) { |
2441 | dev_kfree_skb_any(skb); |
2442 | return NETDEV_TX_OK; |
2443 | } |
2444 | count = iavf_txd_use_count(size: skb->len); |
2445 | tx_ring->tx_stats.tx_linearize++; |
2446 | } |
2447 | |
2448 | /* need: 1 descriptor per page * PAGE_SIZE/IAVF_MAX_DATA_PER_TXD, |
2449 | * + 1 desc for skb_head_len/IAVF_MAX_DATA_PER_TXD, |
2450 | * + 4 desc gap to avoid the cache line where head is, |
2451 | * + 1 desc for context descriptor, |
2452 | * otherwise try next time |
2453 | */ |
2454 | if (iavf_maybe_stop_tx(tx_ring, size: count + 4 + 1)) { |
2455 | tx_ring->tx_stats.tx_busy++; |
2456 | return NETDEV_TX_BUSY; |
2457 | } |
2458 | |
2459 | /* record the location of the first descriptor for this packet */ |
2460 | first = &tx_ring->tx_bi[tx_ring->next_to_use]; |
2461 | first->skb = skb; |
2462 | first->bytecount = skb->len; |
2463 | first->gso_segs = 1; |
2464 | |
2465 | /* prepare the xmit flags */ |
2466 | iavf_tx_prepare_vlan_flags(skb, tx_ring, flags: &tx_flags); |
2467 | if (tx_flags & IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN) { |
2468 | cd_type_cmd_tso_mss |= IAVF_TX_CTX_DESC_IL2TAG2 << |
2469 | IAVF_TXD_CTX_QW1_CMD_SHIFT; |
2470 | cd_l2tag2 = (tx_flags & IAVF_TX_FLAGS_VLAN_MASK) >> |
2471 | IAVF_TX_FLAGS_VLAN_SHIFT; |
2472 | } |
2473 | |
2474 | /* obtain protocol of skb */ |
2475 | protocol = vlan_get_protocol(skb); |
2476 | |
2477 | /* setup IPv4/IPv6 offloads */ |
2478 | if (protocol == htons(ETH_P_IP)) |
2479 | tx_flags |= IAVF_TX_FLAGS_IPV4; |
2480 | else if (protocol == htons(ETH_P_IPV6)) |
2481 | tx_flags |= IAVF_TX_FLAGS_IPV6; |
2482 | |
2483 | tso = iavf_tso(first, hdr_len: &hdr_len, cd_type_cmd_tso_mss: &cd_type_cmd_tso_mss); |
2484 | |
2485 | if (tso < 0) |
2486 | goto out_drop; |
2487 | else if (tso) |
2488 | tx_flags |= IAVF_TX_FLAGS_TSO; |
2489 | |
2490 | /* Always offload the checksum, since it's in the data descriptor */ |
2491 | tso = iavf_tx_enable_csum(skb, tx_flags: &tx_flags, td_cmd: &td_cmd, td_offset: &td_offset, |
2492 | tx_ring, cd_tunneling: &cd_tunneling); |
2493 | if (tso < 0) |
2494 | goto out_drop; |
2495 | |
2496 | /* always enable CRC insertion offload */ |
2497 | td_cmd |= IAVF_TX_DESC_CMD_ICRC; |
2498 | |
2499 | iavf_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss, |
2500 | cd_tunneling, cd_l2tag2); |
2501 | |
2502 | iavf_tx_map(tx_ring, skb, first, tx_flags, hdr_len, |
2503 | td_cmd, td_offset); |
2504 | |
2505 | return NETDEV_TX_OK; |
2506 | |
2507 | out_drop: |
2508 | iavf_trace(xmit_frame_ring_drop, first->skb, tx_ring); |
2509 | dev_kfree_skb_any(skb: first->skb); |
2510 | first->skb = NULL; |
2511 | return NETDEV_TX_OK; |
2512 | } |
2513 | |
2514 | /** |
2515 | * iavf_xmit_frame - Selects the correct VSI and Tx queue to send buffer |
2516 | * @skb: send buffer |
2517 | * @netdev: network interface device structure |
2518 | * |
2519 | * Returns NETDEV_TX_OK if sent, else an error code |
2520 | **/ |
2521 | netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev) |
2522 | { |
2523 | struct iavf_adapter *adapter = netdev_priv(dev: netdev); |
2524 | struct iavf_ring *tx_ring = &adapter->tx_rings[skb->queue_mapping]; |
2525 | |
2526 | /* hardware can't handle really short frames, hardware padding works |
2527 | * beyond this point |
2528 | */ |
2529 | if (unlikely(skb->len < IAVF_MIN_TX_LEN)) { |
2530 | if (skb_pad(skb, IAVF_MIN_TX_LEN - skb->len)) |
2531 | return NETDEV_TX_OK; |
2532 | skb->len = IAVF_MIN_TX_LEN; |
2533 | skb_set_tail_pointer(skb, IAVF_MIN_TX_LEN); |
2534 | } |
2535 | |
2536 | return iavf_xmit_frame_ring(skb, tx_ring); |
2537 | } |
2538 | |