1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* XDP user-space ring structure |
3 | * Copyright(c) 2018 Intel Corporation. |
4 | */ |
5 | |
6 | #ifndef _LINUX_XSK_QUEUE_H |
7 | #define _LINUX_XSK_QUEUE_H |
8 | |
9 | #include <linux/types.h> |
10 | #include <linux/if_xdp.h> |
11 | #include <net/xdp_sock.h> |
12 | #include <net/xsk_buff_pool.h> |
13 | |
14 | #include "xsk.h" |
15 | |
16 | struct xdp_ring { |
17 | u32 producer ____cacheline_aligned_in_smp; |
18 | /* Hinder the adjacent cache prefetcher to prefetch the consumer |
19 | * pointer if the producer pointer is touched and vice versa. |
20 | */ |
21 | u32 pad1 ____cacheline_aligned_in_smp; |
22 | u32 consumer ____cacheline_aligned_in_smp; |
23 | u32 pad2 ____cacheline_aligned_in_smp; |
24 | u32 flags; |
25 | u32 pad3 ____cacheline_aligned_in_smp; |
26 | }; |
27 | |
28 | /* Used for the RX and TX queues for packets */ |
29 | struct xdp_rxtx_ring { |
30 | struct xdp_ring ptrs; |
31 | struct xdp_desc desc[] ____cacheline_aligned_in_smp; |
32 | }; |
33 | |
34 | /* Used for the fill and completion queues for buffers */ |
35 | struct xdp_umem_ring { |
36 | struct xdp_ring ptrs; |
37 | u64 desc[] ____cacheline_aligned_in_smp; |
38 | }; |
39 | |
40 | struct xsk_queue { |
41 | u32 ring_mask; |
42 | u32 nentries; |
43 | u32 cached_prod; |
44 | u32 cached_cons; |
45 | struct xdp_ring *ring; |
46 | u64 invalid_descs; |
47 | u64 queue_empty_descs; |
48 | size_t ring_vmalloc_size; |
49 | }; |
50 | |
51 | struct parsed_desc { |
52 | u32 mb; |
53 | u32 valid; |
54 | }; |
55 | |
56 | /* The structure of the shared state of the rings are a simple |
57 | * circular buffer, as outlined in |
58 | * Documentation/core-api/circular-buffers.rst. For the Rx and |
59 | * completion ring, the kernel is the producer and user space is the |
60 | * consumer. For the Tx and fill rings, the kernel is the consumer and |
61 | * user space is the producer. |
62 | * |
63 | * producer consumer |
64 | * |
65 | * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) |
66 | * STORE $data LOAD $data |
67 | * STORE.rel ->producer (B) STORE.rel ->consumer (D) |
68 | * } |
69 | * |
70 | * (A) pairs with (D), and (B) pairs with (C). |
71 | * |
72 | * Starting with (B), it protects the data from being written after |
73 | * the producer pointer. If this barrier was missing, the consumer |
74 | * could observe the producer pointer being set and thus load the data |
75 | * before the producer has written the new data. The consumer would in |
76 | * this case load the old data. |
77 | * |
78 | * (C) protects the consumer from speculatively loading the data before |
79 | * the producer pointer actually has been read. If we do not have this |
80 | * barrier, some architectures could load old data as speculative loads |
81 | * are not discarded as the CPU does not know there is a dependency |
82 | * between ->producer and data. |
83 | * |
84 | * (A) is a control dependency that separates the load of ->consumer |
85 | * from the stores of $data. In case ->consumer indicates there is no |
86 | * room in the buffer to store $data we do not. The dependency will |
87 | * order both of the stores after the loads. So no barrier is needed. |
88 | * |
89 | * (D) protects the load of the data to be observed to happen after the |
90 | * store of the consumer pointer. If we did not have this memory |
91 | * barrier, the producer could observe the consumer pointer being set |
92 | * and overwrite the data with a new value before the consumer got the |
93 | * chance to read the old value. The consumer would thus miss reading |
94 | * the old entry and very likely read the new entry twice, once right |
95 | * now and again after circling through the ring. |
96 | */ |
97 | |
98 | /* The operations on the rings are the following: |
99 | * |
100 | * producer consumer |
101 | * |
102 | * RESERVE entries PEEK in the ring for entries |
103 | * WRITE data into the ring READ data from the ring |
104 | * SUBMIT entries RELEASE entries |
105 | * |
106 | * The producer reserves one or more entries in the ring. It can then |
107 | * fill in these entries and finally submit them so that they can be |
108 | * seen and read by the consumer. |
109 | * |
110 | * The consumer peeks into the ring to see if the producer has written |
111 | * any new entries. If so, the consumer can then read these entries |
112 | * and when it is done reading them release them back to the producer |
113 | * so that the producer can use these slots to fill in new entries. |
114 | * |
115 | * The function names below reflect these operations. |
116 | */ |
117 | |
118 | /* Functions that read and validate content from consumer rings. */ |
119 | |
120 | static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) |
121 | { |
122 | struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; |
123 | u32 idx = cached_cons & q->ring_mask; |
124 | |
125 | *addr = ring->desc[idx]; |
126 | } |
127 | |
128 | static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) |
129 | { |
130 | if (q->cached_cons != q->cached_prod) { |
131 | __xskq_cons_read_addr_unchecked(q, cached_cons: q->cached_cons, addr); |
132 | return true; |
133 | } |
134 | |
135 | return false; |
136 | } |
137 | |
138 | static inline bool xp_unused_options_set(u32 options) |
139 | { |
140 | return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); |
141 | } |
142 | |
143 | static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, |
144 | struct xdp_desc *desc) |
145 | { |
146 | u64 addr = desc->addr - pool->tx_metadata_len; |
147 | u64 len = desc->len + pool->tx_metadata_len; |
148 | u64 offset = addr & (pool->chunk_size - 1); |
149 | |
150 | if (!desc->len) |
151 | return false; |
152 | |
153 | if (offset + len > pool->chunk_size) |
154 | return false; |
155 | |
156 | if (addr >= pool->addrs_cnt) |
157 | return false; |
158 | |
159 | if (xp_unused_options_set(options: desc->options)) |
160 | return false; |
161 | return true; |
162 | } |
163 | |
164 | static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, |
165 | struct xdp_desc *desc) |
166 | { |
167 | u64 addr = xp_unaligned_add_offset_to_addr(addr: desc->addr) - pool->tx_metadata_len; |
168 | u64 len = desc->len + pool->tx_metadata_len; |
169 | |
170 | if (!desc->len) |
171 | return false; |
172 | |
173 | if (len > pool->chunk_size) |
174 | return false; |
175 | |
176 | if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || |
177 | xp_desc_crosses_non_contig_pg(pool, addr, len)) |
178 | return false; |
179 | |
180 | if (xp_unused_options_set(options: desc->options)) |
181 | return false; |
182 | return true; |
183 | } |
184 | |
185 | static inline bool xp_validate_desc(struct xsk_buff_pool *pool, |
186 | struct xdp_desc *desc) |
187 | { |
188 | return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : |
189 | xp_aligned_validate_desc(pool, desc); |
190 | } |
191 | |
192 | static inline bool xskq_has_descs(struct xsk_queue *q) |
193 | { |
194 | return q->cached_cons != q->cached_prod; |
195 | } |
196 | |
197 | static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, |
198 | struct xdp_desc *d, |
199 | struct xsk_buff_pool *pool) |
200 | { |
201 | if (!xp_validate_desc(pool, desc: d)) { |
202 | q->invalid_descs++; |
203 | return false; |
204 | } |
205 | return true; |
206 | } |
207 | |
208 | static inline bool xskq_cons_read_desc(struct xsk_queue *q, |
209 | struct xdp_desc *desc, |
210 | struct xsk_buff_pool *pool) |
211 | { |
212 | if (q->cached_cons != q->cached_prod) { |
213 | struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; |
214 | u32 idx = q->cached_cons & q->ring_mask; |
215 | |
216 | *desc = ring->desc[idx]; |
217 | return xskq_cons_is_valid_desc(q, d: desc, pool); |
218 | } |
219 | |
220 | q->queue_empty_descs++; |
221 | return false; |
222 | } |
223 | |
224 | static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) |
225 | { |
226 | q->cached_cons += cnt; |
227 | } |
228 | |
229 | static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, |
230 | struct xdp_desc *desc, struct parsed_desc *parsed) |
231 | { |
232 | parsed->valid = xskq_cons_is_valid_desc(q, d: desc, pool); |
233 | parsed->mb = xp_mb_desc(desc); |
234 | } |
235 | |
236 | static inline |
237 | u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, |
238 | u32 max) |
239 | { |
240 | u32 cached_cons = q->cached_cons, nb_entries = 0; |
241 | struct xdp_desc *descs = pool->tx_descs; |
242 | u32 total_descs = 0, nr_frags = 0; |
243 | |
244 | /* track first entry, if stumble upon *any* invalid descriptor, rewind |
245 | * current packet that consists of frags and stop the processing |
246 | */ |
247 | while (cached_cons != q->cached_prod && nb_entries < max) { |
248 | struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; |
249 | u32 idx = cached_cons & q->ring_mask; |
250 | struct parsed_desc parsed; |
251 | |
252 | descs[nb_entries] = ring->desc[idx]; |
253 | cached_cons++; |
254 | parse_desc(q, pool, desc: &descs[nb_entries], parsed: &parsed); |
255 | if (unlikely(!parsed.valid)) |
256 | break; |
257 | |
258 | if (likely(!parsed.mb)) { |
259 | total_descs += (nr_frags + 1); |
260 | nr_frags = 0; |
261 | } else { |
262 | nr_frags++; |
263 | if (nr_frags == pool->netdev->xdp_zc_max_segs) { |
264 | nr_frags = 0; |
265 | break; |
266 | } |
267 | } |
268 | nb_entries++; |
269 | } |
270 | |
271 | cached_cons -= nr_frags; |
272 | /* Release valid plus any invalid entries */ |
273 | xskq_cons_release_n(q, cnt: cached_cons - q->cached_cons); |
274 | return total_descs; |
275 | } |
276 | |
277 | /* Functions for consumers */ |
278 | |
279 | static inline void __xskq_cons_release(struct xsk_queue *q) |
280 | { |
281 | smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ |
282 | } |
283 | |
284 | static inline void __xskq_cons_peek(struct xsk_queue *q) |
285 | { |
286 | /* Refresh the local pointer */ |
287 | q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ |
288 | } |
289 | |
290 | static inline void xskq_cons_get_entries(struct xsk_queue *q) |
291 | { |
292 | __xskq_cons_release(q); |
293 | __xskq_cons_peek(q); |
294 | } |
295 | |
296 | static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) |
297 | { |
298 | u32 entries = q->cached_prod - q->cached_cons; |
299 | |
300 | if (entries >= max) |
301 | return max; |
302 | |
303 | __xskq_cons_peek(q); |
304 | entries = q->cached_prod - q->cached_cons; |
305 | |
306 | return entries >= max ? max : entries; |
307 | } |
308 | |
309 | static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) |
310 | { |
311 | return xskq_cons_nb_entries(q, max: cnt) >= cnt; |
312 | } |
313 | |
314 | static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) |
315 | { |
316 | if (q->cached_prod == q->cached_cons) |
317 | xskq_cons_get_entries(q); |
318 | return xskq_cons_read_addr_unchecked(q, addr); |
319 | } |
320 | |
321 | static inline bool xskq_cons_peek_desc(struct xsk_queue *q, |
322 | struct xdp_desc *desc, |
323 | struct xsk_buff_pool *pool) |
324 | { |
325 | if (q->cached_prod == q->cached_cons) |
326 | xskq_cons_get_entries(q); |
327 | return xskq_cons_read_desc(q, desc, pool); |
328 | } |
329 | |
330 | /* To improve performance in the xskq_cons_release functions, only update local state here. |
331 | * Reflect this to global state when we get new entries from the ring in |
332 | * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. |
333 | */ |
334 | static inline void xskq_cons_release(struct xsk_queue *q) |
335 | { |
336 | q->cached_cons++; |
337 | } |
338 | |
339 | static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) |
340 | { |
341 | q->cached_cons -= cnt; |
342 | } |
343 | |
344 | static inline u32 xskq_cons_present_entries(struct xsk_queue *q) |
345 | { |
346 | /* No barriers needed since data is not accessed */ |
347 | return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); |
348 | } |
349 | |
350 | /* Functions for producers */ |
351 | |
352 | static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) |
353 | { |
354 | u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); |
355 | |
356 | if (free_entries >= max) |
357 | return max; |
358 | |
359 | /* Refresh the local tail pointer */ |
360 | q->cached_cons = READ_ONCE(q->ring->consumer); |
361 | free_entries = q->nentries - (q->cached_prod - q->cached_cons); |
362 | |
363 | return free_entries >= max ? max : free_entries; |
364 | } |
365 | |
366 | static inline bool xskq_prod_is_full(struct xsk_queue *q) |
367 | { |
368 | return xskq_prod_nb_free(q, max: 1) ? false : true; |
369 | } |
370 | |
371 | static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) |
372 | { |
373 | q->cached_prod -= cnt; |
374 | } |
375 | |
376 | static inline int xskq_prod_reserve(struct xsk_queue *q) |
377 | { |
378 | if (xskq_prod_is_full(q)) |
379 | return -ENOSPC; |
380 | |
381 | /* A, matches D */ |
382 | q->cached_prod++; |
383 | return 0; |
384 | } |
385 | |
386 | static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) |
387 | { |
388 | struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; |
389 | |
390 | if (xskq_prod_is_full(q)) |
391 | return -ENOSPC; |
392 | |
393 | /* A, matches D */ |
394 | ring->desc[q->cached_prod++ & q->ring_mask] = addr; |
395 | return 0; |
396 | } |
397 | |
398 | static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, |
399 | u32 nb_entries) |
400 | { |
401 | struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; |
402 | u32 i, cached_prod; |
403 | |
404 | /* A, matches D */ |
405 | cached_prod = q->cached_prod; |
406 | for (i = 0; i < nb_entries; i++) |
407 | ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; |
408 | q->cached_prod = cached_prod; |
409 | } |
410 | |
411 | static inline int xskq_prod_reserve_desc(struct xsk_queue *q, |
412 | u64 addr, u32 len, u32 flags) |
413 | { |
414 | struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; |
415 | u32 idx; |
416 | |
417 | if (xskq_prod_is_full(q)) |
418 | return -ENOBUFS; |
419 | |
420 | /* A, matches D */ |
421 | idx = q->cached_prod++ & q->ring_mask; |
422 | ring->desc[idx].addr = addr; |
423 | ring->desc[idx].len = len; |
424 | ring->desc[idx].options = flags; |
425 | |
426 | return 0; |
427 | } |
428 | |
429 | static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) |
430 | { |
431 | smp_store_release(&q->ring->producer, idx); /* B, matches C */ |
432 | } |
433 | |
434 | static inline void xskq_prod_submit(struct xsk_queue *q) |
435 | { |
436 | __xskq_prod_submit(q, idx: q->cached_prod); |
437 | } |
438 | |
439 | static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) |
440 | { |
441 | __xskq_prod_submit(q, idx: q->ring->producer + nb_entries); |
442 | } |
443 | |
444 | static inline bool xskq_prod_is_empty(struct xsk_queue *q) |
445 | { |
446 | /* No barriers needed since data is not accessed */ |
447 | return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); |
448 | } |
449 | |
450 | /* For both producers and consumers */ |
451 | |
452 | static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) |
453 | { |
454 | return q ? q->invalid_descs : 0; |
455 | } |
456 | |
457 | static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) |
458 | { |
459 | return q ? q->queue_empty_descs : 0; |
460 | } |
461 | |
462 | struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); |
463 | void xskq_destroy(struct xsk_queue *q_ops); |
464 | |
465 | #endif /* _LINUX_XSK_QUEUE_H */ |
466 | |