1 | /* SPDX-License-Identifier: GPL-2.0 |
2 | * |
3 | * page_pool.c |
4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> |
5 | * Copyright (C) 2016 Red Hat, Inc. |
6 | */ |
7 | |
8 | #include <linux/types.h> |
9 | #include <linux/kernel.h> |
10 | #include <linux/slab.h> |
11 | #include <linux/device.h> |
12 | |
13 | #include <net/page_pool/helpers.h> |
14 | #include <net/xdp.h> |
15 | |
16 | #include <linux/dma-direction.h> |
17 | #include <linux/dma-mapping.h> |
18 | #include <linux/page-flags.h> |
19 | #include <linux/mm.h> /* for put_page() */ |
20 | #include <linux/poison.h> |
21 | #include <linux/ethtool.h> |
22 | #include <linux/netdevice.h> |
23 | |
24 | #include <trace/events/page_pool.h> |
25 | |
26 | #include "page_pool_priv.h" |
27 | |
28 | #define DEFER_TIME (msecs_to_jiffies(1000)) |
29 | #define DEFER_WARN_INTERVAL (60 * HZ) |
30 | |
31 | #define BIAS_MAX (LONG_MAX >> 1) |
32 | |
33 | #ifdef CONFIG_PAGE_POOL_STATS |
34 | static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); |
35 | |
36 | /* alloc_stat_inc is intended to be used in softirq context */ |
37 | #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) |
38 | /* recycle_stat_inc is safe to use when preemption is possible. */ |
39 | #define recycle_stat_inc(pool, __stat) \ |
40 | do { \ |
41 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ |
42 | this_cpu_inc(s->__stat); \ |
43 | } while (0) |
44 | |
45 | #define recycle_stat_add(pool, __stat, val) \ |
46 | do { \ |
47 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ |
48 | this_cpu_add(s->__stat, val); \ |
49 | } while (0) |
50 | |
51 | static const char pp_stats[][ETH_GSTRING_LEN] = { |
52 | "rx_pp_alloc_fast" , |
53 | "rx_pp_alloc_slow" , |
54 | "rx_pp_alloc_slow_ho" , |
55 | "rx_pp_alloc_empty" , |
56 | "rx_pp_alloc_refill" , |
57 | "rx_pp_alloc_waive" , |
58 | "rx_pp_recycle_cached" , |
59 | "rx_pp_recycle_cache_full" , |
60 | "rx_pp_recycle_ring" , |
61 | "rx_pp_recycle_ring_full" , |
62 | "rx_pp_recycle_released_ref" , |
63 | }; |
64 | |
65 | /** |
66 | * page_pool_get_stats() - fetch page pool stats |
67 | * @pool: pool from which page was allocated |
68 | * @stats: struct page_pool_stats to fill in |
69 | * |
70 | * Retrieve statistics about the page_pool. This API is only available |
71 | * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. |
72 | * A pointer to a caller allocated struct page_pool_stats structure |
73 | * is passed to this API which is filled in. The caller can then report |
74 | * those stats to the user (perhaps via ethtool, debugfs, etc.). |
75 | */ |
76 | bool page_pool_get_stats(const struct page_pool *pool, |
77 | struct page_pool_stats *stats) |
78 | { |
79 | int cpu = 0; |
80 | |
81 | if (!stats) |
82 | return false; |
83 | |
84 | /* The caller is responsible to initialize stats. */ |
85 | stats->alloc_stats.fast += pool->alloc_stats.fast; |
86 | stats->alloc_stats.slow += pool->alloc_stats.slow; |
87 | stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; |
88 | stats->alloc_stats.empty += pool->alloc_stats.empty; |
89 | stats->alloc_stats.refill += pool->alloc_stats.refill; |
90 | stats->alloc_stats.waive += pool->alloc_stats.waive; |
91 | |
92 | for_each_possible_cpu(cpu) { |
93 | const struct page_pool_recycle_stats *pcpu = |
94 | per_cpu_ptr(pool->recycle_stats, cpu); |
95 | |
96 | stats->recycle_stats.cached += pcpu->cached; |
97 | stats->recycle_stats.cache_full += pcpu->cache_full; |
98 | stats->recycle_stats.ring += pcpu->ring; |
99 | stats->recycle_stats.ring_full += pcpu->ring_full; |
100 | stats->recycle_stats.released_refcnt += pcpu->released_refcnt; |
101 | } |
102 | |
103 | return true; |
104 | } |
105 | EXPORT_SYMBOL(page_pool_get_stats); |
106 | |
107 | u8 *page_pool_ethtool_stats_get_strings(u8 *data) |
108 | { |
109 | int i; |
110 | |
111 | for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { |
112 | memcpy(data, pp_stats[i], ETH_GSTRING_LEN); |
113 | data += ETH_GSTRING_LEN; |
114 | } |
115 | |
116 | return data; |
117 | } |
118 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); |
119 | |
120 | int page_pool_ethtool_stats_get_count(void) |
121 | { |
122 | return ARRAY_SIZE(pp_stats); |
123 | } |
124 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); |
125 | |
126 | u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) |
127 | { |
128 | struct page_pool_stats *pool_stats = stats; |
129 | |
130 | *data++ = pool_stats->alloc_stats.fast; |
131 | *data++ = pool_stats->alloc_stats.slow; |
132 | *data++ = pool_stats->alloc_stats.slow_high_order; |
133 | *data++ = pool_stats->alloc_stats.empty; |
134 | *data++ = pool_stats->alloc_stats.refill; |
135 | *data++ = pool_stats->alloc_stats.waive; |
136 | *data++ = pool_stats->recycle_stats.cached; |
137 | *data++ = pool_stats->recycle_stats.cache_full; |
138 | *data++ = pool_stats->recycle_stats.ring; |
139 | *data++ = pool_stats->recycle_stats.ring_full; |
140 | *data++ = pool_stats->recycle_stats.released_refcnt; |
141 | |
142 | return data; |
143 | } |
144 | EXPORT_SYMBOL(page_pool_ethtool_stats_get); |
145 | |
146 | #else |
147 | #define alloc_stat_inc(pool, __stat) |
148 | #define recycle_stat_inc(pool, __stat) |
149 | #define recycle_stat_add(pool, __stat, val) |
150 | #endif |
151 | |
152 | static bool page_pool_producer_lock(struct page_pool *pool) |
153 | __acquires(&pool->ring.producer_lock) |
154 | { |
155 | bool in_softirq = in_softirq(); |
156 | |
157 | if (in_softirq) |
158 | spin_lock(lock: &pool->ring.producer_lock); |
159 | else |
160 | spin_lock_bh(lock: &pool->ring.producer_lock); |
161 | |
162 | return in_softirq; |
163 | } |
164 | |
165 | static void page_pool_producer_unlock(struct page_pool *pool, |
166 | bool in_softirq) |
167 | __releases(&pool->ring.producer_lock) |
168 | { |
169 | if (in_softirq) |
170 | spin_unlock(lock: &pool->ring.producer_lock); |
171 | else |
172 | spin_unlock_bh(lock: &pool->ring.producer_lock); |
173 | } |
174 | |
175 | static int page_pool_init(struct page_pool *pool, |
176 | const struct page_pool_params *params, |
177 | int cpuid) |
178 | { |
179 | unsigned int ring_qsize = 1024; /* Default */ |
180 | |
181 | memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); |
182 | memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); |
183 | |
184 | pool->cpuid = cpuid; |
185 | |
186 | /* Validate only known flags were used */ |
187 | if (pool->p.flags & ~(PP_FLAG_ALL)) |
188 | return -EINVAL; |
189 | |
190 | if (pool->p.pool_size) |
191 | ring_qsize = pool->p.pool_size; |
192 | |
193 | /* Sanity limit mem that can be pinned down */ |
194 | if (ring_qsize > 32768) |
195 | return -E2BIG; |
196 | |
197 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. |
198 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, |
199 | * which is the XDP_TX use-case. |
200 | */ |
201 | if (pool->p.flags & PP_FLAG_DMA_MAP) { |
202 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && |
203 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) |
204 | return -EINVAL; |
205 | } |
206 | |
207 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { |
208 | /* In order to request DMA-sync-for-device the page |
209 | * needs to be mapped |
210 | */ |
211 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
212 | return -EINVAL; |
213 | |
214 | if (!pool->p.max_len) |
215 | return -EINVAL; |
216 | |
217 | /* pool->p.offset has to be set according to the address |
218 | * offset used by the DMA engine to start copying rx data |
219 | */ |
220 | } |
221 | |
222 | pool->has_init_callback = !!pool->slow.init_callback; |
223 | |
224 | #ifdef CONFIG_PAGE_POOL_STATS |
225 | if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { |
226 | pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); |
227 | if (!pool->recycle_stats) |
228 | return -ENOMEM; |
229 | } else { |
230 | /* For system page pool instance we use a singular stats object |
231 | * instead of allocating a separate percpu variable for each |
232 | * (also percpu) page pool instance. |
233 | */ |
234 | pool->recycle_stats = &pp_system_recycle_stats; |
235 | } |
236 | #endif |
237 | |
238 | if (ptr_ring_init(r: &pool->ring, size: ring_qsize, GFP_KERNEL) < 0) { |
239 | #ifdef CONFIG_PAGE_POOL_STATS |
240 | if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) |
241 | free_percpu(pdata: pool->recycle_stats); |
242 | #endif |
243 | return -ENOMEM; |
244 | } |
245 | |
246 | atomic_set(v: &pool->pages_state_release_cnt, i: 0); |
247 | |
248 | /* Driver calling page_pool_create() also call page_pool_destroy() */ |
249 | refcount_set(r: &pool->user_cnt, n: 1); |
250 | |
251 | if (pool->p.flags & PP_FLAG_DMA_MAP) |
252 | get_device(dev: pool->p.dev); |
253 | |
254 | return 0; |
255 | } |
256 | |
257 | static void page_pool_uninit(struct page_pool *pool) |
258 | { |
259 | ptr_ring_cleanup(r: &pool->ring, NULL); |
260 | |
261 | if (pool->p.flags & PP_FLAG_DMA_MAP) |
262 | put_device(dev: pool->p.dev); |
263 | |
264 | #ifdef CONFIG_PAGE_POOL_STATS |
265 | if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) |
266 | free_percpu(pdata: pool->recycle_stats); |
267 | #endif |
268 | } |
269 | |
270 | /** |
271 | * page_pool_create_percpu() - create a page pool for a given cpu. |
272 | * @params: parameters, see struct page_pool_params |
273 | * @cpuid: cpu identifier |
274 | */ |
275 | struct page_pool * |
276 | page_pool_create_percpu(const struct page_pool_params *params, int cpuid) |
277 | { |
278 | struct page_pool *pool; |
279 | int err; |
280 | |
281 | pool = kzalloc_node(size: sizeof(*pool), GFP_KERNEL, node: params->nid); |
282 | if (!pool) |
283 | return ERR_PTR(error: -ENOMEM); |
284 | |
285 | err = page_pool_init(pool, params, cpuid); |
286 | if (err < 0) |
287 | goto err_free; |
288 | |
289 | err = page_pool_list(pool); |
290 | if (err) |
291 | goto err_uninit; |
292 | |
293 | return pool; |
294 | |
295 | err_uninit: |
296 | page_pool_uninit(pool); |
297 | err_free: |
298 | pr_warn("%s() gave up with errno %d\n" , __func__, err); |
299 | kfree(objp: pool); |
300 | return ERR_PTR(error: err); |
301 | } |
302 | EXPORT_SYMBOL(page_pool_create_percpu); |
303 | |
304 | /** |
305 | * page_pool_create() - create a page pool |
306 | * @params: parameters, see struct page_pool_params |
307 | */ |
308 | struct page_pool *page_pool_create(const struct page_pool_params *params) |
309 | { |
310 | return page_pool_create_percpu(params, -1); |
311 | } |
312 | EXPORT_SYMBOL(page_pool_create); |
313 | |
314 | static void page_pool_return_page(struct page_pool *pool, struct page *page); |
315 | |
316 | noinline |
317 | static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) |
318 | { |
319 | struct ptr_ring *r = &pool->ring; |
320 | struct page *page; |
321 | int pref_nid; /* preferred NUMA node */ |
322 | |
323 | /* Quicker fallback, avoid locks when ring is empty */ |
324 | if (__ptr_ring_empty(r)) { |
325 | alloc_stat_inc(pool, empty); |
326 | return NULL; |
327 | } |
328 | |
329 | /* Softirq guarantee CPU and thus NUMA node is stable. This, |
330 | * assumes CPU refilling driver RX-ring will also run RX-NAPI. |
331 | */ |
332 | #ifdef CONFIG_NUMA |
333 | pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; |
334 | #else |
335 | /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ |
336 | pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ |
337 | #endif |
338 | |
339 | /* Refill alloc array, but only if NUMA match */ |
340 | do { |
341 | page = __ptr_ring_consume(r); |
342 | if (unlikely(!page)) |
343 | break; |
344 | |
345 | if (likely(page_to_nid(page) == pref_nid)) { |
346 | pool->alloc.cache[pool->alloc.count++] = page; |
347 | } else { |
348 | /* NUMA mismatch; |
349 | * (1) release 1 page to page-allocator and |
350 | * (2) break out to fallthrough to alloc_pages_node. |
351 | * This limit stress on page buddy alloactor. |
352 | */ |
353 | page_pool_return_page(pool, page); |
354 | alloc_stat_inc(pool, waive); |
355 | page = NULL; |
356 | break; |
357 | } |
358 | } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); |
359 | |
360 | /* Return last page */ |
361 | if (likely(pool->alloc.count > 0)) { |
362 | page = pool->alloc.cache[--pool->alloc.count]; |
363 | alloc_stat_inc(pool, refill); |
364 | } |
365 | |
366 | return page; |
367 | } |
368 | |
369 | /* fast path */ |
370 | static struct page *__page_pool_get_cached(struct page_pool *pool) |
371 | { |
372 | struct page *page; |
373 | |
374 | /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ |
375 | if (likely(pool->alloc.count)) { |
376 | /* Fast-path */ |
377 | page = pool->alloc.cache[--pool->alloc.count]; |
378 | alloc_stat_inc(pool, fast); |
379 | } else { |
380 | page = page_pool_refill_alloc_cache(pool); |
381 | } |
382 | |
383 | return page; |
384 | } |
385 | |
386 | static void page_pool_dma_sync_for_device(struct page_pool *pool, |
387 | struct page *page, |
388 | unsigned int dma_sync_size) |
389 | { |
390 | dma_addr_t dma_addr = page_pool_get_dma_addr(page); |
391 | |
392 | dma_sync_size = min(dma_sync_size, pool->p.max_len); |
393 | dma_sync_single_range_for_device(dev: pool->p.dev, addr: dma_addr, |
394 | offset: pool->p.offset, size: dma_sync_size, |
395 | dir: pool->p.dma_dir); |
396 | } |
397 | |
398 | static bool page_pool_dma_map(struct page_pool *pool, struct page *page) |
399 | { |
400 | dma_addr_t dma; |
401 | |
402 | /* Setup DMA mapping: use 'struct page' area for storing DMA-addr |
403 | * since dma_addr_t can be either 32 or 64 bits and does not always fit |
404 | * into page private data (i.e 32bit cpu with 64bit DMA caps) |
405 | * This mapping is kept for lifetime of page, until leaving pool. |
406 | */ |
407 | dma = dma_map_page_attrs(dev: pool->p.dev, page, offset: 0, |
408 | size: (PAGE_SIZE << pool->p.order), |
409 | dir: pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | |
410 | DMA_ATTR_WEAK_ORDERING); |
411 | if (dma_mapping_error(dev: pool->p.dev, dma_addr: dma)) |
412 | return false; |
413 | |
414 | if (page_pool_set_dma_addr(page, addr: dma)) |
415 | goto unmap_failed; |
416 | |
417 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) |
418 | page_pool_dma_sync_for_device(pool, page, dma_sync_size: pool->p.max_len); |
419 | |
420 | return true; |
421 | |
422 | unmap_failed: |
423 | WARN_ON_ONCE("unexpected DMA address, please report to netdev@" ); |
424 | dma_unmap_page_attrs(dev: pool->p.dev, addr: dma, |
425 | PAGE_SIZE << pool->p.order, dir: pool->p.dma_dir, |
426 | DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); |
427 | return false; |
428 | } |
429 | |
430 | static void page_pool_set_pp_info(struct page_pool *pool, |
431 | struct page *page) |
432 | { |
433 | page->pp = pool; |
434 | page->pp_magic |= PP_SIGNATURE; |
435 | |
436 | /* Ensuring all pages have been split into one fragment initially: |
437 | * page_pool_set_pp_info() is only called once for every page when it |
438 | * is allocated from the page allocator and page_pool_fragment_page() |
439 | * is dirtying the same cache line as the page->pp_magic above, so |
440 | * the overhead is negligible. |
441 | */ |
442 | page_pool_fragment_page(page, nr: 1); |
443 | if (pool->has_init_callback) |
444 | pool->slow.init_callback(page, pool->slow.init_arg); |
445 | } |
446 | |
447 | static void page_pool_clear_pp_info(struct page *page) |
448 | { |
449 | page->pp_magic = 0; |
450 | page->pp = NULL; |
451 | } |
452 | |
453 | static struct page *__page_pool_alloc_page_order(struct page_pool *pool, |
454 | gfp_t gfp) |
455 | { |
456 | struct page *page; |
457 | |
458 | gfp |= __GFP_COMP; |
459 | page = alloc_pages_node(nid: pool->p.nid, gfp_mask: gfp, order: pool->p.order); |
460 | if (unlikely(!page)) |
461 | return NULL; |
462 | |
463 | if ((pool->p.flags & PP_FLAG_DMA_MAP) && |
464 | unlikely(!page_pool_dma_map(pool, page))) { |
465 | put_page(page); |
466 | return NULL; |
467 | } |
468 | |
469 | alloc_stat_inc(pool, slow_high_order); |
470 | page_pool_set_pp_info(pool, page); |
471 | |
472 | /* Track how many pages are held 'in-flight' */ |
473 | pool->pages_state_hold_cnt++; |
474 | trace_page_pool_state_hold(pool, page, hold: pool->pages_state_hold_cnt); |
475 | return page; |
476 | } |
477 | |
478 | /* slow path */ |
479 | noinline |
480 | static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, |
481 | gfp_t gfp) |
482 | { |
483 | const int bulk = PP_ALLOC_CACHE_REFILL; |
484 | unsigned int pp_flags = pool->p.flags; |
485 | unsigned int pp_order = pool->p.order; |
486 | struct page *page; |
487 | int i, nr_pages; |
488 | |
489 | /* Don't support bulk alloc for high-order pages */ |
490 | if (unlikely(pp_order)) |
491 | return __page_pool_alloc_page_order(pool, gfp); |
492 | |
493 | /* Unnecessary as alloc cache is empty, but guarantees zero count */ |
494 | if (unlikely(pool->alloc.count > 0)) |
495 | return pool->alloc.cache[--pool->alloc.count]; |
496 | |
497 | /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ |
498 | memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); |
499 | |
500 | nr_pages = alloc_pages_bulk_array_node(gfp, nid: pool->p.nid, nr_pages: bulk, |
501 | page_array: pool->alloc.cache); |
502 | if (unlikely(!nr_pages)) |
503 | return NULL; |
504 | |
505 | /* Pages have been filled into alloc.cache array, but count is zero and |
506 | * page element have not been (possibly) DMA mapped. |
507 | */ |
508 | for (i = 0; i < nr_pages; i++) { |
509 | page = pool->alloc.cache[i]; |
510 | if ((pp_flags & PP_FLAG_DMA_MAP) && |
511 | unlikely(!page_pool_dma_map(pool, page))) { |
512 | put_page(page); |
513 | continue; |
514 | } |
515 | |
516 | page_pool_set_pp_info(pool, page); |
517 | pool->alloc.cache[pool->alloc.count++] = page; |
518 | /* Track how many pages are held 'in-flight' */ |
519 | pool->pages_state_hold_cnt++; |
520 | trace_page_pool_state_hold(pool, page, |
521 | hold: pool->pages_state_hold_cnt); |
522 | } |
523 | |
524 | /* Return last page */ |
525 | if (likely(pool->alloc.count > 0)) { |
526 | page = pool->alloc.cache[--pool->alloc.count]; |
527 | alloc_stat_inc(pool, slow); |
528 | } else { |
529 | page = NULL; |
530 | } |
531 | |
532 | /* When page just alloc'ed is should/must have refcnt 1. */ |
533 | return page; |
534 | } |
535 | |
536 | /* For using page_pool replace: alloc_pages() API calls, but provide |
537 | * synchronization guarantee for allocation side. |
538 | */ |
539 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) |
540 | { |
541 | struct page *page; |
542 | |
543 | /* Fast-path: Get a page from cache */ |
544 | page = __page_pool_get_cached(pool); |
545 | if (page) |
546 | return page; |
547 | |
548 | /* Slow-path: cache empty, do real allocation */ |
549 | page = __page_pool_alloc_pages_slow(pool, gfp); |
550 | return page; |
551 | } |
552 | EXPORT_SYMBOL(page_pool_alloc_pages); |
553 | |
554 | /* Calculate distance between two u32 values, valid if distance is below 2^(31) |
555 | * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution |
556 | */ |
557 | #define _distance(a, b) (s32)((a) - (b)) |
558 | |
559 | s32 page_pool_inflight(const struct page_pool *pool, bool strict) |
560 | { |
561 | u32 release_cnt = atomic_read(v: &pool->pages_state_release_cnt); |
562 | u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); |
563 | s32 inflight; |
564 | |
565 | inflight = _distance(hold_cnt, release_cnt); |
566 | |
567 | if (strict) { |
568 | trace_page_pool_release(pool, inflight, hold: hold_cnt, release: release_cnt); |
569 | WARN(inflight < 0, "Negative(%d) inflight packet-pages" , |
570 | inflight); |
571 | } else { |
572 | inflight = max(0, inflight); |
573 | } |
574 | |
575 | return inflight; |
576 | } |
577 | |
578 | static __always_inline |
579 | void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) |
580 | { |
581 | dma_addr_t dma; |
582 | |
583 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
584 | /* Always account for inflight pages, even if we didn't |
585 | * map them |
586 | */ |
587 | return; |
588 | |
589 | dma = page_pool_get_dma_addr(page); |
590 | |
591 | /* When page is unmapped, it cannot be returned to our pool */ |
592 | dma_unmap_page_attrs(dev: pool->p.dev, addr: dma, |
593 | PAGE_SIZE << pool->p.order, dir: pool->p.dma_dir, |
594 | DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); |
595 | page_pool_set_dma_addr(page, addr: 0); |
596 | } |
597 | |
598 | /* Disconnects a page (from a page_pool). API users can have a need |
599 | * to disconnect a page (from a page_pool), to allow it to be used as |
600 | * a regular page (that will eventually be returned to the normal |
601 | * page-allocator via put_page). |
602 | */ |
603 | void page_pool_return_page(struct page_pool *pool, struct page *page) |
604 | { |
605 | int count; |
606 | |
607 | __page_pool_release_page_dma(pool, page); |
608 | |
609 | page_pool_clear_pp_info(page); |
610 | |
611 | /* This may be the last page returned, releasing the pool, so |
612 | * it is not safe to reference pool afterwards. |
613 | */ |
614 | count = atomic_inc_return_relaxed(v: &pool->pages_state_release_cnt); |
615 | trace_page_pool_state_release(pool, page, release: count); |
616 | |
617 | put_page(page); |
618 | /* An optimization would be to call __free_pages(page, pool->p.order) |
619 | * knowing page is not part of page-cache (thus avoiding a |
620 | * __page_cache_release() call). |
621 | */ |
622 | } |
623 | |
624 | static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) |
625 | { |
626 | int ret; |
627 | /* BH protection not needed if current is softirq */ |
628 | if (in_softirq()) |
629 | ret = ptr_ring_produce(r: &pool->ring, ptr: page); |
630 | else |
631 | ret = ptr_ring_produce_bh(r: &pool->ring, ptr: page); |
632 | |
633 | if (!ret) { |
634 | recycle_stat_inc(pool, ring); |
635 | return true; |
636 | } |
637 | |
638 | return false; |
639 | } |
640 | |
641 | /* Only allow direct recycling in special circumstances, into the |
642 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. |
643 | * |
644 | * Caller must provide appropriate safe context. |
645 | */ |
646 | static bool page_pool_recycle_in_cache(struct page *page, |
647 | struct page_pool *pool) |
648 | { |
649 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { |
650 | recycle_stat_inc(pool, cache_full); |
651 | return false; |
652 | } |
653 | |
654 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ |
655 | pool->alloc.cache[pool->alloc.count++] = page; |
656 | recycle_stat_inc(pool, cached); |
657 | return true; |
658 | } |
659 | |
660 | static bool __page_pool_page_can_be_recycled(const struct page *page) |
661 | { |
662 | return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); |
663 | } |
664 | |
665 | /* If the page refcnt == 1, this will try to recycle the page. |
666 | * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for |
667 | * the configured size min(dma_sync_size, pool->max_len). |
668 | * If the page refcnt != 1, then the page will be returned to memory |
669 | * subsystem. |
670 | */ |
671 | static __always_inline struct page * |
672 | __page_pool_put_page(struct page_pool *pool, struct page *page, |
673 | unsigned int dma_sync_size, bool allow_direct) |
674 | { |
675 | lockdep_assert_no_hardirq(); |
676 | |
677 | /* This allocator is optimized for the XDP mode that uses |
678 | * one-frame-per-page, but have fallbacks that act like the |
679 | * regular page allocator APIs. |
680 | * |
681 | * refcnt == 1 means page_pool owns page, and can recycle it. |
682 | * |
683 | * page is NOT reusable when allocated when system is under |
684 | * some pressure. (page_is_pfmemalloc) |
685 | */ |
686 | if (likely(__page_pool_page_can_be_recycled(page))) { |
687 | /* Read barrier done in page_ref_count / READ_ONCE */ |
688 | |
689 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) |
690 | page_pool_dma_sync_for_device(pool, page, |
691 | dma_sync_size); |
692 | |
693 | if (allow_direct && in_softirq() && |
694 | page_pool_recycle_in_cache(page, pool)) |
695 | return NULL; |
696 | |
697 | /* Page found as candidate for recycling */ |
698 | return page; |
699 | } |
700 | /* Fallback/non-XDP mode: API user have elevated refcnt. |
701 | * |
702 | * Many drivers split up the page into fragments, and some |
703 | * want to keep doing this to save memory and do refcnt based |
704 | * recycling. Support this use case too, to ease drivers |
705 | * switching between XDP/non-XDP. |
706 | * |
707 | * In-case page_pool maintains the DMA mapping, API user must |
708 | * call page_pool_put_page once. In this elevated refcnt |
709 | * case, the DMA is unmapped/released, as driver is likely |
710 | * doing refcnt based recycle tricks, meaning another process |
711 | * will be invoking put_page. |
712 | */ |
713 | recycle_stat_inc(pool, released_refcnt); |
714 | page_pool_return_page(pool, page); |
715 | |
716 | return NULL; |
717 | } |
718 | |
719 | void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, |
720 | unsigned int dma_sync_size, bool allow_direct) |
721 | { |
722 | page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); |
723 | if (page && !page_pool_recycle_in_ring(pool, page)) { |
724 | /* Cache full, fallback to free pages */ |
725 | recycle_stat_inc(pool, ring_full); |
726 | page_pool_return_page(pool, page); |
727 | } |
728 | } |
729 | EXPORT_SYMBOL(page_pool_put_unrefed_page); |
730 | |
731 | /** |
732 | * page_pool_put_page_bulk() - release references on multiple pages |
733 | * @pool: pool from which pages were allocated |
734 | * @data: array holding page pointers |
735 | * @count: number of pages in @data |
736 | * |
737 | * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring |
738 | * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() |
739 | * will release leftover pages to the page allocator. |
740 | * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx |
741 | * completion loop for the XDP_REDIRECT use case. |
742 | * |
743 | * Please note the caller must not use data area after running |
744 | * page_pool_put_page_bulk(), as this function overwrites it. |
745 | */ |
746 | void page_pool_put_page_bulk(struct page_pool *pool, void **data, |
747 | int count) |
748 | { |
749 | int i, bulk_len = 0; |
750 | bool in_softirq; |
751 | |
752 | for (i = 0; i < count; i++) { |
753 | struct page *page = virt_to_head_page(x: data[i]); |
754 | |
755 | /* It is not the last user for the page frag case */ |
756 | if (!page_pool_is_last_ref(page)) |
757 | continue; |
758 | |
759 | page = __page_pool_put_page(pool, page, dma_sync_size: -1, allow_direct: false); |
760 | /* Approved for bulk recycling in ptr_ring cache */ |
761 | if (page) |
762 | data[bulk_len++] = page; |
763 | } |
764 | |
765 | if (unlikely(!bulk_len)) |
766 | return; |
767 | |
768 | /* Bulk producer into ptr_ring page_pool cache */ |
769 | in_softirq = page_pool_producer_lock(pool); |
770 | for (i = 0; i < bulk_len; i++) { |
771 | if (__ptr_ring_produce(r: &pool->ring, ptr: data[i])) { |
772 | /* ring full */ |
773 | recycle_stat_inc(pool, ring_full); |
774 | break; |
775 | } |
776 | } |
777 | recycle_stat_add(pool, ring, i); |
778 | page_pool_producer_unlock(pool, in_softirq); |
779 | |
780 | /* Hopefully all pages was return into ptr_ring */ |
781 | if (likely(i == bulk_len)) |
782 | return; |
783 | |
784 | /* ptr_ring cache full, free remaining pages outside producer lock |
785 | * since put_page() with refcnt == 1 can be an expensive operation |
786 | */ |
787 | for (; i < bulk_len; i++) |
788 | page_pool_return_page(pool, page: data[i]); |
789 | } |
790 | EXPORT_SYMBOL(page_pool_put_page_bulk); |
791 | |
792 | static struct page *page_pool_drain_frag(struct page_pool *pool, |
793 | struct page *page) |
794 | { |
795 | long drain_count = BIAS_MAX - pool->frag_users; |
796 | |
797 | /* Some user is still using the page frag */ |
798 | if (likely(page_pool_unref_page(page, drain_count))) |
799 | return NULL; |
800 | |
801 | if (__page_pool_page_can_be_recycled(page)) { |
802 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) |
803 | page_pool_dma_sync_for_device(pool, page, dma_sync_size: -1); |
804 | |
805 | return page; |
806 | } |
807 | |
808 | page_pool_return_page(pool, page); |
809 | return NULL; |
810 | } |
811 | |
812 | static void page_pool_free_frag(struct page_pool *pool) |
813 | { |
814 | long drain_count = BIAS_MAX - pool->frag_users; |
815 | struct page *page = pool->frag_page; |
816 | |
817 | pool->frag_page = NULL; |
818 | |
819 | if (!page || page_pool_unref_page(page, nr: drain_count)) |
820 | return; |
821 | |
822 | page_pool_return_page(pool, page); |
823 | } |
824 | |
825 | struct page *page_pool_alloc_frag(struct page_pool *pool, |
826 | unsigned int *offset, |
827 | unsigned int size, gfp_t gfp) |
828 | { |
829 | unsigned int max_size = PAGE_SIZE << pool->p.order; |
830 | struct page *page = pool->frag_page; |
831 | |
832 | if (WARN_ON(size > max_size)) |
833 | return NULL; |
834 | |
835 | size = ALIGN(size, dma_get_cache_alignment()); |
836 | *offset = pool->frag_offset; |
837 | |
838 | if (page && *offset + size > max_size) { |
839 | page = page_pool_drain_frag(pool, page); |
840 | if (page) { |
841 | alloc_stat_inc(pool, fast); |
842 | goto frag_reset; |
843 | } |
844 | } |
845 | |
846 | if (!page) { |
847 | page = page_pool_alloc_pages(pool, gfp); |
848 | if (unlikely(!page)) { |
849 | pool->frag_page = NULL; |
850 | return NULL; |
851 | } |
852 | |
853 | pool->frag_page = page; |
854 | |
855 | frag_reset: |
856 | pool->frag_users = 1; |
857 | *offset = 0; |
858 | pool->frag_offset = size; |
859 | page_pool_fragment_page(page, BIAS_MAX); |
860 | return page; |
861 | } |
862 | |
863 | pool->frag_users++; |
864 | pool->frag_offset = *offset + size; |
865 | alloc_stat_inc(pool, fast); |
866 | return page; |
867 | } |
868 | EXPORT_SYMBOL(page_pool_alloc_frag); |
869 | |
870 | static void page_pool_empty_ring(struct page_pool *pool) |
871 | { |
872 | struct page *page; |
873 | |
874 | /* Empty recycle ring */ |
875 | while ((page = ptr_ring_consume_bh(r: &pool->ring))) { |
876 | /* Verify the refcnt invariant of cached pages */ |
877 | if (!(page_ref_count(page) == 1)) |
878 | pr_crit("%s() page_pool refcnt %d violation\n" , |
879 | __func__, page_ref_count(page)); |
880 | |
881 | page_pool_return_page(pool, page); |
882 | } |
883 | } |
884 | |
885 | static void __page_pool_destroy(struct page_pool *pool) |
886 | { |
887 | if (pool->disconnect) |
888 | pool->disconnect(pool); |
889 | |
890 | page_pool_unlist(pool); |
891 | page_pool_uninit(pool); |
892 | kfree(objp: pool); |
893 | } |
894 | |
895 | static void page_pool_empty_alloc_cache_once(struct page_pool *pool) |
896 | { |
897 | struct page *page; |
898 | |
899 | if (pool->destroy_cnt) |
900 | return; |
901 | |
902 | /* Empty alloc cache, assume caller made sure this is |
903 | * no-longer in use, and page_pool_alloc_pages() cannot be |
904 | * call concurrently. |
905 | */ |
906 | while (pool->alloc.count) { |
907 | page = pool->alloc.cache[--pool->alloc.count]; |
908 | page_pool_return_page(pool, page); |
909 | } |
910 | } |
911 | |
912 | static void page_pool_scrub(struct page_pool *pool) |
913 | { |
914 | page_pool_empty_alloc_cache_once(pool); |
915 | pool->destroy_cnt++; |
916 | |
917 | /* No more consumers should exist, but producers could still |
918 | * be in-flight. |
919 | */ |
920 | page_pool_empty_ring(pool); |
921 | } |
922 | |
923 | static int page_pool_release(struct page_pool *pool) |
924 | { |
925 | int inflight; |
926 | |
927 | page_pool_scrub(pool); |
928 | inflight = page_pool_inflight(pool, strict: true); |
929 | if (!inflight) |
930 | __page_pool_destroy(pool); |
931 | |
932 | return inflight; |
933 | } |
934 | |
935 | static void page_pool_release_retry(struct work_struct *wq) |
936 | { |
937 | struct delayed_work *dwq = to_delayed_work(work: wq); |
938 | struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); |
939 | void *netdev; |
940 | int inflight; |
941 | |
942 | inflight = page_pool_release(pool); |
943 | if (!inflight) |
944 | return; |
945 | |
946 | /* Periodic warning for page pools the user can't see */ |
947 | netdev = READ_ONCE(pool->slow.netdev); |
948 | if (time_after_eq(jiffies, pool->defer_warn) && |
949 | (!netdev || netdev == NET_PTR_POISON)) { |
950 | int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; |
951 | |
952 | pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n" , |
953 | __func__, pool->user.id, inflight, sec); |
954 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; |
955 | } |
956 | |
957 | /* Still not ready to be disconnected, retry later */ |
958 | schedule_delayed_work(dwork: &pool->release_dw, DEFER_TIME); |
959 | } |
960 | |
961 | void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), |
962 | struct xdp_mem_info *mem) |
963 | { |
964 | refcount_inc(r: &pool->user_cnt); |
965 | pool->disconnect = disconnect; |
966 | pool->xdp_mem_id = mem->id; |
967 | } |
968 | |
969 | static void page_pool_disable_direct_recycling(struct page_pool *pool) |
970 | { |
971 | /* Disable direct recycling based on pool->cpuid. |
972 | * Paired with READ_ONCE() in napi_pp_put_page(). |
973 | */ |
974 | WRITE_ONCE(pool->cpuid, -1); |
975 | |
976 | if (!pool->p.napi) |
977 | return; |
978 | |
979 | /* To avoid races with recycling and additional barriers make sure |
980 | * pool and NAPI are unlinked when NAPI is disabled. |
981 | */ |
982 | WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || |
983 | READ_ONCE(pool->p.napi->list_owner) != -1); |
984 | |
985 | WRITE_ONCE(pool->p.napi, NULL); |
986 | } |
987 | |
988 | void page_pool_destroy(struct page_pool *pool) |
989 | { |
990 | if (!pool) |
991 | return; |
992 | |
993 | if (!page_pool_put(pool)) |
994 | return; |
995 | |
996 | page_pool_disable_direct_recycling(pool); |
997 | page_pool_free_frag(pool); |
998 | |
999 | if (!page_pool_release(pool)) |
1000 | return; |
1001 | |
1002 | page_pool_detached(pool); |
1003 | pool->defer_start = jiffies; |
1004 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; |
1005 | |
1006 | INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); |
1007 | schedule_delayed_work(dwork: &pool->release_dw, DEFER_TIME); |
1008 | } |
1009 | EXPORT_SYMBOL(page_pool_destroy); |
1010 | |
1011 | /* Caller must provide appropriate safe context, e.g. NAPI. */ |
1012 | void page_pool_update_nid(struct page_pool *pool, int new_nid) |
1013 | { |
1014 | struct page *page; |
1015 | |
1016 | trace_page_pool_update_nid(pool, new_nid); |
1017 | pool->p.nid = new_nid; |
1018 | |
1019 | /* Flush pool alloc cache, as refill will check NUMA node */ |
1020 | while (pool->alloc.count) { |
1021 | page = pool->alloc.cache[--pool->alloc.count]; |
1022 | page_pool_return_page(pool, page); |
1023 | } |
1024 | } |
1025 | EXPORT_SYMBOL(page_pool_update_nid); |
1026 | |