1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * mm/percpu-vm.c - vmalloc area based chunk allocation |
4 | * |
5 | * Copyright (C) 2010 SUSE Linux Products GmbH |
6 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
7 | * |
8 | * Chunks are mapped into vmalloc areas and populated page by page. |
9 | * This is the default chunk allocator. |
10 | */ |
11 | #include "internal.h" |
12 | |
13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, |
14 | unsigned int cpu, int page_idx) |
15 | { |
16 | /* must not be used on pre-mapped chunk */ |
17 | WARN_ON(chunk->immutable); |
18 | |
19 | return vmalloc_to_page(addr: (void *)pcpu_chunk_addr(chunk, cpu, page_idx)); |
20 | } |
21 | |
22 | /** |
23 | * pcpu_get_pages - get temp pages array |
24 | * |
25 | * Returns pointer to array of pointers to struct page which can be indexed |
26 | * with pcpu_page_idx(). Note that there is only one array and accesses |
27 | * should be serialized by pcpu_alloc_mutex. |
28 | * |
29 | * RETURNS: |
30 | * Pointer to temp pages array on success. |
31 | */ |
32 | static struct page **pcpu_get_pages(void) |
33 | { |
34 | static struct page **pages; |
35 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
36 | |
37 | lockdep_assert_held(&pcpu_alloc_mutex); |
38 | |
39 | if (!pages) |
40 | pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); |
41 | return pages; |
42 | } |
43 | |
44 | /** |
45 | * pcpu_free_pages - free pages which were allocated for @chunk |
46 | * @chunk: chunk pages were allocated for |
47 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
48 | * @page_start: page index of the first page to be freed |
49 | * @page_end: page index of the last page to be freed + 1 |
50 | * |
51 | * Free pages [@page_start and @page_end) in @pages for all units. |
52 | * The pages were allocated for @chunk. |
53 | */ |
54 | static void pcpu_free_pages(struct pcpu_chunk *chunk, |
55 | struct page **pages, int page_start, int page_end) |
56 | { |
57 | unsigned int cpu; |
58 | int i; |
59 | |
60 | for_each_possible_cpu(cpu) { |
61 | for (i = page_start; i < page_end; i++) { |
62 | struct page *page = pages[pcpu_page_idx(cpu, i)]; |
63 | |
64 | if (page) |
65 | __free_page(page); |
66 | } |
67 | } |
68 | } |
69 | |
70 | /** |
71 | * pcpu_alloc_pages - allocates pages for @chunk |
72 | * @chunk: target chunk |
73 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
74 | * @page_start: page index of the first page to be allocated |
75 | * @page_end: page index of the last page to be allocated + 1 |
76 | * @gfp: allocation flags passed to the underlying allocator |
77 | * |
78 | * Allocate pages [@page_start,@page_end) into @pages for all units. |
79 | * The allocation is for @chunk. Percpu core doesn't care about the |
80 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
81 | */ |
82 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
83 | struct page **pages, int page_start, int page_end, |
84 | gfp_t gfp) |
85 | { |
86 | unsigned int cpu, tcpu; |
87 | int i; |
88 | |
89 | gfp |= __GFP_HIGHMEM; |
90 | |
91 | for_each_possible_cpu(cpu) { |
92 | for (i = page_start; i < page_end; i++) { |
93 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; |
94 | |
95 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp_mask: gfp, order: 0); |
96 | if (!*pagep) |
97 | goto err; |
98 | } |
99 | } |
100 | return 0; |
101 | |
102 | err: |
103 | while (--i >= page_start) |
104 | __free_page(pages[pcpu_page_idx(cpu, i)]); |
105 | |
106 | for_each_possible_cpu(tcpu) { |
107 | if (tcpu == cpu) |
108 | break; |
109 | for (i = page_start; i < page_end; i++) |
110 | __free_page(pages[pcpu_page_idx(tcpu, i)]); |
111 | } |
112 | return -ENOMEM; |
113 | } |
114 | |
115 | /** |
116 | * pcpu_pre_unmap_flush - flush cache prior to unmapping |
117 | * @chunk: chunk the regions to be flushed belongs to |
118 | * @page_start: page index of the first page to be flushed |
119 | * @page_end: page index of the last page to be flushed + 1 |
120 | * |
121 | * Pages in [@page_start,@page_end) of @chunk are about to be |
122 | * unmapped. Flush cache. As each flushing trial can be very |
123 | * expensive, issue flush on the whole region at once rather than |
124 | * doing it for each cpu. This could be an overkill but is more |
125 | * scalable. |
126 | */ |
127 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, |
128 | int page_start, int page_end) |
129 | { |
130 | flush_cache_vunmap( |
131 | start: pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
132 | end: pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
133 | } |
134 | |
135 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
136 | { |
137 | vunmap_range_noflush(start: addr, end: addr + (nr_pages << PAGE_SHIFT)); |
138 | } |
139 | |
140 | /** |
141 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
142 | * @chunk: chunk of interest |
143 | * @pages: pages array which can be used to pass information to free |
144 | * @page_start: page index of the first page to unmap |
145 | * @page_end: page index of the last page to unmap + 1 |
146 | * |
147 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. |
148 | * Corresponding elements in @pages were cleared by the caller and can |
149 | * be used to carry information to pcpu_free_pages() which will be |
150 | * called after all unmaps are finished. The caller should call |
151 | * proper pre/post flush functions. |
152 | */ |
153 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, |
154 | struct page **pages, int page_start, int page_end) |
155 | { |
156 | unsigned int cpu; |
157 | int i; |
158 | |
159 | for_each_possible_cpu(cpu) { |
160 | for (i = page_start; i < page_end; i++) { |
161 | struct page *page; |
162 | |
163 | page = pcpu_chunk_page(chunk, cpu, page_idx: i); |
164 | WARN_ON(!page); |
165 | pages[pcpu_page_idx(cpu, i)] = page; |
166 | } |
167 | __pcpu_unmap_pages(addr: pcpu_chunk_addr(chunk, cpu, page_start), |
168 | nr_pages: page_end - page_start); |
169 | } |
170 | } |
171 | |
172 | /** |
173 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping |
174 | * @chunk: pcpu_chunk the regions to be flushed belong to |
175 | * @page_start: page index of the first page to be flushed |
176 | * @page_end: page index of the last page to be flushed + 1 |
177 | * |
178 | * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush |
179 | * TLB for the regions. This can be skipped if the area is to be |
180 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. |
181 | * |
182 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
183 | * for the whole region. |
184 | */ |
185 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, |
186 | int page_start, int page_end) |
187 | { |
188 | flush_tlb_kernel_range( |
189 | start: pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
190 | end: pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
191 | } |
192 | |
193 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
194 | int nr_pages) |
195 | { |
196 | return vmap_pages_range_noflush(addr, end: addr + (nr_pages << PAGE_SHIFT), |
197 | PAGE_KERNEL, pages, PAGE_SHIFT); |
198 | } |
199 | |
200 | /** |
201 | * pcpu_map_pages - map pages into a pcpu_chunk |
202 | * @chunk: chunk of interest |
203 | * @pages: pages array containing pages to be mapped |
204 | * @page_start: page index of the first page to map |
205 | * @page_end: page index of the last page to map + 1 |
206 | * |
207 | * For each cpu, map pages [@page_start,@page_end) into @chunk. The |
208 | * caller is responsible for calling pcpu_post_map_flush() after all |
209 | * mappings are complete. |
210 | * |
211 | * This function is responsible for setting up whatever is necessary for |
212 | * reverse lookup (addr -> chunk). |
213 | */ |
214 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
215 | struct page **pages, int page_start, int page_end) |
216 | { |
217 | unsigned int cpu, tcpu; |
218 | int i, err; |
219 | |
220 | for_each_possible_cpu(cpu) { |
221 | err = __pcpu_map_pages(addr: pcpu_chunk_addr(chunk, cpu, page_start), |
222 | pages: &pages[pcpu_page_idx(cpu, page_start)], |
223 | nr_pages: page_end - page_start); |
224 | if (err < 0) |
225 | goto err; |
226 | |
227 | for (i = page_start; i < page_end; i++) |
228 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], |
229 | chunk); |
230 | } |
231 | return 0; |
232 | err: |
233 | for_each_possible_cpu(tcpu) { |
234 | if (tcpu == cpu) |
235 | break; |
236 | __pcpu_unmap_pages(addr: pcpu_chunk_addr(chunk, tcpu, page_start), |
237 | nr_pages: page_end - page_start); |
238 | } |
239 | pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); |
240 | return err; |
241 | } |
242 | |
243 | /** |
244 | * pcpu_post_map_flush - flush cache after mapping |
245 | * @chunk: pcpu_chunk the regions to be flushed belong to |
246 | * @page_start: page index of the first page to be flushed |
247 | * @page_end: page index of the last page to be flushed + 1 |
248 | * |
249 | * Pages [@page_start,@page_end) of @chunk have been mapped. Flush |
250 | * cache. |
251 | * |
252 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
253 | * for the whole region. |
254 | */ |
255 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, |
256 | int page_start, int page_end) |
257 | { |
258 | flush_cache_vmap( |
259 | start: pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
260 | end: pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
261 | } |
262 | |
263 | /** |
264 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk |
265 | * @chunk: chunk of interest |
266 | * @page_start: the start page |
267 | * @page_end: the end page |
268 | * @gfp: allocation flags passed to the underlying memory allocator |
269 | * |
270 | * For each cpu, populate and map pages [@page_start,@page_end) into |
271 | * @chunk. |
272 | * |
273 | * CONTEXT: |
274 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. |
275 | */ |
276 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, |
277 | int page_start, int page_end, gfp_t gfp) |
278 | { |
279 | struct page **pages; |
280 | |
281 | pages = pcpu_get_pages(); |
282 | if (!pages) |
283 | return -ENOMEM; |
284 | |
285 | if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) |
286 | return -ENOMEM; |
287 | |
288 | if (pcpu_map_pages(chunk, pages, page_start, page_end)) { |
289 | pcpu_free_pages(chunk, pages, page_start, page_end); |
290 | return -ENOMEM; |
291 | } |
292 | pcpu_post_map_flush(chunk, page_start, page_end); |
293 | |
294 | return 0; |
295 | } |
296 | |
297 | /** |
298 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk |
299 | * @chunk: chunk to depopulate |
300 | * @page_start: the start page |
301 | * @page_end: the end page |
302 | * |
303 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
304 | * from @chunk. |
305 | * |
306 | * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the |
307 | * region back to vmalloc() which will lazily flush the tlb. |
308 | * |
309 | * CONTEXT: |
310 | * pcpu_alloc_mutex. |
311 | */ |
312 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, |
313 | int page_start, int page_end) |
314 | { |
315 | struct page **pages; |
316 | |
317 | /* |
318 | * If control reaches here, there must have been at least one |
319 | * successful population attempt so the temp pages array must |
320 | * be available now. |
321 | */ |
322 | pages = pcpu_get_pages(); |
323 | BUG_ON(!pages); |
324 | |
325 | /* unmap and free */ |
326 | pcpu_pre_unmap_flush(chunk, page_start, page_end); |
327 | |
328 | pcpu_unmap_pages(chunk, pages, page_start, page_end); |
329 | |
330 | pcpu_free_pages(chunk, pages, page_start, page_end); |
331 | } |
332 | |
333 | static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) |
334 | { |
335 | struct pcpu_chunk *chunk; |
336 | struct vm_struct **vms; |
337 | |
338 | chunk = pcpu_alloc_chunk(gfp); |
339 | if (!chunk) |
340 | return NULL; |
341 | |
342 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
343 | pcpu_nr_groups, pcpu_atom_size); |
344 | if (!vms) { |
345 | pcpu_free_chunk(chunk); |
346 | return NULL; |
347 | } |
348 | |
349 | chunk->data = vms; |
350 | chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; |
351 | |
352 | pcpu_stats_chunk_alloc(); |
353 | trace_percpu_create_chunk(chunk->base_addr); |
354 | |
355 | return chunk; |
356 | } |
357 | |
358 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) |
359 | { |
360 | if (!chunk) |
361 | return; |
362 | |
363 | pcpu_stats_chunk_dealloc(); |
364 | trace_percpu_destroy_chunk(chunk->base_addr); |
365 | |
366 | if (chunk->data) |
367 | pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); |
368 | pcpu_free_chunk(chunk); |
369 | } |
370 | |
371 | static struct page *pcpu_addr_to_page(void *addr) |
372 | { |
373 | return vmalloc_to_page(addr); |
374 | } |
375 | |
376 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) |
377 | { |
378 | /* no extra restriction */ |
379 | return 0; |
380 | } |
381 | |
382 | /** |
383 | * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim |
384 | * @chunk: chunk of interest |
385 | * |
386 | * This is the entry point for percpu reclaim. If a chunk qualifies, it is then |
387 | * isolated and managed in separate lists at the back of pcpu_slot: sidelined |
388 | * and to_depopulate respectively. The to_depopulate list holds chunks slated |
389 | * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once |
390 | * they are on this list. Once depopulated, they are moved onto the sidelined |
391 | * list which enables them to be pulled back in for allocation if no other chunk |
392 | * can suffice the allocation. |
393 | */ |
394 | static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) |
395 | { |
396 | /* do not reclaim either the first chunk or reserved chunk */ |
397 | if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) |
398 | return false; |
399 | |
400 | /* |
401 | * If it is isolated, it may be on the sidelined list so move it back to |
402 | * the to_depopulate list. If we hit at least 1/4 pages empty pages AND |
403 | * there is no system-wide shortage of empty pages aside from this |
404 | * chunk, move it to the to_depopulate list. |
405 | */ |
406 | return ((chunk->isolated && chunk->nr_empty_pop_pages) || |
407 | (pcpu_nr_empty_pop_pages > |
408 | (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && |
409 | chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); |
410 | } |
411 | |