page_ext.c source code [linux/mm/page_ext.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/mm.h>
3	#include <linux/mmzone.h>
4	#include <linux/memblock.h>
5	#include <linux/page_ext.h>
6	#include <linux/memory.h>
7	#include <linux/vmalloc.h>
8	#include <linux/kmemleak.h>
9	#include <linux/page_owner.h>
10	#include <linux/page_idle.h>
11	#include <linux/page_table_check.h>
12	#include <linux/rcupdate.h>
13
14	/*
15	* struct page extension
16	*
17	* This is the feature to manage memory for extended data per page.
18	*
19	* Until now, we must modify struct page itself to store extra data per page.
20	* This requires rebuilding the kernel and it is really time consuming process.
21	* And, sometimes, rebuild is impossible due to third party module dependency.
22	* At last, enlarging struct page could cause un-wanted system behaviour change.
23	*
24	* This feature is intended to overcome above mentioned problems. This feature
25	* allocates memory for extended data per page in certain place rather than
26	* the struct page itself. This memory can be accessed by the accessor
27	* functions provided by this code. During the boot process, it checks whether
28	* allocation of huge chunk of memory is needed or not. If not, it avoids
29	* allocating memory at all. With this advantage, we can include this feature
30	* into the kernel in default and can avoid rebuild and solve related problems.
31	*
32	* To help these things to work well, there are two callbacks for clients. One
33	* is the need callback which is mandatory if user wants to avoid useless
34	* memory allocation at boot-time. The other is optional, init callback, which
35	* is used to do proper initialization after memory is allocated.
36	*
37	* The need callback is used to decide whether extended memory allocation is
38	* needed or not. Sometimes users want to deactivate some features in this
39	* boot and extra memory would be unnecessary. In this case, to avoid
40	* allocating huge chunk of memory, each clients represent their need of
41	* extra memory through the need callback. If one of the need callbacks
42	* returns true, it means that someone needs extra memory so that
43	* page extension core should allocates memory for page extension. If
44	* none of need callbacks return true, memory isn't needed at all in this boot
45	* and page extension core can skip to allocate memory. As result,
46	* none of memory is wasted.
47	*
48	* When need callback returns true, page_ext checks if there is a request for
49	* extra memory through size in struct page_ext_operations. If it is non-zero,
50	* extra space is allocated for each page_ext entry and offset is returned to
51	* user through offset in struct page_ext_operations.
52	*
53	* The init callback is used to do proper initialization after page extension
54	* is completely initialized. In sparse memory system, extra memory is
55	* allocated some time later than memmap is allocated. In other words, lifetime
56	* of memory for page extension isn't same with memmap for struct page.
57	* Therefore, clients can't store extra data until page extension is
58	* initialized, even if pages are allocated and used freely. This could
59	* cause inadequate state of extra data per page, so, to prevent it, client
60	* can utilize this callback to initialize the state of it correctly.
61	*/
62
63	#ifdef CONFIG_SPARSEMEM
64	#define PAGE_EXT_INVALID (0x1)
65	#endif
66
67	#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
68	static bool need_page_idle(void)
69	{
70	return true;
71	}
72	static struct page_ext_operations page_idle_ops __initdata = {
73	.need = need_page_idle,
74	.need_shared_flags = true,
75	};
76	#endif
77
78	static struct page_ext_operations *page_ext_ops[] __initdata = {
79	#ifdef CONFIG_PAGE_OWNER
80	&page_owner_ops,
81	#endif
82	#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
83	&page_idle_ops,
84	#endif
85	#ifdef CONFIG_PAGE_TABLE_CHECK
86	&page_table_check_ops,
87	#endif
88	};
89
90	unsigned long page_ext_size;
91
92	static unsigned long total_usage;
93
94	bool early_page_ext __meminitdata;
95	static int __init setup_early_page_ext(char *str)
96	{
97	early_page_ext = true;
98	return `0`;
99	}
100	early_param("early_page_ext", setup_early_page_ext);
101
102	static bool __init invoke_need_callbacks(void)
103	{
104	int i;
105	int entries = ARRAY_SIZE(page_ext_ops);
106	bool need = false;
107
108	for (i = `0`; i < entries; i++) {
109	if (page_ext_ops[i]->need()) {
110	if (page_ext_ops[i]->need_shared_flags) {
111	page_ext_size = sizeof(struct page_ext);
112	break;
113	}
114	}
115	}
116
117	for (i = `0`; i < entries; i++) {
118	if (page_ext_ops[i]->need()) {
119	page_ext_ops[i]->offset = page_ext_size;
120	page_ext_size += page_ext_ops[i]->size;
121	need = true;
122	}
123	}
124
125	return need;
126	}
127
128	static void __init invoke_init_callbacks(void)
129	{
130	int i;
131	int entries = ARRAY_SIZE(page_ext_ops);
132
133	for (i = `0`; i < entries; i++) {
134	if (page_ext_ops[i]->init)
135	page_ext_ops[i]->init();
136	}
137	}
138
139	static inline struct page_ext get_entry(void* base, unsigned* long index)
140	{
141	return base + page_ext_size * index;
142	}
143
144	#ifndef CONFIG_SPARSEMEM
145	void __init page_ext_init_flatmem_late(void)
146	{
147	invoke_init_callbacks();
148	}
149
150	void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
151	{
152	pgdat->node_page_ext = NULL;
153	}
154
155	static struct page_ext lookup_page_ext(const* struct page *page)
156	{
157	unsigned long pfn = page_to_pfn(page);
158	unsigned long index;
159	struct page_ext *base;
160
161	WARN_ON_ONCE(!rcu_read_lock_held());
162	base = NODE_DATA(page_to_nid(page))->node_page_ext;
163	/*
164	* The sanity checks the page allocator does upon freeing a
165	* page can reach here before the page_ext arrays are
166	* allocated when feeding a range of pages to the allocator
167	* for the first time during bootup or memory hotplug.
168	*/
169	if (unlikely(!base))
170	return NULL;
171	index = pfn - round_down(node_start_pfn(page_to_nid(page)),
172	MAX_ORDER_NR_PAGES);
173	return get_entry(base, index);
174	}
175
176	static int __init alloc_node_page_ext(int nid)
177	{
178	struct page_ext *base;
179	unsigned long table_size;
180	unsigned long nr_pages;
181
182	nr_pages = NODE_DATA(nid)->node_spanned_pages;
183	if (!nr_pages)
184	return `0`;
185
186	/*
187	* Need extra space if node range is not aligned with
188	* MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
189	* checks buddy's status, range could be out of exact node range.
190	*/
191	if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) \|\|
192	!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
193	nr_pages += MAX_ORDER_NR_PAGES;
194
195	table_size = page_ext_size * nr_pages;
196
197	base = memblock_alloc_try_nid(
198	table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
199	MEMBLOCK_ALLOC_ACCESSIBLE, nid);
200	if (!base)
201	return -ENOMEM;
202	NODE_DATA(nid)->node_page_ext = base;
203	total_usage += table_size;
204	return `0`;
205	}
206
207	void __init page_ext_init_flatmem(void)
208	{
209
210	int nid, fail;
211
212	if (!invoke_need_callbacks())
213	return;
214
215	for_each_online_node(nid) {
216	fail = alloc_node_page_ext(nid);
217	if (fail)
218	goto fail;
219	}
220	pr_info("allocated %ld bytes of page_ext\n", total_usage);
221	return;
222
223	fail:
224	pr_crit("allocation of page_ext failed.\n");
225	panic("Out of memory");
226	}
227
228	#else /* CONFIG_SPARSEMEM */
229	static bool page_ext_invalid(struct page_ext *page_ext)
230	{
231	return !page_ext \|\| (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
232	}
233
234	static struct page_ext lookup_page_ext(const* struct page *page)
235	{
236	unsigned long pfn = page_to_pfn(page);
237	struct mem_section *section = __pfn_to_section(pfn);
238	struct page_ext *page_ext = READ_ONCE(section->page_ext);
239
240	WARN_ON_ONCE(!rcu_read_lock_held());
241	/*
242	* The sanity checks the page allocator does upon freeing a
243	* page can reach here before the page_ext arrays are
244	* allocated when feeding a range of pages to the allocator
245	* for the first time during bootup or memory hotplug.
246	*/
247	if (page_ext_invalid(page_ext))
248	return NULL;
249	return get_entry(base: page_ext, index: pfn);
250	}
251
252	static void __meminit alloc_page_ext(size_t size, int* nid)
253	{
254	gfp_t flags = GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN;
255	void *addr = NULL;
256
257	addr = alloc_pages_exact_nid(nid, size, gfp_mask: flags);
258	if (addr) {
259	kmemleak_alloc(ptr: addr, size, min_count: `1`, gfp: flags);
260	return addr;
261	}
262
263	addr = vzalloc_node(size, node: nid);
264
265	return addr;
266	}
267
268	static int __meminit init_section_page_ext(unsigned long pfn, int nid)
269	{
270	struct mem_section *section;
271	struct page_ext *base;
272	unsigned long table_size;
273
274	section = __pfn_to_section(pfn);
275
276	if (section->page_ext)
277	return `0`;
278
279	table_size = page_ext_size * PAGES_PER_SECTION;
280	base = alloc_page_ext(size: table_size, nid);
281
282	/*
283	* The value stored in section->page_ext is (base - pfn)
284	* and it does not point to the memory block allocated above,
285	* causing kmemleak false positives.
286	*/
287	kmemleak_not_leak(ptr: base);
288
289	if (!base) {
290	pr_err("page ext allocation failure\n");
291	return -ENOMEM;
292	}
293
294	/*
295	* The passed "pfn" may not be aligned to SECTION. For the calculation
296	* we need to apply a mask.
297	*/
298	pfn &= PAGE_SECTION_MASK;
299	section->page_ext = (void )base - page_ext_size pfn;
300	total_usage += table_size;
301	return `0`;
302	}
303
304	static void free_page_ext(void *addr)
305	{
306	if (is_vmalloc_addr(x: addr)) {
307	vfree(addr);
308	} else {
309	struct page *page = virt_to_page(addr);
310	size_t table_size;
311
312	table_size = page_ext_size * PAGES_PER_SECTION;
313
314	BUG_ON(PageReserved(page));
315	kmemleak_free(ptr: addr);
316	free_pages_exact(virt: addr, size: table_size);
317	}
318	}
319
320	static void __free_page_ext(unsigned long pfn)
321	{
322	struct mem_section *ms;
323	struct page_ext *base;
324
325	ms = __pfn_to_section(pfn);
326	if (!ms \|\| !ms->page_ext)
327	return;
328
329	base = READ_ONCE(ms->page_ext);
330	/*
331	* page_ext here can be valid while doing the roll back
332	* operation in online_page_ext().
333	*/
334	if (page_ext_invalid(page_ext: base))
335	base = (void *)base - PAGE_EXT_INVALID;
336	WRITE_ONCE(ms->page_ext, NULL);
337
338	base = get_entry(base, index: pfn);
339	free_page_ext(addr: base);
340	}
341
342	static void __invalidate_page_ext(unsigned long pfn)
343	{
344	struct mem_section *ms;
345	void *val;
346
347	ms = __pfn_to_section(pfn);
348	if (!ms \|\| !ms->page_ext)
349	return;
350	val = (void *)ms->page_ext + PAGE_EXT_INVALID;
351	WRITE_ONCE(ms->page_ext, val);
352	}
353
354	static int __meminit online_page_ext(unsigned long start_pfn,
355	unsigned long nr_pages,
356	int nid)
357	{
358	unsigned long start, end, pfn;
359	int fail = `0`;
360
361	start = SECTION_ALIGN_DOWN(start_pfn);
362	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
363
364	if (nid == NUMA_NO_NODE) {
365	/*
366	* In this case, "nid" already exists and contains valid memory.
367	* "start_pfn" passed to us is a pfn which is an arg for
368	* online__pages(), and start_pfn should exist.
369	*/
370	nid = pfn_to_nid(start_pfn);
371	VM_BUG_ON(!node_online(nid));
372	}
373
374	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
375	fail = init_section_page_ext(pfn, nid);
376	if (!fail)
377	return `0`;
378
379	/ rollback /
380	end = pfn - PAGES_PER_SECTION;
381	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
382	__free_page_ext(pfn);
383
384	return -ENOMEM;
385	}
386
387	static void __meminit offline_page_ext(unsigned long start_pfn,
388	unsigned long nr_pages)
389	{
390	unsigned long start, end, pfn;
391
392	start = SECTION_ALIGN_DOWN(start_pfn);
393	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
394
395	/*
396	* Freeing of page_ext is done in 3 steps to avoid
397	* use-after-free of it:
398	* 1) Traverse all the sections and mark their page_ext
399	* as invalid.
400	* 2) Wait for all the existing users of page_ext who
401	* started before invalidation to finish.
402	* 3) Free the page_ext.
403	*/
404	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
405	__invalidate_page_ext(pfn);
406
407	synchronize_rcu();
408
409	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
410	__free_page_ext(pfn);
411	}
412
413	static int __meminit page_ext_callback(struct notifier_block *self,
414	unsigned long action, void *arg)
415	{
416	struct memory_notify *mn = arg;
417	int ret = `0`;
418
419	switch (action) {
420	case MEM_GOING_ONLINE:
421	ret = online_page_ext(start_pfn: mn->start_pfn,
422	nr_pages: mn->nr_pages, nid: mn->status_change_nid);
423	break;
424	case MEM_OFFLINE:
425	offline_page_ext(start_pfn: mn->start_pfn,
426	nr_pages: mn->nr_pages);
427	break;
428	case MEM_CANCEL_ONLINE:
429	offline_page_ext(start_pfn: mn->start_pfn,
430	nr_pages: mn->nr_pages);
431	break;
432	case MEM_GOING_OFFLINE:
433	break;
434	case MEM_ONLINE:
435	case MEM_CANCEL_OFFLINE:
436	break;
437	}
438
439	return notifier_from_errno(err: ret);
440	}
441
442	void __init page_ext_init(void)
443	{
444	unsigned long pfn;
445	int nid;
446
447	if (!invoke_need_callbacks())
448	return;
449
450	for_each_node_state(nid, N_MEMORY) {
451	unsigned long start_pfn, end_pfn;
452
453	start_pfn = node_start_pfn(nid);
454	end_pfn = node_end_pfn(nid);
455	/*
456	* start_pfn and end_pfn may not be aligned to SECTION and the
457	* page->flags of out of node pages are not initialized. So we
458	* scan [start_pfn, the biggest section's pfn < end_pfn) here.
459	*/
460	for (pfn = start_pfn; pfn < end_pfn;
461	pfn = ALIGN(pfn + `1`, PAGES_PER_SECTION)) {
462
463	if (!pfn_valid(pfn))
464	continue;
465	/*
466	* Nodes's pfns can be overlapping.
467	* We know some arch can have a nodes layout such as
468	* -------------pfn-------------->
469	* N0 \| N1 \| N2 \| N0 \| N1 \| N2\|....
470	*/
471	if (pfn_to_nid(pfn) != nid)
472	continue;
473	if (init_section_page_ext(pfn, nid))
474	goto oom;
475	cond_resched();
476	}
477	}
478	hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
479	pr_info("allocated %ld bytes of page_ext\n", total_usage);
480	invoke_init_callbacks();
481	return;
482
483	oom:
484	panic(fmt: "Out of memory");
485	}
486
487	void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
488	{
489	}
490
491	#endif
492
493	/**
494	* page_ext_get() - Get the extended information for a page.
495	* @page: The page we're interested in.
496	*
497	* Ensures that the page_ext will remain valid until page_ext_put()
498	* is called.
499	*
500	* Return: NULL if no page_ext exists for this page.
501	* Context: Any context. Caller may not sleep until they have called
502	* page_ext_put().
503	*/
504	struct page_ext page_ext_get(struct* page *page)
505	{
506	struct page_ext *page_ext;
507
508	rcu_read_lock();
509	page_ext = lookup_page_ext(page);
510	if (!page_ext) {
511	rcu_read_unlock();
512	return NULL;
513	}
514
515	return page_ext;
516	}
517
518	/**
519	* page_ext_put() - Working with page extended information is done.
520	* @page_ext: Page extended information received from page_ext_get().
521	*
522	* The page extended information of the page may not be valid after this
523	* function is called.
524	*
525	* Return: None.
526	* Context: Any context with corresponding page_ext_get() is called.
527	*/
528	void page_ext_put(struct page_ext *page_ext)
529	{
530	if (unlikely(!page_ext))
531	return;
532
533	rcu_read_unlock();
534	}
535

source code of linux/mm/page_ext.c