hugetlbpage.c source code [linux/arch/powerpc/mm/hugetlbpage.c]

1	/*
2	* PPC Huge TLB Page Support for Kernel.
3	*
4	* Copyright (C) 2003 David Gibson, IBM Corporation.
5	* Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6	*
7	* Based on the IA-32 version:
8	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9	*/
10
11	#include <linux/mm.h>
12	#include <linux/io.h>
13	#include <linux/slab.h>
14	#include <linux/hugetlb.h>
15	#include <linux/export.h>
16	#include <linux/of_fdt.h>
17	#include <linux/memblock.h>
18	#include <linux/moduleparam.h>
19	#include <linux/swap.h>
20	#include <linux/swapops.h>
21	#include <linux/kmemleak.h>
22	#include <asm/pgalloc.h>
23	#include <asm/tlb.h>
24	#include <asm/setup.h>
25	#include <asm/hugetlb.h>
26	#include <asm/pte-walk.h>
27	#include <asm/firmware.h>
28
29	bool hugetlb_disabled = false;
30
31	#define hugepd_none(hpd) (hpd_val(hpd) == 0)
32
33	#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
34	__builtin_ffs(sizeof(void *)))
35
36	pte_t huge_pte_offset(struct* mm_struct mm, unsigned* long addr, unsigned long sz)
37	{
38	/*
39	* Only called for hugetlbfs pages, hence can ignore THP and the
40	* irq disabled walk.
41	*/
42	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
43	}
44
45	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
46	unsigned long address, unsigned int pdshift,
47	unsigned int pshift, spinlock_t *ptl)
48	{
49	struct kmem_cache *cachep;
50	pte_t *new;
51	int i;
52	int num_hugepd;
53
54	if (pshift >= pdshift) {
55	cachep = PGT_CACHE(PTE_T_ORDER);
56	num_hugepd = `1` << (pshift - pdshift);
57	} else {
58	cachep = PGT_CACHE(pdshift - pshift);
59	num_hugepd = `1`;
60	}
61
62	if (!cachep) {
63	WARN_ONCE(`1`, "No page table cache created for hugetlb tables");
64	return -ENOMEM;
65	}
66
67	new = kmem_cache_alloc(cachep, flags: pgtable_gfp_flags(mm, GFP_KERNEL));
68
69	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
70	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
71
72	if (!new)
73	return -ENOMEM;
74
75	/*
76	* Make sure other cpus find the hugepd set only after a
77	* properly initialized page table is visible to them.
78	* For more details look for comment in __pte_alloc().
79	*/
80	smp_wmb();
81
82	spin_lock(lock: ptl);
83	/*
84	* We have multiple higher-level entries that point to the same
85	* actual pte location. Fill in each as we go and backtrack on error.
86	* We need all of these so the DTLB pgtable walk code can find the
87	* right higher-level entry without knowing if it's a hugepage or not.
88	*/
89	for (i = `0`; i < num_hugepd; i++, hpdp++) {
90	if (unlikely(!hugepd_none(*hpdp)))
91	break;
92	hugepd_populate(hpdp, new, pshift);
93	}
94	/ If we bailed from the for loop early, an error occurred, clean up /
95	if (i < num_hugepd) {
96	for (i = i - `1` ; i >= `0`; i--, hpdp--)
97	*hpdp = __hugepd(`0`);
98	kmem_cache_free(s: cachep, objp: new);
99	} else {
100	kmemleak_ignore(ptr: new);
101	}
102	spin_unlock(lock: ptl);
103	return `0`;
104	}
105
106	/*
107	* At this point we do the placement change only for BOOK3S 64. This would
108	* possibly work on other subarchs.
109	*/
110	pte_t huge_pte_alloc(struct* mm_struct mm, struct* vm_area_struct *vma,
111	unsigned long addr, unsigned long sz)
112	{
113	pgd_t *pg;
114	p4d_t *p4;
115	pud_t *pu;
116	pmd_t *pm;
117	hugepd_t *hpdp = NULL;
118	unsigned pshift = __ffs(sz);
119	unsigned pdshift = PGDIR_SHIFT;
120	spinlock_t *ptl;
121
122	addr &= ~(sz-`1`);
123	pg = pgd_offset(mm, addr);
124	p4 = p4d_offset(pgd: pg, address: addr);
125
126	#ifdef CONFIG_PPC_BOOK3S_64
127	if (pshift == PGDIR_SHIFT)
128	/ 16GB huge page /
129	return (pte_t *) p4;
130	else if (pshift > PUD_SHIFT) {
131	/*
132	* We need to use hugepd table
133	*/
134	ptl = &mm->page_table_lock;
135	hpdp = (hugepd_t *)p4;
136	} else {
137	pdshift = PUD_SHIFT;
138	pu = pud_alloc(mm, p4, addr);
139	if (!pu)
140	return NULL;
141	if (pshift == PUD_SHIFT)
142	return (pte_t *)pu;
143	else if (pshift > PMD_SHIFT) {
144	ptl = pud_lockptr(mm, pu);
145	hpdp = (hugepd_t *)pu;
146	} else {
147	pdshift = PMD_SHIFT;
148	pm = pmd_alloc(mm, pu, addr);
149	if (!pm)
150	return NULL;
151	if (pshift == PMD_SHIFT)
152	/ 16MB hugepage /
153	return (pte_t *)pm;
154	else {
155	ptl = pmd_lockptr(mm, pm);
156	hpdp = (hugepd_t *)pm;
157	}
158	}
159	}
160	#else
161	if (pshift >= PGDIR_SHIFT) {
162	ptl = &mm->page_table_lock;
163	hpdp = (hugepd_t *)p4;
164	} else {
165	pdshift = PUD_SHIFT;
166	pu = pud_alloc(mm, p4d: p4, address: addr);
167	if (!pu)
168	return NULL;
169	if (pshift >= PUD_SHIFT) {
170	ptl = pud_lockptr(mm, pud: pu);
171	hpdp = (hugepd_t *)pu;
172	} else {
173	pdshift = PMD_SHIFT;
174	pm = pmd_alloc(mm, pud: pu, address: addr);
175	if (!pm)
176	return NULL;
177	ptl = pmd_lockptr(mm, pmd: pm);
178	hpdp = (hugepd_t *)pm;
179	}
180	}
181	#endif
182	if (!hpdp)
183	return NULL;
184
185	if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
186	return pte_alloc_huge(mm, pmd: (pmd_t *)hpdp, address: addr);
187
188	BUG_ON(!hugepd_none(hpdp) && !hugepd_ok(hpdp));
189
190	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, address: addr,
191	pdshift, pshift, ptl))
192	return NULL;
193
194	return hugepte_offset(*hpdp, addr, pdshift);
195	}
196
197	#ifdef CONFIG_PPC_BOOK3S_64
198	/*
199	* Tracks gpages after the device tree is scanned and before the
200	* huge_boot_pages list is ready on pseries.
201	*/
202	#define MAX_NUMBER_GPAGES 1024
203	__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
204	__initdata static unsigned nr_gpages;
205
206	/*
207	* Build list of addresses of gigantic pages. This function is used in early
208	* boot before the buddy allocator is setup.
209	*/
210	void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
211	{
212	if (!addr)
213	return;
214	while (number_of_pages > `0`) {
215	gpage_freearray[nr_gpages] = addr;
216	nr_gpages++;
217	number_of_pages--;
218	addr += page_size;
219	}
220	}
221
222	static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
223	{
224	struct huge_bootmem_page *m;
225	if (nr_gpages == `0`)
226	return `0`;
227	m = phys_to_virt(gpage_freearray[--nr_gpages]);
228	gpage_freearray[nr_gpages] = `0`;
229	list_add(&m->list, &huge_boot_pages[`0`]);
230	m->hstate = hstate;
231	return `1`;
232	}
233
234	bool __init hugetlb_node_alloc_supported(void)
235	{
236	return false;
237	}
238	#endif
239
240
241	int __init alloc_bootmem_huge_page(struct hstate h, int* nid)
242	{
243
244	#ifdef CONFIG_PPC_BOOK3S_64
245	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
246	return pseries_alloc_bootmem_huge_page(h);
247	#endif
248	return __alloc_bootmem_huge_page(h, nid);
249	}
250
251	#ifndef CONFIG_PPC_BOOK3S_64
252	#define HUGEPD_FREELIST_SIZE \
253	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
254
255	struct hugepd_freelist {
256	struct rcu_head rcu;
257	unsigned int index;
258	void *ptes[];
259	};
260
261	static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
262
263	static void hugepd_free_rcu_callback(struct rcu_head *head)
264	{
265	struct hugepd_freelist *batch =
266	container_of(head, struct hugepd_freelist, rcu);
267	unsigned int i;
268
269	for (i = `0`; i < batch->index; i++)
270	kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
271
272	free_page((unsigned long)batch);
273	}
274
275	static void hugepd_free(struct mmu_gather tlb, void* *hugepte)
276	{
277	struct hugepd_freelist **batchp;
278
279	batchp = &get_cpu_var(hugepd_freelist_cur);
280
281	if (atomic_read(v: &tlb->mm->mm_users) < `2` \|\|
282	mm_is_thread_local(tlb->mm)) {
283	kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
284	put_cpu_var(hugepd_freelist_cur);
285	return;
286	}
287
288	if (*batchp == NULL) {
289	batchp = (struct* hugepd_freelist *)__get_free_page(GFP_ATOMIC);
290	(*batchp)->index = `0`;
291	}
292
293	(batchp)->ptes[(batchp)->index++] = hugepte;
294	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
295	call_rcu(head: &(*batchp)->rcu, func: hugepd_free_rcu_callback);
296	*batchp = NULL;
297	}
298	put_cpu_var(hugepd_freelist_cur);
299	}
300	#else
301	static inline void hugepd_free(struct mmu_gather tlb, void* *hugepte) {}
302	#endif
303
304	/ Return true when the entry to be freed maps more than the area being freed /
305	static bool range_is_outside_limits(unsigned long start, unsigned long end,
306	unsigned long floor, unsigned long ceiling,
307	unsigned long mask)
308	{
309	if ((start & mask) < floor)
310	return true;
311	if (ceiling) {
312	ceiling &= mask;
313	if (!ceiling)
314	return true;
315	}
316	return end - `1` > ceiling - `1`;
317	}
318
319	static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshift,
320	unsigned long start, unsigned long end,
321	unsigned long floor, unsigned long ceiling)
322	{
323	pte_t hugepte = hugepd_page(hpdp);
324	int i;
325
326	unsigned long pdmask = ~((`1UL` << pdshift) - `1`);
327	unsigned int num_hugepd = `1`;
328	unsigned int shift = hugepd_shift(*hpdp);
329
330	/ Note: On fsl the hpdp may be the first of several /
331	if (shift > pdshift)
332	num_hugepd = `1` << (shift - pdshift);
333
334	if (range_is_outside_limits(start, end, floor, ceiling, mask: pdmask))
335	return;
336
337	for (i = `0`; i < num_hugepd; i++, hpdp++)
338	*hpdp = __hugepd(`0`);
339
340	if (shift >= pdshift)
341	hugepd_free(tlb, hugepte);
342	else
343	pgtable_free_tlb(tlb, hugepte,
344	get_hugepd_cache_index(pdshift - shift));
345	}
346
347	static void hugetlb_free_pte_range(struct mmu_gather tlb, pmd_t pmd,
348	unsigned long addr, unsigned long end,
349	unsigned long floor, unsigned long ceiling)
350	{
351	pgtable_t token = pmd_pgtable(*pmd);
352
353	if (range_is_outside_limits(start: addr, end, floor, ceiling, PMD_MASK))
354	return;
355
356	pmd_clear(pmdp: pmd);
357	pte_free_tlb(tlb, token, addr);
358	mm_dec_nr_ptes(mm: tlb->mm);
359	}
360
361	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
362	unsigned long addr, unsigned long end,
363	unsigned long floor, unsigned long ceiling)
364	{
365	pmd_t *pmd;
366	unsigned long next;
367	unsigned long start;
368
369	start = addr;
370	do {
371	unsigned long more;
372
373	pmd = pmd_offset(pud, address: addr);
374	next = pmd_addr_end(addr, end);
375	if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
376	if (pmd_none_or_clear_bad(pmd))
377	continue;
378
379	/*
380	* if it is not hugepd pointer, we should already find
381	* it cleared.
382	*/
383	WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
384
385	hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
386
387	continue;
388	}
389	/*
390	* Increment next by the size of the huge mapping since
391	* there may be more than one entry at this level for a
392	* single hugepage, but all of them point to
393	* the same kmem cache that holds the hugepte.
394	*/
395	more = addr + (`1UL` << hugepd_shift((hugepd_t )pmd));
396	if (more > next)
397	next = more;
398
399	free_hugepd_range(tlb, hpdp: (hugepd_t *)pmd, PMD_SHIFT,
400	start: addr, end: next, floor, ceiling);
401	} while (addr = next, addr != end);
402
403	if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
404	return;
405
406	pmd = pmd_offset(pud, address: start & PUD_MASK);
407	pud_clear(pudp: pud);
408	pmd_free_tlb(tlb, pmd, start & PUD_MASK);
409	mm_dec_nr_pmds(mm: tlb->mm);
410	}
411
412	static void hugetlb_free_pud_range(struct mmu_gather tlb, p4d_t p4d,
413	unsigned long addr, unsigned long end,
414	unsigned long floor, unsigned long ceiling)
415	{
416	pud_t *pud;
417	unsigned long next;
418	unsigned long start;
419
420	start = addr;
421	do {
422	pud = pud_offset(p4d, address: addr);
423	next = pud_addr_end(addr, end);
424	if (!is_hugepd(__hugepd(pud_val(*pud)))) {
425	if (pud_none_or_clear_bad(pud))
426	continue;
427	hugetlb_free_pmd_range(tlb, pud, addr, end: next, floor,
428	ceiling);
429	} else {
430	unsigned long more;
431	/*
432	* Increment next by the size of the huge mapping since
433	* there may be more than one entry at this level for a
434	* single hugepage, but all of them point to
435	* the same kmem cache that holds the hugepte.
436	*/
437	more = addr + (`1UL` << hugepd_shift((hugepd_t )pud));
438	if (more > next)
439	next = more;
440
441	free_hugepd_range(tlb, hpdp: (hugepd_t *)pud, PUD_SHIFT,
442	start: addr, end: next, floor, ceiling);
443	}
444	} while (addr = next, addr != end);
445
446	if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
447	return;
448
449	pud = pud_offset(p4d, address: start & PGDIR_MASK);
450	p4d_clear(p4dp: p4d);
451	pud_free_tlb(tlb, pud, start & PGDIR_MASK);
452	mm_dec_nr_puds(mm: tlb->mm);
453	}
454
455	/*
456	* This function frees user-level page tables of a process.
457	*/
458	void hugetlb_free_pgd_range(struct mmu_gather *tlb,
459	unsigned long addr, unsigned long end,
460	unsigned long floor, unsigned long ceiling)
461	{
462	pgd_t *pgd;
463	p4d_t *p4d;
464	unsigned long next;
465
466	/*
467	* Because there are a number of different possible pagetable
468	* layouts for hugepage ranges, we limit knowledge of how
469	* things should be laid out to the allocation path
470	* (huge_pte_alloc(), above). Everything else works out the
471	* structure as it goes from information in the hugepd
472	* pointers. That means that we can't here use the
473	* optimization used in the normal page free_pgd_range(), of
474	* checking whether we're actually covering a large enough
475	* range to have to do anything at the top level of the walk
476	* instead of at the bottom.
477	*
478	* To make sense of this, you should probably go read the big
479	* block comment at the top of the normal free_pgd_range(),
480	* too.
481	*/
482
483	do {
484	next = pgd_addr_end(addr, end);
485	pgd = pgd_offset(tlb->mm, addr);
486	p4d = p4d_offset(pgd, address: addr);
487	if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
488	if (p4d_none_or_clear_bad(p4d))
489	continue;
490	hugetlb_free_pud_range(tlb, p4d, addr, end: next, floor, ceiling);
491	} else {
492	unsigned long more;
493	/*
494	* Increment next by the size of the huge mapping since
495	* there may be more than one entry at the pgd level
496	* for a single hugepage, but all of them point to the
497	* same kmem cache that holds the hugepte.
498	*/
499	more = addr + (`1UL` << hugepd_shift((hugepd_t )pgd));
500	if (more > next)
501	next = more;
502
503	free_hugepd_range(tlb, hpdp: (hugepd_t *)p4d, PGDIR_SHIFT,
504	start: addr, end: next, floor, ceiling);
505	}
506	} while (addr = next, addr != end);
507	}
508
509	bool __init arch_hugetlb_valid_size(unsigned long size)
510	{
511	int shift = __ffs(size);
512	int mmu_psize;
513
514	/ Check that it is a page size supported by the hardware and*
515	* that it fits within pagetable and slice limits. */
516	if (size <= PAGE_SIZE \|\| !is_power_of_2(n: size))
517	return false;
518
519	mmu_psize = check_and_get_huge_psize(shift);
520	if (mmu_psize < `0`)
521	return false;
522
523	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
524
525	return true;
526	}
527
528	static int __init add_huge_page_size(unsigned long long size)
529	{
530	int shift = __ffs(size);
531
532	if (!arch_hugetlb_valid_size(size: (unsigned long)size))
533	return -EINVAL;
534
535	hugetlb_add_hstate(order: shift - PAGE_SHIFT);
536	return `0`;
537	}
538
539	static int __init hugetlbpage_init(void)
540	{
541	bool configured = false;
542	int psize;
543
544	if (hugetlb_disabled) {
545	pr_info("HugeTLB support is disabled!\n");
546	return `0`;
547	}
548
549	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
550	!mmu_has_feature(MMU_FTR_16M_PAGE))
551	return -ENODEV;
552
553	for (psize = `0`; psize < MMU_PAGE_COUNT; ++psize) {
554	unsigned shift;
555	unsigned pdshift;
556
557	if (!mmu_psize_defs[psize].shift)
558	continue;
559
560	shift = mmu_psize_to_shift(psize);
561
562	#ifdef CONFIG_PPC_BOOK3S_64
563	if (shift > PGDIR_SHIFT)
564	continue;
565	else if (shift > PUD_SHIFT)
566	pdshift = PGDIR_SHIFT;
567	else if (shift > PMD_SHIFT)
568	pdshift = PUD_SHIFT;
569	else
570	pdshift = PMD_SHIFT;
571	#else
572	if (shift < PUD_SHIFT)
573	pdshift = PMD_SHIFT;
574	else if (shift < PGDIR_SHIFT)
575	pdshift = PUD_SHIFT;
576	else
577	pdshift = PGDIR_SHIFT;
578	#endif
579
580	if (add_huge_page_size(size: `1ULL` << shift) < `0`)
581	continue;
582	/*
583	* if we have pdshift and shift value same, we don't
584	* use pgt cache for hugepd.
585	*/
586	if (pdshift > shift) {
587	if (!IS_ENABLED(CONFIG_PPC_8xx))
588	pgtable_cache_add(pdshift - shift);
589	} else if (IS_ENABLED(CONFIG_PPC_E500) \|\|
590	IS_ENABLED(CONFIG_PPC_8xx)) {
591	pgtable_cache_add(PTE_T_ORDER);
592	}
593
594	configured = true;
595	}
596
597	if (!configured)
598	pr_info("Failed to initialize. Disabling HugeTLB");
599
600	return `0`;
601	}
602
603	arch_initcall(hugetlbpage_init);
604
605	void __init gigantic_hugetlb_cma_reserve(void)
606	{
607	unsigned long order = `0`;
608
609	if (radix_enabled())
610	order = PUD_SHIFT - PAGE_SHIFT;
611	else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift)
612	/*
613	* For pseries we do use ibm,expected#pages for reserving 16G pages.
614	*/
615	order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
616
617	if (order)
618	hugetlb_cma_reserve(order);
619	}
620

source code of linux/arch/powerpc/mm/hugetlbpage.c