memory-failure.c source code [linux/mm/memory-failure.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2008, 2009 Intel Corporation
4	* Authors: Andi Kleen, Fengguang Wu
5	*
6	* High level machine check handler. Handles pages reported by the
7	* hardware as being corrupted usually due to a multi-bit ECC memory or cache
8	* failure.
9	*
10	* In addition there is a "soft offline" entry point that allows stop using
11	* not-yet-corrupted-by-suspicious pages without killing anything.
12	*
13	* Handles page cache pages in various states. The tricky part
14	* here is that we can access any page asynchronously in respect to
15	* other VM users, because memory failures could happen anytime and
16	* anywhere. This could violate some of their assumptions. This is why
17	* this code has to be extremely careful. Generally it tries to use
18	* normal locking rules, as in get the standard locks, even if that means
19	* the error handling takes potentially a long time.
20	*
21	* It can be very tempting to add handling for obscure cases here.
22	* In general any code for handling new cases should only be added iff:
23	* - You know how to test it.
24	* - You have a test that can be added to mce-test
25	* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
26	* - The case actually shows up as a frequent (top 10) page state in
27	* tools/mm/page-types when running a real workload.
28	*
29	* There are several operations here with exponential complexity because
30	* of unsuitable VM data structures. For example the operation to map back
31	* from RMAP chains to processes has to walk the complete process list and
32	* has non linear complexity with the number. But since memory corruptions
33	* are rare we hope to get away with this. This avoids impacting the core
34	* VM.
35	*/
36
37	#define pr_fmt(fmt) "Memory failure: " fmt
38
39	#include <linux/kernel.h>
40	#include <linux/mm.h>
41	#include <linux/page-flags.h>
42	#include <linux/sched/signal.h>
43	#include <linux/sched/task.h>
44	#include <linux/dax.h>
45	#include <linux/ksm.h>
46	#include <linux/rmap.h>
47	#include <linux/export.h>
48	#include <linux/pagemap.h>
49	#include <linux/swap.h>
50	#include <linux/backing-dev.h>
51	#include <linux/migrate.h>
52	#include <linux/slab.h>
53	#include <linux/swapops.h>
54	#include <linux/hugetlb.h>
55	#include <linux/memory_hotplug.h>
56	#include <linux/mm_inline.h>
57	#include <linux/memremap.h>
58	#include <linux/kfifo.h>
59	#include <linux/ratelimit.h>
60	#include <linux/pagewalk.h>
61	#include <linux/shmem_fs.h>
62	#include <linux/sysctl.h>
63	#include "swap.h"
64	#include "internal.h"
65	#include "ras/ras_event.h"
66
67	static int sysctl_memory_failure_early_kill __read_mostly;
68
69	static int sysctl_memory_failure_recovery __read_mostly = `1`;
70
71	atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(`0`);
72
73	static bool hw_memory_failure __read_mostly = false;
74
75	static DEFINE_MUTEX(mf_mutex);
76
77	void num_poisoned_pages_inc(unsigned long pfn)
78	{
79	atomic_long_inc(v: &num_poisoned_pages);
80	memblk_nr_poison_inc(pfn);
81	}
82
83	void num_poisoned_pages_sub(unsigned long pfn, long i)
84	{
85	atomic_long_sub(i, v: &num_poisoned_pages);
86	if (pfn != -`1UL`)
87	memblk_nr_poison_sub(pfn, i);
88	}
89
90	/**
91	* MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
92	* @_name: name of the file in the per NUMA sysfs directory.
93	*/
94	#define MF_ATTR_RO(_name) \
95	static ssize_t _name##_show(struct device *dev, \
96	struct device_attribute *attr, \
97	char *buf) \
98	{ \
99	struct memory_failure_stats *mf_stats = \
100	&NODE_DATA(dev->id)->mf_stats; \
101	return sprintf(buf, "%lu\n", mf_stats->_name); \
102	} \
103	static DEVICE_ATTR_RO(_name)
104
105	MF_ATTR_RO(total);
106	MF_ATTR_RO(ignored);
107	MF_ATTR_RO(failed);
108	MF_ATTR_RO(delayed);
109	MF_ATTR_RO(recovered);
110
111	static struct attribute *memory_failure_attr[] = {
112	&dev_attr_total.attr,
113	&dev_attr_ignored.attr,
114	&dev_attr_failed.attr,
115	&dev_attr_delayed.attr,
116	&dev_attr_recovered.attr,
117	NULL,
118	};
119
120	const struct attribute_group memory_failure_attr_group = {
121	.name = "memory_failure",
122	.attrs = memory_failure_attr,
123	};
124
125	static struct ctl_table memory_failure_table[] = {
126	{
127	.procname = "memory_failure_early_kill",
128	.data = &sysctl_memory_failure_early_kill,
129	.maxlen = sizeof(sysctl_memory_failure_early_kill),
130	.mode = `0644`,
131	.proc_handler = proc_dointvec_minmax,
132	.extra1 = SYSCTL_ZERO,
133	.extra2 = SYSCTL_ONE,
134	},
135	{
136	.procname = "memory_failure_recovery",
137	.data = &sysctl_memory_failure_recovery,
138	.maxlen = sizeof(sysctl_memory_failure_recovery),
139	.mode = `0644`,
140	.proc_handler = proc_dointvec_minmax,
141	.extra1 = SYSCTL_ZERO,
142	.extra2 = SYSCTL_ONE,
143	},
144	{ }
145	};
146
147	/*
148	* Return values:
149	* 1: the page is dissolved (if needed) and taken off from buddy,
150	* 0: the page is dissolved (if needed) and not taken off from buddy,
151	* < 0: failed to dissolve.
152	*/
153	static int __page_handle_poison(struct page *page)
154	{
155	int ret;
156
157	zone_pcp_disable(zone: page_zone(page));
158	ret = dissolve_free_huge_page(page);
159	if (!ret)
160	ret = take_page_off_buddy(page);
161	zone_pcp_enable(zone: page_zone(page));
162
163	return ret;
164	}
165
166	static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
167	{
168	if (hugepage_or_freepage) {
169	/*
170	* Doing this check for free pages is also fine since dissolve_free_huge_page
171	* returns 0 for non-hugetlb pages as well.
172	*/
173	if (__page_handle_poison(page) <= `0`)
174	/*
175	* We could fail to take off the target page from buddy
176	* for example due to racy page allocation, but that's
177	* acceptable because soft-offlined page is not broken
178	* and if someone really want to use it, they should
179	* take it.
180	*/
181	return false;
182	}
183
184	SetPageHWPoison(page);
185	if (release)
186	put_page(page);
187	page_ref_inc(page);
188	num_poisoned_pages_inc(page_to_pfn(page));
189
190	return true;
191	}
192
193	#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
194
195	u32 hwpoison_filter_enable = `0`;
196	u32 hwpoison_filter_dev_major = ~`0U`;
197	u32 hwpoison_filter_dev_minor = ~`0U`;
198	u64 hwpoison_filter_flags_mask;
199	u64 hwpoison_filter_flags_value;
200	EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
201	EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
202	EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
203	EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
204	EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
205
206	static int hwpoison_filter_dev(struct page *p)
207	{
208	struct address_space *mapping;
209	dev_t dev;
210
211	if (hwpoison_filter_dev_major == ~`0U` &&
212	hwpoison_filter_dev_minor == ~`0U`)
213	return `0`;
214
215	mapping = page_mapping(p);
216	if (mapping == NULL \|\| mapping->host == NULL)
217	return -EINVAL;
218
219	dev = mapping->host->i_sb->s_dev;
220	if (hwpoison_filter_dev_major != ~`0U` &&
221	hwpoison_filter_dev_major != MAJOR(dev))
222	return -EINVAL;
223	if (hwpoison_filter_dev_minor != ~`0U` &&
224	hwpoison_filter_dev_minor != MINOR(dev))
225	return -EINVAL;
226
227	return `0`;
228	}
229
230	static int hwpoison_filter_flags(struct page *p)
231	{
232	if (!hwpoison_filter_flags_mask)
233	return `0`;
234
235	if ((stable_page_flags(page: p) & hwpoison_filter_flags_mask) ==
236	hwpoison_filter_flags_value)
237	return `0`;
238	else
239	return -EINVAL;
240	}
241
242	/*
243	* This allows stress tests to limit test scope to a collection of tasks
244	* by putting them under some memcg. This prevents killing unrelated/important
245	* processes such as /sbin/init. Note that the target task may share clean
246	* pages with init (eg. libc text), which is harmless. If the target task
247	* share _dirty_ pages with another task B, the test scheme must make sure B
248	* is also included in the memcg. At last, due to race conditions this filter
249	* can only guarantee that the page either belongs to the memcg tasks, or is
250	* a freed page.
251	*/
252	#ifdef CONFIG_MEMCG
253	u64 hwpoison_filter_memcg;
254	EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
255	static int hwpoison_filter_task(struct page *p)
256	{
257	if (!hwpoison_filter_memcg)
258	return `0`;
259
260	if (page_cgroup_ino(page: p) != hwpoison_filter_memcg)
261	return -EINVAL;
262
263	return `0`;
264	}
265	#else
266	static int hwpoison_filter_task(struct page p) { return* `0`; }
267	#endif
268
269	int hwpoison_filter(struct page *p)
270	{
271	if (!hwpoison_filter_enable)
272	return `0`;
273
274	if (hwpoison_filter_dev(p))
275	return -EINVAL;
276
277	if (hwpoison_filter_flags(p))
278	return -EINVAL;
279
280	if (hwpoison_filter_task(p))
281	return -EINVAL;
282
283	return `0`;
284	}
285	#else
286	int hwpoison_filter(struct page *p)
287	{
288	return `0`;
289	}
290	#endif
291
292	EXPORT_SYMBOL_GPL(hwpoison_filter);
293
294	/*
295	* Kill all processes that have a poisoned page mapped and then isolate
296	* the page.
297	*
298	* General strategy:
299	* Find all processes having the page mapped and kill them.
300	* But we keep a page reference around so that the page is not
301	* actually freed yet.
302	* Then stash the page away
303	*
304	* There's no convenient way to get back to mapped processes
305	* from the VMAs. So do a brute-force search over all
306	* running processes.
307	*
308	* Remember that machine checks are not common (or rather
309	* if they are common you have other problems), so this shouldn't
310	* be a performance issue.
311	*
312	* Also there are some races possible while we get from the
313	* error detection to actually handle it.
314	*/
315
316	struct to_kill {
317	struct list_head nd;
318	struct task_struct *tsk;
319	unsigned long addr;
320	short size_shift;
321	};
322
323	/*
324	* Send all the processes who have the page mapped a signal.
325	* ``action optional'' if they are not immediately affected by the error
326	* ``action required'' if error happened in current execution context
327	*/
328	static int kill_proc(struct to_kill tk, unsigned* long pfn, int flags)
329	{
330	struct task_struct *t = tk->tsk;
331	short addr_lsb = tk->size_shift;
332	int ret = `0`;
333
334	pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
335	pfn, t->comm, t->pid);
336
337	if ((flags & MF_ACTION_REQUIRED) && (t == current))
338	ret = force_sig_mceerr(BUS_MCEERR_AR,
339	(void __user *)tk->addr, addr_lsb);
340	else
341	/*
342	* Signal other processes sharing the page if they have
343	* PF_MCE_EARLY set.
344	* Don't use force here, it's convenient if the signal
345	* can be temporarily blocked.
346	* This could cause a loop when the user sets SIGBUS
347	* to SIG_IGN, but hopefully no one will do that?
348	*/
349	ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
350	addr_lsb, t);
351	if (ret < `0`)
352	pr_info("Error sending signal to %s:%d: %d\n",
353	t->comm, t->pid, ret);
354	return ret;
355	}
356
357	/*
358	* Unknown page type encountered. Try to check whether it can turn PageLRU by
359	* lru_add_drain_all.
360	*/
361	void shake_page(struct page *p)
362	{
363	if (PageHuge(page: p))
364	return;
365	/*
366	* TODO: Could shrink slab caches here if a lightweight range-based
367	* shrinker will be available.
368	*/
369	if (PageSlab(page: p))
370	return;
371
372	lru_add_drain_all();
373	}
374	EXPORT_SYMBOL_GPL(shake_page);
375
376	static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
377	unsigned long address)
378	{
379	unsigned long ret = `0`;
380	pgd_t *pgd;
381	p4d_t *p4d;
382	pud_t *pud;
383	pmd_t *pmd;
384	pte_t *pte;
385	pte_t ptent;
386
387	VM_BUG_ON_VMA(address == -EFAULT, vma);
388	pgd = pgd_offset(vma->vm_mm, address);
389	if (!pgd_present(pgd: *pgd))
390	return `0`;
391	p4d = p4d_offset(pgd, address);
392	if (!p4d_present(p4d: *p4d))
393	return `0`;
394	pud = pud_offset(p4d, address);
395	if (!pud_present(pud: *pud))
396	return `0`;
397	if (pud_devmap(pud: *pud))
398	return PUD_SHIFT;
399	pmd = pmd_offset(pud, address);
400	if (!pmd_present(pmd: *pmd))
401	return `0`;
402	if (pmd_devmap(pmd: *pmd))
403	return PMD_SHIFT;
404	pte = pte_offset_map(pmd, addr: address);
405	if (!pte)
406	return `0`;
407	ptent = ptep_get(ptep: pte);
408	if (pte_present(a: ptent) && pte_devmap(a: ptent))
409	ret = PAGE_SHIFT;
410	pte_unmap(pte);
411	return ret;
412	}
413
414	/*
415	* Failure handling: if we can't find or can't kill a process there's
416	* not much we can do. We just print a message and ignore otherwise.
417	*/
418
419	#define FSDAX_INVALID_PGOFF ULONG_MAX
420
421	/*
422	* Schedule a process for later kill.
423	* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
424	*
425	* Note: @fsdax_pgoff is used only when @p is a fsdax page and a
426	* filesystem with a memory failure handler has claimed the
427	* memory_failure event. In all other cases, page->index and
428	* page->mapping are sufficient for mapping the page back to its
429	* corresponding user virtual address.
430	*/
431	static void __add_to_kill(struct task_struct tsk, struct* page *p,
432	struct vm_area_struct vma, struct* list_head *to_kill,
433	unsigned long ksm_addr, pgoff_t fsdax_pgoff)
434	{
435	struct to_kill *tk;
436
437	tk = kmalloc(size: sizeof(struct to_kill), GFP_ATOMIC);
438	if (!tk) {
439	pr_err("Out of memory while machine check handling\n");
440	return;
441	}
442
443	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
444	if (is_zone_device_page(page: p)) {
445	if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
446	tk->addr = vma_pgoff_address(pgoff: fsdax_pgoff, nr_pages: `1`, vma);
447	tk->size_shift = dev_pagemap_mapping_shift(vma, address: tk->addr);
448	} else
449	tk->size_shift = page_shift(compound_head(p));
450
451	/*
452	* Send SIGKILL if "tk->addr == -EFAULT". Also, as
453	* "tk->size_shift" is always non-zero for !is_zone_device_page(),
454	* so "tk->size_shift == 0" effectively checks no mapping on
455	* ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
456	* to a process' address space, it's possible not all N VMAs
457	* contain mappings for the page, but at least one VMA does.
458	* Only deliver SIGBUS with payload derived from the VMA that
459	* has a mapping for the page.
460	*/
461	if (tk->addr == -EFAULT) {
462	pr_info("Unable to find user space address %lx in %s\n",
463	page_to_pfn(p), tsk->comm);
464	} else if (tk->size_shift == `0`) {
465	kfree(objp: tk);
466	return;
467	}
468
469	get_task_struct(t: tsk);
470	tk->tsk = tsk;
471	list_add_tail(new: &tk->nd, head: to_kill);
472	}
473
474	static void add_to_kill_anon_file(struct task_struct tsk, struct* page *p,
475	struct vm_area_struct *vma,
476	struct list_head *to_kill)
477	{
478	__add_to_kill(tsk, p, vma, to_kill, ksm_addr: `0`, FSDAX_INVALID_PGOFF);
479	}
480
481	#ifdef CONFIG_KSM
482	static bool task_in_to_kill_list(struct list_head *to_kill,
483	struct task_struct *tsk)
484	{
485	struct to_kill tk, next;
486
487	list_for_each_entry_safe(tk, next, to_kill, nd) {
488	if (tk->tsk == tsk)
489	return true;
490	}
491
492	return false;
493	}
494	void add_to_kill_ksm(struct task_struct tsk, struct* page *p,
495	struct vm_area_struct vma, struct* list_head *to_kill,
496	unsigned long ksm_addr)
497	{
498	if (!task_in_to_kill_list(to_kill, tsk))
499	__add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
500	}
501	#endif
502	/*
503	* Kill the processes that have been collected earlier.
504	*
505	* Only do anything when FORCEKILL is set, otherwise just free the
506	* list (this is used for clean pages which do not need killing)
507	* Also when FAIL is set do a force kill because something went
508	* wrong earlier.
509	*/
510	static void kill_procs(struct list_head to_kill, int* forcekill, bool fail,
511	unsigned long pfn, int flags)
512	{
513	struct to_kill tk, next;
514
515	list_for_each_entry_safe(tk, next, to_kill, nd) {
516	if (forcekill) {
517	/*
518	* In case something went wrong with munmapping
519	* make sure the process doesn't catch the
520	* signal and then access the memory. Just kill it.
521	*/
522	if (fail \|\| tk->addr == -EFAULT) {
523	pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
524	pfn, tk->tsk->comm, tk->tsk->pid);
525	do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
526	p: tk->tsk, type: PIDTYPE_PID);
527	}
528
529	/*
530	* In theory the process could have mapped
531	* something else on the address in-between. We could
532	* check for that, but we need to tell the
533	* process anyways.
534	*/
535	else if (kill_proc(tk, pfn, flags) < `0`)
536	pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
537	pfn, tk->tsk->comm, tk->tsk->pid);
538	}
539	list_del(entry: &tk->nd);
540	put_task_struct(t: tk->tsk);
541	kfree(objp: tk);
542	}
543	}
544
545	/*
546	* Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
547	* on behalf of the thread group. Return task_struct of the (first found)
548	* dedicated thread if found, and return NULL otherwise.
549	*
550	* We already hold rcu lock in the caller, so we don't have to call
551	* rcu_read_lock/unlock() in this function.
552	*/
553	static struct task_struct find_early_kill_thread(struct* task_struct *tsk)
554	{
555	struct task_struct *t;
556
557	for_each_thread(tsk, t) {
558	if (t->flags & PF_MCE_PROCESS) {
559	if (t->flags & PF_MCE_EARLY)
560	return t;
561	} else {
562	if (sysctl_memory_failure_early_kill)
563	return t;
564	}
565	}
566	return NULL;
567	}
568
569	/*
570	* Determine whether a given process is "early kill" process which expects
571	* to be signaled when some page under the process is hwpoisoned.
572	* Return task_struct of the dedicated thread (main thread unless explicitly
573	* specified) if the process is "early kill" and otherwise returns NULL.
574	*
575	* Note that the above is true for Action Optional case. For Action Required
576	* case, it's only meaningful to the current thread which need to be signaled
577	* with SIGBUS, this error is Action Optional for other non current
578	* processes sharing the same error page,if the process is "early kill", the
579	* task_struct of the dedicated thread will also be returned.
580	*/
581	struct task_struct task_early_kill(struct* task_struct tsk, int* force_early)
582	{
583	if (!tsk->mm)
584	return NULL;
585	/*
586	* Comparing ->mm here because current task might represent
587	* a subthread, while tsk always points to the main thread.
588	*/
589	if (force_early && tsk->mm == current->mm)
590	return current;
591
592	return find_early_kill_thread(tsk);
593	}
594
595	/*
596	* Collect processes when the error hit an anonymous page.
597	*/
598	static void collect_procs_anon(struct page page, struct* list_head *to_kill,
599	int force_early)
600	{
601	struct folio *folio = page_folio(page);
602	struct vm_area_struct *vma;
603	struct task_struct *tsk;
604	struct anon_vma *av;
605	pgoff_t pgoff;
606
607	av = folio_lock_anon_vma_read(folio, NULL);
608	if (av == NULL) / Not actually mapped anymore /
609	return;
610
611	pgoff = page_to_pgoff(page);
612	rcu_read_lock();
613	for_each_process(tsk) {
614	struct anon_vma_chain *vmac;
615	struct task_struct *t = task_early_kill(tsk, force_early);
616
617	if (!t)
618	continue;
619	anon_vma_interval_tree_foreach(vmac, &av->rb_root,
620	pgoff, pgoff) {
621	vma = vmac->vma;
622	if (vma->vm_mm != t->mm)
623	continue;
624	if (!page_mapped_in_vma(page, vma))
625	continue;
626	add_to_kill_anon_file(tsk: t, p: page, vma, to_kill);
627	}
628	}
629	rcu_read_unlock();
630	anon_vma_unlock_read(anon_vma: av);
631	}
632
633	/*
634	* Collect processes when the error hit a file mapped page.
635	*/
636	static void collect_procs_file(struct page page, struct* list_head *to_kill,
637	int force_early)
638	{
639	struct vm_area_struct *vma;
640	struct task_struct *tsk;
641	struct address_space *mapping = page->mapping;
642	pgoff_t pgoff;
643
644	i_mmap_lock_read(mapping);
645	rcu_read_lock();
646	pgoff = page_to_pgoff(page);
647	for_each_process(tsk) {
648	struct task_struct *t = task_early_kill(tsk, force_early);
649
650	if (!t)
651	continue;
652	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
653	pgoff) {
654	/*
655	* Send early kill signal to tasks where a vma covers
656	* the page but the corrupted page is not necessarily
657	* mapped in its pte.
658	* Assume applications who requested early kill want
659	* to be informed of all such data corruptions.
660	*/
661	if (vma->vm_mm == t->mm)
662	add_to_kill_anon_file(tsk: t, p: page, vma, to_kill);
663	}
664	}
665	rcu_read_unlock();
666	i_mmap_unlock_read(mapping);
667	}
668
669	#ifdef CONFIG_FS_DAX
670	static void add_to_kill_fsdax(struct task_struct tsk, struct* page *p,
671	struct vm_area_struct *vma,
672	struct list_head *to_kill, pgoff_t pgoff)
673	{
674	__add_to_kill(tsk, p, vma, to_kill, ksm_addr: `0`, fsdax_pgoff: pgoff);
675	}
676
677	/*
678	* Collect processes when the error hit a fsdax page.
679	*/
680	static void collect_procs_fsdax(struct page *page,
681	struct address_space *mapping, pgoff_t pgoff,
682	struct list_head *to_kill)
683	{
684	struct vm_area_struct *vma;
685	struct task_struct *tsk;
686
687	i_mmap_lock_read(mapping);
688	rcu_read_lock();
689	for_each_process(tsk) {
690	struct task_struct *t = task_early_kill(tsk, force_early: true);
691
692	if (!t)
693	continue;
694	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
695	if (vma->vm_mm == t->mm)
696	add_to_kill_fsdax(tsk: t, p: page, vma, to_kill, pgoff);
697	}
698	}
699	rcu_read_unlock();
700	i_mmap_unlock_read(mapping);
701	}
702	#endif /* CONFIG_FS_DAX */
703
704	/*
705	* Collect the processes who have the corrupted page mapped to kill.
706	*/
707	static void collect_procs(struct page page, struct* list_head *tokill,
708	int force_early)
709	{
710	if (!page->mapping)
711	return;
712	if (unlikely(PageKsm(page)))
713	collect_procs_ksm(page, to_kill: tokill, force_early);
714	else if (PageAnon(page))
715	collect_procs_anon(page, to_kill: tokill, force_early);
716	else
717	collect_procs_file(page, to_kill: tokill, force_early);
718	}
719
720	struct hwpoison_walk {
721	struct to_kill tk;
722	unsigned long pfn;
723	int flags;
724	};
725
726	static void set_to_kill(struct to_kill tk, unsigned* long addr, short shift)
727	{
728	tk->addr = addr;
729	tk->size_shift = shift;
730	}
731
732	static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
733	unsigned long poisoned_pfn, struct to_kill *tk)
734	{
735	unsigned long pfn = `0`;
736
737	if (pte_present(a: pte)) {
738	pfn = pte_pfn(pte);
739	} else {
740	swp_entry_t swp = pte_to_swp_entry(pte);
741
742	if (is_hwpoison_entry(entry: swp))
743	pfn = swp_offset_pfn(entry: swp);
744	}
745
746	if (!pfn \|\| pfn != poisoned_pfn)
747	return `0`;
748
749	set_to_kill(tk, addr, shift);
750	return `1`;
751	}
752
753	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
754	static int check_hwpoisoned_pmd_entry(pmd_t pmdp, unsigned* long addr,
755	struct hwpoison_walk *hwp)
756	{
757	pmd_t pmd = *pmdp;
758	unsigned long pfn;
759	unsigned long hwpoison_vaddr;
760
761	if (!pmd_present(pmd))
762	return `0`;
763	pfn = pmd_pfn(pmd);
764	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
765	hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
766	set_to_kill(tk: &hwp->tk, addr: hwpoison_vaddr, PAGE_SHIFT);
767	return `1`;
768	}
769	return `0`;
770	}
771	#else
772	static int check_hwpoisoned_pmd_entry(pmd_t pmdp, unsigned* long addr,
773	struct hwpoison_walk *hwp)
774	{
775	return `0`;
776	}
777	#endif
778
779	static int hwpoison_pte_range(pmd_t pmdp, unsigned* long addr,
780	unsigned long end, struct mm_walk *walk)
781	{
782	struct hwpoison_walk *hwp = walk->private;
783	int ret = `0`;
784	pte_t ptep, mapped_pte;
785	spinlock_t *ptl;
786
787	ptl = pmd_trans_huge_lock(pmd: pmdp, vma: walk->vma);
788	if (ptl) {
789	ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
790	spin_unlock(lock: ptl);
791	goto out;
792	}
793
794	mapped_pte = ptep = pte_offset_map_lock(mm: walk->vma->vm_mm, pmd: pmdp,
795	addr, ptlp: &ptl);
796	if (!ptep)
797	goto out;
798
799	for (; addr != end; ptep++, addr += PAGE_SIZE) {
800	ret = check_hwpoisoned_entry(pte: ptep_get(ptep), addr, PAGE_SHIFT,
801	poisoned_pfn: hwp->pfn, tk: &hwp->tk);
802	if (ret == `1`)
803	break;
804	}
805	pte_unmap_unlock(mapped_pte, ptl);
806	out:
807	cond_resched();
808	return ret;
809	}
810
811	#ifdef CONFIG_HUGETLB_PAGE
812	static int hwpoison_hugetlb_range(pte_t ptep, unsigned* long hmask,
813	unsigned long addr, unsigned long end,
814	struct mm_walk *walk)
815	{
816	struct hwpoison_walk *hwp = walk->private;
817	pte_t pte = huge_ptep_get(ptep);
818	struct hstate *h = hstate_vma(vma: walk->vma);
819
820	return check_hwpoisoned_entry(pte, addr, shift: huge_page_shift(h),
821	poisoned_pfn: hwp->pfn, tk: &hwp->tk);
822	}
823	#else
824	#define hwpoison_hugetlb_range NULL
825	#endif
826
827	static const struct mm_walk_ops hwpoison_walk_ops = {
828	.pmd_entry = hwpoison_pte_range,
829	.hugetlb_entry = hwpoison_hugetlb_range,
830	.walk_lock = PGWALK_RDLOCK,
831	};
832
833	/*
834	* Sends SIGBUS to the current process with error info.
835	*
836	* This function is intended to handle "Action Required" MCEs on already
837	* hardware poisoned pages. They could happen, for example, when
838	* memory_failure() failed to unmap the error page at the first call, or
839	* when multiple local machine checks happened on different CPUs.
840	*
841	* MCE handler currently has no easy access to the error virtual address,
842	* so this function walks page table to find it. The returned virtual address
843	* is proper in most cases, but it could be wrong when the application
844	* process has multiple entries mapping the error page.
845	*/
846	static int kill_accessing_process(struct task_struct p, unsigned* long pfn,
847	int flags)
848	{
849	int ret;
850	struct hwpoison_walk priv = {
851	.pfn = pfn,
852	};
853	priv.tk.tsk = p;
854
855	if (!p->mm)
856	return -EFAULT;
857
858	mmap_read_lock(mm: p->mm);
859	ret = walk_page_range(mm: p->mm, start: `0`, TASK_SIZE, ops: &hwpoison_walk_ops,
860	private: (void *)&priv);
861	if (ret == `1` && priv.tk.addr)
862	kill_proc(tk: &priv.tk, pfn, flags);
863	else
864	ret = `0`;
865	mmap_read_unlock(mm: p->mm);
866	return ret > `0` ? -EHWPOISON : -EFAULT;
867	}
868
869	static const char *action_name[] = {
870	[MF_IGNORED] = "Ignored",
871	[MF_FAILED] = "Failed",
872	[MF_DELAYED] = "Delayed",
873	[MF_RECOVERED] = "Recovered",
874	};
875
876	static const char * const action_page_types[] = {
877	[MF_MSG_KERNEL] = "reserved kernel page",
878	[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
879	[MF_MSG_SLAB] = "kernel slab page",
880	[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
881	[MF_MSG_HUGE] = "huge page",
882	[MF_MSG_FREE_HUGE] = "free huge page",
883	[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
884	[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
885	[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
886	[MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
887	[MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
888	[MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
889	[MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
890	[MF_MSG_DIRTY_LRU] = "dirty LRU page",
891	[MF_MSG_CLEAN_LRU] = "clean LRU page",
892	[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
893	[MF_MSG_BUDDY] = "free buddy page",
894	[MF_MSG_DAX] = "dax page",
895	[MF_MSG_UNSPLIT_THP] = "unsplit thp",
896	[MF_MSG_UNKNOWN] = "unknown page",
897	};
898
899	/*
900	* XXX: It is possible that a page is isolated from LRU cache,
901	* and then kept in swap cache or failed to remove from page cache.
902	* The page count will stop it from being freed by unpoison.
903	* Stress tests should be aware of this memory leak problem.
904	*/
905	static int delete_from_lru_cache(struct page *p)
906	{
907	if (isolate_lru_page(page: p)) {
908	/*
909	* Clear sensible page flags, so that the buddy system won't
910	* complain when the page is unpoison-and-freed.
911	*/
912	ClearPageActive(page: p);
913	ClearPageUnevictable(page: p);
914
915	/*
916	* Poisoned page might never drop its ref count to 0 so we have
917	* to uncharge it manually from its memcg.
918	*/
919	mem_cgroup_uncharge(page_folio(p));
920
921	/*
922	* drop the page count elevated by isolate_lru_page()
923	*/
924	put_page(page: p);
925	return `0`;
926	}
927	return -EIO;
928	}
929
930	static int truncate_error_page(struct page p, unsigned* long pfn,
931	struct address_space *mapping)
932	{
933	int ret = MF_FAILED;
934
935	if (mapping->a_ops->error_remove_page) {
936	struct folio *folio = page_folio(p);
937	int err = mapping->a_ops->error_remove_page(mapping, p);
938
939	if (err != `0`)
940	pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
941	else if (!filemap_release_folio(folio, GFP_NOIO))
942	pr_info("%#lx: failed to release buffers\n", pfn);
943	else
944	ret = MF_RECOVERED;
945	} else {
946	/*
947	* If the file system doesn't support it just invalidate
948	* This fails on dirty or anything with private pages
949	*/
950	if (invalidate_inode_page(page: p))
951	ret = MF_RECOVERED;
952	else
953	pr_info("%#lx: Failed to invalidate\n", pfn);
954	}
955
956	return ret;
957	}
958
959	struct page_state {
960	unsigned long mask;
961	unsigned long res;
962	enum mf_action_page_type type;
963
964	/ Callback ->action() has to unlock the relevant page inside it. /
965	int (action)(struct* page_state ps, struct* page *p);
966	};
967
968	/*
969	* Return true if page is still referenced by others, otherwise return
970	* false.
971	*
972	* The extra_pins is true when one extra refcount is expected.
973	*/
974	static bool has_extra_refcount(struct page_state ps, struct* page *p,
975	bool extra_pins)
976	{
977	int count = page_count(page: p) - `1`;
978
979	if (extra_pins)
980	count -= `1`;
981
982	if (count > `0`) {
983	pr_err("%#lx: %s still referenced by %d users\n",
984	page_to_pfn(p), action_page_types[ps->type], count);
985	return true;
986	}
987
988	return false;
989	}
990
991	/*
992	* Error hit kernel page.
993	* Do nothing, try to be lucky and not touch this instead. For a few cases we
994	* could be more sophisticated.
995	*/
996	static int me_kernel(struct page_state ps, struct* page *p)
997	{
998	unlock_page(page: p);
999	return MF_IGNORED;
1000	}
1001
1002	/*
1003	* Page in unknown state. Do nothing.
1004	*/
1005	static int me_unknown(struct page_state ps, struct* page *p)
1006	{
1007	pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
1008	unlock_page(page: p);
1009	return MF_FAILED;
1010	}
1011
1012	/*
1013	* Clean (or cleaned) page cache page.
1014	*/
1015	static int me_pagecache_clean(struct page_state ps, struct* page *p)
1016	{
1017	int ret;
1018	struct address_space *mapping;
1019	bool extra_pins;
1020
1021	delete_from_lru_cache(p);
1022
1023	/*
1024	* For anonymous pages we're done the only reference left
1025	* should be the one m_f() holds.
1026	*/
1027	if (PageAnon(page: p)) {
1028	ret = MF_RECOVERED;
1029	goto out;
1030	}
1031
1032	/*
1033	* Now truncate the page in the page cache. This is really
1034	* more like a "temporary hole punch"
1035	* Don't do this for block devices when someone else
1036	* has a reference, because it could be file system metadata
1037	* and that's not safe to truncate.
1038	*/
1039	mapping = page_mapping(p);
1040	if (!mapping) {
1041	/*
1042	* Page has been teared down in the meanwhile
1043	*/
1044	ret = MF_FAILED;
1045	goto out;
1046	}
1047
1048	/*
1049	* The shmem page is kept in page cache instead of truncating
1050	* so is expected to have an extra refcount after error-handling.
1051	*/
1052	extra_pins = shmem_mapping(mapping);
1053
1054	/*
1055	* Truncation is a bit tricky. Enable it per file system for now.
1056	*
1057	* Open: to take i_rwsem or not for this? Right now we don't.
1058	*/
1059	ret = truncate_error_page(p, page_to_pfn(p), mapping);
1060	if (has_extra_refcount(ps, p, extra_pins))
1061	ret = MF_FAILED;
1062
1063	out:
1064	unlock_page(page: p);
1065
1066	return ret;
1067	}
1068
1069	/*
1070	* Dirty pagecache page
1071	* Issues: when the error hit a hole page the error is not properly
1072	* propagated.
1073	*/
1074	static int me_pagecache_dirty(struct page_state ps, struct* page *p)
1075	{
1076	struct address_space *mapping = page_mapping(p);
1077
1078	SetPageError(p);
1079	/ TBD: print more information about the file. /
1080	if (mapping) {
1081	/*
1082	* IO error will be reported by write(), fsync(), etc.
1083	* who check the mapping.
1084	* This way the application knows that something went
1085	* wrong with its dirty file data.
1086	*
1087	* There's one open issue:
1088	*
1089	* The EIO will be only reported on the next IO
1090	* operation and then cleared through the IO map.
1091	* Normally Linux has two mechanisms to pass IO error
1092	* first through the AS_EIO flag in the address space
1093	* and then through the PageError flag in the page.
1094	* Since we drop pages on memory failure handling the
1095	* only mechanism open to use is through AS_AIO.
1096	*
1097	* This has the disadvantage that it gets cleared on
1098	* the first operation that returns an error, while
1099	* the PageError bit is more sticky and only cleared
1100	* when the page is reread or dropped. If an
1101	* application assumes it will always get error on
1102	* fsync, but does other operations on the fd before
1103	* and the page is dropped between then the error
1104	* will not be properly reported.
1105	*
1106	* This can already happen even without hwpoisoned
1107	* pages: first on metadata IO errors (which only
1108	* report through AS_EIO) or when the page is dropped
1109	* at the wrong time.
1110	*
1111	* So right now we assume that the application DTRT on
1112	* the first EIO, but we're not worse than other parts
1113	* of the kernel.
1114	*/
1115	mapping_set_error(mapping, error: -EIO);
1116	}
1117
1118	return me_pagecache_clean(ps, p);
1119	}
1120
1121	/*
1122	* Clean and dirty swap cache.
1123	*
1124	* Dirty swap cache page is tricky to handle. The page could live both in page
1125	* cache and swap cache(ie. page is freshly swapped in). So it could be
1126	* referenced concurrently by 2 types of PTEs:
1127	* normal PTEs and swap PTEs. We try to handle them consistently by calling
1128	* try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
1129	* and then
1130	* - clear dirty bit to prevent IO
1131	* - remove from LRU
1132	* - but keep in the swap cache, so that when we return to it on
1133	* a later page fault, we know the application is accessing
1134	* corrupted data and shall be killed (we installed simple
1135	* interception code in do_swap_page to catch it).
1136	*
1137	* Clean swap cache pages can be directly isolated. A later page fault will
1138	* bring in the known good data from disk.
1139	*/
1140	static int me_swapcache_dirty(struct page_state ps, struct* page *p)
1141	{
1142	int ret;
1143	bool extra_pins = false;
1144
1145	ClearPageDirty(page: p);
1146	/ Trigger EIO in shmem: /
1147	ClearPageUptodate(page: p);
1148
1149	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
1150	unlock_page(page: p);
1151
1152	if (ret == MF_DELAYED)
1153	extra_pins = true;
1154
1155	if (has_extra_refcount(ps, p, extra_pins))
1156	ret = MF_FAILED;
1157
1158	return ret;
1159	}
1160
1161	static int me_swapcache_clean(struct page_state ps, struct* page *p)
1162	{
1163	struct folio *folio = page_folio(p);
1164	int ret;
1165
1166	delete_from_swap_cache(folio);
1167
1168	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
1169	folio_unlock(folio);
1170
1171	if (has_extra_refcount(ps, p, extra_pins: false))
1172	ret = MF_FAILED;
1173
1174	return ret;
1175	}
1176
1177	/*
1178	* Huge pages. Needs work.
1179	* Issues:
1180	* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
1181	* To narrow down kill region to one page, we need to break up pmd.
1182	*/
1183	static int me_huge_page(struct page_state ps, struct* page *p)
1184	{
1185	int res;
1186	struct page *hpage = compound_head(p);
1187	struct address_space *mapping;
1188	bool extra_pins = false;
1189
1190	mapping = page_mapping(hpage);
1191	if (mapping) {
1192	res = truncate_error_page(p: hpage, page_to_pfn(p), mapping);
1193	/ The page is kept in page cache. /
1194	extra_pins = true;
1195	unlock_page(page: hpage);
1196	} else {
1197	unlock_page(page: hpage);
1198	/*
1199	* migration entry prevents later access on error hugepage,
1200	* so we can free and dissolve it into buddy to save healthy
1201	* subpages.
1202	*/
1203	put_page(page: hpage);
1204	if (__page_handle_poison(page: p) >= `0`) {
1205	page_ref_inc(page: p);
1206	res = MF_RECOVERED;
1207	} else {
1208	res = MF_FAILED;
1209	}
1210	}
1211
1212	if (has_extra_refcount(ps, p, extra_pins))
1213	res = MF_FAILED;
1214
1215	return res;
1216	}
1217
1218	/*
1219	* Various page states we can handle.
1220	*
1221	* A page state is defined by its current page->flags bits.
1222	* The table matches them in order and calls the right handler.
1223	*
1224	* This is quite tricky because we can access page at any time
1225	* in its live cycle, so all accesses have to be extremely careful.
1226	*
1227	* This is not complete. More states could be added.
1228	* For any missing state don't attempt recovery.
1229	*/
1230
1231	#define dirty (1UL << PG_dirty)
1232	#define sc ((1UL << PG_swapcache) \| (1UL << PG_swapbacked))
1233	#define unevict (1UL << PG_unevictable)
1234	#define mlock (1UL << PG_mlocked)
1235	#define lru (1UL << PG_lru)
1236	#define head (1UL << PG_head)
1237	#define slab (1UL << PG_slab)
1238	#define reserved (1UL << PG_reserved)
1239
1240	static struct page_state error_states[] = {
1241	{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
1242	/*
1243	* free pages are specially detected outside this table:
1244	* PG_buddy pages only make a small fraction of all free pages.
1245	*/
1246
1247	/*
1248	* Could in theory check if slab page is free or if we can drop
1249	* currently unused objects without touching them. But just
1250	* treat it as standard kernel for now.
1251	*/
1252	{ slab, slab, MF_MSG_SLAB, me_kernel },
1253
1254	{ head, head, MF_MSG_HUGE, me_huge_page },
1255
1256	{ sc\|dirty, sc\|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
1257	{ sc\|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
1258
1259	{ mlock\|dirty, mlock\|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
1260	{ mlock\|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
1261
1262	{ unevict\|dirty, unevict\|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
1263	{ unevict\|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
1264
1265	{ lru\|dirty, lru\|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
1266	{ lru\|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
1267
1268	/*
1269	* Catchall entry: must be at end.
1270	*/
1271	{ `0`, `0`, MF_MSG_UNKNOWN, me_unknown },
1272	};
1273
1274	#undef dirty
1275	#undef sc
1276	#undef unevict
1277	#undef mlock
1278	#undef lru
1279	#undef head
1280	#undef slab
1281	#undef reserved
1282
1283	static void update_per_node_mf_stats(unsigned long pfn,
1284	enum mf_result result)
1285	{
1286	int nid = MAX_NUMNODES;
1287	struct memory_failure_stats *mf_stats = NULL;
1288
1289	nid = pfn_to_nid(pfn);
1290	if (unlikely(nid < `0` \|\| nid >= MAX_NUMNODES)) {
1291	WARN_ONCE(`1`, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
1292	return;
1293	}
1294
1295	mf_stats = &NODE_DATA(nid)->mf_stats;
1296	switch (result) {
1297	case MF_IGNORED:
1298	++mf_stats->ignored;
1299	break;
1300	case MF_FAILED:
1301	++mf_stats->failed;
1302	break;
1303	case MF_DELAYED:
1304	++mf_stats->delayed;
1305	break;
1306	case MF_RECOVERED:
1307	++mf_stats->recovered;
1308	break;
1309	default:
1310	WARN_ONCE(`1`, "Memory failure: mf_result=%d is not properly handled", result);
1311	break;
1312	}
1313	++mf_stats->total;
1314	}
1315
1316	/*
1317	* "Dirty/Clean" indication is not 100% accurate due to the possibility of
1318	* setting PG_dirty outside page lock. See also comment above set_page_dirty().
1319	*/
1320	static int action_result(unsigned long pfn, enum mf_action_page_type type,
1321	enum mf_result result)
1322	{
1323	trace_memory_failure_event(pfn, type, result);
1324
1325	num_poisoned_pages_inc(pfn);
1326
1327	update_per_node_mf_stats(pfn, result);
1328
1329	pr_err("%#lx: recovery action for %s: %s\n",
1330	pfn, action_page_types[type], action_name[result]);
1331
1332	return (result == MF_RECOVERED \|\| result == MF_DELAYED) ? `0` : -EBUSY;
1333	}
1334
1335	static int page_action(struct page_state ps, struct* page *p,
1336	unsigned long pfn)
1337	{
1338	int result;
1339
1340	/ page p should be unlocked after returning from ps->action(). /
1341	result = ps->action(ps, p);
1342
1343	/ Could do more checks here if page looks ok /
1344	/*
1345	* Could adjust zone counters here to correct for the missing page.
1346	*/
1347
1348	return action_result(pfn, type: ps->type, result);
1349	}
1350
1351	static inline bool PageHWPoisonTakenOff(struct page *page)
1352	{
1353	return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
1354	}
1355
1356	void SetPageHWPoisonTakenOff(struct page *page)
1357	{
1358	set_page_private(page, MAGIC_HWPOISON);
1359	}
1360
1361	void ClearPageHWPoisonTakenOff(struct page *page)
1362	{
1363	if (PageHWPoison(page))
1364	set_page_private(page, private: `0`);
1365	}
1366
1367	/*
1368	* Return true if a page type of a given page is supported by hwpoison
1369	* mechanism (while handling could fail), otherwise false. This function
1370	* does not return true for hugetlb or device memory pages, so it's assumed
1371	* to be called only in the context where we never have such pages.
1372	*/
1373	static inline bool HWPoisonHandlable(struct page page, unsigned* long flags)
1374	{
1375	/ Soft offline could migrate non-LRU movable pages /
1376	if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
1377	return true;
1378
1379	return PageLRU(page) \|\| is_free_buddy_page(page);
1380	}
1381
1382	static int __get_hwpoison_page(struct page page, unsigned* long flags)
1383	{
1384	struct folio *folio = page_folio(page);
1385	int ret = `0`;
1386	bool hugetlb = false;
1387
1388	ret = get_hwpoison_hugetlb_folio(folio, hugetlb: &hugetlb, unpoison: false);
1389	if (hugetlb) {
1390	/ Make sure hugetlb demotion did not happen from under us. /
1391	if (folio == page_folio(page))
1392	return ret;
1393	if (ret > `0`) {
1394	folio_put(folio);
1395	folio = page_folio(page);
1396	}
1397	}
1398
1399	/*
1400	* This check prevents from calling folio_try_get() for any
1401	* unsupported type of folio in order to reduce the risk of unexpected
1402	* races caused by taking a folio refcount.
1403	*/
1404	if (!HWPoisonHandlable(page: &folio->page, flags))
1405	return -EBUSY;
1406
1407	if (folio_try_get(folio)) {
1408	if (folio == page_folio(page))
1409	return `1`;
1410
1411	pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
1412	folio_put(folio);
1413	}
1414
1415	return `0`;
1416	}
1417
1418	static int get_any_page(struct page p, unsigned* long flags)
1419	{
1420	int ret = `0`, pass = `0`;
1421	bool count_increased = false;
1422
1423	if (flags & MF_COUNT_INCREASED)
1424	count_increased = true;
1425
1426	try_again:
1427	if (!count_increased) {
1428	ret = __get_hwpoison_page(page: p, flags);
1429	if (!ret) {
1430	if (page_count(page: p)) {
1431	/ We raced with an allocation, retry. /
1432	if (pass++ < `3`)
1433	goto try_again;
1434	ret = -EBUSY;
1435	} else if (!PageHuge(page: p) && !is_free_buddy_page(page: p)) {
1436	/ We raced with put_page, retry. /
1437	if (pass++ < `3`)
1438	goto try_again;
1439	ret = -EIO;
1440	}
1441	goto out;
1442	} else if (ret == -EBUSY) {
1443	/*
1444	* We raced with (possibly temporary) unhandlable
1445	* page, retry.
1446	*/
1447	if (pass++ < `3`) {
1448	shake_page(p);
1449	goto try_again;
1450	}
1451	ret = -EIO;
1452	goto out;
1453	}
1454	}
1455
1456	if (PageHuge(page: p) \|\| HWPoisonHandlable(page: p, flags)) {
1457	ret = `1`;
1458	} else {
1459	/*
1460	* A page we cannot handle. Check whether we can turn
1461	* it into something we can handle.
1462	*/
1463	if (pass++ < `3`) {
1464	put_page(page: p);
1465	shake_page(p);
1466	count_increased = false;
1467	goto try_again;
1468	}
1469	put_page(page: p);
1470	ret = -EIO;
1471	}
1472	out:
1473	if (ret == -EIO)
1474	pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
1475
1476	return ret;
1477	}
1478
1479	static int __get_unpoison_page(struct page *page)
1480	{
1481	struct folio *folio = page_folio(page);
1482	int ret = `0`;
1483	bool hugetlb = false;
1484
1485	ret = get_hwpoison_hugetlb_folio(folio, hugetlb: &hugetlb, unpoison: true);
1486	if (hugetlb) {
1487	/ Make sure hugetlb demotion did not happen from under us. /
1488	if (folio == page_folio(page))
1489	return ret;
1490	if (ret > `0`)
1491	folio_put(folio);
1492	}
1493
1494	/*
1495	* PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
1496	* but also isolated from buddy freelist, so need to identify the
1497	* state and have to cancel both operations to unpoison.
1498	*/
1499	if (PageHWPoisonTakenOff(page))
1500	return -EHWPOISON;
1501
1502	return get_page_unless_zero(page) ? `1` : `0`;
1503	}
1504
1505	/**
1506	* get_hwpoison_page() - Get refcount for memory error handling
1507	* @p: Raw error page (hit by memory error)
1508	* @flags: Flags controlling behavior of error handling
1509	*
1510	* get_hwpoison_page() takes a page refcount of an error page to handle memory
1511	* error on it, after checking that the error page is in a well-defined state
1512	* (defined as a page-type we can successfully handle the memory error on it,
1513	* such as LRU page and hugetlb page).
1514	*
1515	* Memory error handling could be triggered at any time on any type of page,
1516	* so it's prone to race with typical memory management lifecycle (like
1517	* allocation and free). So to avoid such races, get_hwpoison_page() takes
1518	* extra care for the error page's state (as done in __get_hwpoison_page()),
1519	* and has some retry logic in get_any_page().
1520	*
1521	* When called from unpoison_memory(), the caller should already ensure that
1522	* the given page has PG_hwpoison. So it's never reused for other page
1523	* allocations, and __get_unpoison_page() never races with them.
1524	*
1525	* Return: 0 on failure,
1526	* 1 on success for in-use pages in a well-defined state,
1527	* -EIO for pages on which we can not handle memory errors,
1528	* -EBUSY when get_hwpoison_page() has raced with page lifecycle
1529	* operations like allocation and free,
1530	* -EHWPOISON when the page is hwpoisoned and taken off from buddy.
1531	*/
1532	static int get_hwpoison_page(struct page p, unsigned* long flags)
1533	{
1534	int ret;
1535
1536	zone_pcp_disable(zone: page_zone(page: p));
1537	if (flags & MF_UNPOISON)
1538	ret = __get_unpoison_page(page: p);
1539	else
1540	ret = get_any_page(p, flags);
1541	zone_pcp_enable(zone: page_zone(page: p));
1542
1543	return ret;
1544	}
1545
1546	/*
1547	* Do all that is necessary to remove user space mappings. Unmap
1548	* the pages and send SIGBUS to the processes if the data was dirty.
1549	*/
1550	static bool hwpoison_user_mappings(struct page p, unsigned* long pfn,
1551	int flags, struct page *hpage)
1552	{
1553	struct folio *folio = page_folio(hpage);
1554	enum ttu_flags ttu = TTU_IGNORE_MLOCK \| TTU_SYNC \| TTU_HWPOISON;
1555	struct address_space *mapping;
1556	LIST_HEAD(tokill);
1557	bool unmap_success;
1558	int forcekill;
1559	bool mlocked = PageMlocked(page: hpage);
1560
1561	/*
1562	* Here we are interested only in user-mapped pages, so skip any
1563	* other types of pages.
1564	*/
1565	if (PageReserved(page: p) \|\| PageSlab(page: p) \|\| PageTable(page: p) \|\| PageOffline(page: p))
1566	return true;
1567	if (!(PageLRU(page: hpage) \|\| PageHuge(page: p)))
1568	return true;
1569
1570	/*
1571	* This check implies we don't kill processes if their pages
1572	* are in the swap cache early. Those are always late kills.
1573	*/
1574	if (!page_mapped(page: hpage))
1575	return true;
1576
1577	if (PageSwapCache(page: p)) {
1578	pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
1579	ttu &= ~TTU_HWPOISON;
1580	}
1581
1582	/*
1583	* Propagate the dirty bit from PTEs to struct page first, because we
1584	* need this to decide if we should kill or just drop the page.
1585	* XXX: the dirty test could be racy: set_page_dirty() may not always
1586	* be called inside page lock (it's recommended but not enforced).
1587	*/
1588	mapping = page_mapping(hpage);
1589	if (!(flags & MF_MUST_KILL) && !PageDirty(page: hpage) && mapping &&
1590	mapping_can_writeback(mapping)) {
1591	if (page_mkclean(page: hpage)) {
1592	SetPageDirty(hpage);
1593	} else {
1594	ttu &= ~TTU_HWPOISON;
1595	pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
1596	pfn);
1597	}
1598	}
1599
1600	/*
1601	* First collect all the processes that have the page
1602	* mapped in dirty form. This has to be done before try_to_unmap,
1603	* because ttu takes the rmap data structures down.
1604	*/
1605	collect_procs(page: hpage, tokill: &tokill, force_early: flags & MF_ACTION_REQUIRED);
1606
1607	if (PageHuge(page: hpage) && !PageAnon(page: hpage)) {
1608	/*
1609	* For hugetlb pages in shared mappings, try_to_unmap
1610	* could potentially call huge_pmd_unshare. Because of
1611	* this, take semaphore in write mode here and set
1612	* TTU_RMAP_LOCKED to indicate we have taken the lock
1613	* at this higher level.
1614	*/
1615	mapping = hugetlb_page_mapping_lock_write(hpage);
1616	if (mapping) {
1617	try_to_unmap(folio, flags: ttu\|TTU_RMAP_LOCKED);
1618	i_mmap_unlock_write(mapping);
1619	} else
1620	pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
1621	} else {
1622	try_to_unmap(folio, flags: ttu);
1623	}
1624
1625	unmap_success = !page_mapped(page: hpage);
1626	if (!unmap_success)
1627	pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
1628	pfn, page_mapcount(hpage));
1629
1630	/*
1631	* try_to_unmap() might put mlocked page in lru cache, so call
1632	* shake_page() again to ensure that it's flushed.
1633	*/
1634	if (mlocked)
1635	shake_page(hpage);
1636
1637	/*
1638	* Now that the dirty bit has been propagated to the
1639	* struct page and all unmaps done we can decide if
1640	* killing is needed or not. Only kill when the page
1641	* was dirty or the process is not restartable,
1642	* otherwise the tokill list is merely
1643	* freed. When there was a problem unmapping earlier
1644	* use a more force-full uncatchable kill to prevent
1645	* any accesses to the poisoned memory.
1646	*/
1647	forcekill = PageDirty(page: hpage) \|\| (flags & MF_MUST_KILL) \|\|
1648	!unmap_success;
1649	kill_procs(to_kill: &tokill, forcekill, fail: !unmap_success, pfn, flags);
1650
1651	return unmap_success;
1652	}
1653
1654	static int identify_page_state(unsigned long pfn, struct page *p,
1655	unsigned long page_flags)
1656	{
1657	struct page_state *ps;
1658
1659	/*
1660	* The first check uses the current page flags which may not have any
1661	* relevant information. The second check with the saved page flags is
1662	* carried out only if the first check can't determine the page status.
1663	*/
1664	for (ps = error_states;; ps++)
1665	if ((p->flags & ps->mask) == ps->res)
1666	break;
1667
1668	page_flags \|= (p->flags & (`1UL` << PG_dirty));
1669
1670	if (!ps->mask)
1671	for (ps = error_states;; ps++)
1672	if ((page_flags & ps->mask) == ps->res)
1673	break;
1674	return page_action(ps, p, pfn);
1675	}
1676
1677	static int try_to_split_thp_page(struct page *page)
1678	{
1679	int ret;
1680
1681	lock_page(page);
1682	ret = split_huge_page(page);
1683	unlock_page(page);
1684
1685	if (unlikely(ret))
1686	put_page(page);
1687
1688	return ret;
1689	}
1690
1691	static void unmap_and_kill(struct list_head to_kill, unsigned* long pfn,
1692	struct address_space mapping, pgoff_t index, int* flags)
1693	{
1694	struct to_kill *tk;
1695	unsigned long size = `0`;
1696
1697	list_for_each_entry(tk, to_kill, nd)
1698	if (tk->size_shift)
1699	size = max(size, `1UL` << tk->size_shift);
1700
1701	if (size) {
1702	/*
1703	* Unmap the largest mapping to avoid breaking up device-dax
1704	* mappings which are constant size. The actual size of the
1705	* mapping being torn down is communicated in siginfo, see
1706	* kill_proc()
1707	*/
1708	loff_t start = (index << PAGE_SHIFT) & ~(size - `1`);
1709
1710	unmap_mapping_range(mapping, holebegin: start, holelen: size, even_cows: `0`);
1711	}
1712
1713	kill_procs(to_kill, forcekill: flags & MF_MUST_KILL, fail: false, pfn, flags);
1714	}
1715
1716	/*
1717	* Only dev_pagemap pages get here, such as fsdax when the filesystem
1718	* either do not claim or fails to claim a hwpoison event, or devdax.
1719	* The fsdax pages are initialized per base page, and the devdax pages
1720	* could be initialized either as base pages, or as compound pages with
1721	* vmemmap optimization enabled. Devdax is simplistic in its dealing with
1722	* hwpoison, such that, if a subpage of a compound page is poisoned,
1723	* simply mark the compound head page is by far sufficient.
1724	*/
1725	static int mf_generic_kill_procs(unsigned long long pfn, int flags,
1726	struct dev_pagemap *pgmap)
1727	{
1728	struct folio *folio = pfn_folio(pfn);
1729	LIST_HEAD(to_kill);
1730	dax_entry_t cookie;
1731	int rc = `0`;
1732
1733	/*
1734	* Prevent the inode from being freed while we are interrogating
1735	* the address_space, typically this would be handled by
1736	* lock_page(), but dax pages do not use the page lock. This
1737	* also prevents changes to the mapping of this pfn until
1738	* poison signaling is complete.
1739	*/
1740	cookie = dax_lock_folio(folio);
1741	if (!cookie)
1742	return -EBUSY;
1743
1744	if (hwpoison_filter(&folio->page)) {
1745	rc = -EOPNOTSUPP;
1746	goto unlock;
1747	}
1748
1749	switch (pgmap->type) {
1750	case MEMORY_DEVICE_PRIVATE:
1751	case MEMORY_DEVICE_COHERENT:
1752	/*
1753	* TODO: Handle device pages which may need coordination
1754	* with device-side memory.
1755	*/
1756	rc = -ENXIO;
1757	goto unlock;
1758	default:
1759	break;
1760	}
1761
1762	/*
1763	* Use this flag as an indication that the dax page has been
1764	* remapped UC to prevent speculative consumption of poison.
1765	*/
1766	SetPageHWPoison(&folio->page);
1767
1768	/*
1769	* Unlike System-RAM there is no possibility to swap in a
1770	* different physical page at a given virtual address, so all
1771	* userspace consumption of ZONE_DEVICE memory necessitates
1772	* SIGBUS (i.e. MF_MUST_KILL)
1773	*/
1774	flags \|= MF_ACTION_REQUIRED \| MF_MUST_KILL;
1775	collect_procs(page: &folio->page, tokill: &to_kill, force_early: true);
1776
1777	unmap_and_kill(to_kill: &to_kill, pfn, mapping: folio->mapping, index: folio->index, flags);
1778	unlock:
1779	dax_unlock_folio(folio, cookie);
1780	return rc;
1781	}
1782
1783	#ifdef CONFIG_FS_DAX
1784	/**
1785	* mf_dax_kill_procs - Collect and kill processes who are using this file range
1786	* @mapping: address_space of the file in use
1787	* @index: start pgoff of the range within the file
1788	* @count: length of the range, in unit of PAGE_SIZE
1789	* @mf_flags: memory failure flags
1790	*/
1791	int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
1792	unsigned long count, int mf_flags)
1793	{
1794	LIST_HEAD(to_kill);
1795	dax_entry_t cookie;
1796	struct page *page;
1797	size_t end = index + count;
1798
1799	mf_flags \|= MF_ACTION_REQUIRED \| MF_MUST_KILL;
1800
1801	for (; index < end; index++) {
1802	page = NULL;
1803	cookie = dax_lock_mapping_entry(mapping, index, page: &page);
1804	if (!cookie)
1805	return -EBUSY;
1806	if (!page)
1807	goto unlock;
1808
1809	SetPageHWPoison(page);
1810
1811	collect_procs_fsdax(page, mapping, pgoff: index, to_kill: &to_kill);
1812	unmap_and_kill(to_kill: &to_kill, page_to_pfn(page), mapping,
1813	index, flags: mf_flags);
1814	unlock:
1815	dax_unlock_mapping_entry(mapping, index, cookie);
1816	}
1817	return `0`;
1818	}
1819	EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
1820	#endif /* CONFIG_FS_DAX */
1821
1822	#ifdef CONFIG_HUGETLB_PAGE
1823
1824	/*
1825	* Struct raw_hwp_page represents information about "raw error page",
1826	* constructing singly linked list from ->_hugetlb_hwpoison field of folio.
1827	*/
1828	struct raw_hwp_page {
1829	struct llist_node node;
1830	struct page *page;
1831	};
1832
1833	static inline struct llist_head raw_hwp_list_head(struct* folio *folio)
1834	{
1835	return (struct llist_head *)&folio->_hugetlb_hwpoison;
1836	}
1837
1838	bool is_raw_hwpoison_page_in_hugepage(struct page *page)
1839	{
1840	struct llist_head *raw_hwp_head;
1841	struct raw_hwp_page *p;
1842	struct folio *folio = page_folio(page);
1843	bool ret = false;
1844
1845	if (!folio_test_hwpoison(folio))
1846	return false;
1847
1848	if (!folio_test_hugetlb(folio))
1849	return PageHWPoison(page);
1850
1851	/*
1852	* When RawHwpUnreliable is set, kernel lost track of which subpages
1853	* are HWPOISON. So return as if ALL subpages are HWPOISONed.
1854	*/
1855	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1856	return true;
1857
1858	mutex_lock(&mf_mutex);
1859
1860	raw_hwp_head = raw_hwp_list_head(folio);
1861	llist_for_each_entry(p, raw_hwp_head->first, node) {
1862	if (page == p->page) {
1863	ret = true;
1864	break;
1865	}
1866	}
1867
1868	mutex_unlock(lock: &mf_mutex);
1869
1870	return ret;
1871	}
1872
1873	static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
1874	{
1875	struct llist_node *head;
1876	struct raw_hwp_page p, next;
1877	unsigned long count = `0`;
1878
1879	head = llist_del_all(head: raw_hwp_list_head(folio));
1880	llist_for_each_entry_safe(p, next, head, node) {
1881	if (move_flag)
1882	SetPageHWPoison(p->page);
1883	else
1884	num_poisoned_pages_sub(page_to_pfn(p->page), i: `1`);
1885	kfree(objp: p);
1886	count++;
1887	}
1888	return count;
1889	}
1890
1891	static int folio_set_hugetlb_hwpoison(struct folio folio, struct* page *page)
1892	{
1893	struct llist_head *head;
1894	struct raw_hwp_page *raw_hwp;
1895	struct raw_hwp_page p, next;
1896	int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : `0`;
1897
1898	/*
1899	* Once the hwpoison hugepage has lost reliable raw error info,
1900	* there is little meaning to keep additional error info precisely,
1901	* so skip to add additional raw error info.
1902	*/
1903	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1904	return -EHWPOISON;
1905	head = raw_hwp_list_head(folio);
1906	llist_for_each_entry_safe(p, next, head->first, node) {
1907	if (p->page == page)
1908	return -EHWPOISON;
1909	}
1910
1911	raw_hwp = kmalloc(size: sizeof(struct raw_hwp_page), GFP_ATOMIC);
1912	if (raw_hwp) {
1913	raw_hwp->page = page;
1914	llist_add(new: &raw_hwp->node, head);
1915	/ the first error event will be counted in action_result(). /
1916	if (ret)
1917	num_poisoned_pages_inc(page_to_pfn(page));
1918	} else {
1919	/*
1920	* Failed to save raw error info. We no longer trace all
1921	* hwpoisoned subpages, and we need refuse to free/dissolve
1922	* this hwpoisoned hugepage.
1923	*/
1924	folio_set_hugetlb_raw_hwp_unreliable(folio);
1925	/*
1926	* Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
1927	* used any more, so free it.
1928	*/
1929	__folio_free_raw_hwp(folio, move_flag: false);
1930	}
1931	return ret;
1932	}
1933
1934	static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
1935	{
1936	/*
1937	* hugetlb_vmemmap_optimized hugepages can't be freed because struct
1938	* pages for tail pages are required but they don't exist.
1939	*/
1940	if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
1941	return `0`;
1942
1943	/*
1944	* hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
1945	* definition.
1946	*/
1947	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1948	return `0`;
1949
1950	return __folio_free_raw_hwp(folio, move_flag);
1951	}
1952
1953	void folio_clear_hugetlb_hwpoison(struct folio *folio)
1954	{
1955	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1956	return;
1957	if (folio_test_hugetlb_vmemmap_optimized(folio))
1958	return;
1959	folio_clear_hwpoison(folio);
1960	folio_free_raw_hwp(folio, move_flag: true);
1961	}
1962
1963	/*
1964	* Called from hugetlb code with hugetlb_lock held.
1965	*
1966	* Return values:
1967	* 0 - free hugepage
1968	* 1 - in-use hugepage
1969	* 2 - not a hugepage
1970	* -EBUSY - the hugepage is busy (try to retry)
1971	* -EHWPOISON - the hugepage is already hwpoisoned
1972	*/
1973	int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
1974	bool *migratable_cleared)
1975	{
1976	struct page *page = pfn_to_page(pfn);
1977	struct folio *folio = page_folio(page);
1978	int ret = `2`; / fallback to normal page handling /
1979	bool count_increased = false;
1980
1981	if (!folio_test_hugetlb(folio))
1982	goto out;
1983
1984	if (flags & MF_COUNT_INCREASED) {
1985	ret = `1`;
1986	count_increased = true;
1987	} else if (folio_test_hugetlb_freed(folio)) {
1988	ret = `0`;
1989	} else if (folio_test_hugetlb_migratable(folio)) {
1990	ret = folio_try_get(folio);
1991	if (ret)
1992	count_increased = true;
1993	} else {
1994	ret = -EBUSY;
1995	if (!(flags & MF_NO_RETRY))
1996	goto out;
1997	}
1998
1999	if (folio_set_hugetlb_hwpoison(folio, page)) {
2000	ret = -EHWPOISON;
2001	goto out;
2002	}
2003
2004	/*
2005	* Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
2006	* from being migrated by memory hotremove.
2007	*/
2008	if (count_increased && folio_test_hugetlb_migratable(folio)) {
2009	folio_clear_hugetlb_migratable(folio);
2010	*migratable_cleared = true;
2011	}
2012
2013	return ret;
2014	out:
2015	if (count_increased)
2016	folio_put(folio);
2017	return ret;
2018	}
2019
2020	/*
2021	* Taking refcount of hugetlb pages needs extra care about race conditions
2022	* with basic operations like hugepage allocation/free/demotion.
2023	* So some of prechecks for hwpoison (pinning, and testing/setting
2024	* PageHWPoison) should be done in single hugetlb_lock range.
2025	*/
2026	static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
2027	{
2028	int res;
2029	struct page *p = pfn_to_page(pfn);
2030	struct folio *folio;
2031	unsigned long page_flags;
2032	bool migratable_cleared = false;
2033
2034	*hugetlb = `1`;
2035	retry:
2036	res = get_huge_page_for_hwpoison(pfn, flags, migratable_cleared: &migratable_cleared);
2037	if (res == `2`) { / fallback to normal page handling /
2038	*hugetlb = `0`;
2039	return `0`;
2040	} else if (res == -EHWPOISON) {
2041	pr_err("%#lx: already hardware poisoned\n", pfn);
2042	if (flags & MF_ACTION_REQUIRED) {
2043	folio = page_folio(p);
2044	res = kill_accessing_process(current, pfn: folio_pfn(folio), flags);
2045	}
2046	return res;
2047	} else if (res == -EBUSY) {
2048	if (!(flags & MF_NO_RETRY)) {
2049	flags \|= MF_NO_RETRY;
2050	goto retry;
2051	}
2052	return action_result(pfn, type: MF_MSG_UNKNOWN, result: MF_IGNORED);
2053	}
2054
2055	folio = page_folio(p);
2056	folio_lock(folio);
2057
2058	if (hwpoison_filter(p)) {
2059	folio_clear_hugetlb_hwpoison(folio);
2060	if (migratable_cleared)
2061	folio_set_hugetlb_migratable(folio);
2062	folio_unlock(folio);
2063	if (res == `1`)
2064	folio_put(folio);
2065	return -EOPNOTSUPP;
2066	}
2067
2068	/*
2069	* Handling free hugepage. The possible race with hugepage allocation
2070	* or demotion can be prevented by PageHWPoison flag.
2071	*/
2072	if (res == `0`) {
2073	folio_unlock(folio);
2074	if (__page_handle_poison(page: p) >= `0`) {
2075	page_ref_inc(page: p);
2076	res = MF_RECOVERED;
2077	} else {
2078	res = MF_FAILED;
2079	}
2080	return action_result(pfn, type: MF_MSG_FREE_HUGE, result: res);
2081	}
2082
2083	page_flags = folio->flags;
2084
2085	if (!hwpoison_user_mappings(p, pfn, flags, hpage: &folio->page)) {
2086	folio_unlock(folio);
2087	return action_result(pfn, type: MF_MSG_UNMAP_FAILED, result: MF_IGNORED);
2088	}
2089
2090	return identify_page_state(pfn, p, page_flags);
2091	}
2092
2093	#else
2094	static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
2095	{
2096	return `0`;
2097	}
2098
2099	static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
2100	{
2101	return `0`;
2102	}
2103	#endif /* CONFIG_HUGETLB_PAGE */
2104
2105	/ Drop the extra refcount in case we come from madvise() /
2106	static void put_ref_page(unsigned long pfn, int flags)
2107	{
2108	struct page *page;
2109
2110	if (!(flags & MF_COUNT_INCREASED))
2111	return;
2112
2113	page = pfn_to_page(pfn);
2114	if (page)
2115	put_page(page);
2116	}
2117
2118	static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
2119	struct dev_pagemap *pgmap)
2120	{
2121	int rc = -ENXIO;
2122
2123	/ device metadata space is not recoverable /
2124	if (!pgmap_pfn_valid(pgmap, pfn))
2125	goto out;
2126
2127	/*
2128	* Call driver's implementation to handle the memory failure, otherwise
2129	* fall back to generic handler.
2130	*/
2131	if (pgmap_has_memory_failure(pgmap)) {
2132	rc = pgmap->ops->memory_failure(pgmap, pfn, `1`, flags);
2133	/*
2134	* Fall back to generic handler too if operation is not
2135	* supported inside the driver/device/filesystem.
2136	*/
2137	if (rc != -EOPNOTSUPP)
2138	goto out;
2139	}
2140
2141	rc = mf_generic_kill_procs(pfn, flags, pgmap);
2142	out:
2143	/ drop pgmap ref acquired in caller /
2144	put_dev_pagemap(pgmap);
2145	if (rc != -EOPNOTSUPP)
2146	action_result(pfn, type: MF_MSG_DAX, result: rc ? MF_FAILED : MF_RECOVERED);
2147	return rc;
2148	}
2149
2150	/**
2151	* memory_failure - Handle memory failure of a page.
2152	* @pfn: Page Number of the corrupted page
2153	* @flags: fine tune action taken
2154	*
2155	* This function is called by the low level machine check code
2156	* of an architecture when it detects hardware memory corruption
2157	* of a page. It tries its best to recover, which includes
2158	* dropping pages, killing processes etc.
2159	*
2160	* The function is primarily of use for corruptions that
2161	* happen outside the current execution context (e.g. when
2162	* detected by a background scrubber)
2163	*
2164	* Must run in process context (e.g. a work queue) with interrupts
2165	* enabled and no spinlocks held.
2166	*
2167	* Return: 0 for successfully handled the memory error,
2168	* -EOPNOTSUPP for hwpoison_filter() filtered the error event,
2169	* < 0(except -EOPNOTSUPP) on failure.
2170	*/
2171	int memory_failure(unsigned long pfn, int flags)
2172	{
2173	struct page *p;
2174	struct page *hpage;
2175	struct dev_pagemap *pgmap;
2176	int res = `0`;
2177	unsigned long page_flags;
2178	bool retry = true;
2179	int hugetlb = `0`;
2180
2181	if (!sysctl_memory_failure_recovery)
2182	panic(fmt: "Memory failure on page %lx", pfn);
2183
2184	mutex_lock(&mf_mutex);
2185
2186	if (!(flags & MF_SW_SIMULATED))
2187	hw_memory_failure = true;
2188
2189	p = pfn_to_online_page(pfn);
2190	if (!p) {
2191	res = arch_memory_failure(pfn, flags);
2192	if (res == `0`)
2193	goto unlock_mutex;
2194
2195	if (pfn_valid(pfn)) {
2196	pgmap = get_dev_pagemap(pfn, NULL);
2197	put_ref_page(pfn, flags);
2198	if (pgmap) {
2199	res = memory_failure_dev_pagemap(pfn, flags,
2200	pgmap);
2201	goto unlock_mutex;
2202	}
2203	}
2204	pr_err("%#lx: memory outside kernel control\n", pfn);
2205	res = -ENXIO;
2206	goto unlock_mutex;
2207	}
2208
2209	try_again:
2210	res = try_memory_failure_hugetlb(pfn, flags, hugetlb: &hugetlb);
2211	if (hugetlb)
2212	goto unlock_mutex;
2213
2214	if (TestSetPageHWPoison(page: p)) {
2215	pr_err("%#lx: already hardware poisoned\n", pfn);
2216	res = -EHWPOISON;
2217	if (flags & MF_ACTION_REQUIRED)
2218	res = kill_accessing_process(current, pfn, flags);
2219	if (flags & MF_COUNT_INCREASED)
2220	put_page(page: p);
2221	goto unlock_mutex;
2222	}
2223
2224	/*
2225	* We need/can do nothing about count=0 pages.
2226	* 1) it's a free page, and therefore in safe hand:
2227	* check_new_page() will be the gate keeper.
2228	* 2) it's part of a non-compound high order page.
2229	* Implies some kernel user: cannot stop them from
2230	* R/W the page; let's pray that the page has been
2231	* used and will be freed some time later.
2232	* In fact it's dangerous to directly bump up page count from 0,
2233	* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
2234	*/
2235	if (!(flags & MF_COUNT_INCREASED)) {
2236	res = get_hwpoison_page(p, flags);
2237	if (!res) {
2238	if (is_free_buddy_page(page: p)) {
2239	if (take_page_off_buddy(page: p)) {
2240	page_ref_inc(page: p);
2241	res = MF_RECOVERED;
2242	} else {
2243	/ We lost the race, try again /
2244	if (retry) {
2245	ClearPageHWPoison(page: p);
2246	retry = false;
2247	goto try_again;
2248	}
2249	res = MF_FAILED;
2250	}
2251	res = action_result(pfn, type: MF_MSG_BUDDY, result: res);
2252	} else {
2253	res = action_result(pfn, type: MF_MSG_KERNEL_HIGH_ORDER, result: MF_IGNORED);
2254	}
2255	goto unlock_mutex;
2256	} else if (res < `0`) {
2257	res = action_result(pfn, type: MF_MSG_UNKNOWN, result: MF_IGNORED);
2258	goto unlock_mutex;
2259	}
2260	}
2261
2262	hpage = compound_head(p);
2263	if (PageTransHuge(page: hpage)) {
2264	/*
2265	* The flag must be set after the refcount is bumped
2266	* otherwise it may race with THP split.
2267	* And the flag can't be set in get_hwpoison_page() since
2268	* it is called by soft offline too and it is just called
2269	* for !MF_COUNT_INCREASED. So here seems to be the best
2270	* place.
2271	*
2272	* Don't need care about the above error handling paths for
2273	* get_hwpoison_page() since they handle either free page
2274	* or unhandlable page. The refcount is bumped iff the
2275	* page is a valid handlable page.
2276	*/
2277	SetPageHasHWPoisoned(hpage);
2278	if (try_to_split_thp_page(page: p) < `0`) {
2279	res = action_result(pfn, type: MF_MSG_UNSPLIT_THP, result: MF_IGNORED);
2280	goto unlock_mutex;
2281	}
2282	VM_BUG_ON_PAGE(!page_count(p), p);
2283	}
2284
2285	/*
2286	* We ignore non-LRU pages for good reasons.
2287	* - PG_locked is only well defined for LRU pages and a few others
2288	* - to avoid races with __SetPageLocked()
2289	* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
2290	* The check (unnecessarily) ignores LRU pages being isolated and
2291	* walked by the page reclaim code, however that's not a big loss.
2292	*/
2293	shake_page(p);
2294
2295	lock_page(page: p);
2296
2297	/*
2298	* We're only intended to deal with the non-Compound page here.
2299	* However, the page could have changed compound pages due to
2300	* race window. If this happens, we could try again to hopefully
2301	* handle the page next round.
2302	*/
2303	if (PageCompound(page: p)) {
2304	if (retry) {
2305	ClearPageHWPoison(page: p);
2306	unlock_page(page: p);
2307	put_page(page: p);
2308	flags &= ~MF_COUNT_INCREASED;
2309	retry = false;
2310	goto try_again;
2311	}
2312	res = action_result(pfn, type: MF_MSG_DIFFERENT_COMPOUND, result: MF_IGNORED);
2313	goto unlock_page;
2314	}
2315
2316	/*
2317	* We use page flags to determine what action should be taken, but
2318	* the flags can be modified by the error containment action. One
2319	* example is an mlocked page, where PG_mlocked is cleared by
2320	* page_remove_rmap() in try_to_unmap_one(). So to determine page status
2321	* correctly, we save a copy of the page flags at this time.
2322	*/
2323	page_flags = p->flags;
2324
2325	if (hwpoison_filter(p)) {
2326	ClearPageHWPoison(page: p);
2327	unlock_page(page: p);
2328	put_page(page: p);
2329	res = -EOPNOTSUPP;
2330	goto unlock_mutex;
2331	}
2332
2333	/*
2334	* __munlock_folio() may clear a writeback page's LRU flag without
2335	* page_lock. We need wait writeback completion for this page or it
2336	* may trigger vfs BUG while evict inode.
2337	*/
2338	if (!PageLRU(page: p) && !PageWriteback(page: p))
2339	goto identify_page_state;
2340
2341	/*
2342	* It's very difficult to mess with pages currently under IO
2343	* and in many cases impossible, so we just avoid it here.
2344	*/
2345	wait_on_page_writeback(page: p);
2346
2347	/*
2348	* Now take care of user space mappings.
2349	* Abort on fail: __filemap_remove_folio() assumes unmapped page.
2350	*/
2351	if (!hwpoison_user_mappings(p, pfn, flags, hpage: p)) {
2352	res = action_result(pfn, type: MF_MSG_UNMAP_FAILED, result: MF_IGNORED);
2353	goto unlock_page;
2354	}
2355
2356	/*
2357	* Torn down by someone else?
2358	*/
2359	if (PageLRU(page: p) && !PageSwapCache(page: p) && p->mapping == NULL) {
2360	res = action_result(pfn, type: MF_MSG_TRUNCATED_LRU, result: MF_IGNORED);
2361	goto unlock_page;
2362	}
2363
2364	identify_page_state:
2365	res = identify_page_state(pfn, p, page_flags);
2366	mutex_unlock(lock: &mf_mutex);
2367	return res;
2368	unlock_page:
2369	unlock_page(page: p);
2370	unlock_mutex:
2371	mutex_unlock(lock: &mf_mutex);
2372	return res;
2373	}
2374	EXPORT_SYMBOL_GPL(memory_failure);
2375
2376	#define MEMORY_FAILURE_FIFO_ORDER 4
2377	#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
2378
2379	struct memory_failure_entry {
2380	unsigned long pfn;
2381	int flags;
2382	};
2383
2384	struct memory_failure_cpu {
2385	DECLARE_KFIFO(fifo, struct memory_failure_entry,
2386	MEMORY_FAILURE_FIFO_SIZE);
2387	spinlock_t lock;
2388	struct work_struct work;
2389	};
2390
2391	static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
2392
2393	/**
2394	* memory_failure_queue - Schedule handling memory failure of a page.
2395	* @pfn: Page Number of the corrupted page
2396	* @flags: Flags for memory failure handling
2397	*
2398	* This function is called by the low level hardware error handler
2399	* when it detects hardware memory corruption of a page. It schedules
2400	* the recovering of error page, including dropping pages, killing
2401	* processes etc.
2402	*
2403	* The function is primarily of use for corruptions that
2404	* happen outside the current execution context (e.g. when
2405	* detected by a background scrubber)
2406	*
2407	* Can run in IRQ context.
2408	*/
2409	void memory_failure_queue(unsigned long pfn, int flags)
2410	{
2411	struct memory_failure_cpu *mf_cpu;
2412	unsigned long proc_flags;
2413	struct memory_failure_entry entry = {
2414	.pfn = pfn,
2415	.flags = flags,
2416	};
2417
2418	mf_cpu = &get_cpu_var(memory_failure_cpu);
2419	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2420	if (kfifo_put(&mf_cpu->fifo, entry))
2421	schedule_work_on(smp_processor_id(), work: &mf_cpu->work);
2422	else
2423	pr_err("buffer overflow when queuing memory failure at %#lx\n",
2424	pfn);
2425	spin_unlock_irqrestore(lock: &mf_cpu->lock, flags: proc_flags);
2426	put_cpu_var(memory_failure_cpu);
2427	}
2428	EXPORT_SYMBOL_GPL(memory_failure_queue);
2429
2430	static void memory_failure_work_func(struct work_struct *work)
2431	{
2432	struct memory_failure_cpu *mf_cpu;
2433	struct memory_failure_entry entry = { `0`, };
2434	unsigned long proc_flags;
2435	int gotten;
2436
2437	mf_cpu = container_of(work, struct memory_failure_cpu, work);
2438	for (;;) {
2439	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2440	gotten = kfifo_get(&mf_cpu->fifo, &entry);
2441	spin_unlock_irqrestore(lock: &mf_cpu->lock, flags: proc_flags);
2442	if (!gotten)
2443	break;
2444	if (entry.flags & MF_SOFT_OFFLINE)
2445	soft_offline_page(pfn: entry.pfn, flags: entry.flags);
2446	else
2447	memory_failure(entry.pfn, entry.flags);
2448	}
2449	}
2450
2451	/*
2452	* Process memory_failure work queued on the specified CPU.
2453	* Used to avoid return-to-userspace racing with the memory_failure workqueue.
2454	*/
2455	void memory_failure_queue_kick(int cpu)
2456	{
2457	struct memory_failure_cpu *mf_cpu;
2458
2459	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2460	cancel_work_sync(work: &mf_cpu->work);
2461	memory_failure_work_func(work: &mf_cpu->work);
2462	}
2463
2464	static int __init memory_failure_init(void)
2465	{
2466	struct memory_failure_cpu *mf_cpu;
2467	int cpu;
2468
2469	for_each_possible_cpu(cpu) {
2470	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2471	spin_lock_init(&mf_cpu->lock);
2472	INIT_KFIFO(mf_cpu->fifo);
2473	INIT_WORK(&mf_cpu->work, memory_failure_work_func);
2474	}
2475
2476	register_sysctl_init("vm", memory_failure_table);
2477
2478	return `0`;
2479	}
2480	core_initcall(memory_failure_init);
2481
2482	#undef pr_fmt
2483	#define pr_fmt(fmt) "" fmt
2484	#define unpoison_pr_info(fmt, pfn, rs) \
2485	({ \
2486	if (__ratelimit(rs)) \
2487	pr_info(fmt, pfn); \
2488	})
2489
2490	/**
2491	* unpoison_memory - Unpoison a previously poisoned page
2492	* @pfn: Page number of the to be unpoisoned page
2493	*
2494	* Software-unpoison a page that has been poisoned by
2495	* memory_failure() earlier.
2496	*
2497	* This is only done on the software-level, so it only works
2498	* for linux injected failures, not real hardware failures
2499	*
2500	* Returns 0 for success, otherwise -errno.
2501	*/
2502	int unpoison_memory(unsigned long pfn)
2503	{
2504	struct folio *folio;
2505	struct page *p;
2506	int ret = -EBUSY, ghp;
2507	unsigned long count = `1`;
2508	bool huge = false;
2509	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
2510	DEFAULT_RATELIMIT_BURST);
2511
2512	if (!pfn_valid(pfn))
2513	return -ENXIO;
2514
2515	p = pfn_to_page(pfn);
2516	folio = page_folio(p);
2517
2518	mutex_lock(&mf_mutex);
2519
2520	if (hw_memory_failure) {
2521	unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
2522	pfn, &unpoison_rs);
2523	ret = -EOPNOTSUPP;
2524	goto unlock_mutex;
2525	}
2526
2527	if (!PageHWPoison(page: p)) {
2528	unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
2529	pfn, &unpoison_rs);
2530	goto unlock_mutex;
2531	}
2532
2533	if (folio_ref_count(folio) > `1`) {
2534	unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
2535	pfn, &unpoison_rs);
2536	goto unlock_mutex;
2537	}
2538
2539	if (folio_test_slab(folio) \|\| PageTable(page: &folio->page) \|\|
2540	folio_test_reserved(folio) \|\| PageOffline(page: &folio->page))
2541	goto unlock_mutex;
2542
2543	/*
2544	* Note that folio->_mapcount is overloaded in SLAB, so the simple test
2545	* in folio_mapped() has to be done after folio_test_slab() is checked.
2546	*/
2547	if (folio_mapped(folio)) {
2548	unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
2549	pfn, &unpoison_rs);
2550	goto unlock_mutex;
2551	}
2552
2553	if (folio_mapping(folio)) {
2554	unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
2555	pfn, &unpoison_rs);
2556	goto unlock_mutex;
2557	}
2558
2559	ghp = get_hwpoison_page(p, flags: MF_UNPOISON);
2560	if (!ghp) {
2561	if (PageHuge(page: p)) {
2562	huge = true;
2563	count = folio_free_raw_hwp(folio, move_flag: false);
2564	if (count == `0`)
2565	goto unlock_mutex;
2566	}
2567	ret = folio_test_clear_hwpoison(folio) ? `0` : -EBUSY;
2568	} else if (ghp < `0`) {
2569	if (ghp == -EHWPOISON) {
2570	ret = put_page_back_buddy(page: p) ? `0` : -EBUSY;
2571	} else {
2572	ret = ghp;
2573	unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
2574	pfn, &unpoison_rs);
2575	}
2576	} else {
2577	if (PageHuge(page: p)) {
2578	huge = true;
2579	count = folio_free_raw_hwp(folio, move_flag: false);
2580	if (count == `0`) {
2581	folio_put(folio);
2582	goto unlock_mutex;
2583	}
2584	}
2585
2586	folio_put(folio);
2587	if (TestClearPageHWPoison(page: p)) {
2588	folio_put(folio);
2589	ret = `0`;
2590	}
2591	}
2592
2593	unlock_mutex:
2594	mutex_unlock(lock: &mf_mutex);
2595	if (!ret) {
2596	if (!huge)
2597	num_poisoned_pages_sub(pfn, i: `1`);
2598	unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
2599	page_to_pfn(p), &unpoison_rs);
2600	}
2601	return ret;
2602	}
2603	EXPORT_SYMBOL(unpoison_memory);
2604
2605	static bool isolate_page(struct page page, struct* list_head *pagelist)
2606	{
2607	bool isolated = false;
2608
2609	if (PageHuge(page)) {
2610	isolated = isolate_hugetlb(page_folio(page), list: pagelist);
2611	} else {
2612	bool lru = !__PageMovable(page);
2613
2614	if (lru)
2615	isolated = isolate_lru_page(page);
2616	else
2617	isolated = isolate_movable_page(page,
2618	ISOLATE_UNEVICTABLE);
2619
2620	if (isolated) {
2621	list_add(new: &page->lru, head: pagelist);
2622	if (lru)
2623	inc_node_page_state(page, NR_ISOLATED_ANON +
2624	page_is_file_lru(page));
2625	}
2626	}
2627
2628	/*
2629	* If we succeed to isolate the page, we grabbed another refcount on
2630	* the page, so we can safely drop the one we got from get_any_page().
2631	* If we failed to isolate the page, it means that we cannot go further
2632	* and we will return an error, so drop the reference we got from
2633	* get_any_page() as well.
2634	*/
2635	put_page(page);
2636	return isolated;
2637	}
2638
2639	/*
2640	* soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
2641	* If the page is a non-dirty unmapped page-cache page, it simply invalidates.
2642	* If the page is mapped, it migrates the contents over.
2643	*/
2644	static int soft_offline_in_use_page(struct page *page)
2645	{
2646	long ret = `0`;
2647	unsigned long pfn = page_to_pfn(page);
2648	struct page *hpage = compound_head(page);
2649	char const *msg_page[] = {"page", "hugepage"};
2650	bool huge = PageHuge(page);
2651	LIST_HEAD(pagelist);
2652	struct migration_target_control mtc = {
2653	.nid = NUMA_NO_NODE,
2654	.gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
2655	};
2656
2657	if (!huge && PageTransHuge(page: hpage)) {
2658	if (try_to_split_thp_page(page)) {
2659	pr_info("soft offline: %#lx: thp split failed\n", pfn);
2660	return -EBUSY;
2661	}
2662	hpage = page;
2663	}
2664
2665	lock_page(page);
2666	if (!huge)
2667	wait_on_page_writeback(page);
2668	if (PageHWPoison(page)) {
2669	unlock_page(page);
2670	put_page(page);
2671	pr_info("soft offline: %#lx page already poisoned\n", pfn);
2672	return `0`;
2673	}
2674
2675	if (!huge && PageLRU(page) && !PageSwapCache(page))
2676	/*
2677	* Try to invalidate first. This should work for
2678	* non dirty unmapped page cache pages.
2679	*/
2680	ret = invalidate_inode_page(page);
2681	unlock_page(page);
2682
2683	if (ret) {
2684	pr_info("soft_offline: %#lx: invalidated\n", pfn);
2685	page_handle_poison(page, hugepage_or_freepage: false, release: true);
2686	return `0`;
2687	}
2688
2689	if (isolate_page(page: hpage, pagelist: &pagelist)) {
2690	ret = migrate_pages(l: &pagelist, new: alloc_migration_target, NULL,
2691	private: (unsigned long)&mtc, mode: MIGRATE_SYNC, reason: MR_MEMORY_FAILURE, NULL);
2692	if (!ret) {
2693	bool release = !huge;
2694
2695	if (!page_handle_poison(page, hugepage_or_freepage: huge, release))
2696	ret = -EBUSY;
2697	} else {
2698	if (!list_empty(head: &pagelist))
2699	putback_movable_pages(l: &pagelist);
2700
2701	pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
2702	pfn, msg_page[huge], ret, &page->flags);
2703	if (ret > `0`)
2704	ret = -EBUSY;
2705	}
2706	} else {
2707	pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
2708	pfn, msg_page[huge], page_count(page), &page->flags);
2709	ret = -EBUSY;
2710	}
2711	return ret;
2712	}
2713
2714	/**
2715	* soft_offline_page - Soft offline a page.
2716	* @pfn: pfn to soft-offline
2717	* @flags: flags. Same as memory_failure().
2718	*
2719	* Returns 0 on success
2720	* -EOPNOTSUPP for hwpoison_filter() filtered the error event
2721	* < 0 otherwise negated errno.
2722	*
2723	* Soft offline a page, by migration or invalidation,
2724	* without killing anything. This is for the case when
2725	* a page is not corrupted yet (so it's still valid to access),
2726	* but has had a number of corrected errors and is better taken
2727	* out.
2728	*
2729	* The actual policy on when to do that is maintained by
2730	* user space.
2731	*
2732	* This should never impact any application or cause data loss,
2733	* however it might take some time.
2734	*
2735	* This is not a 100% solution for all memory, but tries to be
2736	* ``good enough'' for the majority of memory.
2737	*/
2738	int soft_offline_page(unsigned long pfn, int flags)
2739	{
2740	int ret;
2741	bool try_again = true;
2742	struct page *page;
2743
2744	if (!pfn_valid(pfn)) {
2745	WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
2746	return -ENXIO;
2747	}
2748
2749	/ Only online pages can be soft-offlined (esp., not ZONE_DEVICE). /
2750	page = pfn_to_online_page(pfn);
2751	if (!page) {
2752	put_ref_page(pfn, flags);
2753	return -EIO;
2754	}
2755
2756	mutex_lock(&mf_mutex);
2757
2758	if (PageHWPoison(page)) {
2759	pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
2760	put_ref_page(pfn, flags);
2761	mutex_unlock(lock: &mf_mutex);
2762	return `0`;
2763	}
2764
2765	retry:
2766	get_online_mems();
2767	ret = get_hwpoison_page(p: page, flags: flags \| MF_SOFT_OFFLINE);
2768	put_online_mems();
2769
2770	if (hwpoison_filter(page)) {
2771	if (ret > `0`)
2772	put_page(page);
2773
2774	mutex_unlock(lock: &mf_mutex);
2775	return -EOPNOTSUPP;
2776	}
2777
2778	if (ret > `0`) {
2779	ret = soft_offline_in_use_page(page);
2780	} else if (ret == `0`) {
2781	if (!page_handle_poison(page, hugepage_or_freepage: true, release: false)) {
2782	if (try_again) {
2783	try_again = false;
2784	flags &= ~MF_COUNT_INCREASED;
2785	goto retry;
2786	}
2787	ret = -EBUSY;
2788	}
2789	}
2790
2791	mutex_unlock(lock: &mf_mutex);
2792
2793	return ret;
2794	}
2795

source code of linux/mm/memory-failure.c