oom_kill.c source code [linux/mm/oom_kill.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/oom_kill.c
4	*
5	* Copyright (C) 1998,2000 Rik van Riel
6	* Thanks go out to Claus Fischer for some serious inspiration and
7	* for goading me into coding this file...
8	* Copyright (C) 2010 Google, Inc.
9	* Rewritten by David Rientjes
10	*
11	* The routines in this file are used to kill a process when
12	* we're seriously out of memory. This gets called from __alloc_pages()
13	* in mm/page_alloc.c when we really run out of memory.
14	*
15	* Since we won't call these routines often (on a well-configured
16	* machine) this file will double as a 'coding guide' and a signpost
17	* for newbie kernel hackers. It features several pointers to major
18	* kernel subsystems and hints as to where to find out what things do.
19	*/
20
21	#include <linux/oom.h>
22	#include <linux/mm.h>
23	#include <linux/err.h>
24	#include <linux/gfp.h>
25	#include <linux/sched.h>
26	#include <linux/sched/mm.h>
27	#include <linux/sched/task.h>
28	#include <linux/sched/debug.h>
29	#include <linux/swap.h>
30	#include <linux/syscalls.h>
31	#include <linux/timex.h>
32	#include <linux/jiffies.h>
33	#include <linux/cpuset.h>
34	#include <linux/export.h>
35	#include <linux/notifier.h>
36	#include <linux/memcontrol.h>
37	#include <linux/mempolicy.h>
38	#include <linux/security.h>
39	#include <linux/ptrace.h>
40	#include <linux/freezer.h>
41	#include <linux/ftrace.h>
42	#include <linux/ratelimit.h>
43	#include <linux/kthread.h>
44	#include <linux/init.h>
45	#include <linux/mmu_notifier.h>
46	#include <linux/cred.h>
47	#include <linux/nmi.h>
48
49	#include <asm/tlb.h>
50	#include "internal.h"
51	#include "slab.h"
52
53	#define CREATE_TRACE_POINTS
54	#include <trace/events/oom.h>
55
56	static int sysctl_panic_on_oom;
57	static int sysctl_oom_kill_allocating_task;
58	static int sysctl_oom_dump_tasks = `1`;
59
60	/*
61	* Serializes oom killer invocations (out_of_memory()) from all contexts to
62	* prevent from over eager oom killing (e.g. when the oom killer is invoked
63	* from different domains).
64	*
65	* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
66	* and mark_oom_victim
67	*/
68	DEFINE_MUTEX(oom_lock);
69	/ Serializes oom_score_adj and oom_score_adj_min updates /
70	DEFINE_MUTEX(oom_adj_mutex);
71
72	static inline bool is_memcg_oom(struct oom_control *oc)
73	{
74	return oc->memcg != NULL;
75	}
76
77	#ifdef CONFIG_NUMA
78	/**
79	* oom_cpuset_eligible() - check task eligibility for kill
80	* @start: task struct of which task to consider
81	* @oc: pointer to struct oom_control
82	*
83	* Task eligibility is determined by whether or not a candidate task, @tsk,
84	* shares the same mempolicy nodes as current if it is bound by such a policy
85	* and whether or not it has the same set of allowed cpuset nodes.
86	*
87	* This function is assuming oom-killer context and 'current' has triggered
88	* the oom-killer.
89	*/
90	static bool oom_cpuset_eligible(struct task_struct *start,
91	struct oom_control *oc)
92	{
93	struct task_struct *tsk;
94	bool ret = false;
95	const nodemask_t *mask = oc->nodemask;
96
97	rcu_read_lock();
98	for_each_thread(start, tsk) {
99	if (mask) {
100	/*
101	* If this is a mempolicy constrained oom, tsk's
102	* cpuset is irrelevant. Only return true if its
103	* mempolicy intersects current, otherwise it may be
104	* needlessly killed.
105	*/
106	ret = mempolicy_in_oom_domain(tsk, mask);
107	} else {
108	/*
109	* This is not a mempolicy constrained oom, so only
110	* check the mems of tsk's cpuset.
111	*/
112	ret = cpuset_mems_allowed_intersects(current, tsk2: tsk);
113	}
114	if (ret)
115	break;
116	}
117	rcu_read_unlock();
118
119	return ret;
120	}
121	#else
122	static bool oom_cpuset_eligible(struct task_struct tsk, struct* oom_control *oc)
123	{
124	return true;
125	}
126	#endif /* CONFIG_NUMA */
127
128	/*
129	* The process p may have detached its own ->mm while exiting or through
130	* kthread_use_mm(), but one or more of its subthreads may still have a valid
131	* pointer. Return p, or any of its subthreads with a valid ->mm, with
132	* task_lock() held.
133	*/
134	struct task_struct find_lock_task_mm(struct* task_struct *p)
135	{
136	struct task_struct *t;
137
138	rcu_read_lock();
139
140	for_each_thread(p, t) {
141	task_lock(p: t);
142	if (likely(t->mm))
143	goto found;
144	task_unlock(p: t);
145	}
146	t = NULL;
147	found:
148	rcu_read_unlock();
149
150	return t;
151	}
152
153	/*
154	* order == -1 means the oom kill is required by sysrq, otherwise only
155	* for display purposes.
156	*/
157	static inline bool is_sysrq_oom(struct oom_control *oc)
158	{
159	return oc->order == -`1`;
160	}
161
162	/ return true if the task is not adequate as candidate victim task. /
163	static bool oom_unkillable_task(struct task_struct *p)
164	{
165	if (is_global_init(tsk: p))
166	return true;
167	if (p->flags & PF_KTHREAD)
168	return true;
169	return false;
170	}
171
172	/*
173	* Check whether unreclaimable slab amount is greater than
174	* all user memory(LRU pages).
175	* dump_unreclaimable_slab() could help in the case that
176	* oom due to too much unreclaimable slab used by kernel.
177	*/
178	static bool should_dump_unreclaim_slab(void)
179	{
180	unsigned long nr_lru;
181
182	nr_lru = global_node_page_state(item: NR_ACTIVE_ANON) +
183	global_node_page_state(item: NR_INACTIVE_ANON) +
184	global_node_page_state(item: NR_ACTIVE_FILE) +
185	global_node_page_state(item: NR_INACTIVE_FILE) +
186	global_node_page_state(item: NR_ISOLATED_ANON) +
187	global_node_page_state(item: NR_ISOLATED_FILE) +
188	global_node_page_state(item: NR_UNEVICTABLE);
189
190	return (global_node_page_state_pages(item: NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
191	}
192
193	/**
194	* oom_badness - heuristic function to determine which candidate task to kill
195	* @p: task struct of which task we should calculate
196	* @totalpages: total present RAM allowed for page allocation
197	*
198	* The heuristic for determining which task to kill is made to be as simple and
199	* predictable as possible. The goal is to return the highest value for the
200	* task consuming the most memory to avoid subsequent oom failures.
201	*/
202	long oom_badness(struct task_struct p, unsigned* long totalpages)
203	{
204	long points;
205	long adj;
206
207	if (oom_unkillable_task(p))
208	return LONG_MIN;
209
210	p = find_lock_task_mm(p);
211	if (!p)
212	return LONG_MIN;
213
214	/*
215	* Do not even consider tasks which are explicitly marked oom
216	* unkillable or have been already oom reaped or the are in
217	* the middle of vfork
218	*/
219	adj = (long)p->signal->oom_score_adj;
220	if (adj == OOM_SCORE_ADJ_MIN \|\|
221	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
222	in_vfork(tsk: p)) {
223	task_unlock(p);
224	return LONG_MIN;
225	}
226
227	/*
228	* The baseline for the badness score is the proportion of RAM that each
229	* task's rss, pagetable and swap space use.
230	*/
231	points = get_mm_rss(mm: p->mm) + get_mm_counter(mm: p->mm, member: MM_SWAPENTS) +
232	mm_pgtables_bytes(mm: p->mm) / PAGE_SIZE;
233	task_unlock(p);
234
235	/ Normalize to oom_score_adj units /
236	adj *= totalpages / `1000`;
237	points += adj;
238
239	return points;
240	}
241
242	static const char * const oom_constraint_text[] = {
243	[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
244	[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
245	[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
246	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
247	};
248
249	/*
250	* Determine the type of allocation constraint.
251	*/
252	static enum oom_constraint constrained_alloc(struct oom_control *oc)
253	{
254	struct zone *zone;
255	struct zoneref *z;
256	enum zone_type highest_zoneidx = gfp_zone(flags: oc->gfp_mask);
257	bool cpuset_limited = false;
258	int nid;
259
260	if (is_memcg_oom(oc)) {
261	oc->totalpages = mem_cgroup_get_max(memcg: oc->memcg) ?: `1`;
262	return CONSTRAINT_MEMCG;
263	}
264
265	/ Default to all available memory /
266	oc->totalpages = totalram_pages() + total_swap_pages;
267
268	if (!IS_ENABLED(CONFIG_NUMA))
269	return CONSTRAINT_NONE;
270
271	if (!oc->zonelist)
272	return CONSTRAINT_NONE;
273	/*
274	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
275	* to kill current.We have to random task kill in this case.
276	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
277	*/
278	if (oc->gfp_mask & __GFP_THISNODE)
279	return CONSTRAINT_NONE;
280
281	/*
282	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
283	* the page allocator means a mempolicy is in effect. Cpuset policy
284	* is enforced in get_page_from_freelist().
285	*/
286	if (oc->nodemask &&
287	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
288	oc->totalpages = total_swap_pages;
289	for_each_node_mask(nid, *oc->nodemask)
290	oc->totalpages += node_present_pages(nid);
291	return CONSTRAINT_MEMORY_POLICY;
292	}
293
294	/ Check this allocation failure is caused by cpuset's wall function /
295	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
296	highest_zoneidx, oc->nodemask)
297	if (!cpuset_zone_allowed(z: zone, gfp_mask: oc->gfp_mask))
298	cpuset_limited = true;
299
300	if (cpuset_limited) {
301	oc->totalpages = total_swap_pages;
302	for_each_node_mask(nid, cpuset_current_mems_allowed)
303	oc->totalpages += node_present_pages(nid);
304	return CONSTRAINT_CPUSET;
305	}
306	return CONSTRAINT_NONE;
307	}
308
309	static int oom_evaluate_task(struct task_struct task, void* *arg)
310	{
311	struct oom_control *oc = arg;
312	long points;
313
314	if (oom_unkillable_task(p: task))
315	goto next;
316
317	/ p may not have freeable memory in nodemask /
318	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(start: task, oc))
319	goto next;
320
321	/*
322	* This task already has access to memory reserves and is being killed.
323	* Don't allow any other task to have access to the reserves unless
324	* the task has MMF_OOM_SKIP because chances that it would release
325	* any memory is quite low.
326	*/
327	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(tsk: task)) {
328	if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
329	goto next;
330	goto abort;
331	}
332
333	/*
334	* If task is allocating a lot of memory and has been marked to be
335	* killed first if it triggers an oom, then select it.
336	*/
337	if (oom_task_origin(p: task)) {
338	points = LONG_MAX;
339	goto select;
340	}
341
342	points = oom_badness(p: task, totalpages: oc->totalpages);
343	if (points == LONG_MIN \|\| points < oc->chosen_points)
344	goto next;
345
346	select:
347	if (oc->chosen)
348	put_task_struct(t: oc->chosen);
349	get_task_struct(t: task);
350	oc->chosen = task;
351	oc->chosen_points = points;
352	next:
353	return `0`;
354	abort:
355	if (oc->chosen)
356	put_task_struct(t: oc->chosen);
357	oc->chosen = (void *)-`1UL`;
358	return `1`;
359	}
360
361	/*
362	* Simple selection loop. We choose the process with the highest number of
363	* 'points'. In case scan was aborted, oc->chosen is set to -1.
364	*/
365	static void select_bad_process(struct oom_control *oc)
366	{
367	oc->chosen_points = LONG_MIN;
368
369	if (is_memcg_oom(oc))
370	mem_cgroup_scan_tasks(memcg: oc->memcg, oom_evaluate_task, arg: oc);
371	else {
372	struct task_struct *p;
373
374	rcu_read_lock();
375	for_each_process(p)
376	if (oom_evaluate_task(task: p, arg: oc))
377	break;
378	rcu_read_unlock();
379	}
380	}
381
382	static int dump_task(struct task_struct p, void* *arg)
383	{
384	struct oom_control *oc = arg;
385	struct task_struct *task;
386
387	if (oom_unkillable_task(p))
388	return `0`;
389
390	/ p may not have freeable memory in nodemask /
391	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(start: p, oc))
392	return `0`;
393
394	task = find_lock_task_mm(p);
395	if (!task) {
396	/*
397	* All of p's threads have already detached their mm's. There's
398	* no need to report them; they can't be oom killed anyway.
399	*/
400	return `0`;
401	}
402
403	pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n",
404	task->pid, from_kuid(&init_user_ns, task_uid(task)),
405	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
406	get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES),
407	get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm),
408	get_mm_counter(task->mm, MM_SWAPENTS),
409	task->signal->oom_score_adj, task->comm);
410	task_unlock(p: task);
411
412	return `0`;
413	}
414
415	/**
416	* dump_tasks - dump current memory state of all system tasks
417	* @oc: pointer to struct oom_control
418	*
419	* Dumps the current memory state of all eligible tasks. Tasks not in the same
420	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
421	* are not shown.
422	* State information includes task's pid, uid, tgid, vm size, rss,
423	* pgtables_bytes, swapents, oom_score_adj value, and name.
424	*/
425	static void dump_tasks(struct oom_control *oc)
426	{
427	pr_info("Tasks state (memory values in pages):\n");
428	pr_info("[ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name\n");
429
430	if (is_memcg_oom(oc))
431	mem_cgroup_scan_tasks(memcg: oc->memcg, dump_task, arg: oc);
432	else {
433	struct task_struct *p;
434	int i = `0`;
435
436	rcu_read_lock();
437	for_each_process(p) {
438	/ Avoid potential softlockup warning /
439	if ((++i & `1023`) == `0`)
440	touch_softlockup_watchdog();
441	dump_task(p, arg: oc);
442	}
443	rcu_read_unlock();
444	}
445	}
446
447	static void dump_oom_victim(struct oom_control oc, struct* task_struct *victim)
448	{
449	/ one line summary of the oom killer context. /
450	pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
451	oom_constraint_text[oc->constraint],
452	nodemask_pr_args(oc->nodemask));
453	cpuset_print_current_mems_allowed();
454	mem_cgroup_print_oom_context(memcg: oc->memcg, p: victim);
455	pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
456	from_kuid(&init_user_ns, task_uid(victim)));
457	}
458
459	static void dump_header(struct oom_control *oc)
460	{
461	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
462	current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
463	current->signal->oom_score_adj);
464	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
465	pr_warn("COMPACTION is disabled!!!\n");
466
467	dump_stack();
468	if (is_memcg_oom(oc))
469	mem_cgroup_print_oom_meminfo(memcg: oc->memcg);
470	else {
471	__show_mem(SHOW_MEM_FILTER_NODES, nodemask: oc->nodemask, max_zone_idx: gfp_zone(flags: oc->gfp_mask));
472	if (should_dump_unreclaim_slab())
473	dump_unreclaimable_slab();
474	}
475	if (sysctl_oom_dump_tasks)
476	dump_tasks(oc);
477	}
478
479	/*
480	* Number of OOM victims in flight
481	*/
482	static atomic_t oom_victims = ATOMIC_INIT(`0`);
483	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
484
485	static bool oom_killer_disabled __read_mostly;
486
487	/*
488	* task->mm can be NULL if the task is the exited group leader. So to
489	* determine whether the task is using a particular mm, we examine all the
490	* task's threads: if one of those is using this mm then this task was also
491	* using it.
492	*/
493	bool process_shares_mm(struct task_struct p, struct* mm_struct *mm)
494	{
495	struct task_struct *t;
496
497	for_each_thread(p, t) {
498	struct mm_struct *t_mm = READ_ONCE(t->mm);
499	if (t_mm)
500	return t_mm == mm;
501	}
502	return false;
503	}
504
505	#ifdef CONFIG_MMU
506	/*
507	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
508	* victim (if that is possible) to help the OOM killer to move on.
509	*/
510	static struct task_struct *oom_reaper_th;
511	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
512	static struct task_struct *oom_reaper_list;
513	static DEFINE_SPINLOCK(oom_reaper_lock);
514
515	static bool __oom_reap_task_mm(struct mm_struct *mm)
516	{
517	struct vm_area_struct *vma;
518	bool ret = true;
519	VMA_ITERATOR(vmi, mm, `0`);
520
521	/*
522	* Tell all users of get_user/copy_from_user etc... that the content
523	* is no longer stable. No barriers really needed because unmapping
524	* should imply barriers already and the reader would hit a page fault
525	* if it stumbled over a reaped memory.
526	*/
527	set_bit(MMF_UNSTABLE, addr: &mm->flags);
528
529	for_each_vma(vmi, vma) {
530	if (vma->vm_flags & (VM_HUGETLB\|VM_PFNMAP))
531	continue;
532
533	/*
534	* Only anonymous pages have a good chance to be dropped
535	* without additional steps which we cannot afford as we
536	* are OOM already.
537	*
538	* We do not even care about fs backed pages because all
539	* which are reclaimable have already been reclaimed and
540	* we do not want to block exit_mmap by keeping mm ref
541	* count elevated without a good reason.
542	*/
543	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
544	struct mmu_notifier_range range;
545	struct mmu_gather tlb;
546
547	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_UNMAP, flags: `0`,
548	mm, start: vma->vm_start,
549	end: vma->vm_end);
550	tlb_gather_mmu(tlb: &tlb, mm);
551	if (mmu_notifier_invalidate_range_start_nonblock(range: &range)) {
552	tlb_finish_mmu(tlb: &tlb);
553	ret = false;
554	continue;
555	}
556	unmap_page_range(tlb: &tlb, vma, addr: range.start, end: range.end, NULL);
557	mmu_notifier_invalidate_range_end(range: &range);
558	tlb_finish_mmu(tlb: &tlb);
559	}
560	}
561
562	return ret;
563	}
564
565	/*
566	* Reaps the address space of the given task.
567	*
568	* Returns true on success and false if none or part of the address space
569	* has been reclaimed and the caller should retry later.
570	*/
571	static bool oom_reap_task_mm(struct task_struct tsk, struct* mm_struct *mm)
572	{
573	bool ret = true;
574
575	if (!mmap_read_trylock(mm)) {
576	trace_skip_task_reaping(pid: tsk->pid);
577	return false;
578	}
579
580	/*
581	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
582	* work on the mm anymore. The check for MMF_OOM_SKIP must run
583	* under mmap_lock for reading because it serializes against the
584	* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
585	*/
586	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
587	trace_skip_task_reaping(pid: tsk->pid);
588	goto out_unlock;
589	}
590
591	trace_start_task_reaping(pid: tsk->pid);
592
593	/ failed to reap part of the address space. Try again later /
594	ret = __oom_reap_task_mm(mm);
595	if (!ret)
596	goto out_finish;
597
598	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
599	task_pid_nr(tsk), tsk->comm,
600	K(get_mm_counter(mm, MM_ANONPAGES)),
601	K(get_mm_counter(mm, MM_FILEPAGES)),
602	K(get_mm_counter(mm, MM_SHMEMPAGES)));
603	out_finish:
604	trace_finish_task_reaping(pid: tsk->pid);
605	out_unlock:
606	mmap_read_unlock(mm);
607
608	return ret;
609	}
610
611	#define MAX_OOM_REAP_RETRIES 10
612	static void oom_reap_task(struct task_struct *tsk)
613	{
614	int attempts = `0`;
615	struct mm_struct *mm = tsk->signal->oom_mm;
616
617	/ Retry the mmap_read_trylock(mm) a few times /
618	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
619	schedule_timeout_idle(HZ/`10`);
620
621	if (attempts <= MAX_OOM_REAP_RETRIES \|\|
622	test_bit(MMF_OOM_SKIP, &mm->flags))
623	goto done;
624
625	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
626	task_pid_nr(tsk), tsk->comm);
627	sched_show_task(p: tsk);
628	debug_show_all_locks();
629
630	done:
631	tsk->oom_reaper_list = NULL;
632
633	/*
634	* Hide this mm from OOM killer because it has been either reaped or
635	* somebody can't call mmap_write_unlock(mm).
636	*/
637	set_bit(MMF_OOM_SKIP, addr: &mm->flags);
638
639	/ Drop a reference taken by queue_oom_reaper /
640	put_task_struct(t: tsk);
641	}
642
643	static int oom_reaper(void *unused)
644	{
645	set_freezable();
646
647	while (true) {
648	struct task_struct *tsk = NULL;
649
650	wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
651	spin_lock_irq(lock: &oom_reaper_lock);
652	if (oom_reaper_list != NULL) {
653	tsk = oom_reaper_list;
654	oom_reaper_list = tsk->oom_reaper_list;
655	}
656	spin_unlock_irq(lock: &oom_reaper_lock);
657
658	if (tsk)
659	oom_reap_task(tsk);
660	}
661
662	return `0`;
663	}
664
665	static void wake_oom_reaper(struct timer_list *timer)
666	{
667	struct task_struct tsk = container_of(timer, struct* task_struct,
668	oom_reaper_timer);
669	struct mm_struct *mm = tsk->signal->oom_mm;
670	unsigned long flags;
671
672	/ The victim managed to terminate on its own - see exit_mmap /
673	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
674	put_task_struct(t: tsk);
675	return;
676	}
677
678	spin_lock_irqsave(&oom_reaper_lock, flags);
679	tsk->oom_reaper_list = oom_reaper_list;
680	oom_reaper_list = tsk;
681	spin_unlock_irqrestore(lock: &oom_reaper_lock, flags);
682	trace_wake_reaper(pid: tsk->pid);
683	wake_up(&oom_reaper_wait);
684	}
685
686	/*
687	* Give the OOM victim time to exit naturally before invoking the oom_reaping.
688	* The timers timeout is arbitrary... the longer it is, the longer the worst
689	* case scenario for the OOM can take. If it is too small, the oom_reaper can
690	* get in the way and release resources needed by the process exit path.
691	* e.g. The futex robust list can sit in Anon\|Private memory that gets reaped
692	* before the exit path is able to wake the futex waiters.
693	*/
694	#define OOM_REAPER_DELAY (2*HZ)
695	static void queue_oom_reaper(struct task_struct *tsk)
696	{
697	/ mm is already queued? /
698	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, addr: &tsk->signal->oom_mm->flags))
699	return;
700
701	get_task_struct(t: tsk);
702	timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, `0`);
703	tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
704	add_timer(timer: &tsk->oom_reaper_timer);
705	}
706
707	#ifdef CONFIG_SYSCTL
708	static const struct ctl_table vm_oom_kill_table[] = {
709	{
710	.procname = "panic_on_oom",
711	.data = &sysctl_panic_on_oom,
712	.maxlen = sizeof(sysctl_panic_on_oom),
713	.mode = `0644`,
714	.proc_handler = proc_dointvec_minmax,
715	.extra1 = SYSCTL_ZERO,
716	.extra2 = SYSCTL_TWO,
717	},
718	{
719	.procname = "oom_kill_allocating_task",
720	.data = &sysctl_oom_kill_allocating_task,
721	.maxlen = sizeof(sysctl_oom_kill_allocating_task),
722	.mode = `0644`,
723	.proc_handler = proc_dointvec,
724	},
725	{
726	.procname = "oom_dump_tasks",
727	.data = &sysctl_oom_dump_tasks,
728	.maxlen = sizeof(sysctl_oom_dump_tasks),
729	.mode = `0644`,
730	.proc_handler = proc_dointvec,
731	},
732	};
733	#endif
734
735	static int __init oom_init(void)
736	{
737	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
738	#ifdef CONFIG_SYSCTL
739	register_sysctl_init("vm", vm_oom_kill_table);
740	#endif
741	return `0`;
742	}
743	subsys_initcall(oom_init)
744	#else
745	static inline void queue_oom_reaper(struct task_struct *tsk)
746	{
747	}
748	#endif /* CONFIG_MMU */
749
750	/**
751	* mark_oom_victim - mark the given task as OOM victim
752	* @tsk: task to mark
753	*
754	* Has to be called with oom_lock held and never after
755	* oom has been disabled already.
756	*
757	* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
758	* under task_lock or operate on the current).
759	*/
760	static void mark_oom_victim(struct task_struct *tsk)
761	{
762	const struct cred *cred;
763	struct mm_struct *mm = tsk->mm;
764
765	WARN_ON(oom_killer_disabled);
766	/ OOM killer might race with memcg OOM /
767	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
768	return;
769
770	/ oom_mm is bound to the signal struct life time. /
771	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
772	mmgrab(mm: tsk->signal->oom_mm);
773
774	/*
775	* Make sure that the task is woken up from uninterruptible sleep
776	* if it is frozen because OOM killer wouldn't be able to free
777	* any memory and livelock. freezing_slow_path will tell the freezer
778	* that TIF_MEMDIE tasks should be ignored.
779	*/
780	__thaw_task(t: tsk);
781	atomic_inc(v: &oom_victims);
782	cred = get_task_cred(tsk);
783	trace_mark_victim(task: tsk, uid: cred->uid.val);
784	put_cred(cred);
785	}
786
787	/**
788	* exit_oom_victim - note the exit of an OOM victim
789	*/
790	void exit_oom_victim(void)
791	{
792	clear_thread_flag(TIF_MEMDIE);
793
794	if (!atomic_dec_return(v: &oom_victims))
795	wake_up_all(&oom_victims_wait);
796	}
797
798	/**
799	* oom_killer_enable - enable OOM killer
800	*/
801	void oom_killer_enable(void)
802	{
803	oom_killer_disabled = false;
804	pr_info("OOM killer enabled.\n");
805	}
806
807	/**
808	* oom_killer_disable - disable OOM killer
809	* @timeout: maximum timeout to wait for oom victims in jiffies
810	*
811	* Forces all page allocations to fail rather than trigger OOM killer.
812	* Will block and wait until all OOM victims are killed or the given
813	* timeout expires.
814	*
815	* The function cannot be called when there are runnable user tasks because
816	* the userspace would see unexpected allocation failures as a result. Any
817	* new usage of this function should be consulted with MM people.
818	*
819	* Returns true if successful and false if the OOM killer cannot be
820	* disabled.
821	*/
822	bool oom_killer_disable(signed long timeout)
823	{
824	signed long ret;
825
826	/*
827	* Make sure to not race with an ongoing OOM killer. Check that the
828	* current is not killed (possibly due to sharing the victim's memory).
829	*/
830	if (mutex_lock_killable(&oom_lock))
831	return false;
832	oom_killer_disabled = true;
833	mutex_unlock(lock: &oom_lock);
834
835	ret = wait_event_interruptible_timeout(oom_victims_wait,
836	!atomic_read(&oom_victims), timeout);
837	if (ret <= `0`) {
838	oom_killer_enable();
839	return false;
840	}
841	pr_info("OOM killer disabled.\n");
842
843	return true;
844	}
845
846	static inline bool __task_will_free_mem(struct task_struct *task)
847	{
848	struct signal_struct *sig = task->signal;
849
850	/*
851	* A coredumping process may sleep for an extended period in
852	* coredump_task_exit(), so the oom killer cannot assume that
853	* the process will promptly exit and release memory.
854	*/
855	if (sig->core_state)
856	return false;
857
858	if (sig->flags & SIGNAL_GROUP_EXIT)
859	return true;
860
861	if (thread_group_empty(p: task) && (task->flags & PF_EXITING))
862	return true;
863
864	return false;
865	}
866
867	/*
868	* Checks whether the given task is dying or exiting and likely to
869	* release its address space. This means that all threads and processes
870	* sharing the same mm have to be killed or exiting.
871	* Caller has to make sure that task->mm is stable (hold task_lock or
872	* it operates on the current).
873	*/
874	static bool task_will_free_mem(struct task_struct *task)
875	{
876	struct mm_struct *mm = task->mm;
877	struct task_struct *p;
878	bool ret = true;
879
880	/*
881	* Skip tasks without mm because it might have passed its exit_mm and
882	* exit_oom_victim. oom_reaper could have rescued that but do not rely
883	* on that for now. We can consider find_lock_task_mm in future.
884	*/
885	if (!mm)
886	return false;
887
888	if (!__task_will_free_mem(task))
889	return false;
890
891	/*
892	* This task has already been drained by the oom reaper so there are
893	* only small chances it will free some more
894	*/
895	if (test_bit(MMF_OOM_SKIP, &mm->flags))
896	return false;
897
898	if (atomic_read(v: &mm->mm_users) <= `1`)
899	return true;
900
901	/*
902	* Make sure that all tasks which share the mm with the given tasks
903	* are dying as well to make sure that a) nobody pins its mm and
904	* b) the task is also reapable by the oom reaper.
905	*/
906	rcu_read_lock();
907	for_each_process(p) {
908	if (!process_shares_mm(p, mm))
909	continue;
910	if (same_thread_group(p1: task, p2: p))
911	continue;
912	ret = __task_will_free_mem(task: p);
913	if (!ret)
914	break;
915	}
916	rcu_read_unlock();
917
918	return ret;
919	}
920
921	static void __oom_kill_process(struct task_struct victim, const* char *message)
922	{
923	struct task_struct *p;
924	struct mm_struct *mm;
925	bool can_oom_reap = true;
926
927	p = find_lock_task_mm(p: victim);
928	if (!p) {
929	pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
930	message, task_pid_nr(victim), victim->comm);
931	put_task_struct(t: victim);
932	return;
933	} else if (victim != p) {
934	get_task_struct(t: p);
935	put_task_struct(t: victim);
936	victim = p;
937	}
938
939	/ Get a reference to safely compare mm after task_unlock(victim) /
940	mm = victim->mm;
941	mmgrab(mm);
942
943	/ Raise event before sending signal: task reaper must see this /
944	count_vm_event(item: OOM_KILL);
945	memcg_memory_event_mm(mm, event: MEMCG_OOM_KILL);
946
947	/*
948	* We should send SIGKILL before granting access to memory reserves
949	* in order to prevent the OOM victim from depleting the memory
950	* reserves from the user space under its control.
951	*/
952	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p: victim, type: PIDTYPE_TGID);
953	mark_oom_victim(tsk: victim);
954	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
955	message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
956	K(get_mm_counter(mm, MM_ANONPAGES)),
957	K(get_mm_counter(mm, MM_FILEPAGES)),
958	K(get_mm_counter(mm, MM_SHMEMPAGES)),
959	from_kuid(&init_user_ns, task_uid(victim)),
960	mm_pgtables_bytes(mm) >> `10`, victim->signal->oom_score_adj);
961	task_unlock(p: victim);
962
963	/*
964	* Kill all user processes sharing victim->mm in other thread groups, if
965	* any. They don't get access to memory reserves, though, to avoid
966	* depletion of all memory. This prevents mm->mmap_lock livelock when an
967	* oom killed thread cannot exit because it requires the semaphore and
968	* its contended by another thread trying to allocate memory itself.
969	* That thread will now get access to memory reserves since it has a
970	* pending fatal signal.
971	*/
972	rcu_read_lock();
973	for_each_process(p) {
974	if (!process_shares_mm(p, mm))
975	continue;
976	if (same_thread_group(p1: p, p2: victim))
977	continue;
978	if (is_global_init(tsk: p)) {
979	can_oom_reap = false;
980	set_bit(MMF_OOM_SKIP, addr: &mm->flags);
981	pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
982	task_pid_nr(victim), victim->comm,
983	task_pid_nr(p), p->comm);
984	continue;
985	}
986	/*
987	* No kthread_use_mm() user needs to read from the userspace so
988	* we are ok to reap it.
989	*/
990	if (unlikely(p->flags & PF_KTHREAD))
991	continue;
992	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, type: PIDTYPE_TGID);
993	}
994	rcu_read_unlock();
995
996	if (can_oom_reap)
997	queue_oom_reaper(tsk: victim);
998
999	mmdrop(mm);
1000	put_task_struct(t: victim);
1001	}
1002
1003	/*
1004	* Kill provided task unless it's secured by setting
1005	* oom_score_adj to OOM_SCORE_ADJ_MIN.
1006	*/
1007	static int oom_kill_memcg_member(struct task_struct task, void* *message)
1008	{
1009	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
1010	!is_global_init(tsk: task)) {
1011	get_task_struct(t: task);
1012	__oom_kill_process(victim: task, message);
1013	}
1014	return `0`;
1015	}
1016
1017	static void oom_kill_process(struct oom_control oc, const* char *message)
1018	{
1019	struct task_struct *victim = oc->chosen;
1020	struct mem_cgroup *oom_group;
1021	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1022	DEFAULT_RATELIMIT_BURST);
1023
1024	/*
1025	* If the task is already exiting, don't alarm the sysadmin or kill
1026	* its children or threads, just give it access to memory reserves
1027	* so it can die quickly
1028	*/
1029	task_lock(p: victim);
1030	if (task_will_free_mem(task: victim)) {
1031	mark_oom_victim(tsk: victim);
1032	queue_oom_reaper(tsk: victim);
1033	task_unlock(p: victim);
1034	put_task_struct(t: victim);
1035	return;
1036	}
1037	task_unlock(p: victim);
1038
1039	if (__ratelimit(&oom_rs)) {
1040	dump_header(oc);
1041	dump_oom_victim(oc, victim);
1042	}
1043
1044	/*
1045	* Do we need to kill the entire memory cgroup?
1046	* Or even one of the ancestor memory cgroups?
1047	* Check this out before killing the victim task.
1048	*/
1049	oom_group = mem_cgroup_get_oom_group(victim, oom_domain: oc->memcg);
1050
1051	__oom_kill_process(victim, message);
1052
1053	/*
1054	* If necessary, kill all tasks in the selected memory cgroup.
1055	*/
1056	if (oom_group) {
1057	memcg_memory_event(memcg: oom_group, event: MEMCG_OOM_GROUP_KILL);
1058	mem_cgroup_print_oom_group(memcg: oom_group);
1059	mem_cgroup_scan_tasks(memcg: oom_group, oom_kill_memcg_member,
1060	arg: (void *)message);
1061	mem_cgroup_put(memcg: oom_group);
1062	}
1063	}
1064
1065	/*
1066	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
1067	*/
1068	static void check_panic_on_oom(struct oom_control *oc)
1069	{
1070	if (likely(!sysctl_panic_on_oom))
1071	return;
1072	if (sysctl_panic_on_oom != `2`) {
1073	/*
1074	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
1075	* does not panic for cpuset, mempolicy, or memcg allocation
1076	* failures.
1077	*/
1078	if (oc->constraint != CONSTRAINT_NONE)
1079	return;
1080	}
1081	/ Do not panic for oom kills triggered by sysrq /
1082	if (is_sysrq_oom(oc))
1083	return;
1084	dump_header(oc);
1085	panic(fmt: "Out of memory: %s panic_on_oom is enabled\n",
1086	sysctl_panic_on_oom == `2` ? "compulsory" : "system-wide");
1087	}
1088
1089	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
1090
1091	int register_oom_notifier(struct notifier_block *nb)
1092	{
1093	return blocking_notifier_chain_register(nh: &oom_notify_list, nb);
1094	}
1095	EXPORT_SYMBOL_GPL(register_oom_notifier);
1096
1097	int unregister_oom_notifier(struct notifier_block *nb)
1098	{
1099	return blocking_notifier_chain_unregister(nh: &oom_notify_list, nb);
1100	}
1101	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
1102
1103	/**
1104	* out_of_memory - kill the "best" process when we run out of memory
1105	* @oc: pointer to struct oom_control
1106	*
1107	* If we run out of memory, we have the choice between either
1108	* killing a random task (bad), letting the system crash (worse)
1109	* OR try to be smart about which process to kill. Note that we
1110	* don't have to be perfect here, we just have to be good.
1111	*/
1112	bool out_of_memory(struct oom_control *oc)
1113	{
1114	unsigned long freed = `0`;
1115
1116	if (oom_killer_disabled)
1117	return false;
1118
1119	if (!is_memcg_oom(oc)) {
1120	blocking_notifier_call_chain(nh: &oom_notify_list, val: `0`, v: &freed);
1121	if (freed > `0` && !is_sysrq_oom(oc))
1122	/ Got some memory back in the last second. /
1123	return true;
1124	}
1125
1126	/*
1127	* If current has a pending SIGKILL or is exiting, then automatically
1128	* select it. The goal is to allow it to allocate so that it may
1129	* quickly exit and free its memory.
1130	*/
1131	if (task_will_free_mem(current)) {
1132	mark_oom_victim(current);
1133	queue_oom_reaper(current);
1134	return true;
1135	}
1136
1137	/*
1138	* The OOM killer does not compensate for IO-less reclaim.
1139	* But mem_cgroup_oom() has to invoke the OOM killer even
1140	* if it is a GFP_NOFS allocation.
1141	*/
1142	if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
1143	return true;
1144
1145	/*
1146	* Check if there were limitations on the allocation (only relevant for
1147	* NUMA and memcg) that may require different handling.
1148	*/
1149	oc->constraint = constrained_alloc(oc);
1150	if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
1151	oc->nodemask = NULL;
1152	check_panic_on_oom(oc);
1153
1154	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1155	current->mm && !oom_unkillable_task(current) &&
1156	oom_cpuset_eligible(current, oc) &&
1157	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1158	get_task_struct(current);
1159	oc->chosen = current;
1160	oom_kill_process(oc, message: "Out of memory (oom_kill_allocating_task)");
1161	return true;
1162	}
1163
1164	select_bad_process(oc);
1165	/ Found nothing?!?! /
1166	if (!oc->chosen) {
1167	dump_header(oc);
1168	pr_warn("Out of memory and no killable processes...\n");
1169	/*
1170	* If we got here due to an actual allocation at the
1171	* system level, we cannot survive this and will enter
1172	* an endless loop in the allocator. Bail out now.
1173	*/
1174	if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
1175	panic(fmt: "System is deadlocked on memory\n");
1176	}
1177	if (oc->chosen && oc->chosen != (void *)-`1UL`)
1178	oom_kill_process(oc, message: !is_memcg_oom(oc) ? "Out of memory" :
1179	"Memory cgroup out of memory");
1180	return !!oc->chosen;
1181	}
1182
1183	/*
1184	* The pagefault handler calls here because some allocation has failed. We have
1185	* to take care of the memcg OOM here because this is the only safe context without
1186	* any locks held but let the oom killer triggered from the allocation context care
1187	* about the global OOM.
1188	*/
1189	void pagefault_out_of_memory(void)
1190	{
1191	static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
1192	DEFAULT_RATELIMIT_BURST);
1193
1194	if (mem_cgroup_oom_synchronize(wait: true))
1195	return;
1196
1197	if (fatal_signal_pending(current))
1198	return;
1199
1200	if (__ratelimit(&pfoom_rs))
1201	pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
1202	}
1203
1204	SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
1205	{
1206	#ifdef CONFIG_MMU
1207	struct mm_struct *mm = NULL;
1208	struct task_struct *task;
1209	struct task_struct *p;
1210	unsigned int f_flags;
1211	bool reap = false;
1212	long ret = `0`;
1213
1214	if (flags)
1215	return -EINVAL;
1216
1217	task = pidfd_get_task(pidfd, flags: &f_flags);
1218	if (IS_ERR(ptr: task))
1219	return PTR_ERR(ptr: task);
1220
1221	/*
1222	* Make sure to choose a thread which still has a reference to mm
1223	* during the group exit
1224	*/
1225	p = find_lock_task_mm(p: task);
1226	if (!p) {
1227	ret = -ESRCH;
1228	goto put_task;
1229	}
1230
1231	mm = p->mm;
1232	mmgrab(mm);
1233
1234	if (task_will_free_mem(task: p))
1235	reap = true;
1236	else {
1237	/ Error only if the work has not been done already /
1238	if (!test_bit(MMF_OOM_SKIP, &mm->flags))
1239	ret = -EINVAL;
1240	}
1241	task_unlock(p);
1242
1243	if (!reap)
1244	goto drop_mm;
1245
1246	if (mmap_read_lock_killable(mm)) {
1247	ret = -EINTR;
1248	goto drop_mm;
1249	}
1250	/*
1251	* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
1252	* possible change in exit_mmap is seen
1253	*/
1254	if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
1255	ret = -EAGAIN;
1256	mmap_read_unlock(mm);
1257
1258	drop_mm:
1259	mmdrop(mm);
1260	put_task:
1261	put_task_struct(t: task);
1262	return ret;
1263	#else
1264	return -ENOSYS;
1265	#endif /* CONFIG_MMU */
1266	}
1267

source code of linux/mm/oom_kill.c