vmstat.c source code [linux/mm/vmstat.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/vmstat.c
4	*
5	* Manages VM statistics
6	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
7	*
8	* zoned VM statistics
9	* Copyright (C) 2006 Silicon Graphics, Inc.,
10	* Christoph Lameter <cl@gentwo.org>
11	* Copyright (C) 2008-2014 Christoph Lameter
12	*/
13	#include <linux/fs.h>
14	#include <linux/mm.h>
15	#include <linux/err.h>
16	#include <linux/module.h>
17	#include <linux/slab.h>
18	#include <linux/cpu.h>
19	#include <linux/cpumask.h>
20	#include <linux/vmstat.h>
21	#include <linux/proc_fs.h>
22	#include <linux/seq_file.h>
23	#include <linux/debugfs.h>
24	#include <linux/sched.h>
25	#include <linux/math64.h>
26	#include <linux/writeback.h>
27	#include <linux/compaction.h>
28	#include <linux/mm_inline.h>
29	#include <linux/page_owner.h>
30	#include <linux/sched/isolation.h>
31
32	#include "internal.h"
33
34	#ifdef CONFIG_PROC_FS
35	#ifdef CONFIG_NUMA
36	#define ENABLE_NUMA_STAT 1
37	static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39	/ zero numa counters within a zone /
40	static void zero_zone_numa_counters(struct zone *zone)
41	{
42	int item, cpu;
43
44	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
45	atomic_long_set(v: &zone->vm_numa_event[item], i: `0`);
46	for_each_online_cpu(cpu) {
47	per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
48	= `0`;
49	}
50	}
51	}
52
53	/ zero numa counters of all the populated zones /
54	static void zero_zones_numa_counters(void)
55	{
56	struct zone *zone;
57
58	for_each_populated_zone(zone)
59	zero_zone_numa_counters(zone);
60	}
61
62	/ zero global numa counters /
63	static void zero_global_numa_counters(void)
64	{
65	int item;
66
67	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++)
68	atomic_long_set(v: &vm_numa_event[item], i: `0`);
69	}
70
71	static void invalid_numa_statistics(void)
72	{
73	zero_zones_numa_counters();
74	zero_global_numa_counters();
75	}
76
77	static DEFINE_MUTEX(vm_numa_stat_lock);
78
79	static int sysctl_vm_numa_stat_handler(const struct ctl_table table, int* write,
80	void buffer, size_t length, loff_t *ppos)
81	{
82	int ret, oldval;
83
84	mutex_lock(&vm_numa_stat_lock);
85	if (write)
86	oldval = sysctl_vm_numa_stat;
87	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
88	if (ret \|\| !write)
89	goto out;
90
91	if (oldval == sysctl_vm_numa_stat)
92	goto out;
93	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
94	static_branch_enable(&vm_numa_stat_key);
95	pr_info("enable numa statistics\n");
96	} else {
97	static_branch_disable(&vm_numa_stat_key);
98	invalid_numa_statistics();
99	pr_info("disable numa statistics, and clear numa counters\n");
100	}
101
102	out:
103	mutex_unlock(lock: &vm_numa_stat_lock);
104	return ret;
105	}
106	#endif
107	#endif /* CONFIG_PROC_FS */
108
109	#ifdef CONFIG_VM_EVENT_COUNTERS
110	DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{`0`}};
111	EXPORT_PER_CPU_SYMBOL(vm_event_states);
112
113	static void sum_vm_events(unsigned long *ret)
114	{
115	int cpu;
116	int i;
117
118	memset(ret, `0`, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
119
120	for_each_online_cpu(cpu) {
121	struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
122
123	for (i = `0`; i < NR_VM_EVENT_ITEMS; i++)
124	ret[i] += this->event[i];
125	}
126	}
127
128	/*
129	* Accumulate the vm event counters across all CPUs.
130	* The result is unavoidably approximate - it can change
131	* during and after execution of this function.
132	*/
133	void all_vm_events(unsigned long *ret)
134	{
135	cpus_read_lock();
136	sum_vm_events(ret);
137	cpus_read_unlock();
138	}
139	EXPORT_SYMBOL_GPL(all_vm_events);
140
141	/*
142	* Fold the foreign cpu events into our own.
143	*
144	* This is adding to the events on one processor
145	* but keeps the global counts constant.
146	*/
147	void vm_events_fold_cpu(int cpu)
148	{
149	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
150	int i;
151
152	for (i = `0`; i < NR_VM_EVENT_ITEMS; i++) {
153	count_vm_events(item: i, delta: fold_state->event[i]);
154	fold_state->event[i] = `0`;
155	}
156	}
157
158	#endif /* CONFIG_VM_EVENT_COUNTERS */
159
160	/*
161	* Manage combined zone based / global counters
162	*
163	* vm_stat contains the global counters
164	*/
165	atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
166	atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
167	atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
168	EXPORT_SYMBOL(vm_zone_stat);
169	EXPORT_SYMBOL(vm_node_stat);
170
171	#ifdef CONFIG_NUMA
172	static void fold_vm_zone_numa_events(struct zone *zone)
173	{
174	unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { `0`, };
175	int cpu;
176	enum numa_stat_item item;
177
178	for_each_online_cpu(cpu) {
179	struct per_cpu_zonestat *pzstats;
180
181	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
182	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++)
183	zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], `0`);
184	}
185
186	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++)
187	zone_numa_event_add(x: zone_numa_events[item], zone, item);
188	}
189
190	void fold_vm_numa_events(void)
191	{
192	struct zone *zone;
193
194	for_each_populated_zone(zone)
195	fold_vm_zone_numa_events(zone);
196	}
197	#endif
198
199	#ifdef CONFIG_SMP
200
201	int calculate_pressure_threshold(struct zone *zone)
202	{
203	int threshold;
204	int watermark_distance;
205
206	/*
207	* As vmstats are not up to date, there is drift between the estimated
208	* and real values. For high thresholds and a high number of CPUs, it
209	* is possible for the min watermark to be breached while the estimated
210	* value looks fine. The pressure threshold is a reduced value such
211	* that even the maximum amount of drift will not accidentally breach
212	* the min watermark
213	*/
214	watermark_distance = low_wmark_pages(z: zone) - min_wmark_pages(z: zone);
215	threshold = max(`1`, (int)(watermark_distance / num_online_cpus()));
216
217	/*
218	* Maximum threshold is 125
219	*/
220	threshold = min(`125`, threshold);
221
222	return threshold;
223	}
224
225	int calculate_normal_threshold(struct zone *zone)
226	{
227	int threshold;
228	int mem; / memory in 128 MB units /
229
230	/*
231	* The threshold scales with the number of processors and the amount
232	* of memory per zone. More memory means that we can defer updates for
233	* longer, more processors could lead to more contention.
234	* fls() is used to have a cheap way of logarithmic scaling.
235	*
236	* Some sample thresholds:
237	*
238	* Threshold Processors (fls) Zonesize fls(mem)+1
239	* ------------------------------------------------------------------
240	* 8 1 1 0.9-1 GB 4
241	* 16 2 2 0.9-1 GB 4
242	* 20 2 2 1-2 GB 5
243	* 24 2 2 2-4 GB 6
244	* 28 2 2 4-8 GB 7
245	* 32 2 2 8-16 GB 8
246	* 4 2 2 <128M 1
247	* 30 4 3 2-4 GB 5
248	* 48 4 3 8-16 GB 8
249	* 32 8 4 1-2 GB 4
250	* 32 8 4 0.9-1GB 4
251	* 10 16 5 <128M 1
252	* 40 16 5 900M 4
253	* 70 64 7 2-4 GB 5
254	* 84 64 7 4-8 GB 6
255	* 108 512 9 4-8 GB 6
256	* 125 1024 10 8-16 GB 8
257	* 125 1024 10 16-32 GB 9
258	*/
259
260	mem = zone_managed_pages(zone) >> (`27` - PAGE_SHIFT);
261
262	threshold = `2` * fls(x: num_online_cpus()) * (`1` + fls(x: mem));
263
264	/*
265	* Maximum threshold is 125
266	*/
267	threshold = min(`125`, threshold);
268
269	return threshold;
270	}
271
272	/*
273	* Refresh the thresholds for each zone.
274	*/
275	void refresh_zone_stat_thresholds(void)
276	{
277	struct pglist_data *pgdat;
278	struct zone *zone;
279	int cpu;
280	int threshold;
281
282	/ Zero current pgdat thresholds /
283	for_each_online_pgdat(pgdat) {
284	for_each_online_cpu(cpu) {
285	per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = `0`;
286	}
287	}
288
289	for_each_populated_zone(zone) {
290	struct pglist_data *pgdat = zone->zone_pgdat;
291	unsigned long max_drift, tolerate_drift;
292
293	threshold = calculate_normal_threshold(zone);
294
295	for_each_online_cpu(cpu) {
296	int pgdat_threshold;
297
298	per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
299	= threshold;
300
301	/ Base nodestat threshold on the largest populated zone. /
302	pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
303	per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
304	= max(threshold, pgdat_threshold);
305	}
306
307	/*
308	* Only set percpu_drift_mark if there is a danger that
309	* NR_FREE_PAGES reports the low watermark is ok when in fact
310	* the min watermark could be breached by an allocation
311	*/
312	tolerate_drift = low_wmark_pages(z: zone) - min_wmark_pages(z: zone);
313	max_drift = num_online_cpus() * threshold;
314	if (max_drift > tolerate_drift)
315	zone->percpu_drift_mark = high_wmark_pages(z: zone) +
316	max_drift;
317	}
318	}
319
320	void set_pgdat_percpu_threshold(pg_data_t *pgdat,
321	int (calculate_pressure)(struct* zone *))
322	{
323	struct zone *zone;
324	int cpu;
325	int threshold;
326	int i;
327
328	for (i = `0`; i < pgdat->nr_zones; i++) {
329	zone = &pgdat->node_zones[i];
330	if (!zone->percpu_drift_mark)
331	continue;
332
333	threshold = (*calculate_pressure)(zone);
334	for_each_online_cpu(cpu)
335	per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
336	= threshold;
337	}
338	}
339
340	/*
341	* For use when we know that interrupts are disabled,
342	* or when we know that preemption is disabled and that
343	* particular counter cannot be updated from interrupt context.
344	*/
345	void __mod_zone_page_state(struct zone zone, enum* zone_stat_item item,
346	long delta)
347	{
348	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
349	s8 __percpu *p = pcp->vm_stat_diff + item;
350	long x;
351	long t;
352
353	/*
354	* Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
355	* atomicity is provided by IRQs being disabled -- either explicitly
356	* or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
357	* CPU migrations and preemption potentially corrupts a counter so
358	* disable preemption.
359	*/
360	preempt_disable_nested();
361
362	x = delta + __this_cpu_read(*p);
363
364	t = __this_cpu_read(pcp->stat_threshold);
365
366	if (unlikely(abs(x) > t)) {
367	zone_page_state_add(x, zone, item);
368	x = `0`;
369	}
370	__this_cpu_write(*p, x);
371
372	preempt_enable_nested();
373	}
374	EXPORT_SYMBOL(__mod_zone_page_state);
375
376	void __mod_node_page_state(struct pglist_data pgdat, enum* node_stat_item item,
377	long delta)
378	{
379	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
380	s8 __percpu *p = pcp->vm_node_stat_diff + item;
381	long x;
382	long t;
383
384	if (vmstat_item_in_bytes(idx: item)) {
385	/*
386	* Only cgroups use subpage accounting right now; at
387	* the global level, these items still change in
388	* multiples of whole pages. Store them as pages
389	* internally to keep the per-cpu counters compact.
390	*/
391	VM_WARN_ON_ONCE(delta & (PAGE_SIZE - `1`));
392	delta >>= PAGE_SHIFT;
393	}
394
395	/ See __mod_node_page_state /
396	preempt_disable_nested();
397
398	x = delta + __this_cpu_read(*p);
399
400	t = __this_cpu_read(pcp->stat_threshold);
401
402	if (unlikely(abs(x) > t)) {
403	node_page_state_add(x, pgdat, item);
404	x = `0`;
405	}
406	__this_cpu_write(*p, x);
407
408	preempt_enable_nested();
409	}
410	EXPORT_SYMBOL(__mod_node_page_state);
411
412	/*
413	* Optimized increment and decrement functions.
414	*
415	* These are only for a single page and therefore can take a struct page *
416	* argument instead of struct zone *. This allows the inclusion of the code
417	* generated for page_zone(page) into the optimized functions.
418	*
419	* No overflow check is necessary and therefore the differential can be
420	* incremented or decremented in place which may allow the compilers to
421	* generate better code.
422	* The increment or decrement is known and therefore one boundary check can
423	* be omitted.
424	*
425	* NOTE: These functions are very performance sensitive. Change only
426	* with care.
427	*
428	* Some processors have inc/dec instructions that are atomic vs an interrupt.
429	* However, the code must first determine the differential location in a zone
430	* based on the processor number and then inc/dec the counter. There is no
431	* guarantee without disabling preemption that the processor will not change
432	* in between and therefore the atomicity vs. interrupt cannot be exploited
433	* in a useful way here.
434	*/
435	void __inc_zone_state(struct zone zone, enum* zone_stat_item item)
436	{
437	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
438	s8 __percpu *p = pcp->vm_stat_diff + item;
439	s8 v, t;
440
441	/ See __mod_node_page_state /
442	preempt_disable_nested();
443
444	v = __this_cpu_inc_return(*p);
445	t = __this_cpu_read(pcp->stat_threshold);
446	if (unlikely(v > t)) {
447	s8 overstep = t >> `1`;
448
449	zone_page_state_add(x: v + overstep, zone, item);
450	__this_cpu_write(*p, -overstep);
451	}
452
453	preempt_enable_nested();
454	}
455
456	void __inc_node_state(struct pglist_data pgdat, enum* node_stat_item item)
457	{
458	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
459	s8 __percpu *p = pcp->vm_node_stat_diff + item;
460	s8 v, t;
461
462	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
463
464	/ See __mod_node_page_state /
465	preempt_disable_nested();
466
467	v = __this_cpu_inc_return(*p);
468	t = __this_cpu_read(pcp->stat_threshold);
469	if (unlikely(v > t)) {
470	s8 overstep = t >> `1`;
471
472	node_page_state_add(x: v + overstep, pgdat, item);
473	__this_cpu_write(*p, -overstep);
474	}
475
476	preempt_enable_nested();
477	}
478
479	void __inc_zone_page_state(struct page page, enum* zone_stat_item item)
480	{
481	__inc_zone_state(zone: page_zone(page), item);
482	}
483	EXPORT_SYMBOL(__inc_zone_page_state);
484
485	void __inc_node_page_state(struct page page, enum* node_stat_item item)
486	{
487	__inc_node_state(pgdat: page_pgdat(page), item);
488	}
489	EXPORT_SYMBOL(__inc_node_page_state);
490
491	void __dec_zone_state(struct zone zone, enum* zone_stat_item item)
492	{
493	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
494	s8 __percpu *p = pcp->vm_stat_diff + item;
495	s8 v, t;
496
497	/ See __mod_node_page_state /
498	preempt_disable_nested();
499
500	v = __this_cpu_dec_return(*p);
501	t = __this_cpu_read(pcp->stat_threshold);
502	if (unlikely(v < - t)) {
503	s8 overstep = t >> `1`;
504
505	zone_page_state_add(x: v - overstep, zone, item);
506	__this_cpu_write(*p, overstep);
507	}
508
509	preempt_enable_nested();
510	}
511
512	void __dec_node_state(struct pglist_data pgdat, enum* node_stat_item item)
513	{
514	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
515	s8 __percpu *p = pcp->vm_node_stat_diff + item;
516	s8 v, t;
517
518	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
519
520	/ See __mod_node_page_state /
521	preempt_disable_nested();
522
523	v = __this_cpu_dec_return(*p);
524	t = __this_cpu_read(pcp->stat_threshold);
525	if (unlikely(v < - t)) {
526	s8 overstep = t >> `1`;
527
528	node_page_state_add(x: v - overstep, pgdat, item);
529	__this_cpu_write(*p, overstep);
530	}
531
532	preempt_enable_nested();
533	}
534
535	void __dec_zone_page_state(struct page page, enum* zone_stat_item item)
536	{
537	__dec_zone_state(zone: page_zone(page), item);
538	}
539	EXPORT_SYMBOL(__dec_zone_page_state);
540
541	void __dec_node_page_state(struct page page, enum* node_stat_item item)
542	{
543	__dec_node_state(pgdat: page_pgdat(page), item);
544	}
545	EXPORT_SYMBOL(__dec_node_page_state);
546
547	#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
548	/*
549	* If we have cmpxchg_local support then we do not need to incur the overhead
550	* that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
551	*
552	* mod_state() modifies the zone counter state through atomic per cpu
553	* operations.
554	*
555	* Overstep mode specifies how overstep should handled:
556	* 0 No overstepping
557	* 1 Overstepping half of threshold
558	* -1 Overstepping minus half of threshold
559	*/
560	static inline void mod_zone_state(struct zone *zone,
561	enum zone_stat_item item, long delta, int overstep_mode)
562	{
563	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
564	s8 __percpu *p = pcp->vm_stat_diff + item;
565	long n, t, z;
566	s8 o;
567
568	o = this_cpu_read(*p);
569	do {
570	z = `0`; / overflow to zone counters /
571
572	/*
573	* The fetching of the stat_threshold is racy. We may apply
574	* a counter threshold to the wrong the cpu if we get
575	* rescheduled while executing here. However, the next
576	* counter update will apply the threshold again and
577	* therefore bring the counter under the threshold again.
578	*
579	* Most of the time the thresholds are the same anyways
580	* for all cpus in a zone.
581	*/
582	t = this_cpu_read(pcp->stat_threshold);
583
584	n = delta + (long)o;
585
586	if (abs(n) > t) {
587	int os = overstep_mode * (t >> `1`) ;
588
589	/ Overflow must be added to zone counters /
590	z = n + os;
591	n = -os;
592	}
593	} while (!this_cpu_try_cmpxchg(*p, &o, n));
594
595	if (z)
596	zone_page_state_add(x: z, zone, item);
597	}
598
599	void mod_zone_page_state(struct zone zone, enum* zone_stat_item item,
600	long delta)
601	{
602	mod_zone_state(zone, item, delta, overstep_mode: `0`);
603	}
604	EXPORT_SYMBOL(mod_zone_page_state);
605
606	void inc_zone_page_state(struct page page, enum* zone_stat_item item)
607	{
608	mod_zone_state(zone: page_zone(page), item, delta: `1`, overstep_mode: `1`);
609	}
610	EXPORT_SYMBOL(inc_zone_page_state);
611
612	void dec_zone_page_state(struct page page, enum* zone_stat_item item)
613	{
614	mod_zone_state(zone: page_zone(page), item, delta: -`1`, overstep_mode: -`1`);
615	}
616	EXPORT_SYMBOL(dec_zone_page_state);
617
618	static inline void mod_node_state(struct pglist_data *pgdat,
619	enum node_stat_item item, int delta, int overstep_mode)
620	{
621	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
622	s8 __percpu *p = pcp->vm_node_stat_diff + item;
623	long n, t, z;
624	s8 o;
625
626	if (vmstat_item_in_bytes(idx: item)) {
627	/*
628	* Only cgroups use subpage accounting right now; at
629	* the global level, these items still change in
630	* multiples of whole pages. Store them as pages
631	* internally to keep the per-cpu counters compact.
632	*/
633	VM_WARN_ON_ONCE(delta & (PAGE_SIZE - `1`));
634	delta >>= PAGE_SHIFT;
635	}
636
637	o = this_cpu_read(*p);
638	do {
639	z = `0`; / overflow to node counters /
640
641	/*
642	* The fetching of the stat_threshold is racy. We may apply
643	* a counter threshold to the wrong the cpu if we get
644	* rescheduled while executing here. However, the next
645	* counter update will apply the threshold again and
646	* therefore bring the counter under the threshold again.
647	*
648	* Most of the time the thresholds are the same anyways
649	* for all cpus in a node.
650	*/
651	t = this_cpu_read(pcp->stat_threshold);
652
653	n = delta + (long)o;
654
655	if (abs(n) > t) {
656	int os = overstep_mode * (t >> `1`) ;
657
658	/ Overflow must be added to node counters /
659	z = n + os;
660	n = -os;
661	}
662	} while (!this_cpu_try_cmpxchg(*p, &o, n));
663
664	if (z)
665	node_page_state_add(x: z, pgdat, item);
666	}
667
668	void mod_node_page_state(struct pglist_data pgdat, enum* node_stat_item item,
669	long delta)
670	{
671	mod_node_state(pgdat, item, delta, overstep_mode: `0`);
672	}
673	EXPORT_SYMBOL(mod_node_page_state);
674
675	void inc_node_state(struct pglist_data pgdat, enum* node_stat_item item)
676	{
677	mod_node_state(pgdat, item, delta: `1`, overstep_mode: `1`);
678	}
679
680	void inc_node_page_state(struct page page, enum* node_stat_item item)
681	{
682	mod_node_state(pgdat: page_pgdat(page), item, delta: `1`, overstep_mode: `1`);
683	}
684	EXPORT_SYMBOL(inc_node_page_state);
685
686	void dec_node_page_state(struct page page, enum* node_stat_item item)
687	{
688	mod_node_state(pgdat: page_pgdat(page), item, delta: -`1`, overstep_mode: -`1`);
689	}
690	EXPORT_SYMBOL(dec_node_page_state);
691	#else
692	/*
693	* Use interrupt disable to serialize counter updates
694	*/
695	void mod_zone_page_state(struct zone zone, enum* zone_stat_item item,
696	long delta)
697	{
698	unsigned long flags;
699
700	local_irq_save(flags);
701	__mod_zone_page_state(zone, item, delta);
702	local_irq_restore(flags);
703	}
704	EXPORT_SYMBOL(mod_zone_page_state);
705
706	void inc_zone_page_state(struct page page, enum* zone_stat_item item)
707	{
708	unsigned long flags;
709	struct zone *zone;
710
711	zone = page_zone(page);
712	local_irq_save(flags);
713	__inc_zone_state(zone, item);
714	local_irq_restore(flags);
715	}
716	EXPORT_SYMBOL(inc_zone_page_state);
717
718	void dec_zone_page_state(struct page page, enum* zone_stat_item item)
719	{
720	unsigned long flags;
721
722	local_irq_save(flags);
723	__dec_zone_page_state(page, item);
724	local_irq_restore(flags);
725	}
726	EXPORT_SYMBOL(dec_zone_page_state);
727
728	void inc_node_state(struct pglist_data pgdat, enum* node_stat_item item)
729	{
730	unsigned long flags;
731
732	local_irq_save(flags);
733	__inc_node_state(pgdat, item);
734	local_irq_restore(flags);
735	}
736	EXPORT_SYMBOL(inc_node_state);
737
738	void mod_node_page_state(struct pglist_data pgdat, enum* node_stat_item item,
739	long delta)
740	{
741	unsigned long flags;
742
743	local_irq_save(flags);
744	__mod_node_page_state(pgdat, item, delta);
745	local_irq_restore(flags);
746	}
747	EXPORT_SYMBOL(mod_node_page_state);
748
749	void inc_node_page_state(struct page page, enum* node_stat_item item)
750	{
751	unsigned long flags;
752	struct pglist_data *pgdat;
753
754	pgdat = page_pgdat(page);
755	local_irq_save(flags);
756	__inc_node_state(pgdat, item);
757	local_irq_restore(flags);
758	}
759	EXPORT_SYMBOL(inc_node_page_state);
760
761	void dec_node_page_state(struct page page, enum* node_stat_item item)
762	{
763	unsigned long flags;
764
765	local_irq_save(flags);
766	__dec_node_page_state(page, item);
767	local_irq_restore(flags);
768	}
769	EXPORT_SYMBOL(dec_node_page_state);
770	#endif
771
772	/*
773	* Fold a differential into the global counters.
774	* Returns the number of counters updated.
775	*/
776	static int fold_diff(int zone_diff, int* *node_diff)
777	{
778	int i;
779	int changes = `0`;
780
781	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++)
782	if (zone_diff[i]) {
783	atomic_long_add(i: zone_diff[i], v: &vm_zone_stat[i]);
784	changes++;
785	}
786
787	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++)
788	if (node_diff[i]) {
789	atomic_long_add(i: node_diff[i], v: &vm_node_stat[i]);
790	changes++;
791	}
792	return changes;
793	}
794
795	/*
796	* Update the zone counters for the current cpu.
797	*
798	* Note that refresh_cpu_vm_stats strives to only access
799	* node local memory. The per cpu pagesets on remote zones are placed
800	* in the memory local to the processor using that pageset. So the
801	* loop over all zones will access a series of cachelines local to
802	* the processor.
803	*
804	* The call to zone_page_state_add updates the cachelines with the
805	* statistics in the remote zone struct as well as the global cachelines
806	* with the global counters. These could cause remote node cache line
807	* bouncing and will have to be only done when necessary.
808	*
809	* The function returns the number of global counters updated.
810	*/
811	static int refresh_cpu_vm_stats(bool do_pagesets)
812	{
813	struct pglist_data *pgdat;
814	struct zone *zone;
815	int i;
816	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { `0`, };
817	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { `0`, };
818	int changes = `0`;
819
820	for_each_populated_zone(zone) {
821	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
822	struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
823
824	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
825	int v;
826
827	v = this_cpu_xchg(pzstats->vm_stat_diff[i], `0`);
828	if (v) {
829
830	atomic_long_add(i: v, v: &zone->vm_stat[i]);
831	global_zone_diff[i] += v;
832	#ifdef CONFIG_NUMA
833	/ 3 seconds idle till flush /
834	__this_cpu_write(pcp->expire, `3`);
835	#endif
836	}
837	}
838
839	if (do_pagesets) {
840	cond_resched();
841
842	changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
843	#ifdef CONFIG_NUMA
844	/*
845	* Deal with draining the remote pageset of this
846	* processor
847	*
848	* Check if there are pages remaining in this pageset
849	* if not then there is nothing to expire.
850	*/
851	if (!__this_cpu_read(pcp->expire) \|\|
852	!__this_cpu_read(pcp->count))
853	continue;
854
855	/*
856	* We never drain zones local to this processor.
857	*/
858	if (zone_to_nid(zone) == numa_node_id()) {
859	__this_cpu_write(pcp->expire, `0`);
860	continue;
861	}
862
863	if (__this_cpu_dec_return(pcp->expire)) {
864	changes++;
865	continue;
866	}
867
868	if (__this_cpu_read(pcp->count)) {
869	drain_zone_pages(zone, this_cpu_ptr(pcp));
870	changes++;
871	}
872	#endif
873	}
874	}
875
876	for_each_online_pgdat(pgdat) {
877	struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
878
879	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
880	int v;
881
882	v = this_cpu_xchg(p->vm_node_stat_diff[i], `0`);
883	if (v) {
884	atomic_long_add(i: v, v: &pgdat->vm_stat[i]);
885	global_node_diff[i] += v;
886	}
887	}
888	}
889
890	changes += fold_diff(zone_diff: global_zone_diff, node_diff: global_node_diff);
891	return changes;
892	}
893
894	/*
895	* Fold the data for an offline cpu into the global array.
896	* There cannot be any access by the offline cpu and therefore
897	* synchronization is simplified.
898	*/
899	void cpu_vm_stats_fold(int cpu)
900	{
901	struct pglist_data *pgdat;
902	struct zone *zone;
903	int i;
904	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { `0`, };
905	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { `0`, };
906
907	for_each_populated_zone(zone) {
908	struct per_cpu_zonestat *pzstats;
909
910	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
911
912	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
913	if (pzstats->vm_stat_diff[i]) {
914	int v;
915
916	v = pzstats->vm_stat_diff[i];
917	pzstats->vm_stat_diff[i] = `0`;
918	atomic_long_add(i: v, v: &zone->vm_stat[i]);
919	global_zone_diff[i] += v;
920	}
921	}
922	#ifdef CONFIG_NUMA
923	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
924	if (pzstats->vm_numa_event[i]) {
925	unsigned long v;
926
927	v = pzstats->vm_numa_event[i];
928	pzstats->vm_numa_event[i] = `0`;
929	zone_numa_event_add(x: v, zone, item: i);
930	}
931	}
932	#endif
933	}
934
935	for_each_online_pgdat(pgdat) {
936	struct per_cpu_nodestat *p;
937
938	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
939
940	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++)
941	if (p->vm_node_stat_diff[i]) {
942	int v;
943
944	v = p->vm_node_stat_diff[i];
945	p->vm_node_stat_diff[i] = `0`;
946	atomic_long_add(i: v, v: &pgdat->vm_stat[i]);
947	global_node_diff[i] += v;
948	}
949	}
950
951	fold_diff(zone_diff: global_zone_diff, node_diff: global_node_diff);
952	}
953
954	/*
955	* this is only called if !populated_zone(zone), which implies no other users of
956	* pset->vm_stat_diff[] exist.
957	*/
958	void drain_zonestat(struct zone zone, struct* per_cpu_zonestat *pzstats)
959	{
960	unsigned long v;
961	int i;
962
963	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
964	if (pzstats->vm_stat_diff[i]) {
965	v = pzstats->vm_stat_diff[i];
966	pzstats->vm_stat_diff[i] = `0`;
967	zone_page_state_add(x: v, zone, item: i);
968	}
969	}
970
971	#ifdef CONFIG_NUMA
972	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
973	if (pzstats->vm_numa_event[i]) {
974	v = pzstats->vm_numa_event[i];
975	pzstats->vm_numa_event[i] = `0`;
976	zone_numa_event_add(x: v, zone, item: i);
977	}
978	}
979	#endif
980	}
981	#endif
982
983	#ifdef CONFIG_NUMA
984	/*
985	* Determine the per node value of a stat item. This function
986	* is called frequently in a NUMA machine, so try to be as
987	* frugal as possible.
988	*/
989	unsigned long sum_zone_node_page_state(int node,
990	enum zone_stat_item item)
991	{
992	struct zone *zones = NODE_DATA(node)->node_zones;
993	int i;
994	unsigned long count = `0`;
995
996	for (i = `0`; i < MAX_NR_ZONES; i++)
997	count += zone_page_state(zone: zones + i, item);
998
999	return count;
1000	}
1001
1002	/ Determine the per node value of a numa stat item. /
1003	unsigned long sum_zone_numa_event_state(int node,
1004	enum numa_stat_item item)
1005	{
1006	struct zone *zones = NODE_DATA(node)->node_zones;
1007	unsigned long count = `0`;
1008	int i;
1009
1010	for (i = `0`; i < MAX_NR_ZONES; i++)
1011	count += zone_numa_event_state(zone: zones + i, item);
1012
1013	return count;
1014	}
1015
1016	/*
1017	* Determine the per node value of a stat item.
1018	*/
1019	unsigned long node_page_state_pages(struct pglist_data *pgdat,
1020	enum node_stat_item item)
1021	{
1022	long x = atomic_long_read(v: &pgdat->vm_stat[item]);
1023	#ifdef CONFIG_SMP
1024	if (x < `0`)
1025	x = `0`;
1026	#endif
1027	return x;
1028	}
1029
1030	unsigned long node_page_state(struct pglist_data *pgdat,
1031	enum node_stat_item item)
1032	{
1033	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1034
1035	return node_page_state_pages(pgdat, item);
1036	}
1037	#endif
1038
1039	/*
1040	* Count number of pages "struct page" and "struct page_ext" consume.
1041	* nr_memmap_boot_pages: # of pages allocated by boot allocator
1042	* nr_memmap_pages: # of pages that were allocated by buddy allocator
1043	*/
1044	static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(`0`);
1045	static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(`0`);
1046
1047	void memmap_boot_pages_add(long delta)
1048	{
1049	atomic_long_add(i: delta, v: &nr_memmap_boot_pages);
1050	}
1051
1052	void memmap_pages_add(long delta)
1053	{
1054	atomic_long_add(i: delta, v: &nr_memmap_pages);
1055	}
1056
1057	#ifdef CONFIG_COMPACTION
1058
1059	struct contig_page_info {
1060	unsigned long free_pages;
1061	unsigned long free_blocks_total;
1062	unsigned long free_blocks_suitable;
1063	};
1064
1065	/*
1066	* Calculate the number of free pages in a zone, how many contiguous
1067	* pages are free and how many are large enough to satisfy an allocation of
1068	* the target size. Note that this function makes no attempt to estimate
1069	* how many suitable free blocks there might be if MOVABLE pages were
1070	* migrated. Calculating that is possible, but expensive and can be
1071	* figured out from userspace
1072	*/
1073	static void fill_contig_page_info(struct zone *zone,
1074	unsigned int suitable_order,
1075	struct contig_page_info *info)
1076	{
1077	unsigned int order;
1078
1079	info->free_pages = `0`;
1080	info->free_blocks_total = `0`;
1081	info->free_blocks_suitable = `0`;
1082
1083	for (order = `0`; order < NR_PAGE_ORDERS; order++) {
1084	unsigned long blocks;
1085
1086	/*
1087	* Count number of free blocks.
1088	*
1089	* Access to nr_free is lockless as nr_free is used only for
1090	* diagnostic purposes. Use data_race to avoid KCSAN warning.
1091	*/
1092	blocks = data_race(zone->free_area[order].nr_free);
1093	info->free_blocks_total += blocks;
1094
1095	/ Count free base pages /
1096	info->free_pages += blocks << order;
1097
1098	/ Count the suitable free blocks /
1099	if (order >= suitable_order)
1100	info->free_blocks_suitable += blocks <<
1101	(order - suitable_order);
1102	}
1103	}
1104
1105	/*
1106	* A fragmentation index only makes sense if an allocation of a requested
1107	* size would fail. If that is true, the fragmentation index indicates
1108	* whether external fragmentation or a lack of memory was the problem.
1109	* The value can be used to determine if page reclaim or compaction
1110	* should be used
1111	*/
1112	static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1113	{
1114	unsigned long requested = `1UL` << order;
1115
1116	if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
1117	return `0`;
1118
1119	if (!info->free_blocks_total)
1120	return `0`;
1121
1122	/ Fragmentation index only makes sense when a request would fail /
1123	if (info->free_blocks_suitable)
1124	return -`1000`;
1125
1126	/*
1127	* Index is between 0 and 1 so return within 3 decimal places
1128	*
1129	* 0 => allocation would fail due to lack of memory
1130	* 1 => allocation would fail due to fragmentation
1131	*/
1132	return `1000` - div_u64( dividend: (`1000`+(div_u64(dividend: info->free_pages * `1000ULL`, divisor: requested))), divisor: info->free_blocks_total);
1133	}
1134
1135	/*
1136	* Calculates external fragmentation within a zone wrt the given order.
1137	* It is defined as the percentage of pages found in blocks of size
1138	* less than 1 << order. It returns values in range [0, 100].
1139	*/
1140	unsigned int extfrag_for_order(struct zone zone, unsigned* int order)
1141	{
1142	struct contig_page_info info;
1143
1144	fill_contig_page_info(zone, suitable_order: order, info: &info);
1145	if (info.free_pages == `0`)
1146	return `0`;
1147
1148	return div_u64(dividend: (info.free_pages -
1149	(info.free_blocks_suitable << order)) * `100`,
1150	divisor: info.free_pages);
1151	}
1152
1153	/ Same as __fragmentation index but allocs contig_page_info on stack /
1154	int fragmentation_index(struct zone zone, unsigned* int order)
1155	{
1156	struct contig_page_info info;
1157
1158	fill_contig_page_info(zone, suitable_order: order, info: &info);
1159	return __fragmentation_index(order, info: &info);
1160	}
1161	#endif
1162
1163	#if defined(CONFIG_PROC_FS) \|\| defined(CONFIG_SYSFS) \|\| \
1164	defined(CONFIG_NUMA) \|\| defined(CONFIG_MEMCG)
1165	#ifdef CONFIG_ZONE_DMA
1166	#define TEXT_FOR_DMA(xx) xx "_dma",
1167	#else
1168	#define TEXT_FOR_DMA(xx)
1169	#endif
1170
1171	#ifdef CONFIG_ZONE_DMA32
1172	#define TEXT_FOR_DMA32(xx) xx "_dma32",
1173	#else
1174	#define TEXT_FOR_DMA32(xx)
1175	#endif
1176
1177	#ifdef CONFIG_HIGHMEM
1178	#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1179	#else
1180	#define TEXT_FOR_HIGHMEM(xx)
1181	#endif
1182
1183	#ifdef CONFIG_ZONE_DEVICE
1184	#define TEXT_FOR_DEVICE(xx) xx "_device",
1185	#else
1186	#define TEXT_FOR_DEVICE(xx)
1187	#endif
1188
1189	#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1190	TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1191	TEXT_FOR_DEVICE(xx)
1192
1193	const char * const vmstat_text[] = {
1194	/ enum zone_stat_item counters /
1195	"nr_free_pages",
1196	"nr_free_pages_blocks",
1197	"nr_zone_inactive_anon",
1198	"nr_zone_active_anon",
1199	"nr_zone_inactive_file",
1200	"nr_zone_active_file",
1201	"nr_zone_unevictable",
1202	"nr_zone_write_pending",
1203	"nr_mlock",
1204	#if IS_ENABLED(CONFIG_ZSMALLOC)
1205	"nr_zspages",
1206	#endif
1207	"nr_free_cma",
1208	#ifdef CONFIG_UNACCEPTED_MEMORY
1209	"nr_unaccepted",
1210	#endif
1211
1212	/ enum numa_stat_item counters /
1213	#ifdef CONFIG_NUMA
1214	"numa_hit",
1215	"numa_miss",
1216	"numa_foreign",
1217	"numa_interleave",
1218	"numa_local",
1219	"numa_other",
1220	#endif
1221
1222	/ enum node_stat_item counters /
1223	"nr_inactive_anon",
1224	"nr_active_anon",
1225	"nr_inactive_file",
1226	"nr_active_file",
1227	"nr_unevictable",
1228	"nr_slab_reclaimable",
1229	"nr_slab_unreclaimable",
1230	"nr_isolated_anon",
1231	"nr_isolated_file",
1232	"workingset_nodes",
1233	"workingset_refault_anon",
1234	"workingset_refault_file",
1235	"workingset_activate_anon",
1236	"workingset_activate_file",
1237	"workingset_restore_anon",
1238	"workingset_restore_file",
1239	"workingset_nodereclaim",
1240	"nr_anon_pages",
1241	"nr_mapped",
1242	"nr_file_pages",
1243	"nr_dirty",
1244	"nr_writeback",
1245	"nr_writeback_temp",
1246	"nr_shmem",
1247	"nr_shmem_hugepages",
1248	"nr_shmem_pmdmapped",
1249	"nr_file_hugepages",
1250	"nr_file_pmdmapped",
1251	"nr_anon_transparent_hugepages",
1252	"nr_vmscan_write",
1253	"nr_vmscan_immediate_reclaim",
1254	"nr_dirtied",
1255	"nr_written",
1256	"nr_throttled_written",
1257	"nr_kernel_misc_reclaimable",
1258	"nr_foll_pin_acquired",
1259	"nr_foll_pin_released",
1260	"nr_kernel_stack",
1261	#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1262	"nr_shadow_call_stack",
1263	#endif
1264	"nr_page_table_pages",
1265	"nr_sec_page_table_pages",
1266	#ifdef CONFIG_IOMMU_SUPPORT
1267	"nr_iommu_pages",
1268	#endif
1269	#ifdef CONFIG_SWAP
1270	"nr_swapcached",
1271	#endif
1272	#ifdef CONFIG_NUMA_BALANCING
1273	"pgpromote_success",
1274	"pgpromote_candidate",
1275	#endif
1276	"pgdemote_kswapd",
1277	"pgdemote_direct",
1278	"pgdemote_khugepaged",
1279	"pgdemote_proactive",
1280	#ifdef CONFIG_HUGETLB_PAGE
1281	"nr_hugetlb",
1282	#endif
1283	"nr_balloon_pages",
1284	/ system-wide enum vm_stat_item counters /
1285	"nr_dirty_threshold",
1286	"nr_dirty_background_threshold",
1287	"nr_memmap_pages",
1288	"nr_memmap_boot_pages",
1289
1290	#if defined(CONFIG_VM_EVENT_COUNTERS) \|\| defined(CONFIG_MEMCG)
1291	/ enum vm_event_item counters /
1292	"pgpgin",
1293	"pgpgout",
1294	"pswpin",
1295	"pswpout",
1296
1297	TEXTS_FOR_ZONES("pgalloc")
1298	TEXTS_FOR_ZONES("allocstall")
1299	TEXTS_FOR_ZONES("pgskip")
1300
1301	"pgfree",
1302	"pgactivate",
1303	"pgdeactivate",
1304	"pglazyfree",
1305
1306	"pgfault",
1307	"pgmajfault",
1308	"pglazyfreed",
1309
1310	"pgrefill",
1311	"pgreuse",
1312	"pgsteal_kswapd",
1313	"pgsteal_direct",
1314	"pgsteal_khugepaged",
1315	"pgsteal_proactive",
1316	"pgscan_kswapd",
1317	"pgscan_direct",
1318	"pgscan_khugepaged",
1319	"pgscan_proactive",
1320	"pgscan_direct_throttle",
1321	"pgscan_anon",
1322	"pgscan_file",
1323	"pgsteal_anon",
1324	"pgsteal_file",
1325
1326	#ifdef CONFIG_NUMA
1327	"zone_reclaim_success",
1328	"zone_reclaim_failed",
1329	#endif
1330	"pginodesteal",
1331	"slabs_scanned",
1332	"kswapd_inodesteal",
1333	"kswapd_low_wmark_hit_quickly",
1334	"kswapd_high_wmark_hit_quickly",
1335	"pageoutrun",
1336
1337	"pgrotated",
1338
1339	"drop_pagecache",
1340	"drop_slab",
1341	"oom_kill",
1342
1343	#ifdef CONFIG_NUMA_BALANCING
1344	"numa_pte_updates",
1345	"numa_huge_pte_updates",
1346	"numa_hint_faults",
1347	"numa_hint_faults_local",
1348	"numa_pages_migrated",
1349	"numa_task_migrated",
1350	"numa_task_swapped",
1351	#endif
1352	#ifdef CONFIG_MIGRATION
1353	"pgmigrate_success",
1354	"pgmigrate_fail",
1355	"thp_migration_success",
1356	"thp_migration_fail",
1357	"thp_migration_split",
1358	#endif
1359	#ifdef CONFIG_COMPACTION
1360	"compact_migrate_scanned",
1361	"compact_free_scanned",
1362	"compact_isolated",
1363	"compact_stall",
1364	"compact_fail",
1365	"compact_success",
1366	"compact_daemon_wake",
1367	"compact_daemon_migrate_scanned",
1368	"compact_daemon_free_scanned",
1369	#endif
1370
1371	#ifdef CONFIG_HUGETLB_PAGE
1372	"htlb_buddy_alloc_success",
1373	"htlb_buddy_alloc_fail",
1374	#endif
1375	#ifdef CONFIG_CMA
1376	"cma_alloc_success",
1377	"cma_alloc_fail",
1378	#endif
1379	"unevictable_pgs_culled",
1380	"unevictable_pgs_scanned",
1381	"unevictable_pgs_rescued",
1382	"unevictable_pgs_mlocked",
1383	"unevictable_pgs_munlocked",
1384	"unevictable_pgs_cleared",
1385	"unevictable_pgs_stranded",
1386
1387	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1388	"thp_fault_alloc",
1389	"thp_fault_fallback",
1390	"thp_fault_fallback_charge",
1391	"thp_collapse_alloc",
1392	"thp_collapse_alloc_failed",
1393	"thp_file_alloc",
1394	"thp_file_fallback",
1395	"thp_file_fallback_charge",
1396	"thp_file_mapped",
1397	"thp_split_page",
1398	"thp_split_page_failed",
1399	"thp_deferred_split_page",
1400	"thp_underused_split_page",
1401	"thp_split_pmd",
1402	"thp_scan_exceed_none_pte",
1403	"thp_scan_exceed_swap_pte",
1404	"thp_scan_exceed_share_pte",
1405	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1406	"thp_split_pud",
1407	#endif
1408	"thp_zero_page_alloc",
1409	"thp_zero_page_alloc_failed",
1410	"thp_swpout",
1411	"thp_swpout_fallback",
1412	#endif
1413	#ifdef CONFIG_MEMORY_BALLOON
1414	"balloon_inflate",
1415	"balloon_deflate",
1416	#ifdef CONFIG_BALLOON_COMPACTION
1417	"balloon_migrate",
1418	#endif
1419	#endif /* CONFIG_MEMORY_BALLOON */
1420	#ifdef CONFIG_DEBUG_TLBFLUSH
1421	"nr_tlb_remote_flush",
1422	"nr_tlb_remote_flush_received",
1423	"nr_tlb_local_flush_all",
1424	"nr_tlb_local_flush_one",
1425	#endif /* CONFIG_DEBUG_TLBFLUSH */
1426
1427	#ifdef CONFIG_SWAP
1428	"swap_ra",
1429	"swap_ra_hit",
1430	"swpin_zero",
1431	"swpout_zero",
1432	#ifdef CONFIG_KSM
1433	"ksm_swpin_copy",
1434	#endif
1435	#endif
1436	#ifdef CONFIG_KSM
1437	"cow_ksm",
1438	#endif
1439	#ifdef CONFIG_ZSWAP
1440	"zswpin",
1441	"zswpout",
1442	"zswpwb",
1443	#endif
1444	#ifdef CONFIG_X86
1445	"direct_map_level2_splits",
1446	"direct_map_level3_splits",
1447	"direct_map_level2_collapses",
1448	"direct_map_level3_collapses",
1449	#endif
1450	#ifdef CONFIG_PER_VMA_LOCK_STATS
1451	"vma_lock_success",
1452	"vma_lock_abort",
1453	"vma_lock_retry",
1454	"vma_lock_miss",
1455	#endif
1456	#ifdef CONFIG_DEBUG_STACK_USAGE
1457	"kstack_1k",
1458	#if THREAD_SIZE > 1024
1459	"kstack_2k",
1460	#endif
1461	#if THREAD_SIZE > 2048
1462	"kstack_4k",
1463	#endif
1464	#if THREAD_SIZE > 4096
1465	"kstack_8k",
1466	#endif
1467	#if THREAD_SIZE > 8192
1468	"kstack_16k",
1469	#endif
1470	#if THREAD_SIZE > 16384
1471	"kstack_32k",
1472	#endif
1473	#if THREAD_SIZE > 32768
1474	"kstack_64k",
1475	#endif
1476	#if THREAD_SIZE > 65536
1477	"kstack_rest",
1478	#endif
1479	#endif
1480	#endif /* CONFIG_VM_EVENT_COUNTERS \|\| CONFIG_MEMCG */
1481	};
1482	#endif /* CONFIG_PROC_FS \|\| CONFIG_SYSFS \|\| CONFIG_NUMA \|\| CONFIG_MEMCG */
1483
1484	#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) \|\| \
1485	defined(CONFIG_PROC_FS)
1486	static void frag_start(struct* seq_file m, loff_t pos)
1487	{
1488	pg_data_t *pgdat;
1489	loff_t node = *pos;
1490
1491	for (pgdat = first_online_pgdat();
1492	pgdat && node;
1493	pgdat = next_online_pgdat(pgdat))
1494	--node;
1495
1496	return pgdat;
1497	}
1498
1499	static void frag_next(struct* seq_file m, void* arg, loff_t pos)
1500	{
1501	pg_data_t pgdat = (pg_data_t )arg;
1502
1503	(*pos)++;
1504	return next_online_pgdat(pgdat);
1505	}
1506
1507	static void frag_stop(struct seq_file m, void* *arg)
1508	{
1509	}
1510
1511	/*
1512	* Walk zones in a node and print using a callback.
1513	* If @assert_populated is true, only use callback for zones that are populated.
1514	*/
1515	static void walk_zones_in_node(struct seq_file m, pg_data_t pgdat,
1516	bool assert_populated, bool nolock,
1517	void (print)(struct* seq_file m, pg_data_t , struct zone *))
1518	{
1519	struct zone *zone;
1520	struct zone *node_zones = pgdat->node_zones;
1521	unsigned long flags;
1522
1523	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1524	if (assert_populated && !populated_zone(zone))
1525	continue;
1526
1527	if (!nolock)
1528	spin_lock_irqsave(&zone->lock, flags);
1529	print(m, pgdat, zone);
1530	if (!nolock)
1531	spin_unlock_irqrestore(lock: &zone->lock, flags);
1532	}
1533	}
1534	#endif
1535
1536	#ifdef CONFIG_PROC_FS
1537	static void frag_show_print(struct seq_file m, pg_data_t pgdat,
1538	struct zone *zone)
1539	{
1540	int order;
1541
1542	seq_printf(m, fmt: "Node %d, zone %8s ", pgdat->node_id, zone->name);
1543	for (order = `0`; order < NR_PAGE_ORDERS; ++order)
1544	/*
1545	* Access to nr_free is lockless as nr_free is used only for
1546	* printing purposes. Use data_race to avoid KCSAN warning.
1547	*/
1548	seq_printf(m, fmt: "%6lu ", data_race(zone->free_area[order].nr_free));
1549	seq_putc(m, c: `'\n'`);
1550	}
1551
1552	/*
1553	* This walks the free areas for each zone.
1554	*/
1555	static int frag_show(struct seq_file m, void* *arg)
1556	{
1557	pg_data_t pgdat = (pg_data_t )arg;
1558	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: frag_show_print);
1559	return `0`;
1560	}
1561
1562	static void pagetypeinfo_showfree_print(struct seq_file *m,
1563	pg_data_t pgdat, struct* zone *zone)
1564	{
1565	int order, mtype;
1566
1567	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++) {
1568	seq_printf(m, fmt: "Node %4d, zone %8s, type %12s ",
1569	pgdat->node_id,
1570	zone->name,
1571	migratetype_names[mtype]);
1572	for (order = `0`; order < NR_PAGE_ORDERS; ++order) {
1573	unsigned long freecount = `0`;
1574	struct free_area *area;
1575	struct list_head *curr;
1576	bool overflow = false;
1577
1578	area = &(zone->free_area[order]);
1579
1580	list_for_each(curr, &area->free_list[mtype]) {
1581	/*
1582	* Cap the free_list iteration because it might
1583	* be really large and we are under a spinlock
1584	* so a long time spent here could trigger a
1585	* hard lockup detector. Anyway this is a
1586	* debugging tool so knowing there is a handful
1587	* of pages of this order should be more than
1588	* sufficient.
1589	*/
1590	if (++freecount >= `100000`) {
1591	overflow = true;
1592	break;
1593	}
1594	}
1595	seq_printf(m, fmt: "%s%6lu ", overflow ? ">" : "", freecount);
1596	spin_unlock_irq(lock: &zone->lock);
1597	cond_resched();
1598	spin_lock_irq(lock: &zone->lock);
1599	}
1600	seq_putc(m, c: `'\n'`);
1601	}
1602	}
1603
1604	/ Print out the free pages at each order for each migatetype /
1605	static void pagetypeinfo_showfree(struct seq_file m, void* *arg)
1606	{
1607	int order;
1608	pg_data_t pgdat = (pg_data_t )arg;
1609
1610	/ Print header /
1611	seq_printf(m, fmt: "%-43s ", "Free pages count per migrate type at order");
1612	for (order = `0`; order < NR_PAGE_ORDERS; ++order)
1613	seq_printf(m, fmt: "%6d ", order);
1614	seq_putc(m, c: `'\n'`);
1615
1616	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: pagetypeinfo_showfree_print);
1617	}
1618
1619	static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1620	pg_data_t pgdat, struct* zone *zone)
1621	{
1622	int mtype;
1623	unsigned long pfn;
1624	unsigned long start_pfn = zone->zone_start_pfn;
1625	unsigned long end_pfn = zone_end_pfn(zone);
1626	unsigned long count[MIGRATE_TYPES] = { `0`, };
1627
1628	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1629	struct page *page;
1630
1631	page = pfn_to_online_page(pfn);
1632	if (!page)
1633	continue;
1634
1635	if (page_zone(page) != zone)
1636	continue;
1637
1638	mtype = get_pageblock_migratetype(page);
1639
1640	if (mtype < MIGRATE_TYPES)
1641	count[mtype]++;
1642	}
1643
1644	/ Print counts /
1645	seq_printf(m, fmt: "Node %d, zone %8s ", pgdat->node_id, zone->name);
1646	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++)
1647	seq_printf(m, fmt: "%12lu ", count[mtype]);
1648	seq_putc(m, c: `'\n'`);
1649	}
1650
1651	/ Print out the number of pageblocks for each migratetype /
1652	static void pagetypeinfo_showblockcount(struct seq_file m, void* *arg)
1653	{
1654	int mtype;
1655	pg_data_t pgdat = (pg_data_t )arg;
1656
1657	seq_printf(m, fmt: "\n%-23s", "Number of blocks type ");
1658	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++)
1659	seq_printf(m, fmt: "%12s ", migratetype_names[mtype]);
1660	seq_putc(m, c: `'\n'`);
1661	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false,
1662	print: pagetypeinfo_showblockcount_print);
1663	}
1664
1665	/*
1666	* Print out the number of pageblocks for each migratetype that contain pages
1667	* of other types. This gives an indication of how well fallbacks are being
1668	* contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1669	* to determine what is going on
1670	*/
1671	static void pagetypeinfo_showmixedcount(struct seq_file m, pg_data_t pgdat)
1672	{
1673	#ifdef CONFIG_PAGE_OWNER
1674	int mtype;
1675
1676	if (!static_branch_unlikely(&page_owner_inited))
1677	return;
1678
1679	drain_all_pages(NULL);
1680
1681	seq_printf(m, fmt: "\n%-23s", "Number of mixed blocks ");
1682	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++)
1683	seq_printf(m, fmt: "%12s ", migratetype_names[mtype]);
1684	seq_putc(m, c: `'\n'`);
1685
1686	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: true,
1687	print: pagetypeinfo_showmixedcount_print);
1688	#endif /* CONFIG_PAGE_OWNER */
1689	}
1690
1691	/*
1692	* This prints out statistics in relation to grouping pages by mobility.
1693	* It is expensive to collect so do not constantly read the file.
1694	*/
1695	static int pagetypeinfo_show(struct seq_file m, void* *arg)
1696	{
1697	pg_data_t pgdat = (pg_data_t )arg;
1698
1699	/ check memoryless node /
1700	if (!node_state(node: pgdat->node_id, state: N_MEMORY))
1701	return `0`;
1702
1703	seq_printf(m, fmt: "Page block order: %d\n", pageblock_order);
1704	seq_printf(m, fmt: "Pages per block: %lu\n", pageblock_nr_pages);
1705	seq_putc(m, c: `'\n'`);
1706	pagetypeinfo_showfree(m, arg: pgdat);
1707	pagetypeinfo_showblockcount(m, arg: pgdat);
1708	pagetypeinfo_showmixedcount(m, pgdat);
1709
1710	return `0`;
1711	}
1712
1713	static const struct seq_operations fragmentation_op = {
1714	.start = frag_start,
1715	.next = frag_next,
1716	.stop = frag_stop,
1717	.show = frag_show,
1718	};
1719
1720	static const struct seq_operations pagetypeinfo_op = {
1721	.start = frag_start,
1722	.next = frag_next,
1723	.stop = frag_stop,
1724	.show = pagetypeinfo_show,
1725	};
1726
1727	static bool is_zone_first_populated(pg_data_t pgdat, struct* zone *zone)
1728	{
1729	int zid;
1730
1731	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
1732	struct zone *compare = &pgdat->node_zones[zid];
1733
1734	if (populated_zone(zone: compare))
1735	return zone == compare;
1736	}
1737
1738	return false;
1739	}
1740
1741	static void zoneinfo_show_print(struct seq_file m, pg_data_t pgdat,
1742	struct zone *zone)
1743	{
1744	int i;
1745	seq_printf(m, fmt: "Node %d, zone %8s", pgdat->node_id, zone->name);
1746	if (is_zone_first_populated(pgdat, zone)) {
1747	seq_printf(m, fmt: "\n per-node stats");
1748	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
1749	unsigned long pages = node_page_state_pages(pgdat, item: i);
1750
1751	if (vmstat_item_print_in_thp(item: i))
1752	pages /= HPAGE_PMD_NR;
1753	seq_printf(m, fmt: "\n %-12s %lu", node_stat_name(item: i),
1754	pages);
1755	}
1756	}
1757	seq_printf(m,
1758	fmt: "\n pages free %lu"
1759	"\n boost %lu"
1760	"\n min %lu"
1761	"\n low %lu"
1762	"\n high %lu"
1763	"\n promo %lu"
1764	"\n spanned %lu"
1765	"\n present %lu"
1766	"\n managed %lu"
1767	"\n cma %lu",
1768	zone_page_state(zone, item: NR_FREE_PAGES),
1769	zone->watermark_boost,
1770	min_wmark_pages(z: zone),
1771	low_wmark_pages(z: zone),
1772	high_wmark_pages(z: zone),
1773	promo_wmark_pages(z: zone),
1774	zone->spanned_pages,
1775	zone->present_pages,
1776	zone_managed_pages(zone),
1777	zone_cma_pages(zone));
1778
1779	seq_printf(m,
1780	fmt: "\n protection: (%ld",
1781	zone->lowmem_reserve[`0`]);
1782	for (i = `1`; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1783	seq_printf(m, fmt: ", %ld", zone->lowmem_reserve[i]);
1784	seq_putc(m, c: `')'`);
1785
1786	/ If unpopulated, no other information is useful /
1787	if (!populated_zone(zone)) {
1788	seq_putc(m, c: `'\n'`);
1789	return;
1790	}
1791
1792	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++)
1793	seq_printf(m, fmt: "\n %-12s %lu", zone_stat_name(item: i),
1794	zone_page_state(zone, item: i));
1795
1796	#ifdef CONFIG_NUMA
1797	fold_vm_zone_numa_events(zone);
1798	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1799	seq_printf(m, fmt: "\n %-12s %lu", numa_stat_name(item: i),
1800	zone_numa_event_state(zone, item: i));
1801	#endif
1802
1803	seq_printf(m, fmt: "\n pagesets");
1804	for_each_online_cpu(i) {
1805	struct per_cpu_pages *pcp;
1806	struct per_cpu_zonestat __maybe_unused *pzstats;
1807
1808	pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1809	seq_printf(m,
1810	fmt: "\n cpu: %i"
1811	"\n count: %i"
1812	"\n high: %i"
1813	"\n batch: %i"
1814	"\n high_min: %i"
1815	"\n high_max: %i",
1816	i,
1817	pcp->count,
1818	pcp->high,
1819	pcp->batch,
1820	pcp->high_min,
1821	pcp->high_max);
1822	#ifdef CONFIG_SMP
1823	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1824	seq_printf(m, fmt: "\n vm stats threshold: %d",
1825	pzstats->stat_threshold);
1826	#endif
1827	}
1828	seq_printf(m,
1829	fmt: "\n node_unreclaimable: %u"
1830	"\n start_pfn: %lu",
1831	pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1832	zone->zone_start_pfn);
1833	seq_putc(m, c: `'\n'`);
1834	}
1835
1836	/*
1837	* Output information about zones in @pgdat. All zones are printed regardless
1838	* of whether they are populated or not: lowmem_reserve_ratio operates on the
1839	* set of all zones and userspace would not be aware of such zones if they are
1840	* suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1841	*/
1842	static int zoneinfo_show(struct seq_file m, void* *arg)
1843	{
1844	pg_data_t pgdat = (pg_data_t )arg;
1845	walk_zones_in_node(m, pgdat, assert_populated: false, nolock: false, print: zoneinfo_show_print);
1846	return `0`;
1847	}
1848
1849	static const struct seq_operations zoneinfo_op = {
1850	.start = frag_start, / iterate over all zones. The same as in*
1851	* fragmentation. */
1852	.next = frag_next,
1853	.stop = frag_stop,
1854	.show = zoneinfo_show,
1855	};
1856
1857	#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1858	NR_VM_NUMA_EVENT_ITEMS + \
1859	NR_VM_NODE_STAT_ITEMS + \
1860	NR_VM_STAT_ITEMS + \
1861	(IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1862	NR_VM_EVENT_ITEMS : 0))
1863
1864	static void vmstat_start(struct* seq_file m, loff_t pos)
1865	{
1866	unsigned long *v;
1867	int i;
1868
1869	if (*pos >= NR_VMSTAT_ITEMS)
1870	return NULL;
1871
1872	BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1873	fold_vm_numa_events();
1874	v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1875	m->private = v;
1876	if (!v)
1877	return ERR_PTR(error: -ENOMEM);
1878	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++)
1879	v[i] = global_zone_page_state(item: i);
1880	v += NR_VM_ZONE_STAT_ITEMS;
1881
1882	#ifdef CONFIG_NUMA
1883	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1884	v[i] = global_numa_event_state(item: i);
1885	v += NR_VM_NUMA_EVENT_ITEMS;
1886	#endif
1887
1888	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
1889	v[i] = global_node_page_state_pages(item: i);
1890	if (vmstat_item_print_in_thp(item: i))
1891	v[i] /= HPAGE_PMD_NR;
1892	}
1893	v += NR_VM_NODE_STAT_ITEMS;
1894
1895	global_dirty_limits(pbackground: v + NR_DIRTY_BG_THRESHOLD,
1896	pdirty: v + NR_DIRTY_THRESHOLD);
1897	v[NR_MEMMAP_PAGES] = atomic_long_read(v: &nr_memmap_pages);
1898	v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(v: &nr_memmap_boot_pages);
1899	v += NR_VM_STAT_ITEMS;
1900
1901	#ifdef CONFIG_VM_EVENT_COUNTERS
1902	all_vm_events(v);
1903	v[PGPGIN] /= `2`; / sectors -> kbytes /
1904	v[PGPGOUT] /= `2`;
1905	#endif
1906	return (unsigned long )m->private + pos;
1907	}
1908
1909	static void vmstat_next(struct* seq_file m, void* arg, loff_t pos)
1910	{
1911	(*pos)++;
1912	if (*pos >= NR_VMSTAT_ITEMS)
1913	return NULL;
1914	return (unsigned long )m->private + pos;
1915	}
1916
1917	static int vmstat_show(struct seq_file m, void* *arg)
1918	{
1919	unsigned long *l = arg;
1920	unsigned long off = l - (unsigned long *)m->private;
1921
1922	seq_puts(m, s: vmstat_text[off]);
1923	seq_put_decimal_ull(m, delimiter: " ", num: *l);
1924	seq_putc(m, c: `'\n'`);
1925
1926	if (off == NR_VMSTAT_ITEMS - `1`) {
1927	/*
1928	* We've come to the end - add any deprecated counters to avoid
1929	* breaking userspace which might depend on them being present.
1930	*/
1931	seq_puts(m, s: "nr_unstable 0\n");
1932	}
1933	return `0`;
1934	}
1935
1936	static void vmstat_stop(struct seq_file m, void* *arg)
1937	{
1938	kfree(objp: m->private);
1939	m->private = NULL;
1940	}
1941
1942	static const struct seq_operations vmstat_op = {
1943	.start = vmstat_start,
1944	.next = vmstat_next,
1945	.stop = vmstat_stop,
1946	.show = vmstat_show,
1947	};
1948	#endif /* CONFIG_PROC_FS */
1949
1950	#ifdef CONFIG_SMP
1951	static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1952	static int sysctl_stat_interval __read_mostly = HZ;
1953	static int vmstat_late_init_done;
1954
1955	#ifdef CONFIG_PROC_FS
1956	static void refresh_vm_stats(struct work_struct *work)
1957	{
1958	refresh_cpu_vm_stats(do_pagesets: true);
1959	}
1960
1961	static int vmstat_refresh(const struct ctl_table table, int* write,
1962	void buffer, size_t lenp, loff_t *ppos)
1963	{
1964	long val;
1965	int err;
1966	int i;
1967
1968	/*
1969	* The regular update, every sysctl_stat_interval, may come later
1970	* than expected: leaving a significant amount in per_cpu buckets.
1971	* This is particularly misleading when checking a quantity of HUGE
1972	* pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1973	* which can equally be echo'ed to or cat'ted from (by root),
1974	* can be used to update the stats just before reading them.
1975	*
1976	* Oh, and since global_zone_page_state() etc. are so careful to hide
1977	* transiently negative values, report an error here if any of
1978	* the stats is negative, so we know to go looking for imbalance.
1979	*/
1980	err = schedule_on_each_cpu(func: refresh_vm_stats);
1981	if (err)
1982	return err;
1983	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1984	/*
1985	* Skip checking stats known to go negative occasionally.
1986	*/
1987	switch (i) {
1988	case NR_ZONE_WRITE_PENDING:
1989	case NR_FREE_CMA_PAGES:
1990	continue;
1991	}
1992	val = atomic_long_read(v: &vm_zone_stat[i]);
1993	if (val < `0`) {
1994	pr_warn("%s: %s %ld\n",
1995	__func__, zone_stat_name(i), val);
1996	}
1997	}
1998	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
1999	/*
2000	* Skip checking stats known to go negative occasionally.
2001	*/
2002	switch (i) {
2003	case NR_WRITEBACK:
2004	continue;
2005	}
2006	val = atomic_long_read(v: &vm_node_stat[i]);
2007	if (val < `0`) {
2008	pr_warn("%s: %s %ld\n",
2009	__func__, node_stat_name(i), val);
2010	}
2011	}
2012	if (write)
2013	ppos += lenp;
2014	else
2015	*lenp = `0`;
2016	return `0`;
2017	}
2018	#endif /* CONFIG_PROC_FS */
2019
2020	static void vmstat_update(struct work_struct *w)
2021	{
2022	if (refresh_cpu_vm_stats(do_pagesets: true)) {
2023	/*
2024	* Counters were updated so we expect more updates
2025	* to occur in the future. Keep on running the
2026	* update worker thread.
2027	*/
2028	queue_delayed_work_on(smp_processor_id(), wq: mm_percpu_wq,
2029	this_cpu_ptr(&vmstat_work),
2030	delay: round_jiffies_relative(j: sysctl_stat_interval));
2031	}
2032	}
2033
2034	/*
2035	* Check if the diffs for a certain cpu indicate that
2036	* an update is needed.
2037	*/
2038	static bool need_update(int cpu)
2039	{
2040	pg_data_t *last_pgdat = NULL;
2041	struct zone *zone;
2042
2043	for_each_populated_zone(zone) {
2044	struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2045	struct per_cpu_nodestat *n;
2046
2047	/*
2048	* The fast way of checking if there are any vmstat diffs.
2049	*/
2050	if (memchr_inv(p: pzstats->vm_stat_diff, c: `0`, size: sizeof(pzstats->vm_stat_diff)))
2051	return true;
2052
2053	if (last_pgdat == zone->zone_pgdat)
2054	continue;
2055	last_pgdat = zone->zone_pgdat;
2056	n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
2057	if (memchr_inv(p: n->vm_node_stat_diff, c: `0`, size: sizeof(n->vm_node_stat_diff)))
2058	return true;
2059	}
2060	return false;
2061	}
2062
2063	/*
2064	* Switch off vmstat processing and then fold all the remaining differentials
2065	* until the diffs stay at zero. The function is used by NOHZ and can only be
2066	* invoked when tick processing is not active.
2067	*/
2068	void quiet_vmstat(void)
2069	{
2070	if (system_state != SYSTEM_RUNNING)
2071	return;
2072
2073	if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
2074	return;
2075
2076	if (!need_update(smp_processor_id()))
2077	return;
2078
2079	/*
2080	* Just refresh counters and do not care about the pending delayed
2081	* vmstat_update. It doesn't fire that often to matter and canceling
2082	* it would be too expensive from this path.
2083	* vmstat_shepherd will take care about that for us.
2084	*/
2085	refresh_cpu_vm_stats(do_pagesets: false);
2086	}
2087
2088	/*
2089	* Shepherd worker thread that checks the
2090	* differentials of processors that have their worker
2091	* threads for vm statistics updates disabled because of
2092	* inactivity.
2093	*/
2094	static void vmstat_shepherd(struct work_struct *w);
2095
2096	static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2097
2098	static void vmstat_shepherd(struct work_struct *w)
2099	{
2100	int cpu;
2101
2102	cpus_read_lock();
2103	/ Check processors whose vmstat worker threads have been disabled /
2104	for_each_online_cpu(cpu) {
2105	struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2106
2107	/*
2108	* In kernel users of vmstat counters either require the precise value and
2109	* they are using zone_page_state_snapshot interface or they can live with
2110	* an imprecision as the regular flushing can happen at arbitrary time and
2111	* cumulative error can grow (see calculate_normal_threshold).
2112	*
2113	* From that POV the regular flushing can be postponed for CPUs that have
2114	* been isolated from the kernel interference without critical
2115	* infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2116	* for all isolated CPUs to avoid interference with the isolated workload.
2117	*/
2118	if (cpu_is_isolated(cpu))
2119	continue;
2120
2121	if (!delayed_work_pending(dw) && need_update(cpu))
2122	queue_delayed_work_on(cpu, wq: mm_percpu_wq, work: dw, delay: `0`);
2123
2124	cond_resched();
2125	}
2126	cpus_read_unlock();
2127
2128	schedule_delayed_work(dwork: &shepherd,
2129	delay: round_jiffies_relative(j: sysctl_stat_interval));
2130	}
2131
2132	static void __init start_shepherd_timer(void)
2133	{
2134	int cpu;
2135
2136	for_each_possible_cpu(cpu) {
2137	INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2138	vmstat_update);
2139
2140	/*
2141	* For secondary CPUs during CPU hotplug scenarios,
2142	* vmstat_cpu_online() will enable the work.
2143	* mm/vmstat:online enables and disables vmstat_work
2144	* symmetrically during CPU hotplug events.
2145	*/
2146	if (!cpu_online(cpu))
2147	disable_delayed_work_sync(dwork: &per_cpu(vmstat_work, cpu));
2148	}
2149
2150	schedule_delayed_work(dwork: &shepherd,
2151	delay: round_jiffies_relative(j: sysctl_stat_interval));
2152	}
2153
2154	static void __init init_cpu_node_state(void)
2155	{
2156	int node;
2157
2158	for_each_online_node(node) {
2159	if (!cpumask_empty(srcp: cpumask_of_node(node)))
2160	node_set_state(node, state: N_CPU);
2161	}
2162	}
2163
2164	static int vmstat_cpu_online(unsigned int cpu)
2165	{
2166	if (vmstat_late_init_done)
2167	refresh_zone_stat_thresholds();
2168
2169	if (!node_state(cpu_to_node(cpu), state: N_CPU)) {
2170	node_set_state(cpu_to_node(cpu), state: N_CPU);
2171	}
2172	enable_delayed_work(dwork: &per_cpu(vmstat_work, cpu));
2173
2174	return `0`;
2175	}
2176
2177	static int vmstat_cpu_down_prep(unsigned int cpu)
2178	{
2179	disable_delayed_work_sync(dwork: &per_cpu(vmstat_work, cpu));
2180	return `0`;
2181	}
2182
2183	static int vmstat_cpu_dead(unsigned int cpu)
2184	{
2185	const struct cpumask *node_cpus;
2186	int node;
2187
2188	node = cpu_to_node(cpu);
2189
2190	refresh_zone_stat_thresholds();
2191	node_cpus = cpumask_of_node(node);
2192	if (!cpumask_empty(srcp: node_cpus))
2193	return `0`;
2194
2195	node_clear_state(node, state: N_CPU);
2196
2197	return `0`;
2198	}
2199
2200	static int __init vmstat_late_init(void)
2201	{
2202	refresh_zone_stat_thresholds();
2203	vmstat_late_init_done = `1`;
2204
2205	return `0`;
2206	}
2207	late_initcall(vmstat_late_init);
2208	#endif
2209
2210	#ifdef CONFIG_PROC_FS
2211	static const struct ctl_table vmstat_table[] = {
2212	#ifdef CONFIG_SMP
2213	{
2214	.procname = "stat_interval",
2215	.data = &sysctl_stat_interval,
2216	.maxlen = sizeof(sysctl_stat_interval),
2217	.mode = `0644`,
2218	.proc_handler = proc_dointvec_jiffies,
2219	},
2220	{
2221	.procname = "stat_refresh",
2222	.data = NULL,
2223	.maxlen = `0`,
2224	.mode = `0600`,
2225	.proc_handler = vmstat_refresh,
2226	},
2227	#endif
2228	#ifdef CONFIG_NUMA
2229	{
2230	.procname = "numa_stat",
2231	.data = &sysctl_vm_numa_stat,
2232	.maxlen = sizeof(int),
2233	.mode = `0644`,
2234	.proc_handler = sysctl_vm_numa_stat_handler,
2235	.extra1 = SYSCTL_ZERO,
2236	.extra2 = SYSCTL_ONE,
2237	},
2238	#endif
2239	};
2240	#endif
2241
2242	struct workqueue_struct *mm_percpu_wq;
2243
2244	void __init init_mm_internals(void)
2245	{
2246	int ret __maybe_unused;
2247
2248	mm_percpu_wq = alloc_workqueue(fmt: "mm_percpu_wq", flags: WQ_MEM_RECLAIM, max_active: `0`);
2249
2250	#ifdef CONFIG_SMP
2251	ret = cpuhp_setup_state_nocalls(state: CPUHP_MM_VMSTAT_DEAD, name: "mm/vmstat:dead",
2252	NULL, teardown: vmstat_cpu_dead);
2253	if (ret < `0`)
2254	pr_err("vmstat: failed to register 'dead' hotplug state\n");
2255
2256	ret = cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, name: "mm/vmstat:online",
2257	startup: vmstat_cpu_online,
2258	teardown: vmstat_cpu_down_prep);
2259	if (ret < `0`)
2260	pr_err("vmstat: failed to register 'online' hotplug state\n");
2261
2262	cpus_read_lock();
2263	init_cpu_node_state();
2264	cpus_read_unlock();
2265
2266	start_shepherd_timer();
2267	#endif
2268	#ifdef CONFIG_PROC_FS
2269	proc_create_seq("buddyinfo", `0444`, NULL, &fragmentation_op);
2270	proc_create_seq("pagetypeinfo", `0400`, NULL, &pagetypeinfo_op);
2271	proc_create_seq("vmstat", `0444`, NULL, &vmstat_op);
2272	proc_create_seq("zoneinfo", `0444`, NULL, &zoneinfo_op);
2273	register_sysctl_init("vm", vmstat_table);
2274	#endif
2275	}
2276
2277	#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2278
2279	/*
2280	* Return an index indicating how much of the available free memory is
2281	* unusable for an allocation of the requested size.
2282	*/
2283	static int unusable_free_index(unsigned int order,
2284	struct contig_page_info *info)
2285	{
2286	/ No free memory is interpreted as all free memory is unusable /
2287	if (info->free_pages == `0`)
2288	return `1000`;
2289
2290	/*
2291	* Index should be a value between 0 and 1. Return a value to 3
2292	* decimal places.
2293	*
2294	* 0 => no fragmentation
2295	* 1 => high fragmentation
2296	*/
2297	return div_u64(dividend: (info->free_pages - (info->free_blocks_suitable << order)) * `1000ULL`, divisor: info->free_pages);
2298
2299	}
2300
2301	static void unusable_show_print(struct seq_file *m,
2302	pg_data_t pgdat, struct* zone *zone)
2303	{
2304	unsigned int order;
2305	int index;
2306	struct contig_page_info info;
2307
2308	seq_printf(m, fmt: "Node %d, zone %8s ",
2309	pgdat->node_id,
2310	zone->name);
2311	for (order = `0`; order < NR_PAGE_ORDERS; ++order) {
2312	fill_contig_page_info(zone, suitable_order: order, info: &info);
2313	index = unusable_free_index(order, info: &info);
2314	seq_printf(m, fmt: "%d.%03d ", index / `1000`, index % `1000`);
2315	}
2316
2317	seq_putc(m, c: `'\n'`);
2318	}
2319
2320	/*
2321	* Display unusable free space index
2322	*
2323	* The unusable free space index measures how much of the available free
2324	* memory cannot be used to satisfy an allocation of a given size and is a
2325	* value between 0 and 1. The higher the value, the more of free memory is
2326	* unusable and by implication, the worse the external fragmentation is. This
2327	* can be expressed as a percentage by multiplying by 100.
2328	*/
2329	static int unusable_show(struct seq_file m, void* *arg)
2330	{
2331	pg_data_t pgdat = (pg_data_t )arg;
2332
2333	/ check memoryless node /
2334	if (!node_state(node: pgdat->node_id, state: N_MEMORY))
2335	return `0`;
2336
2337	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: unusable_show_print);
2338
2339	return `0`;
2340	}
2341
2342	static const struct seq_operations unusable_sops = {
2343	.start = frag_start,
2344	.next = frag_next,
2345	.stop = frag_stop,
2346	.show = unusable_show,
2347	};
2348
2349	DEFINE_SEQ_ATTRIBUTE(unusable);
2350
2351	static void extfrag_show_print(struct seq_file *m,
2352	pg_data_t pgdat, struct* zone *zone)
2353	{
2354	unsigned int order;
2355	int index;
2356
2357	/ Alloc on stack as interrupts are disabled for zone walk /
2358	struct contig_page_info info;
2359
2360	seq_printf(m, fmt: "Node %d, zone %8s ",
2361	pgdat->node_id,
2362	zone->name);
2363	for (order = `0`; order < NR_PAGE_ORDERS; ++order) {
2364	fill_contig_page_info(zone, suitable_order: order, info: &info);
2365	index = __fragmentation_index(order, info: &info);
2366	seq_printf(m, fmt: "%2d.%03d ", index / `1000`, index % `1000`);
2367	}
2368
2369	seq_putc(m, c: `'\n'`);
2370	}
2371
2372	/*
2373	* Display fragmentation index for orders that allocations would fail for
2374	*/
2375	static int extfrag_show(struct seq_file m, void* *arg)
2376	{
2377	pg_data_t pgdat = (pg_data_t )arg;
2378
2379	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: extfrag_show_print);
2380
2381	return `0`;
2382	}
2383
2384	static const struct seq_operations extfrag_sops = {
2385	.start = frag_start,
2386	.next = frag_next,
2387	.stop = frag_stop,
2388	.show = extfrag_show,
2389	};
2390
2391	DEFINE_SEQ_ATTRIBUTE(extfrag);
2392
2393	static int __init extfrag_debug_init(void)
2394	{
2395	struct dentry *extfrag_debug_root;
2396
2397	extfrag_debug_root = debugfs_create_dir(name: "extfrag", NULL);
2398
2399	debugfs_create_file("unusable_index", `0444`, extfrag_debug_root, NULL,
2400	&unusable_fops);
2401
2402	debugfs_create_file("extfrag_index", `0444`, extfrag_debug_root, NULL,
2403	&extfrag_fops);
2404
2405	return `0`;
2406	}
2407
2408	module_init(extfrag_debug_init);
2409
2410	#endif
2411

source code of linux/mm/vmstat.c