page_reporting.c source code [linux/mm/page_reporting.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/mm.h>
3	#include <linux/mmzone.h>
4	#include <linux/page_reporting.h>
5	#include <linux/gfp.h>
6	#include <linux/export.h>
7	#include <linux/module.h>
8	#include <linux/delay.h>
9	#include <linux/scatterlist.h>
10
11	#include "page_reporting.h"
12	#include "internal.h"
13
14	/ Initialize to an unsupported value /
15	unsigned int page_reporting_order = -`1`;
16
17	static int page_order_update_notify(const char val, const* struct kernel_param *kp)
18	{
19	/*
20	* If param is set beyond this limit, order is set to default
21	* pageblock_order value
22	*/
23	return param_set_uint_minmax(val, kp, min: `0`, MAX_ORDER);
24	}
25
26	static const struct kernel_param_ops page_reporting_param_ops = {
27	.set = &page_order_update_notify,
28	/*
29	* For the get op, use param_get_int instead of param_get_uint.
30	* This is to make sure that when unset the initialized value of
31	* -1 is shown correctly
32	*/
33	.get = &param_get_int,
34	};
35
36	module_param_cb(page_reporting_order, &page_reporting_param_ops,
37	&page_reporting_order, `0644`);
38	MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
39
40	/*
41	* This symbol is also a kernel parameter. Export the page_reporting_order
42	* symbol so that other drivers can access it to control order values without
43	* having to introduce another configurable parameter. Only one driver can
44	* register with the page_reporting driver for the service, so we have just
45	* one control parameter for the use case(which can be accessed in both
46	* drivers)
47	*/
48	EXPORT_SYMBOL_GPL(page_reporting_order);
49
50	#define PAGE_REPORTING_DELAY (2 * HZ)
51	static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
52
53	enum {
54	PAGE_REPORTING_IDLE = `0`,
55	PAGE_REPORTING_REQUESTED,
56	PAGE_REPORTING_ACTIVE
57	};
58
59	/ request page reporting /
60	static void
61	__page_reporting_request(struct page_reporting_dev_info *prdev)
62	{
63	unsigned int state;
64
65	/ Check to see if we are in desired state /
66	state = atomic_read(v: &prdev->state);
67	if (state == PAGE_REPORTING_REQUESTED)
68	return;
69
70	/*
71	* If reporting is already active there is nothing we need to do.
72	* Test against 0 as that represents PAGE_REPORTING_IDLE.
73	*/
74	state = atomic_xchg(v: &prdev->state, new: PAGE_REPORTING_REQUESTED);
75	if (state != PAGE_REPORTING_IDLE)
76	return;
77
78	/*
79	* Delay the start of work to allow a sizable queue to build. For
80	* now we are limiting this to running no more than once every
81	* couple of seconds.
82	*/
83	schedule_delayed_work(dwork: &prdev->work, PAGE_REPORTING_DELAY);
84	}
85
86	/ notify prdev of free page reporting request /
87	void __page_reporting_notify(void)
88	{
89	struct page_reporting_dev_info *prdev;
90
91	/*
92	* We use RCU to protect the pr_dev_info pointer. In almost all
93	* cases this should be present, however in the unlikely case of
94	* a shutdown this will be NULL and we should exit.
95	*/
96	rcu_read_lock();
97	prdev = rcu_dereference(pr_dev_info);
98	if (likely(prdev))
99	__page_reporting_request(prdev);
100
101	rcu_read_unlock();
102	}
103
104	static void
105	page_reporting_drain(struct page_reporting_dev_info *prdev,
106	struct scatterlist sgl, unsigned* int nents, bool reported)
107	{
108	struct scatterlist *sg = sgl;
109
110	/*
111	* Drain the now reported pages back into their respective
112	* free lists/areas. We assume at least one page is populated.
113	*/
114	do {
115	struct page *page = sg_page(sg);
116	int mt = get_pageblock_migratetype(page);
117	unsigned int order = get_order(size: sg->length);
118
119	__putback_isolated_page(page, order, mt);
120
121	/ If the pages were not reported due to error skip flagging /
122	if (!reported)
123	continue;
124
125	/*
126	* If page was not comingled with another page we can
127	* consider the result to be "reported" since the page
128	* hasn't been modified, otherwise we will need to
129	* report on the new larger page when we make our way
130	* up to that higher order.
131	*/
132	if (PageBuddy(page) && buddy_order(page) == order)
133	__SetPageReported(page);
134	} while ((sg = sg_next(sg)));
135
136	/ reinitialize scatterlist now that it is empty /
137	sg_init_table(sgl, nents);
138	}
139
140	/*
141	* The page reporting cycle consists of 4 stages, fill, report, drain, and
142	* idle. We will cycle through the first 3 stages until we cannot obtain a
143	* full scatterlist of pages, in that case we will switch to idle.
144	*/
145	static int
146	page_reporting_cycle(struct page_reporting_dev_info prdev, struct* zone *zone,
147	unsigned int order, unsigned int mt,
148	struct scatterlist sgl, unsigned* int *offset)
149	{
150	struct free_area *area = &zone->free_area[order];
151	struct list_head *list = &area->free_list[mt];
152	unsigned int page_len = PAGE_SIZE << order;
153	struct page page, next;
154	long budget;
155	int err = `0`;
156
157	/*
158	* Perform early check, if free area is empty there is
159	* nothing to process so we can skip this free_list.
160	*/
161	if (list_empty(head: list))
162	return err;
163
164	spin_lock_irq(lock: &zone->lock);
165
166	/*
167	* Limit how many calls we will be making to the page reporting
168	* device for this list. By doing this we avoid processing any
169	* given list for too long.
170	*
171	* The current value used allows us enough calls to process over a
172	* sixteenth of the current list plus one additional call to handle
173	* any pages that may have already been present from the previous
174	* list processed. This should result in us reporting all pages on
175	* an idle system in about 30 seconds.
176	*
177	* The division here should be cheap since PAGE_REPORTING_CAPACITY
178	* should always be a power of 2.
179	*/
180	budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * `16`);
181
182	/ loop through free list adding unreported pages to sg list /
183	list_for_each_entry_safe(page, next, list, lru) {
184	/ We are going to skip over the reported pages. /
185	if (PageReported(page))
186	continue;
187
188	/*
189	* If we fully consumed our budget then update our
190	* state to indicate that we are requesting additional
191	* processing and exit this list.
192	*/
193	if (budget < `0`) {
194	atomic_set(v: &prdev->state, i: PAGE_REPORTING_REQUESTED);
195	next = page;
196	break;
197	}
198
199	/ Attempt to pull page from list and place in scatterlist /
200	if (*offset) {
201	if (!__isolate_free_page(page, order)) {
202	next = page;
203	break;
204	}
205
206	/ Add page to scatter list /
207	--(*offset);
208	sg_set_page(sg: &sgl[*offset], page, len: page_len, offset: `0`);
209
210	continue;
211	}
212
213	/*
214	* Make the first non-reported page in the free list
215	* the new head of the free list before we release the
216	* zone lock.
217	*/
218	if (!list_is_first(list: &page->lru, head: list))
219	list_rotate_to_front(list: &page->lru, head: list);
220
221	/ release lock before waiting on report processing /
222	spin_unlock_irq(lock: &zone->lock);
223
224	/ begin processing pages in local list /
225	err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
226
227	/ reset offset since the full list was reported /
228	*offset = PAGE_REPORTING_CAPACITY;
229
230	/ update budget to reflect call to report function /
231	budget--;
232
233	/ reacquire zone lock and resume processing /
234	spin_lock_irq(lock: &zone->lock);
235
236	/ flush reported pages from the sg list /
237	page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, reported: !err);
238
239	/*
240	* Reset next to first entry, the old next isn't valid
241	* since we dropped the lock to report the pages
242	*/
243	next = list_first_entry(list, struct page, lru);
244
245	/ exit on error /
246	if (err)
247	break;
248	}
249
250	/ Rotate any leftover pages to the head of the freelist /
251	if (!list_entry_is_head(next, list, lru) && !list_is_first(list: &next->lru, head: list))
252	list_rotate_to_front(list: &next->lru, head: list);
253
254	spin_unlock_irq(lock: &zone->lock);
255
256	return err;
257	}
258
259	static int
260	page_reporting_process_zone(struct page_reporting_dev_info *prdev,
261	struct scatterlist sgl, struct* zone *zone)
262	{
263	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
264	unsigned long watermark;
265	int err = `0`;
266
267	/ Generate minimum watermark to be able to guarantee progress /
268	watermark = low_wmark_pages(zone) +
269	(PAGE_REPORTING_CAPACITY << page_reporting_order);
270
271	/*
272	* Cancel request if insufficient free memory or if we failed
273	* to allocate page reporting statistics for the zone.
274	*/
275	if (!zone_watermark_ok(z: zone, order: `0`, mark: watermark, highest_zoneidx: `0`, ALLOC_CMA))
276	return err;
277
278	/ Process each free list starting from lowest order/mt /
279	for (order = page_reporting_order; order <= MAX_ORDER; order++) {
280	for (mt = `0`; mt < MIGRATE_TYPES; mt++) {
281	/ We do not pull pages from the isolate free list /
282	if (is_migrate_isolate(migratetype: mt))
283	continue;
284
285	err = page_reporting_cycle(prdev, zone, order, mt,
286	sgl, offset: &offset);
287	if (err)
288	return err;
289	}
290	}
291
292	/ report the leftover pages before going idle /
293	leftover = PAGE_REPORTING_CAPACITY - offset;
294	if (leftover) {
295	sgl = &sgl[offset];
296	err = prdev->report(prdev, sgl, leftover);
297
298	/ flush any remaining pages out from the last report /
299	spin_lock_irq(lock: &zone->lock);
300	page_reporting_drain(prdev, sgl, nents: leftover, reported: !err);
301	spin_unlock_irq(lock: &zone->lock);
302	}
303
304	return err;
305	}
306
307	static void page_reporting_process(struct work_struct *work)
308	{
309	struct delayed_work *d_work = to_delayed_work(work);
310	struct page_reporting_dev_info *prdev =
311	container_of(d_work, struct page_reporting_dev_info, work);
312	int err = `0`, state = PAGE_REPORTING_ACTIVE;
313	struct scatterlist *sgl;
314	struct zone *zone;
315
316	/*
317	* Change the state to "Active" so that we can track if there is
318	* anyone requests page reporting after we complete our pass. If
319	* the state is not altered by the end of the pass we will switch
320	* to idle and quit scheduling reporting runs.
321	*/
322	atomic_set(v: &prdev->state, i: state);
323
324	/ allocate scatterlist to store pages being reported on /
325	sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, size: sizeof(*sgl), GFP_KERNEL);
326	if (!sgl)
327	goto err_out;
328
329	sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
330
331	for_each_zone(zone) {
332	err = page_reporting_process_zone(prdev, sgl, zone);
333	if (err)
334	break;
335	}
336
337	kfree(objp: sgl);
338	err_out:
339	/*
340	* If the state has reverted back to requested then there may be
341	* additional pages to be processed. We will defer for 2s to allow
342	* more pages to accumulate.
343	*/
344	state = atomic_cmpxchg(v: &prdev->state, old: state, new: PAGE_REPORTING_IDLE);
345	if (state == PAGE_REPORTING_REQUESTED)
346	schedule_delayed_work(dwork: &prdev->work, PAGE_REPORTING_DELAY);
347	}
348
349	static DEFINE_MUTEX(page_reporting_mutex);
350	DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
351
352	int page_reporting_register(struct page_reporting_dev_info *prdev)
353	{
354	int err = `0`;
355
356	mutex_lock(&page_reporting_mutex);
357
358	/ nothing to do if already in use /
359	if (rcu_dereference_protected(pr_dev_info,
360	lockdep_is_held(&page_reporting_mutex))) {
361	err = -EBUSY;
362	goto err_out;
363	}
364
365	/*
366	* If the page_reporting_order value is not set, we check if
367	* an order is provided from the driver that is performing the
368	* registration. If that is not provided either, we default to
369	* pageblock_order.
370	*/
371
372	if (page_reporting_order == -`1`) {
373	if (prdev->order > `0` && prdev->order <= MAX_ORDER)
374	page_reporting_order = prdev->order;
375	else
376	page_reporting_order = pageblock_order;
377	}
378
379	/ initialize state and work structures /
380	atomic_set(v: &prdev->state, i: PAGE_REPORTING_IDLE);
381	INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
382
383	/ Begin initial flush of zones /
384	__page_reporting_request(prdev);
385
386	/ Assign device to allow notifications /
387	rcu_assign_pointer(pr_dev_info, prdev);
388
389	/ enable page reporting notification /
390	if (!static_key_enabled(&page_reporting_enabled)) {
391	static_branch_enable(&page_reporting_enabled);
392	pr_info("Free page reporting enabled\n");
393	}
394	err_out:
395	mutex_unlock(lock: &page_reporting_mutex);
396
397	return err;
398	}
399	EXPORT_SYMBOL_GPL(page_reporting_register);
400
401	void page_reporting_unregister(struct page_reporting_dev_info *prdev)
402	{
403	mutex_lock(&page_reporting_mutex);
404
405	if (prdev == rcu_dereference_protected(pr_dev_info,
406	lockdep_is_held(&page_reporting_mutex))) {
407	/ Disable page reporting notification /
408	RCU_INIT_POINTER(pr_dev_info, NULL);
409	synchronize_rcu();
410
411	/ Flush any existing work, and lock it out /
412	cancel_delayed_work_sync(dwork: &prdev->work);
413	}
414
415	mutex_unlock(lock: &page_reporting_mutex);
416	}
417	EXPORT_SYMBOL_GPL(page_reporting_unregister);
418

source code of linux/mm/page_reporting.c