page_io.c source code [linux/mm/page_io.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/mm/page_io.c
4	*
5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6	*
7	* Swap reorganised 29.12.95,
8	* Asynchronous swapping added 30.12.95. Stephen Tweedie
9	* Removed race in async swapping. 14.4.1996. Bruno Haible
10	* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
11	* Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
12	*/
13
14	#include <linux/mm.h>
15	#include <linux/kernel_stat.h>
16	#include <linux/gfp.h>
17	#include <linux/pagemap.h>
18	#include <linux/swap.h>
19	#include <linux/bio.h>
20	#include <linux/swapops.h>
21	#include <linux/writeback.h>
22	#include <linux/blkdev.h>
23	#include <linux/psi.h>
24	#include <linux/uio.h>
25	#include <linux/sched/task.h>
26	#include <linux/delayacct.h>
27	#include <linux/zswap.h>
28	#include "swap.h"
29
30	static void __end_swap_bio_write(struct bio *bio)
31	{
32	struct folio *folio = bio_first_folio_all(bio);
33
34	if (bio->bi_status) {
35	/*
36	* We failed to write the page out to swap-space.
37	* Re-dirty the page in order to avoid it being reclaimed.
38	* Also print a dire warning that things will go BAD (tm)
39	* very quickly.
40	*
41	* Also clear PG_reclaim to avoid folio_rotate_reclaimable()
42	*/
43	folio_mark_dirty(folio);
44	pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
45	MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
46	(unsigned long long)bio->bi_iter.bi_sector);
47	folio_clear_reclaim(folio);
48	}
49	folio_end_writeback(folio);
50	}
51
52	static void end_swap_bio_write(struct bio *bio)
53	{
54	__end_swap_bio_write(bio);
55	bio_put(bio);
56	}
57
58	static void __end_swap_bio_read(struct bio *bio)
59	{
60	struct folio *folio = bio_first_folio_all(bio);
61
62	if (bio->bi_status) {
63	pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
64	MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
65	(unsigned long long)bio->bi_iter.bi_sector);
66	} else {
67	folio_mark_uptodate(folio);
68	}
69	folio_unlock(folio);
70	}
71
72	static void end_swap_bio_read(struct bio *bio)
73	{
74	__end_swap_bio_read(bio);
75	bio_put(bio);
76	}
77
78	int generic_swapfile_activate(struct swap_info_struct *sis,
79	struct file *swap_file,
80	sector_t *span)
81	{
82	struct address_space *mapping = swap_file->f_mapping;
83	struct inode *inode = mapping->host;
84	unsigned blocks_per_page;
85	unsigned long page_no;
86	unsigned blkbits;
87	sector_t probe_block;
88	sector_t last_block;
89	sector_t lowest_block = -`1`;
90	sector_t highest_block = `0`;
91	int nr_extents = `0`;
92	int ret;
93
94	blkbits = inode->i_blkbits;
95	blocks_per_page = PAGE_SIZE >> blkbits;
96
97	/*
98	* Map all the blocks into the extent tree. This code doesn't try
99	* to be very smart.
100	*/
101	probe_block = `0`;
102	page_no = `0`;
103	last_block = i_size_read(inode) >> blkbits;
104	while ((probe_block + blocks_per_page) <= last_block &&
105	page_no < sis->max) {
106	unsigned block_in_page;
107	sector_t first_block;
108
109	cond_resched();
110
111	first_block = probe_block;
112	ret = bmap(inode, block: &first_block);
113	if (ret \|\| !first_block)
114	goto bad_bmap;
115
116	/*
117	* It must be PAGE_SIZE aligned on-disk
118	*/
119	if (first_block & (blocks_per_page - `1`)) {
120	probe_block++;
121	goto reprobe;
122	}
123
124	for (block_in_page = `1`; block_in_page < blocks_per_page;
125	block_in_page++) {
126	sector_t block;
127
128	block = probe_block + block_in_page;
129	ret = bmap(inode, block: &block);
130	if (ret \|\| !block)
131	goto bad_bmap;
132
133	if (block != first_block + block_in_page) {
134	/ Discontiguity /
135	probe_block++;
136	goto reprobe;
137	}
138	}
139
140	first_block >>= (PAGE_SHIFT - blkbits);
141	if (page_no) { / exclude the header page /
142	if (first_block < lowest_block)
143	lowest_block = first_block;
144	if (first_block > highest_block)
145	highest_block = first_block;
146	}
147
148	/*
149	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
150	*/
151	ret = add_swap_extent(sis, start_page: page_no, nr_pages: `1`, start_block: first_block);
152	if (ret < `0`)
153	goto out;
154	nr_extents += ret;
155	page_no++;
156	probe_block += blocks_per_page;
157	reprobe:
158	continue;
159	}
160	ret = nr_extents;
161	*span = `1` + highest_block - lowest_block;
162	if (page_no == `0`)
163	page_no = `1`; / force Empty message /
164	sis->max = page_no;
165	sis->pages = page_no - `1`;
166	sis->highest_bit = page_no - `1`;
167	out:
168	return ret;
169	bad_bmap:
170	pr_err("swapon: swapfile has holes\n");
171	ret = -EINVAL;
172	goto out;
173	}
174
175	/*
176	* We may have stale swap cache pages in memory: notice
177	* them here and get rid of the unnecessary final write.
178	*/
179	int swap_writepage(struct page page, struct* writeback_control *wbc)
180	{
181	struct folio *folio = page_folio(page);
182	int ret;
183
184	if (folio_free_swap(folio)) {
185	folio_unlock(folio);
186	return `0`;
187	}
188	/*
189	* Arch code may have to preserve more data than just the page
190	* contents, e.g. memory tags.
191	*/
192	ret = arch_prepare_to_swap(page: &folio->page);
193	if (ret) {
194	folio_mark_dirty(folio);
195	folio_unlock(folio);
196	return ret;
197	}
198	if (zswap_store(folio)) {
199	folio_start_writeback(folio);
200	folio_unlock(folio);
201	folio_end_writeback(folio);
202	return `0`;
203	}
204	__swap_writepage(page: &folio->page, wbc);
205	return `0`;
206	}
207
208	static inline void count_swpout_vm_event(struct folio *folio)
209	{
210	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
211	if (unlikely(folio_test_pmd_mappable(folio))) {
212	count_memcg_folio_events(folio, idx: THP_SWPOUT, nr: `1`);
213	count_vm_event(item: THP_SWPOUT);
214	}
215	#endif
216	count_vm_events(item: PSWPOUT, delta: folio_nr_pages(folio));
217	}
218
219	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
220	static void bio_associate_blkg_from_page(struct bio bio, struct* folio *folio)
221	{
222	struct cgroup_subsys_state *css;
223	struct mem_cgroup *memcg;
224
225	memcg = folio_memcg(folio);
226	if (!memcg)
227	return;
228
229	rcu_read_lock();
230	css = cgroup_e_css(cgroup: memcg->css.cgroup, ss: &io_cgrp_subsys);
231	bio_associate_blkg_from_css(bio, css);
232	rcu_read_unlock();
233	}
234	#else
235	#define bio_associate_blkg_from_page(bio, folio) do { } while (0)
236	#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
237
238	struct swap_iocb {
239	struct kiocb iocb;
240	struct bio_vec bvec[SWAP_CLUSTER_MAX];
241	int pages;
242	int len;
243	};
244	static mempool_t *sio_pool;
245
246	int sio_pool_init(void)
247	{
248	if (!sio_pool) {
249	mempool_t *pool = mempool_create_kmalloc_pool(
250	SWAP_CLUSTER_MAX, size: sizeof(struct swap_iocb));
251	if (cmpxchg(&sio_pool, NULL, pool))
252	mempool_destroy(pool);
253	}
254	if (!sio_pool)
255	return -ENOMEM;
256	return `0`;
257	}
258
259	static void sio_write_complete(struct kiocb iocb, long* ret)
260	{
261	struct swap_iocb sio = container_of(iocb, struct* swap_iocb, iocb);
262	struct page *page = sio->bvec[`0`].bv_page;
263	int p;
264
265	if (ret != sio->len) {
266	/*
267	* In the case of swap-over-nfs, this can be a
268	* temporary failure if the system has limited
269	* memory for allocating transmit buffers.
270	* Mark the page dirty and avoid
271	* folio_rotate_reclaimable but rate-limit the
272	* messages but do not flag PageError like
273	* the normal direct-to-bio case as it could
274	* be temporary.
275	*/
276	pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
277	ret, page_file_offset(page));
278	for (p = `0`; p < sio->pages; p++) {
279	page = sio->bvec[p].bv_page;
280	set_page_dirty(page);
281	ClearPageReclaim(page);
282	}
283	}
284
285	for (p = `0`; p < sio->pages; p++)
286	end_page_writeback(page: sio->bvec[p].bv_page);
287
288	mempool_free(element: sio, pool: sio_pool);
289	}
290
291	static void swap_writepage_fs(struct page page, struct* writeback_control *wbc)
292	{
293	struct swap_iocb *sio = NULL;
294	struct swap_info_struct *sis = page_swap_info(page);
295	struct file *swap_file = sis->swap_file;
296	loff_t pos = page_file_offset(page);
297
298	count_swpout_vm_event(page_folio(page));
299	set_page_writeback(page);
300	unlock_page(page);
301	if (wbc->swap_plug)
302	sio = *wbc->swap_plug;
303	if (sio) {
304	if (sio->iocb.ki_filp != swap_file \|\|
305	sio->iocb.ki_pos + sio->len != pos) {
306	swap_write_unplug(sio);
307	sio = NULL;
308	}
309	}
310	if (!sio) {
311	sio = mempool_alloc(pool: sio_pool, GFP_NOIO);
312	init_sync_kiocb(kiocb: &sio->iocb, filp: swap_file);
313	sio->iocb.ki_complete = sio_write_complete;
314	sio->iocb.ki_pos = pos;
315	sio->pages = `0`;
316	sio->len = `0`;
317	}
318	bvec_set_page(bv: &sio->bvec[sio->pages], page, len: thp_size(page), offset: `0`);
319	sio->len += thp_size(page);
320	sio->pages += `1`;
321	if (sio->pages == ARRAY_SIZE(sio->bvec) \|\| !wbc->swap_plug) {
322	swap_write_unplug(sio);
323	sio = NULL;
324	}
325	if (wbc->swap_plug)
326	*wbc->swap_plug = sio;
327	}
328
329	static void swap_writepage_bdev_sync(struct page *page,
330	struct writeback_control wbc, struct* swap_info_struct *sis)
331	{
332	struct bio_vec bv;
333	struct bio bio;
334	struct folio *folio = page_folio(page);
335
336	bio_init(bio: &bio, bdev: sis->bdev, table: &bv, max_vecs: `1`,
337	opf: REQ_OP_WRITE \| REQ_SWAP \| wbc_to_write_flags(wbc));
338	bio.bi_iter.bi_sector = swap_page_sector(page);
339	__bio_add_page(bio: &bio, page, len: thp_size(page), off: `0`);
340
341	bio_associate_blkg_from_page(bio: &bio, folio);
342	count_swpout_vm_event(folio);
343
344	folio_start_writeback(folio);
345	folio_unlock(folio);
346
347	submit_bio_wait(bio: &bio);
348	__end_swap_bio_write(bio: &bio);
349	}
350
351	static void swap_writepage_bdev_async(struct page *page,
352	struct writeback_control wbc, struct* swap_info_struct *sis)
353	{
354	struct bio *bio;
355	struct folio *folio = page_folio(page);
356
357	bio = bio_alloc(bdev: sis->bdev, nr_vecs: `1`,
358	opf: REQ_OP_WRITE \| REQ_SWAP \| wbc_to_write_flags(wbc),
359	GFP_NOIO);
360	bio->bi_iter.bi_sector = swap_page_sector(page);
361	bio->bi_end_io = end_swap_bio_write;
362	__bio_add_page(bio, page, len: thp_size(page), off: `0`);
363
364	bio_associate_blkg_from_page(bio, folio);
365	count_swpout_vm_event(folio);
366	folio_start_writeback(folio);
367	folio_unlock(folio);
368	submit_bio(bio);
369	}
370
371	void __swap_writepage(struct page page, struct* writeback_control *wbc)
372	{
373	struct swap_info_struct *sis = page_swap_info(page);
374
375	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
376	/*
377	* ->flags can be updated non-atomicially (scan_swap_map_slots),
378	* but that will never affect SWP_FS_OPS, so the data_race
379	* is safe.
380	*/
381	if (data_race(sis->flags & SWP_FS_OPS))
382	swap_writepage_fs(page, wbc);
383	else if (sis->flags & SWP_SYNCHRONOUS_IO)
384	swap_writepage_bdev_sync(page, wbc, sis);
385	else
386	swap_writepage_bdev_async(page, wbc, sis);
387	}
388
389	void swap_write_unplug(struct swap_iocb *sio)
390	{
391	struct iov_iter from;
392	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
393	int ret;
394
395	iov_iter_bvec(i: &from, ITER_SOURCE, bvec: sio->bvec, nr_segs: sio->pages, count: sio->len);
396	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
397	if (ret != -EIOCBQUEUED)
398	sio_write_complete(iocb: &sio->iocb, ret);
399	}
400
401	static void sio_read_complete(struct kiocb iocb, long* ret)
402	{
403	struct swap_iocb sio = container_of(iocb, struct* swap_iocb, iocb);
404	int p;
405
406	if (ret == sio->len) {
407	for (p = `0`; p < sio->pages; p++) {
408	struct folio *folio = page_folio(sio->bvec[p].bv_page);
409
410	folio_mark_uptodate(folio);
411	folio_unlock(folio);
412	}
413	count_vm_events(item: PSWPIN, delta: sio->pages);
414	} else {
415	for (p = `0`; p < sio->pages; p++) {
416	struct folio *folio = page_folio(sio->bvec[p].bv_page);
417
418	folio_unlock(folio);
419	}
420	pr_alert_ratelimited("Read-error on swap-device\n");
421	}
422	mempool_free(element: sio, pool: sio_pool);
423	}
424
425	static void swap_readpage_fs(struct page *page,
426	struct swap_iocb **plug)
427	{
428	struct swap_info_struct *sis = page_swap_info(page);
429	struct swap_iocb *sio = NULL;
430	loff_t pos = page_file_offset(page);
431
432	if (plug)
433	sio = *plug;
434	if (sio) {
435	if (sio->iocb.ki_filp != sis->swap_file \|\|
436	sio->iocb.ki_pos + sio->len != pos) {
437	swap_read_unplug(plug: sio);
438	sio = NULL;
439	}
440	}
441	if (!sio) {
442	sio = mempool_alloc(pool: sio_pool, GFP_KERNEL);
443	init_sync_kiocb(kiocb: &sio->iocb, filp: sis->swap_file);
444	sio->iocb.ki_pos = pos;
445	sio->iocb.ki_complete = sio_read_complete;
446	sio->pages = `0`;
447	sio->len = `0`;
448	}
449	bvec_set_page(bv: &sio->bvec[sio->pages], page, len: thp_size(page), offset: `0`);
450	sio->len += thp_size(page);
451	sio->pages += `1`;
452	if (sio->pages == ARRAY_SIZE(sio->bvec) \|\| !plug) {
453	swap_read_unplug(plug: sio);
454	sio = NULL;
455	}
456	if (plug)
457	*plug = sio;
458	}
459
460	static void swap_readpage_bdev_sync(struct page *page,
461	struct swap_info_struct *sis)
462	{
463	struct bio_vec bv;
464	struct bio bio;
465
466	bio_init(bio: &bio, bdev: sis->bdev, table: &bv, max_vecs: `1`, opf: REQ_OP_READ);
467	bio.bi_iter.bi_sector = swap_page_sector(page);
468	__bio_add_page(bio: &bio, page, len: thp_size(page), off: `0`);
469	/*
470	* Keep this task valid during swap readpage because the oom killer may
471	* attempt to access it in the page fault retry time check.
472	*/
473	get_task_struct(current);
474	count_vm_event(item: PSWPIN);
475	submit_bio_wait(bio: &bio);
476	__end_swap_bio_read(bio: &bio);
477	put_task_struct(current);
478	}
479
480	static void swap_readpage_bdev_async(struct page *page,
481	struct swap_info_struct *sis)
482	{
483	struct bio *bio;
484
485	bio = bio_alloc(bdev: sis->bdev, nr_vecs: `1`, opf: REQ_OP_READ, GFP_KERNEL);
486	bio->bi_iter.bi_sector = swap_page_sector(page);
487	bio->bi_end_io = end_swap_bio_read;
488	__bio_add_page(bio, page, len: thp_size(page), off: `0`);
489	count_vm_event(item: PSWPIN);
490	submit_bio(bio);
491	}
492
493	void swap_readpage(struct page page, bool synchronous, struct* swap_iocb **plug)
494	{
495	struct folio *folio = page_folio(page);
496	struct swap_info_struct *sis = page_swap_info(page);
497	bool workingset = folio_test_workingset(folio);
498	unsigned long pflags;
499	bool in_thrashing;
500
501	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
502	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
503	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
504
505	/*
506	* Count submission time as memory stall and delay. When the device
507	* is congested, or the submitting cgroup IO-throttled, submission
508	* can be a significant part of overall IO time.
509	*/
510	if (workingset) {
511	delayacct_thrashing_start(in_thrashing: &in_thrashing);
512	psi_memstall_enter(flags: &pflags);
513	}
514	delayacct_swapin_start();
515
516	if (zswap_load(folio)) {
517	folio_mark_uptodate(folio);
518	folio_unlock(folio);
519	} else if (data_race(sis->flags & SWP_FS_OPS)) {
520	swap_readpage_fs(page, plug);
521	} else if (synchronous \|\| (sis->flags & SWP_SYNCHRONOUS_IO)) {
522	swap_readpage_bdev_sync(page, sis);
523	} else {
524	swap_readpage_bdev_async(page, sis);
525	}
526
527	if (workingset) {
528	delayacct_thrashing_end(in_thrashing: &in_thrashing);
529	psi_memstall_leave(flags: &pflags);
530	}
531	delayacct_swapin_end();
532	}
533
534	void __swap_read_unplug(struct swap_iocb *sio)
535	{
536	struct iov_iter from;
537	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
538	int ret;
539
540	iov_iter_bvec(i: &from, ITER_DEST, bvec: sio->bvec, nr_segs: sio->pages, count: sio->len);
541	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
542	if (ret != -EIOCBQUEUED)
543	sio_read_complete(iocb: &sio->iocb, ret);
544	}
545

source code of linux/mm/page_io.c