zswap.c source code [linux/mm/zswap.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* zswap.c - zswap driver file
4	*
5	* zswap is a cache that takes pages that are in the process
6	* of being swapped out and attempts to compress and store them in a
7	* RAM-based memory pool. This can result in a significant I/O reduction on
8	* the swap device and, in the case where decompressing from RAM is faster
9	* than reading from the swap device, can also improve workload performance.
10	*
11	* Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
12	*/
13
14	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16	#include <linux/module.h>
17	#include <linux/cpu.h>
18	#include <linux/highmem.h>
19	#include <linux/slab.h>
20	#include <linux/spinlock.h>
21	#include <linux/types.h>
22	#include <linux/atomic.h>
23	#include <linux/rbtree.h>
24	#include <linux/swap.h>
25	#include <linux/crypto.h>
26	#include <linux/scatterlist.h>
27	#include <linux/mempolicy.h>
28	#include <linux/mempool.h>
29	#include <linux/zpool.h>
30	#include <crypto/acompress.h>
31	#include <linux/zswap.h>
32	#include <linux/mm_types.h>
33	#include <linux/page-flags.h>
34	#include <linux/swapops.h>
35	#include <linux/writeback.h>
36	#include <linux/pagemap.h>
37	#include <linux/workqueue.h>
38
39	#include "swap.h"
40	#include "internal.h"
41
42	/*********************************
43	* statistics
44	**********************************/
45	/ Total bytes used by the compressed storage /
46	u64 zswap_pool_total_size;
47	/ The number of compressed pages currently stored in zswap /
48	atomic_t zswap_stored_pages = ATOMIC_INIT(`0`);
49	/ The number of same-value filled pages currently stored in zswap /
50	static atomic_t zswap_same_filled_pages = ATOMIC_INIT(`0`);
51
52	/*
53	* The statistics below are not protected from concurrent access for
54	* performance reasons so they may not be a 100% accurate. However,
55	* they do provide useful information on roughly how many times a
56	* certain event is occurring.
57	*/
58
59	/ Pool limit was hit (see zswap_max_pool_percent) /
60	static u64 zswap_pool_limit_hit;
61	/ Pages written back when pool limit was reached /
62	static u64 zswap_written_back_pages;
63	/ Store failed due to a reclaim failure after pool limit was reached /
64	static u64 zswap_reject_reclaim_fail;
65	/ Store failed due to compression algorithm failure /
66	static u64 zswap_reject_compress_fail;
67	/ Compressed page was too big for the allocator to (optimally) store /
68	static u64 zswap_reject_compress_poor;
69	/ Store failed because underlying allocator could not get memory /
70	static u64 zswap_reject_alloc_fail;
71	/ Store failed because the entry metadata could not be allocated (rare) /
72	static u64 zswap_reject_kmemcache_fail;
73	/ Duplicate store was encountered (rare) /
74	static u64 zswap_duplicate_entry;
75
76	/ Shrinker work queue /
77	static struct workqueue_struct *shrink_wq;
78	/ Pool limit was hit, we need to calm down /
79	static bool zswap_pool_reached_full;
80
81	/*********************************
82	* tunables
83	**********************************/
84
85	#define ZSWAP_PARAM_UNSET ""
86
87	static int zswap_setup(void);
88
89	/ Enable/disable zswap /
90	static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
91	static int zswap_enabled_param_set(const char *,
92	const struct kernel_param *);
93	static const struct kernel_param_ops zswap_enabled_param_ops = {
94	.set = zswap_enabled_param_set,
95	.get = param_get_bool,
96	};
97	module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, `0644`);
98
99	/ Crypto compressor to use /
100	static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
101	static int zswap_compressor_param_set(const char *,
102	const struct kernel_param *);
103	static const struct kernel_param_ops zswap_compressor_param_ops = {
104	.set = zswap_compressor_param_set,
105	.get = param_get_charp,
106	.free = param_free_charp,
107	};
108	module_param_cb(compressor, &zswap_compressor_param_ops,
109	&zswap_compressor, `0644`);
110
111	/ Compressed storage zpool to use /
112	static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
113	static int zswap_zpool_param_set(const char , const* struct kernel_param *);
114	static const struct kernel_param_ops zswap_zpool_param_ops = {
115	.set = zswap_zpool_param_set,
116	.get = param_get_charp,
117	.free = param_free_charp,
118	};
119	module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, `0644`);
120
121	/ The maximum percentage of memory that the compressed pool can occupy /
122	static unsigned int zswap_max_pool_percent = `20`;
123	module_param_named(max_pool_percent, zswap_max_pool_percent, uint, `0644`);
124
125	/ The threshold for accepting new pages after the max_pool_percent was hit /
126	static unsigned int zswap_accept_thr_percent = `90`; / of max pool size /
127	module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
128	uint, `0644`);
129
130	/*
131	* Enable/disable handling same-value filled pages (enabled by default).
132	* If disabled every page is considered non-same-value filled.
133	*/
134	static bool zswap_same_filled_pages_enabled = true;
135	module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
136	bool, `0644`);
137
138	/ Enable/disable handling non-same-value filled pages (enabled by default) /
139	static bool zswap_non_same_filled_pages_enabled = true;
140	module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
141	bool, `0644`);
142
143	static bool zswap_exclusive_loads_enabled = IS_ENABLED(
144	CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
145	module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, `0644`);
146
147	/ Number of zpools in zswap_pool (empirically determined for scalability) /
148	#define ZSWAP_NR_ZPOOLS 32
149
150	/*********************************
151	* data structures
152	**********************************/
153
154	struct crypto_acomp_ctx {
155	struct crypto_acomp *acomp;
156	struct acomp_req *req;
157	struct crypto_wait wait;
158	u8 *dstmem;
159	struct mutex *mutex;
160	};
161
162	/*
163	* The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
164	* The only case where lru_lock is not acquired while holding tree.lock is
165	* when a zswap_entry is taken off the lru for writeback, in that case it
166	* needs to be verified that it's still valid in the tree.
167	*/
168	struct zswap_pool {
169	struct zpool *zpools[ZSWAP_NR_ZPOOLS];
170	struct crypto_acomp_ctx __percpu *acomp_ctx;
171	struct kref kref;
172	struct list_head list;
173	struct work_struct release_work;
174	struct work_struct shrink_work;
175	struct hlist_node node;
176	char tfm_name[CRYPTO_MAX_ALG_NAME];
177	struct list_head lru;
178	spinlock_t lru_lock;
179	};
180
181	/*
182	* struct zswap_entry
183	*
184	* This structure contains the metadata for tracking a single compressed
185	* page within zswap.
186	*
187	* rbnode - links the entry into red-black tree for the appropriate swap type
188	* swpentry - associated swap entry, the offset indexes into the red-black tree
189	* refcount - the number of outstanding reference to the entry. This is needed
190	* to protect against premature freeing of the entry by code
191	* concurrent calls to load, invalidate, and writeback. The lock
192	* for the zswap_tree structure that contains the entry must
193	* be held while changing the refcount. Since the lock must
194	* be held, there is no reason to also make refcount atomic.
195	* length - the length in bytes of the compressed page data. Needed during
196	* decompression. For a same value filled page length is 0, and both
197	* pool and lru are invalid and must be ignored.
198	* pool - the zswap_pool the entry's data is in
199	* handle - zpool allocation handle that stores the compressed page data
200	* value - value of the same-value filled pages which have same content
201	* objcg - the obj_cgroup that the compressed memory is charged to
202	* lru - handle to the pool's lru used to evict pages.
203	*/
204	struct zswap_entry {
205	struct rb_node rbnode;
206	swp_entry_t swpentry;
207	int refcount;
208	unsigned int length;
209	struct zswap_pool *pool;
210	union {
211	unsigned long handle;
212	unsigned long value;
213	};
214	struct obj_cgroup *objcg;
215	struct list_head lru;
216	};
217
218	/*
219	* The tree lock in the zswap_tree struct protects a few things:
220	* - the rbtree
221	* - the refcount field of each entry in the tree
222	*/
223	struct zswap_tree {
224	struct rb_root rbroot;
225	spinlock_t lock;
226	};
227
228	static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
229
230	/ RCU-protected iteration /
231	static LIST_HEAD(zswap_pools);
232	/ protects zswap_pools list modification /
233	static DEFINE_SPINLOCK(zswap_pools_lock);
234	/ pool counter to provide unique names to zpool /
235	static atomic_t zswap_pools_count = ATOMIC_INIT(`0`);
236
237	enum zswap_init_type {
238	ZSWAP_UNINIT,
239	ZSWAP_INIT_SUCCEED,
240	ZSWAP_INIT_FAILED
241	};
242
243	static enum zswap_init_type zswap_init_state;
244
245	/ used to ensure the integrity of initialization /
246	static DEFINE_MUTEX(zswap_init_lock);
247
248	/ init completed, but couldn't create the initial pool /
249	static bool zswap_has_pool;
250
251	/*********************************
252	* helpers and fwd declarations
253	**********************************/
254
255	#define zswap_pool_debug(msg, p) \
256	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
257	zpool_get_type((p)->zpools[0]))
258
259	static int zswap_writeback_entry(struct zswap_entry *entry,
260	struct zswap_tree *tree);
261	static int zswap_pool_get(struct zswap_pool *pool);
262	static void zswap_pool_put(struct zswap_pool *pool);
263
264	static bool zswap_is_full(void)
265	{
266	return totalram_pages() * zswap_max_pool_percent / `100` <
267	DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
268	}
269
270	static bool zswap_can_accept(void)
271	{
272	return totalram_pages() * zswap_accept_thr_percent / `100` *
273	zswap_max_pool_percent / `100` >
274	DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
275	}
276
277	static void zswap_update_total_size(void)
278	{
279	struct zswap_pool *pool;
280	u64 total = `0`;
281	int i;
282
283	rcu_read_lock();
284
285	list_for_each_entry_rcu(pool, &zswap_pools, list)
286	for (i = `0`; i < ZSWAP_NR_ZPOOLS; i++)
287	total += zpool_get_total_size(pool: pool->zpools[i]);
288
289	rcu_read_unlock();
290
291	zswap_pool_total_size = total;
292	}
293
294	/*********************************
295	* zswap entry functions
296	**********************************/
297	static struct kmem_cache *zswap_entry_cache;
298
299	static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
300	{
301	struct zswap_entry *entry;
302	entry = kmem_cache_alloc(cachep: zswap_entry_cache, flags: gfp);
303	if (!entry)
304	return NULL;
305	entry->refcount = `1`;
306	RB_CLEAR_NODE(&entry->rbnode);
307	return entry;
308	}
309
310	static void zswap_entry_cache_free(struct zswap_entry *entry)
311	{
312	kmem_cache_free(s: zswap_entry_cache, objp: entry);
313	}
314
315	/*********************************
316	* rbtree functions
317	**********************************/
318	static struct zswap_entry zswap_rb_search(struct* rb_root *root, pgoff_t offset)
319	{
320	struct rb_node *node = root->rb_node;
321	struct zswap_entry *entry;
322	pgoff_t entry_offset;
323
324	while (node) {
325	entry = rb_entry(node, struct zswap_entry, rbnode);
326	entry_offset = swp_offset(entry: entry->swpentry);
327	if (entry_offset > offset)
328	node = node->rb_left;
329	else if (entry_offset < offset)
330	node = node->rb_right;
331	else
332	return entry;
333	}
334	return NULL;
335	}
336
337	/*
338	* In the case that a entry with the same offset is found, a pointer to
339	* the existing entry is stored in dupentry and the function returns -EEXIST
340	*/
341	static int zswap_rb_insert(struct rb_root root, struct* zswap_entry *entry,
342	struct zswap_entry **dupentry)
343	{
344	struct rb_node *link = &root->rb_node, parent = NULL;
345	struct zswap_entry *myentry;
346	pgoff_t myentry_offset, entry_offset = swp_offset(entry: entry->swpentry);
347
348	while (*link) {
349	parent = *link;
350	myentry = rb_entry(parent, struct zswap_entry, rbnode);
351	myentry_offset = swp_offset(entry: myentry->swpentry);
352	if (myentry_offset > entry_offset)
353	link = &(*link)->rb_left;
354	else if (myentry_offset < entry_offset)
355	link = &(*link)->rb_right;
356	else {
357	*dupentry = myentry;
358	return -EEXIST;
359	}
360	}
361	rb_link_node(node: &entry->rbnode, parent, rb_link: link);
362	rb_insert_color(&entry->rbnode, root);
363	return `0`;
364	}
365
366	static bool zswap_rb_erase(struct rb_root root, struct* zswap_entry *entry)
367	{
368	if (!RB_EMPTY_NODE(&entry->rbnode)) {
369	rb_erase(&entry->rbnode, root);
370	RB_CLEAR_NODE(&entry->rbnode);
371	return true;
372	}
373	return false;
374	}
375
376	static struct zpool zswap_find_zpool(struct* zswap_entry *entry)
377	{
378	int i = `0`;
379
380	if (ZSWAP_NR_ZPOOLS > `1`)
381	i = hash_ptr(ptr: entry, ilog2(ZSWAP_NR_ZPOOLS));
382
383	return entry->pool->zpools[i];
384	}
385
386	/*
387	* Carries out the common pattern of freeing and entry's zpool allocation,
388	* freeing the entry itself, and decrementing the number of stored pages.
389	*/
390	static void zswap_free_entry(struct zswap_entry *entry)
391	{
392	if (entry->objcg) {
393	obj_cgroup_uncharge_zswap(objcg: entry->objcg, size: entry->length);
394	obj_cgroup_put(objcg: entry->objcg);
395	}
396	if (!entry->length)
397	atomic_dec(v: &zswap_same_filled_pages);
398	else {
399	spin_lock(lock: &entry->pool->lru_lock);
400	list_del(entry: &entry->lru);
401	spin_unlock(lock: &entry->pool->lru_lock);
402	zpool_free(pool: zswap_find_zpool(entry), handle: entry->handle);
403	zswap_pool_put(pool: entry->pool);
404	}
405	zswap_entry_cache_free(entry);
406	atomic_dec(v: &zswap_stored_pages);
407	zswap_update_total_size();
408	}
409
410	/ caller must hold the tree lock /
411	static void zswap_entry_get(struct zswap_entry *entry)
412	{
413	entry->refcount++;
414	}
415
416	/ caller must hold the tree lock*
417	* remove from the tree and free it, if nobody reference the entry
418	*/
419	static void zswap_entry_put(struct zswap_tree *tree,
420	struct zswap_entry *entry)
421	{
422	int refcount = --entry->refcount;
423
424	WARN_ON_ONCE(refcount < `0`);
425	if (refcount == `0`) {
426	WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
427	zswap_free_entry(entry);
428	}
429	}
430
431	/ caller must hold the tree lock /
432	static struct zswap_entry zswap_entry_find_get(struct* rb_root *root,
433	pgoff_t offset)
434	{
435	struct zswap_entry *entry;
436
437	entry = zswap_rb_search(root, offset);
438	if (entry)
439	zswap_entry_get(entry);
440
441	return entry;
442	}
443
444	/*********************************
445	* per-cpu code
446	**********************************/
447	static DEFINE_PER_CPU(u8 *, zswap_dstmem);
448	/*
449	* If users dynamically change the zpool type and compressor at runtime, i.e.
450	* zswap is running, zswap can have more than one zpool on one cpu, but they
451	* are sharing dtsmem. So we need this mutex to be per-cpu.
452	*/
453	static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
454
455	static int zswap_dstmem_prepare(unsigned int cpu)
456	{
457	struct mutex *mutex;
458	u8 *dst;
459
460	dst = kmalloc_node(PAGE_SIZE * `2`, GFP_KERNEL, cpu_to_node(cpu));
461	if (!dst)
462	return -ENOMEM;
463
464	mutex = kmalloc_node(size: sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
465	if (!mutex) {
466	kfree(objp: dst);
467	return -ENOMEM;
468	}
469
470	mutex_init(mutex);
471	per_cpu(zswap_dstmem, cpu) = dst;
472	per_cpu(zswap_mutex, cpu) = mutex;
473	return `0`;
474	}
475
476	static int zswap_dstmem_dead(unsigned int cpu)
477	{
478	struct mutex *mutex;
479	u8 *dst;
480
481	mutex = per_cpu(zswap_mutex, cpu);
482	kfree(objp: mutex);
483	per_cpu(zswap_mutex, cpu) = NULL;
484
485	dst = per_cpu(zswap_dstmem, cpu);
486	kfree(objp: dst);
487	per_cpu(zswap_dstmem, cpu) = NULL;
488
489	return `0`;
490	}
491
492	static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
493	{
494	struct zswap_pool pool = hlist_entry(node, struct* zswap_pool, node);
495	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
496	struct crypto_acomp *acomp;
497	struct acomp_req *req;
498
499	acomp = crypto_alloc_acomp_node(alg_name: pool->tfm_name, type: `0`, mask: `0`, cpu_to_node(cpu));
500	if (IS_ERR(ptr: acomp)) {
501	pr_err("could not alloc crypto acomp %s : %ld\n",
502	pool->tfm_name, PTR_ERR(acomp));
503	return PTR_ERR(ptr: acomp);
504	}
505	acomp_ctx->acomp = acomp;
506
507	req = acomp_request_alloc(tfm: acomp_ctx->acomp);
508	if (!req) {
509	pr_err("could not alloc crypto acomp_request %s\n",
510	pool->tfm_name);
511	crypto_free_acomp(tfm: acomp_ctx->acomp);
512	return -ENOMEM;
513	}
514	acomp_ctx->req = req;
515
516	crypto_init_wait(wait: &acomp_ctx->wait);
517	/*
518	* if the backend of acomp is async zip, crypto_req_done() will wakeup
519	* crypto_wait_req(); if the backend of acomp is scomp, the callback
520	* won't be called, crypto_wait_req() will return without blocking.
521	*/
522	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
523	cmpl: crypto_req_done, data: &acomp_ctx->wait);
524
525	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
526	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
527
528	return `0`;
529	}
530
531	static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
532	{
533	struct zswap_pool pool = hlist_entry(node, struct* zswap_pool, node);
534	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
535
536	if (!IS_ERR_OR_NULL(ptr: acomp_ctx)) {
537	if (!IS_ERR_OR_NULL(ptr: acomp_ctx->req))
538	acomp_request_free(req: acomp_ctx->req);
539	if (!IS_ERR_OR_NULL(ptr: acomp_ctx->acomp))
540	crypto_free_acomp(tfm: acomp_ctx->acomp);
541	}
542
543	return `0`;
544	}
545
546	/*********************************
547	* pool functions
548	**********************************/
549
550	static struct zswap_pool __zswap_pool_current(void*)
551	{
552	struct zswap_pool *pool;
553
554	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
555	WARN_ONCE(!pool && zswap_has_pool,
556	"%s: no page storage pool!\n", __func__);
557
558	return pool;
559	}
560
561	static struct zswap_pool zswap_pool_current(void*)
562	{
563	assert_spin_locked(&zswap_pools_lock);
564
565	return __zswap_pool_current();
566	}
567
568	static struct zswap_pool zswap_pool_current_get(void*)
569	{
570	struct zswap_pool *pool;
571
572	rcu_read_lock();
573
574	pool = __zswap_pool_current();
575	if (!zswap_pool_get(pool))
576	pool = NULL;
577
578	rcu_read_unlock();
579
580	return pool;
581	}
582
583	static struct zswap_pool zswap_pool_last_get(void*)
584	{
585	struct zswap_pool pool, last = NULL;
586
587	rcu_read_lock();
588
589	list_for_each_entry_rcu(pool, &zswap_pools, list)
590	last = pool;
591	WARN_ONCE(!last && zswap_has_pool,
592	"%s: no page storage pool!\n", __func__);
593	if (!zswap_pool_get(pool: last))
594	last = NULL;
595
596	rcu_read_unlock();
597
598	return last;
599	}
600
601	/ type and compressor must be null-terminated /
602	static struct zswap_pool zswap_pool_find_get(char* type, char* *compressor)
603	{
604	struct zswap_pool *pool;
605
606	assert_spin_locked(&zswap_pools_lock);
607
608	list_for_each_entry_rcu(pool, &zswap_pools, list) {
609	if (strcmp(pool->tfm_name, compressor))
610	continue;
611	/ all zpools share the same type /
612	if (strcmp(zpool_get_type(pool: pool->zpools[`0`]), type))
613	continue;
614	/ if we can't get it, it's about to be destroyed /
615	if (!zswap_pool_get(pool))
616	continue;
617	return pool;
618	}
619
620	return NULL;
621	}
622
623	/*
624	* If the entry is still valid in the tree, drop the initial ref and remove it
625	* from the tree. This function must be called with an additional ref held,
626	* otherwise it may race with another invalidation freeing the entry.
627	*/
628	static void zswap_invalidate_entry(struct zswap_tree *tree,
629	struct zswap_entry *entry)
630	{
631	if (zswap_rb_erase(root: &tree->rbroot, entry))
632	zswap_entry_put(tree, entry);
633	}
634
635	static int zswap_reclaim_entry(struct zswap_pool *pool)
636	{
637	struct zswap_entry *entry;
638	struct zswap_tree *tree;
639	pgoff_t swpoffset;
640	int ret;
641
642	/ Get an entry off the LRU /
643	spin_lock(lock: &pool->lru_lock);
644	if (list_empty(head: &pool->lru)) {
645	spin_unlock(lock: &pool->lru_lock);
646	return -EINVAL;
647	}
648	entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
649	list_del_init(entry: &entry->lru);
650	/*
651	* Once the lru lock is dropped, the entry might get freed. The
652	* swpoffset is copied to the stack, and entry isn't deref'd again
653	* until the entry is verified to still be alive in the tree.
654	*/
655	swpoffset = swp_offset(entry: entry->swpentry);
656	tree = zswap_trees[swp_type(entry: entry->swpentry)];
657	spin_unlock(lock: &pool->lru_lock);
658
659	/ Check for invalidate() race /
660	spin_lock(lock: &tree->lock);
661	if (entry != zswap_rb_search(root: &tree->rbroot, offset: swpoffset)) {
662	ret = -EAGAIN;
663	goto unlock;
664	}
665	/ Hold a reference to prevent a free during writeback /
666	zswap_entry_get(entry);
667	spin_unlock(lock: &tree->lock);
668
669	ret = zswap_writeback_entry(entry, tree);
670
671	spin_lock(lock: &tree->lock);
672	if (ret) {
673	/ Writeback failed, put entry back on LRU /
674	spin_lock(lock: &pool->lru_lock);
675	list_move(list: &entry->lru, head: &pool->lru);
676	spin_unlock(lock: &pool->lru_lock);
677	goto put_unlock;
678	}
679
680	/*
681	* Writeback started successfully, the page now belongs to the
682	* swapcache. Drop the entry from zswap - unless invalidate already
683	* took it out while we had the tree->lock released for IO.
684	*/
685	zswap_invalidate_entry(tree, entry);
686
687	put_unlock:
688	/ Drop local reference /
689	zswap_entry_put(tree, entry);
690	unlock:
691	spin_unlock(lock: &tree->lock);
692	return ret ? -EAGAIN : `0`;
693	}
694
695	static void shrink_worker(struct work_struct *w)
696	{
697	struct zswap_pool pool = container_of(w, typeof(pool),
698	shrink_work);
699	int ret, failures = `0`;
700
701	do {
702	ret = zswap_reclaim_entry(pool);
703	if (ret) {
704	zswap_reject_reclaim_fail++;
705	if (ret != -EAGAIN)
706	break;
707	if (++failures == MAX_RECLAIM_RETRIES)
708	break;
709	}
710	cond_resched();
711	} while (!zswap_can_accept());
712	zswap_pool_put(pool);
713	}
714
715	static struct zswap_pool zswap_pool_create(char* type, char* *compressor)
716	{
717	int i;
718	struct zswap_pool *pool;
719	char name[`38`]; / 'zswap' + 32 char (max) num + \0 /
720	gfp_t gfp = __GFP_NORETRY \| __GFP_NOWARN \| __GFP_KSWAPD_RECLAIM;
721	int ret;
722
723	if (!zswap_has_pool) {
724	/ if either are unset, pool initialization failed, and we*
725	* need both params to be set correctly before trying to
726	* create a pool.
727	*/
728	if (!strcmp(type, ZSWAP_PARAM_UNSET))
729	return NULL;
730	if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
731	return NULL;
732	}
733
734	pool = kzalloc(size: sizeof(*pool), GFP_KERNEL);
735	if (!pool)
736	return NULL;
737
738	for (i = `0`; i < ZSWAP_NR_ZPOOLS; i++) {
739	/ unique name for each pool specifically required by zsmalloc /
740	snprintf(buf: name, size: `38`, fmt: "zswap%x",
741	atomic_inc_return(v: &zswap_pools_count));
742
743	pool->zpools[i] = zpool_create_pool(type, name, gfp);
744	if (!pool->zpools[i]) {
745	pr_err("%s zpool not available\n", type);
746	goto error;
747	}
748	}
749	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[`0`]));
750
751	strscpy(p: pool->tfm_name, q: compressor, size: sizeof(pool->tfm_name));
752
753	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
754	if (!pool->acomp_ctx) {
755	pr_err("percpu alloc failed\n");
756	goto error;
757	}
758
759	ret = cpuhp_state_add_instance(state: CPUHP_MM_ZSWP_POOL_PREPARE,
760	node: &pool->node);
761	if (ret)
762	goto error;
763	pr_debug("using %s compressor\n", pool->tfm_name);
764
765	/ being the current pool takes 1 ref; this func expects the*
766	* caller to always add the new pool as the current pool
767	*/
768	kref_init(kref: &pool->kref);
769	INIT_LIST_HEAD(list: &pool->list);
770	INIT_LIST_HEAD(list: &pool->lru);
771	spin_lock_init(&pool->lru_lock);
772	INIT_WORK(&pool->shrink_work, shrink_worker);
773
774	zswap_pool_debug("created", pool);
775
776	return pool;
777
778	error:
779	if (pool->acomp_ctx)
780	free_percpu(pdata: pool->acomp_ctx);
781	while (i--)
782	zpool_destroy_pool(pool: pool->zpools[i]);
783	kfree(objp: pool);
784	return NULL;
785	}
786
787	static struct zswap_pool __zswap_pool_create_fallback(void*)
788	{
789	bool has_comp, has_zpool;
790
791	has_comp = crypto_has_acomp(alg_name: zswap_compressor, type: `0`, mask: `0`);
792	if (!has_comp && strcmp(zswap_compressor,
793	CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
794	pr_err("compressor %s not available, using default %s\n",
795	zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
796	param_free_charp(arg: &zswap_compressor);
797	zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
798	has_comp = crypto_has_acomp(alg_name: zswap_compressor, type: `0`, mask: `0`);
799	}
800	if (!has_comp) {
801	pr_err("default compressor %s not available\n",
802	zswap_compressor);
803	param_free_charp(arg: &zswap_compressor);
804	zswap_compressor = ZSWAP_PARAM_UNSET;
805	}
806
807	has_zpool = zpool_has_pool(type: zswap_zpool_type);
808	if (!has_zpool && strcmp(zswap_zpool_type,
809	CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
810	pr_err("zpool %s not available, using default %s\n",
811	zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
812	param_free_charp(arg: &zswap_zpool_type);
813	zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
814	has_zpool = zpool_has_pool(type: zswap_zpool_type);
815	}
816	if (!has_zpool) {
817	pr_err("default zpool %s not available\n",
818	zswap_zpool_type);
819	param_free_charp(arg: &zswap_zpool_type);
820	zswap_zpool_type = ZSWAP_PARAM_UNSET;
821	}
822
823	if (!has_comp \|\| !has_zpool)
824	return NULL;
825
826	return zswap_pool_create(type: zswap_zpool_type, compressor: zswap_compressor);
827	}
828
829	static void zswap_pool_destroy(struct zswap_pool *pool)
830	{
831	int i;
832
833	zswap_pool_debug("destroying", pool);
834
835	cpuhp_state_remove_instance(state: CPUHP_MM_ZSWP_POOL_PREPARE, node: &pool->node);
836	free_percpu(pdata: pool->acomp_ctx);
837	for (i = `0`; i < ZSWAP_NR_ZPOOLS; i++)
838	zpool_destroy_pool(pool: pool->zpools[i]);
839	kfree(objp: pool);
840	}
841
842	static int __must_check zswap_pool_get(struct zswap_pool *pool)
843	{
844	if (!pool)
845	return `0`;
846
847	return kref_get_unless_zero(kref: &pool->kref);
848	}
849
850	static void __zswap_pool_release(struct work_struct *work)
851	{
852	struct zswap_pool pool = container_of(work, typeof(pool),
853	release_work);
854
855	synchronize_rcu();
856
857	/ nobody should have been able to get a kref... /
858	WARN_ON(kref_get_unless_zero(&pool->kref));
859
860	/ pool is now off zswap_pools list and has no references. /
861	zswap_pool_destroy(pool);
862	}
863
864	static void __zswap_pool_empty(struct kref *kref)
865	{
866	struct zswap_pool *pool;
867
868	pool = container_of(kref, typeof(*pool), kref);
869
870	spin_lock(lock: &zswap_pools_lock);
871
872	WARN_ON(pool == zswap_pool_current());
873
874	list_del_rcu(entry: &pool->list);
875
876	INIT_WORK(&pool->release_work, __zswap_pool_release);
877	schedule_work(work: &pool->release_work);
878
879	spin_unlock(lock: &zswap_pools_lock);
880	}
881
882	static void zswap_pool_put(struct zswap_pool *pool)
883	{
884	kref_put(kref: &pool->kref, release: __zswap_pool_empty);
885	}
886
887	/*********************************
888	* param callbacks
889	**********************************/
890
891	static bool zswap_pool_changed(const char s, const* struct kernel_param *kp)
892	{
893	/ no change required /
894	if (!strcmp(s, (char* **)kp->arg) && zswap_has_pool)
895	return false;
896	return true;
897	}
898
899	/ val must be a null-terminated string /
900	static int __zswap_param_set(const char val, const* struct kernel_param *kp,
901	char type, char* *compressor)
902	{
903	struct zswap_pool pool, put_pool = NULL;
904	char s = strstrip(str: (char* *)val);
905	int ret = `0`;
906	bool new_pool = false;
907
908	mutex_lock(&zswap_init_lock);
909	switch (zswap_init_state) {
910	case ZSWAP_UNINIT:
911	/ if this is load-time (pre-init) param setting,*
912	* don't create a pool; that's done during init.
913	*/
914	ret = param_set_charp(val: s, kp);
915	break;
916	case ZSWAP_INIT_SUCCEED:
917	new_pool = zswap_pool_changed(s, kp);
918	break;
919	case ZSWAP_INIT_FAILED:
920	pr_err("can't set param, initialization failed\n");
921	ret = -ENODEV;
922	}
923	mutex_unlock(lock: &zswap_init_lock);
924
925	/ no need to create a new pool, return directly /
926	if (!new_pool)
927	return ret;
928
929	if (!type) {
930	if (!zpool_has_pool(type: s)) {
931	pr_err("zpool %s not available\n", s);
932	return -ENOENT;
933	}
934	type = s;
935	} else if (!compressor) {
936	if (!crypto_has_acomp(alg_name: s, type: `0`, mask: `0`)) {
937	pr_err("compressor %s not available\n", s);
938	return -ENOENT;
939	}
940	compressor = s;
941	} else {
942	WARN_ON(`1`);
943	return -EINVAL;
944	}
945
946	spin_lock(lock: &zswap_pools_lock);
947
948	pool = zswap_pool_find_get(type, compressor);
949	if (pool) {
950	zswap_pool_debug("using existing", pool);
951	WARN_ON(pool == zswap_pool_current());
952	list_del_rcu(entry: &pool->list);
953	}
954
955	spin_unlock(lock: &zswap_pools_lock);
956
957	if (!pool)
958	pool = zswap_pool_create(type, compressor);
959
960	if (pool)
961	ret = param_set_charp(val: s, kp);
962	else
963	ret = -EINVAL;
964
965	spin_lock(lock: &zswap_pools_lock);
966
967	if (!ret) {
968	put_pool = zswap_pool_current();
969	list_add_rcu(new: &pool->list, head: &zswap_pools);
970	zswap_has_pool = true;
971	} else if (pool) {
972	/ add the possibly pre-existing pool to the end of the pools*
973	* list; if it's new (and empty) then it'll be removed and
974	* destroyed by the put after we drop the lock
975	*/
976	list_add_tail_rcu(new: &pool->list, head: &zswap_pools);
977	put_pool = pool;
978	}
979
980	spin_unlock(lock: &zswap_pools_lock);
981
982	if (!zswap_has_pool && !pool) {
983	/ if initial pool creation failed, and this pool creation also*
984	* failed, maybe both compressor and zpool params were bad.
985	* Allow changing this param, so pool creation will succeed
986	* when the other param is changed. We already verified this
987	* param is ok in the zpool_has_pool() or crypto_has_acomp()
988	* checks above.
989	*/
990	ret = param_set_charp(val: s, kp);
991	}
992
993	/ drop the ref from either the old current pool,*
994	* or the new pool we failed to add
995	*/
996	if (put_pool)
997	zswap_pool_put(pool: put_pool);
998
999	return ret;
1000	}
1001
1002	static int zswap_compressor_param_set(const char *val,
1003	const struct kernel_param *kp)
1004	{
1005	return __zswap_param_set(val, kp, type: zswap_zpool_type, NULL);
1006	}
1007
1008	static int zswap_zpool_param_set(const char *val,
1009	const struct kernel_param *kp)
1010	{
1011	return __zswap_param_set(val, kp, NULL, compressor: zswap_compressor);
1012	}
1013
1014	static int zswap_enabled_param_set(const char *val,
1015	const struct kernel_param *kp)
1016	{
1017	int ret = -ENODEV;
1018
1019	/ if this is load-time (pre-init) param setting, only set param. /
1020	if (system_state != SYSTEM_RUNNING)
1021	return param_set_bool(val, kp);
1022
1023	mutex_lock(&zswap_init_lock);
1024	switch (zswap_init_state) {
1025	case ZSWAP_UNINIT:
1026	if (zswap_setup())
1027	break;
1028	fallthrough;
1029	case ZSWAP_INIT_SUCCEED:
1030	if (!zswap_has_pool)
1031	pr_err("can't enable, no pool configured\n");
1032	else
1033	ret = param_set_bool(val, kp);
1034	break;
1035	case ZSWAP_INIT_FAILED:
1036	pr_err("can't enable, initialization failed\n");
1037	}
1038	mutex_unlock(lock: &zswap_init_lock);
1039
1040	return ret;
1041	}
1042
1043	/*********************************
1044	* writeback code
1045	**********************************/
1046	/*
1047	* Attempts to free an entry by adding a page to the swap cache,
1048	* decompressing the entry data into the page, and issuing a
1049	* bio write to write the page back to the swap device.
1050	*
1051	* This can be thought of as a "resumed writeback" of the page
1052	* to the swap device. We are basically resuming the same swap
1053	* writeback path that was intercepted with the zswap_store()
1054	* in the first place. After the page has been decompressed into
1055	* the swap cache, the compressed version stored by zswap can be
1056	* freed.
1057	*/
1058	static int zswap_writeback_entry(struct zswap_entry *entry,
1059	struct zswap_tree *tree)
1060	{
1061	swp_entry_t swpentry = entry->swpentry;
1062	struct page *page;
1063	struct mempolicy *mpol;
1064	struct scatterlist input, output;
1065	struct crypto_acomp_ctx *acomp_ctx;
1066	struct zpool *pool = zswap_find_zpool(entry);
1067	bool page_was_allocated;
1068	u8 src, tmp = NULL;
1069	unsigned int dlen;
1070	int ret;
1071	struct writeback_control wbc = {
1072	.sync_mode = WB_SYNC_NONE,
1073	};
1074
1075	if (!zpool_can_sleep_mapped(pool)) {
1076	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
1077	if (!tmp)
1078	return -ENOMEM;
1079	}
1080
1081	/ try to allocate swap cache page /
1082	mpol = get_task_policy(current);
1083	page = __read_swap_cache_async(entry: swpentry, GFP_KERNEL, mpol,
1084	NO_INTERLEAVE_INDEX, new_page_allocated: &page_was_allocated);
1085	if (!page) {
1086	ret = -ENOMEM;
1087	goto fail;
1088	}
1089
1090	/ Found an existing page, we raced with load/swapin /
1091	if (!page_was_allocated) {
1092	put_page(page);
1093	ret = -EEXIST;
1094	goto fail;
1095	}
1096
1097	/*
1098	* Page is locked, and the swapcache is now secured against
1099	* concurrent swapping to and from the slot. Verify that the
1100	* swap entry hasn't been invalidated and recycled behind our
1101	* backs (our zswap_entry reference doesn't prevent that), to
1102	* avoid overwriting a new swap page with old compressed data.
1103	*/
1104	spin_lock(lock: &tree->lock);
1105	if (zswap_rb_search(root: &tree->rbroot, offset: swp_offset(entry: entry->swpentry)) != entry) {
1106	spin_unlock(lock: &tree->lock);
1107	delete_from_swap_cache(page_folio(page));
1108	ret = -ENOMEM;
1109	goto fail;
1110	}
1111	spin_unlock(lock: &tree->lock);
1112
1113	/ decompress /
1114	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1115	dlen = PAGE_SIZE;
1116
1117	src = zpool_map_handle(pool, handle: entry->handle, mm: ZPOOL_MM_RO);
1118	if (!zpool_can_sleep_mapped(pool)) {
1119	memcpy(tmp, src, entry->length);
1120	src = tmp;
1121	zpool_unmap_handle(pool, handle: entry->handle);
1122	}
1123
1124	mutex_lock(acomp_ctx->mutex);
1125	sg_init_one(&input, src, entry->length);
1126	sg_init_table(&output, `1`);
1127	sg_set_page(sg: &output, page, PAGE_SIZE, offset: `0`);
1128	acomp_request_set_params(req: acomp_ctx->req, src: &input, dst: &output, slen: entry->length, dlen);
1129	ret = crypto_wait_req(err: crypto_acomp_decompress(req: acomp_ctx->req), wait: &acomp_ctx->wait);
1130	dlen = acomp_ctx->req->dlen;
1131	mutex_unlock(lock: acomp_ctx->mutex);
1132
1133	if (!zpool_can_sleep_mapped(pool))
1134	kfree(objp: tmp);
1135	else
1136	zpool_unmap_handle(pool, handle: entry->handle);
1137
1138	BUG_ON(ret);
1139	BUG_ON(dlen != PAGE_SIZE);
1140
1141	/ page is up to date /
1142	SetPageUptodate(page);
1143
1144	/ move it to the tail of the inactive list after end_writeback /
1145	SetPageReclaim(page);
1146
1147	/ start writeback /
1148	__swap_writepage(page, wbc: &wbc);
1149	put_page(page);
1150	zswap_written_back_pages++;
1151
1152	return ret;
1153
1154	fail:
1155	if (!zpool_can_sleep_mapped(pool))
1156	kfree(objp: tmp);
1157
1158	/*
1159	* If we get here because the page is already in swapcache, a
1160	* load may be happening concurrently. It is safe and okay to
1161	* not free the entry. It is also okay to return !0.
1162	*/
1163	return ret;
1164	}
1165
1166	static int zswap_is_page_same_filled(void ptr, unsigned* long *value)
1167	{
1168	unsigned long *page;
1169	unsigned long val;
1170	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - `1`;
1171
1172	page = (unsigned long *)ptr;
1173	val = page[`0`];
1174
1175	if (val != page[last_pos])
1176	return `0`;
1177
1178	for (pos = `1`; pos < last_pos; pos++) {
1179	if (val != page[pos])
1180	return `0`;
1181	}
1182
1183	*value = val;
1184
1185	return `1`;
1186	}
1187
1188	static void zswap_fill_page(void ptr, unsigned* long value)
1189	{
1190	unsigned long *page;
1191
1192	page = (unsigned long *)ptr;
1193	memset_l(p: page, v: value, PAGE_SIZE / sizeof(unsigned long));
1194	}
1195
1196	bool zswap_store(struct folio *folio)
1197	{
1198	swp_entry_t swp = folio->swap;
1199	int type = swp_type(entry: swp);
1200	pgoff_t offset = swp_offset(entry: swp);
1201	struct page *page = &folio->page;
1202	struct zswap_tree *tree = zswap_trees[type];
1203	struct zswap_entry entry, dupentry;
1204	struct scatterlist input, output;
1205	struct crypto_acomp_ctx *acomp_ctx;
1206	struct obj_cgroup *objcg = NULL;
1207	struct zswap_pool *pool;
1208	struct zpool *zpool;
1209	unsigned int dlen = PAGE_SIZE;
1210	unsigned long handle, value;
1211	char *buf;
1212	u8 src, dst;
1213	gfp_t gfp;
1214	int ret;
1215
1216	VM_WARN_ON_ONCE(!folio_test_locked(folio));
1217	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
1218
1219	/ Large folios aren't supported /
1220	if (folio_test_large(folio))
1221	return false;
1222
1223	if (!zswap_enabled \|\| !tree)
1224	return false;
1225
1226	/*
1227	* If this is a duplicate, it must be removed before attempting to store
1228	* it, otherwise, if the store fails the old page won't be removed from
1229	* the tree, and it might be written back overriding the new data.
1230	*/
1231	spin_lock(lock: &tree->lock);
1232	dupentry = zswap_rb_search(root: &tree->rbroot, offset);
1233	if (dupentry) {
1234	zswap_duplicate_entry++;
1235	zswap_invalidate_entry(tree, entry: dupentry);
1236	}
1237	spin_unlock(lock: &tree->lock);
1238
1239	/*
1240	* XXX: zswap reclaim does not work with cgroups yet. Without a
1241	* cgroup-aware entry LRU, we will push out entries system-wide based on
1242	* local cgroup limits.
1243	*/
1244	objcg = get_obj_cgroup_from_folio(folio);
1245	if (objcg && !obj_cgroup_may_zswap(objcg))
1246	goto reject;
1247
1248	/ reclaim space if needed /
1249	if (zswap_is_full()) {
1250	zswap_pool_limit_hit++;
1251	zswap_pool_reached_full = true;
1252	goto shrink;
1253	}
1254
1255	if (zswap_pool_reached_full) {
1256	if (!zswap_can_accept())
1257	goto shrink;
1258	else
1259	zswap_pool_reached_full = false;
1260	}
1261
1262	/ allocate entry /
1263	entry = zswap_entry_cache_alloc(GFP_KERNEL);
1264	if (!entry) {
1265	zswap_reject_kmemcache_fail++;
1266	goto reject;
1267	}
1268
1269	if (zswap_same_filled_pages_enabled) {
1270	src = kmap_atomic(page);
1271	if (zswap_is_page_same_filled(ptr: src, value: &value)) {
1272	kunmap_atomic(src);
1273	entry->swpentry = swp_entry(type, offset);
1274	entry->length = `0`;
1275	entry->value = value;
1276	atomic_inc(v: &zswap_same_filled_pages);
1277	goto insert_entry;
1278	}
1279	kunmap_atomic(src);
1280	}
1281
1282	if (!zswap_non_same_filled_pages_enabled)
1283	goto freepage;
1284
1285	/ if entry is successfully added, it keeps the reference /
1286	entry->pool = zswap_pool_current_get();
1287	if (!entry->pool)
1288	goto freepage;
1289
1290	/ compress /
1291	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1292
1293	mutex_lock(acomp_ctx->mutex);
1294
1295	dst = acomp_ctx->dstmem;
1296	sg_init_table(&input, `1`);
1297	sg_set_page(sg: &input, page, PAGE_SIZE, offset: `0`);
1298
1299	/ zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list /
1300	sg_init_one(&output, dst, PAGE_SIZE * `2`);
1301	acomp_request_set_params(req: acomp_ctx->req, src: &input, dst: &output, PAGE_SIZE, dlen);
1302	/*
1303	* it maybe looks a little bit silly that we send an asynchronous request,
1304	* then wait for its completion synchronously. This makes the process look
1305	* synchronous in fact.
1306	* Theoretically, acomp supports users send multiple acomp requests in one
1307	* acomp instance, then get those requests done simultaneously. but in this
1308	* case, zswap actually does store and load page by page, there is no
1309	* existing method to send the second page before the first page is done
1310	* in one thread doing zwap.
1311	* but in different threads running on different cpu, we have different
1312	* acomp instance, so multiple threads can do (de)compression in parallel.
1313	*/
1314	ret = crypto_wait_req(err: crypto_acomp_compress(req: acomp_ctx->req), wait: &acomp_ctx->wait);
1315	dlen = acomp_ctx->req->dlen;
1316
1317	if (ret) {
1318	zswap_reject_compress_fail++;
1319	goto put_dstmem;
1320	}
1321
1322	/ store /
1323	zpool = zswap_find_zpool(entry);
1324	gfp = __GFP_NORETRY \| __GFP_NOWARN \| __GFP_KSWAPD_RECLAIM;
1325	if (zpool_malloc_support_movable(pool: zpool))
1326	gfp \|= __GFP_HIGHMEM \| __GFP_MOVABLE;
1327	ret = zpool_malloc(pool: zpool, size: dlen, gfp, handle: &handle);
1328	if (ret == -ENOSPC) {
1329	zswap_reject_compress_poor++;
1330	goto put_dstmem;
1331	}
1332	if (ret) {
1333	zswap_reject_alloc_fail++;
1334	goto put_dstmem;
1335	}
1336	buf = zpool_map_handle(pool: zpool, handle, mm: ZPOOL_MM_WO);
1337	memcpy(buf, dst, dlen);
1338	zpool_unmap_handle(pool: zpool, handle);
1339	mutex_unlock(lock: acomp_ctx->mutex);
1340
1341	/ populate entry /
1342	entry->swpentry = swp_entry(type, offset);
1343	entry->handle = handle;
1344	entry->length = dlen;
1345
1346	insert_entry:
1347	entry->objcg = objcg;
1348	if (objcg) {
1349	obj_cgroup_charge_zswap(objcg, size: entry->length);
1350	/ Account before objcg ref is moved to tree /
1351	count_objcg_event(objcg, idx: ZSWPOUT);
1352	}
1353
1354	/ map /
1355	spin_lock(lock: &tree->lock);
1356	/*
1357	* A duplicate entry should have been removed at the beginning of this
1358	* function. Since the swap entry should be pinned, if a duplicate is
1359	* found again here it means that something went wrong in the swap
1360	* cache.
1361	*/
1362	while (zswap_rb_insert(root: &tree->rbroot, entry, dupentry: &dupentry) == -EEXIST) {
1363	WARN_ON(`1`);
1364	zswap_duplicate_entry++;
1365	zswap_invalidate_entry(tree, entry: dupentry);
1366	}
1367	if (entry->length) {
1368	spin_lock(lock: &entry->pool->lru_lock);
1369	list_add(new: &entry->lru, head: &entry->pool->lru);
1370	spin_unlock(lock: &entry->pool->lru_lock);
1371	}
1372	spin_unlock(lock: &tree->lock);
1373
1374	/ update stats /
1375	atomic_inc(v: &zswap_stored_pages);
1376	zswap_update_total_size();
1377	count_vm_event(item: ZSWPOUT);
1378
1379	return true;
1380
1381	put_dstmem:
1382	mutex_unlock(lock: acomp_ctx->mutex);
1383	zswap_pool_put(pool: entry->pool);
1384	freepage:
1385	zswap_entry_cache_free(entry);
1386	reject:
1387	if (objcg)
1388	obj_cgroup_put(objcg);
1389	return false;
1390
1391	shrink:
1392	pool = zswap_pool_last_get();
1393	if (pool && !queue_work(wq: shrink_wq, work: &pool->shrink_work))
1394	zswap_pool_put(pool);
1395	goto reject;
1396	}
1397
1398	bool zswap_load(struct folio *folio)
1399	{
1400	swp_entry_t swp = folio->swap;
1401	int type = swp_type(entry: swp);
1402	pgoff_t offset = swp_offset(entry: swp);
1403	struct page *page = &folio->page;
1404	struct zswap_tree *tree = zswap_trees[type];
1405	struct zswap_entry *entry;
1406	struct scatterlist input, output;
1407	struct crypto_acomp_ctx *acomp_ctx;
1408	u8 src, dst, *tmp;
1409	struct zpool *zpool;
1410	unsigned int dlen;
1411	bool ret;
1412
1413	VM_WARN_ON_ONCE(!folio_test_locked(folio));
1414
1415	/ find /
1416	spin_lock(lock: &tree->lock);
1417	entry = zswap_entry_find_get(root: &tree->rbroot, offset);
1418	if (!entry) {
1419	spin_unlock(lock: &tree->lock);
1420	return false;
1421	}
1422	spin_unlock(lock: &tree->lock);
1423
1424	if (!entry->length) {
1425	dst = kmap_atomic(page);
1426	zswap_fill_page(ptr: dst, value: entry->value);
1427	kunmap_atomic(dst);
1428	ret = true;
1429	goto stats;
1430	}
1431
1432	zpool = zswap_find_zpool(entry);
1433	if (!zpool_can_sleep_mapped(pool: zpool)) {
1434	tmp = kmalloc(size: entry->length, GFP_KERNEL);
1435	if (!tmp) {
1436	ret = false;
1437	goto freeentry;
1438	}
1439	}
1440
1441	/ decompress /
1442	dlen = PAGE_SIZE;
1443	src = zpool_map_handle(pool: zpool, handle: entry->handle, mm: ZPOOL_MM_RO);
1444
1445	if (!zpool_can_sleep_mapped(pool: zpool)) {
1446	memcpy(tmp, src, entry->length);
1447	src = tmp;
1448	zpool_unmap_handle(pool: zpool, handle: entry->handle);
1449	}
1450
1451	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1452	mutex_lock(acomp_ctx->mutex);
1453	sg_init_one(&input, src, entry->length);
1454	sg_init_table(&output, `1`);
1455	sg_set_page(sg: &output, page, PAGE_SIZE, offset: `0`);
1456	acomp_request_set_params(req: acomp_ctx->req, src: &input, dst: &output, slen: entry->length, dlen);
1457	if (crypto_wait_req(err: crypto_acomp_decompress(req: acomp_ctx->req), wait: &acomp_ctx->wait))
1458	WARN_ON(`1`);
1459	mutex_unlock(lock: acomp_ctx->mutex);
1460
1461	if (zpool_can_sleep_mapped(pool: zpool))
1462	zpool_unmap_handle(pool: zpool, handle: entry->handle);
1463	else
1464	kfree(objp: tmp);
1465
1466	ret = true;
1467	stats:
1468	count_vm_event(item: ZSWPIN);
1469	if (entry->objcg)
1470	count_objcg_event(objcg: entry->objcg, idx: ZSWPIN);
1471	freeentry:
1472	spin_lock(lock: &tree->lock);
1473	if (ret && zswap_exclusive_loads_enabled) {
1474	zswap_invalidate_entry(tree, entry);
1475	folio_mark_dirty(folio);
1476	} else if (entry->length) {
1477	spin_lock(lock: &entry->pool->lru_lock);
1478	list_move(list: &entry->lru, head: &entry->pool->lru);
1479	spin_unlock(lock: &entry->pool->lru_lock);
1480	}
1481	zswap_entry_put(tree, entry);
1482	spin_unlock(lock: &tree->lock);
1483
1484	return ret;
1485	}
1486
1487	void zswap_invalidate(int type, pgoff_t offset)
1488	{
1489	struct zswap_tree *tree = zswap_trees[type];
1490	struct zswap_entry *entry;
1491
1492	/ find /
1493	spin_lock(lock: &tree->lock);
1494	entry = zswap_rb_search(root: &tree->rbroot, offset);
1495	if (!entry) {
1496	/ entry was written back /
1497	spin_unlock(lock: &tree->lock);
1498	return;
1499	}
1500	zswap_invalidate_entry(tree, entry);
1501	spin_unlock(lock: &tree->lock);
1502	}
1503
1504	void zswap_swapon(int type)
1505	{
1506	struct zswap_tree *tree;
1507
1508	tree = kzalloc(size: sizeof(*tree), GFP_KERNEL);
1509	if (!tree) {
1510	pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1511	return;
1512	}
1513
1514	tree->rbroot = RB_ROOT;
1515	spin_lock_init(&tree->lock);
1516	zswap_trees[type] = tree;
1517	}
1518
1519	void zswap_swapoff(int type)
1520	{
1521	struct zswap_tree *tree = zswap_trees[type];
1522	struct zswap_entry entry, n;
1523
1524	if (!tree)
1525	return;
1526
1527	/ walk the tree and free everything /
1528	spin_lock(lock: &tree->lock);
1529	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1530	zswap_free_entry(entry);
1531	tree->rbroot = RB_ROOT;
1532	spin_unlock(lock: &tree->lock);
1533	kfree(objp: tree);
1534	zswap_trees[type] = NULL;
1535	}
1536
1537	/*********************************
1538	* debugfs functions
1539	**********************************/
1540	#ifdef CONFIG_DEBUG_FS
1541	#include <linux/debugfs.h>
1542
1543	static struct dentry *zswap_debugfs_root;
1544
1545	static int zswap_debugfs_init(void)
1546	{
1547	if (!debugfs_initialized())
1548	return -ENODEV;
1549
1550	zswap_debugfs_root = debugfs_create_dir(name: "zswap", NULL);
1551
1552	debugfs_create_u64(name: "pool_limit_hit", mode: `0444`,
1553	parent: zswap_debugfs_root, value: &zswap_pool_limit_hit);
1554	debugfs_create_u64(name: "reject_reclaim_fail", mode: `0444`,
1555	parent: zswap_debugfs_root, value: &zswap_reject_reclaim_fail);
1556	debugfs_create_u64(name: "reject_alloc_fail", mode: `0444`,
1557	parent: zswap_debugfs_root, value: &zswap_reject_alloc_fail);
1558	debugfs_create_u64(name: "reject_kmemcache_fail", mode: `0444`,
1559	parent: zswap_debugfs_root, value: &zswap_reject_kmemcache_fail);
1560	debugfs_create_u64(name: "reject_compress_fail", mode: `0444`,
1561	parent: zswap_debugfs_root, value: &zswap_reject_compress_fail);
1562	debugfs_create_u64(name: "reject_compress_poor", mode: `0444`,
1563	parent: zswap_debugfs_root, value: &zswap_reject_compress_poor);
1564	debugfs_create_u64(name: "written_back_pages", mode: `0444`,
1565	parent: zswap_debugfs_root, value: &zswap_written_back_pages);
1566	debugfs_create_u64(name: "duplicate_entry", mode: `0444`,
1567	parent: zswap_debugfs_root, value: &zswap_duplicate_entry);
1568	debugfs_create_u64(name: "pool_total_size", mode: `0444`,
1569	parent: zswap_debugfs_root, value: &zswap_pool_total_size);
1570	debugfs_create_atomic_t(name: "stored_pages", mode: `0444`,
1571	parent: zswap_debugfs_root, value: &zswap_stored_pages);
1572	debugfs_create_atomic_t(name: "same_filled_pages", mode: `0444`,
1573	parent: zswap_debugfs_root, value: &zswap_same_filled_pages);
1574
1575	return `0`;
1576	}
1577	#else
1578	static int zswap_debugfs_init(void)
1579	{
1580	return `0`;
1581	}
1582	#endif
1583
1584	/*********************************
1585	* module init and exit
1586	**********************************/
1587	static int zswap_setup(void)
1588	{
1589	struct zswap_pool *pool;
1590	int ret;
1591
1592	zswap_entry_cache = KMEM_CACHE(zswap_entry, `0`);
1593	if (!zswap_entry_cache) {
1594	pr_err("entry cache creation failed\n");
1595	goto cache_fail;
1596	}
1597
1598	ret = cpuhp_setup_state(state: CPUHP_MM_ZSWP_MEM_PREPARE, name: "mm/zswap:prepare",
1599	startup: zswap_dstmem_prepare, teardown: zswap_dstmem_dead);
1600	if (ret) {
1601	pr_err("dstmem alloc failed\n");
1602	goto dstmem_fail;
1603	}
1604
1605	ret = cpuhp_setup_state_multi(state: CPUHP_MM_ZSWP_POOL_PREPARE,
1606	name: "mm/zswap_pool:prepare",
1607	startup: zswap_cpu_comp_prepare,
1608	teardown: zswap_cpu_comp_dead);
1609	if (ret)
1610	goto hp_fail;
1611
1612	pool = __zswap_pool_create_fallback();
1613	if (pool) {
1614	pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1615	zpool_get_type(pool->zpools[`0`]));
1616	list_add(new: &pool->list, head: &zswap_pools);
1617	zswap_has_pool = true;
1618	} else {
1619	pr_err("pool creation failed\n");
1620	zswap_enabled = false;
1621	}
1622
1623	shrink_wq = create_workqueue("zswap-shrink");
1624	if (!shrink_wq)
1625	goto fallback_fail;
1626
1627	if (zswap_debugfs_init())
1628	pr_warn("debugfs initialization failed\n");
1629	zswap_init_state = ZSWAP_INIT_SUCCEED;
1630	return `0`;
1631
1632	fallback_fail:
1633	if (pool)
1634	zswap_pool_destroy(pool);
1635	hp_fail:
1636	cpuhp_remove_state(state: CPUHP_MM_ZSWP_MEM_PREPARE);
1637	dstmem_fail:
1638	kmem_cache_destroy(s: zswap_entry_cache);
1639	cache_fail:
1640	/ if built-in, we aren't unloaded on failure; don't allow use /
1641	zswap_init_state = ZSWAP_INIT_FAILED;
1642	zswap_enabled = false;
1643	return -ENOMEM;
1644	}
1645
1646	static int __init zswap_init(void)
1647	{
1648	if (!zswap_enabled)
1649	return `0`;
1650	return zswap_setup();
1651	}
1652	/ must be late so crypto has time to come up /
1653	late_initcall(zswap_init);
1654
1655	MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1656	MODULE_DESCRIPTION("Compressed cache for swap pages");
1657

source code of linux/mm/zswap.c