1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2018 HUAWEI, Inc. |
4 | * https://www.huawei.com/ |
5 | * Copyright (C) 2022 Alibaba Cloud |
6 | */ |
7 | #include "compress.h" |
8 | #include <linux/psi.h> |
9 | #include <linux/cpuhotplug.h> |
10 | #include <trace/events/erofs.h> |
11 | |
12 | #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) |
13 | #define Z_EROFS_INLINE_BVECS 2 |
14 | |
15 | /* |
16 | * let's leave a type here in case of introducing |
17 | * another tagged pointer later. |
18 | */ |
19 | typedef void *z_erofs_next_pcluster_t; |
20 | |
21 | struct z_erofs_bvec { |
22 | union { |
23 | struct page *page; |
24 | struct folio *folio; |
25 | }; |
26 | int offset; |
27 | unsigned int end; |
28 | }; |
29 | |
30 | #define __Z_EROFS_BVSET(name, total) \ |
31 | struct name { \ |
32 | /* point to the next page which contains the following bvecs */ \ |
33 | struct page *nextpage; \ |
34 | struct z_erofs_bvec bvec[total]; \ |
35 | } |
36 | __Z_EROFS_BVSET(z_erofs_bvset,); |
37 | __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); |
38 | |
39 | /* |
40 | * Structure fields follow one of the following exclusion rules. |
41 | * |
42 | * I: Modifiable by initialization/destruction paths and read-only |
43 | * for everyone else; |
44 | * |
45 | * L: Field should be protected by the pcluster lock; |
46 | * |
47 | * A: Field should be accessed / updated in atomic for parallelized code. |
48 | */ |
49 | struct z_erofs_pcluster { |
50 | struct erofs_workgroup obj; |
51 | struct mutex lock; |
52 | |
53 | /* A: point to next chained pcluster or TAILs */ |
54 | z_erofs_next_pcluster_t next; |
55 | |
56 | /* L: the maximum decompression size of this round */ |
57 | unsigned int length; |
58 | |
59 | /* L: total number of bvecs */ |
60 | unsigned int vcnt; |
61 | |
62 | /* I: pcluster size (compressed size) in bytes */ |
63 | unsigned int pclustersize; |
64 | |
65 | /* I: page offset of start position of decompression */ |
66 | unsigned short pageofs_out; |
67 | |
68 | /* I: page offset of inline compressed data */ |
69 | unsigned short pageofs_in; |
70 | |
71 | union { |
72 | /* L: inline a certain number of bvec for bootstrap */ |
73 | struct z_erofs_bvset_inline bvset; |
74 | |
75 | /* I: can be used to free the pcluster by RCU. */ |
76 | struct rcu_head rcu; |
77 | }; |
78 | |
79 | /* I: compression algorithm format */ |
80 | unsigned char algorithmformat; |
81 | |
82 | /* L: whether partial decompression or not */ |
83 | bool partial; |
84 | |
85 | /* L: indicate several pageofs_outs or not */ |
86 | bool multibases; |
87 | |
88 | /* L: whether extra buffer allocations are best-effort */ |
89 | bool besteffort; |
90 | |
91 | /* A: compressed bvecs (can be cached or inplaced pages) */ |
92 | struct z_erofs_bvec compressed_bvecs[]; |
93 | }; |
94 | |
95 | /* the end of a chain of pclusters */ |
96 | #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) |
97 | #define Z_EROFS_PCLUSTER_NIL (NULL) |
98 | |
99 | struct z_erofs_decompressqueue { |
100 | struct super_block *sb; |
101 | atomic_t pending_bios; |
102 | z_erofs_next_pcluster_t head; |
103 | |
104 | union { |
105 | struct completion done; |
106 | struct work_struct work; |
107 | struct kthread_work kthread_work; |
108 | } u; |
109 | bool eio, sync; |
110 | }; |
111 | |
112 | static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) |
113 | { |
114 | return !pcl->obj.index; |
115 | } |
116 | |
117 | static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) |
118 | { |
119 | return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; |
120 | } |
121 | |
122 | #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) |
123 | static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) |
124 | { |
125 | return fo->mapping == MNGD_MAPPING(sbi); |
126 | } |
127 | |
128 | /* |
129 | * bit 30: I/O error occurred on this folio |
130 | * bit 0 - 29: remaining parts to complete this folio |
131 | */ |
132 | #define Z_EROFS_FOLIO_EIO (1 << 30) |
133 | |
134 | static void z_erofs_onlinefolio_init(struct folio *folio) |
135 | { |
136 | union { |
137 | atomic_t o; |
138 | void *v; |
139 | } u = { .o = ATOMIC_INIT(1) }; |
140 | |
141 | folio->private = u.v; /* valid only if file-backed folio is locked */ |
142 | } |
143 | |
144 | static void z_erofs_onlinefolio_split(struct folio *folio) |
145 | { |
146 | atomic_inc(v: (atomic_t *)&folio->private); |
147 | } |
148 | |
149 | static void z_erofs_onlinefolio_end(struct folio *folio, int err) |
150 | { |
151 | int orig, v; |
152 | |
153 | do { |
154 | orig = atomic_read(v: (atomic_t *)&folio->private); |
155 | v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0); |
156 | } while (atomic_cmpxchg(v: (atomic_t *)&folio->private, old: orig, new: v) != orig); |
157 | |
158 | if (v & ~Z_EROFS_FOLIO_EIO) |
159 | return; |
160 | folio->private = 0; |
161 | folio_end_read(folio, success: !(v & Z_EROFS_FOLIO_EIO)); |
162 | } |
163 | |
164 | #define Z_EROFS_ONSTACK_PAGES 32 |
165 | |
166 | /* |
167 | * since pclustersize is variable for big pcluster feature, introduce slab |
168 | * pools implementation for different pcluster sizes. |
169 | */ |
170 | struct z_erofs_pcluster_slab { |
171 | struct kmem_cache *slab; |
172 | unsigned int maxpages; |
173 | char name[48]; |
174 | }; |
175 | |
176 | #define _PCLP(n) { .maxpages = n } |
177 | |
178 | static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { |
179 | _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), |
180 | _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) |
181 | }; |
182 | |
183 | struct z_erofs_bvec_iter { |
184 | struct page *bvpage; |
185 | struct z_erofs_bvset *bvset; |
186 | unsigned int nr, cur; |
187 | }; |
188 | |
189 | static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) |
190 | { |
191 | if (iter->bvpage) |
192 | kunmap_local(iter->bvset); |
193 | return iter->bvpage; |
194 | } |
195 | |
196 | static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) |
197 | { |
198 | unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; |
199 | /* have to access nextpage in advance, otherwise it will be unmapped */ |
200 | struct page *nextpage = iter->bvset->nextpage; |
201 | struct page *oldpage; |
202 | |
203 | DBG_BUGON(!nextpage); |
204 | oldpage = z_erofs_bvec_iter_end(iter); |
205 | iter->bvpage = nextpage; |
206 | iter->bvset = kmap_local_page(page: nextpage); |
207 | iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); |
208 | iter->cur = 0; |
209 | return oldpage; |
210 | } |
211 | |
212 | static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, |
213 | struct z_erofs_bvset_inline *bvset, |
214 | unsigned int bootstrap_nr, |
215 | unsigned int cur) |
216 | { |
217 | *iter = (struct z_erofs_bvec_iter) { |
218 | .nr = bootstrap_nr, |
219 | .bvset = (struct z_erofs_bvset *)bvset, |
220 | }; |
221 | |
222 | while (cur > iter->nr) { |
223 | cur -= iter->nr; |
224 | z_erofs_bvset_flip(iter); |
225 | } |
226 | iter->cur = cur; |
227 | } |
228 | |
229 | static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, |
230 | struct z_erofs_bvec *bvec, |
231 | struct page **candidate_bvpage, |
232 | struct page **pagepool) |
233 | { |
234 | if (iter->cur >= iter->nr) { |
235 | struct page *nextpage = *candidate_bvpage; |
236 | |
237 | if (!nextpage) { |
238 | nextpage = erofs_allocpage(pagepool, GFP_KERNEL); |
239 | if (!nextpage) |
240 | return -ENOMEM; |
241 | set_page_private(page: nextpage, Z_EROFS_SHORTLIVED_PAGE); |
242 | } |
243 | DBG_BUGON(iter->bvset->nextpage); |
244 | iter->bvset->nextpage = nextpage; |
245 | z_erofs_bvset_flip(iter); |
246 | |
247 | iter->bvset->nextpage = NULL; |
248 | *candidate_bvpage = NULL; |
249 | } |
250 | iter->bvset->bvec[iter->cur++] = *bvec; |
251 | return 0; |
252 | } |
253 | |
254 | static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, |
255 | struct z_erofs_bvec *bvec, |
256 | struct page **old_bvpage) |
257 | { |
258 | if (iter->cur == iter->nr) |
259 | *old_bvpage = z_erofs_bvset_flip(iter); |
260 | else |
261 | *old_bvpage = NULL; |
262 | *bvec = iter->bvset->bvec[iter->cur++]; |
263 | } |
264 | |
265 | static void z_erofs_destroy_pcluster_pool(void) |
266 | { |
267 | int i; |
268 | |
269 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { |
270 | if (!pcluster_pool[i].slab) |
271 | continue; |
272 | kmem_cache_destroy(s: pcluster_pool[i].slab); |
273 | pcluster_pool[i].slab = NULL; |
274 | } |
275 | } |
276 | |
277 | static int z_erofs_create_pcluster_pool(void) |
278 | { |
279 | struct z_erofs_pcluster_slab *pcs; |
280 | struct z_erofs_pcluster *a; |
281 | unsigned int size; |
282 | |
283 | for (pcs = pcluster_pool; |
284 | pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { |
285 | size = struct_size(a, compressed_bvecs, pcs->maxpages); |
286 | |
287 | sprintf(buf: pcs->name, fmt: "erofs_pcluster-%u" , pcs->maxpages); |
288 | pcs->slab = kmem_cache_create(name: pcs->name, size, align: 0, |
289 | SLAB_RECLAIM_ACCOUNT, NULL); |
290 | if (pcs->slab) |
291 | continue; |
292 | |
293 | z_erofs_destroy_pcluster_pool(); |
294 | return -ENOMEM; |
295 | } |
296 | return 0; |
297 | } |
298 | |
299 | static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) |
300 | { |
301 | unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; |
302 | struct z_erofs_pcluster_slab *pcs = pcluster_pool; |
303 | |
304 | for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { |
305 | struct z_erofs_pcluster *pcl; |
306 | |
307 | if (nrpages > pcs->maxpages) |
308 | continue; |
309 | |
310 | pcl = kmem_cache_zalloc(k: pcs->slab, GFP_KERNEL); |
311 | if (!pcl) |
312 | return ERR_PTR(error: -ENOMEM); |
313 | pcl->pclustersize = size; |
314 | return pcl; |
315 | } |
316 | return ERR_PTR(error: -EINVAL); |
317 | } |
318 | |
319 | static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) |
320 | { |
321 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
322 | int i; |
323 | |
324 | for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { |
325 | struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; |
326 | |
327 | if (pclusterpages > pcs->maxpages) |
328 | continue; |
329 | |
330 | kmem_cache_free(s: pcs->slab, objp: pcl); |
331 | return; |
332 | } |
333 | DBG_BUGON(1); |
334 | } |
335 | |
336 | static struct workqueue_struct *z_erofs_workqueue __read_mostly; |
337 | |
338 | #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD |
339 | static struct kthread_worker __rcu **z_erofs_pcpu_workers; |
340 | |
341 | static void erofs_destroy_percpu_workers(void) |
342 | { |
343 | struct kthread_worker *worker; |
344 | unsigned int cpu; |
345 | |
346 | for_each_possible_cpu(cpu) { |
347 | worker = rcu_dereference_protected( |
348 | z_erofs_pcpu_workers[cpu], 1); |
349 | rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); |
350 | if (worker) |
351 | kthread_destroy_worker(worker); |
352 | } |
353 | kfree(objp: z_erofs_pcpu_workers); |
354 | } |
355 | |
356 | static struct kthread_worker *erofs_init_percpu_worker(int cpu) |
357 | { |
358 | struct kthread_worker *worker = |
359 | kthread_create_worker_on_cpu(cpu, flags: 0, namefmt: "erofs_worker/%u" , cpu); |
360 | |
361 | if (IS_ERR(ptr: worker)) |
362 | return worker; |
363 | if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) |
364 | sched_set_fifo_low(p: worker->task); |
365 | return worker; |
366 | } |
367 | |
368 | static int erofs_init_percpu_workers(void) |
369 | { |
370 | struct kthread_worker *worker; |
371 | unsigned int cpu; |
372 | |
373 | z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), |
374 | size: sizeof(struct kthread_worker *), GFP_ATOMIC); |
375 | if (!z_erofs_pcpu_workers) |
376 | return -ENOMEM; |
377 | |
378 | for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ |
379 | worker = erofs_init_percpu_worker(cpu); |
380 | if (!IS_ERR(ptr: worker)) |
381 | rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); |
382 | } |
383 | return 0; |
384 | } |
385 | #else |
386 | static inline void erofs_destroy_percpu_workers(void) {} |
387 | static inline int erofs_init_percpu_workers(void) { return 0; } |
388 | #endif |
389 | |
390 | #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) |
391 | static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); |
392 | static enum cpuhp_state erofs_cpuhp_state; |
393 | |
394 | static int erofs_cpu_online(unsigned int cpu) |
395 | { |
396 | struct kthread_worker *worker, *old; |
397 | |
398 | worker = erofs_init_percpu_worker(cpu); |
399 | if (IS_ERR(ptr: worker)) |
400 | return PTR_ERR(ptr: worker); |
401 | |
402 | spin_lock(lock: &z_erofs_pcpu_worker_lock); |
403 | old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], |
404 | lockdep_is_held(&z_erofs_pcpu_worker_lock)); |
405 | if (!old) |
406 | rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); |
407 | spin_unlock(lock: &z_erofs_pcpu_worker_lock); |
408 | if (old) |
409 | kthread_destroy_worker(worker); |
410 | return 0; |
411 | } |
412 | |
413 | static int erofs_cpu_offline(unsigned int cpu) |
414 | { |
415 | struct kthread_worker *worker; |
416 | |
417 | spin_lock(lock: &z_erofs_pcpu_worker_lock); |
418 | worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], |
419 | lockdep_is_held(&z_erofs_pcpu_worker_lock)); |
420 | rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); |
421 | spin_unlock(lock: &z_erofs_pcpu_worker_lock); |
422 | |
423 | synchronize_rcu(); |
424 | if (worker) |
425 | kthread_destroy_worker(worker); |
426 | return 0; |
427 | } |
428 | |
429 | static int erofs_cpu_hotplug_init(void) |
430 | { |
431 | int state; |
432 | |
433 | state = cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, |
434 | name: "fs/erofs:online" , startup: erofs_cpu_online, teardown: erofs_cpu_offline); |
435 | if (state < 0) |
436 | return state; |
437 | |
438 | erofs_cpuhp_state = state; |
439 | return 0; |
440 | } |
441 | |
442 | static void erofs_cpu_hotplug_destroy(void) |
443 | { |
444 | if (erofs_cpuhp_state) |
445 | cpuhp_remove_state_nocalls(state: erofs_cpuhp_state); |
446 | } |
447 | #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ |
448 | static inline int erofs_cpu_hotplug_init(void) { return 0; } |
449 | static inline void erofs_cpu_hotplug_destroy(void) {} |
450 | #endif |
451 | |
452 | void z_erofs_exit_zip_subsystem(void) |
453 | { |
454 | erofs_cpu_hotplug_destroy(); |
455 | erofs_destroy_percpu_workers(); |
456 | destroy_workqueue(wq: z_erofs_workqueue); |
457 | z_erofs_destroy_pcluster_pool(); |
458 | } |
459 | |
460 | int __init z_erofs_init_zip_subsystem(void) |
461 | { |
462 | int err = z_erofs_create_pcluster_pool(); |
463 | |
464 | if (err) |
465 | goto out_error_pcluster_pool; |
466 | |
467 | z_erofs_workqueue = alloc_workqueue(fmt: "erofs_worker" , |
468 | flags: WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); |
469 | if (!z_erofs_workqueue) { |
470 | err = -ENOMEM; |
471 | goto out_error_workqueue_init; |
472 | } |
473 | |
474 | err = erofs_init_percpu_workers(); |
475 | if (err) |
476 | goto out_error_pcpu_worker; |
477 | |
478 | err = erofs_cpu_hotplug_init(); |
479 | if (err < 0) |
480 | goto out_error_cpuhp_init; |
481 | return err; |
482 | |
483 | out_error_cpuhp_init: |
484 | erofs_destroy_percpu_workers(); |
485 | out_error_pcpu_worker: |
486 | destroy_workqueue(wq: z_erofs_workqueue); |
487 | out_error_workqueue_init: |
488 | z_erofs_destroy_pcluster_pool(); |
489 | out_error_pcluster_pool: |
490 | return err; |
491 | } |
492 | |
493 | enum z_erofs_pclustermode { |
494 | Z_EROFS_PCLUSTER_INFLIGHT, |
495 | /* |
496 | * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it |
497 | * could be dispatched into bypass queue later due to uptodated managed |
498 | * pages. All related online pages cannot be reused for inplace I/O (or |
499 | * bvpage) since it can be directly decoded without I/O submission. |
500 | */ |
501 | Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, |
502 | /* |
503 | * The pcluster was just linked to a decompression chain by us. It can |
504 | * also be linked with the remaining pclusters, which means if the |
505 | * processing page is the tail page of a pcluster, this pcluster can |
506 | * safely use the whole page (since the previous pcluster is within the |
507 | * same chain) for in-place I/O, as illustrated below: |
508 | * ___________________________________________________ |
509 | * | tail (partial) page | head (partial) page | |
510 | * | (of the current pcl) | (of the previous pcl) | |
511 | * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| |
512 | * |
513 | * [ (*) the page above can be used as inplace I/O. ] |
514 | */ |
515 | Z_EROFS_PCLUSTER_FOLLOWED, |
516 | }; |
517 | |
518 | struct z_erofs_decompress_frontend { |
519 | struct inode *const inode; |
520 | struct erofs_map_blocks map; |
521 | struct z_erofs_bvec_iter biter; |
522 | |
523 | struct page *pagepool; |
524 | struct page *candidate_bvpage; |
525 | struct z_erofs_pcluster *pcl; |
526 | z_erofs_next_pcluster_t owned_head; |
527 | enum z_erofs_pclustermode mode; |
528 | |
529 | erofs_off_t headoffset; |
530 | |
531 | /* a pointer used to pick up inplace I/O pages */ |
532 | unsigned int icur; |
533 | }; |
534 | |
535 | #define DECOMPRESS_FRONTEND_INIT(__i) { \ |
536 | .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ |
537 | .mode = Z_EROFS_PCLUSTER_FOLLOWED } |
538 | |
539 | static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) |
540 | { |
541 | unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; |
542 | |
543 | if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) |
544 | return false; |
545 | |
546 | if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) |
547 | return true; |
548 | |
549 | if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && |
550 | fe->map.m_la < fe->headoffset) |
551 | return true; |
552 | |
553 | return false; |
554 | } |
555 | |
556 | static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) |
557 | { |
558 | struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); |
559 | struct z_erofs_pcluster *pcl = fe->pcl; |
560 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
561 | bool shouldalloc = z_erofs_should_alloc_cache(fe); |
562 | bool standalone = true; |
563 | /* |
564 | * optimistic allocation without direct reclaim since inplace I/O |
565 | * can be used if low memory otherwise. |
566 | */ |
567 | gfp_t gfp = (mapping_gfp_mask(mapping: mc) & ~__GFP_DIRECT_RECLAIM) | |
568 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; |
569 | unsigned int i; |
570 | |
571 | if (i_blocksize(node: fe->inode) != PAGE_SIZE || |
572 | fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) |
573 | return; |
574 | |
575 | for (i = 0; i < pclusterpages; ++i) { |
576 | struct page *page, *newpage; |
577 | |
578 | /* Inaccurate check w/o locking to avoid unneeded lookups */ |
579 | if (READ_ONCE(pcl->compressed_bvecs[i].page)) |
580 | continue; |
581 | |
582 | page = find_get_page(mapping: mc, offset: pcl->obj.index + i); |
583 | if (!page) { |
584 | /* I/O is needed, no possible to decompress directly */ |
585 | standalone = false; |
586 | if (!shouldalloc) |
587 | continue; |
588 | |
589 | /* |
590 | * Try cached I/O if allocation succeeds or fallback to |
591 | * in-place I/O instead to avoid any direct reclaim. |
592 | */ |
593 | newpage = erofs_allocpage(pagepool: &fe->pagepool, gfp); |
594 | if (!newpage) |
595 | continue; |
596 | set_page_private(page: newpage, Z_EROFS_PREALLOCATED_PAGE); |
597 | } |
598 | spin_lock(lock: &pcl->obj.lockref.lock); |
599 | if (!pcl->compressed_bvecs[i].page) { |
600 | pcl->compressed_bvecs[i].page = page ? page : newpage; |
601 | spin_unlock(lock: &pcl->obj.lockref.lock); |
602 | continue; |
603 | } |
604 | spin_unlock(lock: &pcl->obj.lockref.lock); |
605 | |
606 | if (page) |
607 | put_page(page); |
608 | else if (newpage) |
609 | erofs_pagepool_add(pagepool: &fe->pagepool, page: newpage); |
610 | } |
611 | |
612 | /* |
613 | * don't do inplace I/O if all compressed pages are available in |
614 | * managed cache since it can be moved to the bypass queue instead. |
615 | */ |
616 | if (standalone) |
617 | fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; |
618 | } |
619 | |
620 | /* called by erofs_shrinker to get rid of all cached compressed bvecs */ |
621 | int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, |
622 | struct erofs_workgroup *grp) |
623 | { |
624 | struct z_erofs_pcluster *const pcl = |
625 | container_of(grp, struct z_erofs_pcluster, obj); |
626 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
627 | int i; |
628 | |
629 | DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); |
630 | /* There is no actice user since the pcluster is now freezed */ |
631 | for (i = 0; i < pclusterpages; ++i) { |
632 | struct folio *folio = pcl->compressed_bvecs[i].folio; |
633 | |
634 | if (!folio) |
635 | continue; |
636 | |
637 | /* Avoid reclaiming or migrating this folio */ |
638 | if (!folio_trylock(folio)) |
639 | return -EBUSY; |
640 | |
641 | if (!erofs_folio_is_managed(sbi, fo: folio)) |
642 | continue; |
643 | pcl->compressed_bvecs[i].folio = NULL; |
644 | folio_detach_private(folio); |
645 | folio_unlock(folio); |
646 | } |
647 | return 0; |
648 | } |
649 | |
650 | static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) |
651 | { |
652 | struct z_erofs_pcluster *pcl = folio_get_private(folio); |
653 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
654 | bool ret; |
655 | int i; |
656 | |
657 | if (!folio_test_private(folio)) |
658 | return true; |
659 | |
660 | ret = false; |
661 | spin_lock(lock: &pcl->obj.lockref.lock); |
662 | if (pcl->obj.lockref.count <= 0) { |
663 | DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); |
664 | for (i = 0; i < pclusterpages; ++i) { |
665 | if (pcl->compressed_bvecs[i].folio == folio) { |
666 | pcl->compressed_bvecs[i].folio = NULL; |
667 | folio_detach_private(folio); |
668 | ret = true; |
669 | break; |
670 | } |
671 | } |
672 | } |
673 | spin_unlock(lock: &pcl->obj.lockref.lock); |
674 | return ret; |
675 | } |
676 | |
677 | /* |
678 | * It will be called only on inode eviction. In case that there are still some |
679 | * decompression requests in progress, wait with rescheduling for a bit here. |
680 | * An extra lock could be introduced instead but it seems unnecessary. |
681 | */ |
682 | static void z_erofs_cache_invalidate_folio(struct folio *folio, |
683 | size_t offset, size_t length) |
684 | { |
685 | const size_t stop = length + offset; |
686 | |
687 | /* Check for potential overflow in debug mode */ |
688 | DBG_BUGON(stop > folio_size(folio) || stop < length); |
689 | |
690 | if (offset == 0 && stop == folio_size(folio)) |
691 | while (!z_erofs_cache_release_folio(folio, gfp: 0)) |
692 | cond_resched(); |
693 | } |
694 | |
695 | static const struct address_space_operations z_erofs_cache_aops = { |
696 | .release_folio = z_erofs_cache_release_folio, |
697 | .invalidate_folio = z_erofs_cache_invalidate_folio, |
698 | }; |
699 | |
700 | int erofs_init_managed_cache(struct super_block *sb) |
701 | { |
702 | struct inode *const inode = new_inode(sb); |
703 | |
704 | if (!inode) |
705 | return -ENOMEM; |
706 | |
707 | set_nlink(inode, nlink: 1); |
708 | inode->i_size = OFFSET_MAX; |
709 | inode->i_mapping->a_ops = &z_erofs_cache_aops; |
710 | mapping_set_gfp_mask(m: inode->i_mapping, GFP_KERNEL); |
711 | EROFS_SB(sb)->managed_cache = inode; |
712 | return 0; |
713 | } |
714 | |
715 | /* callers must be with pcluster lock held */ |
716 | static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, |
717 | struct z_erofs_bvec *bvec, bool exclusive) |
718 | { |
719 | struct z_erofs_pcluster *pcl = fe->pcl; |
720 | int ret; |
721 | |
722 | if (exclusive) { |
723 | /* give priority for inplaceio to use file pages first */ |
724 | spin_lock(lock: &pcl->obj.lockref.lock); |
725 | while (fe->icur > 0) { |
726 | if (pcl->compressed_bvecs[--fe->icur].page) |
727 | continue; |
728 | pcl->compressed_bvecs[fe->icur] = *bvec; |
729 | spin_unlock(lock: &pcl->obj.lockref.lock); |
730 | return 0; |
731 | } |
732 | spin_unlock(lock: &pcl->obj.lockref.lock); |
733 | |
734 | /* otherwise, check if it can be used as a bvpage */ |
735 | if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && |
736 | !fe->candidate_bvpage) |
737 | fe->candidate_bvpage = bvec->page; |
738 | } |
739 | ret = z_erofs_bvec_enqueue(iter: &fe->biter, bvec, candidate_bvpage: &fe->candidate_bvpage, |
740 | pagepool: &fe->pagepool); |
741 | fe->pcl->vcnt += (ret >= 0); |
742 | return ret; |
743 | } |
744 | |
745 | static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) |
746 | { |
747 | struct z_erofs_pcluster *pcl = f->pcl; |
748 | z_erofs_next_pcluster_t *owned_head = &f->owned_head; |
749 | |
750 | /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ |
751 | if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, |
752 | *owned_head) == Z_EROFS_PCLUSTER_NIL) { |
753 | *owned_head = &pcl->next; |
754 | /* so we can attach this pcluster to our submission chain. */ |
755 | f->mode = Z_EROFS_PCLUSTER_FOLLOWED; |
756 | return; |
757 | } |
758 | |
759 | /* type 2, it belongs to an ongoing chain */ |
760 | f->mode = Z_EROFS_PCLUSTER_INFLIGHT; |
761 | } |
762 | |
763 | static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) |
764 | { |
765 | struct erofs_map_blocks *map = &fe->map; |
766 | struct super_block *sb = fe->inode->i_sb; |
767 | bool ztailpacking = map->m_flags & EROFS_MAP_META; |
768 | struct z_erofs_pcluster *pcl; |
769 | struct erofs_workgroup *grp; |
770 | int err; |
771 | |
772 | if (!(map->m_flags & EROFS_MAP_ENCODED) || |
773 | (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { |
774 | DBG_BUGON(1); |
775 | return -EFSCORRUPTED; |
776 | } |
777 | |
778 | /* no available pcluster, let's allocate one */ |
779 | pcl = z_erofs_alloc_pcluster(size: map->m_plen); |
780 | if (IS_ERR(ptr: pcl)) |
781 | return PTR_ERR(ptr: pcl); |
782 | |
783 | spin_lock_init(&pcl->obj.lockref.lock); |
784 | pcl->obj.lockref.count = 1; /* one ref for this request */ |
785 | pcl->algorithmformat = map->m_algorithmformat; |
786 | pcl->length = 0; |
787 | pcl->partial = true; |
788 | |
789 | /* new pclusters should be claimed as type 1, primary and followed */ |
790 | pcl->next = fe->owned_head; |
791 | pcl->pageofs_out = map->m_la & ~PAGE_MASK; |
792 | fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; |
793 | |
794 | /* |
795 | * lock all primary followed works before visible to others |
796 | * and mutex_trylock *never* fails for a new pcluster. |
797 | */ |
798 | mutex_init(&pcl->lock); |
799 | DBG_BUGON(!mutex_trylock(&pcl->lock)); |
800 | |
801 | if (ztailpacking) { |
802 | pcl->obj.index = 0; /* which indicates ztailpacking */ |
803 | } else { |
804 | pcl->obj.index = erofs_blknr(sb, map->m_pa); |
805 | |
806 | grp = erofs_insert_workgroup(sb: fe->inode->i_sb, grp: &pcl->obj); |
807 | if (IS_ERR(ptr: grp)) { |
808 | err = PTR_ERR(ptr: grp); |
809 | goto err_out; |
810 | } |
811 | |
812 | if (grp != &pcl->obj) { |
813 | fe->pcl = container_of(grp, |
814 | struct z_erofs_pcluster, obj); |
815 | err = -EEXIST; |
816 | goto err_out; |
817 | } |
818 | } |
819 | fe->owned_head = &pcl->next; |
820 | fe->pcl = pcl; |
821 | return 0; |
822 | |
823 | err_out: |
824 | mutex_unlock(lock: &pcl->lock); |
825 | z_erofs_free_pcluster(pcl); |
826 | return err; |
827 | } |
828 | |
829 | static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) |
830 | { |
831 | struct erofs_map_blocks *map = &fe->map; |
832 | struct super_block *sb = fe->inode->i_sb; |
833 | erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); |
834 | struct erofs_workgroup *grp = NULL; |
835 | int ret; |
836 | |
837 | DBG_BUGON(fe->pcl); |
838 | |
839 | /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ |
840 | DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); |
841 | |
842 | if (!(map->m_flags & EROFS_MAP_META)) { |
843 | grp = erofs_find_workgroup(sb, index: blknr); |
844 | } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { |
845 | DBG_BUGON(1); |
846 | return -EFSCORRUPTED; |
847 | } |
848 | |
849 | if (grp) { |
850 | fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); |
851 | ret = -EEXIST; |
852 | } else { |
853 | ret = z_erofs_register_pcluster(fe); |
854 | } |
855 | |
856 | if (ret == -EEXIST) { |
857 | mutex_lock(&fe->pcl->lock); |
858 | z_erofs_try_to_claim_pcluster(f: fe); |
859 | } else if (ret) { |
860 | return ret; |
861 | } |
862 | |
863 | z_erofs_bvec_iter_begin(iter: &fe->biter, bvset: &fe->pcl->bvset, |
864 | Z_EROFS_INLINE_BVECS, cur: fe->pcl->vcnt); |
865 | if (!z_erofs_is_inline_pcluster(pcl: fe->pcl)) { |
866 | /* bind cache first when cached decompression is preferred */ |
867 | z_erofs_bind_cache(fe); |
868 | } else { |
869 | void *mptr; |
870 | |
871 | mptr = erofs_read_metabuf(buf: &map->buf, sb, blkaddr: blknr, type: EROFS_NO_KMAP); |
872 | if (IS_ERR(ptr: mptr)) { |
873 | ret = PTR_ERR(ptr: mptr); |
874 | erofs_err(sb, "failed to get inline data %d" , ret); |
875 | return ret; |
876 | } |
877 | get_page(page: map->buf.page); |
878 | WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); |
879 | fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; |
880 | fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; |
881 | } |
882 | /* file-backed inplace I/O pages are traversed in reverse order */ |
883 | fe->icur = z_erofs_pclusterpages(pcl: fe->pcl); |
884 | return 0; |
885 | } |
886 | |
887 | /* |
888 | * keep in mind that no referenced pclusters will be freed |
889 | * only after a RCU grace period. |
890 | */ |
891 | static void z_erofs_rcu_callback(struct rcu_head *head) |
892 | { |
893 | z_erofs_free_pcluster(container_of(head, |
894 | struct z_erofs_pcluster, rcu)); |
895 | } |
896 | |
897 | void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) |
898 | { |
899 | struct z_erofs_pcluster *const pcl = |
900 | container_of(grp, struct z_erofs_pcluster, obj); |
901 | |
902 | call_rcu(head: &pcl->rcu, func: z_erofs_rcu_callback); |
903 | } |
904 | |
905 | static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) |
906 | { |
907 | struct z_erofs_pcluster *pcl = fe->pcl; |
908 | |
909 | if (!pcl) |
910 | return; |
911 | |
912 | z_erofs_bvec_iter_end(iter: &fe->biter); |
913 | mutex_unlock(lock: &pcl->lock); |
914 | |
915 | if (fe->candidate_bvpage) |
916 | fe->candidate_bvpage = NULL; |
917 | |
918 | /* |
919 | * if all pending pages are added, don't hold its reference |
920 | * any longer if the pcluster isn't hosted by ourselves. |
921 | */ |
922 | if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) |
923 | erofs_workgroup_put(grp: &pcl->obj); |
924 | |
925 | fe->pcl = NULL; |
926 | } |
927 | |
928 | static int z_erofs_read_fragment(struct super_block *sb, struct page *page, |
929 | unsigned int cur, unsigned int end, erofs_off_t pos) |
930 | { |
931 | struct inode *packed_inode = EROFS_SB(sb)->packed_inode; |
932 | struct erofs_buf buf = __EROFS_BUF_INITIALIZER; |
933 | unsigned int cnt; |
934 | u8 *src; |
935 | |
936 | if (!packed_inode) |
937 | return -EFSCORRUPTED; |
938 | |
939 | buf.inode = packed_inode; |
940 | for (; cur < end; cur += cnt, pos += cnt) { |
941 | cnt = min_t(unsigned int, end - cur, |
942 | sb->s_blocksize - erofs_blkoff(sb, pos)); |
943 | src = erofs_bread(buf: &buf, erofs_blknr(sb, pos), type: EROFS_KMAP); |
944 | if (IS_ERR(ptr: src)) { |
945 | erofs_put_metabuf(buf: &buf); |
946 | return PTR_ERR(ptr: src); |
947 | } |
948 | memcpy_to_page(page, offset: cur, from: src + erofs_blkoff(sb, pos), len: cnt); |
949 | } |
950 | erofs_put_metabuf(buf: &buf); |
951 | return 0; |
952 | } |
953 | |
954 | static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *fe, |
955 | struct folio *folio, bool ra) |
956 | { |
957 | struct inode *const inode = fe->inode; |
958 | struct erofs_map_blocks *const map = &fe->map; |
959 | const loff_t offset = folio_pos(folio); |
960 | const unsigned int bs = i_blocksize(node: inode), fs = folio_size(folio); |
961 | bool tight = true, exclusive; |
962 | unsigned int cur, end, len, split; |
963 | int err = 0; |
964 | |
965 | z_erofs_onlinefolio_init(folio); |
966 | split = 0; |
967 | end = fs; |
968 | repeat: |
969 | if (offset + end - 1 < map->m_la || |
970 | offset + end - 1 >= map->m_la + map->m_llen) { |
971 | z_erofs_pcluster_end(fe); |
972 | map->m_la = offset + end - 1; |
973 | map->m_llen = 0; |
974 | err = z_erofs_map_blocks_iter(inode, map, flags: 0); |
975 | if (err) |
976 | goto out; |
977 | } |
978 | |
979 | cur = offset > map->m_la ? 0 : map->m_la - offset; |
980 | /* bump split parts first to avoid several separate cases */ |
981 | ++split; |
982 | |
983 | if (!(map->m_flags & EROFS_MAP_MAPPED)) { |
984 | folio_zero_segment(folio, start: cur, xend: end); |
985 | tight = false; |
986 | goto next_part; |
987 | } |
988 | |
989 | if (map->m_flags & EROFS_MAP_FRAGMENT) { |
990 | erofs_off_t fpos = offset + cur - map->m_la; |
991 | |
992 | len = min_t(unsigned int, map->m_llen - fpos, end - cur); |
993 | err = z_erofs_read_fragment(sb: inode->i_sb, page: &folio->page, cur, |
994 | end: cur + len, EROFS_I(inode)->z_fragmentoff + fpos); |
995 | if (err) |
996 | goto out; |
997 | tight = false; |
998 | goto next_part; |
999 | } |
1000 | |
1001 | if (!fe->pcl) { |
1002 | err = z_erofs_pcluster_begin(fe); |
1003 | if (err) |
1004 | goto out; |
1005 | fe->pcl->besteffort |= !ra; |
1006 | } |
1007 | |
1008 | /* |
1009 | * Ensure the current partial folio belongs to this submit chain rather |
1010 | * than other concurrent submit chains or the noio(bypass) chain since |
1011 | * those chains are handled asynchronously thus the folio cannot be used |
1012 | * for inplace I/O or bvpage (should be processed in a strict order.) |
1013 | */ |
1014 | tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); |
1015 | exclusive = (!cur && ((split <= 1) || (tight && bs == fs))); |
1016 | if (cur) |
1017 | tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); |
1018 | |
1019 | err = z_erofs_attach_page(fe, bvec: &((struct z_erofs_bvec) { |
1020 | .page = &folio->page, |
1021 | .offset = offset - map->m_la, |
1022 | .end = end, |
1023 | }), exclusive); |
1024 | if (err) |
1025 | goto out; |
1026 | |
1027 | z_erofs_onlinefolio_split(folio); |
1028 | if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) |
1029 | fe->pcl->multibases = true; |
1030 | if (fe->pcl->length < offset + end - map->m_la) { |
1031 | fe->pcl->length = offset + end - map->m_la; |
1032 | fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; |
1033 | } |
1034 | if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && |
1035 | !(map->m_flags & EROFS_MAP_PARTIAL_REF) && |
1036 | fe->pcl->length == map->m_llen) |
1037 | fe->pcl->partial = false; |
1038 | next_part: |
1039 | /* shorten the remaining extent to update progress */ |
1040 | map->m_llen = offset + cur - map->m_la; |
1041 | map->m_flags &= ~EROFS_MAP_FULL_MAPPED; |
1042 | |
1043 | end = cur; |
1044 | if (end > 0) |
1045 | goto repeat; |
1046 | |
1047 | out: |
1048 | z_erofs_onlinefolio_end(folio, err); |
1049 | return err; |
1050 | } |
1051 | |
1052 | static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, |
1053 | unsigned int readahead_pages) |
1054 | { |
1055 | /* auto: enable for read_folio, disable for readahead */ |
1056 | if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && |
1057 | !readahead_pages) |
1058 | return true; |
1059 | |
1060 | if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && |
1061 | (readahead_pages <= sbi->opt.max_sync_decompress_pages)) |
1062 | return true; |
1063 | |
1064 | return false; |
1065 | } |
1066 | |
1067 | static bool z_erofs_page_is_invalidated(struct page *page) |
1068 | { |
1069 | return !page->mapping && !z_erofs_is_shortlived_page(page); |
1070 | } |
1071 | |
1072 | struct z_erofs_decompress_backend { |
1073 | struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; |
1074 | struct super_block *sb; |
1075 | struct z_erofs_pcluster *pcl; |
1076 | |
1077 | /* pages with the longest decompressed length for deduplication */ |
1078 | struct page **decompressed_pages; |
1079 | /* pages to keep the compressed data */ |
1080 | struct page **compressed_pages; |
1081 | |
1082 | struct list_head decompressed_secondary_bvecs; |
1083 | struct page **pagepool; |
1084 | unsigned int onstack_used, nr_pages; |
1085 | }; |
1086 | |
1087 | struct z_erofs_bvec_item { |
1088 | struct z_erofs_bvec bvec; |
1089 | struct list_head list; |
1090 | }; |
1091 | |
1092 | static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, |
1093 | struct z_erofs_bvec *bvec) |
1094 | { |
1095 | struct z_erofs_bvec_item *item; |
1096 | unsigned int pgnr; |
1097 | |
1098 | if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && |
1099 | (bvec->end == PAGE_SIZE || |
1100 | bvec->offset + bvec->end == be->pcl->length)) { |
1101 | pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; |
1102 | DBG_BUGON(pgnr >= be->nr_pages); |
1103 | if (!be->decompressed_pages[pgnr]) { |
1104 | be->decompressed_pages[pgnr] = bvec->page; |
1105 | return; |
1106 | } |
1107 | } |
1108 | |
1109 | /* (cold path) one pcluster is requested multiple times */ |
1110 | item = kmalloc(size: sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); |
1111 | item->bvec = *bvec; |
1112 | list_add(new: &item->list, head: &be->decompressed_secondary_bvecs); |
1113 | } |
1114 | |
1115 | static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, |
1116 | int err) |
1117 | { |
1118 | unsigned int off0 = be->pcl->pageofs_out; |
1119 | struct list_head *p, *n; |
1120 | |
1121 | list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { |
1122 | struct z_erofs_bvec_item *bvi; |
1123 | unsigned int end, cur; |
1124 | void *dst, *src; |
1125 | |
1126 | bvi = container_of(p, struct z_erofs_bvec_item, list); |
1127 | cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; |
1128 | end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, |
1129 | bvi->bvec.end); |
1130 | dst = kmap_local_page(page: bvi->bvec.page); |
1131 | while (cur < end) { |
1132 | unsigned int pgnr, scur, len; |
1133 | |
1134 | pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; |
1135 | DBG_BUGON(pgnr >= be->nr_pages); |
1136 | |
1137 | scur = bvi->bvec.offset + cur - |
1138 | ((pgnr << PAGE_SHIFT) - off0); |
1139 | len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); |
1140 | if (!be->decompressed_pages[pgnr]) { |
1141 | err = -EFSCORRUPTED; |
1142 | cur += len; |
1143 | continue; |
1144 | } |
1145 | src = kmap_local_page(page: be->decompressed_pages[pgnr]); |
1146 | memcpy(dst + cur, src + scur, len); |
1147 | kunmap_local(src); |
1148 | cur += len; |
1149 | } |
1150 | kunmap_local(dst); |
1151 | z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); |
1152 | list_del(entry: p); |
1153 | kfree(objp: bvi); |
1154 | } |
1155 | } |
1156 | |
1157 | static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) |
1158 | { |
1159 | struct z_erofs_pcluster *pcl = be->pcl; |
1160 | struct z_erofs_bvec_iter biter; |
1161 | struct page *old_bvpage; |
1162 | int i; |
1163 | |
1164 | z_erofs_bvec_iter_begin(iter: &biter, bvset: &pcl->bvset, Z_EROFS_INLINE_BVECS, cur: 0); |
1165 | for (i = 0; i < pcl->vcnt; ++i) { |
1166 | struct z_erofs_bvec bvec; |
1167 | |
1168 | z_erofs_bvec_dequeue(iter: &biter, bvec: &bvec, old_bvpage: &old_bvpage); |
1169 | |
1170 | if (old_bvpage) |
1171 | z_erofs_put_shortlivedpage(pagepool: be->pagepool, page: old_bvpage); |
1172 | |
1173 | DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); |
1174 | z_erofs_do_decompressed_bvec(be, bvec: &bvec); |
1175 | } |
1176 | |
1177 | old_bvpage = z_erofs_bvec_iter_end(iter: &biter); |
1178 | if (old_bvpage) |
1179 | z_erofs_put_shortlivedpage(pagepool: be->pagepool, page: old_bvpage); |
1180 | } |
1181 | |
1182 | static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, |
1183 | bool *overlapped) |
1184 | { |
1185 | struct z_erofs_pcluster *pcl = be->pcl; |
1186 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
1187 | int i, err = 0; |
1188 | |
1189 | *overlapped = false; |
1190 | for (i = 0; i < pclusterpages; ++i) { |
1191 | struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; |
1192 | struct page *page = bvec->page; |
1193 | |
1194 | /* compressed data ought to be valid before decompressing */ |
1195 | if (!page) { |
1196 | err = -EIO; |
1197 | continue; |
1198 | } |
1199 | be->compressed_pages[i] = page; |
1200 | |
1201 | if (z_erofs_is_inline_pcluster(pcl) || |
1202 | erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { |
1203 | if (!PageUptodate(page)) |
1204 | err = -EIO; |
1205 | continue; |
1206 | } |
1207 | |
1208 | DBG_BUGON(z_erofs_page_is_invalidated(page)); |
1209 | if (z_erofs_is_shortlived_page(page)) |
1210 | continue; |
1211 | z_erofs_do_decompressed_bvec(be, bvec); |
1212 | *overlapped = true; |
1213 | } |
1214 | return err; |
1215 | } |
1216 | |
1217 | static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, |
1218 | int err) |
1219 | { |
1220 | struct erofs_sb_info *const sbi = EROFS_SB(be->sb); |
1221 | struct z_erofs_pcluster *pcl = be->pcl; |
1222 | unsigned int pclusterpages = z_erofs_pclusterpages(pcl); |
1223 | const struct z_erofs_decompressor *decomp = |
1224 | &erofs_decompressors[pcl->algorithmformat]; |
1225 | int i, err2; |
1226 | struct page *page; |
1227 | bool overlapped; |
1228 | |
1229 | mutex_lock(&pcl->lock); |
1230 | be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; |
1231 | |
1232 | /* allocate (de)compressed page arrays if cannot be kept on stack */ |
1233 | be->decompressed_pages = NULL; |
1234 | be->compressed_pages = NULL; |
1235 | be->onstack_used = 0; |
1236 | if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { |
1237 | be->decompressed_pages = be->onstack_pages; |
1238 | be->onstack_used = be->nr_pages; |
1239 | memset(be->decompressed_pages, 0, |
1240 | sizeof(struct page *) * be->nr_pages); |
1241 | } |
1242 | |
1243 | if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) |
1244 | be->compressed_pages = be->onstack_pages + be->onstack_used; |
1245 | |
1246 | if (!be->decompressed_pages) |
1247 | be->decompressed_pages = |
1248 | kvcalloc(n: be->nr_pages, size: sizeof(struct page *), |
1249 | GFP_KERNEL | __GFP_NOFAIL); |
1250 | if (!be->compressed_pages) |
1251 | be->compressed_pages = |
1252 | kvcalloc(n: pclusterpages, size: sizeof(struct page *), |
1253 | GFP_KERNEL | __GFP_NOFAIL); |
1254 | |
1255 | z_erofs_parse_out_bvecs(be); |
1256 | err2 = z_erofs_parse_in_bvecs(be, overlapped: &overlapped); |
1257 | if (err2) |
1258 | err = err2; |
1259 | if (!err) |
1260 | err = decomp->decompress(&(struct z_erofs_decompress_req) { |
1261 | .sb = be->sb, |
1262 | .in = be->compressed_pages, |
1263 | .out = be->decompressed_pages, |
1264 | .pageofs_in = pcl->pageofs_in, |
1265 | .pageofs_out = pcl->pageofs_out, |
1266 | .inputsize = pcl->pclustersize, |
1267 | .outputsize = pcl->length, |
1268 | .alg = pcl->algorithmformat, |
1269 | .inplace_io = overlapped, |
1270 | .partial_decoding = pcl->partial, |
1271 | .fillgaps = pcl->multibases, |
1272 | .gfp = pcl->besteffort ? |
1273 | GFP_KERNEL | __GFP_NOFAIL : |
1274 | GFP_NOWAIT | __GFP_NORETRY |
1275 | }, be->pagepool); |
1276 | |
1277 | /* must handle all compressed pages before actual file pages */ |
1278 | if (z_erofs_is_inline_pcluster(pcl)) { |
1279 | page = pcl->compressed_bvecs[0].page; |
1280 | WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); |
1281 | put_page(page); |
1282 | } else { |
1283 | for (i = 0; i < pclusterpages; ++i) { |
1284 | /* consider shortlived pages added when decompressing */ |
1285 | page = be->compressed_pages[i]; |
1286 | |
1287 | if (!page || |
1288 | erofs_folio_is_managed(sbi, page_folio(page))) |
1289 | continue; |
1290 | (void)z_erofs_put_shortlivedpage(pagepool: be->pagepool, page); |
1291 | WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); |
1292 | } |
1293 | } |
1294 | if (be->compressed_pages < be->onstack_pages || |
1295 | be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) |
1296 | kvfree(addr: be->compressed_pages); |
1297 | z_erofs_fill_other_copies(be, err); |
1298 | |
1299 | for (i = 0; i < be->nr_pages; ++i) { |
1300 | page = be->decompressed_pages[i]; |
1301 | if (!page) |
1302 | continue; |
1303 | |
1304 | DBG_BUGON(z_erofs_page_is_invalidated(page)); |
1305 | |
1306 | /* recycle all individual short-lived pages */ |
1307 | if (z_erofs_put_shortlivedpage(pagepool: be->pagepool, page)) |
1308 | continue; |
1309 | z_erofs_onlinefolio_end(page_folio(page), err); |
1310 | } |
1311 | |
1312 | if (be->decompressed_pages != be->onstack_pages) |
1313 | kvfree(addr: be->decompressed_pages); |
1314 | |
1315 | pcl->length = 0; |
1316 | pcl->partial = true; |
1317 | pcl->multibases = false; |
1318 | pcl->besteffort = false; |
1319 | pcl->bvset.nextpage = NULL; |
1320 | pcl->vcnt = 0; |
1321 | |
1322 | /* pcluster lock MUST be taken before the following line */ |
1323 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); |
1324 | mutex_unlock(lock: &pcl->lock); |
1325 | return err; |
1326 | } |
1327 | |
1328 | static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, |
1329 | struct page **pagepool) |
1330 | { |
1331 | struct z_erofs_decompress_backend be = { |
1332 | .sb = io->sb, |
1333 | .pagepool = pagepool, |
1334 | .decompressed_secondary_bvecs = |
1335 | LIST_HEAD_INIT(be.decompressed_secondary_bvecs), |
1336 | }; |
1337 | z_erofs_next_pcluster_t owned = io->head; |
1338 | |
1339 | while (owned != Z_EROFS_PCLUSTER_TAIL) { |
1340 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); |
1341 | |
1342 | be.pcl = container_of(owned, struct z_erofs_pcluster, next); |
1343 | owned = READ_ONCE(be.pcl->next); |
1344 | |
1345 | z_erofs_decompress_pcluster(be: &be, err: io->eio ? -EIO : 0); |
1346 | if (z_erofs_is_inline_pcluster(pcl: be.pcl)) |
1347 | z_erofs_free_pcluster(pcl: be.pcl); |
1348 | else |
1349 | erofs_workgroup_put(grp: &be.pcl->obj); |
1350 | } |
1351 | } |
1352 | |
1353 | static void z_erofs_decompressqueue_work(struct work_struct *work) |
1354 | { |
1355 | struct z_erofs_decompressqueue *bgq = |
1356 | container_of(work, struct z_erofs_decompressqueue, u.work); |
1357 | struct page *pagepool = NULL; |
1358 | |
1359 | DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); |
1360 | z_erofs_decompress_queue(io: bgq, pagepool: &pagepool); |
1361 | erofs_release_pages(pagepool: &pagepool); |
1362 | kvfree(addr: bgq); |
1363 | } |
1364 | |
1365 | #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD |
1366 | static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) |
1367 | { |
1368 | z_erofs_decompressqueue_work(work: (struct work_struct *)work); |
1369 | } |
1370 | #endif |
1371 | |
1372 | static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, |
1373 | int bios) |
1374 | { |
1375 | struct erofs_sb_info *const sbi = EROFS_SB(io->sb); |
1376 | |
1377 | /* wake up the caller thread for sync decompression */ |
1378 | if (io->sync) { |
1379 | if (!atomic_add_return(i: bios, v: &io->pending_bios)) |
1380 | complete(&io->u.done); |
1381 | return; |
1382 | } |
1383 | |
1384 | if (atomic_add_return(i: bios, v: &io->pending_bios)) |
1385 | return; |
1386 | /* Use (kthread_)work and sync decompression for atomic contexts only */ |
1387 | if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { |
1388 | #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD |
1389 | struct kthread_worker *worker; |
1390 | |
1391 | rcu_read_lock(); |
1392 | worker = rcu_dereference( |
1393 | z_erofs_pcpu_workers[raw_smp_processor_id()]); |
1394 | if (!worker) { |
1395 | INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); |
1396 | queue_work(wq: z_erofs_workqueue, work: &io->u.work); |
1397 | } else { |
1398 | kthread_queue_work(worker, work: &io->u.kthread_work); |
1399 | } |
1400 | rcu_read_unlock(); |
1401 | #else |
1402 | queue_work(z_erofs_workqueue, &io->u.work); |
1403 | #endif |
1404 | /* enable sync decompression for readahead */ |
1405 | if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) |
1406 | sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; |
1407 | return; |
1408 | } |
1409 | z_erofs_decompressqueue_work(work: &io->u.work); |
1410 | } |
1411 | |
1412 | static void z_erofs_fill_bio_vec(struct bio_vec *bvec, |
1413 | struct z_erofs_decompress_frontend *f, |
1414 | struct z_erofs_pcluster *pcl, |
1415 | unsigned int nr, |
1416 | struct address_space *mc) |
1417 | { |
1418 | gfp_t gfp = mapping_gfp_mask(mapping: mc); |
1419 | bool tocache = false; |
1420 | struct z_erofs_bvec zbv; |
1421 | struct address_space *mapping; |
1422 | struct page *page; |
1423 | int bs = i_blocksize(node: f->inode); |
1424 | |
1425 | /* Except for inplace folios, the entire folio can be used for I/Os */ |
1426 | bvec->bv_offset = 0; |
1427 | bvec->bv_len = PAGE_SIZE; |
1428 | repeat: |
1429 | spin_lock(lock: &pcl->obj.lockref.lock); |
1430 | zbv = pcl->compressed_bvecs[nr]; |
1431 | spin_unlock(lock: &pcl->obj.lockref.lock); |
1432 | if (!zbv.folio) |
1433 | goto out_allocfolio; |
1434 | |
1435 | bvec->bv_page = &zbv.folio->page; |
1436 | DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page)); |
1437 | /* |
1438 | * Handle preallocated cached folios. We tried to allocate such folios |
1439 | * without triggering direct reclaim. If allocation failed, inplace |
1440 | * file-backed folios will be used instead. |
1441 | */ |
1442 | if (zbv.folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { |
1443 | zbv.folio->private = 0; |
1444 | tocache = true; |
1445 | goto out_tocache; |
1446 | } |
1447 | |
1448 | mapping = READ_ONCE(zbv.folio->mapping); |
1449 | /* |
1450 | * File-backed folios for inplace I/Os are all locked steady, |
1451 | * therefore it is impossible for `mapping` to be NULL. |
1452 | */ |
1453 | if (mapping && mapping != mc) { |
1454 | if (zbv.offset < 0) |
1455 | bvec->bv_offset = round_up(-zbv.offset, bs); |
1456 | bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; |
1457 | return; |
1458 | } |
1459 | |
1460 | folio_lock(folio: zbv.folio); |
1461 | if (zbv.folio->mapping == mc) { |
1462 | /* |
1463 | * The cached folio is still in managed cache but without |
1464 | * a valid `->private` pcluster hint. Let's reconnect them. |
1465 | */ |
1466 | if (!folio_test_private(folio: zbv.folio)) { |
1467 | folio_attach_private(folio: zbv.folio, data: pcl); |
1468 | /* compressed_bvecs[] already takes a ref before */ |
1469 | folio_put(folio: zbv.folio); |
1470 | } |
1471 | |
1472 | /* no need to submit if it is already up-to-date */ |
1473 | if (folio_test_uptodate(folio: zbv.folio)) { |
1474 | folio_unlock(folio: zbv.folio); |
1475 | bvec->bv_page = NULL; |
1476 | } |
1477 | return; |
1478 | } |
1479 | |
1480 | /* |
1481 | * It has been truncated, so it's unsafe to reuse this one. Let's |
1482 | * allocate a new page for compressed data. |
1483 | */ |
1484 | DBG_BUGON(zbv.folio->mapping); |
1485 | tocache = true; |
1486 | folio_unlock(folio: zbv.folio); |
1487 | folio_put(folio: zbv.folio); |
1488 | out_allocfolio: |
1489 | page = erofs_allocpage(pagepool: &f->pagepool, gfp: gfp | __GFP_NOFAIL); |
1490 | spin_lock(lock: &pcl->obj.lockref.lock); |
1491 | if (pcl->compressed_bvecs[nr].folio) { |
1492 | erofs_pagepool_add(pagepool: &f->pagepool, page); |
1493 | spin_unlock(lock: &pcl->obj.lockref.lock); |
1494 | cond_resched(); |
1495 | goto repeat; |
1496 | } |
1497 | pcl->compressed_bvecs[nr].folio = zbv.folio = page_folio(page); |
1498 | spin_unlock(lock: &pcl->obj.lockref.lock); |
1499 | bvec->bv_page = page; |
1500 | out_tocache: |
1501 | if (!tocache || bs != PAGE_SIZE || |
1502 | filemap_add_folio(mapping: mc, folio: zbv.folio, index: pcl->obj.index + nr, gfp)) { |
1503 | /* turn into a temporary shortlived folio (1 ref) */ |
1504 | zbv.folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; |
1505 | return; |
1506 | } |
1507 | folio_attach_private(folio: zbv.folio, data: pcl); |
1508 | /* drop a refcount added by allocpage (then 2 refs in total here) */ |
1509 | folio_put(folio: zbv.folio); |
1510 | } |
1511 | |
1512 | static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, |
1513 | struct z_erofs_decompressqueue *fgq, bool *fg) |
1514 | { |
1515 | struct z_erofs_decompressqueue *q; |
1516 | |
1517 | if (fg && !*fg) { |
1518 | q = kvzalloc(size: sizeof(*q), GFP_KERNEL | __GFP_NOWARN); |
1519 | if (!q) { |
1520 | *fg = true; |
1521 | goto fg_out; |
1522 | } |
1523 | #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD |
1524 | kthread_init_work(&q->u.kthread_work, |
1525 | z_erofs_decompressqueue_kthread_work); |
1526 | #else |
1527 | INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); |
1528 | #endif |
1529 | } else { |
1530 | fg_out: |
1531 | q = fgq; |
1532 | init_completion(x: &fgq->u.done); |
1533 | atomic_set(v: &fgq->pending_bios, i: 0); |
1534 | q->eio = false; |
1535 | q->sync = true; |
1536 | } |
1537 | q->sb = sb; |
1538 | q->head = Z_EROFS_PCLUSTER_TAIL; |
1539 | return q; |
1540 | } |
1541 | |
1542 | /* define decompression jobqueue types */ |
1543 | enum { |
1544 | JQ_BYPASS, |
1545 | JQ_SUBMIT, |
1546 | NR_JOBQUEUES, |
1547 | }; |
1548 | |
1549 | static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, |
1550 | z_erofs_next_pcluster_t qtail[], |
1551 | z_erofs_next_pcluster_t owned_head) |
1552 | { |
1553 | z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; |
1554 | z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; |
1555 | |
1556 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); |
1557 | |
1558 | WRITE_ONCE(*submit_qtail, owned_head); |
1559 | WRITE_ONCE(*bypass_qtail, &pcl->next); |
1560 | |
1561 | qtail[JQ_BYPASS] = &pcl->next; |
1562 | } |
1563 | |
1564 | static void z_erofs_endio(struct bio *bio) |
1565 | { |
1566 | struct z_erofs_decompressqueue *q = bio->bi_private; |
1567 | blk_status_t err = bio->bi_status; |
1568 | struct folio_iter fi; |
1569 | |
1570 | bio_for_each_folio_all(fi, bio) { |
1571 | struct folio *folio = fi.folio; |
1572 | |
1573 | DBG_BUGON(folio_test_uptodate(folio)); |
1574 | DBG_BUGON(z_erofs_page_is_invalidated(&folio->page)); |
1575 | if (!erofs_folio_is_managed(EROFS_SB(q->sb), fo: folio)) |
1576 | continue; |
1577 | |
1578 | if (!err) |
1579 | folio_mark_uptodate(folio); |
1580 | folio_unlock(folio); |
1581 | } |
1582 | if (err) |
1583 | q->eio = true; |
1584 | z_erofs_decompress_kickoff(io: q, bios: -1); |
1585 | if (bio->bi_bdev) |
1586 | bio_put(bio); |
1587 | } |
1588 | |
1589 | static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, |
1590 | struct z_erofs_decompressqueue *fgq, |
1591 | bool *force_fg, bool readahead) |
1592 | { |
1593 | struct super_block *sb = f->inode->i_sb; |
1594 | struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); |
1595 | z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; |
1596 | struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; |
1597 | z_erofs_next_pcluster_t owned_head = f->owned_head; |
1598 | /* bio is NULL initially, so no need to initialize last_{index,bdev} */ |
1599 | erofs_off_t last_pa; |
1600 | unsigned int nr_bios = 0; |
1601 | struct bio *bio = NULL; |
1602 | unsigned long pflags; |
1603 | int memstall = 0; |
1604 | |
1605 | /* No need to read from device for pclusters in the bypass queue. */ |
1606 | q[JQ_BYPASS] = jobqueue_init(sb, fgq: fgq + JQ_BYPASS, NULL); |
1607 | q[JQ_SUBMIT] = jobqueue_init(sb, fgq: fgq + JQ_SUBMIT, fg: force_fg); |
1608 | |
1609 | qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; |
1610 | qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; |
1611 | |
1612 | /* by default, all need io submission */ |
1613 | q[JQ_SUBMIT]->head = owned_head; |
1614 | |
1615 | do { |
1616 | struct erofs_map_dev mdev; |
1617 | struct z_erofs_pcluster *pcl; |
1618 | erofs_off_t cur, end; |
1619 | struct bio_vec bvec; |
1620 | unsigned int i = 0; |
1621 | bool bypass = true; |
1622 | |
1623 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); |
1624 | pcl = container_of(owned_head, struct z_erofs_pcluster, next); |
1625 | owned_head = READ_ONCE(pcl->next); |
1626 | |
1627 | if (z_erofs_is_inline_pcluster(pcl)) { |
1628 | move_to_bypass_jobqueue(pcl, qtail, owned_head); |
1629 | continue; |
1630 | } |
1631 | |
1632 | /* no device id here, thus it will always succeed */ |
1633 | mdev = (struct erofs_map_dev) { |
1634 | .m_pa = erofs_pos(sb, pcl->obj.index), |
1635 | }; |
1636 | (void)erofs_map_dev(sb, dev: &mdev); |
1637 | |
1638 | cur = mdev.m_pa; |
1639 | end = cur + pcl->pclustersize; |
1640 | do { |
1641 | z_erofs_fill_bio_vec(bvec: &bvec, f, pcl, nr: i++, mc); |
1642 | if (!bvec.bv_page) |
1643 | continue; |
1644 | |
1645 | if (bio && (cur != last_pa || |
1646 | bio->bi_bdev != mdev.m_bdev)) { |
1647 | io_retry: |
1648 | if (!erofs_is_fscache_mode(sb)) |
1649 | submit_bio(bio); |
1650 | else |
1651 | erofs_fscache_submit_bio(bio); |
1652 | |
1653 | if (memstall) { |
1654 | psi_memstall_leave(flags: &pflags); |
1655 | memstall = 0; |
1656 | } |
1657 | bio = NULL; |
1658 | } |
1659 | |
1660 | if (unlikely(PageWorkingset(bvec.bv_page)) && |
1661 | !memstall) { |
1662 | psi_memstall_enter(flags: &pflags); |
1663 | memstall = 1; |
1664 | } |
1665 | |
1666 | if (!bio) { |
1667 | bio = erofs_is_fscache_mode(sb) ? |
1668 | erofs_fscache_bio_alloc(mdev: &mdev) : |
1669 | bio_alloc(bdev: mdev.m_bdev, BIO_MAX_VECS, |
1670 | opf: REQ_OP_READ, GFP_NOIO); |
1671 | bio->bi_end_io = z_erofs_endio; |
1672 | bio->bi_iter.bi_sector = cur >> 9; |
1673 | bio->bi_private = q[JQ_SUBMIT]; |
1674 | if (readahead) |
1675 | bio->bi_opf |= REQ_RAHEAD; |
1676 | ++nr_bios; |
1677 | } |
1678 | |
1679 | if (cur + bvec.bv_len > end) |
1680 | bvec.bv_len = end - cur; |
1681 | DBG_BUGON(bvec.bv_len < sb->s_blocksize); |
1682 | if (!bio_add_page(bio, page: bvec.bv_page, len: bvec.bv_len, |
1683 | off: bvec.bv_offset)) |
1684 | goto io_retry; |
1685 | |
1686 | last_pa = cur + bvec.bv_len; |
1687 | bypass = false; |
1688 | } while ((cur += bvec.bv_len) < end); |
1689 | |
1690 | if (!bypass) |
1691 | qtail[JQ_SUBMIT] = &pcl->next; |
1692 | else |
1693 | move_to_bypass_jobqueue(pcl, qtail, owned_head); |
1694 | } while (owned_head != Z_EROFS_PCLUSTER_TAIL); |
1695 | |
1696 | if (bio) { |
1697 | if (!erofs_is_fscache_mode(sb)) |
1698 | submit_bio(bio); |
1699 | else |
1700 | erofs_fscache_submit_bio(bio); |
1701 | if (memstall) |
1702 | psi_memstall_leave(flags: &pflags); |
1703 | } |
1704 | |
1705 | /* |
1706 | * although background is preferred, no one is pending for submission. |
1707 | * don't issue decompression but drop it directly instead. |
1708 | */ |
1709 | if (!*force_fg && !nr_bios) { |
1710 | kvfree(addr: q[JQ_SUBMIT]); |
1711 | return; |
1712 | } |
1713 | z_erofs_decompress_kickoff(io: q[JQ_SUBMIT], bios: nr_bios); |
1714 | } |
1715 | |
1716 | static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, |
1717 | bool force_fg, bool ra) |
1718 | { |
1719 | struct z_erofs_decompressqueue io[NR_JOBQUEUES]; |
1720 | |
1721 | if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) |
1722 | return; |
1723 | z_erofs_submit_queue(f, fgq: io, force_fg: &force_fg, readahead: ra); |
1724 | |
1725 | /* handle bypass queue (no i/o pclusters) immediately */ |
1726 | z_erofs_decompress_queue(io: &io[JQ_BYPASS], pagepool: &f->pagepool); |
1727 | |
1728 | if (!force_fg) |
1729 | return; |
1730 | |
1731 | /* wait until all bios are completed */ |
1732 | wait_for_completion_io(&io[JQ_SUBMIT].u.done); |
1733 | |
1734 | /* handle synchronous decompress queue in the caller context */ |
1735 | z_erofs_decompress_queue(io: &io[JQ_SUBMIT], pagepool: &f->pagepool); |
1736 | } |
1737 | |
1738 | /* |
1739 | * Since partial uptodate is still unimplemented for now, we have to use |
1740 | * approximate readmore strategies as a start. |
1741 | */ |
1742 | static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, |
1743 | struct readahead_control *rac, bool backmost) |
1744 | { |
1745 | struct inode *inode = f->inode; |
1746 | struct erofs_map_blocks *map = &f->map; |
1747 | erofs_off_t cur, end, headoffset = f->headoffset; |
1748 | int err; |
1749 | |
1750 | if (backmost) { |
1751 | if (rac) |
1752 | end = headoffset + readahead_length(rac) - 1; |
1753 | else |
1754 | end = headoffset + PAGE_SIZE - 1; |
1755 | map->m_la = end; |
1756 | err = z_erofs_map_blocks_iter(inode, map, |
1757 | EROFS_GET_BLOCKS_READMORE); |
1758 | if (err) |
1759 | return; |
1760 | |
1761 | /* expand ra for the trailing edge if readahead */ |
1762 | if (rac) { |
1763 | cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); |
1764 | readahead_expand(ractl: rac, new_start: headoffset, new_len: cur - headoffset); |
1765 | return; |
1766 | } |
1767 | end = round_up(end, PAGE_SIZE); |
1768 | } else { |
1769 | end = round_up(map->m_la, PAGE_SIZE); |
1770 | |
1771 | if (!map->m_llen) |
1772 | return; |
1773 | } |
1774 | |
1775 | cur = map->m_la + map->m_llen - 1; |
1776 | while ((cur >= end) && (cur < i_size_read(inode))) { |
1777 | pgoff_t index = cur >> PAGE_SHIFT; |
1778 | struct page *page; |
1779 | |
1780 | page = erofs_grab_cache_page_nowait(mapping: inode->i_mapping, index); |
1781 | if (page) { |
1782 | if (PageUptodate(page)) |
1783 | unlock_page(page); |
1784 | else |
1785 | z_erofs_scan_folio(fe: f, page_folio(page), ra: !!rac); |
1786 | put_page(page); |
1787 | } |
1788 | |
1789 | if (cur < PAGE_SIZE) |
1790 | break; |
1791 | cur = (index << PAGE_SHIFT) - 1; |
1792 | } |
1793 | } |
1794 | |
1795 | static int z_erofs_read_folio(struct file *file, struct folio *folio) |
1796 | { |
1797 | struct inode *const inode = folio->mapping->host; |
1798 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
1799 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); |
1800 | int err; |
1801 | |
1802 | trace_erofs_read_folio(folio, raw: false); |
1803 | f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; |
1804 | |
1805 | z_erofs_pcluster_readmore(f: &f, NULL, backmost: true); |
1806 | err = z_erofs_scan_folio(fe: &f, folio, ra: false); |
1807 | z_erofs_pcluster_readmore(f: &f, NULL, backmost: false); |
1808 | z_erofs_pcluster_end(fe: &f); |
1809 | |
1810 | /* if some compressed cluster ready, need submit them anyway */ |
1811 | z_erofs_runqueue(f: &f, force_fg: z_erofs_is_sync_decompress(sbi, readahead_pages: 0), ra: false); |
1812 | |
1813 | if (err && err != -EINTR) |
1814 | erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu" , |
1815 | err, folio->index, EROFS_I(inode)->nid); |
1816 | |
1817 | erofs_put_metabuf(buf: &f.map.buf); |
1818 | erofs_release_pages(pagepool: &f.pagepool); |
1819 | return err; |
1820 | } |
1821 | |
1822 | static void z_erofs_readahead(struct readahead_control *rac) |
1823 | { |
1824 | struct inode *const inode = rac->mapping->host; |
1825 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); |
1826 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); |
1827 | struct folio *head = NULL, *folio; |
1828 | unsigned int nr_folios; |
1829 | int err; |
1830 | |
1831 | f.headoffset = readahead_pos(rac); |
1832 | |
1833 | z_erofs_pcluster_readmore(f: &f, rac, backmost: true); |
1834 | nr_folios = readahead_count(rac); |
1835 | trace_erofs_readpages(inode, start: readahead_index(rac), nrpage: nr_folios, raw: false); |
1836 | |
1837 | while ((folio = readahead_folio(ractl: rac))) { |
1838 | folio->private = head; |
1839 | head = folio; |
1840 | } |
1841 | |
1842 | /* traverse in reverse order for best metadata I/O performance */ |
1843 | while (head) { |
1844 | folio = head; |
1845 | head = folio_get_private(folio); |
1846 | |
1847 | err = z_erofs_scan_folio(fe: &f, folio, ra: true); |
1848 | if (err && err != -EINTR) |
1849 | erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu" , |
1850 | folio->index, EROFS_I(inode)->nid); |
1851 | } |
1852 | z_erofs_pcluster_readmore(f: &f, rac, backmost: false); |
1853 | z_erofs_pcluster_end(fe: &f); |
1854 | |
1855 | z_erofs_runqueue(f: &f, force_fg: z_erofs_is_sync_decompress(sbi, readahead_pages: nr_folios), ra: true); |
1856 | erofs_put_metabuf(buf: &f.map.buf); |
1857 | erofs_release_pages(pagepool: &f.pagepool); |
1858 | } |
1859 | |
1860 | const struct address_space_operations z_erofs_aops = { |
1861 | .read_folio = z_erofs_read_folio, |
1862 | .readahead = z_erofs_readahead, |
1863 | }; |
1864 | |