raid56.c source code [linux/fs/btrfs/raid56.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2012 Fusion-io All rights reserved.
4	* Copyright (C) 2012 Intel Corp. All rights reserved.
5	*/
6
7	#include <linux/sched.h>
8	#include <linux/bio.h>
9	#include <linux/slab.h>
10	#include <linux/blkdev.h>
11	#include <linux/raid/pq.h>
12	#include <linux/hash.h>
13	#include <linux/list_sort.h>
14	#include <linux/raid/xor.h>
15	#include <linux/mm.h>
16	#include "messages.h"
17	#include "ctree.h"
18	#include "disk-io.h"
19	#include "volumes.h"
20	#include "raid56.h"
21	#include "async-thread.h"
22	#include "file-item.h"
23	#include "btrfs_inode.h"
24
25	/ set when additional merges to this rbio are not allowed /
26	#define RBIO_RMW_LOCKED_BIT 1
27
28	/*
29	* set when this rbio is sitting in the hash, but it is just a cache
30	* of past RMW
31	*/
32	#define RBIO_CACHE_BIT 2
33
34	/*
35	* set when it is safe to trust the stripe_pages for caching
36	*/
37	#define RBIO_CACHE_READY_BIT 3
38
39	#define RBIO_CACHE_SIZE 1024
40
41	#define BTRFS_STRIPE_HASH_TABLE_BITS 11
42
43	/ Used by the raid56 code to lock stripes for read/modify/write /
44	struct btrfs_stripe_hash {
45	struct list_head hash_list;
46	spinlock_t lock;
47	};
48
49	/ Used by the raid56 code to lock stripes for read/modify/write /
50	struct btrfs_stripe_hash_table {
51	struct list_head stripe_cache;
52	spinlock_t cache_lock;
53	int cache_size;
54	struct btrfs_stripe_hash table[];
55	};
56
57	/*
58	* A bvec like structure to present a sector inside a page.
59	*
60	* Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
61	*/
62	struct sector_ptr {
63	struct page *page;
64	unsigned int pgoff:`24`;
65	unsigned int uptodate:`8`;
66	};
67
68	static void rmw_rbio_work(struct work_struct *work);
69	static void rmw_rbio_work_locked(struct work_struct *work);
70	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
71	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
72
73	static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
74	static void scrub_rbio_work_locked(struct work_struct *work);
75
76	static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
77	{
78	bitmap_free(bitmap: rbio->error_bitmap);
79	kfree(objp: rbio->stripe_pages);
80	kfree(objp: rbio->bio_sectors);
81	kfree(objp: rbio->stripe_sectors);
82	kfree(objp: rbio->finish_pointers);
83	}
84
85	static void free_raid_bio(struct btrfs_raid_bio *rbio)
86	{
87	int i;
88
89	if (!refcount_dec_and_test(r: &rbio->refs))
90	return;
91
92	WARN_ON(!list_empty(&rbio->stripe_cache));
93	WARN_ON(!list_empty(&rbio->hash_list));
94	WARN_ON(!bio_list_empty(&rbio->bio_list));
95
96	for (i = `0`; i < rbio->nr_pages; i++) {
97	if (rbio->stripe_pages[i]) {
98	__free_page(rbio->stripe_pages[i]);
99	rbio->stripe_pages[i] = NULL;
100	}
101	}
102
103	btrfs_put_bioc(bioc: rbio->bioc);
104	free_raid_bio_pointers(rbio);
105	kfree(objp: rbio);
106	}
107
108	static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
109	{
110	INIT_WORK(&rbio->work, work_func);
111	queue_work(wq: rbio->bioc->fs_info->rmw_workers, work: &rbio->work);
112	}
113
114	/*
115	* the stripe hash table is used for locking, and to collect
116	* bios in hopes of making a full stripe
117	*/
118	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
119	{
120	struct btrfs_stripe_hash_table *table;
121	struct btrfs_stripe_hash_table *x;
122	struct btrfs_stripe_hash *cur;
123	struct btrfs_stripe_hash *h;
124	int num_entries = `1` << BTRFS_STRIPE_HASH_TABLE_BITS;
125	int i;
126
127	if (info->stripe_hash_table)
128	return `0`;
129
130	/*
131	* The table is large, starting with order 4 and can go as high as
132	* order 7 in case lock debugging is turned on.
133	*
134	* Try harder to allocate and fallback to vmalloc to lower the chance
135	* of a failing mount.
136	*/
137	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
138	if (!table)
139	return -ENOMEM;
140
141	spin_lock_init(&table->cache_lock);
142	INIT_LIST_HEAD(list: &table->stripe_cache);
143
144	h = table->table;
145
146	for (i = `0`; i < num_entries; i++) {
147	cur = h + i;
148	INIT_LIST_HEAD(list: &cur->hash_list);
149	spin_lock_init(&cur->lock);
150	}
151
152	x = cmpxchg(&info->stripe_hash_table, NULL, table);
153	kvfree(addr: x);
154	return `0`;
155	}
156
157	/*
158	* caching an rbio means to copy anything from the
159	* bio_sectors array into the stripe_pages array. We
160	* use the page uptodate bit in the stripe cache array
161	* to indicate if it has valid data
162	*
163	* once the caching is done, we set the cache ready
164	* bit.
165	*/
166	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
167	{
168	int i;
169	int ret;
170
171	ret = alloc_rbio_pages(rbio);
172	if (ret)
173	return;
174
175	for (i = `0`; i < rbio->nr_sectors; i++) {
176	/ Some range not covered by bio (partial write), skip it /
177	if (!rbio->bio_sectors[i].page) {
178	/*
179	* Even if the sector is not covered by bio, if it is
180	* a data sector it should still be uptodate as it is
181	* read from disk.
182	*/
183	if (i < rbio->nr_data * rbio->stripe_nsectors)
184	ASSERT(rbio->stripe_sectors[i].uptodate);
185	continue;
186	}
187
188	ASSERT(rbio->stripe_sectors[i].page);
189	memcpy_page(dst_page: rbio->stripe_sectors[i].page,
190	dst_off: rbio->stripe_sectors[i].pgoff,
191	src_page: rbio->bio_sectors[i].page,
192	src_off: rbio->bio_sectors[i].pgoff,
193	len: rbio->bioc->fs_info->sectorsize);
194	rbio->stripe_sectors[i].uptodate = `1`;
195	}
196	set_bit(RBIO_CACHE_READY_BIT, addr: &rbio->flags);
197	}
198
199	/*
200	* we hash on the first logical address of the stripe
201	*/
202	static int rbio_bucket(struct btrfs_raid_bio *rbio)
203	{
204	u64 num = rbio->bioc->full_stripe_logical;
205
206	/*
207	* we shift down quite a bit. We're using byte
208	* addressing, and most of the lower bits are zeros.
209	* This tends to upset hash_64, and it consistently
210	* returns just one or two different values.
211	*
212	* shifting off the lower bits fixes things.
213	*/
214	return hash_64(val: num >> `16`, BTRFS_STRIPE_HASH_TABLE_BITS);
215	}
216
217	static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
218	unsigned int page_nr)
219	{
220	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
221	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
222	int i;
223
224	ASSERT(page_nr < rbio->nr_pages);
225
226	for (i = sectors_per_page * page_nr;
227	i < sectors_per_page * page_nr + sectors_per_page;
228	i++) {
229	if (!rbio->stripe_sectors[i].uptodate)
230	return false;
231	}
232	return true;
233	}
234
235	/*
236	* Update the stripe_sectors[] array to use correct page and pgoff
237	*
238	* Should be called every time any page pointer in stripes_pages[] got modified.
239	*/
240	static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
241	{
242	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
243	u32 offset;
244	int i;
245
246	for (i = `0`, offset = `0`; i < rbio->nr_sectors; i++, offset += sectorsize) {
247	int page_index = offset >> PAGE_SHIFT;
248
249	ASSERT(page_index < rbio->nr_pages);
250	rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
251	rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
252	}
253	}
254
255	static void steal_rbio_page(struct btrfs_raid_bio *src,
256	struct btrfs_raid_bio dest, int* page_nr)
257	{
258	const u32 sectorsize = src->bioc->fs_info->sectorsize;
259	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
260	int i;
261
262	if (dest->stripe_pages[page_nr])
263	__free_page(dest->stripe_pages[page_nr]);
264	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
265	src->stripe_pages[page_nr] = NULL;
266
267	/ Also update the sector->uptodate bits. /
268	for (i = sectors_per_page * page_nr;
269	i < sectors_per_page * page_nr + sectors_per_page; i++)
270	dest->stripe_sectors[i].uptodate = true;
271	}
272
273	static bool is_data_stripe_page(struct btrfs_raid_bio rbio, int* page_nr)
274	{
275	const int sector_nr = (page_nr << PAGE_SHIFT) >>
276	rbio->bioc->fs_info->sectorsize_bits;
277
278	/*
279	* We have ensured PAGE_SIZE is aligned with sectorsize, thus
280	* we won't have a page which is half data half parity.
281	*
282	* Thus if the first sector of the page belongs to data stripes, then
283	* the full page belongs to data stripes.
284	*/
285	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
286	}
287
288	/*
289	* Stealing an rbio means taking all the uptodate pages from the stripe array
290	* in the source rbio and putting them into the destination rbio.
291	*
292	* This will also update the involved stripe_sectors[] which are referring to
293	* the old pages.
294	*/
295	static void steal_rbio(struct btrfs_raid_bio src, struct* btrfs_raid_bio *dest)
296	{
297	int i;
298
299	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
300	return;
301
302	for (i = `0`; i < dest->nr_pages; i++) {
303	struct page *p = src->stripe_pages[i];
304
305	/*
306	* We don't need to steal P/Q pages as they will always be
307	* regenerated for RMW or full write anyway.
308	*/
309	if (!is_data_stripe_page(rbio: src, page_nr: i))
310	continue;
311
312	/*
313	* If @src already has RBIO_CACHE_READY_BIT, it should have
314	* all data stripe pages present and uptodate.
315	*/
316	ASSERT(p);
317	ASSERT(full_page_sectors_uptodate(src, i));
318	steal_rbio_page(src, dest, page_nr: i);
319	}
320	index_stripe_sectors(rbio: dest);
321	index_stripe_sectors(rbio: src);
322	}
323
324	/*
325	* merging means we take the bio_list from the victim and
326	* splice it into the destination. The victim should
327	* be discarded afterwards.
328	*
329	* must be called with dest->rbio_list_lock held
330	*/
331	static void merge_rbio(struct btrfs_raid_bio *dest,
332	struct btrfs_raid_bio *victim)
333	{
334	bio_list_merge(bl: &dest->bio_list, bl2: &victim->bio_list);
335	dest->bio_list_bytes += victim->bio_list_bytes;
336	/ Also inherit the bitmaps from @victim. /
337	bitmap_or(dst: &dest->dbitmap, src1: &victim->dbitmap, src2: &dest->dbitmap,
338	nbits: dest->stripe_nsectors);
339	bio_list_init(bl: &victim->bio_list);
340	}
341
342	/*
343	* used to prune items that are in the cache. The caller
344	* must hold the hash table lock.
345	*/
346	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
347	{
348	int bucket = rbio_bucket(rbio);
349	struct btrfs_stripe_hash_table *table;
350	struct btrfs_stripe_hash *h;
351	int freeit = `0`;
352
353	/*
354	* check the bit again under the hash table lock.
355	*/
356	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
357	return;
358
359	table = rbio->bioc->fs_info->stripe_hash_table;
360	h = table->table + bucket;
361
362	/ hold the lock for the bucket because we may be*
363	* removing it from the hash table
364	*/
365	spin_lock(lock: &h->lock);
366
367	/*
368	* hold the lock for the bio list because we need
369	* to make sure the bio list is empty
370	*/
371	spin_lock(lock: &rbio->bio_list_lock);
372
373	if (test_and_clear_bit(RBIO_CACHE_BIT, addr: &rbio->flags)) {
374	list_del_init(entry: &rbio->stripe_cache);
375	table->cache_size -= `1`;
376	freeit = `1`;
377
378	/ if the bio list isn't empty, this rbio is*
379	* still involved in an IO. We take it out
380	* of the cache list, and drop the ref that
381	* was held for the list.
382	*
383	* If the bio_list was empty, we also remove
384	* the rbio from the hash_table, and drop
385	* the corresponding ref
386	*/
387	if (bio_list_empty(bl: &rbio->bio_list)) {
388	if (!list_empty(head: &rbio->hash_list)) {
389	list_del_init(entry: &rbio->hash_list);
390	refcount_dec(r: &rbio->refs);
391	BUG_ON(!list_empty(&rbio->plug_list));
392	}
393	}
394	}
395
396	spin_unlock(lock: &rbio->bio_list_lock);
397	spin_unlock(lock: &h->lock);
398
399	if (freeit)
400	free_raid_bio(rbio);
401	}
402
403	/*
404	* prune a given rbio from the cache
405	*/
406	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
407	{
408	struct btrfs_stripe_hash_table *table;
409
410	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
411	return;
412
413	table = rbio->bioc->fs_info->stripe_hash_table;
414
415	spin_lock(lock: &table->cache_lock);
416	__remove_rbio_from_cache(rbio);
417	spin_unlock(lock: &table->cache_lock);
418	}
419
420	/*
421	* remove everything in the cache
422	*/
423	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
424	{
425	struct btrfs_stripe_hash_table *table;
426	struct btrfs_raid_bio *rbio;
427
428	table = info->stripe_hash_table;
429
430	spin_lock(lock: &table->cache_lock);
431	while (!list_empty(head: &table->stripe_cache)) {
432	rbio = list_entry(table->stripe_cache.next,
433	struct btrfs_raid_bio,
434	stripe_cache);
435	__remove_rbio_from_cache(rbio);
436	}
437	spin_unlock(lock: &table->cache_lock);
438	}
439
440	/*
441	* remove all cached entries and free the hash table
442	* used by unmount
443	*/
444	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
445	{
446	if (!info->stripe_hash_table)
447	return;
448	btrfs_clear_rbio_cache(info);
449	kvfree(addr: info->stripe_hash_table);
450	info->stripe_hash_table = NULL;
451	}
452
453	/*
454	* insert an rbio into the stripe cache. It
455	* must have already been prepared by calling
456	* cache_rbio_pages
457	*
458	* If this rbio was already cached, it gets
459	* moved to the front of the lru.
460	*
461	* If the size of the rbio cache is too big, we
462	* prune an item.
463	*/
464	static void cache_rbio(struct btrfs_raid_bio *rbio)
465	{
466	struct btrfs_stripe_hash_table *table;
467
468	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
469	return;
470
471	table = rbio->bioc->fs_info->stripe_hash_table;
472
473	spin_lock(lock: &table->cache_lock);
474	spin_lock(lock: &rbio->bio_list_lock);
475
476	/ bump our ref if we were not in the list before /
477	if (!test_and_set_bit(RBIO_CACHE_BIT, addr: &rbio->flags))
478	refcount_inc(r: &rbio->refs);
479
480	if (!list_empty(head: &rbio->stripe_cache)){
481	list_move(list: &rbio->stripe_cache, head: &table->stripe_cache);
482	} else {
483	list_add(new: &rbio->stripe_cache, head: &table->stripe_cache);
484	table->cache_size += `1`;
485	}
486
487	spin_unlock(lock: &rbio->bio_list_lock);
488
489	if (table->cache_size > RBIO_CACHE_SIZE) {
490	struct btrfs_raid_bio *found;
491
492	found = list_entry(table->stripe_cache.prev,
493	struct btrfs_raid_bio,
494	stripe_cache);
495
496	if (found != rbio)
497	__remove_rbio_from_cache(rbio: found);
498	}
499
500	spin_unlock(lock: &table->cache_lock);
501	}
502
503	/*
504	* helper function to run the xor_blocks api. It is only
505	* able to do MAX_XOR_BLOCKS at a time, so we need to
506	* loop through.
507	*/
508	static void run_xor(void *pages, int* src_cnt, ssize_t len)
509	{
510	int src_off = `0`;
511	int xor_src_cnt = `0`;
512	void *dest = pages[src_cnt];
513
514	while(src_cnt > `0`) {
515	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
516	xor_blocks(count: xor_src_cnt, bytes: len, dest, srcs: pages + src_off);
517
518	src_cnt -= xor_src_cnt;
519	src_off += xor_src_cnt;
520	}
521	}
522
523	/*
524	* Returns true if the bio list inside this rbio covers an entire stripe (no
525	* rmw required).
526	*/
527	static int rbio_is_full(struct btrfs_raid_bio *rbio)
528	{
529	unsigned long size = rbio->bio_list_bytes;
530	int ret = `1`;
531
532	spin_lock(lock: &rbio->bio_list_lock);
533	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
534	ret = `0`;
535	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
536	spin_unlock(lock: &rbio->bio_list_lock);
537
538	return ret;
539	}
540
541	/*
542	* returns 1 if it is safe to merge two rbios together.
543	* The merging is safe if the two rbios correspond to
544	* the same stripe and if they are both going in the same
545	* direction (read vs write), and if neither one is
546	* locked for final IO
547	*
548	* The caller is responsible for locking such that
549	* rmw_locked is safe to test
550	*/
551	static int rbio_can_merge(struct btrfs_raid_bio *last,
552	struct btrfs_raid_bio *cur)
553	{
554	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
555	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
556	return `0`;
557
558	/*
559	* we can't merge with cached rbios, since the
560	* idea is that when we merge the destination
561	* rbio is going to run our IO for us. We can
562	* steal from cached rbios though, other functions
563	* handle that.
564	*/
565	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
566	test_bit(RBIO_CACHE_BIT, &cur->flags))
567	return `0`;
568
569	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
570	return `0`;
571
572	/ we can't merge with different operations /
573	if (last->operation != cur->operation)
574	return `0`;
575	/*
576	* We've need read the full stripe from the drive.
577	* check and repair the parity and write the new results.
578	*
579	* We're not allowed to add any new bios to the
580	* bio list here, anyone else that wants to
581	* change this stripe needs to do their own rmw.
582	*/
583	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
584	return `0`;
585
586	if (last->operation == BTRFS_RBIO_READ_REBUILD)
587	return `0`;
588
589	return `1`;
590	}
591
592	static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
593	unsigned int stripe_nr,
594	unsigned int sector_nr)
595	{
596	ASSERT(stripe_nr < rbio->real_stripes);
597	ASSERT(sector_nr < rbio->stripe_nsectors);
598
599	return stripe_nr * rbio->stripe_nsectors + sector_nr;
600	}
601
602	/ Return a sector from rbio->stripe_sectors, not from the bio list /
603	static struct sector_ptr rbio_stripe_sector(const* struct btrfs_raid_bio *rbio,
604	unsigned int stripe_nr,
605	unsigned int sector_nr)
606	{
607	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
608	sector_nr)];
609	}
610
611	/ Grab a sector inside P stripe /
612	static struct sector_ptr rbio_pstripe_sector(const* struct btrfs_raid_bio *rbio,
613	unsigned int sector_nr)
614	{
615	return rbio_stripe_sector(rbio, stripe_nr: rbio->nr_data, sector_nr);
616	}
617
618	/ Grab a sector inside Q stripe, return NULL if not RAID6 /
619	static struct sector_ptr rbio_qstripe_sector(const* struct btrfs_raid_bio *rbio,
620	unsigned int sector_nr)
621	{
622	if (rbio->nr_data + `1` == rbio->real_stripes)
623	return NULL;
624	return rbio_stripe_sector(rbio, stripe_nr: rbio->nr_data + `1`, sector_nr);
625	}
626
627	/*
628	* The first stripe in the table for a logical address
629	* has the lock. rbios are added in one of three ways:
630	*
631	* 1) Nobody has the stripe locked yet. The rbio is given
632	* the lock and 0 is returned. The caller must start the IO
633	* themselves.
634	*
635	* 2) Someone has the stripe locked, but we're able to merge
636	* with the lock owner. The rbio is freed and the IO will
637	* start automatically along with the existing rbio. 1 is returned.
638	*
639	* 3) Someone has the stripe locked, but we're not able to merge.
640	* The rbio is added to the lock owner's plug list, or merged into
641	* an rbio already on the plug list. When the lock owner unlocks,
642	* the next rbio on the list is run and the IO is started automatically.
643	* 1 is returned
644	*
645	* If we return 0, the caller still owns the rbio and must continue with
646	* IO submission. If we return 1, the caller must assume the rbio has
647	* already been freed.
648	*/
649	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
650	{
651	struct btrfs_stripe_hash *h;
652	struct btrfs_raid_bio *cur;
653	struct btrfs_raid_bio *pending;
654	struct btrfs_raid_bio *freeit = NULL;
655	struct btrfs_raid_bio *cache_drop = NULL;
656	int ret = `0`;
657
658	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
659
660	spin_lock(lock: &h->lock);
661	list_for_each_entry(cur, &h->hash_list, hash_list) {
662	if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
663	continue;
664
665	spin_lock(lock: &cur->bio_list_lock);
666
667	/ Can we steal this cached rbio's pages? /
668	if (bio_list_empty(bl: &cur->bio_list) &&
669	list_empty(head: &cur->plug_list) &&
670	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
671	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
672	list_del_init(entry: &cur->hash_list);
673	refcount_dec(r: &cur->refs);
674
675	steal_rbio(src: cur, dest: rbio);
676	cache_drop = cur;
677	spin_unlock(lock: &cur->bio_list_lock);
678
679	goto lockit;
680	}
681
682	/ Can we merge into the lock owner? /
683	if (rbio_can_merge(last: cur, cur: rbio)) {
684	merge_rbio(dest: cur, victim: rbio);
685	spin_unlock(lock: &cur->bio_list_lock);
686	freeit = rbio;
687	ret = `1`;
688	goto out;
689	}
690
691
692	/*
693	* We couldn't merge with the running rbio, see if we can merge
694	* with the pending ones. We don't have to check for rmw_locked
695	* because there is no way they are inside finish_rmw right now
696	*/
697	list_for_each_entry(pending, &cur->plug_list, plug_list) {
698	if (rbio_can_merge(last: pending, cur: rbio)) {
699	merge_rbio(dest: pending, victim: rbio);
700	spin_unlock(lock: &cur->bio_list_lock);
701	freeit = rbio;
702	ret = `1`;
703	goto out;
704	}
705	}
706
707	/*
708	* No merging, put us on the tail of the plug list, our rbio
709	* will be started with the currently running rbio unlocks
710	*/
711	list_add_tail(new: &rbio->plug_list, head: &cur->plug_list);
712	spin_unlock(lock: &cur->bio_list_lock);
713	ret = `1`;
714	goto out;
715	}
716	lockit:
717	refcount_inc(r: &rbio->refs);
718	list_add(new: &rbio->hash_list, head: &h->hash_list);
719	out:
720	spin_unlock(lock: &h->lock);
721	if (cache_drop)
722	remove_rbio_from_cache(rbio: cache_drop);
723	if (freeit)
724	free_raid_bio(rbio: freeit);
725	return ret;
726	}
727
728	static void recover_rbio_work_locked(struct work_struct *work);
729
730	/*
731	* called as rmw or parity rebuild is completed. If the plug list has more
732	* rbios waiting for this stripe, the next one on the list will be started
733	*/
734	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
735	{
736	int bucket;
737	struct btrfs_stripe_hash *h;
738	int keep_cache = `0`;
739
740	bucket = rbio_bucket(rbio);
741	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
742
743	if (list_empty(head: &rbio->plug_list))
744	cache_rbio(rbio);
745
746	spin_lock(lock: &h->lock);
747	spin_lock(lock: &rbio->bio_list_lock);
748
749	if (!list_empty(head: &rbio->hash_list)) {
750	/*
751	* if we're still cached and there is no other IO
752	* to perform, just leave this rbio here for others
753	* to steal from later
754	*/
755	if (list_empty(head: &rbio->plug_list) &&
756	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
757	keep_cache = `1`;
758	clear_bit(RBIO_RMW_LOCKED_BIT, addr: &rbio->flags);
759	BUG_ON(!bio_list_empty(&rbio->bio_list));
760	goto done;
761	}
762
763	list_del_init(entry: &rbio->hash_list);
764	refcount_dec(r: &rbio->refs);
765
766	/*
767	* we use the plug list to hold all the rbios
768	* waiting for the chance to lock this stripe.
769	* hand the lock over to one of them.
770	*/
771	if (!list_empty(head: &rbio->plug_list)) {
772	struct btrfs_raid_bio *next;
773	struct list_head *head = rbio->plug_list.next;
774
775	next = list_entry(head, struct btrfs_raid_bio,
776	plug_list);
777
778	list_del_init(entry: &rbio->plug_list);
779
780	list_add(new: &next->hash_list, head: &h->hash_list);
781	refcount_inc(r: &next->refs);
782	spin_unlock(lock: &rbio->bio_list_lock);
783	spin_unlock(lock: &h->lock);
784
785	if (next->operation == BTRFS_RBIO_READ_REBUILD) {
786	start_async_work(rbio: next, work_func: recover_rbio_work_locked);
787	} else if (next->operation == BTRFS_RBIO_WRITE) {
788	steal_rbio(src: rbio, dest: next);
789	start_async_work(rbio: next, work_func: rmw_rbio_work_locked);
790	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
791	steal_rbio(src: rbio, dest: next);
792	start_async_work(rbio: next, work_func: scrub_rbio_work_locked);
793	}
794
795	goto done_nolock;
796	}
797	}
798	done:
799	spin_unlock(lock: &rbio->bio_list_lock);
800	spin_unlock(lock: &h->lock);
801
802	done_nolock:
803	if (!keep_cache)
804	remove_rbio_from_cache(rbio);
805	}
806
807	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
808	{
809	struct bio *next;
810
811	while (cur) {
812	next = cur->bi_next;
813	cur->bi_next = NULL;
814	cur->bi_status = err;
815	bio_endio(cur);
816	cur = next;
817	}
818	}
819
820	/*
821	* this frees the rbio and runs through all the bios in the
822	* bio_list and calls end_io on them
823	*/
824	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
825	{
826	struct bio *cur = bio_list_get(bl: &rbio->bio_list);
827	struct bio *extra;
828
829	kfree(objp: rbio->csum_buf);
830	bitmap_free(bitmap: rbio->csum_bitmap);
831	rbio->csum_buf = NULL;
832	rbio->csum_bitmap = NULL;
833
834	/*
835	* Clear the data bitmap, as the rbio may be cached for later usage.
836	* do this before before unlock_stripe() so there will be no new bio
837	* for this bio.
838	*/
839	bitmap_clear(map: &rbio->dbitmap, start: `0`, nbits: rbio->stripe_nsectors);
840
841	/*
842	* At this moment, rbio->bio_list is empty, however since rbio does not
843	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
844	* hash list, rbio may be merged with others so that rbio->bio_list
845	* becomes non-empty.
846	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
847	* more and we can call bio_endio() on all queued bios.
848	*/
849	unlock_stripe(rbio);
850	extra = bio_list_get(bl: &rbio->bio_list);
851	free_raid_bio(rbio);
852
853	rbio_endio_bio_list(cur, err);
854	if (extra)
855	rbio_endio_bio_list(cur: extra, err);
856	}
857
858	/*
859	* Get a sector pointer specified by its @stripe_nr and @sector_nr.
860	*
861	* @rbio: The raid bio
862	* @stripe_nr: Stripe number, valid range [0, real_stripe)
863	* @sector_nr: Sector number inside the stripe,
864	* valid range [0, stripe_nsectors)
865	* @bio_list_only: Whether to use sectors inside the bio list only.
866	*
867	* The read/modify/write code wants to reuse the original bio page as much
868	* as possible, and only use stripe_sectors as fallback.
869	*/
870	static struct sector_ptr sector_in_rbio(struct* btrfs_raid_bio *rbio,
871	int stripe_nr, int sector_nr,
872	bool bio_list_only)
873	{
874	struct sector_ptr *sector;
875	int index;
876
877	ASSERT(stripe_nr >= `0` && stripe_nr < rbio->real_stripes);
878	ASSERT(sector_nr >= `0` && sector_nr < rbio->stripe_nsectors);
879
880	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
881	ASSERT(index >= `0` && index < rbio->nr_sectors);
882
883	spin_lock(lock: &rbio->bio_list_lock);
884	sector = &rbio->bio_sectors[index];
885	if (sector->page \|\| bio_list_only) {
886	/ Don't return sector without a valid page pointer /
887	if (!sector->page)
888	sector = NULL;
889	spin_unlock(lock: &rbio->bio_list_lock);
890	return sector;
891	}
892	spin_unlock(lock: &rbio->bio_list_lock);
893
894	return &rbio->stripe_sectors[index];
895	}
896
897	/*
898	* allocation and initial setup for the btrfs_raid_bio. Not
899	* this does not allocate any pages for rbio->pages.
900	*/
901	static struct btrfs_raid_bio alloc_rbio(struct* btrfs_fs_info *fs_info,
902	struct btrfs_io_context *bioc)
903	{
904	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
905	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
906	const unsigned int num_pages = stripe_npages * real_stripes;
907	const unsigned int stripe_nsectors =
908	BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
909	const unsigned int num_sectors = stripe_nsectors * real_stripes;
910	struct btrfs_raid_bio *rbio;
911
912	/ PAGE_SIZE must also be aligned to sectorsize for subpage support /
913	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
914	/*
915	* Our current stripe len should be fixed to 64k thus stripe_nsectors
916	* (at most 16) should be no larger than BITS_PER_LONG.
917	*/
918	ASSERT(stripe_nsectors <= BITS_PER_LONG);
919
920	/*
921	* Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
922	* (limited by u8).
923	*/
924	ASSERT(real_stripes >= `2`);
925	ASSERT(real_stripes <= U8_MAX);
926
927	rbio = kzalloc(size: sizeof(*rbio), GFP_NOFS);
928	if (!rbio)
929	return ERR_PTR(error: -ENOMEM);
930	rbio->stripe_pages = kcalloc(n: num_pages, size: sizeof(struct page *),
931	GFP_NOFS);
932	rbio->bio_sectors = kcalloc(n: num_sectors, size: sizeof(struct sector_ptr),
933	GFP_NOFS);
934	rbio->stripe_sectors = kcalloc(n: num_sectors, size: sizeof(struct sector_ptr),
935	GFP_NOFS);
936	rbio->finish_pointers = kcalloc(n: real_stripes, size: sizeof(void *), GFP_NOFS);
937	rbio->error_bitmap = bitmap_zalloc(nbits: num_sectors, GFP_NOFS);
938
939	if (!rbio->stripe_pages \|\| !rbio->bio_sectors \|\| !rbio->stripe_sectors \|\|
940	!rbio->finish_pointers \|\| !rbio->error_bitmap) {
941	free_raid_bio_pointers(rbio);
942	kfree(objp: rbio);
943	return ERR_PTR(error: -ENOMEM);
944	}
945
946	bio_list_init(bl: &rbio->bio_list);
947	init_waitqueue_head(&rbio->io_wait);
948	INIT_LIST_HEAD(list: &rbio->plug_list);
949	spin_lock_init(&rbio->bio_list_lock);
950	INIT_LIST_HEAD(list: &rbio->stripe_cache);
951	INIT_LIST_HEAD(list: &rbio->hash_list);
952	btrfs_get_bioc(bioc);
953	rbio->bioc = bioc;
954	rbio->nr_pages = num_pages;
955	rbio->nr_sectors = num_sectors;
956	rbio->real_stripes = real_stripes;
957	rbio->stripe_npages = stripe_npages;
958	rbio->stripe_nsectors = stripe_nsectors;
959	refcount_set(r: &rbio->refs, n: `1`);
960	atomic_set(v: &rbio->stripes_pending, i: `0`);
961
962	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
963	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(type: bioc->map_type);
964	ASSERT(rbio->nr_data > `0`);
965
966	return rbio;
967	}
968
969	/ allocate pages for all the stripes in the bio, including parity /
970	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
971	{
972	int ret;
973
974	ret = btrfs_alloc_page_array(nr_pages: rbio->nr_pages, page_array: rbio->stripe_pages, extra_gfp: `0`);
975	if (ret < `0`)
976	return ret;
977	/ Mapping all sectors /
978	index_stripe_sectors(rbio);
979	return `0`;
980	}
981
982	/ only allocate pages for p/q stripes /
983	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
984	{
985	const int data_pages = rbio->nr_data * rbio->stripe_npages;
986	int ret;
987
988	ret = btrfs_alloc_page_array(nr_pages: rbio->nr_pages - data_pages,
989	page_array: rbio->stripe_pages + data_pages, extra_gfp: `0`);
990	if (ret < `0`)
991	return ret;
992
993	index_stripe_sectors(rbio);
994	return `0`;
995	}
996
997	/*
998	* Return the total number of errors found in the vertical stripe of @sector_nr.
999	*
1000	* @faila and @failb will also be updated to the first and second stripe
1001	* number of the errors.
1002	*/
1003	static int get_rbio_veritical_errors(struct btrfs_raid_bio rbio, int* sector_nr,
1004	int faila, int* *failb)
1005	{
1006	int stripe_nr;
1007	int found_errors = `0`;
1008
1009	if (faila \|\| failb) {
1010	/*
1011	* Both @faila and @failb should be valid pointers if any of
1012	* them is specified.
1013	*/
1014	ASSERT(faila && failb);
1015	*faila = -`1`;
1016	*failb = -`1`;
1017	}
1018
1019	for (stripe_nr = `0`; stripe_nr < rbio->real_stripes; stripe_nr++) {
1020	int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1021
1022	if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1023	found_errors++;
1024	if (faila) {
1025	/ Update faila and failb. /
1026	if (*faila < `0`)
1027	*faila = stripe_nr;
1028	else if (*failb < `0`)
1029	*failb = stripe_nr;
1030	}
1031	}
1032	}
1033	return found_errors;
1034	}
1035
1036	/*
1037	* Add a single sector @sector into our list of bios for IO.
1038	*
1039	* Return 0 if everything went well.
1040	* Return <0 for error.
1041	*/
1042	static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1043	struct bio_list *bio_list,
1044	struct sector_ptr *sector,
1045	unsigned int stripe_nr,
1046	unsigned int sector_nr,
1047	enum req_op op)
1048	{
1049	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1050	struct bio *last = bio_list->tail;
1051	int ret;
1052	struct bio *bio;
1053	struct btrfs_io_stripe *stripe;
1054	u64 disk_start;
1055
1056	/*
1057	* Note: here stripe_nr has taken device replace into consideration,
1058	* thus it can be larger than rbio->real_stripe.
1059	* So here we check against bioc->num_stripes, not rbio->real_stripes.
1060	*/
1061	ASSERT(stripe_nr >= `0` && stripe_nr < rbio->bioc->num_stripes);
1062	ASSERT(sector_nr >= `0` && sector_nr < rbio->stripe_nsectors);
1063	ASSERT(sector->page);
1064
1065	stripe = &rbio->bioc->stripes[stripe_nr];
1066	disk_start = stripe->physical + sector_nr * sectorsize;
1067
1068	/ if the device is missing, just fail this stripe /
1069	if (!stripe->dev->bdev) {
1070	int found_errors;
1071
1072	set_bit(nr: stripe_nr * rbio->stripe_nsectors + sector_nr,
1073	addr: rbio->error_bitmap);
1074
1075	/ Check if we have reached tolerance early. /
1076	found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1077	NULL, NULL);
1078	if (found_errors > rbio->bioc->max_errors)
1079	return -EIO;
1080	return `0`;
1081	}
1082
1083	/ see if we can add this page onto our existing bio /
1084	if (last) {
1085	u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1086	last_end += last->bi_iter.bi_size;
1087
1088	/*
1089	* we can't merge these if they are from different
1090	* devices or if they are not contiguous
1091	*/
1092	if (last_end == disk_start && !last->bi_status &&
1093	last->bi_bdev == stripe->dev->bdev) {
1094	ret = bio_add_page(bio: last, page: sector->page, len: sectorsize,
1095	off: sector->pgoff);
1096	if (ret == sectorsize)
1097	return `0`;
1098	}
1099	}
1100
1101	/ put a new bio on the list /
1102	bio = bio_alloc(bdev: stripe->dev->bdev,
1103	max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, `1`),
1104	opf: op, GFP_NOFS);
1105	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1106	bio->bi_private = rbio;
1107
1108	__bio_add_page(bio, page: sector->page, len: sectorsize, off: sector->pgoff);
1109	bio_list_add(bl: bio_list, bio);
1110	return `0`;
1111	}
1112
1113	static void index_one_bio(struct btrfs_raid_bio rbio, struct* bio *bio)
1114	{
1115	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1116	struct bio_vec bvec;
1117	struct bvec_iter iter;
1118	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1119	rbio->bioc->full_stripe_logical;
1120
1121	bio_for_each_segment(bvec, bio, iter) {
1122	u32 bvec_offset;
1123
1124	for (bvec_offset = `0`; bvec_offset < bvec.bv_len;
1125	bvec_offset += sectorsize, offset += sectorsize) {
1126	int index = offset / sectorsize;
1127	struct sector_ptr *sector = &rbio->bio_sectors[index];
1128
1129	sector->page = bvec.bv_page;
1130	sector->pgoff = bvec.bv_offset + bvec_offset;
1131	ASSERT(sector->pgoff < PAGE_SIZE);
1132	}
1133	}
1134	}
1135
1136	/*
1137	* helper function to walk our bio list and populate the bio_pages array with
1138	* the result. This seems expensive, but it is faster than constantly
1139	* searching through the bio list as we setup the IO in finish_rmw or stripe
1140	* reconstruction.
1141	*
1142	* This must be called before you trust the answers from page_in_rbio
1143	*/
1144	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1145	{
1146	struct bio *bio;
1147
1148	spin_lock(lock: &rbio->bio_list_lock);
1149	bio_list_for_each(bio, &rbio->bio_list)
1150	index_one_bio(rbio, bio);
1151
1152	spin_unlock(lock: &rbio->bio_list_lock);
1153	}
1154
1155	static void bio_get_trace_info(struct btrfs_raid_bio rbio, struct* bio *bio,
1156	struct raid56_bio_trace_info *trace_info)
1157	{
1158	const struct btrfs_io_context *bioc = rbio->bioc;
1159	int i;
1160
1161	ASSERT(bioc);
1162
1163	/ We rely on bio->bi_bdev to find the stripe number. /
1164	if (!bio->bi_bdev)
1165	goto not_found;
1166
1167	for (i = `0`; i < bioc->num_stripes; i++) {
1168	if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1169	continue;
1170	trace_info->stripe_nr = i;
1171	trace_info->devid = bioc->stripes[i].dev->devid;
1172	trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1173	bioc->stripes[i].physical;
1174	return;
1175	}
1176
1177	not_found:
1178	trace_info->devid = -`1`;
1179	trace_info->offset = -`1`;
1180	trace_info->stripe_nr = -`1`;
1181	}
1182
1183	static inline void bio_list_put(struct bio_list *bio_list)
1184	{
1185	struct bio *bio;
1186
1187	while ((bio = bio_list_pop(bl: bio_list)))
1188	bio_put(bio);
1189	}
1190
1191	static void assert_rbio(struct btrfs_raid_bio *rbio)
1192	{
1193	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) \|\|
1194	!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1195	return;
1196
1197	/*
1198	* At least two stripes (2 disks RAID5), and since real_stripes is U8,
1199	* we won't go beyond 256 disks anyway.
1200	*/
1201	ASSERT(rbio->real_stripes >= `2`);
1202	ASSERT(rbio->nr_data > `0`);
1203
1204	/*
1205	* This is another check to make sure nr data stripes is smaller
1206	* than total stripes.
1207	*/
1208	ASSERT(rbio->nr_data < rbio->real_stripes);
1209	}
1210
1211	/ Generate PQ for one vertical stripe. /
1212	static void generate_pq_vertical(struct btrfs_raid_bio rbio, int* sectornr)
1213	{
1214	void **pointers = rbio->finish_pointers;
1215	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1216	struct sector_ptr *sector;
1217	int stripe;
1218	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1219
1220	/ First collect one sector from each data stripe /
1221	for (stripe = `0`; stripe < rbio->nr_data; stripe++) {
1222	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `0`);
1223	pointers[stripe] = kmap_local_page(page: sector->page) +
1224	sector->pgoff;
1225	}
1226
1227	/ Then add the parity stripe /
1228	sector = rbio_pstripe_sector(rbio, sector_nr: sectornr);
1229	sector->uptodate = `1`;
1230	pointers[stripe++] = kmap_local_page(page: sector->page) + sector->pgoff;
1231
1232	if (has_qstripe) {
1233	/*
1234	* RAID6, add the qstripe and call the library function
1235	* to fill in our p/q
1236	*/
1237	sector = rbio_qstripe_sector(rbio, sector_nr: sectornr);
1238	sector->uptodate = `1`;
1239	pointers[stripe++] = kmap_local_page(page: sector->page) +
1240	sector->pgoff;
1241
1242	assert_rbio(rbio);
1243	raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1244	pointers);
1245	} else {
1246	/ raid5 /
1247	memcpy(pointers[rbio->nr_data], pointers[`0`], sectorsize);
1248	run_xor(pages: pointers + `1`, src_cnt: rbio->nr_data - `1`, len: sectorsize);
1249	}
1250	for (stripe = stripe - `1`; stripe >= `0`; stripe--)
1251	kunmap_local(pointers[stripe]);
1252	}
1253
1254	static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1255	struct bio_list *bio_list)
1256	{
1257	/ The total sector number inside the full stripe. /
1258	int total_sector_nr;
1259	int sectornr;
1260	int stripe;
1261	int ret;
1262
1263	ASSERT(bio_list_size(bio_list) == `0`);
1264
1265	/ We should have at least one data sector. /
1266	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1267
1268	/*
1269	* Reset errors, as we may have errors inherited from from degraded
1270	* write.
1271	*/
1272	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
1273
1274	/*
1275	* Start assembly. Make bios for everything from the higher layers (the
1276	* bio_list in our rbio) and our P/Q. Ignore everything else.
1277	*/
1278	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
1279	total_sector_nr++) {
1280	struct sector_ptr *sector;
1281
1282	stripe = total_sector_nr / rbio->stripe_nsectors;
1283	sectornr = total_sector_nr % rbio->stripe_nsectors;
1284
1285	/ This vertical stripe has no data, skip it. /
1286	if (!test_bit(sectornr, &rbio->dbitmap))
1287	continue;
1288
1289	if (stripe < rbio->nr_data) {
1290	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `1`);
1291	if (!sector)
1292	continue;
1293	} else {
1294	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
1295	}
1296
1297	ret = rbio_add_io_sector(rbio, bio_list, sector, stripe_nr: stripe,
1298	sector_nr: sectornr, op: REQ_OP_WRITE);
1299	if (ret)
1300	goto error;
1301	}
1302
1303	if (likely(!rbio->bioc->replace_nr_stripes))
1304	return `0`;
1305
1306	/*
1307	* Make a copy for the replace target device.
1308	*
1309	* Thus the source stripe number (in replace_stripe_src) should be valid.
1310	*/
1311	ASSERT(rbio->bioc->replace_stripe_src >= `0`);
1312
1313	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
1314	total_sector_nr++) {
1315	struct sector_ptr *sector;
1316
1317	stripe = total_sector_nr / rbio->stripe_nsectors;
1318	sectornr = total_sector_nr % rbio->stripe_nsectors;
1319
1320	/*
1321	* For RAID56, there is only one device that can be replaced,
1322	* and replace_stripe_src[0] indicates the stripe number we
1323	* need to copy from.
1324	*/
1325	if (stripe != rbio->bioc->replace_stripe_src) {
1326	/*
1327	* We can skip the whole stripe completely, note
1328	* total_sector_nr will be increased by one anyway.
1329	*/
1330	ASSERT(sectornr == `0`);
1331	total_sector_nr += rbio->stripe_nsectors - `1`;
1332	continue;
1333	}
1334
1335	/ This vertical stripe has no data, skip it. /
1336	if (!test_bit(sectornr, &rbio->dbitmap))
1337	continue;
1338
1339	if (stripe < rbio->nr_data) {
1340	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `1`);
1341	if (!sector)
1342	continue;
1343	} else {
1344	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
1345	}
1346
1347	ret = rbio_add_io_sector(rbio, bio_list, sector,
1348	stripe_nr: rbio->real_stripes,
1349	sector_nr: sectornr, op: REQ_OP_WRITE);
1350	if (ret)
1351	goto error;
1352	}
1353
1354	return `0`;
1355	error:
1356	bio_list_put(bio_list);
1357	return -EIO;
1358	}
1359
1360	static void set_rbio_range_error(struct btrfs_raid_bio rbio, struct* bio *bio)
1361	{
1362	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1363	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1364	rbio->bioc->full_stripe_logical;
1365	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1366
1367	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1368
1369	bitmap_set(map: rbio->error_bitmap, start: total_nr_sector,
1370	nbits: bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1371
1372	/*
1373	* Special handling for raid56_alloc_missing_rbio() used by
1374	* scrub/replace. Unlike call path in raid56_parity_recover(), they
1375	* pass an empty bio here. Thus we have to find out the missing device
1376	* and mark the stripe error instead.
1377	*/
1378	if (bio->bi_iter.bi_size == `0`) {
1379	bool found_missing = false;
1380	int stripe_nr;
1381
1382	for (stripe_nr = `0`; stripe_nr < rbio->real_stripes; stripe_nr++) {
1383	if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1384	found_missing = true;
1385	bitmap_set(map: rbio->error_bitmap,
1386	start: stripe_nr * rbio->stripe_nsectors,
1387	nbits: rbio->stripe_nsectors);
1388	}
1389	}
1390	ASSERT(found_missing);
1391	}
1392	}
1393
1394	/*
1395	* For subpage case, we can no longer set page Up-to-date directly for
1396	* stripe_pages[], thus we need to locate the sector.
1397	*/
1398	static struct sector_ptr find_stripe_sector(struct* btrfs_raid_bio *rbio,
1399	struct page *page,
1400	unsigned int pgoff)
1401	{
1402	int i;
1403
1404	for (i = `0`; i < rbio->nr_sectors; i++) {
1405	struct sector_ptr *sector = &rbio->stripe_sectors[i];
1406
1407	if (sector->page == page && sector->pgoff == pgoff)
1408	return sector;
1409	}
1410	return NULL;
1411	}
1412
1413	/*
1414	* this sets each page in the bio uptodate. It should only be used on private
1415	* rbio pages, nothing that comes in from the higher layers
1416	*/
1417	static void set_bio_pages_uptodate(struct btrfs_raid_bio rbio, struct* bio *bio)
1418	{
1419	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1420	struct bio_vec *bvec;
1421	struct bvec_iter_all iter_all;
1422
1423	ASSERT(!bio_flagged(bio, BIO_CLONED));
1424
1425	bio_for_each_segment_all(bvec, bio, iter_all) {
1426	struct sector_ptr *sector;
1427	int pgoff;
1428
1429	for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1430	pgoff += sectorsize) {
1431	sector = find_stripe_sector(rbio, page: bvec->bv_page, pgoff);
1432	ASSERT(sector);
1433	if (sector)
1434	sector->uptodate = `1`;
1435	}
1436	}
1437	}
1438
1439	static int get_bio_sector_nr(struct btrfs_raid_bio rbio, struct* bio *bio)
1440	{
1441	struct bio_vec *bv = bio_first_bvec_all(bio);
1442	int i;
1443
1444	for (i = `0`; i < rbio->nr_sectors; i++) {
1445	struct sector_ptr *sector;
1446
1447	sector = &rbio->stripe_sectors[i];
1448	if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1449	break;
1450	sector = &rbio->bio_sectors[i];
1451	if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1452	break;
1453	}
1454	ASSERT(i < rbio->nr_sectors);
1455	return i;
1456	}
1457
1458	static void rbio_update_error_bitmap(struct btrfs_raid_bio rbio, struct* bio *bio)
1459	{
1460	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1461	u32 bio_size = `0`;
1462	struct bio_vec *bvec;
1463	int i;
1464
1465	bio_for_each_bvec_all(bvec, bio, i)
1466	bio_size += bvec->bv_len;
1467
1468	/*
1469	* Since we can have multiple bios touching the error_bitmap, we cannot
1470	* call bitmap_set() without protection.
1471	*
1472	* Instead use set_bit() for each bit, as set_bit() itself is atomic.
1473	*/
1474	for (i = total_sector_nr; i < total_sector_nr +
1475	(bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1476	set_bit(nr: i, addr: rbio->error_bitmap);
1477	}
1478
1479	/ Verify the data sectors at read time. /
1480	static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1481	struct bio *bio)
1482	{
1483	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1484	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1485	struct bio_vec *bvec;
1486	struct bvec_iter_all iter_all;
1487
1488	/ No data csum for the whole stripe, no need to verify. /
1489	if (!rbio->csum_bitmap \|\| !rbio->csum_buf)
1490	return;
1491
1492	/ P/Q stripes, they have no data csum to verify against. /
1493	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1494	return;
1495
1496	bio_for_each_segment_all(bvec, bio, iter_all) {
1497	int bv_offset;
1498
1499	for (bv_offset = bvec->bv_offset;
1500	bv_offset < bvec->bv_offset + bvec->bv_len;
1501	bv_offset += fs_info->sectorsize, total_sector_nr++) {
1502	u8 csum_buf[BTRFS_CSUM_SIZE];
1503	u8 *expected_csum = rbio->csum_buf +
1504	total_sector_nr * fs_info->csum_size;
1505	int ret;
1506
1507	/ No csum for this sector, skip to the next sector. /
1508	if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1509	continue;
1510
1511	ret = btrfs_check_sector_csum(fs_info, page: bvec->bv_page,
1512	pgoff: bv_offset, csum: csum_buf, csum_expected: expected_csum);
1513	if (ret < `0`)
1514	set_bit(nr: total_sector_nr, addr: rbio->error_bitmap);
1515	}
1516	}
1517	}
1518
1519	static void raid_wait_read_end_io(struct bio *bio)
1520	{
1521	struct btrfs_raid_bio *rbio = bio->bi_private;
1522
1523	if (bio->bi_status) {
1524	rbio_update_error_bitmap(rbio, bio);
1525	} else {
1526	set_bio_pages_uptodate(rbio, bio);
1527	verify_bio_data_sectors(rbio, bio);
1528	}
1529
1530	bio_put(bio);
1531	if (atomic_dec_and_test(v: &rbio->stripes_pending))
1532	wake_up(&rbio->io_wait);
1533	}
1534
1535	static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1536	struct bio_list *bio_list)
1537	{
1538	struct bio *bio;
1539
1540	atomic_set(v: &rbio->stripes_pending, i: bio_list_size(bl: bio_list));
1541	while ((bio = bio_list_pop(bl: bio_list))) {
1542	bio->bi_end_io = raid_wait_read_end_io;
1543
1544	if (trace_raid56_read_enabled()) {
1545	struct raid56_bio_trace_info trace_info = { `0` };
1546
1547	bio_get_trace_info(rbio, bio, trace_info: &trace_info);
1548	trace_raid56_read(rbio, bio, trace_info: &trace_info);
1549	}
1550	submit_bio(bio);
1551	}
1552
1553	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == `0`);
1554	}
1555
1556	static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1557	{
1558	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1559	int ret;
1560
1561	ret = btrfs_alloc_page_array(nr_pages: data_pages, page_array: rbio->stripe_pages, extra_gfp: `0`);
1562	if (ret < `0`)
1563	return ret;
1564
1565	index_stripe_sectors(rbio);
1566	return `0`;
1567	}
1568
1569	/*
1570	* We use plugging call backs to collect full stripes.
1571	* Any time we get a partial stripe write while plugged
1572	* we collect it into a list. When the unplug comes down,
1573	* we sort the list by logical block number and merge
1574	* everything we can into the same rbios
1575	*/
1576	struct btrfs_plug_cb {
1577	struct blk_plug_cb cb;
1578	struct btrfs_fs_info *info;
1579	struct list_head rbio_list;
1580	};
1581
1582	/*
1583	* rbios on the plug list are sorted for easier merging.
1584	*/
1585	static int plug_cmp(void priv, const* struct list_head *a,
1586	const struct list_head *b)
1587	{
1588	const struct btrfs_raid_bio ra = container_of(a, struct* btrfs_raid_bio,
1589	plug_list);
1590	const struct btrfs_raid_bio rb = container_of(b, struct* btrfs_raid_bio,
1591	plug_list);
1592	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1593	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1594
1595	if (a_sector < b_sector)
1596	return -`1`;
1597	if (a_sector > b_sector)
1598	return `1`;
1599	return `0`;
1600	}
1601
1602	static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1603	{
1604	struct btrfs_plug_cb plug = container_of(cb, struct* btrfs_plug_cb, cb);
1605	struct btrfs_raid_bio *cur;
1606	struct btrfs_raid_bio *last = NULL;
1607
1608	list_sort(NULL, head: &plug->rbio_list, cmp: plug_cmp);
1609
1610	while (!list_empty(head: &plug->rbio_list)) {
1611	cur = list_entry(plug->rbio_list.next,
1612	struct btrfs_raid_bio, plug_list);
1613	list_del_init(entry: &cur->plug_list);
1614
1615	if (rbio_is_full(rbio: cur)) {
1616	/ We have a full stripe, queue it down. /
1617	start_async_work(rbio: cur, work_func: rmw_rbio_work);
1618	continue;
1619	}
1620	if (last) {
1621	if (rbio_can_merge(last, cur)) {
1622	merge_rbio(dest: last, victim: cur);
1623	free_raid_bio(rbio: cur);
1624	continue;
1625	}
1626	start_async_work(rbio: last, work_func: rmw_rbio_work);
1627	}
1628	last = cur;
1629	}
1630	if (last)
1631	start_async_work(rbio: last, work_func: rmw_rbio_work);
1632	kfree(objp: plug);
1633	}
1634
1635	/ Add the original bio into rbio->bio_list, and update rbio::dbitmap. /
1636	static void rbio_add_bio(struct btrfs_raid_bio rbio, struct* bio *orig_bio)
1637	{
1638	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1639	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1640	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1641	const u32 orig_len = orig_bio->bi_iter.bi_size;
1642	const u32 sectorsize = fs_info->sectorsize;
1643	u64 cur_logical;
1644
1645	ASSERT(orig_logical >= full_stripe_start &&
1646	orig_logical + orig_len <= full_stripe_start +
1647	rbio->nr_data * BTRFS_STRIPE_LEN);
1648
1649	bio_list_add(bl: &rbio->bio_list, bio: orig_bio);
1650	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1651
1652	/ Update the dbitmap. /
1653	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1654	cur_logical += sectorsize) {
1655	int bit = ((u32)(cur_logical - full_stripe_start) >>
1656	fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1657
1658	set_bit(nr: bit, addr: &rbio->dbitmap);
1659	}
1660	}
1661
1662	/*
1663	* our main entry point for writes from the rest of the FS.
1664	*/
1665	void raid56_parity_write(struct bio bio, struct* btrfs_io_context *bioc)
1666	{
1667	struct btrfs_fs_info *fs_info = bioc->fs_info;
1668	struct btrfs_raid_bio *rbio;
1669	struct btrfs_plug_cb *plug = NULL;
1670	struct blk_plug_cb *cb;
1671
1672	rbio = alloc_rbio(fs_info, bioc);
1673	if (IS_ERR(ptr: rbio)) {
1674	bio->bi_status = errno_to_blk_status(errno: PTR_ERR(ptr: rbio));
1675	bio_endio(bio);
1676	return;
1677	}
1678	rbio->operation = BTRFS_RBIO_WRITE;
1679	rbio_add_bio(rbio, orig_bio: bio);
1680
1681	/*
1682	* Don't plug on full rbios, just get them out the door
1683	* as quickly as we can
1684	*/
1685	if (!rbio_is_full(rbio)) {
1686	cb = blk_check_plugged(unplug: raid_unplug, data: fs_info, size: sizeof(*plug));
1687	if (cb) {
1688	plug = container_of(cb, struct btrfs_plug_cb, cb);
1689	if (!plug->info) {
1690	plug->info = fs_info;
1691	INIT_LIST_HEAD(list: &plug->rbio_list);
1692	}
1693	list_add_tail(new: &rbio->plug_list, head: &plug->rbio_list);
1694	return;
1695	}
1696	}
1697
1698	/*
1699	* Either we don't have any existing plug, or we're doing a full stripe,
1700	* queue the rmw work now.
1701	*/
1702	start_async_work(rbio, work_func: rmw_rbio_work);
1703	}
1704
1705	static int verify_one_sector(struct btrfs_raid_bio *rbio,
1706	int stripe_nr, int sector_nr)
1707	{
1708	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1709	struct sector_ptr *sector;
1710	u8 csum_buf[BTRFS_CSUM_SIZE];
1711	u8 *csum_expected;
1712	int ret;
1713
1714	if (!rbio->csum_bitmap \|\| !rbio->csum_buf)
1715	return `0`;
1716
1717	/ No way to verify P/Q as they are not covered by data csum. /
1718	if (stripe_nr >= rbio->nr_data)
1719	return `0`;
1720	/*
1721	* If we're rebuilding a read, we have to use pages from the
1722	* bio list if possible.
1723	*/
1724	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1725	sector = sector_in_rbio(rbio, stripe_nr, sector_nr, bio_list_only: `0`);
1726	} else {
1727	sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1728	}
1729
1730	ASSERT(sector->page);
1731
1732	csum_expected = rbio->csum_buf +
1733	(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1734	fs_info->csum_size;
1735	ret = btrfs_check_sector_csum(fs_info, page: sector->page, pgoff: sector->pgoff,
1736	csum: csum_buf, csum_expected);
1737	return ret;
1738	}
1739
1740	/*
1741	* Recover a vertical stripe specified by @sector_nr.
1742	* @*pointers are the pre-allocated pointers by the caller, so we don't
1743	* need to allocate/free the pointers again and again.
1744	*/
1745	static int recover_vertical(struct btrfs_raid_bio rbio, int* sector_nr,
1746	void *pointers, void* **unmap_array)
1747	{
1748	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1749	struct sector_ptr *sector;
1750	const u32 sectorsize = fs_info->sectorsize;
1751	int found_errors;
1752	int faila;
1753	int failb;
1754	int stripe_nr;
1755	int ret = `0`;
1756
1757	/*
1758	* Now we just use bitmap to mark the horizontal stripes in
1759	* which we have data when doing parity scrub.
1760	*/
1761	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1762	!test_bit(sector_nr, &rbio->dbitmap))
1763	return `0`;
1764
1765	found_errors = get_rbio_veritical_errors(rbio, sector_nr, faila: &faila,
1766	failb: &failb);
1767	/*
1768	* No errors in the vertical stripe, skip it. Can happen for recovery
1769	* which only part of a stripe failed csum check.
1770	*/
1771	if (!found_errors)
1772	return `0`;
1773
1774	if (found_errors > rbio->bioc->max_errors)
1775	return -EIO;
1776
1777	/*
1778	* Setup our array of pointers with sectors from each stripe
1779	*
1780	* NOTE: store a duplicate array of pointers to preserve the
1781	* pointer order.
1782	*/
1783	for (stripe_nr = `0`; stripe_nr < rbio->real_stripes; stripe_nr++) {
1784	/*
1785	* If we're rebuilding a read, we have to use pages from the
1786	* bio list if possible.
1787	*/
1788	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1789	sector = sector_in_rbio(rbio, stripe_nr, sector_nr, bio_list_only: `0`);
1790	} else {
1791	sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1792	}
1793	ASSERT(sector->page);
1794	pointers[stripe_nr] = kmap_local_page(page: sector->page) +
1795	sector->pgoff;
1796	unmap_array[stripe_nr] = pointers[stripe_nr];
1797	}
1798
1799	/ All raid6 handling here /
1800	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1801	/ Single failure, rebuild from parity raid5 style /
1802	if (failb < `0`) {
1803	if (faila == rbio->nr_data)
1804	/*
1805	* Just the P stripe has failed, without
1806	* a bad data or Q stripe.
1807	* We have nothing to do, just skip the
1808	* recovery for this stripe.
1809	*/
1810	goto cleanup;
1811	/*
1812	* a single failure in raid6 is rebuilt
1813	* in the pstripe code below
1814	*/
1815	goto pstripe;
1816	}
1817
1818	/*
1819	* If the q stripe is failed, do a pstripe reconstruction from
1820	* the xors.
1821	* If both the q stripe and the P stripe are failed, we're
1822	* here due to a crc mismatch and we can't give them the
1823	* data they want.
1824	*/
1825	if (failb == rbio->real_stripes - `1`) {
1826	if (faila == rbio->real_stripes - `2`)
1827	/*
1828	* Only P and Q are corrupted.
1829	* We only care about data stripes recovery,
1830	* can skip this vertical stripe.
1831	*/
1832	goto cleanup;
1833	/*
1834	* Otherwise we have one bad data stripe and
1835	* a good P stripe. raid5!
1836	*/
1837	goto pstripe;
1838	}
1839
1840	if (failb == rbio->real_stripes - `2`) {
1841	raid6_datap_recov(rbio->real_stripes, sectorsize,
1842	faila, pointers);
1843	} else {
1844	raid6_2data_recov(rbio->real_stripes, sectorsize,
1845	faila, failb, pointers);
1846	}
1847	} else {
1848	void *p;
1849
1850	/ Rebuild from P stripe here (raid5 or raid6). /
1851	ASSERT(failb == -`1`);
1852	pstripe:
1853	/ Copy parity block into failed block to start with /
1854	memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1855
1856	/ Rearrange the pointer array /
1857	p = pointers[faila];
1858	for (stripe_nr = faila; stripe_nr < rbio->nr_data - `1`;
1859	stripe_nr++)
1860	pointers[stripe_nr] = pointers[stripe_nr + `1`];
1861	pointers[rbio->nr_data - `1`] = p;
1862
1863	/ Xor in the rest /
1864	run_xor(pages: pointers, src_cnt: rbio->nr_data - `1`, len: sectorsize);
1865
1866	}
1867
1868	/*
1869	* No matter if this is a RMW or recovery, we should have all
1870	* failed sectors repaired in the vertical stripe, thus they are now
1871	* uptodate.
1872	* Especially if we determine to cache the rbio, we need to
1873	* have at least all data sectors uptodate.
1874	*
1875	* If possible, also check if the repaired sector matches its data
1876	* checksum.
1877	*/
1878	if (faila >= `0`) {
1879	ret = verify_one_sector(rbio, stripe_nr: faila, sector_nr);
1880	if (ret < `0`)
1881	goto cleanup;
1882
1883	sector = rbio_stripe_sector(rbio, stripe_nr: faila, sector_nr);
1884	sector->uptodate = `1`;
1885	}
1886	if (failb >= `0`) {
1887	ret = verify_one_sector(rbio, stripe_nr: failb, sector_nr);
1888	if (ret < `0`)
1889	goto cleanup;
1890
1891	sector = rbio_stripe_sector(rbio, stripe_nr: failb, sector_nr);
1892	sector->uptodate = `1`;
1893	}
1894
1895	cleanup:
1896	for (stripe_nr = rbio->real_stripes - `1`; stripe_nr >= `0`; stripe_nr--)
1897	kunmap_local(unmap_array[stripe_nr]);
1898	return ret;
1899	}
1900
1901	static int recover_sectors(struct btrfs_raid_bio *rbio)
1902	{
1903	void **pointers = NULL;
1904	void **unmap_array = NULL;
1905	int sectornr;
1906	int ret = `0`;
1907
1908	/*
1909	* @pointers array stores the pointer for each sector.
1910	*
1911	* @unmap_array stores copy of pointers that does not get reordered
1912	* during reconstruction so that kunmap_local works.
1913	*/
1914	pointers = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
1915	unmap_array = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
1916	if (!pointers \|\| !unmap_array) {
1917	ret = -ENOMEM;
1918	goto out;
1919	}
1920
1921	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1922	spin_lock(lock: &rbio->bio_list_lock);
1923	set_bit(RBIO_RMW_LOCKED_BIT, addr: &rbio->flags);
1924	spin_unlock(lock: &rbio->bio_list_lock);
1925	}
1926
1927	index_rbio_pages(rbio);
1928
1929	for (sectornr = `0`; sectornr < rbio->stripe_nsectors; sectornr++) {
1930	ret = recover_vertical(rbio, sector_nr: sectornr, pointers, unmap_array);
1931	if (ret < `0`)
1932	break;
1933	}
1934
1935	out:
1936	kfree(objp: pointers);
1937	kfree(objp: unmap_array);
1938	return ret;
1939	}
1940
1941	static void recover_rbio(struct btrfs_raid_bio *rbio)
1942	{
1943	struct bio_list bio_list = BIO_EMPTY_LIST;
1944	int total_sector_nr;
1945	int ret = `0`;
1946
1947	/*
1948	* Either we're doing recover for a read failure or degraded write,
1949	* caller should have set error bitmap correctly.
1950	*/
1951	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
1952
1953	/ For recovery, we need to read all sectors including P/Q. /
1954	ret = alloc_rbio_pages(rbio);
1955	if (ret < `0`)
1956	goto out;
1957
1958	index_rbio_pages(rbio);
1959
1960	/*
1961	* Read everything that hasn't failed. However this time we will
1962	* not trust any cached sector.
1963	* As we may read out some stale data but higher layer is not reading
1964	* that stale part.
1965	*
1966	* So here we always re-read everything in recovery path.
1967	*/
1968	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
1969	total_sector_nr++) {
1970	int stripe = total_sector_nr / rbio->stripe_nsectors;
1971	int sectornr = total_sector_nr % rbio->stripe_nsectors;
1972	struct sector_ptr *sector;
1973
1974	/*
1975	* Skip the range which has error. It can be a range which is
1976	* marked error (for csum mismatch), or it can be a missing
1977	* device.
1978	*/
1979	if (!rbio->bioc->stripes[stripe].dev->bdev \|\|
1980	test_bit(total_sector_nr, rbio->error_bitmap)) {
1981	/*
1982	* Also set the error bit for missing device, which
1983	* may not yet have its error bit set.
1984	*/
1985	set_bit(nr: total_sector_nr, addr: rbio->error_bitmap);
1986	continue;
1987	}
1988
1989	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
1990	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector, stripe_nr: stripe,
1991	sector_nr: sectornr, op: REQ_OP_READ);
1992	if (ret < `0`) {
1993	bio_list_put(bio_list: &bio_list);
1994	goto out;
1995	}
1996	}
1997
1998	submit_read_wait_bio_list(rbio, bio_list: &bio_list);
1999	ret = recover_sectors(rbio);
2000	out:
2001	rbio_orig_end_io(rbio, err: errno_to_blk_status(errno: ret));
2002	}
2003
2004	static void recover_rbio_work(struct work_struct *work)
2005	{
2006	struct btrfs_raid_bio *rbio;
2007
2008	rbio = container_of(work, struct btrfs_raid_bio, work);
2009	if (!lock_stripe_add(rbio))
2010	recover_rbio(rbio);
2011	}
2012
2013	static void recover_rbio_work_locked(struct work_struct *work)
2014	{
2015	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2016	}
2017
2018	static void set_rbio_raid6_extra_error(struct btrfs_raid_bio rbio, int* mirror_num)
2019	{
2020	bool found = false;
2021	int sector_nr;
2022
2023	/*
2024	* This is for RAID6 extra recovery tries, thus mirror number should
2025	* be large than 2.
2026	* Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2027	* RAID5 methods.
2028	*/
2029	ASSERT(mirror_num > `2`);
2030	for (sector_nr = `0`; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2031	int found_errors;
2032	int faila;
2033	int failb;
2034
2035	found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2036	faila: &faila, failb: &failb);
2037	/ This vertical stripe doesn't have errors. /
2038	if (!found_errors)
2039	continue;
2040
2041	/*
2042	* If we found errors, there should be only one error marked
2043	* by previous set_rbio_range_error().
2044	*/
2045	ASSERT(found_errors == `1`);
2046	found = true;
2047
2048	/ Now select another stripe to mark as error. /
2049	failb = rbio->real_stripes - (mirror_num - `1`);
2050	if (failb <= faila)
2051	failb--;
2052
2053	/ Set the extra bit in error bitmap. /
2054	if (failb >= `0`)
2055	set_bit(nr: failb * rbio->stripe_nsectors + sector_nr,
2056	addr: rbio->error_bitmap);
2057	}
2058
2059	/ We should found at least one vertical stripe with error./
2060	ASSERT(found);
2061	}
2062
2063	/*
2064	* the main entry point for reads from the higher layers. This
2065	* is really only called when the normal read path had a failure,
2066	* so we assume the bio they send down corresponds to a failed part
2067	* of the drive.
2068	*/
2069	void raid56_parity_recover(struct bio bio, struct* btrfs_io_context *bioc,
2070	int mirror_num)
2071	{
2072	struct btrfs_fs_info *fs_info = bioc->fs_info;
2073	struct btrfs_raid_bio *rbio;
2074
2075	rbio = alloc_rbio(fs_info, bioc);
2076	if (IS_ERR(ptr: rbio)) {
2077	bio->bi_status = errno_to_blk_status(errno: PTR_ERR(ptr: rbio));
2078	bio_endio(bio);
2079	return;
2080	}
2081
2082	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2083	rbio_add_bio(rbio, orig_bio: bio);
2084
2085	set_rbio_range_error(rbio, bio);
2086
2087	/*
2088	* Loop retry:
2089	* for 'mirror == 2', reconstruct from all other stripes.
2090	* for 'mirror_num > 2', select a stripe to fail on every retry.
2091	*/
2092	if (mirror_num > `2`)
2093	set_rbio_raid6_extra_error(rbio, mirror_num);
2094
2095	start_async_work(rbio, work_func: recover_rbio_work);
2096	}
2097
2098	static void fill_data_csums(struct btrfs_raid_bio *rbio)
2099	{
2100	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2101	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2102	bytenr: rbio->bioc->full_stripe_logical);
2103	const u64 start = rbio->bioc->full_stripe_logical;
2104	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2105	fs_info->sectorsize_bits;
2106	int ret;
2107
2108	/ The rbio should not have its csum buffer initialized. /
2109	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2110
2111	/*
2112	* Skip the csum search if:
2113	*
2114	* - The rbio doesn't belong to data block groups
2115	* Then we are doing IO for tree blocks, no need to search csums.
2116	*
2117	* - The rbio belongs to mixed block groups
2118	* This is to avoid deadlock, as we're already holding the full
2119	* stripe lock, if we trigger a metadata read, and it needs to do
2120	* raid56 recovery, we will deadlock.
2121	*/
2122	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) \|\|
2123	rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2124	return;
2125
2126	rbio->csum_buf = kzalloc(size: rbio->nr_data * rbio->stripe_nsectors *
2127	fs_info->csum_size, GFP_NOFS);
2128	rbio->csum_bitmap = bitmap_zalloc(nbits: rbio->nr_data * rbio->stripe_nsectors,
2129	GFP_NOFS);
2130	if (!rbio->csum_buf \|\| !rbio->csum_bitmap) {
2131	ret = -ENOMEM;
2132	goto error;
2133	}
2134
2135	ret = btrfs_lookup_csums_bitmap(root: csum_root, NULL, start, end: start + len - `1`,
2136	csum_buf: rbio->csum_buf, csum_bitmap: rbio->csum_bitmap);
2137	if (ret < `0`)
2138	goto error;
2139	if (bitmap_empty(src: rbio->csum_bitmap, nbits: len >> fs_info->sectorsize_bits))
2140	goto no_csum;
2141	return;
2142
2143	error:
2144	/*
2145	* We failed to allocate memory or grab the csum, but it's not fatal,
2146	* we can still continue. But better to warn users that RMW is no
2147	* longer safe for this particular sub-stripe write.
2148	*/
2149	btrfs_warn_rl(fs_info,
2150	"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2151	rbio->bioc->full_stripe_logical, ret);
2152	no_csum:
2153	kfree(objp: rbio->csum_buf);
2154	bitmap_free(bitmap: rbio->csum_bitmap);
2155	rbio->csum_buf = NULL;
2156	rbio->csum_bitmap = NULL;
2157	}
2158
2159	static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2160	{
2161	struct bio_list bio_list = BIO_EMPTY_LIST;
2162	int total_sector_nr;
2163	int ret = `0`;
2164
2165	/*
2166	* Fill the data csums we need for data verification. We need to fill
2167	* the csum_bitmap/csum_buf first, as our endio function will try to
2168	* verify the data sectors.
2169	*/
2170	fill_data_csums(rbio);
2171
2172	/*
2173	* Build a list of bios to read all sectors (including data and P/Q).
2174	*
2175	* This behavior is to compensate the later csum verification and recovery.
2176	*/
2177	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
2178	total_sector_nr++) {
2179	struct sector_ptr *sector;
2180	int stripe = total_sector_nr / rbio->stripe_nsectors;
2181	int sectornr = total_sector_nr % rbio->stripe_nsectors;
2182
2183	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
2184	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector,
2185	stripe_nr: stripe, sector_nr: sectornr, op: REQ_OP_READ);
2186	if (ret) {
2187	bio_list_put(bio_list: &bio_list);
2188	return ret;
2189	}
2190	}
2191
2192	/*
2193	* We may or may not have any corrupted sectors (including missing dev
2194	* and csum mismatch), just let recover_sectors() to handle them all.
2195	*/
2196	submit_read_wait_bio_list(rbio, bio_list: &bio_list);
2197	return recover_sectors(rbio);
2198	}
2199
2200	static void raid_wait_write_end_io(struct bio *bio)
2201	{
2202	struct btrfs_raid_bio *rbio = bio->bi_private;
2203	blk_status_t err = bio->bi_status;
2204
2205	if (err)
2206	rbio_update_error_bitmap(rbio, bio);
2207	bio_put(bio);
2208	if (atomic_dec_and_test(v: &rbio->stripes_pending))
2209	wake_up(&rbio->io_wait);
2210	}
2211
2212	static void submit_write_bios(struct btrfs_raid_bio *rbio,
2213	struct bio_list *bio_list)
2214	{
2215	struct bio *bio;
2216
2217	atomic_set(v: &rbio->stripes_pending, i: bio_list_size(bl: bio_list));
2218	while ((bio = bio_list_pop(bl: bio_list))) {
2219	bio->bi_end_io = raid_wait_write_end_io;
2220
2221	if (trace_raid56_write_enabled()) {
2222	struct raid56_bio_trace_info trace_info = { `0` };
2223
2224	bio_get_trace_info(rbio, bio, trace_info: &trace_info);
2225	trace_raid56_write(rbio, bio, trace_info: &trace_info);
2226	}
2227	submit_bio(bio);
2228	}
2229	}
2230
2231	/*
2232	* To determine if we need to read any sector from the disk.
2233	* Should only be utilized in RMW path, to skip cached rbio.
2234	*/
2235	static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2236	{
2237	int i;
2238
2239	for (i = `0`; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2240	struct sector_ptr *sector = &rbio->stripe_sectors[i];
2241
2242	/*
2243	* We have a sector which doesn't have page nor uptodate,
2244	* thus this rbio can not be cached one, as cached one must
2245	* have all its data sectors present and uptodate.
2246	*/
2247	if (!sector->page \|\| !sector->uptodate)
2248	return true;
2249	}
2250	return false;
2251	}
2252
2253	static void rmw_rbio(struct btrfs_raid_bio *rbio)
2254	{
2255	struct bio_list bio_list;
2256	int sectornr;
2257	int ret = `0`;
2258
2259	/*
2260	* Allocate the pages for parity first, as P/Q pages will always be
2261	* needed for both full-stripe and sub-stripe writes.
2262	*/
2263	ret = alloc_rbio_parity_pages(rbio);
2264	if (ret < `0`)
2265	goto out;
2266
2267	/*
2268	* Either full stripe write, or we have every data sector already
2269	* cached, can go to write path immediately.
2270	*/
2271	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2272	/*
2273	* Now we're doing sub-stripe write, also need all data stripes
2274	* to do the full RMW.
2275	*/
2276	ret = alloc_rbio_data_pages(rbio);
2277	if (ret < `0`)
2278	goto out;
2279
2280	index_rbio_pages(rbio);
2281
2282	ret = rmw_read_wait_recover(rbio);
2283	if (ret < `0`)
2284	goto out;
2285	}
2286
2287	/*
2288	* At this stage we're not allowed to add any new bios to the
2289	* bio list any more, anyone else that wants to change this stripe
2290	* needs to do their own rmw.
2291	*/
2292	spin_lock(lock: &rbio->bio_list_lock);
2293	set_bit(RBIO_RMW_LOCKED_BIT, addr: &rbio->flags);
2294	spin_unlock(lock: &rbio->bio_list_lock);
2295
2296	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
2297
2298	index_rbio_pages(rbio);
2299
2300	/*
2301	* We don't cache full rbios because we're assuming
2302	* the higher layers are unlikely to use this area of
2303	* the disk again soon. If they do use it again,
2304	* hopefully they will send another full bio.
2305	*/
2306	if (!rbio_is_full(rbio))
2307	cache_rbio_pages(rbio);
2308	else
2309	clear_bit(RBIO_CACHE_READY_BIT, addr: &rbio->flags);
2310
2311	for (sectornr = `0`; sectornr < rbio->stripe_nsectors; sectornr++)
2312	generate_pq_vertical(rbio, sectornr);
2313
2314	bio_list_init(bl: &bio_list);
2315	ret = rmw_assemble_write_bios(rbio, bio_list: &bio_list);
2316	if (ret < `0`)
2317	goto out;
2318
2319	/ We should have at least one bio assembled. /
2320	ASSERT(bio_list_size(&bio_list));
2321	submit_write_bios(rbio, bio_list: &bio_list);
2322	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == `0`);
2323
2324	/ We may have more errors than our tolerance during the read. /
2325	for (sectornr = `0`; sectornr < rbio->stripe_nsectors; sectornr++) {
2326	int found_errors;
2327
2328	found_errors = get_rbio_veritical_errors(rbio, sector_nr: sectornr, NULL, NULL);
2329	if (found_errors > rbio->bioc->max_errors) {
2330	ret = -EIO;
2331	break;
2332	}
2333	}
2334	out:
2335	rbio_orig_end_io(rbio, err: errno_to_blk_status(errno: ret));
2336	}
2337
2338	static void rmw_rbio_work(struct work_struct *work)
2339	{
2340	struct btrfs_raid_bio *rbio;
2341
2342	rbio = container_of(work, struct btrfs_raid_bio, work);
2343	if (lock_stripe_add(rbio) == `0`)
2344	rmw_rbio(rbio);
2345	}
2346
2347	static void rmw_rbio_work_locked(struct work_struct *work)
2348	{
2349	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2350	}
2351
2352	/*
2353	* The following code is used to scrub/replace the parity stripe
2354	*
2355	* Caller must have already increased bio_counter for getting @bioc.
2356	*
2357	* Note: We need make sure all the pages that add into the scrub/replace
2358	* raid bio are correct and not be changed during the scrub/replace. That
2359	* is those pages just hold metadata or file data with checksum.
2360	*/
2361
2362	struct btrfs_raid_bio raid56_parity_alloc_scrub_rbio(struct* bio *bio,
2363	struct btrfs_io_context *bioc,
2364	struct btrfs_device *scrub_dev,
2365	unsigned long dbitmap, int* stripe_nsectors)
2366	{
2367	struct btrfs_fs_info *fs_info = bioc->fs_info;
2368	struct btrfs_raid_bio *rbio;
2369	int i;
2370
2371	rbio = alloc_rbio(fs_info, bioc);
2372	if (IS_ERR(ptr: rbio))
2373	return NULL;
2374	bio_list_add(bl: &rbio->bio_list, bio);
2375	/*
2376	* This is a special bio which is used to hold the completion handler
2377	* and make the scrub rbio is similar to the other types
2378	*/
2379	ASSERT(!bio->bi_iter.bi_size);
2380	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2381
2382	/*
2383	* After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2384	* to the end position, so this search can start from the first parity
2385	* stripe.
2386	*/
2387	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2388	if (bioc->stripes[i].dev == scrub_dev) {
2389	rbio->scrubp = i;
2390	break;
2391	}
2392	}
2393	ASSERT(i < rbio->real_stripes);
2394
2395	bitmap_copy(dst: &rbio->dbitmap, src: dbitmap, nbits: stripe_nsectors);
2396	return rbio;
2397	}
2398
2399	/*
2400	* We just scrub the parity that we have correct data on the same horizontal,
2401	* so we needn't allocate all pages for all the stripes.
2402	*/
2403	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2404	{
2405	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2406	int total_sector_nr;
2407
2408	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
2409	total_sector_nr++) {
2410	struct page *page;
2411	int sectornr = total_sector_nr % rbio->stripe_nsectors;
2412	int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2413
2414	if (!test_bit(sectornr, &rbio->dbitmap))
2415	continue;
2416	if (rbio->stripe_pages[index])
2417	continue;
2418	page = alloc_page(GFP_NOFS);
2419	if (!page)
2420	return -ENOMEM;
2421	rbio->stripe_pages[index] = page;
2422	}
2423	index_stripe_sectors(rbio);
2424	return `0`;
2425	}
2426
2427	static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2428	{
2429	struct btrfs_io_context *bioc = rbio->bioc;
2430	const u32 sectorsize = bioc->fs_info->sectorsize;
2431	void **pointers = rbio->finish_pointers;
2432	unsigned long *pbitmap = &rbio->finish_pbitmap;
2433	int nr_data = rbio->nr_data;
2434	int stripe;
2435	int sectornr;
2436	bool has_qstripe;
2437	struct sector_ptr p_sector = { `0` };
2438	struct sector_ptr q_sector = { `0` };
2439	struct bio_list bio_list;
2440	int is_replace = `0`;
2441	int ret;
2442
2443	bio_list_init(bl: &bio_list);
2444
2445	if (rbio->real_stripes - rbio->nr_data == `1`)
2446	has_qstripe = false;
2447	else if (rbio->real_stripes - rbio->nr_data == `2`)
2448	has_qstripe = true;
2449	else
2450	BUG();
2451
2452	/*
2453	* Replace is running and our P/Q stripe is being replaced, then we
2454	* need to duplicate the final write to replace target.
2455	*/
2456	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2457	is_replace = `1`;
2458	bitmap_copy(dst: pbitmap, src: &rbio->dbitmap, nbits: rbio->stripe_nsectors);
2459	}
2460
2461	/*
2462	* Because the higher layers(scrubber) are unlikely to
2463	* use this area of the disk again soon, so don't cache
2464	* it.
2465	*/
2466	clear_bit(RBIO_CACHE_READY_BIT, addr: &rbio->flags);
2467
2468	p_sector.page = alloc_page(GFP_NOFS);
2469	if (!p_sector.page)
2470	return -ENOMEM;
2471	p_sector.pgoff = `0`;
2472	p_sector.uptodate = `1`;
2473
2474	if (has_qstripe) {
2475	/ RAID6, allocate and map temp space for the Q stripe /
2476	q_sector.page = alloc_page(GFP_NOFS);
2477	if (!q_sector.page) {
2478	__free_page(p_sector.page);
2479	p_sector.page = NULL;
2480	return -ENOMEM;
2481	}
2482	q_sector.pgoff = `0`;
2483	q_sector.uptodate = `1`;
2484	pointers[rbio->real_stripes - `1`] = kmap_local_page(page: q_sector.page);
2485	}
2486
2487	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
2488
2489	/ Map the parity stripe just once /
2490	pointers[nr_data] = kmap_local_page(page: p_sector.page);
2491
2492	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2493	struct sector_ptr *sector;
2494	void *parity;
2495
2496	/ first collect one page from each data stripe /
2497	for (stripe = `0`; stripe < nr_data; stripe++) {
2498	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `0`);
2499	pointers[stripe] = kmap_local_page(page: sector->page) +
2500	sector->pgoff;
2501	}
2502
2503	if (has_qstripe) {
2504	assert_rbio(rbio);
2505	/ RAID6, call the library function to fill in our P/Q /
2506	raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2507	pointers);
2508	} else {
2509	/ raid5 /
2510	memcpy(pointers[nr_data], pointers[`0`], sectorsize);
2511	run_xor(pages: pointers + `1`, src_cnt: nr_data - `1`, len: sectorsize);
2512	}
2513
2514	/ Check scrubbing parity and repair it /
2515	sector = rbio_stripe_sector(rbio, stripe_nr: rbio->scrubp, sector_nr: sectornr);
2516	parity = kmap_local_page(page: sector->page) + sector->pgoff;
2517	if (memcmp(p: parity, q: pointers[rbio->scrubp], size: sectorsize) != `0`)
2518	memcpy(parity, pointers[rbio->scrubp], sectorsize);
2519	else
2520	/ Parity is right, needn't writeback /
2521	bitmap_clear(map: &rbio->dbitmap, start: sectornr, nbits: `1`);
2522	kunmap_local(parity);
2523
2524	for (stripe = nr_data - `1`; stripe >= `0`; stripe--)
2525	kunmap_local(pointers[stripe]);
2526	}
2527
2528	kunmap_local(pointers[nr_data]);
2529	__free_page(p_sector.page);
2530	p_sector.page = NULL;
2531	if (q_sector.page) {
2532	kunmap_local(pointers[rbio->real_stripes - `1`]);
2533	__free_page(q_sector.page);
2534	q_sector.page = NULL;
2535	}
2536
2537	/*
2538	* time to start writing. Make bios for everything from the
2539	* higher layers (the bio_list in our rbio) and our p/q. Ignore
2540	* everything else.
2541	*/
2542	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2543	struct sector_ptr *sector;
2544
2545	sector = rbio_stripe_sector(rbio, stripe_nr: rbio->scrubp, sector_nr: sectornr);
2546	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector, stripe_nr: rbio->scrubp,
2547	sector_nr: sectornr, op: REQ_OP_WRITE);
2548	if (ret)
2549	goto cleanup;
2550	}
2551
2552	if (!is_replace)
2553	goto submit_write;
2554
2555	/*
2556	* Replace is running and our parity stripe needs to be duplicated to
2557	* the target device. Check we have a valid source stripe number.
2558	*/
2559	ASSERT(rbio->bioc->replace_stripe_src >= `0`);
2560	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2561	struct sector_ptr *sector;
2562
2563	sector = rbio_stripe_sector(rbio, stripe_nr: rbio->scrubp, sector_nr: sectornr);
2564	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector,
2565	stripe_nr: rbio->real_stripes,
2566	sector_nr: sectornr, op: REQ_OP_WRITE);
2567	if (ret)
2568	goto cleanup;
2569	}
2570
2571	submit_write:
2572	submit_write_bios(rbio, bio_list: &bio_list);
2573	return `0`;
2574
2575	cleanup:
2576	bio_list_put(bio_list: &bio_list);
2577	return ret;
2578	}
2579
2580	static inline int is_data_stripe(struct btrfs_raid_bio rbio, int* stripe)
2581	{
2582	if (stripe >= `0` && stripe < rbio->nr_data)
2583	return `1`;
2584	return `0`;
2585	}
2586
2587	static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2588	{
2589	void **pointers = NULL;
2590	void **unmap_array = NULL;
2591	int sector_nr;
2592	int ret = `0`;
2593
2594	/*
2595	* @pointers array stores the pointer for each sector.
2596	*
2597	* @unmap_array stores copy of pointers that does not get reordered
2598	* during reconstruction so that kunmap_local works.
2599	*/
2600	pointers = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
2601	unmap_array = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
2602	if (!pointers \|\| !unmap_array) {
2603	ret = -ENOMEM;
2604	goto out;
2605	}
2606
2607	for (sector_nr = `0`; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2608	int dfail = `0`, failp = -`1`;
2609	int faila;
2610	int failb;
2611	int found_errors;
2612
2613	found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2614	faila: &faila, failb: &failb);
2615	if (found_errors > rbio->bioc->max_errors) {
2616	ret = -EIO;
2617	goto out;
2618	}
2619	if (found_errors == `0`)
2620	continue;
2621
2622	/ We should have at least one error here. /
2623	ASSERT(faila >= `0` \|\| failb >= `0`);
2624
2625	if (is_data_stripe(rbio, stripe: faila))
2626	dfail++;
2627	else if (is_parity_stripe(faila))
2628	failp = faila;
2629
2630	if (is_data_stripe(rbio, stripe: failb))
2631	dfail++;
2632	else if (is_parity_stripe(failb))
2633	failp = failb;
2634	/*
2635	* Because we can not use a scrubbing parity to repair the
2636	* data, so the capability of the repair is declined. (In the
2637	* case of RAID5, we can not repair anything.)
2638	*/
2639	if (dfail > rbio->bioc->max_errors - `1`) {
2640	ret = -EIO;
2641	goto out;
2642	}
2643	/*
2644	* If all data is good, only parity is correctly, just repair
2645	* the parity, no need to recover data stripes.
2646	*/
2647	if (dfail == `0`)
2648	continue;
2649
2650	/*
2651	* Here means we got one corrupted data stripe and one
2652	* corrupted parity on RAID6, if the corrupted parity is
2653	* scrubbing parity, luckily, use the other one to repair the
2654	* data, or we can not repair the data stripe.
2655	*/
2656	if (failp != rbio->scrubp) {
2657	ret = -EIO;
2658	goto out;
2659	}
2660
2661	ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2662	if (ret < `0`)
2663	goto out;
2664	}
2665	out:
2666	kfree(objp: pointers);
2667	kfree(objp: unmap_array);
2668	return ret;
2669	}
2670
2671	static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2672	{
2673	struct bio_list bio_list = BIO_EMPTY_LIST;
2674	int total_sector_nr;
2675	int ret = `0`;
2676
2677	/ Build a list of bios to read all the missing parts. /
2678	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
2679	total_sector_nr++) {
2680	int sectornr = total_sector_nr % rbio->stripe_nsectors;
2681	int stripe = total_sector_nr / rbio->stripe_nsectors;
2682	struct sector_ptr *sector;
2683
2684	/ No data in the vertical stripe, no need to read. /
2685	if (!test_bit(sectornr, &rbio->dbitmap))
2686	continue;
2687
2688	/*
2689	* We want to find all the sectors missing from the rbio and
2690	* read them from the disk. If sector_in_rbio() finds a sector
2691	* in the bio list we don't need to read it off the stripe.
2692	*/
2693	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `1`);
2694	if (sector)
2695	continue;
2696
2697	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
2698	/*
2699	* The bio cache may have handed us an uptodate sector. If so,
2700	* use it.
2701	*/
2702	if (sector->uptodate)
2703	continue;
2704
2705	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector, stripe_nr: stripe,
2706	sector_nr: sectornr, op: REQ_OP_READ);
2707	if (ret) {
2708	bio_list_put(bio_list: &bio_list);
2709	return ret;
2710	}
2711	}
2712
2713	submit_read_wait_bio_list(rbio, bio_list: &bio_list);
2714	return `0`;
2715	}
2716
2717	static void scrub_rbio(struct btrfs_raid_bio *rbio)
2718	{
2719	int sector_nr;
2720	int ret;
2721
2722	ret = alloc_rbio_essential_pages(rbio);
2723	if (ret)
2724	goto out;
2725
2726	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
2727
2728	ret = scrub_assemble_read_bios(rbio);
2729	if (ret < `0`)
2730	goto out;
2731
2732	/ We may have some failures, recover the failed sectors first. /
2733	ret = recover_scrub_rbio(rbio);
2734	if (ret < `0`)
2735	goto out;
2736
2737	/*
2738	* We have every sector properly prepared. Can finish the scrub
2739	* and writeback the good content.
2740	*/
2741	ret = finish_parity_scrub(rbio);
2742	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == `0`);
2743	for (sector_nr = `0`; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2744	int found_errors;
2745
2746	found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2747	if (found_errors > rbio->bioc->max_errors) {
2748	ret = -EIO;
2749	break;
2750	}
2751	}
2752	out:
2753	rbio_orig_end_io(rbio, err: errno_to_blk_status(errno: ret));
2754	}
2755
2756	static void scrub_rbio_work_locked(struct work_struct *work)
2757	{
2758	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2759	}
2760
2761	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2762	{
2763	if (!lock_stripe_add(rbio))
2764	start_async_work(rbio, work_func: scrub_rbio_work_locked);
2765	}
2766
2767	/*
2768	* This is for scrub call sites where we already have correct data contents.
2769	* This allows us to avoid reading data stripes again.
2770	*
2771	* Unfortunately here we have to do page copy, other than reusing the pages.
2772	* This is due to the fact rbio has its own page management for its cache.
2773	*/
2774	void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2775	struct page **data_pages, u64 data_logical)
2776	{
2777	const u64 offset_in_full_stripe = data_logical -
2778	rbio->bioc->full_stripe_logical;
2779	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2780	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2781	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2782	int ret;
2783
2784	/*
2785	* If we hit ENOMEM temporarily, but later at
2786	* raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2787	* the extra read, not a big deal.
2788	*
2789	* If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2790	* the bio would got proper error number set.
2791	*/
2792	ret = alloc_rbio_data_pages(rbio);
2793	if (ret < `0`)
2794	return;
2795
2796	/ data_logical must be at stripe boundary and inside the full stripe. /
2797	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2798	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2799
2800	for (int page_nr = `0`; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2801	struct page *dst = rbio->stripe_pages[page_nr + page_index];
2802	struct page *src = data_pages[page_nr];
2803
2804	memcpy_page(dst_page: dst, dst_off: `0`, src_page: src, src_off: `0`, PAGE_SIZE);
2805	for (int sector_nr = sectors_per_page * page_index;
2806	sector_nr < sectors_per_page * (page_index + `1`);
2807	sector_nr++)
2808	rbio->stripe_sectors[sector_nr].uptodate = true;
2809	}
2810	}
2811

source code of linux/fs/btrfs/raid56.c