raid1.c source code [linux/drivers/md/raid1.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* raid1.c : Multiple Devices driver for Linux
4	*
5	* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
6	*
7	* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8	*
9	* RAID-1 management functions.
10	*
11	* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
12	*
13	* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
14	* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
15	*
16	* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
17	* bitmapped intelligence in resync:
18	*
19	* - bitmap marked during normal i/o
20	* - bitmap used to skip nondirty blocks during sync
21	*
22	* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
23	* - persistent bitmap code
24	*/
25
26	#include <linux/slab.h>
27	#include <linux/delay.h>
28	#include <linux/blkdev.h>
29	#include <linux/module.h>
30	#include <linux/seq_file.h>
31	#include <linux/ratelimit.h>
32	#include <linux/interval_tree_generic.h>
33
34	#include <trace/events/block.h>
35
36	#include "md.h"
37	#include "raid1.h"
38	#include "md-bitmap.h"
39
40	#define UNSUPPORTED_MDDEV_FLAGS \
41	((1L << MD_HAS_JOURNAL) \| \
42	(1L << MD_JOURNAL_CLEAN) \| \
43	(1L << MD_HAS_PPL) \| \
44	(1L << MD_HAS_MULTIPLE_PPLS))
45
46	static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
47	static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
48
49	#define raid1_log(md, fmt, args...) \
50	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
51
52	#include "raid1-10.c"
53
54	#define START(node) ((node)->start)
55	#define LAST(node) ((node)->last)
56	INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
57	START, LAST, static inline, raid1_rb);
58
59	static int check_and_add_serial(struct md_rdev rdev, struct* r1bio *r1_bio,
60	struct serial_info si, int* idx)
61	{
62	unsigned long flags;
63	int ret = `0`;
64	sector_t lo = r1_bio->sector;
65	sector_t hi = lo + r1_bio->sectors;
66	struct serial_in_rdev *serial = &rdev->serial[idx];
67
68	spin_lock_irqsave(&serial->serial_lock, flags);
69	/ collision happened /
70	if (raid1_rb_iter_first(root: &serial->serial_rb, start: lo, last: hi))
71	ret = -EBUSY;
72	else {
73	si->start = lo;
74	si->last = hi;
75	raid1_rb_insert(node: si, root: &serial->serial_rb);
76	}
77	spin_unlock_irqrestore(lock: &serial->serial_lock, flags);
78
79	return ret;
80	}
81
82	static void wait_for_serialization(struct md_rdev rdev, struct* r1bio *r1_bio)
83	{
84	struct mddev *mddev = rdev->mddev;
85	struct serial_info *si;
86	int idx = sector_to_idx(sector: r1_bio->sector);
87	struct serial_in_rdev *serial = &rdev->serial[idx];
88
89	if (WARN_ON(!mddev->serial_info_pool))
90	return;
91	si = mempool_alloc(pool: mddev->serial_info_pool, GFP_NOIO);
92	wait_event(serial->serial_io_wait,
93	check_and_add_serial(rdev, r1_bio, si, idx) == `0`);
94	}
95
96	static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
97	{
98	struct serial_info *si;
99	unsigned long flags;
100	int found = `0`;
101	struct mddev *mddev = rdev->mddev;
102	int idx = sector_to_idx(sector: lo);
103	struct serial_in_rdev *serial = &rdev->serial[idx];
104
105	spin_lock_irqsave(&serial->serial_lock, flags);
106	for (si = raid1_rb_iter_first(root: &serial->serial_rb, start: lo, last: hi);
107	si; si = raid1_rb_iter_next(node: si, start: lo, last: hi)) {
108	if (si->start == lo && si->last == hi) {
109	raid1_rb_remove(node: si, root: &serial->serial_rb);
110	mempool_free(element: si, pool: mddev->serial_info_pool);
111	found = `1`;
112	break;
113	}
114	}
115	if (!found)
116	WARN(`1`, "The write IO is not recorded for serialization\n");
117	spin_unlock_irqrestore(lock: &serial->serial_lock, flags);
118	wake_up(&serial->serial_io_wait);
119	}
120
121	/*
122	* for resync bio, r1bio pointer can be retrieved from the per-bio
123	* 'struct resync_pages'.
124	*/
125	static inline struct r1bio get_resync_r1bio(struct* bio *bio)
126	{
127	return get_resync_pages(bio)->raid_bio;
128	}
129
130	static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
131	{
132	struct pool_info *pi = data;
133	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
134
135	/ allocate a r1bio with room for raid_disks entries in the bios array /
136	return kzalloc(size, flags: gfp_flags);
137	}
138
139	#define RESYNC_DEPTH 32
140	#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
141	#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
142	#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
143	#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
144	#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
145
146	static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
147	{
148	struct pool_info *pi = data;
149	struct r1bio *r1_bio;
150	struct bio *bio;
151	int need_pages;
152	int j;
153	struct resync_pages *rps;
154
155	r1_bio = r1bio_pool_alloc(gfp_flags, data: pi);
156	if (!r1_bio)
157	return NULL;
158
159	rps = kmalloc_array(n: pi->raid_disks, size: sizeof(struct resync_pages),
160	flags: gfp_flags);
161	if (!rps)
162	goto out_free_r1bio;
163
164	/*
165	* Allocate bios : 1 for reading, n-1 for writing
166	*/
167	for (j = pi->raid_disks ; j-- ; ) {
168	bio = bio_kmalloc(RESYNC_PAGES, gfp_mask: gfp_flags);
169	if (!bio)
170	goto out_free_bio;
171	bio_init(bio, NULL, table: bio->bi_inline_vecs, RESYNC_PAGES, opf: `0`);
172	r1_bio->bios[j] = bio;
173	}
174	/*
175	* Allocate RESYNC_PAGES data pages and attach them to
176	* the first bio.
177	* If this is a user-requested check/repair, allocate
178	* RESYNC_PAGES for each bio.
179	*/
180	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
181	need_pages = pi->raid_disks;
182	else
183	need_pages = `1`;
184	for (j = `0`; j < pi->raid_disks; j++) {
185	struct resync_pages *rp = &rps[j];
186
187	bio = r1_bio->bios[j];
188
189	if (j < need_pages) {
190	if (resync_alloc_pages(rp, gfp_flags))
191	goto out_free_pages;
192	} else {
193	memcpy(rp, &rps[`0`], sizeof(*rp));
194	resync_get_all_pages(rp);
195	}
196
197	rp->raid_bio = r1_bio;
198	bio->bi_private = rp;
199	}
200
201	r1_bio->master_bio = NULL;
202
203	return r1_bio;
204
205	out_free_pages:
206	while (--j >= `0`)
207	resync_free_pages(rp: &rps[j]);
208
209	out_free_bio:
210	while (++j < pi->raid_disks) {
211	bio_uninit(r1_bio->bios[j]);
212	kfree(objp: r1_bio->bios[j]);
213	}
214	kfree(objp: rps);
215
216	out_free_r1bio:
217	rbio_pool_free(rbio: r1_bio, data);
218	return NULL;
219	}
220
221	static void r1buf_pool_free(void __r1_bio, void* *data)
222	{
223	struct pool_info *pi = data;
224	int i;
225	struct r1bio *r1bio = __r1_bio;
226	struct resync_pages *rp = NULL;
227
228	for (i = pi->raid_disks; i--; ) {
229	rp = get_resync_pages(bio: r1bio->bios[i]);
230	resync_free_pages(rp);
231	bio_uninit(r1bio->bios[i]);
232	kfree(objp: r1bio->bios[i]);
233	}
234
235	/ resync pages array stored in the 1st bio's .bi_private /
236	kfree(objp: rp);
237
238	rbio_pool_free(rbio: r1bio, data);
239	}
240
241	static void put_all_bios(struct r1conf conf, struct* r1bio *r1_bio)
242	{
243	int i;
244
245	for (i = `0`; i < conf->raid_disks * `2`; i++) {
246	struct bio **bio = r1_bio->bios + i;
247	if (!BIO_SPECIAL(*bio))
248	bio_put(*bio);
249	*bio = NULL;
250	}
251	}
252
253	static void free_r1bio(struct r1bio *r1_bio)
254	{
255	struct r1conf *conf = r1_bio->mddev->private;
256
257	put_all_bios(conf, r1_bio);
258	mempool_free(element: r1_bio, pool: &conf->r1bio_pool);
259	}
260
261	static void put_buf(struct r1bio *r1_bio)
262	{
263	struct r1conf *conf = r1_bio->mddev->private;
264	sector_t sect = r1_bio->sector;
265	int i;
266
267	for (i = `0`; i < conf->raid_disks * `2`; i++) {
268	struct bio *bio = r1_bio->bios[i];
269	if (bio->bi_end_io)
270	rdev_dec_pending(rdev: conf->mirrors[i].rdev, mddev: r1_bio->mddev);
271	}
272
273	mempool_free(element: r1_bio, pool: &conf->r1buf_pool);
274
275	lower_barrier(conf, sector_nr: sect);
276	}
277
278	static void reschedule_retry(struct r1bio *r1_bio)
279	{
280	unsigned long flags;
281	struct mddev *mddev = r1_bio->mddev;
282	struct r1conf *conf = mddev->private;
283	int idx;
284
285	idx = sector_to_idx(sector: r1_bio->sector);
286	spin_lock_irqsave(&conf->device_lock, flags);
287	list_add(new: &r1_bio->retry_list, head: &conf->retry_list);
288	atomic_inc(v: &conf->nr_queued[idx]);
289	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
290
291	wake_up(&conf->wait_barrier);
292	md_wakeup_thread(thread: mddev->thread);
293	}
294
295	/*
296	* raid_end_bio_io() is called when we have finished servicing a mirrored
297	* operation and are ready to return a success/failure code to the buffer
298	* cache layer.
299	*/
300	static void call_bio_endio(struct r1bio *r1_bio)
301	{
302	struct bio *bio = r1_bio->master_bio;
303
304	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
305	bio->bi_status = BLK_STS_IOERR;
306
307	bio_endio(bio);
308	}
309
310	static void raid_end_bio_io(struct r1bio *r1_bio)
311	{
312	struct bio *bio = r1_bio->master_bio;
313	struct r1conf *conf = r1_bio->mddev->private;
314	sector_t sector = r1_bio->sector;
315
316	/ if nobody has done the final endio yet, do it now /
317	if (!test_and_set_bit(nr: R1BIO_Returned, addr: &r1_bio->state)) {
318	pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
319	(bio_data_dir(bio) == WRITE) ? "write" : "read",
320	(unsigned long long) bio->bi_iter.bi_sector,
321	(unsigned long long) bio_end_sector(bio) - `1`);
322
323	call_bio_endio(r1_bio);
324	}
325
326	free_r1bio(r1_bio);
327	/*
328	* Wake up any possible resync thread that waits for the device
329	* to go idle. All I/Os, even write-behind writes, are done.
330	*/
331	allow_barrier(conf, sector_nr: sector);
332	}
333
334	/*
335	* Update disk head position estimator based on IRQ completion info.
336	*/
337	static inline void update_head_pos(int disk, struct r1bio *r1_bio)
338	{
339	struct r1conf *conf = r1_bio->mddev->private;
340
341	conf->mirrors[disk].head_position =
342	r1_bio->sector + (r1_bio->sectors);
343	}
344
345	/*
346	* Find the disk number which triggered given bio
347	*/
348	static int find_bio_disk(struct r1bio r1_bio, struct* bio *bio)
349	{
350	int mirror;
351	struct r1conf *conf = r1_bio->mddev->private;
352	int raid_disks = conf->raid_disks;
353
354	for (mirror = `0`; mirror < raid_disks * `2`; mirror++)
355	if (r1_bio->bios[mirror] == bio)
356	break;
357
358	BUG_ON(mirror == raid_disks * `2`);
359	update_head_pos(disk: mirror, r1_bio);
360
361	return mirror;
362	}
363
364	static void raid1_end_read_request(struct bio *bio)
365	{
366	int uptodate = !bio->bi_status;
367	struct r1bio *r1_bio = bio->bi_private;
368	struct r1conf *conf = r1_bio->mddev->private;
369	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
370
371	/*
372	* this branch is our 'one mirror IO has finished' event handler:
373	*/
374	update_head_pos(disk: r1_bio->read_disk, r1_bio);
375
376	if (uptodate)
377	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
378	else if (test_bit(FailFast, &rdev->flags) &&
379	test_bit(R1BIO_FailFast, &r1_bio->state))
380	/ This was a fail-fast read so we definitely*
381	* want to retry */
382	;
383	else {
384	/ If all other devices have failed, we want to return*
385	* the error upwards rather than fail the last device.
386	* Here we redefine "uptodate" to mean "Don't want to retry"
387	*/
388	unsigned long flags;
389	spin_lock_irqsave(&conf->device_lock, flags);
390	if (r1_bio->mddev->degraded == conf->raid_disks \|\|
391	(r1_bio->mddev->degraded == conf->raid_disks-`1` &&
392	test_bit(In_sync, &rdev->flags)))
393	uptodate = `1`;
394	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
395	}
396
397	if (uptodate) {
398	raid_end_bio_io(r1_bio);
399	rdev_dec_pending(rdev, mddev: conf->mddev);
400	} else {
401	/*
402	* oops, read error:
403	*/
404	pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n",
405	mdname(conf->mddev),
406	rdev->bdev,
407	(unsigned long long)r1_bio->sector);
408	set_bit(nr: R1BIO_ReadError, addr: &r1_bio->state);
409	reschedule_retry(r1_bio);
410	/ don't drop the reference on read_disk yet /
411	}
412	}
413
414	static void close_write(struct r1bio *r1_bio)
415	{
416	/ it really is the end of this request /
417	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
418	bio_free_pages(bio: r1_bio->behind_master_bio);
419	bio_put(r1_bio->behind_master_bio);
420	r1_bio->behind_master_bio = NULL;
421	}
422	/ clear the bitmap if all writes complete successfully /
423	md_bitmap_endwrite(bitmap: r1_bio->mddev->bitmap, offset: r1_bio->sector,
424	sectors: r1_bio->sectors,
425	success: !test_bit(R1BIO_Degraded, &r1_bio->state),
426	test_bit(R1BIO_BehindIO, &r1_bio->state));
427	md_write_end(mddev: r1_bio->mddev);
428	}
429
430	static void r1_bio_write_done(struct r1bio *r1_bio)
431	{
432	if (!atomic_dec_and_test(v: &r1_bio->remaining))
433	return;
434
435	if (test_bit(R1BIO_WriteError, &r1_bio->state))
436	reschedule_retry(r1_bio);
437	else {
438	close_write(r1_bio);
439	if (test_bit(R1BIO_MadeGood, &r1_bio->state))
440	reschedule_retry(r1_bio);
441	else
442	raid_end_bio_io(r1_bio);
443	}
444	}
445
446	static void raid1_end_write_request(struct bio *bio)
447	{
448	struct r1bio *r1_bio = bio->bi_private;
449	int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
450	struct r1conf *conf = r1_bio->mddev->private;
451	struct bio *to_put = NULL;
452	int mirror = find_bio_disk(r1_bio, bio);
453	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
454	bool discard_error;
455	sector_t lo = r1_bio->sector;
456	sector_t hi = r1_bio->sector + r1_bio->sectors;
457
458	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
459
460	/*
461	* 'one mirror IO has finished' event handler:
462	*/
463	if (bio->bi_status && !discard_error) {
464	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
465	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
466	set_bit(nr: MD_RECOVERY_NEEDED, addr: &
467	conf->mddev->recovery);
468
469	if (test_bit(FailFast, &rdev->flags) &&
470	(bio->bi_opf & MD_FAILFAST) &&
471	/ We never try FailFast to WriteMostly devices /
472	!test_bit(WriteMostly, &rdev->flags)) {
473	md_error(mddev: r1_bio->mddev, rdev);
474	}
475
476	/*
477	* When the device is faulty, it is not necessary to
478	* handle write error.
479	*/
480	if (!test_bit(Faulty, &rdev->flags))
481	set_bit(nr: R1BIO_WriteError, addr: &r1_bio->state);
482	else {
483	/ Fail the request /
484	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
485	/ Finished with this branch /
486	r1_bio->bios[mirror] = NULL;
487	to_put = bio;
488	}
489	} else {
490	/*
491	* Set R1BIO_Uptodate in our master bio, so that we
492	* will return a good error code for to the higher
493	* levels even if IO on some other mirrored buffer
494	* fails.
495	*
496	* The 'master' represents the composite IO operation
497	* to user-side. So if something waits for IO, then it
498	* will wait for the 'master' bio.
499	*/
500	sector_t first_bad;
501	int bad_sectors;
502
503	r1_bio->bios[mirror] = NULL;
504	to_put = bio;
505	/*
506	* Do not set R1BIO_Uptodate if the current device is
507	* rebuilding or Faulty. This is because we cannot use
508	* such device for properly reading the data back (we could
509	* potentially use it, if the current write would have felt
510	* before rdev->recovery_offset, but for simplicity we don't
511	* check this here.
512	*/
513	if (test_bit(In_sync, &rdev->flags) &&
514	!test_bit(Faulty, &rdev->flags))
515	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
516
517	/ Maybe we can clear some bad blocks. /
518	if (is_badblock(rdev, s: r1_bio->sector, sectors: r1_bio->sectors,
519	first_bad: &first_bad, bad_sectors: &bad_sectors) && !discard_error) {
520	r1_bio->bios[mirror] = IO_MADE_GOOD;
521	set_bit(nr: R1BIO_MadeGood, addr: &r1_bio->state);
522	}
523	}
524
525	if (behind) {
526	if (test_bit(CollisionCheck, &rdev->flags))
527	remove_serial(rdev, lo, hi);
528	if (test_bit(WriteMostly, &rdev->flags))
529	atomic_dec(v: &r1_bio->behind_remaining);
530
531	/*
532	* In behind mode, we ACK the master bio once the I/O
533	* has safely reached all non-writemostly
534	* disks. Setting the Returned bit ensures that this
535	* gets done only once -- we don't ever want to return
536	* -EIO here, instead we'll wait
537	*/
538	if (atomic_read(v: &r1_bio->behind_remaining) >= (atomic_read(v: &r1_bio->remaining)-`1`) &&
539	test_bit(R1BIO_Uptodate, &r1_bio->state)) {
540	/ Maybe we can return now /
541	if (!test_and_set_bit(nr: R1BIO_Returned, addr: &r1_bio->state)) {
542	struct bio *mbio = r1_bio->master_bio;
543	pr_debug("raid1: behind end write sectors"
544	" %llu-%llu\n",
545	(unsigned long long) mbio->bi_iter.bi_sector,
546	(unsigned long long) bio_end_sector(mbio) - `1`);
547	call_bio_endio(r1_bio);
548	}
549	}
550	} else if (rdev->mddev->serialize_policy)
551	remove_serial(rdev, lo, hi);
552	if (r1_bio->bios[mirror] == NULL)
553	rdev_dec_pending(rdev, mddev: conf->mddev);
554
555	/*
556	* Let's see if all mirrored write operations have finished
557	* already.
558	*/
559	r1_bio_write_done(r1_bio);
560
561	if (to_put)
562	bio_put(to_put);
563	}
564
565	static sector_t align_to_barrier_unit_end(sector_t start_sector,
566	sector_t sectors)
567	{
568	sector_t len;
569
570	WARN_ON(sectors == `0`);
571	/*
572	* len is the number of sectors from start_sector to end of the
573	* barrier unit which start_sector belongs to.
574	*/
575	len = round_up(start_sector + `1`, BARRIER_UNIT_SECTOR_SIZE) -
576	start_sector;
577
578	if (len > sectors)
579	len = sectors;
580
581	return len;
582	}
583
584	/*
585	* This routine returns the disk from which the requested read should
586	* be done. There is a per-array 'next expected sequential IO' sector
587	* number - if this matches on the next IO then we use the last disk.
588	* There is also a per-disk 'last know head position' sector that is
589	* maintained from IRQ contexts, both the normal and the resync IO
590	* completion handlers update this position correctly. If there is no
591	* perfect sequential match then we pick the disk whose head is closest.
592	*
593	* If there are 2 mirrors in the same 2 devices, performance degrades
594	* because position is mirror, not device based.
595	*
596	* The rdev for the device selected will have nr_pending incremented.
597	*/
598	static int read_balance(struct r1conf conf, struct* r1bio r1_bio, int* *max_sectors)
599	{
600	const sector_t this_sector = r1_bio->sector;
601	int sectors;
602	int best_good_sectors;
603	int best_disk, best_dist_disk, best_pending_disk;
604	int has_nonrot_disk;
605	int disk;
606	sector_t best_dist;
607	unsigned int min_pending;
608	struct md_rdev *rdev;
609	int choose_first;
610	int choose_next_idle;
611
612	rcu_read_lock();
613	/*
614	* Check if we can balance. We can balance on the whole
615	* device if no resync is going on, or below the resync window.
616	* We take the first readable disk when above the resync window.
617	*/
618	retry:
619	sectors = r1_bio->sectors;
620	best_disk = -`1`;
621	best_dist_disk = -`1`;
622	best_dist = MaxSector;
623	best_pending_disk = -`1`;
624	min_pending = UINT_MAX;
625	best_good_sectors = `0`;
626	has_nonrot_disk = `0`;
627	choose_next_idle = `0`;
628	clear_bit(nr: R1BIO_FailFast, addr: &r1_bio->state);
629
630	if ((conf->mddev->recovery_cp < this_sector + sectors) \|\|
631	(mddev_is_clustered(mddev: conf->mddev) &&
632	md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
633	this_sector + sectors)))
634	choose_first = `1`;
635	else
636	choose_first = `0`;
637
638	for (disk = `0` ; disk < conf->raid_disks * `2` ; disk++) {
639	sector_t dist;
640	sector_t first_bad;
641	int bad_sectors;
642	unsigned int pending;
643	bool nonrot;
644
645	rdev = rcu_dereference(conf->mirrors[disk].rdev);
646	if (r1_bio->bios[disk] == IO_BLOCKED
647	\|\| rdev == NULL
648	\|\| test_bit(Faulty, &rdev->flags))
649	continue;
650	if (!test_bit(In_sync, &rdev->flags) &&
651	rdev->recovery_offset < this_sector + sectors)
652	continue;
653	if (test_bit(WriteMostly, &rdev->flags)) {
654	/ Don't balance among write-mostly, just*
655	* use the first as a last resort */
656	if (best_dist_disk < `0`) {
657	if (is_badblock(rdev, s: this_sector, sectors,
658	first_bad: &first_bad, bad_sectors: &bad_sectors)) {
659	if (first_bad <= this_sector)
660	/ Cannot use this /
661	continue;
662	best_good_sectors = first_bad - this_sector;
663	} else
664	best_good_sectors = sectors;
665	best_dist_disk = disk;
666	best_pending_disk = disk;
667	}
668	continue;
669	}
670	/ This is a reasonable device to use. It might*
671	* even be best.
672	*/
673	if (is_badblock(rdev, s: this_sector, sectors,
674	first_bad: &first_bad, bad_sectors: &bad_sectors)) {
675	if (best_dist < MaxSector)
676	/ already have a better device /
677	continue;
678	if (first_bad <= this_sector) {
679	/ cannot read here. If this is the 'primary'*
680	* device, then we must not read beyond
681	* bad_sectors from another device..
682	*/
683	bad_sectors -= (this_sector - first_bad);
684	if (choose_first && sectors > bad_sectors)
685	sectors = bad_sectors;
686	if (best_good_sectors > sectors)
687	best_good_sectors = sectors;
688
689	} else {
690	sector_t good_sectors = first_bad - this_sector;
691	if (good_sectors > best_good_sectors) {
692	best_good_sectors = good_sectors;
693	best_disk = disk;
694	}
695	if (choose_first)
696	break;
697	}
698	continue;
699	} else {
700	if ((sectors > best_good_sectors) && (best_disk >= `0`))
701	best_disk = -`1`;
702	best_good_sectors = sectors;
703	}
704
705	if (best_disk >= `0`)
706	/ At least two disks to choose from so failfast is OK /
707	set_bit(nr: R1BIO_FailFast, addr: &r1_bio->state);
708
709	nonrot = bdev_nonrot(bdev: rdev->bdev);
710	has_nonrot_disk \|= nonrot;
711	pending = atomic_read(v: &rdev->nr_pending);
712	dist = abs(this_sector - conf->mirrors[disk].head_position);
713	if (choose_first) {
714	best_disk = disk;
715	break;
716	}
717	/ Don't change to another disk for sequential reads /
718	if (conf->mirrors[disk].next_seq_sect == this_sector
719	\|\| dist == `0`) {
720	int opt_iosize = bdev_io_opt(bdev: rdev->bdev) >> `9`;
721	struct raid1_info *mirror = &conf->mirrors[disk];
722
723	best_disk = disk;
724	/*
725	* If buffered sequential IO size exceeds optimal
726	* iosize, check if there is idle disk. If yes, choose
727	* the idle disk. read_balance could already choose an
728	* idle disk before noticing it's a sequential IO in
729	* this disk. This doesn't matter because this disk
730	* will idle, next time it will be utilized after the
731	* first disk has IO size exceeds optimal iosize. In
732	* this way, iosize of the first disk will be optimal
733	* iosize at least. iosize of the second disk might be
734	* small, but not a big deal since when the second disk
735	* starts IO, the first disk is likely still busy.
736	*/
737	if (nonrot && opt_iosize > `0` &&
738	mirror->seq_start != MaxSector &&
739	mirror->next_seq_sect > opt_iosize &&
740	mirror->next_seq_sect - opt_iosize >=
741	mirror->seq_start) {
742	choose_next_idle = `1`;
743	continue;
744	}
745	break;
746	}
747
748	if (choose_next_idle)
749	continue;
750
751	if (min_pending > pending) {
752	min_pending = pending;
753	best_pending_disk = disk;
754	}
755
756	if (dist < best_dist) {
757	best_dist = dist;
758	best_dist_disk = disk;
759	}
760	}
761
762	/*
763	* If all disks are rotational, choose the closest disk. If any disk is
764	* non-rotational, choose the disk with less pending request even the
765	* disk is rotational, which might/might not be optimal for raids with
766	* mixed ratation/non-rotational disks depending on workload.
767	*/
768	if (best_disk == -`1`) {
769	if (has_nonrot_disk \|\| min_pending == `0`)
770	best_disk = best_pending_disk;
771	else
772	best_disk = best_dist_disk;
773	}
774
775	if (best_disk >= `0`) {
776	rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
777	if (!rdev)
778	goto retry;
779	atomic_inc(v: &rdev->nr_pending);
780	sectors = best_good_sectors;
781
782	if (conf->mirrors[best_disk].next_seq_sect != this_sector)
783	conf->mirrors[best_disk].seq_start = this_sector;
784
785	conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
786	}
787	rcu_read_unlock();
788	*max_sectors = sectors;
789
790	return best_disk;
791	}
792
793	static void wake_up_barrier(struct r1conf *conf)
794	{
795	if (wq_has_sleeper(wq_head: &conf->wait_barrier))
796	wake_up(&conf->wait_barrier);
797	}
798
799	static void flush_bio_list(struct r1conf conf, struct* bio *bio)
800	{
801	/ flush any pending bitmap writes to disk before proceeding w/ I/O /
802	raid1_prepare_flush_writes(bitmap: conf->mddev->bitmap);
803	wake_up_barrier(conf);
804
805	while (bio) { / submit pending writes /
806	struct bio *next = bio->bi_next;
807
808	raid1_submit_write(bio);
809	bio = next;
810	cond_resched();
811	}
812	}
813
814	static void flush_pending_writes(struct r1conf *conf)
815	{
816	/ Any writes that have been queued but are awaiting*
817	* bitmap updates get flushed here.
818	*/
819	spin_lock_irq(lock: &conf->device_lock);
820
821	if (conf->pending_bio_list.head) {
822	struct blk_plug plug;
823	struct bio *bio;
824
825	bio = bio_list_get(bl: &conf->pending_bio_list);
826	spin_unlock_irq(lock: &conf->device_lock);
827
828	/*
829	* As this is called in a wait_event() loop (see freeze_array),
830	* current->state might be TASK_UNINTERRUPTIBLE which will
831	* cause a warning when we prepare to wait again. As it is
832	* rare that this path is taken, it is perfectly safe to force
833	* us to go around the wait_event() loop again, so the warning
834	* is a false-positive. Silence the warning by resetting
835	* thread state
836	*/
837	__set_current_state(TASK_RUNNING);
838	blk_start_plug(&plug);
839	flush_bio_list(conf, bio);
840	blk_finish_plug(&plug);
841	} else
842	spin_unlock_irq(lock: &conf->device_lock);
843	}
844
845	/ Barriers....*
846	* Sometimes we need to suspend IO while we do something else,
847	* either some resync/recovery, or reconfigure the array.
848	* To do this we raise a 'barrier'.
849	* The 'barrier' is a counter that can be raised multiple times
850	* to count how many activities are happening which preclude
851	* normal IO.
852	* We can only raise the barrier if there is no pending IO.
853	* i.e. if nr_pending == 0.
854	* We choose only to raise the barrier if no-one is waiting for the
855	* barrier to go down. This means that as soon as an IO request
856	* is ready, no other operations which require a barrier will start
857	* until the IO request has had a chance.
858	*
859	* So: regular IO calls 'wait_barrier'. When that returns there
860	* is no backgroup IO happening, It must arrange to call
861	* allow_barrier when it has finished its IO.
862	* backgroup IO calls must call raise_barrier. Once that returns
863	* there is no normal IO happeing. It must arrange to call
864	* lower_barrier when the particular background IO completes.
865	*
866	* If resync/recovery is interrupted, returns -EINTR;
867	* Otherwise, returns 0.
868	*/
869	static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
870	{
871	int idx = sector_to_idx(sector: sector_nr);
872
873	spin_lock_irq(lock: &conf->resync_lock);
874
875	/ Wait until no block IO is waiting /
876	wait_event_lock_irq(conf->wait_barrier,
877	!atomic_read(&conf->nr_waiting[idx]),
878	conf->resync_lock);
879
880	/ block any new IO from starting /
881	atomic_inc(v: &conf->barrier[idx]);
882	/*
883	* In raise_barrier() we firstly increase conf->barrier[idx] then
884	* check conf->nr_pending[idx]. In _wait_barrier() we firstly
885	* increase conf->nr_pending[idx] then check conf->barrier[idx].
886	* A memory barrier here to make sure conf->nr_pending[idx] won't
887	* be fetched before conf->barrier[idx] is increased. Otherwise
888	* there will be a race between raise_barrier() and _wait_barrier().
889	*/
890	smp_mb__after_atomic();
891
892	/ For these conditions we must wait:*
893	* A: while the array is in frozen state
894	* B: while conf->nr_pending[idx] is not 0, meaning regular I/O
895	* existing in corresponding I/O barrier bucket.
896	* C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
897	* max resync count which allowed on current I/O barrier bucket.
898	*/
899	wait_event_lock_irq(conf->wait_barrier,
900	(!conf->array_frozen &&
901	!atomic_read(&conf->nr_pending[idx]) &&
902	atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) \|\|
903	test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
904	conf->resync_lock);
905
906	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
907	atomic_dec(v: &conf->barrier[idx]);
908	spin_unlock_irq(lock: &conf->resync_lock);
909	wake_up(&conf->wait_barrier);
910	return -EINTR;
911	}
912
913	atomic_inc(v: &conf->nr_sync_pending);
914	spin_unlock_irq(lock: &conf->resync_lock);
915
916	return `0`;
917	}
918
919	static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
920	{
921	int idx = sector_to_idx(sector: sector_nr);
922
923	BUG_ON(atomic_read(&conf->barrier[idx]) <= `0`);
924
925	atomic_dec(v: &conf->barrier[idx]);
926	atomic_dec(v: &conf->nr_sync_pending);
927	wake_up(&conf->wait_barrier);
928	}
929
930	static bool _wait_barrier(struct r1conf conf, int* idx, bool nowait)
931	{
932	bool ret = true;
933
934	/*
935	* We need to increase conf->nr_pending[idx] very early here,
936	* then raise_barrier() can be blocked when it waits for
937	* conf->nr_pending[idx] to be 0. Then we can avoid holding
938	* conf->resync_lock when there is no barrier raised in same
939	* barrier unit bucket. Also if the array is frozen, I/O
940	* should be blocked until array is unfrozen.
941	*/
942	atomic_inc(v: &conf->nr_pending[idx]);
943	/*
944	* In _wait_barrier() we firstly increase conf->nr_pending[idx], then
945	* check conf->barrier[idx]. In raise_barrier() we firstly increase
946	* conf->barrier[idx], then check conf->nr_pending[idx]. A memory
947	* barrier is necessary here to make sure conf->barrier[idx] won't be
948	* fetched before conf->nr_pending[idx] is increased. Otherwise there
949	* will be a race between _wait_barrier() and raise_barrier().
950	*/
951	smp_mb__after_atomic();
952
953	/*
954	* Don't worry about checking two atomic_t variables at same time
955	* here. If during we check conf->barrier[idx], the array is
956	* frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
957	* 0, it is safe to return and make the I/O continue. Because the
958	* array is frozen, all I/O returned here will eventually complete
959	* or be queued, no race will happen. See code comment in
960	* frozen_array().
961	*/
962	if (!READ_ONCE(conf->array_frozen) &&
963	!atomic_read(v: &conf->barrier[idx]))
964	return ret;
965
966	/*
967	* After holding conf->resync_lock, conf->nr_pending[idx]
968	* should be decreased before waiting for barrier to drop.
969	* Otherwise, we may encounter a race condition because
970	* raise_barrer() might be waiting for conf->nr_pending[idx]
971	* to be 0 at same time.
972	*/
973	spin_lock_irq(lock: &conf->resync_lock);
974	atomic_inc(v: &conf->nr_waiting[idx]);
975	atomic_dec(v: &conf->nr_pending[idx]);
976	/*
977	* In case freeze_array() is waiting for
978	* get_unqueued_pending() == extra
979	*/
980	wake_up_barrier(conf);
981	/ Wait for the barrier in same barrier unit bucket to drop. /
982
983	/ Return false when nowait flag is set /
984	if (nowait) {
985	ret = false;
986	} else {
987	wait_event_lock_irq(conf->wait_barrier,
988	!conf->array_frozen &&
989	!atomic_read(&conf->barrier[idx]),
990	conf->resync_lock);
991	atomic_inc(v: &conf->nr_pending[idx]);
992	}
993
994	atomic_dec(v: &conf->nr_waiting[idx]);
995	spin_unlock_irq(lock: &conf->resync_lock);
996	return ret;
997	}
998
999	static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
1000	{
1001	int idx = sector_to_idx(sector: sector_nr);
1002	bool ret = true;
1003
1004	/*
1005	* Very similar to _wait_barrier(). The difference is, for read
1006	* I/O we don't need wait for sync I/O, but if the whole array
1007	* is frozen, the read I/O still has to wait until the array is
1008	* unfrozen. Since there is no ordering requirement with
1009	* conf->barrier[idx] here, memory barrier is unnecessary as well.
1010	*/
1011	atomic_inc(v: &conf->nr_pending[idx]);
1012
1013	if (!READ_ONCE(conf->array_frozen))
1014	return ret;
1015
1016	spin_lock_irq(lock: &conf->resync_lock);
1017	atomic_inc(v: &conf->nr_waiting[idx]);
1018	atomic_dec(v: &conf->nr_pending[idx]);
1019	/*
1020	* In case freeze_array() is waiting for
1021	* get_unqueued_pending() == extra
1022	*/
1023	wake_up_barrier(conf);
1024	/ Wait for array to be unfrozen /
1025
1026	/ Return false when nowait flag is set /
1027	if (nowait) {
1028	/ Return false when nowait flag is set /
1029	ret = false;
1030	} else {
1031	wait_event_lock_irq(conf->wait_barrier,
1032	!conf->array_frozen,
1033	conf->resync_lock);
1034	atomic_inc(v: &conf->nr_pending[idx]);
1035	}
1036
1037	atomic_dec(v: &conf->nr_waiting[idx]);
1038	spin_unlock_irq(lock: &conf->resync_lock);
1039	return ret;
1040	}
1041
1042	static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
1043	{
1044	int idx = sector_to_idx(sector: sector_nr);
1045
1046	return _wait_barrier(conf, idx, nowait);
1047	}
1048
1049	static void _allow_barrier(struct r1conf conf, int* idx)
1050	{
1051	atomic_dec(v: &conf->nr_pending[idx]);
1052	wake_up_barrier(conf);
1053	}
1054
1055	static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
1056	{
1057	int idx = sector_to_idx(sector: sector_nr);
1058
1059	_allow_barrier(conf, idx);
1060	}
1061
1062	/ conf->resync_lock should be held /
1063	static int get_unqueued_pending(struct r1conf *conf)
1064	{
1065	int idx, ret;
1066
1067	ret = atomic_read(v: &conf->nr_sync_pending);
1068	for (idx = `0`; idx < BARRIER_BUCKETS_NR; idx++)
1069	ret += atomic_read(v: &conf->nr_pending[idx]) -
1070	atomic_read(v: &conf->nr_queued[idx]);
1071
1072	return ret;
1073	}
1074
1075	static void freeze_array(struct r1conf conf, int* extra)
1076	{
1077	/ Stop sync I/O and normal I/O and wait for everything to*
1078	* go quiet.
1079	* This is called in two situations:
1080	* 1) management command handlers (reshape, remove disk, quiesce).
1081	* 2) one normal I/O request failed.
1082
1083	* After array_frozen is set to 1, new sync IO will be blocked at
1084	* raise_barrier(), and new normal I/O will blocked at _wait_barrier()
1085	* or wait_read_barrier(). The flying I/Os will either complete or be
1086	* queued. When everything goes quite, there are only queued I/Os left.
1087
1088	* Every flying I/O contributes to a conf->nr_pending[idx], idx is the
1089	* barrier bucket index which this I/O request hits. When all sync and
1090	* normal I/O are queued, sum of all conf->nr_pending[] will match sum
1091	* of all conf->nr_queued[]. But normal I/O failure is an exception,
1092	* in handle_read_error(), we may call freeze_array() before trying to
1093	* fix the read error. In this case, the error read I/O is not queued,
1094	* so get_unqueued_pending() == 1.
1095	*
1096	* Therefore before this function returns, we need to wait until
1097	* get_unqueued_pendings(conf) gets equal to extra. For
1098	* normal I/O context, extra is 1, in rested situations extra is 0.
1099	*/
1100	spin_lock_irq(lock: &conf->resync_lock);
1101	conf->array_frozen = `1`;
1102	raid1_log(conf->mddev, "wait freeze");
1103	wait_event_lock_irq_cmd(
1104	conf->wait_barrier,
1105	get_unqueued_pending(conf) == extra,
1106	conf->resync_lock,
1107	flush_pending_writes(conf));
1108	spin_unlock_irq(lock: &conf->resync_lock);
1109	}
1110	static void unfreeze_array(struct r1conf *conf)
1111	{
1112	/ reverse the effect of the freeze /
1113	spin_lock_irq(lock: &conf->resync_lock);
1114	conf->array_frozen = `0`;
1115	spin_unlock_irq(lock: &conf->resync_lock);
1116	wake_up(&conf->wait_barrier);
1117	}
1118
1119	static void alloc_behind_master_bio(struct r1bio *r1_bio,
1120	struct bio *bio)
1121	{
1122	int size = bio->bi_iter.bi_size;
1123	unsigned vcnt = (size + PAGE_SIZE - `1`) >> PAGE_SHIFT;
1124	int i = `0`;
1125	struct bio *behind_bio = NULL;
1126
1127	behind_bio = bio_alloc_bioset(NULL, nr_vecs: vcnt, opf: `0`, GFP_NOIO,
1128	bs: &r1_bio->mddev->bio_set);
1129	if (!behind_bio)
1130	return;
1131
1132	/ discard op, we don't support writezero/writesame yet /
1133	if (!bio_has_data(bio)) {
1134	behind_bio->bi_iter.bi_size = size;
1135	goto skip_copy;
1136	}
1137
1138	while (i < vcnt && size) {
1139	struct page *page;
1140	int len = min_t(int, PAGE_SIZE, size);
1141
1142	page = alloc_page(GFP_NOIO);
1143	if (unlikely(!page))
1144	goto free_pages;
1145
1146	if (!bio_add_page(bio: behind_bio, page, len, off: `0`)) {
1147	put_page(page);
1148	goto free_pages;
1149	}
1150
1151	size -= len;
1152	i++;
1153	}
1154
1155	bio_copy_data(dst: behind_bio, src: bio);
1156	skip_copy:
1157	r1_bio->behind_master_bio = behind_bio;
1158	set_bit(nr: R1BIO_BehindIO, addr: &r1_bio->state);
1159
1160	return;
1161
1162	free_pages:
1163	pr_debug("%dB behind alloc failed, doing sync I/O\n",
1164	bio->bi_iter.bi_size);
1165	bio_free_pages(bio: behind_bio);
1166	bio_put(behind_bio);
1167	}
1168
1169	static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
1170	{
1171	struct raid1_plug_cb plug = container_of(cb, struct* raid1_plug_cb,
1172	cb);
1173	struct mddev *mddev = plug->cb.data;
1174	struct r1conf *conf = mddev->private;
1175	struct bio *bio;
1176
1177	if (from_schedule) {
1178	spin_lock_irq(lock: &conf->device_lock);
1179	bio_list_merge(bl: &conf->pending_bio_list, bl2: &plug->pending);
1180	spin_unlock_irq(lock: &conf->device_lock);
1181	wake_up_barrier(conf);
1182	md_wakeup_thread(thread: mddev->thread);
1183	kfree(objp: plug);
1184	return;
1185	}
1186
1187	/ we aren't scheduling, so we can do the write-out directly. /
1188	bio = bio_list_get(bl: &plug->pending);
1189	flush_bio_list(conf, bio);
1190	kfree(objp: plug);
1191	}
1192
1193	static void init_r1bio(struct r1bio r1_bio, struct* mddev mddev, struct* bio *bio)
1194	{
1195	r1_bio->master_bio = bio;
1196	r1_bio->sectors = bio_sectors(bio);
1197	r1_bio->state = `0`;
1198	r1_bio->mddev = mddev;
1199	r1_bio->sector = bio->bi_iter.bi_sector;
1200	}
1201
1202	static inline struct r1bio *
1203	alloc_r1bio(struct mddev mddev, struct* bio *bio)
1204	{
1205	struct r1conf *conf = mddev->private;
1206	struct r1bio *r1_bio;
1207
1208	r1_bio = mempool_alloc(pool: &conf->r1bio_pool, GFP_NOIO);
1209	/ Ensure no bio records IO_BLOCKED /
1210	memset(r1_bio->bios, `0`, conf->raid_disks * sizeof(r1_bio->bios[`0`]));
1211	init_r1bio(r1_bio, mddev, bio);
1212	return r1_bio;
1213	}
1214
1215	static void raid1_read_request(struct mddev mddev, struct* bio *bio,
1216	int max_read_sectors, struct r1bio *r1_bio)
1217	{
1218	struct r1conf *conf = mddev->private;
1219	struct raid1_info *mirror;
1220	struct bio *read_bio;
1221	struct bitmap *bitmap = mddev->bitmap;
1222	const enum req_op op = bio_op(bio);
1223	const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
1224	int max_sectors;
1225	int rdisk;
1226	bool r1bio_existed = !!r1_bio;
1227	char b[BDEVNAME_SIZE];
1228
1229	/*
1230	* If r1_bio is set, we are blocking the raid1d thread
1231	* so there is a tiny risk of deadlock. So ask for
1232	* emergency memory if needed.
1233	*/
1234	gfp_t gfp = r1_bio ? (GFP_NOIO \| __GFP_HIGH) : GFP_NOIO;
1235
1236	if (r1bio_existed) {
1237	/ Need to get the block device name carefully /
1238	struct md_rdev *rdev;
1239	rcu_read_lock();
1240	rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
1241	if (rdev)
1242	snprintf(buf: b, size: sizeof(b), fmt: "%pg", rdev->bdev);
1243	else
1244	strcpy(p: b, q: "???");
1245	rcu_read_unlock();
1246	}
1247
1248	/*
1249	* Still need barrier for READ in case that whole
1250	* array is frozen.
1251	*/
1252	if (!wait_read_barrier(conf, sector_nr: bio->bi_iter.bi_sector,
1253	nowait: bio->bi_opf & REQ_NOWAIT)) {
1254	bio_wouldblock_error(bio);
1255	return;
1256	}
1257
1258	if (!r1_bio)
1259	r1_bio = alloc_r1bio(mddev, bio);
1260	else
1261	init_r1bio(r1_bio, mddev, bio);
1262	r1_bio->sectors = max_read_sectors;
1263
1264	/*
1265	* make_request() can abort the operation when read-ahead is being
1266	* used and no empty request is available.
1267	*/
1268	rdisk = read_balance(conf, r1_bio, max_sectors: &max_sectors);
1269
1270	if (rdisk < `0`) {
1271	/ couldn't find anywhere to read from /
1272	if (r1bio_existed) {
1273	pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
1274	mdname(mddev),
1275	b,
1276	(unsigned long long)r1_bio->sector);
1277	}
1278	raid_end_bio_io(r1_bio);
1279	return;
1280	}
1281	mirror = conf->mirrors + rdisk;
1282
1283	if (r1bio_existed)
1284	pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n",
1285	mdname(mddev),
1286	(unsigned long long)r1_bio->sector,
1287	mirror->rdev->bdev);
1288
1289	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
1290	bitmap) {
1291	/*
1292	* Reading from a write-mostly device must take care not to
1293	* over-take any writes that are 'behind'
1294	*/
1295	raid1_log(mddev, "wait behind writes");
1296	wait_event(bitmap->behind_wait,
1297	atomic_read(&bitmap->behind_writes) == `0`);
1298	}
1299
1300	if (max_sectors < bio_sectors(bio)) {
1301	struct bio *split = bio_split(bio, sectors: max_sectors,
1302	gfp, bs: &conf->bio_split);
1303	bio_chain(split, bio);
1304	submit_bio_noacct(bio);
1305	bio = split;
1306	r1_bio->master_bio = bio;
1307	r1_bio->sectors = max_sectors;
1308	}
1309
1310	r1_bio->read_disk = rdisk;
1311	if (!r1bio_existed) {
1312	md_account_bio(mddev, bio: &bio);
1313	r1_bio->master_bio = bio;
1314	}
1315	read_bio = bio_alloc_clone(bdev: mirror->rdev->bdev, bio_src: bio, gfp,
1316	bs: &mddev->bio_set);
1317
1318	r1_bio->bios[rdisk] = read_bio;
1319
1320	read_bio->bi_iter.bi_sector = r1_bio->sector +
1321	mirror->rdev->data_offset;
1322	read_bio->bi_end_io = raid1_end_read_request;
1323	read_bio->bi_opf = op \| do_sync;
1324	if (test_bit(FailFast, &mirror->rdev->flags) &&
1325	test_bit(R1BIO_FailFast, &r1_bio->state))
1326	read_bio->bi_opf \|= MD_FAILFAST;
1327	read_bio->bi_private = r1_bio;
1328
1329	if (mddev->gendisk)
1330	trace_block_bio_remap(bio: read_bio, dev: disk_devt(disk: mddev->gendisk),
1331	from: r1_bio->sector);
1332
1333	submit_bio_noacct(bio: read_bio);
1334	}
1335
1336	static void raid1_write_request(struct mddev mddev, struct* bio *bio,
1337	int max_write_sectors)
1338	{
1339	struct r1conf *conf = mddev->private;
1340	struct r1bio *r1_bio;
1341	int i, disks;
1342	struct bitmap *bitmap = mddev->bitmap;
1343	unsigned long flags;
1344	struct md_rdev *blocked_rdev;
1345	int first_clone;
1346	int max_sectors;
1347	bool write_behind = false;
1348	bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
1349
1350	if (mddev_is_clustered(mddev) &&
1351	md_cluster_ops->area_resyncing(mddev, WRITE,
1352	bio->bi_iter.bi_sector, bio_end_sector(bio))) {
1353
1354	DEFINE_WAIT(w);
1355	if (bio->bi_opf & REQ_NOWAIT) {
1356	bio_wouldblock_error(bio);
1357	return;
1358	}
1359	for (;;) {
1360	prepare_to_wait(wq_head: &conf->wait_barrier,
1361	wq_entry: &w, TASK_IDLE);
1362	if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1363	bio->bi_iter.bi_sector,
1364	bio_end_sector(bio)))
1365	break;
1366	schedule();
1367	}
1368	finish_wait(wq_head: &conf->wait_barrier, wq_entry: &w);
1369	}
1370
1371	/*
1372	* Register the new request and wait if the reconstruction
1373	* thread has put up a bar for new requests.
1374	* Continue immediately if no resync is active currently.
1375	*/
1376	if (!wait_barrier(conf, sector_nr: bio->bi_iter.bi_sector,
1377	nowait: bio->bi_opf & REQ_NOWAIT)) {
1378	bio_wouldblock_error(bio);
1379	return;
1380	}
1381
1382	retry_write:
1383	r1_bio = alloc_r1bio(mddev, bio);
1384	r1_bio->sectors = max_write_sectors;
1385
1386	/ first select target devices under rcu_lock and*
1387	* inc refcount on their rdev. Record them by setting
1388	* bios[x] to bio
1389	* If there are known/acknowledged bad blocks on any device on
1390	* which we have seen a write error, we want to avoid writing those
1391	* blocks.
1392	* This potentially requires several writes to write around
1393	* the bad blocks. Each set of writes gets it's own r1bio
1394	* with a set of bios attached.
1395	*/
1396
1397	disks = conf->raid_disks * `2`;
1398	blocked_rdev = NULL;
1399	rcu_read_lock();
1400	max_sectors = r1_bio->sectors;
1401	for (i = `0`; i < disks; i++) {
1402	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1403
1404	/*
1405	* The write-behind io is only attempted on drives marked as
1406	* write-mostly, which means we could allocate write behind
1407	* bio later.
1408	*/
1409	if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
1410	write_behind = true;
1411
1412	if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1413	atomic_inc(v: &rdev->nr_pending);
1414	blocked_rdev = rdev;
1415	break;
1416	}
1417	r1_bio->bios[i] = NULL;
1418	if (!rdev \|\| test_bit(Faulty, &rdev->flags)) {
1419	if (i < conf->raid_disks)
1420	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
1421	continue;
1422	}
1423
1424	atomic_inc(v: &rdev->nr_pending);
1425	if (test_bit(WriteErrorSeen, &rdev->flags)) {
1426	sector_t first_bad;
1427	int bad_sectors;
1428	int is_bad;
1429
1430	is_bad = is_badblock(rdev, s: r1_bio->sector, sectors: max_sectors,
1431	first_bad: &first_bad, bad_sectors: &bad_sectors);
1432	if (is_bad < `0`) {
1433	/ mustn't write here until the bad block is*
1434	* acknowledged*/
1435	set_bit(nr: BlockedBadBlocks, addr: &rdev->flags);
1436	blocked_rdev = rdev;
1437	break;
1438	}
1439	if (is_bad && first_bad <= r1_bio->sector) {
1440	/ Cannot write here at all /
1441	bad_sectors -= (r1_bio->sector - first_bad);
1442	if (bad_sectors < max_sectors)
1443	/ mustn't write more than bad_sectors*
1444	* to other devices yet
1445	*/
1446	max_sectors = bad_sectors;
1447	rdev_dec_pending(rdev, mddev);
1448	/ We don't set R1BIO_Degraded as that*
1449	* only applies if the disk is
1450	* missing, so it might be re-added,
1451	* and we want to know to recover this
1452	* chunk.
1453	* In this case the device is here,
1454	* and the fact that this chunk is not
1455	* in-sync is recorded in the bad
1456	* block log
1457	*/
1458	continue;
1459	}
1460	if (is_bad) {
1461	int good_sectors = first_bad - r1_bio->sector;
1462	if (good_sectors < max_sectors)
1463	max_sectors = good_sectors;
1464	}
1465	}
1466	r1_bio->bios[i] = bio;
1467	}
1468	rcu_read_unlock();
1469
1470	if (unlikely(blocked_rdev)) {
1471	/ Wait for this device to become unblocked /
1472	int j;
1473
1474	for (j = `0`; j < i; j++)
1475	if (r1_bio->bios[j])
1476	rdev_dec_pending(rdev: conf->mirrors[j].rdev, mddev);
1477	free_r1bio(r1_bio);
1478	allow_barrier(conf, sector_nr: bio->bi_iter.bi_sector);
1479
1480	if (bio->bi_opf & REQ_NOWAIT) {
1481	bio_wouldblock_error(bio);
1482	return;
1483	}
1484	raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1485	md_wait_for_blocked_rdev(rdev: blocked_rdev, mddev);
1486	wait_barrier(conf, sector_nr: bio->bi_iter.bi_sector, nowait: false);
1487	goto retry_write;
1488	}
1489
1490	/*
1491	* When using a bitmap, we may call alloc_behind_master_bio below.
1492	* alloc_behind_master_bio allocates a copy of the data payload a page
1493	* at a time and thus needs a new bio that can fit the whole payload
1494	* this bio in page sized chunks.
1495	*/
1496	if (write_behind && bitmap)
1497	max_sectors = min_t(int, max_sectors,
1498	BIO_MAX_VECS * (PAGE_SIZE >> `9`));
1499	if (max_sectors < bio_sectors(bio)) {
1500	struct bio *split = bio_split(bio, sectors: max_sectors,
1501	GFP_NOIO, bs: &conf->bio_split);
1502	bio_chain(split, bio);
1503	submit_bio_noacct(bio);
1504	bio = split;
1505	r1_bio->master_bio = bio;
1506	r1_bio->sectors = max_sectors;
1507	}
1508
1509	md_account_bio(mddev, bio: &bio);
1510	r1_bio->master_bio = bio;
1511	atomic_set(v: &r1_bio->remaining, i: `1`);
1512	atomic_set(v: &r1_bio->behind_remaining, i: `0`);
1513
1514	first_clone = `1`;
1515
1516	for (i = `0`; i < disks; i++) {
1517	struct bio *mbio = NULL;
1518	struct md_rdev *rdev = conf->mirrors[i].rdev;
1519	if (!r1_bio->bios[i])
1520	continue;
1521
1522	if (first_clone) {
1523	/ do behind I/O ?*
1524	* Not if there are too many, or cannot
1525	* allocate memory, or a reader on WriteMostly
1526	* is waiting for behind writes to flush */
1527	if (bitmap && write_behind &&
1528	(atomic_read(v: &bitmap->behind_writes)
1529	< mddev->bitmap_info.max_write_behind) &&
1530	!waitqueue_active(wq_head: &bitmap->behind_wait)) {
1531	alloc_behind_master_bio(r1_bio, bio);
1532	}
1533
1534	md_bitmap_startwrite(bitmap, offset: r1_bio->sector, sectors: r1_bio->sectors,
1535	test_bit(R1BIO_BehindIO, &r1_bio->state));
1536	first_clone = `0`;
1537	}
1538
1539	if (r1_bio->behind_master_bio) {
1540	mbio = bio_alloc_clone(bdev: rdev->bdev,
1541	bio_src: r1_bio->behind_master_bio,
1542	GFP_NOIO, bs: &mddev->bio_set);
1543	if (test_bit(CollisionCheck, &rdev->flags))
1544	wait_for_serialization(rdev, r1_bio);
1545	if (test_bit(WriteMostly, &rdev->flags))
1546	atomic_inc(v: &r1_bio->behind_remaining);
1547	} else {
1548	mbio = bio_alloc_clone(bdev: rdev->bdev, bio_src: bio, GFP_NOIO,
1549	bs: &mddev->bio_set);
1550
1551	if (mddev->serialize_policy)
1552	wait_for_serialization(rdev, r1_bio);
1553	}
1554
1555	r1_bio->bios[i] = mbio;
1556
1557	mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
1558	mbio->bi_end_io = raid1_end_write_request;
1559	mbio->bi_opf = bio_op(bio) \| (bio->bi_opf & (REQ_SYNC \| REQ_FUA));
1560	if (test_bit(FailFast, &rdev->flags) &&
1561	!test_bit(WriteMostly, &rdev->flags) &&
1562	conf->raid_disks - mddev->degraded > `1`)
1563	mbio->bi_opf \|= MD_FAILFAST;
1564	mbio->bi_private = r1_bio;
1565
1566	atomic_inc(v: &r1_bio->remaining);
1567
1568	if (mddev->gendisk)
1569	trace_block_bio_remap(bio: mbio, dev: disk_devt(disk: mddev->gendisk),
1570	from: r1_bio->sector);
1571	/ flush_pending_writes() needs access to the rdev so.../
1572	mbio->bi_bdev = (void *)rdev;
1573	if (!raid1_add_bio_to_plug(mddev, bio: mbio, unplug: raid1_unplug, copies: disks)) {
1574	spin_lock_irqsave(&conf->device_lock, flags);
1575	bio_list_add(bl: &conf->pending_bio_list, bio: mbio);
1576	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1577	md_wakeup_thread(thread: mddev->thread);
1578	}
1579	}
1580
1581	r1_bio_write_done(r1_bio);
1582
1583	/ In case raid1d snuck in to freeze_array /
1584	wake_up_barrier(conf);
1585	}
1586
1587	static bool raid1_make_request(struct mddev mddev, struct* bio *bio)
1588	{
1589	sector_t sectors;
1590
1591	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1592	&& md_flush_request(mddev, bio))
1593	return true;
1594
1595	/*
1596	* There is a limit to the maximum size, but
1597	* the read/write handler might find a lower limit
1598	* due to bad blocks. To avoid multiple splits,
1599	* we pass the maximum number of sectors down
1600	* and let the lower level perform the split.
1601	*/
1602	sectors = align_to_barrier_unit_end(
1603	start_sector: bio->bi_iter.bi_sector, bio_sectors(bio));
1604
1605	if (bio_data_dir(bio) == READ)
1606	raid1_read_request(mddev, bio, max_read_sectors: sectors, NULL);
1607	else {
1608	if (!md_write_start(mddev,bi: bio))
1609	return false;
1610	raid1_write_request(mddev, bio, max_write_sectors: sectors);
1611	}
1612	return true;
1613	}
1614
1615	static void raid1_status(struct seq_file seq, struct* mddev *mddev)
1616	{
1617	struct r1conf *conf = mddev->private;
1618	int i;
1619
1620	seq_printf(m: seq, fmt: " [%d/%d] [", conf->raid_disks,
1621	conf->raid_disks - mddev->degraded);
1622	rcu_read_lock();
1623	for (i = `0`; i < conf->raid_disks; i++) {
1624	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1625	seq_printf(m: seq, fmt: "%s",
1626	rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1627	}
1628	rcu_read_unlock();
1629	seq_printf(m: seq, fmt: "]");
1630	}
1631
1632	/**
1633	* raid1_error() - RAID1 error handler.
1634	* @mddev: affected md device.
1635	* @rdev: member device to fail.
1636	*
1637	* The routine acknowledges &rdev failure and determines new @mddev state.
1638	* If it failed, then:
1639	* - &MD_BROKEN flag is set in &mddev->flags.
1640	* - recovery is disabled.
1641	* Otherwise, it must be degraded:
1642	* - recovery is interrupted.
1643	* - &mddev->degraded is bumped.
1644	*
1645	* @rdev is marked as &Faulty excluding case when array is failed and
1646	* &mddev->fail_last_dev is off.
1647	*/
1648	static void raid1_error(struct mddev mddev, struct* md_rdev *rdev)
1649	{
1650	struct r1conf *conf = mddev->private;
1651	unsigned long flags;
1652
1653	spin_lock_irqsave(&conf->device_lock, flags);
1654
1655	if (test_bit(In_sync, &rdev->flags) &&
1656	(conf->raid_disks - mddev->degraded) == `1`) {
1657	set_bit(nr: MD_BROKEN, addr: &mddev->flags);
1658
1659	if (!mddev->fail_last_dev) {
1660	conf->recovery_disabled = mddev->recovery_disabled;
1661	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1662	return;
1663	}
1664	}
1665	set_bit(nr: Blocked, addr: &rdev->flags);
1666	if (test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
1667	mddev->degraded++;
1668	set_bit(nr: Faulty, addr: &rdev->flags);
1669	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1670	/*
1671	* if recovery is running, make sure it aborts.
1672	*/
1673	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
1674	set_mask_bits(&mddev->sb_flags, `0`,
1675	BIT(MD_SB_CHANGE_DEVS) \| BIT(MD_SB_CHANGE_PENDING));
1676	pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
1677	"md/raid1:%s: Operation continuing on %d devices.\n",
1678	mdname(mddev), rdev->bdev,
1679	mdname(mddev), conf->raid_disks - mddev->degraded);
1680	}
1681
1682	static void print_conf(struct r1conf *conf)
1683	{
1684	int i;
1685
1686	pr_debug("RAID1 conf printout:\n");
1687	if (!conf) {
1688	pr_debug("(!conf)\n");
1689	return;
1690	}
1691	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1692	conf->raid_disks);
1693
1694	rcu_read_lock();
1695	for (i = `0`; i < conf->raid_disks; i++) {
1696	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1697	if (rdev)
1698	pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
1699	i, !test_bit(In_sync, &rdev->flags),
1700	!test_bit(Faulty, &rdev->flags),
1701	rdev->bdev);
1702	}
1703	rcu_read_unlock();
1704	}
1705
1706	static void close_sync(struct r1conf *conf)
1707	{
1708	int idx;
1709
1710	for (idx = `0`; idx < BARRIER_BUCKETS_NR; idx++) {
1711	_wait_barrier(conf, idx, nowait: false);
1712	_allow_barrier(conf, idx);
1713	}
1714
1715	mempool_exit(pool: &conf->r1buf_pool);
1716	}
1717
1718	static int raid1_spare_active(struct mddev *mddev)
1719	{
1720	int i;
1721	struct r1conf *conf = mddev->private;
1722	int count = `0`;
1723	unsigned long flags;
1724
1725	/*
1726	* Find all failed disks within the RAID1 configuration
1727	* and mark them readable.
1728	* Called under mddev lock, so rcu protection not needed.
1729	* device_lock used to avoid races with raid1_end_read_request
1730	* which expects 'In_sync' flags and ->degraded to be consistent.
1731	*/
1732	spin_lock_irqsave(&conf->device_lock, flags);
1733	for (i = `0`; i < conf->raid_disks; i++) {
1734	struct md_rdev *rdev = conf->mirrors[i].rdev;
1735	struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1736	if (repl
1737	&& !test_bit(Candidate, &repl->flags)
1738	&& repl->recovery_offset == MaxSector
1739	&& !test_bit(Faulty, &repl->flags)
1740	&& !test_and_set_bit(nr: In_sync, addr: &repl->flags)) {
1741	/ replacement has just become active /
1742	if (!rdev \|\|
1743	!test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
1744	count++;
1745	if (rdev) {
1746	/ Replaced device not technically*
1747	* faulty, but we need to be sure
1748	* it gets removed and never re-added
1749	*/
1750	set_bit(nr: Faulty, addr: &rdev->flags);
1751	sysfs_notify_dirent_safe(
1752	sd: rdev->sysfs_state);
1753	}
1754	}
1755	if (rdev
1756	&& rdev->recovery_offset == MaxSector
1757	&& !test_bit(Faulty, &rdev->flags)
1758	&& !test_and_set_bit(nr: In_sync, addr: &rdev->flags)) {
1759	count++;
1760	sysfs_notify_dirent_safe(sd: rdev->sysfs_state);
1761	}
1762	}
1763	mddev->degraded -= count;
1764	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1765
1766	print_conf(conf);
1767	return count;
1768	}
1769
1770	static int raid1_add_disk(struct mddev mddev, struct* md_rdev *rdev)
1771	{
1772	struct r1conf *conf = mddev->private;
1773	int err = -EEXIST;
1774	int mirror = `0`, repl_slot = -`1`;
1775	struct raid1_info *p;
1776	int first = `0`;
1777	int last = conf->raid_disks - `1`;
1778
1779	if (mddev->recovery_disabled == conf->recovery_disabled)
1780	return -EBUSY;
1781
1782	if (md_integrity_add_rdev(rdev, mddev))
1783	return -ENXIO;
1784
1785	if (rdev->raid_disk >= `0`)
1786	first = last = rdev->raid_disk;
1787
1788	/*
1789	* find the disk ... but prefer rdev->saved_raid_disk
1790	* if possible.
1791	*/
1792	if (rdev->saved_raid_disk >= `0` &&
1793	rdev->saved_raid_disk >= first &&
1794	rdev->saved_raid_disk < conf->raid_disks &&
1795	conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1796	first = last = rdev->saved_raid_disk;
1797
1798	for (mirror = first; mirror <= last; mirror++) {
1799	p = conf->mirrors + mirror;
1800	if (!p->rdev) {
1801	if (mddev->gendisk)
1802	disk_stack_limits(disk: mddev->gendisk, bdev: rdev->bdev,
1803	offset: rdev->data_offset << `9`);
1804
1805	p->head_position = `0`;
1806	rdev->raid_disk = mirror;
1807	err = `0`;
1808	/ As all devices are equivalent, we don't need a full recovery*
1809	* if this was recently any drive of the array
1810	*/
1811	if (rdev->saved_raid_disk < `0`)
1812	conf->fullsync = `1`;
1813	rcu_assign_pointer(p->rdev, rdev);
1814	break;
1815	}
1816	if (test_bit(WantReplacement, &p->rdev->flags) &&
1817	p[conf->raid_disks].rdev == NULL && repl_slot < `0`)
1818	repl_slot = mirror;
1819	}
1820
1821	if (err && repl_slot >= `0`) {
1822	/ Add this device as a replacement /
1823	p = conf->mirrors + repl_slot;
1824	clear_bit(nr: In_sync, addr: &rdev->flags);
1825	set_bit(nr: Replacement, addr: &rdev->flags);
1826	rdev->raid_disk = repl_slot;
1827	err = `0`;
1828	conf->fullsync = `1`;
1829	rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1830	}
1831
1832	print_conf(conf);
1833	return err;
1834	}
1835
1836	static int raid1_remove_disk(struct mddev mddev, struct* md_rdev *rdev)
1837	{
1838	struct r1conf *conf = mddev->private;
1839	int err = `0`;
1840	int number = rdev->raid_disk;
1841	struct raid1_info *p = conf->mirrors + number;
1842
1843	if (unlikely(number >= conf->raid_disks))
1844	goto abort;
1845
1846	if (rdev != p->rdev)
1847	p = conf->mirrors + conf->raid_disks + number;
1848
1849	print_conf(conf);
1850	if (rdev == p->rdev) {
1851	if (test_bit(In_sync, &rdev->flags) \|\|
1852	atomic_read(v: &rdev->nr_pending)) {
1853	err = -EBUSY;
1854	goto abort;
1855	}
1856	/ Only remove non-faulty devices if recovery*
1857	* is not possible.
1858	*/
1859	if (!test_bit(Faulty, &rdev->flags) &&
1860	mddev->recovery_disabled != conf->recovery_disabled &&
1861	mddev->degraded < conf->raid_disks) {
1862	err = -EBUSY;
1863	goto abort;
1864	}
1865	p->rdev = NULL;
1866	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1867	synchronize_rcu();
1868	if (atomic_read(v: &rdev->nr_pending)) {
1869	/ lost the race, try later /
1870	err = -EBUSY;
1871	p->rdev = rdev;
1872	goto abort;
1873	}
1874	}
1875	if (conf->mirrors[conf->raid_disks + number].rdev) {
1876	/ We just removed a device that is being replaced.*
1877	* Move down the replacement. We drain all IO before
1878	* doing this to avoid confusion.
1879	*/
1880	struct md_rdev *repl =
1881	conf->mirrors[conf->raid_disks + number].rdev;
1882	freeze_array(conf, extra: `0`);
1883	if (atomic_read(v: &repl->nr_pending)) {
1884	/ It means that some queued IO of retry_list*
1885	* hold repl. Thus, we cannot set replacement
1886	* as NULL, avoiding rdev NULL pointer
1887	* dereference in sync_request_write and
1888	* handle_write_finished.
1889	*/
1890	err = -EBUSY;
1891	unfreeze_array(conf);
1892	goto abort;
1893	}
1894	clear_bit(nr: Replacement, addr: &repl->flags);
1895	p->rdev = repl;
1896	conf->mirrors[conf->raid_disks + number].rdev = NULL;
1897	unfreeze_array(conf);
1898	}
1899
1900	clear_bit(nr: WantReplacement, addr: &rdev->flags);
1901	err = md_integrity_register(mddev);
1902	}
1903	abort:
1904
1905	print_conf(conf);
1906	return err;
1907	}
1908
1909	static void end_sync_read(struct bio *bio)
1910	{
1911	struct r1bio *r1_bio = get_resync_r1bio(bio);
1912
1913	update_head_pos(disk: r1_bio->read_disk, r1_bio);
1914
1915	/*
1916	* we have read a block, now it needs to be re-written,
1917	* or re-read if the read failed.
1918	* We don't do much here, just schedule handling by raid1d
1919	*/
1920	if (!bio->bi_status)
1921	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
1922
1923	if (atomic_dec_and_test(v: &r1_bio->remaining))
1924	reschedule_retry(r1_bio);
1925	}
1926
1927	static void abort_sync_write(struct mddev mddev, struct* r1bio *r1_bio)
1928	{
1929	sector_t sync_blocks = `0`;
1930	sector_t s = r1_bio->sector;
1931	long sectors_to_go = r1_bio->sectors;
1932
1933	/ make sure these bits don't get cleared. /
1934	do {
1935	md_bitmap_end_sync(bitmap: mddev->bitmap, offset: s, blocks: &sync_blocks, aborted: `1`);
1936	s += sync_blocks;
1937	sectors_to_go -= sync_blocks;
1938	} while (sectors_to_go > `0`);
1939	}
1940
1941	static void put_sync_write_buf(struct r1bio r1_bio, int* uptodate)
1942	{
1943	if (atomic_dec_and_test(v: &r1_bio->remaining)) {
1944	struct mddev *mddev = r1_bio->mddev;
1945	int s = r1_bio->sectors;
1946
1947	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
1948	test_bit(R1BIO_WriteError, &r1_bio->state))
1949	reschedule_retry(r1_bio);
1950	else {
1951	put_buf(r1_bio);
1952	md_done_sync(mddev, blocks: s, ok: uptodate);
1953	}
1954	}
1955	}
1956
1957	static void end_sync_write(struct bio *bio)
1958	{
1959	int uptodate = !bio->bi_status;
1960	struct r1bio *r1_bio = get_resync_r1bio(bio);
1961	struct mddev *mddev = r1_bio->mddev;
1962	struct r1conf *conf = mddev->private;
1963	sector_t first_bad;
1964	int bad_sectors;
1965	struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
1966
1967	if (!uptodate) {
1968	abort_sync_write(mddev, r1_bio);
1969	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
1970	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
1971	set_bit(nr: MD_RECOVERY_NEEDED, addr: &
1972	mddev->recovery);
1973	set_bit(nr: R1BIO_WriteError, addr: &r1_bio->state);
1974	} else if (is_badblock(rdev, s: r1_bio->sector, sectors: r1_bio->sectors,
1975	first_bad: &first_bad, bad_sectors: &bad_sectors) &&
1976	!is_badblock(rdev: conf->mirrors[r1_bio->read_disk].rdev,
1977	s: r1_bio->sector,
1978	sectors: r1_bio->sectors,
1979	first_bad: &first_bad, bad_sectors: &bad_sectors)
1980	)
1981	set_bit(nr: R1BIO_MadeGood, addr: &r1_bio->state);
1982
1983	put_sync_write_buf(r1_bio, uptodate);
1984	}
1985
1986	static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
1987	int sectors, struct page page, int* rw)
1988	{
1989	if (sync_page_io(rdev, sector, size: sectors << `9`, page, opf: rw, metadata_op: false))
1990	/ success /
1991	return `1`;
1992	if (rw == WRITE) {
1993	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
1994	if (!test_and_set_bit(nr: WantReplacement,
1995	addr: &rdev->flags))
1996	set_bit(nr: MD_RECOVERY_NEEDED, addr: &
1997	rdev->mddev->recovery);
1998	}
1999	/ need to record an error - either for the block or the device /
2000	if (!rdev_set_badblocks(rdev, s: sector, sectors, is_new: `0`))
2001	md_error(mddev: rdev->mddev, rdev);
2002	return `0`;
2003	}
2004
2005	static int fix_sync_read_error(struct r1bio *r1_bio)
2006	{
2007	/ Try some synchronous reads of other devices to get*
2008	* good data, much like with normal read errors. Only
2009	* read into the pages we already have so we don't
2010	* need to re-issue the read request.
2011	* We don't need to freeze the array, because being in an
2012	* active sync request, there is no normal IO, and
2013	* no overlapping syncs.
2014	* We don't need to check is_badblock() again as we
2015	* made sure that anything with a bad block in range
2016	* will have bi_end_io clear.
2017	*/
2018	struct mddev *mddev = r1_bio->mddev;
2019	struct r1conf *conf = mddev->private;
2020	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
2021	struct page **pages = get_resync_pages(bio)->pages;
2022	sector_t sect = r1_bio->sector;
2023	int sectors = r1_bio->sectors;
2024	int idx = `0`;
2025	struct md_rdev *rdev;
2026
2027	rdev = conf->mirrors[r1_bio->read_disk].rdev;
2028	if (test_bit(FailFast, &rdev->flags)) {
2029	/ Don't try recovering from here - just fail it*
2030	* ... unless it is the last working device of course */
2031	md_error(mddev, rdev);
2032	if (test_bit(Faulty, &rdev->flags))
2033	/ Don't try to read from here, but make sure*
2034	* put_buf does it's thing
2035	*/
2036	bio->bi_end_io = end_sync_write;
2037	}
2038
2039	while(sectors) {
2040	int s = sectors;
2041	int d = r1_bio->read_disk;
2042	int success = `0`;
2043	int start;
2044
2045	if (s > (PAGE_SIZE>>`9`))
2046	s = PAGE_SIZE >> `9`;
2047	do {
2048	if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
2049	/ No rcu protection needed here devices*
2050	* can only be removed when no resync is
2051	* active, and resync is currently active
2052	*/
2053	rdev = conf->mirrors[d].rdev;
2054	if (sync_page_io(rdev, sector: sect, size: s<<`9`,
2055	page: pages[idx],
2056	opf: REQ_OP_READ, metadata_op: false)) {
2057	success = `1`;
2058	break;
2059	}
2060	}
2061	d++;
2062	if (d == conf->raid_disks * `2`)
2063	d = `0`;
2064	} while (!success && d != r1_bio->read_disk);
2065
2066	if (!success) {
2067	int abort = `0`;
2068	/ Cannot read from anywhere, this block is lost.*
2069	* Record a bad block on each device. If that doesn't
2070	* work just disable and interrupt the recovery.
2071	* Don't fail devices as that won't really help.
2072	*/
2073	pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
2074	mdname(mddev), bio->bi_bdev,
2075	(unsigned long long)r1_bio->sector);
2076	for (d = `0`; d < conf->raid_disks * `2`; d++) {
2077	rdev = conf->mirrors[d].rdev;
2078	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
2079	continue;
2080	if (!rdev_set_badblocks(rdev, s: sect, sectors: s, is_new: `0`))
2081	abort = `1`;
2082	}
2083	if (abort) {
2084	conf->recovery_disabled =
2085	mddev->recovery_disabled;
2086	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2087	md_done_sync(mddev, blocks: r1_bio->sectors, ok: `0`);
2088	put_buf(r1_bio);
2089	return `0`;
2090	}
2091	/ Try next page /
2092	sectors -= s;
2093	sect += s;
2094	idx++;
2095	continue;
2096	}
2097
2098	start = d;
2099	/ write it back and re-read /
2100	while (d != r1_bio->read_disk) {
2101	if (d == `0`)
2102	d = conf->raid_disks * `2`;
2103	d--;
2104	if (r1_bio->bios[d]->bi_end_io != end_sync_read)
2105	continue;
2106	rdev = conf->mirrors[d].rdev;
2107	if (r1_sync_page_io(rdev, sector: sect, sectors: s,
2108	page: pages[idx],
2109	WRITE) == `0`) {
2110	r1_bio->bios[d]->bi_end_io = NULL;
2111	rdev_dec_pending(rdev, mddev);
2112	}
2113	}
2114	d = start;
2115	while (d != r1_bio->read_disk) {
2116	if (d == `0`)
2117	d = conf->raid_disks * `2`;
2118	d--;
2119	if (r1_bio->bios[d]->bi_end_io != end_sync_read)
2120	continue;
2121	rdev = conf->mirrors[d].rdev;
2122	if (r1_sync_page_io(rdev, sector: sect, sectors: s,
2123	page: pages[idx],
2124	READ) != `0`)
2125	atomic_add(i: s, v: &rdev->corrected_errors);
2126	}
2127	sectors -= s;
2128	sect += s;
2129	idx ++;
2130	}
2131	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
2132	bio->bi_status = `0`;
2133	return `1`;
2134	}
2135
2136	static void process_checks(struct r1bio *r1_bio)
2137	{
2138	/ We have read all readable devices. If we haven't*
2139	* got the block, then there is no hope left.
2140	* If we have, then we want to do a comparison
2141	* and skip the write if everything is the same.
2142	* If any blocks failed to read, then we need to
2143	* attempt an over-write
2144	*/
2145	struct mddev *mddev = r1_bio->mddev;
2146	struct r1conf *conf = mddev->private;
2147	int primary;
2148	int i;
2149	int vcnt;
2150
2151	/ Fix variable parts of all bios /
2152	vcnt = (r1_bio->sectors + PAGE_SIZE / `512` - `1`) >> (PAGE_SHIFT - `9`);
2153	for (i = `0`; i < conf->raid_disks * `2`; i++) {
2154	blk_status_t status;
2155	struct bio *b = r1_bio->bios[i];
2156	struct resync_pages *rp = get_resync_pages(bio: b);
2157	if (b->bi_end_io != end_sync_read)
2158	continue;
2159	/ fixup the bio for reuse, but preserve errno /
2160	status = b->bi_status;
2161	bio_reset(bio: b, bdev: conf->mirrors[i].rdev->bdev, opf: REQ_OP_READ);
2162	b->bi_status = status;
2163	b->bi_iter.bi_sector = r1_bio->sector +
2164	conf->mirrors[i].rdev->data_offset;
2165	b->bi_end_io = end_sync_read;
2166	rp->raid_bio = r1_bio;
2167	b->bi_private = rp;
2168
2169	/ initialize bvec table again /
2170	md_bio_reset_resync_pages(bio: b, rp, size: r1_bio->sectors << `9`);
2171	}
2172	for (primary = `0`; primary < conf->raid_disks * `2`; primary++)
2173	if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
2174	!r1_bio->bios[primary]->bi_status) {
2175	r1_bio->bios[primary]->bi_end_io = NULL;
2176	rdev_dec_pending(rdev: conf->mirrors[primary].rdev, mddev);
2177	break;
2178	}
2179	r1_bio->read_disk = primary;
2180	for (i = `0`; i < conf->raid_disks * `2`; i++) {
2181	int j = `0`;
2182	struct bio *pbio = r1_bio->bios[primary];
2183	struct bio *sbio = r1_bio->bios[i];
2184	blk_status_t status = sbio->bi_status;
2185	struct page **ppages = get_resync_pages(bio: pbio)->pages;
2186	struct page **spages = get_resync_pages(bio: sbio)->pages;
2187	struct bio_vec *bi;
2188	int page_len[RESYNC_PAGES] = { `0` };
2189	struct bvec_iter_all iter_all;
2190
2191	if (sbio->bi_end_io != end_sync_read)
2192	continue;
2193	/ Now we can 'fixup' the error value /
2194	sbio->bi_status = `0`;
2195
2196	bio_for_each_segment_all(bi, sbio, iter_all)
2197	page_len[j++] = bi->bv_len;
2198
2199	if (!status) {
2200	for (j = vcnt; j-- ; ) {
2201	if (memcmp(page_address(ppages[j]),
2202	page_address(spages[j]),
2203	size: page_len[j]))
2204	break;
2205	}
2206	} else
2207	j = `0`;
2208	if (j >= `0`)
2209	atomic64_add(i: r1_bio->sectors, v: &mddev->resync_mismatches);
2210	if (j < `0` \|\| (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
2211	&& !status)) {
2212	/ No need to write to this device. /
2213	sbio->bi_end_io = NULL;
2214	rdev_dec_pending(rdev: conf->mirrors[i].rdev, mddev);
2215	continue;
2216	}
2217
2218	bio_copy_data(dst: sbio, src: pbio);
2219	}
2220	}
2221
2222	static void sync_request_write(struct mddev mddev, struct* r1bio *r1_bio)
2223	{
2224	struct r1conf *conf = mddev->private;
2225	int i;
2226	int disks = conf->raid_disks * `2`;
2227	struct bio *wbio;
2228
2229	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
2230	/ ouch - failed to read all of that. /
2231	if (!fix_sync_read_error(r1_bio))
2232	return;
2233
2234	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2235	process_checks(r1_bio);
2236
2237	/*
2238	* schedule writes
2239	*/
2240	atomic_set(v: &r1_bio->remaining, i: `1`);
2241	for (i = `0`; i < disks ; i++) {
2242	wbio = r1_bio->bios[i];
2243	if (wbio->bi_end_io == NULL \|\|
2244	(wbio->bi_end_io == end_sync_read &&
2245	(i == r1_bio->read_disk \|\|
2246	!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
2247	continue;
2248	if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
2249	abort_sync_write(mddev, r1_bio);
2250	continue;
2251	}
2252
2253	wbio->bi_opf = REQ_OP_WRITE;
2254	if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
2255	wbio->bi_opf \|= MD_FAILFAST;
2256
2257	wbio->bi_end_io = end_sync_write;
2258	atomic_inc(v: &r1_bio->remaining);
2259	md_sync_acct(bdev: conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
2260
2261	submit_bio_noacct(bio: wbio);
2262	}
2263
2264	put_sync_write_buf(r1_bio, uptodate: `1`);
2265	}
2266
2267	/*
2268	* This is a kernel thread which:
2269	*
2270	* 1. Retries failed read operations on working mirrors.
2271	* 2. Updates the raid superblock when problems encounter.
2272	* 3. Performs writes following reads for array synchronising.
2273	*/
2274
2275	static void fix_read_error(struct r1conf conf, int* read_disk,
2276	sector_t sect, int sectors)
2277	{
2278	struct mddev *mddev = conf->mddev;
2279	while(sectors) {
2280	int s = sectors;
2281	int d = read_disk;
2282	int success = `0`;
2283	int start;
2284	struct md_rdev *rdev;
2285
2286	if (s > (PAGE_SIZE>>`9`))
2287	s = PAGE_SIZE >> `9`;
2288
2289	do {
2290	sector_t first_bad;
2291	int bad_sectors;
2292
2293	rcu_read_lock();
2294	rdev = rcu_dereference(conf->mirrors[d].rdev);
2295	if (rdev &&
2296	(test_bit(In_sync, &rdev->flags) \|\|
2297	(!test_bit(Faulty, &rdev->flags) &&
2298	rdev->recovery_offset >= sect + s)) &&
2299	is_badblock(rdev, s: sect, sectors: s,
2300	first_bad: &first_bad, bad_sectors: &bad_sectors) == `0`) {
2301	atomic_inc(v: &rdev->nr_pending);
2302	rcu_read_unlock();
2303	if (sync_page_io(rdev, sector: sect, size: s<<`9`,
2304	page: conf->tmppage, opf: REQ_OP_READ, metadata_op: false))
2305	success = `1`;
2306	rdev_dec_pending(rdev, mddev);
2307	if (success)
2308	break;
2309	} else
2310	rcu_read_unlock();
2311	d++;
2312	if (d == conf->raid_disks * `2`)
2313	d = `0`;
2314	} while (d != read_disk);
2315
2316	if (!success) {
2317	/ Cannot read from anywhere - mark it bad /
2318	struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
2319	if (!rdev_set_badblocks(rdev, s: sect, sectors: s, is_new: `0`))
2320	md_error(mddev, rdev);
2321	break;
2322	}
2323	/ write it back and re-read /
2324	start = d;
2325	while (d != read_disk) {
2326	if (d==`0`)
2327	d = conf->raid_disks * `2`;
2328	d--;
2329	rcu_read_lock();
2330	rdev = rcu_dereference(conf->mirrors[d].rdev);
2331	if (rdev &&
2332	!test_bit(Faulty, &rdev->flags)) {
2333	atomic_inc(v: &rdev->nr_pending);
2334	rcu_read_unlock();
2335	r1_sync_page_io(rdev, sector: sect, sectors: s,
2336	page: conf->tmppage, WRITE);
2337	rdev_dec_pending(rdev, mddev);
2338	} else
2339	rcu_read_unlock();
2340	}
2341	d = start;
2342	while (d != read_disk) {
2343	if (d==`0`)
2344	d = conf->raid_disks * `2`;
2345	d--;
2346	rcu_read_lock();
2347	rdev = rcu_dereference(conf->mirrors[d].rdev);
2348	if (rdev &&
2349	!test_bit(Faulty, &rdev->flags)) {
2350	atomic_inc(v: &rdev->nr_pending);
2351	rcu_read_unlock();
2352	if (r1_sync_page_io(rdev, sector: sect, sectors: s,
2353	page: conf->tmppage, READ)) {
2354	atomic_add(i: s, v: &rdev->corrected_errors);
2355	pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n",
2356	mdname(mddev), s,
2357	(unsigned long long)(sect +
2358	rdev->data_offset),
2359	rdev->bdev);
2360	}
2361	rdev_dec_pending(rdev, mddev);
2362	} else
2363	rcu_read_unlock();
2364	}
2365	sectors -= s;
2366	sect += s;
2367	}
2368	}
2369
2370	static int narrow_write_error(struct r1bio r1_bio, int* i)
2371	{
2372	struct mddev *mddev = r1_bio->mddev;
2373	struct r1conf *conf = mddev->private;
2374	struct md_rdev *rdev = conf->mirrors[i].rdev;
2375
2376	/ bio has the data to be written to device 'i' where*
2377	* we just recently had a write error.
2378	* We repeatedly clone the bio and trim down to one block,
2379	* then try the write. Where the write fails we record
2380	* a bad block.
2381	* It is conceivable that the bio doesn't exactly align with
2382	* blocks. We must handle this somehow.
2383	*
2384	* We currently own a reference on the rdev.
2385	*/
2386
2387	int block_sectors;
2388	sector_t sector;
2389	int sectors;
2390	int sect_to_write = r1_bio->sectors;
2391	int ok = `1`;
2392
2393	if (rdev->badblocks.shift < `0`)
2394	return `0`;
2395
2396	block_sectors = roundup(`1` << rdev->badblocks.shift,
2397	bdev_logical_block_size(rdev->bdev) >> `9`);
2398	sector = r1_bio->sector;
2399	sectors = ((sector + block_sectors)
2400	& ~(sector_t)(block_sectors - `1`))
2401	- sector;
2402
2403	while (sect_to_write) {
2404	struct bio *wbio;
2405	if (sectors > sect_to_write)
2406	sectors = sect_to_write;
2407	/ Write at 'sector' for 'sectors'/
2408
2409	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
2410	wbio = bio_alloc_clone(bdev: rdev->bdev,
2411	bio_src: r1_bio->behind_master_bio,
2412	GFP_NOIO, bs: &mddev->bio_set);
2413	} else {
2414	wbio = bio_alloc_clone(bdev: rdev->bdev, bio_src: r1_bio->master_bio,
2415	GFP_NOIO, bs: &mddev->bio_set);
2416	}
2417
2418	wbio->bi_opf = REQ_OP_WRITE;
2419	wbio->bi_iter.bi_sector = r1_bio->sector;
2420	wbio->bi_iter.bi_size = r1_bio->sectors << `9`;
2421
2422	bio_trim(bio: wbio, offset: sector - r1_bio->sector, size: sectors);
2423	wbio->bi_iter.bi_sector += rdev->data_offset;
2424
2425	if (submit_bio_wait(bio: wbio) < `0`)
2426	/ failure! /
2427	ok = rdev_set_badblocks(rdev, s: sector,
2428	sectors, is_new: `0`)
2429	&& ok;
2430
2431	bio_put(wbio);
2432	sect_to_write -= sectors;
2433	sector += sectors;
2434	sectors = block_sectors;
2435	}
2436	return ok;
2437	}
2438
2439	static void handle_sync_write_finished(struct r1conf conf, struct* r1bio *r1_bio)
2440	{
2441	int m;
2442	int s = r1_bio->sectors;
2443	for (m = `0`; m < conf->raid_disks * `2` ; m++) {
2444	struct md_rdev *rdev = conf->mirrors[m].rdev;
2445	struct bio *bio = r1_bio->bios[m];
2446	if (bio->bi_end_io == NULL)
2447	continue;
2448	if (!bio->bi_status &&
2449	test_bit(R1BIO_MadeGood, &r1_bio->state)) {
2450	rdev_clear_badblocks(rdev, s: r1_bio->sector, sectors: s, is_new: `0`);
2451	}
2452	if (bio->bi_status &&
2453	test_bit(R1BIO_WriteError, &r1_bio->state)) {
2454	if (!rdev_set_badblocks(rdev, s: r1_bio->sector, sectors: s, is_new: `0`))
2455	md_error(mddev: conf->mddev, rdev);
2456	}
2457	}
2458	put_buf(r1_bio);
2459	md_done_sync(mddev: conf->mddev, blocks: s, ok: `1`);
2460	}
2461
2462	static void handle_write_finished(struct r1conf conf, struct* r1bio *r1_bio)
2463	{
2464	int m, idx;
2465	bool fail = false;
2466
2467	for (m = `0`; m < conf->raid_disks * `2` ; m++)
2468	if (r1_bio->bios[m] == IO_MADE_GOOD) {
2469	struct md_rdev *rdev = conf->mirrors[m].rdev;
2470	rdev_clear_badblocks(rdev,
2471	s: r1_bio->sector,
2472	sectors: r1_bio->sectors, is_new: `0`);
2473	rdev_dec_pending(rdev, mddev: conf->mddev);
2474	} else if (r1_bio->bios[m] != NULL) {
2475	/ This drive got a write error. We need to*
2476	* narrow down and record precise write
2477	* errors.
2478	*/
2479	fail = true;
2480	if (!narrow_write_error(r1_bio, i: m)) {
2481	md_error(mddev: conf->mddev,
2482	rdev: conf->mirrors[m].rdev);
2483	/ an I/O failed, we can't clear the bitmap /
2484	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
2485	}
2486	rdev_dec_pending(rdev: conf->mirrors[m].rdev,
2487	mddev: conf->mddev);
2488	}
2489	if (fail) {
2490	spin_lock_irq(lock: &conf->device_lock);
2491	list_add(new: &r1_bio->retry_list, head: &conf->bio_end_io_list);
2492	idx = sector_to_idx(sector: r1_bio->sector);
2493	atomic_inc(v: &conf->nr_queued[idx]);
2494	spin_unlock_irq(lock: &conf->device_lock);
2495	/*
2496	* In case freeze_array() is waiting for condition
2497	* get_unqueued_pending() == extra to be true.
2498	*/
2499	wake_up(&conf->wait_barrier);
2500	md_wakeup_thread(thread: conf->mddev->thread);
2501	} else {
2502	if (test_bit(R1BIO_WriteError, &r1_bio->state))
2503	close_write(r1_bio);
2504	raid_end_bio_io(r1_bio);
2505	}
2506	}
2507
2508	static void handle_read_error(struct r1conf conf, struct* r1bio *r1_bio)
2509	{
2510	struct mddev *mddev = conf->mddev;
2511	struct bio *bio;
2512	struct md_rdev *rdev;
2513	sector_t sector;
2514
2515	clear_bit(nr: R1BIO_ReadError, addr: &r1_bio->state);
2516	/ we got a read error. Maybe the drive is bad. Maybe just*
2517	* the block and we can fix it.
2518	* We freeze all other IO, and try reading the block from
2519	* other devices. When we find one, we re-write
2520	* and check it that fixes the read error.
2521	* This is all done synchronously while the array is
2522	* frozen
2523	*/
2524
2525	bio = r1_bio->bios[r1_bio->read_disk];
2526	bio_put(bio);
2527	r1_bio->bios[r1_bio->read_disk] = NULL;
2528
2529	rdev = conf->mirrors[r1_bio->read_disk].rdev;
2530	if (mddev->ro == `0`
2531	&& !test_bit(FailFast, &rdev->flags)) {
2532	freeze_array(conf, extra: `1`);
2533	fix_read_error(conf, read_disk: r1_bio->read_disk,
2534	sect: r1_bio->sector, sectors: r1_bio->sectors);
2535	unfreeze_array(conf);
2536	} else if (mddev->ro == `0` && test_bit(FailFast, &rdev->flags)) {
2537	md_error(mddev, rdev);
2538	} else {
2539	r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
2540	}
2541
2542	rdev_dec_pending(rdev, mddev: conf->mddev);
2543	sector = r1_bio->sector;
2544	bio = r1_bio->master_bio;
2545
2546	/ Reuse the old r1_bio so that the IO_BLOCKED settings are preserved /
2547	r1_bio->state = `0`;
2548	raid1_read_request(mddev, bio, max_read_sectors: r1_bio->sectors, r1_bio);
2549	allow_barrier(conf, sector_nr: sector);
2550	}
2551
2552	static void raid1d(struct md_thread *thread)
2553	{
2554	struct mddev *mddev = thread->mddev;
2555	struct r1bio *r1_bio;
2556	unsigned long flags;
2557	struct r1conf *conf = mddev->private;
2558	struct list_head *head = &conf->retry_list;
2559	struct blk_plug plug;
2560	int idx;
2561
2562	md_check_recovery(mddev);
2563
2564	if (!list_empty_careful(head: &conf->bio_end_io_list) &&
2565	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2566	LIST_HEAD(tmp);
2567	spin_lock_irqsave(&conf->device_lock, flags);
2568	if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
2569	list_splice_init(list: &conf->bio_end_io_list, head: &tmp);
2570	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2571	while (!list_empty(head: &tmp)) {
2572	r1_bio = list_first_entry(&tmp, struct r1bio,
2573	retry_list);
2574	list_del(entry: &r1_bio->retry_list);
2575	idx = sector_to_idx(sector: r1_bio->sector);
2576	atomic_dec(v: &conf->nr_queued[idx]);
2577	if (mddev->degraded)
2578	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
2579	if (test_bit(R1BIO_WriteError, &r1_bio->state))
2580	close_write(r1_bio);
2581	raid_end_bio_io(r1_bio);
2582	}
2583	}
2584
2585	blk_start_plug(&plug);
2586	for (;;) {
2587
2588	flush_pending_writes(conf);
2589
2590	spin_lock_irqsave(&conf->device_lock, flags);
2591	if (list_empty(head)) {
2592	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2593	break;
2594	}
2595	r1_bio = list_entry(head->prev, struct r1bio, retry_list);
2596	list_del(entry: head->prev);
2597	idx = sector_to_idx(sector: r1_bio->sector);
2598	atomic_dec(v: &conf->nr_queued[idx]);
2599	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2600
2601	mddev = r1_bio->mddev;
2602	conf = mddev->private;
2603	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
2604	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
2605	test_bit(R1BIO_WriteError, &r1_bio->state))
2606	handle_sync_write_finished(conf, r1_bio);
2607	else
2608	sync_request_write(mddev, r1_bio);
2609	} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
2610	test_bit(R1BIO_WriteError, &r1_bio->state))
2611	handle_write_finished(conf, r1_bio);
2612	else if (test_bit(R1BIO_ReadError, &r1_bio->state))
2613	handle_read_error(conf, r1_bio);
2614	else
2615	WARN_ON_ONCE(`1`);
2616
2617	cond_resched();
2618	if (mddev->sb_flags & ~(`1`<<MD_SB_CHANGE_PENDING))
2619	md_check_recovery(mddev);
2620	}
2621	blk_finish_plug(&plug);
2622	}
2623
2624	static int init_resync(struct r1conf *conf)
2625	{
2626	int buffs;
2627
2628	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2629	BUG_ON(mempool_initialized(&conf->r1buf_pool));
2630
2631	return mempool_init(pool: &conf->r1buf_pool, min_nr: buffs, alloc_fn: r1buf_pool_alloc,
2632	free_fn: r1buf_pool_free, pool_data: conf->poolinfo);
2633	}
2634
2635	static struct r1bio raid1_alloc_init_r1buf(struct* r1conf *conf)
2636	{
2637	struct r1bio *r1bio = mempool_alloc(pool: &conf->r1buf_pool, GFP_NOIO);
2638	struct resync_pages *rps;
2639	struct bio *bio;
2640	int i;
2641
2642	for (i = conf->poolinfo->raid_disks; i--; ) {
2643	bio = r1bio->bios[i];
2644	rps = bio->bi_private;
2645	bio_reset(bio, NULL, opf: `0`);
2646	bio->bi_private = rps;
2647	}
2648	r1bio->master_bio = NULL;
2649	return r1bio;
2650	}
2651
2652	/*
2653	* perform a "sync" on one "block"
2654	*
2655	* We need to make sure that no normal I/O request - particularly write
2656	* requests - conflict with active sync requests.
2657	*
2658	* This is achieved by tracking pending requests and a 'barrier' concept
2659	* that can be installed to exclude normal IO requests.
2660	*/
2661
2662	static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2663	int *skipped)
2664	{
2665	struct r1conf *conf = mddev->private;
2666	struct r1bio *r1_bio;
2667	struct bio *bio;
2668	sector_t max_sector, nr_sectors;
2669	int disk = -`1`;
2670	int i;
2671	int wonly = -`1`;
2672	int write_targets = `0`, read_targets = `0`;
2673	sector_t sync_blocks;
2674	int still_degraded = `0`;
2675	int good_sectors = RESYNC_SECTORS;
2676	int min_bad = `0`; / number of sectors that are bad in all devices /
2677	int idx = sector_to_idx(sector: sector_nr);
2678	int page_idx = `0`;
2679
2680	if (!mempool_initialized(pool: &conf->r1buf_pool))
2681	if (init_resync(conf))
2682	return `0`;
2683
2684	max_sector = mddev->dev_sectors;
2685	if (sector_nr >= max_sector) {
2686	/ If we aborted, we need to abort the*
2687	* sync on the 'current' bitmap chunk (there will
2688	* only be one in raid1 resync.
2689	* We can find the current addess in mddev->curr_resync
2690	*/
2691	if (mddev->curr_resync < max_sector) / aborted /
2692	md_bitmap_end_sync(bitmap: mddev->bitmap, offset: mddev->curr_resync,
2693	blocks: &sync_blocks, aborted: `1`);
2694	else / completed sync /
2695	conf->fullsync = `0`;
2696
2697	md_bitmap_close_sync(bitmap: mddev->bitmap);
2698	close_sync(conf);
2699
2700	if (mddev_is_clustered(mddev)) {
2701	conf->cluster_sync_low = `0`;
2702	conf->cluster_sync_high = `0`;
2703	}
2704	return `0`;
2705	}
2706
2707	if (mddev->bitmap == NULL &&
2708	mddev->recovery_cp == MaxSector &&
2709	!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2710	conf->fullsync == `0`) {
2711	*skipped = `1`;
2712	return max_sector - sector_nr;
2713	}
2714	/ before building a request, check if we can skip these blocks..*
2715	* This call the bitmap_start_sync doesn't actually record anything
2716	*/
2717	if (!md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr, blocks: &sync_blocks, degraded: `1`) &&
2718	!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2719	/ We can skip this block, and probably several more /
2720	*skipped = `1`;
2721	return sync_blocks;
2722	}
2723
2724	/*
2725	* If there is non-resync activity waiting for a turn, then let it
2726	* though before starting on this new sync request.
2727	*/
2728	if (atomic_read(v: &conf->nr_waiting[idx]))
2729	schedule_timeout_uninterruptible(timeout: `1`);
2730
2731	/ we are incrementing sector_nr below. To be safe, we check against*
2732	* sector_nr + two times RESYNC_SECTORS
2733	*/
2734
2735	md_bitmap_cond_end_sync(bitmap: mddev->bitmap, sector: sector_nr,
2736	force: mddev_is_clustered(mddev) && (sector_nr + `2` * RESYNC_SECTORS > conf->cluster_sync_high));
2737
2738
2739	if (raise_barrier(conf, sector_nr))
2740	return `0`;
2741
2742	r1_bio = raid1_alloc_init_r1buf(conf);
2743
2744	rcu_read_lock();
2745	/*
2746	* If we get a correctably read error during resync or recovery,
2747	* we might want to read from a different device. So we
2748	* flag all drives that could conceivably be read from for READ,
2749	* and any others (which will be non-In_sync devices) for WRITE.
2750	* If a read fails, we try reading from something else for which READ
2751	* is OK.
2752	*/
2753
2754	r1_bio->mddev = mddev;
2755	r1_bio->sector = sector_nr;
2756	r1_bio->state = `0`;
2757	set_bit(nr: R1BIO_IsSync, addr: &r1_bio->state);
2758	/ make sure good_sectors won't go across barrier unit boundary /
2759	good_sectors = align_to_barrier_unit_end(start_sector: sector_nr, sectors: good_sectors);
2760
2761	for (i = `0`; i < conf->raid_disks * `2`; i++) {
2762	struct md_rdev *rdev;
2763	bio = r1_bio->bios[i];
2764
2765	rdev = rcu_dereference(conf->mirrors[i].rdev);
2766	if (rdev == NULL \|\|
2767	test_bit(Faulty, &rdev->flags)) {
2768	if (i < conf->raid_disks)
2769	still_degraded = `1`;
2770	} else if (!test_bit(In_sync, &rdev->flags)) {
2771	bio->bi_opf = REQ_OP_WRITE;
2772	bio->bi_end_io = end_sync_write;
2773	write_targets ++;
2774	} else {
2775	/ may need to read from here /
2776	sector_t first_bad = MaxSector;
2777	int bad_sectors;
2778
2779	if (is_badblock(rdev, s: sector_nr, sectors: good_sectors,
2780	first_bad: &first_bad, bad_sectors: &bad_sectors)) {
2781	if (first_bad > sector_nr)
2782	good_sectors = first_bad - sector_nr;
2783	else {
2784	bad_sectors -= (sector_nr - first_bad);
2785	if (min_bad == `0` \|\|
2786	min_bad > bad_sectors)
2787	min_bad = bad_sectors;
2788	}
2789	}
2790	if (sector_nr < first_bad) {
2791	if (test_bit(WriteMostly, &rdev->flags)) {
2792	if (wonly < `0`)
2793	wonly = i;
2794	} else {
2795	if (disk < `0`)
2796	disk = i;
2797	}
2798	bio->bi_opf = REQ_OP_READ;
2799	bio->bi_end_io = end_sync_read;
2800	read_targets++;
2801	} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2802	test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2803	!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2804	/*
2805	* The device is suitable for reading (InSync),
2806	* but has bad block(s) here. Let's try to correct them,
2807	* if we are doing resync or repair. Otherwise, leave
2808	* this device alone for this sync request.
2809	*/
2810	bio->bi_opf = REQ_OP_WRITE;
2811	bio->bi_end_io = end_sync_write;
2812	write_targets++;
2813	}
2814	}
2815	if (rdev && bio->bi_end_io) {
2816	atomic_inc(v: &rdev->nr_pending);
2817	bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
2818	bio_set_dev(bio, bdev: rdev->bdev);
2819	if (test_bit(FailFast, &rdev->flags))
2820	bio->bi_opf \|= MD_FAILFAST;
2821	}
2822	}
2823	rcu_read_unlock();
2824	if (disk < `0`)
2825	disk = wonly;
2826	r1_bio->read_disk = disk;
2827
2828	if (read_targets == `0` && min_bad > `0`) {
2829	/ These sectors are bad on all InSync devices, so we*
2830	* need to mark them bad on all write targets
2831	*/
2832	int ok = `1`;
2833	for (i = `0` ; i < conf->raid_disks * `2` ; i++)
2834	if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2835	struct md_rdev *rdev = conf->mirrors[i].rdev;
2836	ok = rdev_set_badblocks(rdev, s: sector_nr,
2837	sectors: min_bad, is_new: `0`
2838	) && ok;
2839	}
2840	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
2841	*skipped = `1`;
2842	put_buf(r1_bio);
2843
2844	if (!ok) {
2845	/ Cannot record the badblocks, so need to*
2846	* abort the resync.
2847	* If there are multiple read targets, could just
2848	* fail the really bad ones ???
2849	*/
2850	conf->recovery_disabled = mddev->recovery_disabled;
2851	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2852	return `0`;
2853	} else
2854	return min_bad;
2855
2856	}
2857	if (min_bad > `0` && min_bad < good_sectors) {
2858	/ only resync enough to reach the next bad->good*
2859	* transition */
2860	good_sectors = min_bad;
2861	}
2862
2863	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > `0`)
2864	/ extra read targets are also write targets /
2865	write_targets += read_targets-`1`;
2866
2867	if (write_targets == `0` \|\| read_targets == `0`) {
2868	/ There is nowhere to write, so all non-sync*
2869	* drives must be failed - so we are finished
2870	*/
2871	sector_t rv;
2872	if (min_bad > `0`)
2873	max_sector = sector_nr + min_bad;
2874	rv = max_sector - sector_nr;
2875	*skipped = `1`;
2876	put_buf(r1_bio);
2877	return rv;
2878	}
2879
2880	if (max_sector > mddev->resync_max)
2881	max_sector = mddev->resync_max; / Don't do IO beyond here /
2882	if (max_sector > sector_nr + good_sectors)
2883	max_sector = sector_nr + good_sectors;
2884	nr_sectors = `0`;
2885	sync_blocks = `0`;
2886	do {
2887	struct page *page;
2888	int len = PAGE_SIZE;
2889	if (sector_nr + (len>>`9`) > max_sector)
2890	len = (max_sector - sector_nr) << `9`;
2891	if (len == `0`)
2892	break;
2893	if (sync_blocks == `0`) {
2894	if (!md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr,
2895	blocks: &sync_blocks, degraded: still_degraded) &&
2896	!conf->fullsync &&
2897	!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2898	break;
2899	if ((len >> `9`) > sync_blocks)
2900	len = sync_blocks<<`9`;
2901	}
2902
2903	for (i = `0` ; i < conf->raid_disks * `2`; i++) {
2904	struct resync_pages *rp;
2905
2906	bio = r1_bio->bios[i];
2907	rp = get_resync_pages(bio);
2908	if (bio->bi_end_io) {
2909	page = resync_fetch_page(rp, idx: page_idx);
2910
2911	/*
2912	* won't fail because the vec table is big
2913	* enough to hold all these pages
2914	*/
2915	__bio_add_page(bio, page, len, off: `0`);
2916	}
2917	}
2918	nr_sectors += len>>`9`;
2919	sector_nr += len>>`9`;
2920	sync_blocks -= (len>>`9`);
2921	} while (++page_idx < RESYNC_PAGES);
2922
2923	r1_bio->sectors = nr_sectors;
2924
2925	if (mddev_is_clustered(mddev) &&
2926	conf->cluster_sync_high < sector_nr + nr_sectors) {
2927	conf->cluster_sync_low = mddev->curr_resync_completed;
2928	conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
2929	/ Send resync message /
2930	md_cluster_ops->resync_info_update(mddev,
2931	conf->cluster_sync_low,
2932	conf->cluster_sync_high);
2933	}
2934
2935	/ For a user-requested sync, we read all readable devices and do a*
2936	* compare
2937	*/
2938	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2939	atomic_set(v: &r1_bio->remaining, i: read_targets);
2940	for (i = `0`; i < conf->raid_disks * `2` && read_targets; i++) {
2941	bio = r1_bio->bios[i];
2942	if (bio->bi_end_io == end_sync_read) {
2943	read_targets--;
2944	md_sync_acct_bio(bio, nr_sectors);
2945	if (read_targets == `1`)
2946	bio->bi_opf &= ~MD_FAILFAST;
2947	submit_bio_noacct(bio);
2948	}
2949	}
2950	} else {
2951	atomic_set(v: &r1_bio->remaining, i: `1`);
2952	bio = r1_bio->bios[r1_bio->read_disk];
2953	md_sync_acct_bio(bio, nr_sectors);
2954	if (read_targets == `1`)
2955	bio->bi_opf &= ~MD_FAILFAST;
2956	submit_bio_noacct(bio);
2957	}
2958	return nr_sectors;
2959	}
2960
2961	static sector_t raid1_size(struct mddev mddev, sector_t sectors, int* raid_disks)
2962	{
2963	if (sectors)
2964	return sectors;
2965
2966	return mddev->dev_sectors;
2967	}
2968
2969	static struct r1conf setup_conf(struct* mddev *mddev)
2970	{
2971	struct r1conf *conf;
2972	int i;
2973	struct raid1_info *disk;
2974	struct md_rdev *rdev;
2975	int err = -ENOMEM;
2976
2977	conf = kzalloc(size: sizeof(struct r1conf), GFP_KERNEL);
2978	if (!conf)
2979	goto abort;
2980
2981	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
2982	size: sizeof(atomic_t), GFP_KERNEL);
2983	if (!conf->nr_pending)
2984	goto abort;
2985
2986	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
2987	size: sizeof(atomic_t), GFP_KERNEL);
2988	if (!conf->nr_waiting)
2989	goto abort;
2990
2991	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
2992	size: sizeof(atomic_t), GFP_KERNEL);
2993	if (!conf->nr_queued)
2994	goto abort;
2995
2996	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
2997	size: sizeof(atomic_t), GFP_KERNEL);
2998	if (!conf->barrier)
2999	goto abort;
3000
3001	conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
3002	mddev->raid_disks, `2`),
3003	GFP_KERNEL);
3004	if (!conf->mirrors)
3005	goto abort;
3006
3007	conf->tmppage = alloc_page(GFP_KERNEL);
3008	if (!conf->tmppage)
3009	goto abort;
3010
3011	conf->poolinfo = kzalloc(size: sizeof(*conf->poolinfo), GFP_KERNEL);
3012	if (!conf->poolinfo)
3013	goto abort;
3014	conf->poolinfo->raid_disks = mddev->raid_disks * `2`;
3015	err = mempool_init(pool: &conf->r1bio_pool, NR_RAID_BIOS, alloc_fn: r1bio_pool_alloc,
3016	free_fn: rbio_pool_free, pool_data: conf->poolinfo);
3017	if (err)
3018	goto abort;
3019
3020	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, `0`, flags: `0`);
3021	if (err)
3022	goto abort;
3023
3024	conf->poolinfo->mddev = mddev;
3025
3026	err = -EINVAL;
3027	spin_lock_init(&conf->device_lock);
3028	rdev_for_each(rdev, mddev) {
3029	int disk_idx = rdev->raid_disk;
3030	if (disk_idx >= mddev->raid_disks
3031	\|\| disk_idx < `0`)
3032	continue;
3033	if (test_bit(Replacement, &rdev->flags))
3034	disk = conf->mirrors + mddev->raid_disks + disk_idx;
3035	else
3036	disk = conf->mirrors + disk_idx;
3037
3038	if (disk->rdev)
3039	goto abort;
3040	disk->rdev = rdev;
3041	disk->head_position = `0`;
3042	disk->seq_start = MaxSector;
3043	}
3044	conf->raid_disks = mddev->raid_disks;
3045	conf->mddev = mddev;
3046	INIT_LIST_HEAD(list: &conf->retry_list);
3047	INIT_LIST_HEAD(list: &conf->bio_end_io_list);
3048
3049	spin_lock_init(&conf->resync_lock);
3050	init_waitqueue_head(&conf->wait_barrier);
3051
3052	bio_list_init(bl: &conf->pending_bio_list);
3053	conf->recovery_disabled = mddev->recovery_disabled - `1`;
3054
3055	err = -EIO;
3056	for (i = `0`; i < conf->raid_disks * `2`; i++) {
3057
3058	disk = conf->mirrors + i;
3059
3060	if (i < conf->raid_disks &&
3061	disk[conf->raid_disks].rdev) {
3062	/ This slot has a replacement. /
3063	if (!disk->rdev) {
3064	/ No original, just make the replacement*
3065	* a recovering spare
3066	*/
3067	disk->rdev =
3068	disk[conf->raid_disks].rdev;
3069	disk[conf->raid_disks].rdev = NULL;
3070	} else if (!test_bit(In_sync, &disk->rdev->flags))
3071	/ Original is not in_sync - bad /
3072	goto abort;
3073	}
3074
3075	if (!disk->rdev \|\|
3076	!test_bit(In_sync, &disk->rdev->flags)) {
3077	disk->head_position = `0`;
3078	if (disk->rdev &&
3079	(disk->rdev->saved_raid_disk < `0`))
3080	conf->fullsync = `1`;
3081	}
3082	}
3083
3084	err = -ENOMEM;
3085	rcu_assign_pointer(conf->thread,
3086	md_register_thread(raid1d, mddev, "raid1"));
3087	if (!conf->thread)
3088	goto abort;
3089
3090	return conf;
3091
3092	abort:
3093	if (conf) {
3094	mempool_exit(pool: &conf->r1bio_pool);
3095	kfree(objp: conf->mirrors);
3096	safe_put_page(p: conf->tmppage);
3097	kfree(objp: conf->poolinfo);
3098	kfree(objp: conf->nr_pending);
3099	kfree(objp: conf->nr_waiting);
3100	kfree(objp: conf->nr_queued);
3101	kfree(objp: conf->barrier);
3102	bioset_exit(&conf->bio_split);
3103	kfree(objp: conf);
3104	}
3105	return ERR_PTR(error: err);
3106	}
3107
3108	static void raid1_free(struct mddev mddev, void* *priv);
3109	static int raid1_run(struct mddev *mddev)
3110	{
3111	struct r1conf *conf;
3112	int i;
3113	struct md_rdev *rdev;
3114	int ret;
3115
3116	if (mddev->level != `1`) {
3117	pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
3118	mdname(mddev), mddev->level);
3119	return -EIO;
3120	}
3121	if (mddev->reshape_position != MaxSector) {
3122	pr_warn("md/raid1:%s: reshape_position set but not supported\n",
3123	mdname(mddev));
3124	return -EIO;
3125	}
3126
3127	/*
3128	* copy the already verified devices into our private RAID1
3129	* bookkeeping area. [whatever we allocate in run(),
3130	* should be freed in raid1_free()]
3131	*/
3132	if (mddev->private == NULL)
3133	conf = setup_conf(mddev);
3134	else
3135	conf = mddev->private;
3136
3137	if (IS_ERR(ptr: conf))
3138	return PTR_ERR(ptr: conf);
3139
3140	if (mddev->queue)
3141	blk_queue_max_write_zeroes_sectors(q: mddev->queue, max_write_same_sectors: `0`);
3142
3143	rdev_for_each(rdev, mddev) {
3144	if (!mddev->gendisk)
3145	continue;
3146	disk_stack_limits(disk: mddev->gendisk, bdev: rdev->bdev,
3147	offset: rdev->data_offset << `9`);
3148	}
3149
3150	mddev->degraded = `0`;
3151	for (i = `0`; i < conf->raid_disks; i++)
3152	if (conf->mirrors[i].rdev == NULL \|\|
3153	!test_bit(In_sync, &conf->mirrors[i].rdev->flags) \|\|
3154	test_bit(Faulty, &conf->mirrors[i].rdev->flags))
3155	mddev->degraded++;
3156	/*
3157	* RAID1 needs at least one disk in active
3158	*/
3159	if (conf->raid_disks - mddev->degraded < `1`) {
3160	md_unregister_thread(mddev, threadp: &conf->thread);
3161	ret = -EINVAL;
3162	goto abort;
3163	}
3164
3165	if (conf->raid_disks - mddev->degraded == `1`)
3166	mddev->recovery_cp = MaxSector;
3167
3168	if (mddev->recovery_cp != MaxSector)
3169	pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
3170	mdname(mddev));
3171	pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
3172	mdname(mddev), mddev->raid_disks - mddev->degraded,
3173	mddev->raid_disks);
3174
3175	/*
3176	* Ok, everything is just fine now
3177	*/
3178	rcu_assign_pointer(mddev->thread, conf->thread);
3179	rcu_assign_pointer(conf->thread, NULL);
3180	mddev->private = conf;
3181	set_bit(nr: MD_FAILFAST_SUPPORTED, addr: &mddev->flags);
3182
3183	md_set_array_sectors(mddev, array_sectors: raid1_size(mddev, sectors: `0`, raid_disks: `0`));
3184
3185	ret = md_integrity_register(mddev);
3186	if (ret) {
3187	md_unregister_thread(mddev, threadp: &mddev->thread);
3188	goto abort;
3189	}
3190	return `0`;
3191
3192	abort:
3193	raid1_free(mddev, priv: conf);
3194	return ret;
3195	}
3196
3197	static void raid1_free(struct mddev mddev, void* *priv)
3198	{
3199	struct r1conf *conf = priv;
3200
3201	mempool_exit(pool: &conf->r1bio_pool);
3202	kfree(objp: conf->mirrors);
3203	safe_put_page(p: conf->tmppage);
3204	kfree(objp: conf->poolinfo);
3205	kfree(objp: conf->nr_pending);
3206	kfree(objp: conf->nr_waiting);
3207	kfree(objp: conf->nr_queued);
3208	kfree(objp: conf->barrier);
3209	bioset_exit(&conf->bio_split);
3210	kfree(objp: conf);
3211	}
3212
3213	static int raid1_resize(struct mddev *mddev, sector_t sectors)
3214	{
3215	/ no resync is happening, and there is enough space*
3216	* on all devices, so we can resize.
3217	* We need to make sure resync covers any new space.
3218	* If the array is shrinking we should possibly wait until
3219	* any io in the removed space completes, but it hardly seems
3220	* worth it.
3221	*/
3222	sector_t newsize = raid1_size(mddev, sectors, raid_disks: `0`);
3223	if (mddev->external_size &&
3224	mddev->array_sectors > newsize)
3225	return -EINVAL;
3226	if (mddev->bitmap) {
3227	int ret = md_bitmap_resize(bitmap: mddev->bitmap, blocks: newsize, chunksize: `0`, init: `0`);
3228	if (ret)
3229	return ret;
3230	}
3231	md_set_array_sectors(mddev, array_sectors: newsize);
3232	if (sectors > mddev->dev_sectors &&
3233	mddev->recovery_cp > mddev->dev_sectors) {
3234	mddev->recovery_cp = mddev->dev_sectors;
3235	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
3236	}
3237	mddev->dev_sectors = sectors;
3238	mddev->resync_max_sectors = sectors;
3239	return `0`;
3240	}
3241
3242	static int raid1_reshape(struct mddev *mddev)
3243	{
3244	/ We need to:*
3245	* 1/ resize the r1bio_pool
3246	* 2/ resize conf->mirrors
3247	*
3248	* We allocate a new r1bio_pool if we can.
3249	* Then raise a device barrier and wait until all IO stops.
3250	* Then resize conf->mirrors and swap in the new r1bio pool.
3251	*
3252	* At the same time, we "pack" the devices so that all the missing
3253	* devices have the higher raid_disk numbers.
3254	*/
3255	mempool_t newpool, oldpool;
3256	struct pool_info *newpoolinfo;
3257	struct raid1_info *newmirrors;
3258	struct r1conf *conf = mddev->private;
3259	int cnt, raid_disks;
3260	unsigned long flags;
3261	int d, d2;
3262	int ret;
3263
3264	memset(&newpool, `0`, sizeof(newpool));
3265	memset(&oldpool, `0`, sizeof(oldpool));
3266
3267	/ Cannot change chunk_size, layout, or level /
3268	if (mddev->chunk_sectors != mddev->new_chunk_sectors \|\|
3269	mddev->layout != mddev->new_layout \|\|
3270	mddev->level != mddev->new_level) {
3271	mddev->new_chunk_sectors = mddev->chunk_sectors;
3272	mddev->new_layout = mddev->layout;
3273	mddev->new_level = mddev->level;
3274	return -EINVAL;
3275	}
3276
3277	if (!mddev_is_clustered(mddev))
3278	md_allow_write(mddev);
3279
3280	raid_disks = mddev->raid_disks + mddev->delta_disks;
3281
3282	if (raid_disks < conf->raid_disks) {
3283	cnt=`0`;
3284	for (d= `0`; d < conf->raid_disks; d++)
3285	if (conf->mirrors[d].rdev)
3286	cnt++;
3287	if (cnt > raid_disks)
3288	return -EBUSY;
3289	}
3290
3291	newpoolinfo = kmalloc(size: sizeof(*newpoolinfo), GFP_KERNEL);
3292	if (!newpoolinfo)
3293	return -ENOMEM;
3294	newpoolinfo->mddev = mddev;
3295	newpoolinfo->raid_disks = raid_disks * `2`;
3296
3297	ret = mempool_init(pool: &newpool, NR_RAID_BIOS, alloc_fn: r1bio_pool_alloc,
3298	free_fn: rbio_pool_free, pool_data: newpoolinfo);
3299	if (ret) {
3300	kfree(objp: newpoolinfo);
3301	return ret;
3302	}
3303	newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
3304	raid_disks, `2`),
3305	GFP_KERNEL);
3306	if (!newmirrors) {
3307	kfree(objp: newpoolinfo);
3308	mempool_exit(pool: &newpool);
3309	return -ENOMEM;
3310	}
3311
3312	freeze_array(conf, extra: `0`);
3313
3314	/ ok, everything is stopped /
3315	oldpool = conf->r1bio_pool;
3316	conf->r1bio_pool = newpool;
3317
3318	for (d = d2 = `0`; d < conf->raid_disks; d++) {
3319	struct md_rdev *rdev = conf->mirrors[d].rdev;
3320	if (rdev && rdev->raid_disk != d2) {
3321	sysfs_unlink_rdev(mddev, rdev);
3322	rdev->raid_disk = d2;
3323	sysfs_unlink_rdev(mddev, rdev);
3324	if (sysfs_link_rdev(mddev, rdev))
3325	pr_warn("md/raid1:%s: cannot register rd%d\n",
3326	mdname(mddev), rdev->raid_disk);
3327	}
3328	if (rdev)
3329	newmirrors[d2++].rdev = rdev;
3330	}
3331	kfree(objp: conf->mirrors);
3332	conf->mirrors = newmirrors;
3333	kfree(objp: conf->poolinfo);
3334	conf->poolinfo = newpoolinfo;
3335
3336	spin_lock_irqsave(&conf->device_lock, flags);
3337	mddev->degraded += (raid_disks - conf->raid_disks);
3338	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
3339	conf->raid_disks = mddev->raid_disks = raid_disks;
3340	mddev->delta_disks = `0`;
3341
3342	unfreeze_array(conf);
3343
3344	set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery);
3345	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
3346	md_wakeup_thread(thread: mddev->thread);
3347
3348	mempool_exit(pool: &oldpool);
3349	return `0`;
3350	}
3351
3352	static void raid1_quiesce(struct mddev mddev, int* quiesce)
3353	{
3354	struct r1conf *conf = mddev->private;
3355
3356	if (quiesce)
3357	freeze_array(conf, extra: `0`);
3358	else
3359	unfreeze_array(conf);
3360	}
3361
3362	static void raid1_takeover(struct* mddev *mddev)
3363	{
3364	/ raid1 can take over:*
3365	* raid5 with 2 devices, any layout or chunk size
3366	*/
3367	if (mddev->level == `5` && mddev->raid_disks == `2`) {
3368	struct r1conf *conf;
3369	mddev->new_level = `1`;
3370	mddev->new_layout = `0`;
3371	mddev->new_chunk_sectors = `0`;
3372	conf = setup_conf(mddev);
3373	if (!IS_ERR(ptr: conf)) {
3374	/ Array must appear to be quiesced /
3375	conf->array_frozen = `1`;
3376	mddev_clear_unsupported_flags(mddev,
3377	UNSUPPORTED_MDDEV_FLAGS);
3378	}
3379	return conf;
3380	}
3381	return ERR_PTR(error: -EINVAL);
3382	}
3383
3384	static struct md_personality raid1_personality =
3385	{
3386	.name = "raid1",
3387	.level = `1`,
3388	.owner = THIS_MODULE,
3389	.make_request = raid1_make_request,
3390	.run = raid1_run,
3391	.free = raid1_free,
3392	.status = raid1_status,
3393	.error_handler = raid1_error,
3394	.hot_add_disk = raid1_add_disk,
3395	.hot_remove_disk= raid1_remove_disk,
3396	.spare_active = raid1_spare_active,
3397	.sync_request = raid1_sync_request,
3398	.resize = raid1_resize,
3399	.size = raid1_size,
3400	.check_reshape = raid1_reshape,
3401	.quiesce = raid1_quiesce,
3402	.takeover = raid1_takeover,
3403	};
3404
3405	static int __init raid_init(void)
3406	{
3407	return register_md_personality(p: &raid1_personality);
3408	}
3409
3410	static void raid_exit(void)
3411	{
3412	unregister_md_personality(p: &raid1_personality);
3413	}
3414
3415	module_init(raid_init);
3416	module_exit(raid_exit);
3417	MODULE_LICENSE("GPL");
3418	MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
3419	MODULE_ALIAS("md-personality-3"); / RAID1 /
3420	MODULE_ALIAS("md-raid1");
3421	MODULE_ALIAS("md-level-1");
3422

source code of linux/drivers/md/raid1.c