md-cluster.c source code [linux/drivers/md/md-cluster.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2015, SUSE
4	*/
5
6
7	#include <linux/module.h>
8	#include <linux/kthread.h>
9	#include <linux/dlm.h>
10	#include <linux/sched.h>
11	#include <linux/raid/md_p.h>
12	#include "md.h"
13	#include "md-bitmap.h"
14	#include "md-cluster.h"
15
16	#define LVB_SIZE 64
17	#define NEW_DEV_TIMEOUT 5000
18
19	struct dlm_lock_resource {
20	dlm_lockspace_t *ls;
21	struct dlm_lksb lksb;
22	char name; /* lock name. /
23	uint32_t flags; / flags to pass to dlm_lock() /
24	wait_queue_head_t sync_locking; / wait queue for synchronized locking /
25	bool sync_locking_done;
26	void (bast)(void* arg, int* mode); / blocking AST function pointer/
27	struct mddev mddev; /* pointing back to mddev. /
28	int mode;
29	};
30
31	struct resync_info {
32	__le64 lo;
33	__le64 hi;
34	};
35
36	/ md_cluster_info flags /
37	#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
38	#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
39	#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
40
41	/ Lock the send communication. This is done through*
42	* bit manipulation as opposed to a mutex in order to
43	* accommodate lock and hold. See next comment.
44	*/
45	#define MD_CLUSTER_SEND_LOCK 4
46	/ If cluster operations (such as adding a disk) must lock the*
47	* communication channel, so as to perform extra operations
48	* (update metadata) and no other operation is allowed on the
49	* MD. Token needs to be locked and held until the operation
50	* completes witha md_update_sb(), which would eventually release
51	* the lock.
52	*/
53	#define MD_CLUSTER_SEND_LOCKED_ALREADY 5
54	/ We should receive message after node joined cluster and*
55	* set up all the related infos such as bitmap and personality */
56	#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
57	#define MD_CLUSTER_PENDING_RECV_EVENT 7
58	#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
59
60	struct md_cluster_info {
61	struct mddev mddev; /* the md device which md_cluster_info belongs to /
62	/ dlm lock space and resources for clustered raid. /
63	dlm_lockspace_t *lockspace;
64	int slot_number;
65	struct completion completion;
66	struct mutex recv_mutex;
67	struct dlm_lock_resource *bitmap_lockres;
68	struct dlm_lock_resource **other_bitmap_lockres;
69	struct dlm_lock_resource *resync_lockres;
70	struct list_head suspend_list;
71
72	spinlock_t suspend_lock;
73	/ record the region which write should be suspended /
74	sector_t suspend_lo;
75	sector_t suspend_hi;
76	int suspend_from; / the slot which broadcast suspend_lo/hi /
77
78	struct md_thread __rcu *recovery_thread;
79	unsigned long recovery_map;
80	/ communication loc resources /
81	struct dlm_lock_resource *ack_lockres;
82	struct dlm_lock_resource *message_lockres;
83	struct dlm_lock_resource *token_lockres;
84	struct dlm_lock_resource *no_new_dev_lockres;
85	struct md_thread __rcu *recv_thread;
86	struct completion newdisk_completion;
87	wait_queue_head_t wait;
88	unsigned long state;
89	/ record the region in RESYNCING message /
90	sector_t sync_low;
91	sector_t sync_hi;
92	};
93
94	enum msg_type {
95	METADATA_UPDATED = `0`,
96	RESYNCING,
97	NEWDISK,
98	REMOVE,
99	RE_ADD,
100	BITMAP_NEEDS_SYNC,
101	CHANGE_CAPACITY,
102	BITMAP_RESIZE,
103	};
104
105	struct cluster_msg {
106	__le32 type;
107	__le32 slot;
108	/ TODO: Unionize this for smaller footprint /
109	__le64 low;
110	__le64 high;
111	char uuid[`16`];
112	__le32 raid_slot;
113	};
114
115	static void sync_ast(void *arg)
116	{
117	struct dlm_lock_resource *res;
118
119	res = arg;
120	res->sync_locking_done = true;
121	wake_up(&res->sync_locking);
122	}
123
124	static int dlm_lock_sync(struct dlm_lock_resource res, int* mode)
125	{
126	int ret = `0`;
127
128	ret = dlm_lock(lockspace: res->ls, mode, lksb: &res->lksb,
129	flags: res->flags, name: res->name, strlen(res->name),
130	parent_lkid: `0`, lockast: sync_ast, astarg: res, bast: res->bast);
131	if (ret)
132	return ret;
133	wait_event(res->sync_locking, res->sync_locking_done);
134	res->sync_locking_done = false;
135	if (res->lksb.sb_status == `0`)
136	res->mode = mode;
137	return res->lksb.sb_status;
138	}
139
140	static int dlm_unlock_sync(struct dlm_lock_resource *res)
141	{
142	return dlm_lock_sync(res, DLM_LOCK_NL);
143	}
144
145	/*
146	* An variation of dlm_lock_sync, which make lock request could
147	* be interrupted
148	*/
149	static int dlm_lock_sync_interruptible(struct dlm_lock_resource res, int* mode,
150	struct mddev *mddev)
151	{
152	int ret = `0`;
153
154	ret = dlm_lock(lockspace: res->ls, mode, lksb: &res->lksb,
155	flags: res->flags, name: res->name, strlen(res->name),
156	parent_lkid: `0`, lockast: sync_ast, astarg: res, bast: res->bast);
157	if (ret)
158	return ret;
159
160	wait_event(res->sync_locking, res->sync_locking_done
161	\|\| kthread_should_stop()
162	\|\| test_bit(MD_CLOSING, &mddev->flags));
163	if (!res->sync_locking_done) {
164	/*
165	* the convert queue contains the lock request when request is
166	* interrupted, and sync_ast could still be run, so need to
167	* cancel the request and reset completion
168	*/
169	ret = dlm_unlock(lockspace: res->ls, lkid: res->lksb.sb_lkid, DLM_LKF_CANCEL,
170	lksb: &res->lksb, astarg: res);
171	res->sync_locking_done = false;
172	if (unlikely(ret != `0`))
173	pr_info("failed to cancel previous lock request "
174	"%s return %d\n", res->name, ret);
175	return -EPERM;
176	} else
177	res->sync_locking_done = false;
178	if (res->lksb.sb_status == `0`)
179	res->mode = mode;
180	return res->lksb.sb_status;
181	}
182
183	static struct dlm_lock_resource lockres_init(struct* mddev *mddev,
184	char name, void* (bastfn)(void* arg, int* mode), int with_lvb)
185	{
186	struct dlm_lock_resource *res = NULL;
187	int ret, namelen;
188	struct md_cluster_info *cinfo = mddev->cluster_info;
189
190	res = kzalloc(size: sizeof(struct dlm_lock_resource), GFP_KERNEL);
191	if (!res)
192	return NULL;
193	init_waitqueue_head(&res->sync_locking);
194	res->sync_locking_done = false;
195	res->ls = cinfo->lockspace;
196	res->mddev = mddev;
197	res->mode = DLM_LOCK_IV;
198	namelen = strlen(name);
199	res->name = kzalloc(size: namelen + `1`, GFP_KERNEL);
200	if (!res->name) {
201	pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
202	goto out_err;
203	}
204	strscpy(p: res->name, q: name, size: namelen + `1`);
205	if (with_lvb) {
206	res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
207	if (!res->lksb.sb_lvbptr) {
208	pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
209	goto out_err;
210	}
211	res->flags = DLM_LKF_VALBLK;
212	}
213
214	if (bastfn)
215	res->bast = bastfn;
216
217	res->flags \|= DLM_LKF_EXPEDITE;
218
219	ret = dlm_lock_sync(res, DLM_LOCK_NL);
220	if (ret) {
221	pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
222	goto out_err;
223	}
224	res->flags &= ~DLM_LKF_EXPEDITE;
225	res->flags \|= DLM_LKF_CONVERT;
226
227	return res;
228	out_err:
229	kfree(objp: res->lksb.sb_lvbptr);
230	kfree(objp: res->name);
231	kfree(objp: res);
232	return NULL;
233	}
234
235	static void lockres_free(struct dlm_lock_resource *res)
236	{
237	int ret = `0`;
238
239	if (!res)
240	return;
241
242	/*
243	* use FORCEUNLOCK flag, so we can unlock even the lock is on the
244	* waiting or convert queue
245	*/
246	ret = dlm_unlock(lockspace: res->ls, lkid: res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK,
247	lksb: &res->lksb, astarg: res);
248	if (unlikely(ret != `0`))
249	pr_err("failed to unlock %s return %d\n", res->name, ret);
250	else
251	wait_event(res->sync_locking, res->sync_locking_done);
252
253	kfree(objp: res->name);
254	kfree(objp: res->lksb.sb_lvbptr);
255	kfree(objp: res);
256	}
257
258	static void add_resync_info(struct dlm_lock_resource *lockres,
259	sector_t lo, sector_t hi)
260	{
261	struct resync_info *ri;
262
263	ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
264	ri->lo = cpu_to_le64(lo);
265	ri->hi = cpu_to_le64(hi);
266	}
267
268	static int read_resync_info(struct mddev *mddev,
269	struct dlm_lock_resource *lockres)
270	{
271	struct resync_info ri;
272	struct md_cluster_info *cinfo = mddev->cluster_info;
273	int ret = `0`;
274
275	dlm_lock_sync(res: lockres, DLM_LOCK_CR);
276	memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
277	if (le64_to_cpu(ri.hi) > `0`) {
278	cinfo->suspend_hi = le64_to_cpu(ri.hi);
279	cinfo->suspend_lo = le64_to_cpu(ri.lo);
280	ret = `1`;
281	}
282	dlm_unlock_sync(res: lockres);
283	return ret;
284	}
285
286	static void recover_bitmaps(struct md_thread *thread)
287	{
288	struct mddev *mddev = thread->mddev;
289	struct md_cluster_info *cinfo = mddev->cluster_info;
290	struct dlm_lock_resource *bm_lockres;
291	char str[`64`];
292	int slot, ret;
293	sector_t lo, hi;
294
295	while (cinfo->recovery_map) {
296	slot = fls64(x: (u64)cinfo->recovery_map) - `1`;
297
298	snprintf(buf: str, size: `64`, fmt: "bitmap%04d", slot);
299	bm_lockres = lockres_init(mddev, name: str, NULL, with_lvb: `1`);
300	if (!bm_lockres) {
301	pr_err("md-cluster: Cannot initialize bitmaps\n");
302	goto clear_bit;
303	}
304
305	ret = dlm_lock_sync_interruptible(res: bm_lockres, DLM_LOCK_PW, mddev);
306	if (ret) {
307	pr_err("md-cluster: Could not DLM lock %s: %d\n",
308	str, ret);
309	goto clear_bit;
310	}
311	ret = md_bitmap_copy_from_slot(mddev, slot, lo: &lo, hi: &hi, clear_bits: true);
312	if (ret) {
313	pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
314	goto clear_bit;
315	}
316
317	/ Clear suspend_area associated with the bitmap /
318	spin_lock_irq(lock: &cinfo->suspend_lock);
319	cinfo->suspend_hi = `0`;
320	cinfo->suspend_lo = `0`;
321	cinfo->suspend_from = -`1`;
322	spin_unlock_irq(lock: &cinfo->suspend_lock);
323
324	/ Kick off a reshape if needed /
325	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
326	test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
327	mddev->reshape_position != MaxSector)
328	md_wakeup_thread(thread: mddev->sync_thread);
329
330	if (hi > `0`) {
331	if (lo < mddev->recovery_cp)
332	mddev->recovery_cp = lo;
333	/ wake up thread to continue resync in case resync*
334	* is not finished */
335	if (mddev->recovery_cp != MaxSector) {
336	/*
337	* clear the REMOTE flag since we will launch
338	* resync thread in current node.
339	*/
340	clear_bit(nr: MD_RESYNCING_REMOTE,
341	addr: &mddev->recovery);
342	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
343	md_wakeup_thread(thread: mddev->thread);
344	}
345	}
346	clear_bit:
347	lockres_free(res: bm_lockres);
348	clear_bit(nr: slot, addr: &cinfo->recovery_map);
349	}
350	}
351
352	static void recover_prep(void *arg)
353	{
354	struct mddev *mddev = arg;
355	struct md_cluster_info *cinfo = mddev->cluster_info;
356	set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, addr: &cinfo->state);
357	}
358
359	static void __recover_slot(struct mddev mddev, int* slot)
360	{
361	struct md_cluster_info *cinfo = mddev->cluster_info;
362
363	set_bit(nr: slot, addr: &cinfo->recovery_map);
364	if (!cinfo->recovery_thread) {
365	rcu_assign_pointer(cinfo->recovery_thread,
366	md_register_thread(recover_bitmaps, mddev, "recover"));
367	if (!cinfo->recovery_thread) {
368	pr_warn("md-cluster: Could not create recovery thread\n");
369	return;
370	}
371	}
372	md_wakeup_thread(thread: cinfo->recovery_thread);
373	}
374
375	static void recover_slot(void arg, struct* dlm_slot *slot)
376	{
377	struct mddev *mddev = arg;
378	struct md_cluster_info *cinfo = mddev->cluster_info;
379
380	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
381	mddev->bitmap_info.cluster_name,
382	slot->nodeid, slot->slot,
383	cinfo->slot_number);
384	/ deduct one since dlm slot starts from one while the num of*
385	* cluster-md begins with 0 */
386	__recover_slot(mddev, slot: slot->slot - `1`);
387	}
388
389	static void recover_done(void arg, struct* dlm_slot *slots,
390	int num_slots, int our_slot,
391	uint32_t generation)
392	{
393	struct mddev *mddev = arg;
394	struct md_cluster_info *cinfo = mddev->cluster_info;
395
396	cinfo->slot_number = our_slot;
397	/ completion is only need to be complete when node join cluster,*
398	* it doesn't need to run during another node's failure */
399	if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
400	complete(&cinfo->completion);
401	clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, addr: &cinfo->state);
402	}
403	clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, addr: &cinfo->state);
404	}
405
406	/ the ops is called when node join the cluster, and do lock recovery*
407	* if node failure occurs */
408	static const struct dlm_lockspace_ops md_ls_ops = {
409	.recover_prep = recover_prep,
410	.recover_slot = recover_slot,
411	.recover_done = recover_done,
412	};
413
414	/*
415	* The BAST function for the ack lock resource
416	* This function wakes up the receive thread in
417	* order to receive and process the message.
418	*/
419	static void ack_bast(void arg, int* mode)
420	{
421	struct dlm_lock_resource *res = arg;
422	struct md_cluster_info *cinfo = res->mddev->cluster_info;
423
424	if (mode == DLM_LOCK_EX) {
425	if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state))
426	md_wakeup_thread(thread: cinfo->recv_thread);
427	else
428	set_bit(MD_CLUSTER_PENDING_RECV_EVENT, addr: &cinfo->state);
429	}
430	}
431
432	static void remove_suspend_info(struct mddev mddev, int* slot)
433	{
434	struct md_cluster_info *cinfo = mddev->cluster_info;
435	mddev->pers->quiesce(mddev, `1`);
436	spin_lock_irq(lock: &cinfo->suspend_lock);
437	cinfo->suspend_hi = `0`;
438	cinfo->suspend_lo = `0`;
439	spin_unlock_irq(lock: &cinfo->suspend_lock);
440	mddev->pers->quiesce(mddev, `0`);
441	}
442
443	static void process_suspend_info(struct mddev *mddev,
444	int slot, sector_t lo, sector_t hi)
445	{
446	struct md_cluster_info *cinfo = mddev->cluster_info;
447	struct mdp_superblock_1 *sb = NULL;
448	struct md_rdev *rdev;
449
450	if (!hi) {
451	/*
452	* clear the REMOTE flag since resync or recovery is finished
453	* in remote node.
454	*/
455	clear_bit(nr: MD_RESYNCING_REMOTE, addr: &mddev->recovery);
456	remove_suspend_info(mddev, slot);
457	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
458	md_wakeup_thread(thread: mddev->thread);
459	return;
460	}
461
462	rdev_for_each(rdev, mddev)
463	if (rdev->raid_disk > -`1` && !test_bit(Faulty, &rdev->flags)) {
464	sb = page_address(rdev->sb_page);
465	break;
466	}
467
468	/*
469	* The bitmaps are not same for different nodes
470	* if RESYNCING is happening in one node, then
471	* the node which received the RESYNCING message
472	* probably will perform resync with the region
473	* [lo, hi] again, so we could reduce resync time
474	* a lot if we can ensure that the bitmaps among
475	* different nodes are match up well.
476	*
477	* sync_low/hi is used to record the region which
478	* arrived in the previous RESYNCING message,
479	*
480	* Call md_bitmap_sync_with_cluster to clear NEEDED_MASK
481	* and set RESYNC_MASK since resync thread is running
482	* in another node, so we don't need to do the resync
483	* again with the same section.
484	*
485	* Skip md_bitmap_sync_with_cluster in case reshape
486	* happening, because reshaping region is small and
487	* we don't want to trigger lots of WARN.
488	*/
489	if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
490	md_bitmap_sync_with_cluster(mddev, old_lo: cinfo->sync_low,
491	old_hi: cinfo->sync_hi, new_lo: lo, new_hi: hi);
492	cinfo->sync_low = lo;
493	cinfo->sync_hi = hi;
494
495	mddev->pers->quiesce(mddev, `1`);
496	spin_lock_irq(lock: &cinfo->suspend_lock);
497	cinfo->suspend_from = slot;
498	cinfo->suspend_lo = lo;
499	cinfo->suspend_hi = hi;
500	spin_unlock_irq(lock: &cinfo->suspend_lock);
501	mddev->pers->quiesce(mddev, `0`);
502	}
503
504	static int process_add_new_disk(struct mddev mddev, struct* cluster_msg *cmsg)
505	{
506	char disk_uuid[`64`];
507	struct md_cluster_info *cinfo = mddev->cluster_info;
508	char event_name[] = "EVENT=ADD_DEVICE";
509	char raid_slot[`16`];
510	char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
511	int len;
512	int res = `0`;
513
514	len = snprintf(buf: disk_uuid, size: `64`, fmt: "DEVICE_UUID=");
515	sprintf(buf: disk_uuid + len, fmt: "%pU", cmsg->uuid);
516	snprintf(buf: raid_slot, size: `16`, fmt: "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
517	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
518	init_completion(x: &cinfo->newdisk_completion);
519	set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, addr: &cinfo->state);
520	kobject_uevent_env(kobj: &disk_to_dev(mddev->gendisk)->kobj, action: KOBJ_CHANGE, envp);
521	if (!wait_for_completion_timeout(x: &cinfo->newdisk_completion,
522	NEW_DEV_TIMEOUT)) {
523	pr_err("md-cluster(%s:%d): timeout on a new disk adding\n",
524	__func__, __LINE__);
525	res = -`1`;
526	}
527	clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, addr: &cinfo->state);
528	return res;
529	}
530
531
532	static void process_metadata_update(struct mddev mddev, struct* cluster_msg *msg)
533	{
534	int got_lock = `0`;
535	struct md_thread *thread;
536	struct md_cluster_info *cinfo = mddev->cluster_info;
537	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
538
539	dlm_lock_sync(res: cinfo->no_new_dev_lockres, DLM_LOCK_CR);
540
541	/ daemaon thread must exist /
542	thread = rcu_dereference_protected(mddev->thread, true);
543	wait_event(thread->wqueue,
544	(got_lock = mddev_trylock(mddev)) \|\|
545	test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
546	md_reload_sb(mddev, raid_disk: mddev->good_device_nr);
547	if (got_lock)
548	mddev_unlock(mddev);
549	}
550
551	static void process_remove_disk(struct mddev mddev, struct* cluster_msg *msg)
552	{
553	struct md_rdev *rdev;
554
555	rcu_read_lock();
556	rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
557	if (rdev) {
558	set_bit(nr: ClusterRemove, addr: &rdev->flags);
559	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
560	md_wakeup_thread(thread: mddev->thread);
561	}
562	else
563	pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
564	__func__, __LINE__, le32_to_cpu(msg->raid_slot));
565	rcu_read_unlock();
566	}
567
568	static void process_readd_disk(struct mddev mddev, struct* cluster_msg *msg)
569	{
570	struct md_rdev *rdev;
571
572	rcu_read_lock();
573	rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
574	if (rdev && test_bit(Faulty, &rdev->flags))
575	clear_bit(nr: Faulty, addr: &rdev->flags);
576	else
577	pr_warn("%s: %d Could not find disk(%d) which is faulty",
578	__func__, __LINE__, le32_to_cpu(msg->raid_slot));
579	rcu_read_unlock();
580	}
581
582	static int process_recvd_msg(struct mddev mddev, struct* cluster_msg *msg)
583	{
584	int ret = `0`;
585
586	if (WARN(mddev->cluster_info->slot_number - `1` == le32_to_cpu(msg->slot),
587	"node %d received its own msg\n", le32_to_cpu(msg->slot)))
588	return -`1`;
589	switch (le32_to_cpu(msg->type)) {
590	case METADATA_UPDATED:
591	process_metadata_update(mddev, msg);
592	break;
593	case CHANGE_CAPACITY:
594	set_capacity_and_notify(disk: mddev->gendisk, size: mddev->array_sectors);
595	break;
596	case RESYNCING:
597	set_bit(nr: MD_RESYNCING_REMOTE, addr: &mddev->recovery);
598	process_suspend_info(mddev, le32_to_cpu(msg->slot),
599	le64_to_cpu(msg->low),
600	le64_to_cpu(msg->high));
601	break;
602	case NEWDISK:
603	if (process_add_new_disk(mddev, cmsg: msg))
604	ret = -`1`;
605	break;
606	case REMOVE:
607	process_remove_disk(mddev, msg);
608	break;
609	case RE_ADD:
610	process_readd_disk(mddev, msg);
611	break;
612	case BITMAP_NEEDS_SYNC:
613	__recover_slot(mddev, le32_to_cpu(msg->slot));
614	break;
615	case BITMAP_RESIZE:
616	if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, `0`, `0`))
617	ret = md_bitmap_resize(bitmap: mddev->bitmap,
618	le64_to_cpu(msg->high), chunksize: `0`, init: `0`);
619	break;
620	default:
621	ret = -`1`;
622	pr_warn("%s:%d Received unknown message from %d\n",
623	__func__, __LINE__, msg->slot);
624	}
625	return ret;
626	}
627
628	/*
629	* thread for receiving message
630	*/
631	static void recv_daemon(struct md_thread *thread)
632	{
633	struct md_cluster_info *cinfo = thread->mddev->cluster_info;
634	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
635	struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
636	struct cluster_msg msg;
637	int ret;
638
639	mutex_lock(&cinfo->recv_mutex);
640	/get CR on Message/
641	if (dlm_lock_sync(res: message_lockres, DLM_LOCK_CR)) {
642	pr_err("md/raid1:failed to get CR on MESSAGE\n");
643	mutex_unlock(lock: &cinfo->recv_mutex);
644	return;
645	}
646
647	/ read lvb and wake up thread to process this message_lockres /
648	memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
649	ret = process_recvd_msg(mddev: thread->mddev, msg: &msg);
650	if (ret)
651	goto out;
652
653	/release CR on ack_lockres/
654	ret = dlm_unlock_sync(res: ack_lockres);
655	if (unlikely(ret != `0`))
656	pr_info("unlock ack failed return %d\n", ret);
657	/up-convert to PR on message_lockres/
658	ret = dlm_lock_sync(res: message_lockres, DLM_LOCK_PR);
659	if (unlikely(ret != `0`))
660	pr_info("lock PR on msg failed return %d\n", ret);
661	/get CR on ack_lockres again/
662	ret = dlm_lock_sync(res: ack_lockres, DLM_LOCK_CR);
663	if (unlikely(ret != `0`))
664	pr_info("lock CR on ack failed return %d\n", ret);
665	out:
666	/release CR on message_lockres/
667	ret = dlm_unlock_sync(res: message_lockres);
668	if (unlikely(ret != `0`))
669	pr_info("unlock msg failed return %d\n", ret);
670	mutex_unlock(lock: &cinfo->recv_mutex);
671	}
672
673	/ lock_token()*
674	* Takes the lock on the TOKEN lock resource so no other
675	* node can communicate while the operation is underway.
676	*/
677	static int lock_token(struct md_cluster_info *cinfo)
678	{
679	int error;
680
681	error = dlm_lock_sync(res: cinfo->token_lockres, DLM_LOCK_EX);
682	if (error) {
683	pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
684	__func__, __LINE__, error);
685	} else {
686	/ Lock the receive sequence /
687	mutex_lock(&cinfo->recv_mutex);
688	}
689	return error;
690	}
691
692	/ lock_comm()*
693	* Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
694	*/
695	static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
696	{
697	int rv, set_bit = `0`;
698	struct mddev *mddev = cinfo->mddev;
699
700	/*
701	* If resync thread run after raid1d thread, then process_metadata_update
702	* could not continue if raid1d held reconfig_mutex (and raid1d is blocked
703	* since another node already got EX on Token and waiting the EX of Ack),
704	* so let resync wake up thread in case flag is set.
705	*/
706	if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
707	&cinfo->state)) {
708	rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
709	addr: &cinfo->state);
710	WARN_ON_ONCE(rv);
711	md_wakeup_thread(thread: mddev->thread);
712	set_bit = `1`;
713	}
714
715	wait_event(cinfo->wait,
716	!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
717	rv = lock_token(cinfo);
718	if (set_bit)
719	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, addr: &cinfo->state);
720	return rv;
721	}
722
723	static void unlock_comm(struct md_cluster_info *cinfo)
724	{
725	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
726	mutex_unlock(lock: &cinfo->recv_mutex);
727	dlm_unlock_sync(res: cinfo->token_lockres);
728	clear_bit(MD_CLUSTER_SEND_LOCK, addr: &cinfo->state);
729	wake_up(&cinfo->wait);
730	}
731
732	/ __sendmsg()*
733	* This function performs the actual sending of the message. This function is
734	* usually called after performing the encompassing operation
735	* The function:
736	* 1. Grabs the message lockresource in EX mode
737	* 2. Copies the message to the message LVB
738	* 3. Downconverts message lockresource to CW
739	* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
740	* and the other nodes read the message. The thread will wait here until all other
741	* nodes have released ack lock resource.
742	* 5. Downconvert ack lockresource to CR
743	*/
744	static int __sendmsg(struct md_cluster_info cinfo, struct* cluster_msg *cmsg)
745	{
746	int error;
747	int slot = cinfo->slot_number - `1`;
748
749	cmsg->slot = cpu_to_le32(slot);
750	/get EX on Message/
751	error = dlm_lock_sync(res: cinfo->message_lockres, DLM_LOCK_EX);
752	if (error) {
753	pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
754	goto failed_message;
755	}
756
757	memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
758	sizeof(struct cluster_msg));
759	/down-convert EX to CW on Message/
760	error = dlm_lock_sync(res: cinfo->message_lockres, DLM_LOCK_CW);
761	if (error) {
762	pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
763	error);
764	goto failed_ack;
765	}
766
767	/up-convert CR to EX on Ack/
768	error = dlm_lock_sync(res: cinfo->ack_lockres, DLM_LOCK_EX);
769	if (error) {
770	pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
771	error);
772	goto failed_ack;
773	}
774
775	/down-convert EX to CR on Ack/
776	error = dlm_lock_sync(res: cinfo->ack_lockres, DLM_LOCK_CR);
777	if (error) {
778	pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
779	error);
780	goto failed_ack;
781	}
782
783	failed_ack:
784	error = dlm_unlock_sync(res: cinfo->message_lockres);
785	if (unlikely(error != `0`)) {
786	pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
787	error);
788	/ in case the message can't be released due to some reason /
789	goto failed_ack;
790	}
791	failed_message:
792	return error;
793	}
794
795	static int sendmsg(struct md_cluster_info cinfo, struct* cluster_msg *cmsg,
796	bool mddev_locked)
797	{
798	int ret;
799
800	ret = lock_comm(cinfo, mddev_locked);
801	if (!ret) {
802	ret = __sendmsg(cinfo, cmsg);
803	unlock_comm(cinfo);
804	}
805	return ret;
806	}
807
808	static int gather_all_resync_info(struct mddev mddev, int* total_slots)
809	{
810	struct md_cluster_info *cinfo = mddev->cluster_info;
811	int i, ret = `0`;
812	struct dlm_lock_resource *bm_lockres;
813	char str[`64`];
814	sector_t lo, hi;
815
816
817	for (i = `0`; i < total_slots; i++) {
818	memset(str, `'\0'`, `64`);
819	snprintf(buf: str, size: `64`, fmt: "bitmap%04d", i);
820	bm_lockres = lockres_init(mddev, name: str, NULL, with_lvb: `1`);
821	if (!bm_lockres)
822	return -ENOMEM;
823	if (i == (cinfo->slot_number - `1`)) {
824	lockres_free(res: bm_lockres);
825	continue;
826	}
827
828	bm_lockres->flags \|= DLM_LKF_NOQUEUE;
829	ret = dlm_lock_sync(res: bm_lockres, DLM_LOCK_PW);
830	if (ret == -EAGAIN) {
831	if (read_resync_info(mddev, lockres: bm_lockres)) {
832	pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
833	__func__, __LINE__,
834	(unsigned long long) cinfo->suspend_lo,
835	(unsigned long long) cinfo->suspend_hi,
836	i);
837	cinfo->suspend_from = i;
838	}
839	ret = `0`;
840	lockres_free(res: bm_lockres);
841	continue;
842	}
843	if (ret) {
844	lockres_free(res: bm_lockres);
845	goto out;
846	}
847
848	/ Read the disk bitmap sb and check if it needs recovery /
849	ret = md_bitmap_copy_from_slot(mddev, slot: i, lo: &lo, hi: &hi, clear_bits: false);
850	if (ret) {
851	pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
852	lockres_free(res: bm_lockres);
853	continue;
854	}
855	if ((hi > `0`) && (lo < mddev->recovery_cp)) {
856	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
857	mddev->recovery_cp = lo;
858	md_check_recovery(mddev);
859	}
860
861	lockres_free(res: bm_lockres);
862	}
863	out:
864	return ret;
865	}
866
867	static int join(struct mddev mddev, int* nodes)
868	{
869	struct md_cluster_info *cinfo;
870	int ret, ops_rv;
871	char str[`64`];
872
873	cinfo = kzalloc(size: sizeof(struct md_cluster_info), GFP_KERNEL);
874	if (!cinfo)
875	return -ENOMEM;
876
877	INIT_LIST_HEAD(list: &cinfo->suspend_list);
878	spin_lock_init(&cinfo->suspend_lock);
879	init_completion(x: &cinfo->completion);
880	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, addr: &cinfo->state);
881	init_waitqueue_head(&cinfo->wait);
882	mutex_init(&cinfo->recv_mutex);
883
884	mddev->cluster_info = cinfo;
885	cinfo->mddev = mddev;
886
887	memset(str, `0`, `64`);
888	sprintf(buf: str, fmt: "%pU", mddev->uuid);
889	ret = dlm_new_lockspace(name: str, cluster: mddev->bitmap_info.cluster_name,
890	flags: `0`, LVB_SIZE, ops: &md_ls_ops, ops_arg: mddev,
891	ops_result: &ops_rv, lockspace: &cinfo->lockspace);
892	if (ret)
893	goto err;
894	wait_for_completion(&cinfo->completion);
895	if (nodes < cinfo->slot_number) {
896	pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
897	cinfo->slot_number, nodes);
898	ret = -ERANGE;
899	goto err;
900	}
901	/ Initiate the communication resources /
902	ret = -ENOMEM;
903	rcu_assign_pointer(cinfo->recv_thread,
904	md_register_thread(recv_daemon, mddev, "cluster_recv"));
905	if (!cinfo->recv_thread) {
906	pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
907	goto err;
908	}
909	cinfo->message_lockres = lockres_init(mddev, name: "message", NULL, with_lvb: `1`);
910	if (!cinfo->message_lockres)
911	goto err;
912	cinfo->token_lockres = lockres_init(mddev, name: "token", NULL, with_lvb: `0`);
913	if (!cinfo->token_lockres)
914	goto err;
915	cinfo->no_new_dev_lockres = lockres_init(mddev, name: "no-new-dev", NULL, with_lvb: `0`);
916	if (!cinfo->no_new_dev_lockres)
917	goto err;
918
919	ret = dlm_lock_sync(res: cinfo->token_lockres, DLM_LOCK_EX);
920	if (ret) {
921	ret = -EAGAIN;
922	pr_err("md-cluster: can't join cluster to avoid lock issue\n");
923	goto err;
924	}
925	cinfo->ack_lockres = lockres_init(mddev, name: "ack", bastfn: ack_bast, with_lvb: `0`);
926	if (!cinfo->ack_lockres) {
927	ret = -ENOMEM;
928	goto err;
929	}
930	/ get sync CR lock on ACK. /
931	if (dlm_lock_sync(res: cinfo->ack_lockres, DLM_LOCK_CR))
932	pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
933	ret);
934	dlm_unlock_sync(res: cinfo->token_lockres);
935	/ get sync CR lock on no-new-dev. /
936	if (dlm_lock_sync(res: cinfo->no_new_dev_lockres, DLM_LOCK_CR))
937	pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
938
939
940	pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
941	snprintf(buf: str, size: `64`, fmt: "bitmap%04d", cinfo->slot_number - `1`);
942	cinfo->bitmap_lockres = lockres_init(mddev, name: str, NULL, with_lvb: `1`);
943	if (!cinfo->bitmap_lockres) {
944	ret = -ENOMEM;
945	goto err;
946	}
947	if (dlm_lock_sync(res: cinfo->bitmap_lockres, DLM_LOCK_PW)) {
948	pr_err("Failed to get bitmap lock\n");
949	ret = -EINVAL;
950	goto err;
951	}
952
953	cinfo->resync_lockres = lockres_init(mddev, name: "resync", NULL, with_lvb: `0`);
954	if (!cinfo->resync_lockres) {
955	ret = -ENOMEM;
956	goto err;
957	}
958
959	return `0`;
960	err:
961	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, addr: &cinfo->state);
962	md_unregister_thread(mddev, threadp: &cinfo->recovery_thread);
963	md_unregister_thread(mddev, threadp: &cinfo->recv_thread);
964	lockres_free(res: cinfo->message_lockres);
965	lockres_free(res: cinfo->token_lockres);
966	lockres_free(res: cinfo->ack_lockres);
967	lockres_free(res: cinfo->no_new_dev_lockres);
968	lockres_free(res: cinfo->resync_lockres);
969	lockres_free(res: cinfo->bitmap_lockres);
970	if (cinfo->lockspace)
971	dlm_release_lockspace(lockspace: cinfo->lockspace, force: `2`);
972	mddev->cluster_info = NULL;
973	kfree(objp: cinfo);
974	return ret;
975	}
976
977	static void load_bitmaps(struct mddev mddev, int* total_slots)
978	{
979	struct md_cluster_info *cinfo = mddev->cluster_info;
980
981	/ load all the node's bitmap info for resync /
982	if (gather_all_resync_info(mddev, total_slots))
983	pr_err("md-cluster: failed to gather all resyn infos\n");
984	set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, addr: &cinfo->state);
985	/ wake up recv thread in case something need to be handled /
986	if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, addr: &cinfo->state))
987	md_wakeup_thread(thread: cinfo->recv_thread);
988	}
989
990	static void resync_bitmap(struct mddev *mddev)
991	{
992	struct md_cluster_info *cinfo = mddev->cluster_info;
993	struct cluster_msg cmsg = {`0`};
994	int err;
995
996	cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
997	err = sendmsg(cinfo, cmsg: &cmsg, mddev_locked: `1`);
998	if (err)
999	pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
1000	__func__, __LINE__, err);
1001	}
1002
1003	static void unlock_all_bitmaps(struct mddev *mddev);
1004	static int leave(struct mddev *mddev)
1005	{
1006	struct md_cluster_info *cinfo = mddev->cluster_info;
1007
1008	if (!cinfo)
1009	return `0`;
1010
1011	/*
1012	* BITMAP_NEEDS_SYNC message should be sent when node
1013	* is leaving the cluster with dirty bitmap, also we
1014	* can only deliver it when dlm connection is available.
1015	*
1016	* Also, we should send BITMAP_NEEDS_SYNC message in
1017	* case reshaping is interrupted.
1018	*/
1019	if ((cinfo->slot_number > `0` && mddev->recovery_cp != MaxSector) \|\|
1020	(mddev->reshape_position != MaxSector &&
1021	test_bit(MD_CLOSING, &mddev->flags)))
1022	resync_bitmap(mddev);
1023
1024	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, addr: &cinfo->state);
1025	md_unregister_thread(mddev, threadp: &cinfo->recovery_thread);
1026	md_unregister_thread(mddev, threadp: &cinfo->recv_thread);
1027	lockres_free(res: cinfo->message_lockres);
1028	lockres_free(res: cinfo->token_lockres);
1029	lockres_free(res: cinfo->ack_lockres);
1030	lockres_free(res: cinfo->no_new_dev_lockres);
1031	lockres_free(res: cinfo->resync_lockres);
1032	lockres_free(res: cinfo->bitmap_lockres);
1033	unlock_all_bitmaps(mddev);
1034	dlm_release_lockspace(lockspace: cinfo->lockspace, force: `2`);
1035	kfree(objp: cinfo);
1036	return `0`;
1037	}
1038
1039	/ slot_number(): Returns the MD slot number to use*
1040	* DLM starts the slot numbers from 1, wheras cluster-md
1041	* wants the number to be from zero, so we deduct one
1042	*/
1043	static int slot_number(struct mddev *mddev)
1044	{
1045	struct md_cluster_info *cinfo = mddev->cluster_info;
1046
1047	return cinfo->slot_number - `1`;
1048	}
1049
1050	/*
1051	* Check if the communication is already locked, else lock the communication
1052	* channel.
1053	* If it is already locked, token is in EX mode, and hence lock_token()
1054	* should not be called.
1055	*/
1056	static int metadata_update_start(struct mddev *mddev)
1057	{
1058	struct md_cluster_info *cinfo = mddev->cluster_info;
1059	int ret;
1060
1061	/*
1062	* metadata_update_start is always called with the protection of
1063	* reconfig_mutex, so set WAITING_FOR_TOKEN here.
1064	*/
1065	ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
1066	addr: &cinfo->state);
1067	WARN_ON_ONCE(ret);
1068	md_wakeup_thread(thread: mddev->thread);
1069
1070	wait_event(cinfo->wait,
1071	!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) \|\|
1072	test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
1073
1074	/ If token is already locked, return 0 /
1075	if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
1076	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, addr: &cinfo->state);
1077	return `0`;
1078	}
1079
1080	ret = lock_token(cinfo);
1081	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, addr: &cinfo->state);
1082	return ret;
1083	}
1084
1085	static int metadata_update_finish(struct mddev *mddev)
1086	{
1087	struct md_cluster_info *cinfo = mddev->cluster_info;
1088	struct cluster_msg cmsg;
1089	struct md_rdev *rdev;
1090	int ret = `0`;
1091	int raid_slot = -`1`;
1092
1093	memset(&cmsg, `0`, sizeof(cmsg));
1094	cmsg.type = cpu_to_le32(METADATA_UPDATED);
1095	/ Pick up a good active device number to send.*
1096	*/
1097	rdev_for_each(rdev, mddev)
1098	if (rdev->raid_disk > -`1` && !test_bit(Faulty, &rdev->flags)) {
1099	raid_slot = rdev->desc_nr;
1100	break;
1101	}
1102	if (raid_slot >= `0`) {
1103	cmsg.raid_slot = cpu_to_le32(raid_slot);
1104	ret = __sendmsg(cinfo, cmsg: &cmsg);
1105	} else
1106	pr_warn("md-cluster: No good device id found to send\n");
1107	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, addr: &cinfo->state);
1108	unlock_comm(cinfo);
1109	return ret;
1110	}
1111
1112	static void metadata_update_cancel(struct mddev *mddev)
1113	{
1114	struct md_cluster_info *cinfo = mddev->cluster_info;
1115	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, addr: &cinfo->state);
1116	unlock_comm(cinfo);
1117	}
1118
1119	static int update_bitmap_size(struct mddev *mddev, sector_t size)
1120	{
1121	struct md_cluster_info *cinfo = mddev->cluster_info;
1122	struct cluster_msg cmsg = {`0`};
1123	int ret;
1124
1125	cmsg.type = cpu_to_le32(BITMAP_RESIZE);
1126	cmsg.high = cpu_to_le64(size);
1127	ret = sendmsg(cinfo, cmsg: &cmsg, mddev_locked: `0`);
1128	if (ret)
1129	pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n",
1130	__func__, __LINE__, ret);
1131	return ret;
1132	}
1133
1134	static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
1135	{
1136	struct bitmap_counts *counts;
1137	char str[`64`];
1138	struct dlm_lock_resource *bm_lockres;
1139	struct bitmap *bitmap = mddev->bitmap;
1140	unsigned long my_pages = bitmap->counts.pages;
1141	int i, rv;
1142
1143	/*
1144	* We need to ensure all the nodes can grow to a larger
1145	* bitmap size before make the reshaping.
1146	*/
1147	rv = update_bitmap_size(mddev, size: newsize);
1148	if (rv)
1149	return rv;
1150
1151	for (i = `0`; i < mddev->bitmap_info.nodes; i++) {
1152	if (i == md_cluster_ops->slot_number(mddev))
1153	continue;
1154
1155	bitmap = get_bitmap_from_slot(mddev, slot: i);
1156	if (IS_ERR(ptr: bitmap)) {
1157	pr_err("can't get bitmap from slot %d\n", i);
1158	bitmap = NULL;
1159	goto out;
1160	}
1161	counts = &bitmap->counts;
1162
1163	/*
1164	* If we can hold the bitmap lock of one node then
1165	* the slot is not occupied, update the pages.
1166	*/
1167	snprintf(buf: str, size: `64`, fmt: "bitmap%04d", i);
1168	bm_lockres = lockres_init(mddev, name: str, NULL, with_lvb: `1`);
1169	if (!bm_lockres) {
1170	pr_err("Cannot initialize %s lock\n", str);
1171	goto out;
1172	}
1173	bm_lockres->flags \|= DLM_LKF_NOQUEUE;
1174	rv = dlm_lock_sync(res: bm_lockres, DLM_LOCK_PW);
1175	if (!rv)
1176	counts->pages = my_pages;
1177	lockres_free(res: bm_lockres);
1178
1179	if (my_pages != counts->pages)
1180	/*
1181	* Let's revert the bitmap size if one node
1182	* can't resize bitmap
1183	*/
1184	goto out;
1185	md_bitmap_free(bitmap);
1186	}
1187
1188	return `0`;
1189	out:
1190	md_bitmap_free(bitmap);
1191	update_bitmap_size(mddev, size: oldsize);
1192	return -`1`;
1193	}
1194
1195	/*
1196	* return 0 if all the bitmaps have the same sync_size
1197	*/
1198	static int cluster_check_sync_size(struct mddev *mddev)
1199	{
1200	int i, rv;
1201	bitmap_super_t *sb;
1202	unsigned long my_sync_size, sync_size = `0`;
1203	int node_num = mddev->bitmap_info.nodes;
1204	int current_slot = md_cluster_ops->slot_number(mddev);
1205	struct bitmap *bitmap = mddev->bitmap;
1206	char str[`64`];
1207	struct dlm_lock_resource *bm_lockres;
1208
1209	sb = kmap_atomic(page: bitmap->storage.sb_page);
1210	my_sync_size = sb->sync_size;
1211	kunmap_atomic(sb);
1212
1213	for (i = `0`; i < node_num; i++) {
1214	if (i == current_slot)
1215	continue;
1216
1217	bitmap = get_bitmap_from_slot(mddev, slot: i);
1218	if (IS_ERR(ptr: bitmap)) {
1219	pr_err("can't get bitmap from slot %d\n", i);
1220	return -`1`;
1221	}
1222
1223	/*
1224	* If we can hold the bitmap lock of one node then
1225	* the slot is not occupied, update the sb.
1226	*/
1227	snprintf(buf: str, size: `64`, fmt: "bitmap%04d", i);
1228	bm_lockres = lockres_init(mddev, name: str, NULL, with_lvb: `1`);
1229	if (!bm_lockres) {
1230	pr_err("md-cluster: Cannot initialize %s\n", str);
1231	md_bitmap_free(bitmap);
1232	return -`1`;
1233	}
1234	bm_lockres->flags \|= DLM_LKF_NOQUEUE;
1235	rv = dlm_lock_sync(res: bm_lockres, DLM_LOCK_PW);
1236	if (!rv)
1237	md_bitmap_update_sb(bitmap);
1238	lockres_free(res: bm_lockres);
1239
1240	sb = kmap_atomic(page: bitmap->storage.sb_page);
1241	if (sync_size == `0`)
1242	sync_size = sb->sync_size;
1243	else if (sync_size != sb->sync_size) {
1244	kunmap_atomic(sb);
1245	md_bitmap_free(bitmap);
1246	return -`1`;
1247	}
1248	kunmap_atomic(sb);
1249	md_bitmap_free(bitmap);
1250	}
1251
1252	return (my_sync_size == sync_size) ? `0` : -`1`;
1253	}
1254
1255	/*
1256	* Update the size for cluster raid is a little more complex, we perform it
1257	* by the steps:
1258	* 1. hold token lock and update superblock in initiator node.
1259	* 2. send METADATA_UPDATED msg to other nodes.
1260	* 3. The initiator node continues to check each bitmap's sync_size, if all
1261	* bitmaps have the same value of sync_size, then we can set capacity and
1262	* let other nodes to perform it. If one node can't update sync_size
1263	* accordingly, we need to revert to previous value.
1264	*/
1265	static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
1266	{
1267	struct md_cluster_info *cinfo = mddev->cluster_info;
1268	struct cluster_msg cmsg;
1269	struct md_rdev *rdev;
1270	int ret = `0`;
1271	int raid_slot = -`1`;
1272
1273	md_update_sb(mddev, force: `1`);
1274	if (lock_comm(cinfo, mddev_locked: `1`)) {
1275	pr_err("%s: lock_comm failed\n", __func__);
1276	return;
1277	}
1278
1279	memset(&cmsg, `0`, sizeof(cmsg));
1280	cmsg.type = cpu_to_le32(METADATA_UPDATED);
1281	rdev_for_each(rdev, mddev)
1282	if (rdev->raid_disk >= `0` && !test_bit(Faulty, &rdev->flags)) {
1283	raid_slot = rdev->desc_nr;
1284	break;
1285	}
1286	if (raid_slot >= `0`) {
1287	cmsg.raid_slot = cpu_to_le32(raid_slot);
1288	/*
1289	* We can only change capiticy after all the nodes can do it,
1290	* so need to wait after other nodes already received the msg
1291	* and handled the change
1292	*/
1293	ret = __sendmsg(cinfo, cmsg: &cmsg);
1294	if (ret) {
1295	pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
1296	__func__, __LINE__);
1297	unlock_comm(cinfo);
1298	return;
1299	}
1300	} else {
1301	pr_err("md-cluster: No good device id found to send\n");
1302	unlock_comm(cinfo);
1303	return;
1304	}
1305
1306	/*
1307	* check the sync_size from other node's bitmap, if sync_size
1308	* have already updated in other nodes as expected, send an
1309	* empty metadata msg to permit the change of capacity
1310	*/
1311	if (cluster_check_sync_size(mddev) == `0`) {
1312	memset(&cmsg, `0`, sizeof(cmsg));
1313	cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
1314	ret = __sendmsg(cinfo, cmsg: &cmsg);
1315	if (ret)
1316	pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
1317	__func__, __LINE__);
1318	set_capacity_and_notify(disk: mddev->gendisk, size: mddev->array_sectors);
1319	} else {
1320	/ revert to previous sectors /
1321	ret = mddev->pers->resize(mddev, old_dev_sectors);
1322	ret = __sendmsg(cinfo, cmsg: &cmsg);
1323	if (ret)
1324	pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
1325	__func__, __LINE__);
1326	}
1327	unlock_comm(cinfo);
1328	}
1329
1330	static int resync_start(struct mddev *mddev)
1331	{
1332	struct md_cluster_info *cinfo = mddev->cluster_info;
1333	return dlm_lock_sync_interruptible(res: cinfo->resync_lockres, DLM_LOCK_EX, mddev);
1334	}
1335
1336	static void resync_info_get(struct mddev mddev, sector_t lo, sector_t *hi)
1337	{
1338	struct md_cluster_info *cinfo = mddev->cluster_info;
1339
1340	spin_lock_irq(lock: &cinfo->suspend_lock);
1341	*lo = cinfo->suspend_lo;
1342	*hi = cinfo->suspend_hi;
1343	spin_unlock_irq(lock: &cinfo->suspend_lock);
1344	}
1345
1346	static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
1347	{
1348	struct md_cluster_info *cinfo = mddev->cluster_info;
1349	struct resync_info ri;
1350	struct cluster_msg cmsg = {`0`};
1351
1352	/ do not send zero again, if we have sent before /
1353	if (hi == `0`) {
1354	memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
1355	if (le64_to_cpu(ri.hi) == `0`)
1356	return `0`;
1357	}
1358
1359	add_resync_info(lockres: cinfo->bitmap_lockres, lo, hi);
1360	/ Re-acquire the lock to refresh LVB /
1361	dlm_lock_sync(res: cinfo->bitmap_lockres, DLM_LOCK_PW);
1362	cmsg.type = cpu_to_le32(RESYNCING);
1363	cmsg.low = cpu_to_le64(lo);
1364	cmsg.high = cpu_to_le64(hi);
1365
1366	/*
1367	* mddev_lock is held if resync_info_update is called from
1368	* resync_finish (md_reap_sync_thread -> resync_finish)
1369	*/
1370	if (lo == `0` && hi == `0`)
1371	return sendmsg(cinfo, cmsg: &cmsg, mddev_locked: `1`);
1372	else
1373	return sendmsg(cinfo, cmsg: &cmsg, mddev_locked: `0`);
1374	}
1375
1376	static int resync_finish(struct mddev *mddev)
1377	{
1378	struct md_cluster_info *cinfo = mddev->cluster_info;
1379	int ret = `0`;
1380
1381	clear_bit(nr: MD_RESYNCING_REMOTE, addr: &mddev->recovery);
1382
1383	/*
1384	* If resync thread is interrupted so we can't say resync is finished,
1385	* another node will launch resync thread to continue.
1386	*/
1387	if (!test_bit(MD_CLOSING, &mddev->flags))
1388	ret = resync_info_update(mddev, lo: `0`, hi: `0`);
1389	dlm_unlock_sync(res: cinfo->resync_lockres);
1390	return ret;
1391	}
1392
1393	static int area_resyncing(struct mddev mddev, int* direction,
1394	sector_t lo, sector_t hi)
1395	{
1396	struct md_cluster_info *cinfo = mddev->cluster_info;
1397	int ret = `0`;
1398
1399	if ((direction == READ) &&
1400	test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
1401	return `1`;
1402
1403	spin_lock_irq(lock: &cinfo->suspend_lock);
1404	if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi)
1405	ret = `1`;
1406	spin_unlock_irq(lock: &cinfo->suspend_lock);
1407	return ret;
1408	}
1409
1410	/ add_new_disk() - initiates a disk add*
1411	* However, if this fails before writing md_update_sb(),
1412	* add_new_disk_cancel() must be called to release token lock
1413	*/
1414	static int add_new_disk(struct mddev mddev, struct* md_rdev *rdev)
1415	{
1416	struct md_cluster_info *cinfo = mddev->cluster_info;
1417	struct cluster_msg cmsg;
1418	int ret = `0`;
1419	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1420	char *uuid = sb->device_uuid;
1421
1422	memset(&cmsg, `0`, sizeof(cmsg));
1423	cmsg.type = cpu_to_le32(NEWDISK);
1424	memcpy(cmsg.uuid, uuid, `16`);
1425	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1426	if (lock_comm(cinfo, mddev_locked: `1`))
1427	return -EAGAIN;
1428	ret = __sendmsg(cinfo, cmsg: &cmsg);
1429	if (ret) {
1430	unlock_comm(cinfo);
1431	return ret;
1432	}
1433	cinfo->no_new_dev_lockres->flags \|= DLM_LKF_NOQUEUE;
1434	ret = dlm_lock_sync(res: cinfo->no_new_dev_lockres, DLM_LOCK_EX);
1435	cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
1436	/ Some node does not "see" the device /
1437	if (ret == -EAGAIN)
1438	ret = -ENOENT;
1439	if (ret)
1440	unlock_comm(cinfo);
1441	else {
1442	dlm_lock_sync(res: cinfo->no_new_dev_lockres, DLM_LOCK_CR);
1443	/ Since MD_CHANGE_DEVS will be set in add_bound_rdev which*
1444	* will run soon after add_new_disk, the below path will be
1445	* invoked:
1446	* md_wakeup_thread(mddev->thread)
1447	* -> conf->thread (raid1d)
1448	* -> md_check_recovery -> md_update_sb
1449	* -> metadata_update_start/finish
1450	* MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
1451	*
1452	* For other failure cases, metadata_update_cancel and
1453	* add_new_disk_cancel also clear below bit as well.
1454	* */
1455	set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, addr: &cinfo->state);
1456	wake_up(&cinfo->wait);
1457	}
1458	return ret;
1459	}
1460
1461	static void add_new_disk_cancel(struct mddev *mddev)
1462	{
1463	struct md_cluster_info *cinfo = mddev->cluster_info;
1464	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, addr: &cinfo->state);
1465	unlock_comm(cinfo);
1466	}
1467
1468	static int new_disk_ack(struct mddev *mddev, bool ack)
1469	{
1470	struct md_cluster_info *cinfo = mddev->cluster_info;
1471
1472	if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
1473	pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
1474	return -EINVAL;
1475	}
1476
1477	if (ack)
1478	dlm_unlock_sync(res: cinfo->no_new_dev_lockres);
1479	complete(&cinfo->newdisk_completion);
1480	return `0`;
1481	}
1482
1483	static int remove_disk(struct mddev mddev, struct* md_rdev *rdev)
1484	{
1485	struct cluster_msg cmsg = {`0`};
1486	struct md_cluster_info *cinfo = mddev->cluster_info;
1487	cmsg.type = cpu_to_le32(REMOVE);
1488	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1489	return sendmsg(cinfo, cmsg: &cmsg, mddev_locked: `1`);
1490	}
1491
1492	static int lock_all_bitmaps(struct mddev *mddev)
1493	{
1494	int slot, my_slot, ret, held = `1`, i = `0`;
1495	char str[`64`];
1496	struct md_cluster_info *cinfo = mddev->cluster_info;
1497
1498	cinfo->other_bitmap_lockres =
1499	kcalloc(n: mddev->bitmap_info.nodes - `1`,
1500	size: sizeof(struct dlm_lock_resource *), GFP_KERNEL);
1501	if (!cinfo->other_bitmap_lockres) {
1502	pr_err("md: can't alloc mem for other bitmap locks\n");
1503	return `0`;
1504	}
1505
1506	my_slot = slot_number(mddev);
1507	for (slot = `0`; slot < mddev->bitmap_info.nodes; slot++) {
1508	if (slot == my_slot)
1509	continue;
1510
1511	memset(str, `'\0'`, `64`);
1512	snprintf(buf: str, size: `64`, fmt: "bitmap%04d", slot);
1513	cinfo->other_bitmap_lockres[i] = lockres_init(mddev, name: str, NULL, with_lvb: `1`);
1514	if (!cinfo->other_bitmap_lockres[i])
1515	return -ENOMEM;
1516
1517	cinfo->other_bitmap_lockres[i]->flags \|= DLM_LKF_NOQUEUE;
1518	ret = dlm_lock_sync(res: cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
1519	if (ret)
1520	held = -`1`;
1521	i++;
1522	}
1523
1524	return held;
1525	}
1526
1527	static void unlock_all_bitmaps(struct mddev *mddev)
1528	{
1529	struct md_cluster_info *cinfo = mddev->cluster_info;
1530	int i;
1531
1532	/ release other node's bitmap lock if they are existed /
1533	if (cinfo->other_bitmap_lockres) {
1534	for (i = `0`; i < mddev->bitmap_info.nodes - `1`; i++) {
1535	if (cinfo->other_bitmap_lockres[i]) {
1536	lockres_free(res: cinfo->other_bitmap_lockres[i]);
1537	}
1538	}
1539	kfree(objp: cinfo->other_bitmap_lockres);
1540	cinfo->other_bitmap_lockres = NULL;
1541	}
1542	}
1543
1544	static int gather_bitmaps(struct md_rdev *rdev)
1545	{
1546	int sn, err;
1547	sector_t lo, hi;
1548	struct cluster_msg cmsg = {`0`};
1549	struct mddev *mddev = rdev->mddev;
1550	struct md_cluster_info *cinfo = mddev->cluster_info;
1551
1552	cmsg.type = cpu_to_le32(RE_ADD);
1553	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1554	err = sendmsg(cinfo, cmsg: &cmsg, mddev_locked: `1`);
1555	if (err)
1556	goto out;
1557
1558	for (sn = `0`; sn < mddev->bitmap_info.nodes; sn++) {
1559	if (sn == (cinfo->slot_number - `1`))
1560	continue;
1561	err = md_bitmap_copy_from_slot(mddev, slot: sn, lo: &lo, hi: &hi, clear_bits: false);
1562	if (err) {
1563	pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
1564	goto out;
1565	}
1566	if ((hi > `0`) && (lo < mddev->recovery_cp))
1567	mddev->recovery_cp = lo;
1568	}
1569	out:
1570	return err;
1571	}
1572
1573	static struct md_cluster_operations cluster_ops = {
1574	.join = join,
1575	.leave = leave,
1576	.slot_number = slot_number,
1577	.resync_start = resync_start,
1578	.resync_finish = resync_finish,
1579	.resync_info_update = resync_info_update,
1580	.resync_info_get = resync_info_get,
1581	.metadata_update_start = metadata_update_start,
1582	.metadata_update_finish = metadata_update_finish,
1583	.metadata_update_cancel = metadata_update_cancel,
1584	.area_resyncing = area_resyncing,
1585	.add_new_disk = add_new_disk,
1586	.add_new_disk_cancel = add_new_disk_cancel,
1587	.new_disk_ack = new_disk_ack,
1588	.remove_disk = remove_disk,
1589	.load_bitmaps = load_bitmaps,
1590	.gather_bitmaps = gather_bitmaps,
1591	.resize_bitmaps = resize_bitmaps,
1592	.lock_all_bitmaps = lock_all_bitmaps,
1593	.unlock_all_bitmaps = unlock_all_bitmaps,
1594	.update_size = update_size,
1595	};
1596
1597	static int __init cluster_init(void)
1598	{
1599	pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
1600	pr_info("Registering Cluster MD functions\n");
1601	register_md_cluster_operations(ops: &cluster_ops, THIS_MODULE);
1602	return `0`;
1603	}
1604
1605	static void cluster_exit(void)
1606	{
1607	unregister_md_cluster_operations();
1608	}
1609
1610	module_init(cluster_init);
1611	module_exit(cluster_exit);
1612	MODULE_AUTHOR("SUSE");
1613	MODULE_LICENSE("GPL");
1614	MODULE_DESCRIPTION("Clustering support for MD");
1615

source code of linux/drivers/md/md-cluster.c