drbd_main.c source code [linux/drivers/block/drbd/drbd_main.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	drbd.c
4
5	This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6
7	Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8	Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9	Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10
11	Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
12	from Logicworks, Inc. for making SDP replication support possible.
13
14
15	*/
16
17	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19	#include <linux/module.h>
20	#include <linux/jiffies.h>
21	#include <linux/drbd.h>
22	#include <linux/uaccess.h>
23	#include <asm/types.h>
24	#include <net/sock.h>
25	#include <linux/ctype.h>
26	#include <linux/mutex.h>
27	#include <linux/fs.h>
28	#include <linux/file.h>
29	#include <linux/proc_fs.h>
30	#include <linux/init.h>
31	#include <linux/mm.h>
32	#include <linux/memcontrol.h>
33	#include <linux/mm_inline.h>
34	#include <linux/slab.h>
35	#include <linux/random.h>
36	#include <linux/reboot.h>
37	#include <linux/notifier.h>
38	#include <linux/kthread.h>
39	#include <linux/workqueue.h>
40	#include <linux/unistd.h>
41	#include <linux/vmalloc.h>
42	#include <linux/sched/signal.h>
43
44	#include <linux/drbd_limits.h>
45	#include "drbd_int.h"
46	#include "drbd_protocol.h"
47	#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
48	#include "drbd_vli.h"
49	#include "drbd_debugfs.h"
50
51	static DEFINE_MUTEX(drbd_main_mutex);
52	static int drbd_open(struct gendisk *disk, blk_mode_t mode);
53	static void drbd_release(struct gendisk *gd);
54	static void md_sync_timer_fn(struct timer_list *t);
55	static int w_bitmap_io(struct drbd_work w, int* unused);
56
57	MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
58	"Lars Ellenberg <lars@linbit.com>");
59	MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
60	MODULE_VERSION(REL_VERSION);
61	MODULE_LICENSE("GPL");
62	MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
63	__stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
64	MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
65
66	#include <linux/moduleparam.h>
67	/ thanks to these macros, if compiled into the kernel (not-module),*
68	* these become boot parameters (e.g., drbd.minor_count) */
69
70	#ifdef CONFIG_DRBD_FAULT_INJECTION
71	int drbd_enable_faults;
72	int drbd_fault_rate;
73	static int drbd_fault_count;
74	static int drbd_fault_devs;
75	/ bitmap of enabled faults /
76	module_param_named(enable_faults, drbd_enable_faults, int, `0664`);
77	/ fault rate % value - applies to all enabled faults /
78	module_param_named(fault_rate, drbd_fault_rate, int, `0664`);
79	/ count of faults inserted /
80	module_param_named(fault_count, drbd_fault_count, int, `0664`);
81	/ bitmap of devices to insert faults on /
82	module_param_named(fault_devs, drbd_fault_devs, int, `0644`);
83	#endif
84
85	/ module parameters we can keep static /
86	static bool drbd_allow_oos; / allow_open_on_secondary /
87	static bool drbd_disable_sendpage;
88	MODULE_PARM_DESC(allow_oos, "DONT USE!");
89	module_param_named(allow_oos, drbd_allow_oos, bool, `0`);
90	module_param_named(disable_sendpage, drbd_disable_sendpage, bool, `0644`);
91
92	/ module parameters we share /
93	int drbd_proc_details; / Detail level in proc drbd/
94	module_param_named(proc_details, drbd_proc_details, int, `0644`);
95	/ module parameters shared with defaults /
96	unsigned int drbd_minor_count = DRBD_MINOR_COUNT_DEF;
97	/ Module parameter for setting the user mode helper program*
98	* to run. Default is /sbin/drbdadm */
99	char drbd_usermode_helper[`80`] = "/sbin/drbdadm";
100	module_param_named(minor_count, drbd_minor_count, uint, `0444`);
101	module_param_string(usermode_helper, drbd_usermode_helper, sizeof(drbd_usermode_helper), `0644`);
102
103	/ in 2.6.x, our device mapping and config info contains our virtual gendisks*
104	* as member "struct gendisk *vdisk;"
105	*/
106	struct idr drbd_devices;
107	struct list_head drbd_resources;
108	struct mutex resources_mutex;
109
110	struct kmem_cache *drbd_request_cache;
111	struct kmem_cache drbd_ee_cache; /* peer requests /
112	struct kmem_cache drbd_bm_ext_cache; /* bitmap extents /
113	struct kmem_cache drbd_al_ext_cache; /* activity log extents /
114	mempool_t drbd_request_mempool;
115	mempool_t drbd_ee_mempool;
116	mempool_t drbd_md_io_page_pool;
117	struct bio_set drbd_md_io_bio_set;
118	struct bio_set drbd_io_bio_set;
119
120	/ I do not use a standard mempool, because:*
121	1) I want to hand out the pre-allocated objects first.
122	2) I want to be able to interrupt sleeping allocation with a signal.
123	Note: This is a single linked list, the next pointer is the private
124	member of struct page.
125	*/
126	struct page *drbd_pp_pool;
127	DEFINE_SPINLOCK(drbd_pp_lock);
128	int drbd_pp_vacant;
129	wait_queue_head_t drbd_pp_wait;
130
131	DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, `5` * HZ, `5`);
132
133	static const struct block_device_operations drbd_ops = {
134	.owner = THIS_MODULE,
135	.submit_bio = drbd_submit_bio,
136	.open = drbd_open,
137	.release = drbd_release,
138	};
139
140	#ifdef __CHECKER__
141	/ When checking with sparse, and this is an inline function, sparse will*
142	give tons of false positives. When this is a real functions sparse works.
143	*/
144	int _get_ldev_if_state(struct drbd_device device, enum* drbd_disk_state mins)
145	{
146	int io_allowed;
147
148	atomic_inc(&device->local_cnt);
149	io_allowed = (device->state.disk >= mins);
150	if (!io_allowed) {
151	if (atomic_dec_and_test(&device->local_cnt))
152	wake_up(&device->misc_wait);
153	}
154	return io_allowed;
155	}
156
157	#endif
158
159	/**
160	* tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
161	* @connection: DRBD connection.
162	* @barrier_nr: Expected identifier of the DRBD write barrier packet.
163	* @set_size: Expected number of requests before that barrier.
164	*
165	* In case the passed barrier_nr or set_size does not match the oldest
166	* epoch of not yet barrier-acked requests, this function will cause a
167	* termination of the connection.
168	*/
169	void tl_release(struct drbd_connection connection, unsigned* int barrier_nr,
170	unsigned int set_size)
171	{
172	struct drbd_request *r;
173	struct drbd_request req = NULL, tmp = NULL;
174	int expect_epoch = `0`;
175	int expect_size = `0`;
176
177	spin_lock_irq(lock: &connection->resource->req_lock);
178
179	/ find oldest not yet barrier-acked write request,*
180	* count writes in its epoch. */
181	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
182	const unsigned s = r->rq_state;
183	if (!req) {
184	if (!(s & RQ_WRITE))
185	continue;
186	if (!(s & RQ_NET_MASK))
187	continue;
188	if (s & RQ_NET_DONE)
189	continue;
190	req = r;
191	expect_epoch = req->epoch;
192	expect_size ++;
193	} else {
194	if (r->epoch != expect_epoch)
195	break;
196	if (!(s & RQ_WRITE))
197	continue;
198	/ if (s & RQ_DONE): not expected /
199	/ if (!(s & RQ_NET_MASK)): not expected /
200	expect_size++;
201	}
202	}
203
204	/ first some paranoia code /
205	if (req == NULL) {
206	drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
207	barrier_nr);
208	goto bail;
209	}
210	if (expect_epoch != barrier_nr) {
211	drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
212	barrier_nr, expect_epoch);
213	goto bail;
214	}
215
216	if (expect_size != set_size) {
217	drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
218	barrier_nr, set_size, expect_size);
219	goto bail;
220	}
221
222	/ Clean up list of requests processed during current epoch. /
223	/ this extra list walk restart is paranoia,*
224	* to catch requests being barrier-acked "unexpectedly".
225	* It usually should find the same req again, or some READ preceding it. */
226	list_for_each_entry(req, &connection->transfer_log, tl_requests)
227	if (req->epoch == expect_epoch) {
228	tmp = req;
229	break;
230	}
231	req = list_prepare_entry(tmp, &connection->transfer_log, tl_requests);
232	list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
233	struct drbd_peer_device *peer_device;
234	if (req->epoch != expect_epoch)
235	break;
236	peer_device = conn_peer_device(connection, volume_number: req->device->vnr);
237	_req_mod(req, what: BARRIER_ACKED, peer_device);
238	}
239	spin_unlock_irq(lock: &connection->resource->req_lock);
240
241	return;
242
243	bail:
244	spin_unlock_irq(lock: &connection->resource->req_lock);
245	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), flags: CS_HARD);
246	}
247
248
249	/**
250	* _tl_restart() - Walks the transfer log, and applies an action to all requests
251	* @connection: DRBD connection to operate on.
252	* @what: The action/event to perform with all request objects
253	*
254	* @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
255	* RESTART_FROZEN_DISK_IO.
256	*/
257	/ must hold resource->req_lock /
258	void _tl_restart(struct drbd_connection connection, enum* drbd_req_event what)
259	{
260	struct drbd_peer_device *peer_device;
261	struct drbd_request req, r;
262
263	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
264	peer_device = conn_peer_device(connection, volume_number: req->device->vnr);
265	_req_mod(req, what, peer_device);
266	}
267	}
268
269	void tl_restart(struct drbd_connection connection, enum* drbd_req_event what)
270	{
271	spin_lock_irq(lock: &connection->resource->req_lock);
272	_tl_restart(connection, what);
273	spin_unlock_irq(lock: &connection->resource->req_lock);
274	}
275
276	/**
277	* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
278	* @connection: DRBD connection.
279	*
280	* This is called after the connection to the peer was lost. The storage covered
281	* by the requests on the transfer gets marked as our of sync. Called from the
282	* receiver thread and the worker thread.
283	*/
284	void tl_clear(struct drbd_connection *connection)
285	{
286	tl_restart(connection, what: CONNECTION_LOST_WHILE_PENDING);
287	}
288
289	/**
290	* tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
291	* @device: DRBD device.
292	*/
293	void tl_abort_disk_io(struct drbd_device *device)
294	{
295	struct drbd_connection *connection = first_peer_device(device)->connection;
296	struct drbd_request req, r;
297
298	spin_lock_irq(lock: &connection->resource->req_lock);
299	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
300	if (!(req->rq_state & RQ_LOCAL_PENDING))
301	continue;
302	if (req->device != device)
303	continue;
304	_req_mod(req, what: ABORT_DISK_IO, NULL);
305	}
306	spin_unlock_irq(lock: &connection->resource->req_lock);
307	}
308
309	static int drbd_thread_setup(void *arg)
310	{
311	struct drbd_thread thi = (struct* drbd_thread *) arg;
312	struct drbd_resource *resource = thi->resource;
313	unsigned long flags;
314	int retval;
315
316	snprintf(current->comm, size: sizeof(current->comm), fmt: "drbd_%c_%s",
317	thi->name[`0`],
318	resource->name);
319
320	allow_kernel_signal(DRBD_SIGKILL);
321	allow_kernel_signal(SIGXCPU);
322	restart:
323	retval = thi->function(thi);
324
325	spin_lock_irqsave(&thi->t_lock, flags);
326
327	/ if the receiver has been "EXITING", the last thing it did*
328	* was set the conn state to "StandAlone",
329	* if now a re-connect request comes in, conn state goes C_UNCONNECTED,
330	* and receiver thread will be "started".
331	* drbd_thread_start needs to set "RESTARTING" in that case.
332	* t_state check and assignment needs to be within the same spinlock,
333	* so either thread_start sees EXITING, and can remap to RESTARTING,
334	* or thread_start see NONE, and can proceed as normal.
335	*/
336
337	if (thi->t_state == RESTARTING) {
338	drbd_info(resource, "Restarting %s thread\n", thi->name);
339	thi->t_state = RUNNING;
340	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
341	goto restart;
342	}
343
344	thi->task = NULL;
345	thi->t_state = NONE;
346	smp_mb();
347	complete_all(&thi->stop);
348	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
349
350	drbd_info(resource, "Terminating %s\n", current->comm);
351
352	/ Release mod reference taken when thread was started /
353
354	if (thi->connection)
355	kref_put(kref: &thi->connection->kref, release: drbd_destroy_connection);
356	kref_put(kref: &resource->kref, release: drbd_destroy_resource);
357	module_put(THIS_MODULE);
358	return retval;
359	}
360
361	static void drbd_thread_init(struct drbd_resource resource, struct* drbd_thread *thi,
362	int (func) (struct* drbd_thread ), const* char *name)
363	{
364	spin_lock_init(&thi->t_lock);
365	thi->task = NULL;
366	thi->t_state = NONE;
367	thi->function = func;
368	thi->resource = resource;
369	thi->connection = NULL;
370	thi->name = name;
371	}
372
373	int drbd_thread_start(struct drbd_thread *thi)
374	{
375	struct drbd_resource *resource = thi->resource;
376	struct task_struct *nt;
377	unsigned long flags;
378
379	/ is used from state engine doing drbd_thread_stop_nowait,*
380	* while holding the req lock irqsave */
381	spin_lock_irqsave(&thi->t_lock, flags);
382
383	switch (thi->t_state) {
384	case NONE:
385	drbd_info(resource, "Starting %s thread (from %s [%d])\n",
386	thi->name, current->comm, current->pid);
387
388	/ Get ref on module for thread - this is released when thread exits /
389	if (!try_module_get(THIS_MODULE)) {
390	drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
391	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
392	return false;
393	}
394
395	kref_get(kref: &resource->kref);
396	if (thi->connection)
397	kref_get(kref: &thi->connection->kref);
398
399	init_completion(x: &thi->stop);
400	thi->reset_cpu_mask = `1`;
401	thi->t_state = RUNNING;
402	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
403	flush_signals(current); / otherw. may get -ERESTARTNOINTR /
404
405	nt = kthread_create(drbd_thread_setup, (void *) thi,
406	"drbd_%c_%s", thi->name[`0`], thi->resource->name);
407
408	if (IS_ERR(ptr: nt)) {
409	drbd_err(resource, "Couldn't start thread\n");
410
411	if (thi->connection)
412	kref_put(kref: &thi->connection->kref, release: drbd_destroy_connection);
413	kref_put(kref: &resource->kref, release: drbd_destroy_resource);
414	module_put(THIS_MODULE);
415	return false;
416	}
417	spin_lock_irqsave(&thi->t_lock, flags);
418	thi->task = nt;
419	thi->t_state = RUNNING;
420	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
421	wake_up_process(tsk: nt);
422	break;
423	case EXITING:
424	thi->t_state = RESTARTING;
425	drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
426	thi->name, current->comm, current->pid);
427	fallthrough;
428	case RUNNING:
429	case RESTARTING:
430	default:
431	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
432	break;
433	}
434
435	return true;
436	}
437
438
439	void _drbd_thread_stop(struct drbd_thread thi, int* restart, int wait)
440	{
441	unsigned long flags;
442
443	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
444
445	/ may be called from state engine, holding the req lock irqsave /
446	spin_lock_irqsave(&thi->t_lock, flags);
447
448	if (thi->t_state == NONE) {
449	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
450	if (restart)
451	drbd_thread_start(thi);
452	return;
453	}
454
455	if (thi->t_state != ns) {
456	if (thi->task == NULL) {
457	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
458	return;
459	}
460
461	thi->t_state = ns;
462	smp_mb();
463	init_completion(x: &thi->stop);
464	if (thi->task != current)
465	send_sig(DRBD_SIGKILL, thi->task, `1`);
466	}
467
468	spin_unlock_irqrestore(lock: &thi->t_lock, flags);
469
470	if (wait)
471	wait_for_completion(&thi->stop);
472	}
473
474	int conn_lowest_minor(struct drbd_connection *connection)
475	{
476	struct drbd_peer_device *peer_device;
477	int vnr = `0`, minor = -`1`;
478
479	rcu_read_lock();
480	peer_device = idr_get_next(&connection->peer_devices, nextid: &vnr);
481	if (peer_device)
482	minor = device_to_minor(device: peer_device->device);
483	rcu_read_unlock();
484
485	return minor;
486	}
487
488	#ifdef CONFIG_SMP
489	/*
490	* drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
491	*
492	* Forces all threads of a resource onto the same CPU. This is beneficial for
493	* DRBD's performance. May be overwritten by user's configuration.
494	*/
495	static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
496	{
497	unsigned int *resources_per_cpu, min_index = ~`0`;
498
499	resources_per_cpu = kcalloc(n: nr_cpu_ids, size: sizeof(*resources_per_cpu),
500	GFP_KERNEL);
501	if (resources_per_cpu) {
502	struct drbd_resource *resource;
503	unsigned int cpu, min = ~`0`;
504
505	rcu_read_lock();
506	for_each_resource_rcu(resource, &drbd_resources) {
507	for_each_cpu(cpu, resource->cpu_mask)
508	resources_per_cpu[cpu]++;
509	}
510	rcu_read_unlock();
511	for_each_online_cpu(cpu) {
512	if (resources_per_cpu[cpu] < min) {
513	min = resources_per_cpu[cpu];
514	min_index = cpu;
515	}
516	}
517	kfree(objp: resources_per_cpu);
518	}
519	if (min_index == ~`0`) {
520	cpumask_setall(dstp: *cpu_mask);
521	return;
522	}
523	cpumask_set_cpu(cpu: min_index, dstp: *cpu_mask);
524	}
525
526	/**
527	* drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
528	* @thi: drbd_thread object
529	*
530	* call in the "main loop" of _all_ threads, no need for any mutex, current won't die
531	* prematurely.
532	*/
533	void drbd_thread_current_set_cpu(struct drbd_thread *thi)
534	{
535	struct drbd_resource *resource = thi->resource;
536	struct task_struct *p = current;
537
538	if (!thi->reset_cpu_mask)
539	return;
540	thi->reset_cpu_mask = `0`;
541	set_cpus_allowed_ptr(p, new_mask: resource->cpu_mask);
542	}
543	#else
544	#define drbd_calc_cpu_mask(A) ({})
545	#endif
546
547	/*
548	* drbd_header_size - size of a packet header
549	*
550	* The header size is a multiple of 8, so any payload following the header is
551	* word aligned on 64-bit architectures. (The bitmap send and receive code
552	* relies on this.)
553	*/
554	unsigned int drbd_header_size(struct drbd_connection *connection)
555	{
556	if (connection->agreed_pro_version >= `100`) {
557	BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), `8`));
558	return sizeof(struct p_header100);
559	} else {
560	BUILD_BUG_ON(sizeof(struct p_header80) !=
561	sizeof(struct p_header95));
562	BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), `8`));
563	return sizeof(struct p_header80);
564	}
565	}
566
567	static unsigned int prepare_header80(struct p_header80 h, enum* drbd_packet cmd, int size)
568	{
569	h->magic = cpu_to_be32(DRBD_MAGIC);
570	h->command = cpu_to_be16(cmd);
571	h->length = cpu_to_be16(size);
572	return sizeof(struct p_header80);
573	}
574
575	static unsigned int prepare_header95(struct p_header95 h, enum* drbd_packet cmd, int size)
576	{
577	h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
578	h->command = cpu_to_be16(cmd);
579	h->length = cpu_to_be32(size);
580	return sizeof(struct p_header95);
581	}
582
583	static unsigned int prepare_header100(struct p_header100 h, enum* drbd_packet cmd,
584	int size, int vnr)
585	{
586	h->magic = cpu_to_be32(DRBD_MAGIC_100);
587	h->volume = cpu_to_be16(vnr);
588	h->command = cpu_to_be16(cmd);
589	h->length = cpu_to_be32(size);
590	h->pad = `0`;
591	return sizeof(struct p_header100);
592	}
593
594	static unsigned int prepare_header(struct drbd_connection connection, int* vnr,
595	void buffer, enum* drbd_packet cmd, int size)
596	{
597	if (connection->agreed_pro_version >= `100`)
598	return prepare_header100(h: buffer, cmd, size, vnr);
599	else if (connection->agreed_pro_version >= `95` &&
600	size > DRBD_MAX_SIZE_H80_PACKET)
601	return prepare_header95(h: buffer, cmd, size);
602	else
603	return prepare_header80(h: buffer, cmd, size);
604	}
605
606	static void __conn_prepare_command(struct* drbd_connection *connection,
607	struct drbd_socket *sock)
608	{
609	if (!sock->socket)
610	return NULL;
611	return sock->sbuf + drbd_header_size(connection);
612	}
613
614	void conn_prepare_command(struct* drbd_connection connection, struct* drbd_socket *sock)
615	{
616	void *p;
617
618	mutex_lock(&sock->mutex);
619	p = __conn_prepare_command(connection, sock);
620	if (!p)
621	mutex_unlock(lock: &sock->mutex);
622
623	return p;
624	}
625
626	void drbd_prepare_command(struct* drbd_peer_device peer_device, struct* drbd_socket *sock)
627	{
628	return conn_prepare_command(connection: peer_device->connection, sock);
629	}
630
631	static int __send_command(struct drbd_connection connection, int* vnr,
632	struct drbd_socket sock, enum* drbd_packet cmd,
633	unsigned int header_size, void *data,
634	unsigned int size)
635	{
636	int msg_flags;
637	int err;
638
639	/*
640	* Called with @data == NULL and the size of the data blocks in @size
641	* for commands that send data blocks. For those commands, omit the
642	* MSG_MORE flag: this will increase the likelihood that data blocks
643	* which are page aligned on the sender will end up page aligned on the
644	* receiver.
645	*/
646	msg_flags = data ? MSG_MORE : `0`;
647
648	header_size += prepare_header(connection, vnr, buffer: sock->sbuf, cmd,
649	size: header_size + size);
650	err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
651	msg_flags);
652	if (data && !err)
653	err = drbd_send_all(connection, sock->socket, data, size, `0`);
654	/ DRBD protocol "pings" are latency critical.*
655	* This is supposed to trigger tcp_push_pending_frames() */
656	if (!err && (cmd == P_PING \|\| cmd == P_PING_ACK))
657	tcp_sock_set_nodelay(sk: sock->socket->sk);
658
659	return err;
660	}
661
662	static int __conn_send_command(struct drbd_connection connection, struct* drbd_socket *sock,
663	enum drbd_packet cmd, unsigned int header_size,
664	void data, unsigned* int size)
665	{
666	return __send_command(connection, vnr: `0`, sock, cmd, header_size, data, size);
667	}
668
669	int conn_send_command(struct drbd_connection connection, struct* drbd_socket *sock,
670	enum drbd_packet cmd, unsigned int header_size,
671	void data, unsigned* int size)
672	{
673	int err;
674
675	err = __conn_send_command(connection, sock, cmd, header_size, data, size);
676	mutex_unlock(lock: &sock->mutex);
677	return err;
678	}
679
680	int drbd_send_command(struct drbd_peer_device peer_device, struct* drbd_socket *sock,
681	enum drbd_packet cmd, unsigned int header_size,
682	void data, unsigned* int size)
683	{
684	int err;
685
686	err = __send_command(connection: peer_device->connection, vnr: peer_device->device->vnr,
687	sock, cmd, header_size, data, size);
688	mutex_unlock(lock: &sock->mutex);
689	return err;
690	}
691
692	int drbd_send_ping(struct drbd_connection *connection)
693	{
694	struct drbd_socket *sock;
695
696	sock = &connection->meta;
697	if (!conn_prepare_command(connection, sock))
698	return -EIO;
699	return conn_send_command(connection, sock, cmd: P_PING, header_size: `0`, NULL, size: `0`);
700	}
701
702	int drbd_send_ping_ack(struct drbd_connection *connection)
703	{
704	struct drbd_socket *sock;
705
706	sock = &connection->meta;
707	if (!conn_prepare_command(connection, sock))
708	return -EIO;
709	return conn_send_command(connection, sock, cmd: P_PING_ACK, header_size: `0`, NULL, size: `0`);
710	}
711
712	int drbd_send_sync_param(struct drbd_peer_device *peer_device)
713	{
714	struct drbd_socket *sock;
715	struct p_rs_param_95 *p;
716	int size;
717	const int apv = peer_device->connection->agreed_pro_version;
718	enum drbd_packet cmd;
719	struct net_conf *nc;
720	struct disk_conf *dc;
721
722	sock = &peer_device->connection->data;
723	p = drbd_prepare_command(peer_device, sock);
724	if (!p)
725	return -EIO;
726
727	rcu_read_lock();
728	nc = rcu_dereference(peer_device->connection->net_conf);
729
730	size = apv <= `87` ? sizeof(struct p_rs_param)
731	: apv == `88` ? sizeof(struct p_rs_param)
732	+ strlen(nc->verify_alg) + `1`
733	: apv <= `94` ? sizeof(struct p_rs_param_89)
734	: / apv >= 95 / sizeof(struct p_rs_param_95);
735
736	cmd = apv >= `89` ? P_SYNC_PARAM89 : P_SYNC_PARAM;
737
738	/ initialize verify_alg and csums_alg /
739	BUILD_BUG_ON(sizeof(p->algs) != `2` * SHARED_SECRET_MAX);
740	memset(&p->algs, `0`, sizeof(p->algs));
741
742	if (get_ldev(peer_device->device)) {
743	dc = rcu_dereference(peer_device->device->ldev->disk_conf);
744	p->resync_rate = cpu_to_be32(dc->resync_rate);
745	p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
746	p->c_delay_target = cpu_to_be32(dc->c_delay_target);
747	p->c_fill_target = cpu_to_be32(dc->c_fill_target);
748	p->c_max_rate = cpu_to_be32(dc->c_max_rate);
749	put_ldev(device: peer_device->device);
750	} else {
751	p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
752	p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
753	p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
754	p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
755	p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
756	}
757
758	if (apv >= `88`)
759	strcpy(p: p->verify_alg, q: nc->verify_alg);
760	if (apv >= `89`)
761	strcpy(p: p->csums_alg, q: nc->csums_alg);
762	rcu_read_unlock();
763
764	return drbd_send_command(peer_device, sock, cmd, header_size: size, NULL, size: `0`);
765	}
766
767	int __drbd_send_protocol(struct drbd_connection connection, enum* drbd_packet cmd)
768	{
769	struct drbd_socket *sock;
770	struct p_protocol *p;
771	struct net_conf *nc;
772	int size, cf;
773
774	sock = &connection->data;
775	p = __conn_prepare_command(connection, sock);
776	if (!p)
777	return -EIO;
778
779	rcu_read_lock();
780	nc = rcu_dereference(connection->net_conf);
781
782	if (nc->tentative && connection->agreed_pro_version < `92`) {
783	rcu_read_unlock();
784	drbd_err(connection, "--dry-run is not supported by peer");
785	return -EOPNOTSUPP;
786	}
787
788	size = sizeof(*p);
789	if (connection->agreed_pro_version >= `87`)
790	size += strlen(nc->integrity_alg) + `1`;
791
792	p->protocol = cpu_to_be32(nc->wire_protocol);
793	p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
794	p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
795	p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
796	p->two_primaries = cpu_to_be32(nc->two_primaries);
797	cf = `0`;
798	if (nc->discard_my_data)
799	cf \|= CF_DISCARD_MY_DATA;
800	if (nc->tentative)
801	cf \|= CF_DRY_RUN;
802	p->conn_flags = cpu_to_be32(cf);
803
804	if (connection->agreed_pro_version >= `87`)
805	strcpy(p: p->integrity_alg, q: nc->integrity_alg);
806	rcu_read_unlock();
807
808	return __conn_send_command(connection, sock, cmd, header_size: size, NULL, size: `0`);
809	}
810
811	int drbd_send_protocol(struct drbd_connection *connection)
812	{
813	int err;
814
815	mutex_lock(&connection->data.mutex);
816	err = __drbd_send_protocol(connection, cmd: P_PROTOCOL);
817	mutex_unlock(lock: &connection->data.mutex);
818
819	return err;
820	}
821
822	static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
823	{
824	struct drbd_device *device = peer_device->device;
825	struct drbd_socket *sock;
826	struct p_uuids *p;
827	int i;
828
829	if (!get_ldev_if_state(device, D_NEGOTIATING))
830	return `0`;
831
832	sock = &peer_device->connection->data;
833	p = drbd_prepare_command(peer_device, sock);
834	if (!p) {
835	put_ldev(device);
836	return -EIO;
837	}
838	spin_lock_irq(lock: &device->ldev->md.uuid_lock);
839	for (i = UI_CURRENT; i < UI_SIZE; i++)
840	p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
841	spin_unlock_irq(lock: &device->ldev->md.uuid_lock);
842
843	device->comm_bm_set = drbd_bm_total_weight(device);
844	p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
845	rcu_read_lock();
846	uuid_flags \|= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? `1` : `0`;
847	rcu_read_unlock();
848	uuid_flags \|= test_bit(CRASHED_PRIMARY, &device->flags) ? `2` : `0`;
849	uuid_flags \|= device->new_state_tmp.disk == D_INCONSISTENT ? `4` : `0`;
850	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
851
852	put_ldev(device);
853	return drbd_send_command(peer_device, sock, cmd: P_UUIDS, header_size: sizeof(*p), NULL, size: `0`);
854	}
855
856	int drbd_send_uuids(struct drbd_peer_device *peer_device)
857	{
858	return _drbd_send_uuids(peer_device, uuid_flags: `0`);
859	}
860
861	int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
862	{
863	return _drbd_send_uuids(peer_device, uuid_flags: `8`);
864	}
865
866	void drbd_print_uuids(struct drbd_device device, const* char *text)
867	{
868	if (get_ldev_if_state(device, D_NEGOTIATING)) {
869	u64 *uuid = device->ldev->md.uuid;
870	drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
871	text,
872	(unsigned long long)uuid[UI_CURRENT],
873	(unsigned long long)uuid[UI_BITMAP],
874	(unsigned long long)uuid[UI_HISTORY_START],
875	(unsigned long long)uuid[UI_HISTORY_END]);
876	put_ldev(device);
877	} else {
878	drbd_info(device, "%s effective data uuid: %016llX\n",
879	text,
880	(unsigned long long)device->ed_uuid);
881	}
882	}
883
884	void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
885	{
886	struct drbd_device *device = peer_device->device;
887	struct drbd_socket *sock;
888	struct p_rs_uuid *p;
889	u64 uuid;
890
891	D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
892
893	uuid = device->ldev->md.uuid[UI_BITMAP];
894	if (uuid && uuid != UUID_JUST_CREATED)
895	uuid = uuid + UUID_NEW_BM_OFFSET;
896	else
897	get_random_bytes(buf: &uuid, len: sizeof(u64));
898	drbd_uuid_set(device, idx: UI_BITMAP, val: uuid);
899	drbd_print_uuids(device, text: "updated sync UUID");
900	drbd_md_sync(device);
901
902	sock = &peer_device->connection->data;
903	p = drbd_prepare_command(peer_device, sock);
904	if (p) {
905	p->uuid = cpu_to_be64(uuid);
906	drbd_send_command(peer_device, sock, cmd: P_SYNC_UUID, header_size: sizeof(*p), NULL, size: `0`);
907	}
908	}
909
910	int drbd_send_sizes(struct drbd_peer_device peer_device, int* trigger_reply, enum dds_flags flags)
911	{
912	struct drbd_device *device = peer_device->device;
913	struct drbd_socket *sock;
914	struct p_sizes *p;
915	sector_t d_size, u_size;
916	int q_order_type;
917	unsigned int max_bio_size;
918	unsigned int packet_size;
919
920	sock = &peer_device->connection->data;
921	p = drbd_prepare_command(peer_device, sock);
922	if (!p)
923	return -EIO;
924
925	packet_size = sizeof(*p);
926	if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
927	packet_size += sizeof(p->qlim[`0`]);
928
929	memset(p, `0`, packet_size);
930	if (get_ldev_if_state(device, D_NEGOTIATING)) {
931	struct block_device *bdev = device->ldev->backing_bdev;
932	struct request_queue *q = bdev_get_queue(bdev);
933
934	d_size = drbd_get_max_capacity(bdev: device->ldev);
935	rcu_read_lock();
936	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
937	rcu_read_unlock();
938	q_order_type = drbd_queue_order_type(device);
939	max_bio_size = queue_max_hw_sectors(q) << `9`;
940	max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
941	p->qlim->physical_block_size =
942	cpu_to_be32(bdev_physical_block_size(bdev));
943	p->qlim->logical_block_size =
944	cpu_to_be32(bdev_logical_block_size(bdev));
945	p->qlim->alignment_offset =
946	cpu_to_be32(bdev_alignment_offset(bdev));
947	p->qlim->io_min = cpu_to_be32(bdev_io_min(bdev));
948	p->qlim->io_opt = cpu_to_be32(bdev_io_opt(bdev));
949	p->qlim->discard_enabled = !!bdev_max_discard_sectors(bdev);
950	put_ldev(device);
951	} else {
952	struct request_queue *q = device->rq_queue;
953
954	p->qlim->physical_block_size =
955	cpu_to_be32(queue_physical_block_size(q));
956	p->qlim->logical_block_size =
957	cpu_to_be32(queue_logical_block_size(q));
958	p->qlim->alignment_offset = `0`;
959	p->qlim->io_min = cpu_to_be32(queue_io_min(q));
960	p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
961	p->qlim->discard_enabled = `0`;
962
963	d_size = `0`;
964	u_size = `0`;
965	q_order_type = QUEUE_ORDERED_NONE;
966	max_bio_size = DRBD_MAX_BIO_SIZE; / ... multiple BIOs per peer_request /
967	}
968
969	if (peer_device->connection->agreed_pro_version <= `94`)
970	max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
971	else if (peer_device->connection->agreed_pro_version < `100`)
972	max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
973
974	p->d_size = cpu_to_be64(d_size);
975	p->u_size = cpu_to_be64(u_size);
976	if (trigger_reply)
977	p->c_size = `0`;
978	else
979	p->c_size = cpu_to_be64(get_capacity(device->vdisk));
980	p->max_bio_size = cpu_to_be32(max_bio_size);
981	p->queue_order_type = cpu_to_be16(q_order_type);
982	p->dds_flags = cpu_to_be16(flags);
983
984	return drbd_send_command(peer_device, sock, cmd: P_SIZES, header_size: packet_size, NULL, size: `0`);
985	}
986
987	/**
988	* drbd_send_current_state() - Sends the drbd state to the peer
989	* @peer_device: DRBD peer device.
990	*/
991	int drbd_send_current_state(struct drbd_peer_device *peer_device)
992	{
993	struct drbd_socket *sock;
994	struct p_state *p;
995
996	sock = &peer_device->connection->data;
997	p = drbd_prepare_command(peer_device, sock);
998	if (!p)
999	return -EIO;
1000	p->state = cpu_to_be32(peer_device->device->state.i); / Within the send mutex /
1001	return drbd_send_command(peer_device, sock, cmd: P_STATE, header_size: sizeof(*p), NULL, size: `0`);
1002	}
1003
1004	/**
1005	* drbd_send_state() - After a state change, sends the new state to the peer
1006	* @peer_device: DRBD peer device.
1007	* @state: the state to send, not necessarily the current state.
1008	*
1009	* Each state change queues an "after_state_ch" work, which will eventually
1010	* send the resulting new state to the peer. If more state changes happen
1011	* between queuing and processing of the after_state_ch work, we still
1012	* want to send each intermediary state in the order it occurred.
1013	*/
1014	int drbd_send_state(struct drbd_peer_device peer_device, union* drbd_state state)
1015	{
1016	struct drbd_socket *sock;
1017	struct p_state *p;
1018
1019	sock = &peer_device->connection->data;
1020	p = drbd_prepare_command(peer_device, sock);
1021	if (!p)
1022	return -EIO;
1023	p->state = cpu_to_be32(state.i); / Within the send mutex /
1024	return drbd_send_command(peer_device, sock, cmd: P_STATE, header_size: sizeof(*p), NULL, size: `0`);
1025	}
1026
1027	int drbd_send_state_req(struct drbd_peer_device peer_device, union* drbd_state mask, union drbd_state val)
1028	{
1029	struct drbd_socket *sock;
1030	struct p_req_state *p;
1031
1032	sock = &peer_device->connection->data;
1033	p = drbd_prepare_command(peer_device, sock);
1034	if (!p)
1035	return -EIO;
1036	p->mask = cpu_to_be32(mask.i);
1037	p->val = cpu_to_be32(val.i);
1038	return drbd_send_command(peer_device, sock, cmd: P_STATE_CHG_REQ, header_size: sizeof(*p), NULL, size: `0`);
1039	}
1040
1041	int conn_send_state_req(struct drbd_connection connection, union* drbd_state mask, union drbd_state val)
1042	{
1043	enum drbd_packet cmd;
1044	struct drbd_socket *sock;
1045	struct p_req_state *p;
1046
1047	cmd = connection->agreed_pro_version < `100` ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
1048	sock = &connection->data;
1049	p = conn_prepare_command(connection, sock);
1050	if (!p)
1051	return -EIO;
1052	p->mask = cpu_to_be32(mask.i);
1053	p->val = cpu_to_be32(val.i);
1054	return conn_send_command(connection, sock, cmd, header_size: sizeof(*p), NULL, size: `0`);
1055	}
1056
1057	void drbd_send_sr_reply(struct drbd_peer_device peer_device, enum* drbd_state_rv retcode)
1058	{
1059	struct drbd_socket *sock;
1060	struct p_req_state_reply *p;
1061
1062	sock = &peer_device->connection->meta;
1063	p = drbd_prepare_command(peer_device, sock);
1064	if (p) {
1065	p->retcode = cpu_to_be32(retcode);
1066	drbd_send_command(peer_device, sock, cmd: P_STATE_CHG_REPLY, header_size: sizeof(*p), NULL, size: `0`);
1067	}
1068	}
1069
1070	void conn_send_sr_reply(struct drbd_connection connection, enum* drbd_state_rv retcode)
1071	{
1072	struct drbd_socket *sock;
1073	struct p_req_state_reply *p;
1074	enum drbd_packet cmd = connection->agreed_pro_version < `100` ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1075
1076	sock = &connection->meta;
1077	p = conn_prepare_command(connection, sock);
1078	if (p) {
1079	p->retcode = cpu_to_be32(retcode);
1080	conn_send_command(connection, sock, cmd, header_size: sizeof(*p), NULL, size: `0`);
1081	}
1082	}
1083
1084	static void dcbp_set_code(struct p_compressed_bm p, enum* drbd_bitmap_code code)
1085	{
1086	BUG_ON(code & ~`0xf`);
1087	p->encoding = (p->encoding & ~`0xf`) \| code;
1088	}
1089
1090	static void dcbp_set_start(struct p_compressed_bm p, int* set)
1091	{
1092	p->encoding = (p->encoding & ~`0x80`) \| (set ? `0x80` : `0`);
1093	}
1094
1095	static void dcbp_set_pad_bits(struct p_compressed_bm p, int* n)
1096	{
1097	BUG_ON(n & ~`0x7`);
1098	p->encoding = (p->encoding & (~`0x7` << `4`)) \| (n << `4`);
1099	}
1100
1101	static int fill_bitmap_rle_bits(struct drbd_device *device,
1102	struct p_compressed_bm *p,
1103	unsigned int size,
1104	struct bm_xfer_ctx *c)
1105	{
1106	struct bitstream bs;
1107	unsigned long plain_bits;
1108	unsigned long tmp;
1109	unsigned long rl;
1110	unsigned len;
1111	unsigned toggle;
1112	int bits, use_rle;
1113
1114	/ may we use this feature? /
1115	rcu_read_lock();
1116	use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
1117	rcu_read_unlock();
1118	if (!use_rle \|\| first_peer_device(device)->connection->agreed_pro_version < `90`)
1119	return `0`;
1120
1121	if (c->bit_offset >= c->bm_bits)
1122	return `0`; / nothing to do. /
1123
1124	/ use at most thus many bytes /
1125	bitstream_init(bs: &bs, s: p->code, len: size, pad_bits: `0`);
1126	memset(p->code, `0`, size);
1127	/ plain bits covered in this code string /
1128	plain_bits = `0`;
1129
1130	/ p->encoding & 0x80 stores whether the first run length is set.*
1131	* bit offset is implicit.
1132	* start with toggle == 2 to be able to tell the first iteration */
1133	toggle = `2`;
1134
1135	/ see how much plain bits we can stuff into one packet*
1136	* using RLE and VLI. */
1137	do {
1138	tmp = (toggle == `0`) ? _drbd_bm_find_next_zero(device, bm_fo: c->bit_offset)
1139	: _drbd_bm_find_next(device, bm_fo: c->bit_offset);
1140	if (tmp == -`1UL`)
1141	tmp = c->bm_bits;
1142	rl = tmp - c->bit_offset;
1143
1144	if (toggle == `2`) { / first iteration /
1145	if (rl == `0`) {
1146	/ the first checked bit was set,*
1147	* store start value, */
1148	dcbp_set_start(p, set: `1`);
1149	/ but skip encoding of zero run length /
1150	toggle = !toggle;
1151	continue;
1152	}
1153	dcbp_set_start(p, set: `0`);
1154	}
1155
1156	/ paranoia: catch zero runlength.*
1157	* can only happen if bitmap is modified while we scan it. */
1158	if (rl == `0`) {
1159	drbd_err(device, "unexpected zero runlength while encoding bitmap "
1160	"t:%u bo:%lu\n", toggle, c->bit_offset);
1161	return -`1`;
1162	}
1163
1164	bits = vli_encode_bits(bs: &bs, in: rl);
1165	if (bits == -ENOBUFS) / buffer full /
1166	break;
1167	if (bits <= `0`) {
1168	drbd_err(device, "error while encoding bitmap: %d\n", bits);
1169	return `0`;
1170	}
1171
1172	toggle = !toggle;
1173	plain_bits += rl;
1174	c->bit_offset = tmp;
1175	} while (c->bit_offset < c->bm_bits);
1176
1177	len = bs.cur.b - p->code + !!bs.cur.bit;
1178
1179	if (plain_bits < (len << `3`)) {
1180	/ incompressible with this method.*
1181	* we need to rewind both word and bit position. */
1182	c->bit_offset -= plain_bits;
1183	bm_xfer_ctx_bit_to_word_offset(c);
1184	c->bit_offset = c->word_offset * BITS_PER_LONG;
1185	return `0`;
1186	}
1187
1188	/ RLE + VLI was able to compress it just fine.*
1189	* update c->word_offset. */
1190	bm_xfer_ctx_bit_to_word_offset(c);
1191
1192	/ store pad_bits /
1193	dcbp_set_pad_bits(p, n: (`8` - bs.cur.bit) & `0x7`);
1194
1195	return len;
1196	}
1197
1198	/*
1199	* send_bitmap_rle_or_plain
1200	*
1201	* Return 0 when done, 1 when another iteration is needed, and a negative error
1202	* code upon failure.
1203	*/
1204	static int
1205	send_bitmap_rle_or_plain(struct drbd_peer_device peer_device, struct* bm_xfer_ctx *c)
1206	{
1207	struct drbd_device *device = peer_device->device;
1208	struct drbd_socket *sock = &peer_device->connection->data;
1209	unsigned int header_size = drbd_header_size(connection: peer_device->connection);
1210	struct p_compressed_bm *p = sock->sbuf + header_size;
1211	int len, err;
1212
1213	len = fill_bitmap_rle_bits(device, p,
1214	DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
1215	if (len < `0`)
1216	return -EIO;
1217
1218	if (len) {
1219	dcbp_set_code(p, code: RLE_VLI_Bits);
1220	err = __send_command(connection: peer_device->connection, vnr: device->vnr, sock,
1221	cmd: P_COMPRESSED_BITMAP, header_size: sizeof(*p) + len,
1222	NULL, size: `0`);
1223	c->packets[`0`]++;
1224	c->bytes[`0`] += header_size + sizeof(*p) + len;
1225
1226	if (c->bit_offset >= c->bm_bits)
1227	len = `0`; / DONE /
1228	} else {
1229	/ was not compressible.*
1230	* send a buffer full of plain text bits instead. */
1231	unsigned int data_size;
1232	unsigned long num_words;
1233	unsigned long *p = sock->sbuf + header_size;
1234
1235	data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
1236	num_words = min_t(size_t, data_size / sizeof(*p),
1237	c->bm_words - c->word_offset);
1238	len = num_words * sizeof(*p);
1239	if (len)
1240	drbd_bm_get_lel(device, offset: c->word_offset, number: num_words, buffer: p);
1241	err = __send_command(connection: peer_device->connection, vnr: device->vnr, sock, cmd: P_BITMAP,
1242	header_size: len, NULL, size: `0`);
1243	c->word_offset += num_words;
1244	c->bit_offset = c->word_offset * BITS_PER_LONG;
1245
1246	c->packets[`1`]++;
1247	c->bytes[`1`] += header_size + len;
1248
1249	if (c->bit_offset > c->bm_bits)
1250	c->bit_offset = c->bm_bits;
1251	}
1252	if (!err) {
1253	if (len == `0`) {
1254	INFO_bm_xfer_stats(peer_device, direction: "send", c);
1255	return `0`;
1256	} else
1257	return `1`;
1258	}
1259	return -EIO;
1260	}
1261
1262	/ See the comment at receive_bitmap() /
1263	static int _drbd_send_bitmap(struct drbd_device *device,
1264	struct drbd_peer_device *peer_device)
1265	{
1266	struct bm_xfer_ctx c;
1267	int err;
1268
1269	if (!expect(device, device->bitmap))
1270	return false;
1271
1272	if (get_ldev(device)) {
1273	if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
1274	drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
1275	drbd_bm_set_all(device);
1276	if (drbd_bm_write(device, peer_device)) {
1277	/ write_bm did fail! Leave full sync flag set in Meta P_DATA*
1278	* but otherwise process as per normal - need to tell other
1279	* side that a full resync is required! */
1280	drbd_err(device, "Failed to write bitmap to disk!\n");
1281	} else {
1282	drbd_md_clear_flag(device, MDF_FULL_SYNC);
1283	drbd_md_sync(device);
1284	}
1285	}
1286	put_ldev(device);
1287	}
1288
1289	c = (struct bm_xfer_ctx) {
1290	.bm_bits = drbd_bm_bits(device),
1291	.bm_words = drbd_bm_words(device),
1292	};
1293
1294	do {
1295	err = send_bitmap_rle_or_plain(peer_device, c: &c);
1296	} while (err > `0`);
1297
1298	return err == `0`;
1299	}
1300
1301	int drbd_send_bitmap(struct drbd_device device, struct* drbd_peer_device *peer_device)
1302	{
1303	struct drbd_socket *sock = &peer_device->connection->data;
1304	int err = -`1`;
1305
1306	mutex_lock(&sock->mutex);
1307	if (sock->socket)
1308	err = !_drbd_send_bitmap(device, peer_device);
1309	mutex_unlock(lock: &sock->mutex);
1310	return err;
1311	}
1312
1313	void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
1314	{
1315	struct drbd_socket *sock;
1316	struct p_barrier_ack *p;
1317
1318	if (connection->cstate < C_WF_REPORT_PARAMS)
1319	return;
1320
1321	sock = &connection->meta;
1322	p = conn_prepare_command(connection, sock);
1323	if (!p)
1324	return;
1325	p->barrier = barrier_nr;
1326	p->set_size = cpu_to_be32(set_size);
1327	conn_send_command(connection, sock, cmd: P_BARRIER_ACK, header_size: sizeof(*p), NULL, size: `0`);
1328	}
1329
1330	/**
1331	* _drbd_send_ack() - Sends an ack packet
1332	* @peer_device: DRBD peer device.
1333	* @cmd: Packet command code.
1334	* @sector: sector, needs to be in big endian byte order
1335	* @blksize: size in byte, needs to be in big endian byte order
1336	* @block_id: Id, big endian byte order
1337	*/
1338	static int _drbd_send_ack(struct drbd_peer_device peer_device, enum* drbd_packet cmd,
1339	u64 sector, u32 blksize, u64 block_id)
1340	{
1341	struct drbd_socket *sock;
1342	struct p_block_ack *p;
1343
1344	if (peer_device->device->state.conn < C_CONNECTED)
1345	return -EIO;
1346
1347	sock = &peer_device->connection->meta;
1348	p = drbd_prepare_command(peer_device, sock);
1349	if (!p)
1350	return -EIO;
1351	p->sector = sector;
1352	p->block_id = block_id;
1353	p->blksize = blksize;
1354	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
1355	return drbd_send_command(peer_device, sock, cmd, header_size: sizeof(*p), NULL, size: `0`);
1356	}
1357
1358	/ dp->sector and dp->block_id already/still in network byte order,*
1359	* data_size is payload size according to dp->head,
1360	* and may need to be corrected for digest size. */
1361	void drbd_send_ack_dp(struct drbd_peer_device peer_device, enum* drbd_packet cmd,
1362	struct p_data dp, int* data_size)
1363	{
1364	if (peer_device->connection->peer_integrity_tfm)
1365	data_size -= crypto_shash_digestsize(tfm: peer_device->connection->peer_integrity_tfm);
1366	_drbd_send_ack(peer_device, cmd, sector: dp->sector, cpu_to_be32(data_size),
1367	block_id: dp->block_id);
1368	}
1369
1370	void drbd_send_ack_rp(struct drbd_peer_device peer_device, enum* drbd_packet cmd,
1371	struct p_block_req *rp)
1372	{
1373	_drbd_send_ack(peer_device, cmd, sector: rp->sector, blksize: rp->blksize, block_id: rp->block_id);
1374	}
1375
1376	/**
1377	* drbd_send_ack() - Sends an ack packet
1378	* @peer_device: DRBD peer device
1379	* @cmd: packet command code
1380	* @peer_req: peer request
1381	*/
1382	int drbd_send_ack(struct drbd_peer_device peer_device, enum* drbd_packet cmd,
1383	struct drbd_peer_request *peer_req)
1384	{
1385	return _drbd_send_ack(peer_device, cmd,
1386	cpu_to_be64(peer_req->i.sector),
1387	cpu_to_be32(peer_req->i.size),
1388	block_id: peer_req->block_id);
1389	}
1390
1391	/ This function misuses the block_id field to signal if the blocks*
1392	* are is sync or not. */
1393	int drbd_send_ack_ex(struct drbd_peer_device peer_device, enum* drbd_packet cmd,
1394	sector_t sector, int blksize, u64 block_id)
1395	{
1396	return _drbd_send_ack(peer_device, cmd,
1397	cpu_to_be64(sector),
1398	cpu_to_be32(blksize),
1399	cpu_to_be64(block_id));
1400	}
1401
1402	int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
1403	struct drbd_peer_request *peer_req)
1404	{
1405	struct drbd_socket *sock;
1406	struct p_block_desc *p;
1407
1408	sock = &peer_device->connection->data;
1409	p = drbd_prepare_command(peer_device, sock);
1410	if (!p)
1411	return -EIO;
1412	p->sector = cpu_to_be64(peer_req->i.sector);
1413	p->blksize = cpu_to_be32(peer_req->i.size);
1414	p->pad = `0`;
1415	return drbd_send_command(peer_device, sock, cmd: P_RS_DEALLOCATED, header_size: sizeof(*p), NULL, size: `0`);
1416	}
1417
1418	int drbd_send_drequest(struct drbd_peer_device peer_device, int* cmd,
1419	sector_t sector, int size, u64 block_id)
1420	{
1421	struct drbd_socket *sock;
1422	struct p_block_req *p;
1423
1424	sock = &peer_device->connection->data;
1425	p = drbd_prepare_command(peer_device, sock);
1426	if (!p)
1427	return -EIO;
1428	p->sector = cpu_to_be64(sector);
1429	p->block_id = block_id;
1430	p->blksize = cpu_to_be32(size);
1431	return drbd_send_command(peer_device, sock, cmd, header_size: sizeof(*p), NULL, size: `0`);
1432	}
1433
1434	int drbd_send_drequest_csum(struct drbd_peer_device peer_device, sector_t sector, int* size,
1435	void digest, int* digest_size, enum drbd_packet cmd)
1436	{
1437	struct drbd_socket *sock;
1438	struct p_block_req *p;
1439
1440	/ FIXME: Put the digest into the preallocated socket buffer. /
1441
1442	sock = &peer_device->connection->data;
1443	p = drbd_prepare_command(peer_device, sock);
1444	if (!p)
1445	return -EIO;
1446	p->sector = cpu_to_be64(sector);
1447	p->block_id = ID_SYNCER / unused /;
1448	p->blksize = cpu_to_be32(size);
1449	return drbd_send_command(peer_device, sock, cmd, header_size: sizeof(*p), data: digest, size: digest_size);
1450	}
1451
1452	int drbd_send_ov_request(struct drbd_peer_device peer_device, sector_t sector, int* size)
1453	{
1454	struct drbd_socket *sock;
1455	struct p_block_req *p;
1456
1457	sock = &peer_device->connection->data;
1458	p = drbd_prepare_command(peer_device, sock);
1459	if (!p)
1460	return -EIO;
1461	p->sector = cpu_to_be64(sector);
1462	p->block_id = ID_SYNCER / unused /;
1463	p->blksize = cpu_to_be32(size);
1464	return drbd_send_command(peer_device, sock, cmd: P_OV_REQUEST, header_size: sizeof(*p), NULL, size: `0`);
1465	}
1466
1467	/ called on sndtimeo*
1468	* returns false if we should retry,
1469	* true if we think connection is dead
1470	*/
1471	static int we_should_drop_the_connection(struct drbd_connection connection, struct* socket *sock)
1472	{
1473	int drop_it;
1474	/ long elapsed = (long)(jiffies - device->last_received); /
1475
1476	drop_it = connection->meta.socket == sock
1477	\|\| !connection->ack_receiver.task
1478	\|\| get_t_state(thi: &connection->ack_receiver) != RUNNING
1479	\|\| connection->cstate < C_WF_REPORT_PARAMS;
1480
1481	if (drop_it)
1482	return true;
1483
1484	drop_it = !--connection->ko_count;
1485	if (!drop_it) {
1486	drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1487	current->comm, current->pid, connection->ko_count);
1488	request_ping(connection);
1489	}
1490
1491	return drop_it; / && (device->state == R_PRIMARY) /;
1492	}
1493
1494	static void drbd_update_congested(struct drbd_connection *connection)
1495	{
1496	struct sock *sk = connection->data.socket->sk;
1497	if (sk->sk_wmem_queued > sk->sk_sndbuf * `4` / `5`)
1498	set_bit(nr: NET_CONGESTED, addr: &connection->flags);
1499	}
1500
1501	/ The idea of sendpage seems to be to put some kind of reference*
1502	* to the page into the skb, and to hand it over to the NIC. In
1503	* this process get_page() gets called.
1504	*
1505	* As soon as the page was really sent over the network put_page()
1506	* gets called by some part of the network layer. [ NIC driver? ]
1507	*
1508	* [ get_page() / put_page() increment/decrement the count. If count
1509	* reaches 0 the page will be freed. ]
1510	*
1511	* This works nicely with pages from FSs.
1512	* But this means that in protocol A we might signal IO completion too early!
1513	*
1514	* In order not to corrupt data during a resync we must make sure
1515	* that we do not reuse our own buffer pages (EEs) to early, therefore
1516	* we have the net_ee list.
1517	*
1518	* XFS seems to have problems, still, it submits pages with page_count == 0!
1519	* As a workaround, we disable sendpage on pages
1520	* with page_count == 0 or PageSlab.
1521	*/
1522	static int _drbd_no_send_page(struct drbd_peer_device peer_device, struct* page *page,
1523	int offset, size_t size, unsigned msg_flags)
1524	{
1525	struct socket *socket;
1526	void *addr;
1527	int err;
1528
1529	socket = peer_device->connection->data.socket;
1530	addr = kmap(page) + offset;
1531	err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
1532	kunmap(page);
1533	if (!err)
1534	peer_device->device->send_cnt += size >> `9`;
1535	return err;
1536	}
1537
1538	static int _drbd_send_page(struct drbd_peer_device peer_device, struct* page *page,
1539	int offset, size_t size, unsigned msg_flags)
1540	{
1541	struct socket *socket = peer_device->connection->data.socket;
1542	struct msghdr msg = { .msg_flags = msg_flags, };
1543	struct bio_vec bvec;
1544	int len = size;
1545	int err = -EIO;
1546
1547	/ e.g. XFS meta- & log-data is in slab pages, which have a*
1548	* page_count of 0 and/or have PageSlab() set.
1549	* we cannot use send_page for those, as that does get_page();
1550	* put_page(); and would cause either a VM_BUG directly, or
1551	* __page_cache_release a page that would actually still be referenced
1552	* by someone, leading to some obscure delayed Oops somewhere else. */
1553	if (!drbd_disable_sendpage && sendpage_ok(page))
1554	msg.msg_flags \|= MSG_NOSIGNAL \| MSG_SPLICE_PAGES;
1555
1556	drbd_update_congested(connection: peer_device->connection);
1557	do {
1558	int sent;
1559
1560	bvec_set_page(bv: &bvec, page, len, offset);
1561	iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: `1`, count: len);
1562
1563	sent = sock_sendmsg(sock: socket, msg: &msg);
1564	if (sent <= `0`) {
1565	if (sent == -EAGAIN) {
1566	if (we_should_drop_the_connection(connection: peer_device->connection, sock: socket))
1567	break;
1568	continue;
1569	}
1570	drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
1571	__func__, (int)size, len, sent);
1572	if (sent < `0`)
1573	err = sent;
1574	break;
1575	}
1576	len -= sent;
1577	offset += sent;
1578	} while (len > `0` / THINK && device->cstate >= C_CONNECTED/);
1579	clear_bit(nr: NET_CONGESTED, addr: &peer_device->connection->flags);
1580
1581	if (len == `0`) {
1582	err = `0`;
1583	peer_device->device->send_cnt += size >> `9`;
1584	}
1585	return err;
1586	}
1587
1588	static int _drbd_send_bio(struct drbd_peer_device peer_device, struct* bio *bio)
1589	{
1590	struct bio_vec bvec;
1591	struct bvec_iter iter;
1592
1593	/ hint all but last page with MSG_MORE /
1594	bio_for_each_segment(bvec, bio, iter) {
1595	int err;
1596
1597	err = _drbd_no_send_page(peer_device, page: bvec.bv_page,
1598	offset: bvec.bv_offset, size: bvec.bv_len,
1599	bio_iter_last(bvec, iter)
1600	? `0` : MSG_MORE);
1601	if (err)
1602	return err;
1603	}
1604	return `0`;
1605	}
1606
1607	static int _drbd_send_zc_bio(struct drbd_peer_device peer_device, struct* bio *bio)
1608	{
1609	struct bio_vec bvec;
1610	struct bvec_iter iter;
1611
1612	/ hint all but last page with MSG_MORE /
1613	bio_for_each_segment(bvec, bio, iter) {
1614	int err;
1615
1616	err = _drbd_send_page(peer_device, page: bvec.bv_page,
1617	offset: bvec.bv_offset, size: bvec.bv_len,
1618	bio_iter_last(bvec, iter) ? `0` : MSG_MORE);
1619	if (err)
1620	return err;
1621	}
1622	return `0`;
1623	}
1624
1625	static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
1626	struct drbd_peer_request *peer_req)
1627	{
1628	struct page *page = peer_req->pages;
1629	unsigned len = peer_req->i.size;
1630	int err;
1631
1632	/ hint all but last page with MSG_MORE /
1633	page_chain_for_each(page) {
1634	unsigned l = min_t(unsigned, len, PAGE_SIZE);
1635
1636	err = _drbd_send_page(peer_device, page, offset: `0`, size: l,
1637	msg_flags: page_chain_next(page) ? MSG_MORE : `0`);
1638	if (err)
1639	return err;
1640	len -= l;
1641	}
1642	return `0`;
1643	}
1644
1645	static u32 bio_flags_to_wire(struct drbd_connection *connection,
1646	struct bio *bio)
1647	{
1648	if (connection->agreed_pro_version >= `95`)
1649	return (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : `0`) \|
1650	(bio->bi_opf & REQ_FUA ? DP_FUA : `0`) \|
1651	(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : `0`) \|
1652	(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : `0`) \|
1653	(bio_op(bio) == REQ_OP_WRITE_ZEROES ?
1654	((connection->agreed_features & DRBD_FF_WZEROES) ?
1655	(DP_ZEROES \|(!(bio->bi_opf & REQ_NOUNMAP) ? DP_DISCARD : `0`))
1656	: DP_DISCARD)
1657	: `0`);
1658	else
1659	return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : `0`;
1660	}
1661
1662	/ Used to send write or TRIM aka REQ_OP_DISCARD requests*
1663	* R_PRIMARY -> Peer (P_DATA, P_TRIM)
1664	*/
1665	int drbd_send_dblock(struct drbd_peer_device peer_device, struct* drbd_request *req)
1666	{
1667	struct drbd_device *device = peer_device->device;
1668	struct drbd_socket *sock;
1669	struct p_data *p;
1670	void *digest_out;
1671	unsigned int dp_flags = `0`;
1672	int digest_size;
1673	int err;
1674
1675	sock = &peer_device->connection->data;
1676	p = drbd_prepare_command(peer_device, sock);
1677	digest_size = peer_device->connection->integrity_tfm ?
1678	crypto_shash_digestsize(tfm: peer_device->connection->integrity_tfm) : `0`;
1679
1680	if (!p)
1681	return -EIO;
1682	p->sector = cpu_to_be64(req->i.sector);
1683	p->block_id = (unsigned long)req;
1684	p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
1685	dp_flags = bio_flags_to_wire(connection: peer_device->connection, bio: req->master_bio);
1686	if (device->state.conn >= C_SYNC_SOURCE &&
1687	device->state.conn <= C_PAUSED_SYNC_T)
1688	dp_flags \|= DP_MAY_SET_IN_SYNC;
1689	if (peer_device->connection->agreed_pro_version >= `100`) {
1690	if (req->rq_state & RQ_EXP_RECEIVE_ACK)
1691	dp_flags \|= DP_SEND_RECEIVE_ACK;
1692	/ During resync, request an explicit write ack,*
1693	* even in protocol != C */
1694	if (req->rq_state & RQ_EXP_WRITE_ACK
1695	\|\| (dp_flags & DP_MAY_SET_IN_SYNC))
1696	dp_flags \|= DP_SEND_WRITE_ACK;
1697	}
1698	p->dp_flags = cpu_to_be32(dp_flags);
1699
1700	if (dp_flags & (DP_DISCARD\|DP_ZEROES)) {
1701	enum drbd_packet cmd = (dp_flags & DP_ZEROES) ? P_ZEROES : P_TRIM;
1702	struct p_trim t = (struct* p_trim*)p;
1703	t->size = cpu_to_be32(req->i.size);
1704	err = __send_command(connection: peer_device->connection, vnr: device->vnr, sock, cmd, header_size: sizeof(*t), NULL, size: `0`);
1705	goto out;
1706	}
1707	digest_out = p + `1`;
1708
1709	/ our digest is still only over the payload.*
1710	* TRIM does not carry any payload. */
1711	if (digest_size)
1712	drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
1713	err = __send_command(connection: peer_device->connection, vnr: device->vnr, sock, cmd: P_DATA,
1714	header_size: sizeof(*p) + digest_size, NULL, size: req->i.size);
1715	if (!err) {
1716	/ For protocol A, we have to memcpy the payload into*
1717	* socket buffers, as we may complete right away
1718	* as soon as we handed it over to tcp, at which point the data
1719	* pages may become invalid.
1720	*
1721	* For data-integrity enabled, we copy it as well, so we can be
1722	* sure that even if the bio pages may still be modified, it
1723	* won't change the data on the wire, thus if the digest checks
1724	* out ok after sending on this side, but does not fit on the
1725	* receiving side, we sure have detected corruption elsewhere.
1726	*/
1727	if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK \| RQ_EXP_WRITE_ACK)) \|\| digest_size)
1728	err = _drbd_send_bio(peer_device, bio: req->master_bio);
1729	else
1730	err = _drbd_send_zc_bio(peer_device, bio: req->master_bio);
1731
1732	/ double check digest, sometimes buffers have been modified in flight. /
1733	if (digest_size > `0` && digest_size <= `64`) {
1734	/ 64 byte, 512 bit, is the largest digest size*
1735	* currently supported in kernel crypto. */
1736	unsigned char digest[`64`];
1737	drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
1738	if (memcmp(p: p + `1`, q: digest, size: digest_size)) {
1739	drbd_warn(device,
1740	"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
1741	(unsigned long long)req->i.sector, req->i.size);
1742	}
1743	} / else if (digest_size > 64) {*
1744	... Be noisy about digest too large ...
1745	} /*
1746	}
1747	out:
1748	mutex_unlock(lock: &sock->mutex); / locked by drbd_prepare_command() /
1749
1750	return err;
1751	}
1752
1753	/ answer packet, used to send data back for read requests:*
1754	* Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1755	* C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1756	*/
1757	int drbd_send_block(struct drbd_peer_device peer_device, enum* drbd_packet cmd,
1758	struct drbd_peer_request *peer_req)
1759	{
1760	struct drbd_device *device = peer_device->device;
1761	struct drbd_socket *sock;
1762	struct p_data *p;
1763	int err;
1764	int digest_size;
1765
1766	sock = &peer_device->connection->data;
1767	p = drbd_prepare_command(peer_device, sock);
1768
1769	digest_size = peer_device->connection->integrity_tfm ?
1770	crypto_shash_digestsize(tfm: peer_device->connection->integrity_tfm) : `0`;
1771
1772	if (!p)
1773	return -EIO;
1774	p->sector = cpu_to_be64(peer_req->i.sector);
1775	p->block_id = peer_req->block_id;
1776	p->seq_num = `0`; / unused /
1777	p->dp_flags = `0`;
1778	if (digest_size)
1779	drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + `1`);
1780	err = __send_command(connection: peer_device->connection, vnr: device->vnr, sock, cmd, header_size: sizeof(*p) + digest_size, NULL, size: peer_req->i.size);
1781	if (!err)
1782	err = _drbd_send_zc_ee(peer_device, peer_req);
1783	mutex_unlock(lock: &sock->mutex); / locked by drbd_prepare_command() /
1784
1785	return err;
1786	}
1787
1788	int drbd_send_out_of_sync(struct drbd_peer_device peer_device, struct* drbd_request *req)
1789	{
1790	struct drbd_socket *sock;
1791	struct p_block_desc *p;
1792
1793	sock = &peer_device->connection->data;
1794	p = drbd_prepare_command(peer_device, sock);
1795	if (!p)
1796	return -EIO;
1797	p->sector = cpu_to_be64(req->i.sector);
1798	p->blksize = cpu_to_be32(req->i.size);
1799	return drbd_send_command(peer_device, sock, cmd: P_OUT_OF_SYNC, header_size: sizeof(*p), NULL, size: `0`);
1800	}
1801
1802	/*
1803	drbd_send distinguishes two cases:
1804
1805	Packets sent via the data socket "sock"
1806	and packets sent via the meta data socket "msock"
1807
1808	sock msock
1809	-----------------+-------------------------+------------------------------
1810	timeout conf.timeout / 2 conf.timeout / 2
1811	timeout action send a ping via msock Abort communication
1812	and close all sockets
1813	*/
1814
1815	/*
1816	* you must have down()ed the appropriate [m]sock_mutex elsewhere!
1817	*/
1818	int drbd_send(struct drbd_connection connection, struct* socket *sock,
1819	void buf, size_t size, unsigned* msg_flags)
1820	{
1821	struct kvec iov = {.iov_base = buf, .iov_len = size};
1822	struct msghdr msg = {.msg_flags = msg_flags \| MSG_NOSIGNAL};
1823	int rv, sent = `0`;
1824
1825	if (!sock)
1826	return -EBADR;
1827
1828	/ THINK if (signal_pending) return ... ? /
1829
1830	iov_iter_kvec(i: &msg.msg_iter, ITER_SOURCE, kvec: &iov, nr_segs: `1`, count: size);
1831
1832	if (sock == connection->data.socket) {
1833	rcu_read_lock();
1834	connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
1835	rcu_read_unlock();
1836	drbd_update_congested(connection);
1837	}
1838	do {
1839	rv = sock_sendmsg(sock, msg: &msg);
1840	if (rv == -EAGAIN) {
1841	if (we_should_drop_the_connection(connection, sock))
1842	break;
1843	else
1844	continue;
1845	}
1846	if (rv == -EINTR) {
1847	flush_signals(current);
1848	rv = `0`;
1849	}
1850	if (rv < `0`)
1851	break;
1852	sent += rv;
1853	} while (sent < size);
1854
1855	if (sock == connection->data.socket)
1856	clear_bit(nr: NET_CONGESTED, addr: &connection->flags);
1857
1858	if (rv <= `0`) {
1859	if (rv != -EAGAIN) {
1860	drbd_err(connection, "%s_sendmsg returned %d\n",
1861	sock == connection->meta.socket ? "msock" : "sock",
1862	rv);
1863	conn_request_state(connection, NS(conn, C_BROKEN_PIPE), flags: CS_HARD);
1864	} else
1865	conn_request_state(connection, NS(conn, C_TIMEOUT), flags: CS_HARD);
1866	}
1867
1868	return sent;
1869	}
1870
1871	/*
1872	* drbd_send_all - Send an entire buffer
1873	*
1874	* Returns 0 upon success and a negative error value otherwise.
1875	*/
1876	int drbd_send_all(struct drbd_connection connection, struct* socket sock, void* *buffer,
1877	size_t size, unsigned msg_flags)
1878	{
1879	int err;
1880
1881	err = drbd_send(connection, sock, buf: buffer, size, msg_flags);
1882	if (err < `0`)
1883	return err;
1884	if (err != size)
1885	return -EIO;
1886	return `0`;
1887	}
1888
1889	static int drbd_open(struct gendisk *disk, blk_mode_t mode)
1890	{
1891	struct drbd_device *device = disk->private_data;
1892	unsigned long flags;
1893	int rv = `0`;
1894
1895	mutex_lock(&drbd_main_mutex);
1896	spin_lock_irqsave(&device->resource->req_lock, flags);
1897	/ to have a stable device->state.role*
1898	* and no race with updating open_cnt */
1899
1900	if (device->state.role != R_PRIMARY) {
1901	if (mode & BLK_OPEN_WRITE)
1902	rv = -EROFS;
1903	else if (!drbd_allow_oos)
1904	rv = -EMEDIUMTYPE;
1905	}
1906
1907	if (!rv)
1908	device->open_cnt++;
1909	spin_unlock_irqrestore(lock: &device->resource->req_lock, flags);
1910	mutex_unlock(lock: &drbd_main_mutex);
1911
1912	return rv;
1913	}
1914
1915	static void drbd_release(struct gendisk *gd)
1916	{
1917	struct drbd_device *device = gd->private_data;
1918
1919	mutex_lock(&drbd_main_mutex);
1920	device->open_cnt--;
1921	mutex_unlock(lock: &drbd_main_mutex);
1922	}
1923
1924	/ need to hold resource->req_lock /
1925	void drbd_queue_unplug(struct drbd_device *device)
1926	{
1927	if (device->state.pdsk >= D_INCONSISTENT && device->state.conn >= C_CONNECTED) {
1928	D_ASSERT(device, device->state.role == R_PRIMARY);
1929	if (test_and_clear_bit(nr: UNPLUG_REMOTE, addr: &device->flags)) {
1930	drbd_queue_work_if_unqueued(
1931	q: &first_peer_device(device)->connection->sender_work,
1932	w: &device->unplug_work);
1933	}
1934	}
1935	}
1936
1937	static void drbd_set_defaults(struct drbd_device *device)
1938	{
1939	/ Beware! The actual layout differs*
1940	* between big endian and little endian */
1941	device->state = (union drbd_dev_state) {
1942	{ .role = R_SECONDARY,
1943	.peer = R_UNKNOWN,
1944	.conn = C_STANDALONE,
1945	.disk = D_DISKLESS,
1946	.pdsk = D_UNKNOWN,
1947	} };
1948	}
1949
1950	void drbd_init_set_defaults(struct drbd_device *device)
1951	{
1952	/ the memset(,0,) did most of this.*
1953	* note: only assignments, no allocation in here */
1954
1955	drbd_set_defaults(device);
1956
1957	atomic_set(v: &device->ap_bio_cnt, i: `0`);
1958	atomic_set(v: &device->ap_actlog_cnt, i: `0`);
1959	atomic_set(v: &device->ap_pending_cnt, i: `0`);
1960	atomic_set(v: &device->rs_pending_cnt, i: `0`);
1961	atomic_set(v: &device->unacked_cnt, i: `0`);
1962	atomic_set(v: &device->local_cnt, i: `0`);
1963	atomic_set(v: &device->pp_in_use_by_net, i: `0`);
1964	atomic_set(v: &device->rs_sect_in, i: `0`);
1965	atomic_set(v: &device->rs_sect_ev, i: `0`);
1966	atomic_set(v: &device->ap_in_flight, i: `0`);
1967	atomic_set(v: &device->md_io.in_use, i: `0`);
1968
1969	mutex_init(&device->own_state_mutex);
1970	device->state_mutex = &device->own_state_mutex;
1971
1972	spin_lock_init(&device->al_lock);
1973	spin_lock_init(&device->peer_seq_lock);
1974
1975	INIT_LIST_HEAD(list: &device->active_ee);
1976	INIT_LIST_HEAD(list: &device->sync_ee);
1977	INIT_LIST_HEAD(list: &device->done_ee);
1978	INIT_LIST_HEAD(list: &device->read_ee);
1979	INIT_LIST_HEAD(list: &device->net_ee);
1980	INIT_LIST_HEAD(list: &device->resync_reads);
1981	INIT_LIST_HEAD(list: &device->resync_work.list);
1982	INIT_LIST_HEAD(list: &device->unplug_work.list);
1983	INIT_LIST_HEAD(list: &device->bm_io_work.w.list);
1984	INIT_LIST_HEAD(list: &device->pending_master_completion[`0`]);
1985	INIT_LIST_HEAD(list: &device->pending_master_completion[`1`]);
1986	INIT_LIST_HEAD(list: &device->pending_completion[`0`]);
1987	INIT_LIST_HEAD(list: &device->pending_completion[`1`]);
1988
1989	device->resync_work.cb = w_resync_timer;
1990	device->unplug_work.cb = w_send_write_hint;
1991	device->bm_io_work.w.cb = w_bitmap_io;
1992
1993	timer_setup(&device->resync_timer, resync_timer_fn, `0`);
1994	timer_setup(&device->md_sync_timer, md_sync_timer_fn, `0`);
1995	timer_setup(&device->start_resync_timer, start_resync_timer_fn, `0`);
1996	timer_setup(&device->request_timer, request_timer_fn, `0`);
1997
1998	init_waitqueue_head(&device->misc_wait);
1999	init_waitqueue_head(&device->state_wait);
2000	init_waitqueue_head(&device->ee_wait);
2001	init_waitqueue_head(&device->al_wait);
2002	init_waitqueue_head(&device->seq_wait);
2003
2004	device->resync_wenr = LC_FREE;
2005	device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2006	device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2007	}
2008
2009	void drbd_set_my_capacity(struct drbd_device *device, sector_t size)
2010	{
2011	char ppb[`10`];
2012
2013	set_capacity_and_notify(disk: device->vdisk, size);
2014
2015	drbd_info(device, "size = %s (%llu KB)\n",
2016	ppsize(ppb, size>>`1`), (unsigned long long)size>>`1`);
2017	}
2018
2019	void drbd_device_cleanup(struct drbd_device *device)
2020	{
2021	int i;
2022	if (first_peer_device(device)->connection->receiver.t_state != NONE)
2023	drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2024	first_peer_device(device)->connection->receiver.t_state);
2025
2026	device->al_writ_cnt =
2027	device->bm_writ_cnt =
2028	device->read_cnt =
2029	device->recv_cnt =
2030	device->send_cnt =
2031	device->writ_cnt =
2032	device->p_size =
2033	device->rs_start =
2034	device->rs_total =
2035	device->rs_failed = `0`;
2036	device->rs_last_events = `0`;
2037	device->rs_last_sect_ev = `0`;
2038	for (i = `0`; i < DRBD_SYNC_MARKS; i++) {
2039	device->rs_mark_left[i] = `0`;
2040	device->rs_mark_time[i] = `0`;
2041	}
2042	D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
2043
2044	set_capacity_and_notify(disk: device->vdisk, size: `0`);
2045	if (device->bitmap) {
2046	/ maybe never allocated. /
2047	drbd_bm_resize(device, sectors: `0`, set_new_bits: `1`);
2048	drbd_bm_cleanup(device);
2049	}
2050
2051	drbd_backing_dev_free(device, ldev: device->ldev);
2052	device->ldev = NULL;
2053
2054	clear_bit(nr: AL_SUSPENDED, addr: &device->flags);
2055
2056	D_ASSERT(device, list_empty(&device->active_ee));
2057	D_ASSERT(device, list_empty(&device->sync_ee));
2058	D_ASSERT(device, list_empty(&device->done_ee));
2059	D_ASSERT(device, list_empty(&device->read_ee));
2060	D_ASSERT(device, list_empty(&device->net_ee));
2061	D_ASSERT(device, list_empty(&device->resync_reads));
2062	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
2063	D_ASSERT(device, list_empty(&device->resync_work.list));
2064	D_ASSERT(device, list_empty(&device->unplug_work.list));
2065
2066	drbd_set_defaults(device);
2067	}
2068
2069
2070	static void drbd_destroy_mempools(void)
2071	{
2072	struct page *page;
2073
2074	while (drbd_pp_pool) {
2075	page = drbd_pp_pool;
2076	drbd_pp_pool = (struct page *)page_private(page);
2077	__free_page(page);
2078	drbd_pp_vacant--;
2079	}
2080
2081	/ D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); /
2082
2083	bioset_exit(&drbd_io_bio_set);
2084	bioset_exit(&drbd_md_io_bio_set);
2085	mempool_exit(pool: &drbd_md_io_page_pool);
2086	mempool_exit(pool: &drbd_ee_mempool);
2087	mempool_exit(pool: &drbd_request_mempool);
2088	kmem_cache_destroy(s: drbd_ee_cache);
2089	kmem_cache_destroy(s: drbd_request_cache);
2090	kmem_cache_destroy(s: drbd_bm_ext_cache);
2091	kmem_cache_destroy(s: drbd_al_ext_cache);
2092
2093	drbd_ee_cache = NULL;
2094	drbd_request_cache = NULL;
2095	drbd_bm_ext_cache = NULL;
2096	drbd_al_ext_cache = NULL;
2097
2098	return;
2099	}
2100
2101	static int drbd_create_mempools(void)
2102	{
2103	struct page *page;
2104	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
2105	int i, ret;
2106
2107	/ caches /
2108	drbd_request_cache = kmem_cache_create(
2109	name: "drbd_req", size: sizeof(struct drbd_request), align: `0`, flags: `0`, NULL);
2110	if (drbd_request_cache == NULL)
2111	goto Enomem;
2112
2113	drbd_ee_cache = kmem_cache_create(
2114	name: "drbd_ee", size: sizeof(struct drbd_peer_request), align: `0`, flags: `0`, NULL);
2115	if (drbd_ee_cache == NULL)
2116	goto Enomem;
2117
2118	drbd_bm_ext_cache = kmem_cache_create(
2119	name: "drbd_bm", size: sizeof(struct bm_extent), align: `0`, flags: `0`, NULL);
2120	if (drbd_bm_ext_cache == NULL)
2121	goto Enomem;
2122
2123	drbd_al_ext_cache = kmem_cache_create(
2124	name: "drbd_al", size: sizeof(struct lc_element), align: `0`, flags: `0`, NULL);
2125	if (drbd_al_ext_cache == NULL)
2126	goto Enomem;
2127
2128	/ mempools /
2129	ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, `0`, flags: `0`);
2130	if (ret)
2131	goto Enomem;
2132
2133	ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, `0`,
2134	flags: BIOSET_NEED_BVECS);
2135	if (ret)
2136	goto Enomem;
2137
2138	ret = mempool_init_page_pool(pool: &drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, order: `0`);
2139	if (ret)
2140	goto Enomem;
2141
2142	ret = mempool_init_slab_pool(pool: &drbd_request_mempool, min_nr: number,
2143	kc: drbd_request_cache);
2144	if (ret)
2145	goto Enomem;
2146
2147	ret = mempool_init_slab_pool(pool: &drbd_ee_mempool, min_nr: number, kc: drbd_ee_cache);
2148	if (ret)
2149	goto Enomem;
2150
2151	for (i = `0`; i < number; i++) {
2152	page = alloc_page(GFP_HIGHUSER);
2153	if (!page)
2154	goto Enomem;
2155	set_page_private(page, private: (unsigned long)drbd_pp_pool);
2156	drbd_pp_pool = page;
2157	}
2158	drbd_pp_vacant = number;
2159
2160	return `0`;
2161
2162	Enomem:
2163	drbd_destroy_mempools(); / in case we allocated some /
2164	return -ENOMEM;
2165	}
2166
2167	static void drbd_release_all_peer_reqs(struct drbd_device *device)
2168	{
2169	int rr;
2170
2171	rr = drbd_free_peer_reqs(device, &device->active_ee);
2172	if (rr)
2173	drbd_err(device, "%d EEs in active list found!\n", rr);
2174
2175	rr = drbd_free_peer_reqs(device, &device->sync_ee);
2176	if (rr)
2177	drbd_err(device, "%d EEs in sync list found!\n", rr);
2178
2179	rr = drbd_free_peer_reqs(device, &device->read_ee);
2180	if (rr)
2181	drbd_err(device, "%d EEs in read list found!\n", rr);
2182
2183	rr = drbd_free_peer_reqs(device, &device->done_ee);
2184	if (rr)
2185	drbd_err(device, "%d EEs in done list found!\n", rr);
2186
2187	rr = drbd_free_peer_reqs(device, &device->net_ee);
2188	if (rr)
2189	drbd_err(device, "%d EEs in net list found!\n", rr);
2190	}
2191
2192	/ caution. no locking. /
2193	void drbd_destroy_device(struct kref *kref)
2194	{
2195	struct drbd_device device = container_of(kref, struct* drbd_device, kref);
2196	struct drbd_resource *resource = device->resource;
2197	struct drbd_peer_device peer_device, tmp_peer_device;
2198
2199	timer_shutdown_sync(timer: &device->request_timer);
2200
2201	/ paranoia asserts /
2202	D_ASSERT(device, device->open_cnt == `0`);
2203	/ end paranoia asserts /
2204
2205	/ cleanup stuff that may have been allocated during*
2206	* device (re-)configuration or state changes */
2207
2208	drbd_backing_dev_free(device, ldev: device->ldev);
2209	device->ldev = NULL;
2210
2211	drbd_release_all_peer_reqs(device);
2212
2213	lc_destroy(lc: device->act_log);
2214	lc_destroy(lc: device->resync);
2215
2216	kfree(objp: device->p_uuid);
2217	/ device->p_uuid = NULL; /
2218
2219	if (device->bitmap) / should no longer be there. /
2220	drbd_bm_cleanup(device);
2221	__free_page(device->md_io.page);
2222	put_disk(disk: device->vdisk);
2223	kfree(objp: device->rs_plan_s);
2224
2225	/ not for_each_connection(connection, resource):*
2226	* those may have been cleaned up and disassociated already.
2227	*/
2228	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2229	kref_put(kref: &peer_device->connection->kref, release: drbd_destroy_connection);
2230	kfree(objp: peer_device);
2231	}
2232	if (device->submit.wq)
2233	destroy_workqueue(wq: device->submit.wq);
2234	kfree(objp: device);
2235	kref_put(kref: &resource->kref, release: drbd_destroy_resource);
2236	}
2237
2238	/ One global retry thread, if we need to push back some bio and have it*
2239	* reinserted through our make request function.
2240	*/
2241	static struct retry_worker {
2242	struct workqueue_struct *wq;
2243	struct work_struct worker;
2244
2245	spinlock_t lock;
2246	struct list_head writes;
2247	} retry;
2248
2249	static void do_retry(struct work_struct *ws)
2250	{
2251	struct retry_worker retry = container_of(ws, struct* retry_worker, worker);
2252	LIST_HEAD(writes);
2253	struct drbd_request req, tmp;
2254
2255	spin_lock_irq(lock: &retry->lock);
2256	list_splice_init(list: &retry->writes, head: &writes);
2257	spin_unlock_irq(lock: &retry->lock);
2258
2259	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
2260	struct drbd_device *device = req->device;
2261	struct bio *bio = req->master_bio;
2262	bool expected;
2263
2264	expected =
2265	expect(device, atomic_read(&req->completion_ref) == `0`) &&
2266	expect(device, req->rq_state & RQ_POSTPONED) &&
2267	expect(device, (req->rq_state & RQ_LOCAL_PENDING) == `0` \|\|
2268	(req->rq_state & RQ_LOCAL_ABORTED) != `0`);
2269
2270	if (!expected)
2271	drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
2272	req, atomic_read(&req->completion_ref),
2273	req->rq_state);
2274
2275	/ We still need to put one kref associated with the*
2276	* "completion_ref" going zero in the code path that queued it
2277	* here. The request object may still be referenced by a
2278	* frozen local req->private_bio, in case we force-detached.
2279	*/
2280	kref_put(kref: &req->kref, release: drbd_req_destroy);
2281
2282	/ A single suspended or otherwise blocking device may stall*
2283	* all others as well. Fortunately, this code path is to
2284	* recover from a situation that "should not happen":
2285	* concurrent writes in multi-primary setup.
2286	* In a "normal" lifecycle, this workqueue is supposed to be
2287	* destroyed without ever doing anything.
2288	* If it turns out to be an issue anyways, we can do per
2289	* resource (replication group) or per device (minor) retry
2290	* workqueues instead.
2291	*/
2292
2293	/ We are not just doing submit_bio_noacct(),*
2294	* as we want to keep the start_time information. */
2295	inc_ap_bio(device);
2296	__drbd_make_request(device, bio);
2297	}
2298	}
2299
2300	/ called via drbd_req_put_completion_ref(),*
2301	* holds resource->req_lock */
2302	void drbd_restart_request(struct drbd_request *req)
2303	{
2304	unsigned long flags;
2305	spin_lock_irqsave(&retry.lock, flags);
2306	list_move_tail(list: &req->tl_requests, head: &retry.writes);
2307	spin_unlock_irqrestore(lock: &retry.lock, flags);
2308
2309	/ Drop the extra reference that would otherwise*
2310	* have been dropped by complete_master_bio.
2311	* do_retry() needs to grab a new one. */
2312	dec_ap_bio(device: req->device);
2313
2314	queue_work(wq: retry.wq, work: &retry.worker);
2315	}
2316
2317	void drbd_destroy_resource(struct kref *kref)
2318	{
2319	struct drbd_resource *resource =
2320	container_of(kref, struct drbd_resource, kref);
2321
2322	idr_destroy(&resource->devices);
2323	free_cpumask_var(mask: resource->cpu_mask);
2324	kfree(objp: resource->name);
2325	kfree(objp: resource);
2326	}
2327
2328	void drbd_free_resource(struct drbd_resource *resource)
2329	{
2330	struct drbd_connection connection, tmp;
2331
2332	for_each_connection_safe(connection, tmp, resource) {
2333	list_del(entry: &connection->connections);
2334	drbd_debugfs_connection_cleanup(connection);
2335	kref_put(kref: &connection->kref, release: drbd_destroy_connection);
2336	}
2337	drbd_debugfs_resource_cleanup(resource);
2338	kref_put(kref: &resource->kref, release: drbd_destroy_resource);
2339	}
2340
2341	static void drbd_cleanup(void)
2342	{
2343	unsigned int i;
2344	struct drbd_device *device;
2345	struct drbd_resource resource, tmp;
2346
2347	/ first remove proc,*
2348	* drbdsetup uses it's presence to detect
2349	* whether DRBD is loaded.
2350	* If we would get stuck in proc removal,
2351	* but have netlink already deregistered,
2352	* some drbdsetup commands may wait forever
2353	* for an answer.
2354	*/
2355	if (drbd_proc)
2356	remove_proc_entry("drbd", NULL);
2357
2358	if (retry.wq)
2359	destroy_workqueue(wq: retry.wq);
2360
2361	drbd_genl_unregister();
2362
2363	idr_for_each_entry(&drbd_devices, device, i)
2364	drbd_delete_device(device);
2365
2366	/ not _rcu since, no other updater anymore. Genl already unregistered /
2367	for_each_resource_safe(resource, tmp, &drbd_resources) {
2368	list_del(entry: &resource->resources);
2369	drbd_free_resource(resource);
2370	}
2371
2372	drbd_debugfs_cleanup();
2373
2374	drbd_destroy_mempools();
2375	unregister_blkdev(DRBD_MAJOR, name: "drbd");
2376
2377	idr_destroy(&drbd_devices);
2378
2379	pr_info("module cleanup done.\n");
2380	}
2381
2382	static void drbd_init_workqueue(struct drbd_work_queue* wq)
2383	{
2384	spin_lock_init(&wq->q_lock);
2385	INIT_LIST_HEAD(list: &wq->q);
2386	init_waitqueue_head(&wq->q_wait);
2387	}
2388
2389	struct completion_work {
2390	struct drbd_work w;
2391	struct completion done;
2392	};
2393
2394	static int w_complete(struct drbd_work w, int* cancel)
2395	{
2396	struct completion_work *completion_work =
2397	container_of(w, struct completion_work, w);
2398
2399	complete(&completion_work->done);
2400	return `0`;
2401	}
2402
2403	void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
2404	{
2405	struct completion_work completion_work;
2406
2407	completion_work.w.cb = w_complete;
2408	init_completion(x: &completion_work.done);
2409	drbd_queue_work(q: work_queue, w: &completion_work.w);
2410	wait_for_completion(&completion_work.done);
2411	}
2412
2413	struct drbd_resource drbd_find_resource(const* char *name)
2414	{
2415	struct drbd_resource *resource;
2416
2417	if (!name \|\| !name[`0`])
2418	return NULL;
2419
2420	rcu_read_lock();
2421	for_each_resource_rcu(resource, &drbd_resources) {
2422	if (!strcmp(resource->name, name)) {
2423	kref_get(kref: &resource->kref);
2424	goto found;
2425	}
2426	}
2427	resource = NULL;
2428	found:
2429	rcu_read_unlock();
2430	return resource;
2431	}
2432
2433	struct drbd_connection conn_get_by_addrs(void* my_addr, int* my_addr_len,
2434	void peer_addr, int* peer_addr_len)
2435	{
2436	struct drbd_resource *resource;
2437	struct drbd_connection *connection;
2438
2439	rcu_read_lock();
2440	for_each_resource_rcu(resource, &drbd_resources) {
2441	for_each_connection_rcu(connection, resource) {
2442	if (connection->my_addr_len == my_addr_len &&
2443	connection->peer_addr_len == peer_addr_len &&
2444	!memcmp(p: &connection->my_addr, q: my_addr, size: my_addr_len) &&
2445	!memcmp(p: &connection->peer_addr, q: peer_addr, size: peer_addr_len)) {
2446	kref_get(kref: &connection->kref);
2447	goto found;
2448	}
2449	}
2450	}
2451	connection = NULL;
2452	found:
2453	rcu_read_unlock();
2454	return connection;
2455	}
2456
2457	static int drbd_alloc_socket(struct drbd_socket *socket)
2458	{
2459	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
2460	if (!socket->rbuf)
2461	return -ENOMEM;
2462	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
2463	if (!socket->sbuf)
2464	return -ENOMEM;
2465	return `0`;
2466	}
2467
2468	static void drbd_free_socket(struct drbd_socket *socket)
2469	{
2470	free_page((unsigned long) socket->sbuf);
2471	free_page((unsigned long) socket->rbuf);
2472	}
2473
2474	void conn_free_crypto(struct drbd_connection *connection)
2475	{
2476	drbd_free_sock(connection);
2477
2478	crypto_free_shash(tfm: connection->csums_tfm);
2479	crypto_free_shash(tfm: connection->verify_tfm);
2480	crypto_free_shash(tfm: connection->cram_hmac_tfm);
2481	crypto_free_shash(tfm: connection->integrity_tfm);
2482	crypto_free_shash(tfm: connection->peer_integrity_tfm);
2483	kfree(objp: connection->int_dig_in);
2484	kfree(objp: connection->int_dig_vv);
2485
2486	connection->csums_tfm = NULL;
2487	connection->verify_tfm = NULL;
2488	connection->cram_hmac_tfm = NULL;
2489	connection->integrity_tfm = NULL;
2490	connection->peer_integrity_tfm = NULL;
2491	connection->int_dig_in = NULL;
2492	connection->int_dig_vv = NULL;
2493	}
2494
2495	int set_resource_options(struct drbd_resource resource, struct* res_opts *res_opts)
2496	{
2497	struct drbd_connection *connection;
2498	cpumask_var_t new_cpu_mask;
2499	int err;
2500
2501	if (!zalloc_cpumask_var(mask: &new_cpu_mask, GFP_KERNEL))
2502	return -ENOMEM;
2503
2504	/ silently ignore cpu mask on UP kernel /
2505	if (nr_cpu_ids > `1` && res_opts->cpu_mask[`0`] != `0`) {
2506	err = bitmap_parse(buf: res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
2507	cpumask_bits(new_cpu_mask), nbits: nr_cpu_ids);
2508	if (err == -EOVERFLOW) {
2509	/ So what. mask it out. /
2510	cpumask_var_t tmp_cpu_mask;
2511	if (zalloc_cpumask_var(mask: &tmp_cpu_mask, GFP_KERNEL)) {
2512	cpumask_setall(dstp: tmp_cpu_mask);
2513	cpumask_and(dstp: new_cpu_mask, src1p: new_cpu_mask, src2p: tmp_cpu_mask);
2514	drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
2515	res_opts->cpu_mask,
2516	strlen(res_opts->cpu_mask) > `12` ? "..." : "",
2517	nr_cpu_ids);
2518	free_cpumask_var(mask: tmp_cpu_mask);
2519	err = `0`;
2520	}
2521	}
2522	if (err) {
2523	drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
2524	/ retcode = ERR_CPU_MASK_PARSE; /
2525	goto fail;
2526	}
2527	}
2528	resource->res_opts = *res_opts;
2529	if (cpumask_empty(srcp: new_cpu_mask))
2530	drbd_calc_cpu_mask(cpu_mask: &new_cpu_mask);
2531	if (!cpumask_equal(src1p: resource->cpu_mask, src2p: new_cpu_mask)) {
2532	cpumask_copy(dstp: resource->cpu_mask, srcp: new_cpu_mask);
2533	for_each_connection_rcu(connection, resource) {
2534	connection->receiver.reset_cpu_mask = `1`;
2535	connection->ack_receiver.reset_cpu_mask = `1`;
2536	connection->worker.reset_cpu_mask = `1`;
2537	}
2538	}
2539	err = `0`;
2540
2541	fail:
2542	free_cpumask_var(mask: new_cpu_mask);
2543	return err;
2544
2545	}
2546
2547	struct drbd_resource drbd_create_resource(const* char *name)
2548	{
2549	struct drbd_resource *resource;
2550
2551	resource = kzalloc(size: sizeof(struct drbd_resource), GFP_KERNEL);
2552	if (!resource)
2553	goto fail;
2554	resource->name = kstrdup(s: name, GFP_KERNEL);
2555	if (!resource->name)
2556	goto fail_free_resource;
2557	if (!zalloc_cpumask_var(mask: &resource->cpu_mask, GFP_KERNEL))
2558	goto fail_free_name;
2559	kref_init(kref: &resource->kref);
2560	idr_init(idr: &resource->devices);
2561	INIT_LIST_HEAD(list: &resource->connections);
2562	resource->write_ordering = WO_BDEV_FLUSH;
2563	list_add_tail_rcu(new: &resource->resources, head: &drbd_resources);
2564	mutex_init(&resource->conf_update);
2565	mutex_init(&resource->adm_mutex);
2566	spin_lock_init(&resource->req_lock);
2567	drbd_debugfs_resource_add(resource);
2568	return resource;
2569
2570	fail_free_name:
2571	kfree(objp: resource->name);
2572	fail_free_resource:
2573	kfree(objp: resource);
2574	fail:
2575	return NULL;
2576	}
2577
2578	/ caller must be under adm_mutex /
2579	struct drbd_connection conn_create(const* char name, struct* res_opts *res_opts)
2580	{
2581	struct drbd_resource *resource;
2582	struct drbd_connection *connection;
2583
2584	connection = kzalloc(size: sizeof(struct drbd_connection), GFP_KERNEL);
2585	if (!connection)
2586	return NULL;
2587
2588	if (drbd_alloc_socket(socket: &connection->data))
2589	goto fail;
2590	if (drbd_alloc_socket(socket: &connection->meta))
2591	goto fail;
2592
2593	connection->current_epoch = kzalloc(size: sizeof(struct drbd_epoch), GFP_KERNEL);
2594	if (!connection->current_epoch)
2595	goto fail;
2596
2597	INIT_LIST_HEAD(list: &connection->transfer_log);
2598
2599	INIT_LIST_HEAD(list: &connection->current_epoch->list);
2600	connection->epochs = `1`;
2601	spin_lock_init(&connection->epoch_lock);
2602
2603	connection->send.seen_any_write_yet = false;
2604	connection->send.current_epoch_nr = `0`;
2605	connection->send.current_epoch_writes = `0`;
2606
2607	resource = drbd_create_resource(name);
2608	if (!resource)
2609	goto fail;
2610
2611	connection->cstate = C_STANDALONE;
2612	mutex_init(&connection->cstate_mutex);
2613	init_waitqueue_head(&connection->ping_wait);
2614	idr_init(idr: &connection->peer_devices);
2615
2616	drbd_init_workqueue(wq: &connection->sender_work);
2617	mutex_init(&connection->data.mutex);
2618	mutex_init(&connection->meta.mutex);
2619
2620	drbd_thread_init(resource, thi: &connection->receiver, func: drbd_receiver, name: "receiver");
2621	connection->receiver.connection = connection;
2622	drbd_thread_init(resource, thi: &connection->worker, func: drbd_worker, name: "worker");
2623	connection->worker.connection = connection;
2624	drbd_thread_init(resource, thi: &connection->ack_receiver, func: drbd_ack_receiver, name: "ack_recv");
2625	connection->ack_receiver.connection = connection;
2626
2627	kref_init(kref: &connection->kref);
2628
2629	connection->resource = resource;
2630
2631	if (set_resource_options(resource, res_opts))
2632	goto fail_resource;
2633
2634	kref_get(kref: &resource->kref);
2635	list_add_tail_rcu(new: &connection->connections, head: &resource->connections);
2636	drbd_debugfs_connection_add(connection);
2637	return connection;
2638
2639	fail_resource:
2640	list_del(entry: &resource->resources);
2641	drbd_free_resource(resource);
2642	fail:
2643	kfree(objp: connection->current_epoch);
2644	drbd_free_socket(socket: &connection->meta);
2645	drbd_free_socket(socket: &connection->data);
2646	kfree(objp: connection);
2647	return NULL;
2648	}
2649
2650	void drbd_destroy_connection(struct kref *kref)
2651	{
2652	struct drbd_connection connection = container_of(kref, struct* drbd_connection, kref);
2653	struct drbd_resource *resource = connection->resource;
2654
2655	if (atomic_read(v: &connection->current_epoch->epoch_size) != `0`)
2656	drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
2657	kfree(objp: connection->current_epoch);
2658
2659	idr_destroy(&connection->peer_devices);
2660
2661	drbd_free_socket(socket: &connection->meta);
2662	drbd_free_socket(socket: &connection->data);
2663	kfree(objp: connection->int_dig_in);
2664	kfree(objp: connection->int_dig_vv);
2665	kfree(objp: connection);
2666	kref_put(kref: &resource->kref, release: drbd_destroy_resource);
2667	}
2668
2669	static int init_submitter(struct drbd_device *device)
2670	{
2671	/ opencoded create_singlethread_workqueue(),*
2672	* to be able to say "drbd%d", ..., minor */
2673	device->submit.wq =
2674	alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
2675	if (!device->submit.wq)
2676	return -ENOMEM;
2677
2678	INIT_WORK(&device->submit.worker, do_submit);
2679	INIT_LIST_HEAD(list: &device->submit.writes);
2680	return `0`;
2681	}
2682
2683	enum drbd_ret_code drbd_create_device(struct drbd_config_context adm_ctx, unsigned* int minor)
2684	{
2685	struct drbd_resource *resource = adm_ctx->resource;
2686	struct drbd_connection connection, n;
2687	struct drbd_device *device;
2688	struct drbd_peer_device peer_device, tmp_peer_device;
2689	struct gendisk *disk;
2690	int id;
2691	int vnr = adm_ctx->volume;
2692	enum drbd_ret_code err = ERR_NOMEM;
2693	struct queue_limits lim = {
2694	/*
2695	* Setting the max_hw_sectors to an odd value of 8kibyte here.
2696	* This triggers a max_bio_size message upon first attach or
2697	* connect.
2698	*/
2699	.max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> `8`,
2700	};
2701
2702	device = minor_to_device(minor);
2703	if (device)
2704	return ERR_MINOR_OR_VOLUME_EXISTS;
2705
2706	/ GFP_KERNEL, we are outside of all write-out paths /
2707	device = kzalloc(size: sizeof(struct drbd_device), GFP_KERNEL);
2708	if (!device)
2709	return ERR_NOMEM;
2710	kref_init(kref: &device->kref);
2711
2712	kref_get(kref: &resource->kref);
2713	device->resource = resource;
2714	device->minor = minor;
2715	device->vnr = vnr;
2716
2717	drbd_init_set_defaults(device);
2718
2719	disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
2720	if (IS_ERR(ptr: disk)) {
2721	err = PTR_ERR(ptr: disk);
2722	goto out_no_disk;
2723	}
2724
2725	device->vdisk = disk;
2726	device->rq_queue = disk->queue;
2727
2728	set_disk_ro(disk, read_only: true);
2729
2730	disk->major = DRBD_MAJOR;
2731	disk->first_minor = minor;
2732	disk->minors = `1`;
2733	disk->fops = &drbd_ops;
2734	disk->flags \|= GENHD_FL_NO_PART;
2735	sprintf(buf: disk->disk_name, fmt: "drbd%d", minor);
2736	disk->private_data = device;
2737
2738	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q: disk->queue);
2739	blk_queue_write_cache(q: disk->queue, enabled: true, fua: true);
2740
2741	device->md_io.page = alloc_page(GFP_KERNEL);
2742	if (!device->md_io.page)
2743	goto out_no_io_page;
2744
2745	if (drbd_bm_init(device))
2746	goto out_no_bitmap;
2747	device->read_requests = RB_ROOT;
2748	device->write_requests = RB_ROOT;
2749
2750	id = idr_alloc(&drbd_devices, ptr: device, start: minor, end: minor + `1`, GFP_KERNEL);
2751	if (id < `0`) {
2752	if (id == -ENOSPC)
2753	err = ERR_MINOR_OR_VOLUME_EXISTS;
2754	goto out_no_minor_idr;
2755	}
2756	kref_get(kref: &device->kref);
2757
2758	id = idr_alloc(&resource->devices, ptr: device, start: vnr, end: vnr + `1`, GFP_KERNEL);
2759	if (id < `0`) {
2760	if (id == -ENOSPC)
2761	err = ERR_MINOR_OR_VOLUME_EXISTS;
2762	goto out_idr_remove_minor;
2763	}
2764	kref_get(kref: &device->kref);
2765
2766	INIT_LIST_HEAD(list: &device->peer_devices);
2767	INIT_LIST_HEAD(list: &device->pending_bitmap_io);
2768	for_each_connection(connection, resource) {
2769	peer_device = kzalloc(size: sizeof(struct drbd_peer_device), GFP_KERNEL);
2770	if (!peer_device)
2771	goto out_idr_remove_from_resource;
2772	peer_device->connection = connection;
2773	peer_device->device = device;
2774
2775	list_add(new: &peer_device->peer_devices, head: &device->peer_devices);
2776	kref_get(kref: &device->kref);
2777
2778	id = idr_alloc(&connection->peer_devices, ptr: peer_device, start: vnr, end: vnr + `1`, GFP_KERNEL);
2779	if (id < `0`) {
2780	if (id == -ENOSPC)
2781	err = ERR_INVALID_REQUEST;
2782	goto out_idr_remove_from_resource;
2783	}
2784	kref_get(kref: &connection->kref);
2785	INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
2786	}
2787
2788	if (init_submitter(device)) {
2789	err = ERR_NOMEM;
2790	goto out_idr_remove_from_resource;
2791	}
2792
2793	err = add_disk(disk);
2794	if (err)
2795	goto out_destroy_workqueue;
2796
2797	/ inherit the connection state /
2798	device->state.conn = first_connection(resource)->cstate;
2799	if (device->state.conn == C_WF_REPORT_PARAMS) {
2800	for_each_peer_device(peer_device, device)
2801	drbd_connected(peer_device);
2802	}
2803	/ move to create_peer_device() /
2804	for_each_peer_device(peer_device, device)
2805	drbd_debugfs_peer_device_add(peer_device);
2806	drbd_debugfs_device_add(device);
2807	return NO_ERROR;
2808
2809	out_destroy_workqueue:
2810	destroy_workqueue(wq: device->submit.wq);
2811	out_idr_remove_from_resource:
2812	for_each_connection_safe(connection, n, resource) {
2813	peer_device = idr_remove(&connection->peer_devices, id: vnr);
2814	if (peer_device)
2815	kref_put(kref: &connection->kref, release: drbd_destroy_connection);
2816	}
2817	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2818	list_del(entry: &peer_device->peer_devices);
2819	kfree(objp: peer_device);
2820	}
2821	idr_remove(&resource->devices, id: vnr);
2822	out_idr_remove_minor:
2823	idr_remove(&drbd_devices, id: minor);
2824	synchronize_rcu();
2825	out_no_minor_idr:
2826	drbd_bm_cleanup(device);
2827	out_no_bitmap:
2828	__free_page(device->md_io.page);
2829	out_no_io_page:
2830	put_disk(disk);
2831	out_no_disk:
2832	kref_put(kref: &resource->kref, release: drbd_destroy_resource);
2833	kfree(objp: device);
2834	return err;
2835	}
2836
2837	void drbd_delete_device(struct drbd_device *device)
2838	{
2839	struct drbd_resource *resource = device->resource;
2840	struct drbd_connection *connection;
2841	struct drbd_peer_device *peer_device;
2842
2843	/ move to free_peer_device() /
2844	for_each_peer_device(peer_device, device)
2845	drbd_debugfs_peer_device_cleanup(peer_device);
2846	drbd_debugfs_device_cleanup(device);
2847	for_each_connection(connection, resource) {
2848	idr_remove(&connection->peer_devices, id: device->vnr);
2849	kref_put(kref: &device->kref, release: drbd_destroy_device);
2850	}
2851	idr_remove(&resource->devices, id: device->vnr);
2852	kref_put(kref: &device->kref, release: drbd_destroy_device);
2853	idr_remove(&drbd_devices, id: device_to_minor(device));
2854	kref_put(kref: &device->kref, release: drbd_destroy_device);
2855	del_gendisk(gp: device->vdisk);
2856	synchronize_rcu();
2857	kref_put(kref: &device->kref, release: drbd_destroy_device);
2858	}
2859
2860	static int __init drbd_init(void)
2861	{
2862	int err;
2863
2864	if (drbd_minor_count < DRBD_MINOR_COUNT_MIN \|\| drbd_minor_count > DRBD_MINOR_COUNT_MAX) {
2865	pr_err("invalid minor_count (%d)\n", drbd_minor_count);
2866	#ifdef MODULE
2867	return -EINVAL;
2868	#else
2869	drbd_minor_count = DRBD_MINOR_COUNT_DEF;
2870	#endif
2871	}
2872
2873	err = register_blkdev(DRBD_MAJOR, "drbd");
2874	if (err) {
2875	pr_err("unable to register block device major %d\n",
2876	DRBD_MAJOR);
2877	return err;
2878	}
2879
2880	/*
2881	* allocate all necessary structs
2882	*/
2883	init_waitqueue_head(&drbd_pp_wait);
2884
2885	drbd_proc = NULL; / play safe for drbd_cleanup /
2886	idr_init(idr: &drbd_devices);
2887
2888	mutex_init(&resources_mutex);
2889	INIT_LIST_HEAD(list: &drbd_resources);
2890
2891	err = drbd_genl_register();
2892	if (err) {
2893	pr_err("unable to register generic netlink family\n");
2894	goto fail;
2895	}
2896
2897	err = drbd_create_mempools();
2898	if (err)
2899	goto fail;
2900
2901	err = -ENOMEM;
2902	drbd_proc = proc_create_single("drbd", S_IFREG \| `0444` , NULL, drbd_seq_show);
2903	if (!drbd_proc) {
2904	pr_err("unable to register proc file\n");
2905	goto fail;
2906	}
2907
2908	retry.wq = create_singlethread_workqueue("drbd-reissue");
2909	if (!retry.wq) {
2910	pr_err("unable to create retry workqueue\n");
2911	goto fail;
2912	}
2913	INIT_WORK(&retry.worker, do_retry);
2914	spin_lock_init(&retry.lock);
2915	INIT_LIST_HEAD(list: &retry.writes);
2916
2917	drbd_debugfs_init();
2918
2919	pr_info("initialized. "
2920	"Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2921	GENL_MAGIC_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2922	pr_info("%s\n", drbd_buildtag());
2923	pr_info("registered as block device major %d\n", DRBD_MAJOR);
2924	return `0`; / Success! /
2925
2926	fail:
2927	drbd_cleanup();
2928	if (err == -ENOMEM)
2929	pr_err("ran out of memory\n");
2930	else
2931	pr_err("initialization failure\n");
2932	return err;
2933	}
2934
2935	static void drbd_free_one_sock(struct drbd_socket *ds)
2936	{
2937	struct socket *s;
2938	mutex_lock(&ds->mutex);
2939	s = ds->socket;
2940	ds->socket = NULL;
2941	mutex_unlock(lock: &ds->mutex);
2942	if (s) {
2943	/ so debugfs does not need to mutex_lock() /
2944	synchronize_rcu();
2945	kernel_sock_shutdown(sock: s, how: SHUT_RDWR);
2946	sock_release(sock: s);
2947	}
2948	}
2949
2950	void drbd_free_sock(struct drbd_connection *connection)
2951	{
2952	if (connection->data.socket)
2953	drbd_free_one_sock(ds: &connection->data);
2954	if (connection->meta.socket)
2955	drbd_free_one_sock(ds: &connection->meta);
2956	}
2957
2958	/ meta data management /
2959
2960	void conn_md_sync(struct drbd_connection *connection)
2961	{
2962	struct drbd_peer_device *peer_device;
2963	int vnr;
2964
2965	rcu_read_lock();
2966	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2967	struct drbd_device *device = peer_device->device;
2968
2969	kref_get(kref: &device->kref);
2970	rcu_read_unlock();
2971	drbd_md_sync(device);
2972	kref_put(kref: &device->kref, release: drbd_destroy_device);
2973	rcu_read_lock();
2974	}
2975	rcu_read_unlock();
2976	}
2977
2978	/ aligned 4kByte /
2979	struct meta_data_on_disk {
2980	u64 la_size_sect; / last agreed size. /
2981	u64 uuid[UI_SIZE]; / UUIDs. /
2982	u64 device_uuid;
2983	u64 reserved_u64_1;
2984	u32 flags; / MDF /
2985	u32 magic;
2986	u32 md_size_sect;
2987	u32 al_offset; / offset to this block /
2988	u32 al_nr_extents; / important for restoring the AL (userspace) /
2989	/ `-- act_log->nr_elements <-- ldev->dc.al_extents /
2990	u32 bm_offset; / offset to the bitmap, from here /
2991	u32 bm_bytes_per_bit; / BM_BLOCK_SIZE /
2992	u32 la_peer_max_bio_size; / last peer max_bio_size /
2993
2994	/ see al_tr_number_to_on_disk_sector() /
2995	u32 al_stripes;
2996	u32 al_stripe_size_4k;
2997
2998	u8 reserved_u8[`4096` - (`7``8` + `10``4`)];
2999	} __packed;
3000
3001
3002
3003	void drbd_md_write(struct drbd_device device, void* *b)
3004	{
3005	struct meta_data_on_disk *buffer = b;
3006	sector_t sector;
3007	int i;
3008
3009	memset(buffer, `0`, sizeof(*buffer));
3010
3011	buffer->la_size_sect = cpu_to_be64(get_capacity(device->vdisk));
3012	for (i = UI_CURRENT; i < UI_SIZE; i++)
3013	buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
3014	buffer->flags = cpu_to_be32(device->ldev->md.flags);
3015	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
3016
3017	buffer->md_size_sect = cpu_to_be32(device->ldev->md.md_size_sect);
3018	buffer->al_offset = cpu_to_be32(device->ldev->md.al_offset);
3019	buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
3020	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3021	buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
3022
3023	buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
3024	buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
3025
3026	buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
3027	buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
3028
3029	D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
3030	sector = device->ldev->md.md_offset;
3031
3032	if (drbd_md_sync_page_io(device, bdev: device->ldev, sector, op: REQ_OP_WRITE)) {
3033	/ this was a try anyways ... /
3034	drbd_err(device, "meta data update failed!\n");
3035	drbd_chk_io_error(device, `1`, DRBD_META_IO_ERROR);
3036	}
3037	}
3038
3039	/**
3040	* drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3041	* @device: DRBD device.
3042	*/
3043	void drbd_md_sync(struct drbd_device *device)
3044	{
3045	struct meta_data_on_disk *buffer;
3046
3047	/ Don't accidentally change the DRBD meta data layout. /
3048	BUILD_BUG_ON(UI_SIZE != `4`);
3049	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != `4096`);
3050
3051	del_timer(timer: &device->md_sync_timer);
3052	/ timer may be rearmed by drbd_md_mark_dirty() now. /
3053	if (!test_and_clear_bit(nr: MD_DIRTY, addr: &device->flags))
3054	return;
3055
3056	/ We use here D_FAILED and not D_ATTACHING because we try to write*
3057	* metadata even if we detach due to a disk failure! */
3058	if (!get_ldev_if_state(device, D_FAILED))
3059	return;
3060
3061	buffer = drbd_md_get_buffer(device, intent: __func__);
3062	if (!buffer)
3063	goto out;
3064
3065	drbd_md_write(device, b: buffer);
3066
3067	/ Update device->ldev->md.la_size_sect,*
3068	* since we updated it on metadata. */
3069	device->ldev->md.la_size_sect = get_capacity(disk: device->vdisk);
3070
3071	drbd_md_put_buffer(device);
3072	out:
3073	put_ldev(device);
3074	}
3075
3076	static int check_activity_log_stripe_size(struct drbd_device *device,
3077	struct meta_data_on_disk *on_disk,
3078	struct drbd_md *in_core)
3079	{
3080	u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
3081	u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
3082	u64 al_size_4k;
3083
3084	/ both not set: default to old fixed size activity log /
3085	if (al_stripes == `0` && al_stripe_size_4k == `0`) {
3086	al_stripes = `1`;
3087	al_stripe_size_4k = MD_32kB_SECT/`8`;
3088	}
3089
3090	/ some paranoia plausibility checks /
3091
3092	/ we need both values to be set /
3093	if (al_stripes == `0` \|\| al_stripe_size_4k == `0`)
3094	goto err;
3095
3096	al_size_4k = (u64)al_stripes * al_stripe_size_4k;
3097
3098	/ Upper limit of activity log area, to avoid potential overflow*
3099	* problems in al_tr_number_to_on_disk_sector(). As right now, more
3100	* than 72 * 4k blocks total only increases the amount of history,
3101	* limiting this arbitrarily to 16 GB is not a real limitation ;-) */
3102	if (al_size_4k > (`16` * `1024` * `1024`/`4`))
3103	goto err;
3104
3105	/ Lower limit: we need at least 8 transaction slots (32kB)*
3106	* to not break existing setups */
3107	if (al_size_4k < MD_32kB_SECT/`8`)
3108	goto err;
3109
3110	in_core->al_stripe_size_4k = al_stripe_size_4k;
3111	in_core->al_stripes = al_stripes;
3112	in_core->al_size_4k = al_size_4k;
3113
3114	return `0`;
3115	err:
3116	drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
3117	al_stripes, al_stripe_size_4k);
3118	return -EINVAL;
3119	}
3120
3121	static int check_offsets_and_sizes(struct drbd_device device, struct* drbd_backing_dev *bdev)
3122	{
3123	sector_t capacity = drbd_get_capacity(bdev: bdev->md_bdev);
3124	struct drbd_md *in_core = &bdev->md;
3125	s32 on_disk_al_sect;
3126	s32 on_disk_bm_sect;
3127
3128	/ The on-disk size of the activity log, calculated from offsets, and*
3129	* the size of the activity log calculated from the stripe settings,
3130	* should match.
3131	* Though we could relax this a bit: it is ok, if the striped activity log
3132	* fits in the available on-disk activity log size.
3133	* Right now, that would break how resize is implemented.
3134	* TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
3135	* of possible unused padding space in the on disk layout. */
3136	if (in_core->al_offset < `0`) {
3137	if (in_core->bm_offset > in_core->al_offset)
3138	goto err;
3139	on_disk_al_sect = -in_core->al_offset;
3140	on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
3141	} else {
3142	if (in_core->al_offset != MD_4kB_SECT)
3143	goto err;
3144	if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
3145	goto err;
3146
3147	on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
3148	on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
3149	}
3150
3151	/ old fixed size meta data is exactly that: fixed. /
3152	if (in_core->meta_dev_idx >= `0`) {
3153	if (in_core->md_size_sect != MD_128MB_SECT
3154	\|\| in_core->al_offset != MD_4kB_SECT
3155	\|\| in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
3156	\|\| in_core->al_stripes != `1`
3157	\|\| in_core->al_stripe_size_4k != MD_32kB_SECT/`8`)
3158	goto err;
3159	}
3160
3161	if (capacity < in_core->md_size_sect)
3162	goto err;
3163	if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
3164	goto err;
3165
3166	/ should be aligned, and at least 32k /
3167	if ((on_disk_al_sect & `7`) \|\| (on_disk_al_sect < MD_32kB_SECT))
3168	goto err;
3169
3170	/ should fit (for now: exactly) into the available on-disk space;*
3171	* overflow prevention is in check_activity_log_stripe_size() above. */
3172	if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
3173	goto err;
3174
3175	/ again, should be aligned /
3176	if (in_core->bm_offset & `7`)
3177	goto err;
3178
3179	/ FIXME check for device grow with flex external meta data? /
3180
3181	/ can the available bitmap space cover the last agreed device size? /
3182	if (on_disk_bm_sect < (in_core->la_size_sect+`7`)/MD_4kB_SECT/`8`/`512`)
3183	goto err;
3184
3185	return `0`;
3186
3187	err:
3188	drbd_err(device, "meta data offsets don't make sense: idx=%d "
3189	"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
3190	"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
3191	in_core->meta_dev_idx,
3192	in_core->al_stripes, in_core->al_stripe_size_4k,
3193	in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
3194	(unsigned long long)in_core->la_size_sect,
3195	(unsigned long long)capacity);
3196
3197	return -EINVAL;
3198	}
3199
3200
3201	/**
3202	* drbd_md_read() - Reads in the meta data super block
3203	* @device: DRBD device.
3204	* @bdev: Device from which the meta data should be read in.
3205	*
3206	* Return NO_ERROR on success, and an enum drbd_ret_code in case
3207	* something goes wrong.
3208	*
3209	* Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
3210	* even before @bdev is assigned to @device->ldev.
3211	*/
3212	int drbd_md_read(struct drbd_device device, struct* drbd_backing_dev *bdev)
3213	{
3214	struct meta_data_on_disk *buffer;
3215	u32 magic, flags;
3216	int i, rv = NO_ERROR;
3217
3218	if (device->state.disk != D_DISKLESS)
3219	return ERR_DISK_CONFIGURED;
3220
3221	buffer = drbd_md_get_buffer(device, intent: __func__);
3222	if (!buffer)
3223	return ERR_NOMEM;
3224
3225	/ First, figure out where our meta data superblock is located,*
3226	* and read it. */
3227	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
3228	bdev->md.md_offset = drbd_md_ss(bdev);
3229	/ Even for (flexible or indexed) external meta data,*
3230	* initially restrict us to the 4k superblock for now.
3231	* Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
3232	bdev->md.md_size_sect = `8`;
3233
3234	if (drbd_md_sync_page_io(device, bdev, sector: bdev->md.md_offset,
3235	op: REQ_OP_READ)) {
3236	/ NOTE: can't do normal error processing here as this is*
3237	called BEFORE disk is attached /*
3238	drbd_err(device, "Error while reading metadata.\n");
3239	rv = ERR_IO_MD_DISK;
3240	goto err;
3241	}
3242
3243	magic = be32_to_cpu(buffer->magic);
3244	flags = be32_to_cpu(buffer->flags);
3245	if (magic == DRBD_MD_MAGIC_84_UNCLEAN \|\|
3246	(magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
3247	/ btw: that's Activity Log clean, not "all" clean. /
3248	drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
3249	rv = ERR_MD_UNCLEAN;
3250	goto err;
3251	}
3252
3253	rv = ERR_MD_INVALID;
3254	if (magic != DRBD_MD_MAGIC_08) {
3255	if (magic == DRBD_MD_MAGIC_07)
3256	drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
3257	else
3258	drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
3259	goto err;
3260	}
3261
3262	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3263	drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3264	be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3265	goto err;
3266	}
3267
3268
3269	/ convert to in_core endian /
3270	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
3271	for (i = UI_CURRENT; i < UI_SIZE; i++)
3272	bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3273	bdev->md.flags = be32_to_cpu(buffer->flags);
3274	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3275
3276	bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
3277	bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
3278	bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
3279
3280	if (check_activity_log_stripe_size(device, on_disk: buffer, in_core: &bdev->md))
3281	goto err;
3282	if (check_offsets_and_sizes(device, bdev))
3283	goto err;
3284
3285	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3286	drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
3287	be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3288	goto err;
3289	}
3290	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3291	drbd_err(device, "unexpected md_size: %u (expected %u)\n",
3292	be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3293	goto err;
3294	}
3295
3296	rv = NO_ERROR;
3297
3298	spin_lock_irq(lock: &device->resource->req_lock);
3299	if (device->state.conn < C_CONNECTED) {
3300	unsigned int peer;
3301	peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3302	peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
3303	device->peer_max_bio_size = peer;
3304	}
3305	spin_unlock_irq(lock: &device->resource->req_lock);
3306
3307	err:
3308	drbd_md_put_buffer(device);
3309
3310	return rv;
3311	}
3312
3313	/**
3314	* drbd_md_mark_dirty() - Mark meta data super block as dirty
3315	* @device: DRBD device.
3316	*
3317	* Call this function if you change anything that should be written to
3318	* the meta-data super block. This function sets MD_DIRTY, and starts a
3319	* timer that ensures that within five seconds you have to call drbd_md_sync().
3320	*/
3321	void drbd_md_mark_dirty(struct drbd_device *device)
3322	{
3323	if (!test_and_set_bit(nr: MD_DIRTY, addr: &device->flags))
3324	mod_timer(timer: &device->md_sync_timer, expires: jiffies + `5`*HZ);
3325	}
3326
3327	void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
3328	{
3329	int i;
3330
3331	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3332	device->ldev->md.uuid[i+`1`] = device->ldev->md.uuid[i];
3333	}
3334
3335	void __drbd_uuid_set(struct drbd_device device, int* idx, u64 val) __must_hold(local)
3336	{
3337	if (idx == UI_CURRENT) {
3338	if (device->state.role == R_PRIMARY)
3339	val \|= `1`;
3340	else
3341	val &= ~((u64)`1`);
3342
3343	drbd_set_ed_uuid(device, val);
3344	}
3345
3346	device->ldev->md.uuid[idx] = val;
3347	drbd_md_mark_dirty(device);
3348	}
3349
3350	void _drbd_uuid_set(struct drbd_device device, int* idx, u64 val) __must_hold(local)
3351	{
3352	unsigned long flags;
3353	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3354	__drbd_uuid_set(device, idx, val);
3355	spin_unlock_irqrestore(lock: &device->ldev->md.uuid_lock, flags);
3356	}
3357
3358	void drbd_uuid_set(struct drbd_device device, int* idx, u64 val) __must_hold(local)
3359	{
3360	unsigned long flags;
3361	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3362	if (device->ldev->md.uuid[idx]) {
3363	drbd_uuid_move_history(device);
3364	device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
3365	}
3366	__drbd_uuid_set(device, idx, val);
3367	spin_unlock_irqrestore(lock: &device->ldev->md.uuid_lock, flags);
3368	}
3369
3370	/**
3371	* drbd_uuid_new_current() - Creates a new current UUID
3372	* @device: DRBD device.
3373	*
3374	* Creates a new current UUID, and rotates the old current UUID into
3375	* the bitmap slot. Causes an incremental resync upon next connect.
3376	*/
3377	void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
3378	{
3379	u64 val;
3380	unsigned long long bm_uuid;
3381
3382	get_random_bytes(buf: &val, len: sizeof(u64));
3383
3384	spin_lock_irq(lock: &device->ldev->md.uuid_lock);
3385	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
3386
3387	if (bm_uuid)
3388	drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
3389
3390	device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
3391	__drbd_uuid_set(device, idx: UI_CURRENT, val);
3392	spin_unlock_irq(lock: &device->ldev->md.uuid_lock);
3393
3394	drbd_print_uuids(device, text: "new current UUID");
3395	/ get it to stable storage _now_ /
3396	drbd_md_sync(device);
3397	}
3398
3399	void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
3400	{
3401	unsigned long flags;
3402	if (device->ldev->md.uuid[UI_BITMAP] == `0` && val == `0`)
3403	return;
3404
3405	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3406	if (val == `0`) {
3407	drbd_uuid_move_history(device);
3408	device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3409	device->ldev->md.uuid[UI_BITMAP] = `0`;
3410	} else {
3411	unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
3412	if (bm_uuid)
3413	drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
3414
3415	device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)`1`);
3416	}
3417	spin_unlock_irqrestore(lock: &device->ldev->md.uuid_lock, flags);
3418
3419	drbd_md_mark_dirty(device);
3420	}
3421
3422	/**
3423	* drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3424	* @device: DRBD device.
3425	*
3426	* Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3427	*/
3428	int drbd_bmio_set_n_write(struct drbd_device *device,
3429	struct drbd_peer_device *peer_device) __must_hold(local)
3430
3431	{
3432	int rv = -EIO;
3433
3434	drbd_md_set_flag(device, MDF_FULL_SYNC);
3435	drbd_md_sync(device);
3436	drbd_bm_set_all(device);
3437
3438	rv = drbd_bm_write(device, peer_device);
3439
3440	if (!rv) {
3441	drbd_md_clear_flag(device, MDF_FULL_SYNC);
3442	drbd_md_sync(device);
3443	}
3444
3445	return rv;
3446	}
3447
3448	/**
3449	* drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3450	* @device: DRBD device.
3451	*
3452	* Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3453	*/
3454	int drbd_bmio_clear_n_write(struct drbd_device *device,
3455	struct drbd_peer_device *peer_device) __must_hold(local)
3456
3457	{
3458	drbd_resume_al(device);
3459	drbd_bm_clear_all(device);
3460	return drbd_bm_write(device, peer_device);
3461	}
3462
3463	static int w_bitmap_io(struct drbd_work w, int* unused)
3464	{
3465	struct drbd_device *device =
3466	container_of(w, struct drbd_device, bm_io_work.w);
3467	struct bm_io_work *work = &device->bm_io_work;
3468	int rv = -EIO;
3469
3470	if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
3471	int cnt = atomic_read(v: &device->ap_bio_cnt);
3472	if (cnt)
3473	drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
3474	cnt, work->why);
3475	}
3476
3477	if (get_ldev(device)) {
3478	drbd_bm_lock(device, why: work->why, flags: work->flags);
3479	rv = work->io_fn(device, work->peer_device);
3480	drbd_bm_unlock(device);
3481	put_ldev(device);
3482	}
3483
3484	clear_bit_unlock(nr: BITMAP_IO, addr: &device->flags);
3485	wake_up(&device->misc_wait);
3486
3487	if (work->done)
3488	work->done(device, rv);
3489
3490	clear_bit(nr: BITMAP_IO_QUEUED, addr: &device->flags);
3491	work->why = NULL;
3492	work->flags = `0`;
3493
3494	return `0`;
3495	}
3496
3497	/**
3498	* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3499	* @device: DRBD device.
3500	* @io_fn: IO callback to be called when bitmap IO is possible
3501	* @done: callback to be called after the bitmap IO was performed
3502	* @why: Descriptive text of the reason for doing the IO
3503	* @flags: Bitmap flags
3504	*
3505	* While IO on the bitmap happens we freeze application IO thus we ensure
3506	* that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3507	* called from worker context. It MUST NOT be used while a previous such
3508	* work is still pending!
3509	*
3510	* Its worker function encloses the call of io_fn() by get_ldev() and
3511	* put_ldev().
3512	*/
3513	void drbd_queue_bitmap_io(struct drbd_device *device,
3514	int (io_fn)(struct* drbd_device , struct* drbd_peer_device *),
3515	void (done)(struct* drbd_device , int*),
3516	char why, enum* bm_flag flags,
3517	struct drbd_peer_device *peer_device)
3518	{
3519	D_ASSERT(device, current == peer_device->connection->worker.task);
3520
3521	D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
3522	D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
3523	D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
3524	if (device->bm_io_work.why)
3525	drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
3526	why, device->bm_io_work.why);
3527
3528	device->bm_io_work.peer_device = peer_device;
3529	device->bm_io_work.io_fn = io_fn;
3530	device->bm_io_work.done = done;
3531	device->bm_io_work.why = why;
3532	device->bm_io_work.flags = flags;
3533
3534	spin_lock_irq(lock: &device->resource->req_lock);
3535	set_bit(nr: BITMAP_IO, addr: &device->flags);
3536	/ don't wait for pending application IO if the caller indicates that*
3537	* application IO does not conflict anyways. */
3538	if (flags == BM_LOCKED_CHANGE_ALLOWED \|\| atomic_read(v: &device->ap_bio_cnt) == `0`) {
3539	if (!test_and_set_bit(nr: BITMAP_IO_QUEUED, addr: &device->flags))
3540	drbd_queue_work(q: &peer_device->connection->sender_work,
3541	w: &device->bm_io_work.w);
3542	}
3543	spin_unlock_irq(lock: &device->resource->req_lock);
3544	}
3545
3546	/**
3547	* drbd_bitmap_io() - Does an IO operation on the whole bitmap
3548	* @device: DRBD device.
3549	* @io_fn: IO callback to be called when bitmap IO is possible
3550	* @why: Descriptive text of the reason for doing the IO
3551	* @flags: Bitmap flags
3552	*
3553	* freezes application IO while that the actual IO operations runs. This
3554	* functions MAY NOT be called from worker context.
3555	*/
3556	int drbd_bitmap_io(struct drbd_device *device,
3557	int (io_fn)(struct* drbd_device , struct* drbd_peer_device *),
3558	char why, enum* bm_flag flags,
3559	struct drbd_peer_device *peer_device)
3560	{
3561	/ Only suspend io, if some operation is supposed to be locked out /
3562	const bool do_suspend_io = flags & (BM_DONT_CLEAR\|BM_DONT_SET\|BM_DONT_TEST);
3563	int rv;
3564
3565	D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
3566
3567	if (do_suspend_io)
3568	drbd_suspend_io(device);
3569
3570	drbd_bm_lock(device, why, flags);
3571	rv = io_fn(device, peer_device);
3572	drbd_bm_unlock(device);
3573
3574	if (do_suspend_io)
3575	drbd_resume_io(device);
3576
3577	return rv;
3578	}
3579
3580	void drbd_md_set_flag(struct drbd_device device, int* flag) __must_hold(local)
3581	{
3582	if ((device->ldev->md.flags & flag) != flag) {
3583	drbd_md_mark_dirty(device);
3584	device->ldev->md.flags \|= flag;
3585	}
3586	}
3587
3588	void drbd_md_clear_flag(struct drbd_device device, int* flag) __must_hold(local)
3589	{
3590	if ((device->ldev->md.flags & flag) != `0`) {
3591	drbd_md_mark_dirty(device);
3592	device->ldev->md.flags &= ~flag;
3593	}
3594	}
3595	int drbd_md_test_flag(struct drbd_backing_dev bdev, int* flag)
3596	{
3597	return (bdev->md.flags & flag) != `0`;
3598	}
3599
3600	static void md_sync_timer_fn(struct timer_list *t)
3601	{
3602	struct drbd_device *device = from_timer(device, t, md_sync_timer);
3603	drbd_device_post_work(device, work_bit: MD_SYNC);
3604	}
3605
3606	const char cmdname(enum* drbd_packet cmd)
3607	{
3608	/ THINK may need to become several global tables*
3609	* when we want to support more than
3610	* one PRO_VERSION */
3611	static const char *cmdnames[] = {
3612
3613	[P_DATA] = "Data",
3614	[P_DATA_REPLY] = "DataReply",
3615	[P_RS_DATA_REPLY] = "RSDataReply",
3616	[P_BARRIER] = "Barrier",
3617	[P_BITMAP] = "ReportBitMap",
3618	[P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
3619	[P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
3620	[P_UNPLUG_REMOTE] = "UnplugRemote",
3621	[P_DATA_REQUEST] = "DataRequest",
3622	[P_RS_DATA_REQUEST] = "RSDataRequest",
3623	[P_SYNC_PARAM] = "SyncParam",
3624	[P_PROTOCOL] = "ReportProtocol",
3625	[P_UUIDS] = "ReportUUIDs",
3626	[P_SIZES] = "ReportSizes",
3627	[P_STATE] = "ReportState",
3628	[P_SYNC_UUID] = "ReportSyncUUID",
3629	[P_AUTH_CHALLENGE] = "AuthChallenge",
3630	[P_AUTH_RESPONSE] = "AuthResponse",
3631	[P_STATE_CHG_REQ] = "StateChgRequest",
3632	[P_PING] = "Ping",
3633	[P_PING_ACK] = "PingAck",
3634	[P_RECV_ACK] = "RecvAck",
3635	[P_WRITE_ACK] = "WriteAck",
3636	[P_RS_WRITE_ACK] = "RSWriteAck",
3637	[P_SUPERSEDED] = "Superseded",
3638	[P_NEG_ACK] = "NegAck",
3639	[P_NEG_DREPLY] = "NegDReply",
3640	[P_NEG_RS_DREPLY] = "NegRSDReply",
3641	[P_BARRIER_ACK] = "BarrierAck",
3642	[P_STATE_CHG_REPLY] = "StateChgReply",
3643	[P_OV_REQUEST] = "OVRequest",
3644	[P_OV_REPLY] = "OVReply",
3645	[P_OV_RESULT] = "OVResult",
3646	[P_CSUM_RS_REQUEST] = "CsumRSRequest",
3647	[P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
3648	[P_SYNC_PARAM89] = "SyncParam89",
3649	[P_COMPRESSED_BITMAP] = "CBitmap",
3650	[P_DELAY_PROBE] = "DelayProbe",
3651	[P_OUT_OF_SYNC] = "OutOfSync",
3652	[P_RS_CANCEL] = "RSCancel",
3653	[P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
3654	[P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
3655	[P_PROTOCOL_UPDATE] = "protocol_update",
3656	[P_TRIM] = "Trim",
3657	[P_RS_THIN_REQ] = "rs_thin_req",
3658	[P_RS_DEALLOCATED] = "rs_deallocated",
3659	[P_WSAME] = "WriteSame",
3660	[P_ZEROES] = "Zeroes",
3661
3662	/ enum drbd_packet, but not commands - obsoleted flags:*
3663	* P_MAY_IGNORE
3664	* P_MAX_OPT_CMD
3665	*/
3666	};
3667
3668	/ too big for the array: 0xfffX /
3669	if (cmd == P_INITIAL_META)
3670	return "InitialMeta";
3671	if (cmd == P_INITIAL_DATA)
3672	return "InitialData";
3673	if (cmd == P_CONNECTION_FEATURES)
3674	return "ConnectionFeatures";
3675	if (cmd >= ARRAY_SIZE(cmdnames))
3676	return "Unknown";
3677	return cmdnames[cmd];
3678	}
3679
3680	/**
3681	* drbd_wait_misc - wait for a request to make progress
3682	* @device: device associated with the request
3683	* @i: the struct drbd_interval embedded in struct drbd_request or
3684	* struct drbd_peer_request
3685	*/
3686	int drbd_wait_misc(struct drbd_device device, struct* drbd_interval *i)
3687	{
3688	struct net_conf *nc;
3689	DEFINE_WAIT(wait);
3690	long timeout;
3691
3692	rcu_read_lock();
3693	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
3694	if (!nc) {
3695	rcu_read_unlock();
3696	return -ETIMEDOUT;
3697	}
3698	timeout = nc->ko_count ? nc->timeout * HZ / `10` * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
3699	rcu_read_unlock();
3700
3701	/ Indicate to wake up device->misc_wait on progress. /
3702	i->waiting = true;
3703	prepare_to_wait(wq_head: &device->misc_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
3704	spin_unlock_irq(lock: &device->resource->req_lock);
3705	timeout = schedule_timeout(timeout);
3706	finish_wait(wq_head: &device->misc_wait, wq_entry: &wait);
3707	spin_lock_irq(lock: &device->resource->req_lock);
3708	if (!timeout \|\| device->state.conn < C_CONNECTED)
3709	return -ETIMEDOUT;
3710	if (signal_pending(current))
3711	return -ERESTARTSYS;
3712	return `0`;
3713	}
3714
3715	void lock_all_resources(void)
3716	{
3717	struct drbd_resource *resource;
3718	int __maybe_unused i = `0`;
3719
3720	mutex_lock(&resources_mutex);
3721	local_irq_disable();
3722	for_each_resource(resource, &drbd_resources)
3723	spin_lock_nested(&resource->req_lock, i++);
3724	}
3725
3726	void unlock_all_resources(void)
3727	{
3728	struct drbd_resource *resource;
3729
3730	for_each_resource(resource, &drbd_resources)
3731	spin_unlock(lock: &resource->req_lock);
3732	local_irq_enable();
3733	mutex_unlock(lock: &resources_mutex);
3734	}
3735
3736	#ifdef CONFIG_DRBD_FAULT_INJECTION
3737	/ Fault insertion support including random number generator shamelessly*
3738	* stolen from kernel/rcutorture.c */
3739	struct fault_random_state {
3740	unsigned long state;
3741	unsigned long count;
3742	};
3743
3744	#define FAULT_RANDOM_MULT 39916801 /* prime */
3745	#define FAULT_RANDOM_ADD 479001701 /* prime */
3746	#define FAULT_RANDOM_REFRESH 10000
3747
3748	/*
3749	* Crude but fast random-number generator. Uses a linear congruential
3750	* generator, with occasional help from get_random_bytes().
3751	*/
3752	static unsigned long
3753	_drbd_fault_random(struct fault_random_state *rsp)
3754	{
3755	long refresh;
3756
3757	if (!rsp->count--) {
3758	get_random_bytes(buf: &refresh, len: sizeof(refresh));
3759	rsp->state += refresh;
3760	rsp->count = FAULT_RANDOM_REFRESH;
3761	}
3762	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3763	return swahw32(rsp->state);
3764	}
3765
3766	static char *
3767	_drbd_fault_str(unsigned int type) {
3768	static char *_faults[] = {
3769	[DRBD_FAULT_MD_WR] = "Meta-data write",
3770	[DRBD_FAULT_MD_RD] = "Meta-data read",
3771	[DRBD_FAULT_RS_WR] = "Resync write",
3772	[DRBD_FAULT_RS_RD] = "Resync read",
3773	[DRBD_FAULT_DT_WR] = "Data write",
3774	[DRBD_FAULT_DT_RD] = "Data read",
3775	[DRBD_FAULT_DT_RA] = "Data read ahead",
3776	[DRBD_FAULT_BM_ALLOC] = "BM allocation",
3777	[DRBD_FAULT_AL_EE] = "EE allocation",
3778	[DRBD_FAULT_RECEIVE] = "receive data corruption",
3779	};
3780
3781	return (type < DRBD_FAULT_MAX) ? _faults[type] : "Unknown";
3782	}
3783
3784	unsigned int
3785	_drbd_insert_fault(struct drbd_device device, unsigned* int type)
3786	{
3787	static struct fault_random_state rrs = {`0`, `0`};
3788
3789	unsigned int ret = (
3790	(drbd_fault_devs == `0` \|\|
3791	((`1` << device_to_minor(device)) & drbd_fault_devs) != `0`) &&
3792	(((_drbd_fault_random(rsp: &rrs) % `100`) + `1`) <= drbd_fault_rate));
3793
3794	if (ret) {
3795	drbd_fault_count++;
3796
3797	if (drbd_ratelimit())
3798	drbd_warn(device, "***Simulating %s failure\n",
3799	_drbd_fault_str(type));
3800	}
3801
3802	return ret;
3803	}
3804	#endif
3805
3806	module_init(drbd_init)
3807	module_exit(drbd_cleanup)
3808
3809	EXPORT_SYMBOL(drbd_conn_str);
3810	EXPORT_SYMBOL(drbd_role_str);
3811	EXPORT_SYMBOL(drbd_disk_str);
3812	EXPORT_SYMBOL(drbd_set_st_err_str);
3813

source code of linux/drivers/block/drbd/drbd_main.c