multipath.c source code [linux/drivers/nvme/host/multipath.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2017-2018 Christoph Hellwig.
4	*/
5
6	#include <linux/backing-dev.h>
7	#include <linux/moduleparam.h>
8	#include <linux/vmalloc.h>
9	#include <trace/events/block.h>
10	#include "nvme.h"
11
12	bool multipath = true;
13	module_param(multipath, bool, `0444`);
14	MODULE_PARM_DESC(multipath,
15	"turn on native support for multiple controllers per subsystem");
16
17	static const char *nvme_iopolicy_names[] = {
18	[NVME_IOPOLICY_NUMA] = "numa",
19	[NVME_IOPOLICY_RR] = "round-robin",
20	};
21
22	static int iopolicy = NVME_IOPOLICY_NUMA;
23
24	static int nvme_set_iopolicy(const char val, const* struct kernel_param *kp)
25	{
26	if (!val)
27	return -EINVAL;
28	if (!strncmp(val, "numa", `4`))
29	iopolicy = NVME_IOPOLICY_NUMA;
30	else if (!strncmp(val, "round-robin", `11`))
31	iopolicy = NVME_IOPOLICY_RR;
32	else
33	return -EINVAL;
34
35	return `0`;
36	}
37
38	static int nvme_get_iopolicy(char buf, const* struct kernel_param *kp)
39	{
40	return sprintf(buf, fmt: "%s\n", nvme_iopolicy_names[iopolicy]);
41	}
42
43	module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
44	&iopolicy, `0644`);
45	MODULE_PARM_DESC(iopolicy,
46	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
47
48	void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
49	{
50	subsys->iopolicy = iopolicy;
51	}
52
53	void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
54	{
55	struct nvme_ns_head *h;
56
57	lockdep_assert_held(&subsys->lock);
58	list_for_each_entry(h, &subsys->nsheads, entry)
59	if (h->disk)
60	blk_mq_unfreeze_queue(q: h->disk->queue);
61	}
62
63	void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
64	{
65	struct nvme_ns_head *h;
66
67	lockdep_assert_held(&subsys->lock);
68	list_for_each_entry(h, &subsys->nsheads, entry)
69	if (h->disk)
70	blk_mq_freeze_queue_wait(q: h->disk->queue);
71	}
72
73	void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
74	{
75	struct nvme_ns_head *h;
76
77	lockdep_assert_held(&subsys->lock);
78	list_for_each_entry(h, &subsys->nsheads, entry)
79	if (h->disk)
80	blk_freeze_queue_start(q: h->disk->queue);
81	}
82
83	void nvme_failover_req(struct request *req)
84	{
85	struct nvme_ns *ns = req->q->queuedata;
86	u16 status = nvme_req(req)->status & `0x7ff`;
87	unsigned long flags;
88	struct bio *bio;
89
90	nvme_mpath_clear_current_path(ns);
91
92	/*
93	* If we got back an ANA error, we know the controller is alive but not
94	* ready to serve this namespace. Kick of a re-read of the ANA
95	* information page, and just try any other available path for now.
96	*/
97	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
98	set_bit(NVME_NS_ANA_PENDING, addr: &ns->flags);
99	queue_work(wq: nvme_wq, work: &ns->ctrl->ana_work);
100	}
101
102	spin_lock_irqsave(&ns->head->requeue_lock, flags);
103	for (bio = req->bio; bio; bio = bio->bi_next) {
104	bio_set_dev(bio, bdev: ns->head->disk->part0);
105	if (bio->bi_opf & REQ_POLLED) {
106	bio->bi_opf &= ~REQ_POLLED;
107	bio->bi_cookie = BLK_QC_T_NONE;
108	}
109	/*
110	* The alternate request queue that we may end up submitting
111	* the bio to may be frozen temporarily, in this case REQ_NOWAIT
112	* will fail the I/O immediately with EAGAIN to the issuer.
113	* We are not in the issuer context which cannot block. Clear
114	* the flag to avoid spurious EAGAIN I/O failures.
115	*/
116	bio->bi_opf &= ~REQ_NOWAIT;
117	}
118	blk_steal_bios(list: &ns->head->requeue_list, rq: req);
119	spin_unlock_irqrestore(lock: &ns->head->requeue_lock, flags);
120
121	blk_mq_end_request(rq: req, error: `0`);
122	kblockd_schedule_work(work: &ns->head->requeue_work);
123	}
124
125	void nvme_mpath_start_request(struct request *rq)
126	{
127	struct nvme_ns *ns = rq->q->queuedata;
128	struct gendisk *disk = ns->head->disk;
129
130	if (!blk_queue_io_stat(disk->queue) \|\| blk_rq_is_passthrough(rq))
131	return;
132
133	nvme_req(req: rq)->flags \|= NVME_MPATH_IO_STATS;
134	nvme_req(req: rq)->start_time = bdev_start_io_acct(bdev: disk->part0, op: req_op(req: rq),
135	start_time: jiffies);
136	}
137	EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
138
139	void nvme_mpath_end_request(struct request *rq)
140	{
141	struct nvme_ns *ns = rq->q->queuedata;
142
143	if (!(nvme_req(req: rq)->flags & NVME_MPATH_IO_STATS))
144	return;
145	bdev_end_io_acct(bdev: ns->head->disk->part0, op: req_op(req: rq),
146	sectors: blk_rq_bytes(rq) >> SECTOR_SHIFT,
147	start_time: nvme_req(req: rq)->start_time);
148	}
149
150	void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
151	{
152	struct nvme_ns *ns;
153
154	down_read(sem: &ctrl->namespaces_rwsem);
155	list_for_each_entry(ns, &ctrl->namespaces, list) {
156	if (!ns->head->disk)
157	continue;
158	kblockd_schedule_work(work: &ns->head->requeue_work);
159	if (nvme_ctrl_state(ctrl: ns->ctrl) == NVME_CTRL_LIVE)
160	disk_uevent(disk: ns->head->disk, action: KOBJ_CHANGE);
161	}
162	up_read(sem: &ctrl->namespaces_rwsem);
163	}
164
165	static const char *nvme_ana_state_names[] = {
166	[`0`] = "invalid state",
167	[NVME_ANA_OPTIMIZED] = "optimized",
168	[NVME_ANA_NONOPTIMIZED] = "non-optimized",
169	[NVME_ANA_INACCESSIBLE] = "inaccessible",
170	[NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
171	[NVME_ANA_CHANGE] = "change",
172	};
173
174	bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
175	{
176	struct nvme_ns_head *head = ns->head;
177	bool changed = false;
178	int node;
179
180	if (!head)
181	goto out;
182
183	for_each_node(node) {
184	if (ns == rcu_access_pointer(head->current_path[node])) {
185	rcu_assign_pointer(head->current_path[node], NULL);
186	changed = true;
187	}
188	}
189	out:
190	return changed;
191	}
192
193	void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
194	{
195	struct nvme_ns *ns;
196
197	down_read(sem: &ctrl->namespaces_rwsem);
198	list_for_each_entry(ns, &ctrl->namespaces, list) {
199	nvme_mpath_clear_current_path(ns);
200	kblockd_schedule_work(work: &ns->head->requeue_work);
201	}
202	up_read(sem: &ctrl->namespaces_rwsem);
203	}
204
205	void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
206	{
207	struct nvme_ns_head *head = ns->head;
208	sector_t capacity = get_capacity(disk: head->disk);
209	int node;
210	int srcu_idx;
211
212	srcu_idx = srcu_read_lock(ssp: &head->srcu);
213	list_for_each_entry_rcu(ns, &head->list, siblings) {
214	if (capacity != get_capacity(disk: ns->disk))
215	clear_bit(NVME_NS_READY, addr: &ns->flags);
216	}
217	srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx);
218
219	for_each_node(node)
220	rcu_assign_pointer(head->current_path[node], NULL);
221	kblockd_schedule_work(work: &head->requeue_work);
222	}
223
224	static bool nvme_path_is_disabled(struct nvme_ns *ns)
225	{
226	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl: ns->ctrl);
227
228	/*
229	* We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
230	* still be able to complete assuming that the controller is connected.
231	* Otherwise it will fail immediately and return to the requeue list.
232	*/
233	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
234	return true;
235	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) \|\|
236	!test_bit(NVME_NS_READY, &ns->flags))
237	return true;
238	return false;
239	}
240
241	static struct nvme_ns __nvme_find_path(struct* nvme_ns_head head, int* node)
242	{
243	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
244	struct nvme_ns found = NULL, fallback = NULL, *ns;
245
246	list_for_each_entry_rcu(ns, &head->list, siblings) {
247	if (nvme_path_is_disabled(ns))
248	continue;
249
250	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
251	distance = node_distance(node, ns->ctrl->numa_node);
252	else
253	distance = LOCAL_DISTANCE;
254
255	switch (ns->ana_state) {
256	case NVME_ANA_OPTIMIZED:
257	if (distance < found_distance) {
258	found_distance = distance;
259	found = ns;
260	}
261	break;
262	case NVME_ANA_NONOPTIMIZED:
263	if (distance < fallback_distance) {
264	fallback_distance = distance;
265	fallback = ns;
266	}
267	break;
268	default:
269	break;
270	}
271	}
272
273	if (!found)
274	found = fallback;
275	if (found)
276	rcu_assign_pointer(head->current_path[node], found);
277	return found;
278	}
279
280	static struct nvme_ns nvme_next_ns(struct* nvme_ns_head *head,
281	struct nvme_ns *ns)
282	{
283	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
284	siblings);
285	if (ns)
286	return ns;
287	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
288	}
289
290	static struct nvme_ns nvme_round_robin_path(struct* nvme_ns_head *head,
291	int node, struct nvme_ns *old)
292	{
293	struct nvme_ns ns, found = NULL;
294
295	if (list_is_singular(head: &head->list)) {
296	if (nvme_path_is_disabled(ns: old))
297	return NULL;
298	return old;
299	}
300
301	for (ns = nvme_next_ns(head, ns: old);
302	ns && ns != old;
303	ns = nvme_next_ns(head, ns)) {
304	if (nvme_path_is_disabled(ns))
305	continue;
306
307	if (ns->ana_state == NVME_ANA_OPTIMIZED) {
308	found = ns;
309	goto out;
310	}
311	if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
312	found = ns;
313	}
314
315	/*
316	* The loop above skips the current path for round-robin semantics.
317	* Fall back to the current path if either:
318	* - no other optimized path found and current is optimized,
319	* - no other usable path found and current is usable.
320	*/
321	if (!nvme_path_is_disabled(ns: old) &&
322	(old->ana_state == NVME_ANA_OPTIMIZED \|\|
323	(!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
324	return old;
325
326	if (!found)
327	return NULL;
328	out:
329	rcu_assign_pointer(head->current_path[node], found);
330	return found;
331	}
332
333	static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
334	{
335	return nvme_ctrl_state(ctrl: ns->ctrl) == NVME_CTRL_LIVE &&
336	ns->ana_state == NVME_ANA_OPTIMIZED;
337	}
338
339	inline struct nvme_ns nvme_find_path(struct* nvme_ns_head *head)
340	{
341	int node = numa_node_id();
342	struct nvme_ns *ns;
343
344	ns = srcu_dereference(head->current_path[node], &head->srcu);
345	if (unlikely(!ns))
346	return __nvme_find_path(head, node);
347
348	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
349	return nvme_round_robin_path(head, node, old: ns);
350	if (unlikely(!nvme_path_is_optimized(ns)))
351	return __nvme_find_path(head, node);
352	return ns;
353	}
354
355	static bool nvme_available_path(struct nvme_ns_head *head)
356	{
357	struct nvme_ns *ns;
358
359	list_for_each_entry_rcu(ns, &head->list, siblings) {
360	if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
361	continue;
362	switch (nvme_ctrl_state(ctrl: ns->ctrl)) {
363	case NVME_CTRL_LIVE:
364	case NVME_CTRL_RESETTING:
365	case NVME_CTRL_CONNECTING:
366	/ fallthru /
367	return true;
368	default:
369	break;
370	}
371	}
372	return false;
373	}
374
375	static void nvme_ns_head_submit_bio(struct bio *bio)
376	{
377	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
378	struct device *dev = disk_to_dev(head->disk);
379	struct nvme_ns *ns;
380	int srcu_idx;
381
382	/*
383	* The namespace might be going away and the bio might be moved to a
384	* different queue via blk_steal_bios(), so we need to use the bio_split
385	* pool from the original queue to allocate the bvecs from.
386	*/
387	bio = bio_split_to_limits(bio);
388	if (!bio)
389	return;
390
391	srcu_idx = srcu_read_lock(ssp: &head->srcu);
392	ns = nvme_find_path(head);
393	if (likely(ns)) {
394	bio_set_dev(bio, bdev: ns->disk->part0);
395	bio->bi_opf \|= REQ_NVME_MPATH;
396	trace_block_bio_remap(bio, dev: disk_devt(disk: ns->head->disk),
397	from: bio->bi_iter.bi_sector);
398	submit_bio_noacct(bio);
399	} else if (nvme_available_path(head)) {
400	dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
401
402	spin_lock_irq(lock: &head->requeue_lock);
403	bio_list_add(bl: &head->requeue_list, bio);
404	spin_unlock_irq(lock: &head->requeue_lock);
405	} else {
406	dev_warn_ratelimited(dev, "no available path - failing I/O\n");
407
408	bio_io_error(bio);
409	}
410
411	srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx);
412	}
413
414	static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
415	{
416	if (!nvme_tryget_ns_head(head: disk->private_data))
417	return -ENXIO;
418	return `0`;
419	}
420
421	static void nvme_ns_head_release(struct gendisk *disk)
422	{
423	nvme_put_ns_head(head: disk->private_data);
424	}
425
426	#ifdef CONFIG_BLK_DEV_ZONED
427	static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
428	unsigned int nr_zones, report_zones_cb cb, void *data)
429	{
430	struct nvme_ns_head *head = disk->private_data;
431	struct nvme_ns *ns;
432	int srcu_idx, ret = -EWOULDBLOCK;
433
434	srcu_idx = srcu_read_lock(ssp: &head->srcu);
435	ns = nvme_find_path(head);
436	if (ns)
437	ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
438	srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx);
439	return ret;
440	}
441	#else
442	#define nvme_ns_head_report_zones NULL
443	#endif /* CONFIG_BLK_DEV_ZONED */
444
445	const struct block_device_operations nvme_ns_head_ops = {
446	.owner = THIS_MODULE,
447	.submit_bio = nvme_ns_head_submit_bio,
448	.open = nvme_ns_head_open,
449	.release = nvme_ns_head_release,
450	.ioctl = nvme_ns_head_ioctl,
451	.compat_ioctl = blkdev_compat_ptr_ioctl,
452	.getgeo = nvme_getgeo,
453	.report_zones = nvme_ns_head_report_zones,
454	.pr_ops = &nvme_pr_ops,
455	};
456
457	static inline struct nvme_ns_head cdev_to_ns_head(struct* cdev *cdev)
458	{
459	return container_of(cdev, struct nvme_ns_head, cdev);
460	}
461
462	static int nvme_ns_head_chr_open(struct inode inode, struct* file *file)
463	{
464	if (!nvme_tryget_ns_head(head: cdev_to_ns_head(cdev: inode->i_cdev)))
465	return -ENXIO;
466	return `0`;
467	}
468
469	static int nvme_ns_head_chr_release(struct inode inode, struct* file *file)
470	{
471	nvme_put_ns_head(head: cdev_to_ns_head(cdev: inode->i_cdev));
472	return `0`;
473	}
474
475	static const struct file_operations nvme_ns_head_chr_fops = {
476	.owner = THIS_MODULE,
477	.open = nvme_ns_head_chr_open,
478	.release = nvme_ns_head_chr_release,
479	.unlocked_ioctl = nvme_ns_head_chr_ioctl,
480	.compat_ioctl = compat_ptr_ioctl,
481	.uring_cmd = nvme_ns_head_chr_uring_cmd,
482	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
483	};
484
485	static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
486	{
487	int ret;
488
489	head->cdev_device.parent = &head->subsys->dev;
490	ret = dev_set_name(dev: &head->cdev_device, name: "ng%dn%d",
491	head->subsys->instance, head->instance);
492	if (ret)
493	return ret;
494	ret = nvme_cdev_add(cdev: &head->cdev, cdev_device: &head->cdev_device,
495	fops: &nvme_ns_head_chr_fops, THIS_MODULE);
496	return ret;
497	}
498
499	static void nvme_requeue_work(struct work_struct *work)
500	{
501	struct nvme_ns_head *head =
502	container_of(work, struct nvme_ns_head, requeue_work);
503	struct bio bio, next;
504
505	spin_lock_irq(lock: &head->requeue_lock);
506	next = bio_list_get(bl: &head->requeue_list);
507	spin_unlock_irq(lock: &head->requeue_lock);
508
509	while ((bio = next) != NULL) {
510	next = bio->bi_next;
511	bio->bi_next = NULL;
512
513	submit_bio_noacct(bio);
514	}
515	}
516
517	int nvme_mpath_alloc_disk(struct nvme_ctrl ctrl, struct* nvme_ns_head *head)
518	{
519	struct queue_limits lim;
520	bool vwc = false;
521
522	mutex_init(&head->lock);
523	bio_list_init(bl: &head->requeue_list);
524	spin_lock_init(&head->requeue_lock);
525	INIT_WORK(&head->requeue_work, nvme_requeue_work);
526
527	/*
528	* Add a multipath node if the subsystems supports multiple controllers.
529	* We also do this for private namespaces as the namespace sharing flag
530	* could change after a rescan.
531	*/
532	if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) \|\|
533	!nvme_is_unique_nsid(ctrl, head) \|\| !multipath)
534	return `0`;
535
536	blk_set_stacking_limits(lim: &lim);
537	lim.dma_alignment = `3`;
538	if (head->ids.csi != NVME_CSI_ZNS)
539	lim.max_zone_append_sectors = `0`;
540
541	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
542	if (IS_ERR(ptr: head->disk))
543	return PTR_ERR(ptr: head->disk);
544	head->disk->fops = &nvme_ns_head_ops;
545	head->disk->private_data = head;
546	sprintf(buf: head->disk->disk_name, fmt: "nvme%dn%d",
547	ctrl->subsys->instance, head->instance);
548
549	blk_queue_flag_set(QUEUE_FLAG_NONROT, q: head->disk->queue);
550	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q: head->disk->queue);
551	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q: head->disk->queue);
552	/*
553	* This assumes all controllers that refer to a namespace either
554	* support poll queues or not. That is not a strict guarantee,
555	* but if the assumption is wrong the effect is only suboptimal
556	* performance but not correctness problem.
557	*/
558	if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
559	ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
560	blk_queue_flag_set(QUEUE_FLAG_POLL, q: head->disk->queue);
561
562	/ we need to propagate up the VMC settings /
563	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
564	vwc = true;
565	blk_queue_write_cache(q: head->disk->queue, enabled: vwc, fua: vwc);
566	return `0`;
567	}
568
569	static void nvme_mpath_set_live(struct nvme_ns *ns)
570	{
571	struct nvme_ns_head *head = ns->head;
572	int rc;
573
574	if (!head->disk)
575	return;
576
577	/*
578	* test_and_set_bit() is used because it is protecting against two nvme
579	* paths simultaneously calling device_add_disk() on the same namespace
580	* head.
581	*/
582	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, addr: &head->flags)) {
583	rc = device_add_disk(parent: &head->subsys->dev, disk: head->disk,
584	groups: nvme_ns_attr_groups);
585	if (rc) {
586	clear_bit(NVME_NSHEAD_DISK_LIVE, addr: &ns->flags);
587	return;
588	}
589	nvme_add_ns_head_cdev(head);
590	}
591
592	mutex_lock(&head->lock);
593	if (nvme_path_is_optimized(ns)) {
594	int node, srcu_idx;
595
596	srcu_idx = srcu_read_lock(ssp: &head->srcu);
597	for_each_node(node)
598	__nvme_find_path(head, node);
599	srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx);
600	}
601	mutex_unlock(lock: &head->lock);
602
603	synchronize_srcu(ssp: &head->srcu);
604	kblockd_schedule_work(work: &head->requeue_work);
605	}
606
607	static int nvme_parse_ana_log(struct nvme_ctrl ctrl, void* *data,
608	int (cb)(struct* nvme_ctrl ctrl, struct* nvme_ana_group_desc *,
609	void *))
610	{
611	void *base = ctrl->ana_log_buf;
612	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
613	int error, i;
614
615	lockdep_assert_held(&ctrl->ana_lock);
616
617	for (i = `0`; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
618	struct nvme_ana_group_desc *desc = base + offset;
619	u32 nr_nsids;
620	size_t nsid_buf_size;
621
622	if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
623	return -EINVAL;
624
625	nr_nsids = le32_to_cpu(desc->nnsids);
626	nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
627
628	if (WARN_ON_ONCE(desc->grpid == `0`))
629	return -EINVAL;
630	if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
631	return -EINVAL;
632	if (WARN_ON_ONCE(desc->state == `0`))
633	return -EINVAL;
634	if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
635	return -EINVAL;
636
637	offset += sizeof(*desc);
638	if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
639	return -EINVAL;
640
641	error = cb(ctrl, desc, data);
642	if (error)
643	return error;
644
645	offset += nsid_buf_size;
646	}
647
648	return `0`;
649	}
650
651	static inline bool nvme_state_is_live(enum nvme_ana_state state)
652	{
653	return state == NVME_ANA_OPTIMIZED \|\| state == NVME_ANA_NONOPTIMIZED;
654	}
655
656	static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
657	struct nvme_ns *ns)
658	{
659	ns->ana_grpid = le32_to_cpu(desc->grpid);
660	ns->ana_state = desc->state;
661	clear_bit(NVME_NS_ANA_PENDING, addr: &ns->flags);
662	/*
663	* nvme_mpath_set_live() will trigger I/O to the multipath path device
664	* and in turn to this path device. However we cannot accept this I/O
665	* if the controller is not live. This may deadlock if called from
666	* nvme_mpath_init_identify() and the ctrl will never complete
667	* initialization, preventing I/O from completing. For this case we
668	* will reprocess the ANA log page in nvme_mpath_update() once the
669	* controller is ready.
670	*/
671	if (nvme_state_is_live(state: ns->ana_state) &&
672	nvme_ctrl_state(ctrl: ns->ctrl) == NVME_CTRL_LIVE)
673	nvme_mpath_set_live(ns);
674	}
675
676	static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
677	struct nvme_ana_group_desc desc, void* *data)
678	{
679	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = `0`;
680	unsigned *nr_change_groups = data;
681	struct nvme_ns *ns;
682
683	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
684	le32_to_cpu(desc->grpid),
685	nvme_ana_state_names[desc->state]);
686
687	if (desc->state == NVME_ANA_CHANGE)
688	(*nr_change_groups)++;
689
690	if (!nr_nsids)
691	return `0`;
692
693	down_read(sem: &ctrl->namespaces_rwsem);
694	list_for_each_entry(ns, &ctrl->namespaces, list) {
695	unsigned nsid;
696	again:
697	nsid = le32_to_cpu(desc->nsids[n]);
698	if (ns->head->ns_id < nsid)
699	continue;
700	if (ns->head->ns_id == nsid)
701	nvme_update_ns_ana_state(desc, ns);
702	if (++n == nr_nsids)
703	break;
704	if (ns->head->ns_id > nsid)
705	goto again;
706	}
707	up_read(sem: &ctrl->namespaces_rwsem);
708	return `0`;
709	}
710
711	static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
712	{
713	u32 nr_change_groups = `0`;
714	int error;
715
716	mutex_lock(&ctrl->ana_lock);
717	error = nvme_get_log(ctrl, NVME_NSID_ALL, log_page: NVME_LOG_ANA, lsp: `0`, csi: NVME_CSI_NVM,
718	log: ctrl->ana_log_buf, size: ctrl->ana_log_size, offset: `0`);
719	if (error) {
720	dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
721	goto out_unlock;
722	}
723
724	error = nvme_parse_ana_log(ctrl, data: &nr_change_groups,
725	cb: nvme_update_ana_state);
726	if (error)
727	goto out_unlock;
728
729	/*
730	* In theory we should have an ANATT timer per group as they might enter
731	* the change state at different times. But that is a lot of overhead
732	* just to protect against a target that keeps entering new changes
733	* states while never finishing previous ones. But we'll still
734	* eventually time out once all groups are in change state, so this
735	* isn't a big deal.
736	*
737	* We also double the ANATT value to provide some slack for transports
738	* or AEN processing overhead.
739	*/
740	if (nr_change_groups)
741	mod_timer(timer: &ctrl->anatt_timer, expires: ctrl->anatt * HZ * `2` + jiffies);
742	else
743	del_timer_sync(timer: &ctrl->anatt_timer);
744	out_unlock:
745	mutex_unlock(lock: &ctrl->ana_lock);
746	return error;
747	}
748
749	static void nvme_ana_work(struct work_struct *work)
750	{
751	struct nvme_ctrl ctrl = container_of(work, struct* nvme_ctrl, ana_work);
752
753	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
754	return;
755
756	nvme_read_ana_log(ctrl);
757	}
758
759	void nvme_mpath_update(struct nvme_ctrl *ctrl)
760	{
761	u32 nr_change_groups = `0`;
762
763	if (!ctrl->ana_log_buf)
764	return;
765
766	mutex_lock(&ctrl->ana_lock);
767	nvme_parse_ana_log(ctrl, data: &nr_change_groups, cb: nvme_update_ana_state);
768	mutex_unlock(lock: &ctrl->ana_lock);
769	}
770
771	static void nvme_anatt_timeout(struct timer_list *t)
772	{
773	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
774
775	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
776	nvme_reset_ctrl(ctrl);
777	}
778
779	void nvme_mpath_stop(struct nvme_ctrl *ctrl)
780	{
781	if (!nvme_ctrl_use_ana(ctrl))
782	return;
783	del_timer_sync(timer: &ctrl->anatt_timer);
784	cancel_work_sync(work: &ctrl->ana_work);
785	}
786
787	#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
788	struct device_attribute subsys_attr_##_name = \
789	__ATTR(_name, _mode, _show, _store)
790
791	static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
792	struct device_attribute attr, char* *buf)
793	{
794	struct nvme_subsystem *subsys =
795	container_of(dev, struct nvme_subsystem, dev);
796
797	return sysfs_emit(buf, fmt: "%s\n",
798	nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
799	}
800
801	static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
802	struct device_attribute attr, const* char *buf, size_t count)
803	{
804	struct nvme_subsystem *subsys =
805	container_of(dev, struct nvme_subsystem, dev);
806	int i;
807
808	for (i = `0`; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
809	if (sysfs_streq(s1: buf, s2: nvme_iopolicy_names[i])) {
810	WRITE_ONCE(subsys->iopolicy, i);
811	return count;
812	}
813	}
814
815	return -EINVAL;
816	}
817	SUBSYS_ATTR_RW(iopolicy, S_IRUGO \| S_IWUSR,
818	nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
819
820	static ssize_t ana_grpid_show(struct device dev, struct* device_attribute *attr,
821	char *buf)
822	{
823	return sysfs_emit(buf, fmt: "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
824	}
825	DEVICE_ATTR_RO(ana_grpid);
826
827	static ssize_t ana_state_show(struct device dev, struct* device_attribute *attr,
828	char *buf)
829	{
830	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
831
832	return sysfs_emit(buf, fmt: "%s\n", nvme_ana_state_names[ns->ana_state]);
833	}
834	DEVICE_ATTR_RO(ana_state);
835
836	static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
837	struct nvme_ana_group_desc desc, void* *data)
838	{
839	struct nvme_ana_group_desc *dst = data;
840
841	if (desc->grpid != dst->grpid)
842	return `0`;
843
844	dst = desc;
845	return -ENXIO; / just break out of the loop /
846	}
847
848	void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
849	{
850	if (nvme_ctrl_use_ana(ctrl: ns->ctrl)) {
851	struct nvme_ana_group_desc desc = {
852	.grpid = anagrpid,
853	.state = `0`,
854	};
855
856	mutex_lock(&ns->ctrl->ana_lock);
857	ns->ana_grpid = le32_to_cpu(anagrpid);
858	nvme_parse_ana_log(ctrl: ns->ctrl, data: &desc, cb: nvme_lookup_ana_group_desc);
859	mutex_unlock(lock: &ns->ctrl->ana_lock);
860	if (desc.state) {
861	/ found the group desc: update /
862	nvme_update_ns_ana_state(desc: &desc, ns);
863	} else {
864	/ group desc not found: trigger a re-read /
865	set_bit(NVME_NS_ANA_PENDING, addr: &ns->flags);
866	queue_work(wq: nvme_wq, work: &ns->ctrl->ana_work);
867	}
868	} else {
869	ns->ana_state = NVME_ANA_OPTIMIZED;
870	nvme_mpath_set_live(ns);
871	}
872
873	if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
874	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
875	q: ns->head->disk->queue);
876	#ifdef CONFIG_BLK_DEV_ZONED
877	if (blk_queue_is_zoned(q: ns->queue) && ns->head->disk)
878	ns->head->disk->nr_zones = ns->disk->nr_zones;
879	#endif
880	}
881
882	void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
883	{
884	if (!head->disk)
885	return;
886	kblockd_schedule_work(work: &head->requeue_work);
887	if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
888	nvme_cdev_del(cdev: &head->cdev, cdev_device: &head->cdev_device);
889	del_gendisk(gp: head->disk);
890	}
891	}
892
893	void nvme_mpath_remove_disk(struct nvme_ns_head *head)
894	{
895	if (!head->disk)
896	return;
897	/ make sure all pending bios are cleaned up /
898	kblockd_schedule_work(work: &head->requeue_work);
899	flush_work(work: &head->requeue_work);
900	put_disk(disk: head->disk);
901	}
902
903	void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
904	{
905	mutex_init(&ctrl->ana_lock);
906	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, `0`);
907	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
908	}
909
910	int nvme_mpath_init_identify(struct nvme_ctrl ctrl, struct* nvme_id_ctrl *id)
911	{
912	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
913	size_t ana_log_size;
914	int error = `0`;
915
916	/ check if multipath is enabled and we have the capability /
917	if (!multipath \|\| !ctrl->subsys \|\|
918	!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
919	return `0`;
920
921	if (!ctrl->max_namespaces \|\|
922	ctrl->max_namespaces > le32_to_cpu(id->nn)) {
923	dev_err(ctrl->device,
924	"Invalid MNAN value %u\n", ctrl->max_namespaces);
925	return -EINVAL;
926	}
927
928	ctrl->anacap = id->anacap;
929	ctrl->anatt = id->anatt;
930	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
931	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
932
933	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
934	ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
935	ctrl->max_namespaces * sizeof(__le32);
936	if (ana_log_size > max_transfer_size) {
937	dev_err(ctrl->device,
938	"ANA log page size (%zd) larger than MDTS (%zd).\n",
939	ana_log_size, max_transfer_size);
940	dev_err(ctrl->device, "disabling ANA support.\n");
941	goto out_uninit;
942	}
943	if (ana_log_size > ctrl->ana_log_size) {
944	nvme_mpath_stop(ctrl);
945	nvme_mpath_uninit(ctrl);
946	ctrl->ana_log_buf = kvmalloc(size: ana_log_size, GFP_KERNEL);
947	if (!ctrl->ana_log_buf)
948	return -ENOMEM;
949	}
950	ctrl->ana_log_size = ana_log_size;
951	error = nvme_read_ana_log(ctrl);
952	if (error)
953	goto out_uninit;
954	return `0`;
955
956	out_uninit:
957	nvme_mpath_uninit(ctrl);
958	return error;
959	}
960
961	void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
962	{
963	kvfree(addr: ctrl->ana_log_buf);
964	ctrl->ana_log_buf = NULL;
965	ctrl->ana_log_size = `0`;
966	}
967

source code of linux/drivers/nvme/host/multipath.c