1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
4 | */ |
5 | |
6 | #include <linux/backing-dev.h> |
7 | #include <linux/moduleparam.h> |
8 | #include <linux/vmalloc.h> |
9 | #include <trace/events/block.h> |
10 | #include "nvme.h" |
11 | |
12 | bool multipath = true; |
13 | module_param(multipath, bool, 0444); |
14 | MODULE_PARM_DESC(multipath, |
15 | "turn on native support for multiple controllers per subsystem" ); |
16 | |
17 | static const char *nvme_iopolicy_names[] = { |
18 | [NVME_IOPOLICY_NUMA] = "numa" , |
19 | [NVME_IOPOLICY_RR] = "round-robin" , |
20 | }; |
21 | |
22 | static int iopolicy = NVME_IOPOLICY_NUMA; |
23 | |
24 | static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) |
25 | { |
26 | if (!val) |
27 | return -EINVAL; |
28 | if (!strncmp(val, "numa" , 4)) |
29 | iopolicy = NVME_IOPOLICY_NUMA; |
30 | else if (!strncmp(val, "round-robin" , 11)) |
31 | iopolicy = NVME_IOPOLICY_RR; |
32 | else |
33 | return -EINVAL; |
34 | |
35 | return 0; |
36 | } |
37 | |
38 | static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp) |
39 | { |
40 | return sprintf(buf, fmt: "%s\n" , nvme_iopolicy_names[iopolicy]); |
41 | } |
42 | |
43 | module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy, |
44 | &iopolicy, 0644); |
45 | MODULE_PARM_DESC(iopolicy, |
46 | "Default multipath I/O policy; 'numa' (default) or 'round-robin'" ); |
47 | |
48 | void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) |
49 | { |
50 | subsys->iopolicy = iopolicy; |
51 | } |
52 | |
53 | void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) |
54 | { |
55 | struct nvme_ns_head *h; |
56 | |
57 | lockdep_assert_held(&subsys->lock); |
58 | list_for_each_entry(h, &subsys->nsheads, entry) |
59 | if (h->disk) |
60 | blk_mq_unfreeze_queue(q: h->disk->queue); |
61 | } |
62 | |
63 | void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) |
64 | { |
65 | struct nvme_ns_head *h; |
66 | |
67 | lockdep_assert_held(&subsys->lock); |
68 | list_for_each_entry(h, &subsys->nsheads, entry) |
69 | if (h->disk) |
70 | blk_mq_freeze_queue_wait(q: h->disk->queue); |
71 | } |
72 | |
73 | void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) |
74 | { |
75 | struct nvme_ns_head *h; |
76 | |
77 | lockdep_assert_held(&subsys->lock); |
78 | list_for_each_entry(h, &subsys->nsheads, entry) |
79 | if (h->disk) |
80 | blk_freeze_queue_start(q: h->disk->queue); |
81 | } |
82 | |
83 | void nvme_failover_req(struct request *req) |
84 | { |
85 | struct nvme_ns *ns = req->q->queuedata; |
86 | u16 status = nvme_req(req)->status & 0x7ff; |
87 | unsigned long flags; |
88 | struct bio *bio; |
89 | |
90 | nvme_mpath_clear_current_path(ns); |
91 | |
92 | /* |
93 | * If we got back an ANA error, we know the controller is alive but not |
94 | * ready to serve this namespace. Kick of a re-read of the ANA |
95 | * information page, and just try any other available path for now. |
96 | */ |
97 | if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { |
98 | set_bit(NVME_NS_ANA_PENDING, addr: &ns->flags); |
99 | queue_work(wq: nvme_wq, work: &ns->ctrl->ana_work); |
100 | } |
101 | |
102 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
103 | for (bio = req->bio; bio; bio = bio->bi_next) { |
104 | bio_set_dev(bio, bdev: ns->head->disk->part0); |
105 | if (bio->bi_opf & REQ_POLLED) { |
106 | bio->bi_opf &= ~REQ_POLLED; |
107 | bio->bi_cookie = BLK_QC_T_NONE; |
108 | } |
109 | /* |
110 | * The alternate request queue that we may end up submitting |
111 | * the bio to may be frozen temporarily, in this case REQ_NOWAIT |
112 | * will fail the I/O immediately with EAGAIN to the issuer. |
113 | * We are not in the issuer context which cannot block. Clear |
114 | * the flag to avoid spurious EAGAIN I/O failures. |
115 | */ |
116 | bio->bi_opf &= ~REQ_NOWAIT; |
117 | } |
118 | blk_steal_bios(list: &ns->head->requeue_list, rq: req); |
119 | spin_unlock_irqrestore(lock: &ns->head->requeue_lock, flags); |
120 | |
121 | blk_mq_end_request(rq: req, error: 0); |
122 | kblockd_schedule_work(work: &ns->head->requeue_work); |
123 | } |
124 | |
125 | void nvme_mpath_start_request(struct request *rq) |
126 | { |
127 | struct nvme_ns *ns = rq->q->queuedata; |
128 | struct gendisk *disk = ns->head->disk; |
129 | |
130 | if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) |
131 | return; |
132 | |
133 | nvme_req(req: rq)->flags |= NVME_MPATH_IO_STATS; |
134 | nvme_req(req: rq)->start_time = bdev_start_io_acct(bdev: disk->part0, op: req_op(req: rq), |
135 | start_time: jiffies); |
136 | } |
137 | EXPORT_SYMBOL_GPL(nvme_mpath_start_request); |
138 | |
139 | void nvme_mpath_end_request(struct request *rq) |
140 | { |
141 | struct nvme_ns *ns = rq->q->queuedata; |
142 | |
143 | if (!(nvme_req(req: rq)->flags & NVME_MPATH_IO_STATS)) |
144 | return; |
145 | bdev_end_io_acct(bdev: ns->head->disk->part0, op: req_op(req: rq), |
146 | sectors: blk_rq_bytes(rq) >> SECTOR_SHIFT, |
147 | start_time: nvme_req(req: rq)->start_time); |
148 | } |
149 | |
150 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
151 | { |
152 | struct nvme_ns *ns; |
153 | |
154 | down_read(sem: &ctrl->namespaces_rwsem); |
155 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
156 | if (!ns->head->disk) |
157 | continue; |
158 | kblockd_schedule_work(work: &ns->head->requeue_work); |
159 | if (nvme_ctrl_state(ctrl: ns->ctrl) == NVME_CTRL_LIVE) |
160 | disk_uevent(disk: ns->head->disk, action: KOBJ_CHANGE); |
161 | } |
162 | up_read(sem: &ctrl->namespaces_rwsem); |
163 | } |
164 | |
165 | static const char *nvme_ana_state_names[] = { |
166 | [0] = "invalid state" , |
167 | [NVME_ANA_OPTIMIZED] = "optimized" , |
168 | [NVME_ANA_NONOPTIMIZED] = "non-optimized" , |
169 | [NVME_ANA_INACCESSIBLE] = "inaccessible" , |
170 | [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss" , |
171 | [NVME_ANA_CHANGE] = "change" , |
172 | }; |
173 | |
174 | bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
175 | { |
176 | struct nvme_ns_head *head = ns->head; |
177 | bool changed = false; |
178 | int node; |
179 | |
180 | if (!head) |
181 | goto out; |
182 | |
183 | for_each_node(node) { |
184 | if (ns == rcu_access_pointer(head->current_path[node])) { |
185 | rcu_assign_pointer(head->current_path[node], NULL); |
186 | changed = true; |
187 | } |
188 | } |
189 | out: |
190 | return changed; |
191 | } |
192 | |
193 | void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) |
194 | { |
195 | struct nvme_ns *ns; |
196 | |
197 | down_read(sem: &ctrl->namespaces_rwsem); |
198 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
199 | nvme_mpath_clear_current_path(ns); |
200 | kblockd_schedule_work(work: &ns->head->requeue_work); |
201 | } |
202 | up_read(sem: &ctrl->namespaces_rwsem); |
203 | } |
204 | |
205 | void nvme_mpath_revalidate_paths(struct nvme_ns *ns) |
206 | { |
207 | struct nvme_ns_head *head = ns->head; |
208 | sector_t capacity = get_capacity(disk: head->disk); |
209 | int node; |
210 | int srcu_idx; |
211 | |
212 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
213 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
214 | if (capacity != get_capacity(disk: ns->disk)) |
215 | clear_bit(NVME_NS_READY, addr: &ns->flags); |
216 | } |
217 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
218 | |
219 | for_each_node(node) |
220 | rcu_assign_pointer(head->current_path[node], NULL); |
221 | kblockd_schedule_work(work: &head->requeue_work); |
222 | } |
223 | |
224 | static bool nvme_path_is_disabled(struct nvme_ns *ns) |
225 | { |
226 | enum nvme_ctrl_state state = nvme_ctrl_state(ctrl: ns->ctrl); |
227 | |
228 | /* |
229 | * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should |
230 | * still be able to complete assuming that the controller is connected. |
231 | * Otherwise it will fail immediately and return to the requeue list. |
232 | */ |
233 | if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING) |
234 | return true; |
235 | if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || |
236 | !test_bit(NVME_NS_READY, &ns->flags)) |
237 | return true; |
238 | return false; |
239 | } |
240 | |
241 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
242 | { |
243 | int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; |
244 | struct nvme_ns *found = NULL, *fallback = NULL, *ns; |
245 | |
246 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
247 | if (nvme_path_is_disabled(ns)) |
248 | continue; |
249 | |
250 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
251 | distance = node_distance(node, ns->ctrl->numa_node); |
252 | else |
253 | distance = LOCAL_DISTANCE; |
254 | |
255 | switch (ns->ana_state) { |
256 | case NVME_ANA_OPTIMIZED: |
257 | if (distance < found_distance) { |
258 | found_distance = distance; |
259 | found = ns; |
260 | } |
261 | break; |
262 | case NVME_ANA_NONOPTIMIZED: |
263 | if (distance < fallback_distance) { |
264 | fallback_distance = distance; |
265 | fallback = ns; |
266 | } |
267 | break; |
268 | default: |
269 | break; |
270 | } |
271 | } |
272 | |
273 | if (!found) |
274 | found = fallback; |
275 | if (found) |
276 | rcu_assign_pointer(head->current_path[node], found); |
277 | return found; |
278 | } |
279 | |
280 | static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
281 | struct nvme_ns *ns) |
282 | { |
283 | ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, |
284 | siblings); |
285 | if (ns) |
286 | return ns; |
287 | return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); |
288 | } |
289 | |
290 | static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, |
291 | int node, struct nvme_ns *old) |
292 | { |
293 | struct nvme_ns *ns, *found = NULL; |
294 | |
295 | if (list_is_singular(head: &head->list)) { |
296 | if (nvme_path_is_disabled(ns: old)) |
297 | return NULL; |
298 | return old; |
299 | } |
300 | |
301 | for (ns = nvme_next_ns(head, ns: old); |
302 | ns && ns != old; |
303 | ns = nvme_next_ns(head, ns)) { |
304 | if (nvme_path_is_disabled(ns)) |
305 | continue; |
306 | |
307 | if (ns->ana_state == NVME_ANA_OPTIMIZED) { |
308 | found = ns; |
309 | goto out; |
310 | } |
311 | if (ns->ana_state == NVME_ANA_NONOPTIMIZED) |
312 | found = ns; |
313 | } |
314 | |
315 | /* |
316 | * The loop above skips the current path for round-robin semantics. |
317 | * Fall back to the current path if either: |
318 | * - no other optimized path found and current is optimized, |
319 | * - no other usable path found and current is usable. |
320 | */ |
321 | if (!nvme_path_is_disabled(ns: old) && |
322 | (old->ana_state == NVME_ANA_OPTIMIZED || |
323 | (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) |
324 | return old; |
325 | |
326 | if (!found) |
327 | return NULL; |
328 | out: |
329 | rcu_assign_pointer(head->current_path[node], found); |
330 | return found; |
331 | } |
332 | |
333 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
334 | { |
335 | return nvme_ctrl_state(ctrl: ns->ctrl) == NVME_CTRL_LIVE && |
336 | ns->ana_state == NVME_ANA_OPTIMIZED; |
337 | } |
338 | |
339 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) |
340 | { |
341 | int node = numa_node_id(); |
342 | struct nvme_ns *ns; |
343 | |
344 | ns = srcu_dereference(head->current_path[node], &head->srcu); |
345 | if (unlikely(!ns)) |
346 | return __nvme_find_path(head, node); |
347 | |
348 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) |
349 | return nvme_round_robin_path(head, node, old: ns); |
350 | if (unlikely(!nvme_path_is_optimized(ns))) |
351 | return __nvme_find_path(head, node); |
352 | return ns; |
353 | } |
354 | |
355 | static bool nvme_available_path(struct nvme_ns_head *head) |
356 | { |
357 | struct nvme_ns *ns; |
358 | |
359 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
360 | if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) |
361 | continue; |
362 | switch (nvme_ctrl_state(ctrl: ns->ctrl)) { |
363 | case NVME_CTRL_LIVE: |
364 | case NVME_CTRL_RESETTING: |
365 | case NVME_CTRL_CONNECTING: |
366 | /* fallthru */ |
367 | return true; |
368 | default: |
369 | break; |
370 | } |
371 | } |
372 | return false; |
373 | } |
374 | |
375 | static void nvme_ns_head_submit_bio(struct bio *bio) |
376 | { |
377 | struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; |
378 | struct device *dev = disk_to_dev(head->disk); |
379 | struct nvme_ns *ns; |
380 | int srcu_idx; |
381 | |
382 | /* |
383 | * The namespace might be going away and the bio might be moved to a |
384 | * different queue via blk_steal_bios(), so we need to use the bio_split |
385 | * pool from the original queue to allocate the bvecs from. |
386 | */ |
387 | bio = bio_split_to_limits(bio); |
388 | if (!bio) |
389 | return; |
390 | |
391 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
392 | ns = nvme_find_path(head); |
393 | if (likely(ns)) { |
394 | bio_set_dev(bio, bdev: ns->disk->part0); |
395 | bio->bi_opf |= REQ_NVME_MPATH; |
396 | trace_block_bio_remap(bio, dev: disk_devt(disk: ns->head->disk), |
397 | from: bio->bi_iter.bi_sector); |
398 | submit_bio_noacct(bio); |
399 | } else if (nvme_available_path(head)) { |
400 | dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n" ); |
401 | |
402 | spin_lock_irq(lock: &head->requeue_lock); |
403 | bio_list_add(bl: &head->requeue_list, bio); |
404 | spin_unlock_irq(lock: &head->requeue_lock); |
405 | } else { |
406 | dev_warn_ratelimited(dev, "no available path - failing I/O\n" ); |
407 | |
408 | bio_io_error(bio); |
409 | } |
410 | |
411 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
412 | } |
413 | |
414 | static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode) |
415 | { |
416 | if (!nvme_tryget_ns_head(head: disk->private_data)) |
417 | return -ENXIO; |
418 | return 0; |
419 | } |
420 | |
421 | static void nvme_ns_head_release(struct gendisk *disk) |
422 | { |
423 | nvme_put_ns_head(head: disk->private_data); |
424 | } |
425 | |
426 | #ifdef CONFIG_BLK_DEV_ZONED |
427 | static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, |
428 | unsigned int nr_zones, report_zones_cb cb, void *data) |
429 | { |
430 | struct nvme_ns_head *head = disk->private_data; |
431 | struct nvme_ns *ns; |
432 | int srcu_idx, ret = -EWOULDBLOCK; |
433 | |
434 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
435 | ns = nvme_find_path(head); |
436 | if (ns) |
437 | ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); |
438 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
439 | return ret; |
440 | } |
441 | #else |
442 | #define nvme_ns_head_report_zones NULL |
443 | #endif /* CONFIG_BLK_DEV_ZONED */ |
444 | |
445 | const struct block_device_operations nvme_ns_head_ops = { |
446 | .owner = THIS_MODULE, |
447 | .submit_bio = nvme_ns_head_submit_bio, |
448 | .open = nvme_ns_head_open, |
449 | .release = nvme_ns_head_release, |
450 | .ioctl = nvme_ns_head_ioctl, |
451 | .compat_ioctl = blkdev_compat_ptr_ioctl, |
452 | .getgeo = nvme_getgeo, |
453 | .report_zones = nvme_ns_head_report_zones, |
454 | .pr_ops = &nvme_pr_ops, |
455 | }; |
456 | |
457 | static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) |
458 | { |
459 | return container_of(cdev, struct nvme_ns_head, cdev); |
460 | } |
461 | |
462 | static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) |
463 | { |
464 | if (!nvme_tryget_ns_head(head: cdev_to_ns_head(cdev: inode->i_cdev))) |
465 | return -ENXIO; |
466 | return 0; |
467 | } |
468 | |
469 | static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) |
470 | { |
471 | nvme_put_ns_head(head: cdev_to_ns_head(cdev: inode->i_cdev)); |
472 | return 0; |
473 | } |
474 | |
475 | static const struct file_operations nvme_ns_head_chr_fops = { |
476 | .owner = THIS_MODULE, |
477 | .open = nvme_ns_head_chr_open, |
478 | .release = nvme_ns_head_chr_release, |
479 | .unlocked_ioctl = nvme_ns_head_chr_ioctl, |
480 | .compat_ioctl = compat_ptr_ioctl, |
481 | .uring_cmd = nvme_ns_head_chr_uring_cmd, |
482 | .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, |
483 | }; |
484 | |
485 | static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) |
486 | { |
487 | int ret; |
488 | |
489 | head->cdev_device.parent = &head->subsys->dev; |
490 | ret = dev_set_name(dev: &head->cdev_device, name: "ng%dn%d" , |
491 | head->subsys->instance, head->instance); |
492 | if (ret) |
493 | return ret; |
494 | ret = nvme_cdev_add(cdev: &head->cdev, cdev_device: &head->cdev_device, |
495 | fops: &nvme_ns_head_chr_fops, THIS_MODULE); |
496 | return ret; |
497 | } |
498 | |
499 | static void nvme_requeue_work(struct work_struct *work) |
500 | { |
501 | struct nvme_ns_head *head = |
502 | container_of(work, struct nvme_ns_head, requeue_work); |
503 | struct bio *bio, *next; |
504 | |
505 | spin_lock_irq(lock: &head->requeue_lock); |
506 | next = bio_list_get(bl: &head->requeue_list); |
507 | spin_unlock_irq(lock: &head->requeue_lock); |
508 | |
509 | while ((bio = next) != NULL) { |
510 | next = bio->bi_next; |
511 | bio->bi_next = NULL; |
512 | |
513 | submit_bio_noacct(bio); |
514 | } |
515 | } |
516 | |
517 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) |
518 | { |
519 | struct queue_limits lim; |
520 | bool vwc = false; |
521 | |
522 | mutex_init(&head->lock); |
523 | bio_list_init(bl: &head->requeue_list); |
524 | spin_lock_init(&head->requeue_lock); |
525 | INIT_WORK(&head->requeue_work, nvme_requeue_work); |
526 | |
527 | /* |
528 | * Add a multipath node if the subsystems supports multiple controllers. |
529 | * We also do this for private namespaces as the namespace sharing flag |
530 | * could change after a rescan. |
531 | */ |
532 | if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || |
533 | !nvme_is_unique_nsid(ctrl, head) || !multipath) |
534 | return 0; |
535 | |
536 | blk_set_stacking_limits(lim: &lim); |
537 | lim.dma_alignment = 3; |
538 | if (head->ids.csi != NVME_CSI_ZNS) |
539 | lim.max_zone_append_sectors = 0; |
540 | |
541 | head->disk = blk_alloc_disk(&lim, ctrl->numa_node); |
542 | if (IS_ERR(ptr: head->disk)) |
543 | return PTR_ERR(ptr: head->disk); |
544 | head->disk->fops = &nvme_ns_head_ops; |
545 | head->disk->private_data = head; |
546 | sprintf(buf: head->disk->disk_name, fmt: "nvme%dn%d" , |
547 | ctrl->subsys->instance, head->instance); |
548 | |
549 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q: head->disk->queue); |
550 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q: head->disk->queue); |
551 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q: head->disk->queue); |
552 | /* |
553 | * This assumes all controllers that refer to a namespace either |
554 | * support poll queues or not. That is not a strict guarantee, |
555 | * but if the assumption is wrong the effect is only suboptimal |
556 | * performance but not correctness problem. |
557 | */ |
558 | if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && |
559 | ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) |
560 | blk_queue_flag_set(QUEUE_FLAG_POLL, q: head->disk->queue); |
561 | |
562 | /* we need to propagate up the VMC settings */ |
563 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) |
564 | vwc = true; |
565 | blk_queue_write_cache(q: head->disk->queue, enabled: vwc, fua: vwc); |
566 | return 0; |
567 | } |
568 | |
569 | static void nvme_mpath_set_live(struct nvme_ns *ns) |
570 | { |
571 | struct nvme_ns_head *head = ns->head; |
572 | int rc; |
573 | |
574 | if (!head->disk) |
575 | return; |
576 | |
577 | /* |
578 | * test_and_set_bit() is used because it is protecting against two nvme |
579 | * paths simultaneously calling device_add_disk() on the same namespace |
580 | * head. |
581 | */ |
582 | if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, addr: &head->flags)) { |
583 | rc = device_add_disk(parent: &head->subsys->dev, disk: head->disk, |
584 | groups: nvme_ns_attr_groups); |
585 | if (rc) { |
586 | clear_bit(NVME_NSHEAD_DISK_LIVE, addr: &ns->flags); |
587 | return; |
588 | } |
589 | nvme_add_ns_head_cdev(head); |
590 | } |
591 | |
592 | mutex_lock(&head->lock); |
593 | if (nvme_path_is_optimized(ns)) { |
594 | int node, srcu_idx; |
595 | |
596 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
597 | for_each_node(node) |
598 | __nvme_find_path(head, node); |
599 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
600 | } |
601 | mutex_unlock(lock: &head->lock); |
602 | |
603 | synchronize_srcu(ssp: &head->srcu); |
604 | kblockd_schedule_work(work: &head->requeue_work); |
605 | } |
606 | |
607 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, |
608 | int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, |
609 | void *)) |
610 | { |
611 | void *base = ctrl->ana_log_buf; |
612 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); |
613 | int error, i; |
614 | |
615 | lockdep_assert_held(&ctrl->ana_lock); |
616 | |
617 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { |
618 | struct nvme_ana_group_desc *desc = base + offset; |
619 | u32 nr_nsids; |
620 | size_t nsid_buf_size; |
621 | |
622 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) |
623 | return -EINVAL; |
624 | |
625 | nr_nsids = le32_to_cpu(desc->nnsids); |
626 | nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); |
627 | |
628 | if (WARN_ON_ONCE(desc->grpid == 0)) |
629 | return -EINVAL; |
630 | if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) |
631 | return -EINVAL; |
632 | if (WARN_ON_ONCE(desc->state == 0)) |
633 | return -EINVAL; |
634 | if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) |
635 | return -EINVAL; |
636 | |
637 | offset += sizeof(*desc); |
638 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) |
639 | return -EINVAL; |
640 | |
641 | error = cb(ctrl, desc, data); |
642 | if (error) |
643 | return error; |
644 | |
645 | offset += nsid_buf_size; |
646 | } |
647 | |
648 | return 0; |
649 | } |
650 | |
651 | static inline bool nvme_state_is_live(enum nvme_ana_state state) |
652 | { |
653 | return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; |
654 | } |
655 | |
656 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, |
657 | struct nvme_ns *ns) |
658 | { |
659 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
660 | ns->ana_state = desc->state; |
661 | clear_bit(NVME_NS_ANA_PENDING, addr: &ns->flags); |
662 | /* |
663 | * nvme_mpath_set_live() will trigger I/O to the multipath path device |
664 | * and in turn to this path device. However we cannot accept this I/O |
665 | * if the controller is not live. This may deadlock if called from |
666 | * nvme_mpath_init_identify() and the ctrl will never complete |
667 | * initialization, preventing I/O from completing. For this case we |
668 | * will reprocess the ANA log page in nvme_mpath_update() once the |
669 | * controller is ready. |
670 | */ |
671 | if (nvme_state_is_live(state: ns->ana_state) && |
672 | nvme_ctrl_state(ctrl: ns->ctrl) == NVME_CTRL_LIVE) |
673 | nvme_mpath_set_live(ns); |
674 | } |
675 | |
676 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, |
677 | struct nvme_ana_group_desc *desc, void *data) |
678 | { |
679 | u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; |
680 | unsigned *nr_change_groups = data; |
681 | struct nvme_ns *ns; |
682 | |
683 | dev_dbg(ctrl->device, "ANA group %d: %s.\n" , |
684 | le32_to_cpu(desc->grpid), |
685 | nvme_ana_state_names[desc->state]); |
686 | |
687 | if (desc->state == NVME_ANA_CHANGE) |
688 | (*nr_change_groups)++; |
689 | |
690 | if (!nr_nsids) |
691 | return 0; |
692 | |
693 | down_read(sem: &ctrl->namespaces_rwsem); |
694 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
695 | unsigned nsid; |
696 | again: |
697 | nsid = le32_to_cpu(desc->nsids[n]); |
698 | if (ns->head->ns_id < nsid) |
699 | continue; |
700 | if (ns->head->ns_id == nsid) |
701 | nvme_update_ns_ana_state(desc, ns); |
702 | if (++n == nr_nsids) |
703 | break; |
704 | if (ns->head->ns_id > nsid) |
705 | goto again; |
706 | } |
707 | up_read(sem: &ctrl->namespaces_rwsem); |
708 | return 0; |
709 | } |
710 | |
711 | static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
712 | { |
713 | u32 nr_change_groups = 0; |
714 | int error; |
715 | |
716 | mutex_lock(&ctrl->ana_lock); |
717 | error = nvme_get_log(ctrl, NVME_NSID_ALL, log_page: NVME_LOG_ANA, lsp: 0, csi: NVME_CSI_NVM, |
718 | log: ctrl->ana_log_buf, size: ctrl->ana_log_size, offset: 0); |
719 | if (error) { |
720 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n" , error); |
721 | goto out_unlock; |
722 | } |
723 | |
724 | error = nvme_parse_ana_log(ctrl, data: &nr_change_groups, |
725 | cb: nvme_update_ana_state); |
726 | if (error) |
727 | goto out_unlock; |
728 | |
729 | /* |
730 | * In theory we should have an ANATT timer per group as they might enter |
731 | * the change state at different times. But that is a lot of overhead |
732 | * just to protect against a target that keeps entering new changes |
733 | * states while never finishing previous ones. But we'll still |
734 | * eventually time out once all groups are in change state, so this |
735 | * isn't a big deal. |
736 | * |
737 | * We also double the ANATT value to provide some slack for transports |
738 | * or AEN processing overhead. |
739 | */ |
740 | if (nr_change_groups) |
741 | mod_timer(timer: &ctrl->anatt_timer, expires: ctrl->anatt * HZ * 2 + jiffies); |
742 | else |
743 | del_timer_sync(timer: &ctrl->anatt_timer); |
744 | out_unlock: |
745 | mutex_unlock(lock: &ctrl->ana_lock); |
746 | return error; |
747 | } |
748 | |
749 | static void nvme_ana_work(struct work_struct *work) |
750 | { |
751 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); |
752 | |
753 | if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) |
754 | return; |
755 | |
756 | nvme_read_ana_log(ctrl); |
757 | } |
758 | |
759 | void nvme_mpath_update(struct nvme_ctrl *ctrl) |
760 | { |
761 | u32 nr_change_groups = 0; |
762 | |
763 | if (!ctrl->ana_log_buf) |
764 | return; |
765 | |
766 | mutex_lock(&ctrl->ana_lock); |
767 | nvme_parse_ana_log(ctrl, data: &nr_change_groups, cb: nvme_update_ana_state); |
768 | mutex_unlock(lock: &ctrl->ana_lock); |
769 | } |
770 | |
771 | static void nvme_anatt_timeout(struct timer_list *t) |
772 | { |
773 | struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); |
774 | |
775 | dev_info(ctrl->device, "ANATT timeout, resetting controller.\n" ); |
776 | nvme_reset_ctrl(ctrl); |
777 | } |
778 | |
779 | void nvme_mpath_stop(struct nvme_ctrl *ctrl) |
780 | { |
781 | if (!nvme_ctrl_use_ana(ctrl)) |
782 | return; |
783 | del_timer_sync(timer: &ctrl->anatt_timer); |
784 | cancel_work_sync(work: &ctrl->ana_work); |
785 | } |
786 | |
787 | #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
788 | struct device_attribute subsys_attr_##_name = \ |
789 | __ATTR(_name, _mode, _show, _store) |
790 | |
791 | static ssize_t nvme_subsys_iopolicy_show(struct device *dev, |
792 | struct device_attribute *attr, char *buf) |
793 | { |
794 | struct nvme_subsystem *subsys = |
795 | container_of(dev, struct nvme_subsystem, dev); |
796 | |
797 | return sysfs_emit(buf, fmt: "%s\n" , |
798 | nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); |
799 | } |
800 | |
801 | static ssize_t nvme_subsys_iopolicy_store(struct device *dev, |
802 | struct device_attribute *attr, const char *buf, size_t count) |
803 | { |
804 | struct nvme_subsystem *subsys = |
805 | container_of(dev, struct nvme_subsystem, dev); |
806 | int i; |
807 | |
808 | for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { |
809 | if (sysfs_streq(s1: buf, s2: nvme_iopolicy_names[i])) { |
810 | WRITE_ONCE(subsys->iopolicy, i); |
811 | return count; |
812 | } |
813 | } |
814 | |
815 | return -EINVAL; |
816 | } |
817 | SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, |
818 | nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); |
819 | |
820 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
821 | char *buf) |
822 | { |
823 | return sysfs_emit(buf, fmt: "%d\n" , nvme_get_ns_from_dev(dev)->ana_grpid); |
824 | } |
825 | DEVICE_ATTR_RO(ana_grpid); |
826 | |
827 | static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, |
828 | char *buf) |
829 | { |
830 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); |
831 | |
832 | return sysfs_emit(buf, fmt: "%s\n" , nvme_ana_state_names[ns->ana_state]); |
833 | } |
834 | DEVICE_ATTR_RO(ana_state); |
835 | |
836 | static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, |
837 | struct nvme_ana_group_desc *desc, void *data) |
838 | { |
839 | struct nvme_ana_group_desc *dst = data; |
840 | |
841 | if (desc->grpid != dst->grpid) |
842 | return 0; |
843 | |
844 | *dst = *desc; |
845 | return -ENXIO; /* just break out of the loop */ |
846 | } |
847 | |
848 | void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) |
849 | { |
850 | if (nvme_ctrl_use_ana(ctrl: ns->ctrl)) { |
851 | struct nvme_ana_group_desc desc = { |
852 | .grpid = anagrpid, |
853 | .state = 0, |
854 | }; |
855 | |
856 | mutex_lock(&ns->ctrl->ana_lock); |
857 | ns->ana_grpid = le32_to_cpu(anagrpid); |
858 | nvme_parse_ana_log(ctrl: ns->ctrl, data: &desc, cb: nvme_lookup_ana_group_desc); |
859 | mutex_unlock(lock: &ns->ctrl->ana_lock); |
860 | if (desc.state) { |
861 | /* found the group desc: update */ |
862 | nvme_update_ns_ana_state(desc: &desc, ns); |
863 | } else { |
864 | /* group desc not found: trigger a re-read */ |
865 | set_bit(NVME_NS_ANA_PENDING, addr: &ns->flags); |
866 | queue_work(wq: nvme_wq, work: &ns->ctrl->ana_work); |
867 | } |
868 | } else { |
869 | ns->ana_state = NVME_ANA_OPTIMIZED; |
870 | nvme_mpath_set_live(ns); |
871 | } |
872 | |
873 | if (blk_queue_stable_writes(ns->queue) && ns->head->disk) |
874 | blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, |
875 | q: ns->head->disk->queue); |
876 | #ifdef CONFIG_BLK_DEV_ZONED |
877 | if (blk_queue_is_zoned(q: ns->queue) && ns->head->disk) |
878 | ns->head->disk->nr_zones = ns->disk->nr_zones; |
879 | #endif |
880 | } |
881 | |
882 | void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) |
883 | { |
884 | if (!head->disk) |
885 | return; |
886 | kblockd_schedule_work(work: &head->requeue_work); |
887 | if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
888 | nvme_cdev_del(cdev: &head->cdev, cdev_device: &head->cdev_device); |
889 | del_gendisk(gp: head->disk); |
890 | } |
891 | } |
892 | |
893 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
894 | { |
895 | if (!head->disk) |
896 | return; |
897 | /* make sure all pending bios are cleaned up */ |
898 | kblockd_schedule_work(work: &head->requeue_work); |
899 | flush_work(work: &head->requeue_work); |
900 | put_disk(disk: head->disk); |
901 | } |
902 | |
903 | void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) |
904 | { |
905 | mutex_init(&ctrl->ana_lock); |
906 | timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); |
907 | INIT_WORK(&ctrl->ana_work, nvme_ana_work); |
908 | } |
909 | |
910 | int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
911 | { |
912 | size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; |
913 | size_t ana_log_size; |
914 | int error = 0; |
915 | |
916 | /* check if multipath is enabled and we have the capability */ |
917 | if (!multipath || !ctrl->subsys || |
918 | !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) |
919 | return 0; |
920 | |
921 | if (!ctrl->max_namespaces || |
922 | ctrl->max_namespaces > le32_to_cpu(id->nn)) { |
923 | dev_err(ctrl->device, |
924 | "Invalid MNAN value %u\n" , ctrl->max_namespaces); |
925 | return -EINVAL; |
926 | } |
927 | |
928 | ctrl->anacap = id->anacap; |
929 | ctrl->anatt = id->anatt; |
930 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); |
931 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); |
932 | |
933 | ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
934 | ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + |
935 | ctrl->max_namespaces * sizeof(__le32); |
936 | if (ana_log_size > max_transfer_size) { |
937 | dev_err(ctrl->device, |
938 | "ANA log page size (%zd) larger than MDTS (%zd).\n" , |
939 | ana_log_size, max_transfer_size); |
940 | dev_err(ctrl->device, "disabling ANA support.\n" ); |
941 | goto out_uninit; |
942 | } |
943 | if (ana_log_size > ctrl->ana_log_size) { |
944 | nvme_mpath_stop(ctrl); |
945 | nvme_mpath_uninit(ctrl); |
946 | ctrl->ana_log_buf = kvmalloc(size: ana_log_size, GFP_KERNEL); |
947 | if (!ctrl->ana_log_buf) |
948 | return -ENOMEM; |
949 | } |
950 | ctrl->ana_log_size = ana_log_size; |
951 | error = nvme_read_ana_log(ctrl); |
952 | if (error) |
953 | goto out_uninit; |
954 | return 0; |
955 | |
956 | out_uninit: |
957 | nvme_mpath_uninit(ctrl); |
958 | return error; |
959 | } |
960 | |
961 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl) |
962 | { |
963 | kvfree(addr: ctrl->ana_log_buf); |
964 | ctrl->ana_log_buf = NULL; |
965 | ctrl->ana_log_size = 0; |
966 | } |
967 | |