core.c source code [linux/drivers/nvme/host/core.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* NVM Express device driver
4	* Copyright (c) 2011-2014, Intel Corporation.
5	*/
6
7	#include <linux/async.h>
8	#include <linux/blkdev.h>
9	#include <linux/blk-mq.h>
10	#include <linux/blk-integrity.h>
11	#include <linux/compat.h>
12	#include <linux/delay.h>
13	#include <linux/errno.h>
14	#include <linux/hdreg.h>
15	#include <linux/kernel.h>
16	#include <linux/module.h>
17	#include <linux/backing-dev.h>
18	#include <linux/slab.h>
19	#include <linux/types.h>
20	#include <linux/pr.h>
21	#include <linux/ptrace.h>
22	#include <linux/nvme_ioctl.h>
23	#include <linux/pm_qos.h>
24	#include <linux/ratelimit.h>
25	#include <linux/unaligned.h>
26
27	#include "nvme.h"
28	#include "fabrics.h"
29	#include <linux/nvme-auth.h>
30
31	#define CREATE_TRACE_POINTS
32	#include "trace.h"
33
34	#define NVME_MINORS (1U << MINORBITS)
35
36	struct nvme_ns_info {
37	struct nvme_ns_ids ids;
38	u32 nsid;
39	__le32 anagrpid;
40	u8 pi_offset;
41	u16 endgid;
42	u64 runs;
43	bool is_shared;
44	bool is_readonly;
45	bool is_ready;
46	bool is_removed;
47	bool is_rotational;
48	bool no_vwc;
49	};
50
51	unsigned int admin_timeout = `60`;
52	module_param(admin_timeout, uint, `0644`);
53	MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
54	EXPORT_SYMBOL_GPL(admin_timeout);
55
56	unsigned int nvme_io_timeout = `30`;
57	module_param_named(io_timeout, nvme_io_timeout, uint, `0644`);
58	MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
59	EXPORT_SYMBOL_GPL(nvme_io_timeout);
60
61	static unsigned char shutdown_timeout = `5`;
62	module_param(shutdown_timeout, byte, `0644`);
63	MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
64
65	static u8 nvme_max_retries = `5`;
66	module_param_named(max_retries, nvme_max_retries, byte, `0644`);
67	MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
68
69	static unsigned long default_ps_max_latency_us = `100000`;
70	module_param(default_ps_max_latency_us, ulong, `0644`);
71	MODULE_PARM_DESC(default_ps_max_latency_us,
72	"max power saving latency for new devices; use PM QOS to change per device");
73
74	static bool force_apst;
75	module_param(force_apst, bool, `0644`);
76	MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
77
78	static unsigned long apst_primary_timeout_ms = `100`;
79	module_param(apst_primary_timeout_ms, ulong, `0644`);
80	MODULE_PARM_DESC(apst_primary_timeout_ms,
81	"primary APST timeout in ms");
82
83	static unsigned long apst_secondary_timeout_ms = `2000`;
84	module_param(apst_secondary_timeout_ms, ulong, `0644`);
85	MODULE_PARM_DESC(apst_secondary_timeout_ms,
86	"secondary APST timeout in ms");
87
88	static unsigned long apst_primary_latency_tol_us = `15000`;
89	module_param(apst_primary_latency_tol_us, ulong, `0644`);
90	MODULE_PARM_DESC(apst_primary_latency_tol_us,
91	"primary APST latency tolerance in us");
92
93	static unsigned long apst_secondary_latency_tol_us = `100000`;
94	module_param(apst_secondary_latency_tol_us, ulong, `0644`);
95	MODULE_PARM_DESC(apst_secondary_latency_tol_us,
96	"secondary APST latency tolerance in us");
97
98	/*
99	* Older kernels didn't enable protection information if it was at an offset.
100	* Newer kernels do, so it breaks reads on the upgrade if such formats were
101	* used in prior kernels since the metadata written did not contain a valid
102	* checksum.
103	*/
104	static bool disable_pi_offsets = false;
105	module_param(disable_pi_offsets, bool, `0444`);
106	MODULE_PARM_DESC(disable_pi_offsets,
107	"disable protection information if it has an offset");
108
109	/*
110	* nvme_wq - hosts nvme related works that are not reset or delete
111	* nvme_reset_wq - hosts nvme reset works
112	* nvme_delete_wq - hosts nvme delete works
113	*
114	* nvme_wq will host works such as scan, aen handling, fw activation,
115	* keep-alive, periodic reconnects etc. nvme_reset_wq
116	* runs reset works which also flush works hosted on nvme_wq for
117	* serialization purposes. nvme_delete_wq host controller deletion
118	* works which flush reset works for serialization.
119	*/
120	struct workqueue_struct *nvme_wq;
121	EXPORT_SYMBOL_GPL(nvme_wq);
122
123	struct workqueue_struct *nvme_reset_wq;
124	EXPORT_SYMBOL_GPL(nvme_reset_wq);
125
126	struct workqueue_struct *nvme_delete_wq;
127	EXPORT_SYMBOL_GPL(nvme_delete_wq);
128
129	static LIST_HEAD(nvme_subsystems);
130	DEFINE_MUTEX(nvme_subsystems_lock);
131
132	static DEFINE_IDA(nvme_instance_ida);
133	static dev_t nvme_ctrl_base_chr_devt;
134	static int nvme_class_uevent(const struct device dev, struct* kobj_uevent_env *env);
135	static const struct class nvme_class = {
136	.name = "nvme",
137	.dev_uevent = nvme_class_uevent,
138	};
139
140	static const struct class nvme_subsys_class = {
141	.name = "nvme-subsystem",
142	};
143
144	static DEFINE_IDA(nvme_ns_chr_minor_ida);
145	static dev_t nvme_ns_chr_devt;
146	static const struct class nvme_ns_chr_class = {
147	.name = "nvme-generic",
148	};
149
150	static void nvme_put_subsystem(struct nvme_subsystem *subsys);
151	static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
152	unsigned nsid);
153	static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
154	struct nvme_command *cmd);
155	static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
156	u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
157
158	void nvme_queue_scan(struct nvme_ctrl *ctrl)
159	{
160	/*
161	* Only new queue scan work when admin and IO queues are both alive
162	*/
163	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
164	queue_work(wq: nvme_wq, work: &ctrl->scan_work);
165	}
166
167	/*
168	* Use this function to proceed with scheduling reset_work for a controller
169	* that had previously been set to the resetting state. This is intended for
170	* code paths that can't be interrupted by other reset attempts. A hot removal
171	* may prevent this from succeeding.
172	*/
173	int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
174	{
175	if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
176	return -EBUSY;
177	if (!queue_work(wq: nvme_reset_wq, work: &ctrl->reset_work))
178	return -EBUSY;
179	return `0`;
180	}
181	EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
182
183	static void nvme_failfast_work(struct work_struct *work)
184	{
185	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
186	struct nvme_ctrl, failfast_work);
187
188	if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
189	return;
190
191	set_bit(nr: NVME_CTRL_FAILFAST_EXPIRED, addr: &ctrl->flags);
192	dev_info(ctrl->device, "failfast expired\n");
193	nvme_kick_requeue_lists(ctrl);
194	}
195
196	static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
197	{
198	if (!ctrl->opts \|\| ctrl->opts->fast_io_fail_tmo == -`1`)
199	return;
200
201	schedule_delayed_work(dwork: &ctrl->failfast_work,
202	delay: ctrl->opts->fast_io_fail_tmo * HZ);
203	}
204
205	static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
206	{
207	if (!ctrl->opts)
208	return;
209
210	cancel_delayed_work_sync(dwork: &ctrl->failfast_work);
211	clear_bit(nr: NVME_CTRL_FAILFAST_EXPIRED, addr: &ctrl->flags);
212	}
213
214
215	int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
216	{
217	if (!nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_RESETTING))
218	return -EBUSY;
219	if (!queue_work(wq: nvme_reset_wq, work: &ctrl->reset_work))
220	return -EBUSY;
221	return `0`;
222	}
223	EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
224
225	int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
226	{
227	int ret;
228
229	ret = nvme_reset_ctrl(ctrl);
230	if (!ret) {
231	flush_work(work: &ctrl->reset_work);
232	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
233	ret = -ENETRESET;
234	}
235
236	return ret;
237	}
238
239	static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
240	{
241	dev_info(ctrl->device,
242	"Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
243
244	flush_work(work: &ctrl->reset_work);
245	nvme_stop_ctrl(ctrl);
246	nvme_remove_namespaces(ctrl);
247	ctrl->ops->delete_ctrl(ctrl);
248	nvme_uninit_ctrl(ctrl);
249	}
250
251	static void nvme_delete_ctrl_work(struct work_struct *work)
252	{
253	struct nvme_ctrl *ctrl =
254	container_of(work, struct nvme_ctrl, delete_work);
255
256	nvme_do_delete_ctrl(ctrl);
257	}
258
259	int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
260	{
261	if (!nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_DELETING))
262	return -EBUSY;
263	if (!queue_work(wq: nvme_delete_wq, work: &ctrl->delete_work))
264	return -EBUSY;
265	return `0`;
266	}
267	EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
268
269	void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
270	{
271	/*
272	* Keep a reference until nvme_do_delete_ctrl() complete,
273	* since ->delete_ctrl can free the controller.
274	*/
275	nvme_get_ctrl(ctrl);
276	if (nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_DELETING))
277	nvme_do_delete_ctrl(ctrl);
278	nvme_put_ctrl(ctrl);
279	}
280
281	static blk_status_t nvme_error_status(u16 status)
282	{
283	switch (status & NVME_SCT_SC_MASK) {
284	case NVME_SC_SUCCESS:
285	return BLK_STS_OK;
286	case NVME_SC_CAP_EXCEEDED:
287	return BLK_STS_NOSPC;
288	case NVME_SC_LBA_RANGE:
289	case NVME_SC_CMD_INTERRUPTED:
290	case NVME_SC_NS_NOT_READY:
291	return BLK_STS_TARGET;
292	case NVME_SC_BAD_ATTRIBUTES:
293	case NVME_SC_INVALID_OPCODE:
294	case NVME_SC_INVALID_FIELD:
295	case NVME_SC_INVALID_NS:
296	return BLK_STS_NOTSUPP;
297	case NVME_SC_WRITE_FAULT:
298	case NVME_SC_READ_ERROR:
299	case NVME_SC_UNWRITTEN_BLOCK:
300	case NVME_SC_ACCESS_DENIED:
301	case NVME_SC_READ_ONLY:
302	case NVME_SC_COMPARE_FAILED:
303	return BLK_STS_MEDIUM;
304	case NVME_SC_GUARD_CHECK:
305	case NVME_SC_APPTAG_CHECK:
306	case NVME_SC_REFTAG_CHECK:
307	case NVME_SC_INVALID_PI:
308	return BLK_STS_PROTECTION;
309	case NVME_SC_RESERVATION_CONFLICT:
310	return BLK_STS_RESV_CONFLICT;
311	case NVME_SC_HOST_PATH_ERROR:
312	return BLK_STS_TRANSPORT;
313	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
314	return BLK_STS_ZONE_ACTIVE_RESOURCE;
315	case NVME_SC_ZONE_TOO_MANY_OPEN:
316	return BLK_STS_ZONE_OPEN_RESOURCE;
317	default:
318	return BLK_STS_IOERR;
319	}
320	}
321
322	static void nvme_retry_req(struct request *req)
323	{
324	unsigned long delay = `0`;
325	u16 crd;
326
327	/ The mask and shift result must be <= 3 /
328	crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> `11`;
329	if (crd)
330	delay = nvme_req(req)->ctrl->crdt[crd - `1`] * `100`;
331
332	nvme_req(req)->retries++;
333	blk_mq_requeue_request(rq: req, kick_requeue_list: false);
334	blk_mq_delay_kick_requeue_list(q: req->q, msecs: delay);
335	}
336
337	static void nvme_log_error(struct request *req)
338	{
339	struct nvme_ns *ns = req->q->queuedata;
340	struct nvme_request *nr = nvme_req(req);
341
342	if (ns) {
343	pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
344	ns->disk ? ns->disk->disk_name : "?",
345	nvme_get_opcode_str(nr->cmd->common.opcode),
346	nr->cmd->common.opcode,
347	nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
348	blk_rq_bytes(req) >> ns->head->lba_shift,
349	nvme_get_error_status_str(nr->status),
350	NVME_SCT(nr->status), / Status Code Type /
351	nr->status & NVME_SC_MASK, / Status Code /
352	nr->status & NVME_STATUS_MORE ? "MORE " : "",
353	nr->status & NVME_STATUS_DNR ? "DNR " : "");
354	return;
355	}
356
357	pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
358	dev_name(nr->ctrl->device),
359	nvme_get_admin_opcode_str(nr->cmd->common.opcode),
360	nr->cmd->common.opcode,
361	nvme_get_error_status_str(nr->status),
362	NVME_SCT(nr->status), / Status Code Type /
363	nr->status & NVME_SC_MASK, / Status Code /
364	nr->status & NVME_STATUS_MORE ? "MORE " : "",
365	nr->status & NVME_STATUS_DNR ? "DNR " : "");
366	}
367
368	static void nvme_log_err_passthru(struct request *req)
369	{
370	struct nvme_ns *ns = req->q->queuedata;
371	struct nvme_request *nr = nvme_req(req);
372
373	pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
374	"cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
375	ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
376	ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
377	nvme_get_admin_opcode_str(nr->cmd->common.opcode),
378	nr->cmd->common.opcode,
379	nvme_get_error_status_str(nr->status),
380	NVME_SCT(nr->status), / Status Code Type /
381	nr->status & NVME_SC_MASK, / Status Code /
382	nr->status & NVME_STATUS_MORE ? "MORE " : "",
383	nr->status & NVME_STATUS_DNR ? "DNR " : "",
384	le32_to_cpu(nr->cmd->common.cdw10),
385	le32_to_cpu(nr->cmd->common.cdw11),
386	le32_to_cpu(nr->cmd->common.cdw12),
387	le32_to_cpu(nr->cmd->common.cdw13),
388	le32_to_cpu(nr->cmd->common.cdw14),
389	le32_to_cpu(nr->cmd->common.cdw15));
390	}
391
392	enum nvme_disposition {
393	COMPLETE,
394	RETRY,
395	FAILOVER,
396	AUTHENTICATE,
397	};
398
399	static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
400	{
401	if (likely(nvme_req(req)->status == `0`))
402	return COMPLETE;
403
404	if (blk_noretry_request(req) \|\|
405	(nvme_req(req)->status & NVME_STATUS_DNR) \|\|
406	nvme_req(req)->retries >= nvme_max_retries)
407	return COMPLETE;
408
409	if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
410	return AUTHENTICATE;
411
412	if (req->cmd_flags & REQ_NVME_MPATH) {
413	if (nvme_is_path_error(status: nvme_req(req)->status) \|\|
414	blk_queue_dying(req->q))
415	return FAILOVER;
416	} else {
417	if (blk_queue_dying(req->q))
418	return COMPLETE;
419	}
420
421	return RETRY;
422	}
423
424	static inline void nvme_end_req_zoned(struct request *req)
425	{
426	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
427	req_op(req) == REQ_OP_ZONE_APPEND) {
428	struct nvme_ns *ns = req->q->queuedata;
429
430	req->__sector = nvme_lba_to_sect(head: ns->head,
431	le64_to_cpu(nvme_req(req)->result.u64));
432	}
433	}
434
435	static inline void __nvme_end_req(struct request *req)
436	{
437	if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
438	if (blk_rq_is_passthrough(rq: req))
439	nvme_log_err_passthru(req);
440	else
441	nvme_log_error(req);
442	}
443	nvme_end_req_zoned(req);
444	nvme_trace_bio_complete(req);
445	if (req->cmd_flags & REQ_NVME_MPATH)
446	nvme_mpath_end_request(rq: req);
447	}
448
449	void nvme_end_req(struct request *req)
450	{
451	blk_status_t status = nvme_error_status(status: nvme_req(req)->status);
452
453	__nvme_end_req(req);
454	blk_mq_end_request(rq: req, error: status);
455	}
456
457	void nvme_complete_rq(struct request *req)
458	{
459	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
460
461	trace_nvme_complete_rq(req);
462	nvme_cleanup_cmd(req);
463
464	/*
465	* Completions of long-running commands should not be able to
466	* defer sending of periodic keep alives, since the controller
467	* may have completed processing such commands a long time ago
468	* (arbitrarily close to command submission time).
469	* req->deadline - req->timeout is the command submission time
470	* in jiffies.
471	*/
472	if (ctrl->kas &&
473	req->deadline - req->timeout >= ctrl->ka_last_check_time)
474	ctrl->comp_seen = true;
475
476	switch (nvme_decide_disposition(req)) {
477	case COMPLETE:
478	nvme_end_req(req);
479	return;
480	case RETRY:
481	nvme_retry_req(req);
482	return;
483	case FAILOVER:
484	nvme_failover_req(req);
485	return;
486	case AUTHENTICATE:
487	#ifdef CONFIG_NVME_HOST_AUTH
488	queue_work(wq: nvme_wq, work: &ctrl->dhchap_auth_work);
489	nvme_retry_req(req);
490	#else
491	nvme_end_req(req);
492	#endif
493	return;
494	}
495	}
496	EXPORT_SYMBOL_GPL(nvme_complete_rq);
497
498	void nvme_complete_batch_req(struct request *req)
499	{
500	trace_nvme_complete_rq(req);
501	nvme_cleanup_cmd(req);
502	__nvme_end_req(req);
503	}
504	EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
505
506	/*
507	* Called to unwind from ->queue_rq on a failed command submission so that the
508	* multipathing code gets called to potentially failover to another path.
509	* The caller needs to unwind all transport specific resource allocations and
510	* must return propagate the return value.
511	*/
512	blk_status_t nvme_host_path_error(struct request *req)
513	{
514	nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
515	blk_mq_set_request_complete(rq: req);
516	nvme_complete_rq(req);
517	return BLK_STS_OK;
518	}
519	EXPORT_SYMBOL_GPL(nvme_host_path_error);
520
521	bool nvme_cancel_request(struct request req, void* *data)
522	{
523	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
524	"Cancelling I/O %d", req->tag);
525
526	/ don't abort one completed or idle request /
527	if (blk_mq_rq_state(rq: req) != MQ_RQ_IN_FLIGHT)
528	return true;
529
530	nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
531	nvme_req(req)->flags \|= NVME_REQ_CANCELLED;
532	blk_mq_complete_request(rq: req);
533	return true;
534	}
535	EXPORT_SYMBOL_GPL(nvme_cancel_request);
536
537	void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
538	{
539	if (ctrl->tagset) {
540	blk_mq_tagset_busy_iter(tagset: ctrl->tagset,
541	fn: nvme_cancel_request, priv: ctrl);
542	blk_mq_tagset_wait_completed_request(tagset: ctrl->tagset);
543	}
544	}
545	EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
546
547	void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
548	{
549	if (ctrl->admin_tagset) {
550	blk_mq_tagset_busy_iter(tagset: ctrl->admin_tagset,
551	fn: nvme_cancel_request, priv: ctrl);
552	blk_mq_tagset_wait_completed_request(tagset: ctrl->admin_tagset);
553	}
554	}
555	EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
556
557	bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
558	enum nvme_ctrl_state new_state)
559	{
560	enum nvme_ctrl_state old_state;
561	unsigned long flags;
562	bool changed = false;
563
564	spin_lock_irqsave(&ctrl->lock, flags);
565
566	old_state = nvme_ctrl_state(ctrl);
567	switch (new_state) {
568	case NVME_CTRL_LIVE:
569	switch (old_state) {
570	case NVME_CTRL_CONNECTING:
571	changed = true;
572	fallthrough;
573	default:
574	break;
575	}
576	break;
577	case NVME_CTRL_RESETTING:
578	switch (old_state) {
579	case NVME_CTRL_NEW:
580	case NVME_CTRL_LIVE:
581	changed = true;
582	fallthrough;
583	default:
584	break;
585	}
586	break;
587	case NVME_CTRL_CONNECTING:
588	switch (old_state) {
589	case NVME_CTRL_NEW:
590	case NVME_CTRL_RESETTING:
591	changed = true;
592	fallthrough;
593	default:
594	break;
595	}
596	break;
597	case NVME_CTRL_DELETING:
598	switch (old_state) {
599	case NVME_CTRL_LIVE:
600	case NVME_CTRL_RESETTING:
601	case NVME_CTRL_CONNECTING:
602	changed = true;
603	fallthrough;
604	default:
605	break;
606	}
607	break;
608	case NVME_CTRL_DELETING_NOIO:
609	switch (old_state) {
610	case NVME_CTRL_DELETING:
611	case NVME_CTRL_DEAD:
612	changed = true;
613	fallthrough;
614	default:
615	break;
616	}
617	break;
618	case NVME_CTRL_DEAD:
619	switch (old_state) {
620	case NVME_CTRL_DELETING:
621	changed = true;
622	fallthrough;
623	default:
624	break;
625	}
626	break;
627	default:
628	break;
629	}
630
631	if (changed) {
632	WRITE_ONCE(ctrl->state, new_state);
633	wake_up_all(&ctrl->state_wq);
634	}
635
636	spin_unlock_irqrestore(lock: &ctrl->lock, flags);
637	if (!changed)
638	return false;
639
640	if (new_state == NVME_CTRL_LIVE) {
641	if (old_state == NVME_CTRL_CONNECTING)
642	nvme_stop_failfast_work(ctrl);
643	nvme_kick_requeue_lists(ctrl);
644	} else if (new_state == NVME_CTRL_CONNECTING &&
645	old_state == NVME_CTRL_RESETTING) {
646	nvme_start_failfast_work(ctrl);
647	}
648	return changed;
649	}
650	EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
651
652	/*
653	* Waits for the controller state to be resetting, or returns false if it is
654	* not possible to ever transition to that state.
655	*/
656	bool nvme_wait_reset(struct nvme_ctrl *ctrl)
657	{
658	wait_event(ctrl->state_wq,
659	nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) \|\|
660	nvme_state_terminal(ctrl));
661	return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
662	}
663	EXPORT_SYMBOL_GPL(nvme_wait_reset);
664
665	static void nvme_free_ns_head(struct kref *ref)
666	{
667	struct nvme_ns_head *head =
668	container_of(ref, struct nvme_ns_head, ref);
669
670	nvme_mpath_put_disk(head);
671	ida_free(&head->subsys->ns_ida, id: head->instance);
672	cleanup_srcu_struct(ssp: &head->srcu);
673	nvme_put_subsystem(subsys: head->subsys);
674	kfree(objp: head->plids);
675	kfree(objp: head);
676	}
677
678	bool nvme_tryget_ns_head(struct nvme_ns_head *head)
679	{
680	return kref_get_unless_zero(kref: &head->ref);
681	}
682
683	void nvme_put_ns_head(struct nvme_ns_head *head)
684	{
685	kref_put(kref: &head->ref, release: nvme_free_ns_head);
686	}
687
688	static void nvme_free_ns(struct kref *kref)
689	{
690	struct nvme_ns ns = container_of(kref, struct* nvme_ns, kref);
691
692	put_disk(disk: ns->disk);
693	nvme_put_ns_head(head: ns->head);
694	nvme_put_ctrl(ctrl: ns->ctrl);
695	kfree(objp: ns);
696	}
697
698	bool nvme_get_ns(struct nvme_ns *ns)
699	{
700	return kref_get_unless_zero(kref: &ns->kref);
701	}
702
703	void nvme_put_ns(struct nvme_ns *ns)
704	{
705	kref_put(kref: &ns->kref, release: nvme_free_ns);
706	}
707	EXPORT_SYMBOL_NS_GPL(nvme_put_ns, "NVME_TARGET_PASSTHRU");
708
709	static inline void nvme_clear_nvme_request(struct request *req)
710	{
711	nvme_req(req)->status = `0`;
712	nvme_req(req)->retries = `0`;
713	nvme_req(req)->flags = `0`;
714	req->rq_flags \|= RQF_DONTPREP;
715	}
716
717	/ initialize a passthrough request /
718	void nvme_init_request(struct request req, struct* nvme_command *cmd)
719	{
720	struct nvme_request *nr = nvme_req(req);
721	bool logging_enabled;
722
723	if (req->q->queuedata) {
724	struct nvme_ns *ns = req->q->disk->private_data;
725
726	logging_enabled = ns->head->passthru_err_log_enabled;
727	req->timeout = NVME_IO_TIMEOUT;
728	} else { / no queuedata implies admin queue /
729	logging_enabled = nr->ctrl->passthru_err_log_enabled;
730	req->timeout = NVME_ADMIN_TIMEOUT;
731	}
732
733	if (!logging_enabled)
734	req->rq_flags \|= RQF_QUIET;
735
736	/ passthru commands should let the driver set the SGL flags /
737	cmd->common.flags &= ~NVME_CMD_SGL_ALL;
738
739	req->cmd_flags \|= REQ_FAILFAST_DRIVER;
740	if (req->mq_hctx->type == HCTX_TYPE_POLL)
741	req->cmd_flags \|= REQ_POLLED;
742	nvme_clear_nvme_request(req);
743	memcpy(nr->cmd, cmd, sizeof(*cmd));
744	}
745	EXPORT_SYMBOL_GPL(nvme_init_request);
746
747	/*
748	* For something we're not in a state to send to the device the default action
749	* is to busy it and retry it after the controller state is recovered. However,
750	* if the controller is deleting or if anything is marked for failfast or
751	* nvme multipath it is immediately failed.
752	*
753	* Note: commands used to initialize the controller will be marked for failfast.
754	* Note: nvme cli/ioctl commands are marked for failfast.
755	*/
756	blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
757	struct request *rq)
758	{
759	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
760
761	if (state != NVME_CTRL_DELETING_NOIO &&
762	state != NVME_CTRL_DELETING &&
763	state != NVME_CTRL_DEAD &&
764	!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
765	!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
766	return BLK_STS_RESOURCE;
767
768	if (!(rq->rq_flags & RQF_DONTPREP))
769	nvme_clear_nvme_request(req: rq);
770
771	return nvme_host_path_error(rq);
772	}
773	EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
774
775	bool __nvme_check_ready(struct nvme_ctrl ctrl, struct* request *rq,
776	bool queue_live, enum nvme_ctrl_state state)
777	{
778	struct nvme_request *req = nvme_req(req: rq);
779
780	/*
781	* currently we have a problem sending passthru commands
782	* on the admin_q if the controller is not LIVE because we can't
783	* make sure that they are going out after the admin connect,
784	* controller enable and/or other commands in the initialization
785	* sequence. until the controller will be LIVE, fail with
786	* BLK_STS_RESOURCE so that they will be rescheduled.
787	*/
788	if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
789	return false;
790
791	if (ctrl->ops->flags & NVME_F_FABRICS) {
792	/*
793	* Only allow commands on a live queue, except for the connect
794	* command, which is require to set the queue live in the
795	* appropinquate states.
796	*/
797	switch (state) {
798	case NVME_CTRL_CONNECTING:
799	if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(cmd: req->cmd) &&
800	(req->cmd->fabrics.fctype == nvme_fabrics_type_connect \|\|
801	req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send \|\|
802	req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
803	return true;
804	break;
805	default:
806	break;
807	case NVME_CTRL_DEAD:
808	return false;
809	}
810	}
811
812	return queue_live;
813	}
814	EXPORT_SYMBOL_GPL(__nvme_check_ready);
815
816	static inline void nvme_setup_flush(struct nvme_ns *ns,
817	struct nvme_command *cmnd)
818	{
819	memset(cmnd, `0`, sizeof(*cmnd));
820	cmnd->common.opcode = nvme_cmd_flush;
821	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
822	}
823
824	static blk_status_t nvme_setup_discard(struct nvme_ns ns, struct* request *req,
825	struct nvme_command *cmnd)
826	{
827	unsigned short segments = blk_rq_nr_discard_segments(rq: req), n = `0`;
828	struct nvme_dsm_range *range;
829	struct bio *bio;
830
831	/*
832	* Some devices do not consider the DSM 'Number of Ranges' field when
833	* determining how much data to DMA. Always allocate memory for maximum
834	* number of segments to prevent device reading beyond end of buffer.
835	*/
836	static const size_t alloc_size = sizeof(range) NVME_DSM_MAX_RANGES;
837
838	range = kzalloc(alloc_size, GFP_ATOMIC \| __GFP_NOWARN);
839	if (!range) {
840	/*
841	* If we fail allocation our range, fallback to the controller
842	* discard page. If that's also busy, it's safe to return
843	* busy, as we know we can make progress once that's freed.
844	*/
845	if (test_and_set_bit_lock(nr: `0`, addr: &ns->ctrl->discard_page_busy))
846	return BLK_STS_RESOURCE;
847
848	range = page_address(ns->ctrl->discard_page);
849	}
850
851	if (queue_max_discard_segments(q: req->q) == `1`) {
852	u64 slba = nvme_sect_to_lba(head: ns->head, sector: blk_rq_pos(rq: req));
853	u32 nlb = blk_rq_sectors(rq: req) >> (ns->head->lba_shift - `9`);
854
855	range[`0`].cattr = cpu_to_le32(`0`);
856	range[`0`].nlb = cpu_to_le32(nlb);
857	range[`0`].slba = cpu_to_le64(slba);
858	n = `1`;
859	} else {
860	__rq_for_each_bio(bio, req) {
861	u64 slba = nvme_sect_to_lba(head: ns->head,
862	sector: bio->bi_iter.bi_sector);
863	u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
864
865	if (n < segments) {
866	range[n].cattr = cpu_to_le32(`0`);
867	range[n].nlb = cpu_to_le32(nlb);
868	range[n].slba = cpu_to_le64(slba);
869	}
870	n++;
871	}
872	}
873
874	if (WARN_ON_ONCE(n != segments)) {
875	if (virt_to_page(range) == ns->ctrl->discard_page)
876	clear_bit_unlock(nr: `0`, addr: &ns->ctrl->discard_page_busy);
877	else
878	kfree(objp: range);
879	return BLK_STS_IOERR;
880	}
881
882	memset(cmnd, `0`, sizeof(*cmnd));
883	cmnd->dsm.opcode = nvme_cmd_dsm;
884	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
885	cmnd->dsm.nr = cpu_to_le32(segments - `1`);
886	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
887
888	bvec_set_virt(bv: &req->special_vec, vaddr: range, len: alloc_size);
889	req->rq_flags \|= RQF_SPECIAL_PAYLOAD;
890
891	return BLK_STS_OK;
892	}
893
894	static void nvme_set_app_tag(struct request req, struct* nvme_command *cmnd)
895	{
896	cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
897	cmnd->rw.lbatm = cpu_to_le16(`0xffff`);
898	}
899
900	static void nvme_set_ref_tag(struct nvme_ns ns, struct* nvme_command *cmnd,
901	struct request *req)
902	{
903	u32 upper, lower;
904	u64 ref48;
905
906	/ only type1 and type 2 PI formats have a reftag /
907	switch (ns->head->pi_type) {
908	case NVME_NS_DPS_PI_TYPE1:
909	case NVME_NS_DPS_PI_TYPE2:
910	break;
911	default:
912	return;
913	}
914
915	/ both rw and write zeroes share the same reftag format /
916	switch (ns->head->guard_type) {
917	case NVME_NVM_NS_16B_GUARD:
918	cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
919	break;
920	case NVME_NVM_NS_64B_GUARD:
921	ref48 = ext_pi_ref_tag(rq: req);
922	lower = lower_32_bits(ref48);
923	upper = upper_32_bits(ref48);
924
925	cmnd->rw.reftag = cpu_to_le32(lower);
926	cmnd->rw.cdw3 = cpu_to_le32(upper);
927	break;
928	default:
929	break;
930	}
931	}
932
933	static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
934	struct request req, struct* nvme_command *cmnd)
935	{
936	memset(cmnd, `0`, sizeof(*cmnd));
937
938	if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
939	return nvme_setup_discard(ns, req, cmnd);
940
941	cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
942	cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
943	cmnd->write_zeroes.slba =
944	cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
945	cmnd->write_zeroes.length =
946	cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - `1`);
947
948	if (!(req->cmd_flags & REQ_NOUNMAP) &&
949	(ns->head->features & NVME_NS_DEAC))
950	cmnd->write_zeroes.control \|= cpu_to_le16(NVME_WZ_DEAC);
951
952	if (nvme_ns_has_pi(head: ns->head)) {
953	cmnd->write_zeroes.control \|= cpu_to_le16(NVME_RW_PRINFO_PRACT);
954	nvme_set_ref_tag(ns, cmnd, req);
955	}
956
957	return BLK_STS_OK;
958	}
959
960	/*
961	* NVMe does not support a dedicated command to issue an atomic write. A write
962	* which does adhere to the device atomic limits will silently be executed
963	* non-atomically. The request issuer should ensure that the write is within
964	* the queue atomic writes limits, but just validate this in case it is not.
965	*/
966	static bool nvme_valid_atomic_write(struct request *req)
967	{
968	struct request_queue *q = req->q;
969	u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
970
971	if (blk_rq_bytes(rq: req) > queue_atomic_write_unit_max_bytes(q))
972	return false;
973
974	if (boundary_bytes) {
975	u64 mask = boundary_bytes - `1`, imask = ~mask;
976	u64 start = blk_rq_pos(rq: req) << SECTOR_SHIFT;
977	u64 end = start + blk_rq_bytes(rq: req) - `1`;
978
979	/ If greater then must be crossing a boundary /
980	if (blk_rq_bytes(rq: req) > boundary_bytes)
981	return false;
982
983	if ((start & imask) != (end & imask))
984	return false;
985	}
986
987	return true;
988	}
989
990	static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
991	struct request req, struct* nvme_command *cmnd,
992	enum nvme_opcode op)
993	{
994	u16 control = `0`;
995	u32 dsmgmt = `0`;
996
997	if (req->cmd_flags & REQ_FUA)
998	control \|= NVME_RW_FUA;
999	if (req->cmd_flags & (REQ_FAILFAST_DEV \| REQ_RAHEAD))
1000	control \|= NVME_RW_LR;
1001
1002	if (req->cmd_flags & REQ_RAHEAD)
1003	dsmgmt \|= NVME_RW_DSM_FREQ_PREFETCH;
1004
1005	if (op == nvme_cmd_write && ns->head->nr_plids) {
1006	u16 write_stream = req->bio->bi_write_stream;
1007
1008	if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
1009	return BLK_STS_INVAL;
1010
1011	if (write_stream) {
1012	dsmgmt \|= ns->head->plids[write_stream - `1`] << `16`;
1013	control \|= NVME_RW_DTYPE_DPLCMT;
1014	}
1015	}
1016
1017	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
1018	return BLK_STS_INVAL;
1019
1020	cmnd->rw.opcode = op;
1021	cmnd->rw.flags = `0`;
1022	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
1023	cmnd->rw.cdw2 = `0`;
1024	cmnd->rw.cdw3 = `0`;
1025	cmnd->rw.metadata = `0`;
1026	cmnd->rw.slba =
1027	cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
1028	cmnd->rw.length =
1029	cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - `1`);
1030	cmnd->rw.reftag = `0`;
1031	cmnd->rw.lbat = `0`;
1032	cmnd->rw.lbatm = `0`;
1033
1034	if (ns->head->ms) {
1035	/*
1036	* If formatted with metadata, the block layer always provides a
1037	* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
1038	* we enable the PRACT bit for protection information or set the
1039	* namespace capacity to zero to prevent any I/O.
1040	*/
1041	if (!blk_integrity_rq(rq: req)) {
1042	if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
1043	return BLK_STS_NOTSUPP;
1044	control \|= NVME_RW_PRINFO_PRACT;
1045	nvme_set_ref_tag(ns, cmnd, req);
1046	}
1047
1048	if (bio_integrity_flagged(bio: req->bio, flag: BIP_CHECK_GUARD))
1049	control \|= NVME_RW_PRINFO_PRCHK_GUARD;
1050	if (bio_integrity_flagged(bio: req->bio, flag: BIP_CHECK_REFTAG)) {
1051	control \|= NVME_RW_PRINFO_PRCHK_REF;
1052	if (op == nvme_cmd_zone_append)
1053	control \|= NVME_RW_APPEND_PIREMAP;
1054	nvme_set_ref_tag(ns, cmnd, req);
1055	}
1056	if (bio_integrity_flagged(bio: req->bio, flag: BIP_CHECK_APPTAG)) {
1057	control \|= NVME_RW_PRINFO_PRCHK_APP;
1058	nvme_set_app_tag(req, cmnd);
1059	}
1060	}
1061
1062	cmnd->rw.control = cpu_to_le16(control);
1063	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
1064	return `0`;
1065	}
1066
1067	void nvme_cleanup_cmd(struct request *req)
1068	{
1069	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
1070	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
1071
1072	if (req->special_vec.bv_page == ctrl->discard_page)
1073	clear_bit_unlock(nr: `0`, addr: &ctrl->discard_page_busy);
1074	else
1075	kfree(objp: bvec_virt(bvec: &req->special_vec));
1076	req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
1077	}
1078	}
1079	EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
1080
1081	blk_status_t nvme_setup_cmd(struct nvme_ns ns, struct* request *req)
1082	{
1083	struct nvme_command *cmd = nvme_req(req)->cmd;
1084	blk_status_t ret = BLK_STS_OK;
1085
1086	if (!(req->rq_flags & RQF_DONTPREP))
1087	nvme_clear_nvme_request(req);
1088
1089	switch (req_op(req)) {
1090	case REQ_OP_DRV_IN:
1091	case REQ_OP_DRV_OUT:
1092	/ these are setup prior to execution in nvme_init_request() /
1093	break;
1094	case REQ_OP_FLUSH:
1095	nvme_setup_flush(ns, cmnd: cmd);
1096	break;
1097	case REQ_OP_ZONE_RESET_ALL:
1098	case REQ_OP_ZONE_RESET:
1099	ret = nvme_setup_zone_mgmt_send(ns, req, cmnd: cmd, action: NVME_ZONE_RESET);
1100	break;
1101	case REQ_OP_ZONE_OPEN:
1102	ret = nvme_setup_zone_mgmt_send(ns, req, cmnd: cmd, action: NVME_ZONE_OPEN);
1103	break;
1104	case REQ_OP_ZONE_CLOSE:
1105	ret = nvme_setup_zone_mgmt_send(ns, req, cmnd: cmd, action: NVME_ZONE_CLOSE);
1106	break;
1107	case REQ_OP_ZONE_FINISH:
1108	ret = nvme_setup_zone_mgmt_send(ns, req, cmnd: cmd, action: NVME_ZONE_FINISH);
1109	break;
1110	case REQ_OP_WRITE_ZEROES:
1111	ret = nvme_setup_write_zeroes(ns, req, cmnd: cmd);
1112	break;
1113	case REQ_OP_DISCARD:
1114	ret = nvme_setup_discard(ns, req, cmnd: cmd);
1115	break;
1116	case REQ_OP_READ:
1117	ret = nvme_setup_rw(ns, req, cmnd: cmd, op: nvme_cmd_read);
1118	break;
1119	case REQ_OP_WRITE:
1120	ret = nvme_setup_rw(ns, req, cmnd: cmd, op: nvme_cmd_write);
1121	break;
1122	case REQ_OP_ZONE_APPEND:
1123	ret = nvme_setup_rw(ns, req, cmnd: cmd, op: nvme_cmd_zone_append);
1124	break;
1125	default:
1126	WARN_ON_ONCE(`1`);
1127	return BLK_STS_IOERR;
1128	}
1129
1130	cmd->common.command_id = nvme_cid(rq: req);
1131	trace_nvme_setup_cmd(req, cmd);
1132	return ret;
1133	}
1134	EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1135
1136	/*
1137	* Return values:
1138	* 0: success
1139	* >0: nvme controller's cqe status response
1140	* <0: kernel error in lieu of controller response
1141	*/
1142	int nvme_execute_rq(struct request *rq, bool at_head)
1143	{
1144	blk_status_t status;
1145
1146	status = blk_execute_rq(rq, at_head);
1147	if (nvme_req(req: rq)->flags & NVME_REQ_CANCELLED)
1148	return -EINTR;
1149	if (nvme_req(req: rq)->status)
1150	return nvme_req(req: rq)->status;
1151	return blk_status_to_errno(status);
1152	}
1153	EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");
1154
1155	/*
1156	* Returns 0 on success. If the result is negative, it's a Linux error code;
1157	* if the result is positive, it's an NVM Express status code
1158	*/
1159	int __nvme_submit_sync_cmd(struct request_queue q, struct* nvme_command *cmd,
1160	union nvme_result result, void* buffer, unsigned* bufflen,
1161	int qid, nvme_submit_flags_t flags)
1162	{
1163	struct request *req;
1164	int ret;
1165	blk_mq_req_flags_t blk_flags = `0`;
1166
1167	if (flags & NVME_SUBMIT_NOWAIT)
1168	blk_flags \|= BLK_MQ_REQ_NOWAIT;
1169	if (flags & NVME_SUBMIT_RESERVED)
1170	blk_flags \|= BLK_MQ_REQ_RESERVED;
1171	if (qid == NVME_QID_ANY)
1172	req = blk_mq_alloc_request(q, opf: nvme_req_op(cmd), flags: blk_flags);
1173	else
1174	req = blk_mq_alloc_request_hctx(q, opf: nvme_req_op(cmd), flags: blk_flags,
1175	hctx_idx: qid - `1`);
1176
1177	if (IS_ERR(ptr: req))
1178	return PTR_ERR(ptr: req);
1179	nvme_init_request(req, cmd);
1180	if (flags & NVME_SUBMIT_RETRY)
1181	req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
1182
1183	if (buffer && bufflen) {
1184	ret = blk_rq_map_kern(rq: req, kbuf: buffer, len: bufflen, GFP_KERNEL);
1185	if (ret)
1186	goto out;
1187	}
1188
1189	ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
1190	if (result && ret >= `0`)
1191	*result = nvme_req(req)->result;
1192	out:
1193	blk_mq_free_request(rq: req);
1194	return ret;
1195	}
1196	EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1197
1198	int nvme_submit_sync_cmd(struct request_queue q, struct* nvme_command *cmd,
1199	void buffer, unsigned* bufflen)
1200	{
1201	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1202	NVME_QID_ANY, `0`);
1203	}
1204	EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1205
1206	u32 nvme_command_effects(struct nvme_ctrl ctrl, struct* nvme_ns *ns, u8 opcode)
1207	{
1208	u32 effects = `0`;
1209
1210	if (ns) {
1211	effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1212	if (effects & ~(NVME_CMD_EFFECTS_CSUPP \| NVME_CMD_EFFECTS_LBCC))
1213	dev_warn_once(ctrl->device,
1214	"IO command:%02x has unusual effects:%08x\n",
1215	opcode, effects);
1216
1217	/*
1218	* NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
1219	* which would deadlock when done on an I/O command. Note that
1220	* We already warn about an unusual effect above.
1221	*/
1222	effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1223	} else {
1224	effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1225
1226	/ Ignore execution restrictions if any relaxation bits are set /
1227	if (effects & NVME_CMD_EFFECTS_CSER_MASK)
1228	effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1229	}
1230
1231	return effects;
1232	}
1233	EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");
1234
1235	u32 nvme_passthru_start(struct nvme_ctrl ctrl, struct* nvme_ns *ns, u8 opcode)
1236	{
1237	u32 effects = nvme_command_effects(ctrl, ns, opcode);
1238
1239	/*
1240	* For simplicity, IO to all namespaces is quiesced even if the command
1241	* effects say only one namespace is affected.
1242	*/
1243	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1244	mutex_lock(&ctrl->scan_lock);
1245	mutex_lock(&ctrl->subsys->lock);
1246	nvme_mpath_start_freeze(subsys: ctrl->subsys);
1247	nvme_mpath_wait_freeze(subsys: ctrl->subsys);
1248	nvme_start_freeze(ctrl);
1249	nvme_wait_freeze(ctrl);
1250	}
1251	return effects;
1252	}
1253	EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
1254
1255	void nvme_passthru_end(struct nvme_ctrl ctrl, struct* nvme_ns *ns, u32 effects,
1256	struct nvme_command cmd, int* status)
1257	{
1258	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1259	nvme_unfreeze(ctrl);
1260	nvme_mpath_unfreeze(subsys: ctrl->subsys);
1261	mutex_unlock(lock: &ctrl->subsys->lock);
1262	mutex_unlock(lock: &ctrl->scan_lock);
1263	}
1264	if (effects & NVME_CMD_EFFECTS_CCC) {
1265	if (!test_and_set_bit(nr: NVME_CTRL_DIRTY_CAPABILITY,
1266	addr: &ctrl->flags)) {
1267	dev_info(ctrl->device,
1268	"controller capabilities changed, reset may be required to take effect.\n");
1269	}
1270	}
1271	if (effects & (NVME_CMD_EFFECTS_NIC \| NVME_CMD_EFFECTS_NCC)) {
1272	nvme_queue_scan(ctrl);
1273	flush_work(work: &ctrl->scan_work);
1274	}
1275	if (ns)
1276	return;
1277
1278	switch (cmd->common.opcode) {
1279	case nvme_admin_set_features:
1280	switch (le32_to_cpu(cmd->common.cdw10) & `0xFF`) {
1281	case NVME_FEAT_KATO:
1282	/*
1283	* Keep alive commands interval on the host should be
1284	* updated when KATO is modified by Set Features
1285	* commands.
1286	*/
1287	if (!status)
1288	nvme_update_keep_alive(ctrl, cmd);
1289	break;
1290	default:
1291	break;
1292	}
1293	break;
1294	default:
1295	break;
1296	}
1297	}
1298	EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");
1299
1300	/*
1301	* Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1302	*
1303	* The host should send Keep Alive commands at half of the Keep Alive Timeout
1304	* accounting for transport roundtrip times [..].
1305	*/
1306	static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
1307	{
1308	unsigned long delay = ctrl->kato * HZ / `2`;
1309
1310	/*
1311	* When using Traffic Based Keep Alive, we need to run
1312	* nvme_keep_alive_work at twice the normal frequency, as one
1313	* command completion can postpone sending a keep alive command
1314	* by up to twice the delay between runs.
1315	*/
1316	if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
1317	delay /= `2`;
1318	return delay;
1319	}
1320
1321	static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1322	{
1323	unsigned long now = jiffies;
1324	unsigned long delay = nvme_keep_alive_work_period(ctrl);
1325	unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
1326
1327	if (time_after(now, ka_next_check_tm))
1328	delay = `0`;
1329	else
1330	delay = ka_next_check_tm - now;
1331
1332	queue_delayed_work(wq: nvme_wq, dwork: &ctrl->ka_work, delay);
1333	}
1334
1335	static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
1336	blk_status_t status)
1337	{
1338	struct nvme_ctrl *ctrl = rq->end_io_data;
1339	unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
1340	unsigned long delay = nvme_keep_alive_work_period(ctrl);
1341	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
1342
1343	/*
1344	* Subtract off the keepalive RTT so nvme_keep_alive_work runs
1345	* at the desired frequency.
1346	*/
1347	if (rtt <= delay) {
1348	delay -= rtt;
1349	} else {
1350	dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
1351	jiffies_to_msecs(rtt));
1352	delay = `0`;
1353	}
1354
1355	blk_mq_free_request(rq);
1356
1357	if (status) {
1358	dev_err(ctrl->device,
1359	"failed nvme_keep_alive_end_io error=%d\n",
1360	status);
1361	return RQ_END_IO_NONE;
1362	}
1363
1364	ctrl->ka_last_check_time = jiffies;
1365	ctrl->comp_seen = false;
1366	if (state == NVME_CTRL_LIVE \|\| state == NVME_CTRL_CONNECTING)
1367	queue_delayed_work(wq: nvme_wq, dwork: &ctrl->ka_work, delay);
1368	return RQ_END_IO_NONE;
1369	}
1370
1371	static void nvme_keep_alive_work(struct work_struct *work)
1372	{
1373	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1374	struct nvme_ctrl, ka_work);
1375	bool comp_seen = ctrl->comp_seen;
1376	struct request *rq;
1377
1378	ctrl->ka_last_check_time = jiffies;
1379
1380	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1381	dev_dbg(ctrl->device,
1382	"reschedule traffic based keep-alive timer\n");
1383	ctrl->comp_seen = false;
1384	nvme_queue_keep_alive_work(ctrl);
1385	return;
1386	}
1387
1388	rq = blk_mq_alloc_request(q: ctrl->admin_q, opf: nvme_req_op(cmd: &ctrl->ka_cmd),
1389	flags: BLK_MQ_REQ_RESERVED \| BLK_MQ_REQ_NOWAIT);
1390	if (IS_ERR(ptr: rq)) {
1391	/ allocation failure, reset the controller /
1392	dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1393	nvme_reset_ctrl(ctrl);
1394	return;
1395	}
1396	nvme_init_request(rq, &ctrl->ka_cmd);
1397
1398	rq->timeout = ctrl->kato * HZ;
1399	rq->end_io = nvme_keep_alive_end_io;
1400	rq->end_io_data = ctrl;
1401	blk_execute_rq_nowait(rq, at_head: false);
1402	}
1403
1404	static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1405	{
1406	if (unlikely(ctrl->kato == `0`))
1407	return;
1408
1409	nvme_queue_keep_alive_work(ctrl);
1410	}
1411
1412	void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1413	{
1414	if (unlikely(ctrl->kato == `0`))
1415	return;
1416
1417	cancel_delayed_work_sync(dwork: &ctrl->ka_work);
1418	}
1419	EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1420
1421	static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1422	struct nvme_command *cmd)
1423	{
1424	unsigned int new_kato =
1425	DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), `1000`);
1426
1427	dev_info(ctrl->device,
1428	"keep alive interval updated from %u ms to %u ms\n",
1429	ctrl->kato * `1000` / `2`, new_kato * `1000` / `2`);
1430
1431	nvme_stop_keep_alive(ctrl);
1432	ctrl->kato = new_kato;
1433	nvme_start_keep_alive(ctrl);
1434	}
1435
1436	static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
1437	{
1438	/*
1439	* The CNS field occupies a full byte starting with NVMe 1.2
1440	*/
1441	if (ctrl->vs >= NVME_VS(`1`, `2`, `0`))
1442	return true;
1443
1444	/*
1445	* NVMe 1.1 expanded the CNS value to two bits, which means values
1446	* larger than that could get truncated and treated as an incorrect
1447	* value.
1448	*
1449	* Qemu implemented 1.0 behavior for controllers claiming 1.1
1450	* compliance, so they need to be quirked here.
1451	*/
1452	if (ctrl->vs >= NVME_VS(`1`, `1`, `0`) &&
1453	!(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS))
1454	return cns <= `3`;
1455
1456	/*
1457	* NVMe 1.0 used a single bit for the CNS value.
1458	*/
1459	return cns <= `1`;
1460	}
1461
1462	static int nvme_identify_ctrl(struct nvme_ctrl dev, struct* nvme_id_ctrl **id)
1463	{
1464	struct nvme_command c = { };
1465	int error;
1466
1467	/ gcc-4.4.4 (at least) has issues with initializers and anon unions /
1468	c.identify.opcode = nvme_admin_identify;
1469	c.identify.cns = NVME_ID_CNS_CTRL;
1470
1471	id = kmalloc(sizeof(struct* nvme_id_ctrl), GFP_KERNEL);
1472	if (!*id)
1473	return -ENOMEM;
1474
1475	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1476	sizeof(struct nvme_id_ctrl));
1477	if (error) {
1478	kfree(objp: *id);
1479	*id = NULL;
1480	}
1481	return error;
1482	}
1483
1484	static int nvme_process_ns_desc(struct nvme_ctrl ctrl, struct* nvme_ns_ids *ids,
1485	struct nvme_ns_id_desc cur, bool csi_seen)
1486	{
1487	const char *warn_str = "ctrl returned bogus length:";
1488	void *data = cur;
1489
1490	switch (cur->nidt) {
1491	case NVME_NIDT_EUI64:
1492	if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1493	dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1494	warn_str, cur->nidl);
1495	return -`1`;
1496	}
1497	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1498	return NVME_NIDT_EUI64_LEN;
1499	memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1500	return NVME_NIDT_EUI64_LEN;
1501	case NVME_NIDT_NGUID:
1502	if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1503	dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1504	warn_str, cur->nidl);
1505	return -`1`;
1506	}
1507	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1508	return NVME_NIDT_NGUID_LEN;
1509	memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1510	return NVME_NIDT_NGUID_LEN;
1511	case NVME_NIDT_UUID:
1512	if (cur->nidl != NVME_NIDT_UUID_LEN) {
1513	dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1514	warn_str, cur->nidl);
1515	return -`1`;
1516	}
1517	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1518	return NVME_NIDT_UUID_LEN;
1519	uuid_copy(dst: &ids->uuid, src: data + sizeof(*cur));
1520	return NVME_NIDT_UUID_LEN;
1521	case NVME_NIDT_CSI:
1522	if (cur->nidl != NVME_NIDT_CSI_LEN) {
1523	dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1524	warn_str, cur->nidl);
1525	return -`1`;
1526	}
1527	memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1528	*csi_seen = true;
1529	return NVME_NIDT_CSI_LEN;
1530	default:
1531	/ Skip unknown types /
1532	return cur->nidl;
1533	}
1534	}
1535
1536	static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
1537	struct nvme_ns_info *info)
1538	{
1539	struct nvme_command c = { };
1540	bool csi_seen = false;
1541	int status, pos, len;
1542	void *data;
1543
1544	if (ctrl->vs < NVME_VS(`1`, `3`, `0`) && !nvme_multi_css(ctrl))
1545	return `0`;
1546	if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1547	return `0`;
1548
1549	c.identify.opcode = nvme_admin_identify;
1550	c.identify.nsid = cpu_to_le32(info->nsid);
1551	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1552
1553	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1554	if (!data)
1555	return -ENOMEM;
1556
1557	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1558	NVME_IDENTIFY_DATA_SIZE);
1559	if (status) {
1560	dev_warn(ctrl->device,
1561	"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1562	info->nsid, status);
1563	goto free_data;
1564	}
1565
1566	for (pos = `0`; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1567	struct nvme_ns_id_desc *cur = data + pos;
1568
1569	if (cur->nidl == `0`)
1570	break;
1571
1572	len = nvme_process_ns_desc(ctrl, ids: &info->ids, cur, csi_seen: &csi_seen);
1573	if (len < `0`)
1574	break;
1575
1576	len += sizeof(*cur);
1577	}
1578
1579	if (nvme_multi_css(ctrl) && !csi_seen) {
1580	dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1581	info->nsid);
1582	status = -EINVAL;
1583	}
1584
1585	free_data:
1586	kfree(objp: data);
1587	return status;
1588	}
1589
1590	int nvme_identify_ns(struct nvme_ctrl ctrl, unsigned* nsid,
1591	struct nvme_id_ns **id)
1592	{
1593	struct nvme_command c = { };
1594	int error;
1595
1596	/ gcc-4.4.4 (at least) has issues with initializers and anon unions /
1597	c.identify.opcode = nvme_admin_identify;
1598	c.identify.nsid = cpu_to_le32(nsid);
1599	c.identify.cns = NVME_ID_CNS_NS;
1600
1601	id = kmalloc(sizeof(*id), GFP_KERNEL);
1602	if (!*id)
1603	return -ENOMEM;
1604
1605	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1606	if (error) {
1607	dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1608	kfree(objp: *id);
1609	*id = NULL;
1610	}
1611	return error;
1612	}
1613
1614	static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
1615	struct nvme_ns_info *info)
1616	{
1617	struct nvme_ns_ids *ids = &info->ids;
1618	struct nvme_id_ns *id;
1619	int ret;
1620
1621	ret = nvme_identify_ns(ctrl, nsid: info->nsid, id: &id);
1622	if (ret)
1623	return ret;
1624
1625	if (id->ncap == `0`) {
1626	/ namespace not allocated or attached /
1627	info->is_removed = true;
1628	ret = -ENODEV;
1629	goto error;
1630	}
1631
1632	info->anagrpid = id->anagrpid;
1633	info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1634	info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1635	info->is_ready = true;
1636	info->endgid = le16_to_cpu(id->endgid);
1637	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1638	dev_info(ctrl->device,
1639	"Ignoring bogus Namespace Identifiers\n");
1640	} else {
1641	if (ctrl->vs >= NVME_VS(`1`, `1`, `0`) &&
1642	!memchr_inv(p: ids->eui64, c: `0`, size: sizeof(ids->eui64)))
1643	memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1644	if (ctrl->vs >= NVME_VS(`1`, `2`, `0`) &&
1645	!memchr_inv(p: ids->nguid, c: `0`, size: sizeof(ids->nguid)))
1646	memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1647	}
1648
1649	error:
1650	kfree(objp: id);
1651	return ret;
1652	}
1653
1654	static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
1655	struct nvme_ns_info *info)
1656	{
1657	struct nvme_id_ns_cs_indep *id;
1658	struct nvme_command c = {
1659	.identify.opcode = nvme_admin_identify,
1660	.identify.nsid = cpu_to_le32(info->nsid),
1661	.identify.cns = NVME_ID_CNS_NS_CS_INDEP,
1662	};
1663	int ret;
1664
1665	id = kmalloc(sizeof(*id), GFP_KERNEL);
1666	if (!id)
1667	return -ENOMEM;
1668
1669	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1670	if (!ret) {
1671	info->anagrpid = id->anagrpid;
1672	info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1673	info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1674	info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1675	info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
1676	info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
1677	info->endgid = le16_to_cpu(id->endgid);
1678	}
1679	kfree(objp: id);
1680	return ret;
1681	}
1682
1683	static int nvme_features(struct nvme_ctrl dev, u8 op, unsigned* int fid,
1684	unsigned int dword11, void buffer, size_t buflen, u32 result)
1685	{
1686	union nvme_result res = { `0` };
1687	struct nvme_command c = { };
1688	int ret;
1689
1690	c.features.opcode = op;
1691	c.features.fid = cpu_to_le32(fid);
1692	c.features.dword11 = cpu_to_le32(dword11);
1693
1694	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1695	buffer, buflen, NVME_QID_ANY, `0`);
1696	if (ret >= `0` && result)
1697	*result = le32_to_cpu(res.u32);
1698	return ret;
1699	}
1700
1701	int nvme_set_features(struct nvme_ctrl dev, unsigned* int fid,
1702	unsigned int dword11, void *buffer, size_t buflen,
1703	void *result)
1704	{
1705	return nvme_features(dev, op: nvme_admin_set_features, fid, dword11, buffer,
1706	buflen, result);
1707	}
1708	EXPORT_SYMBOL_GPL(nvme_set_features);
1709
1710	int nvme_get_features(struct nvme_ctrl dev, unsigned* int fid,
1711	unsigned int dword11, void *buffer, size_t buflen,
1712	void *result)
1713	{
1714	return nvme_features(dev, op: nvme_admin_get_features, fid, dword11, buffer,
1715	buflen, result);
1716	}
1717	EXPORT_SYMBOL_GPL(nvme_get_features);
1718
1719	int nvme_set_queue_count(struct nvme_ctrl ctrl, int* *count)
1720	{
1721	u32 q_count = (count - `1`) \| ((count - `1`) << `16`);
1722	u32 result;
1723	int status, nr_io_queues;
1724
1725	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, `0`,
1726	&result);
1727
1728	/*
1729	* It's either a kernel error or the host observed a connection
1730	* lost. In either case it's not possible communicate with the
1731	* controller and thus enter the error code path.
1732	*/
1733	if (status < `0` \|\| status == NVME_SC_HOST_PATH_ERROR)
1734	return status;
1735
1736	/*
1737	* Degraded controllers might return an error when setting the queue
1738	* count. We still want to be able to bring them online and offer
1739	* access to the admin queue, as that might be only way to fix them up.
1740	*/
1741	if (status > `0`) {
1742	dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1743	*count = `0`;
1744	} else {
1745	nr_io_queues = min(result & `0xffff`, result >> `16`) + `1`;
1746	count = min(count, nr_io_queues);
1747	}
1748
1749	return `0`;
1750	}
1751	EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1752
1753	#define NVME_AEN_SUPPORTED \
1754	(NVME_AEN_CFG_NS_ATTR \| NVME_AEN_CFG_FW_ACT \| \
1755	NVME_AEN_CFG_ANA_CHANGE \| NVME_AEN_CFG_DISC_CHANGE)
1756
1757	static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1758	{
1759	u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1760	int status;
1761
1762	if (!supported_aens)
1763	return;
1764
1765	status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1766	NULL, `0`, &result);
1767	if (status)
1768	dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1769	supported_aens);
1770
1771	queue_work(wq: nvme_wq, work: &ctrl->async_event_work);
1772	}
1773
1774	static int nvme_ns_open(struct nvme_ns *ns)
1775	{
1776
1777	/ should never be called due to GENHD_FL_HIDDEN /
1778	if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1779	goto fail;
1780	if (!nvme_get_ns(ns))
1781	goto fail;
1782	if (!try_module_get(module: ns->ctrl->ops->module))
1783	goto fail_put_ns;
1784
1785	return `0`;
1786
1787	fail_put_ns:
1788	nvme_put_ns(ns);
1789	fail:
1790	return -ENXIO;
1791	}
1792
1793	static void nvme_ns_release(struct nvme_ns *ns)
1794	{
1795
1796	module_put(module: ns->ctrl->ops->module);
1797	nvme_put_ns(ns);
1798	}
1799
1800	static int nvme_open(struct gendisk *disk, blk_mode_t mode)
1801	{
1802	return nvme_ns_open(ns: disk->private_data);
1803	}
1804
1805	static void nvme_release(struct gendisk *disk)
1806	{
1807	nvme_ns_release(ns: disk->private_data);
1808	}
1809
1810	int nvme_getgeo(struct gendisk disk, struct* hd_geometry *geo)
1811	{
1812	/ some standard values /
1813	geo->heads = `1` << `6`;
1814	geo->sectors = `1` << `5`;
1815	geo->cylinders = get_capacity(disk) >> `11`;
1816	return `0`;
1817	}
1818
1819	static bool nvme_init_integrity(struct nvme_ns_head *head,
1820	struct queue_limits lim, struct* nvme_ns_info *info)
1821	{
1822	struct blk_integrity *bi = &lim->integrity;
1823
1824	memset(bi, `0`, sizeof(*bi));
1825
1826	if (!head->ms)
1827	return true;
1828
1829	/*
1830	* PI can always be supported as we can ask the controller to simply
1831	* insert/strip it, which is not possible for other kinds of metadata.
1832	*/
1833	if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) \|\|
1834	!(head->features & NVME_NS_METADATA_SUPPORTED))
1835	return nvme_ns_has_pi(head);
1836
1837	switch (head->pi_type) {
1838	case NVME_NS_DPS_PI_TYPE3:
1839	switch (head->guard_type) {
1840	case NVME_NVM_NS_16B_GUARD:
1841	bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1842	bi->tag_size = sizeof(u16) + sizeof(u32);
1843	bi->flags \|= BLK_INTEGRITY_DEVICE_CAPABLE;
1844	break;
1845	case NVME_NVM_NS_64B_GUARD:
1846	bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1847	bi->tag_size = sizeof(u16) + `6`;
1848	bi->flags \|= BLK_INTEGRITY_DEVICE_CAPABLE;
1849	break;
1850	default:
1851	break;
1852	}
1853	break;
1854	case NVME_NS_DPS_PI_TYPE1:
1855	case NVME_NS_DPS_PI_TYPE2:
1856	switch (head->guard_type) {
1857	case NVME_NVM_NS_16B_GUARD:
1858	bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1859	bi->tag_size = sizeof(u16);
1860	bi->flags \|= BLK_INTEGRITY_DEVICE_CAPABLE \|
1861	BLK_INTEGRITY_REF_TAG;
1862	break;
1863	case NVME_NVM_NS_64B_GUARD:
1864	bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1865	bi->tag_size = sizeof(u16);
1866	bi->flags \|= BLK_INTEGRITY_DEVICE_CAPABLE \|
1867	BLK_INTEGRITY_REF_TAG;
1868	break;
1869	default:
1870	break;
1871	}
1872	break;
1873	default:
1874	break;
1875	}
1876
1877	bi->metadata_size = head->ms;
1878	if (bi->csum_type) {
1879	bi->pi_tuple_size = head->pi_size;
1880	bi->pi_offset = info->pi_offset;
1881	}
1882	return true;
1883	}
1884
1885	static void nvme_config_discard(struct nvme_ns ns, struct* queue_limits *lim)
1886	{
1887	struct nvme_ctrl *ctrl = ns->ctrl;
1888
1889	if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head: ns->head, UINT_MAX))
1890	lim->max_hw_discard_sectors =
1891	nvme_lba_to_sect(head: ns->head, lba: ctrl->dmrsl);
1892	else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1893	lim->max_hw_discard_sectors = UINT_MAX;
1894	else
1895	lim->max_hw_discard_sectors = `0`;
1896
1897	lim->discard_granularity = lim->logical_block_size;
1898
1899	if (ctrl->dmrl)
1900	lim->max_discard_segments = ctrl->dmrl;
1901	else
1902	lim->max_discard_segments = NVME_DSM_MAX_RANGES;
1903	}
1904
1905	static bool nvme_ns_ids_equal(struct nvme_ns_ids a, struct* nvme_ns_ids *b)
1906	{
1907	return uuid_equal(u1: &a->uuid, u2: &b->uuid) &&
1908	memcmp(p: &a->nguid, q: &b->nguid, size: sizeof(a->nguid)) == `0` &&
1909	memcmp(p: &a->eui64, q: &b->eui64, size: sizeof(a->eui64)) == `0` &&
1910	a->csi == b->csi;
1911	}
1912
1913	static int nvme_identify_ns_nvm(struct nvme_ctrl ctrl, unsigned* int nsid,
1914	struct nvme_id_ns_nvm **nvmp)
1915	{
1916	struct nvme_command c = {
1917	.identify.opcode = nvme_admin_identify,
1918	.identify.nsid = cpu_to_le32(nsid),
1919	.identify.cns = NVME_ID_CNS_CS_NS,
1920	.identify.csi = NVME_CSI_NVM,
1921	};
1922	struct nvme_id_ns_nvm *nvm;
1923	int ret;
1924
1925	nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
1926	if (!nvm)
1927	return -ENOMEM;
1928
1929	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
1930	if (ret)
1931	kfree(objp: nvm);
1932	else
1933	*nvmp = nvm;
1934	return ret;
1935	}
1936
1937	static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
1938	struct nvme_id_ns id, struct* nvme_id_ns_nvm *nvm)
1939	{
1940	u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
1941	u8 guard_type;
1942
1943	/ no support for storage tag formats right now /
1944	if (nvme_elbaf_sts(elbaf))
1945	return;
1946
1947	guard_type = nvme_elbaf_guard_type(elbaf);
1948	if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
1949	guard_type == NVME_NVM_NS_QTYPE_GUARD)
1950	guard_type = nvme_elbaf_qualified_guard_type(elbaf);
1951
1952	head->guard_type = guard_type;
1953	switch (head->guard_type) {
1954	case NVME_NVM_NS_64B_GUARD:
1955	head->pi_size = sizeof(struct crc64_pi_tuple);
1956	break;
1957	case NVME_NVM_NS_16B_GUARD:
1958	head->pi_size = sizeof(struct t10_pi_tuple);
1959	break;
1960	default:
1961	break;
1962	}
1963	}
1964
1965	static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
1966	struct nvme_ns_head head, struct* nvme_id_ns *id,
1967	struct nvme_id_ns_nvm nvm, struct* nvme_ns_info *info)
1968	{
1969	head->features &= ~(NVME_NS_METADATA_SUPPORTED \| NVME_NS_EXT_LBAS);
1970	head->pi_type = `0`;
1971	head->pi_size = `0`;
1972	head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
1973	if (!head->ms \|\| !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1974	return;
1975
1976	if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1977	nvme_configure_pi_elbas(head, id, nvm);
1978	} else {
1979	head->pi_size = sizeof(struct t10_pi_tuple);
1980	head->guard_type = NVME_NVM_NS_16B_GUARD;
1981	}
1982
1983	if (head->pi_size && head->ms >= head->pi_size)
1984	head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1985	if (!(id->dps & NVME_NS_DPS_PI_FIRST)) {
1986	if (disable_pi_offsets)
1987	head->pi_type = `0`;
1988	else
1989	info->pi_offset = head->ms - head->pi_size;
1990	}
1991
1992	if (ctrl->ops->flags & NVME_F_FABRICS) {
1993	/*
1994	* The NVMe over Fabrics specification only supports metadata as
1995	* part of the extended data LBA. We rely on HCA/HBA support to
1996	* remap the separate metadata buffer from the block layer.
1997	*/
1998	if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1999	return;
2000
2001	head->features \|= NVME_NS_EXT_LBAS;
2002
2003	/*
2004	* The current fabrics transport drivers support namespace
2005	* metadata formats only if nvme_ns_has_pi() returns true.
2006	* Suppress support for all other formats so the namespace will
2007	* have a 0 capacity and not be usable through the block stack.
2008	*
2009	* Note, this check will need to be modified if any drivers
2010	* gain the ability to use other metadata formats.
2011	*/
2012	if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
2013	head->features \|= NVME_NS_METADATA_SUPPORTED;
2014	} else {
2015	/*
2016	* For PCIe controllers, we can't easily remap the separate
2017	* metadata buffer from the block layer and thus require a
2018	* separate metadata buffer for block layer metadata/PI support.
2019	* We allow extended LBAs for the passthrough interface, though.
2020	*/
2021	if (id->flbas & NVME_NS_FLBAS_META_EXT)
2022	head->features \|= NVME_NS_EXT_LBAS;
2023	else
2024	head->features \|= NVME_NS_METADATA_SUPPORTED;
2025	}
2026	}
2027
2028
2029	static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
2030	struct nvme_id_ns id, struct* queue_limits *lim, u32 bs)
2031	{
2032	u32 atomic_bs, boundary = `0`;
2033
2034	/*
2035	* We do not support an offset for the atomic boundaries.
2036	*/
2037	if (id->nabo)
2038	return bs;
2039
2040	if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
2041	/*
2042	* Use the per-namespace atomic write unit when available.
2043	*/
2044	atomic_bs = (`1` + le16_to_cpu(id->nawupf)) * bs;
2045	if (id->nabspf)
2046	boundary = (le16_to_cpu(id->nabspf) + `1`) * bs;
2047	} else {
2048	/*
2049	* Use the controller wide atomic write unit. This sucks
2050	* because the limit is defined in terms of logical blocks while
2051	* namespaces can have different formats, and because there is
2052	* no clear language in the specification prohibiting different
2053	* values for different controllers in the subsystem.
2054	*/
2055	atomic_bs = (`1` + ns->ctrl->subsys->awupf) * bs;
2056	}
2057
2058	lim->atomic_write_hw_max = atomic_bs;
2059	lim->atomic_write_hw_boundary = boundary;
2060	lim->atomic_write_hw_unit_min = bs;
2061	lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
2062	lim->features \|= BLK_FEAT_ATOMIC_WRITES;
2063	return atomic_bs;
2064	}
2065
2066	static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
2067	{
2068	return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + `1`;
2069	}
2070
2071	static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
2072	struct queue_limits *lim, bool is_admin)
2073	{
2074	lim->max_hw_sectors = ctrl->max_hw_sectors;
2075	lim->max_segments = min_t(u32, USHRT_MAX,
2076	min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
2077	lim->max_integrity_segments = ctrl->max_integrity_segments;
2078	lim->virt_boundary_mask = ctrl->ops->get_virt_boundary(ctrl, is_admin);
2079	lim->max_segment_size = UINT_MAX;
2080	lim->dma_alignment = `3`;
2081	}
2082
2083	static bool nvme_update_disk_info(struct nvme_ns ns, struct* nvme_id_ns *id,
2084	struct queue_limits *lim)
2085	{
2086	struct nvme_ns_head *head = ns->head;
2087	u32 bs = `1U` << head->lba_shift;
2088	u32 atomic_bs, phys_bs, io_opt = `0`;
2089	bool valid = true;
2090
2091	/*
2092	* The block layer can't support LBA sizes larger than the page size
2093	* or smaller than a sector size yet, so catch this early and don't
2094	* allow block I/O.
2095	*/
2096	if (blk_validate_block_size(bsize: bs)) {
2097	bs = (`1` << `9`);
2098	valid = false;
2099	}
2100
2101	phys_bs = bs;
2102	atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);
2103
2104	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2105	/ NPWG = Namespace Preferred Write Granularity /
2106	phys_bs = bs * (`1` + le16_to_cpu(id->npwg));
2107	/ NOWS = Namespace Optimal Write Size /
2108	if (id->nows)
2109	io_opt = bs * (`1` + le16_to_cpu(id->nows));
2110	}
2111
2112	/*
2113	* Linux filesystems assume writing a single physical block is
2114	* an atomic operation. Hence limit the physical block size to the
2115	* value of the Atomic Write Unit Power Fail parameter.
2116	*/
2117	lim->logical_block_size = bs;
2118	lim->physical_block_size = min(phys_bs, atomic_bs);
2119	lim->io_min = phys_bs;
2120	lim->io_opt = io_opt;
2121	if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
2122	(ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
2123	lim->max_write_zeroes_sectors = UINT_MAX;
2124	else
2125	lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
2126	return valid;
2127	}
2128
2129	static bool nvme_ns_is_readonly(struct nvme_ns ns, struct* nvme_ns_info *info)
2130	{
2131	return info->is_readonly \|\| test_bit(NVME_NS_FORCE_RO, &ns->flags);
2132	}
2133
2134	static inline bool nvme_first_scan(struct gendisk *disk)
2135	{
2136	/ nvme_alloc_ns() scans the disk prior to adding it /
2137	return !disk_live(disk);
2138	}
2139
2140	static void nvme_set_chunk_sectors(struct nvme_ns ns, struct* nvme_id_ns *id,
2141	struct queue_limits *lim)
2142	{
2143	struct nvme_ctrl *ctrl = ns->ctrl;
2144	u32 iob;
2145
2146	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2147	is_power_of_2(n: ctrl->max_hw_sectors))
2148	iob = ctrl->max_hw_sectors;
2149	else
2150	iob = nvme_lba_to_sect(head: ns->head, le16_to_cpu(id->noiob));
2151
2152	if (!iob)
2153	return;
2154
2155	if (!is_power_of_2(n: iob)) {
2156	if (nvme_first_scan(disk: ns->disk))
2157	pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2158	ns->disk->disk_name, iob);
2159	return;
2160	}
2161
2162	if (blk_queue_is_zoned(q: ns->disk->queue)) {
2163	if (nvme_first_scan(disk: ns->disk))
2164	pr_warn("%s: ignoring zoned namespace IO boundary\n",
2165	ns->disk->disk_name);
2166	return;
2167	}
2168
2169	lim->chunk_sectors = iob;
2170	}
2171
2172	static int nvme_update_ns_info_generic(struct nvme_ns *ns,
2173	struct nvme_ns_info *info)
2174	{
2175	struct queue_limits lim;
2176	unsigned int memflags;
2177	int ret;
2178
2179	lim = queue_limits_start_update(q: ns->disk->queue);
2180	nvme_set_ctrl_limits(ctrl: ns->ctrl, lim: &lim, is_admin: false);
2181
2182	memflags = blk_mq_freeze_queue(q: ns->disk->queue);
2183	ret = queue_limits_commit_update(q: ns->disk->queue, lim: &lim);
2184	set_disk_ro(disk: ns->disk, read_only: nvme_ns_is_readonly(ns, info));
2185	blk_mq_unfreeze_queue(q: ns->disk->queue, memflags);
2186
2187	/ Hide the block-interface for these devices /
2188	if (!ret)
2189	ret = -ENODEV;
2190	return ret;
2191	}
2192
2193	static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
2194	struct nvme_ns_info *info, u8 fdp_idx)
2195	{
2196	struct nvme_fdp_config_log hdr, *h;
2197	struct nvme_fdp_config_desc *desc;
2198	size_t size = sizeof(hdr);
2199	void log, end;
2200	int i, n, ret;
2201
2202	ret = nvme_get_log_lsi(ctrl, nsid: `0`, log_page: NVME_LOG_FDP_CONFIGS, lsp: `0`,
2203	csi: NVME_CSI_NVM, log: &hdr, size, offset: `0`, lsi: info->endgid);
2204	if (ret) {
2205	dev_warn(ctrl->device,
2206	"FDP configs log header status:0x%x endgid:%d\n", ret,
2207	info->endgid);
2208	return ret;
2209	}
2210
2211	size = le32_to_cpu(hdr.sze);
2212	if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
2213	dev_warn(ctrl->device, "FDP config size too large:%zu\n",
2214	size);
2215	return `0`;
2216	}
2217
2218	h = kvmalloc(size, GFP_KERNEL);
2219	if (!h)
2220	return -ENOMEM;
2221
2222	ret = nvme_get_log_lsi(ctrl, nsid: `0`, log_page: NVME_LOG_FDP_CONFIGS, lsp: `0`,
2223	csi: NVME_CSI_NVM, log: h, size, offset: `0`, lsi: info->endgid);
2224	if (ret) {
2225	dev_warn(ctrl->device,
2226	"FDP configs log status:0x%x endgid:%d\n", ret,
2227	info->endgid);
2228	goto out;
2229	}
2230
2231	n = le16_to_cpu(h->numfdpc) + `1`;
2232	if (fdp_idx > n) {
2233	dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
2234	fdp_idx, n);
2235	/ Proceed without registering FDP streams /
2236	ret = `0`;
2237	goto out;
2238	}
2239
2240	log = h + `1`;
2241	desc = log;
2242	end = log + size - sizeof(*h);
2243	for (i = `0`; i < fdp_idx; i++) {
2244	log += le16_to_cpu(desc->dsze);
2245	desc = log;
2246	if (log >= end) {
2247	dev_warn(ctrl->device,
2248	"FDP invalid config descriptor list\n");
2249	ret = `0`;
2250	goto out;
2251	}
2252	}
2253
2254	if (le32_to_cpu(desc->nrg) > `1`) {
2255	dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
2256	ret = `0`;
2257	goto out;
2258	}
2259
2260	info->runs = le64_to_cpu(desc->runs);
2261	out:
2262	kvfree(addr: h);
2263	return ret;
2264	}
2265
2266	static int nvme_query_fdp_info(struct nvme_ns ns, struct* nvme_ns_info *info)
2267	{
2268	struct nvme_ns_head *head = ns->head;
2269	struct nvme_ctrl *ctrl = ns->ctrl;
2270	struct nvme_fdp_ruh_status *ruhs;
2271	struct nvme_fdp_config fdp;
2272	struct nvme_command c = {};
2273	size_t size;
2274	int i, ret;
2275
2276	/*
2277	* The FDP configuration is static for the lifetime of the namespace,
2278	* so return immediately if we've already registered this namespace's
2279	* streams.
2280	*/
2281	if (head->nr_plids)
2282	return `0`;
2283
2284	ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, `0`,
2285	&fdp);
2286	if (ret) {
2287	dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
2288	return ret;
2289	}
2290
2291	if (!(fdp.flags & FDPCFG_FDPE))
2292	return `0`;
2293
2294	ret = nvme_query_fdp_granularity(ctrl, info, fdp_idx: fdp.fdpcidx);
2295	if (!info->runs)
2296	return ret;
2297
2298	size = struct_size(ruhs, ruhsd, S8_MAX - `1`);
2299	ruhs = kzalloc(size, GFP_KERNEL);
2300	if (!ruhs)
2301	return -ENOMEM;
2302
2303	c.imr.opcode = nvme_cmd_io_mgmt_recv;
2304	c.imr.nsid = cpu_to_le32(head->ns_id);
2305	c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
2306	c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
2307	ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
2308	if (ret) {
2309	dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
2310	goto free;
2311	}
2312
2313	head->nr_plids = le16_to_cpu(ruhs->nruhsd);
2314	if (!head->nr_plids)
2315	goto free;
2316
2317	head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
2318	GFP_KERNEL);
2319	if (!head->plids) {
2320	dev_warn(ctrl->device,
2321	"failed to allocate %u FDP placement IDs\n",
2322	head->nr_plids);
2323	head->nr_plids = `0`;
2324	ret = -ENOMEM;
2325	goto free;
2326	}
2327
2328	for (i = `0`; i < head->nr_plids; i++)
2329	head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
2330	free:
2331	kfree(objp: ruhs);
2332	return ret;
2333	}
2334
2335	static int nvme_update_ns_info_block(struct nvme_ns *ns,
2336	struct nvme_ns_info *info)
2337	{
2338	struct queue_limits lim;
2339	struct nvme_id_ns_nvm *nvm = NULL;
2340	struct nvme_zone_info zi = {};
2341	struct nvme_id_ns *id;
2342	unsigned int memflags;
2343	sector_t capacity;
2344	unsigned lbaf;
2345	int ret;
2346
2347	ret = nvme_identify_ns(ctrl: ns->ctrl, nsid: info->nsid, id: &id);
2348	if (ret)
2349	return ret;
2350
2351	if (id->ncap == `0`) {
2352	/ namespace not allocated or attached /
2353	info->is_removed = true;
2354	ret = -ENXIO;
2355	goto out;
2356	}
2357	lbaf = nvme_lbaf_index(flbas: id->flbas);
2358
2359	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
2360	ret = nvme_identify_ns_nvm(ctrl: ns->ctrl, nsid: info->nsid, nvmp: &nvm);
2361	if (ret < `0`)
2362	goto out;
2363	}
2364
2365	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2366	ns->head->ids.csi == NVME_CSI_ZNS) {
2367	ret = nvme_query_zone_info(ns, lbaf, zi: &zi);
2368	if (ret < `0`)
2369	goto out;
2370	}
2371
2372	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
2373	ret = nvme_query_fdp_info(ns, info);
2374	if (ret < `0`)
2375	goto out;
2376	}
2377
2378	lim = queue_limits_start_update(q: ns->disk->queue);
2379
2380	memflags = blk_mq_freeze_queue(q: ns->disk->queue);
2381	ns->head->lba_shift = id->lbaf[lbaf].ds;
2382	ns->head->nuse = le64_to_cpu(id->nuse);
2383	capacity = nvme_lba_to_sect(head: ns->head, le64_to_cpu(id->nsze));
2384	nvme_set_ctrl_limits(ctrl: ns->ctrl, lim: &lim, is_admin: false);
2385	nvme_configure_metadata(ctrl: ns->ctrl, head: ns->head, id, nvm, info);
2386	nvme_set_chunk_sectors(ns, id, lim: &lim);
2387	if (!nvme_update_disk_info(ns, id, lim: &lim))
2388	capacity = `0`;
2389
2390	nvme_config_discard(ns, lim: &lim);
2391	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2392	ns->head->ids.csi == NVME_CSI_ZNS)
2393	nvme_update_zone_info(ns, lim: &lim, zi: &zi);
2394
2395	if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
2396	lim.features \|= BLK_FEAT_WRITE_CACHE \| BLK_FEAT_FUA;
2397	else
2398	lim.features &= ~(BLK_FEAT_WRITE_CACHE \| BLK_FEAT_FUA);
2399
2400	if (info->is_rotational)
2401	lim.features \|= BLK_FEAT_ROTATIONAL;
2402
2403	/*
2404	* Register a metadata profile for PI, or the plain non-integrity NVMe
2405	* metadata masquerading as Type 0 if supported, otherwise reject block
2406	* I/O to namespaces with metadata except when the namespace supports
2407	* PI, as it can strip/insert in that case.
2408	*/
2409	if (!nvme_init_integrity(head: ns->head, lim: &lim, info))
2410	capacity = `0`;
2411
2412	lim.max_write_streams = ns->head->nr_plids;
2413	if (lim.max_write_streams)
2414	lim.write_stream_granularity = min(info->runs, U32_MAX);
2415	else
2416	lim.write_stream_granularity = `0`;
2417
2418	/*
2419	* Only set the DEAC bit if the device guarantees that reads from
2420	* deallocated data return zeroes. While the DEAC bit does not
2421	* require that, it must be a no-op if reads from deallocated data
2422	* do not return zeroes.
2423	*/
2424	if ((id->dlfeat & `0x7`) == `0x1` && (id->dlfeat & (`1` << `3`))) {
2425	ns->head->features \|= NVME_NS_DEAC;
2426	lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
2427	}
2428
2429	ret = queue_limits_commit_update(q: ns->disk->queue, lim: &lim);
2430	if (ret) {
2431	blk_mq_unfreeze_queue(q: ns->disk->queue, memflags);
2432	goto out;
2433	}
2434
2435	set_capacity_and_notify(disk: ns->disk, size: capacity);
2436	set_disk_ro(disk: ns->disk, read_only: nvme_ns_is_readonly(ns, info));
2437	set_bit(NVME_NS_READY, addr: &ns->flags);
2438	blk_mq_unfreeze_queue(q: ns->disk->queue, memflags);
2439
2440	if (blk_queue_is_zoned(q: ns->queue)) {
2441	ret = blk_revalidate_disk_zones(disk: ns->disk);
2442	if (ret && !nvme_first_scan(disk: ns->disk))
2443	goto out;
2444	}
2445
2446	ret = `0`;
2447	out:
2448	kfree(objp: nvm);
2449	kfree(objp: id);
2450	return ret;
2451	}
2452
2453	static int nvme_update_ns_info(struct nvme_ns ns, struct* nvme_ns_info *info)
2454	{
2455	bool unsupported = false;
2456	int ret;
2457
2458	switch (info->ids.csi) {
2459	case NVME_CSI_ZNS:
2460	if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2461	dev_info(ns->ctrl->device,
2462	"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2463	info->nsid);
2464	ret = nvme_update_ns_info_generic(ns, info);
2465	break;
2466	}
2467	ret = nvme_update_ns_info_block(ns, info);
2468	break;
2469	case NVME_CSI_NVM:
2470	ret = nvme_update_ns_info_block(ns, info);
2471	break;
2472	default:
2473	dev_info(ns->ctrl->device,
2474	"block device for nsid %u not supported (csi %u)\n",
2475	info->nsid, info->ids.csi);
2476	ret = nvme_update_ns_info_generic(ns, info);
2477	break;
2478	}
2479
2480	/*
2481	* If probing fails due an unsupported feature, hide the block device,
2482	* but still allow other access.
2483	*/
2484	if (ret == -ENODEV) {
2485	ns->disk->flags \|= GENHD_FL_HIDDEN;
2486	set_bit(NVME_NS_READY, addr: &ns->flags);
2487	unsupported = true;
2488	ret = `0`;
2489	}
2490
2491	if (!ret && nvme_ns_head_multipath(head: ns->head)) {
2492	struct queue_limits *ns_lim = &ns->disk->queue->limits;
2493	struct queue_limits lim;
2494	unsigned int memflags;
2495
2496	lim = queue_limits_start_update(q: ns->head->disk->queue);
2497	memflags = blk_mq_freeze_queue(q: ns->head->disk->queue);
2498	/*
2499	* queue_limits mixes values that are the hardware limitations
2500	* for bio splitting with what is the device configuration.
2501	*
2502	* For NVMe the device configuration can change after e.g. a
2503	* Format command, and we really want to pick up the new format
2504	* value here. But we must still stack the queue limits to the
2505	* least common denominator for multipathing to split the bios
2506	* properly.
2507	*
2508	* To work around this, we explicitly set the device
2509	* configuration to those that we just queried, but only stack
2510	* the splitting limits in to make sure we still obey possibly
2511	* lower limitations of other controllers.
2512	*/
2513	lim.logical_block_size = ns_lim->logical_block_size;
2514	lim.physical_block_size = ns_lim->physical_block_size;
2515	lim.io_min = ns_lim->io_min;
2516	lim.io_opt = ns_lim->io_opt;
2517	queue_limits_stack_bdev(t: &lim, bdev: ns->disk->part0, offset: `0`,
2518	pfx: ns->head->disk->disk_name);
2519	if (unsupported)
2520	ns->head->disk->flags \|= GENHD_FL_HIDDEN;
2521	else
2522	nvme_init_integrity(head: ns->head, lim: &lim, info);
2523	lim.max_write_streams = ns_lim->max_write_streams;
2524	lim.write_stream_granularity = ns_lim->write_stream_granularity;
2525	ret = queue_limits_commit_update(q: ns->head->disk->queue, lim: &lim);
2526
2527	set_capacity_and_notify(disk: ns->head->disk, size: get_capacity(disk: ns->disk));
2528	set_disk_ro(disk: ns->head->disk, read_only: nvme_ns_is_readonly(ns, info));
2529	nvme_mpath_revalidate_paths(ns);
2530
2531	blk_mq_unfreeze_queue(q: ns->head->disk->queue, memflags);
2532	}
2533
2534	return ret;
2535	}
2536
2537	int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[`16`],
2538	enum blk_unique_id type)
2539	{
2540	struct nvme_ns_ids *ids = &ns->head->ids;
2541
2542	if (type != BLK_UID_EUI64)
2543	return -EINVAL;
2544
2545	if (memchr_inv(p: ids->nguid, c: `0`, size: sizeof(ids->nguid))) {
2546	memcpy(id, &ids->nguid, sizeof(ids->nguid));
2547	return sizeof(ids->nguid);
2548	}
2549	if (memchr_inv(p: ids->eui64, c: `0`, size: sizeof(ids->eui64))) {
2550	memcpy(id, &ids->eui64, sizeof(ids->eui64));
2551	return sizeof(ids->eui64);
2552	}
2553
2554	return -EINVAL;
2555	}
2556
2557	static int nvme_get_unique_id(struct gendisk *disk, u8 id[`16`],
2558	enum blk_unique_id type)
2559	{
2560	return nvme_ns_get_unique_id(ns: disk->private_data, id, type);
2561	}
2562
2563	#ifdef CONFIG_BLK_SED_OPAL
2564	static int nvme_sec_submit(void data, u16 spsp, u8 secp, void* *buffer, size_t len,
2565	bool send)
2566	{
2567	struct nvme_ctrl *ctrl = data;
2568	struct nvme_command cmd = { };
2569
2570	if (send)
2571	cmd.common.opcode = nvme_admin_security_send;
2572	else
2573	cmd.common.opcode = nvme_admin_security_recv;
2574	cmd.common.nsid = `0`;
2575	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << `24` \| ((u32)spsp) << `8`);
2576	cmd.common.cdw11 = cpu_to_le32(len);
2577
2578	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2579	NVME_QID_ANY, NVME_SUBMIT_AT_HEAD);
2580	}
2581
2582	static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2583	{
2584	if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
2585	if (!ctrl->opal_dev)
2586	ctrl->opal_dev = init_opal_dev(data: ctrl, send_recv: &nvme_sec_submit);
2587	else if (was_suspended)
2588	opal_unlock_from_suspend(dev: ctrl->opal_dev);
2589	} else {
2590	free_opal_dev(dev: ctrl->opal_dev);
2591	ctrl->opal_dev = NULL;
2592	}
2593	}
2594	#else
2595	static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2596	{
2597	}
2598	#endif /* CONFIG_BLK_SED_OPAL */
2599
2600	#ifdef CONFIG_BLK_DEV_ZONED
2601	static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2602	unsigned int nr_zones, struct blk_report_zones_args *args)
2603	{
2604	return nvme_ns_report_zones(ns: disk->private_data, sector, nr_zones, args);
2605	}
2606	#else
2607	#define nvme_report_zones NULL
2608	#endif /* CONFIG_BLK_DEV_ZONED */
2609
2610	const struct block_device_operations nvme_bdev_ops = {
2611	.owner = THIS_MODULE,
2612	.ioctl = nvme_ioctl,
2613	.compat_ioctl = blkdev_compat_ptr_ioctl,
2614	.open = nvme_open,
2615	.release = nvme_release,
2616	.getgeo = nvme_getgeo,
2617	.get_unique_id = nvme_get_unique_id,
2618	.report_zones = nvme_report_zones,
2619	.pr_ops = &nvme_pr_ops,
2620	};
2621
2622	static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
2623	u32 timeout, const char *op)
2624	{
2625	unsigned long timeout_jiffies = jiffies + timeout * HZ;
2626	u32 csts;
2627	int ret;
2628
2629	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == `0`) {
2630	if (csts == ~`0`)
2631	return -ENODEV;
2632	if ((csts & mask) == val)
2633	break;
2634
2635	usleep_range(min: `1000`, max: `2000`);
2636	if (fatal_signal_pending(current))
2637	return -EINTR;
2638	if (time_after(jiffies, timeout_jiffies)) {
2639	dev_err(ctrl->device,
2640	"Device not ready; aborting %s, CSTS=0x%x\n",
2641	op, csts);
2642	return -ENODEV;
2643	}
2644	}
2645
2646	return ret;
2647	}
2648
2649	int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2650	{
2651	int ret;
2652
2653	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2654	if (shutdown)
2655	ctrl->ctrl_config \|= NVME_CC_SHN_NORMAL;
2656	else
2657	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2658
2659	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2660	if (ret)
2661	return ret;
2662
2663	if (shutdown) {
2664	return nvme_wait_ready(ctrl, mask: NVME_CSTS_SHST_MASK,
2665	val: NVME_CSTS_SHST_CMPLT,
2666	timeout: ctrl->shutdown_timeout, op: "shutdown");
2667	}
2668	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2669	msleep(NVME_QUIRK_DELAY_AMOUNT);
2670	return nvme_wait_ready(ctrl, mask: NVME_CSTS_RDY, val: `0`,
2671	timeout: (NVME_CAP_TIMEOUT(ctrl->cap) + `1`) / `2`, op: "reset");
2672	}
2673	EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2674
2675	int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2676	{
2677	unsigned dev_page_min;
2678	u32 timeout;
2679	int ret;
2680
2681	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2682	if (ret) {
2683	dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2684	return ret;
2685	}
2686	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + `12`;
2687
2688	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2689	dev_err(ctrl->device,
2690	"Minimum device page size %u too large for host (%u)\n",
2691	`1` << dev_page_min, `1` << NVME_CTRL_PAGE_SHIFT);
2692	return -ENODEV;
2693	}
2694
2695	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2696	ctrl->ctrl_config = NVME_CC_CSS_CSI;
2697	else
2698	ctrl->ctrl_config = NVME_CC_CSS_NVM;
2699
2700	/*
2701	* Setting CRIME results in CSTS.RDY before the media is ready. This
2702	* makes it possible for media related commands to return the error
2703	* NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is
2704	* restructured to handle retries, disable CC.CRIME.
2705	*/
2706	ctrl->ctrl_config &= ~NVME_CC_CRIME;
2707
2708	ctrl->ctrl_config \|= (NVME_CTRL_PAGE_SHIFT - `12`) << NVME_CC_MPS_SHIFT;
2709	ctrl->ctrl_config \|= NVME_CC_AMS_RR \| NVME_CC_SHN_NONE;
2710	ctrl->ctrl_config \|= NVME_CC_IOSQES \| NVME_CC_IOCQES;
2711	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2712	if (ret)
2713	return ret;
2714
2715	/ CAP value may change after initial CC write /
2716	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2717	if (ret)
2718	return ret;
2719
2720	timeout = NVME_CAP_TIMEOUT(ctrl->cap);
2721	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
2722	u32 crto, ready_timeout;
2723
2724	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
2725	if (ret) {
2726	dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
2727	ret);
2728	return ret;
2729	}
2730
2731	/*
2732	* CRTO should always be greater or equal to CAP.TO, but some
2733	* devices are known to get this wrong. Use the larger of the
2734	* two values.
2735	*/
2736	ready_timeout = NVME_CRTO_CRWMT(crto);
2737
2738	if (ready_timeout < timeout)
2739	dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
2740	crto, ctrl->cap);
2741	else
2742	timeout = ready_timeout;
2743	}
2744
2745	ctrl->ctrl_config \|= NVME_CC_ENABLE;
2746	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2747	if (ret)
2748	return ret;
2749	return nvme_wait_ready(ctrl, mask: NVME_CSTS_RDY, val: NVME_CSTS_RDY,
2750	timeout: (timeout + `1`) / `2`, op: "initialisation");
2751	}
2752	EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2753
2754	static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2755	{
2756	__le64 ts;
2757	int ret;
2758
2759	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2760	return `0`;
2761
2762	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2763	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, `0`, &ts, sizeof(ts),
2764	NULL);
2765	if (ret)
2766	dev_warn_once(ctrl->device,
2767	"could not set timestamp (%d)\n", ret);
2768	return ret;
2769	}
2770
2771	static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2772	{
2773	struct nvme_feat_host_behavior *host;
2774	u8 acre = `0`, lbafee = `0`;
2775	int ret;
2776
2777	/ Don't bother enabling the feature if retry delay is not reported /
2778	if (ctrl->crdt[`0`])
2779	acre = NVME_ENABLE_ACRE;
2780	if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
2781	lbafee = NVME_ENABLE_LBAFEE;
2782
2783	if (!acre && !lbafee)
2784	return `0`;
2785
2786	host = kzalloc(sizeof(*host), GFP_KERNEL);
2787	if (!host)
2788	return `0`;
2789
2790	host->acre = acre;
2791	host->lbafee = lbafee;
2792	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, `0`,
2793	host, sizeof(*host), NULL);
2794	kfree(objp: host);
2795	return ret;
2796	}
2797
2798	/*
2799	* The function checks whether the given total (exlat + enlat) latency of
2800	* a power state allows the latter to be used as an APST transition target.
2801	* It does so by comparing the latency to the primary and secondary latency
2802	* tolerances defined by module params. If there's a match, the corresponding
2803	* timeout value is returned and the matching tolerance index (1 or 2) is
2804	* reported.
2805	*/
2806	static bool nvme_apst_get_transition_time(u64 total_latency,
2807	u64 transition_time, unsigned* *last_index)
2808	{
2809	if (total_latency <= apst_primary_latency_tol_us) {
2810	if (*last_index == `1`)
2811	return false;
2812	*last_index = `1`;
2813	*transition_time = apst_primary_timeout_ms;
2814	return true;
2815	}
2816	if (apst_secondary_timeout_ms &&
2817	total_latency <= apst_secondary_latency_tol_us) {
2818	if (*last_index <= `2`)
2819	return false;
2820	*last_index = `2`;
2821	*transition_time = apst_secondary_timeout_ms;
2822	return true;
2823	}
2824	return false;
2825	}
2826
2827	/*
2828	* APST (Autonomous Power State Transition) lets us program a table of power
2829	* state transitions that the controller will perform automatically.
2830	*
2831	* Depending on module params, one of the two supported techniques will be used:
2832	*
2833	* - If the parameters provide explicit timeouts and tolerances, they will be
2834	* used to build a table with up to 2 non-operational states to transition to.
2835	* The default parameter values were selected based on the values used by
2836	* Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2837	* regeneration of the APST table in the event of switching between external
2838	* and battery power, the timeouts and tolerances reflect a compromise
2839	* between values used by Microsoft for AC and battery scenarios.
2840	* - If not, we'll configure the table with a simple heuristic: we are willing
2841	* to spend at most 2% of the time transitioning between power states.
2842	* Therefore, when running in any given state, we will enter the next
2843	* lower-power non-operational state after waiting 50 * (enlat + exlat)
2844	* microseconds, as long as that state's exit latency is under the requested
2845	* maximum latency.
2846	*
2847	* We will not autonomously enter any non-operational state for which the total
2848	* latency exceeds ps_max_latency_us.
2849	*
2850	* Users can set ps_max_latency_us to zero to turn off APST.
2851	*/
2852	static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2853	{
2854	struct nvme_feat_auto_pst *table;
2855	unsigned apste = `0`;
2856	u64 max_lat_us = `0`;
2857	__le64 target = `0`;
2858	int max_ps = -`1`;
2859	int state;
2860	int ret;
2861	unsigned last_lt_index = UINT_MAX;
2862
2863	/*
2864	* If APST isn't supported or if we haven't been initialized yet,
2865	* then don't do anything.
2866	*/
2867	if (!ctrl->apsta)
2868	return `0`;
2869
2870	if (ctrl->npss > `31`) {
2871	dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2872	return `0`;
2873	}
2874
2875	table = kzalloc(sizeof(*table), GFP_KERNEL);
2876	if (!table)
2877	return `0`;
2878
2879	if (!ctrl->apst_enabled \|\| ctrl->ps_max_latency_us == `0`) {
2880	/ Turn off APST. /
2881	dev_dbg(ctrl->device, "APST disabled\n");
2882	goto done;
2883	}
2884
2885	/*
2886	* Walk through all states from lowest- to highest-power.
2887	* According to the spec, lower-numbered states use more power. NPSS,
2888	* despite the name, is the index of the lowest-power state, not the
2889	* number of states.
2890	*/
2891	for (state = (int)ctrl->npss; state >= `0`; state--) {
2892	u64 total_latency_us, exit_latency_us, transition_ms;
2893
2894	if (target)
2895	table->entries[state] = target;
2896
2897	/*
2898	* Don't allow transitions to the deepest state if it's quirked
2899	* off.
2900	*/
2901	if (state == ctrl->npss &&
2902	(ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2903	continue;
2904
2905	/*
2906	* Is this state a useful non-operational state for higher-power
2907	* states to autonomously transition to?
2908	*/
2909	if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2910	continue;
2911
2912	exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2913	if (exit_latency_us > ctrl->ps_max_latency_us)
2914	continue;
2915
2916	total_latency_us = exit_latency_us +
2917	le32_to_cpu(ctrl->psd[state].entry_lat);
2918
2919	/*
2920	* This state is good. It can be used as the APST idle target
2921	* for higher power states.
2922	*/
2923	if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2924	if (!nvme_apst_get_transition_time(total_latency: total_latency_us,
2925	transition_time: &transition_ms, last_index: &last_lt_index))
2926	continue;
2927	} else {
2928	transition_ms = total_latency_us + `19`;
2929	do_div(transition_ms, `20`);
2930	if (transition_ms > (`1` << `24`) - `1`)
2931	transition_ms = (`1` << `24`) - `1`;
2932	}
2933
2934	target = cpu_to_le64((state << `3`) \| (transition_ms << `8`));
2935	if (max_ps == -`1`)
2936	max_ps = state;
2937	if (total_latency_us > max_lat_us)
2938	max_lat_us = total_latency_us;
2939	}
2940
2941	if (max_ps == -`1`)
2942	dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2943	else
2944	dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2945	max_ps, max_lat_us, (int)sizeof(*table), table);
2946	apste = `1`;
2947
2948	done:
2949	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2950	table, sizeof(*table), NULL);
2951	if (ret)
2952	dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2953	kfree(objp: table);
2954	return ret;
2955	}
2956
2957	static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2958	{
2959	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2960	u64 latency;
2961
2962	switch (val) {
2963	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2964	case PM_QOS_LATENCY_ANY:
2965	latency = U64_MAX;
2966	break;
2967
2968	default:
2969	latency = val;
2970	}
2971
2972	if (ctrl->ps_max_latency_us != latency) {
2973	ctrl->ps_max_latency_us = latency;
2974	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
2975	nvme_configure_apst(ctrl);
2976	}
2977	}
2978
2979	struct nvme_core_quirk_entry {
2980	/*
2981	* NVMe model and firmware strings are padded with spaces. For
2982	* simplicity, strings in the quirk table are padded with NULLs
2983	* instead.
2984	*/
2985	u16 vid;
2986	const char *mn;
2987	const char *fr;
2988	unsigned long quirks;
2989	};
2990
2991	static const struct nvme_core_quirk_entry core_quirks[] = {
2992	{
2993	/*
2994	* This Toshiba device seems to die using any APST states. See:
2995	* https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2996	*/
2997	.vid = `0x1179`,
2998	.mn = "THNSF5256GPUK TOSHIBA",
2999	.quirks = NVME_QUIRK_NO_APST,
3000	},
3001	{
3002	/*
3003	* This LiteON CL1-3D*-Q11 firmware version has a race
3004	* condition associated with actions related to suspend to idle
3005	* LiteON has resolved the problem in future firmware
3006	*/
3007	.vid = `0x14a4`,
3008	.fr = "22301111",
3009	.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
3010	},
3011	{
3012	/*
3013	* This Kioxia CD6-V Series / HPE PE8030 device times out and
3014	* aborts I/O during any load, but more easily reproducible
3015	* with discards (fstrim).
3016	*
3017	* The device is left in a state where it is also not possible
3018	* to use "nvme set-feature" to disable APST, but booting with
3019	* nvme_core.default_ps_max_latency=0 works.
3020	*/
3021	.vid = `0x1e0f`,
3022	.mn = "KCD6XVUL6T40",
3023	.quirks = NVME_QUIRK_NO_APST,
3024	},
3025	{
3026	/*
3027	* The external Samsung X5 SSD fails initialization without a
3028	* delay before checking if it is ready and has a whole set of
3029	* other problems. To make this even more interesting, it
3030	* shares the PCI ID with internal Samsung 970 Evo Plus that
3031	* does not need or want these quirks.
3032	*/
3033	.vid = `0x144d`,
3034	.mn = "Samsung Portable SSD X5",
3035	.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY \|
3036	NVME_QUIRK_NO_DEEPEST_PS \|
3037	NVME_QUIRK_IGNORE_DEV_SUBNQN,
3038	}
3039	};
3040
3041	/ match is null-terminated but idstr is space-padded. /
3042	static bool string_matches(const char idstr, const* char *match, size_t len)
3043	{
3044	size_t matchlen;
3045
3046	if (!match)
3047	return true;
3048
3049	matchlen = strlen(match);
3050	WARN_ON_ONCE(matchlen > len);
3051
3052	if (memcmp(p: idstr, q: match, size: matchlen))
3053	return false;
3054
3055	for (; matchlen < len; matchlen++)
3056	if (idstr[matchlen] != `' '`)
3057	return false;
3058
3059	return true;
3060	}
3061
3062	static bool quirk_matches(const struct nvme_id_ctrl *id,
3063	const struct nvme_core_quirk_entry *q)
3064	{
3065	return q->vid == le16_to_cpu(id->vid) &&
3066	string_matches(idstr: id->mn, match: q->mn, len: sizeof(id->mn)) &&
3067	string_matches(idstr: id->fr, match: q->fr, len: sizeof(id->fr));
3068	}
3069
3070	static void nvme_init_subnqn(struct nvme_subsystem subsys, struct* nvme_ctrl *ctrl,
3071	struct nvme_id_ctrl *id)
3072	{
3073	size_t nqnlen;
3074	int off;
3075
3076	if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
3077	nqnlen = strnlen(p: id->subnqn, NVMF_NQN_SIZE);
3078	if (nqnlen > `0` && nqnlen < NVMF_NQN_SIZE) {
3079	strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
3080	return;
3081	}
3082
3083	if (ctrl->vs >= NVME_VS(`1`, `2`, `1`))
3084	dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
3085	}
3086
3087	/*
3088	* Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
3089	* Base Specification 2.0. It is slightly different from the format
3090	* specified there due to historic reasons, and we can't change it now.
3091	*/
3092	off = snprintf(buf: subsys->subnqn, NVMF_NQN_SIZE,
3093	fmt: "nqn.2014.08.org.nvmexpress:%04x%04x",
3094	le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
3095	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
3096	off += sizeof(id->sn);
3097	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
3098	off += sizeof(id->mn);
3099	memset(subsys->subnqn + off, `0`, sizeof(subsys->subnqn) - off);
3100	}
3101
3102	static void nvme_release_subsystem(struct device *dev)
3103	{
3104	struct nvme_subsystem *subsys =
3105	container_of(dev, struct nvme_subsystem, dev);
3106
3107	if (subsys->instance >= `0`)
3108	ida_free(&nvme_instance_ida, id: subsys->instance);
3109	kfree(objp: subsys);
3110	}
3111
3112	static void nvme_destroy_subsystem(struct kref *ref)
3113	{
3114	struct nvme_subsystem *subsys =
3115	container_of(ref, struct nvme_subsystem, ref);
3116
3117	mutex_lock(&nvme_subsystems_lock);
3118	list_del(entry: &subsys->entry);
3119	mutex_unlock(lock: &nvme_subsystems_lock);
3120
3121	ida_destroy(ida: &subsys->ns_ida);
3122	device_del(dev: &subsys->dev);
3123	put_device(dev: &subsys->dev);
3124	}
3125
3126	static void nvme_put_subsystem(struct nvme_subsystem *subsys)
3127	{
3128	kref_put(kref: &subsys->ref, release: nvme_destroy_subsystem);
3129	}
3130
3131	static struct nvme_subsystem __nvme_find_get_subsystem(const* char *subsysnqn)
3132	{
3133	struct nvme_subsystem *subsys;
3134
3135	lockdep_assert_held(&nvme_subsystems_lock);
3136
3137	/*
3138	* Fail matches for discovery subsystems. This results
3139	* in each discovery controller bound to a unique subsystem.
3140	* This avoids issues with validating controller values
3141	* that can only be true when there is a single unique subsystem.
3142	* There may be multiple and completely independent entities
3143	* that provide discovery controllers.
3144	*/
3145	if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
3146	return NULL;
3147
3148	list_for_each_entry(subsys, &nvme_subsystems, entry) {
3149	if (strcmp(subsys->subnqn, subsysnqn))
3150	continue;
3151	if (!kref_get_unless_zero(kref: &subsys->ref))
3152	continue;
3153	return subsys;
3154	}
3155
3156	return NULL;
3157	}
3158
3159	static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
3160	{
3161	return ctrl->opts && ctrl->opts->discovery_nqn;
3162	}
3163
3164	static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl)
3165	{
3166	return ctrl->cntrltype == NVME_CTRL_ADMIN;
3167	}
3168
3169	static inline bool nvme_is_io_ctrl(struct nvme_ctrl *ctrl)
3170	{
3171	return !nvme_discovery_ctrl(ctrl) && !nvme_admin_ctrl(ctrl);
3172	}
3173
3174	static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
3175	struct nvme_ctrl ctrl, struct* nvme_id_ctrl *id)
3176	{
3177	struct nvme_ctrl *tmp;
3178
3179	lockdep_assert_held(&nvme_subsystems_lock);
3180
3181	list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
3182	if (nvme_state_terminal(ctrl: tmp))
3183	continue;
3184
3185	if (tmp->cntlid == ctrl->cntlid) {
3186	dev_err(ctrl->device,
3187	"Duplicate cntlid %u with %s, subsys %s, rejecting\n",
3188	ctrl->cntlid, dev_name(tmp->device),
3189	subsys->subnqn);
3190	return false;
3191	}
3192
3193	if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) \|\|
3194	nvme_discovery_ctrl(ctrl))
3195	continue;
3196
3197	dev_err(ctrl->device,
3198	"Subsystem does not support multiple controllers\n");
3199	return false;
3200	}
3201
3202	return true;
3203	}
3204
3205	static int nvme_init_subsystem(struct nvme_ctrl ctrl, struct* nvme_id_ctrl *id)
3206	{
3207	struct nvme_subsystem subsys, found;
3208	int ret;
3209
3210	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
3211	if (!subsys)
3212	return -ENOMEM;
3213
3214	subsys->instance = -`1`;
3215	mutex_init(&subsys->lock);
3216	kref_init(kref: &subsys->ref);
3217	INIT_LIST_HEAD(list: &subsys->ctrls);
3218	INIT_LIST_HEAD(list: &subsys->nsheads);
3219	nvme_init_subnqn(subsys, ctrl, id);
3220	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
3221	memcpy(subsys->model, id->mn, sizeof(subsys->model));
3222	subsys->vendor_id = le16_to_cpu(id->vid);
3223	subsys->cmic = id->cmic;
3224	subsys->awupf = le16_to_cpu(id->awupf);
3225
3226	/ Versions prior to 1.4 don't necessarily report a valid type /
3227	if (id->cntrltype == NVME_CTRL_DISC \|\|
3228	!strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
3229	subsys->subtype = NVME_NQN_DISC;
3230	else
3231	subsys->subtype = NVME_NQN_NVME;
3232
3233	if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
3234	dev_err(ctrl->device,
3235	"Subsystem %s is not a discovery controller",
3236	subsys->subnqn);
3237	kfree(objp: subsys);
3238	return -EINVAL;
3239	}
3240	nvme_mpath_default_iopolicy(subsys);
3241
3242	subsys->dev.class = &nvme_subsys_class;
3243	subsys->dev.release = nvme_release_subsystem;
3244	subsys->dev.groups = nvme_subsys_attrs_groups;
3245	dev_set_name(dev: &subsys->dev, name: "nvme-subsys%d", ctrl->instance);
3246	device_initialize(dev: &subsys->dev);
3247
3248	mutex_lock(&nvme_subsystems_lock);
3249	found = __nvme_find_get_subsystem(subsysnqn: subsys->subnqn);
3250	if (found) {
3251	put_device(dev: &subsys->dev);
3252	subsys = found;
3253
3254	if (!nvme_validate_cntlid(subsys, ctrl, id)) {
3255	ret = -EINVAL;
3256	goto out_put_subsystem;
3257	}
3258	} else {
3259	ret = device_add(dev: &subsys->dev);
3260	if (ret) {
3261	dev_err(ctrl->device,
3262	"failed to register subsystem device.\n");
3263	put_device(dev: &subsys->dev);
3264	goto out_unlock;
3265	}
3266	ida_init(ida: &subsys->ns_ida);
3267	list_add_tail(new: &subsys->entry, head: &nvme_subsystems);
3268	}
3269
3270	ret = sysfs_create_link(kobj: &subsys->dev.kobj, target: &ctrl->device->kobj,
3271	name: dev_name(dev: ctrl->device));
3272	if (ret) {
3273	dev_err(ctrl->device,
3274	"failed to create sysfs link from subsystem.\n");
3275	goto out_put_subsystem;
3276	}
3277
3278	if (!found)
3279	subsys->instance = ctrl->instance;
3280	ctrl->subsys = subsys;
3281	list_add_tail(new: &ctrl->subsys_entry, head: &subsys->ctrls);
3282	mutex_unlock(lock: &nvme_subsystems_lock);
3283	return `0`;
3284
3285	out_put_subsystem:
3286	nvme_put_subsystem(subsys);
3287	out_unlock:
3288	mutex_unlock(lock: &nvme_subsystems_lock);
3289	return ret;
3290	}
3291
3292	static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
3293	u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
3294	{
3295	struct nvme_command c = { };
3296	u32 dwlen = nvme_bytes_to_numd(len: size);
3297
3298	c.get_log_page.opcode = nvme_admin_get_log_page;
3299	c.get_log_page.nsid = cpu_to_le32(nsid);
3300	c.get_log_page.lid = log_page;
3301	c.get_log_page.lsp = lsp;
3302	c.get_log_page.numdl = cpu_to_le16(dwlen & ((`1` << `16`) - `1`));
3303	c.get_log_page.numdu = cpu_to_le16(dwlen >> `16`);
3304	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3305	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3306	c.get_log_page.csi = csi;
3307	c.get_log_page.lsi = cpu_to_le16(lsi);
3308
3309	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3310	}
3311
3312	int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
3313	void *log, size_t size, u64 offset)
3314	{
3315	return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
3316	offset, lsi: `0`);
3317	}
3318
3319	static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3320	struct nvme_effects_log **log)
3321	{
3322	struct nvme_effects_log old, cel = xa_load(&ctrl->cels, index: csi);
3323	int ret;
3324
3325	if (cel)
3326	goto out;
3327
3328	cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3329	if (!cel)
3330	return -ENOMEM;
3331
3332	ret = nvme_get_log(ctrl, nsid: `0x00`, log_page: NVME_LOG_CMD_EFFECTS, lsp: `0`, csi,
3333	log: cel, size: sizeof(*cel), offset: `0`);
3334	if (ret) {
3335	kfree(objp: cel);
3336	return ret;
3337	}
3338
3339	old = xa_store(&ctrl->cels, index: csi, entry: cel, GFP_KERNEL);
3340	if (xa_is_err(entry: old)) {
3341	kfree(objp: cel);
3342	return xa_err(entry: old);
3343	}
3344	out:
3345	*log = cel;
3346	return `0`;
3347	}
3348
3349	static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
3350	{
3351	u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + `12`, val;
3352
3353	if (check_shl_overflow(`1U`, units + page_shift - `9`, &val))
3354	return UINT_MAX;
3355	return val;
3356	}
3357
3358	static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
3359	{
3360	struct nvme_command c = { };
3361	struct nvme_id_ctrl_nvm *id;
3362	int ret;
3363
3364	/*
3365	* Even though NVMe spec explicitly states that MDTS is not applicable
3366	* to the write-zeroes, we are cautious and limit the size to the
3367	* controllers max_hw_sectors value, which is based on the MDTS field
3368	* and possibly other limiting factors.
3369	*/
3370	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
3371	!(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
3372	ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
3373	else
3374	ctrl->max_zeroes_sectors = `0`;
3375
3376	if (!nvme_is_io_ctrl(ctrl) \|\|
3377	!nvme_id_cns_ok(ctrl, cns: NVME_ID_CNS_CS_CTRL) \|\|
3378	test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
3379	return `0`;
3380
3381	id = kzalloc(sizeof(*id), GFP_KERNEL);
3382	if (!id)
3383	return -ENOMEM;
3384
3385	c.identify.opcode = nvme_admin_identify;
3386	c.identify.cns = NVME_ID_CNS_CS_CTRL;
3387	c.identify.csi = NVME_CSI_NVM;
3388
3389	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
3390	if (ret)
3391	goto free_data;
3392
3393	ctrl->dmrl = id->dmrl;
3394	ctrl->dmrsl = le32_to_cpu(id->dmrsl);
3395	if (id->wzsl)
3396	ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, units: id->wzsl);
3397
3398	free_data:
3399	if (ret > `0`)
3400	set_bit(nr: NVME_CTRL_SKIP_ID_CNS_CS, addr: &ctrl->flags);
3401	kfree(objp: id);
3402	return ret;
3403	}
3404
3405	static int nvme_init_effects_log(struct nvme_ctrl *ctrl,
3406	u8 csi, struct nvme_effects_log **log)
3407	{
3408	struct nvme_effects_log effects, old;
3409
3410	effects = kzalloc(sizeof(*effects), GFP_KERNEL);
3411	if (!effects)
3412	return -ENOMEM;
3413
3414	old = xa_store(&ctrl->cels, index: csi, entry: effects, GFP_KERNEL);
3415	if (xa_is_err(entry: old)) {
3416	kfree(objp: effects);
3417	return xa_err(entry: old);
3418	}
3419
3420	*log = effects;
3421	return `0`;
3422	}
3423
3424	static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
3425	{
3426	struct nvme_effects_log *log = ctrl->effects;
3427
3428	log->acs[nvme_admin_format_nvm] \|= cpu_to_le32(NVME_CMD_EFFECTS_LBCC \|
3429	NVME_CMD_EFFECTS_NCC \|
3430	NVME_CMD_EFFECTS_CSE_MASK);
3431	log->acs[nvme_admin_sanitize_nvm] \|= cpu_to_le32(NVME_CMD_EFFECTS_LBCC \|
3432	NVME_CMD_EFFECTS_CSE_MASK);
3433
3434	/*
3435	* The spec says the result of a security receive command depends on
3436	* the previous security send command. As such, many vendors log this
3437	* command as one to submitted only when no other commands to the same
3438	* namespace are outstanding. The intention is to tell the host to
3439	* prevent mixing security send and receive.
3440	*
3441	* This driver can only enforce such exclusive access against IO
3442	* queues, though. We are not readily able to enforce such a rule for
3443	* two commands to the admin queue, which is the only queue that
3444	* matters for this command.
3445	*
3446	* Rather than blindly freezing the IO queues for this effect that
3447	* doesn't even apply to IO, mask it off.
3448	*/
3449	log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
3450
3451	log->iocs[nvme_cmd_write] \|= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3452	log->iocs[nvme_cmd_write_zeroes] \|= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3453	log->iocs[nvme_cmd_write_uncor] \|= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3454	}
3455
3456	static int nvme_init_effects(struct nvme_ctrl ctrl, struct* nvme_id_ctrl *id)
3457	{
3458	int ret = `0`;
3459
3460	if (ctrl->effects)
3461	return `0`;
3462
3463	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3464	ret = nvme_get_effects_log(ctrl, csi: NVME_CSI_NVM, log: &ctrl->effects);
3465	if (ret < `0`)
3466	return ret;
3467	}
3468
3469	if (!ctrl->effects) {
3470	ret = nvme_init_effects_log(ctrl, csi: NVME_CSI_NVM, log: &ctrl->effects);
3471	if (ret < `0`)
3472	return ret;
3473	}
3474
3475	nvme_init_known_nvm_effects(ctrl);
3476	return `0`;
3477	}
3478
3479	static int nvme_check_ctrl_fabric_info(struct nvme_ctrl ctrl, struct* nvme_id_ctrl *id)
3480	{
3481	/*
3482	* In fabrics we need to verify the cntlid matches the
3483	* admin connect
3484	*/
3485	if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3486	dev_err(ctrl->device,
3487	"Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
3488	ctrl->cntlid, le16_to_cpu(id->cntlid));
3489	return -EINVAL;
3490	}
3491
3492	if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3493	dev_err(ctrl->device,
3494	"keep-alive support is mandatory for fabrics\n");
3495	return -EINVAL;
3496	}
3497
3498	if (nvme_is_io_ctrl(ctrl) && ctrl->ioccsz < `4`) {
3499	dev_err(ctrl->device,
3500	"I/O queue command capsule supported size %d < 4\n",
3501	ctrl->ioccsz);
3502	return -EINVAL;
3503	}
3504
3505	if (nvme_is_io_ctrl(ctrl) && ctrl->iorcsz < `1`) {
3506	dev_err(ctrl->device,
3507	"I/O queue response capsule supported size %d < 1\n",
3508	ctrl->iorcsz);
3509	return -EINVAL;
3510	}
3511
3512	if (!ctrl->maxcmd) {
3513	dev_warn(ctrl->device,
3514	"Firmware bug: maximum outstanding commands is 0\n");
3515	ctrl->maxcmd = ctrl->sqsize + `1`;
3516	}
3517
3518	return `0`;
3519	}
3520
3521	static int nvme_init_identify(struct nvme_ctrl *ctrl)
3522	{
3523	struct queue_limits lim;
3524	struct nvme_id_ctrl *id;
3525	u32 max_hw_sectors;
3526	bool prev_apst_enabled;
3527	int ret;
3528
3529	ret = nvme_identify_ctrl(dev: ctrl, id: &id);
3530	if (ret) {
3531	dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3532	return -EIO;
3533	}
3534
3535	if (!(ctrl->ops->flags & NVME_F_FABRICS))
3536	ctrl->cntlid = le16_to_cpu(id->cntlid);
3537
3538	if (!ctrl->identified) {
3539	unsigned int i;
3540
3541	/*
3542	* Check for quirks. Quirk can depend on firmware version,
3543	* so, in principle, the set of quirks present can change
3544	* across a reset. As a possible future enhancement, we
3545	* could re-scan for quirks every time we reinitialize
3546	* the device, but we'd have to make sure that the driver
3547	* behaves intelligently if the quirks change.
3548	*/
3549	for (i = `0`; i < ARRAY_SIZE(core_quirks); i++) {
3550	if (quirk_matches(id, q: &core_quirks[i]))
3551	ctrl->quirks \|= core_quirks[i].quirks;
3552	}
3553
3554	ret = nvme_init_subsystem(ctrl, id);
3555	if (ret)
3556	goto out_free;
3557
3558	ret = nvme_init_effects(ctrl, id);
3559	if (ret)
3560	goto out_free;
3561	}
3562	memcpy(ctrl->subsys->firmware_rev, id->fr,
3563	sizeof(ctrl->subsys->firmware_rev));
3564
3565	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3566	dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3567	ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3568	}
3569
3570	ctrl->crdt[`0`] = le16_to_cpu(id->crdt1);
3571	ctrl->crdt[`1`] = le16_to_cpu(id->crdt2);
3572	ctrl->crdt[`2`] = le16_to_cpu(id->crdt3);
3573
3574	ctrl->oacs = le16_to_cpu(id->oacs);
3575	ctrl->oncs = le16_to_cpu(id->oncs);
3576	ctrl->mtfa = le16_to_cpu(id->mtfa);
3577	ctrl->oaes = le32_to_cpu(id->oaes);
3578	ctrl->wctemp = le16_to_cpu(id->wctemp);
3579	ctrl->cctemp = le16_to_cpu(id->cctemp);
3580
3581	atomic_set(v: &ctrl->abort_limit, i: id->acl + `1`);
3582	ctrl->vwc = id->vwc;
3583	if (id->mdts)
3584	max_hw_sectors = nvme_mps_to_sectors(ctrl, units: id->mdts);
3585	else
3586	max_hw_sectors = UINT_MAX;
3587	ctrl->max_hw_sectors =
3588	min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3589
3590	lim = queue_limits_start_update(q: ctrl->admin_q);
3591	nvme_set_ctrl_limits(ctrl, lim: &lim, is_admin: true);
3592	ret = queue_limits_commit_update(q: ctrl->admin_q, lim: &lim);
3593	if (ret)
3594	goto out_free;
3595
3596	ctrl->sgls = le32_to_cpu(id->sgls);
3597	ctrl->kas = le16_to_cpu(id->kas);
3598	ctrl->max_namespaces = le32_to_cpu(id->mnan);
3599	ctrl->ctratt = le32_to_cpu(id->ctratt);
3600
3601	ctrl->cntrltype = id->cntrltype;
3602	ctrl->dctype = id->dctype;
3603
3604	if (id->rtd3e) {
3605	/ us -> s /
3606	u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3607
3608	ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3609	shutdown_timeout, `60`);
3610
3611	if (ctrl->shutdown_timeout != shutdown_timeout)
3612	dev_info(ctrl->device,
3613	"D3 entry latency set to %u seconds\n",
3614	ctrl->shutdown_timeout);
3615	} else
3616	ctrl->shutdown_timeout = shutdown_timeout;
3617
3618	ctrl->npss = id->npss;
3619	ctrl->apsta = id->apsta;
3620	prev_apst_enabled = ctrl->apst_enabled;
3621	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3622	if (force_apst && id->apsta) {
3623	dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3624	ctrl->apst_enabled = true;
3625	} else {
3626	ctrl->apst_enabled = false;
3627	}
3628	} else {
3629	ctrl->apst_enabled = id->apsta;
3630	}
3631	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3632
3633	if (ctrl->ops->flags & NVME_F_FABRICS) {
3634	ctrl->icdoff = le16_to_cpu(id->icdoff);
3635	ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3636	ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3637	ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3638
3639	ret = nvme_check_ctrl_fabric_info(ctrl, id);
3640	if (ret)
3641	goto out_free;
3642	} else {
3643	ctrl->hmpre = le32_to_cpu(id->hmpre);
3644	ctrl->hmmin = le32_to_cpu(id->hmmin);
3645	ctrl->hmminds = le32_to_cpu(id->hmminds);
3646	ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3647	}
3648
3649	ret = nvme_mpath_init_identify(ctrl, id);
3650	if (ret < `0`)
3651	goto out_free;
3652
3653	if (ctrl->apst_enabled && !prev_apst_enabled)
3654	dev_pm_qos_expose_latency_tolerance(dev: ctrl->device);
3655	else if (!ctrl->apst_enabled && prev_apst_enabled)
3656	dev_pm_qos_hide_latency_tolerance(dev: ctrl->device);
3657	out_free:
3658	kfree(objp: id);
3659	return ret;
3660	}
3661
3662	/*
3663	* Initialize the cached copies of the Identify data and various controller
3664	* register in our nvme_ctrl structure. This should be called as soon as
3665	* the admin queue is fully up and running.
3666	*/
3667	int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3668	{
3669	int ret;
3670
3671	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3672	if (ret) {
3673	dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3674	return ret;
3675	}
3676
3677	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3678
3679	if (ctrl->vs >= NVME_VS(`1`, `1`, `0`))
3680	ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3681
3682	ret = nvme_init_identify(ctrl);
3683	if (ret)
3684	return ret;
3685
3686	if (nvme_admin_ctrl(ctrl)) {
3687	/*
3688	* An admin controller has one admin queue, but no I/O queues.
3689	* Override queue_count so it only creates an admin queue.
3690	*/
3691	dev_dbg(ctrl->device,
3692	"Subsystem %s is an administrative controller",
3693	ctrl->subsys->subnqn);
3694	ctrl->queue_count = `1`;
3695	}
3696
3697	ret = nvme_configure_apst(ctrl);
3698	if (ret < `0`)
3699	return ret;
3700
3701	ret = nvme_configure_timestamp(ctrl);
3702	if (ret < `0`)
3703	return ret;
3704
3705	ret = nvme_configure_host_options(ctrl);
3706	if (ret < `0`)
3707	return ret;
3708
3709	nvme_configure_opal(ctrl, was_suspended);
3710
3711	if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3712	/*
3713	* Do not return errors unless we are in a controller reset,
3714	* the controller works perfectly fine without hwmon.
3715	*/
3716	ret = nvme_hwmon_init(ctrl);
3717	if (ret == -EINTR)
3718	return ret;
3719	}
3720
3721	clear_bit(nr: NVME_CTRL_DIRTY_CAPABILITY, addr: &ctrl->flags);
3722	ctrl->identified = true;
3723
3724	nvme_start_keep_alive(ctrl);
3725
3726	return `0`;
3727	}
3728	EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3729
3730	static int nvme_dev_open(struct inode inode, struct* file *file)
3731	{
3732	struct nvme_ctrl *ctrl =
3733	container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3734
3735	switch (nvme_ctrl_state(ctrl)) {
3736	case NVME_CTRL_LIVE:
3737	break;
3738	default:
3739	return -EWOULDBLOCK;
3740	}
3741
3742	nvme_get_ctrl(ctrl);
3743	if (!try_module_get(module: ctrl->ops->module)) {
3744	nvme_put_ctrl(ctrl);
3745	return -EINVAL;
3746	}
3747
3748	file->private_data = ctrl;
3749	return `0`;
3750	}
3751
3752	static int nvme_dev_release(struct inode inode, struct* file *file)
3753	{
3754	struct nvme_ctrl *ctrl =
3755	container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3756
3757	module_put(module: ctrl->ops->module);
3758	nvme_put_ctrl(ctrl);
3759	return `0`;
3760	}
3761
3762	static const struct file_operations nvme_dev_fops = {
3763	.owner = THIS_MODULE,
3764	.open = nvme_dev_open,
3765	.release = nvme_dev_release,
3766	.unlocked_ioctl = nvme_dev_ioctl,
3767	.compat_ioctl = compat_ptr_ioctl,
3768	.uring_cmd = nvme_dev_uring_cmd,
3769	};
3770
3771	static struct nvme_ns_head nvme_find_ns_head(struct* nvme_ctrl *ctrl,
3772	unsigned nsid)
3773	{
3774	struct nvme_ns_head *h;
3775
3776	lockdep_assert_held(&ctrl->subsys->lock);
3777
3778	list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
3779	/*
3780	* Private namespaces can share NSIDs under some conditions.
3781	* In that case we can't use the same ns_head for namespaces
3782	* with the same NSID.
3783	*/
3784	if (h->ns_id != nsid \|\| !nvme_is_unique_nsid(ctrl, head: h))
3785	continue;
3786	if (nvme_tryget_ns_head(head: h))
3787	return h;
3788	}
3789
3790	return NULL;
3791	}
3792
3793	static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3794	struct nvme_ns_ids *ids)
3795	{
3796	bool has_uuid = !uuid_is_null(uuid: &ids->uuid);
3797	bool has_nguid = memchr_inv(p: ids->nguid, c: `0`, size: sizeof(ids->nguid));
3798	bool has_eui64 = memchr_inv(p: ids->eui64, c: `0`, size: sizeof(ids->eui64));
3799	struct nvme_ns_head *h;
3800
3801	lockdep_assert_held(&subsys->lock);
3802
3803	list_for_each_entry(h, &subsys->nsheads, entry) {
3804	if (has_uuid && uuid_equal(u1: &ids->uuid, u2: &h->ids.uuid))
3805	return -EINVAL;
3806	if (has_nguid &&
3807	memcmp(p: &ids->nguid, q: &h->ids.nguid, size: sizeof(ids->nguid)) == `0`)
3808	return -EINVAL;
3809	if (has_eui64 &&
3810	memcmp(p: &ids->eui64, q: &h->ids.eui64, size: sizeof(ids->eui64)) == `0`)
3811	return -EINVAL;
3812	}
3813
3814	return `0`;
3815	}
3816
3817	static void nvme_cdev_rel(struct device *dev)
3818	{
3819	ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3820	}
3821
3822	void nvme_cdev_del(struct cdev cdev, struct* device *cdev_device)
3823	{
3824	cdev_device_del(cdev, dev: cdev_device);
3825	put_device(dev: cdev_device);
3826	}
3827
3828	int nvme_cdev_add(struct cdev cdev, struct* device *cdev_device,
3829	const struct file_operations fops, struct* module *owner)
3830	{
3831	int minor, ret;
3832
3833	minor = ida_alloc(ida: &nvme_ns_chr_minor_ida, GFP_KERNEL);
3834	if (minor < `0`)
3835	return minor;
3836	cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3837	cdev_device->class = &nvme_ns_chr_class;
3838	cdev_device->release = nvme_cdev_rel;
3839	device_initialize(dev: cdev_device);
3840	cdev_init(cdev, fops);
3841	cdev->owner = owner;
3842	ret = cdev_device_add(cdev, dev: cdev_device);
3843	if (ret)
3844	put_device(dev: cdev_device);
3845
3846	return ret;
3847	}
3848
3849	static int nvme_ns_chr_open(struct inode inode, struct* file *file)
3850	{
3851	return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3852	}
3853
3854	static int nvme_ns_chr_release(struct inode inode, struct* file *file)
3855	{
3856	nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3857	return `0`;
3858	}
3859
3860	static const struct file_operations nvme_ns_chr_fops = {
3861	.owner = THIS_MODULE,
3862	.open = nvme_ns_chr_open,
3863	.release = nvme_ns_chr_release,
3864	.unlocked_ioctl = nvme_ns_chr_ioctl,
3865	.compat_ioctl = compat_ptr_ioctl,
3866	.uring_cmd = nvme_ns_chr_uring_cmd,
3867	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
3868	};
3869
3870	static int nvme_add_ns_cdev(struct nvme_ns *ns)
3871	{
3872	int ret;
3873
3874	ns->cdev_device.parent = ns->ctrl->device;
3875	ret = dev_set_name(dev: &ns->cdev_device, name: "ng%dn%d",
3876	ns->ctrl->instance, ns->head->instance);
3877	if (ret)
3878	return ret;
3879
3880	return nvme_cdev_add(cdev: &ns->cdev, cdev_device: &ns->cdev_device, fops: &nvme_ns_chr_fops,
3881	owner: ns->ctrl->ops->module);
3882	}
3883
3884	static struct nvme_ns_head nvme_alloc_ns_head(struct* nvme_ctrl *ctrl,
3885	struct nvme_ns_info *info)
3886	{
3887	struct nvme_ns_head *head;
3888	size_t size = sizeof(*head);
3889	int ret = -ENOMEM;
3890
3891	#ifdef CONFIG_NVME_MULTIPATH
3892	size += num_possible_nodes() * sizeof(struct nvme_ns *);
3893	#endif
3894
3895	head = kzalloc(size, GFP_KERNEL);
3896	if (!head)
3897	goto out;
3898	ret = ida_alloc_min(ida: &ctrl->subsys->ns_ida, min: `1`, GFP_KERNEL);
3899	if (ret < `0`)
3900	goto out_free_head;
3901	head->instance = ret;
3902	INIT_LIST_HEAD(list: &head->list);
3903	ret = init_srcu_struct(&head->srcu);
3904	if (ret)
3905	goto out_ida_remove;
3906	head->subsys = ctrl->subsys;
3907	head->ns_id = info->nsid;
3908	head->ids = info->ids;
3909	head->shared = info->is_shared;
3910	head->rotational = info->is_rotational;
3911	ratelimit_state_init(rs: &head->rs_nuse, interval: `5` * HZ, burst: `1`);
3912	ratelimit_set_flags(rs: &head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
3913	kref_init(kref: &head->ref);
3914
3915	if (head->ids.csi) {
3916	ret = nvme_get_effects_log(ctrl, csi: head->ids.csi, log: &head->effects);
3917	if (ret)
3918	goto out_cleanup_srcu;
3919	} else
3920	head->effects = ctrl->effects;
3921
3922	ret = nvme_mpath_alloc_disk(ctrl, head);
3923	if (ret)
3924	goto out_cleanup_srcu;
3925
3926	list_add_tail(new: &head->entry, head: &ctrl->subsys->nsheads);
3927
3928	kref_get(kref: &ctrl->subsys->ref);
3929
3930	return head;
3931	out_cleanup_srcu:
3932	cleanup_srcu_struct(ssp: &head->srcu);
3933	out_ida_remove:
3934	ida_free(&ctrl->subsys->ns_ida, id: head->instance);
3935	out_free_head:
3936	kfree(objp: head);
3937	out:
3938	if (ret > `0`)
3939	ret = blk_status_to_errno(status: nvme_error_status(status: ret));
3940	return ERR_PTR(error: ret);
3941	}
3942
3943	static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3944	struct nvme_ns_ids *ids)
3945	{
3946	struct nvme_subsystem *s;
3947	int ret = `0`;
3948
3949	/*
3950	* Note that this check is racy as we try to avoid holding the global
3951	* lock over the whole ns_head creation. But it is only intended as
3952	* a sanity check anyway.
3953	*/
3954	mutex_lock(&nvme_subsystems_lock);
3955	list_for_each_entry(s, &nvme_subsystems, entry) {
3956	if (s == this)
3957	continue;
3958	mutex_lock(&s->lock);
3959	ret = nvme_subsys_check_duplicate_ids(subsys: s, ids);
3960	mutex_unlock(lock: &s->lock);
3961	if (ret)
3962	break;
3963	}
3964	mutex_unlock(lock: &nvme_subsystems_lock);
3965
3966	return ret;
3967	}
3968
3969	static int nvme_init_ns_head(struct nvme_ns ns, struct* nvme_ns_info *info)
3970	{
3971	struct nvme_ctrl *ctrl = ns->ctrl;
3972	struct nvme_ns_head *head = NULL;
3973	int ret;
3974
3975	ret = nvme_global_check_duplicate_ids(this: ctrl->subsys, ids: &info->ids);
3976	if (ret) {
3977	/*
3978	* We've found two different namespaces on two different
3979	* subsystems that report the same ID. This is pretty nasty
3980	* for anything that actually requires unique device
3981	* identification. In the kernel we need this for multipathing,
3982	* and in user space the /dev/disk/by-id/ links rely on it.
3983	*
3984	* If the device also claims to be multi-path capable back off
3985	* here now and refuse the probe the second device as this is a
3986	* recipe for data corruption. If not this is probably a
3987	* cheap consumer device if on the PCIe bus, so let the user
3988	* proceed and use the shiny toy, but warn that with changing
3989	* probing order (which due to our async probing could just be
3990	* device taking longer to startup) the other device could show
3991	* up at any time.
3992	*/
3993	nvme_print_device_info(ctrl);
3994	if ((ns->ctrl->ops->flags & NVME_F_FABRICS) \|\| / !PCIe /
3995	((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
3996	info->is_shared)) {
3997	dev_err(ctrl->device,
3998	"ignoring nsid %d because of duplicate IDs\n",
3999	info->nsid);
4000	return ret;
4001	}
4002
4003	dev_err(ctrl->device,
4004	"clearing duplicate IDs for nsid %d\n", info->nsid);
4005	dev_err(ctrl->device,
4006	"use of /dev/disk/by-id/ may cause data corruption\n");
4007	memset(&info->ids.nguid, `0`, sizeof(info->ids.nguid));
4008	memset(&info->ids.uuid, `0`, sizeof(info->ids.uuid));
4009	memset(&info->ids.eui64, `0`, sizeof(info->ids.eui64));
4010	ctrl->quirks \|= NVME_QUIRK_BOGUS_NID;
4011	}
4012
4013	mutex_lock(&ctrl->subsys->lock);
4014	head = nvme_find_ns_head(ctrl, nsid: info->nsid);
4015	if (!head) {
4016	ret = nvme_subsys_check_duplicate_ids(subsys: ctrl->subsys, ids: &info->ids);
4017	if (ret) {
4018	dev_err(ctrl->device,
4019	"duplicate IDs in subsystem for nsid %d\n",
4020	info->nsid);
4021	goto out_unlock;
4022	}
4023	head = nvme_alloc_ns_head(ctrl, info);
4024	if (IS_ERR(ptr: head)) {
4025	ret = PTR_ERR(ptr: head);
4026	goto out_unlock;
4027	}
4028	} else {
4029	ret = -EINVAL;
4030	if ((!info->is_shared \|\| !head->shared) &&
4031	!list_empty(head: &head->list)) {
4032	dev_err(ctrl->device,
4033	"Duplicate unshared namespace %d\n",
4034	info->nsid);
4035	goto out_put_ns_head;
4036	}
4037	if (!nvme_ns_ids_equal(a: &head->ids, b: &info->ids)) {
4038	dev_err(ctrl->device,
4039	"IDs don't match for shared namespace %d\n",
4040	info->nsid);
4041	goto out_put_ns_head;
4042	}
4043
4044	if (!multipath) {
4045	dev_warn(ctrl->device,
4046	"Found shared namespace %d, but multipathing not supported.\n",
4047	info->nsid);
4048	dev_warn_once(ctrl->device,
4049	"Shared namespace support requires core_nvme.multipath=Y.\n");
4050	}
4051	}
4052
4053	list_add_tail_rcu(new: &ns->siblings, head: &head->list);
4054	ns->head = head;
4055	mutex_unlock(lock: &ctrl->subsys->lock);
4056
4057	#ifdef CONFIG_NVME_MULTIPATH
4058	cancel_delayed_work(dwork: &head->remove_work);
4059	#endif
4060	return `0`;
4061
4062	out_put_ns_head:
4063	nvme_put_ns_head(head);
4064	out_unlock:
4065	mutex_unlock(lock: &ctrl->subsys->lock);
4066	return ret;
4067	}
4068
4069	struct nvme_ns nvme_find_get_ns(struct* nvme_ctrl ctrl, unsigned* nsid)
4070	{
4071	struct nvme_ns ns, ret = NULL;
4072	int srcu_idx;
4073
4074	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
4075	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4076	srcu_read_lock_held(&ctrl->srcu)) {
4077	if (ns->head->ns_id == nsid) {
4078	if (!nvme_get_ns(ns))
4079	continue;
4080	ret = ns;
4081	break;
4082	}
4083	if (ns->head->ns_id > nsid)
4084	break;
4085	}
4086	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
4087	return ret;
4088	}
4089	EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, "NVME_TARGET_PASSTHRU");
4090
4091	/*
4092	* Add the namespace to the controller list while keeping the list ordered.
4093	*/
4094	static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
4095	{
4096	struct nvme_ns *tmp;
4097
4098	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
4099	if (tmp->head->ns_id < ns->head->ns_id) {
4100	list_add_rcu(new: &ns->list, head: &tmp->list);
4101	return;
4102	}
4103	}
4104	list_add_rcu(new: &ns->list, head: &ns->ctrl->namespaces);
4105	}
4106
4107	static void nvme_alloc_ns(struct nvme_ctrl ctrl, struct* nvme_ns_info *info)
4108	{
4109	struct queue_limits lim = { };
4110	struct nvme_ns *ns;
4111	struct gendisk *disk;
4112	int node = ctrl->numa_node;
4113	bool last_path = false;
4114
4115	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
4116	if (!ns)
4117	return;
4118
4119	if (ctrl->opts && ctrl->opts->data_digest)
4120	lim.features \|= BLK_FEAT_STABLE_WRITES;
4121	if (ctrl->ops->supports_pci_p2pdma &&
4122	ctrl->ops->supports_pci_p2pdma(ctrl))
4123	lim.features \|= BLK_FEAT_PCI_P2PDMA;
4124
4125	disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
4126	if (IS_ERR(ptr: disk))
4127	goto out_free_ns;
4128	disk->fops = &nvme_bdev_ops;
4129	disk->private_data = ns;
4130
4131	ns->disk = disk;
4132	ns->queue = disk->queue;
4133	ns->ctrl = ctrl;
4134	kref_init(kref: &ns->kref);
4135
4136	if (nvme_init_ns_head(ns, info))
4137	goto out_cleanup_disk;
4138
4139	/*
4140	* If multipathing is enabled, the device name for all disks and not
4141	* just those that represent shared namespaces needs to be based on the
4142	* subsystem instance. Using the controller instance for private
4143	* namespaces could lead to naming collisions between shared and private
4144	* namespaces if they don't use a common numbering scheme.
4145	*
4146	* If multipathing is not enabled, disk names must use the controller
4147	* instance as shared namespaces will show up as multiple block
4148	* devices.
4149	*/
4150	if (nvme_ns_head_multipath(head: ns->head)) {
4151	sprintf(buf: disk->disk_name, fmt: "nvme%dc%dn%d", ctrl->subsys->instance,
4152	ctrl->instance, ns->head->instance);
4153	disk->flags \|= GENHD_FL_HIDDEN;
4154	} else if (multipath) {
4155	sprintf(buf: disk->disk_name, fmt: "nvme%dn%d", ctrl->subsys->instance,
4156	ns->head->instance);
4157	} else {
4158	sprintf(buf: disk->disk_name, fmt: "nvme%dn%d", ctrl->instance,
4159	ns->head->instance);
4160	}
4161
4162	if (nvme_update_ns_info(ns, info))
4163	goto out_unlink_ns;
4164
4165	mutex_lock(&ctrl->namespaces_lock);
4166	/*
4167	* Ensure that no namespaces are added to the ctrl list after the queues
4168	* are frozen, thereby avoiding a deadlock between scan and reset.
4169	*/
4170	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
4171	mutex_unlock(lock: &ctrl->namespaces_lock);
4172	goto out_unlink_ns;
4173	}
4174	nvme_ns_add_to_ctrl_list(ns);
4175	mutex_unlock(lock: &ctrl->namespaces_lock);
4176	synchronize_srcu(ssp: &ctrl->srcu);
4177	nvme_get_ctrl(ctrl);
4178
4179	if (device_add_disk(parent: ctrl->device, disk: ns->disk, groups: nvme_ns_attr_groups))
4180	goto out_cleanup_ns_from_list;
4181
4182	if (!nvme_ns_head_multipath(head: ns->head))
4183	nvme_add_ns_cdev(ns);
4184
4185	nvme_mpath_add_disk(ns, anagrpid: info->anagrpid);
4186	nvme_fault_inject_init(fault_inj: &ns->fault_inject, dev_name: ns->disk->disk_name);
4187
4188	/*
4189	* Set ns->disk->device->driver_data to ns so we can access
4190	* ns->head->passthru_err_log_enabled in
4191	* nvme_io_passthru_err_log_enabled_[store \| show]().
4192	*/
4193	dev_set_drvdata(disk_to_dev(ns->disk), data: ns);
4194
4195	return;
4196
4197	out_cleanup_ns_from_list:
4198	nvme_put_ctrl(ctrl);
4199	mutex_lock(&ctrl->namespaces_lock);
4200	list_del_rcu(entry: &ns->list);
4201	mutex_unlock(lock: &ctrl->namespaces_lock);
4202	synchronize_srcu(ssp: &ctrl->srcu);
4203	out_unlink_ns:
4204	mutex_lock(&ctrl->subsys->lock);
4205	list_del_rcu(entry: &ns->siblings);
4206	if (list_empty(head: &ns->head->list)) {
4207	list_del_init(entry: &ns->head->entry);
4208	/*
4209	* If multipath is not configured, we still create a namespace
4210	* head (nshead), but head->disk is not initialized in that
4211	* case. As a result, only a single reference to nshead is held
4212	* (via kref_init()) when it is created. Therefore, ensure that
4213	* we do not release the reference to nshead twice if head->disk
4214	* is not present.
4215	*/
4216	if (ns->head->disk)
4217	last_path = true;
4218	}
4219	mutex_unlock(lock: &ctrl->subsys->lock);
4220	if (last_path)
4221	nvme_put_ns_head(head: ns->head);
4222	nvme_put_ns_head(head: ns->head);
4223	out_cleanup_disk:
4224	put_disk(disk);
4225	out_free_ns:
4226	kfree(objp: ns);
4227	}
4228
4229	static void nvme_ns_remove(struct nvme_ns *ns)
4230	{
4231	bool last_path = false;
4232
4233	if (test_and_set_bit(NVME_NS_REMOVING, addr: &ns->flags))
4234	return;
4235
4236	clear_bit(NVME_NS_READY, addr: &ns->flags);
4237	set_capacity(disk: ns->disk, size: `0`);
4238	nvme_fault_inject_fini(fault_inject: &ns->fault_inject);
4239
4240	/*
4241	* Ensure that !NVME_NS_READY is seen by other threads to prevent
4242	* this ns going back into current_path.
4243	*/
4244	synchronize_srcu(ssp: &ns->head->srcu);
4245
4246	/ wait for concurrent submissions /
4247	if (nvme_mpath_clear_current_path(ns))
4248	synchronize_srcu(ssp: &ns->head->srcu);
4249
4250	mutex_lock(&ns->ctrl->subsys->lock);
4251	list_del_rcu(entry: &ns->siblings);
4252	if (list_empty(head: &ns->head->list)) {
4253	if (!nvme_mpath_queue_if_no_path(head: ns->head))
4254	list_del_init(entry: &ns->head->entry);
4255	last_path = true;
4256	}
4257	mutex_unlock(lock: &ns->ctrl->subsys->lock);
4258
4259	/ guarantee not available in head->list /
4260	synchronize_srcu(ssp: &ns->head->srcu);
4261
4262	if (!nvme_ns_head_multipath(head: ns->head))
4263	nvme_cdev_del(cdev: &ns->cdev, cdev_device: &ns->cdev_device);
4264
4265	nvme_mpath_remove_sysfs_link(ns);
4266
4267	del_gendisk(gp: ns->disk);
4268
4269	mutex_lock(&ns->ctrl->namespaces_lock);
4270	list_del_rcu(entry: &ns->list);
4271	mutex_unlock(lock: &ns->ctrl->namespaces_lock);
4272	synchronize_srcu(ssp: &ns->ctrl->srcu);
4273
4274	if (last_path)
4275	nvme_mpath_remove_disk(head: ns->head);
4276	nvme_put_ns(ns);
4277	}
4278
4279	static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
4280	{
4281	struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4282
4283	if (ns) {
4284	nvme_ns_remove(ns);
4285	nvme_put_ns(ns);
4286	}
4287	}
4288
4289	static void nvme_validate_ns(struct nvme_ns ns, struct* nvme_ns_info *info)
4290	{
4291	int ret = NVME_SC_INVALID_NS \| NVME_STATUS_DNR;
4292
4293	if (!nvme_ns_ids_equal(a: &ns->head->ids, b: &info->ids)) {
4294	dev_err(ns->ctrl->device,
4295	"identifiers changed for nsid %d\n", ns->head->ns_id);
4296	goto out;
4297	}
4298
4299	ret = nvme_update_ns_info(ns, info);
4300	out:
4301	/*
4302	* Only remove the namespace if we got a fatal error back from the
4303	* device, otherwise ignore the error and just move on.
4304	*
4305	* TODO: we should probably schedule a delayed retry here.
4306	*/
4307	if (ret > `0` && (ret & NVME_STATUS_DNR))
4308	nvme_ns_remove(ns);
4309	}
4310
4311	static void nvme_scan_ns(struct nvme_ctrl ctrl, unsigned* nsid)
4312	{
4313	struct nvme_ns_info info = { .nsid = nsid };
4314	struct nvme_ns *ns;
4315	int ret = `1`;
4316
4317	if (nvme_identify_ns_descs(ctrl, info: &info))
4318	return;
4319
4320	if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
4321	dev_warn(ctrl->device,
4322	"command set not reported for nsid: %d\n", nsid);
4323	return;
4324	}
4325
4326	/*
4327	* If available try to use the Command Set Independent Identify Namespace
4328	* data structure to find all the generic information that is needed to
4329	* set up a namespace. If not fall back to the legacy version.
4330	*/
4331	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) \|\|
4332	(info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) \|\|
4333	ctrl->vs >= NVME_VS(`2`, `0`, `0`))
4334	ret = nvme_ns_info_from_id_cs_indep(ctrl, info: &info);
4335	if (ret > `0`)
4336	ret = nvme_ns_info_from_identify(ctrl, info: &info);
4337
4338	if (info.is_removed)
4339	nvme_ns_remove_by_nsid(ctrl, nsid);
4340
4341	/*
4342	* Ignore the namespace if it is not ready. We will get an AEN once it
4343	* becomes ready and restart the scan.
4344	*/
4345	if (ret \|\| !info.is_ready)
4346	return;
4347
4348	ns = nvme_find_get_ns(ctrl, nsid);
4349	if (ns) {
4350	nvme_validate_ns(ns, info: &info);
4351	nvme_put_ns(ns);
4352	} else {
4353	nvme_alloc_ns(ctrl, info: &info);
4354	}
4355	}
4356
4357	/**
4358	* struct async_scan_info - keeps track of controller & NSIDs to scan
4359	* @ctrl: Controller on which namespaces are being scanned
4360	* @next_nsid: Index of next NSID to scan in ns_list
4361	* @ns_list: Pointer to list of NSIDs to scan
4362	*
4363	* Note: There is a single async_scan_info structure shared by all instances
4364	* of nvme_scan_ns_async() scanning a given controller, so the atomic
4365	* operations on next_nsid are critical to ensure each instance scans a unique
4366	* NSID.
4367	*/
4368	struct async_scan_info {
4369	struct nvme_ctrl *ctrl;
4370	atomic_t next_nsid;
4371	__le32 *ns_list;
4372	};
4373
4374	static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
4375	{
4376	struct async_scan_info *scan_info = data;
4377	int idx;
4378	u32 nsid;
4379
4380	idx = (u32)atomic_fetch_inc(v: &scan_info->next_nsid);
4381	nsid = le32_to_cpu(scan_info->ns_list[idx]);
4382
4383	nvme_scan_ns(ctrl: scan_info->ctrl, nsid);
4384	}
4385
4386	static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4387	unsigned nsid)
4388	{
4389	struct nvme_ns ns, next;
4390	LIST_HEAD(rm_list);
4391
4392	mutex_lock(&ctrl->namespaces_lock);
4393	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4394	if (ns->head->ns_id > nsid) {
4395	list_del_rcu(entry: &ns->list);
4396	synchronize_srcu(ssp: &ctrl->srcu);
4397	list_add_tail_rcu(new: &ns->list, head: &rm_list);
4398	}
4399	}
4400	mutex_unlock(lock: &ctrl->namespaces_lock);
4401
4402	list_for_each_entry_safe(ns, next, &rm_list, list)
4403	nvme_ns_remove(ns);
4404	}
4405
4406	static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4407	{
4408	const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4409	__le32 *ns_list;
4410	u32 prev = `0`;
4411	int ret = `0`, i;
4412	ASYNC_DOMAIN(domain);
4413	struct async_scan_info scan_info;
4414
4415	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4416	if (!ns_list)
4417	return -ENOMEM;
4418
4419	scan_info.ctrl = ctrl;
4420	scan_info.ns_list = ns_list;
4421	for (;;) {
4422	struct nvme_command cmd = {
4423	.identify.opcode = nvme_admin_identify,
4424	.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
4425	.identify.nsid = cpu_to_le32(prev),
4426	};
4427
4428	ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4429	NVME_IDENTIFY_DATA_SIZE);
4430	if (ret) {
4431	dev_warn(ctrl->device,
4432	"Identify NS List failed (status=0x%x)\n", ret);
4433	goto free;
4434	}
4435
4436	atomic_set(v: &scan_info.next_nsid, i: `0`);
4437	for (i = `0`; i < nr_entries; i++) {
4438	u32 nsid = le32_to_cpu(ns_list[i]);
4439
4440	if (!nsid) / end of the list? /
4441	goto out;
4442	async_schedule_domain(func: nvme_scan_ns_async, data: &scan_info,
4443	domain: &domain);
4444	while (++prev < nsid)
4445	nvme_ns_remove_by_nsid(ctrl, nsid: prev);
4446	}
4447	async_synchronize_full_domain(domain: &domain);
4448	}
4449	out:
4450	nvme_remove_invalid_namespaces(ctrl, nsid: prev);
4451	free:
4452	async_synchronize_full_domain(domain: &domain);
4453	kfree(objp: ns_list);
4454	return ret;
4455	}
4456
4457	static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4458	{
4459	struct nvme_id_ctrl *id;
4460	u32 nn, i;
4461
4462	if (nvme_identify_ctrl(dev: ctrl, id: &id))
4463	return;
4464	nn = le32_to_cpu(id->nn);
4465	kfree(objp: id);
4466
4467	for (i = `1`; i <= nn; i++)
4468	nvme_scan_ns(ctrl, nsid: i);
4469
4470	nvme_remove_invalid_namespaces(ctrl, nsid: nn);
4471	}
4472
4473	static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4474	{
4475	size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4476	__le32 *log;
4477	int error;
4478
4479	log = kzalloc(log_size, GFP_KERNEL);
4480	if (!log)
4481	return;
4482
4483	/*
4484	* We need to read the log to clear the AEN, but we don't want to rely
4485	* on it for the changed namespace information as userspace could have
4486	* raced with us in reading the log page, which could cause us to miss
4487	* updates.
4488	*/
4489	error = nvme_get_log(ctrl, NVME_NSID_ALL, log_page: NVME_LOG_CHANGED_NS, lsp: `0`,
4490	csi: NVME_CSI_NVM, log, size: log_size, offset: `0`);
4491	if (error)
4492	dev_warn(ctrl->device,
4493	"reading changed ns log failed: %d\n", error);
4494
4495	kfree(objp: log);
4496	}
4497
4498	static void nvme_scan_work(struct work_struct *work)
4499	{
4500	struct nvme_ctrl *ctrl =
4501	container_of(work, struct nvme_ctrl, scan_work);
4502	int ret;
4503
4504	/ No tagset on a live ctrl means IO queues could not created /
4505	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE \|\| !ctrl->tagset)
4506	return;
4507
4508	/*
4509	* Identify controller limits can change at controller reset due to
4510	* new firmware download, even though it is not common we cannot ignore
4511	* such scenario. Controller's non-mdts limits are reported in the unit
4512	* of logical blocks that is dependent on the format of attached
4513	* namespace. Hence re-read the limits at the time of ns allocation.
4514	*/
4515	ret = nvme_init_non_mdts_limits(ctrl);
4516	if (ret < `0`) {
4517	dev_warn(ctrl->device,
4518	"reading non-mdts-limits failed: %d\n", ret);
4519	return;
4520	}
4521
4522	if (test_and_clear_bit(nr: NVME_AER_NOTICE_NS_CHANGED, addr: &ctrl->events)) {
4523	dev_info(ctrl->device, "rescanning namespaces.\n");
4524	nvme_clear_changed_ns_log(ctrl);
4525	}
4526
4527	mutex_lock(&ctrl->scan_lock);
4528	if (!nvme_id_cns_ok(ctrl, cns: NVME_ID_CNS_NS_ACTIVE_LIST)) {
4529	nvme_scan_ns_sequential(ctrl);
4530	} else {
4531	/*
4532	* Fall back to sequential scan if DNR is set to handle broken
4533	* devices which should support Identify NS List (as per the VS
4534	* they report) but don't actually support it.
4535	*/
4536	ret = nvme_scan_ns_list(ctrl);
4537	if (ret > `0` && ret & NVME_STATUS_DNR)
4538	nvme_scan_ns_sequential(ctrl);
4539	}
4540	mutex_unlock(lock: &ctrl->scan_lock);
4541
4542	/ Requeue if we have missed AENs /
4543	if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events))
4544	nvme_queue_scan(ctrl);
4545	#ifdef CONFIG_NVME_MULTIPATH
4546	else if (ctrl->ana_log_buf)
4547	/ Re-read the ANA log page to not miss updates /
4548	queue_work(wq: nvme_wq, work: &ctrl->ana_work);
4549	#endif
4550	}
4551
4552	/*
4553	* This function iterates the namespace list unlocked to allow recovery from
4554	* controller failure. It is up to the caller to ensure the namespace list is
4555	* not modified by scan work while this function is executing.
4556	*/
4557	void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4558	{
4559	struct nvme_ns ns, next;
4560	LIST_HEAD(ns_list);
4561
4562	/*
4563	* make sure to requeue I/O to all namespaces as these
4564	* might result from the scan itself and must complete
4565	* for the scan_work to make progress
4566	*/
4567	nvme_mpath_clear_ctrl_paths(ctrl);
4568
4569	/*
4570	* Unquiesce io queues so any pending IO won't hang, especially
4571	* those submitted from scan work
4572	*/
4573	nvme_unquiesce_io_queues(ctrl);
4574
4575	/ prevent racing with ns scanning /
4576	flush_work(work: &ctrl->scan_work);
4577
4578	/*
4579	* The dead states indicates the controller was not gracefully
4580	* disconnected. In that case, we won't be able to flush any data while
4581	* removing the namespaces' disks; fail all the queues now to avoid
4582	* potentially having to clean up the failed sync later.
4583	*/
4584	if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
4585	nvme_mark_namespaces_dead(ctrl);
4586
4587	/ this is a no-op when called from the controller reset handler /
4588	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4589
4590	mutex_lock(&ctrl->namespaces_lock);
4591	list_splice_init_rcu(list: &ctrl->namespaces, head: &ns_list, sync: synchronize_rcu);
4592	mutex_unlock(lock: &ctrl->namespaces_lock);
4593	synchronize_srcu(ssp: &ctrl->srcu);
4594
4595	list_for_each_entry_safe(ns, next, &ns_list, list)
4596	nvme_ns_remove(ns);
4597	}
4598	EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4599
4600	static int nvme_class_uevent(const struct device dev, struct* kobj_uevent_env *env)
4601	{
4602	const struct nvme_ctrl *ctrl =
4603	container_of(dev, struct nvme_ctrl, ctrl_device);
4604	struct nvmf_ctrl_options *opts = ctrl->opts;
4605	int ret;
4606
4607	ret = add_uevent_var(env, format: "NVME_TRTYPE=%s", ctrl->ops->name);
4608	if (ret)
4609	return ret;
4610
4611	if (opts) {
4612	ret = add_uevent_var(env, format: "NVME_TRADDR=%s", opts->traddr);
4613	if (ret)
4614	return ret;
4615
4616	ret = add_uevent_var(env, format: "NVME_TRSVCID=%s",
4617	opts->trsvcid ?: "none");
4618	if (ret)
4619	return ret;
4620
4621	ret = add_uevent_var(env, format: "NVME_HOST_TRADDR=%s",
4622	opts->host_traddr ?: "none");
4623	if (ret)
4624	return ret;
4625
4626	ret = add_uevent_var(env, format: "NVME_HOST_IFACE=%s",
4627	opts->host_iface ?: "none");
4628	}
4629	return ret;
4630	}
4631
4632	static void nvme_change_uevent(struct nvme_ctrl ctrl, char* *envdata)
4633	{
4634	char *envp[`2`] = { envdata, NULL };
4635
4636	kobject_uevent_env(kobj: &ctrl->device->kobj, action: KOBJ_CHANGE, envp);
4637	}
4638
4639	static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4640	{
4641	char *envp[`2`] = { NULL, NULL };
4642	u32 aen_result = ctrl->aen_result;
4643
4644	ctrl->aen_result = `0`;
4645	if (!aen_result)
4646	return;
4647
4648	envp[`0`] = kasprintf(GFP_KERNEL, fmt: "NVME_AEN=%#08x", aen_result);
4649	if (!envp[`0`])
4650	return;
4651	kobject_uevent_env(kobj: &ctrl->device->kobj, action: KOBJ_CHANGE, envp);
4652	kfree(objp: envp[`0`]);
4653	}
4654
4655	static void nvme_async_event_work(struct work_struct *work)
4656	{
4657	struct nvme_ctrl *ctrl =
4658	container_of(work, struct nvme_ctrl, async_event_work);
4659
4660	nvme_aen_uevent(ctrl);
4661
4662	/*
4663	* The transport drivers must guarantee AER submission here is safe by
4664	* flushing ctrl async_event_work after changing the controller state
4665	* from LIVE and before freeing the admin queue.
4666	*/
4667	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
4668	ctrl->ops->submit_async_event(ctrl);
4669	}
4670
4671	static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4672	{
4673
4674	u32 csts;
4675
4676	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4677	return false;
4678
4679	if (csts == ~`0`)
4680	return false;
4681
4682	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4683	}
4684
4685	static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4686	{
4687	struct nvme_fw_slot_info_log *log;
4688	u8 next_fw_slot, cur_fw_slot;
4689
4690	log = kmalloc(sizeof(*log), GFP_KERNEL);
4691	if (!log)
4692	return;
4693
4694	if (nvme_get_log(ctrl, NVME_NSID_ALL, log_page: NVME_LOG_FW_SLOT, lsp: `0`, csi: NVME_CSI_NVM,
4695	log, size: sizeof(*log), offset: `0`)) {
4696	dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4697	goto out_free_log;
4698	}
4699
4700	cur_fw_slot = log->afi & `0x7`;
4701	next_fw_slot = (log->afi & `0x70`) >> `4`;
4702	if (!cur_fw_slot \|\| (next_fw_slot && (cur_fw_slot != next_fw_slot))) {
4703	dev_info(ctrl->device,
4704	"Firmware is activated after next Controller Level Reset\n");
4705	goto out_free_log;
4706	}
4707
4708	memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - `1`],
4709	sizeof(ctrl->subsys->firmware_rev));
4710
4711	out_free_log:
4712	kfree(objp: log);
4713	}
4714
4715	static void nvme_fw_act_work(struct work_struct *work)
4716	{
4717	struct nvme_ctrl *ctrl = container_of(work,
4718	struct nvme_ctrl, fw_act_work);
4719	unsigned long fw_act_timeout;
4720
4721	nvme_auth_stop(ctrl);
4722
4723	if (ctrl->mtfa)
4724	fw_act_timeout = jiffies + msecs_to_jiffies(m: ctrl->mtfa * `100`);
4725	else
4726	fw_act_timeout = jiffies + secs_to_jiffies(admin_timeout);
4727
4728	nvme_quiesce_io_queues(ctrl);
4729	while (nvme_ctrl_pp_status(ctrl)) {
4730	if (time_after(jiffies, fw_act_timeout)) {
4731	dev_warn(ctrl->device,
4732	"Fw activation timeout, reset controller\n");
4733	nvme_try_sched_reset(ctrl);
4734	return;
4735	}
4736	msleep(msecs: `100`);
4737	}
4738
4739	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING) \|\|
4740	!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4741	return;
4742
4743	nvme_unquiesce_io_queues(ctrl);
4744	/ read FW slot information to clear the AER /
4745	nvme_get_fw_slot_info(ctrl);
4746
4747	queue_work(wq: nvme_wq, work: &ctrl->async_event_work);
4748	}
4749
4750	static u32 nvme_aer_type(u32 result)
4751	{
4752	return result & `0x7`;
4753	}
4754
4755	static u32 nvme_aer_subtype(u32 result)
4756	{
4757	return (result & `0xff00`) >> `8`;
4758	}
4759
4760	static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4761	{
4762	u32 aer_notice_type = nvme_aer_subtype(result);
4763	bool requeue = true;
4764
4765	switch (aer_notice_type) {
4766	case NVME_AER_NOTICE_NS_CHANGED:
4767	set_bit(nr: NVME_AER_NOTICE_NS_CHANGED, addr: &ctrl->events);
4768	nvme_queue_scan(ctrl);
4769	break;
4770	case NVME_AER_NOTICE_FW_ACT_STARTING:
4771	/*
4772	* We are (ab)using the RESETTING state to prevent subsequent
4773	* recovery actions from interfering with the controller's
4774	* firmware activation.
4775	*/
4776	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
4777	requeue = false;
4778	queue_work(wq: nvme_wq, work: &ctrl->fw_act_work);
4779	}
4780	break;
4781	#ifdef CONFIG_NVME_MULTIPATH
4782	case NVME_AER_NOTICE_ANA:
4783	if (!ctrl->ana_log_buf)
4784	break;
4785	queue_work(wq: nvme_wq, work: &ctrl->ana_work);
4786	break;
4787	#endif
4788	case NVME_AER_NOTICE_DISC_CHANGED:
4789	ctrl->aen_result = result;
4790	break;
4791	default:
4792	dev_warn(ctrl->device, "async event result %08x\n", result);
4793	}
4794	return requeue;
4795	}
4796
4797	static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4798	{
4799	dev_warn(ctrl->device,
4800	"resetting controller due to persistent internal error\n");
4801	nvme_reset_ctrl(ctrl);
4802	}
4803
4804	void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4805	volatile union nvme_result *res)
4806	{
4807	u32 result = le32_to_cpu(res->u32);
4808	u32 aer_type = nvme_aer_type(result);
4809	u32 aer_subtype = nvme_aer_subtype(result);
4810	bool requeue = true;
4811
4812	if (le16_to_cpu(status) >> `1` != NVME_SC_SUCCESS)
4813	return;
4814
4815	trace_nvme_async_event(ctrl, result);
4816	switch (aer_type) {
4817	case NVME_AER_NOTICE:
4818	requeue = nvme_handle_aen_notice(ctrl, result);
4819	break;
4820	case NVME_AER_ERROR:
4821	/*
4822	* For a persistent internal error, don't run async_event_work
4823	* to submit a new AER. The controller reset will do it.
4824	*/
4825	if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4826	nvme_handle_aer_persistent_error(ctrl);
4827	return;
4828	}
4829	fallthrough;
4830	case NVME_AER_SMART:
4831	case NVME_AER_CSS:
4832	case NVME_AER_VS:
4833	ctrl->aen_result = result;
4834	break;
4835	default:
4836	break;
4837	}
4838
4839	if (requeue)
4840	queue_work(wq: nvme_wq, work: &ctrl->async_event_work);
4841	}
4842	EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4843
4844	int nvme_alloc_admin_tag_set(struct nvme_ctrl ctrl, struct* blk_mq_tag_set *set,
4845	const struct blk_mq_ops ops, unsigned* int cmd_size)
4846	{
4847	struct queue_limits lim = {};
4848	int ret;
4849
4850	memset(set, `0`, sizeof(*set));
4851	set->ops = ops;
4852	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
4853	if (ctrl->ops->flags & NVME_F_FABRICS)
4854	/ Reserved for fabric connect and keep alive /
4855	set->reserved_tags = `2`;
4856	set->numa_node = ctrl->numa_node;
4857	if (ctrl->ops->flags & NVME_F_BLOCKING)
4858	set->flags \|= BLK_MQ_F_BLOCKING;
4859	set->cmd_size = cmd_size;
4860	set->driver_data = ctrl;
4861	set->nr_hw_queues = `1`;
4862	set->timeout = NVME_ADMIN_TIMEOUT;
4863	ret = blk_mq_alloc_tag_set(set);
4864	if (ret)
4865	return ret;
4866
4867	ctrl->admin_q = blk_mq_alloc_queue(set, lim: &lim, NULL);
4868	if (IS_ERR(ptr: ctrl->admin_q)) {
4869	ret = PTR_ERR(ptr: ctrl->admin_q);
4870	goto out_free_tagset;
4871	}
4872
4873	if (ctrl->ops->flags & NVME_F_FABRICS) {
4874	ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
4875	if (IS_ERR(ptr: ctrl->fabrics_q)) {
4876	ret = PTR_ERR(ptr: ctrl->fabrics_q);
4877	goto out_cleanup_admin_q;
4878	}
4879	}
4880
4881	ctrl->admin_tagset = set;
4882	return `0`;
4883
4884	out_cleanup_admin_q:
4885	blk_mq_destroy_queue(ctrl->admin_q);
4886	blk_put_queue(ctrl->admin_q);
4887	out_free_tagset:
4888	blk_mq_free_tag_set(set);
4889	ctrl->admin_q = NULL;
4890	ctrl->fabrics_q = NULL;
4891	return ret;
4892	}
4893	EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
4894
4895	void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
4896	{
4897	/*
4898	* As we're about to destroy the queue and free tagset
4899	* we can not have keep-alive work running.
4900	*/
4901	nvme_stop_keep_alive(ctrl);
4902	blk_mq_destroy_queue(ctrl->admin_q);
4903	if (ctrl->ops->flags & NVME_F_FABRICS) {
4904	blk_mq_destroy_queue(ctrl->fabrics_q);
4905	blk_put_queue(ctrl->fabrics_q);
4906	}
4907	blk_mq_free_tag_set(set: ctrl->admin_tagset);
4908	}
4909	EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
4910
4911	int nvme_alloc_io_tag_set(struct nvme_ctrl ctrl, struct* blk_mq_tag_set *set,
4912	const struct blk_mq_ops ops, unsigned* int nr_maps,
4913	unsigned int cmd_size)
4914	{
4915	int ret;
4916
4917	memset(set, `0`, sizeof(*set));
4918	set->ops = ops;
4919	set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - `1`);
4920	/*
4921	* Some Apple controllers requires tags to be unique across admin and
4922	* the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
4923	*/
4924	if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
4925	set->reserved_tags = NVME_AQ_DEPTH;
4926	else if (ctrl->ops->flags & NVME_F_FABRICS)
4927	/ Reserved for fabric connect /
4928	set->reserved_tags = `1`;
4929	set->numa_node = ctrl->numa_node;
4930	if (ctrl->ops->flags & NVME_F_BLOCKING)
4931	set->flags \|= BLK_MQ_F_BLOCKING;
4932	set->cmd_size = cmd_size;
4933	set->driver_data = ctrl;
4934	set->nr_hw_queues = ctrl->queue_count - `1`;
4935	set->timeout = NVME_IO_TIMEOUT;
4936	set->nr_maps = nr_maps;
4937	ret = blk_mq_alloc_tag_set(set);
4938	if (ret)
4939	return ret;
4940
4941	if (ctrl->ops->flags & NVME_F_FABRICS) {
4942	struct queue_limits lim = {
4943	.features = BLK_FEAT_SKIP_TAGSET_QUIESCE,
4944	};
4945
4946	ctrl->connect_q = blk_mq_alloc_queue(set, lim: &lim, NULL);
4947	if (IS_ERR(ptr: ctrl->connect_q)) {
4948	ret = PTR_ERR(ptr: ctrl->connect_q);
4949	goto out_free_tag_set;
4950	}
4951	}
4952
4953	ctrl->tagset = set;
4954	return `0`;
4955
4956	out_free_tag_set:
4957	blk_mq_free_tag_set(set);
4958	ctrl->connect_q = NULL;
4959	return ret;
4960	}
4961	EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
4962
4963	void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
4964	{
4965	if (ctrl->ops->flags & NVME_F_FABRICS) {
4966	blk_mq_destroy_queue(ctrl->connect_q);
4967	blk_put_queue(ctrl->connect_q);
4968	}
4969	blk_mq_free_tag_set(set: ctrl->tagset);
4970	}
4971	EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
4972
4973	void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4974	{
4975	nvme_mpath_stop(ctrl);
4976	nvme_auth_stop(ctrl);
4977	nvme_stop_failfast_work(ctrl);
4978	flush_work(work: &ctrl->async_event_work);
4979	cancel_work_sync(work: &ctrl->fw_act_work);
4980	if (ctrl->ops->stop_ctrl)
4981	ctrl->ops->stop_ctrl(ctrl);
4982	}
4983	EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4984
4985	void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4986	{
4987	nvme_enable_aen(ctrl);
4988
4989	/*
4990	* persistent discovery controllers need to send indication to userspace
4991	* to re-read the discovery log page to learn about possible changes
4992	* that were missed. We identify persistent discovery controllers by
4993	* checking that they started once before, hence are reconnecting back.
4994	*/
4995	if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
4996	nvme_discovery_ctrl(ctrl)) {
4997	if (!ctrl->kato) {
4998	nvme_stop_keep_alive(ctrl);
4999	ctrl->kato = NVME_DEFAULT_KATO;
5000	nvme_start_keep_alive(ctrl);
5001	}
5002	nvme_change_uevent(ctrl, envdata: "NVME_EVENT=rediscover");
5003	}
5004
5005	if (ctrl->queue_count > `1`) {
5006	nvme_queue_scan(ctrl);
5007	nvme_unquiesce_io_queues(ctrl);
5008	nvme_mpath_update(ctrl);
5009	}
5010
5011	nvme_change_uevent(ctrl, envdata: "NVME_EVENT=connected");
5012	set_bit(nr: NVME_CTRL_STARTED_ONCE, addr: &ctrl->flags);
5013	}
5014	EXPORT_SYMBOL_GPL(nvme_start_ctrl);
5015
5016	void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
5017	{
5018	nvme_stop_keep_alive(ctrl);
5019	nvme_hwmon_exit(ctrl);
5020	nvme_fault_inject_fini(fault_inject: &ctrl->fault_inject);
5021	dev_pm_qos_hide_latency_tolerance(dev: ctrl->device);
5022	cdev_device_del(cdev: &ctrl->cdev, dev: ctrl->device);
5023	nvme_put_ctrl(ctrl);
5024	}
5025	EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
5026
5027	static void nvme_free_cels(struct nvme_ctrl *ctrl)
5028	{
5029	struct nvme_effects_log *cel;
5030	unsigned long i;
5031
5032	xa_for_each(&ctrl->cels, i, cel) {
5033	xa_erase(&ctrl->cels, index: i);
5034	kfree(objp: cel);
5035	}
5036
5037	xa_destroy(&ctrl->cels);
5038	}
5039
5040	static void nvme_free_ctrl(struct device *dev)
5041	{
5042	struct nvme_ctrl *ctrl =
5043	container_of(dev, struct nvme_ctrl, ctrl_device);
5044	struct nvme_subsystem *subsys = ctrl->subsys;
5045
5046	if (ctrl->admin_q)
5047	blk_put_queue(ctrl->admin_q);
5048	if (!subsys \|\| ctrl->instance != subsys->instance)
5049	ida_free(&nvme_instance_ida, id: ctrl->instance);
5050	nvme_free_cels(ctrl);
5051	nvme_mpath_uninit(ctrl);
5052	cleanup_srcu_struct(ssp: &ctrl->srcu);
5053	nvme_auth_stop(ctrl);
5054	nvme_auth_free(ctrl);
5055	__free_page(ctrl->discard_page);
5056	free_opal_dev(dev: ctrl->opal_dev);
5057
5058	if (subsys) {
5059	mutex_lock(&nvme_subsystems_lock);
5060	list_del(entry: &ctrl->subsys_entry);
5061	sysfs_remove_link(kobj: &subsys->dev.kobj, name: dev_name(dev: ctrl->device));
5062	mutex_unlock(lock: &nvme_subsystems_lock);
5063	}
5064
5065	ctrl->ops->free_ctrl(ctrl);
5066
5067	if (subsys)
5068	nvme_put_subsystem(subsys);
5069	}
5070
5071	/*
5072	* Initialize a NVMe controller structures. This needs to be called during
5073	* earliest initialization so that we have the initialized structured around
5074	* during probing.
5075	*
5076	* On success, the caller must use the nvme_put_ctrl() to release this when
5077	* needed, which also invokes the ops->free_ctrl() callback.
5078	*/
5079	int nvme_init_ctrl(struct nvme_ctrl ctrl, struct* device *dev,
5080	const struct nvme_ctrl_ops ops, unsigned* long quirks)
5081	{
5082	int ret;
5083
5084	WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
5085	ctrl->passthru_err_log_enabled = false;
5086	clear_bit(nr: NVME_CTRL_FAILFAST_EXPIRED, addr: &ctrl->flags);
5087	spin_lock_init(&ctrl->lock);
5088	mutex_init(&ctrl->namespaces_lock);
5089
5090	ret = init_srcu_struct(&ctrl->srcu);
5091	if (ret)
5092	return ret;
5093
5094	mutex_init(&ctrl->scan_lock);
5095	INIT_LIST_HEAD(list: &ctrl->namespaces);
5096	xa_init(xa: &ctrl->cels);
5097	ctrl->dev = dev;
5098	ctrl->ops = ops;
5099	ctrl->quirks = quirks;
5100	ctrl->numa_node = NUMA_NO_NODE;
5101	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
5102	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
5103	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
5104	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
5105	init_waitqueue_head(&ctrl->state_wq);
5106
5107	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
5108	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
5109	memset(&ctrl->ka_cmd, `0`, sizeof(ctrl->ka_cmd));
5110	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
5111	ctrl->ka_last_check_time = jiffies;
5112
5113	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
5114	PAGE_SIZE);
5115	ctrl->discard_page = alloc_page(GFP_KERNEL);
5116	if (!ctrl->discard_page) {
5117	ret = -ENOMEM;
5118	goto out;
5119	}
5120
5121	ret = ida_alloc(ida: &nvme_instance_ida, GFP_KERNEL);
5122	if (ret < `0`)
5123	goto out;
5124	ctrl->instance = ret;
5125
5126	ret = nvme_auth_init_ctrl(ctrl);
5127	if (ret)
5128	goto out_release_instance;
5129
5130	nvme_mpath_init_ctrl(ctrl);
5131
5132	device_initialize(dev: &ctrl->ctrl_device);
5133	ctrl->device = &ctrl->ctrl_device;
5134	ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
5135	ctrl->instance);
5136	ctrl->device->class = &nvme_class;
5137	ctrl->device->parent = ctrl->dev;
5138	if (ops->dev_attr_groups)
5139	ctrl->device->groups = ops->dev_attr_groups;
5140	else
5141	ctrl->device->groups = nvme_dev_attr_groups;
5142	ctrl->device->release = nvme_free_ctrl;
5143	dev_set_drvdata(dev: ctrl->device, data: ctrl);
5144
5145	return ret;
5146
5147	out_release_instance:
5148	ida_free(&nvme_instance_ida, id: ctrl->instance);
5149	out:
5150	if (ctrl->discard_page)
5151	__free_page(ctrl->discard_page);
5152	cleanup_srcu_struct(ssp: &ctrl->srcu);
5153	return ret;
5154	}
5155	EXPORT_SYMBOL_GPL(nvme_init_ctrl);
5156
5157	/*
5158	* On success, returns with an elevated controller reference and caller must
5159	* use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
5160	*/
5161	int nvme_add_ctrl(struct nvme_ctrl *ctrl)
5162	{
5163	int ret;
5164
5165	ret = dev_set_name(dev: ctrl->device, name: "nvme%d", ctrl->instance);
5166	if (ret)
5167	return ret;
5168
5169	cdev_init(&ctrl->cdev, &nvme_dev_fops);
5170	ctrl->cdev.owner = ctrl->ops->module;
5171	ret = cdev_device_add(cdev: &ctrl->cdev, dev: ctrl->device);
5172	if (ret)
5173	return ret;
5174
5175	/*
5176	* Initialize latency tolerance controls. The sysfs files won't
5177	* be visible to userspace unless the device actually supports APST.
5178	*/
5179	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
5180	dev_pm_qos_update_user_latency_tolerance(dev: ctrl->device,
5181	min(default_ps_max_latency_us, (unsigned long)S32_MAX));
5182
5183	nvme_fault_inject_init(fault_inj: &ctrl->fault_inject, dev_name: dev_name(dev: ctrl->device));
5184	nvme_get_ctrl(ctrl);
5185
5186	return `0`;
5187	}
5188	EXPORT_SYMBOL_GPL(nvme_add_ctrl);
5189
5190	/ let I/O to all namespaces fail in preparation for surprise removal /
5191	void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
5192	{
5193	struct nvme_ns *ns;
5194	int srcu_idx;
5195
5196	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
5197	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5198	srcu_read_lock_held(&ctrl->srcu))
5199	blk_mark_disk_dead(disk: ns->disk);
5200	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
5201	}
5202	EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
5203
5204	void nvme_unfreeze(struct nvme_ctrl *ctrl)
5205	{
5206	struct nvme_ns *ns;
5207	int srcu_idx;
5208
5209	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
5210	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5211	srcu_read_lock_held(&ctrl->srcu))
5212	blk_mq_unfreeze_queue_non_owner(q: ns->queue);
5213	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
5214	clear_bit(nr: NVME_CTRL_FROZEN, addr: &ctrl->flags);
5215	}
5216	EXPORT_SYMBOL_GPL(nvme_unfreeze);
5217
5218	int nvme_wait_freeze_timeout(struct nvme_ctrl ctrl, long* timeout)
5219	{
5220	struct nvme_ns *ns;
5221	int srcu_idx;
5222
5223	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
5224	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5225	srcu_read_lock_held(&ctrl->srcu)) {
5226	timeout = blk_mq_freeze_queue_wait_timeout(q: ns->queue, timeout);
5227	if (timeout <= `0`)
5228	break;
5229	}
5230	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
5231	return timeout;
5232	}
5233	EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
5234
5235	void nvme_wait_freeze(struct nvme_ctrl *ctrl)
5236	{
5237	struct nvme_ns *ns;
5238	int srcu_idx;
5239
5240	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
5241	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5242	srcu_read_lock_held(&ctrl->srcu))
5243	blk_mq_freeze_queue_wait(q: ns->queue);
5244	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
5245	}
5246	EXPORT_SYMBOL_GPL(nvme_wait_freeze);
5247
5248	void nvme_start_freeze(struct nvme_ctrl *ctrl)
5249	{
5250	struct nvme_ns *ns;
5251	int srcu_idx;
5252
5253	set_bit(nr: NVME_CTRL_FROZEN, addr: &ctrl->flags);
5254	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
5255	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5256	srcu_read_lock_held(&ctrl->srcu))
5257	/*
5258	* Typical non_owner use case is from pci driver, in which
5259	* start_freeze is called from timeout work function, but
5260	* unfreeze is done in reset work context
5261	*/
5262	blk_freeze_queue_start_non_owner(q: ns->queue);
5263	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
5264	}
5265	EXPORT_SYMBOL_GPL(nvme_start_freeze);
5266
5267	void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
5268	{
5269	if (!ctrl->tagset)
5270	return;
5271	if (!test_and_set_bit(nr: NVME_CTRL_STOPPED, addr: &ctrl->flags))
5272	blk_mq_quiesce_tagset(set: ctrl->tagset);
5273	else
5274	blk_mq_wait_quiesce_done(set: ctrl->tagset);
5275	}
5276	EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
5277
5278	void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
5279	{
5280	if (!ctrl->tagset)
5281	return;
5282	if (test_and_clear_bit(nr: NVME_CTRL_STOPPED, addr: &ctrl->flags))
5283	blk_mq_unquiesce_tagset(set: ctrl->tagset);
5284	}
5285	EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
5286
5287	void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
5288	{
5289	if (!test_and_set_bit(nr: NVME_CTRL_ADMIN_Q_STOPPED, addr: &ctrl->flags))
5290	blk_mq_quiesce_queue(q: ctrl->admin_q);
5291	else
5292	blk_mq_wait_quiesce_done(set: ctrl->admin_q->tag_set);
5293	}
5294	EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
5295
5296	void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
5297	{
5298	if (test_and_clear_bit(nr: NVME_CTRL_ADMIN_Q_STOPPED, addr: &ctrl->flags))
5299	blk_mq_unquiesce_queue(q: ctrl->admin_q);
5300	}
5301	EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
5302
5303	void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
5304	{
5305	struct nvme_ns *ns;
5306	int srcu_idx;
5307
5308	srcu_idx = srcu_read_lock(ssp: &ctrl->srcu);
5309	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5310	srcu_read_lock_held(&ctrl->srcu))
5311	blk_sync_queue(q: ns->queue);
5312	srcu_read_unlock(ssp: &ctrl->srcu, idx: srcu_idx);
5313	}
5314	EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
5315
5316	void nvme_sync_queues(struct nvme_ctrl *ctrl)
5317	{
5318	nvme_sync_io_queues(ctrl);
5319	if (ctrl->admin_q)
5320	blk_sync_queue(q: ctrl->admin_q);
5321	}
5322	EXPORT_SYMBOL_GPL(nvme_sync_queues);
5323
5324	struct nvme_ctrl nvme_ctrl_from_file(struct* file *file)
5325	{
5326	if (file->f_op != &nvme_dev_fops)
5327	return NULL;
5328	return file->private_data;
5329	}
5330	EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, "NVME_TARGET_PASSTHRU");
5331
5332	/*
5333	* Check we didn't inadvertently grow the command structure sizes:
5334	*/
5335	static inline void _nvme_check_size(void)
5336	{
5337	BUILD_BUG_ON(sizeof(struct nvme_common_command) != `64`);
5338	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != `64`);
5339	BUILD_BUG_ON(sizeof(struct nvme_identify) != `64`);
5340	BUILD_BUG_ON(sizeof(struct nvme_features) != `64`);
5341	BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != `64`);
5342	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != `64`);
5343	BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != `64`);
5344	BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != `64`);
5345	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != `64`);
5346	BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != `64`);
5347	BUILD_BUG_ON(sizeof(struct nvme_command) != `64`);
5348	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
5349	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
5350	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
5351	NVME_IDENTIFY_DATA_SIZE);
5352	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
5353	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
5354	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
5355	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
5356	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != `64`);
5357	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != `512`);
5358	BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != `512`);
5359	BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != `512`);
5360	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != `64`);
5361	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != `64`);
5362	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != `512`);
5363	}
5364
5365
5366	static int __init nvme_core_init(void)
5367	{
5368	unsigned int wq_flags = WQ_UNBOUND \| WQ_MEM_RECLAIM \| WQ_SYSFS;
5369	int result = -ENOMEM;
5370
5371	_nvme_check_size();
5372
5373	nvme_wq = alloc_workqueue("nvme-wq", wq_flags, `0`);
5374	if (!nvme_wq)
5375	goto out;
5376
5377	nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, `0`);
5378	if (!nvme_reset_wq)
5379	goto destroy_wq;
5380
5381	nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, `0`);
5382	if (!nvme_delete_wq)
5383	goto destroy_reset_wq;
5384
5385	result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, `0`,
5386	NVME_MINORS, "nvme");
5387	if (result < `0`)
5388	goto destroy_delete_wq;
5389
5390	result = class_register(class: &nvme_class);
5391	if (result)
5392	goto unregister_chrdev;
5393
5394	result = class_register(class: &nvme_subsys_class);
5395	if (result)
5396	goto destroy_class;
5397
5398	result = alloc_chrdev_region(&nvme_ns_chr_devt, `0`, NVME_MINORS,
5399	"nvme-generic");
5400	if (result < `0`)
5401	goto destroy_subsys_class;
5402
5403	result = class_register(class: &nvme_ns_chr_class);
5404	if (result)
5405	goto unregister_generic_ns;
5406
5407	result = nvme_init_auth();
5408	if (result)
5409	goto destroy_ns_chr;
5410	return `0`;
5411
5412	destroy_ns_chr:
5413	class_unregister(class: &nvme_ns_chr_class);
5414	unregister_generic_ns:
5415	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5416	destroy_subsys_class:
5417	class_unregister(class: &nvme_subsys_class);
5418	destroy_class:
5419	class_unregister(class: &nvme_class);
5420	unregister_chrdev:
5421	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5422	destroy_delete_wq:
5423	destroy_workqueue(wq: nvme_delete_wq);
5424	destroy_reset_wq:
5425	destroy_workqueue(wq: nvme_reset_wq);
5426	destroy_wq:
5427	destroy_workqueue(wq: nvme_wq);
5428	out:
5429	return result;
5430	}
5431
5432	static void __exit nvme_core_exit(void)
5433	{
5434	nvme_exit_auth();
5435	class_unregister(class: &nvme_ns_chr_class);
5436	class_unregister(class: &nvme_subsys_class);
5437	class_unregister(class: &nvme_class);
5438	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5439	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5440	destroy_workqueue(wq: nvme_delete_wq);
5441	destroy_workqueue(wq: nvme_reset_wq);
5442	destroy_workqueue(wq: nvme_wq);
5443	ida_destroy(ida: &nvme_ns_chr_minor_ida);
5444	ida_destroy(ida: &nvme_instance_ida);
5445	}
5446
5447	MODULE_LICENSE("GPL");
5448	MODULE_VERSION("1.0");
5449	MODULE_DESCRIPTION("NVMe host core framework");
5450	module_init(nvme_core_init);
5451	module_exit(nvme_core_exit);
5452

source code of linux/drivers/nvme/host/core.c