1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2011-2014, Intel Corporation. |
4 | * Copyright (c) 2017-2021 Christoph Hellwig. |
5 | */ |
6 | #include <linux/ptrace.h> /* for force_successful_syscall_return */ |
7 | #include <linux/nvme_ioctl.h> |
8 | #include <linux/io_uring/cmd.h> |
9 | #include "nvme.h" |
10 | |
11 | enum { |
12 | NVME_IOCTL_VEC = (1 << 0), |
13 | NVME_IOCTL_PARTITION = (1 << 1), |
14 | }; |
15 | |
16 | static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, |
17 | unsigned int flags, bool open_for_write) |
18 | { |
19 | u32 effects; |
20 | |
21 | /* |
22 | * Do not allow unprivileged passthrough on partitions, as that allows an |
23 | * escape from the containment of the partition. |
24 | */ |
25 | if (flags & NVME_IOCTL_PARTITION) |
26 | goto admin; |
27 | |
28 | /* |
29 | * Do not allow unprivileged processes to send vendor specific or fabrics |
30 | * commands as we can't be sure about their effects. |
31 | */ |
32 | if (c->common.opcode >= nvme_cmd_vendor_start || |
33 | c->common.opcode == nvme_fabrics_command) |
34 | goto admin; |
35 | |
36 | /* |
37 | * Do not allow unprivileged passthrough of admin commands except |
38 | * for a subset of identify commands that contain information required |
39 | * to form proper I/O commands in userspace and do not expose any |
40 | * potentially sensitive information. |
41 | */ |
42 | if (!ns) { |
43 | if (c->common.opcode == nvme_admin_identify) { |
44 | switch (c->identify.cns) { |
45 | case NVME_ID_CNS_NS: |
46 | case NVME_ID_CNS_CS_NS: |
47 | case NVME_ID_CNS_NS_CS_INDEP: |
48 | case NVME_ID_CNS_CS_CTRL: |
49 | case NVME_ID_CNS_CTRL: |
50 | return true; |
51 | } |
52 | } |
53 | goto admin; |
54 | } |
55 | |
56 | /* |
57 | * Check if the controller provides a Commands Supported and Effects log |
58 | * and marks this command as supported. If not reject unprivileged |
59 | * passthrough. |
60 | */ |
61 | effects = nvme_command_effects(ctrl: ns->ctrl, ns, opcode: c->common.opcode); |
62 | if (!(effects & NVME_CMD_EFFECTS_CSUPP)) |
63 | goto admin; |
64 | |
65 | /* |
66 | * Don't allow passthrough for command that have intrusive (or unknown) |
67 | * effects. |
68 | */ |
69 | if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | |
70 | NVME_CMD_EFFECTS_UUID_SEL | |
71 | NVME_CMD_EFFECTS_SCOPE_MASK)) |
72 | goto admin; |
73 | |
74 | /* |
75 | * Only allow I/O commands that transfer data to the controller or that |
76 | * change the logical block contents if the file descriptor is open for |
77 | * writing. |
78 | */ |
79 | if ((nvme_is_write(cmd: c) || (effects & NVME_CMD_EFFECTS_LBCC)) && |
80 | !open_for_write) |
81 | goto admin; |
82 | |
83 | return true; |
84 | admin: |
85 | return capable(CAP_SYS_ADMIN); |
86 | } |
87 | |
88 | /* |
89 | * Convert integer values from ioctl structures to user pointers, silently |
90 | * ignoring the upper bits in the compat case to match behaviour of 32-bit |
91 | * kernels. |
92 | */ |
93 | static void __user *nvme_to_user_ptr(uintptr_t ptrval) |
94 | { |
95 | if (in_compat_syscall()) |
96 | ptrval = (compat_uptr_t)ptrval; |
97 | return (void __user *)ptrval; |
98 | } |
99 | |
100 | static struct request *nvme_alloc_user_request(struct request_queue *q, |
101 | struct nvme_command *cmd, blk_opf_t rq_flags, |
102 | blk_mq_req_flags_t blk_flags) |
103 | { |
104 | struct request *req; |
105 | |
106 | req = blk_mq_alloc_request(q, opf: nvme_req_op(cmd) | rq_flags, flags: blk_flags); |
107 | if (IS_ERR(ptr: req)) |
108 | return req; |
109 | nvme_init_request(req, cmd); |
110 | nvme_req(req)->flags |= NVME_REQ_USERCMD; |
111 | return req; |
112 | } |
113 | |
114 | static int nvme_map_user_request(struct request *req, u64 ubuffer, |
115 | unsigned bufflen, void __user *meta_buffer, unsigned meta_len, |
116 | u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) |
117 | { |
118 | struct request_queue *q = req->q; |
119 | struct nvme_ns *ns = q->queuedata; |
120 | struct block_device *bdev = ns ? ns->disk->part0 : NULL; |
121 | struct bio *bio = NULL; |
122 | int ret; |
123 | |
124 | if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { |
125 | struct iov_iter iter; |
126 | |
127 | /* fixedbufs is only for non-vectored io */ |
128 | if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) |
129 | return -EINVAL; |
130 | ret = io_uring_cmd_import_fixed(ubuf: ubuffer, len: bufflen, |
131 | rq_data_dir(req), iter: &iter, ioucmd); |
132 | if (ret < 0) |
133 | goto out; |
134 | ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); |
135 | } else { |
136 | ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ptrval: ubuffer), |
137 | bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, |
138 | 0, rq_data_dir(req)); |
139 | } |
140 | |
141 | if (ret) |
142 | goto out; |
143 | |
144 | bio = req->bio; |
145 | if (bdev) { |
146 | bio_set_dev(bio, bdev); |
147 | if (meta_buffer && meta_len) { |
148 | ret = bio_integrity_map_user(bio, ubuf: meta_buffer, len: meta_len, |
149 | seed: meta_seed); |
150 | if (ret) |
151 | goto out_unmap; |
152 | req->cmd_flags |= REQ_INTEGRITY; |
153 | } |
154 | } |
155 | |
156 | return ret; |
157 | |
158 | out_unmap: |
159 | if (bio) |
160 | blk_rq_unmap_user(bio); |
161 | out: |
162 | blk_mq_free_request(rq: req); |
163 | return ret; |
164 | } |
165 | |
166 | static int nvme_submit_user_cmd(struct request_queue *q, |
167 | struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, |
168 | void __user *meta_buffer, unsigned meta_len, u32 meta_seed, |
169 | u64 *result, unsigned timeout, unsigned int flags) |
170 | { |
171 | struct nvme_ns *ns = q->queuedata; |
172 | struct nvme_ctrl *ctrl; |
173 | struct request *req; |
174 | struct bio *bio; |
175 | u32 effects; |
176 | int ret; |
177 | |
178 | req = nvme_alloc_user_request(q, cmd, rq_flags: 0, blk_flags: 0); |
179 | if (IS_ERR(ptr: req)) |
180 | return PTR_ERR(ptr: req); |
181 | |
182 | req->timeout = timeout; |
183 | if (ubuffer && bufflen) { |
184 | ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, |
185 | meta_len, meta_seed, NULL, flags); |
186 | if (ret) |
187 | return ret; |
188 | } |
189 | |
190 | bio = req->bio; |
191 | ctrl = nvme_req(req)->ctrl; |
192 | |
193 | effects = nvme_passthru_start(ctrl, ns, opcode: cmd->common.opcode); |
194 | ret = nvme_execute_rq(rq: req, at_head: false); |
195 | if (result) |
196 | *result = le64_to_cpu(nvme_req(req)->result.u64); |
197 | if (bio) |
198 | blk_rq_unmap_user(bio); |
199 | blk_mq_free_request(rq: req); |
200 | |
201 | if (effects) |
202 | nvme_passthru_end(ctrl, ns, effects, cmd, status: ret); |
203 | |
204 | return ret; |
205 | } |
206 | |
207 | static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) |
208 | { |
209 | struct nvme_user_io io; |
210 | struct nvme_command c; |
211 | unsigned length, meta_len; |
212 | void __user *metadata; |
213 | |
214 | if (copy_from_user(to: &io, from: uio, n: sizeof(io))) |
215 | return -EFAULT; |
216 | if (io.flags) |
217 | return -EINVAL; |
218 | |
219 | switch (io.opcode) { |
220 | case nvme_cmd_write: |
221 | case nvme_cmd_read: |
222 | case nvme_cmd_compare: |
223 | break; |
224 | default: |
225 | return -EINVAL; |
226 | } |
227 | |
228 | length = (io.nblocks + 1) << ns->head->lba_shift; |
229 | |
230 | if ((io.control & NVME_RW_PRINFO_PRACT) && |
231 | (ns->head->ms == ns->head->pi_size)) { |
232 | /* |
233 | * Protection information is stripped/inserted by the |
234 | * controller. |
235 | */ |
236 | if (nvme_to_user_ptr(ptrval: io.metadata)) |
237 | return -EINVAL; |
238 | meta_len = 0; |
239 | metadata = NULL; |
240 | } else { |
241 | meta_len = (io.nblocks + 1) * ns->head->ms; |
242 | metadata = nvme_to_user_ptr(ptrval: io.metadata); |
243 | } |
244 | |
245 | if (ns->head->features & NVME_NS_EXT_LBAS) { |
246 | length += meta_len; |
247 | meta_len = 0; |
248 | } else if (meta_len) { |
249 | if ((io.metadata & 3) || !io.metadata) |
250 | return -EINVAL; |
251 | } |
252 | |
253 | memset(&c, 0, sizeof(c)); |
254 | c.rw.opcode = io.opcode; |
255 | c.rw.flags = io.flags; |
256 | c.rw.nsid = cpu_to_le32(ns->head->ns_id); |
257 | c.rw.slba = cpu_to_le64(io.slba); |
258 | c.rw.length = cpu_to_le16(io.nblocks); |
259 | c.rw.control = cpu_to_le16(io.control); |
260 | c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); |
261 | c.rw.reftag = cpu_to_le32(io.reftag); |
262 | c.rw.apptag = cpu_to_le16(io.apptag); |
263 | c.rw.appmask = cpu_to_le16(io.appmask); |
264 | |
265 | return nvme_submit_user_cmd(q: ns->queue, cmd: &c, ubuffer: io.addr, bufflen: length, meta_buffer: metadata, |
266 | meta_len, lower_32_bits(io.slba), NULL, timeout: 0, flags: 0); |
267 | } |
268 | |
269 | static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, |
270 | struct nvme_ns *ns, __u32 nsid) |
271 | { |
272 | if (ns && nsid != ns->head->ns_id) { |
273 | dev_err(ctrl->device, |
274 | "%s: nsid (%u) in cmd does not match nsid (%u)" |
275 | "of namespace\n" , |
276 | current->comm, nsid, ns->head->ns_id); |
277 | return false; |
278 | } |
279 | |
280 | return true; |
281 | } |
282 | |
283 | static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, |
284 | struct nvme_passthru_cmd __user *ucmd, unsigned int flags, |
285 | bool open_for_write) |
286 | { |
287 | struct nvme_passthru_cmd cmd; |
288 | struct nvme_command c; |
289 | unsigned timeout = 0; |
290 | u64 result; |
291 | int status; |
292 | |
293 | if (copy_from_user(to: &cmd, from: ucmd, n: sizeof(cmd))) |
294 | return -EFAULT; |
295 | if (cmd.flags) |
296 | return -EINVAL; |
297 | if (!nvme_validate_passthru_nsid(ctrl, ns, nsid: cmd.nsid)) |
298 | return -EINVAL; |
299 | |
300 | memset(&c, 0, sizeof(c)); |
301 | c.common.opcode = cmd.opcode; |
302 | c.common.flags = cmd.flags; |
303 | c.common.nsid = cpu_to_le32(cmd.nsid); |
304 | c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); |
305 | c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); |
306 | c.common.cdw10 = cpu_to_le32(cmd.cdw10); |
307 | c.common.cdw11 = cpu_to_le32(cmd.cdw11); |
308 | c.common.cdw12 = cpu_to_le32(cmd.cdw12); |
309 | c.common.cdw13 = cpu_to_le32(cmd.cdw13); |
310 | c.common.cdw14 = cpu_to_le32(cmd.cdw14); |
311 | c.common.cdw15 = cpu_to_le32(cmd.cdw15); |
312 | |
313 | if (!nvme_cmd_allowed(ns, c: &c, flags: 0, open_for_write)) |
314 | return -EACCES; |
315 | |
316 | if (cmd.timeout_ms) |
317 | timeout = msecs_to_jiffies(m: cmd.timeout_ms); |
318 | |
319 | status = nvme_submit_user_cmd(q: ns ? ns->queue : ctrl->admin_q, cmd: &c, |
320 | ubuffer: cmd.addr, bufflen: cmd.data_len, meta_buffer: nvme_to_user_ptr(ptrval: cmd.metadata), |
321 | meta_len: cmd.metadata_len, meta_seed: 0, result: &result, timeout, flags: 0); |
322 | |
323 | if (status >= 0) { |
324 | if (put_user(result, &ucmd->result)) |
325 | return -EFAULT; |
326 | } |
327 | |
328 | return status; |
329 | } |
330 | |
331 | static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, |
332 | struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, |
333 | bool open_for_write) |
334 | { |
335 | struct nvme_passthru_cmd64 cmd; |
336 | struct nvme_command c; |
337 | unsigned timeout = 0; |
338 | int status; |
339 | |
340 | if (copy_from_user(to: &cmd, from: ucmd, n: sizeof(cmd))) |
341 | return -EFAULT; |
342 | if (cmd.flags) |
343 | return -EINVAL; |
344 | if (!nvme_validate_passthru_nsid(ctrl, ns, nsid: cmd.nsid)) |
345 | return -EINVAL; |
346 | |
347 | memset(&c, 0, sizeof(c)); |
348 | c.common.opcode = cmd.opcode; |
349 | c.common.flags = cmd.flags; |
350 | c.common.nsid = cpu_to_le32(cmd.nsid); |
351 | c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); |
352 | c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); |
353 | c.common.cdw10 = cpu_to_le32(cmd.cdw10); |
354 | c.common.cdw11 = cpu_to_le32(cmd.cdw11); |
355 | c.common.cdw12 = cpu_to_le32(cmd.cdw12); |
356 | c.common.cdw13 = cpu_to_le32(cmd.cdw13); |
357 | c.common.cdw14 = cpu_to_le32(cmd.cdw14); |
358 | c.common.cdw15 = cpu_to_le32(cmd.cdw15); |
359 | |
360 | if (!nvme_cmd_allowed(ns, c: &c, flags, open_for_write)) |
361 | return -EACCES; |
362 | |
363 | if (cmd.timeout_ms) |
364 | timeout = msecs_to_jiffies(m: cmd.timeout_ms); |
365 | |
366 | status = nvme_submit_user_cmd(q: ns ? ns->queue : ctrl->admin_q, cmd: &c, |
367 | ubuffer: cmd.addr, bufflen: cmd.data_len, meta_buffer: nvme_to_user_ptr(ptrval: cmd.metadata), |
368 | meta_len: cmd.metadata_len, meta_seed: 0, result: &cmd.result, timeout, flags); |
369 | |
370 | if (status >= 0) { |
371 | if (put_user(cmd.result, &ucmd->result)) |
372 | return -EFAULT; |
373 | } |
374 | |
375 | return status; |
376 | } |
377 | |
378 | struct nvme_uring_data { |
379 | __u64 metadata; |
380 | __u64 addr; |
381 | __u32 data_len; |
382 | __u32 metadata_len; |
383 | __u32 timeout_ms; |
384 | }; |
385 | |
386 | /* |
387 | * This overlays struct io_uring_cmd pdu. |
388 | * Expect build errors if this grows larger than that. |
389 | */ |
390 | struct nvme_uring_cmd_pdu { |
391 | struct request *req; |
392 | struct bio *bio; |
393 | u64 result; |
394 | int status; |
395 | }; |
396 | |
397 | static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( |
398 | struct io_uring_cmd *ioucmd) |
399 | { |
400 | return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; |
401 | } |
402 | |
403 | static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, |
404 | unsigned issue_flags) |
405 | { |
406 | struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); |
407 | |
408 | if (pdu->bio) |
409 | blk_rq_unmap_user(pdu->bio); |
410 | io_uring_cmd_done(cmd: ioucmd, ret: pdu->status, res2: pdu->result, issue_flags); |
411 | } |
412 | |
413 | static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, |
414 | blk_status_t err) |
415 | { |
416 | struct io_uring_cmd *ioucmd = req->end_io_data; |
417 | struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); |
418 | |
419 | if (nvme_req(req)->flags & NVME_REQ_CANCELLED) |
420 | pdu->status = -EINTR; |
421 | else |
422 | pdu->status = nvme_req(req)->status; |
423 | pdu->result = le64_to_cpu(nvme_req(req)->result.u64); |
424 | |
425 | /* |
426 | * For iopoll, complete it directly. |
427 | * Otherwise, move the completion to task work. |
428 | */ |
429 | if (blk_rq_is_poll(rq: req)) |
430 | nvme_uring_task_cb(ioucmd, issue_flags: IO_URING_F_UNLOCKED); |
431 | else |
432 | io_uring_cmd_do_in_task_lazy(ioucmd, task_work_cb: nvme_uring_task_cb); |
433 | |
434 | return RQ_END_IO_FREE; |
435 | } |
436 | |
437 | static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, |
438 | struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) |
439 | { |
440 | struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); |
441 | const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(sqe: ioucmd->sqe); |
442 | struct request_queue *q = ns ? ns->queue : ctrl->admin_q; |
443 | struct nvme_uring_data d; |
444 | struct nvme_command c; |
445 | struct request *req; |
446 | blk_opf_t rq_flags = REQ_ALLOC_CACHE; |
447 | blk_mq_req_flags_t blk_flags = 0; |
448 | int ret; |
449 | |
450 | c.common.opcode = READ_ONCE(cmd->opcode); |
451 | c.common.flags = READ_ONCE(cmd->flags); |
452 | if (c.common.flags) |
453 | return -EINVAL; |
454 | |
455 | c.common.command_id = 0; |
456 | c.common.nsid = cpu_to_le32(cmd->nsid); |
457 | if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) |
458 | return -EINVAL; |
459 | |
460 | c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); |
461 | c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); |
462 | c.common.metadata = 0; |
463 | c.common.dptr.prp1 = c.common.dptr.prp2 = 0; |
464 | c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); |
465 | c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); |
466 | c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); |
467 | c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); |
468 | c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); |
469 | c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); |
470 | |
471 | if (!nvme_cmd_allowed(ns, c: &c, flags: 0, open_for_write: ioucmd->file->f_mode & FMODE_WRITE)) |
472 | return -EACCES; |
473 | |
474 | d.metadata = READ_ONCE(cmd->metadata); |
475 | d.addr = READ_ONCE(cmd->addr); |
476 | d.data_len = READ_ONCE(cmd->data_len); |
477 | d.metadata_len = READ_ONCE(cmd->metadata_len); |
478 | d.timeout_ms = READ_ONCE(cmd->timeout_ms); |
479 | |
480 | if (issue_flags & IO_URING_F_NONBLOCK) { |
481 | rq_flags |= REQ_NOWAIT; |
482 | blk_flags = BLK_MQ_REQ_NOWAIT; |
483 | } |
484 | if (issue_flags & IO_URING_F_IOPOLL) |
485 | rq_flags |= REQ_POLLED; |
486 | |
487 | req = nvme_alloc_user_request(q, cmd: &c, rq_flags, blk_flags); |
488 | if (IS_ERR(ptr: req)) |
489 | return PTR_ERR(ptr: req); |
490 | req->timeout = d.timeout_ms ? msecs_to_jiffies(m: d.timeout_ms) : 0; |
491 | |
492 | if (d.addr && d.data_len) { |
493 | ret = nvme_map_user_request(req, ubuffer: d.addr, |
494 | bufflen: d.data_len, meta_buffer: nvme_to_user_ptr(ptrval: d.metadata), |
495 | meta_len: d.metadata_len, meta_seed: 0, ioucmd, flags: vec); |
496 | if (ret) |
497 | return ret; |
498 | } |
499 | |
500 | /* to free bio on completion, as req->bio will be null at that time */ |
501 | pdu->bio = req->bio; |
502 | pdu->req = req; |
503 | req->end_io_data = ioucmd; |
504 | req->end_io = nvme_uring_cmd_end_io; |
505 | blk_execute_rq_nowait(rq: req, at_head: false); |
506 | return -EIOCBQUEUED; |
507 | } |
508 | |
509 | static bool is_ctrl_ioctl(unsigned int cmd) |
510 | { |
511 | if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) |
512 | return true; |
513 | if (is_sed_ioctl(cmd)) |
514 | return true; |
515 | return false; |
516 | } |
517 | |
518 | static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, |
519 | void __user *argp, bool open_for_write) |
520 | { |
521 | switch (cmd) { |
522 | case NVME_IOCTL_ADMIN_CMD: |
523 | return nvme_user_cmd(ctrl, NULL, ucmd: argp, flags: 0, open_for_write); |
524 | case NVME_IOCTL_ADMIN64_CMD: |
525 | return nvme_user_cmd64(ctrl, NULL, ucmd: argp, flags: 0, open_for_write); |
526 | default: |
527 | return sed_ioctl(dev: ctrl->opal_dev, cmd, ioctl_ptr: argp); |
528 | } |
529 | } |
530 | |
531 | #ifdef COMPAT_FOR_U64_ALIGNMENT |
532 | struct nvme_user_io32 { |
533 | __u8 opcode; |
534 | __u8 flags; |
535 | __u16 control; |
536 | __u16 nblocks; |
537 | __u16 rsvd; |
538 | __u64 metadata; |
539 | __u64 addr; |
540 | __u64 slba; |
541 | __u32 dsmgmt; |
542 | __u32 reftag; |
543 | __u16 apptag; |
544 | __u16 appmask; |
545 | } __attribute__((__packed__)); |
546 | #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) |
547 | #endif /* COMPAT_FOR_U64_ALIGNMENT */ |
548 | |
549 | static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, |
550 | void __user *argp, unsigned int flags, bool open_for_write) |
551 | { |
552 | switch (cmd) { |
553 | case NVME_IOCTL_ID: |
554 | force_successful_syscall_return(); |
555 | return ns->head->ns_id; |
556 | case NVME_IOCTL_IO_CMD: |
557 | return nvme_user_cmd(ctrl: ns->ctrl, ns, ucmd: argp, flags, open_for_write); |
558 | /* |
559 | * struct nvme_user_io can have different padding on some 32-bit ABIs. |
560 | * Just accept the compat version as all fields that are used are the |
561 | * same size and at the same offset. |
562 | */ |
563 | #ifdef COMPAT_FOR_U64_ALIGNMENT |
564 | case NVME_IOCTL_SUBMIT_IO32: |
565 | #endif |
566 | case NVME_IOCTL_SUBMIT_IO: |
567 | return nvme_submit_io(ns, uio: argp); |
568 | case NVME_IOCTL_IO64_CMD_VEC: |
569 | flags |= NVME_IOCTL_VEC; |
570 | fallthrough; |
571 | case NVME_IOCTL_IO64_CMD: |
572 | return nvme_user_cmd64(ctrl: ns->ctrl, ns, ucmd: argp, flags, |
573 | open_for_write); |
574 | default: |
575 | return -ENOTTY; |
576 | } |
577 | } |
578 | |
579 | int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, |
580 | unsigned int cmd, unsigned long arg) |
581 | { |
582 | struct nvme_ns *ns = bdev->bd_disk->private_data; |
583 | bool open_for_write = mode & BLK_OPEN_WRITE; |
584 | void __user *argp = (void __user *)arg; |
585 | unsigned int flags = 0; |
586 | |
587 | if (bdev_is_partition(bdev)) |
588 | flags |= NVME_IOCTL_PARTITION; |
589 | |
590 | if (is_ctrl_ioctl(cmd)) |
591 | return nvme_ctrl_ioctl(ctrl: ns->ctrl, cmd, argp, open_for_write); |
592 | return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); |
593 | } |
594 | |
595 | long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
596 | { |
597 | struct nvme_ns *ns = |
598 | container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); |
599 | bool open_for_write = file->f_mode & FMODE_WRITE; |
600 | void __user *argp = (void __user *)arg; |
601 | |
602 | if (is_ctrl_ioctl(cmd)) |
603 | return nvme_ctrl_ioctl(ctrl: ns->ctrl, cmd, argp, open_for_write); |
604 | return nvme_ns_ioctl(ns, cmd, argp, flags: 0, open_for_write); |
605 | } |
606 | |
607 | static int nvme_uring_cmd_checks(unsigned int issue_flags) |
608 | { |
609 | |
610 | /* NVMe passthrough requires big SQE/CQE support */ |
611 | if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != |
612 | (IO_URING_F_SQE128|IO_URING_F_CQE32)) |
613 | return -EOPNOTSUPP; |
614 | return 0; |
615 | } |
616 | |
617 | static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, |
618 | unsigned int issue_flags) |
619 | { |
620 | struct nvme_ctrl *ctrl = ns->ctrl; |
621 | int ret; |
622 | |
623 | BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); |
624 | |
625 | ret = nvme_uring_cmd_checks(issue_flags); |
626 | if (ret) |
627 | return ret; |
628 | |
629 | switch (ioucmd->cmd_op) { |
630 | case NVME_URING_CMD_IO: |
631 | ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, vec: false); |
632 | break; |
633 | case NVME_URING_CMD_IO_VEC: |
634 | ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, vec: true); |
635 | break; |
636 | default: |
637 | ret = -ENOTTY; |
638 | } |
639 | |
640 | return ret; |
641 | } |
642 | |
643 | int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) |
644 | { |
645 | struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, |
646 | struct nvme_ns, cdev); |
647 | |
648 | return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); |
649 | } |
650 | |
651 | int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, |
652 | struct io_comp_batch *iob, |
653 | unsigned int poll_flags) |
654 | { |
655 | struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); |
656 | struct request *req = pdu->req; |
657 | |
658 | if (req && blk_rq_is_poll(rq: req)) |
659 | return blk_rq_poll(rq: req, iob, poll_flags); |
660 | return 0; |
661 | } |
662 | #ifdef CONFIG_NVME_MULTIPATH |
663 | static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, |
664 | void __user *argp, struct nvme_ns_head *head, int srcu_idx, |
665 | bool open_for_write) |
666 | __releases(&head->srcu) |
667 | { |
668 | struct nvme_ctrl *ctrl = ns->ctrl; |
669 | int ret; |
670 | |
671 | nvme_get_ctrl(ctrl: ns->ctrl); |
672 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
673 | ret = nvme_ctrl_ioctl(ctrl: ns->ctrl, cmd, argp, open_for_write); |
674 | |
675 | nvme_put_ctrl(ctrl); |
676 | return ret; |
677 | } |
678 | |
679 | int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, |
680 | unsigned int cmd, unsigned long arg) |
681 | { |
682 | struct nvme_ns_head *head = bdev->bd_disk->private_data; |
683 | bool open_for_write = mode & BLK_OPEN_WRITE; |
684 | void __user *argp = (void __user *)arg; |
685 | struct nvme_ns *ns; |
686 | int srcu_idx, ret = -EWOULDBLOCK; |
687 | unsigned int flags = 0; |
688 | |
689 | if (bdev_is_partition(bdev)) |
690 | flags |= NVME_IOCTL_PARTITION; |
691 | |
692 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
693 | ns = nvme_find_path(head); |
694 | if (!ns) |
695 | goto out_unlock; |
696 | |
697 | /* |
698 | * Handle ioctls that apply to the controller instead of the namespace |
699 | * seperately and drop the ns SRCU reference early. This avoids a |
700 | * deadlock when deleting namespaces using the passthrough interface. |
701 | */ |
702 | if (is_ctrl_ioctl(cmd)) |
703 | return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, |
704 | open_for_write); |
705 | |
706 | ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); |
707 | out_unlock: |
708 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
709 | return ret; |
710 | } |
711 | |
712 | long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, |
713 | unsigned long arg) |
714 | { |
715 | bool open_for_write = file->f_mode & FMODE_WRITE; |
716 | struct cdev *cdev = file_inode(f: file)->i_cdev; |
717 | struct nvme_ns_head *head = |
718 | container_of(cdev, struct nvme_ns_head, cdev); |
719 | void __user *argp = (void __user *)arg; |
720 | struct nvme_ns *ns; |
721 | int srcu_idx, ret = -EWOULDBLOCK; |
722 | |
723 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
724 | ns = nvme_find_path(head); |
725 | if (!ns) |
726 | goto out_unlock; |
727 | |
728 | if (is_ctrl_ioctl(cmd)) |
729 | return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, |
730 | open_for_write); |
731 | |
732 | ret = nvme_ns_ioctl(ns, cmd, argp, flags: 0, open_for_write); |
733 | out_unlock: |
734 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
735 | return ret; |
736 | } |
737 | |
738 | int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, |
739 | unsigned int issue_flags) |
740 | { |
741 | struct cdev *cdev = file_inode(f: ioucmd->file)->i_cdev; |
742 | struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); |
743 | int srcu_idx = srcu_read_lock(ssp: &head->srcu); |
744 | struct nvme_ns *ns = nvme_find_path(head); |
745 | int ret = -EINVAL; |
746 | |
747 | if (ns) |
748 | ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); |
749 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
750 | return ret; |
751 | } |
752 | #endif /* CONFIG_NVME_MULTIPATH */ |
753 | |
754 | int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) |
755 | { |
756 | struct nvme_ctrl *ctrl = ioucmd->file->private_data; |
757 | int ret; |
758 | |
759 | /* IOPOLL not supported yet */ |
760 | if (issue_flags & IO_URING_F_IOPOLL) |
761 | return -EOPNOTSUPP; |
762 | |
763 | ret = nvme_uring_cmd_checks(issue_flags); |
764 | if (ret) |
765 | return ret; |
766 | |
767 | switch (ioucmd->cmd_op) { |
768 | case NVME_URING_CMD_ADMIN: |
769 | ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, vec: false); |
770 | break; |
771 | case NVME_URING_CMD_ADMIN_VEC: |
772 | ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, vec: true); |
773 | break; |
774 | default: |
775 | ret = -ENOTTY; |
776 | } |
777 | |
778 | return ret; |
779 | } |
780 | |
781 | static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, |
782 | bool open_for_write) |
783 | { |
784 | struct nvme_ns *ns; |
785 | int ret; |
786 | |
787 | down_read(sem: &ctrl->namespaces_rwsem); |
788 | if (list_empty(head: &ctrl->namespaces)) { |
789 | ret = -ENOTTY; |
790 | goto out_unlock; |
791 | } |
792 | |
793 | ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); |
794 | if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { |
795 | dev_warn(ctrl->device, |
796 | "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n" ); |
797 | ret = -EINVAL; |
798 | goto out_unlock; |
799 | } |
800 | |
801 | dev_warn(ctrl->device, |
802 | "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n" ); |
803 | kref_get(kref: &ns->kref); |
804 | up_read(sem: &ctrl->namespaces_rwsem); |
805 | |
806 | ret = nvme_user_cmd(ctrl, ns, ucmd: argp, flags: 0, open_for_write); |
807 | nvme_put_ns(ns); |
808 | return ret; |
809 | |
810 | out_unlock: |
811 | up_read(sem: &ctrl->namespaces_rwsem); |
812 | return ret; |
813 | } |
814 | |
815 | long nvme_dev_ioctl(struct file *file, unsigned int cmd, |
816 | unsigned long arg) |
817 | { |
818 | bool open_for_write = file->f_mode & FMODE_WRITE; |
819 | struct nvme_ctrl *ctrl = file->private_data; |
820 | void __user *argp = (void __user *)arg; |
821 | |
822 | switch (cmd) { |
823 | case NVME_IOCTL_ADMIN_CMD: |
824 | return nvme_user_cmd(ctrl, NULL, ucmd: argp, flags: 0, open_for_write); |
825 | case NVME_IOCTL_ADMIN64_CMD: |
826 | return nvme_user_cmd64(ctrl, NULL, ucmd: argp, flags: 0, open_for_write); |
827 | case NVME_IOCTL_IO_CMD: |
828 | return nvme_dev_user_cmd(ctrl, argp, open_for_write); |
829 | case NVME_IOCTL_RESET: |
830 | if (!capable(CAP_SYS_ADMIN)) |
831 | return -EACCES; |
832 | dev_warn(ctrl->device, "resetting controller\n" ); |
833 | return nvme_reset_ctrl_sync(ctrl); |
834 | case NVME_IOCTL_SUBSYS_RESET: |
835 | if (!capable(CAP_SYS_ADMIN)) |
836 | return -EACCES; |
837 | return nvme_reset_subsystem(ctrl); |
838 | case NVME_IOCTL_RESCAN: |
839 | if (!capable(CAP_SYS_ADMIN)) |
840 | return -EACCES; |
841 | nvme_queue_scan(ctrl); |
842 | return 0; |
843 | default: |
844 | return -ENOTTY; |
845 | } |
846 | } |
847 | |