1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/sched.h>
15#include <linux/fs.h>
16#include <linux/pagemap.h>
17#include <linux/file.h>
18#include <linux/stat.h>
19#include <linux/errno.h>
20#include <linux/major.h>
21#include <linux/wait.h>
22#include <linux/blkdev.h>
23#include <linux/init.h>
24#include <linux/swap.h>
25#include <linux/slab.h>
26#include <linux/compat.h>
27#include <linux/mutex.h>
28#include <linux/writeback.h>
29#include <linux/completion.h>
30#include <linux/highmem.h>
31#include <linux/sysfs.h>
32#include <linux/miscdevice.h>
33#include <linux/falloc.h>
34#include <linux/uio.h>
35#include <linux/ioprio.h>
36#include <linux/sched/mm.h>
37#include <linux/uaccess.h>
38#include <linux/cdev.h>
39#include <linux/io_uring.h>
40#include <linux/blk-mq.h>
41#include <linux/delay.h>
42#include <linux/mm.h>
43#include <asm/page.h>
44#include <linux/task_work.h>
45#include <linux/namei.h>
46#include <linux/kref.h>
47#include <uapi/linux/ublk_cmd.h>
48
49#define UBLK_MINORS (1U << MINORBITS)
50
51/* All UBLK_F_* have to be included into UBLK_F_ALL */
52#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
53 | UBLK_F_URING_CMD_COMP_IN_TASK \
54 | UBLK_F_NEED_GET_DATA \
55 | UBLK_F_USER_RECOVERY \
56 | UBLK_F_USER_RECOVERY_REISSUE \
57 | UBLK_F_UNPRIVILEGED_DEV \
58 | UBLK_F_CMD_IOCTL_ENCODE \
59 | UBLK_F_USER_COPY \
60 | UBLK_F_ZONED)
61
62/* All UBLK_PARAM_TYPE_* should be included here */
63#define UBLK_PARAM_TYPE_ALL \
64 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
65 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
66
67struct ublk_rq_data {
68 struct llist_node node;
69
70 struct kref ref;
71 __u64 sector;
72 __u32 operation;
73 __u32 nr_zones;
74};
75
76struct ublk_uring_cmd_pdu {
77 struct ublk_queue *ubq;
78 u16 tag;
79};
80
81/*
82 * io command is active: sqe cmd is received, and its cqe isn't done
83 *
84 * If the flag is set, the io command is owned by ublk driver, and waited
85 * for incoming blk-mq request from the ublk block device.
86 *
87 * If the flag is cleared, the io command will be completed, and owned by
88 * ublk server.
89 */
90#define UBLK_IO_FLAG_ACTIVE 0x01
91
92/*
93 * IO command is completed via cqe, and it is being handled by ublksrv, and
94 * not committed yet
95 *
96 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
97 * cross verification
98 */
99#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
100
101/*
102 * IO command is aborted, so this flag is set in case of
103 * !UBLK_IO_FLAG_ACTIVE.
104 *
105 * After this flag is observed, any pending or new incoming request
106 * associated with this io command will be failed immediately
107 */
108#define UBLK_IO_FLAG_ABORTED 0x04
109
110/*
111 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
112 * get data buffer address from ublksrv.
113 *
114 * Then, bio data could be copied into this data buffer for a WRITE request
115 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
116 */
117#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
118
119/* atomic RW with ubq->cancel_lock */
120#define UBLK_IO_FLAG_CANCELED 0x80000000
121
122struct ublk_io {
123 /* userspace buffer address from io cmd */
124 __u64 addr;
125 unsigned int flags;
126 int res;
127
128 struct io_uring_cmd *cmd;
129};
130
131struct ublk_queue {
132 int q_id;
133 int q_depth;
134
135 unsigned long flags;
136 struct task_struct *ubq_daemon;
137 char *io_cmd_buf;
138
139 struct llist_head io_cmds;
140
141 unsigned long io_addr; /* mapped vm address */
142 unsigned int max_io_sz;
143 bool force_abort;
144 bool timeout;
145 bool canceling;
146 unsigned short nr_io_ready; /* how many ios setup */
147 spinlock_t cancel_lock;
148 struct ublk_device *dev;
149 struct ublk_io ios[];
150};
151
152struct ublk_device {
153 struct gendisk *ub_disk;
154
155 char *__queues;
156
157 unsigned int queue_size;
158 struct ublksrv_ctrl_dev_info dev_info;
159
160 struct blk_mq_tag_set tag_set;
161
162 struct cdev cdev;
163 struct device cdev_dev;
164
165#define UB_STATE_OPEN 0
166#define UB_STATE_USED 1
167#define UB_STATE_DELETED 2
168 unsigned long state;
169 int ub_number;
170
171 struct mutex mutex;
172
173 spinlock_t lock;
174 struct mm_struct *mm;
175
176 struct ublk_params params;
177
178 struct completion completion;
179 unsigned int nr_queues_ready;
180 unsigned int nr_privileged_daemon;
181
182 struct work_struct quiesce_work;
183 struct work_struct stop_work;
184};
185
186/* header of ublk_params */
187struct ublk_params_header {
188 __u32 len;
189 __u32 types;
190};
191
192static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
193
194static inline unsigned int ublk_req_build_flags(struct request *req);
195static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
196 int tag);
197static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
198{
199 return ub->dev_info.flags & UBLK_F_USER_COPY;
200}
201
202static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
203{
204 return ub->dev_info.flags & UBLK_F_ZONED;
205}
206
207static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
208{
209 return ubq->flags & UBLK_F_ZONED;
210}
211
212#ifdef CONFIG_BLK_DEV_ZONED
213
214static int ublk_get_nr_zones(const struct ublk_device *ub)
215{
216 const struct ublk_param_basic *p = &ub->params.basic;
217
218 /* Zone size is a power of 2 */
219 return p->dev_sectors >> ilog2(p->chunk_sectors);
220}
221
222static int ublk_revalidate_disk_zones(struct ublk_device *ub)
223{
224 return blk_revalidate_disk_zones(disk: ub->ub_disk, NULL);
225}
226
227static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
228{
229 const struct ublk_param_zoned *p = &ub->params.zoned;
230 int nr_zones;
231
232 if (!ublk_dev_is_zoned(ub))
233 return -EINVAL;
234
235 if (!p->max_zone_append_sectors)
236 return -EINVAL;
237
238 nr_zones = ublk_get_nr_zones(ub);
239
240 if (p->max_active_zones > nr_zones)
241 return -EINVAL;
242
243 if (p->max_open_zones > nr_zones)
244 return -EINVAL;
245
246 return 0;
247}
248
249static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
250{
251 const struct ublk_param_zoned *p = &ub->params.zoned;
252
253 disk_set_zoned(disk: ub->ub_disk, model: BLK_ZONED_HM);
254 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q: ub->ub_disk->queue);
255 blk_queue_required_elevator_features(q: ub->ub_disk->queue,
256 ELEVATOR_F_ZBD_SEQ_WRITE);
257 disk_set_max_active_zones(disk: ub->ub_disk, max_active_zones: p->max_active_zones);
258 disk_set_max_open_zones(disk: ub->ub_disk, max_open_zones: p->max_open_zones);
259 blk_queue_max_zone_append_sectors(q: ub->ub_disk->queue, max_zone_append_sectors: p->max_zone_append_sectors);
260
261 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
262
263 return 0;
264}
265
266/* Based on virtblk_alloc_report_buffer */
267static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
268 unsigned int nr_zones, size_t *buflen)
269{
270 struct request_queue *q = ublk->ub_disk->queue;
271 size_t bufsize;
272 void *buf;
273
274 nr_zones = min_t(unsigned int, nr_zones,
275 ublk->ub_disk->nr_zones);
276
277 bufsize = nr_zones * sizeof(struct blk_zone);
278 bufsize =
279 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
280
281 while (bufsize >= sizeof(struct blk_zone)) {
282 buf = kvmalloc(size: bufsize, GFP_KERNEL | __GFP_NORETRY);
283 if (buf) {
284 *buflen = bufsize;
285 return buf;
286 }
287 bufsize >>= 1;
288 }
289
290 *buflen = 0;
291 return NULL;
292}
293
294static int ublk_report_zones(struct gendisk *disk, sector_t sector,
295 unsigned int nr_zones, report_zones_cb cb, void *data)
296{
297 struct ublk_device *ub = disk->private_data;
298 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
299 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
300 unsigned int done_zones = 0;
301 unsigned int max_zones_per_request;
302 int ret;
303 struct blk_zone *buffer;
304 size_t buffer_length;
305
306 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
307 nr_zones);
308
309 buffer = ublk_alloc_report_buffer(ublk: ub, nr_zones, buflen: &buffer_length);
310 if (!buffer)
311 return -ENOMEM;
312
313 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
314
315 while (done_zones < nr_zones) {
316 unsigned int remaining_zones = nr_zones - done_zones;
317 unsigned int zones_in_request =
318 min_t(unsigned int, remaining_zones, max_zones_per_request);
319 struct request *req;
320 struct ublk_rq_data *pdu;
321 blk_status_t status;
322
323 memset(buffer, 0, buffer_length);
324
325 req = blk_mq_alloc_request(q: disk->queue, opf: REQ_OP_DRV_IN, flags: 0);
326 if (IS_ERR(ptr: req)) {
327 ret = PTR_ERR(ptr: req);
328 goto out;
329 }
330
331 pdu = blk_mq_rq_to_pdu(rq: req);
332 pdu->operation = UBLK_IO_OP_REPORT_ZONES;
333 pdu->sector = sector;
334 pdu->nr_zones = zones_in_request;
335
336 ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
337 GFP_KERNEL);
338 if (ret) {
339 blk_mq_free_request(rq: req);
340 goto out;
341 }
342
343 status = blk_execute_rq(rq: req, at_head: 0);
344 ret = blk_status_to_errno(status);
345 blk_mq_free_request(rq: req);
346 if (ret)
347 goto out;
348
349 for (unsigned int i = 0; i < zones_in_request; i++) {
350 struct blk_zone *zone = buffer + i;
351
352 /* A zero length zone means no more zones in this response */
353 if (!zone->len)
354 break;
355
356 ret = cb(zone, i, data);
357 if (ret)
358 goto out;
359
360 done_zones++;
361 sector += zone_size_sectors;
362
363 }
364 }
365
366 ret = done_zones;
367
368out:
369 kvfree(addr: buffer);
370 return ret;
371}
372
373static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
374 struct request *req)
375{
376 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag: req->tag);
377 struct ublk_io *io = &ubq->ios[req->tag];
378 struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(rq: req);
379 u32 ublk_op;
380
381 switch (req_op(req)) {
382 case REQ_OP_ZONE_OPEN:
383 ublk_op = UBLK_IO_OP_ZONE_OPEN;
384 break;
385 case REQ_OP_ZONE_CLOSE:
386 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
387 break;
388 case REQ_OP_ZONE_FINISH:
389 ublk_op = UBLK_IO_OP_ZONE_FINISH;
390 break;
391 case REQ_OP_ZONE_RESET:
392 ublk_op = UBLK_IO_OP_ZONE_RESET;
393 break;
394 case REQ_OP_ZONE_APPEND:
395 ublk_op = UBLK_IO_OP_ZONE_APPEND;
396 break;
397 case REQ_OP_ZONE_RESET_ALL:
398 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
399 break;
400 case REQ_OP_DRV_IN:
401 ublk_op = pdu->operation;
402 switch (ublk_op) {
403 case UBLK_IO_OP_REPORT_ZONES:
404 iod->op_flags = ublk_op | ublk_req_build_flags(req);
405 iod->nr_zones = pdu->nr_zones;
406 iod->start_sector = pdu->sector;
407 return BLK_STS_OK;
408 default:
409 return BLK_STS_IOERR;
410 }
411 case REQ_OP_DRV_OUT:
412 /* We do not support drv_out */
413 return BLK_STS_NOTSUPP;
414 default:
415 return BLK_STS_IOERR;
416 }
417
418 iod->op_flags = ublk_op | ublk_req_build_flags(req);
419 iod->nr_sectors = blk_rq_sectors(rq: req);
420 iod->start_sector = blk_rq_pos(rq: req);
421 iod->addr = io->addr;
422
423 return BLK_STS_OK;
424}
425
426#else
427
428#define ublk_report_zones (NULL)
429
430static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
431{
432 return -EOPNOTSUPP;
433}
434
435static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
436{
437 return -EOPNOTSUPP;
438}
439
440static int ublk_revalidate_disk_zones(struct ublk_device *ub)
441{
442 return 0;
443}
444
445static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
446 struct request *req)
447{
448 return BLK_STS_NOTSUPP;
449}
450
451#endif
452
453static inline void __ublk_complete_rq(struct request *req);
454static void ublk_complete_rq(struct kref *ref);
455
456static dev_t ublk_chr_devt;
457static const struct class ublk_chr_class = {
458 .name = "ublk-char",
459};
460
461static DEFINE_IDR(ublk_index_idr);
462static DEFINE_SPINLOCK(ublk_idr_lock);
463static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
464
465static DEFINE_MUTEX(ublk_ctl_mutex);
466
467/*
468 * Max ublk devices allowed to add
469 *
470 * It can be extended to one per-user limit in future or even controlled
471 * by cgroup.
472 */
473#define UBLK_MAX_UBLKS UBLK_MINORS
474static unsigned int ublks_max = 64;
475static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
476
477static struct miscdevice ublk_misc;
478
479static inline unsigned ublk_pos_to_hwq(loff_t pos)
480{
481 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
482 UBLK_QID_BITS_MASK;
483}
484
485static inline unsigned ublk_pos_to_buf_off(loff_t pos)
486{
487 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
488}
489
490static inline unsigned ublk_pos_to_tag(loff_t pos)
491{
492 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
493 UBLK_TAG_BITS_MASK;
494}
495
496static void ublk_dev_param_basic_apply(struct ublk_device *ub)
497{
498 struct request_queue *q = ub->ub_disk->queue;
499 const struct ublk_param_basic *p = &ub->params.basic;
500
501 blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
502 blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
503 blk_queue_io_min(q, min: 1 << p->io_min_shift);
504 blk_queue_io_opt(q, opt: 1 << p->io_opt_shift);
505
506 blk_queue_write_cache(q, enabled: p->attrs & UBLK_ATTR_VOLATILE_CACHE,
507 fua: p->attrs & UBLK_ATTR_FUA);
508 if (p->attrs & UBLK_ATTR_ROTATIONAL)
509 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
510 else
511 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
512
513 blk_queue_max_hw_sectors(q, p->max_sectors);
514 blk_queue_chunk_sectors(q, p->chunk_sectors);
515 blk_queue_virt_boundary(q, p->virt_boundary_mask);
516
517 if (p->attrs & UBLK_ATTR_READ_ONLY)
518 set_disk_ro(disk: ub->ub_disk, read_only: true);
519
520 set_capacity(disk: ub->ub_disk, size: p->dev_sectors);
521}
522
523static void ublk_dev_param_discard_apply(struct ublk_device *ub)
524{
525 struct request_queue *q = ub->ub_disk->queue;
526 const struct ublk_param_discard *p = &ub->params.discard;
527
528 q->limits.discard_alignment = p->discard_alignment;
529 q->limits.discard_granularity = p->discard_granularity;
530 blk_queue_max_discard_sectors(q, max_discard_sectors: p->max_discard_sectors);
531 blk_queue_max_write_zeroes_sectors(q,
532 max_write_same_sectors: p->max_write_zeroes_sectors);
533 blk_queue_max_discard_segments(q, p->max_discard_segments);
534}
535
536static int ublk_validate_params(const struct ublk_device *ub)
537{
538 /* basic param is the only one which must be set */
539 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
540 const struct ublk_param_basic *p = &ub->params.basic;
541
542 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
543 return -EINVAL;
544
545 if (p->logical_bs_shift > p->physical_bs_shift)
546 return -EINVAL;
547
548 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
549 return -EINVAL;
550
551 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
552 return -EINVAL;
553 } else
554 return -EINVAL;
555
556 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
557 const struct ublk_param_discard *p = &ub->params.discard;
558
559 /* So far, only support single segment discard */
560 if (p->max_discard_sectors && p->max_discard_segments != 1)
561 return -EINVAL;
562
563 if (!p->discard_granularity)
564 return -EINVAL;
565 }
566
567 /* dev_t is read-only */
568 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
569 return -EINVAL;
570
571 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
572 return ublk_dev_param_zoned_validate(ub);
573 else if (ublk_dev_is_zoned(ub))
574 return -EINVAL;
575
576 return 0;
577}
578
579static int ublk_apply_params(struct ublk_device *ub)
580{
581 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
582 return -EINVAL;
583
584 ublk_dev_param_basic_apply(ub);
585
586 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
587 ublk_dev_param_discard_apply(ub);
588
589 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
590 return ublk_dev_param_zoned_apply(ub);
591
592 return 0;
593}
594
595static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
596{
597 return ubq->flags & UBLK_F_USER_COPY;
598}
599
600static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
601{
602 /*
603 * read()/write() is involved in user copy, so request reference
604 * has to be grabbed
605 */
606 return ublk_support_user_copy(ubq);
607}
608
609static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
610 struct request *req)
611{
612 if (ublk_need_req_ref(ubq)) {
613 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq: req);
614
615 kref_init(kref: &data->ref);
616 }
617}
618
619static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
620 struct request *req)
621{
622 if (ublk_need_req_ref(ubq)) {
623 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq: req);
624
625 return kref_get_unless_zero(kref: &data->ref);
626 }
627
628 return true;
629}
630
631static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
632 struct request *req)
633{
634 if (ublk_need_req_ref(ubq)) {
635 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq: req);
636
637 kref_put(kref: &data->ref, release: ublk_complete_rq);
638 } else {
639 __ublk_complete_rq(req);
640 }
641}
642
643static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
644{
645 return ubq->flags & UBLK_F_NEED_GET_DATA;
646}
647
648static struct ublk_device *ublk_get_device(struct ublk_device *ub)
649{
650 if (kobject_get_unless_zero(kobj: &ub->cdev_dev.kobj))
651 return ub;
652 return NULL;
653}
654
655static void ublk_put_device(struct ublk_device *ub)
656{
657 put_device(dev: &ub->cdev_dev);
658}
659
660static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
661 int qid)
662{
663 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
664}
665
666static inline bool ublk_rq_has_data(const struct request *rq)
667{
668 return bio_has_data(bio: rq->bio);
669}
670
671static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
672 int tag)
673{
674 return (struct ublksrv_io_desc *)
675 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
676}
677
678static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
679{
680 return ublk_get_queue(dev: ub, qid: q_id)->io_cmd_buf;
681}
682
683static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
684{
685 struct ublk_queue *ubq = ublk_get_queue(dev: ub, qid: q_id);
686
687 return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
688 PAGE_SIZE);
689}
690
691static inline bool ublk_queue_can_use_recovery_reissue(
692 struct ublk_queue *ubq)
693{
694 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
695 (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
696}
697
698static inline bool ublk_queue_can_use_recovery(
699 struct ublk_queue *ubq)
700{
701 return ubq->flags & UBLK_F_USER_RECOVERY;
702}
703
704static inline bool ublk_can_use_recovery(struct ublk_device *ub)
705{
706 return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
707}
708
709static void ublk_free_disk(struct gendisk *disk)
710{
711 struct ublk_device *ub = disk->private_data;
712
713 clear_bit(UB_STATE_USED, addr: &ub->state);
714 put_device(dev: &ub->cdev_dev);
715}
716
717static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
718 unsigned int *owner_gid)
719{
720 kuid_t uid;
721 kgid_t gid;
722
723 current_uid_gid(&uid, &gid);
724
725 *owner_uid = from_kuid(to: &init_user_ns, uid);
726 *owner_gid = from_kgid(to: &init_user_ns, gid);
727}
728
729static int ublk_open(struct gendisk *disk, blk_mode_t mode)
730{
731 struct ublk_device *ub = disk->private_data;
732
733 if (capable(CAP_SYS_ADMIN))
734 return 0;
735
736 /*
737 * If it is one unprivileged device, only owner can open
738 * the disk. Otherwise it could be one trap made by one
739 * evil user who grants this disk's privileges to other
740 * users deliberately.
741 *
742 * This way is reasonable too given anyone can create
743 * unprivileged device, and no need other's grant.
744 */
745 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
746 unsigned int curr_uid, curr_gid;
747
748 ublk_store_owner_uid_gid(owner_uid: &curr_uid, owner_gid: &curr_gid);
749
750 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
751 ub->dev_info.owner_gid)
752 return -EPERM;
753 }
754
755 return 0;
756}
757
758static const struct block_device_operations ub_fops = {
759 .owner = THIS_MODULE,
760 .open = ublk_open,
761 .free_disk = ublk_free_disk,
762 .report_zones = ublk_report_zones,
763};
764
765#define UBLK_MAX_PIN_PAGES 32
766
767struct ublk_io_iter {
768 struct page *pages[UBLK_MAX_PIN_PAGES];
769 struct bio *bio;
770 struct bvec_iter iter;
771};
772
773/* return how many pages are copied */
774static void ublk_copy_io_pages(struct ublk_io_iter *data,
775 size_t total, size_t pg_off, int dir)
776{
777 unsigned done = 0;
778 unsigned pg_idx = 0;
779
780 while (done < total) {
781 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
782 unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
783 (unsigned)(PAGE_SIZE - pg_off));
784 void *bv_buf = bvec_kmap_local(bvec: &bv);
785 void *pg_buf = kmap_local_page(page: data->pages[pg_idx]);
786
787 if (dir == ITER_DEST)
788 memcpy(pg_buf + pg_off, bv_buf, bytes);
789 else
790 memcpy(bv_buf, pg_buf + pg_off, bytes);
791
792 kunmap_local(pg_buf);
793 kunmap_local(bv_buf);
794
795 /* advance page array */
796 pg_off += bytes;
797 if (pg_off == PAGE_SIZE) {
798 pg_idx += 1;
799 pg_off = 0;
800 }
801
802 done += bytes;
803
804 /* advance bio */
805 bio_advance_iter_single(bio: data->bio, iter: &data->iter, bytes);
806 if (!data->iter.bi_size) {
807 data->bio = data->bio->bi_next;
808 if (data->bio == NULL)
809 break;
810 data->iter = data->bio->bi_iter;
811 }
812 }
813}
814
815static bool ublk_advance_io_iter(const struct request *req,
816 struct ublk_io_iter *iter, unsigned int offset)
817{
818 struct bio *bio = req->bio;
819
820 for_each_bio(bio) {
821 if (bio->bi_iter.bi_size > offset) {
822 iter->bio = bio;
823 iter->iter = bio->bi_iter;
824 bio_advance_iter(bio: iter->bio, iter: &iter->iter, bytes: offset);
825 return true;
826 }
827 offset -= bio->bi_iter.bi_size;
828 }
829 return false;
830}
831
832/*
833 * Copy data between request pages and io_iter, and 'offset'
834 * is the start point of linear offset of request.
835 */
836static size_t ublk_copy_user_pages(const struct request *req,
837 unsigned offset, struct iov_iter *uiter, int dir)
838{
839 struct ublk_io_iter iter;
840 size_t done = 0;
841
842 if (!ublk_advance_io_iter(req, iter: &iter, offset))
843 return 0;
844
845 while (iov_iter_count(i: uiter) && iter.bio) {
846 unsigned nr_pages;
847 ssize_t len;
848 size_t off;
849 int i;
850
851 len = iov_iter_get_pages2(i: uiter, pages: iter.pages,
852 maxsize: iov_iter_count(i: uiter),
853 UBLK_MAX_PIN_PAGES, start: &off);
854 if (len <= 0)
855 return done;
856
857 ublk_copy_io_pages(data: &iter, total: len, pg_off: off, dir);
858 nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
859 for (i = 0; i < nr_pages; i++) {
860 if (dir == ITER_DEST)
861 set_page_dirty(iter.pages[i]);
862 put_page(page: iter.pages[i]);
863 }
864 done += len;
865 }
866
867 return done;
868}
869
870static inline bool ublk_need_map_req(const struct request *req)
871{
872 return ublk_rq_has_data(rq: req) && req_op(req) == REQ_OP_WRITE;
873}
874
875static inline bool ublk_need_unmap_req(const struct request *req)
876{
877 return ublk_rq_has_data(rq: req) &&
878 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
879}
880
881static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
882 struct ublk_io *io)
883{
884 const unsigned int rq_bytes = blk_rq_bytes(rq: req);
885
886 if (ublk_support_user_copy(ubq))
887 return rq_bytes;
888
889 /*
890 * no zero copy, we delay copy WRITE request data into ublksrv
891 * context and the big benefit is that pinning pages in current
892 * context is pretty fast, see ublk_pin_user_pages
893 */
894 if (ublk_need_map_req(req)) {
895 struct iov_iter iter;
896 struct iovec iov;
897 const int dir = ITER_DEST;
898
899 import_single_range(type: dir, u64_to_user_ptr(io->addr), len: rq_bytes,
900 iov: &iov, i: &iter);
901
902 return ublk_copy_user_pages(req, offset: 0, uiter: &iter, dir);
903 }
904 return rq_bytes;
905}
906
907static int ublk_unmap_io(const struct ublk_queue *ubq,
908 const struct request *req,
909 struct ublk_io *io)
910{
911 const unsigned int rq_bytes = blk_rq_bytes(rq: req);
912
913 if (ublk_support_user_copy(ubq))
914 return rq_bytes;
915
916 if (ublk_need_unmap_req(req)) {
917 struct iov_iter iter;
918 struct iovec iov;
919 const int dir = ITER_SOURCE;
920
921 WARN_ON_ONCE(io->res > rq_bytes);
922
923 import_single_range(type: dir, u64_to_user_ptr(io->addr), len: io->res,
924 iov: &iov, i: &iter);
925 return ublk_copy_user_pages(req, offset: 0, uiter: &iter, dir);
926 }
927 return rq_bytes;
928}
929
930static inline unsigned int ublk_req_build_flags(struct request *req)
931{
932 unsigned flags = 0;
933
934 if (req->cmd_flags & REQ_FAILFAST_DEV)
935 flags |= UBLK_IO_F_FAILFAST_DEV;
936
937 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
938 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
939
940 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
941 flags |= UBLK_IO_F_FAILFAST_DRIVER;
942
943 if (req->cmd_flags & REQ_META)
944 flags |= UBLK_IO_F_META;
945
946 if (req->cmd_flags & REQ_FUA)
947 flags |= UBLK_IO_F_FUA;
948
949 if (req->cmd_flags & REQ_NOUNMAP)
950 flags |= UBLK_IO_F_NOUNMAP;
951
952 if (req->cmd_flags & REQ_SWAP)
953 flags |= UBLK_IO_F_SWAP;
954
955 return flags;
956}
957
958static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
959{
960 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag: req->tag);
961 struct ublk_io *io = &ubq->ios[req->tag];
962 enum req_op op = req_op(req);
963 u32 ublk_op;
964
965 if (!ublk_queue_is_zoned(ubq) &&
966 (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
967 return BLK_STS_IOERR;
968
969 switch (req_op(req)) {
970 case REQ_OP_READ:
971 ublk_op = UBLK_IO_OP_READ;
972 break;
973 case REQ_OP_WRITE:
974 ublk_op = UBLK_IO_OP_WRITE;
975 break;
976 case REQ_OP_FLUSH:
977 ublk_op = UBLK_IO_OP_FLUSH;
978 break;
979 case REQ_OP_DISCARD:
980 ublk_op = UBLK_IO_OP_DISCARD;
981 break;
982 case REQ_OP_WRITE_ZEROES:
983 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
984 break;
985 default:
986 if (ublk_queue_is_zoned(ubq))
987 return ublk_setup_iod_zoned(ubq, req);
988 return BLK_STS_IOERR;
989 }
990
991 /* need to translate since kernel may change */
992 iod->op_flags = ublk_op | ublk_req_build_flags(req);
993 iod->nr_sectors = blk_rq_sectors(rq: req);
994 iod->start_sector = blk_rq_pos(rq: req);
995 iod->addr = io->addr;
996
997 return BLK_STS_OK;
998}
999
1000static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1001 struct io_uring_cmd *ioucmd)
1002{
1003 return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
1004}
1005
1006static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
1007{
1008 return ubq->ubq_daemon->flags & PF_EXITING;
1009}
1010
1011/* todo: handle partial completion */
1012static inline void __ublk_complete_rq(struct request *req)
1013{
1014 struct ublk_queue *ubq = req->mq_hctx->driver_data;
1015 struct ublk_io *io = &ubq->ios[req->tag];
1016 unsigned int unmapped_bytes;
1017 blk_status_t res = BLK_STS_OK;
1018
1019 /* called from ublk_abort_queue() code path */
1020 if (io->flags & UBLK_IO_FLAG_ABORTED) {
1021 res = BLK_STS_IOERR;
1022 goto exit;
1023 }
1024
1025 /* failed read IO if nothing is read */
1026 if (!io->res && req_op(req) == REQ_OP_READ)
1027 io->res = -EIO;
1028
1029 if (io->res < 0) {
1030 res = errno_to_blk_status(errno: io->res);
1031 goto exit;
1032 }
1033
1034 /*
1035 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1036 * directly.
1037 *
1038 * Both the two needn't unmap.
1039 */
1040 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1041 req_op(req) != REQ_OP_DRV_IN)
1042 goto exit;
1043
1044 /* for READ request, writing data in iod->addr to rq buffers */
1045 unmapped_bytes = ublk_unmap_io(ubq, req, io);
1046
1047 /*
1048 * Extremely impossible since we got data filled in just before
1049 *
1050 * Re-read simply for this unlikely case.
1051 */
1052 if (unlikely(unmapped_bytes < io->res))
1053 io->res = unmapped_bytes;
1054
1055 if (blk_update_request(rq: req, BLK_STS_OK, nr_bytes: io->res))
1056 blk_mq_requeue_request(rq: req, kick_requeue_list: true);
1057 else
1058 __blk_mq_end_request(rq: req, BLK_STS_OK);
1059
1060 return;
1061exit:
1062 blk_mq_end_request(rq: req, error: res);
1063}
1064
1065static void ublk_complete_rq(struct kref *ref)
1066{
1067 struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
1068 ref);
1069 struct request *req = blk_mq_rq_from_pdu(pdu: data);
1070
1071 __ublk_complete_rq(req);
1072}
1073
1074/*
1075 * Since __ublk_rq_task_work always fails requests immediately during
1076 * exiting, __ublk_fail_req() is only called from abort context during
1077 * exiting. So lock is unnecessary.
1078 *
1079 * Also aborting may not be started yet, keep in mind that one failed
1080 * request may be issued by block layer again.
1081 */
1082static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
1083 struct request *req)
1084{
1085 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1086
1087 if (ublk_queue_can_use_recovery_reissue(ubq))
1088 blk_mq_requeue_request(rq: req, kick_requeue_list: false);
1089 else
1090 ublk_put_req_ref(ubq, req);
1091}
1092
1093static void ubq_complete_io_cmd(struct ublk_io *io, int res,
1094 unsigned issue_flags)
1095{
1096 /* mark this cmd owned by ublksrv */
1097 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1098
1099 /*
1100 * clear ACTIVE since we are done with this sqe/cmd slot
1101 * We can only accept io cmd in case of being not active.
1102 */
1103 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1104
1105 /* tell ublksrv one io request is coming */
1106 io_uring_cmd_done(cmd: io->cmd, ret: res, res2: 0, issue_flags);
1107}
1108
1109#define UBLK_REQUEUE_DELAY_MS 3
1110
1111static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1112 struct request *rq)
1113{
1114 /* We cannot process this rq so just requeue it. */
1115 if (ublk_queue_can_use_recovery(ubq))
1116 blk_mq_requeue_request(rq, kick_requeue_list: false);
1117 else
1118 blk_mq_end_request(rq, BLK_STS_IOERR);
1119}
1120
1121static inline void __ublk_rq_task_work(struct request *req,
1122 unsigned issue_flags)
1123{
1124 struct ublk_queue *ubq = req->mq_hctx->driver_data;
1125 int tag = req->tag;
1126 struct ublk_io *io = &ubq->ios[tag];
1127 unsigned int mapped_bytes;
1128
1129 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
1130 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
1131 ublk_get_iod(ubq, req->tag)->addr);
1132
1133 /*
1134 * Task is exiting if either:
1135 *
1136 * (1) current != ubq_daemon.
1137 * io_uring_cmd_complete_in_task() tries to run task_work
1138 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
1139 *
1140 * (2) current->flags & PF_EXITING.
1141 */
1142 if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
1143 __ublk_abort_rq(ubq, rq: req);
1144 return;
1145 }
1146
1147 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1148 /*
1149 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1150 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1151 * and notify it.
1152 */
1153 if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
1154 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1155 pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
1156 __func__, io->cmd->cmd_op, ubq->q_id,
1157 req->tag, io->flags);
1158 ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
1159 return;
1160 }
1161 /*
1162 * We have handled UBLK_IO_NEED_GET_DATA command,
1163 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
1164 * do the copy work.
1165 */
1166 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
1167 /* update iod->addr because ublksrv may have passed a new io buffer */
1168 ublk_get_iod(ubq, tag: req->tag)->addr = io->addr;
1169 pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
1170 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
1171 ublk_get_iod(ubq, req->tag)->addr);
1172 }
1173
1174 mapped_bytes = ublk_map_io(ubq, req, io);
1175
1176 /* partially mapped, update io descriptor */
1177 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1178 /*
1179 * Nothing mapped, retry until we succeed.
1180 *
1181 * We may never succeed in mapping any bytes here because
1182 * of OOM. TODO: reserve one buffer with single page pinned
1183 * for providing forward progress guarantee.
1184 */
1185 if (unlikely(!mapped_bytes)) {
1186 blk_mq_requeue_request(rq: req, kick_requeue_list: false);
1187 blk_mq_delay_kick_requeue_list(q: req->q,
1188 UBLK_REQUEUE_DELAY_MS);
1189 return;
1190 }
1191
1192 ublk_get_iod(ubq, tag: req->tag)->nr_sectors =
1193 mapped_bytes >> 9;
1194 }
1195
1196 ublk_init_req_ref(ubq, req);
1197 ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
1198}
1199
1200static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
1201 unsigned issue_flags)
1202{
1203 struct llist_node *io_cmds = llist_del_all(head: &ubq->io_cmds);
1204 struct ublk_rq_data *data, *tmp;
1205
1206 io_cmds = llist_reverse_order(head: io_cmds);
1207 llist_for_each_entry_safe(data, tmp, io_cmds, node)
1208 __ublk_rq_task_work(req: blk_mq_rq_from_pdu(pdu: data), issue_flags);
1209}
1210
1211static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
1212{
1213 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(ioucmd: cmd);
1214 struct ublk_queue *ubq = pdu->ubq;
1215
1216 ublk_forward_io_cmds(ubq, issue_flags);
1217}
1218
1219static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1220{
1221 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
1222
1223 if (llist_add(new: &data->node, head: &ubq->io_cmds)) {
1224 struct ublk_io *io = &ubq->ios[rq->tag];
1225
1226 io_uring_cmd_complete_in_task(ioucmd: io->cmd, task_work_cb: ublk_rq_task_work_cb);
1227 }
1228}
1229
1230static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1231{
1232 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1233 unsigned int nr_inflight = 0;
1234 int i;
1235
1236 if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
1237 if (!ubq->timeout) {
1238 send_sig(SIGKILL, ubq->ubq_daemon, 0);
1239 ubq->timeout = true;
1240 }
1241
1242 return BLK_EH_DONE;
1243 }
1244
1245 if (!ubq_daemon_is_dying(ubq))
1246 return BLK_EH_RESET_TIMER;
1247
1248 for (i = 0; i < ubq->q_depth; i++) {
1249 struct ublk_io *io = &ubq->ios[i];
1250
1251 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1252 nr_inflight++;
1253 }
1254
1255 /* cancelable uring_cmd can't help us if all commands are in-flight */
1256 if (nr_inflight == ubq->q_depth) {
1257 struct ublk_device *ub = ubq->dev;
1258
1259 if (ublk_abort_requests(ub, ubq)) {
1260 if (ublk_can_use_recovery(ub))
1261 schedule_work(work: &ub->quiesce_work);
1262 else
1263 schedule_work(work: &ub->stop_work);
1264 }
1265 return BLK_EH_DONE;
1266 }
1267
1268 return BLK_EH_RESET_TIMER;
1269}
1270
1271static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1272 const struct blk_mq_queue_data *bd)
1273{
1274 struct ublk_queue *ubq = hctx->driver_data;
1275 struct request *rq = bd->rq;
1276 blk_status_t res;
1277
1278 /* fill iod to slot in io cmd buffer */
1279 res = ublk_setup_iod(ubq, req: rq);
1280 if (unlikely(res != BLK_STS_OK))
1281 return BLK_STS_IOERR;
1282
1283 /* With recovery feature enabled, force_abort is set in
1284 * ublk_stop_dev() before calling del_gendisk(). We have to
1285 * abort all requeued and new rqs here to let del_gendisk()
1286 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1287 * to avoid UAF on io_uring ctx.
1288 *
1289 * Note: force_abort is guaranteed to be seen because it is set
1290 * before request queue is unqiuesced.
1291 */
1292 if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
1293 return BLK_STS_IOERR;
1294
1295 if (unlikely(ubq->canceling)) {
1296 __ublk_abort_rq(ubq, rq);
1297 return BLK_STS_OK;
1298 }
1299
1300 blk_mq_start_request(rq: bd->rq);
1301 ublk_queue_cmd(ubq, rq);
1302
1303 return BLK_STS_OK;
1304}
1305
1306static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1307 unsigned int hctx_idx)
1308{
1309 struct ublk_device *ub = driver_data;
1310 struct ublk_queue *ubq = ublk_get_queue(dev: ub, qid: hctx->queue_num);
1311
1312 hctx->driver_data = ubq;
1313 return 0;
1314}
1315
1316static const struct blk_mq_ops ublk_mq_ops = {
1317 .queue_rq = ublk_queue_rq,
1318 .init_hctx = ublk_init_hctx,
1319 .timeout = ublk_timeout,
1320};
1321
1322static int ublk_ch_open(struct inode *inode, struct file *filp)
1323{
1324 struct ublk_device *ub = container_of(inode->i_cdev,
1325 struct ublk_device, cdev);
1326
1327 if (test_and_set_bit(UB_STATE_OPEN, addr: &ub->state))
1328 return -EBUSY;
1329 filp->private_data = ub;
1330 return 0;
1331}
1332
1333static int ublk_ch_release(struct inode *inode, struct file *filp)
1334{
1335 struct ublk_device *ub = filp->private_data;
1336
1337 clear_bit(UB_STATE_OPEN, addr: &ub->state);
1338 return 0;
1339}
1340
1341/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
1342static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1343{
1344 struct ublk_device *ub = filp->private_data;
1345 size_t sz = vma->vm_end - vma->vm_start;
1346 unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
1347 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1348 int q_id, ret = 0;
1349
1350 spin_lock(lock: &ub->lock);
1351 if (!ub->mm)
1352 ub->mm = current->mm;
1353 if (current->mm != ub->mm)
1354 ret = -EINVAL;
1355 spin_unlock(lock: &ub->lock);
1356
1357 if (ret)
1358 return ret;
1359
1360 if (vma->vm_flags & VM_WRITE)
1361 return -EPERM;
1362
1363 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1364 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1365 return -EINVAL;
1366
1367 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1368 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1369 __func__, q_id, current->pid, vma->vm_start,
1370 phys_off, (unsigned long)sz);
1371
1372 if (sz != ublk_queue_cmd_buf_size(ub, q_id))
1373 return -EINVAL;
1374
1375 pfn = virt_to_phys(address: ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1376 return remap_pfn_range(vma, addr: vma->vm_start, pfn, size: sz, vma->vm_page_prot);
1377}
1378
1379static void ublk_commit_completion(struct ublk_device *ub,
1380 const struct ublksrv_io_cmd *ub_cmd)
1381{
1382 u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
1383 struct ublk_queue *ubq = ublk_get_queue(dev: ub, qid);
1384 struct ublk_io *io = &ubq->ios[tag];
1385 struct request *req;
1386
1387 /* now this cmd slot is owned by nbd driver */
1388 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
1389 io->res = ub_cmd->result;
1390
1391 /* find the io request and complete */
1392 req = blk_mq_tag_to_rq(tags: ub->tag_set.tags[qid], tag);
1393 if (WARN_ON_ONCE(unlikely(!req)))
1394 return;
1395
1396 if (req_op(req) == REQ_OP_ZONE_APPEND)
1397 req->__sector = ub_cmd->zone_append_lba;
1398
1399 if (likely(!blk_should_fake_timeout(req->q)))
1400 ublk_put_req_ref(ubq, req);
1401}
1402
1403/*
1404 * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
1405 * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
1406 * context, so everything is serialized.
1407 */
1408static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1409{
1410 int i;
1411
1412 for (i = 0; i < ubq->q_depth; i++) {
1413 struct ublk_io *io = &ubq->ios[i];
1414
1415 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
1416 struct request *rq;
1417
1418 /*
1419 * Either we fail the request or ublk_rq_task_work_fn
1420 * will do it
1421 */
1422 rq = blk_mq_tag_to_rq(tags: ub->tag_set.tags[ubq->q_id], tag: i);
1423 if (rq && blk_mq_request_started(rq)) {
1424 io->flags |= UBLK_IO_FLAG_ABORTED;
1425 __ublk_fail_req(ubq, io, req: rq);
1426 }
1427 }
1428 }
1429}
1430
1431static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
1432{
1433 struct gendisk *disk;
1434
1435 spin_lock(lock: &ubq->cancel_lock);
1436 if (ubq->canceling) {
1437 spin_unlock(lock: &ubq->cancel_lock);
1438 return false;
1439 }
1440 ubq->canceling = true;
1441 spin_unlock(lock: &ubq->cancel_lock);
1442
1443 spin_lock(lock: &ub->lock);
1444 disk = ub->ub_disk;
1445 if (disk)
1446 get_device(disk_to_dev(disk));
1447 spin_unlock(lock: &ub->lock);
1448
1449 /* Our disk has been dead */
1450 if (!disk)
1451 return false;
1452
1453 /* Now we are serialized with ublk_queue_rq() */
1454 blk_mq_quiesce_queue(q: disk->queue);
1455 /* abort queue is for making forward progress */
1456 ublk_abort_queue(ub, ubq);
1457 blk_mq_unquiesce_queue(q: disk->queue);
1458 put_device(disk_to_dev(disk));
1459
1460 return true;
1461}
1462
1463static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
1464 unsigned int issue_flags)
1465{
1466 bool done;
1467
1468 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1469 return;
1470
1471 spin_lock(lock: &ubq->cancel_lock);
1472 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1473 if (!done)
1474 io->flags |= UBLK_IO_FLAG_CANCELED;
1475 spin_unlock(lock: &ubq->cancel_lock);
1476
1477 if (!done)
1478 io_uring_cmd_done(cmd: io->cmd, UBLK_IO_RES_ABORT, res2: 0, issue_flags);
1479}
1480
1481/*
1482 * The ublk char device won't be closed when calling cancel fn, so both
1483 * ublk device and queue are guaranteed to be live
1484 */
1485static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1486 unsigned int issue_flags)
1487{
1488 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(ioucmd: cmd);
1489 struct ublk_queue *ubq = pdu->ubq;
1490 struct task_struct *task;
1491 struct ublk_device *ub;
1492 bool need_schedule;
1493 struct ublk_io *io;
1494
1495 if (WARN_ON_ONCE(!ubq))
1496 return;
1497
1498 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1499 return;
1500
1501 task = io_uring_cmd_get_task(cmd);
1502 if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
1503 return;
1504
1505 ub = ubq->dev;
1506 need_schedule = ublk_abort_requests(ub, ubq);
1507
1508 io = &ubq->ios[pdu->tag];
1509 WARN_ON_ONCE(io->cmd != cmd);
1510 ublk_cancel_cmd(ubq, io, issue_flags);
1511
1512 if (need_schedule) {
1513 if (ublk_can_use_recovery(ub))
1514 schedule_work(work: &ub->quiesce_work);
1515 else
1516 schedule_work(work: &ub->stop_work);
1517 }
1518}
1519
1520static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1521{
1522 return ubq->nr_io_ready == ubq->q_depth;
1523}
1524
1525static void ublk_cancel_queue(struct ublk_queue *ubq)
1526{
1527 int i;
1528
1529 for (i = 0; i < ubq->q_depth; i++)
1530 ublk_cancel_cmd(ubq, io: &ubq->ios[i], issue_flags: IO_URING_F_UNLOCKED);
1531}
1532
1533/* Cancel all pending commands, must be called after del_gendisk() returns */
1534static void ublk_cancel_dev(struct ublk_device *ub)
1535{
1536 int i;
1537
1538 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1539 ublk_cancel_queue(ubq: ublk_get_queue(dev: ub, qid: i));
1540}
1541
1542static bool ublk_check_inflight_rq(struct request *rq, void *data)
1543{
1544 bool *idle = data;
1545
1546 if (blk_mq_request_started(rq)) {
1547 *idle = false;
1548 return false;
1549 }
1550 return true;
1551}
1552
1553static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1554{
1555 bool idle;
1556
1557 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1558 while (true) {
1559 idle = true;
1560 blk_mq_tagset_busy_iter(tagset: &ub->tag_set,
1561 fn: ublk_check_inflight_rq, priv: &idle);
1562 if (idle)
1563 break;
1564 msleep(UBLK_REQUEUE_DELAY_MS);
1565 }
1566}
1567
1568static void __ublk_quiesce_dev(struct ublk_device *ub)
1569{
1570 pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1571 __func__, ub->dev_info.dev_id,
1572 ub->dev_info.state == UBLK_S_DEV_LIVE ?
1573 "LIVE" : "QUIESCED");
1574 blk_mq_quiesce_queue(q: ub->ub_disk->queue);
1575 ublk_wait_tagset_rqs_idle(ub);
1576 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1577}
1578
1579static void ublk_quiesce_work_fn(struct work_struct *work)
1580{
1581 struct ublk_device *ub =
1582 container_of(work, struct ublk_device, quiesce_work);
1583
1584 mutex_lock(&ub->mutex);
1585 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
1586 goto unlock;
1587 __ublk_quiesce_dev(ub);
1588 unlock:
1589 mutex_unlock(lock: &ub->mutex);
1590 ublk_cancel_dev(ub);
1591}
1592
1593static void ublk_unquiesce_dev(struct ublk_device *ub)
1594{
1595 int i;
1596
1597 pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1598 __func__, ub->dev_info.dev_id,
1599 ub->dev_info.state == UBLK_S_DEV_LIVE ?
1600 "LIVE" : "QUIESCED");
1601 /* quiesce_work has run. We let requeued rqs be aborted
1602 * before running fallback_wq. "force_abort" must be seen
1603 * after request queue is unqiuesced. Then del_gendisk()
1604 * can move on.
1605 */
1606 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1607 ublk_get_queue(dev: ub, qid: i)->force_abort = true;
1608
1609 blk_mq_unquiesce_queue(q: ub->ub_disk->queue);
1610 /* We may have requeued some rqs in ublk_quiesce_queue() */
1611 blk_mq_kick_requeue_list(q: ub->ub_disk->queue);
1612}
1613
1614static void ublk_stop_dev(struct ublk_device *ub)
1615{
1616 struct gendisk *disk;
1617
1618 mutex_lock(&ub->mutex);
1619 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1620 goto unlock;
1621 if (ublk_can_use_recovery(ub)) {
1622 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1623 __ublk_quiesce_dev(ub);
1624 ublk_unquiesce_dev(ub);
1625 }
1626 del_gendisk(gp: ub->ub_disk);
1627
1628 /* Sync with ublk_abort_queue() by holding the lock */
1629 spin_lock(lock: &ub->lock);
1630 disk = ub->ub_disk;
1631 ub->dev_info.state = UBLK_S_DEV_DEAD;
1632 ub->dev_info.ublksrv_pid = -1;
1633 ub->ub_disk = NULL;
1634 spin_unlock(lock: &ub->lock);
1635 put_disk(disk);
1636 unlock:
1637 mutex_unlock(lock: &ub->mutex);
1638 ublk_cancel_dev(ub);
1639}
1640
1641/* device can only be started after all IOs are ready */
1642static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
1643{
1644 mutex_lock(&ub->mutex);
1645 ubq->nr_io_ready++;
1646 if (ublk_queue_ready(ubq)) {
1647 ubq->ubq_daemon = current;
1648 get_task_struct(t: ubq->ubq_daemon);
1649 ub->nr_queues_ready++;
1650
1651 if (capable(CAP_SYS_ADMIN))
1652 ub->nr_privileged_daemon++;
1653 }
1654 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
1655 complete_all(&ub->completion);
1656 mutex_unlock(lock: &ub->mutex);
1657}
1658
1659static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1660 int tag)
1661{
1662 struct ublk_queue *ubq = ublk_get_queue(dev: ub, qid: q_id);
1663 struct request *req = blk_mq_tag_to_rq(tags: ub->tag_set.tags[q_id], tag);
1664
1665 ublk_queue_cmd(ubq, rq: req);
1666}
1667
1668static inline int ublk_check_cmd_op(u32 cmd_op)
1669{
1670 u32 ioc_type = _IOC_TYPE(cmd_op);
1671
1672 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
1673 return -EOPNOTSUPP;
1674
1675 if (ioc_type != 'u' && ioc_type != 0)
1676 return -EOPNOTSUPP;
1677
1678 return 0;
1679}
1680
1681static inline void ublk_fill_io_cmd(struct ublk_io *io,
1682 struct io_uring_cmd *cmd, unsigned long buf_addr)
1683{
1684 io->cmd = cmd;
1685 io->flags |= UBLK_IO_FLAG_ACTIVE;
1686 io->addr = buf_addr;
1687}
1688
1689static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
1690 unsigned int issue_flags,
1691 struct ublk_queue *ubq, unsigned int tag)
1692{
1693 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(ioucmd: cmd);
1694
1695 /*
1696 * Safe to refer to @ubq since ublk_queue won't be died until its
1697 * commands are completed
1698 */
1699 pdu->ubq = ubq;
1700 pdu->tag = tag;
1701 io_uring_cmd_mark_cancelable(cmd, issue_flags);
1702}
1703
1704static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
1705 unsigned int issue_flags,
1706 const struct ublksrv_io_cmd *ub_cmd)
1707{
1708 struct ublk_device *ub = cmd->file->private_data;
1709 struct ublk_queue *ubq;
1710 struct ublk_io *io;
1711 u32 cmd_op = cmd->cmd_op;
1712 unsigned tag = ub_cmd->tag;
1713 int ret = -EINVAL;
1714 struct request *req;
1715
1716 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1717 __func__, cmd->cmd_op, ub_cmd->q_id, tag,
1718 ub_cmd->result);
1719
1720 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
1721 goto out;
1722
1723 ubq = ublk_get_queue(dev: ub, qid: ub_cmd->q_id);
1724 if (!ubq || ub_cmd->q_id != ubq->q_id)
1725 goto out;
1726
1727 if (ubq->ubq_daemon && ubq->ubq_daemon != current)
1728 goto out;
1729
1730 if (tag >= ubq->q_depth)
1731 goto out;
1732
1733 io = &ubq->ios[tag];
1734
1735 /* there is pending io cmd, something must be wrong */
1736 if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1737 ret = -EBUSY;
1738 goto out;
1739 }
1740
1741 /*
1742 * ensure that the user issues UBLK_IO_NEED_GET_DATA
1743 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1744 */
1745 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
1746 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
1747 goto out;
1748
1749 ret = ublk_check_cmd_op(cmd_op);
1750 if (ret)
1751 goto out;
1752
1753 ret = -EINVAL;
1754 switch (_IOC_NR(cmd_op)) {
1755 case UBLK_IO_FETCH_REQ:
1756 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1757 if (ublk_queue_ready(ubq)) {
1758 ret = -EBUSY;
1759 goto out;
1760 }
1761 /*
1762 * The io is being handled by server, so COMMIT_RQ is expected
1763 * instead of FETCH_REQ
1764 */
1765 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1766 goto out;
1767
1768 if (!ublk_support_user_copy(ubq)) {
1769 /*
1770 * FETCH_RQ has to provide IO buffer if NEED GET
1771 * DATA is not enabled
1772 */
1773 if (!ub_cmd->addr && !ublk_need_get_data(ubq))
1774 goto out;
1775 } else if (ub_cmd->addr) {
1776 /* User copy requires addr to be unset */
1777 ret = -EINVAL;
1778 goto out;
1779 }
1780
1781 ublk_fill_io_cmd(io, cmd, buf_addr: ub_cmd->addr);
1782 ublk_mark_io_ready(ub, ubq);
1783 break;
1784 case UBLK_IO_COMMIT_AND_FETCH_REQ:
1785 req = blk_mq_tag_to_rq(tags: ub->tag_set.tags[ub_cmd->q_id], tag);
1786
1787 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1788 goto out;
1789
1790 if (!ublk_support_user_copy(ubq)) {
1791 /*
1792 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
1793 * NEED GET DATA is not enabled or it is Read IO.
1794 */
1795 if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
1796 req_op(req) == REQ_OP_READ))
1797 goto out;
1798 } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
1799 /*
1800 * User copy requires addr to be unset when command is
1801 * not zone append
1802 */
1803 ret = -EINVAL;
1804 goto out;
1805 }
1806
1807 ublk_fill_io_cmd(io, cmd, buf_addr: ub_cmd->addr);
1808 ublk_commit_completion(ub, ub_cmd);
1809 break;
1810 case UBLK_IO_NEED_GET_DATA:
1811 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1812 goto out;
1813 ublk_fill_io_cmd(io, cmd, buf_addr: ub_cmd->addr);
1814 ublk_handle_need_get_data(ub, q_id: ub_cmd->q_id, tag: ub_cmd->tag);
1815 break;
1816 default:
1817 goto out;
1818 }
1819 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
1820 return -EIOCBQUEUED;
1821
1822 out:
1823 io_uring_cmd_done(cmd, ret, res2: 0, issue_flags);
1824 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1825 __func__, cmd_op, tag, ret, io->flags);
1826 return -EIOCBQUEUED;
1827}
1828
1829static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
1830 struct ublk_queue *ubq, int tag, size_t offset)
1831{
1832 struct request *req;
1833
1834 if (!ublk_need_req_ref(ubq))
1835 return NULL;
1836
1837 req = blk_mq_tag_to_rq(tags: ub->tag_set.tags[ubq->q_id], tag);
1838 if (!req)
1839 return NULL;
1840
1841 if (!ublk_get_req_ref(ubq, req))
1842 return NULL;
1843
1844 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
1845 goto fail_put;
1846
1847 if (!ublk_rq_has_data(rq: req))
1848 goto fail_put;
1849
1850 if (offset > blk_rq_bytes(rq: req))
1851 goto fail_put;
1852
1853 return req;
1854fail_put:
1855 ublk_put_req_ref(ubq, req);
1856 return NULL;
1857}
1858
1859static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
1860 unsigned int issue_flags)
1861{
1862 /*
1863 * Not necessary for async retry, but let's keep it simple and always
1864 * copy the values to avoid any potential reuse.
1865 */
1866 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(sqe: cmd->sqe);
1867 const struct ublksrv_io_cmd ub_cmd = {
1868 .q_id = READ_ONCE(ub_src->q_id),
1869 .tag = READ_ONCE(ub_src->tag),
1870 .result = READ_ONCE(ub_src->result),
1871 .addr = READ_ONCE(ub_src->addr)
1872 };
1873
1874 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
1875
1876 return __ublk_ch_uring_cmd(cmd, issue_flags, ub_cmd: &ub_cmd);
1877}
1878
1879static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
1880 unsigned int issue_flags)
1881{
1882 ublk_ch_uring_cmd_local(cmd, issue_flags);
1883}
1884
1885static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1886{
1887 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
1888 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
1889 return 0;
1890 }
1891
1892 /* well-implemented server won't run into unlocked */
1893 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
1894 io_uring_cmd_complete_in_task(ioucmd: cmd, task_work_cb: ublk_ch_uring_cmd_cb);
1895 return -EIOCBQUEUED;
1896 }
1897
1898 return ublk_ch_uring_cmd_local(cmd, issue_flags);
1899}
1900
1901static inline bool ublk_check_ubuf_dir(const struct request *req,
1902 int ubuf_dir)
1903{
1904 /* copy ubuf to request pages */
1905 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
1906 ubuf_dir == ITER_SOURCE)
1907 return true;
1908
1909 /* copy request pages to ubuf */
1910 if ((req_op(req) == REQ_OP_WRITE ||
1911 req_op(req) == REQ_OP_ZONE_APPEND) &&
1912 ubuf_dir == ITER_DEST)
1913 return true;
1914
1915 return false;
1916}
1917
1918static struct request *ublk_check_and_get_req(struct kiocb *iocb,
1919 struct iov_iter *iter, size_t *off, int dir)
1920{
1921 struct ublk_device *ub = iocb->ki_filp->private_data;
1922 struct ublk_queue *ubq;
1923 struct request *req;
1924 size_t buf_off;
1925 u16 tag, q_id;
1926
1927 if (!ub)
1928 return ERR_PTR(error: -EACCES);
1929
1930 if (!user_backed_iter(i: iter))
1931 return ERR_PTR(error: -EACCES);
1932
1933 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1934 return ERR_PTR(error: -EACCES);
1935
1936 tag = ublk_pos_to_tag(pos: iocb->ki_pos);
1937 q_id = ublk_pos_to_hwq(pos: iocb->ki_pos);
1938 buf_off = ublk_pos_to_buf_off(pos: iocb->ki_pos);
1939
1940 if (q_id >= ub->dev_info.nr_hw_queues)
1941 return ERR_PTR(error: -EINVAL);
1942
1943 ubq = ublk_get_queue(dev: ub, qid: q_id);
1944 if (!ubq)
1945 return ERR_PTR(error: -EINVAL);
1946
1947 if (tag >= ubq->q_depth)
1948 return ERR_PTR(error: -EINVAL);
1949
1950 req = __ublk_check_and_get_req(ub, ubq, tag, offset: buf_off);
1951 if (!req)
1952 return ERR_PTR(error: -EINVAL);
1953
1954 if (!req->mq_hctx || !req->mq_hctx->driver_data)
1955 goto fail;
1956
1957 if (!ublk_check_ubuf_dir(req, ubuf_dir: dir))
1958 goto fail;
1959
1960 *off = buf_off;
1961 return req;
1962fail:
1963 ublk_put_req_ref(ubq, req);
1964 return ERR_PTR(error: -EACCES);
1965}
1966
1967static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
1968{
1969 struct ublk_queue *ubq;
1970 struct request *req;
1971 size_t buf_off;
1972 size_t ret;
1973
1974 req = ublk_check_and_get_req(iocb, iter: to, off: &buf_off, ITER_DEST);
1975 if (IS_ERR(ptr: req))
1976 return PTR_ERR(ptr: req);
1977
1978 ret = ublk_copy_user_pages(req, offset: buf_off, uiter: to, ITER_DEST);
1979 ubq = req->mq_hctx->driver_data;
1980 ublk_put_req_ref(ubq, req);
1981
1982 return ret;
1983}
1984
1985static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
1986{
1987 struct ublk_queue *ubq;
1988 struct request *req;
1989 size_t buf_off;
1990 size_t ret;
1991
1992 req = ublk_check_and_get_req(iocb, iter: from, off: &buf_off, ITER_SOURCE);
1993 if (IS_ERR(ptr: req))
1994 return PTR_ERR(ptr: req);
1995
1996 ret = ublk_copy_user_pages(req, offset: buf_off, uiter: from, ITER_SOURCE);
1997 ubq = req->mq_hctx->driver_data;
1998 ublk_put_req_ref(ubq, req);
1999
2000 return ret;
2001}
2002
2003static const struct file_operations ublk_ch_fops = {
2004 .owner = THIS_MODULE,
2005 .open = ublk_ch_open,
2006 .release = ublk_ch_release,
2007 .llseek = no_llseek,
2008 .read_iter = ublk_ch_read_iter,
2009 .write_iter = ublk_ch_write_iter,
2010 .uring_cmd = ublk_ch_uring_cmd,
2011 .mmap = ublk_ch_mmap,
2012};
2013
2014static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
2015{
2016 int size = ublk_queue_cmd_buf_size(ub, q_id);
2017 struct ublk_queue *ubq = ublk_get_queue(dev: ub, qid: q_id);
2018
2019 if (ubq->ubq_daemon)
2020 put_task_struct(t: ubq->ubq_daemon);
2021 if (ubq->io_cmd_buf)
2022 free_pages(addr: (unsigned long)ubq->io_cmd_buf, order: get_order(size));
2023}
2024
2025static int ublk_init_queue(struct ublk_device *ub, int q_id)
2026{
2027 struct ublk_queue *ubq = ublk_get_queue(dev: ub, qid: q_id);
2028 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
2029 void *ptr;
2030 int size;
2031
2032 spin_lock_init(&ubq->cancel_lock);
2033 ubq->flags = ub->dev_info.flags;
2034 ubq->q_id = q_id;
2035 ubq->q_depth = ub->dev_info.queue_depth;
2036 size = ublk_queue_cmd_buf_size(ub, q_id);
2037
2038 ptr = (void *) __get_free_pages(gfp_mask: gfp_flags, order: get_order(size));
2039 if (!ptr)
2040 return -ENOMEM;
2041
2042 ubq->io_cmd_buf = ptr;
2043 ubq->dev = ub;
2044 return 0;
2045}
2046
2047static void ublk_deinit_queues(struct ublk_device *ub)
2048{
2049 int nr_queues = ub->dev_info.nr_hw_queues;
2050 int i;
2051
2052 if (!ub->__queues)
2053 return;
2054
2055 for (i = 0; i < nr_queues; i++)
2056 ublk_deinit_queue(ub, q_id: i);
2057 kfree(objp: ub->__queues);
2058}
2059
2060static int ublk_init_queues(struct ublk_device *ub)
2061{
2062 int nr_queues = ub->dev_info.nr_hw_queues;
2063 int depth = ub->dev_info.queue_depth;
2064 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
2065 int i, ret = -ENOMEM;
2066
2067 ub->queue_size = ubq_size;
2068 ub->__queues = kcalloc(n: nr_queues, size: ubq_size, GFP_KERNEL);
2069 if (!ub->__queues)
2070 return ret;
2071
2072 for (i = 0; i < nr_queues; i++) {
2073 if (ublk_init_queue(ub, q_id: i))
2074 goto fail;
2075 }
2076
2077 init_completion(x: &ub->completion);
2078 return 0;
2079
2080 fail:
2081 ublk_deinit_queues(ub);
2082 return ret;
2083}
2084
2085static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2086{
2087 int i = idx;
2088 int err;
2089
2090 spin_lock(lock: &ublk_idr_lock);
2091 /* allocate id, if @id >= 0, we're requesting that specific id */
2092 if (i >= 0) {
2093 err = idr_alloc(&ublk_index_idr, ptr: ub, start: i, end: i + 1, GFP_NOWAIT);
2094 if (err == -ENOSPC)
2095 err = -EEXIST;
2096 } else {
2097 err = idr_alloc(&ublk_index_idr, ptr: ub, start: 0, UBLK_MAX_UBLKS,
2098 GFP_NOWAIT);
2099 }
2100 spin_unlock(lock: &ublk_idr_lock);
2101
2102 if (err >= 0)
2103 ub->ub_number = err;
2104
2105 return err;
2106}
2107
2108static void ublk_free_dev_number(struct ublk_device *ub)
2109{
2110 spin_lock(lock: &ublk_idr_lock);
2111 idr_remove(&ublk_index_idr, id: ub->ub_number);
2112 wake_up_all(&ublk_idr_wq);
2113 spin_unlock(lock: &ublk_idr_lock);
2114}
2115
2116static void ublk_cdev_rel(struct device *dev)
2117{
2118 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2119
2120 blk_mq_free_tag_set(set: &ub->tag_set);
2121 ublk_deinit_queues(ub);
2122 ublk_free_dev_number(ub);
2123 mutex_destroy(lock: &ub->mutex);
2124 kfree(objp: ub);
2125}
2126
2127static int ublk_add_chdev(struct ublk_device *ub)
2128{
2129 struct device *dev = &ub->cdev_dev;
2130 int minor = ub->ub_number;
2131 int ret;
2132
2133 dev->parent = ublk_misc.this_device;
2134 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2135 dev->class = &ublk_chr_class;
2136 dev->release = ublk_cdev_rel;
2137 device_initialize(dev);
2138
2139 ret = dev_set_name(dev, name: "ublkc%d", minor);
2140 if (ret)
2141 goto fail;
2142
2143 cdev_init(&ub->cdev, &ublk_ch_fops);
2144 ret = cdev_device_add(cdev: &ub->cdev, dev);
2145 if (ret)
2146 goto fail;
2147
2148 ublks_added++;
2149 return 0;
2150 fail:
2151 put_device(dev);
2152 return ret;
2153}
2154
2155static void ublk_stop_work_fn(struct work_struct *work)
2156{
2157 struct ublk_device *ub =
2158 container_of(work, struct ublk_device, stop_work);
2159
2160 ublk_stop_dev(ub);
2161}
2162
2163/* align max io buffer size with PAGE_SIZE */
2164static void ublk_align_max_io_size(struct ublk_device *ub)
2165{
2166 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2167
2168 ub->dev_info.max_io_buf_bytes =
2169 round_down(max_io_bytes, PAGE_SIZE);
2170}
2171
2172static int ublk_add_tag_set(struct ublk_device *ub)
2173{
2174 ub->tag_set.ops = &ublk_mq_ops;
2175 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2176 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2177 ub->tag_set.numa_node = NUMA_NO_NODE;
2178 ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
2179 ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
2180 ub->tag_set.driver_data = ub;
2181 return blk_mq_alloc_tag_set(set: &ub->tag_set);
2182}
2183
2184static void ublk_remove(struct ublk_device *ub)
2185{
2186 ublk_stop_dev(ub);
2187 cancel_work_sync(work: &ub->stop_work);
2188 cancel_work_sync(work: &ub->quiesce_work);
2189 cdev_device_del(cdev: &ub->cdev, dev: &ub->cdev_dev);
2190 put_device(dev: &ub->cdev_dev);
2191 ublks_added--;
2192}
2193
2194static struct ublk_device *ublk_get_device_from_id(int idx)
2195{
2196 struct ublk_device *ub = NULL;
2197
2198 if (idx < 0)
2199 return NULL;
2200
2201 spin_lock(lock: &ublk_idr_lock);
2202 ub = idr_find(&ublk_index_idr, id: idx);
2203 if (ub)
2204 ub = ublk_get_device(ub);
2205 spin_unlock(lock: &ublk_idr_lock);
2206
2207 return ub;
2208}
2209
2210static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
2211{
2212 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2213 int ublksrv_pid = (int)header->data[0];
2214 struct gendisk *disk;
2215 int ret = -EINVAL;
2216
2217 if (ublksrv_pid <= 0)
2218 return -EINVAL;
2219
2220 if (wait_for_completion_interruptible(x: &ub->completion) != 0)
2221 return -EINTR;
2222
2223 mutex_lock(&ub->mutex);
2224 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2225 test_bit(UB_STATE_USED, &ub->state)) {
2226 ret = -EEXIST;
2227 goto out_unlock;
2228 }
2229
2230 disk = blk_mq_alloc_disk(&ub->tag_set, NULL);
2231 if (IS_ERR(ptr: disk)) {
2232 ret = PTR_ERR(ptr: disk);
2233 goto out_unlock;
2234 }
2235 sprintf(buf: disk->disk_name, fmt: "ublkb%d", ub->ub_number);
2236 disk->fops = &ub_fops;
2237 disk->private_data = ub;
2238
2239 ub->dev_info.ublksrv_pid = ublksrv_pid;
2240 ub->ub_disk = disk;
2241
2242 ret = ublk_apply_params(ub);
2243 if (ret)
2244 goto out_put_disk;
2245
2246 /* don't probe partitions if any one ubq daemon is un-trusted */
2247 if (ub->nr_privileged_daemon != ub->nr_queues_ready)
2248 set_bit(GD_SUPPRESS_PART_SCAN, addr: &disk->state);
2249
2250 get_device(dev: &ub->cdev_dev);
2251 ub->dev_info.state = UBLK_S_DEV_LIVE;
2252
2253 if (ublk_dev_is_zoned(ub)) {
2254 ret = ublk_revalidate_disk_zones(ub);
2255 if (ret)
2256 goto out_put_cdev;
2257 }
2258
2259 ret = add_disk(disk);
2260 if (ret)
2261 goto out_put_cdev;
2262
2263 set_bit(UB_STATE_USED, addr: &ub->state);
2264
2265out_put_cdev:
2266 if (ret) {
2267 ub->dev_info.state = UBLK_S_DEV_DEAD;
2268 ublk_put_device(ub);
2269 }
2270out_put_disk:
2271 if (ret)
2272 put_disk(disk);
2273out_unlock:
2274 mutex_unlock(lock: &ub->mutex);
2275 return ret;
2276}
2277
2278static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2279 struct io_uring_cmd *cmd)
2280{
2281 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2282 void __user *argp = (void __user *)(unsigned long)header->addr;
2283 cpumask_var_t cpumask;
2284 unsigned long queue;
2285 unsigned int retlen;
2286 unsigned int i;
2287 int ret;
2288
2289 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
2290 return -EINVAL;
2291 if (header->len & (sizeof(unsigned long)-1))
2292 return -EINVAL;
2293 if (!header->addr)
2294 return -EINVAL;
2295
2296 queue = header->data[0];
2297 if (queue >= ub->dev_info.nr_hw_queues)
2298 return -EINVAL;
2299
2300 if (!zalloc_cpumask_var(mask: &cpumask, GFP_KERNEL))
2301 return -ENOMEM;
2302
2303 for_each_possible_cpu(i) {
2304 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
2305 cpumask_set_cpu(cpu: i, dstp: cpumask);
2306 }
2307
2308 ret = -EFAULT;
2309 retlen = min_t(unsigned short, header->len, cpumask_size());
2310 if (copy_to_user(to: argp, from: cpumask, n: retlen))
2311 goto out_free_cpumask;
2312 if (retlen != header->len &&
2313 clear_user(to: argp + retlen, n: header->len - retlen))
2314 goto out_free_cpumask;
2315
2316 ret = 0;
2317out_free_cpumask:
2318 free_cpumask_var(mask: cpumask);
2319 return ret;
2320}
2321
2322static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
2323{
2324 pr_devel("%s: dev id %d flags %llx\n", __func__,
2325 info->dev_id, info->flags);
2326 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
2327 info->nr_hw_queues, info->queue_depth);
2328}
2329
2330static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
2331{
2332 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2333 void __user *argp = (void __user *)(unsigned long)header->addr;
2334 struct ublksrv_ctrl_dev_info info;
2335 struct ublk_device *ub;
2336 int ret = -EINVAL;
2337
2338 if (header->len < sizeof(info) || !header->addr)
2339 return -EINVAL;
2340 if (header->queue_id != (u16)-1) {
2341 pr_warn("%s: queue_id is wrong %x\n",
2342 __func__, header->queue_id);
2343 return -EINVAL;
2344 }
2345
2346 if (copy_from_user(to: &info, from: argp, n: sizeof(info)))
2347 return -EFAULT;
2348
2349 if (capable(CAP_SYS_ADMIN))
2350 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
2351 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
2352 return -EPERM;
2353
2354 /*
2355 * unprivileged device can't be trusted, but RECOVERY and
2356 * RECOVERY_REISSUE still may hang error handling, so can't
2357 * support recovery features for unprivileged ublk now
2358 *
2359 * TODO: provide forward progress for RECOVERY handler, so that
2360 * unprivileged device can benefit from it
2361 */
2362 if (info.flags & UBLK_F_UNPRIVILEGED_DEV)
2363 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
2364 UBLK_F_USER_RECOVERY);
2365
2366 /* the created device is always owned by current user */
2367 ublk_store_owner_uid_gid(owner_uid: &info.owner_uid, owner_gid: &info.owner_gid);
2368
2369 if (header->dev_id != info.dev_id) {
2370 pr_warn("%s: dev id not match %u %u\n",
2371 __func__, header->dev_id, info.dev_id);
2372 return -EINVAL;
2373 }
2374
2375 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
2376 pr_warn("%s: dev id is too large. Max supported is %d\n",
2377 __func__, UBLK_MAX_UBLKS - 1);
2378 return -EINVAL;
2379 }
2380
2381 ublk_dump_dev_info(info: &info);
2382
2383 ret = mutex_lock_killable(&ublk_ctl_mutex);
2384 if (ret)
2385 return ret;
2386
2387 ret = -EACCES;
2388 if (ublks_added >= ublks_max)
2389 goto out_unlock;
2390
2391 ret = -ENOMEM;
2392 ub = kzalloc(size: sizeof(*ub), GFP_KERNEL);
2393 if (!ub)
2394 goto out_unlock;
2395 mutex_init(&ub->mutex);
2396 spin_lock_init(&ub->lock);
2397 INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
2398 INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
2399
2400 ret = ublk_alloc_dev_number(ub, idx: header->dev_id);
2401 if (ret < 0)
2402 goto out_free_ub;
2403
2404 memcpy(&ub->dev_info, &info, sizeof(info));
2405
2406 /* update device id */
2407 ub->dev_info.dev_id = ub->ub_number;
2408
2409 /*
2410 * 64bit flags will be copied back to userspace as feature
2411 * negotiation result, so have to clear flags which driver
2412 * doesn't support yet, then userspace can get correct flags
2413 * (features) to handle.
2414 */
2415 ub->dev_info.flags &= UBLK_F_ALL;
2416
2417 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
2418 UBLK_F_URING_CMD_COMP_IN_TASK;
2419
2420 /* GET_DATA isn't needed any more with USER_COPY */
2421 if (ublk_dev_is_user_copy(ub))
2422 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
2423
2424 /* Zoned storage support requires user copy feature */
2425 if (ublk_dev_is_zoned(ub) &&
2426 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
2427 ret = -EINVAL;
2428 goto out_free_dev_number;
2429 }
2430
2431 /* We are not ready to support zero copy */
2432 ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
2433
2434 ub->dev_info.nr_hw_queues = min_t(unsigned int,
2435 ub->dev_info.nr_hw_queues, nr_cpu_ids);
2436 ublk_align_max_io_size(ub);
2437
2438 ret = ublk_init_queues(ub);
2439 if (ret)
2440 goto out_free_dev_number;
2441
2442 ret = ublk_add_tag_set(ub);
2443 if (ret)
2444 goto out_deinit_queues;
2445
2446 ret = -EFAULT;
2447 if (copy_to_user(to: argp, from: &ub->dev_info, n: sizeof(info)))
2448 goto out_free_tag_set;
2449
2450 /*
2451 * Add the char dev so that ublksrv daemon can be setup.
2452 * ublk_add_chdev() will cleanup everything if it fails.
2453 */
2454 ret = ublk_add_chdev(ub);
2455 goto out_unlock;
2456
2457out_free_tag_set:
2458 blk_mq_free_tag_set(set: &ub->tag_set);
2459out_deinit_queues:
2460 ublk_deinit_queues(ub);
2461out_free_dev_number:
2462 ublk_free_dev_number(ub);
2463out_free_ub:
2464 mutex_destroy(lock: &ub->mutex);
2465 kfree(objp: ub);
2466out_unlock:
2467 mutex_unlock(lock: &ublk_ctl_mutex);
2468 return ret;
2469}
2470
2471static inline bool ublk_idr_freed(int id)
2472{
2473 void *ptr;
2474
2475 spin_lock(lock: &ublk_idr_lock);
2476 ptr = idr_find(&ublk_index_idr, id);
2477 spin_unlock(lock: &ublk_idr_lock);
2478
2479 return ptr == NULL;
2480}
2481
2482static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
2483{
2484 struct ublk_device *ub = *p_ub;
2485 int idx = ub->ub_number;
2486 int ret;
2487
2488 ret = mutex_lock_killable(&ublk_ctl_mutex);
2489 if (ret)
2490 return ret;
2491
2492 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
2493 ublk_remove(ub);
2494 set_bit(UB_STATE_DELETED, addr: &ub->state);
2495 }
2496
2497 /* Mark the reference as consumed */
2498 *p_ub = NULL;
2499 ublk_put_device(ub);
2500 mutex_unlock(lock: &ublk_ctl_mutex);
2501
2502 /*
2503 * Wait until the idr is removed, then it can be reused after
2504 * DEL_DEV command is returned.
2505 *
2506 * If we returns because of user interrupt, future delete command
2507 * may come:
2508 *
2509 * - the device number isn't freed, this device won't or needn't
2510 * be deleted again, since UB_STATE_DELETED is set, and device
2511 * will be released after the last reference is dropped
2512 *
2513 * - the device number is freed already, we will not find this
2514 * device via ublk_get_device_from_id()
2515 */
2516 if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
2517 return -EINTR;
2518 return 0;
2519}
2520
2521static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
2522{
2523 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2524
2525 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
2526 __func__, cmd->cmd_op, header->dev_id, header->queue_id,
2527 header->data[0], header->addr, header->len);
2528}
2529
2530static int ublk_ctrl_stop_dev(struct ublk_device *ub)
2531{
2532 ublk_stop_dev(ub);
2533 cancel_work_sync(work: &ub->stop_work);
2534 cancel_work_sync(work: &ub->quiesce_work);
2535
2536 return 0;
2537}
2538
2539static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
2540 struct io_uring_cmd *cmd)
2541{
2542 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2543 void __user *argp = (void __user *)(unsigned long)header->addr;
2544
2545 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
2546 return -EINVAL;
2547
2548 if (copy_to_user(to: argp, from: &ub->dev_info, n: sizeof(ub->dev_info)))
2549 return -EFAULT;
2550
2551 return 0;
2552}
2553
2554/* TYPE_DEVT is readonly, so fill it up before returning to userspace */
2555static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
2556{
2557 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
2558 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
2559
2560 if (ub->ub_disk) {
2561 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
2562 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
2563 } else {
2564 ub->params.devt.disk_major = 0;
2565 ub->params.devt.disk_minor = 0;
2566 }
2567 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
2568}
2569
2570static int ublk_ctrl_get_params(struct ublk_device *ub,
2571 struct io_uring_cmd *cmd)
2572{
2573 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2574 void __user *argp = (void __user *)(unsigned long)header->addr;
2575 struct ublk_params_header ph;
2576 int ret;
2577
2578 if (header->len <= sizeof(ph) || !header->addr)
2579 return -EINVAL;
2580
2581 if (copy_from_user(to: &ph, from: argp, n: sizeof(ph)))
2582 return -EFAULT;
2583
2584 if (ph.len > header->len || !ph.len)
2585 return -EINVAL;
2586
2587 if (ph.len > sizeof(struct ublk_params))
2588 ph.len = sizeof(struct ublk_params);
2589
2590 mutex_lock(&ub->mutex);
2591 ublk_ctrl_fill_params_devt(ub);
2592 if (copy_to_user(to: argp, from: &ub->params, n: ph.len))
2593 ret = -EFAULT;
2594 else
2595 ret = 0;
2596 mutex_unlock(lock: &ub->mutex);
2597
2598 return ret;
2599}
2600
2601static int ublk_ctrl_set_params(struct ublk_device *ub,
2602 struct io_uring_cmd *cmd)
2603{
2604 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2605 void __user *argp = (void __user *)(unsigned long)header->addr;
2606 struct ublk_params_header ph;
2607 int ret = -EFAULT;
2608
2609 if (header->len <= sizeof(ph) || !header->addr)
2610 return -EINVAL;
2611
2612 if (copy_from_user(to: &ph, from: argp, n: sizeof(ph)))
2613 return -EFAULT;
2614
2615 if (ph.len > header->len || !ph.len || !ph.types)
2616 return -EINVAL;
2617
2618 if (ph.len > sizeof(struct ublk_params))
2619 ph.len = sizeof(struct ublk_params);
2620
2621 /* parameters can only be changed when device isn't live */
2622 mutex_lock(&ub->mutex);
2623 if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
2624 ret = -EACCES;
2625 } else if (copy_from_user(to: &ub->params, from: argp, n: ph.len)) {
2626 ret = -EFAULT;
2627 } else {
2628 /* clear all we don't support yet */
2629 ub->params.types &= UBLK_PARAM_TYPE_ALL;
2630 ret = ublk_validate_params(ub);
2631 if (ret)
2632 ub->params.types = 0;
2633 }
2634 mutex_unlock(lock: &ub->mutex);
2635
2636 return ret;
2637}
2638
2639static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2640{
2641 int i;
2642
2643 WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
2644
2645 /* All old ioucmds have to be completed */
2646 ubq->nr_io_ready = 0;
2647 /* old daemon is PF_EXITING, put it now */
2648 put_task_struct(t: ubq->ubq_daemon);
2649 /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
2650 ubq->ubq_daemon = NULL;
2651 ubq->timeout = false;
2652 ubq->canceling = false;
2653
2654 for (i = 0; i < ubq->q_depth; i++) {
2655 struct ublk_io *io = &ubq->ios[i];
2656
2657 /* forget everything now and be ready for new FETCH_REQ */
2658 io->flags = 0;
2659 io->cmd = NULL;
2660 io->addr = 0;
2661 }
2662}
2663
2664static int ublk_ctrl_start_recovery(struct ublk_device *ub,
2665 struct io_uring_cmd *cmd)
2666{
2667 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2668 int ret = -EINVAL;
2669 int i;
2670
2671 mutex_lock(&ub->mutex);
2672 if (!ublk_can_use_recovery(ub))
2673 goto out_unlock;
2674 /*
2675 * START_RECOVERY is only allowd after:
2676 *
2677 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
2678 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
2679 * released.
2680 *
2681 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
2682 * (a)has quiesced request queue
2683 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
2684 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
2685 * (d)has completed/camceled all ioucmds owned by ther dying process
2686 */
2687 if (test_bit(UB_STATE_OPEN, &ub->state) ||
2688 ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2689 ret = -EBUSY;
2690 goto out_unlock;
2691 }
2692 pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
2693 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2694 ublk_queue_reinit(ub, ubq: ublk_get_queue(dev: ub, qid: i));
2695 /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
2696 ub->mm = NULL;
2697 ub->nr_queues_ready = 0;
2698 ub->nr_privileged_daemon = 0;
2699 init_completion(x: &ub->completion);
2700 ret = 0;
2701 out_unlock:
2702 mutex_unlock(lock: &ub->mutex);
2703 return ret;
2704}
2705
2706static int ublk_ctrl_end_recovery(struct ublk_device *ub,
2707 struct io_uring_cmd *cmd)
2708{
2709 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2710 int ublksrv_pid = (int)header->data[0];
2711 int ret = -EINVAL;
2712
2713 pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
2714 __func__, ub->dev_info.nr_hw_queues, header->dev_id);
2715 /* wait until new ubq_daemon sending all FETCH_REQ */
2716 if (wait_for_completion_interruptible(x: &ub->completion))
2717 return -EINTR;
2718
2719 pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
2720 __func__, ub->dev_info.nr_hw_queues, header->dev_id);
2721
2722 mutex_lock(&ub->mutex);
2723 if (!ublk_can_use_recovery(ub))
2724 goto out_unlock;
2725
2726 if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2727 ret = -EBUSY;
2728 goto out_unlock;
2729 }
2730 ub->dev_info.ublksrv_pid = ublksrv_pid;
2731 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
2732 __func__, ublksrv_pid, header->dev_id);
2733 blk_mq_unquiesce_queue(q: ub->ub_disk->queue);
2734 pr_devel("%s: queue unquiesced, dev id %d.\n",
2735 __func__, header->dev_id);
2736 blk_mq_kick_requeue_list(q: ub->ub_disk->queue);
2737 ub->dev_info.state = UBLK_S_DEV_LIVE;
2738 ret = 0;
2739 out_unlock:
2740 mutex_unlock(lock: &ub->mutex);
2741 return ret;
2742}
2743
2744static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
2745{
2746 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2747 void __user *argp = (void __user *)(unsigned long)header->addr;
2748 u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
2749
2750 if (header->len != UBLK_FEATURES_LEN || !header->addr)
2751 return -EINVAL;
2752
2753 if (copy_to_user(to: argp, from: &features, UBLK_FEATURES_LEN))
2754 return -EFAULT;
2755
2756 return 0;
2757}
2758
2759/*
2760 * All control commands are sent via /dev/ublk-control, so we have to check
2761 * the destination device's permission
2762 */
2763static int ublk_char_dev_permission(struct ublk_device *ub,
2764 const char *dev_path, int mask)
2765{
2766 int err;
2767 struct path path;
2768 struct kstat stat;
2769
2770 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
2771 if (err)
2772 return err;
2773
2774 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
2775 if (err)
2776 goto exit;
2777
2778 err = -EPERM;
2779 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
2780 goto exit;
2781
2782 err = inode_permission(&nop_mnt_idmap,
2783 d_backing_inode(upper: path.dentry), mask);
2784exit:
2785 path_put(&path);
2786 return err;
2787}
2788
2789static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
2790 struct io_uring_cmd *cmd)
2791{
2792 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(sqe: cmd->sqe);
2793 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2794 void __user *argp = (void __user *)(unsigned long)header->addr;
2795 char *dev_path = NULL;
2796 int ret = 0;
2797 int mask;
2798
2799 if (!unprivileged) {
2800 if (!capable(CAP_SYS_ADMIN))
2801 return -EPERM;
2802 /*
2803 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
2804 * char_dev_path in payload too, since userspace may not
2805 * know if the specified device is created as unprivileged
2806 * mode.
2807 */
2808 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
2809 return 0;
2810 }
2811
2812 /*
2813 * User has to provide the char device path for unprivileged ublk
2814 *
2815 * header->addr always points to the dev path buffer, and
2816 * header->dev_path_len records length of dev path buffer.
2817 */
2818 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
2819 return -EINVAL;
2820
2821 if (header->len < header->dev_path_len)
2822 return -EINVAL;
2823
2824 dev_path = memdup_user_nul(argp, header->dev_path_len);
2825 if (IS_ERR(ptr: dev_path))
2826 return PTR_ERR(ptr: dev_path);
2827
2828 ret = -EINVAL;
2829 switch (_IOC_NR(cmd->cmd_op)) {
2830 case UBLK_CMD_GET_DEV_INFO:
2831 case UBLK_CMD_GET_DEV_INFO2:
2832 case UBLK_CMD_GET_QUEUE_AFFINITY:
2833 case UBLK_CMD_GET_PARAMS:
2834 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
2835 mask = MAY_READ;
2836 break;
2837 case UBLK_CMD_START_DEV:
2838 case UBLK_CMD_STOP_DEV:
2839 case UBLK_CMD_ADD_DEV:
2840 case UBLK_CMD_DEL_DEV:
2841 case UBLK_CMD_SET_PARAMS:
2842 case UBLK_CMD_START_USER_RECOVERY:
2843 case UBLK_CMD_END_USER_RECOVERY:
2844 mask = MAY_READ | MAY_WRITE;
2845 break;
2846 default:
2847 goto exit;
2848 }
2849
2850 ret = ublk_char_dev_permission(ub, dev_path, mask);
2851 if (!ret) {
2852 header->len -= header->dev_path_len;
2853 header->addr += header->dev_path_len;
2854 }
2855 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
2856 __func__, ub->ub_number, cmd->cmd_op,
2857 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
2858 dev_path, ret);
2859exit:
2860 kfree(objp: dev_path);
2861 return ret;
2862}
2863
2864static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
2865 unsigned int issue_flags)
2866{
2867 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(sqe: cmd->sqe);
2868 struct ublk_device *ub = NULL;
2869 u32 cmd_op = cmd->cmd_op;
2870 int ret = -EINVAL;
2871
2872 if (issue_flags & IO_URING_F_NONBLOCK)
2873 return -EAGAIN;
2874
2875 ublk_ctrl_cmd_dump(cmd);
2876
2877 if (!(issue_flags & IO_URING_F_SQE128))
2878 goto out;
2879
2880 ret = ublk_check_cmd_op(cmd_op);
2881 if (ret)
2882 goto out;
2883
2884 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
2885 ret = ublk_ctrl_get_features(cmd);
2886 goto out;
2887 }
2888
2889 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
2890 ret = -ENODEV;
2891 ub = ublk_get_device_from_id(idx: header->dev_id);
2892 if (!ub)
2893 goto out;
2894
2895 ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
2896 if (ret)
2897 goto put_dev;
2898 }
2899
2900 switch (_IOC_NR(cmd_op)) {
2901 case UBLK_CMD_START_DEV:
2902 ret = ublk_ctrl_start_dev(ub, cmd);
2903 break;
2904 case UBLK_CMD_STOP_DEV:
2905 ret = ublk_ctrl_stop_dev(ub);
2906 break;
2907 case UBLK_CMD_GET_DEV_INFO:
2908 case UBLK_CMD_GET_DEV_INFO2:
2909 ret = ublk_ctrl_get_dev_info(ub, cmd);
2910 break;
2911 case UBLK_CMD_ADD_DEV:
2912 ret = ublk_ctrl_add_dev(cmd);
2913 break;
2914 case UBLK_CMD_DEL_DEV:
2915 ret = ublk_ctrl_del_dev(p_ub: &ub);
2916 break;
2917 case UBLK_CMD_GET_QUEUE_AFFINITY:
2918 ret = ublk_ctrl_get_queue_affinity(ub, cmd);
2919 break;
2920 case UBLK_CMD_GET_PARAMS:
2921 ret = ublk_ctrl_get_params(ub, cmd);
2922 break;
2923 case UBLK_CMD_SET_PARAMS:
2924 ret = ublk_ctrl_set_params(ub, cmd);
2925 break;
2926 case UBLK_CMD_START_USER_RECOVERY:
2927 ret = ublk_ctrl_start_recovery(ub, cmd);
2928 break;
2929 case UBLK_CMD_END_USER_RECOVERY:
2930 ret = ublk_ctrl_end_recovery(ub, cmd);
2931 break;
2932 default:
2933 ret = -ENOTSUPP;
2934 break;
2935 }
2936
2937 put_dev:
2938 if (ub)
2939 ublk_put_device(ub);
2940 out:
2941 io_uring_cmd_done(cmd, ret, res2: 0, issue_flags);
2942 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
2943 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
2944 return -EIOCBQUEUED;
2945}
2946
2947static const struct file_operations ublk_ctl_fops = {
2948 .open = nonseekable_open,
2949 .uring_cmd = ublk_ctrl_uring_cmd,
2950 .owner = THIS_MODULE,
2951 .llseek = noop_llseek,
2952};
2953
2954static struct miscdevice ublk_misc = {
2955 .minor = MISC_DYNAMIC_MINOR,
2956 .name = "ublk-control",
2957 .fops = &ublk_ctl_fops,
2958};
2959
2960static int __init ublk_init(void)
2961{
2962 int ret;
2963
2964 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
2965 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
2966
2967 init_waitqueue_head(&ublk_idr_wq);
2968
2969 ret = misc_register(misc: &ublk_misc);
2970 if (ret)
2971 return ret;
2972
2973 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
2974 if (ret)
2975 goto unregister_mis;
2976
2977 ret = class_register(class: &ublk_chr_class);
2978 if (ret)
2979 goto free_chrdev_region;
2980
2981 return 0;
2982
2983free_chrdev_region:
2984 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2985unregister_mis:
2986 misc_deregister(misc: &ublk_misc);
2987 return ret;
2988}
2989
2990static void __exit ublk_exit(void)
2991{
2992 struct ublk_device *ub;
2993 int id;
2994
2995 idr_for_each_entry(&ublk_index_idr, ub, id)
2996 ublk_remove(ub);
2997
2998 class_unregister(class: &ublk_chr_class);
2999 misc_deregister(misc: &ublk_misc);
3000
3001 idr_destroy(&ublk_index_idr);
3002 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3003}
3004
3005module_init(ublk_init);
3006module_exit(ublk_exit);
3007
3008static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
3009{
3010 return param_set_uint_minmax(val: buf, kp, min: 0, UBLK_MAX_UBLKS);
3011}
3012
3013static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
3014{
3015 return sysfs_emit(buf, fmt: "%u\n", ublks_max);
3016}
3017
3018static const struct kernel_param_ops ublk_max_ublks_ops = {
3019 .set = ublk_set_max_ublks,
3020 .get = ublk_get_max_ublks,
3021};
3022
3023module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
3024MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
3025
3026MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
3027MODULE_LICENSE("GPL");
3028

source code of linux/drivers/block/ublk_drv.c