| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * RDMA Network Block Driver |
| 4 | * |
| 5 | * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. |
| 6 | * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. |
| 7 | * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. |
| 8 | */ |
| 9 | |
| 10 | #undef pr_fmt |
| 11 | #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt |
| 12 | |
| 13 | #include <linux/module.h> |
| 14 | #include <linux/blkdev.h> |
| 15 | #include <linux/hdreg.h> |
| 16 | #include <linux/scatterlist.h> |
| 17 | #include <linux/idr.h> |
| 18 | |
| 19 | #include "rnbd-clt.h" |
| 20 | |
| 21 | MODULE_DESCRIPTION("RDMA Network Block Device Client" ); |
| 22 | MODULE_LICENSE("GPL" ); |
| 23 | |
| 24 | static int rnbd_client_major; |
| 25 | static DEFINE_IDA(index_ida); |
| 26 | static DEFINE_MUTEX(sess_lock); |
| 27 | static LIST_HEAD(sess_list); |
| 28 | static struct workqueue_struct *rnbd_clt_wq; |
| 29 | |
| 30 | /* |
| 31 | * Maximum number of partitions an instance can have. |
| 32 | * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself) |
| 33 | */ |
| 34 | #define RNBD_PART_BITS 6 |
| 35 | |
| 36 | static inline bool rnbd_clt_get_sess(struct rnbd_clt_session *sess) |
| 37 | { |
| 38 | return refcount_inc_not_zero(r: &sess->refcount); |
| 39 | } |
| 40 | |
| 41 | static void free_sess(struct rnbd_clt_session *sess); |
| 42 | |
| 43 | static void rnbd_clt_put_sess(struct rnbd_clt_session *sess) |
| 44 | { |
| 45 | might_sleep(); |
| 46 | |
| 47 | if (refcount_dec_and_test(r: &sess->refcount)) |
| 48 | free_sess(sess); |
| 49 | } |
| 50 | |
| 51 | static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) |
| 52 | { |
| 53 | might_sleep(); |
| 54 | |
| 55 | if (!refcount_dec_and_test(r: &dev->refcount)) |
| 56 | return; |
| 57 | |
| 58 | ida_free(&index_ida, id: dev->clt_device_id); |
| 59 | kfree(objp: dev->hw_queues); |
| 60 | kfree(objp: dev->pathname); |
| 61 | rnbd_clt_put_sess(sess: dev->sess); |
| 62 | mutex_destroy(lock: &dev->lock); |
| 63 | kfree(objp: dev); |
| 64 | } |
| 65 | |
| 66 | static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) |
| 67 | { |
| 68 | return refcount_inc_not_zero(r: &dev->refcount); |
| 69 | } |
| 70 | |
| 71 | static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, |
| 72 | sector_t new_nsectors) |
| 73 | { |
| 74 | if (get_capacity(disk: dev->gd) == new_nsectors) |
| 75 | return; |
| 76 | |
| 77 | /* |
| 78 | * If the size changed, we need to revalidate it |
| 79 | */ |
| 80 | rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n" , |
| 81 | get_capacity(dev->gd), new_nsectors); |
| 82 | set_capacity_and_notify(disk: dev->gd, size: new_nsectors); |
| 83 | } |
| 84 | |
| 85 | static int process_msg_open_rsp(struct rnbd_clt_dev *dev, |
| 86 | struct rnbd_msg_open_rsp *rsp) |
| 87 | { |
| 88 | struct kobject *gd_kobj; |
| 89 | int err = 0; |
| 90 | |
| 91 | mutex_lock(&dev->lock); |
| 92 | if (dev->dev_state == DEV_STATE_UNMAPPED) { |
| 93 | rnbd_clt_info(dev, |
| 94 | "Ignoring Open-Response message from server for unmapped device\n" ); |
| 95 | err = -ENOENT; |
| 96 | goto out; |
| 97 | } |
| 98 | if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) { |
| 99 | u64 nsectors = le64_to_cpu(rsp->nsectors); |
| 100 | |
| 101 | rnbd_clt_change_capacity(dev, new_nsectors: nsectors); |
| 102 | gd_kobj = &disk_to_dev(dev->gd)->kobj; |
| 103 | kobject_uevent(kobj: gd_kobj, action: KOBJ_ONLINE); |
| 104 | rnbd_clt_info(dev, "Device online, device remapped successfully\n" ); |
| 105 | } |
| 106 | if (!rsp->logical_block_size) { |
| 107 | err = -EINVAL; |
| 108 | goto out; |
| 109 | } |
| 110 | dev->device_id = le32_to_cpu(rsp->device_id); |
| 111 | dev->dev_state = DEV_STATE_MAPPED; |
| 112 | |
| 113 | out: |
| 114 | mutex_unlock(lock: &dev->lock); |
| 115 | |
| 116 | return err; |
| 117 | } |
| 118 | |
| 119 | int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize) |
| 120 | { |
| 121 | int ret = 0; |
| 122 | |
| 123 | mutex_lock(&dev->lock); |
| 124 | if (dev->dev_state != DEV_STATE_MAPPED) { |
| 125 | pr_err("Failed to set new size of the device, device is not opened\n" ); |
| 126 | ret = -ENOENT; |
| 127 | goto out; |
| 128 | } |
| 129 | rnbd_clt_change_capacity(dev, new_nsectors: newsize); |
| 130 | |
| 131 | out: |
| 132 | mutex_unlock(lock: &dev->lock); |
| 133 | |
| 134 | return ret; |
| 135 | } |
| 136 | |
| 137 | static inline void rnbd_clt_dev_requeue(struct rnbd_queue *q) |
| 138 | { |
| 139 | if (WARN_ON(!q->hctx)) |
| 140 | return; |
| 141 | |
| 142 | /* We can come here from interrupt, thus async=true */ |
| 143 | blk_mq_run_hw_queue(hctx: q->hctx, async: true); |
| 144 | } |
| 145 | |
| 146 | enum { |
| 147 | RNBD_DELAY_IFBUSY = -1, |
| 148 | }; |
| 149 | |
| 150 | /** |
| 151 | * rnbd_get_cpu_qlist() - finds a list with HW queues to be rerun |
| 152 | * @sess: Session to find a queue for |
| 153 | * @cpu: Cpu to start the search from |
| 154 | * |
| 155 | * Description: |
| 156 | * Each CPU has a list of HW queues, which needs to be rerun. If a list |
| 157 | * is not empty - it is marked with a bit. This function finds first |
| 158 | * set bit in a bitmap and returns corresponding CPU list. |
| 159 | */ |
| 160 | static struct rnbd_cpu_qlist * |
| 161 | rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) |
| 162 | { |
| 163 | int bit; |
| 164 | |
| 165 | /* Search from cpu to nr_cpu_ids */ |
| 166 | bit = find_next_bit(addr: sess->cpu_queues_bm, size: nr_cpu_ids, offset: cpu); |
| 167 | if (bit < nr_cpu_ids) { |
| 168 | return per_cpu_ptr(sess->cpu_queues, bit); |
| 169 | } else if (cpu != 0) { |
| 170 | /* Search from 0 to cpu */ |
| 171 | bit = find_first_bit(addr: sess->cpu_queues_bm, size: cpu); |
| 172 | if (bit < cpu) |
| 173 | return per_cpu_ptr(sess->cpu_queues, bit); |
| 174 | } |
| 175 | |
| 176 | return NULL; |
| 177 | } |
| 178 | |
| 179 | static inline int nxt_cpu(int cpu) |
| 180 | { |
| 181 | return (cpu + 1) % nr_cpu_ids; |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | * rnbd_rerun_if_needed() - rerun next queue marked as stopped |
| 186 | * @sess: Session to rerun a queue on |
| 187 | * |
| 188 | * Description: |
| 189 | * Each CPU has it's own list of HW queues, which should be rerun. |
| 190 | * Function finds such list with HW queues, takes a list lock, picks up |
| 191 | * the first HW queue out of the list and requeues it. |
| 192 | * |
| 193 | * Return: |
| 194 | * True if the queue was requeued, false otherwise. |
| 195 | * |
| 196 | * Context: |
| 197 | * Does not matter. |
| 198 | */ |
| 199 | static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess) |
| 200 | { |
| 201 | struct rnbd_queue *q = NULL; |
| 202 | struct rnbd_cpu_qlist *cpu_q; |
| 203 | unsigned long flags; |
| 204 | int *cpup; |
| 205 | |
| 206 | /* |
| 207 | * To keep fairness and not to let other queues starve we always |
| 208 | * try to wake up someone else in round-robin manner. That of course |
| 209 | * increases latency but queues always have a chance to be executed. |
| 210 | */ |
| 211 | cpup = get_cpu_ptr(sess->cpu_rr); |
| 212 | for (cpu_q = rnbd_get_cpu_qlist(sess, cpu: nxt_cpu(cpu: *cpup)); cpu_q; |
| 213 | cpu_q = rnbd_get_cpu_qlist(sess, cpu: nxt_cpu(cpu: cpu_q->cpu))) { |
| 214 | if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags)) |
| 215 | continue; |
| 216 | if (!test_bit(cpu_q->cpu, sess->cpu_queues_bm)) |
| 217 | goto unlock; |
| 218 | q = list_first_entry_or_null(&cpu_q->requeue_list, |
| 219 | typeof(*q), requeue_list); |
| 220 | if (WARN_ON(!q)) |
| 221 | goto clear_bit; |
| 222 | list_del_init(entry: &q->requeue_list); |
| 223 | clear_bit_unlock(nr: 0, addr: &q->in_list); |
| 224 | |
| 225 | if (list_empty(head: &cpu_q->requeue_list)) { |
| 226 | /* Clear bit if nothing is left */ |
| 227 | clear_bit: |
| 228 | clear_bit(nr: cpu_q->cpu, addr: sess->cpu_queues_bm); |
| 229 | } |
| 230 | unlock: |
| 231 | spin_unlock_irqrestore(lock: &cpu_q->requeue_lock, flags); |
| 232 | |
| 233 | if (q) |
| 234 | break; |
| 235 | } |
| 236 | |
| 237 | /** |
| 238 | * Saves the CPU that is going to be requeued on the per-cpu var. Just |
| 239 | * incrementing it doesn't work because rnbd_get_cpu_qlist() will |
| 240 | * always return the first CPU with something on the queue list when the |
| 241 | * value stored on the var is greater than the last CPU with something |
| 242 | * on the list. |
| 243 | */ |
| 244 | if (cpu_q) |
| 245 | *cpup = cpu_q->cpu; |
| 246 | put_cpu_ptr(sess->cpu_rr); |
| 247 | |
| 248 | if (q) |
| 249 | rnbd_clt_dev_requeue(q); |
| 250 | |
| 251 | return q; |
| 252 | } |
| 253 | |
| 254 | /** |
| 255 | * rnbd_rerun_all_if_idle() - rerun all queues left in the list if |
| 256 | * session is idling (there are no requests |
| 257 | * in-flight). |
| 258 | * @sess: Session to rerun the queues on |
| 259 | * |
| 260 | * Description: |
| 261 | * This function tries to rerun all stopped queues if there are no |
| 262 | * requests in-flight anymore. This function tries to solve an obvious |
| 263 | * problem, when number of tags < than number of queues (hctx), which |
| 264 | * are stopped and put to sleep. If last permit, which has been just put, |
| 265 | * does not wake up all left queues (hctxs), IO requests hang forever. |
| 266 | * |
| 267 | * That can happen when all number of permits, say N, have been exhausted |
| 268 | * from one CPU, and we have many block devices per session, say M. |
| 269 | * Each block device has it's own queue (hctx) for each CPU, so eventually |
| 270 | * we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids. |
| 271 | * If number of permits N < M x nr_cpu_ids finally we will get an IO hang. |
| 272 | * |
| 273 | * To avoid this hang last caller of rnbd_put_permit() (last caller is the |
| 274 | * one who observes sess->busy == 0) must wake up all remaining queues. |
| 275 | * |
| 276 | * Context: |
| 277 | * Does not matter. |
| 278 | */ |
| 279 | static void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess) |
| 280 | { |
| 281 | bool requeued; |
| 282 | |
| 283 | do { |
| 284 | requeued = rnbd_rerun_if_needed(sess); |
| 285 | } while (atomic_read(v: &sess->busy) == 0 && requeued); |
| 286 | } |
| 287 | |
| 288 | static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess, |
| 289 | enum rtrs_clt_con_type con_type, |
| 290 | enum wait_type wait) |
| 291 | { |
| 292 | struct rtrs_permit *permit; |
| 293 | |
| 294 | permit = rtrs_clt_get_permit(sess: sess->rtrs, con_type, wait); |
| 295 | if (permit) |
| 296 | /* We have a subtle rare case here, when all permits can be |
| 297 | * consumed before busy counter increased. This is safe, |
| 298 | * because loser will get NULL as a permit, observe 0 busy |
| 299 | * counter and immediately restart the queue himself. |
| 300 | */ |
| 301 | atomic_inc(v: &sess->busy); |
| 302 | |
| 303 | return permit; |
| 304 | } |
| 305 | |
| 306 | static void rnbd_put_permit(struct rnbd_clt_session *sess, |
| 307 | struct rtrs_permit *permit) |
| 308 | { |
| 309 | rtrs_clt_put_permit(sess: sess->rtrs, permit); |
| 310 | atomic_dec(v: &sess->busy); |
| 311 | /* Paired with rnbd_clt_dev_add_to_requeue(). Decrement first |
| 312 | * and then check queue bits. |
| 313 | */ |
| 314 | smp_mb__after_atomic(); |
| 315 | rnbd_rerun_all_if_idle(sess); |
| 316 | } |
| 317 | |
| 318 | static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, |
| 319 | enum rtrs_clt_con_type con_type, |
| 320 | enum wait_type wait) |
| 321 | { |
| 322 | struct rnbd_iu *iu; |
| 323 | struct rtrs_permit *permit; |
| 324 | |
| 325 | iu = kzalloc(sizeof(*iu), GFP_KERNEL); |
| 326 | if (!iu) |
| 327 | return NULL; |
| 328 | |
| 329 | permit = rnbd_get_permit(sess, con_type, wait); |
| 330 | if (!permit) { |
| 331 | kfree(objp: iu); |
| 332 | return NULL; |
| 333 | } |
| 334 | |
| 335 | iu->permit = permit; |
| 336 | /* |
| 337 | * 1st reference is dropped after finishing sending a "user" message, |
| 338 | * 2nd reference is dropped after confirmation with the response is |
| 339 | * returned. |
| 340 | * 1st and 2nd can happen in any order, so the rnbd_iu should be |
| 341 | * released (rtrs_permit returned to rtrs) only after both |
| 342 | * are finished. |
| 343 | */ |
| 344 | atomic_set(v: &iu->refcount, i: 2); |
| 345 | init_waitqueue_head(&iu->comp.wait); |
| 346 | iu->comp.errno = INT_MAX; |
| 347 | |
| 348 | if (sg_alloc_table(&iu->sgt, 1, GFP_KERNEL)) { |
| 349 | rnbd_put_permit(sess, permit); |
| 350 | kfree(objp: iu); |
| 351 | return NULL; |
| 352 | } |
| 353 | |
| 354 | return iu; |
| 355 | } |
| 356 | |
| 357 | static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) |
| 358 | { |
| 359 | if (atomic_dec_and_test(v: &iu->refcount)) { |
| 360 | sg_free_table(&iu->sgt); |
| 361 | rnbd_put_permit(sess, permit: iu->permit); |
| 362 | kfree(objp: iu); |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | static void rnbd_softirq_done_fn(struct request *rq) |
| 367 | { |
| 368 | struct rnbd_clt_dev *dev = rq->q->disk->private_data; |
| 369 | struct rnbd_clt_session *sess = dev->sess; |
| 370 | struct rnbd_iu *iu; |
| 371 | |
| 372 | iu = blk_mq_rq_to_pdu(rq); |
| 373 | sg_free_table_chained(table: &iu->sgt, RNBD_INLINE_SG_CNT); |
| 374 | rnbd_put_permit(sess, permit: iu->permit); |
| 375 | blk_mq_end_request(rq, error: errno_to_blk_status(errno: iu->errno)); |
| 376 | } |
| 377 | |
| 378 | static void msg_io_conf(void *priv, int errno) |
| 379 | { |
| 380 | struct rnbd_iu *iu = priv; |
| 381 | struct rnbd_clt_dev *dev = iu->dev; |
| 382 | struct request *rq = iu->rq; |
| 383 | int rw = rq_data_dir(rq); |
| 384 | |
| 385 | iu->errno = errno; |
| 386 | |
| 387 | blk_mq_complete_request(rq); |
| 388 | |
| 389 | if (errno) |
| 390 | rnbd_clt_info_rl(dev, "%s I/O failed with err: %d\n" , |
| 391 | rw == READ ? "read" : "write" , errno); |
| 392 | } |
| 393 | |
| 394 | static void wake_up_iu_comp(struct rnbd_iu *iu, int errno) |
| 395 | { |
| 396 | iu->comp.errno = errno; |
| 397 | wake_up(&iu->comp.wait); |
| 398 | } |
| 399 | |
| 400 | static void msg_conf(void *priv, int errno) |
| 401 | { |
| 402 | struct rnbd_iu *iu = priv; |
| 403 | |
| 404 | iu->errno = errno; |
| 405 | schedule_work(work: &iu->work); |
| 406 | } |
| 407 | |
| 408 | static int send_usr_msg(struct rtrs_clt_sess *rtrs, int dir, |
| 409 | struct rnbd_iu *iu, struct kvec *vec, |
| 410 | size_t len, struct scatterlist *sg, unsigned int sg_len, |
| 411 | void (*conf)(struct work_struct *work), |
| 412 | int *errno, int wait) |
| 413 | { |
| 414 | int err; |
| 415 | struct rtrs_clt_req_ops req_ops; |
| 416 | |
| 417 | INIT_WORK(&iu->work, conf); |
| 418 | req_ops = (struct rtrs_clt_req_ops) { |
| 419 | .priv = iu, |
| 420 | .conf_fn = msg_conf, |
| 421 | }; |
| 422 | err = rtrs_clt_request(dir, ops: &req_ops, sess: rtrs, permit: iu->permit, |
| 423 | vec, nr: 1, len, sg, sg_cnt: sg_len); |
| 424 | if (!err && wait) { |
| 425 | wait_event(iu->comp.wait, iu->comp.errno != INT_MAX); |
| 426 | *errno = iu->comp.errno; |
| 427 | } else { |
| 428 | *errno = 0; |
| 429 | } |
| 430 | |
| 431 | return err; |
| 432 | } |
| 433 | |
| 434 | static void msg_close_conf(struct work_struct *work) |
| 435 | { |
| 436 | struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); |
| 437 | struct rnbd_clt_dev *dev = iu->dev; |
| 438 | |
| 439 | wake_up_iu_comp(iu, errno: iu->errno); |
| 440 | rnbd_put_iu(sess: dev->sess, iu); |
| 441 | rnbd_clt_put_dev(dev); |
| 442 | } |
| 443 | |
| 444 | static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, |
| 445 | enum wait_type wait) |
| 446 | { |
| 447 | struct rnbd_clt_session *sess = dev->sess; |
| 448 | struct rnbd_msg_close msg; |
| 449 | struct rnbd_iu *iu; |
| 450 | struct kvec vec = { |
| 451 | .iov_base = &msg, |
| 452 | .iov_len = sizeof(msg) |
| 453 | }; |
| 454 | int err, errno; |
| 455 | |
| 456 | iu = rnbd_get_iu(sess, con_type: RTRS_ADMIN_CON, wait: RTRS_PERMIT_WAIT); |
| 457 | if (!iu) |
| 458 | return -ENOMEM; |
| 459 | |
| 460 | iu->buf = NULL; |
| 461 | iu->dev = dev; |
| 462 | |
| 463 | msg.hdr.type = cpu_to_le16(RNBD_MSG_CLOSE); |
| 464 | msg.device_id = cpu_to_le32(device_id); |
| 465 | |
| 466 | WARN_ON(!rnbd_clt_get_dev(dev)); |
| 467 | err = send_usr_msg(rtrs: sess->rtrs, WRITE, iu, vec: &vec, len: 0, NULL, sg_len: 0, |
| 468 | conf: msg_close_conf, errno: &errno, wait); |
| 469 | if (err) { |
| 470 | rnbd_clt_put_dev(dev); |
| 471 | rnbd_put_iu(sess, iu); |
| 472 | } else { |
| 473 | err = errno; |
| 474 | } |
| 475 | |
| 476 | rnbd_put_iu(sess, iu); |
| 477 | return err; |
| 478 | } |
| 479 | |
| 480 | static void msg_open_conf(struct work_struct *work) |
| 481 | { |
| 482 | struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); |
| 483 | struct rnbd_msg_open_rsp *rsp = iu->buf; |
| 484 | struct rnbd_clt_dev *dev = iu->dev; |
| 485 | int errno = iu->errno; |
| 486 | bool from_map = false; |
| 487 | |
| 488 | /* INIT state is only triggered from rnbd_clt_map_device */ |
| 489 | if (dev->dev_state == DEV_STATE_INIT) |
| 490 | from_map = true; |
| 491 | |
| 492 | if (errno) { |
| 493 | rnbd_clt_err(dev, |
| 494 | "Opening failed, server responded: %d\n" , |
| 495 | errno); |
| 496 | } else { |
| 497 | errno = process_msg_open_rsp(dev, rsp); |
| 498 | if (errno) { |
| 499 | u32 device_id = le32_to_cpu(rsp->device_id); |
| 500 | /* |
| 501 | * If server thinks its fine, but we fail to process |
| 502 | * then be nice and send a close to server. |
| 503 | */ |
| 504 | send_msg_close(dev, device_id, wait: RTRS_PERMIT_NOWAIT); |
| 505 | } |
| 506 | } |
| 507 | /* We free rsp in rnbd_clt_map_device for map scenario */ |
| 508 | if (!from_map) |
| 509 | kfree(objp: rsp); |
| 510 | wake_up_iu_comp(iu, errno); |
| 511 | rnbd_put_iu(sess: dev->sess, iu); |
| 512 | rnbd_clt_put_dev(dev); |
| 513 | } |
| 514 | |
| 515 | static void msg_sess_info_conf(struct work_struct *work) |
| 516 | { |
| 517 | struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); |
| 518 | struct rnbd_msg_sess_info_rsp *rsp = iu->buf; |
| 519 | struct rnbd_clt_session *sess = iu->sess; |
| 520 | |
| 521 | if (!iu->errno) |
| 522 | sess->ver = min_t(u8, rsp->ver, RNBD_PROTO_VER_MAJOR); |
| 523 | |
| 524 | kfree(objp: rsp); |
| 525 | wake_up_iu_comp(iu, errno: iu->errno); |
| 526 | rnbd_put_iu(sess, iu); |
| 527 | rnbd_clt_put_sess(sess); |
| 528 | } |
| 529 | |
| 530 | static int send_msg_open(struct rnbd_clt_dev *dev, enum wait_type wait) |
| 531 | { |
| 532 | struct rnbd_clt_session *sess = dev->sess; |
| 533 | struct rnbd_msg_open_rsp *rsp; |
| 534 | struct rnbd_msg_open msg; |
| 535 | struct rnbd_iu *iu; |
| 536 | struct kvec vec = { |
| 537 | .iov_base = &msg, |
| 538 | .iov_len = sizeof(msg) |
| 539 | }; |
| 540 | int err, errno; |
| 541 | |
| 542 | rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); |
| 543 | if (!rsp) |
| 544 | return -ENOMEM; |
| 545 | |
| 546 | iu = rnbd_get_iu(sess, con_type: RTRS_ADMIN_CON, wait: RTRS_PERMIT_WAIT); |
| 547 | if (!iu) { |
| 548 | kfree(objp: rsp); |
| 549 | return -ENOMEM; |
| 550 | } |
| 551 | |
| 552 | iu->buf = rsp; |
| 553 | iu->dev = dev; |
| 554 | |
| 555 | sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); |
| 556 | |
| 557 | msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); |
| 558 | msg.access_mode = dev->access_mode; |
| 559 | strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); |
| 560 | |
| 561 | WARN_ON(!rnbd_clt_get_dev(dev)); |
| 562 | err = send_usr_msg(rtrs: sess->rtrs, READ, iu, |
| 563 | vec: &vec, len: sizeof(*rsp), sg: iu->sgt.sgl, sg_len: 1, |
| 564 | conf: msg_open_conf, errno: &errno, wait); |
| 565 | if (err) { |
| 566 | rnbd_clt_put_dev(dev); |
| 567 | rnbd_put_iu(sess, iu); |
| 568 | kfree(objp: rsp); |
| 569 | } else { |
| 570 | err = errno; |
| 571 | } |
| 572 | |
| 573 | rnbd_put_iu(sess, iu); |
| 574 | return err; |
| 575 | } |
| 576 | |
| 577 | static int send_msg_sess_info(struct rnbd_clt_session *sess, enum wait_type wait) |
| 578 | { |
| 579 | struct rnbd_msg_sess_info_rsp *rsp; |
| 580 | struct rnbd_msg_sess_info msg; |
| 581 | struct rnbd_iu *iu; |
| 582 | struct kvec vec = { |
| 583 | .iov_base = &msg, |
| 584 | .iov_len = sizeof(msg) |
| 585 | }; |
| 586 | int err, errno; |
| 587 | |
| 588 | rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); |
| 589 | if (!rsp) |
| 590 | return -ENOMEM; |
| 591 | |
| 592 | iu = rnbd_get_iu(sess, con_type: RTRS_ADMIN_CON, wait: RTRS_PERMIT_WAIT); |
| 593 | if (!iu) { |
| 594 | kfree(objp: rsp); |
| 595 | return -ENOMEM; |
| 596 | } |
| 597 | |
| 598 | iu->buf = rsp; |
| 599 | iu->sess = sess; |
| 600 | sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); |
| 601 | |
| 602 | msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO); |
| 603 | msg.ver = RNBD_PROTO_VER_MAJOR; |
| 604 | |
| 605 | if (!rnbd_clt_get_sess(sess)) { |
| 606 | /* |
| 607 | * That can happen only in one case, when RTRS has restablished |
| 608 | * the connection and link_ev() is called, but session is almost |
| 609 | * dead, last reference on session is put and caller is waiting |
| 610 | * for RTRS to close everything. |
| 611 | */ |
| 612 | err = -ENODEV; |
| 613 | goto put_iu; |
| 614 | } |
| 615 | err = send_usr_msg(rtrs: sess->rtrs, READ, iu, |
| 616 | vec: &vec, len: sizeof(*rsp), sg: iu->sgt.sgl, sg_len: 1, |
| 617 | conf: msg_sess_info_conf, errno: &errno, wait); |
| 618 | if (err) { |
| 619 | rnbd_clt_put_sess(sess); |
| 620 | put_iu: |
| 621 | rnbd_put_iu(sess, iu); |
| 622 | kfree(objp: rsp); |
| 623 | } else { |
| 624 | err = errno; |
| 625 | } |
| 626 | rnbd_put_iu(sess, iu); |
| 627 | return err; |
| 628 | } |
| 629 | |
| 630 | static void set_dev_states_to_disconnected(struct rnbd_clt_session *sess) |
| 631 | { |
| 632 | struct rnbd_clt_dev *dev; |
| 633 | struct kobject *gd_kobj; |
| 634 | |
| 635 | mutex_lock(&sess->lock); |
| 636 | list_for_each_entry(dev, &sess->devs_list, list) { |
| 637 | rnbd_clt_err(dev, "Device disconnected.\n" ); |
| 638 | |
| 639 | mutex_lock(&dev->lock); |
| 640 | if (dev->dev_state == DEV_STATE_MAPPED) { |
| 641 | dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED; |
| 642 | gd_kobj = &disk_to_dev(dev->gd)->kobj; |
| 643 | kobject_uevent(kobj: gd_kobj, action: KOBJ_OFFLINE); |
| 644 | } |
| 645 | mutex_unlock(lock: &dev->lock); |
| 646 | } |
| 647 | mutex_unlock(lock: &sess->lock); |
| 648 | } |
| 649 | |
| 650 | static void remap_devs(struct rnbd_clt_session *sess) |
| 651 | { |
| 652 | struct rnbd_clt_dev *dev; |
| 653 | struct rtrs_attrs attrs; |
| 654 | int err; |
| 655 | |
| 656 | /* |
| 657 | * Careful here: we are called from RTRS link event directly, |
| 658 | * thus we can't send any RTRS request and wait for response |
| 659 | * or RTRS will not be able to complete request with failure |
| 660 | * if something goes wrong (failing of outstanding requests |
| 661 | * happens exactly from the context where we are blocking now). |
| 662 | * |
| 663 | * So to avoid deadlocks each usr message sent from here must |
| 664 | * be asynchronous. |
| 665 | */ |
| 666 | |
| 667 | err = send_msg_sess_info(sess, wait: RTRS_PERMIT_NOWAIT); |
| 668 | if (err) { |
| 669 | pr_err("send_msg_sess_info(\"%s\"): %d\n" , sess->sessname, err); |
| 670 | return; |
| 671 | } |
| 672 | |
| 673 | err = rtrs_clt_query(sess: sess->rtrs, attr: &attrs); |
| 674 | if (err) { |
| 675 | pr_err("rtrs_clt_query(\"%s\"): %d\n" , sess->sessname, err); |
| 676 | return; |
| 677 | } |
| 678 | mutex_lock(&sess->lock); |
| 679 | sess->max_io_size = attrs.max_io_size; |
| 680 | |
| 681 | list_for_each_entry(dev, &sess->devs_list, list) { |
| 682 | bool skip; |
| 683 | |
| 684 | mutex_lock(&dev->lock); |
| 685 | skip = (dev->dev_state == DEV_STATE_INIT); |
| 686 | mutex_unlock(lock: &dev->lock); |
| 687 | if (skip) |
| 688 | /* |
| 689 | * When device is establishing connection for the first |
| 690 | * time - do not remap, it will be closed soon. |
| 691 | */ |
| 692 | continue; |
| 693 | |
| 694 | rnbd_clt_info(dev, "session reconnected, remapping device\n" ); |
| 695 | err = send_msg_open(dev, wait: RTRS_PERMIT_NOWAIT); |
| 696 | if (err) { |
| 697 | rnbd_clt_err(dev, "send_msg_open(): %d\n" , err); |
| 698 | break; |
| 699 | } |
| 700 | } |
| 701 | mutex_unlock(lock: &sess->lock); |
| 702 | } |
| 703 | |
| 704 | static void rnbd_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev) |
| 705 | { |
| 706 | struct rnbd_clt_session *sess = priv; |
| 707 | |
| 708 | switch (ev) { |
| 709 | case RTRS_CLT_LINK_EV_DISCONNECTED: |
| 710 | set_dev_states_to_disconnected(sess); |
| 711 | break; |
| 712 | case RTRS_CLT_LINK_EV_RECONNECTED: |
| 713 | remap_devs(sess); |
| 714 | break; |
| 715 | default: |
| 716 | pr_err("Unknown session event received (%d), session: %s\n" , |
| 717 | ev, sess->sessname); |
| 718 | } |
| 719 | } |
| 720 | |
| 721 | static void rnbd_init_cpu_qlists(struct rnbd_cpu_qlist __percpu *cpu_queues) |
| 722 | { |
| 723 | unsigned int cpu; |
| 724 | struct rnbd_cpu_qlist *cpu_q; |
| 725 | |
| 726 | for_each_possible_cpu(cpu) { |
| 727 | cpu_q = per_cpu_ptr(cpu_queues, cpu); |
| 728 | |
| 729 | cpu_q->cpu = cpu; |
| 730 | INIT_LIST_HEAD(list: &cpu_q->requeue_list); |
| 731 | spin_lock_init(&cpu_q->requeue_lock); |
| 732 | } |
| 733 | } |
| 734 | |
| 735 | static void destroy_mq_tags(struct rnbd_clt_session *sess) |
| 736 | { |
| 737 | if (sess->tag_set.tags) |
| 738 | blk_mq_free_tag_set(set: &sess->tag_set); |
| 739 | } |
| 740 | |
| 741 | static inline void wake_up_rtrs_waiters(struct rnbd_clt_session *sess) |
| 742 | { |
| 743 | sess->rtrs_ready = true; |
| 744 | wake_up_all(&sess->rtrs_waitq); |
| 745 | } |
| 746 | |
| 747 | static void close_rtrs(struct rnbd_clt_session *sess) |
| 748 | { |
| 749 | might_sleep(); |
| 750 | |
| 751 | if (!IS_ERR_OR_NULL(ptr: sess->rtrs)) { |
| 752 | rtrs_clt_close(clt: sess->rtrs); |
| 753 | sess->rtrs = NULL; |
| 754 | wake_up_rtrs_waiters(sess); |
| 755 | } |
| 756 | } |
| 757 | |
| 758 | static void free_sess(struct rnbd_clt_session *sess) |
| 759 | { |
| 760 | WARN_ON(!list_empty(&sess->devs_list)); |
| 761 | |
| 762 | might_sleep(); |
| 763 | |
| 764 | close_rtrs(sess); |
| 765 | destroy_mq_tags(sess); |
| 766 | if (!list_empty(head: &sess->list)) { |
| 767 | mutex_lock(&sess_lock); |
| 768 | list_del(entry: &sess->list); |
| 769 | mutex_unlock(lock: &sess_lock); |
| 770 | } |
| 771 | free_percpu(pdata: sess->cpu_queues); |
| 772 | free_percpu(pdata: sess->cpu_rr); |
| 773 | mutex_destroy(lock: &sess->lock); |
| 774 | kfree(objp: sess); |
| 775 | } |
| 776 | |
| 777 | static struct rnbd_clt_session *alloc_sess(const char *sessname) |
| 778 | { |
| 779 | struct rnbd_clt_session *sess; |
| 780 | int err, cpu; |
| 781 | |
| 782 | sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE); |
| 783 | if (!sess) |
| 784 | return ERR_PTR(error: -ENOMEM); |
| 785 | strscpy(sess->sessname, sessname, sizeof(sess->sessname)); |
| 786 | atomic_set(v: &sess->busy, i: 0); |
| 787 | mutex_init(&sess->lock); |
| 788 | INIT_LIST_HEAD(list: &sess->devs_list); |
| 789 | INIT_LIST_HEAD(list: &sess->list); |
| 790 | bitmap_zero(dst: sess->cpu_queues_bm, nbits: num_possible_cpus()); |
| 791 | init_waitqueue_head(&sess->rtrs_waitq); |
| 792 | refcount_set(r: &sess->refcount, n: 1); |
| 793 | |
| 794 | sess->cpu_queues = alloc_percpu(struct rnbd_cpu_qlist); |
| 795 | if (!sess->cpu_queues) { |
| 796 | err = -ENOMEM; |
| 797 | goto err; |
| 798 | } |
| 799 | rnbd_init_cpu_qlists(cpu_queues: sess->cpu_queues); |
| 800 | |
| 801 | /* |
| 802 | * That is simple percpu variable which stores cpu indices, which are |
| 803 | * incremented on each access. We need that for the sake of fairness |
| 804 | * to wake up queues in a round-robin manner. |
| 805 | */ |
| 806 | sess->cpu_rr = alloc_percpu(int); |
| 807 | if (!sess->cpu_rr) { |
| 808 | err = -ENOMEM; |
| 809 | goto err; |
| 810 | } |
| 811 | for_each_possible_cpu(cpu) |
| 812 | * per_cpu_ptr(sess->cpu_rr, cpu) = cpu; |
| 813 | |
| 814 | return sess; |
| 815 | |
| 816 | err: |
| 817 | free_sess(sess); |
| 818 | |
| 819 | return ERR_PTR(error: err); |
| 820 | } |
| 821 | |
| 822 | static int wait_for_rtrs_connection(struct rnbd_clt_session *sess) |
| 823 | { |
| 824 | wait_event(sess->rtrs_waitq, sess->rtrs_ready); |
| 825 | if (IS_ERR_OR_NULL(ptr: sess->rtrs)) |
| 826 | return -ECONNRESET; |
| 827 | |
| 828 | return 0; |
| 829 | } |
| 830 | |
| 831 | static void wait_for_rtrs_disconnection(struct rnbd_clt_session *sess) |
| 832 | __releases(&sess_lock) |
| 833 | __acquires(&sess_lock) |
| 834 | { |
| 835 | DEFINE_WAIT(wait); |
| 836 | |
| 837 | prepare_to_wait(wq_head: &sess->rtrs_waitq, wq_entry: &wait, TASK_UNINTERRUPTIBLE); |
| 838 | if (IS_ERR_OR_NULL(ptr: sess->rtrs)) { |
| 839 | finish_wait(wq_head: &sess->rtrs_waitq, wq_entry: &wait); |
| 840 | return; |
| 841 | } |
| 842 | mutex_unlock(lock: &sess_lock); |
| 843 | /* loop in caller, see __find_and_get_sess(). |
| 844 | * You can't leave mutex locked and call schedule(), you will catch a |
| 845 | * deadlock with a caller of free_sess(), which has just put the last |
| 846 | * reference and is about to take the sess_lock in order to delete |
| 847 | * the session from the list. |
| 848 | */ |
| 849 | schedule(); |
| 850 | mutex_lock(&sess_lock); |
| 851 | } |
| 852 | |
| 853 | static struct rnbd_clt_session *__find_and_get_sess(const char *sessname) |
| 854 | __releases(&sess_lock) |
| 855 | __acquires(&sess_lock) |
| 856 | { |
| 857 | struct rnbd_clt_session *sess, *sn; |
| 858 | int err; |
| 859 | |
| 860 | again: |
| 861 | list_for_each_entry_safe(sess, sn, &sess_list, list) { |
| 862 | if (strcmp(sessname, sess->sessname)) |
| 863 | continue; |
| 864 | |
| 865 | if (sess->rtrs_ready && IS_ERR_OR_NULL(ptr: sess->rtrs)) |
| 866 | /* |
| 867 | * No RTRS connection, session is dying. |
| 868 | */ |
| 869 | continue; |
| 870 | |
| 871 | if (rnbd_clt_get_sess(sess)) { |
| 872 | /* |
| 873 | * Alive session is found, wait for RTRS connection. |
| 874 | */ |
| 875 | mutex_unlock(lock: &sess_lock); |
| 876 | err = wait_for_rtrs_connection(sess); |
| 877 | if (err) |
| 878 | rnbd_clt_put_sess(sess); |
| 879 | mutex_lock(&sess_lock); |
| 880 | |
| 881 | if (err) |
| 882 | /* Session is dying, repeat the loop */ |
| 883 | goto again; |
| 884 | |
| 885 | return sess; |
| 886 | } |
| 887 | /* |
| 888 | * Ref is 0, session is dying, wait for RTRS disconnect |
| 889 | * in order to avoid session names clashes. |
| 890 | */ |
| 891 | wait_for_rtrs_disconnection(sess); |
| 892 | /* |
| 893 | * RTRS is disconnected and soon session will be freed, |
| 894 | * so repeat a loop. |
| 895 | */ |
| 896 | goto again; |
| 897 | } |
| 898 | |
| 899 | return NULL; |
| 900 | } |
| 901 | |
| 902 | /* caller is responsible for initializing 'first' to false */ |
| 903 | static struct |
| 904 | rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) |
| 905 | { |
| 906 | struct rnbd_clt_session *sess = NULL; |
| 907 | |
| 908 | mutex_lock(&sess_lock); |
| 909 | sess = __find_and_get_sess(sessname); |
| 910 | if (!sess) { |
| 911 | sess = alloc_sess(sessname); |
| 912 | if (IS_ERR(ptr: sess)) { |
| 913 | mutex_unlock(lock: &sess_lock); |
| 914 | return sess; |
| 915 | } |
| 916 | list_add(new: &sess->list, head: &sess_list); |
| 917 | *first = true; |
| 918 | } |
| 919 | mutex_unlock(lock: &sess_lock); |
| 920 | |
| 921 | return sess; |
| 922 | } |
| 923 | |
| 924 | static int rnbd_client_open(struct gendisk *disk, blk_mode_t mode) |
| 925 | { |
| 926 | struct rnbd_clt_dev *dev = disk->private_data; |
| 927 | |
| 928 | if (get_disk_ro(disk: dev->gd) && (mode & BLK_OPEN_WRITE)) |
| 929 | return -EPERM; |
| 930 | |
| 931 | if (dev->dev_state == DEV_STATE_UNMAPPED || |
| 932 | !rnbd_clt_get_dev(dev)) |
| 933 | return -EIO; |
| 934 | |
| 935 | return 0; |
| 936 | } |
| 937 | |
| 938 | static void rnbd_client_release(struct gendisk *gen) |
| 939 | { |
| 940 | struct rnbd_clt_dev *dev = gen->private_data; |
| 941 | |
| 942 | rnbd_clt_put_dev(dev); |
| 943 | } |
| 944 | |
| 945 | static int rnbd_client_getgeo(struct gendisk *disk, |
| 946 | struct hd_geometry *geo) |
| 947 | { |
| 948 | u64 size; |
| 949 | struct rnbd_clt_dev *dev = disk->private_data; |
| 950 | struct queue_limits *limit = &dev->queue->limits; |
| 951 | |
| 952 | size = dev->size * (limit->logical_block_size / SECTOR_SIZE); |
| 953 | geo->cylinders = size >> 6; /* size/64 */ |
| 954 | geo->heads = 4; |
| 955 | geo->sectors = 16; |
| 956 | geo->start = 0; |
| 957 | |
| 958 | return 0; |
| 959 | } |
| 960 | |
| 961 | static const struct block_device_operations rnbd_client_ops = { |
| 962 | .owner = THIS_MODULE, |
| 963 | .open = rnbd_client_open, |
| 964 | .release = rnbd_client_release, |
| 965 | .getgeo = rnbd_client_getgeo |
| 966 | }; |
| 967 | |
| 968 | /* The amount of data that belongs to an I/O and the amount of data that |
| 969 | * should be read or written to the disk (bi_size) can differ. |
| 970 | * |
| 971 | * E.g. When WRITE_SAME is used, only a small amount of data is |
| 972 | * transferred that is then written repeatedly over a lot of sectors. |
| 973 | * |
| 974 | * Get the size of data to be transferred via RTRS by summing up the size |
| 975 | * of the scather-gather list entries. |
| 976 | */ |
| 977 | static size_t rnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len) |
| 978 | { |
| 979 | struct scatterlist *sg; |
| 980 | size_t tsize = 0; |
| 981 | int i; |
| 982 | |
| 983 | for_each_sg(sglist, sg, len, i) |
| 984 | tsize += sg->length; |
| 985 | return tsize; |
| 986 | } |
| 987 | |
| 988 | static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, |
| 989 | struct request *rq, |
| 990 | struct rnbd_iu *iu) |
| 991 | { |
| 992 | struct rtrs_clt_sess *rtrs = dev->sess->rtrs; |
| 993 | struct rtrs_permit *permit = iu->permit; |
| 994 | struct rnbd_msg_io msg; |
| 995 | struct rtrs_clt_req_ops req_ops; |
| 996 | unsigned int sg_cnt = 0; |
| 997 | struct kvec vec; |
| 998 | size_t size; |
| 999 | int err; |
| 1000 | |
| 1001 | iu->rq = rq; |
| 1002 | iu->dev = dev; |
| 1003 | msg.sector = cpu_to_le64(blk_rq_pos(rq)); |
| 1004 | msg.bi_size = cpu_to_le32(blk_rq_bytes(rq)); |
| 1005 | msg.rw = cpu_to_le32(rq_to_rnbd_flags(rq)); |
| 1006 | msg.prio = cpu_to_le16(req_get_ioprio(rq)); |
| 1007 | |
| 1008 | /* |
| 1009 | * We only support discards/WRITE_ZEROES with single segment for now. |
| 1010 | * See queue limits. |
| 1011 | */ |
| 1012 | if ((req_op(req: rq) != REQ_OP_DISCARD) && (req_op(req: rq) != REQ_OP_WRITE_ZEROES)) |
| 1013 | sg_cnt = blk_rq_map_sg(rq, sglist: iu->sgt.sgl); |
| 1014 | |
| 1015 | if (sg_cnt == 0) |
| 1016 | sg_mark_end(sg: &iu->sgt.sgl[0]); |
| 1017 | |
| 1018 | msg.hdr.type = cpu_to_le16(RNBD_MSG_IO); |
| 1019 | msg.device_id = cpu_to_le32(dev->device_id); |
| 1020 | |
| 1021 | vec = (struct kvec) { |
| 1022 | .iov_base = &msg, |
| 1023 | .iov_len = sizeof(msg) |
| 1024 | }; |
| 1025 | size = rnbd_clt_get_sg_size(sglist: iu->sgt.sgl, len: sg_cnt); |
| 1026 | req_ops = (struct rtrs_clt_req_ops) { |
| 1027 | .priv = iu, |
| 1028 | .conf_fn = msg_io_conf, |
| 1029 | }; |
| 1030 | err = rtrs_clt_request(rq_data_dir(rq), ops: &req_ops, sess: rtrs, permit, |
| 1031 | vec: &vec, nr: 1, len: size, sg: iu->sgt.sgl, sg_cnt); |
| 1032 | if (err) { |
| 1033 | rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n" , |
| 1034 | err); |
| 1035 | return err; |
| 1036 | } |
| 1037 | |
| 1038 | return 0; |
| 1039 | } |
| 1040 | |
| 1041 | /** |
| 1042 | * rnbd_clt_dev_add_to_requeue() - add device to requeue if session is busy |
| 1043 | * @dev: Device to be checked |
| 1044 | * @q: Queue to be added to the requeue list if required |
| 1045 | * |
| 1046 | * Description: |
| 1047 | * If session is busy, that means someone will requeue us when resources |
| 1048 | * are freed. If session is not doing anything - device is not added to |
| 1049 | * the list and @false is returned. |
| 1050 | */ |
| 1051 | static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev, |
| 1052 | struct rnbd_queue *q) |
| 1053 | { |
| 1054 | struct rnbd_clt_session *sess = dev->sess; |
| 1055 | struct rnbd_cpu_qlist *cpu_q; |
| 1056 | unsigned long flags; |
| 1057 | bool added = true; |
| 1058 | bool need_set; |
| 1059 | |
| 1060 | cpu_q = get_cpu_ptr(sess->cpu_queues); |
| 1061 | spin_lock_irqsave(&cpu_q->requeue_lock, flags); |
| 1062 | |
| 1063 | if (!test_and_set_bit_lock(nr: 0, addr: &q->in_list)) { |
| 1064 | if (WARN_ON(!list_empty(&q->requeue_list))) |
| 1065 | goto unlock; |
| 1066 | |
| 1067 | need_set = !test_bit(cpu_q->cpu, sess->cpu_queues_bm); |
| 1068 | if (need_set) { |
| 1069 | set_bit(nr: cpu_q->cpu, addr: sess->cpu_queues_bm); |
| 1070 | /* Paired with rnbd_put_permit(). Set a bit first |
| 1071 | * and then observe the busy counter. |
| 1072 | */ |
| 1073 | smp_mb__before_atomic(); |
| 1074 | } |
| 1075 | if (atomic_read(v: &sess->busy)) { |
| 1076 | list_add_tail(new: &q->requeue_list, head: &cpu_q->requeue_list); |
| 1077 | } else { |
| 1078 | /* Very unlikely, but possible: busy counter was |
| 1079 | * observed as zero. Drop all bits and return |
| 1080 | * false to restart the queue by ourselves. |
| 1081 | */ |
| 1082 | if (need_set) |
| 1083 | clear_bit(nr: cpu_q->cpu, addr: sess->cpu_queues_bm); |
| 1084 | clear_bit_unlock(nr: 0, addr: &q->in_list); |
| 1085 | added = false; |
| 1086 | } |
| 1087 | } |
| 1088 | unlock: |
| 1089 | spin_unlock_irqrestore(lock: &cpu_q->requeue_lock, flags); |
| 1090 | put_cpu_ptr(sess->cpu_queues); |
| 1091 | |
| 1092 | return added; |
| 1093 | } |
| 1094 | |
| 1095 | static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev, |
| 1096 | struct blk_mq_hw_ctx *hctx, |
| 1097 | int delay) |
| 1098 | { |
| 1099 | struct rnbd_queue *q = hctx->driver_data; |
| 1100 | |
| 1101 | if (delay != RNBD_DELAY_IFBUSY) |
| 1102 | blk_mq_delay_run_hw_queue(hctx, msecs: delay); |
| 1103 | else if (!rnbd_clt_dev_add_to_requeue(dev, q)) |
| 1104 | /* |
| 1105 | * If session is not busy we have to restart |
| 1106 | * the queue ourselves. |
| 1107 | */ |
| 1108 | blk_mq_delay_run_hw_queue(hctx, msecs: 10/*ms*/); |
| 1109 | } |
| 1110 | |
| 1111 | static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, |
| 1112 | const struct blk_mq_queue_data *bd) |
| 1113 | { |
| 1114 | struct request *rq = bd->rq; |
| 1115 | struct rnbd_clt_dev *dev = rq->q->disk->private_data; |
| 1116 | struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); |
| 1117 | int err; |
| 1118 | blk_status_t ret = BLK_STS_IOERR; |
| 1119 | |
| 1120 | if (dev->dev_state != DEV_STATE_MAPPED) |
| 1121 | return BLK_STS_IOERR; |
| 1122 | |
| 1123 | iu->permit = rnbd_get_permit(sess: dev->sess, con_type: RTRS_IO_CON, |
| 1124 | wait: RTRS_PERMIT_NOWAIT); |
| 1125 | if (!iu->permit) { |
| 1126 | rnbd_clt_dev_kick_mq_queue(dev, hctx, delay: RNBD_DELAY_IFBUSY); |
| 1127 | return BLK_STS_RESOURCE; |
| 1128 | } |
| 1129 | |
| 1130 | iu->sgt.sgl = iu->first_sgl; |
| 1131 | err = sg_alloc_table_chained(table: &iu->sgt, |
| 1132 | /* Even-if the request has no segment, |
| 1133 | * sglist must have one entry at least. |
| 1134 | */ |
| 1135 | nents: blk_rq_nr_phys_segments(rq) ? : 1, |
| 1136 | first_chunk: iu->sgt.sgl, |
| 1137 | RNBD_INLINE_SG_CNT); |
| 1138 | if (err) { |
| 1139 | rnbd_clt_err_rl(dev, "sg_alloc_table_chained ret=%d\n" , err); |
| 1140 | rnbd_clt_dev_kick_mq_queue(dev, hctx, delay: 10/*ms*/); |
| 1141 | rnbd_put_permit(sess: dev->sess, permit: iu->permit); |
| 1142 | return BLK_STS_RESOURCE; |
| 1143 | } |
| 1144 | |
| 1145 | blk_mq_start_request(rq); |
| 1146 | err = rnbd_client_xfer_request(dev, rq, iu); |
| 1147 | if (err == 0) |
| 1148 | return BLK_STS_OK; |
| 1149 | if (err == -EAGAIN || err == -ENOMEM) { |
| 1150 | rnbd_clt_dev_kick_mq_queue(dev, hctx, delay: 10/*ms*/); |
| 1151 | ret = BLK_STS_RESOURCE; |
| 1152 | } |
| 1153 | sg_free_table_chained(table: &iu->sgt, RNBD_INLINE_SG_CNT); |
| 1154 | rnbd_put_permit(sess: dev->sess, permit: iu->permit); |
| 1155 | return ret; |
| 1156 | } |
| 1157 | |
| 1158 | static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) |
| 1159 | { |
| 1160 | struct rnbd_queue *q = hctx->driver_data; |
| 1161 | struct rnbd_clt_dev *dev = q->dev; |
| 1162 | |
| 1163 | return rtrs_clt_rdma_cq_direct(clt: dev->sess->rtrs, index: hctx->queue_num); |
| 1164 | } |
| 1165 | |
| 1166 | static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set) |
| 1167 | { |
| 1168 | struct rnbd_clt_session *sess = set->driver_data; |
| 1169 | |
| 1170 | /* shared read/write queues */ |
| 1171 | set->map[HCTX_TYPE_DEFAULT].nr_queues = num_online_cpus(); |
| 1172 | set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; |
| 1173 | set->map[HCTX_TYPE_READ].nr_queues = num_online_cpus(); |
| 1174 | set->map[HCTX_TYPE_READ].queue_offset = 0; |
| 1175 | blk_mq_map_queues(qmap: &set->map[HCTX_TYPE_DEFAULT]); |
| 1176 | blk_mq_map_queues(qmap: &set->map[HCTX_TYPE_READ]); |
| 1177 | |
| 1178 | if (sess->nr_poll_queues) { |
| 1179 | /* dedicated queue for poll */ |
| 1180 | set->map[HCTX_TYPE_POLL].nr_queues = sess->nr_poll_queues; |
| 1181 | set->map[HCTX_TYPE_POLL].queue_offset = set->map[HCTX_TYPE_READ].queue_offset + |
| 1182 | set->map[HCTX_TYPE_READ].nr_queues; |
| 1183 | blk_mq_map_queues(qmap: &set->map[HCTX_TYPE_POLL]); |
| 1184 | pr_info("[session=%s] mapped %d/%d/%d default/read/poll queues.\n" , |
| 1185 | sess->sessname, |
| 1186 | set->map[HCTX_TYPE_DEFAULT].nr_queues, |
| 1187 | set->map[HCTX_TYPE_READ].nr_queues, |
| 1188 | set->map[HCTX_TYPE_POLL].nr_queues); |
| 1189 | } else { |
| 1190 | pr_info("[session=%s] mapped %d/%d default/read queues.\n" , |
| 1191 | sess->sessname, |
| 1192 | set->map[HCTX_TYPE_DEFAULT].nr_queues, |
| 1193 | set->map[HCTX_TYPE_READ].nr_queues); |
| 1194 | } |
| 1195 | } |
| 1196 | |
| 1197 | static struct blk_mq_ops rnbd_mq_ops = { |
| 1198 | .queue_rq = rnbd_queue_rq, |
| 1199 | .complete = rnbd_softirq_done_fn, |
| 1200 | .map_queues = rnbd_rdma_map_queues, |
| 1201 | .poll = rnbd_rdma_poll, |
| 1202 | }; |
| 1203 | |
| 1204 | static int setup_mq_tags(struct rnbd_clt_session *sess) |
| 1205 | { |
| 1206 | struct blk_mq_tag_set *tag_set = &sess->tag_set; |
| 1207 | |
| 1208 | memset(tag_set, 0, sizeof(*tag_set)); |
| 1209 | tag_set->ops = &rnbd_mq_ops; |
| 1210 | tag_set->queue_depth = sess->queue_depth; |
| 1211 | tag_set->numa_node = NUMA_NO_NODE; |
| 1212 | tag_set->flags = BLK_MQ_F_TAG_QUEUE_SHARED; |
| 1213 | tag_set->cmd_size = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE; |
| 1214 | |
| 1215 | /* for HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL */ |
| 1216 | tag_set->nr_maps = sess->nr_poll_queues ? HCTX_MAX_TYPES : 2; |
| 1217 | /* |
| 1218 | * HCTX_TYPE_DEFAULT and HCTX_TYPE_READ share one set of queues |
| 1219 | * others are for HCTX_TYPE_POLL |
| 1220 | */ |
| 1221 | tag_set->nr_hw_queues = num_online_cpus() + sess->nr_poll_queues; |
| 1222 | tag_set->driver_data = sess; |
| 1223 | |
| 1224 | return blk_mq_alloc_tag_set(set: tag_set); |
| 1225 | } |
| 1226 | |
| 1227 | static struct rnbd_clt_session * |
| 1228 | find_and_get_or_create_sess(const char *sessname, |
| 1229 | const struct rtrs_addr *paths, |
| 1230 | size_t path_cnt, u16 port_nr, u32 nr_poll_queues) |
| 1231 | { |
| 1232 | struct rnbd_clt_session *sess; |
| 1233 | struct rtrs_attrs attrs; |
| 1234 | int err; |
| 1235 | bool first = false; |
| 1236 | struct rtrs_clt_ops rtrs_ops; |
| 1237 | |
| 1238 | sess = find_or_create_sess(sessname, first: &first); |
| 1239 | if (sess == ERR_PTR(error: -ENOMEM)) { |
| 1240 | return ERR_PTR(error: -ENOMEM); |
| 1241 | } else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { |
| 1242 | /* |
| 1243 | * A device MUST have its own session to use the polling-mode. |
| 1244 | * It must fail to map new device with the same session. |
| 1245 | */ |
| 1246 | err = -EINVAL; |
| 1247 | goto put_sess; |
| 1248 | } |
| 1249 | |
| 1250 | if (!first) |
| 1251 | return sess; |
| 1252 | |
| 1253 | if (!path_cnt) { |
| 1254 | pr_err("Session %s not found, and path parameter not given" , sessname); |
| 1255 | err = -ENXIO; |
| 1256 | goto put_sess; |
| 1257 | } |
| 1258 | |
| 1259 | rtrs_ops = (struct rtrs_clt_ops) { |
| 1260 | .priv = sess, |
| 1261 | .link_ev = rnbd_clt_link_ev, |
| 1262 | }; |
| 1263 | /* |
| 1264 | * Nothing was found, establish rtrs connection and proceed further. |
| 1265 | */ |
| 1266 | sess->rtrs = rtrs_clt_open(ops: &rtrs_ops, pathname: sessname, |
| 1267 | paths, path_cnt, port: port_nr, |
| 1268 | pdu_sz: 0, /* Do not use pdu of rtrs */ |
| 1269 | RECONNECT_DELAY, |
| 1270 | MAX_RECONNECTS, nr_poll_queues); |
| 1271 | if (IS_ERR(ptr: sess->rtrs)) { |
| 1272 | err = PTR_ERR(ptr: sess->rtrs); |
| 1273 | goto wake_up_and_put; |
| 1274 | } |
| 1275 | |
| 1276 | err = rtrs_clt_query(sess: sess->rtrs, attr: &attrs); |
| 1277 | if (err) |
| 1278 | goto close_rtrs; |
| 1279 | |
| 1280 | sess->max_io_size = attrs.max_io_size; |
| 1281 | sess->queue_depth = attrs.queue_depth; |
| 1282 | sess->nr_poll_queues = nr_poll_queues; |
| 1283 | sess->max_segments = attrs.max_segments; |
| 1284 | |
| 1285 | err = setup_mq_tags(sess); |
| 1286 | if (err) |
| 1287 | goto close_rtrs; |
| 1288 | |
| 1289 | err = send_msg_sess_info(sess, wait: RTRS_PERMIT_WAIT); |
| 1290 | if (err) |
| 1291 | goto close_rtrs; |
| 1292 | |
| 1293 | wake_up_rtrs_waiters(sess); |
| 1294 | |
| 1295 | return sess; |
| 1296 | |
| 1297 | close_rtrs: |
| 1298 | close_rtrs(sess); |
| 1299 | put_sess: |
| 1300 | rnbd_clt_put_sess(sess); |
| 1301 | |
| 1302 | return ERR_PTR(error: err); |
| 1303 | |
| 1304 | wake_up_and_put: |
| 1305 | wake_up_rtrs_waiters(sess); |
| 1306 | goto put_sess; |
| 1307 | } |
| 1308 | |
| 1309 | static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev, |
| 1310 | struct rnbd_queue *q, |
| 1311 | struct blk_mq_hw_ctx *hctx) |
| 1312 | { |
| 1313 | INIT_LIST_HEAD(list: &q->requeue_list); |
| 1314 | q->dev = dev; |
| 1315 | q->hctx = hctx; |
| 1316 | } |
| 1317 | |
| 1318 | static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) |
| 1319 | { |
| 1320 | unsigned long i; |
| 1321 | struct blk_mq_hw_ctx *hctx; |
| 1322 | struct rnbd_queue *q; |
| 1323 | |
| 1324 | queue_for_each_hw_ctx(dev->queue, hctx, i) { |
| 1325 | q = &dev->hw_queues[i]; |
| 1326 | rnbd_init_hw_queue(dev, q, hctx); |
| 1327 | hctx->driver_data = q; |
| 1328 | } |
| 1329 | } |
| 1330 | |
| 1331 | static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, |
| 1332 | struct rnbd_msg_open_rsp *rsp, int idx) |
| 1333 | { |
| 1334 | int err; |
| 1335 | |
| 1336 | dev->gd->major = rnbd_client_major; |
| 1337 | dev->gd->first_minor = idx << RNBD_PART_BITS; |
| 1338 | dev->gd->minors = 1 << RNBD_PART_BITS; |
| 1339 | dev->gd->fops = &rnbd_client_ops; |
| 1340 | dev->gd->queue = dev->queue; |
| 1341 | dev->gd->private_data = dev; |
| 1342 | snprintf(buf: dev->gd->disk_name, size: sizeof(dev->gd->disk_name), fmt: "rnbd%d" , |
| 1343 | idx); |
| 1344 | pr_debug("disk_name=%s, capacity=%llu\n" , |
| 1345 | dev->gd->disk_name, |
| 1346 | le64_to_cpu(rsp->nsectors) * |
| 1347 | (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE)); |
| 1348 | |
| 1349 | set_capacity(disk: dev->gd, le64_to_cpu(rsp->nsectors)); |
| 1350 | |
| 1351 | if (dev->access_mode == RNBD_ACCESS_RO) |
| 1352 | set_disk_ro(disk: dev->gd, read_only: true); |
| 1353 | |
| 1354 | err = add_disk(disk: dev->gd); |
| 1355 | if (err) |
| 1356 | put_disk(disk: dev->gd); |
| 1357 | |
| 1358 | return err; |
| 1359 | } |
| 1360 | |
| 1361 | static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, |
| 1362 | struct rnbd_msg_open_rsp *rsp) |
| 1363 | { |
| 1364 | struct queue_limits lim = { |
| 1365 | .logical_block_size = le16_to_cpu(rsp->logical_block_size), |
| 1366 | .physical_block_size = le16_to_cpu(rsp->physical_block_size), |
| 1367 | .io_opt = dev->sess->max_io_size, |
| 1368 | .max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE, |
| 1369 | .max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors), |
| 1370 | .discard_granularity = le32_to_cpu(rsp->discard_granularity), |
| 1371 | .discard_alignment = le32_to_cpu(rsp->discard_alignment), |
| 1372 | .max_segments = dev->sess->max_segments, |
| 1373 | .virt_boundary_mask = SZ_4K - 1, |
| 1374 | .max_write_zeroes_sectors = |
| 1375 | le32_to_cpu(rsp->max_write_zeroes_sectors), |
| 1376 | }; |
| 1377 | int idx = dev->clt_device_id; |
| 1378 | |
| 1379 | dev->size = le64_to_cpu(rsp->nsectors) * |
| 1380 | le16_to_cpu(rsp->logical_block_size); |
| 1381 | |
| 1382 | if (rsp->secure_discard) { |
| 1383 | lim.max_secure_erase_sectors = |
| 1384 | le32_to_cpu(rsp->max_discard_sectors); |
| 1385 | } |
| 1386 | |
| 1387 | if (rsp->cache_policy & RNBD_WRITEBACK) { |
| 1388 | lim.features |= BLK_FEAT_WRITE_CACHE; |
| 1389 | if (rsp->cache_policy & RNBD_FUA) |
| 1390 | lim.features |= BLK_FEAT_FUA; |
| 1391 | } |
| 1392 | |
| 1393 | dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev); |
| 1394 | if (IS_ERR(ptr: dev->gd)) |
| 1395 | return PTR_ERR(ptr: dev->gd); |
| 1396 | dev->queue = dev->gd->queue; |
| 1397 | rnbd_init_mq_hw_queues(dev); |
| 1398 | |
| 1399 | return rnbd_clt_setup_gen_disk(dev, rsp, idx); |
| 1400 | } |
| 1401 | |
| 1402 | static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, |
| 1403 | enum rnbd_access_mode access_mode, |
| 1404 | const char *pathname, |
| 1405 | u32 nr_poll_queues) |
| 1406 | { |
| 1407 | struct rnbd_clt_dev *dev; |
| 1408 | int ret; |
| 1409 | |
| 1410 | dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, NUMA_NO_NODE); |
| 1411 | if (!dev) |
| 1412 | return ERR_PTR(error: -ENOMEM); |
| 1413 | |
| 1414 | /* |
| 1415 | * nr_cpu_ids: the number of softirq queues |
| 1416 | * nr_poll_queues: the number of polling queues |
| 1417 | */ |
| 1418 | dev->hw_queues = kcalloc(nr_cpu_ids + nr_poll_queues, |
| 1419 | sizeof(*dev->hw_queues), |
| 1420 | GFP_KERNEL); |
| 1421 | if (!dev->hw_queues) { |
| 1422 | ret = -ENOMEM; |
| 1423 | goto out_alloc; |
| 1424 | } |
| 1425 | |
| 1426 | dev->clt_device_id = ida_alloc_max(ida: &index_ida, |
| 1427 | max: (1 << (MINORBITS - RNBD_PART_BITS)) - 1, |
| 1428 | GFP_KERNEL); |
| 1429 | if (dev->clt_device_id < 0) { |
| 1430 | ret = dev->clt_device_id; |
| 1431 | pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n" , |
| 1432 | pathname, sess->sessname, ret); |
| 1433 | goto out_queues; |
| 1434 | } |
| 1435 | |
| 1436 | dev->pathname = kstrdup(s: pathname, GFP_KERNEL); |
| 1437 | if (!dev->pathname) { |
| 1438 | ret = -ENOMEM; |
| 1439 | goto out_ida; |
| 1440 | } |
| 1441 | |
| 1442 | dev->sess = sess; |
| 1443 | dev->access_mode = access_mode; |
| 1444 | dev->nr_poll_queues = nr_poll_queues; |
| 1445 | mutex_init(&dev->lock); |
| 1446 | refcount_set(r: &dev->refcount, n: 1); |
| 1447 | dev->dev_state = DEV_STATE_INIT; |
| 1448 | |
| 1449 | /* |
| 1450 | * Here we called from sysfs entry, thus clt-sysfs is |
| 1451 | * responsible that session will not disappear. |
| 1452 | */ |
| 1453 | WARN_ON(!rnbd_clt_get_sess(sess)); |
| 1454 | |
| 1455 | return dev; |
| 1456 | |
| 1457 | out_ida: |
| 1458 | ida_free(&index_ida, id: dev->clt_device_id); |
| 1459 | out_queues: |
| 1460 | kfree(objp: dev->hw_queues); |
| 1461 | out_alloc: |
| 1462 | kfree(objp: dev); |
| 1463 | return ERR_PTR(error: ret); |
| 1464 | } |
| 1465 | |
| 1466 | static bool __exists_dev(const char *pathname, const char *sessname) |
| 1467 | { |
| 1468 | struct rnbd_clt_session *sess; |
| 1469 | struct rnbd_clt_dev *dev; |
| 1470 | bool found = false; |
| 1471 | |
| 1472 | list_for_each_entry(sess, &sess_list, list) { |
| 1473 | if (sessname && strncmp(sess->sessname, sessname, |
| 1474 | sizeof(sess->sessname))) |
| 1475 | continue; |
| 1476 | mutex_lock(&sess->lock); |
| 1477 | list_for_each_entry(dev, &sess->devs_list, list) { |
| 1478 | if (strlen(dev->pathname) == strlen(pathname) && |
| 1479 | !strcmp(dev->pathname, pathname)) { |
| 1480 | found = true; |
| 1481 | break; |
| 1482 | } |
| 1483 | } |
| 1484 | mutex_unlock(lock: &sess->lock); |
| 1485 | if (found) |
| 1486 | break; |
| 1487 | } |
| 1488 | |
| 1489 | return found; |
| 1490 | } |
| 1491 | |
| 1492 | static bool exists_devpath(const char *pathname, const char *sessname) |
| 1493 | { |
| 1494 | bool found; |
| 1495 | |
| 1496 | mutex_lock(&sess_lock); |
| 1497 | found = __exists_dev(pathname, sessname); |
| 1498 | mutex_unlock(lock: &sess_lock); |
| 1499 | |
| 1500 | return found; |
| 1501 | } |
| 1502 | |
| 1503 | static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev) |
| 1504 | { |
| 1505 | bool found; |
| 1506 | struct rnbd_clt_session *sess = dev->sess; |
| 1507 | |
| 1508 | mutex_lock(&sess_lock); |
| 1509 | found = __exists_dev(pathname: dev->pathname, sessname: sess->sessname); |
| 1510 | if (!found) { |
| 1511 | mutex_lock(&sess->lock); |
| 1512 | list_add_tail(new: &dev->list, head: &sess->devs_list); |
| 1513 | mutex_unlock(lock: &sess->lock); |
| 1514 | } |
| 1515 | mutex_unlock(lock: &sess_lock); |
| 1516 | |
| 1517 | return found; |
| 1518 | } |
| 1519 | |
| 1520 | static void delete_dev(struct rnbd_clt_dev *dev) |
| 1521 | { |
| 1522 | struct rnbd_clt_session *sess = dev->sess; |
| 1523 | |
| 1524 | mutex_lock(&sess->lock); |
| 1525 | list_del(entry: &dev->list); |
| 1526 | mutex_unlock(lock: &sess->lock); |
| 1527 | } |
| 1528 | |
| 1529 | struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, |
| 1530 | struct rtrs_addr *paths, |
| 1531 | size_t path_cnt, u16 port_nr, |
| 1532 | const char *pathname, |
| 1533 | enum rnbd_access_mode access_mode, |
| 1534 | u32 nr_poll_queues) |
| 1535 | { |
| 1536 | struct rnbd_clt_session *sess; |
| 1537 | struct rnbd_clt_dev *dev; |
| 1538 | int ret, errno; |
| 1539 | struct rnbd_msg_open_rsp *rsp; |
| 1540 | struct rnbd_msg_open msg; |
| 1541 | struct rnbd_iu *iu; |
| 1542 | struct kvec vec = { |
| 1543 | .iov_base = &msg, |
| 1544 | .iov_len = sizeof(msg) |
| 1545 | }; |
| 1546 | |
| 1547 | if (exists_devpath(pathname, sessname)) |
| 1548 | return ERR_PTR(error: -EEXIST); |
| 1549 | |
| 1550 | sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr, nr_poll_queues); |
| 1551 | if (IS_ERR(ptr: sess)) |
| 1552 | return ERR_CAST(ptr: sess); |
| 1553 | |
| 1554 | dev = init_dev(sess, access_mode, pathname, nr_poll_queues); |
| 1555 | if (IS_ERR(ptr: dev)) { |
| 1556 | pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %pe\n" , |
| 1557 | pathname, sess->sessname, dev); |
| 1558 | ret = PTR_ERR(ptr: dev); |
| 1559 | goto put_sess; |
| 1560 | } |
| 1561 | if (insert_dev_if_not_exists_devpath(dev)) { |
| 1562 | ret = -EEXIST; |
| 1563 | goto put_dev; |
| 1564 | } |
| 1565 | |
| 1566 | rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); |
| 1567 | if (!rsp) { |
| 1568 | ret = -ENOMEM; |
| 1569 | goto del_dev; |
| 1570 | } |
| 1571 | |
| 1572 | iu = rnbd_get_iu(sess, con_type: RTRS_ADMIN_CON, wait: RTRS_PERMIT_WAIT); |
| 1573 | if (!iu) { |
| 1574 | ret = -ENOMEM; |
| 1575 | kfree(objp: rsp); |
| 1576 | goto del_dev; |
| 1577 | } |
| 1578 | iu->buf = rsp; |
| 1579 | iu->dev = dev; |
| 1580 | sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); |
| 1581 | |
| 1582 | msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); |
| 1583 | msg.access_mode = dev->access_mode; |
| 1584 | strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); |
| 1585 | |
| 1586 | WARN_ON(!rnbd_clt_get_dev(dev)); |
| 1587 | ret = send_usr_msg(rtrs: sess->rtrs, READ, iu, |
| 1588 | vec: &vec, len: sizeof(*rsp), sg: iu->sgt.sgl, sg_len: 1, |
| 1589 | conf: msg_open_conf, errno: &errno, wait: RTRS_PERMIT_WAIT); |
| 1590 | if (ret) { |
| 1591 | rnbd_clt_put_dev(dev); |
| 1592 | rnbd_put_iu(sess, iu); |
| 1593 | } else { |
| 1594 | ret = errno; |
| 1595 | } |
| 1596 | if (ret) { |
| 1597 | rnbd_clt_err(dev, |
| 1598 | "map_device: failed, can't open remote device, err: %d\n" , |
| 1599 | ret); |
| 1600 | goto put_iu; |
| 1601 | } |
| 1602 | mutex_lock(&dev->lock); |
| 1603 | pr_debug("Opened remote device: session=%s, path='%s'\n" , |
| 1604 | sess->sessname, pathname); |
| 1605 | ret = rnbd_client_setup_device(dev, rsp); |
| 1606 | if (ret) { |
| 1607 | rnbd_clt_err(dev, |
| 1608 | "map_device: Failed to configure device, err: %d\n" , |
| 1609 | ret); |
| 1610 | mutex_unlock(lock: &dev->lock); |
| 1611 | goto send_close; |
| 1612 | } |
| 1613 | |
| 1614 | rnbd_clt_info(dev, |
| 1615 | "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_write_zeroes_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n" , |
| 1616 | dev->gd->disk_name, le64_to_cpu(rsp->nsectors), |
| 1617 | le16_to_cpu(rsp->logical_block_size), |
| 1618 | le16_to_cpu(rsp->physical_block_size), |
| 1619 | le32_to_cpu(rsp->max_write_zeroes_sectors), |
| 1620 | le32_to_cpu(rsp->max_discard_sectors), |
| 1621 | le32_to_cpu(rsp->discard_granularity), |
| 1622 | le32_to_cpu(rsp->discard_alignment), |
| 1623 | le16_to_cpu(rsp->secure_discard), |
| 1624 | sess->max_segments, sess->max_io_size / SECTOR_SIZE, |
| 1625 | !!(rsp->cache_policy & RNBD_WRITEBACK), |
| 1626 | !!(rsp->cache_policy & RNBD_FUA)); |
| 1627 | |
| 1628 | mutex_unlock(lock: &dev->lock); |
| 1629 | kfree(objp: rsp); |
| 1630 | rnbd_put_iu(sess, iu); |
| 1631 | rnbd_clt_put_sess(sess); |
| 1632 | |
| 1633 | return dev; |
| 1634 | |
| 1635 | send_close: |
| 1636 | send_msg_close(dev, device_id: dev->device_id, wait: RTRS_PERMIT_WAIT); |
| 1637 | put_iu: |
| 1638 | kfree(objp: rsp); |
| 1639 | rnbd_put_iu(sess, iu); |
| 1640 | del_dev: |
| 1641 | delete_dev(dev); |
| 1642 | put_dev: |
| 1643 | rnbd_clt_put_dev(dev); |
| 1644 | put_sess: |
| 1645 | rnbd_clt_put_sess(sess); |
| 1646 | |
| 1647 | return ERR_PTR(error: ret); |
| 1648 | } |
| 1649 | |
| 1650 | static void destroy_gen_disk(struct rnbd_clt_dev *dev) |
| 1651 | { |
| 1652 | del_gendisk(gp: dev->gd); |
| 1653 | put_disk(disk: dev->gd); |
| 1654 | } |
| 1655 | |
| 1656 | static void destroy_sysfs(struct rnbd_clt_dev *dev, |
| 1657 | const struct attribute *sysfs_self) |
| 1658 | { |
| 1659 | rnbd_clt_remove_dev_symlink(dev); |
| 1660 | if (dev->kobj.state_initialized) { |
| 1661 | if (sysfs_self) |
| 1662 | /* To avoid deadlock firstly remove itself */ |
| 1663 | sysfs_remove_file_self(kobj: &dev->kobj, attr: sysfs_self); |
| 1664 | kobject_del(kobj: &dev->kobj); |
| 1665 | kobject_put(kobj: &dev->kobj); |
| 1666 | } |
| 1667 | } |
| 1668 | |
| 1669 | int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, |
| 1670 | const struct attribute *sysfs_self) |
| 1671 | { |
| 1672 | struct rnbd_clt_session *sess = dev->sess; |
| 1673 | int refcount, ret = 0; |
| 1674 | bool was_mapped; |
| 1675 | |
| 1676 | mutex_lock(&dev->lock); |
| 1677 | if (dev->dev_state == DEV_STATE_UNMAPPED) { |
| 1678 | rnbd_clt_info(dev, "Device is already being unmapped\n" ); |
| 1679 | ret = -EALREADY; |
| 1680 | goto err; |
| 1681 | } |
| 1682 | refcount = refcount_read(r: &dev->refcount); |
| 1683 | if (!force && refcount > 1) { |
| 1684 | rnbd_clt_err(dev, |
| 1685 | "Closing device failed, device is in use, (%d device users)\n" , |
| 1686 | refcount - 1); |
| 1687 | ret = -EBUSY; |
| 1688 | goto err; |
| 1689 | } |
| 1690 | was_mapped = (dev->dev_state == DEV_STATE_MAPPED); |
| 1691 | dev->dev_state = DEV_STATE_UNMAPPED; |
| 1692 | mutex_unlock(lock: &dev->lock); |
| 1693 | |
| 1694 | delete_dev(dev); |
| 1695 | destroy_sysfs(dev, sysfs_self); |
| 1696 | destroy_gen_disk(dev); |
| 1697 | if (was_mapped && sess->rtrs) |
| 1698 | send_msg_close(dev, device_id: dev->device_id, wait: RTRS_PERMIT_WAIT); |
| 1699 | |
| 1700 | rnbd_clt_info(dev, "Device is unmapped\n" ); |
| 1701 | |
| 1702 | /* Likely last reference put */ |
| 1703 | rnbd_clt_put_dev(dev); |
| 1704 | |
| 1705 | /* |
| 1706 | * Here device and session can be vanished! |
| 1707 | */ |
| 1708 | |
| 1709 | return 0; |
| 1710 | err: |
| 1711 | mutex_unlock(lock: &dev->lock); |
| 1712 | |
| 1713 | return ret; |
| 1714 | } |
| 1715 | |
| 1716 | int rnbd_clt_remap_device(struct rnbd_clt_dev *dev) |
| 1717 | { |
| 1718 | int err; |
| 1719 | |
| 1720 | mutex_lock(&dev->lock); |
| 1721 | if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) |
| 1722 | err = 0; |
| 1723 | else if (dev->dev_state == DEV_STATE_UNMAPPED) |
| 1724 | err = -ENODEV; |
| 1725 | else if (dev->dev_state == DEV_STATE_MAPPED) |
| 1726 | err = -EALREADY; |
| 1727 | else |
| 1728 | err = -EBUSY; |
| 1729 | mutex_unlock(lock: &dev->lock); |
| 1730 | if (!err) { |
| 1731 | rnbd_clt_info(dev, "Remapping device.\n" ); |
| 1732 | err = send_msg_open(dev, wait: RTRS_PERMIT_WAIT); |
| 1733 | if (err) |
| 1734 | rnbd_clt_err(dev, "remap_device: %d\n" , err); |
| 1735 | } |
| 1736 | |
| 1737 | return err; |
| 1738 | } |
| 1739 | |
| 1740 | static void unmap_device_work(struct work_struct *work) |
| 1741 | { |
| 1742 | struct rnbd_clt_dev *dev; |
| 1743 | |
| 1744 | dev = container_of(work, typeof(*dev), unmap_on_rmmod_work); |
| 1745 | rnbd_clt_unmap_device(dev, force: true, NULL); |
| 1746 | } |
| 1747 | |
| 1748 | static void rnbd_destroy_sessions(void) |
| 1749 | { |
| 1750 | struct rnbd_clt_session *sess, *sn; |
| 1751 | struct rnbd_clt_dev *dev, *tn; |
| 1752 | |
| 1753 | /* Firstly forbid access through sysfs interface */ |
| 1754 | rnbd_clt_destroy_sysfs_files(); |
| 1755 | |
| 1756 | /* |
| 1757 | * Here at this point there is no any concurrent access to sessions |
| 1758 | * list and devices list: |
| 1759 | * 1. New session or device can't be created - session sysfs files |
| 1760 | * are removed. |
| 1761 | * 2. Device or session can't be removed - module reference is taken |
| 1762 | * into account in unmap device sysfs callback. |
| 1763 | * 3. No IO requests inflight - each file open of block_dev increases |
| 1764 | * module reference in get_disk(). |
| 1765 | * |
| 1766 | * But still there can be user requests inflights, which are sent by |
| 1767 | * asynchronous send_msg_*() functions, thus before unmapping devices |
| 1768 | * RTRS session must be explicitly closed. |
| 1769 | */ |
| 1770 | |
| 1771 | list_for_each_entry_safe(sess, sn, &sess_list, list) { |
| 1772 | if (!rnbd_clt_get_sess(sess)) |
| 1773 | continue; |
| 1774 | close_rtrs(sess); |
| 1775 | list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { |
| 1776 | /* |
| 1777 | * Here unmap happens in parallel for only one reason: |
| 1778 | * del_gendisk() takes around half a second, so |
| 1779 | * on huge amount of devices the whole module unload |
| 1780 | * procedure takes minutes. |
| 1781 | */ |
| 1782 | INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work); |
| 1783 | queue_work(wq: rnbd_clt_wq, work: &dev->unmap_on_rmmod_work); |
| 1784 | } |
| 1785 | rnbd_clt_put_sess(sess); |
| 1786 | } |
| 1787 | /* Wait for all scheduled unmap works */ |
| 1788 | flush_workqueue(rnbd_clt_wq); |
| 1789 | WARN_ON(!list_empty(&sess_list)); |
| 1790 | } |
| 1791 | |
| 1792 | static int __init rnbd_client_init(void) |
| 1793 | { |
| 1794 | int err = 0; |
| 1795 | |
| 1796 | BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4); |
| 1797 | BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36); |
| 1798 | BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36); |
| 1799 | BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264); |
| 1800 | BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8); |
| 1801 | BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56); |
| 1802 | rnbd_client_major = register_blkdev(rnbd_client_major, "rnbd" ); |
| 1803 | if (rnbd_client_major <= 0) { |
| 1804 | pr_err("Failed to load module, block device registration failed\n" ); |
| 1805 | return -EBUSY; |
| 1806 | } |
| 1807 | |
| 1808 | err = rnbd_clt_create_sysfs_files(); |
| 1809 | if (err) { |
| 1810 | pr_err("Failed to load module, creating sysfs device files failed, err: %d\n" , |
| 1811 | err); |
| 1812 | unregister_blkdev(major: rnbd_client_major, name: "rnbd" ); |
| 1813 | return err; |
| 1814 | } |
| 1815 | rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq" , WQ_PERCPU, 0); |
| 1816 | if (!rnbd_clt_wq) { |
| 1817 | pr_err("Failed to load module, alloc_workqueue failed.\n" ); |
| 1818 | rnbd_clt_destroy_sysfs_files(); |
| 1819 | unregister_blkdev(major: rnbd_client_major, name: "rnbd" ); |
| 1820 | err = -ENOMEM; |
| 1821 | } |
| 1822 | |
| 1823 | return err; |
| 1824 | } |
| 1825 | |
| 1826 | static void __exit rnbd_client_exit(void) |
| 1827 | { |
| 1828 | rnbd_destroy_sessions(); |
| 1829 | unregister_blkdev(major: rnbd_client_major, name: "rnbd" ); |
| 1830 | ida_destroy(ida: &index_ida); |
| 1831 | destroy_workqueue(wq: rnbd_clt_wq); |
| 1832 | } |
| 1833 | |
| 1834 | module_init(rnbd_client_init); |
| 1835 | module_exit(rnbd_client_exit); |
| 1836 | |