1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * VDPA simulator for block device. |
4 | * |
5 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. |
6 | * Copyright (c) 2021, Red Hat Inc. All rights reserved. |
7 | * |
8 | */ |
9 | |
10 | #include <linux/init.h> |
11 | #include <linux/module.h> |
12 | #include <linux/device.h> |
13 | #include <linux/kernel.h> |
14 | #include <linux/blkdev.h> |
15 | #include <linux/vringh.h> |
16 | #include <linux/vdpa.h> |
17 | #include <uapi/linux/virtio_blk.h> |
18 | |
19 | #include "vdpa_sim.h" |
20 | |
21 | #define DRV_VERSION "0.1" |
22 | #define DRV_AUTHOR "Max Gurtovoy <mgurtovoy@nvidia.com>" |
23 | #define DRV_DESC "vDPA Device Simulator for block device" |
24 | #define DRV_LICENSE "GPL v2" |
25 | |
26 | #define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \ |
27 | (1ULL << VIRTIO_BLK_F_FLUSH) | \ |
28 | (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ |
29 | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ |
30 | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ |
31 | (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ |
32 | (1ULL << VIRTIO_BLK_F_MQ) | \ |
33 | (1ULL << VIRTIO_BLK_F_DISCARD) | \ |
34 | (1ULL << VIRTIO_BLK_F_WRITE_ZEROES)) |
35 | |
36 | #define VDPASIM_BLK_CAPACITY 0x40000 |
37 | #define VDPASIM_BLK_SIZE_MAX 0x1000 |
38 | #define VDPASIM_BLK_SEG_MAX 32 |
39 | #define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX |
40 | |
41 | /* 1 virtqueue, 1 address space, 1 virtqueue group */ |
42 | #define VDPASIM_BLK_VQ_NUM 1 |
43 | #define VDPASIM_BLK_AS_NUM 1 |
44 | #define VDPASIM_BLK_GROUP_NUM 1 |
45 | |
46 | struct vdpasim_blk { |
47 | struct vdpasim vdpasim; |
48 | void *buffer; |
49 | bool shared_backend; |
50 | }; |
51 | |
52 | static struct vdpasim_blk *sim_to_blk(struct vdpasim *vdpasim) |
53 | { |
54 | return container_of(vdpasim, struct vdpasim_blk, vdpasim); |
55 | } |
56 | |
57 | static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim" ; |
58 | |
59 | static bool shared_backend; |
60 | module_param(shared_backend, bool, 0444); |
61 | MODULE_PARM_DESC(shared_backend, "Enable the shared backend between virtio-blk devices" ); |
62 | |
63 | static void *shared_buffer; |
64 | /* mutex to synchronize shared_buffer access */ |
65 | static DEFINE_MUTEX(shared_buffer_mutex); |
66 | |
67 | static void vdpasim_blk_buffer_lock(struct vdpasim_blk *blk) |
68 | { |
69 | if (blk->shared_backend) |
70 | mutex_lock(&shared_buffer_mutex); |
71 | } |
72 | |
73 | static void vdpasim_blk_buffer_unlock(struct vdpasim_blk *blk) |
74 | { |
75 | if (blk->shared_backend) |
76 | mutex_unlock(lock: &shared_buffer_mutex); |
77 | } |
78 | |
79 | static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, |
80 | u64 num_sectors, u64 max_sectors) |
81 | { |
82 | if (start_sector > VDPASIM_BLK_CAPACITY) { |
83 | dev_dbg(&vdpasim->vdpa.dev, |
84 | "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n" , |
85 | start_sector, VDPASIM_BLK_CAPACITY); |
86 | } |
87 | |
88 | if (num_sectors > max_sectors) { |
89 | dev_dbg(&vdpasim->vdpa.dev, |
90 | "number of sectors exceeds the max allowed in a request - num: 0x%llx max: 0x%llx\n" , |
91 | num_sectors, max_sectors); |
92 | return false; |
93 | } |
94 | |
95 | if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) { |
96 | dev_dbg(&vdpasim->vdpa.dev, |
97 | "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n" , |
98 | start_sector, num_sectors, VDPASIM_BLK_CAPACITY); |
99 | return false; |
100 | } |
101 | |
102 | return true; |
103 | } |
104 | |
105 | /* Returns 'true' if the request is handled (with or without an I/O error) |
106 | * and the status is correctly written in the last byte of the 'in iov', |
107 | * 'false' otherwise. |
108 | */ |
109 | static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, |
110 | struct vdpasim_virtqueue *vq) |
111 | { |
112 | struct vdpasim_blk *blk = sim_to_blk(vdpasim); |
113 | size_t pushed = 0, to_pull, to_push; |
114 | struct virtio_blk_outhdr hdr; |
115 | bool handled = false; |
116 | ssize_t bytes; |
117 | loff_t offset; |
118 | u64 sector; |
119 | u8 status; |
120 | u32 type; |
121 | int ret; |
122 | |
123 | ret = vringh_getdesc_iotlb(vrh: &vq->vring, riov: &vq->out_iov, wiov: &vq->in_iov, |
124 | head: &vq->head, GFP_ATOMIC); |
125 | if (ret != 1) |
126 | return false; |
127 | |
128 | if (vq->out_iov.used < 1 || vq->in_iov.used < 1) { |
129 | dev_dbg(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n" , |
130 | vq->out_iov.used, vq->in_iov.used); |
131 | goto err; |
132 | } |
133 | |
134 | if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) { |
135 | dev_dbg(&vdpasim->vdpa.dev, "request in header too short\n" ); |
136 | goto err; |
137 | } |
138 | |
139 | /* The last byte is the status and we checked if the last iov has |
140 | * enough room for it. |
141 | */ |
142 | to_push = vringh_kiov_length(kiov: &vq->in_iov) - 1; |
143 | |
144 | to_pull = vringh_kiov_length(kiov: &vq->out_iov); |
145 | |
146 | bytes = vringh_iov_pull_iotlb(vrh: &vq->vring, riov: &vq->out_iov, dst: &hdr, |
147 | len: sizeof(hdr)); |
148 | if (bytes != sizeof(hdr)) { |
149 | dev_dbg(&vdpasim->vdpa.dev, "request out header too short\n" ); |
150 | goto err; |
151 | } |
152 | |
153 | to_pull -= bytes; |
154 | |
155 | type = vdpasim32_to_cpu(vdpasim, val: hdr.type); |
156 | sector = vdpasim64_to_cpu(vdpasim, val: hdr.sector); |
157 | offset = sector << SECTOR_SHIFT; |
158 | status = VIRTIO_BLK_S_OK; |
159 | |
160 | if (type != VIRTIO_BLK_T_IN && type != VIRTIO_BLK_T_OUT && |
161 | sector != 0) { |
162 | dev_dbg(&vdpasim->vdpa.dev, |
163 | "sector must be 0 for %u request - sector: 0x%llx\n" , |
164 | type, sector); |
165 | status = VIRTIO_BLK_S_IOERR; |
166 | goto err_status; |
167 | } |
168 | |
169 | switch (type) { |
170 | case VIRTIO_BLK_T_IN: |
171 | if (!vdpasim_blk_check_range(vdpasim, start_sector: sector, |
172 | num_sectors: to_push >> SECTOR_SHIFT, |
173 | VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { |
174 | status = VIRTIO_BLK_S_IOERR; |
175 | break; |
176 | } |
177 | |
178 | vdpasim_blk_buffer_lock(blk); |
179 | bytes = vringh_iov_push_iotlb(vrh: &vq->vring, wiov: &vq->in_iov, |
180 | src: blk->buffer + offset, len: to_push); |
181 | vdpasim_blk_buffer_unlock(blk); |
182 | if (bytes < 0) { |
183 | dev_dbg(&vdpasim->vdpa.dev, |
184 | "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n" , |
185 | bytes, offset, to_push); |
186 | status = VIRTIO_BLK_S_IOERR; |
187 | break; |
188 | } |
189 | |
190 | pushed += bytes; |
191 | break; |
192 | |
193 | case VIRTIO_BLK_T_OUT: |
194 | if (!vdpasim_blk_check_range(vdpasim, start_sector: sector, |
195 | num_sectors: to_pull >> SECTOR_SHIFT, |
196 | VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { |
197 | status = VIRTIO_BLK_S_IOERR; |
198 | break; |
199 | } |
200 | |
201 | vdpasim_blk_buffer_lock(blk); |
202 | bytes = vringh_iov_pull_iotlb(vrh: &vq->vring, riov: &vq->out_iov, |
203 | dst: blk->buffer + offset, len: to_pull); |
204 | vdpasim_blk_buffer_unlock(blk); |
205 | if (bytes < 0) { |
206 | dev_dbg(&vdpasim->vdpa.dev, |
207 | "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n" , |
208 | bytes, offset, to_pull); |
209 | status = VIRTIO_BLK_S_IOERR; |
210 | break; |
211 | } |
212 | break; |
213 | |
214 | case VIRTIO_BLK_T_GET_ID: |
215 | bytes = vringh_iov_push_iotlb(vrh: &vq->vring, wiov: &vq->in_iov, |
216 | src: vdpasim_blk_id, |
217 | VIRTIO_BLK_ID_BYTES); |
218 | if (bytes < 0) { |
219 | dev_dbg(&vdpasim->vdpa.dev, |
220 | "vringh_iov_push_iotlb() error: %zd\n" , bytes); |
221 | status = VIRTIO_BLK_S_IOERR; |
222 | break; |
223 | } |
224 | |
225 | pushed += bytes; |
226 | break; |
227 | |
228 | case VIRTIO_BLK_T_FLUSH: |
229 | /* nothing to do */ |
230 | break; |
231 | |
232 | case VIRTIO_BLK_T_DISCARD: |
233 | case VIRTIO_BLK_T_WRITE_ZEROES: { |
234 | struct virtio_blk_discard_write_zeroes range; |
235 | u32 num_sectors, flags; |
236 | |
237 | if (to_pull != sizeof(range)) { |
238 | dev_dbg(&vdpasim->vdpa.dev, |
239 | "discard/write_zeroes header len: 0x%zx [expected: 0x%zx]\n" , |
240 | to_pull, sizeof(range)); |
241 | status = VIRTIO_BLK_S_IOERR; |
242 | break; |
243 | } |
244 | |
245 | bytes = vringh_iov_pull_iotlb(vrh: &vq->vring, riov: &vq->out_iov, dst: &range, |
246 | len: to_pull); |
247 | if (bytes < 0) { |
248 | dev_dbg(&vdpasim->vdpa.dev, |
249 | "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n" , |
250 | bytes, offset, to_pull); |
251 | status = VIRTIO_BLK_S_IOERR; |
252 | break; |
253 | } |
254 | |
255 | sector = le64_to_cpu(range.sector); |
256 | offset = sector << SECTOR_SHIFT; |
257 | num_sectors = le32_to_cpu(range.num_sectors); |
258 | flags = le32_to_cpu(range.flags); |
259 | |
260 | if (type == VIRTIO_BLK_T_DISCARD && flags != 0) { |
261 | dev_dbg(&vdpasim->vdpa.dev, |
262 | "discard unexpected flags set - flags: 0x%x\n" , |
263 | flags); |
264 | status = VIRTIO_BLK_S_UNSUPP; |
265 | break; |
266 | } |
267 | |
268 | if (type == VIRTIO_BLK_T_WRITE_ZEROES && |
269 | flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { |
270 | dev_dbg(&vdpasim->vdpa.dev, |
271 | "write_zeroes unexpected flags set - flags: 0x%x\n" , |
272 | flags); |
273 | status = VIRTIO_BLK_S_UNSUPP; |
274 | break; |
275 | } |
276 | |
277 | if (!vdpasim_blk_check_range(vdpasim, start_sector: sector, num_sectors, |
278 | VDPASIM_BLK_DWZ_MAX_SECTORS)) { |
279 | status = VIRTIO_BLK_S_IOERR; |
280 | break; |
281 | } |
282 | |
283 | if (type == VIRTIO_BLK_T_WRITE_ZEROES) { |
284 | vdpasim_blk_buffer_lock(blk); |
285 | memset(blk->buffer + offset, 0, |
286 | num_sectors << SECTOR_SHIFT); |
287 | vdpasim_blk_buffer_unlock(blk); |
288 | } |
289 | |
290 | break; |
291 | } |
292 | default: |
293 | dev_dbg(&vdpasim->vdpa.dev, |
294 | "Unsupported request type %d\n" , type); |
295 | status = VIRTIO_BLK_S_IOERR; |
296 | break; |
297 | } |
298 | |
299 | err_status: |
300 | /* If some operations fail, we need to skip the remaining bytes |
301 | * to put the status in the last byte |
302 | */ |
303 | if (to_push - pushed > 0) |
304 | vringh_kiov_advance(kiov: &vq->in_iov, len: to_push - pushed); |
305 | |
306 | /* Last byte is the status */ |
307 | bytes = vringh_iov_push_iotlb(vrh: &vq->vring, wiov: &vq->in_iov, src: &status, len: 1); |
308 | if (bytes != 1) |
309 | goto err; |
310 | |
311 | pushed += bytes; |
312 | |
313 | /* Make sure data is wrote before advancing index */ |
314 | smp_wmb(); |
315 | |
316 | handled = true; |
317 | |
318 | err: |
319 | vringh_complete_iotlb(vrh: &vq->vring, head: vq->head, len: pushed); |
320 | |
321 | return handled; |
322 | } |
323 | |
324 | static void vdpasim_blk_work(struct vdpasim *vdpasim) |
325 | { |
326 | bool reschedule = false; |
327 | int i; |
328 | |
329 | mutex_lock(&vdpasim->mutex); |
330 | |
331 | if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) |
332 | goto out; |
333 | |
334 | if (!vdpasim->running) |
335 | goto out; |
336 | |
337 | for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { |
338 | struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; |
339 | int reqs = 0; |
340 | |
341 | if (!vq->ready) |
342 | continue; |
343 | |
344 | while (vdpasim_blk_handle_req(vdpasim, vq)) { |
345 | /* Make sure used is visible before rasing the interrupt. */ |
346 | smp_wmb(); |
347 | |
348 | local_bh_disable(); |
349 | if (vringh_need_notify_iotlb(vrh: &vq->vring) > 0) |
350 | vringh_notify(vrh: &vq->vring); |
351 | local_bh_enable(); |
352 | |
353 | if (++reqs > 4) { |
354 | reschedule = true; |
355 | break; |
356 | } |
357 | } |
358 | } |
359 | out: |
360 | mutex_unlock(lock: &vdpasim->mutex); |
361 | |
362 | if (reschedule) |
363 | vdpasim_schedule_work(vdpasim); |
364 | } |
365 | |
366 | static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) |
367 | { |
368 | struct virtio_blk_config *blk_config = config; |
369 | |
370 | memset(config, 0, sizeof(struct virtio_blk_config)); |
371 | |
372 | blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY); |
373 | blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX); |
374 | blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX); |
375 | blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM); |
376 | blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, val: 1); |
377 | blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, val: 1); |
378 | blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); |
379 | /* VIRTIO_BLK_F_DISCARD */ |
380 | blk_config->discard_sector_alignment = |
381 | cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); |
382 | blk_config->max_discard_sectors = |
383 | cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); |
384 | blk_config->max_discard_seg = cpu_to_vdpasim32(vdpasim, val: 1); |
385 | /* VIRTIO_BLK_F_WRITE_ZEROES */ |
386 | blk_config->max_write_zeroes_sectors = |
387 | cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); |
388 | blk_config->max_write_zeroes_seg = cpu_to_vdpasim32(vdpasim, val: 1); |
389 | |
390 | } |
391 | |
392 | static void vdpasim_blk_free(struct vdpasim *vdpasim) |
393 | { |
394 | struct vdpasim_blk *blk = sim_to_blk(vdpasim); |
395 | |
396 | if (!blk->shared_backend) |
397 | kvfree(addr: blk->buffer); |
398 | } |
399 | |
400 | static void vdpasim_blk_mgmtdev_release(struct device *dev) |
401 | { |
402 | } |
403 | |
404 | static struct device vdpasim_blk_mgmtdev = { |
405 | .init_name = "vdpasim_blk" , |
406 | .release = vdpasim_blk_mgmtdev_release, |
407 | }; |
408 | |
409 | static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, |
410 | const struct vdpa_dev_set_config *config) |
411 | { |
412 | struct vdpasim_dev_attr dev_attr = {}; |
413 | struct vdpasim_blk *blk; |
414 | struct vdpasim *simdev; |
415 | int ret; |
416 | |
417 | dev_attr.mgmt_dev = mdev; |
418 | dev_attr.name = name; |
419 | dev_attr.id = VIRTIO_ID_BLOCK; |
420 | dev_attr.supported_features = VDPASIM_BLK_FEATURES; |
421 | dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; |
422 | dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; |
423 | dev_attr.nas = VDPASIM_BLK_AS_NUM; |
424 | dev_attr.alloc_size = sizeof(struct vdpasim_blk); |
425 | dev_attr.config_size = sizeof(struct virtio_blk_config); |
426 | dev_attr.get_config = vdpasim_blk_get_config; |
427 | dev_attr.work_fn = vdpasim_blk_work; |
428 | dev_attr.free = vdpasim_blk_free; |
429 | |
430 | simdev = vdpasim_create(attr: &dev_attr, config); |
431 | if (IS_ERR(ptr: simdev)) |
432 | return PTR_ERR(ptr: simdev); |
433 | |
434 | blk = sim_to_blk(vdpasim: simdev); |
435 | blk->shared_backend = shared_backend; |
436 | |
437 | if (blk->shared_backend) { |
438 | blk->buffer = shared_buffer; |
439 | } else { |
440 | blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, |
441 | GFP_KERNEL); |
442 | if (!blk->buffer) { |
443 | ret = -ENOMEM; |
444 | goto put_dev; |
445 | } |
446 | } |
447 | |
448 | ret = _vdpa_register_device(vdev: &simdev->vdpa, VDPASIM_BLK_VQ_NUM); |
449 | if (ret) |
450 | goto put_dev; |
451 | |
452 | return 0; |
453 | |
454 | put_dev: |
455 | put_device(dev: &simdev->vdpa.dev); |
456 | return ret; |
457 | } |
458 | |
459 | static void vdpasim_blk_dev_del(struct vdpa_mgmt_dev *mdev, |
460 | struct vdpa_device *dev) |
461 | { |
462 | struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa); |
463 | |
464 | _vdpa_unregister_device(vdev: &simdev->vdpa); |
465 | } |
466 | |
467 | static const struct vdpa_mgmtdev_ops vdpasim_blk_mgmtdev_ops = { |
468 | .dev_add = vdpasim_blk_dev_add, |
469 | .dev_del = vdpasim_blk_dev_del |
470 | }; |
471 | |
472 | static struct virtio_device_id id_table[] = { |
473 | { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, |
474 | { 0 }, |
475 | }; |
476 | |
477 | static struct vdpa_mgmt_dev mgmt_dev = { |
478 | .device = &vdpasim_blk_mgmtdev, |
479 | .id_table = id_table, |
480 | .ops = &vdpasim_blk_mgmtdev_ops, |
481 | }; |
482 | |
483 | static int __init vdpasim_blk_init(void) |
484 | { |
485 | int ret; |
486 | |
487 | ret = device_register(dev: &vdpasim_blk_mgmtdev); |
488 | if (ret) { |
489 | put_device(dev: &vdpasim_blk_mgmtdev); |
490 | return ret; |
491 | } |
492 | |
493 | ret = vdpa_mgmtdev_register(mdev: &mgmt_dev); |
494 | if (ret) |
495 | goto parent_err; |
496 | |
497 | if (shared_backend) { |
498 | shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, |
499 | GFP_KERNEL); |
500 | if (!shared_buffer) { |
501 | ret = -ENOMEM; |
502 | goto mgmt_dev_err; |
503 | } |
504 | } |
505 | |
506 | return 0; |
507 | mgmt_dev_err: |
508 | vdpa_mgmtdev_unregister(mdev: &mgmt_dev); |
509 | parent_err: |
510 | device_unregister(dev: &vdpasim_blk_mgmtdev); |
511 | return ret; |
512 | } |
513 | |
514 | static void __exit vdpasim_blk_exit(void) |
515 | { |
516 | kvfree(addr: shared_buffer); |
517 | vdpa_mgmtdev_unregister(mdev: &mgmt_dev); |
518 | device_unregister(dev: &vdpasim_blk_mgmtdev); |
519 | } |
520 | |
521 | module_init(vdpasim_blk_init) |
522 | module_exit(vdpasim_blk_exit) |
523 | |
524 | MODULE_VERSION(DRV_VERSION); |
525 | MODULE_LICENSE(DRV_LICENSE); |
526 | MODULE_AUTHOR(DRV_AUTHOR); |
527 | MODULE_DESCRIPTION(DRV_DESC); |
528 | |