1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2020 Red Hat GmbH |
4 | * |
5 | * This file is released under the GPL. |
6 | * |
7 | * Device-mapper target to emulate smaller logical block |
8 | * size on backing devices exposing (natively) larger ones. |
9 | * |
10 | * E.g. 512 byte sector emulation on 4K native disks. |
11 | */ |
12 | |
13 | #include "dm.h" |
14 | #include <linux/module.h> |
15 | #include <linux/workqueue.h> |
16 | #include <linux/dm-bufio.h> |
17 | |
18 | #define DM_MSG_PREFIX "ebs" |
19 | |
20 | static void ebs_dtr(struct dm_target *ti); |
21 | |
22 | /* Emulated block size context. */ |
23 | struct ebs_c { |
24 | struct dm_dev *dev; /* Underlying device to emulate block size on. */ |
25 | struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ |
26 | struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ |
27 | struct work_struct ws; /* Work item used for ^. */ |
28 | struct bio_list bios_in; /* Worker bios input list. */ |
29 | spinlock_t lock; /* Guard bios input list above. */ |
30 | sector_t start; /* <start> table line argument, see ebs_ctr below. */ |
31 | unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ |
32 | unsigned int u_bs; /* Underlying block size in sectors retrieved from/set on lower layer device. */ |
33 | unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ |
34 | bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ |
35 | }; |
36 | |
37 | static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) |
38 | { |
39 | return sector >> ec->block_shift; |
40 | } |
41 | |
42 | static inline sector_t __block_mod(sector_t sector, unsigned int bs) |
43 | { |
44 | return sector & (bs - 1); |
45 | } |
46 | |
47 | /* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */ |
48 | static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) |
49 | { |
50 | sector_t end_sector = __block_mod(sector: bio->bi_iter.bi_sector, bs: ec->u_bs) + bio_sectors(bio); |
51 | |
52 | return __sector_to_block(ec, sector: end_sector) + (__block_mod(sector: end_sector, bs: ec->u_bs) ? 1 : 0); |
53 | } |
54 | |
55 | static inline bool __ebs_check_bs(unsigned int bs) |
56 | { |
57 | return bs && is_power_of_2(n: bs); |
58 | } |
59 | |
60 | /* |
61 | * READ/WRITE: |
62 | * |
63 | * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. |
64 | */ |
65 | static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv, |
66 | struct bvec_iter *iter) |
67 | { |
68 | int r = 0; |
69 | unsigned char *ba, *pa; |
70 | unsigned int cur_len; |
71 | unsigned int bv_len = bv->bv_len; |
72 | unsigned int buf_off = to_bytes(n: __block_mod(sector: iter->bi_sector, bs: ec->u_bs)); |
73 | sector_t block = __sector_to_block(ec, sector: iter->bi_sector); |
74 | struct dm_buffer *b; |
75 | |
76 | if (unlikely(!bv->bv_page || !bv_len)) |
77 | return -EIO; |
78 | |
79 | pa = bvec_virt(bvec: bv); |
80 | |
81 | /* Handle overlapping page <-> blocks */ |
82 | while (bv_len) { |
83 | cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); |
84 | |
85 | /* Avoid reading for writes in case bio vector's page overwrites block completely. */ |
86 | if (op == REQ_OP_READ || buf_off || bv_len < dm_bufio_get_block_size(c: ec->bufio)) |
87 | ba = dm_bufio_read(c: ec->bufio, block, bp: &b); |
88 | else |
89 | ba = dm_bufio_new(c: ec->bufio, block, bp: &b); |
90 | |
91 | if (IS_ERR(ptr: ba)) { |
92 | /* |
93 | * Carry on with next buffer, if any, to issue all possible |
94 | * data but return error. |
95 | */ |
96 | r = PTR_ERR(ptr: ba); |
97 | } else { |
98 | /* Copy data to/from bio to buffer if read/new was successful above. */ |
99 | ba += buf_off; |
100 | if (op == REQ_OP_READ) { |
101 | memcpy(pa, ba, cur_len); |
102 | flush_dcache_page(page: bv->bv_page); |
103 | } else { |
104 | flush_dcache_page(page: bv->bv_page); |
105 | memcpy(ba, pa, cur_len); |
106 | dm_bufio_mark_partial_buffer_dirty(b, start: buf_off, end: buf_off + cur_len); |
107 | } |
108 | |
109 | dm_bufio_release(b); |
110 | } |
111 | |
112 | pa += cur_len; |
113 | bv_len -= cur_len; |
114 | buf_off = 0; |
115 | block++; |
116 | } |
117 | |
118 | return r; |
119 | } |
120 | |
121 | /* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ |
122 | static int __ebs_rw_bio(struct ebs_c *ec, enum req_op op, struct bio *bio) |
123 | { |
124 | int r = 0, rr; |
125 | struct bio_vec bv; |
126 | struct bvec_iter iter; |
127 | |
128 | bio_for_each_bvec(bv, bio, iter) { |
129 | rr = __ebs_rw_bvec(ec, op, bv: &bv, iter: &iter); |
130 | if (rr) |
131 | r = rr; |
132 | } |
133 | |
134 | return r; |
135 | } |
136 | |
137 | /* |
138 | * Discard bio's blocks, i.e. pass discards down. |
139 | * |
140 | * Avoid discarding partial blocks at beginning and end; |
141 | * return 0 in case no blocks can be discarded as a result. |
142 | */ |
143 | static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) |
144 | { |
145 | sector_t block, blocks, sector = bio->bi_iter.bi_sector; |
146 | |
147 | block = __sector_to_block(ec, sector); |
148 | blocks = __nr_blocks(ec, bio); |
149 | |
150 | /* |
151 | * Partial first underlying block (__nr_blocks() may have |
152 | * resulted in one block). |
153 | */ |
154 | if (__block_mod(sector, bs: ec->u_bs)) { |
155 | block++; |
156 | blocks--; |
157 | } |
158 | |
159 | /* Partial last underlying block if any. */ |
160 | if (blocks && __block_mod(bio_end_sector(bio), bs: ec->u_bs)) |
161 | blocks--; |
162 | |
163 | return blocks ? dm_bufio_issue_discard(c: ec->bufio, block, count: blocks) : 0; |
164 | } |
165 | |
166 | /* Release blocks them from the bufio cache. */ |
167 | static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) |
168 | { |
169 | sector_t blocks, sector = bio->bi_iter.bi_sector; |
170 | |
171 | blocks = __nr_blocks(ec, bio); |
172 | |
173 | dm_bufio_forget_buffers(c: ec->bufio, block: __sector_to_block(ec, sector), n_blocks: blocks); |
174 | } |
175 | |
176 | /* Worker function to process incoming bios. */ |
177 | static void __ebs_process_bios(struct work_struct *ws) |
178 | { |
179 | int r; |
180 | bool write = false; |
181 | sector_t block1, block2; |
182 | struct ebs_c *ec = container_of(ws, struct ebs_c, ws); |
183 | struct bio *bio; |
184 | struct bio_list bios; |
185 | |
186 | bio_list_init(bl: &bios); |
187 | |
188 | spin_lock_irq(lock: &ec->lock); |
189 | bios = ec->bios_in; |
190 | bio_list_init(bl: &ec->bios_in); |
191 | spin_unlock_irq(lock: &ec->lock); |
192 | |
193 | /* Prefetch all read and any mis-aligned write buffers */ |
194 | bio_list_for_each(bio, &bios) { |
195 | block1 = __sector_to_block(ec, sector: bio->bi_iter.bi_sector); |
196 | if (bio_op(bio) == REQ_OP_READ) |
197 | dm_bufio_prefetch(c: ec->bufio, block: block1, n_blocks: __nr_blocks(ec, bio)); |
198 | else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { |
199 | block2 = __sector_to_block(ec, bio_end_sector(bio)); |
200 | if (__block_mod(sector: bio->bi_iter.bi_sector, bs: ec->u_bs)) |
201 | dm_bufio_prefetch(c: ec->bufio, block: block1, n_blocks: 1); |
202 | if (__block_mod(bio_end_sector(bio), bs: ec->u_bs) && block2 != block1) |
203 | dm_bufio_prefetch(c: ec->bufio, block: block2, n_blocks: 1); |
204 | } |
205 | } |
206 | |
207 | bio_list_for_each(bio, &bios) { |
208 | r = -EIO; |
209 | if (bio_op(bio) == REQ_OP_READ) |
210 | r = __ebs_rw_bio(ec, op: REQ_OP_READ, bio); |
211 | else if (bio_op(bio) == REQ_OP_WRITE) { |
212 | write = true; |
213 | r = __ebs_rw_bio(ec, op: REQ_OP_WRITE, bio); |
214 | } else if (bio_op(bio) == REQ_OP_DISCARD) { |
215 | __ebs_forget_bio(ec, bio); |
216 | r = __ebs_discard_bio(ec, bio); |
217 | } |
218 | |
219 | if (r < 0) |
220 | bio->bi_status = errno_to_blk_status(errno: r); |
221 | } |
222 | |
223 | /* |
224 | * We write dirty buffers after processing I/O on them |
225 | * but before we endio thus addressing REQ_FUA/REQ_SYNC. |
226 | */ |
227 | r = write ? dm_bufio_write_dirty_buffers(c: ec->bufio) : 0; |
228 | |
229 | while ((bio = bio_list_pop(bl: &bios))) { |
230 | /* Any other request is endioed. */ |
231 | if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) |
232 | bio_io_error(bio); |
233 | else |
234 | bio_endio(bio); |
235 | } |
236 | } |
237 | |
238 | /* |
239 | * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>] |
240 | * |
241 | * <dev_path>: path of the underlying device |
242 | * <offset>: offset in 512 bytes sectors into <dev_path> |
243 | * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer |
244 | * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer; |
245 | * optional, if not supplied, retrieve logical block size from underlying device |
246 | */ |
247 | static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
248 | { |
249 | int r; |
250 | unsigned short tmp1; |
251 | unsigned long long tmp; |
252 | char dummy; |
253 | struct ebs_c *ec; |
254 | |
255 | if (argc < 3 || argc > 4) { |
256 | ti->error = "Invalid argument count" ; |
257 | return -EINVAL; |
258 | } |
259 | |
260 | ec = ti->private = kzalloc(size: sizeof(*ec), GFP_KERNEL); |
261 | if (!ec) { |
262 | ti->error = "Cannot allocate ebs context" ; |
263 | return -ENOMEM; |
264 | } |
265 | |
266 | r = -EINVAL; |
267 | if (sscanf(argv[1], "%llu%c" , &tmp, &dummy) != 1 || |
268 | tmp != (sector_t)tmp || |
269 | (sector_t)tmp >= ti->len) { |
270 | ti->error = "Invalid device offset sector" ; |
271 | goto bad; |
272 | } |
273 | ec->start = tmp; |
274 | |
275 | if (sscanf(argv[2], "%hu%c" , &tmp1, &dummy) != 1 || |
276 | !__ebs_check_bs(bs: tmp1) || |
277 | to_bytes(n: tmp1) > PAGE_SIZE) { |
278 | ti->error = "Invalid emulated block size" ; |
279 | goto bad; |
280 | } |
281 | ec->e_bs = tmp1; |
282 | |
283 | if (argc > 3) { |
284 | if (sscanf(argv[3], "%hu%c" , &tmp1, &dummy) != 1 || !__ebs_check_bs(bs: tmp1)) { |
285 | ti->error = "Invalid underlying block size" ; |
286 | goto bad; |
287 | } |
288 | ec->u_bs = tmp1; |
289 | ec->u_bs_set = true; |
290 | } else |
291 | ec->u_bs_set = false; |
292 | |
293 | r = dm_get_device(ti, path: argv[0], mode: dm_table_get_mode(t: ti->table), result: &ec->dev); |
294 | if (r) { |
295 | ti->error = "Device lookup failed" ; |
296 | ec->dev = NULL; |
297 | goto bad; |
298 | } |
299 | |
300 | r = -EINVAL; |
301 | if (!ec->u_bs_set) { |
302 | ec->u_bs = to_sector(n: bdev_logical_block_size(bdev: ec->dev->bdev)); |
303 | if (!__ebs_check_bs(bs: ec->u_bs)) { |
304 | ti->error = "Invalid retrieved underlying block size" ; |
305 | goto bad; |
306 | } |
307 | } |
308 | |
309 | if (!ec->u_bs_set && ec->e_bs == ec->u_bs) |
310 | DMINFO("Emulation superfluous: emulated equal to underlying block size" ); |
311 | |
312 | if (__block_mod(sector: ec->start, bs: ec->u_bs)) { |
313 | ti->error = "Device offset must be multiple of underlying block size" ; |
314 | goto bad; |
315 | } |
316 | |
317 | ec->bufio = dm_bufio_client_create(bdev: ec->dev->bdev, block_size: to_bytes(n: ec->u_bs), reserved_buffers: 1, |
318 | aux_size: 0, NULL, NULL, flags: 0); |
319 | if (IS_ERR(ptr: ec->bufio)) { |
320 | ti->error = "Cannot create dm bufio client" ; |
321 | r = PTR_ERR(ptr: ec->bufio); |
322 | ec->bufio = NULL; |
323 | goto bad; |
324 | } |
325 | |
326 | ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); |
327 | if (!ec->wq) { |
328 | ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue" ; |
329 | r = -ENOMEM; |
330 | goto bad; |
331 | } |
332 | |
333 | ec->block_shift = __ffs(ec->u_bs); |
334 | INIT_WORK(&ec->ws, &__ebs_process_bios); |
335 | bio_list_init(bl: &ec->bios_in); |
336 | spin_lock_init(&ec->lock); |
337 | |
338 | ti->num_flush_bios = 1; |
339 | ti->num_discard_bios = 1; |
340 | ti->num_secure_erase_bios = 0; |
341 | ti->num_write_zeroes_bios = 0; |
342 | return 0; |
343 | bad: |
344 | ebs_dtr(ti); |
345 | return r; |
346 | } |
347 | |
348 | static void ebs_dtr(struct dm_target *ti) |
349 | { |
350 | struct ebs_c *ec = ti->private; |
351 | |
352 | if (ec->wq) |
353 | destroy_workqueue(wq: ec->wq); |
354 | if (ec->bufio) |
355 | dm_bufio_client_destroy(c: ec->bufio); |
356 | if (ec->dev) |
357 | dm_put_device(ti, d: ec->dev); |
358 | kfree(objp: ec); |
359 | } |
360 | |
361 | static int ebs_map(struct dm_target *ti, struct bio *bio) |
362 | { |
363 | struct ebs_c *ec = ti->private; |
364 | |
365 | bio_set_dev(bio, bdev: ec->dev->bdev); |
366 | bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); |
367 | |
368 | if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) |
369 | return DM_MAPIO_REMAPPED; |
370 | /* |
371 | * Only queue for bufio processing in case of partial or overlapping buffers |
372 | * -or- |
373 | * emulation with ebs == ubs aiming for tests of dm-bufio overhead. |
374 | */ |
375 | if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || |
376 | __block_mod(bio_end_sector(bio), ec->u_bs) || |
377 | ec->e_bs == ec->u_bs)) { |
378 | spin_lock_irq(lock: &ec->lock); |
379 | bio_list_add(bl: &ec->bios_in, bio); |
380 | spin_unlock_irq(lock: &ec->lock); |
381 | |
382 | queue_work(wq: ec->wq, work: &ec->ws); |
383 | |
384 | return DM_MAPIO_SUBMITTED; |
385 | } |
386 | |
387 | /* Forget any buffer content relative to this direct backing device I/O. */ |
388 | __ebs_forget_bio(ec, bio); |
389 | |
390 | return DM_MAPIO_REMAPPED; |
391 | } |
392 | |
393 | static void ebs_status(struct dm_target *ti, status_type_t type, |
394 | unsigned int status_flags, char *result, unsigned int maxlen) |
395 | { |
396 | struct ebs_c *ec = ti->private; |
397 | |
398 | switch (type) { |
399 | case STATUSTYPE_INFO: |
400 | *result = '\0'; |
401 | break; |
402 | case STATUSTYPE_TABLE: |
403 | snprintf(buf: result, size: maxlen, fmt: ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u" , |
404 | ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); |
405 | break; |
406 | case STATUSTYPE_IMA: |
407 | *result = '\0'; |
408 | break; |
409 | } |
410 | } |
411 | |
412 | static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) |
413 | { |
414 | struct ebs_c *ec = ti->private; |
415 | struct dm_dev *dev = ec->dev; |
416 | |
417 | /* |
418 | * Only pass ioctls through if the device sizes match exactly. |
419 | */ |
420 | *bdev = dev->bdev; |
421 | return !!(ec->start || ti->len != bdev_nr_sectors(bdev: dev->bdev)); |
422 | } |
423 | |
424 | static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) |
425 | { |
426 | struct ebs_c *ec = ti->private; |
427 | |
428 | limits->logical_block_size = to_bytes(n: ec->e_bs); |
429 | limits->physical_block_size = to_bytes(n: ec->u_bs); |
430 | limits->alignment_offset = limits->physical_block_size; |
431 | blk_limits_io_min(limits, min: limits->logical_block_size); |
432 | } |
433 | |
434 | static int ebs_iterate_devices(struct dm_target *ti, |
435 | iterate_devices_callout_fn fn, void *data) |
436 | { |
437 | struct ebs_c *ec = ti->private; |
438 | |
439 | return fn(ti, ec->dev, ec->start, ti->len, data); |
440 | } |
441 | |
442 | static struct target_type ebs_target = { |
443 | .name = "ebs" , |
444 | .version = {1, 0, 1}, |
445 | .features = DM_TARGET_PASSES_INTEGRITY, |
446 | .module = THIS_MODULE, |
447 | .ctr = ebs_ctr, |
448 | .dtr = ebs_dtr, |
449 | .map = ebs_map, |
450 | .status = ebs_status, |
451 | .io_hints = ebs_io_hints, |
452 | .prepare_ioctl = ebs_prepare_ioctl, |
453 | .iterate_devices = ebs_iterate_devices, |
454 | }; |
455 | module_dm(ebs); |
456 | |
457 | MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>" ); |
458 | MODULE_DESCRIPTION(DM_NAME " emulated block size target" ); |
459 | MODULE_LICENSE("GPL" ); |
460 | |