1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 1991, 1992 Linus Torvalds |
4 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Copyright (C) 2016 - 2020 Christoph Hellwig |
6 | */ |
7 | |
8 | #include <linux/init.h> |
9 | #include <linux/mm.h> |
10 | #include <linux/slab.h> |
11 | #include <linux/kmod.h> |
12 | #include <linux/major.h> |
13 | #include <linux/device_cgroup.h> |
14 | #include <linux/blkdev.h> |
15 | #include <linux/blk-integrity.h> |
16 | #include <linux/backing-dev.h> |
17 | #include <linux/module.h> |
18 | #include <linux/blkpg.h> |
19 | #include <linux/magic.h> |
20 | #include <linux/buffer_head.h> |
21 | #include <linux/swap.h> |
22 | #include <linux/writeback.h> |
23 | #include <linux/mount.h> |
24 | #include <linux/pseudo_fs.h> |
25 | #include <linux/uio.h> |
26 | #include <linux/namei.h> |
27 | #include <linux/part_stat.h> |
28 | #include <linux/uaccess.h> |
29 | #include <linux/stat.h> |
30 | #include "../fs/internal.h" |
31 | #include "blk.h" |
32 | |
33 | struct bdev_inode { |
34 | struct block_device bdev; |
35 | struct inode vfs_inode; |
36 | }; |
37 | |
38 | static inline struct bdev_inode *BDEV_I(struct inode *inode) |
39 | { |
40 | return container_of(inode, struct bdev_inode, vfs_inode); |
41 | } |
42 | |
43 | struct block_device *I_BDEV(struct inode *inode) |
44 | { |
45 | return &BDEV_I(inode)->bdev; |
46 | } |
47 | EXPORT_SYMBOL(I_BDEV); |
48 | |
49 | static void bdev_write_inode(struct block_device *bdev) |
50 | { |
51 | struct inode *inode = bdev->bd_inode; |
52 | int ret; |
53 | |
54 | spin_lock(lock: &inode->i_lock); |
55 | while (inode->i_state & I_DIRTY) { |
56 | spin_unlock(lock: &inode->i_lock); |
57 | ret = write_inode_now(inode, sync: true); |
58 | if (ret) |
59 | pr_warn_ratelimited( |
60 | "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n" , |
61 | bdev, ret); |
62 | spin_lock(lock: &inode->i_lock); |
63 | } |
64 | spin_unlock(lock: &inode->i_lock); |
65 | } |
66 | |
67 | /* Kill _all_ buffers and pagecache , dirty or not.. */ |
68 | static void kill_bdev(struct block_device *bdev) |
69 | { |
70 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
71 | |
72 | if (mapping_empty(mapping)) |
73 | return; |
74 | |
75 | invalidate_bh_lrus(); |
76 | truncate_inode_pages(mapping, 0); |
77 | } |
78 | |
79 | /* Invalidate clean unused buffers and pagecache. */ |
80 | void invalidate_bdev(struct block_device *bdev) |
81 | { |
82 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
83 | |
84 | if (mapping->nrpages) { |
85 | invalidate_bh_lrus(); |
86 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ |
87 | invalidate_mapping_pages(mapping, start: 0, end: -1); |
88 | } |
89 | } |
90 | EXPORT_SYMBOL(invalidate_bdev); |
91 | |
92 | /* |
93 | * Drop all buffers & page cache for given bdev range. This function bails |
94 | * with error if bdev has other exclusive owner (such as filesystem). |
95 | */ |
96 | int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, |
97 | loff_t lstart, loff_t lend) |
98 | { |
99 | /* |
100 | * If we don't hold exclusive handle for the device, upgrade to it |
101 | * while we discard the buffer cache to avoid discarding buffers |
102 | * under live filesystem. |
103 | */ |
104 | if (!(mode & BLK_OPEN_EXCL)) { |
105 | int err = bd_prepare_to_claim(bdev, holder: truncate_bdev_range, NULL); |
106 | if (err) |
107 | goto invalidate; |
108 | } |
109 | |
110 | truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); |
111 | if (!(mode & BLK_OPEN_EXCL)) |
112 | bd_abort_claiming(bdev, holder: truncate_bdev_range); |
113 | return 0; |
114 | |
115 | invalidate: |
116 | /* |
117 | * Someone else has handle exclusively open. Try invalidating instead. |
118 | * The 'end' argument is inclusive so the rounding is safe. |
119 | */ |
120 | return invalidate_inode_pages2_range(mapping: bdev->bd_inode->i_mapping, |
121 | start: lstart >> PAGE_SHIFT, |
122 | end: lend >> PAGE_SHIFT); |
123 | } |
124 | |
125 | static void set_init_blocksize(struct block_device *bdev) |
126 | { |
127 | unsigned int bsize = bdev_logical_block_size(bdev); |
128 | loff_t size = i_size_read(inode: bdev->bd_inode); |
129 | |
130 | while (bsize < PAGE_SIZE) { |
131 | if (size & bsize) |
132 | break; |
133 | bsize <<= 1; |
134 | } |
135 | bdev->bd_inode->i_blkbits = blksize_bits(size: bsize); |
136 | } |
137 | |
138 | int set_blocksize(struct block_device *bdev, int size) |
139 | { |
140 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ |
141 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(n: size)) |
142 | return -EINVAL; |
143 | |
144 | /* Size cannot be smaller than the size supported by the device */ |
145 | if (size < bdev_logical_block_size(bdev)) |
146 | return -EINVAL; |
147 | |
148 | /* Don't change the size if it is same as current */ |
149 | if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { |
150 | sync_blockdev(bdev); |
151 | bdev->bd_inode->i_blkbits = blksize_bits(size); |
152 | kill_bdev(bdev); |
153 | } |
154 | return 0; |
155 | } |
156 | |
157 | EXPORT_SYMBOL(set_blocksize); |
158 | |
159 | int sb_set_blocksize(struct super_block *sb, int size) |
160 | { |
161 | if (set_blocksize(sb->s_bdev, size)) |
162 | return 0; |
163 | /* If we get here, we know size is power of two |
164 | * and it's value is between 512 and PAGE_SIZE */ |
165 | sb->s_blocksize = size; |
166 | sb->s_blocksize_bits = blksize_bits(size); |
167 | return sb->s_blocksize; |
168 | } |
169 | |
170 | EXPORT_SYMBOL(sb_set_blocksize); |
171 | |
172 | int sb_min_blocksize(struct super_block *sb, int size) |
173 | { |
174 | int minsize = bdev_logical_block_size(bdev: sb->s_bdev); |
175 | if (size < minsize) |
176 | size = minsize; |
177 | return sb_set_blocksize(sb, size); |
178 | } |
179 | |
180 | EXPORT_SYMBOL(sb_min_blocksize); |
181 | |
182 | int sync_blockdev_nowait(struct block_device *bdev) |
183 | { |
184 | if (!bdev) |
185 | return 0; |
186 | return filemap_flush(bdev->bd_inode->i_mapping); |
187 | } |
188 | EXPORT_SYMBOL_GPL(sync_blockdev_nowait); |
189 | |
190 | /* |
191 | * Write out and wait upon all the dirty data associated with a block |
192 | * device via its mapping. Does not take the superblock lock. |
193 | */ |
194 | int sync_blockdev(struct block_device *bdev) |
195 | { |
196 | if (!bdev) |
197 | return 0; |
198 | return filemap_write_and_wait(mapping: bdev->bd_inode->i_mapping); |
199 | } |
200 | EXPORT_SYMBOL(sync_blockdev); |
201 | |
202 | int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) |
203 | { |
204 | return filemap_write_and_wait_range(mapping: bdev->bd_inode->i_mapping, |
205 | lstart, lend); |
206 | } |
207 | EXPORT_SYMBOL(sync_blockdev_range); |
208 | |
209 | /** |
210 | * freeze_bdev - lock a filesystem and force it into a consistent state |
211 | * @bdev: blockdevice to lock |
212 | * |
213 | * If a superblock is found on this device, we take the s_umount semaphore |
214 | * on it to make sure nobody unmounts until the snapshot creation is done. |
215 | * The reference counter (bd_fsfreeze_count) guarantees that only the last |
216 | * unfreeze process can unfreeze the frozen filesystem actually when multiple |
217 | * freeze requests arrive simultaneously. It counts up in freeze_bdev() and |
218 | * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze |
219 | * actually. |
220 | */ |
221 | int freeze_bdev(struct block_device *bdev) |
222 | { |
223 | struct super_block *sb; |
224 | int error = 0; |
225 | |
226 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
227 | if (++bdev->bd_fsfreeze_count > 1) |
228 | goto done; |
229 | |
230 | sb = get_active_super(bdev); |
231 | if (!sb) |
232 | goto sync; |
233 | if (sb->s_op->freeze_super) |
234 | error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); |
235 | else |
236 | error = freeze_super(super: sb, who: FREEZE_HOLDER_USERSPACE); |
237 | deactivate_super(sb); |
238 | |
239 | if (error) { |
240 | bdev->bd_fsfreeze_count--; |
241 | goto done; |
242 | } |
243 | bdev->bd_fsfreeze_sb = sb; |
244 | |
245 | sync: |
246 | sync_blockdev(bdev); |
247 | done: |
248 | mutex_unlock(lock: &bdev->bd_fsfreeze_mutex); |
249 | return error; |
250 | } |
251 | EXPORT_SYMBOL(freeze_bdev); |
252 | |
253 | /** |
254 | * thaw_bdev - unlock filesystem |
255 | * @bdev: blockdevice to unlock |
256 | * |
257 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). |
258 | */ |
259 | int thaw_bdev(struct block_device *bdev) |
260 | { |
261 | struct super_block *sb; |
262 | int error = -EINVAL; |
263 | |
264 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
265 | if (!bdev->bd_fsfreeze_count) |
266 | goto out; |
267 | |
268 | error = 0; |
269 | if (--bdev->bd_fsfreeze_count > 0) |
270 | goto out; |
271 | |
272 | sb = bdev->bd_fsfreeze_sb; |
273 | if (!sb) |
274 | goto out; |
275 | |
276 | if (sb->s_op->thaw_super) |
277 | error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); |
278 | else |
279 | error = thaw_super(super: sb, who: FREEZE_HOLDER_USERSPACE); |
280 | if (error) |
281 | bdev->bd_fsfreeze_count++; |
282 | else |
283 | bdev->bd_fsfreeze_sb = NULL; |
284 | out: |
285 | mutex_unlock(lock: &bdev->bd_fsfreeze_mutex); |
286 | return error; |
287 | } |
288 | EXPORT_SYMBOL(thaw_bdev); |
289 | |
290 | /* |
291 | * pseudo-fs |
292 | */ |
293 | |
294 | static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); |
295 | static struct kmem_cache *bdev_cachep __ro_after_init; |
296 | |
297 | static struct inode *bdev_alloc_inode(struct super_block *sb) |
298 | { |
299 | struct bdev_inode *ei = alloc_inode_sb(sb, cache: bdev_cachep, GFP_KERNEL); |
300 | |
301 | if (!ei) |
302 | return NULL; |
303 | memset(&ei->bdev, 0, sizeof(ei->bdev)); |
304 | return &ei->vfs_inode; |
305 | } |
306 | |
307 | static void bdev_free_inode(struct inode *inode) |
308 | { |
309 | struct block_device *bdev = I_BDEV(inode); |
310 | |
311 | free_percpu(pdata: bdev->bd_stats); |
312 | kfree(objp: bdev->bd_meta_info); |
313 | |
314 | if (!bdev_is_partition(bdev)) { |
315 | if (bdev->bd_disk && bdev->bd_disk->bdi) |
316 | bdi_put(bdi: bdev->bd_disk->bdi); |
317 | kfree(objp: bdev->bd_disk); |
318 | } |
319 | |
320 | if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) |
321 | blk_free_ext_minor(MINOR(bdev->bd_dev)); |
322 | |
323 | kmem_cache_free(s: bdev_cachep, objp: BDEV_I(inode)); |
324 | } |
325 | |
326 | static void init_once(void *data) |
327 | { |
328 | struct bdev_inode *ei = data; |
329 | |
330 | inode_init_once(&ei->vfs_inode); |
331 | } |
332 | |
333 | static void bdev_evict_inode(struct inode *inode) |
334 | { |
335 | truncate_inode_pages_final(&inode->i_data); |
336 | invalidate_inode_buffers(inode); /* is it needed here? */ |
337 | clear_inode(inode); |
338 | } |
339 | |
340 | static const struct super_operations bdev_sops = { |
341 | .statfs = simple_statfs, |
342 | .alloc_inode = bdev_alloc_inode, |
343 | .free_inode = bdev_free_inode, |
344 | .drop_inode = generic_delete_inode, |
345 | .evict_inode = bdev_evict_inode, |
346 | }; |
347 | |
348 | static int bd_init_fs_context(struct fs_context *fc) |
349 | { |
350 | struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); |
351 | if (!ctx) |
352 | return -ENOMEM; |
353 | fc->s_iflags |= SB_I_CGROUPWB; |
354 | ctx->ops = &bdev_sops; |
355 | return 0; |
356 | } |
357 | |
358 | static struct file_system_type bd_type = { |
359 | .name = "bdev" , |
360 | .init_fs_context = bd_init_fs_context, |
361 | .kill_sb = kill_anon_super, |
362 | }; |
363 | |
364 | struct super_block *blockdev_superblock __ro_after_init; |
365 | EXPORT_SYMBOL_GPL(blockdev_superblock); |
366 | |
367 | void __init bdev_cache_init(void) |
368 | { |
369 | int err; |
370 | static struct vfsmount *bd_mnt __ro_after_init; |
371 | |
372 | bdev_cachep = kmem_cache_create(name: "bdev_cache" , size: sizeof(struct bdev_inode), |
373 | align: 0, flags: (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
374 | SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), |
375 | ctor: init_once); |
376 | err = register_filesystem(&bd_type); |
377 | if (err) |
378 | panic(fmt: "Cannot register bdev pseudo-fs" ); |
379 | bd_mnt = kern_mount(&bd_type); |
380 | if (IS_ERR(ptr: bd_mnt)) |
381 | panic(fmt: "Cannot create bdev pseudo-fs" ); |
382 | blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ |
383 | } |
384 | |
385 | struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) |
386 | { |
387 | struct block_device *bdev; |
388 | struct inode *inode; |
389 | |
390 | inode = new_inode(sb: blockdev_superblock); |
391 | if (!inode) |
392 | return NULL; |
393 | inode->i_mode = S_IFBLK; |
394 | inode->i_rdev = 0; |
395 | inode->i_data.a_ops = &def_blk_aops; |
396 | mapping_set_gfp_mask(m: &inode->i_data, GFP_USER); |
397 | |
398 | bdev = I_BDEV(inode); |
399 | mutex_init(&bdev->bd_fsfreeze_mutex); |
400 | spin_lock_init(&bdev->bd_size_lock); |
401 | mutex_init(&bdev->bd_holder_lock); |
402 | bdev->bd_partno = partno; |
403 | bdev->bd_inode = inode; |
404 | bdev->bd_queue = disk->queue; |
405 | if (partno) |
406 | bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; |
407 | else |
408 | bdev->bd_has_submit_bio = false; |
409 | bdev->bd_stats = alloc_percpu(struct disk_stats); |
410 | if (!bdev->bd_stats) { |
411 | iput(inode); |
412 | return NULL; |
413 | } |
414 | bdev->bd_disk = disk; |
415 | return bdev; |
416 | } |
417 | |
418 | void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) |
419 | { |
420 | spin_lock(lock: &bdev->bd_size_lock); |
421 | i_size_write(inode: bdev->bd_inode, i_size: (loff_t)sectors << SECTOR_SHIFT); |
422 | bdev->bd_nr_sectors = sectors; |
423 | spin_unlock(lock: &bdev->bd_size_lock); |
424 | } |
425 | |
426 | void bdev_add(struct block_device *bdev, dev_t dev) |
427 | { |
428 | bdev->bd_dev = dev; |
429 | bdev->bd_inode->i_rdev = dev; |
430 | bdev->bd_inode->i_ino = dev; |
431 | insert_inode_hash(inode: bdev->bd_inode); |
432 | } |
433 | |
434 | long nr_blockdev_pages(void) |
435 | { |
436 | struct inode *inode; |
437 | long ret = 0; |
438 | |
439 | spin_lock(lock: &blockdev_superblock->s_inode_list_lock); |
440 | list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) |
441 | ret += inode->i_mapping->nrpages; |
442 | spin_unlock(lock: &blockdev_superblock->s_inode_list_lock); |
443 | |
444 | return ret; |
445 | } |
446 | |
447 | /** |
448 | * bd_may_claim - test whether a block device can be claimed |
449 | * @bdev: block device of interest |
450 | * @holder: holder trying to claim @bdev |
451 | * @hops: holder ops |
452 | * |
453 | * Test whether @bdev can be claimed by @holder. |
454 | * |
455 | * RETURNS: |
456 | * %true if @bdev can be claimed, %false otherwise. |
457 | */ |
458 | static bool bd_may_claim(struct block_device *bdev, void *holder, |
459 | const struct blk_holder_ops *hops) |
460 | { |
461 | struct block_device *whole = bdev_whole(bdev); |
462 | |
463 | lockdep_assert_held(&bdev_lock); |
464 | |
465 | if (bdev->bd_holder) { |
466 | /* |
467 | * The same holder can always re-claim. |
468 | */ |
469 | if (bdev->bd_holder == holder) { |
470 | if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) |
471 | return false; |
472 | return true; |
473 | } |
474 | return false; |
475 | } |
476 | |
477 | /* |
478 | * If the whole devices holder is set to bd_may_claim, a partition on |
479 | * the device is claimed, but not the whole device. |
480 | */ |
481 | if (whole != bdev && |
482 | whole->bd_holder && whole->bd_holder != bd_may_claim) |
483 | return false; |
484 | return true; |
485 | } |
486 | |
487 | /** |
488 | * bd_prepare_to_claim - claim a block device |
489 | * @bdev: block device of interest |
490 | * @holder: holder trying to claim @bdev |
491 | * @hops: holder ops. |
492 | * |
493 | * Claim @bdev. This function fails if @bdev is already claimed by another |
494 | * holder and waits if another claiming is in progress. return, the caller |
495 | * has ownership of bd_claiming and bd_holder[s]. |
496 | * |
497 | * RETURNS: |
498 | * 0 if @bdev can be claimed, -EBUSY otherwise. |
499 | */ |
500 | int bd_prepare_to_claim(struct block_device *bdev, void *holder, |
501 | const struct blk_holder_ops *hops) |
502 | { |
503 | struct block_device *whole = bdev_whole(bdev); |
504 | |
505 | if (WARN_ON_ONCE(!holder)) |
506 | return -EINVAL; |
507 | retry: |
508 | mutex_lock(&bdev_lock); |
509 | /* if someone else claimed, fail */ |
510 | if (!bd_may_claim(bdev, holder, hops)) { |
511 | mutex_unlock(lock: &bdev_lock); |
512 | return -EBUSY; |
513 | } |
514 | |
515 | /* if claiming is already in progress, wait for it to finish */ |
516 | if (whole->bd_claiming) { |
517 | wait_queue_head_t *wq = bit_waitqueue(word: &whole->bd_claiming, bit: 0); |
518 | DEFINE_WAIT(wait); |
519 | |
520 | prepare_to_wait(wq_head: wq, wq_entry: &wait, TASK_UNINTERRUPTIBLE); |
521 | mutex_unlock(lock: &bdev_lock); |
522 | schedule(); |
523 | finish_wait(wq_head: wq, wq_entry: &wait); |
524 | goto retry; |
525 | } |
526 | |
527 | /* yay, all mine */ |
528 | whole->bd_claiming = holder; |
529 | mutex_unlock(lock: &bdev_lock); |
530 | return 0; |
531 | } |
532 | EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ |
533 | |
534 | static void bd_clear_claiming(struct block_device *whole, void *holder) |
535 | { |
536 | lockdep_assert_held(&bdev_lock); |
537 | /* tell others that we're done */ |
538 | BUG_ON(whole->bd_claiming != holder); |
539 | whole->bd_claiming = NULL; |
540 | wake_up_bit(word: &whole->bd_claiming, bit: 0); |
541 | } |
542 | |
543 | /** |
544 | * bd_finish_claiming - finish claiming of a block device |
545 | * @bdev: block device of interest |
546 | * @holder: holder that has claimed @bdev |
547 | * @hops: block device holder operations |
548 | * |
549 | * Finish exclusive open of a block device. Mark the device as exlusively |
550 | * open by the holder and wake up all waiters for exclusive open to finish. |
551 | */ |
552 | static void bd_finish_claiming(struct block_device *bdev, void *holder, |
553 | const struct blk_holder_ops *hops) |
554 | { |
555 | struct block_device *whole = bdev_whole(bdev); |
556 | |
557 | mutex_lock(&bdev_lock); |
558 | BUG_ON(!bd_may_claim(bdev, holder, hops)); |
559 | /* |
560 | * Note that for a whole device bd_holders will be incremented twice, |
561 | * and bd_holder will be set to bd_may_claim before being set to holder |
562 | */ |
563 | whole->bd_holders++; |
564 | whole->bd_holder = bd_may_claim; |
565 | bdev->bd_holders++; |
566 | mutex_lock(&bdev->bd_holder_lock); |
567 | bdev->bd_holder = holder; |
568 | bdev->bd_holder_ops = hops; |
569 | mutex_unlock(lock: &bdev->bd_holder_lock); |
570 | bd_clear_claiming(whole, holder); |
571 | mutex_unlock(lock: &bdev_lock); |
572 | } |
573 | |
574 | /** |
575 | * bd_abort_claiming - abort claiming of a block device |
576 | * @bdev: block device of interest |
577 | * @holder: holder that has claimed @bdev |
578 | * |
579 | * Abort claiming of a block device when the exclusive open failed. This can be |
580 | * also used when exclusive open is not actually desired and we just needed |
581 | * to block other exclusive openers for a while. |
582 | */ |
583 | void bd_abort_claiming(struct block_device *bdev, void *holder) |
584 | { |
585 | mutex_lock(&bdev_lock); |
586 | bd_clear_claiming(bdev_whole(bdev), holder); |
587 | mutex_unlock(lock: &bdev_lock); |
588 | } |
589 | EXPORT_SYMBOL(bd_abort_claiming); |
590 | |
591 | static void bd_end_claim(struct block_device *bdev, void *holder) |
592 | { |
593 | struct block_device *whole = bdev_whole(bdev); |
594 | bool unblock = false; |
595 | |
596 | /* |
597 | * Release a claim on the device. The holder fields are protected with |
598 | * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. |
599 | */ |
600 | mutex_lock(&bdev_lock); |
601 | WARN_ON_ONCE(bdev->bd_holder != holder); |
602 | WARN_ON_ONCE(--bdev->bd_holders < 0); |
603 | WARN_ON_ONCE(--whole->bd_holders < 0); |
604 | if (!bdev->bd_holders) { |
605 | mutex_lock(&bdev->bd_holder_lock); |
606 | bdev->bd_holder = NULL; |
607 | bdev->bd_holder_ops = NULL; |
608 | mutex_unlock(lock: &bdev->bd_holder_lock); |
609 | if (bdev->bd_write_holder) |
610 | unblock = true; |
611 | } |
612 | if (!whole->bd_holders) |
613 | whole->bd_holder = NULL; |
614 | mutex_unlock(lock: &bdev_lock); |
615 | |
616 | /* |
617 | * If this was the last claim, remove holder link and unblock evpoll if |
618 | * it was a write holder. |
619 | */ |
620 | if (unblock) { |
621 | disk_unblock_events(disk: bdev->bd_disk); |
622 | bdev->bd_write_holder = false; |
623 | } |
624 | } |
625 | |
626 | static void blkdev_flush_mapping(struct block_device *bdev) |
627 | { |
628 | WARN_ON_ONCE(bdev->bd_holders); |
629 | sync_blockdev(bdev); |
630 | kill_bdev(bdev); |
631 | bdev_write_inode(bdev); |
632 | } |
633 | |
634 | static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) |
635 | { |
636 | struct gendisk *disk = bdev->bd_disk; |
637 | int ret; |
638 | |
639 | if (disk->fops->open) { |
640 | ret = disk->fops->open(disk, mode); |
641 | if (ret) { |
642 | /* avoid ghost partitions on a removed medium */ |
643 | if (ret == -ENOMEDIUM && |
644 | test_bit(GD_NEED_PART_SCAN, &disk->state)) |
645 | bdev_disk_changed(disk, invalidate: true); |
646 | return ret; |
647 | } |
648 | } |
649 | |
650 | if (!atomic_read(v: &bdev->bd_openers)) |
651 | set_init_blocksize(bdev); |
652 | if (test_bit(GD_NEED_PART_SCAN, &disk->state)) |
653 | bdev_disk_changed(disk, invalidate: false); |
654 | atomic_inc(v: &bdev->bd_openers); |
655 | return 0; |
656 | } |
657 | |
658 | static void blkdev_put_whole(struct block_device *bdev) |
659 | { |
660 | if (atomic_dec_and_test(v: &bdev->bd_openers)) |
661 | blkdev_flush_mapping(bdev); |
662 | if (bdev->bd_disk->fops->release) |
663 | bdev->bd_disk->fops->release(bdev->bd_disk); |
664 | } |
665 | |
666 | static int blkdev_get_part(struct block_device *part, blk_mode_t mode) |
667 | { |
668 | struct gendisk *disk = part->bd_disk; |
669 | int ret; |
670 | |
671 | ret = blkdev_get_whole(bdev_whole(part), mode); |
672 | if (ret) |
673 | return ret; |
674 | |
675 | ret = -ENXIO; |
676 | if (!bdev_nr_sectors(bdev: part)) |
677 | goto out_blkdev_put; |
678 | |
679 | if (!atomic_read(v: &part->bd_openers)) { |
680 | disk->open_partitions++; |
681 | set_init_blocksize(part); |
682 | } |
683 | atomic_inc(v: &part->bd_openers); |
684 | return 0; |
685 | |
686 | out_blkdev_put: |
687 | blkdev_put_whole(bdev_whole(part)); |
688 | return ret; |
689 | } |
690 | |
691 | static void blkdev_put_part(struct block_device *part) |
692 | { |
693 | struct block_device *whole = bdev_whole(part); |
694 | |
695 | if (atomic_dec_and_test(v: &part->bd_openers)) { |
696 | blkdev_flush_mapping(bdev: part); |
697 | whole->bd_disk->open_partitions--; |
698 | } |
699 | blkdev_put_whole(bdev: whole); |
700 | } |
701 | |
702 | struct block_device *blkdev_get_no_open(dev_t dev) |
703 | { |
704 | struct block_device *bdev; |
705 | struct inode *inode; |
706 | |
707 | inode = ilookup(sb: blockdev_superblock, ino: dev); |
708 | if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { |
709 | blk_request_module(devt: dev); |
710 | inode = ilookup(sb: blockdev_superblock, ino: dev); |
711 | if (inode) |
712 | pr_warn_ratelimited( |
713 | "block device autoloading is deprecated and will be removed.\n" ); |
714 | } |
715 | if (!inode) |
716 | return NULL; |
717 | |
718 | /* switch from the inode reference to a device mode one: */ |
719 | bdev = &BDEV_I(inode)->bdev; |
720 | if (!kobject_get_unless_zero(kobj: &bdev->bd_device.kobj)) |
721 | bdev = NULL; |
722 | iput(inode); |
723 | return bdev; |
724 | } |
725 | |
726 | void blkdev_put_no_open(struct block_device *bdev) |
727 | { |
728 | put_device(dev: &bdev->bd_device); |
729 | } |
730 | |
731 | /** |
732 | * blkdev_get_by_dev - open a block device by device number |
733 | * @dev: device number of block device to open |
734 | * @mode: open mode (BLK_OPEN_*) |
735 | * @holder: exclusive holder identifier |
736 | * @hops: holder operations |
737 | * |
738 | * Open the block device described by device number @dev. If @holder is not |
739 | * %NULL, the block device is opened with exclusive access. Exclusive opens may |
740 | * nest for the same @holder. |
741 | * |
742 | * Use this interface ONLY if you really do not have anything better - i.e. when |
743 | * you are behind a truly sucky interface and all you are given is a device |
744 | * number. Everything else should use blkdev_get_by_path(). |
745 | * |
746 | * CONTEXT: |
747 | * Might sleep. |
748 | * |
749 | * RETURNS: |
750 | * Reference to the block_device on success, ERR_PTR(-errno) on failure. |
751 | */ |
752 | struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, |
753 | const struct blk_holder_ops *hops) |
754 | { |
755 | bool unblock_events = true; |
756 | struct block_device *bdev; |
757 | struct gendisk *disk; |
758 | int ret; |
759 | |
760 | ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, |
761 | MAJOR(dev), MINOR(dev), |
762 | access: ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | |
763 | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); |
764 | if (ret) |
765 | return ERR_PTR(error: ret); |
766 | |
767 | bdev = blkdev_get_no_open(dev); |
768 | if (!bdev) |
769 | return ERR_PTR(error: -ENXIO); |
770 | disk = bdev->bd_disk; |
771 | |
772 | if (holder) { |
773 | mode |= BLK_OPEN_EXCL; |
774 | ret = bd_prepare_to_claim(bdev, holder, hops); |
775 | if (ret) |
776 | goto put_blkdev; |
777 | } else { |
778 | if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { |
779 | ret = -EIO; |
780 | goto put_blkdev; |
781 | } |
782 | } |
783 | |
784 | disk_block_events(disk); |
785 | |
786 | mutex_lock(&disk->open_mutex); |
787 | ret = -ENXIO; |
788 | if (!disk_live(disk)) |
789 | goto abort_claiming; |
790 | if (!try_module_get(module: disk->fops->owner)) |
791 | goto abort_claiming; |
792 | if (bdev_is_partition(bdev)) |
793 | ret = blkdev_get_part(part: bdev, mode); |
794 | else |
795 | ret = blkdev_get_whole(bdev, mode); |
796 | if (ret) |
797 | goto put_module; |
798 | if (holder) { |
799 | bd_finish_claiming(bdev, holder, hops); |
800 | |
801 | /* |
802 | * Block event polling for write claims if requested. Any write |
803 | * holder makes the write_holder state stick until all are |
804 | * released. This is good enough and tracking individual |
805 | * writeable reference is too fragile given the way @mode is |
806 | * used in blkdev_get/put(). |
807 | */ |
808 | if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && |
809 | (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { |
810 | bdev->bd_write_holder = true; |
811 | unblock_events = false; |
812 | } |
813 | } |
814 | mutex_unlock(lock: &disk->open_mutex); |
815 | |
816 | if (unblock_events) |
817 | disk_unblock_events(disk); |
818 | return bdev; |
819 | put_module: |
820 | module_put(module: disk->fops->owner); |
821 | abort_claiming: |
822 | if (holder) |
823 | bd_abort_claiming(bdev, holder); |
824 | mutex_unlock(lock: &disk->open_mutex); |
825 | disk_unblock_events(disk); |
826 | put_blkdev: |
827 | blkdev_put_no_open(bdev); |
828 | return ERR_PTR(error: ret); |
829 | } |
830 | EXPORT_SYMBOL(blkdev_get_by_dev); |
831 | |
832 | struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, |
833 | const struct blk_holder_ops *hops) |
834 | { |
835 | struct bdev_handle *handle = kmalloc(size: sizeof(*handle), GFP_KERNEL); |
836 | struct block_device *bdev; |
837 | |
838 | if (!handle) |
839 | return ERR_PTR(error: -ENOMEM); |
840 | bdev = blkdev_get_by_dev(dev, mode, holder, hops); |
841 | if (IS_ERR(ptr: bdev)) { |
842 | kfree(objp: handle); |
843 | return ERR_CAST(ptr: bdev); |
844 | } |
845 | handle->bdev = bdev; |
846 | handle->holder = holder; |
847 | if (holder) |
848 | mode |= BLK_OPEN_EXCL; |
849 | handle->mode = mode; |
850 | return handle; |
851 | } |
852 | EXPORT_SYMBOL(bdev_open_by_dev); |
853 | |
854 | /** |
855 | * blkdev_get_by_path - open a block device by name |
856 | * @path: path to the block device to open |
857 | * @mode: open mode (BLK_OPEN_*) |
858 | * @holder: exclusive holder identifier |
859 | * @hops: holder operations |
860 | * |
861 | * Open the block device described by the device file at @path. If @holder is |
862 | * not %NULL, the block device is opened with exclusive access. Exclusive opens |
863 | * may nest for the same @holder. |
864 | * |
865 | * CONTEXT: |
866 | * Might sleep. |
867 | * |
868 | * RETURNS: |
869 | * Reference to the block_device on success, ERR_PTR(-errno) on failure. |
870 | */ |
871 | struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, |
872 | void *holder, const struct blk_holder_ops *hops) |
873 | { |
874 | struct block_device *bdev; |
875 | dev_t dev; |
876 | int error; |
877 | |
878 | error = lookup_bdev(pathname: path, dev: &dev); |
879 | if (error) |
880 | return ERR_PTR(error); |
881 | |
882 | bdev = blkdev_get_by_dev(dev, mode, holder, hops); |
883 | if (!IS_ERR(ptr: bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { |
884 | blkdev_put(bdev, holder); |
885 | return ERR_PTR(error: -EACCES); |
886 | } |
887 | |
888 | return bdev; |
889 | } |
890 | EXPORT_SYMBOL(blkdev_get_by_path); |
891 | |
892 | struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, |
893 | void *holder, const struct blk_holder_ops *hops) |
894 | { |
895 | struct bdev_handle *handle; |
896 | dev_t dev; |
897 | int error; |
898 | |
899 | error = lookup_bdev(pathname: path, dev: &dev); |
900 | if (error) |
901 | return ERR_PTR(error); |
902 | |
903 | handle = bdev_open_by_dev(dev, mode, holder, hops); |
904 | if (!IS_ERR(ptr: handle) && (mode & BLK_OPEN_WRITE) && |
905 | bdev_read_only(bdev: handle->bdev)) { |
906 | bdev_release(handle); |
907 | return ERR_PTR(error: -EACCES); |
908 | } |
909 | |
910 | return handle; |
911 | } |
912 | EXPORT_SYMBOL(bdev_open_by_path); |
913 | |
914 | void blkdev_put(struct block_device *bdev, void *holder) |
915 | { |
916 | struct gendisk *disk = bdev->bd_disk; |
917 | |
918 | /* |
919 | * Sync early if it looks like we're the last one. If someone else |
920 | * opens the block device between now and the decrement of bd_openers |
921 | * then we did a sync that we didn't need to, but that's not the end |
922 | * of the world and we want to avoid long (could be several minute) |
923 | * syncs while holding the mutex. |
924 | */ |
925 | if (atomic_read(v: &bdev->bd_openers) == 1) |
926 | sync_blockdev(bdev); |
927 | |
928 | mutex_lock(&disk->open_mutex); |
929 | if (holder) |
930 | bd_end_claim(bdev, holder); |
931 | |
932 | /* |
933 | * Trigger event checking and tell drivers to flush MEDIA_CHANGE |
934 | * event. This is to ensure detection of media removal commanded |
935 | * from userland - e.g. eject(1). |
936 | */ |
937 | disk_flush_events(disk, mask: DISK_EVENT_MEDIA_CHANGE); |
938 | |
939 | if (bdev_is_partition(bdev)) |
940 | blkdev_put_part(part: bdev); |
941 | else |
942 | blkdev_put_whole(bdev); |
943 | mutex_unlock(lock: &disk->open_mutex); |
944 | |
945 | module_put(module: disk->fops->owner); |
946 | blkdev_put_no_open(bdev); |
947 | } |
948 | EXPORT_SYMBOL(blkdev_put); |
949 | |
950 | void bdev_release(struct bdev_handle *handle) |
951 | { |
952 | blkdev_put(handle->bdev, handle->holder); |
953 | kfree(objp: handle); |
954 | } |
955 | EXPORT_SYMBOL(bdev_release); |
956 | |
957 | /** |
958 | * lookup_bdev() - Look up a struct block_device by name. |
959 | * @pathname: Name of the block device in the filesystem. |
960 | * @dev: Pointer to the block device's dev_t, if found. |
961 | * |
962 | * Lookup the block device's dev_t at @pathname in the current |
963 | * namespace if possible and return it in @dev. |
964 | * |
965 | * Context: May sleep. |
966 | * Return: 0 if succeeded, negative errno otherwise. |
967 | */ |
968 | int lookup_bdev(const char *pathname, dev_t *dev) |
969 | { |
970 | struct inode *inode; |
971 | struct path path; |
972 | int error; |
973 | |
974 | if (!pathname || !*pathname) |
975 | return -EINVAL; |
976 | |
977 | error = kern_path(pathname, LOOKUP_FOLLOW, &path); |
978 | if (error) |
979 | return error; |
980 | |
981 | inode = d_backing_inode(upper: path.dentry); |
982 | error = -ENOTBLK; |
983 | if (!S_ISBLK(inode->i_mode)) |
984 | goto out_path_put; |
985 | error = -EACCES; |
986 | if (!may_open_dev(path: &path)) |
987 | goto out_path_put; |
988 | |
989 | *dev = inode->i_rdev; |
990 | error = 0; |
991 | out_path_put: |
992 | path_put(&path); |
993 | return error; |
994 | } |
995 | EXPORT_SYMBOL(lookup_bdev); |
996 | |
997 | /** |
998 | * bdev_mark_dead - mark a block device as dead |
999 | * @bdev: block device to operate on |
1000 | * @surprise: indicate a surprise removal |
1001 | * |
1002 | * Tell the file system that this devices or media is dead. If @surprise is set |
1003 | * to %true the device or media is already gone, if not we are preparing for an |
1004 | * orderly removal. |
1005 | * |
1006 | * This calls into the file system, which then typicall syncs out all dirty data |
1007 | * and writes back inodes and then invalidates any cached data in the inodes on |
1008 | * the file system. In addition we also invalidate the block device mapping. |
1009 | */ |
1010 | void bdev_mark_dead(struct block_device *bdev, bool surprise) |
1011 | { |
1012 | mutex_lock(&bdev->bd_holder_lock); |
1013 | if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) |
1014 | bdev->bd_holder_ops->mark_dead(bdev, surprise); |
1015 | else { |
1016 | mutex_unlock(lock: &bdev->bd_holder_lock); |
1017 | sync_blockdev(bdev); |
1018 | } |
1019 | |
1020 | invalidate_bdev(bdev); |
1021 | } |
1022 | /* |
1023 | * New drivers should not use this directly. There are some drivers however |
1024 | * that needs this for historical reasons. For example, the DASD driver has |
1025 | * historically had a shutdown to offline mode that doesn't actually remove the |
1026 | * gendisk that otherwise looks a lot like a safe device removal. |
1027 | */ |
1028 | EXPORT_SYMBOL_GPL(bdev_mark_dead); |
1029 | |
1030 | void sync_bdevs(bool wait) |
1031 | { |
1032 | struct inode *inode, *old_inode = NULL; |
1033 | |
1034 | spin_lock(lock: &blockdev_superblock->s_inode_list_lock); |
1035 | list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { |
1036 | struct address_space *mapping = inode->i_mapping; |
1037 | struct block_device *bdev; |
1038 | |
1039 | spin_lock(lock: &inode->i_lock); |
1040 | if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || |
1041 | mapping->nrpages == 0) { |
1042 | spin_unlock(lock: &inode->i_lock); |
1043 | continue; |
1044 | } |
1045 | __iget(inode); |
1046 | spin_unlock(lock: &inode->i_lock); |
1047 | spin_unlock(lock: &blockdev_superblock->s_inode_list_lock); |
1048 | /* |
1049 | * We hold a reference to 'inode' so it couldn't have been |
1050 | * removed from s_inodes list while we dropped the |
1051 | * s_inode_list_lock We cannot iput the inode now as we can |
1052 | * be holding the last reference and we cannot iput it under |
1053 | * s_inode_list_lock. So we keep the reference and iput it |
1054 | * later. |
1055 | */ |
1056 | iput(old_inode); |
1057 | old_inode = inode; |
1058 | bdev = I_BDEV(inode); |
1059 | |
1060 | mutex_lock(&bdev->bd_disk->open_mutex); |
1061 | if (!atomic_read(v: &bdev->bd_openers)) { |
1062 | ; /* skip */ |
1063 | } else if (wait) { |
1064 | /* |
1065 | * We keep the error status of individual mapping so |
1066 | * that applications can catch the writeback error using |
1067 | * fsync(2). See filemap_fdatawait_keep_errors() for |
1068 | * details. |
1069 | */ |
1070 | filemap_fdatawait_keep_errors(mapping: inode->i_mapping); |
1071 | } else { |
1072 | filemap_fdatawrite(inode->i_mapping); |
1073 | } |
1074 | mutex_unlock(lock: &bdev->bd_disk->open_mutex); |
1075 | |
1076 | spin_lock(lock: &blockdev_superblock->s_inode_list_lock); |
1077 | } |
1078 | spin_unlock(lock: &blockdev_superblock->s_inode_list_lock); |
1079 | iput(old_inode); |
1080 | } |
1081 | |
1082 | /* |
1083 | * Handle STATX_DIOALIGN for block devices. |
1084 | * |
1085 | * Note that the inode passed to this is the inode of a block device node file, |
1086 | * not the block device's internal inode. Therefore it is *not* valid to use |
1087 | * I_BDEV() here; the block device has to be looked up by i_rdev instead. |
1088 | */ |
1089 | void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) |
1090 | { |
1091 | struct block_device *bdev; |
1092 | |
1093 | bdev = blkdev_get_no_open(dev: inode->i_rdev); |
1094 | if (!bdev) |
1095 | return; |
1096 | |
1097 | stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; |
1098 | stat->dio_offset_align = bdev_logical_block_size(bdev); |
1099 | stat->result_mask |= STATX_DIOALIGN; |
1100 | |
1101 | blkdev_put_no_open(bdev); |
1102 | } |
1103 | |