1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * linux/fs/ext4/block_validity.c |
4 | * |
5 | * Copyright (C) 2009 |
6 | * Theodore Ts'o (tytso@mit.edu) |
7 | * |
8 | * Track which blocks in the filesystem are metadata blocks that |
9 | * should never be used as data blocks by files or directories. |
10 | */ |
11 | |
12 | #include <linux/time.h> |
13 | #include <linux/fs.h> |
14 | #include <linux/namei.h> |
15 | #include <linux/quotaops.h> |
16 | #include <linux/buffer_head.h> |
17 | #include <linux/swap.h> |
18 | #include <linux/pagemap.h> |
19 | #include <linux/blkdev.h> |
20 | #include <linux/slab.h> |
21 | #include "ext4.h" |
22 | |
23 | struct ext4_system_zone { |
24 | struct rb_node node; |
25 | ext4_fsblk_t start_blk; |
26 | unsigned int count; |
27 | u32 ino; |
28 | }; |
29 | |
30 | static struct kmem_cache *ext4_system_zone_cachep; |
31 | |
32 | int __init ext4_init_system_zone(void) |
33 | { |
34 | ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); |
35 | if (ext4_system_zone_cachep == NULL) |
36 | return -ENOMEM; |
37 | return 0; |
38 | } |
39 | |
40 | void ext4_exit_system_zone(void) |
41 | { |
42 | rcu_barrier(); |
43 | kmem_cache_destroy(s: ext4_system_zone_cachep); |
44 | } |
45 | |
46 | static inline int can_merge(struct ext4_system_zone *entry1, |
47 | struct ext4_system_zone *entry2) |
48 | { |
49 | if ((entry1->start_blk + entry1->count) == entry2->start_blk && |
50 | entry1->ino == entry2->ino) |
51 | return 1; |
52 | return 0; |
53 | } |
54 | |
55 | static void release_system_zone(struct ext4_system_blocks *system_blks) |
56 | { |
57 | struct ext4_system_zone *entry, *n; |
58 | |
59 | rbtree_postorder_for_each_entry_safe(entry, n, |
60 | &system_blks->root, node) |
61 | kmem_cache_free(s: ext4_system_zone_cachep, objp: entry); |
62 | } |
63 | |
64 | /* |
65 | * Mark a range of blocks as belonging to the "system zone" --- that |
66 | * is, filesystem metadata blocks which should never be used by |
67 | * inodes. |
68 | */ |
69 | static int add_system_zone(struct ext4_system_blocks *system_blks, |
70 | ext4_fsblk_t start_blk, |
71 | unsigned int count, u32 ino) |
72 | { |
73 | struct ext4_system_zone *new_entry, *entry; |
74 | struct rb_node **n = &system_blks->root.rb_node, *node; |
75 | struct rb_node *parent = NULL, *new_node = NULL; |
76 | |
77 | while (*n) { |
78 | parent = *n; |
79 | entry = rb_entry(parent, struct ext4_system_zone, node); |
80 | if (start_blk < entry->start_blk) |
81 | n = &(*n)->rb_left; |
82 | else if (start_blk >= (entry->start_blk + entry->count)) |
83 | n = &(*n)->rb_right; |
84 | else /* Unexpected overlap of system zones. */ |
85 | return -EFSCORRUPTED; |
86 | } |
87 | |
88 | new_entry = kmem_cache_alloc(cachep: ext4_system_zone_cachep, |
89 | GFP_KERNEL); |
90 | if (!new_entry) |
91 | return -ENOMEM; |
92 | new_entry->start_blk = start_blk; |
93 | new_entry->count = count; |
94 | new_entry->ino = ino; |
95 | new_node = &new_entry->node; |
96 | |
97 | rb_link_node(node: new_node, parent, rb_link: n); |
98 | rb_insert_color(new_node, &system_blks->root); |
99 | |
100 | /* Can we merge to the left? */ |
101 | node = rb_prev(new_node); |
102 | if (node) { |
103 | entry = rb_entry(node, struct ext4_system_zone, node); |
104 | if (can_merge(entry1: entry, entry2: new_entry)) { |
105 | new_entry->start_blk = entry->start_blk; |
106 | new_entry->count += entry->count; |
107 | rb_erase(node, &system_blks->root); |
108 | kmem_cache_free(s: ext4_system_zone_cachep, objp: entry); |
109 | } |
110 | } |
111 | |
112 | /* Can we merge to the right? */ |
113 | node = rb_next(new_node); |
114 | if (node) { |
115 | entry = rb_entry(node, struct ext4_system_zone, node); |
116 | if (can_merge(entry1: new_entry, entry2: entry)) { |
117 | new_entry->count += entry->count; |
118 | rb_erase(node, &system_blks->root); |
119 | kmem_cache_free(s: ext4_system_zone_cachep, objp: entry); |
120 | } |
121 | } |
122 | return 0; |
123 | } |
124 | |
125 | static void debug_print_tree(struct ext4_sb_info *sbi) |
126 | { |
127 | struct rb_node *node; |
128 | struct ext4_system_zone *entry; |
129 | struct ext4_system_blocks *system_blks; |
130 | int first = 1; |
131 | |
132 | printk(KERN_INFO "System zones: " ); |
133 | rcu_read_lock(); |
134 | system_blks = rcu_dereference(sbi->s_system_blks); |
135 | node = rb_first(&system_blks->root); |
136 | while (node) { |
137 | entry = rb_entry(node, struct ext4_system_zone, node); |
138 | printk(KERN_CONT "%s%llu-%llu" , first ? "" : ", " , |
139 | entry->start_blk, entry->start_blk + entry->count - 1); |
140 | first = 0; |
141 | node = rb_next(node); |
142 | } |
143 | rcu_read_unlock(); |
144 | printk(KERN_CONT "\n" ); |
145 | } |
146 | |
147 | static int ext4_protect_reserved_inode(struct super_block *sb, |
148 | struct ext4_system_blocks *system_blks, |
149 | u32 ino) |
150 | { |
151 | struct inode *inode; |
152 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
153 | struct ext4_map_blocks map; |
154 | u32 i = 0, num; |
155 | int err = 0, n; |
156 | |
157 | if ((ino < EXT4_ROOT_INO) || |
158 | (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) |
159 | return -EINVAL; |
160 | inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); |
161 | if (IS_ERR(ptr: inode)) |
162 | return PTR_ERR(ptr: inode); |
163 | num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; |
164 | while (i < num) { |
165 | cond_resched(); |
166 | map.m_lblk = i; |
167 | map.m_len = num - i; |
168 | n = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
169 | if (n < 0) { |
170 | err = n; |
171 | break; |
172 | } |
173 | if (n == 0) { |
174 | i++; |
175 | } else { |
176 | err = add_system_zone(system_blks, start_blk: map.m_pblk, count: n, ino); |
177 | if (err < 0) { |
178 | if (err == -EFSCORRUPTED) { |
179 | EXT4_ERROR_INODE_ERR(inode, -err, |
180 | "blocks %llu-%llu from inode overlap system zone" , |
181 | map.m_pblk, |
182 | map.m_pblk + map.m_len - 1); |
183 | } |
184 | break; |
185 | } |
186 | i += n; |
187 | } |
188 | } |
189 | iput(inode); |
190 | return err; |
191 | } |
192 | |
193 | static void ext4_destroy_system_zone(struct rcu_head *rcu) |
194 | { |
195 | struct ext4_system_blocks *system_blks; |
196 | |
197 | system_blks = container_of(rcu, struct ext4_system_blocks, rcu); |
198 | release_system_zone(system_blks); |
199 | kfree(objp: system_blks); |
200 | } |
201 | |
202 | /* |
203 | * Build system zone rbtree which is used for block validity checking. |
204 | * |
205 | * The update of system_blks pointer in this function is protected by |
206 | * sb->s_umount semaphore. However we have to be careful as we can be |
207 | * racing with ext4_inode_block_valid() calls reading system_blks rbtree |
208 | * protected only by RCU. That's why we first build the rbtree and then |
209 | * swap it in place. |
210 | */ |
211 | int ext4_setup_system_zone(struct super_block *sb) |
212 | { |
213 | ext4_group_t ngroups = ext4_get_groups_count(sb); |
214 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
215 | struct ext4_system_blocks *system_blks; |
216 | struct ext4_group_desc *gdp; |
217 | ext4_group_t i; |
218 | int ret; |
219 | |
220 | system_blks = kzalloc(size: sizeof(*system_blks), GFP_KERNEL); |
221 | if (!system_blks) |
222 | return -ENOMEM; |
223 | |
224 | for (i=0; i < ngroups; i++) { |
225 | unsigned int meta_blks = ext4_num_base_meta_blocks(sb, block_group: i); |
226 | |
227 | cond_resched(); |
228 | if (meta_blks != 0) { |
229 | ret = add_system_zone(system_blks, |
230 | start_blk: ext4_group_first_block_no(sb, group_no: i), |
231 | count: meta_blks, ino: 0); |
232 | if (ret) |
233 | goto err; |
234 | } |
235 | gdp = ext4_get_group_desc(sb, block_group: i, NULL); |
236 | ret = add_system_zone(system_blks, |
237 | start_blk: ext4_block_bitmap(sb, bg: gdp), count: 1, ino: 0); |
238 | if (ret) |
239 | goto err; |
240 | ret = add_system_zone(system_blks, |
241 | start_blk: ext4_inode_bitmap(sb, bg: gdp), count: 1, ino: 0); |
242 | if (ret) |
243 | goto err; |
244 | ret = add_system_zone(system_blks, |
245 | start_blk: ext4_inode_table(sb, bg: gdp), |
246 | count: sbi->s_itb_per_group, ino: 0); |
247 | if (ret) |
248 | goto err; |
249 | } |
250 | if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { |
251 | ret = ext4_protect_reserved_inode(sb, system_blks, |
252 | le32_to_cpu(sbi->s_es->s_journal_inum)); |
253 | if (ret) |
254 | goto err; |
255 | } |
256 | |
257 | /* |
258 | * System blks rbtree complete, announce it once to prevent racing |
259 | * with ext4_inode_block_valid() accessing the rbtree at the same |
260 | * time. |
261 | */ |
262 | rcu_assign_pointer(sbi->s_system_blks, system_blks); |
263 | |
264 | if (test_opt(sb, DEBUG)) |
265 | debug_print_tree(sbi); |
266 | return 0; |
267 | err: |
268 | release_system_zone(system_blks); |
269 | kfree(objp: system_blks); |
270 | return ret; |
271 | } |
272 | |
273 | /* |
274 | * Called when the filesystem is unmounted or when remounting it with |
275 | * noblock_validity specified. |
276 | * |
277 | * The update of system_blks pointer in this function is protected by |
278 | * sb->s_umount semaphore. However we have to be careful as we can be |
279 | * racing with ext4_inode_block_valid() calls reading system_blks rbtree |
280 | * protected only by RCU. So we first clear the system_blks pointer and |
281 | * then free the rbtree only after RCU grace period expires. |
282 | */ |
283 | void ext4_release_system_zone(struct super_block *sb) |
284 | { |
285 | struct ext4_system_blocks *system_blks; |
286 | |
287 | system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, |
288 | lockdep_is_held(&sb->s_umount)); |
289 | rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); |
290 | |
291 | if (system_blks) |
292 | call_rcu(head: &system_blks->rcu, func: ext4_destroy_system_zone); |
293 | } |
294 | |
295 | int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, |
296 | ext4_fsblk_t start_blk, unsigned int count) |
297 | { |
298 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
299 | struct ext4_system_blocks *system_blks; |
300 | struct ext4_system_zone *entry; |
301 | struct rb_node *n; |
302 | int ret = 1; |
303 | |
304 | if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || |
305 | (start_blk + count < start_blk) || |
306 | (start_blk + count > ext4_blocks_count(es: sbi->s_es))) |
307 | return 0; |
308 | |
309 | /* |
310 | * Lock the system zone to prevent it being released concurrently |
311 | * when doing a remount which inverse current "[no]block_validity" |
312 | * mount option. |
313 | */ |
314 | rcu_read_lock(); |
315 | system_blks = rcu_dereference(sbi->s_system_blks); |
316 | if (system_blks == NULL) |
317 | goto out_rcu; |
318 | |
319 | n = system_blks->root.rb_node; |
320 | while (n) { |
321 | entry = rb_entry(n, struct ext4_system_zone, node); |
322 | if (start_blk + count - 1 < entry->start_blk) |
323 | n = n->rb_left; |
324 | else if (start_blk >= (entry->start_blk + entry->count)) |
325 | n = n->rb_right; |
326 | else { |
327 | ret = 0; |
328 | if (inode) |
329 | ret = (entry->ino == inode->i_ino); |
330 | break; |
331 | } |
332 | } |
333 | out_rcu: |
334 | rcu_read_unlock(); |
335 | return ret; |
336 | } |
337 | |
338 | /* |
339 | * Returns 1 if the passed-in block region (start_blk, |
340 | * start_blk+count) is valid; 0 if some part of the block region |
341 | * overlaps with some other filesystem metadata blocks. |
342 | */ |
343 | int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, |
344 | unsigned int count) |
345 | { |
346 | return ext4_sb_block_valid(sb: inode->i_sb, inode, start_blk, count); |
347 | } |
348 | |
349 | int ext4_check_blockref(const char *function, unsigned int line, |
350 | struct inode *inode, __le32 *p, unsigned int max) |
351 | { |
352 | __le32 *bref = p; |
353 | unsigned int blk; |
354 | |
355 | if (ext4_has_feature_journal(sb: inode->i_sb) && |
356 | (inode->i_ino == |
357 | le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) |
358 | return 0; |
359 | |
360 | while (bref < p+max) { |
361 | blk = le32_to_cpu(*bref++); |
362 | if (blk && |
363 | unlikely(!ext4_inode_block_valid(inode, blk, 1))) { |
364 | ext4_error_inode(inode, function, line, blk, |
365 | "invalid block" ); |
366 | return -EFSCORRUPTED; |
367 | } |
368 | } |
369 | return 0; |
370 | } |
371 | |
372 | |