1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Simple file system for zoned block devices exposing zones as files. |
4 | * |
5 | * Copyright (C) 2019 Western Digital Corporation or its affiliates. |
6 | */ |
7 | #include <linux/module.h> |
8 | #include <linux/pagemap.h> |
9 | #include <linux/magic.h> |
10 | #include <linux/iomap.h> |
11 | #include <linux/init.h> |
12 | #include <linux/slab.h> |
13 | #include <linux/blkdev.h> |
14 | #include <linux/statfs.h> |
15 | #include <linux/writeback.h> |
16 | #include <linux/quotaops.h> |
17 | #include <linux/seq_file.h> |
18 | #include <linux/uio.h> |
19 | #include <linux/mman.h> |
20 | #include <linux/sched/mm.h> |
21 | #include <linux/crc32.h> |
22 | #include <linux/task_io_accounting_ops.h> |
23 | #include <linux/fs_parser.h> |
24 | #include <linux/fs_context.h> |
25 | |
26 | #include "zonefs.h" |
27 | |
28 | #define CREATE_TRACE_POINTS |
29 | #include "trace.h" |
30 | |
31 | /* |
32 | * Get the name of a zone group directory. |
33 | */ |
34 | static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) |
35 | { |
36 | switch (ztype) { |
37 | case ZONEFS_ZTYPE_CNV: |
38 | return "cnv" ; |
39 | case ZONEFS_ZTYPE_SEQ: |
40 | return "seq" ; |
41 | default: |
42 | WARN_ON_ONCE(1); |
43 | return "???" ; |
44 | } |
45 | } |
46 | |
47 | /* |
48 | * Manage the active zone count. |
49 | */ |
50 | static void zonefs_account_active(struct super_block *sb, |
51 | struct zonefs_zone *z) |
52 | { |
53 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
54 | |
55 | if (zonefs_zone_is_cnv(z)) |
56 | return; |
57 | |
58 | /* |
59 | * For zones that transitioned to the offline or readonly condition, |
60 | * we only need to clear the active state. |
61 | */ |
62 | if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) |
63 | goto out; |
64 | |
65 | /* |
66 | * If the zone is active, that is, if it is explicitly open or |
67 | * partially written, check if it was already accounted as active. |
68 | */ |
69 | if ((z->z_flags & ZONEFS_ZONE_OPEN) || |
70 | (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { |
71 | if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { |
72 | z->z_flags |= ZONEFS_ZONE_ACTIVE; |
73 | atomic_inc(v: &sbi->s_active_seq_files); |
74 | } |
75 | return; |
76 | } |
77 | |
78 | out: |
79 | /* The zone is not active. If it was, update the active count */ |
80 | if (z->z_flags & ZONEFS_ZONE_ACTIVE) { |
81 | z->z_flags &= ~ZONEFS_ZONE_ACTIVE; |
82 | atomic_dec(v: &sbi->s_active_seq_files); |
83 | } |
84 | } |
85 | |
86 | /* |
87 | * Manage the active zone count. Called with zi->i_truncate_mutex held. |
88 | */ |
89 | void zonefs_inode_account_active(struct inode *inode) |
90 | { |
91 | lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); |
92 | |
93 | return zonefs_account_active(sb: inode->i_sb, z: zonefs_inode_zone(inode)); |
94 | } |
95 | |
96 | /* |
97 | * Execute a zone management operation. |
98 | */ |
99 | static int zonefs_zone_mgmt(struct super_block *sb, |
100 | struct zonefs_zone *z, enum req_op op) |
101 | { |
102 | int ret; |
103 | |
104 | /* |
105 | * With ZNS drives, closing an explicitly open zone that has not been |
106 | * written will change the zone state to "closed", that is, the zone |
107 | * will remain active. Since this can then cause failure of explicit |
108 | * open operation on other zones if the drive active zone resources |
109 | * are exceeded, make sure that the zone does not remain active by |
110 | * resetting it. |
111 | */ |
112 | if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) |
113 | op = REQ_OP_ZONE_RESET; |
114 | |
115 | trace_zonefs_zone_mgmt(sb, z, op); |
116 | ret = blkdev_zone_mgmt(bdev: sb->s_bdev, op, sectors: z->z_sector, |
117 | nr_sectors: z->z_size >> SECTOR_SHIFT); |
118 | if (ret) { |
119 | zonefs_err(sb, |
120 | "Zone management operation %s at %llu failed %d\n" , |
121 | blk_op_str(op), z->z_sector, ret); |
122 | return ret; |
123 | } |
124 | |
125 | return 0; |
126 | } |
127 | |
128 | int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) |
129 | { |
130 | lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); |
131 | |
132 | return zonefs_zone_mgmt(sb: inode->i_sb, z: zonefs_inode_zone(inode), op); |
133 | } |
134 | |
135 | void zonefs_i_size_write(struct inode *inode, loff_t isize) |
136 | { |
137 | struct zonefs_zone *z = zonefs_inode_zone(inode); |
138 | |
139 | i_size_write(inode, i_size: isize); |
140 | |
141 | /* |
142 | * A full zone is no longer open/active and does not need |
143 | * explicit closing. |
144 | */ |
145 | if (isize >= z->z_capacity) { |
146 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb: inode->i_sb); |
147 | |
148 | if (z->z_flags & ZONEFS_ZONE_ACTIVE) |
149 | atomic_dec(v: &sbi->s_active_seq_files); |
150 | z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); |
151 | } |
152 | } |
153 | |
154 | void zonefs_update_stats(struct inode *inode, loff_t new_isize) |
155 | { |
156 | struct super_block *sb = inode->i_sb; |
157 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
158 | loff_t old_isize = i_size_read(inode); |
159 | loff_t nr_blocks; |
160 | |
161 | if (new_isize == old_isize) |
162 | return; |
163 | |
164 | spin_lock(lock: &sbi->s_lock); |
165 | |
166 | /* |
167 | * This may be called for an update after an IO error. |
168 | * So beware of the values seen. |
169 | */ |
170 | if (new_isize < old_isize) { |
171 | nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits; |
172 | if (sbi->s_used_blocks > nr_blocks) |
173 | sbi->s_used_blocks -= nr_blocks; |
174 | else |
175 | sbi->s_used_blocks = 0; |
176 | } else { |
177 | sbi->s_used_blocks += |
178 | (new_isize - old_isize) >> sb->s_blocksize_bits; |
179 | if (sbi->s_used_blocks > sbi->s_blocks) |
180 | sbi->s_used_blocks = sbi->s_blocks; |
181 | } |
182 | |
183 | spin_unlock(lock: &sbi->s_lock); |
184 | } |
185 | |
186 | /* |
187 | * Check a zone condition. Return the amount of written (and still readable) |
188 | * data in the zone. |
189 | */ |
190 | static loff_t zonefs_check_zone_condition(struct super_block *sb, |
191 | struct zonefs_zone *z, |
192 | struct blk_zone *zone) |
193 | { |
194 | switch (zone->cond) { |
195 | case BLK_ZONE_COND_OFFLINE: |
196 | zonefs_warn(sb, "Zone %llu: offline zone\n" , |
197 | z->z_sector); |
198 | z->z_flags |= ZONEFS_ZONE_OFFLINE; |
199 | return 0; |
200 | case BLK_ZONE_COND_READONLY: |
201 | /* |
202 | * The write pointer of read-only zones is invalid, so we cannot |
203 | * determine the zone wpoffset (inode size). We thus keep the |
204 | * zone wpoffset as is, which leads to an empty file |
205 | * (wpoffset == 0) on mount. For a runtime error, this keeps |
206 | * the inode size as it was when last updated so that the user |
207 | * can recover data. |
208 | */ |
209 | zonefs_warn(sb, "Zone %llu: read-only zone\n" , |
210 | z->z_sector); |
211 | z->z_flags |= ZONEFS_ZONE_READONLY; |
212 | if (zonefs_zone_is_cnv(z)) |
213 | return z->z_capacity; |
214 | return z->z_wpoffset; |
215 | case BLK_ZONE_COND_FULL: |
216 | /* The write pointer of full zones is invalid. */ |
217 | return z->z_capacity; |
218 | default: |
219 | if (zonefs_zone_is_cnv(z)) |
220 | return z->z_capacity; |
221 | return (zone->wp - zone->start) << SECTOR_SHIFT; |
222 | } |
223 | } |
224 | |
225 | /* |
226 | * Check a zone condition and adjust its inode access permissions for |
227 | * offline and readonly zones. |
228 | */ |
229 | static void zonefs_inode_update_mode(struct inode *inode) |
230 | { |
231 | struct zonefs_zone *z = zonefs_inode_zone(inode); |
232 | |
233 | if (z->z_flags & ZONEFS_ZONE_OFFLINE) { |
234 | /* Offline zones cannot be read nor written */ |
235 | inode->i_flags |= S_IMMUTABLE; |
236 | inode->i_mode &= ~0777; |
237 | } else if (z->z_flags & ZONEFS_ZONE_READONLY) { |
238 | /* Readonly zones cannot be written */ |
239 | inode->i_flags |= S_IMMUTABLE; |
240 | if (z->z_flags & ZONEFS_ZONE_INIT_MODE) |
241 | inode->i_mode &= ~0777; |
242 | else |
243 | inode->i_mode &= ~0222; |
244 | } |
245 | |
246 | z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; |
247 | z->z_mode = inode->i_mode; |
248 | } |
249 | |
250 | static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, |
251 | void *data) |
252 | { |
253 | struct blk_zone *z = data; |
254 | |
255 | *z = *zone; |
256 | return 0; |
257 | } |
258 | |
259 | static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, |
260 | bool write) |
261 | { |
262 | struct zonefs_zone *z = zonefs_inode_zone(inode); |
263 | struct super_block *sb = inode->i_sb; |
264 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
265 | loff_t isize, data_size; |
266 | |
267 | /* |
268 | * Check the zone condition: if the zone is not "bad" (offline or |
269 | * read-only), read errors are simply signaled to the IO issuer as long |
270 | * as there is no inconsistency between the inode size and the amount of |
271 | * data writen in the zone (data_size). |
272 | */ |
273 | data_size = zonefs_check_zone_condition(sb, z, zone); |
274 | isize = i_size_read(inode); |
275 | if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && |
276 | !write && isize == data_size) |
277 | return; |
278 | |
279 | /* |
280 | * At this point, we detected either a bad zone or an inconsistency |
281 | * between the inode size and the amount of data written in the zone. |
282 | * For the latter case, the cause may be a write IO error or an external |
283 | * action on the device. Two error patterns exist: |
284 | * 1) The inode size is lower than the amount of data in the zone: |
285 | * a write operation partially failed and data was writen at the end |
286 | * of the file. This can happen in the case of a large direct IO |
287 | * needing several BIOs and/or write requests to be processed. |
288 | * 2) The inode size is larger than the amount of data in the zone: |
289 | * this can happen with a deferred write error with the use of the |
290 | * device side write cache after getting successful write IO |
291 | * completions. Other possibilities are (a) an external corruption, |
292 | * e.g. an application reset the zone directly, or (b) the device |
293 | * has a serious problem (e.g. firmware bug). |
294 | * |
295 | * In all cases, warn about inode size inconsistency and handle the |
296 | * IO error according to the zone condition and to the mount options. |
297 | */ |
298 | if (isize != data_size) |
299 | zonefs_warn(sb, |
300 | "inode %lu: invalid size %lld (should be %lld)\n" , |
301 | inode->i_ino, isize, data_size); |
302 | |
303 | /* |
304 | * First handle bad zones signaled by hardware. The mount options |
305 | * errors=zone-ro and errors=zone-offline result in changing the |
306 | * zone condition to read-only and offline respectively, as if the |
307 | * condition was signaled by the hardware. |
308 | */ |
309 | if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || |
310 | (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { |
311 | zonefs_warn(sb, "inode %lu: read/write access disabled\n" , |
312 | inode->i_ino); |
313 | if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) |
314 | z->z_flags |= ZONEFS_ZONE_OFFLINE; |
315 | zonefs_inode_update_mode(inode); |
316 | data_size = 0; |
317 | } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || |
318 | (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { |
319 | zonefs_warn(sb, "inode %lu: write access disabled\n" , |
320 | inode->i_ino); |
321 | if (!(z->z_flags & ZONEFS_ZONE_READONLY)) |
322 | z->z_flags |= ZONEFS_ZONE_READONLY; |
323 | zonefs_inode_update_mode(inode); |
324 | data_size = isize; |
325 | } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && |
326 | data_size > isize) { |
327 | /* Do not expose garbage data */ |
328 | data_size = isize; |
329 | } |
330 | |
331 | /* |
332 | * If the filesystem is mounted with the explicit-open mount option, we |
333 | * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to |
334 | * the read-only or offline condition, to avoid attempting an explicit |
335 | * close of the zone when the inode file is closed. |
336 | */ |
337 | if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && |
338 | (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) |
339 | z->z_flags &= ~ZONEFS_ZONE_OPEN; |
340 | |
341 | /* |
342 | * If error=remount-ro was specified, any error result in remounting |
343 | * the volume as read-only. |
344 | */ |
345 | if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) { |
346 | zonefs_warn(sb, "remounting filesystem read-only\n" ); |
347 | sb->s_flags |= SB_RDONLY; |
348 | } |
349 | |
350 | /* |
351 | * Update block usage stats and the inode size to prevent access to |
352 | * invalid data. |
353 | */ |
354 | zonefs_update_stats(inode, new_isize: data_size); |
355 | zonefs_i_size_write(inode, isize: data_size); |
356 | z->z_wpoffset = data_size; |
357 | zonefs_inode_account_active(inode); |
358 | } |
359 | |
360 | /* |
361 | * When an file IO error occurs, check the file zone to see if there is a change |
362 | * in the zone condition (e.g. offline or read-only). For a failed write to a |
363 | * sequential zone, the zone write pointer position must also be checked to |
364 | * eventually correct the file size and zonefs inode write pointer offset |
365 | * (which can be out of sync with the drive due to partial write failures). |
366 | */ |
367 | void __zonefs_io_error(struct inode *inode, bool write) |
368 | { |
369 | struct zonefs_zone *z = zonefs_inode_zone(inode); |
370 | struct super_block *sb = inode->i_sb; |
371 | unsigned int noio_flag; |
372 | struct blk_zone zone; |
373 | int ret; |
374 | |
375 | /* |
376 | * Conventional zone have no write pointer and cannot become read-only |
377 | * or offline. So simply fake a report for a single or aggregated zone |
378 | * and let zonefs_handle_io_error() correct the zone inode information |
379 | * according to the mount options. |
380 | */ |
381 | if (!zonefs_zone_is_seq(z)) { |
382 | zone.start = z->z_sector; |
383 | zone.len = z->z_size >> SECTOR_SHIFT; |
384 | zone.wp = zone.start + zone.len; |
385 | zone.type = BLK_ZONE_TYPE_CONVENTIONAL; |
386 | zone.cond = BLK_ZONE_COND_NOT_WP; |
387 | zone.capacity = zone.len; |
388 | goto handle_io_error; |
389 | } |
390 | |
391 | /* |
392 | * Memory allocations in blkdev_report_zones() can trigger a memory |
393 | * reclaim which may in turn cause a recursion into zonefs as well as |
394 | * struct request allocations for the same device. The former case may |
395 | * end up in a deadlock on the inode truncate mutex, while the latter |
396 | * may prevent IO forward progress. Executing the report zones under |
397 | * the GFP_NOIO context avoids both problems. |
398 | */ |
399 | noio_flag = memalloc_noio_save(); |
400 | ret = blkdev_report_zones(bdev: sb->s_bdev, sector: z->z_sector, nr_zones: 1, |
401 | cb: zonefs_io_error_cb, data: &zone); |
402 | memalloc_noio_restore(flags: noio_flag); |
403 | |
404 | if (ret != 1) { |
405 | zonefs_err(sb, "Get inode %lu zone information failed %d\n" , |
406 | inode->i_ino, ret); |
407 | zonefs_warn(sb, "remounting filesystem read-only\n" ); |
408 | sb->s_flags |= SB_RDONLY; |
409 | return; |
410 | } |
411 | |
412 | handle_io_error: |
413 | zonefs_handle_io_error(inode, zone: &zone, write); |
414 | } |
415 | |
416 | static struct kmem_cache *zonefs_inode_cachep; |
417 | |
418 | static struct inode *zonefs_alloc_inode(struct super_block *sb) |
419 | { |
420 | struct zonefs_inode_info *zi; |
421 | |
422 | zi = alloc_inode_sb(sb, cache: zonefs_inode_cachep, GFP_KERNEL); |
423 | if (!zi) |
424 | return NULL; |
425 | |
426 | inode_init_once(&zi->i_vnode); |
427 | mutex_init(&zi->i_truncate_mutex); |
428 | zi->i_wr_refcnt = 0; |
429 | |
430 | return &zi->i_vnode; |
431 | } |
432 | |
433 | static void zonefs_free_inode(struct inode *inode) |
434 | { |
435 | kmem_cache_free(s: zonefs_inode_cachep, objp: ZONEFS_I(inode)); |
436 | } |
437 | |
438 | /* |
439 | * File system stat. |
440 | */ |
441 | static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) |
442 | { |
443 | struct super_block *sb = dentry->d_sb; |
444 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
445 | enum zonefs_ztype t; |
446 | |
447 | buf->f_type = ZONEFS_MAGIC; |
448 | buf->f_bsize = sb->s_blocksize; |
449 | buf->f_namelen = ZONEFS_NAME_MAX; |
450 | |
451 | spin_lock(lock: &sbi->s_lock); |
452 | |
453 | buf->f_blocks = sbi->s_blocks; |
454 | if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) |
455 | buf->f_bfree = 0; |
456 | else |
457 | buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; |
458 | buf->f_bavail = buf->f_bfree; |
459 | |
460 | for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { |
461 | if (sbi->s_zgroup[t].g_nr_zones) |
462 | buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; |
463 | } |
464 | buf->f_ffree = 0; |
465 | |
466 | spin_unlock(lock: &sbi->s_lock); |
467 | |
468 | buf->f_fsid = uuid_to_fsid(uuid: sbi->s_uuid.b); |
469 | |
470 | return 0; |
471 | } |
472 | |
473 | enum { |
474 | Opt_errors, Opt_explicit_open, |
475 | }; |
476 | |
477 | struct zonefs_context { |
478 | unsigned long s_mount_opts; |
479 | }; |
480 | |
481 | static const struct constant_table zonefs_param_errors[] = { |
482 | {"remount-ro" , ZONEFS_MNTOPT_ERRORS_RO}, |
483 | {"zone-ro" , ZONEFS_MNTOPT_ERRORS_ZRO}, |
484 | {"zone-offline" , ZONEFS_MNTOPT_ERRORS_ZOL}, |
485 | {"repair" , ZONEFS_MNTOPT_ERRORS_REPAIR}, |
486 | {} |
487 | }; |
488 | |
489 | static const struct fs_parameter_spec zonefs_param_spec[] = { |
490 | fsparam_enum ("errors" , Opt_errors, zonefs_param_errors), |
491 | fsparam_flag ("explicit-open" , Opt_explicit_open), |
492 | {} |
493 | }; |
494 | |
495 | static int zonefs_parse_param(struct fs_context *fc, struct fs_parameter *param) |
496 | { |
497 | struct zonefs_context *ctx = fc->fs_private; |
498 | struct fs_parse_result result; |
499 | int opt; |
500 | |
501 | opt = fs_parse(fc, desc: zonefs_param_spec, param, result: &result); |
502 | if (opt < 0) |
503 | return opt; |
504 | |
505 | switch (opt) { |
506 | case Opt_errors: |
507 | ctx->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; |
508 | ctx->s_mount_opts |= result.uint_32; |
509 | break; |
510 | case Opt_explicit_open: |
511 | ctx->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; |
512 | break; |
513 | default: |
514 | return -EINVAL; |
515 | } |
516 | |
517 | return 0; |
518 | } |
519 | |
520 | static int zonefs_show_options(struct seq_file *seq, struct dentry *root) |
521 | { |
522 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb: root->d_sb); |
523 | |
524 | if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) |
525 | seq_puts(m: seq, s: ",errors=remount-ro" ); |
526 | if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) |
527 | seq_puts(m: seq, s: ",errors=zone-ro" ); |
528 | if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) |
529 | seq_puts(m: seq, s: ",errors=zone-offline" ); |
530 | if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) |
531 | seq_puts(m: seq, s: ",errors=repair" ); |
532 | |
533 | return 0; |
534 | } |
535 | |
536 | static int zonefs_inode_setattr(struct mnt_idmap *idmap, |
537 | struct dentry *dentry, struct iattr *iattr) |
538 | { |
539 | struct inode *inode = d_inode(dentry); |
540 | int ret; |
541 | |
542 | if (unlikely(IS_IMMUTABLE(inode))) |
543 | return -EPERM; |
544 | |
545 | ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); |
546 | if (ret) |
547 | return ret; |
548 | |
549 | /* |
550 | * Since files and directories cannot be created nor deleted, do not |
551 | * allow setting any write attributes on the sub-directories grouping |
552 | * files by zone type. |
553 | */ |
554 | if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && |
555 | (iattr->ia_mode & 0222)) |
556 | return -EPERM; |
557 | |
558 | if (((iattr->ia_valid & ATTR_UID) && |
559 | !uid_eq(left: iattr->ia_uid, right: inode->i_uid)) || |
560 | ((iattr->ia_valid & ATTR_GID) && |
561 | !gid_eq(left: iattr->ia_gid, right: inode->i_gid))) { |
562 | ret = dquot_transfer(idmap: &nop_mnt_idmap, inode, iattr); |
563 | if (ret) |
564 | return ret; |
565 | } |
566 | |
567 | if (iattr->ia_valid & ATTR_SIZE) { |
568 | ret = zonefs_file_truncate(inode, isize: iattr->ia_size); |
569 | if (ret) |
570 | return ret; |
571 | } |
572 | |
573 | setattr_copy(&nop_mnt_idmap, inode, attr: iattr); |
574 | |
575 | if (S_ISREG(inode->i_mode)) { |
576 | struct zonefs_zone *z = zonefs_inode_zone(inode); |
577 | |
578 | z->z_mode = inode->i_mode; |
579 | z->z_uid = inode->i_uid; |
580 | z->z_gid = inode->i_gid; |
581 | } |
582 | |
583 | return 0; |
584 | } |
585 | |
586 | static const struct inode_operations zonefs_file_inode_operations = { |
587 | .setattr = zonefs_inode_setattr, |
588 | }; |
589 | |
590 | static long zonefs_fname_to_fno(const struct qstr *fname) |
591 | { |
592 | const char *name = fname->name; |
593 | unsigned int len = fname->len; |
594 | long fno = 0, shift = 1; |
595 | const char *rname; |
596 | char c = *name; |
597 | unsigned int i; |
598 | |
599 | /* |
600 | * File names are always a base-10 number string without any |
601 | * leading 0s. |
602 | */ |
603 | if (!isdigit(c)) |
604 | return -ENOENT; |
605 | |
606 | if (len > 1 && c == '0') |
607 | return -ENOENT; |
608 | |
609 | if (len == 1) |
610 | return c - '0'; |
611 | |
612 | for (i = 0, rname = name + len - 1; i < len; i++, rname--) { |
613 | c = *rname; |
614 | if (!isdigit(c)) |
615 | return -ENOENT; |
616 | fno += (c - '0') * shift; |
617 | shift *= 10; |
618 | } |
619 | |
620 | return fno; |
621 | } |
622 | |
623 | static struct inode *zonefs_get_file_inode(struct inode *dir, |
624 | struct dentry *dentry) |
625 | { |
626 | struct zonefs_zone_group *zgroup = dir->i_private; |
627 | struct super_block *sb = dir->i_sb; |
628 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
629 | struct zonefs_zone *z; |
630 | struct inode *inode; |
631 | ino_t ino; |
632 | long fno; |
633 | |
634 | /* Get the file number from the file name */ |
635 | fno = zonefs_fname_to_fno(fname: &dentry->d_name); |
636 | if (fno < 0) |
637 | return ERR_PTR(error: fno); |
638 | |
639 | if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones) |
640 | return ERR_PTR(error: -ENOENT); |
641 | |
642 | z = &zgroup->g_zones[fno]; |
643 | ino = z->z_sector >> sbi->s_zone_sectors_shift; |
644 | inode = iget_locked(sb, ino); |
645 | if (!inode) |
646 | return ERR_PTR(error: -ENOMEM); |
647 | if (!(inode->i_state & I_NEW)) { |
648 | WARN_ON_ONCE(inode->i_private != z); |
649 | return inode; |
650 | } |
651 | |
652 | inode->i_ino = ino; |
653 | inode->i_mode = z->z_mode; |
654 | inode_set_mtime_to_ts(inode, |
655 | ts: inode_set_atime_to_ts(inode, ts: inode_set_ctime_to_ts(inode, ts: inode_get_ctime(inode: dir)))); |
656 | inode->i_uid = z->z_uid; |
657 | inode->i_gid = z->z_gid; |
658 | inode->i_size = z->z_wpoffset; |
659 | inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; |
660 | inode->i_private = z; |
661 | |
662 | inode->i_op = &zonefs_file_inode_operations; |
663 | inode->i_fop = &zonefs_file_operations; |
664 | inode->i_mapping->a_ops = &zonefs_file_aops; |
665 | |
666 | /* Update the inode access rights depending on the zone condition */ |
667 | zonefs_inode_update_mode(inode); |
668 | |
669 | unlock_new_inode(inode); |
670 | |
671 | return inode; |
672 | } |
673 | |
674 | static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, |
675 | enum zonefs_ztype ztype) |
676 | { |
677 | struct inode *root = d_inode(dentry: sb->s_root); |
678 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
679 | struct inode *inode; |
680 | ino_t ino = bdev_nr_zones(bdev: sb->s_bdev) + ztype + 1; |
681 | |
682 | inode = iget_locked(sb, ino); |
683 | if (!inode) |
684 | return ERR_PTR(error: -ENOMEM); |
685 | if (!(inode->i_state & I_NEW)) |
686 | return inode; |
687 | |
688 | inode->i_ino = ino; |
689 | inode_init_owner(idmap: &nop_mnt_idmap, inode, dir: root, S_IFDIR | 0555); |
690 | inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; |
691 | inode_set_mtime_to_ts(inode, |
692 | ts: inode_set_atime_to_ts(inode, ts: inode_set_ctime_to_ts(inode, ts: inode_get_ctime(inode: root)))); |
693 | inode->i_private = &sbi->s_zgroup[ztype]; |
694 | set_nlink(inode, nlink: 2); |
695 | |
696 | inode->i_op = &zonefs_dir_inode_operations; |
697 | inode->i_fop = &zonefs_dir_operations; |
698 | |
699 | unlock_new_inode(inode); |
700 | |
701 | return inode; |
702 | } |
703 | |
704 | |
705 | static struct inode *zonefs_get_dir_inode(struct inode *dir, |
706 | struct dentry *dentry) |
707 | { |
708 | struct super_block *sb = dir->i_sb; |
709 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
710 | const char *name = dentry->d_name.name; |
711 | enum zonefs_ztype ztype; |
712 | |
713 | /* |
714 | * We only need to check for the "seq" directory and |
715 | * the "cnv" directory if we have conventional zones. |
716 | */ |
717 | if (dentry->d_name.len != 3) |
718 | return ERR_PTR(error: -ENOENT); |
719 | |
720 | for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { |
721 | if (sbi->s_zgroup[ztype].g_nr_zones && |
722 | memcmp(p: name, q: zonefs_zgroup_name(ztype), size: 3) == 0) |
723 | break; |
724 | } |
725 | if (ztype == ZONEFS_ZTYPE_MAX) |
726 | return ERR_PTR(error: -ENOENT); |
727 | |
728 | return zonefs_get_zgroup_inode(sb, ztype); |
729 | } |
730 | |
731 | static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, |
732 | unsigned int flags) |
733 | { |
734 | struct inode *inode; |
735 | |
736 | if (dentry->d_name.len > ZONEFS_NAME_MAX) |
737 | return ERR_PTR(error: -ENAMETOOLONG); |
738 | |
739 | if (dir == d_inode(dentry: dir->i_sb->s_root)) |
740 | inode = zonefs_get_dir_inode(dir, dentry); |
741 | else |
742 | inode = zonefs_get_file_inode(dir, dentry); |
743 | |
744 | return d_splice_alias(inode, dentry); |
745 | } |
746 | |
747 | static int zonefs_readdir_root(struct file *file, struct dir_context *ctx) |
748 | { |
749 | struct inode *inode = file_inode(f: file); |
750 | struct super_block *sb = inode->i_sb; |
751 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
752 | enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV; |
753 | ino_t base_ino = bdev_nr_zones(bdev: sb->s_bdev) + 1; |
754 | |
755 | if (ctx->pos >= inode->i_size) |
756 | return 0; |
757 | |
758 | if (!dir_emit_dots(file, ctx)) |
759 | return 0; |
760 | |
761 | if (ctx->pos == 2) { |
762 | if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) |
763 | ztype = ZONEFS_ZTYPE_SEQ; |
764 | |
765 | if (!dir_emit(ctx, name: zonefs_zgroup_name(ztype), namelen: 3, |
766 | ino: base_ino + ztype, DT_DIR)) |
767 | return 0; |
768 | ctx->pos++; |
769 | } |
770 | |
771 | if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) { |
772 | ztype = ZONEFS_ZTYPE_SEQ; |
773 | if (!dir_emit(ctx, name: zonefs_zgroup_name(ztype), namelen: 3, |
774 | ino: base_ino + ztype, DT_DIR)) |
775 | return 0; |
776 | ctx->pos++; |
777 | } |
778 | |
779 | return 0; |
780 | } |
781 | |
782 | static int zonefs_readdir_zgroup(struct file *file, |
783 | struct dir_context *ctx) |
784 | { |
785 | struct inode *inode = file_inode(f: file); |
786 | struct zonefs_zone_group *zgroup = inode->i_private; |
787 | struct super_block *sb = inode->i_sb; |
788 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
789 | struct zonefs_zone *z; |
790 | int fname_len; |
791 | char *fname; |
792 | ino_t ino; |
793 | int f; |
794 | |
795 | /* |
796 | * The size of zone group directories is equal to the number |
797 | * of zone files in the group and does note include the "." and |
798 | * ".." entries. Hence the "+ 2" here. |
799 | */ |
800 | if (ctx->pos >= inode->i_size + 2) |
801 | return 0; |
802 | |
803 | if (!dir_emit_dots(file, ctx)) |
804 | return 0; |
805 | |
806 | fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); |
807 | if (!fname) |
808 | return -ENOMEM; |
809 | |
810 | for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) { |
811 | z = &zgroup->g_zones[f]; |
812 | ino = z->z_sector >> sbi->s_zone_sectors_shift; |
813 | fname_len = snprintf(buf: fname, ZONEFS_NAME_MAX - 1, fmt: "%u" , f); |
814 | if (!dir_emit(ctx, name: fname, namelen: fname_len, ino, DT_REG)) |
815 | break; |
816 | ctx->pos++; |
817 | } |
818 | |
819 | kfree(objp: fname); |
820 | |
821 | return 0; |
822 | } |
823 | |
824 | static int zonefs_readdir(struct file *file, struct dir_context *ctx) |
825 | { |
826 | struct inode *inode = file_inode(f: file); |
827 | |
828 | if (inode == d_inode(dentry: inode->i_sb->s_root)) |
829 | return zonefs_readdir_root(file, ctx); |
830 | |
831 | return zonefs_readdir_zgroup(file, ctx); |
832 | } |
833 | |
834 | const struct inode_operations zonefs_dir_inode_operations = { |
835 | .lookup = zonefs_lookup, |
836 | .setattr = zonefs_inode_setattr, |
837 | }; |
838 | |
839 | const struct file_operations zonefs_dir_operations = { |
840 | .llseek = generic_file_llseek, |
841 | .read = generic_read_dir, |
842 | .iterate_shared = zonefs_readdir, |
843 | }; |
844 | |
845 | struct zonefs_zone_data { |
846 | struct super_block *sb; |
847 | unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; |
848 | sector_t cnv_zone_start; |
849 | struct blk_zone *zones; |
850 | }; |
851 | |
852 | static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, |
853 | void *data) |
854 | { |
855 | struct zonefs_zone_data *zd = data; |
856 | struct super_block *sb = zd->sb; |
857 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
858 | |
859 | /* |
860 | * We do not care about the first zone: it contains the super block |
861 | * and not exposed as a file. |
862 | */ |
863 | if (!idx) |
864 | return 0; |
865 | |
866 | /* |
867 | * Count the number of zones that will be exposed as files. |
868 | * For sequential zones, we always have as many files as zones. |
869 | * FOr conventional zones, the number of files depends on if we have |
870 | * conventional zones aggregation enabled. |
871 | */ |
872 | switch (zone->type) { |
873 | case BLK_ZONE_TYPE_CONVENTIONAL: |
874 | if (sbi->s_features & ZONEFS_F_AGGRCNV) { |
875 | /* One file per set of contiguous conventional zones */ |
876 | if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || |
877 | zone->start != zd->cnv_zone_start) |
878 | sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; |
879 | zd->cnv_zone_start = zone->start + zone->len; |
880 | } else { |
881 | /* One file per zone */ |
882 | sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; |
883 | } |
884 | break; |
885 | case BLK_ZONE_TYPE_SEQWRITE_REQ: |
886 | case BLK_ZONE_TYPE_SEQWRITE_PREF: |
887 | sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; |
888 | break; |
889 | default: |
890 | zonefs_err(zd->sb, "Unsupported zone type 0x%x\n" , |
891 | zone->type); |
892 | return -EIO; |
893 | } |
894 | |
895 | memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); |
896 | |
897 | return 0; |
898 | } |
899 | |
900 | static int zonefs_get_zone_info(struct zonefs_zone_data *zd) |
901 | { |
902 | struct block_device *bdev = zd->sb->s_bdev; |
903 | int ret; |
904 | |
905 | zd->zones = kvcalloc(n: bdev_nr_zones(bdev), size: sizeof(struct blk_zone), |
906 | GFP_KERNEL); |
907 | if (!zd->zones) |
908 | return -ENOMEM; |
909 | |
910 | /* Get zones information from the device */ |
911 | ret = blkdev_report_zones(bdev, sector: 0, BLK_ALL_ZONES, |
912 | cb: zonefs_get_zone_info_cb, data: zd); |
913 | if (ret < 0) { |
914 | zonefs_err(zd->sb, "Zone report failed %d\n" , ret); |
915 | return ret; |
916 | } |
917 | |
918 | if (ret != bdev_nr_zones(bdev)) { |
919 | zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n" , |
920 | ret, bdev_nr_zones(bdev)); |
921 | return -EIO; |
922 | } |
923 | |
924 | return 0; |
925 | } |
926 | |
927 | static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) |
928 | { |
929 | kvfree(addr: zd->zones); |
930 | } |
931 | |
932 | /* |
933 | * Create a zone group and populate it with zone files. |
934 | */ |
935 | static int zonefs_init_zgroup(struct super_block *sb, |
936 | struct zonefs_zone_data *zd, |
937 | enum zonefs_ztype ztype) |
938 | { |
939 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
940 | struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; |
941 | struct blk_zone *zone, *next, *end; |
942 | struct zonefs_zone *z; |
943 | unsigned int n = 0; |
944 | int ret; |
945 | |
946 | /* Allocate the zone group. If it is empty, we have nothing to do. */ |
947 | if (!zgroup->g_nr_zones) |
948 | return 0; |
949 | |
950 | zgroup->g_zones = kvcalloc(n: zgroup->g_nr_zones, |
951 | size: sizeof(struct zonefs_zone), GFP_KERNEL); |
952 | if (!zgroup->g_zones) |
953 | return -ENOMEM; |
954 | |
955 | /* |
956 | * Initialize the zone groups using the device zone information. |
957 | * We always skip the first zone as it contains the super block |
958 | * and is not use to back a file. |
959 | */ |
960 | end = zd->zones + bdev_nr_zones(bdev: sb->s_bdev); |
961 | for (zone = &zd->zones[1]; zone < end; zone = next) { |
962 | |
963 | next = zone + 1; |
964 | if (zonefs_zone_type(zone) != ztype) |
965 | continue; |
966 | |
967 | if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) |
968 | return -EINVAL; |
969 | |
970 | /* |
971 | * For conventional zones, contiguous zones can be aggregated |
972 | * together to form larger files. Note that this overwrites the |
973 | * length of the first zone of the set of contiguous zones |
974 | * aggregated together. If one offline or read-only zone is |
975 | * found, assume that all zones aggregated have the same |
976 | * condition. |
977 | */ |
978 | if (ztype == ZONEFS_ZTYPE_CNV && |
979 | (sbi->s_features & ZONEFS_F_AGGRCNV)) { |
980 | for (; next < end; next++) { |
981 | if (zonefs_zone_type(zone: next) != ztype) |
982 | break; |
983 | zone->len += next->len; |
984 | zone->capacity += next->capacity; |
985 | if (next->cond == BLK_ZONE_COND_READONLY && |
986 | zone->cond != BLK_ZONE_COND_OFFLINE) |
987 | zone->cond = BLK_ZONE_COND_READONLY; |
988 | else if (next->cond == BLK_ZONE_COND_OFFLINE) |
989 | zone->cond = BLK_ZONE_COND_OFFLINE; |
990 | } |
991 | } |
992 | |
993 | z = &zgroup->g_zones[n]; |
994 | if (ztype == ZONEFS_ZTYPE_CNV) |
995 | z->z_flags |= ZONEFS_ZONE_CNV; |
996 | z->z_sector = zone->start; |
997 | z->z_size = zone->len << SECTOR_SHIFT; |
998 | if (z->z_size > bdev_zone_sectors(bdev: sb->s_bdev) << SECTOR_SHIFT && |
999 | !(sbi->s_features & ZONEFS_F_AGGRCNV)) { |
1000 | zonefs_err(sb, |
1001 | "Invalid zone size %llu (device zone sectors %llu)\n" , |
1002 | z->z_size, |
1003 | bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); |
1004 | return -EINVAL; |
1005 | } |
1006 | |
1007 | z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, |
1008 | zone->capacity << SECTOR_SHIFT); |
1009 | z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); |
1010 | |
1011 | z->z_mode = S_IFREG | sbi->s_perm; |
1012 | z->z_uid = sbi->s_uid; |
1013 | z->z_gid = sbi->s_gid; |
1014 | |
1015 | /* |
1016 | * Let zonefs_inode_update_mode() know that we will need |
1017 | * special initialization of the inode mode the first time |
1018 | * it is accessed. |
1019 | */ |
1020 | z->z_flags |= ZONEFS_ZONE_INIT_MODE; |
1021 | |
1022 | sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); |
1023 | sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; |
1024 | sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; |
1025 | |
1026 | /* |
1027 | * For sequential zones, make sure that any open zone is closed |
1028 | * first to ensure that the initial number of open zones is 0, |
1029 | * in sync with the open zone accounting done when the mount |
1030 | * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. |
1031 | */ |
1032 | if (ztype == ZONEFS_ZTYPE_SEQ && |
1033 | (zone->cond == BLK_ZONE_COND_IMP_OPEN || |
1034 | zone->cond == BLK_ZONE_COND_EXP_OPEN)) { |
1035 | ret = zonefs_zone_mgmt(sb, z, op: REQ_OP_ZONE_CLOSE); |
1036 | if (ret) |
1037 | return ret; |
1038 | } |
1039 | |
1040 | zonefs_account_active(sb, z); |
1041 | |
1042 | n++; |
1043 | } |
1044 | |
1045 | if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) |
1046 | return -EINVAL; |
1047 | |
1048 | zonefs_info(sb, "Zone group \"%s\" has %u file%s\n" , |
1049 | zonefs_zgroup_name(ztype), |
1050 | zgroup->g_nr_zones, |
1051 | str_plural(zgroup->g_nr_zones)); |
1052 | |
1053 | return 0; |
1054 | } |
1055 | |
1056 | static void zonefs_free_zgroups(struct super_block *sb) |
1057 | { |
1058 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
1059 | enum zonefs_ztype ztype; |
1060 | |
1061 | if (!sbi) |
1062 | return; |
1063 | |
1064 | for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { |
1065 | kvfree(addr: sbi->s_zgroup[ztype].g_zones); |
1066 | sbi->s_zgroup[ztype].g_zones = NULL; |
1067 | } |
1068 | } |
1069 | |
1070 | /* |
1071 | * Create a zone group and populate it with zone files. |
1072 | */ |
1073 | static int zonefs_init_zgroups(struct super_block *sb) |
1074 | { |
1075 | struct zonefs_zone_data zd; |
1076 | enum zonefs_ztype ztype; |
1077 | int ret; |
1078 | |
1079 | /* First get the device zone information */ |
1080 | memset(&zd, 0, sizeof(struct zonefs_zone_data)); |
1081 | zd.sb = sb; |
1082 | ret = zonefs_get_zone_info(zd: &zd); |
1083 | if (ret) |
1084 | goto cleanup; |
1085 | |
1086 | /* Allocate and initialize the zone groups */ |
1087 | for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { |
1088 | ret = zonefs_init_zgroup(sb, zd: &zd, ztype); |
1089 | if (ret) { |
1090 | zonefs_info(sb, |
1091 | "Zone group \"%s\" initialization failed\n" , |
1092 | zonefs_zgroup_name(ztype)); |
1093 | break; |
1094 | } |
1095 | } |
1096 | |
1097 | cleanup: |
1098 | zonefs_free_zone_info(zd: &zd); |
1099 | if (ret) |
1100 | zonefs_free_zgroups(sb); |
1101 | |
1102 | return ret; |
1103 | } |
1104 | |
1105 | /* |
1106 | * Read super block information from the device. |
1107 | */ |
1108 | static int zonefs_read_super(struct super_block *sb) |
1109 | { |
1110 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
1111 | struct zonefs_super *super; |
1112 | u32 crc, stored_crc; |
1113 | struct page *page; |
1114 | struct bio_vec bio_vec; |
1115 | struct bio bio; |
1116 | int ret; |
1117 | |
1118 | page = alloc_page(GFP_KERNEL); |
1119 | if (!page) |
1120 | return -ENOMEM; |
1121 | |
1122 | bio_init(bio: &bio, bdev: sb->s_bdev, table: &bio_vec, max_vecs: 1, opf: REQ_OP_READ); |
1123 | bio.bi_iter.bi_sector = 0; |
1124 | __bio_add_page(bio: &bio, page, PAGE_SIZE, off: 0); |
1125 | |
1126 | ret = submit_bio_wait(bio: &bio); |
1127 | if (ret) |
1128 | goto free_page; |
1129 | |
1130 | super = page_address(page); |
1131 | |
1132 | ret = -EINVAL; |
1133 | if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) |
1134 | goto free_page; |
1135 | |
1136 | stored_crc = le32_to_cpu(super->s_crc); |
1137 | super->s_crc = 0; |
1138 | crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super)); |
1139 | if (crc != stored_crc) { |
1140 | zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)" , |
1141 | crc, stored_crc); |
1142 | goto free_page; |
1143 | } |
1144 | |
1145 | sbi->s_features = le64_to_cpu(super->s_features); |
1146 | if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { |
1147 | zonefs_err(sb, "Unknown features set 0x%llx\n" , |
1148 | sbi->s_features); |
1149 | goto free_page; |
1150 | } |
1151 | |
1152 | if (sbi->s_features & ZONEFS_F_UID) { |
1153 | sbi->s_uid = make_kuid(current_user_ns(), |
1154 | le32_to_cpu(super->s_uid)); |
1155 | if (!uid_valid(uid: sbi->s_uid)) { |
1156 | zonefs_err(sb, "Invalid UID feature\n" ); |
1157 | goto free_page; |
1158 | } |
1159 | } |
1160 | |
1161 | if (sbi->s_features & ZONEFS_F_GID) { |
1162 | sbi->s_gid = make_kgid(current_user_ns(), |
1163 | le32_to_cpu(super->s_gid)); |
1164 | if (!gid_valid(gid: sbi->s_gid)) { |
1165 | zonefs_err(sb, "Invalid GID feature\n" ); |
1166 | goto free_page; |
1167 | } |
1168 | } |
1169 | |
1170 | if (sbi->s_features & ZONEFS_F_PERM) |
1171 | sbi->s_perm = le32_to_cpu(super->s_perm); |
1172 | |
1173 | if (memchr_inv(p: super->s_reserved, c: 0, size: sizeof(super->s_reserved))) { |
1174 | zonefs_err(sb, "Reserved area is being used\n" ); |
1175 | goto free_page; |
1176 | } |
1177 | |
1178 | import_uuid(dst: &sbi->s_uuid, src: super->s_uuid); |
1179 | ret = 0; |
1180 | |
1181 | free_page: |
1182 | __free_page(page); |
1183 | |
1184 | return ret; |
1185 | } |
1186 | |
1187 | static const struct super_operations zonefs_sops = { |
1188 | .alloc_inode = zonefs_alloc_inode, |
1189 | .free_inode = zonefs_free_inode, |
1190 | .statfs = zonefs_statfs, |
1191 | .show_options = zonefs_show_options, |
1192 | }; |
1193 | |
1194 | static int zonefs_get_zgroup_inodes(struct super_block *sb) |
1195 | { |
1196 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
1197 | struct inode *dir_inode; |
1198 | enum zonefs_ztype ztype; |
1199 | |
1200 | for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { |
1201 | if (!sbi->s_zgroup[ztype].g_nr_zones) |
1202 | continue; |
1203 | |
1204 | dir_inode = zonefs_get_zgroup_inode(sb, ztype); |
1205 | if (IS_ERR(ptr: dir_inode)) |
1206 | return PTR_ERR(ptr: dir_inode); |
1207 | |
1208 | sbi->s_zgroup[ztype].g_inode = dir_inode; |
1209 | } |
1210 | |
1211 | return 0; |
1212 | } |
1213 | |
1214 | static void zonefs_release_zgroup_inodes(struct super_block *sb) |
1215 | { |
1216 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
1217 | enum zonefs_ztype ztype; |
1218 | |
1219 | if (!sbi) |
1220 | return; |
1221 | |
1222 | for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { |
1223 | if (sbi->s_zgroup[ztype].g_inode) { |
1224 | iput(sbi->s_zgroup[ztype].g_inode); |
1225 | sbi->s_zgroup[ztype].g_inode = NULL; |
1226 | } |
1227 | } |
1228 | } |
1229 | |
1230 | /* |
1231 | * Check that the device is zoned. If it is, get the list of zones and create |
1232 | * sub-directories and files according to the device zone configuration and |
1233 | * format options. |
1234 | */ |
1235 | static int zonefs_fill_super(struct super_block *sb, struct fs_context *fc) |
1236 | { |
1237 | struct zonefs_sb_info *sbi; |
1238 | struct zonefs_context *ctx = fc->fs_private; |
1239 | struct inode *inode; |
1240 | enum zonefs_ztype ztype; |
1241 | int ret; |
1242 | |
1243 | if (!bdev_is_zoned(bdev: sb->s_bdev)) { |
1244 | zonefs_err(sb, "Not a zoned block device\n" ); |
1245 | return -EINVAL; |
1246 | } |
1247 | |
1248 | /* |
1249 | * Initialize super block information: the maximum file size is updated |
1250 | * when the zone files are created so that the format option |
1251 | * ZONEFS_F_AGGRCNV which increases the maximum file size of a file |
1252 | * beyond the zone size is taken into account. |
1253 | */ |
1254 | sbi = kzalloc(size: sizeof(*sbi), GFP_KERNEL); |
1255 | if (!sbi) |
1256 | return -ENOMEM; |
1257 | |
1258 | spin_lock_init(&sbi->s_lock); |
1259 | sb->s_fs_info = sbi; |
1260 | sb->s_magic = ZONEFS_MAGIC; |
1261 | sb->s_maxbytes = 0; |
1262 | sb->s_op = &zonefs_sops; |
1263 | sb->s_time_gran = 1; |
1264 | |
1265 | /* |
1266 | * The block size is set to the device zone write granularity to ensure |
1267 | * that write operations are always aligned according to the device |
1268 | * interface constraints. |
1269 | */ |
1270 | sb_set_blocksize(sb, bdev_zone_write_granularity(bdev: sb->s_bdev)); |
1271 | sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); |
1272 | sbi->s_uid = GLOBAL_ROOT_UID; |
1273 | sbi->s_gid = GLOBAL_ROOT_GID; |
1274 | sbi->s_perm = 0640; |
1275 | sbi->s_mount_opts = ctx->s_mount_opts; |
1276 | |
1277 | atomic_set(&sbi->s_wro_seq_files, 0); |
1278 | sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); |
1279 | atomic_set(&sbi->s_active_seq_files, 0); |
1280 | sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); |
1281 | |
1282 | ret = zonefs_read_super(sb); |
1283 | if (ret) |
1284 | return ret; |
1285 | |
1286 | zonefs_info(sb, "Mounting %u zones" , bdev_nr_zones(sb->s_bdev)); |
1287 | |
1288 | if (!sbi->s_max_wro_seq_files && |
1289 | !sbi->s_max_active_seq_files && |
1290 | sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { |
1291 | zonefs_info(sb, |
1292 | "No open and active zone limits. Ignoring explicit_open mount option\n" ); |
1293 | sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; |
1294 | } |
1295 | |
1296 | /* Initialize the zone groups */ |
1297 | ret = zonefs_init_zgroups(sb); |
1298 | if (ret) |
1299 | goto cleanup; |
1300 | |
1301 | /* Create the root directory inode */ |
1302 | ret = -ENOMEM; |
1303 | inode = new_inode(sb); |
1304 | if (!inode) |
1305 | goto cleanup; |
1306 | |
1307 | inode->i_ino = bdev_nr_zones(sb->s_bdev); |
1308 | inode->i_mode = S_IFDIR | 0555; |
1309 | simple_inode_init_ts(inode); |
1310 | inode->i_op = &zonefs_dir_inode_operations; |
1311 | inode->i_fop = &zonefs_dir_operations; |
1312 | inode->i_size = 2; |
1313 | set_nlink(inode, 2); |
1314 | for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { |
1315 | if (sbi->s_zgroup[ztype].g_nr_zones) { |
1316 | inc_nlink(inode); |
1317 | inode->i_size++; |
1318 | } |
1319 | } |
1320 | |
1321 | sb->s_root = d_make_root(inode); |
1322 | if (!sb->s_root) |
1323 | goto cleanup; |
1324 | |
1325 | /* |
1326 | * Take a reference on the zone groups directory inodes |
1327 | * to keep them in the inode cache. |
1328 | */ |
1329 | ret = zonefs_get_zgroup_inodes(sb); |
1330 | if (ret) |
1331 | goto cleanup; |
1332 | |
1333 | ret = zonefs_sysfs_register(sb); |
1334 | if (ret) |
1335 | goto cleanup; |
1336 | |
1337 | return 0; |
1338 | |
1339 | cleanup: |
1340 | zonefs_release_zgroup_inodes(sb); |
1341 | zonefs_free_zgroups(sb); |
1342 | |
1343 | return ret; |
1344 | } |
1345 | |
1346 | static void zonefs_kill_super(struct super_block *sb) |
1347 | { |
1348 | struct zonefs_sb_info *sbi = ZONEFS_SB(sb); |
1349 | |
1350 | /* Release the reference on the zone group directory inodes */ |
1351 | zonefs_release_zgroup_inodes(sb); |
1352 | |
1353 | kill_block_super(sb); |
1354 | |
1355 | zonefs_sysfs_unregister(sb); |
1356 | zonefs_free_zgroups(sb); |
1357 | kfree(objp: sbi); |
1358 | } |
1359 | |
1360 | static void zonefs_free_fc(struct fs_context *fc) |
1361 | { |
1362 | struct zonefs_context *ctx = fc->fs_private; |
1363 | |
1364 | kfree(objp: ctx); |
1365 | } |
1366 | |
1367 | static int zonefs_get_tree(struct fs_context *fc) |
1368 | { |
1369 | return get_tree_bdev(fc, fill_super: zonefs_fill_super); |
1370 | } |
1371 | |
1372 | static int zonefs_reconfigure(struct fs_context *fc) |
1373 | { |
1374 | struct zonefs_context *ctx = fc->fs_private; |
1375 | struct super_block *sb = fc->root->d_sb; |
1376 | struct zonefs_sb_info *sbi = sb->s_fs_info; |
1377 | |
1378 | sync_filesystem(fc->root->d_sb); |
1379 | /* Copy new options from ctx into sbi. */ |
1380 | sbi->s_mount_opts = ctx->s_mount_opts; |
1381 | |
1382 | return 0; |
1383 | } |
1384 | |
1385 | static const struct fs_context_operations zonefs_context_ops = { |
1386 | .parse_param = zonefs_parse_param, |
1387 | .get_tree = zonefs_get_tree, |
1388 | .reconfigure = zonefs_reconfigure, |
1389 | .free = zonefs_free_fc, |
1390 | }; |
1391 | |
1392 | /* |
1393 | * Set up the filesystem mount context. |
1394 | */ |
1395 | static int zonefs_init_fs_context(struct fs_context *fc) |
1396 | { |
1397 | struct zonefs_context *ctx; |
1398 | |
1399 | ctx = kzalloc(size: sizeof(struct zonefs_context), GFP_KERNEL); |
1400 | if (!ctx) |
1401 | return -ENOMEM; |
1402 | ctx->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; |
1403 | fc->ops = &zonefs_context_ops; |
1404 | fc->fs_private = ctx; |
1405 | |
1406 | return 0; |
1407 | } |
1408 | |
1409 | /* |
1410 | * File system definition and registration. |
1411 | */ |
1412 | static struct file_system_type zonefs_type = { |
1413 | .owner = THIS_MODULE, |
1414 | .name = "zonefs" , |
1415 | .kill_sb = zonefs_kill_super, |
1416 | .fs_flags = FS_REQUIRES_DEV, |
1417 | .init_fs_context = zonefs_init_fs_context, |
1418 | .parameters = zonefs_param_spec, |
1419 | }; |
1420 | |
1421 | static int __init zonefs_init_inodecache(void) |
1422 | { |
1423 | zonefs_inode_cachep = kmem_cache_create(name: "zonefs_inode_cache" , |
1424 | size: sizeof(struct zonefs_inode_info), align: 0, |
1425 | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, |
1426 | NULL); |
1427 | if (zonefs_inode_cachep == NULL) |
1428 | return -ENOMEM; |
1429 | return 0; |
1430 | } |
1431 | |
1432 | static void zonefs_destroy_inodecache(void) |
1433 | { |
1434 | /* |
1435 | * Make sure all delayed rcu free inodes are flushed before we |
1436 | * destroy the inode cache. |
1437 | */ |
1438 | rcu_barrier(); |
1439 | kmem_cache_destroy(s: zonefs_inode_cachep); |
1440 | } |
1441 | |
1442 | static int __init zonefs_init(void) |
1443 | { |
1444 | int ret; |
1445 | |
1446 | BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); |
1447 | |
1448 | ret = zonefs_init_inodecache(); |
1449 | if (ret) |
1450 | return ret; |
1451 | |
1452 | ret = zonefs_sysfs_init(); |
1453 | if (ret) |
1454 | goto destroy_inodecache; |
1455 | |
1456 | ret = register_filesystem(&zonefs_type); |
1457 | if (ret) |
1458 | goto sysfs_exit; |
1459 | |
1460 | return 0; |
1461 | |
1462 | sysfs_exit: |
1463 | zonefs_sysfs_exit(); |
1464 | destroy_inodecache: |
1465 | zonefs_destroy_inodecache(); |
1466 | |
1467 | return ret; |
1468 | } |
1469 | |
1470 | static void __exit zonefs_exit(void) |
1471 | { |
1472 | unregister_filesystem(&zonefs_type); |
1473 | zonefs_sysfs_exit(); |
1474 | zonefs_destroy_inodecache(); |
1475 | } |
1476 | |
1477 | MODULE_AUTHOR("Damien Le Moal" ); |
1478 | MODULE_DESCRIPTION("Zone file system for zoned block devices" ); |
1479 | MODULE_LICENSE("GPL" ); |
1480 | MODULE_ALIAS_FS("zonefs" ); |
1481 | module_init(zonefs_init); |
1482 | module_exit(zonefs_exit); |
1483 | |