1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * dlmfs.c |
4 | * |
5 | * Code which implements the kernel side of a minimal userspace |
6 | * interface to our DLM. This file handles the virtual file system |
7 | * used for communication with userspace. Credit should go to ramfs, |
8 | * which was a template for the fs side of this module. |
9 | * |
10 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. |
11 | */ |
12 | |
13 | /* Simple VFS hooks based on: */ |
14 | /* |
15 | * Resizable simple ram filesystem for Linux. |
16 | * |
17 | * Copyright (C) 2000 Linus Torvalds. |
18 | * 2000 Transmeta Corp. |
19 | */ |
20 | |
21 | #include <linux/module.h> |
22 | #include <linux/fs.h> |
23 | #include <linux/pagemap.h> |
24 | #include <linux/types.h> |
25 | #include <linux/slab.h> |
26 | #include <linux/highmem.h> |
27 | #include <linux/init.h> |
28 | #include <linux/string.h> |
29 | #include <linux/backing-dev.h> |
30 | #include <linux/poll.h> |
31 | |
32 | #include <linux/uaccess.h> |
33 | |
34 | #include "../stackglue.h" |
35 | #include "userdlm.h" |
36 | |
37 | #define MLOG_MASK_PREFIX ML_DLMFS |
38 | #include "../cluster/masklog.h" |
39 | |
40 | |
41 | static const struct super_operations dlmfs_ops; |
42 | static const struct file_operations dlmfs_file_operations; |
43 | static const struct inode_operations dlmfs_dir_inode_operations; |
44 | static const struct inode_operations dlmfs_root_inode_operations; |
45 | static const struct inode_operations dlmfs_file_inode_operations; |
46 | static struct kmem_cache *dlmfs_inode_cache; |
47 | |
48 | struct workqueue_struct *user_dlm_worker; |
49 | |
50 | |
51 | |
52 | /* |
53 | * These are the ABI capabilities of dlmfs. |
54 | * |
55 | * Over time, dlmfs has added some features that were not part of the |
56 | * initial ABI. Unfortunately, some of these features are not detectable |
57 | * via standard usage. For example, Linux's default poll always returns |
58 | * EPOLLIN, so there is no way for a caller of poll(2) to know when dlmfs |
59 | * added poll support. Instead, we provide this list of new capabilities. |
60 | * |
61 | * Capabilities is a read-only attribute. We do it as a module parameter |
62 | * so we can discover it whether dlmfs is built in, loaded, or even not |
63 | * loaded. |
64 | * |
65 | * The ABI features are local to this machine's dlmfs mount. This is |
66 | * distinct from the locking protocol, which is concerned with inter-node |
67 | * interaction. |
68 | * |
69 | * Capabilities: |
70 | * - bast : EPOLLIN against the file descriptor of a held lock |
71 | * signifies a bast fired on the lock. |
72 | */ |
73 | #define DLMFS_CAPABILITIES "bast stackglue" |
74 | static int param_set_dlmfs_capabilities(const char *val, |
75 | const struct kernel_param *kp) |
76 | { |
77 | printk(KERN_ERR "%s: readonly parameter\n" , kp->name); |
78 | return -EINVAL; |
79 | } |
80 | static int param_get_dlmfs_capabilities(char *buffer, |
81 | const struct kernel_param *kp) |
82 | { |
83 | return sysfs_emit(buf: buffer, DLMFS_CAPABILITIES); |
84 | } |
85 | module_param_call(capabilities, param_set_dlmfs_capabilities, |
86 | param_get_dlmfs_capabilities, NULL, 0444); |
87 | MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); |
88 | |
89 | |
90 | /* |
91 | * decodes a set of open flags into a valid lock level and a set of flags. |
92 | * returns < 0 if we have invalid flags |
93 | * flags which mean something to us: |
94 | * O_RDONLY -> PRMODE level |
95 | * O_WRONLY -> EXMODE level |
96 | * |
97 | * O_NONBLOCK -> NOQUEUE |
98 | */ |
99 | static int dlmfs_decode_open_flags(int open_flags, |
100 | int *level, |
101 | int *flags) |
102 | { |
103 | if (open_flags & (O_WRONLY|O_RDWR)) |
104 | *level = DLM_LOCK_EX; |
105 | else |
106 | *level = DLM_LOCK_PR; |
107 | |
108 | *flags = 0; |
109 | if (open_flags & O_NONBLOCK) |
110 | *flags |= DLM_LKF_NOQUEUE; |
111 | |
112 | return 0; |
113 | } |
114 | |
115 | static int dlmfs_file_open(struct inode *inode, |
116 | struct file *file) |
117 | { |
118 | int status, level, flags; |
119 | struct dlmfs_filp_private *fp = NULL; |
120 | struct dlmfs_inode_private *ip; |
121 | |
122 | if (S_ISDIR(inode->i_mode)) |
123 | BUG(); |
124 | |
125 | mlog(0, "open called on inode %lu, flags 0x%x\n" , inode->i_ino, |
126 | file->f_flags); |
127 | |
128 | status = dlmfs_decode_open_flags(open_flags: file->f_flags, level: &level, flags: &flags); |
129 | if (status < 0) |
130 | goto bail; |
131 | |
132 | /* We don't want to honor O_APPEND at read/write time as it |
133 | * doesn't make sense for LVB writes. */ |
134 | file->f_flags &= ~O_APPEND; |
135 | |
136 | fp = kmalloc(size: sizeof(*fp), GFP_NOFS); |
137 | if (!fp) { |
138 | status = -ENOMEM; |
139 | goto bail; |
140 | } |
141 | fp->fp_lock_level = level; |
142 | |
143 | ip = DLMFS_I(inode); |
144 | |
145 | status = user_dlm_cluster_lock(lockres: &ip->ip_lockres, level, lkm_flags: flags); |
146 | if (status < 0) { |
147 | /* this is a strange error to return here but I want |
148 | * to be able userspace to be able to distinguish a |
149 | * valid lock request from one that simply couldn't be |
150 | * granted. */ |
151 | if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN) |
152 | status = -ETXTBSY; |
153 | kfree(objp: fp); |
154 | goto bail; |
155 | } |
156 | |
157 | file->private_data = fp; |
158 | bail: |
159 | return status; |
160 | } |
161 | |
162 | static int dlmfs_file_release(struct inode *inode, |
163 | struct file *file) |
164 | { |
165 | int level; |
166 | struct dlmfs_inode_private *ip = DLMFS_I(inode); |
167 | struct dlmfs_filp_private *fp = file->private_data; |
168 | |
169 | if (S_ISDIR(inode->i_mode)) |
170 | BUG(); |
171 | |
172 | mlog(0, "close called on inode %lu\n" , inode->i_ino); |
173 | |
174 | if (fp) { |
175 | level = fp->fp_lock_level; |
176 | if (level != DLM_LOCK_IV) |
177 | user_dlm_cluster_unlock(lockres: &ip->ip_lockres, level); |
178 | |
179 | kfree(objp: fp); |
180 | file->private_data = NULL; |
181 | } |
182 | |
183 | return 0; |
184 | } |
185 | |
186 | /* |
187 | * We do ->setattr() just to override size changes. Our size is the size |
188 | * of the LVB and nothing else. |
189 | */ |
190 | static int dlmfs_file_setattr(struct mnt_idmap *idmap, |
191 | struct dentry *dentry, struct iattr *attr) |
192 | { |
193 | int error; |
194 | struct inode *inode = d_inode(dentry); |
195 | |
196 | attr->ia_valid &= ~ATTR_SIZE; |
197 | error = setattr_prepare(&nop_mnt_idmap, dentry, attr); |
198 | if (error) |
199 | return error; |
200 | |
201 | setattr_copy(&nop_mnt_idmap, inode, attr); |
202 | mark_inode_dirty(inode); |
203 | return 0; |
204 | } |
205 | |
206 | static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait) |
207 | { |
208 | __poll_t event = 0; |
209 | struct inode *inode = file_inode(f: file); |
210 | struct dlmfs_inode_private *ip = DLMFS_I(inode); |
211 | |
212 | poll_wait(filp: file, wait_address: &ip->ip_lockres.l_event, p: wait); |
213 | |
214 | spin_lock(lock: &ip->ip_lockres.l_lock); |
215 | if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) |
216 | event = EPOLLIN | EPOLLRDNORM; |
217 | spin_unlock(lock: &ip->ip_lockres.l_lock); |
218 | |
219 | return event; |
220 | } |
221 | |
222 | static ssize_t dlmfs_file_read(struct file *file, |
223 | char __user *buf, |
224 | size_t count, |
225 | loff_t *ppos) |
226 | { |
227 | char lvb[DLM_LVB_LEN]; |
228 | |
229 | if (!user_dlm_read_lvb(inode: file_inode(f: file), val: lvb)) |
230 | return 0; |
231 | |
232 | return simple_read_from_buffer(to: buf, count, ppos, from: lvb, available: sizeof(lvb)); |
233 | } |
234 | |
235 | static ssize_t dlmfs_file_write(struct file *filp, |
236 | const char __user *buf, |
237 | size_t count, |
238 | loff_t *ppos) |
239 | { |
240 | char lvb_buf[DLM_LVB_LEN]; |
241 | int bytes_left; |
242 | struct inode *inode = file_inode(f: filp); |
243 | |
244 | mlog(0, "inode %lu, count = %zu, *ppos = %llu\n" , |
245 | inode->i_ino, count, *ppos); |
246 | |
247 | if (*ppos >= DLM_LVB_LEN) |
248 | return -ENOSPC; |
249 | |
250 | /* don't write past the lvb */ |
251 | if (count > DLM_LVB_LEN - *ppos) |
252 | count = DLM_LVB_LEN - *ppos; |
253 | |
254 | if (!count) |
255 | return 0; |
256 | |
257 | bytes_left = copy_from_user(to: lvb_buf, from: buf, n: count); |
258 | count -= bytes_left; |
259 | if (count) |
260 | user_dlm_write_lvb(inode, val: lvb_buf, len: count); |
261 | |
262 | *ppos = *ppos + count; |
263 | mlog(0, "wrote %zu bytes\n" , count); |
264 | return count; |
265 | } |
266 | |
267 | static void dlmfs_init_once(void *foo) |
268 | { |
269 | struct dlmfs_inode_private *ip = |
270 | (struct dlmfs_inode_private *) foo; |
271 | |
272 | ip->ip_conn = NULL; |
273 | ip->ip_parent = NULL; |
274 | |
275 | inode_init_once(&ip->ip_vfs_inode); |
276 | } |
277 | |
278 | static struct inode *dlmfs_alloc_inode(struct super_block *sb) |
279 | { |
280 | struct dlmfs_inode_private *ip; |
281 | |
282 | ip = alloc_inode_sb(sb, cache: dlmfs_inode_cache, GFP_NOFS); |
283 | if (!ip) |
284 | return NULL; |
285 | |
286 | return &ip->ip_vfs_inode; |
287 | } |
288 | |
289 | static void dlmfs_free_inode(struct inode *inode) |
290 | { |
291 | kmem_cache_free(s: dlmfs_inode_cache, objp: DLMFS_I(inode)); |
292 | } |
293 | |
294 | static void dlmfs_evict_inode(struct inode *inode) |
295 | { |
296 | int status; |
297 | struct dlmfs_inode_private *ip; |
298 | struct user_lock_res *lockres; |
299 | int teardown; |
300 | |
301 | clear_inode(inode); |
302 | |
303 | mlog(0, "inode %lu\n" , inode->i_ino); |
304 | |
305 | ip = DLMFS_I(inode); |
306 | lockres = &ip->ip_lockres; |
307 | |
308 | if (S_ISREG(inode->i_mode)) { |
309 | spin_lock(lock: &lockres->l_lock); |
310 | teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN); |
311 | spin_unlock(lock: &lockres->l_lock); |
312 | if (!teardown) { |
313 | status = user_dlm_destroy_lock(lockres); |
314 | if (status < 0) |
315 | mlog_errno(status); |
316 | } |
317 | iput(ip->ip_parent); |
318 | goto clear_fields; |
319 | } |
320 | |
321 | mlog(0, "we're a directory, ip->ip_conn = 0x%p\n" , ip->ip_conn); |
322 | /* we must be a directory. If required, lets unregister the |
323 | * dlm context now. */ |
324 | if (ip->ip_conn) |
325 | user_dlm_unregister(conn: ip->ip_conn); |
326 | clear_fields: |
327 | ip->ip_parent = NULL; |
328 | ip->ip_conn = NULL; |
329 | } |
330 | |
331 | static struct inode *dlmfs_get_root_inode(struct super_block *sb) |
332 | { |
333 | struct inode *inode = new_inode(sb); |
334 | umode_t mode = S_IFDIR | 0755; |
335 | |
336 | if (inode) { |
337 | inode->i_ino = get_next_ino(); |
338 | inode_init_owner(idmap: &nop_mnt_idmap, inode, NULL, mode); |
339 | simple_inode_init_ts(inode); |
340 | inc_nlink(inode); |
341 | |
342 | inode->i_fop = &simple_dir_operations; |
343 | inode->i_op = &dlmfs_root_inode_operations; |
344 | } |
345 | |
346 | return inode; |
347 | } |
348 | |
349 | static struct inode *dlmfs_get_inode(struct inode *parent, |
350 | struct dentry *dentry, |
351 | umode_t mode) |
352 | { |
353 | struct super_block *sb = parent->i_sb; |
354 | struct inode * inode = new_inode(sb); |
355 | struct dlmfs_inode_private *ip; |
356 | |
357 | if (!inode) |
358 | return NULL; |
359 | |
360 | inode->i_ino = get_next_ino(); |
361 | inode_init_owner(idmap: &nop_mnt_idmap, inode, dir: parent, mode); |
362 | simple_inode_init_ts(inode); |
363 | |
364 | ip = DLMFS_I(inode); |
365 | ip->ip_conn = DLMFS_I(inode: parent)->ip_conn; |
366 | |
367 | switch (mode & S_IFMT) { |
368 | default: |
369 | /* for now we don't support anything other than |
370 | * directories and regular files. */ |
371 | BUG(); |
372 | break; |
373 | case S_IFREG: |
374 | inode->i_op = &dlmfs_file_inode_operations; |
375 | inode->i_fop = &dlmfs_file_operations; |
376 | |
377 | i_size_write(inode, DLM_LVB_LEN); |
378 | |
379 | user_dlm_lock_res_init(lockres: &ip->ip_lockres, dentry); |
380 | |
381 | /* released at clear_inode time, this insures that we |
382 | * get to drop the dlm reference on each lock *before* |
383 | * we call the unregister code for releasing parent |
384 | * directories. */ |
385 | ip->ip_parent = igrab(parent); |
386 | BUG_ON(!ip->ip_parent); |
387 | break; |
388 | case S_IFDIR: |
389 | inode->i_op = &dlmfs_dir_inode_operations; |
390 | inode->i_fop = &simple_dir_operations; |
391 | |
392 | /* directory inodes start off with i_nlink == |
393 | * 2 (for "." entry) */ |
394 | inc_nlink(inode); |
395 | break; |
396 | } |
397 | return inode; |
398 | } |
399 | |
400 | /* |
401 | * File creation. Allocate an inode, and we're done.. |
402 | */ |
403 | /* SMP-safe */ |
404 | static int dlmfs_mkdir(struct mnt_idmap * idmap, |
405 | struct inode * dir, |
406 | struct dentry * dentry, |
407 | umode_t mode) |
408 | { |
409 | int status; |
410 | struct inode *inode = NULL; |
411 | const struct qstr *domain = &dentry->d_name; |
412 | struct dlmfs_inode_private *ip; |
413 | struct ocfs2_cluster_connection *conn; |
414 | |
415 | mlog(0, "mkdir %.*s\n" , domain->len, domain->name); |
416 | |
417 | /* verify that we have a proper domain */ |
418 | if (domain->len >= GROUP_NAME_MAX) { |
419 | status = -EINVAL; |
420 | mlog(ML_ERROR, "invalid domain name for directory.\n" ); |
421 | goto bail; |
422 | } |
423 | |
424 | inode = dlmfs_get_inode(parent: dir, dentry, mode: mode | S_IFDIR); |
425 | if (!inode) { |
426 | status = -ENOMEM; |
427 | mlog_errno(status); |
428 | goto bail; |
429 | } |
430 | |
431 | ip = DLMFS_I(inode); |
432 | |
433 | conn = user_dlm_register(name: domain); |
434 | if (IS_ERR(ptr: conn)) { |
435 | status = PTR_ERR(ptr: conn); |
436 | mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n" , |
437 | status, domain->len, domain->name); |
438 | goto bail; |
439 | } |
440 | ip->ip_conn = conn; |
441 | |
442 | inc_nlink(inode: dir); |
443 | d_instantiate(dentry, inode); |
444 | dget(dentry); /* Extra count - pin the dentry in core */ |
445 | |
446 | status = 0; |
447 | bail: |
448 | if (status < 0) |
449 | iput(inode); |
450 | return status; |
451 | } |
452 | |
453 | static int dlmfs_create(struct mnt_idmap *idmap, |
454 | struct inode *dir, |
455 | struct dentry *dentry, |
456 | umode_t mode, |
457 | bool excl) |
458 | { |
459 | int status = 0; |
460 | struct inode *inode; |
461 | const struct qstr *name = &dentry->d_name; |
462 | |
463 | mlog(0, "create %.*s\n" , name->len, name->name); |
464 | |
465 | /* verify name is valid and doesn't contain any dlm reserved |
466 | * characters */ |
467 | if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || |
468 | name->name[0] == '$') { |
469 | status = -EINVAL; |
470 | mlog(ML_ERROR, "invalid lock name, %.*s\n" , name->len, |
471 | name->name); |
472 | goto bail; |
473 | } |
474 | |
475 | inode = dlmfs_get_inode(parent: dir, dentry, mode: mode | S_IFREG); |
476 | if (!inode) { |
477 | status = -ENOMEM; |
478 | mlog_errno(status); |
479 | goto bail; |
480 | } |
481 | |
482 | d_instantiate(dentry, inode); |
483 | dget(dentry); /* Extra count - pin the dentry in core */ |
484 | bail: |
485 | return status; |
486 | } |
487 | |
488 | static int dlmfs_unlink(struct inode *dir, |
489 | struct dentry *dentry) |
490 | { |
491 | int status; |
492 | struct inode *inode = d_inode(dentry); |
493 | |
494 | mlog(0, "unlink inode %lu\n" , inode->i_ino); |
495 | |
496 | /* if there are no current holders, or none that are waiting |
497 | * to acquire a lock, this basically destroys our lockres. */ |
498 | status = user_dlm_destroy_lock(lockres: &DLMFS_I(inode)->ip_lockres); |
499 | if (status < 0) { |
500 | mlog(ML_ERROR, "unlink %pd, error %d from destroy\n" , |
501 | dentry, status); |
502 | goto bail; |
503 | } |
504 | status = simple_unlink(dir, dentry); |
505 | bail: |
506 | return status; |
507 | } |
508 | |
509 | static int dlmfs_fill_super(struct super_block * sb, |
510 | void * data, |
511 | int silent) |
512 | { |
513 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
514 | sb->s_blocksize = PAGE_SIZE; |
515 | sb->s_blocksize_bits = PAGE_SHIFT; |
516 | sb->s_magic = DLMFS_MAGIC; |
517 | sb->s_op = &dlmfs_ops; |
518 | sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); |
519 | if (!sb->s_root) |
520 | return -ENOMEM; |
521 | return 0; |
522 | } |
523 | |
524 | static const struct file_operations dlmfs_file_operations = { |
525 | .open = dlmfs_file_open, |
526 | .release = dlmfs_file_release, |
527 | .poll = dlmfs_file_poll, |
528 | .read = dlmfs_file_read, |
529 | .write = dlmfs_file_write, |
530 | .llseek = default_llseek, |
531 | }; |
532 | |
533 | static const struct inode_operations dlmfs_dir_inode_operations = { |
534 | .create = dlmfs_create, |
535 | .lookup = simple_lookup, |
536 | .unlink = dlmfs_unlink, |
537 | }; |
538 | |
539 | /* this way we can restrict mkdir to only the toplevel of the fs. */ |
540 | static const struct inode_operations dlmfs_root_inode_operations = { |
541 | .lookup = simple_lookup, |
542 | .mkdir = dlmfs_mkdir, |
543 | .rmdir = simple_rmdir, |
544 | }; |
545 | |
546 | static const struct super_operations dlmfs_ops = { |
547 | .statfs = simple_statfs, |
548 | .alloc_inode = dlmfs_alloc_inode, |
549 | .free_inode = dlmfs_free_inode, |
550 | .evict_inode = dlmfs_evict_inode, |
551 | .drop_inode = generic_delete_inode, |
552 | }; |
553 | |
554 | static const struct inode_operations dlmfs_file_inode_operations = { |
555 | .getattr = simple_getattr, |
556 | .setattr = dlmfs_file_setattr, |
557 | }; |
558 | |
559 | static struct dentry *dlmfs_mount(struct file_system_type *fs_type, |
560 | int flags, const char *dev_name, void *data) |
561 | { |
562 | return mount_nodev(fs_type, flags, data, fill_super: dlmfs_fill_super); |
563 | } |
564 | |
565 | static struct file_system_type dlmfs_fs_type = { |
566 | .owner = THIS_MODULE, |
567 | .name = "ocfs2_dlmfs" , |
568 | .mount = dlmfs_mount, |
569 | .kill_sb = kill_litter_super, |
570 | }; |
571 | MODULE_ALIAS_FS("ocfs2_dlmfs" ); |
572 | |
573 | static int __init init_dlmfs_fs(void) |
574 | { |
575 | int status; |
576 | int cleanup_inode = 0, cleanup_worker = 0; |
577 | |
578 | dlmfs_inode_cache = kmem_cache_create(name: "dlmfs_inode_cache" , |
579 | size: sizeof(struct dlmfs_inode_private), |
580 | align: 0, flags: (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
581 | SLAB_ACCOUNT), |
582 | ctor: dlmfs_init_once); |
583 | if (!dlmfs_inode_cache) { |
584 | status = -ENOMEM; |
585 | goto bail; |
586 | } |
587 | cleanup_inode = 1; |
588 | |
589 | user_dlm_worker = alloc_workqueue(fmt: "user_dlm" , flags: WQ_MEM_RECLAIM, max_active: 0); |
590 | if (!user_dlm_worker) { |
591 | status = -ENOMEM; |
592 | goto bail; |
593 | } |
594 | cleanup_worker = 1; |
595 | |
596 | user_dlm_set_locking_protocol(); |
597 | status = register_filesystem(&dlmfs_fs_type); |
598 | bail: |
599 | if (status) { |
600 | if (cleanup_inode) |
601 | kmem_cache_destroy(s: dlmfs_inode_cache); |
602 | if (cleanup_worker) |
603 | destroy_workqueue(wq: user_dlm_worker); |
604 | } else |
605 | printk("OCFS2 User DLM kernel interface loaded\n" ); |
606 | return status; |
607 | } |
608 | |
609 | static void __exit exit_dlmfs_fs(void) |
610 | { |
611 | unregister_filesystem(&dlmfs_fs_type); |
612 | |
613 | destroy_workqueue(wq: user_dlm_worker); |
614 | |
615 | /* |
616 | * Make sure all delayed rcu free inodes are flushed before we |
617 | * destroy cache. |
618 | */ |
619 | rcu_barrier(); |
620 | kmem_cache_destroy(s: dlmfs_inode_cache); |
621 | |
622 | } |
623 | |
624 | MODULE_AUTHOR("Oracle" ); |
625 | MODULE_LICENSE("GPL" ); |
626 | MODULE_DESCRIPTION("OCFS2 DLM-Filesystem" ); |
627 | |
628 | module_init(init_dlmfs_fs) |
629 | module_exit(exit_dlmfs_fs) |
630 | |