1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | #include <linux/fanotify.h> |
3 | #include <linux/fcntl.h> |
4 | #include <linux/file.h> |
5 | #include <linux/fs.h> |
6 | #include <linux/anon_inodes.h> |
7 | #include <linux/fsnotify_backend.h> |
8 | #include <linux/init.h> |
9 | #include <linux/mount.h> |
10 | #include <linux/namei.h> |
11 | #include <linux/poll.h> |
12 | #include <linux/security.h> |
13 | #include <linux/syscalls.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/types.h> |
16 | #include <linux/uaccess.h> |
17 | #include <linux/compat.h> |
18 | #include <linux/sched/signal.h> |
19 | #include <linux/memcontrol.h> |
20 | #include <linux/statfs.h> |
21 | #include <linux/exportfs.h> |
22 | |
23 | #include <asm/ioctls.h> |
24 | |
25 | #include "../fsnotify.h" |
26 | #include "../fdinfo.h" |
27 | #include "fanotify.h" |
28 | |
29 | #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 |
30 | #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 |
31 | #define FANOTIFY_DEFAULT_MAX_GROUPS 128 |
32 | #define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 |
33 | |
34 | /* |
35 | * Legacy fanotify marks limits (8192) is per group and we introduced a tunable |
36 | * limit of marks per user, similar to inotify. Effectively, the legacy limit |
37 | * of fanotify marks per user is <max marks per group> * <max groups per user>. |
38 | * This default limit (1M) also happens to match the increased limit of inotify |
39 | * max_user_watches since v5.10. |
40 | */ |
41 | #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ |
42 | (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) |
43 | |
44 | /* |
45 | * Most of the memory cost of adding an inode mark is pinning the marked inode. |
46 | * The size of the filesystem inode struct is not uniform across filesystems, |
47 | * so double the size of a VFS inode is used as a conservative approximation. |
48 | */ |
49 | #define INODE_MARK_COST (2 * sizeof(struct inode)) |
50 | |
51 | /* configurable via /proc/sys/fs/fanotify/ */ |
52 | static int fanotify_max_queued_events __read_mostly; |
53 | |
54 | #ifdef CONFIG_SYSCTL |
55 | |
56 | #include <linux/sysctl.h> |
57 | |
58 | static long ft_zero = 0; |
59 | static long ft_int_max = INT_MAX; |
60 | |
61 | static const struct ctl_table fanotify_table[] = { |
62 | { |
63 | .procname = "max_user_groups", |
64 | .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], |
65 | .maxlen = sizeof(long), |
66 | .mode = 0644, |
67 | .proc_handler = proc_doulongvec_minmax, |
68 | .extra1 = &ft_zero, |
69 | .extra2 = &ft_int_max, |
70 | }, |
71 | { |
72 | .procname = "max_user_marks", |
73 | .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], |
74 | .maxlen = sizeof(long), |
75 | .mode = 0644, |
76 | .proc_handler = proc_doulongvec_minmax, |
77 | .extra1 = &ft_zero, |
78 | .extra2 = &ft_int_max, |
79 | }, |
80 | { |
81 | .procname = "max_queued_events", |
82 | .data = &fanotify_max_queued_events, |
83 | .maxlen = sizeof(int), |
84 | .mode = 0644, |
85 | .proc_handler = proc_dointvec_minmax, |
86 | .extra1 = SYSCTL_ZERO |
87 | }, |
88 | }; |
89 | |
90 | static void __init fanotify_sysctls_init(void) |
91 | { |
92 | register_sysctl("fs/fanotify", fanotify_table); |
93 | } |
94 | #else |
95 | #define fanotify_sysctls_init() do { } while (0) |
96 | #endif /* CONFIG_SYSCTL */ |
97 | |
98 | /* |
99 | * All flags that may be specified in parameter event_f_flags of fanotify_init. |
100 | * |
101 | * Internal and external open flags are stored together in field f_flags of |
102 | * struct file. Only external open flags shall be allowed in event_f_flags. |
103 | * Internal flags like FMODE_EXEC shall be excluded. |
104 | */ |
105 | #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ |
106 | O_ACCMODE | O_APPEND | O_NONBLOCK | \ |
107 | __O_SYNC | O_DSYNC | O_CLOEXEC | \ |
108 | O_LARGEFILE | O_NOATIME ) |
109 | |
110 | extern const struct fsnotify_ops fanotify_fsnotify_ops; |
111 | |
112 | struct kmem_cache *fanotify_mark_cache __ro_after_init; |
113 | struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; |
114 | struct kmem_cache *fanotify_path_event_cachep __ro_after_init; |
115 | struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; |
116 | struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; |
117 | |
118 | #define FANOTIFY_EVENT_ALIGN 4 |
119 | #define FANOTIFY_FID_INFO_HDR_LEN \ |
120 | (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) |
121 | #define FANOTIFY_PIDFD_INFO_LEN \ |
122 | sizeof(struct fanotify_event_info_pidfd) |
123 | #define FANOTIFY_ERROR_INFO_LEN \ |
124 | (sizeof(struct fanotify_event_info_error)) |
125 | #define FANOTIFY_RANGE_INFO_LEN \ |
126 | (sizeof(struct fanotify_event_info_range)) |
127 | #define FANOTIFY_MNT_INFO_LEN \ |
128 | (sizeof(struct fanotify_event_info_mnt)) |
129 | |
130 | static int fanotify_fid_info_len(int fh_len, int name_len) |
131 | { |
132 | int info_len = fh_len; |
133 | |
134 | if (name_len) |
135 | info_len += name_len + 1; |
136 | |
137 | return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, |
138 | FANOTIFY_EVENT_ALIGN); |
139 | } |
140 | |
141 | /* FAN_RENAME may have one or two dir+name info records */ |
142 | static int fanotify_dir_name_info_len(struct fanotify_event *event) |
143 | { |
144 | struct fanotify_info *info = fanotify_event_info(event); |
145 | int dir_fh_len = fanotify_event_dir_fh_len(event); |
146 | int dir2_fh_len = fanotify_event_dir2_fh_len(event); |
147 | int info_len = 0; |
148 | |
149 | if (dir_fh_len) |
150 | info_len += fanotify_fid_info_len(fh_len: dir_fh_len, |
151 | name_len: info->name_len); |
152 | if (dir2_fh_len) |
153 | info_len += fanotify_fid_info_len(fh_len: dir2_fh_len, |
154 | name_len: info->name2_len); |
155 | |
156 | return info_len; |
157 | } |
158 | |
159 | static size_t fanotify_event_len(unsigned int info_mode, |
160 | struct fanotify_event *event) |
161 | { |
162 | size_t event_len = FAN_EVENT_METADATA_LEN; |
163 | int fh_len; |
164 | int dot_len = 0; |
165 | |
166 | if (fanotify_is_error_event(mask: event->mask)) |
167 | event_len += FANOTIFY_ERROR_INFO_LEN; |
168 | |
169 | if (fanotify_event_has_any_dir_fh(event)) { |
170 | event_len += fanotify_dir_name_info_len(event); |
171 | } else if ((info_mode & FAN_REPORT_NAME) && |
172 | (event->mask & FAN_ONDIR)) { |
173 | /* |
174 | * With group flag FAN_REPORT_NAME, if name was not recorded in |
175 | * event on a directory, we will report the name ".". |
176 | */ |
177 | dot_len = 1; |
178 | } |
179 | |
180 | if (fanotify_event_has_object_fh(event)) { |
181 | fh_len = fanotify_event_object_fh_len(event); |
182 | event_len += fanotify_fid_info_len(fh_len, name_len: dot_len); |
183 | } |
184 | if (fanotify_is_mnt_event(mask: event->mask)) |
185 | event_len += FANOTIFY_MNT_INFO_LEN; |
186 | |
187 | if (info_mode & FAN_REPORT_PIDFD) |
188 | event_len += FANOTIFY_PIDFD_INFO_LEN; |
189 | |
190 | if (fanotify_event_has_access_range(event)) |
191 | event_len += FANOTIFY_RANGE_INFO_LEN; |
192 | |
193 | return event_len; |
194 | } |
195 | |
196 | /* |
197 | * Remove an hashed event from merge hash table. |
198 | */ |
199 | static void fanotify_unhash_event(struct fsnotify_group *group, |
200 | struct fanotify_event *event) |
201 | { |
202 | assert_spin_locked(&group->notification_lock); |
203 | |
204 | pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, |
205 | group, event, fanotify_event_hash_bucket(group, event)); |
206 | |
207 | if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) |
208 | return; |
209 | |
210 | hlist_del_init(n: &event->merge_list); |
211 | } |
212 | |
213 | /* |
214 | * Get an fanotify notification event if one exists and is small |
215 | * enough to fit in "count". Return an error pointer if the count |
216 | * is not large enough. When permission event is dequeued, its state is |
217 | * updated accordingly. |
218 | */ |
219 | static struct fanotify_event *get_one_event(struct fsnotify_group *group, |
220 | size_t count) |
221 | { |
222 | size_t event_size; |
223 | struct fanotify_event *event = NULL; |
224 | struct fsnotify_event *fsn_event; |
225 | unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); |
226 | |
227 | pr_debug("%s: group=%p count=%zd\n", __func__, group, count); |
228 | |
229 | spin_lock(lock: &group->notification_lock); |
230 | fsn_event = fsnotify_peek_first_event(group); |
231 | if (!fsn_event) |
232 | goto out; |
233 | |
234 | event = FANOTIFY_E(fse: fsn_event); |
235 | event_size = fanotify_event_len(info_mode, event); |
236 | |
237 | if (event_size > count) { |
238 | event = ERR_PTR(error: -EINVAL); |
239 | goto out; |
240 | } |
241 | |
242 | /* |
243 | * Held the notification_lock the whole time, so this is the |
244 | * same event we peeked above. |
245 | */ |
246 | fsnotify_remove_first_event(group); |
247 | if (fanotify_is_perm_event(mask: event->mask)) |
248 | FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; |
249 | if (fanotify_is_hashed_event(mask: event->mask)) |
250 | fanotify_unhash_event(group, event); |
251 | out: |
252 | spin_unlock(lock: &group->notification_lock); |
253 | return event; |
254 | } |
255 | |
256 | static int create_fd(struct fsnotify_group *group, const struct path *path, |
257 | struct file **file) |
258 | { |
259 | int client_fd; |
260 | struct file *new_file; |
261 | |
262 | client_fd = get_unused_fd_flags(flags: group->fanotify_data.f_flags); |
263 | if (client_fd < 0) |
264 | return client_fd; |
265 | |
266 | /* |
267 | * We provide an fd for the userspace program, so it could access the |
268 | * file without generating fanotify events itself. |
269 | */ |
270 | new_file = dentry_open_nonotify(path, flags: group->fanotify_data.f_flags, |
271 | current_cred()); |
272 | if (IS_ERR(ptr: new_file)) { |
273 | put_unused_fd(fd: client_fd); |
274 | client_fd = PTR_ERR(ptr: new_file); |
275 | } else { |
276 | *file = new_file; |
277 | } |
278 | |
279 | return client_fd; |
280 | } |
281 | |
282 | static int process_access_response_info(const char __user *info, |
283 | size_t info_len, |
284 | struct fanotify_response_info_audit_rule *friar) |
285 | { |
286 | if (info_len != sizeof(*friar)) |
287 | return -EINVAL; |
288 | |
289 | if (copy_from_user(to: friar, from: info, n: sizeof(*friar))) |
290 | return -EFAULT; |
291 | |
292 | if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) |
293 | return -EINVAL; |
294 | if (friar->hdr.pad != 0) |
295 | return -EINVAL; |
296 | if (friar->hdr.len != sizeof(*friar)) |
297 | return -EINVAL; |
298 | |
299 | return info_len; |
300 | } |
301 | |
302 | /* |
303 | * Finish processing of permission event by setting it to ANSWERED state and |
304 | * drop group->notification_lock. |
305 | */ |
306 | static void finish_permission_event(struct fsnotify_group *group, |
307 | struct fanotify_perm_event *event, u32 response, |
308 | struct fanotify_response_info_audit_rule *friar) |
309 | __releases(&group->notification_lock) |
310 | { |
311 | bool destroy = false; |
312 | |
313 | assert_spin_locked(&group->notification_lock); |
314 | event->response = response & ~FAN_INFO; |
315 | if (response & FAN_INFO) |
316 | memcpy(&event->audit_rule, friar, sizeof(*friar)); |
317 | |
318 | if (event->state == FAN_EVENT_CANCELED) |
319 | destroy = true; |
320 | else |
321 | event->state = FAN_EVENT_ANSWERED; |
322 | spin_unlock(lock: &group->notification_lock); |
323 | if (destroy) |
324 | fsnotify_destroy_event(group, event: &event->fae.fse); |
325 | } |
326 | |
327 | static int process_access_response(struct fsnotify_group *group, |
328 | struct fanotify_response *response_struct, |
329 | const char __user *info, |
330 | size_t info_len) |
331 | { |
332 | struct fanotify_perm_event *event; |
333 | int fd = response_struct->fd; |
334 | u32 response = response_struct->response; |
335 | int errno = fanotify_get_response_errno(res: response); |
336 | int ret = info_len; |
337 | struct fanotify_response_info_audit_rule friar; |
338 | |
339 | pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", |
340 | __func__, group, fd, response, errno, info, info_len); |
341 | /* |
342 | * make sure the response is valid, if invalid we do nothing and either |
343 | * userspace can send a valid response or we will clean it up after the |
344 | * timeout |
345 | */ |
346 | if (response & ~FANOTIFY_RESPONSE_VALID_MASK) |
347 | return -EINVAL; |
348 | |
349 | switch (response & FANOTIFY_RESPONSE_ACCESS) { |
350 | case FAN_ALLOW: |
351 | if (errno) |
352 | return -EINVAL; |
353 | break; |
354 | case FAN_DENY: |
355 | /* Custom errno is supported only for pre-content groups */ |
356 | if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) |
357 | return -EINVAL; |
358 | |
359 | /* |
360 | * Limit errno to values expected on open(2)/read(2)/write(2) |
361 | * of regular files. |
362 | */ |
363 | switch (errno) { |
364 | case 0: |
365 | case EIO: |
366 | case EPERM: |
367 | case EBUSY: |
368 | case ETXTBSY: |
369 | case EAGAIN: |
370 | case ENOSPC: |
371 | case EDQUOT: |
372 | break; |
373 | default: |
374 | return -EINVAL; |
375 | } |
376 | break; |
377 | default: |
378 | return -EINVAL; |
379 | } |
380 | |
381 | if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) |
382 | return -EINVAL; |
383 | |
384 | if (response & FAN_INFO) { |
385 | ret = process_access_response_info(info, info_len, friar: &friar); |
386 | if (ret < 0) |
387 | return ret; |
388 | if (fd == FAN_NOFD) |
389 | return ret; |
390 | } else { |
391 | ret = 0; |
392 | } |
393 | |
394 | if (fd < 0) |
395 | return -EINVAL; |
396 | |
397 | spin_lock(lock: &group->notification_lock); |
398 | list_for_each_entry(event, &group->fanotify_data.access_list, |
399 | fae.fse.list) { |
400 | if (event->fd != fd) |
401 | continue; |
402 | |
403 | list_del_init(entry: &event->fae.fse.list); |
404 | finish_permission_event(group, event, response, friar: &friar); |
405 | wake_up(&group->fanotify_data.access_waitq); |
406 | return ret; |
407 | } |
408 | spin_unlock(lock: &group->notification_lock); |
409 | |
410 | return -ENOENT; |
411 | } |
412 | |
413 | static size_t copy_mnt_info_to_user(struct fanotify_event *event, |
414 | char __user *buf, int count) |
415 | { |
416 | struct fanotify_event_info_mnt info = { }; |
417 | |
418 | info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; |
419 | info.hdr.len = FANOTIFY_MNT_INFO_LEN; |
420 | |
421 | if (WARN_ON(count < info.hdr.len)) |
422 | return -EFAULT; |
423 | |
424 | info.mnt_id = FANOTIFY_ME(event)->mnt_id; |
425 | |
426 | if (copy_to_user(to: buf, from: &info, n: sizeof(info))) |
427 | return -EFAULT; |
428 | |
429 | return info.hdr.len; |
430 | } |
431 | |
432 | static size_t copy_error_info_to_user(struct fanotify_event *event, |
433 | char __user *buf, int count) |
434 | { |
435 | struct fanotify_event_info_error info = { }; |
436 | struct fanotify_error_event *fee = FANOTIFY_EE(event); |
437 | |
438 | info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; |
439 | info.hdr.len = FANOTIFY_ERROR_INFO_LEN; |
440 | |
441 | if (WARN_ON(count < info.hdr.len)) |
442 | return -EFAULT; |
443 | |
444 | info.error = fee->error; |
445 | info.error_count = fee->err_count; |
446 | |
447 | if (copy_to_user(to: buf, from: &info, n: sizeof(info))) |
448 | return -EFAULT; |
449 | |
450 | return info.hdr.len; |
451 | } |
452 | |
453 | static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, |
454 | int info_type, const char *name, |
455 | size_t name_len, |
456 | char __user *buf, size_t count) |
457 | { |
458 | struct fanotify_event_info_fid info = { }; |
459 | struct file_handle handle = { }; |
460 | unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; |
461 | size_t fh_len = fh ? fh->len : 0; |
462 | size_t info_len = fanotify_fid_info_len(fh_len, name_len); |
463 | size_t len = info_len; |
464 | |
465 | pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", |
466 | __func__, fh_len, name_len, info_len, count); |
467 | |
468 | if (WARN_ON_ONCE(len < sizeof(info) || len > count)) |
469 | return -EFAULT; |
470 | |
471 | /* |
472 | * Copy event info fid header followed by variable sized file handle |
473 | * and optionally followed by variable sized filename. |
474 | */ |
475 | switch (info_type) { |
476 | case FAN_EVENT_INFO_TYPE_FID: |
477 | case FAN_EVENT_INFO_TYPE_DFID: |
478 | if (WARN_ON_ONCE(name_len)) |
479 | return -EFAULT; |
480 | break; |
481 | case FAN_EVENT_INFO_TYPE_DFID_NAME: |
482 | case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: |
483 | case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: |
484 | if (WARN_ON_ONCE(!name || !name_len)) |
485 | return -EFAULT; |
486 | break; |
487 | default: |
488 | return -EFAULT; |
489 | } |
490 | |
491 | info.hdr.info_type = info_type; |
492 | info.hdr.len = len; |
493 | info.fsid = *fsid; |
494 | if (copy_to_user(to: buf, from: &info, n: sizeof(info))) |
495 | return -EFAULT; |
496 | |
497 | buf += sizeof(info); |
498 | len -= sizeof(info); |
499 | if (WARN_ON_ONCE(len < sizeof(handle))) |
500 | return -EFAULT; |
501 | |
502 | handle.handle_type = fh->type; |
503 | handle.handle_bytes = fh_len; |
504 | |
505 | /* Mangle handle_type for bad file_handle */ |
506 | if (!fh_len) |
507 | handle.handle_type = FILEID_INVALID; |
508 | |
509 | if (copy_to_user(to: buf, from: &handle, n: sizeof(handle))) |
510 | return -EFAULT; |
511 | |
512 | buf += sizeof(handle); |
513 | len -= sizeof(handle); |
514 | if (WARN_ON_ONCE(len < fh_len)) |
515 | return -EFAULT; |
516 | |
517 | /* |
518 | * For an inline fh and inline file name, copy through stack to exclude |
519 | * the copy from usercopy hardening protections. |
520 | */ |
521 | fh_buf = fanotify_fh_buf(fh); |
522 | if (fh_len <= FANOTIFY_INLINE_FH_LEN) { |
523 | memcpy(bounce, fh_buf, fh_len); |
524 | fh_buf = bounce; |
525 | } |
526 | if (copy_to_user(to: buf, from: fh_buf, n: fh_len)) |
527 | return -EFAULT; |
528 | |
529 | buf += fh_len; |
530 | len -= fh_len; |
531 | |
532 | if (name_len) { |
533 | /* Copy the filename with terminating null */ |
534 | name_len++; |
535 | if (WARN_ON_ONCE(len < name_len)) |
536 | return -EFAULT; |
537 | |
538 | if (copy_to_user(to: buf, from: name, n: name_len)) |
539 | return -EFAULT; |
540 | |
541 | buf += name_len; |
542 | len -= name_len; |
543 | } |
544 | |
545 | /* Pad with 0's */ |
546 | WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); |
547 | if (len > 0 && clear_user(to: buf, n: len)) |
548 | return -EFAULT; |
549 | |
550 | return info_len; |
551 | } |
552 | |
553 | static int copy_pidfd_info_to_user(int pidfd, |
554 | char __user *buf, |
555 | size_t count) |
556 | { |
557 | struct fanotify_event_info_pidfd info = { }; |
558 | size_t info_len = FANOTIFY_PIDFD_INFO_LEN; |
559 | |
560 | if (WARN_ON_ONCE(info_len > count)) |
561 | return -EFAULT; |
562 | |
563 | info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; |
564 | info.hdr.len = info_len; |
565 | info.pidfd = pidfd; |
566 | |
567 | if (copy_to_user(to: buf, from: &info, n: info_len)) |
568 | return -EFAULT; |
569 | |
570 | return info_len; |
571 | } |
572 | |
573 | static size_t copy_range_info_to_user(struct fanotify_event *event, |
574 | char __user *buf, int count) |
575 | { |
576 | struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); |
577 | struct fanotify_event_info_range info = { }; |
578 | size_t info_len = FANOTIFY_RANGE_INFO_LEN; |
579 | |
580 | if (WARN_ON_ONCE(info_len > count)) |
581 | return -EFAULT; |
582 | |
583 | if (WARN_ON_ONCE(!pevent->ppos)) |
584 | return -EINVAL; |
585 | |
586 | info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; |
587 | info.hdr.len = info_len; |
588 | info.offset = *(pevent->ppos); |
589 | info.count = pevent->count; |
590 | |
591 | if (copy_to_user(to: buf, from: &info, n: info_len)) |
592 | return -EFAULT; |
593 | |
594 | return info_len; |
595 | } |
596 | |
597 | static int copy_info_records_to_user(struct fanotify_event *event, |
598 | struct fanotify_info *info, |
599 | unsigned int info_mode, int pidfd, |
600 | char __user *buf, size_t count) |
601 | { |
602 | int ret, total_bytes = 0, info_type = 0; |
603 | unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; |
604 | unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; |
605 | |
606 | /* |
607 | * Event info records order is as follows: |
608 | * 1. dir fid + name |
609 | * 2. (optional) new dir fid + new name |
610 | * 3. (optional) child fid |
611 | */ |
612 | if (fanotify_event_has_dir_fh(event)) { |
613 | info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : |
614 | FAN_EVENT_INFO_TYPE_DFID; |
615 | |
616 | /* FAN_RENAME uses special info types */ |
617 | if (event->mask & FAN_RENAME) |
618 | info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; |
619 | |
620 | ret = copy_fid_info_to_user(fsid: fanotify_event_fsid(event), |
621 | fh: fanotify_info_dir_fh(info), |
622 | info_type, |
623 | name: fanotify_info_name(info), |
624 | name_len: info->name_len, buf, count); |
625 | if (ret < 0) |
626 | return ret; |
627 | |
628 | buf += ret; |
629 | count -= ret; |
630 | total_bytes += ret; |
631 | } |
632 | |
633 | /* New dir fid+name may be reported in addition to old dir fid+name */ |
634 | if (fanotify_event_has_dir2_fh(event)) { |
635 | info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; |
636 | ret = copy_fid_info_to_user(fsid: fanotify_event_fsid(event), |
637 | fh: fanotify_info_dir2_fh(info), |
638 | info_type, |
639 | name: fanotify_info_name2(info), |
640 | name_len: info->name2_len, buf, count); |
641 | if (ret < 0) |
642 | return ret; |
643 | |
644 | buf += ret; |
645 | count -= ret; |
646 | total_bytes += ret; |
647 | } |
648 | |
649 | if (fanotify_event_has_object_fh(event)) { |
650 | const char *dot = NULL; |
651 | int dot_len = 0; |
652 | |
653 | if (fid_mode == FAN_REPORT_FID || info_type) { |
654 | /* |
655 | * With only group flag FAN_REPORT_FID only type FID is |
656 | * reported. Second info record type is always FID. |
657 | */ |
658 | info_type = FAN_EVENT_INFO_TYPE_FID; |
659 | } else if ((fid_mode & FAN_REPORT_NAME) && |
660 | (event->mask & FAN_ONDIR)) { |
661 | /* |
662 | * With group flag FAN_REPORT_NAME, if name was not |
663 | * recorded in an event on a directory, report the name |
664 | * "." with info type DFID_NAME. |
665 | */ |
666 | info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; |
667 | dot = "."; |
668 | dot_len = 1; |
669 | } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || |
670 | (event->mask & FAN_ONDIR)) { |
671 | /* |
672 | * With group flag FAN_REPORT_DIR_FID, a single info |
673 | * record has type DFID for directory entry modification |
674 | * event and for event on a directory. |
675 | */ |
676 | info_type = FAN_EVENT_INFO_TYPE_DFID; |
677 | } else { |
678 | /* |
679 | * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, |
680 | * a single info record has type FID for event on a |
681 | * non-directory, when there is no directory to report. |
682 | * For example, on FAN_DELETE_SELF event. |
683 | */ |
684 | info_type = FAN_EVENT_INFO_TYPE_FID; |
685 | } |
686 | |
687 | ret = copy_fid_info_to_user(fsid: fanotify_event_fsid(event), |
688 | fh: fanotify_event_object_fh(event), |
689 | info_type, name: dot, name_len: dot_len, |
690 | buf, count); |
691 | if (ret < 0) |
692 | return ret; |
693 | |
694 | buf += ret; |
695 | count -= ret; |
696 | total_bytes += ret; |
697 | } |
698 | |
699 | if (pidfd_mode) { |
700 | ret = copy_pidfd_info_to_user(pidfd, buf, count); |
701 | if (ret < 0) |
702 | return ret; |
703 | |
704 | buf += ret; |
705 | count -= ret; |
706 | total_bytes += ret; |
707 | } |
708 | |
709 | if (fanotify_is_error_event(mask: event->mask)) { |
710 | ret = copy_error_info_to_user(event, buf, count); |
711 | if (ret < 0) |
712 | return ret; |
713 | buf += ret; |
714 | count -= ret; |
715 | total_bytes += ret; |
716 | } |
717 | |
718 | if (fanotify_event_has_access_range(event)) { |
719 | ret = copy_range_info_to_user(event, buf, count); |
720 | if (ret < 0) |
721 | return ret; |
722 | buf += ret; |
723 | count -= ret; |
724 | total_bytes += ret; |
725 | } |
726 | |
727 | if (fanotify_is_mnt_event(mask: event->mask)) { |
728 | ret = copy_mnt_info_to_user(event, buf, count); |
729 | if (ret < 0) |
730 | return ret; |
731 | buf += ret; |
732 | count -= ret; |
733 | total_bytes += ret; |
734 | } |
735 | |
736 | return total_bytes; |
737 | } |
738 | |
739 | static ssize_t copy_event_to_user(struct fsnotify_group *group, |
740 | struct fanotify_event *event, |
741 | char __user *buf, size_t count) |
742 | { |
743 | struct fanotify_event_metadata metadata; |
744 | const struct path *path = fanotify_event_path(event); |
745 | struct fanotify_info *info = fanotify_event_info(event); |
746 | unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); |
747 | unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; |
748 | struct file *f = NULL, *pidfd_file = NULL; |
749 | int ret, pidfd = -ESRCH, fd = -EBADF; |
750 | |
751 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); |
752 | |
753 | metadata.event_len = fanotify_event_len(info_mode, event); |
754 | metadata.metadata_len = FAN_EVENT_METADATA_LEN; |
755 | metadata.vers = FANOTIFY_METADATA_VERSION; |
756 | metadata.reserved = 0; |
757 | metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; |
758 | metadata.pid = pid_vnr(pid: event->pid); |
759 | /* |
760 | * For an unprivileged listener, event->pid can be used to identify the |
761 | * events generated by the listener process itself, without disclosing |
762 | * the pids of other processes. |
763 | */ |
764 | if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && |
765 | task_tgid(current) != event->pid) |
766 | metadata.pid = 0; |
767 | |
768 | /* |
769 | * For now, fid mode is required for an unprivileged listener and |
770 | * fid mode does not report fd in events. Keep this check anyway |
771 | * for safety in case fid mode requirement is relaxed in the future |
772 | * to allow unprivileged listener to get events with no fd and no fid. |
773 | */ |
774 | if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && |
775 | path && path->mnt && path->dentry) { |
776 | fd = create_fd(group, path, file: &f); |
777 | /* |
778 | * Opening an fd from dentry can fail for several reasons. |
779 | * For example, when tasks are gone and we try to open their |
780 | * /proc files or we try to open a WRONLY file like in sysfs |
781 | * or when trying to open a file that was deleted on the |
782 | * remote network server. |
783 | * |
784 | * For a group with FAN_REPORT_FD_ERROR, we will send the |
785 | * event with the error instead of the open fd, otherwise |
786 | * Userspace may not get the error at all. |
787 | * In any case, userspace will not know which file failed to |
788 | * open, so add a debug print for further investigation. |
789 | */ |
790 | if (fd < 0) { |
791 | pr_debug("fanotify: create_fd(%pd2) failed err=%d\n", |
792 | path->dentry, fd); |
793 | if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) { |
794 | /* |
795 | * Historically, we've handled EOPENSTALE in a |
796 | * special way and silently dropped such |
797 | * events. Now we have to keep it to maintain |
798 | * backward compatibility... |
799 | */ |
800 | if (fd == -EOPENSTALE) |
801 | fd = 0; |
802 | return fd; |
803 | } |
804 | } |
805 | } |
806 | if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) |
807 | metadata.fd = fd; |
808 | else |
809 | metadata.fd = fd >= 0 ? fd : FAN_NOFD; |
810 | |
811 | if (pidfd_mode) { |
812 | /* |
813 | * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual |
814 | * exclusion is ever lifted. At the time of incoporating pidfd |
815 | * support within fanotify, the pidfd API only supported the |
816 | * creation of pidfds for thread-group leaders. |
817 | */ |
818 | WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); |
819 | |
820 | /* |
821 | * The PIDTYPE_TGID check for an event->pid is performed |
822 | * preemptively in an attempt to catch out cases where the event |
823 | * listener reads events after the event generating process has |
824 | * already terminated. Depending on flag FAN_REPORT_FD_ERROR, |
825 | * report either -ESRCH or FAN_NOPIDFD to the event listener in |
826 | * those cases with all other pidfd creation errors reported as |
827 | * the error code itself or as FAN_EPIDFD. |
828 | */ |
829 | if (metadata.pid && pid_has_task(pid: event->pid, type: PIDTYPE_TGID)) |
830 | pidfd = pidfd_prepare(pid: event->pid, flags: 0, ret_file: &pidfd_file); |
831 | |
832 | if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0) |
833 | pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD; |
834 | } |
835 | |
836 | ret = -EFAULT; |
837 | /* |
838 | * Sanity check copy size in case get_one_event() and |
839 | * event_len sizes ever get out of sync. |
840 | */ |
841 | if (WARN_ON_ONCE(metadata.event_len > count)) |
842 | goto out_close_fd; |
843 | |
844 | if (copy_to_user(to: buf, from: &metadata, FAN_EVENT_METADATA_LEN)) |
845 | goto out_close_fd; |
846 | |
847 | buf += FAN_EVENT_METADATA_LEN; |
848 | count -= FAN_EVENT_METADATA_LEN; |
849 | |
850 | ret = copy_info_records_to_user(event, info, info_mode, pidfd, |
851 | buf, count); |
852 | if (ret < 0) |
853 | goto out_close_fd; |
854 | |
855 | if (f) |
856 | fd_install(fd, file: f); |
857 | |
858 | if (pidfd_file) |
859 | fd_install(fd: pidfd, file: pidfd_file); |
860 | |
861 | if (fanotify_is_perm_event(mask: event->mask)) |
862 | FANOTIFY_PERM(event)->fd = fd; |
863 | |
864 | return metadata.event_len; |
865 | |
866 | out_close_fd: |
867 | if (f) { |
868 | put_unused_fd(fd); |
869 | fput(f); |
870 | } |
871 | |
872 | if (pidfd_file) { |
873 | put_unused_fd(fd: pidfd); |
874 | fput(pidfd_file); |
875 | } |
876 | |
877 | return ret; |
878 | } |
879 | |
880 | /* intofiy userspace file descriptor functions */ |
881 | static __poll_t fanotify_poll(struct file *file, poll_table *wait) |
882 | { |
883 | struct fsnotify_group *group = file->private_data; |
884 | __poll_t ret = 0; |
885 | |
886 | poll_wait(filp: file, wait_address: &group->notification_waitq, p: wait); |
887 | spin_lock(lock: &group->notification_lock); |
888 | if (!fsnotify_notify_queue_is_empty(group)) |
889 | ret = EPOLLIN | EPOLLRDNORM; |
890 | spin_unlock(lock: &group->notification_lock); |
891 | |
892 | return ret; |
893 | } |
894 | |
895 | static ssize_t fanotify_read(struct file *file, char __user *buf, |
896 | size_t count, loff_t *pos) |
897 | { |
898 | struct fsnotify_group *group; |
899 | struct fanotify_event *event; |
900 | char __user *start; |
901 | int ret; |
902 | DEFINE_WAIT_FUNC(wait, woken_wake_function); |
903 | |
904 | start = buf; |
905 | group = file->private_data; |
906 | |
907 | pr_debug("%s: group=%p\n", __func__, group); |
908 | |
909 | add_wait_queue(wq_head: &group->notification_waitq, wq_entry: &wait); |
910 | while (1) { |
911 | /* |
912 | * User can supply arbitrarily large buffer. Avoid softlockups |
913 | * in case there are lots of available events. |
914 | */ |
915 | cond_resched(); |
916 | event = get_one_event(group, count); |
917 | if (IS_ERR(ptr: event)) { |
918 | ret = PTR_ERR(ptr: event); |
919 | break; |
920 | } |
921 | |
922 | if (!event) { |
923 | ret = -EAGAIN; |
924 | if (file->f_flags & O_NONBLOCK) |
925 | break; |
926 | |
927 | ret = -ERESTARTSYS; |
928 | if (signal_pending(current)) |
929 | break; |
930 | |
931 | if (start != buf) |
932 | break; |
933 | |
934 | wait_woken(wq_entry: &wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
935 | continue; |
936 | } |
937 | |
938 | ret = copy_event_to_user(group, event, buf, count); |
939 | |
940 | /* |
941 | * Permission events get queued to wait for response. Other |
942 | * events can be destroyed now. |
943 | */ |
944 | if (!fanotify_is_perm_event(mask: event->mask)) { |
945 | fsnotify_destroy_event(group, event: &event->fse); |
946 | } else { |
947 | if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) { |
948 | spin_lock(lock: &group->notification_lock); |
949 | finish_permission_event(group, |
950 | event: FANOTIFY_PERM(event), FAN_DENY, NULL); |
951 | wake_up(&group->fanotify_data.access_waitq); |
952 | } else { |
953 | spin_lock(lock: &group->notification_lock); |
954 | list_add_tail(new: &event->fse.list, |
955 | head: &group->fanotify_data.access_list); |
956 | spin_unlock(lock: &group->notification_lock); |
957 | } |
958 | } |
959 | if (ret < 0) |
960 | break; |
961 | buf += ret; |
962 | count -= ret; |
963 | } |
964 | remove_wait_queue(wq_head: &group->notification_waitq, wq_entry: &wait); |
965 | |
966 | if (start != buf && ret != -EFAULT) |
967 | ret = buf - start; |
968 | return ret; |
969 | } |
970 | |
971 | static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) |
972 | { |
973 | struct fanotify_response response; |
974 | struct fsnotify_group *group; |
975 | int ret; |
976 | const char __user *info_buf = buf + sizeof(struct fanotify_response); |
977 | size_t info_len; |
978 | |
979 | if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) |
980 | return -EINVAL; |
981 | |
982 | group = file->private_data; |
983 | |
984 | pr_debug("%s: group=%p count=%zu\n", __func__, group, count); |
985 | |
986 | if (count < sizeof(response)) |
987 | return -EINVAL; |
988 | |
989 | if (copy_from_user(to: &response, from: buf, n: sizeof(response))) |
990 | return -EFAULT; |
991 | |
992 | info_len = count - sizeof(response); |
993 | |
994 | ret = process_access_response(group, response_struct: &response, info: info_buf, info_len); |
995 | if (ret < 0) |
996 | count = ret; |
997 | else |
998 | count = sizeof(response) + ret; |
999 | |
1000 | return count; |
1001 | } |
1002 | |
1003 | static int fanotify_release(struct inode *ignored, struct file *file) |
1004 | { |
1005 | struct fsnotify_group *group = file->private_data; |
1006 | struct fsnotify_event *fsn_event; |
1007 | |
1008 | /* |
1009 | * Stop new events from arriving in the notification queue. since |
1010 | * userspace cannot use fanotify fd anymore, no event can enter or |
1011 | * leave access_list by now either. |
1012 | */ |
1013 | fsnotify_group_stop_queueing(group); |
1014 | |
1015 | /* |
1016 | * Process all permission events on access_list and notification queue |
1017 | * and simulate reply from userspace. |
1018 | */ |
1019 | spin_lock(lock: &group->notification_lock); |
1020 | while (!list_empty(head: &group->fanotify_data.access_list)) { |
1021 | struct fanotify_perm_event *event; |
1022 | |
1023 | event = list_first_entry(&group->fanotify_data.access_list, |
1024 | struct fanotify_perm_event, fae.fse.list); |
1025 | list_del_init(entry: &event->fae.fse.list); |
1026 | finish_permission_event(group, event, FAN_ALLOW, NULL); |
1027 | spin_lock(lock: &group->notification_lock); |
1028 | } |
1029 | |
1030 | /* |
1031 | * Destroy all non-permission events. For permission events just |
1032 | * dequeue them and set the response. They will be freed once the |
1033 | * response is consumed and fanotify_get_response() returns. |
1034 | */ |
1035 | while ((fsn_event = fsnotify_remove_first_event(group))) { |
1036 | struct fanotify_event *event = FANOTIFY_E(fse: fsn_event); |
1037 | |
1038 | if (!(event->mask & FANOTIFY_PERM_EVENTS)) { |
1039 | spin_unlock(lock: &group->notification_lock); |
1040 | fsnotify_destroy_event(group, event: fsn_event); |
1041 | } else { |
1042 | finish_permission_event(group, event: FANOTIFY_PERM(event), |
1043 | FAN_ALLOW, NULL); |
1044 | } |
1045 | spin_lock(lock: &group->notification_lock); |
1046 | } |
1047 | spin_unlock(lock: &group->notification_lock); |
1048 | |
1049 | /* Response for all permission events it set, wakeup waiters */ |
1050 | wake_up(&group->fanotify_data.access_waitq); |
1051 | |
1052 | /* matches the fanotify_init->fsnotify_alloc_group */ |
1053 | fsnotify_destroy_group(group); |
1054 | |
1055 | return 0; |
1056 | } |
1057 | |
1058 | static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
1059 | { |
1060 | struct fsnotify_group *group; |
1061 | struct fsnotify_event *fsn_event; |
1062 | void __user *p; |
1063 | int ret = -ENOTTY; |
1064 | size_t send_len = 0; |
1065 | |
1066 | group = file->private_data; |
1067 | |
1068 | p = (void __user *) arg; |
1069 | |
1070 | switch (cmd) { |
1071 | case FIONREAD: |
1072 | spin_lock(lock: &group->notification_lock); |
1073 | list_for_each_entry(fsn_event, &group->notification_list, list) |
1074 | send_len += FAN_EVENT_METADATA_LEN; |
1075 | spin_unlock(lock: &group->notification_lock); |
1076 | ret = put_user(send_len, (int __user *) p); |
1077 | break; |
1078 | } |
1079 | |
1080 | return ret; |
1081 | } |
1082 | |
1083 | static const struct file_operations fanotify_fops = { |
1084 | .show_fdinfo = fanotify_show_fdinfo, |
1085 | .poll = fanotify_poll, |
1086 | .read = fanotify_read, |
1087 | .write = fanotify_write, |
1088 | .fasync = NULL, |
1089 | .release = fanotify_release, |
1090 | .unlocked_ioctl = fanotify_ioctl, |
1091 | .compat_ioctl = compat_ptr_ioctl, |
1092 | .llseek = noop_llseek, |
1093 | }; |
1094 | |
1095 | static int fanotify_find_path(int dfd, const char __user *filename, |
1096 | struct path *path, unsigned int flags, __u64 mask, |
1097 | unsigned int obj_type) |
1098 | { |
1099 | int ret; |
1100 | |
1101 | pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, |
1102 | dfd, filename, flags); |
1103 | |
1104 | if (filename == NULL) { |
1105 | CLASS(fd, f)(fd: dfd); |
1106 | |
1107 | if (fd_empty(f)) |
1108 | return -EBADF; |
1109 | |
1110 | if ((flags & FAN_MARK_ONLYDIR) && |
1111 | !(S_ISDIR(file_inode(fd_file(f))->i_mode))) |
1112 | return -ENOTDIR; |
1113 | |
1114 | *path = fd_file(f)->f_path; |
1115 | path_get(path); |
1116 | } else { |
1117 | unsigned int lookup_flags = 0; |
1118 | |
1119 | if (!(flags & FAN_MARK_DONT_FOLLOW)) |
1120 | lookup_flags |= LOOKUP_FOLLOW; |
1121 | if (flags & FAN_MARK_ONLYDIR) |
1122 | lookup_flags |= LOOKUP_DIRECTORY; |
1123 | |
1124 | ret = user_path_at(dfd, filename, lookup_flags, path); |
1125 | if (ret) |
1126 | goto out; |
1127 | } |
1128 | |
1129 | /* you can only watch an inode if you have read permissions on it */ |
1130 | ret = path_permission(path, MAY_READ); |
1131 | if (ret) { |
1132 | path_put(path); |
1133 | goto out; |
1134 | } |
1135 | |
1136 | ret = security_path_notify(path, mask, obj_type); |
1137 | if (ret) |
1138 | path_put(path); |
1139 | |
1140 | out: |
1141 | return ret; |
1142 | } |
1143 | |
1144 | static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, |
1145 | __u32 mask, unsigned int flags, |
1146 | __u32 umask, int *destroy) |
1147 | { |
1148 | __u32 oldmask, newmask; |
1149 | |
1150 | /* umask bits cannot be removed by user */ |
1151 | mask &= ~umask; |
1152 | spin_lock(lock: &fsn_mark->lock); |
1153 | oldmask = fsnotify_calc_mask(mark: fsn_mark); |
1154 | if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { |
1155 | fsn_mark->mask &= ~mask; |
1156 | } else { |
1157 | fsn_mark->ignore_mask &= ~mask; |
1158 | } |
1159 | newmask = fsnotify_calc_mask(mark: fsn_mark); |
1160 | /* |
1161 | * We need to keep the mark around even if remaining mask cannot |
1162 | * result in any events (e.g. mask == FAN_ONDIR) to support incremenal |
1163 | * changes to the mask. |
1164 | * Destroy mark when only umask bits remain. |
1165 | */ |
1166 | *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); |
1167 | spin_unlock(lock: &fsn_mark->lock); |
1168 | |
1169 | return oldmask & ~newmask; |
1170 | } |
1171 | |
1172 | static int fanotify_remove_mark(struct fsnotify_group *group, |
1173 | void *obj, unsigned int obj_type, __u32 mask, |
1174 | unsigned int flags, __u32 umask) |
1175 | { |
1176 | struct fsnotify_mark *fsn_mark = NULL; |
1177 | __u32 removed; |
1178 | int destroy_mark; |
1179 | |
1180 | fsnotify_group_lock(group); |
1181 | fsn_mark = fsnotify_find_mark(obj, obj_type, group); |
1182 | if (!fsn_mark) { |
1183 | fsnotify_group_unlock(group); |
1184 | return -ENOENT; |
1185 | } |
1186 | |
1187 | removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, |
1188 | umask, destroy: &destroy_mark); |
1189 | if (removed & fsnotify_conn_mask(conn: fsn_mark->connector)) |
1190 | fsnotify_recalc_mask(conn: fsn_mark->connector); |
1191 | if (destroy_mark) |
1192 | fsnotify_detach_mark(mark: fsn_mark); |
1193 | fsnotify_group_unlock(group); |
1194 | if (destroy_mark) |
1195 | fsnotify_free_mark(mark: fsn_mark); |
1196 | |
1197 | /* matches the fsnotify_find_mark() */ |
1198 | fsnotify_put_mark(mark: fsn_mark); |
1199 | return 0; |
1200 | } |
1201 | |
1202 | static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, |
1203 | unsigned int fan_flags) |
1204 | { |
1205 | bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); |
1206 | unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; |
1207 | bool recalc = false; |
1208 | |
1209 | /* |
1210 | * When using FAN_MARK_IGNORE for the first time, mark starts using |
1211 | * independent event flags in ignore mask. After that, trying to |
1212 | * update the ignore mask with the old FAN_MARK_IGNORED_MASK API |
1213 | * will result in EEXIST error. |
1214 | */ |
1215 | if (ignore == FAN_MARK_IGNORE) |
1216 | fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; |
1217 | |
1218 | /* |
1219 | * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to |
1220 | * the removal of the FS_MODIFY bit in calculated mask if it was set |
1221 | * because of an ignore mask that is now going to survive FS_MODIFY. |
1222 | */ |
1223 | if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && |
1224 | !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { |
1225 | fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; |
1226 | if (!(fsn_mark->mask & FS_MODIFY)) |
1227 | recalc = true; |
1228 | } |
1229 | |
1230 | if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE || |
1231 | want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) |
1232 | return recalc; |
1233 | |
1234 | /* |
1235 | * NO_IREF may be removed from a mark, but not added. |
1236 | * When removed, fsnotify_recalc_mask() will take the inode ref. |
1237 | */ |
1238 | WARN_ON_ONCE(!want_iref); |
1239 | fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF; |
1240 | |
1241 | return true; |
1242 | } |
1243 | |
1244 | static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, |
1245 | __u32 mask, unsigned int fan_flags) |
1246 | { |
1247 | bool recalc; |
1248 | |
1249 | spin_lock(lock: &fsn_mark->lock); |
1250 | if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) |
1251 | fsn_mark->mask |= mask; |
1252 | else |
1253 | fsn_mark->ignore_mask |= mask; |
1254 | |
1255 | recalc = fsnotify_calc_mask(mark: fsn_mark) & |
1256 | ~fsnotify_conn_mask(conn: fsn_mark->connector); |
1257 | |
1258 | recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags); |
1259 | spin_unlock(lock: &fsn_mark->lock); |
1260 | |
1261 | return recalc; |
1262 | } |
1263 | |
1264 | struct fan_fsid { |
1265 | struct super_block *sb; |
1266 | __kernel_fsid_t id; |
1267 | bool weak; |
1268 | }; |
1269 | |
1270 | static int fanotify_set_mark_fsid(struct fsnotify_group *group, |
1271 | struct fsnotify_mark *mark, |
1272 | struct fan_fsid *fsid) |
1273 | { |
1274 | struct fsnotify_mark_connector *conn; |
1275 | struct fsnotify_mark *old; |
1276 | struct super_block *old_sb = NULL; |
1277 | |
1278 | FANOTIFY_MARK(mark)->fsid = fsid->id; |
1279 | mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; |
1280 | if (fsid->weak) |
1281 | mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; |
1282 | |
1283 | /* First mark added will determine if group is single or multi fsid */ |
1284 | if (list_empty(head: &group->marks_list)) |
1285 | return 0; |
1286 | |
1287 | /* Find sb of an existing mark */ |
1288 | list_for_each_entry(old, &group->marks_list, g_list) { |
1289 | conn = READ_ONCE(old->connector); |
1290 | if (!conn) |
1291 | continue; |
1292 | old_sb = fsnotify_connector_sb(conn); |
1293 | if (old_sb) |
1294 | break; |
1295 | } |
1296 | |
1297 | /* Only detached marks left? */ |
1298 | if (!old_sb) |
1299 | return 0; |
1300 | |
1301 | /* Do not allow mixing of marks with weak and strong fsid */ |
1302 | if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) |
1303 | return -EXDEV; |
1304 | |
1305 | /* Allow mixing of marks with strong fsid from different fs */ |
1306 | if (!fsid->weak) |
1307 | return 0; |
1308 | |
1309 | /* Do not allow mixing marks with weak fsid from different fs */ |
1310 | if (old_sb != fsid->sb) |
1311 | return -EXDEV; |
1312 | |
1313 | /* Do not allow mixing marks from different btrfs sub-volumes */ |
1314 | if (!fanotify_fsid_equal(fsid1: &FANOTIFY_MARK(mark: old)->fsid, |
1315 | fsid2: &FANOTIFY_MARK(mark)->fsid)) |
1316 | return -EXDEV; |
1317 | |
1318 | return 0; |
1319 | } |
1320 | |
1321 | static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, |
1322 | void *obj, |
1323 | unsigned int obj_type, |
1324 | unsigned int fan_flags, |
1325 | struct fan_fsid *fsid) |
1326 | { |
1327 | struct ucounts *ucounts = group->fanotify_data.ucounts; |
1328 | struct fanotify_mark *fan_mark; |
1329 | struct fsnotify_mark *mark; |
1330 | int ret; |
1331 | |
1332 | /* |
1333 | * Enforce per user marks limits per user in all containing user ns. |
1334 | * A group with FAN_UNLIMITED_MARKS does not contribute to mark count |
1335 | * in the limited groups account. |
1336 | */ |
1337 | BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS)); |
1338 | if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && |
1339 | !inc_ucount(ns: ucounts->ns, uid: ucounts->uid, type: UCOUNT_FANOTIFY_MARKS)) |
1340 | return ERR_PTR(error: -ENOSPC); |
1341 | |
1342 | fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); |
1343 | if (!fan_mark) { |
1344 | ret = -ENOMEM; |
1345 | goto out_dec_ucounts; |
1346 | } |
1347 | |
1348 | mark = &fan_mark->fsn_mark; |
1349 | fsnotify_init_mark(mark, group); |
1350 | if (fan_flags & FAN_MARK_EVICTABLE) |
1351 | mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; |
1352 | |
1353 | /* Cache fsid of filesystem containing the marked object */ |
1354 | if (fsid) { |
1355 | ret = fanotify_set_mark_fsid(group, mark, fsid); |
1356 | if (ret) |
1357 | goto out_put_mark; |
1358 | } else { |
1359 | fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; |
1360 | } |
1361 | |
1362 | ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags: 0); |
1363 | if (ret) |
1364 | goto out_put_mark; |
1365 | |
1366 | return mark; |
1367 | |
1368 | out_put_mark: |
1369 | fsnotify_put_mark(mark); |
1370 | out_dec_ucounts: |
1371 | if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) |
1372 | dec_ucount(ucounts, type: UCOUNT_FANOTIFY_MARKS); |
1373 | return ERR_PTR(error: ret); |
1374 | } |
1375 | |
1376 | static int fanotify_group_init_error_pool(struct fsnotify_group *group) |
1377 | { |
1378 | if (mempool_initialized(pool: &group->fanotify_data.error_events_pool)) |
1379 | return 0; |
1380 | |
1381 | return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, |
1382 | FANOTIFY_DEFAULT_FEE_POOL_SIZE, |
1383 | sizeof(struct fanotify_error_event)); |
1384 | } |
1385 | |
1386 | static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, |
1387 | __u32 mask, unsigned int fan_flags) |
1388 | { |
1389 | /* |
1390 | * Non evictable mark cannot be downgraded to evictable mark. |
1391 | */ |
1392 | if (fan_flags & FAN_MARK_EVICTABLE && |
1393 | !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) |
1394 | return -EEXIST; |
1395 | |
1396 | /* |
1397 | * New ignore mask semantics cannot be downgraded to old semantics. |
1398 | */ |
1399 | if (fan_flags & FAN_MARK_IGNORED_MASK && |
1400 | fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) |
1401 | return -EEXIST; |
1402 | |
1403 | /* |
1404 | * An ignore mask that survives modify could never be downgraded to not |
1405 | * survive modify. With new FAN_MARK_IGNORE semantics we make that rule |
1406 | * explicit and return an error when trying to update the ignore mask |
1407 | * without the original FAN_MARK_IGNORED_SURV_MODIFY value. |
1408 | */ |
1409 | if (fan_flags & FAN_MARK_IGNORE && |
1410 | !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && |
1411 | fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) |
1412 | return -EEXIST; |
1413 | |
1414 | /* For now pre-content events are not generated for directories */ |
1415 | mask |= fsn_mark->mask; |
1416 | if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) |
1417 | return -EEXIST; |
1418 | |
1419 | return 0; |
1420 | } |
1421 | |
1422 | static int fanotify_add_mark(struct fsnotify_group *group, |
1423 | void *obj, unsigned int obj_type, |
1424 | __u32 mask, unsigned int fan_flags, |
1425 | struct fan_fsid *fsid) |
1426 | { |
1427 | struct fsnotify_mark *fsn_mark; |
1428 | bool recalc; |
1429 | int ret = 0; |
1430 | |
1431 | fsnotify_group_lock(group); |
1432 | fsn_mark = fsnotify_find_mark(obj, obj_type, group); |
1433 | if (!fsn_mark) { |
1434 | fsn_mark = fanotify_add_new_mark(group, obj, obj_type, |
1435 | fan_flags, fsid); |
1436 | if (IS_ERR(ptr: fsn_mark)) { |
1437 | fsnotify_group_unlock(group); |
1438 | return PTR_ERR(ptr: fsn_mark); |
1439 | } |
1440 | } |
1441 | |
1442 | /* |
1443 | * Check if requested mark flags conflict with an existing mark flags. |
1444 | */ |
1445 | ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); |
1446 | if (ret) |
1447 | goto out; |
1448 | |
1449 | /* |
1450 | * Error events are pre-allocated per group, only if strictly |
1451 | * needed (i.e. FAN_FS_ERROR was requested). |
1452 | */ |
1453 | if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && |
1454 | (mask & FAN_FS_ERROR)) { |
1455 | ret = fanotify_group_init_error_pool(group); |
1456 | if (ret) |
1457 | goto out; |
1458 | } |
1459 | |
1460 | recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); |
1461 | if (recalc) |
1462 | fsnotify_recalc_mask(conn: fsn_mark->connector); |
1463 | |
1464 | out: |
1465 | fsnotify_group_unlock(group); |
1466 | |
1467 | fsnotify_put_mark(mark: fsn_mark); |
1468 | return ret; |
1469 | } |
1470 | |
1471 | static struct fsnotify_event *fanotify_alloc_overflow_event(void) |
1472 | { |
1473 | struct fanotify_event *oevent; |
1474 | |
1475 | oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); |
1476 | if (!oevent) |
1477 | return NULL; |
1478 | |
1479 | fanotify_init_event(event: oevent, hash: 0, FS_Q_OVERFLOW); |
1480 | oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; |
1481 | |
1482 | return &oevent->fse; |
1483 | } |
1484 | |
1485 | static struct hlist_head *fanotify_alloc_merge_hash(void) |
1486 | { |
1487 | struct hlist_head *hash; |
1488 | |
1489 | hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, |
1490 | GFP_KERNEL_ACCOUNT); |
1491 | if (!hash) |
1492 | return NULL; |
1493 | |
1494 | __hash_init(ht: hash, FANOTIFY_HTABLE_SIZE); |
1495 | |
1496 | return hash; |
1497 | } |
1498 | |
1499 | /* fanotify syscalls */ |
1500 | SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) |
1501 | { |
1502 | struct user_namespace *user_ns = current_user_ns(); |
1503 | struct fsnotify_group *group; |
1504 | int f_flags, fd; |
1505 | unsigned int fid_mode = flags & FANOTIFY_FID_BITS; |
1506 | unsigned int class = flags & FANOTIFY_CLASS_BITS; |
1507 | unsigned int internal_flags = 0; |
1508 | struct file *file; |
1509 | |
1510 | pr_debug("%s: flags=%x event_f_flags=%x\n", |
1511 | __func__, flags, event_f_flags); |
1512 | |
1513 | if (!capable(CAP_SYS_ADMIN)) { |
1514 | /* |
1515 | * An unprivileged user can setup an fanotify group with |
1516 | * limited functionality - an unprivileged group is limited to |
1517 | * notification events with file handles or mount ids and it |
1518 | * cannot use unlimited queue/marks. |
1519 | */ |
1520 | if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || |
1521 | !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) |
1522 | return -EPERM; |
1523 | |
1524 | /* |
1525 | * Setting the internal flag FANOTIFY_UNPRIV on the group |
1526 | * prevents setting mount/filesystem marks on this group and |
1527 | * prevents reporting pid and open fd in events. |
1528 | */ |
1529 | internal_flags |= FANOTIFY_UNPRIV; |
1530 | } |
1531 | |
1532 | #ifdef CONFIG_AUDITSYSCALL |
1533 | if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) |
1534 | #else |
1535 | if (flags & ~FANOTIFY_INIT_FLAGS) |
1536 | #endif |
1537 | return -EINVAL; |
1538 | |
1539 | /* |
1540 | * A pidfd can only be returned for a thread-group leader; thus |
1541 | * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually |
1542 | * exclusive. |
1543 | */ |
1544 | if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) |
1545 | return -EINVAL; |
1546 | |
1547 | /* Don't allow mixing mnt events with inode events for now */ |
1548 | if (flags & FAN_REPORT_MNT) { |
1549 | if (class != FAN_CLASS_NOTIF) |
1550 | return -EINVAL; |
1551 | if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) |
1552 | return -EINVAL; |
1553 | } |
1554 | |
1555 | if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) |
1556 | return -EINVAL; |
1557 | |
1558 | switch (event_f_flags & O_ACCMODE) { |
1559 | case O_RDONLY: |
1560 | case O_RDWR: |
1561 | case O_WRONLY: |
1562 | break; |
1563 | default: |
1564 | return -EINVAL; |
1565 | } |
1566 | |
1567 | if (fid_mode && class != FAN_CLASS_NOTIF) |
1568 | return -EINVAL; |
1569 | |
1570 | /* |
1571 | * Child name is reported with parent fid so requires dir fid. |
1572 | * We can report both child fid and dir fid with or without name. |
1573 | */ |
1574 | if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) |
1575 | return -EINVAL; |
1576 | |
1577 | /* |
1578 | * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID |
1579 | * and is used as an indication to report both dir and child fid on all |
1580 | * dirent events. |
1581 | */ |
1582 | if ((fid_mode & FAN_REPORT_TARGET_FID) && |
1583 | (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) |
1584 | return -EINVAL; |
1585 | |
1586 | f_flags = O_RDWR; |
1587 | if (flags & FAN_CLOEXEC) |
1588 | f_flags |= O_CLOEXEC; |
1589 | if (flags & FAN_NONBLOCK) |
1590 | f_flags |= O_NONBLOCK; |
1591 | |
1592 | /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ |
1593 | group = fsnotify_alloc_group(ops: &fanotify_fsnotify_ops, |
1594 | FSNOTIFY_GROUP_USER); |
1595 | if (IS_ERR(ptr: group)) { |
1596 | return PTR_ERR(ptr: group); |
1597 | } |
1598 | |
1599 | /* Enforce groups limits per user in all containing user ns */ |
1600 | group->fanotify_data.ucounts = inc_ucount(ns: user_ns, current_euid(), |
1601 | type: UCOUNT_FANOTIFY_GROUPS); |
1602 | if (!group->fanotify_data.ucounts) { |
1603 | fd = -EMFILE; |
1604 | goto out_destroy_group; |
1605 | } |
1606 | |
1607 | group->fanotify_data.flags = flags | internal_flags; |
1608 | group->memcg = get_mem_cgroup_from_mm(current->mm); |
1609 | group->user_ns = get_user_ns(ns: user_ns); |
1610 | |
1611 | group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); |
1612 | if (!group->fanotify_data.merge_hash) { |
1613 | fd = -ENOMEM; |
1614 | goto out_destroy_group; |
1615 | } |
1616 | |
1617 | group->overflow_event = fanotify_alloc_overflow_event(); |
1618 | if (unlikely(!group->overflow_event)) { |
1619 | fd = -ENOMEM; |
1620 | goto out_destroy_group; |
1621 | } |
1622 | |
1623 | if (force_o_largefile()) |
1624 | event_f_flags |= O_LARGEFILE; |
1625 | group->fanotify_data.f_flags = event_f_flags; |
1626 | init_waitqueue_head(&group->fanotify_data.access_waitq); |
1627 | INIT_LIST_HEAD(list: &group->fanotify_data.access_list); |
1628 | switch (class) { |
1629 | case FAN_CLASS_NOTIF: |
1630 | group->priority = FSNOTIFY_PRIO_NORMAL; |
1631 | break; |
1632 | case FAN_CLASS_CONTENT: |
1633 | group->priority = FSNOTIFY_PRIO_CONTENT; |
1634 | break; |
1635 | case FAN_CLASS_PRE_CONTENT: |
1636 | group->priority = FSNOTIFY_PRIO_PRE_CONTENT; |
1637 | break; |
1638 | default: |
1639 | fd = -EINVAL; |
1640 | goto out_destroy_group; |
1641 | } |
1642 | |
1643 | BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); |
1644 | if (flags & FAN_UNLIMITED_QUEUE) { |
1645 | group->max_events = UINT_MAX; |
1646 | } else { |
1647 | group->max_events = fanotify_max_queued_events; |
1648 | } |
1649 | |
1650 | if (flags & FAN_ENABLE_AUDIT) { |
1651 | fd = -EPERM; |
1652 | if (!capable(CAP_AUDIT_WRITE)) |
1653 | goto out_destroy_group; |
1654 | } |
1655 | |
1656 | fd = get_unused_fd_flags(flags: f_flags); |
1657 | if (fd < 0) |
1658 | goto out_destroy_group; |
1659 | |
1660 | file = anon_inode_getfile_fmode(name: "[fanotify]", fops: &fanotify_fops, priv: group, |
1661 | flags: f_flags, FMODE_NONOTIFY); |
1662 | if (IS_ERR(ptr: file)) { |
1663 | put_unused_fd(fd); |
1664 | fd = PTR_ERR(ptr: file); |
1665 | goto out_destroy_group; |
1666 | } |
1667 | fd_install(fd, file); |
1668 | return fd; |
1669 | |
1670 | out_destroy_group: |
1671 | fsnotify_destroy_group(group); |
1672 | return fd; |
1673 | } |
1674 | |
1675 | static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, |
1676 | struct fan_fsid *fsid) |
1677 | { |
1678 | unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; |
1679 | __kernel_fsid_t root_fsid; |
1680 | int err; |
1681 | |
1682 | /* |
1683 | * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). |
1684 | */ |
1685 | err = vfs_get_fsid(dentry, fsid: &fsid->id); |
1686 | if (err) |
1687 | return err; |
1688 | |
1689 | fsid->sb = dentry->d_sb; |
1690 | if (!fsid->id.val[0] && !fsid->id.val[1]) { |
1691 | err = -ENODEV; |
1692 | goto weak; |
1693 | } |
1694 | |
1695 | /* |
1696 | * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) |
1697 | * which uses a different fsid than sb root. |
1698 | */ |
1699 | err = vfs_get_fsid(dentry: dentry->d_sb->s_root, fsid: &root_fsid); |
1700 | if (err) |
1701 | return err; |
1702 | |
1703 | if (!fanotify_fsid_equal(fsid1: &root_fsid, fsid2: &fsid->id)) { |
1704 | err = -EXDEV; |
1705 | goto weak; |
1706 | } |
1707 | |
1708 | fsid->weak = false; |
1709 | return 0; |
1710 | |
1711 | weak: |
1712 | /* Allow weak fsid when marking inodes */ |
1713 | fsid->weak = true; |
1714 | return (mark_type == FAN_MARK_INODE) ? 0 : err; |
1715 | } |
1716 | |
1717 | /* Check if filesystem can encode a unique fid */ |
1718 | static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) |
1719 | { |
1720 | unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; |
1721 | const struct export_operations *nop = dentry->d_sb->s_export_op; |
1722 | |
1723 | /* |
1724 | * We need to make sure that the filesystem supports encoding of |
1725 | * file handles so user can use name_to_handle_at() to compare fids |
1726 | * reported with events to the file handle of watched objects. |
1727 | */ |
1728 | if (!exportfs_can_encode_fid(nop)) |
1729 | return -EOPNOTSUPP; |
1730 | |
1731 | /* |
1732 | * For sb/mount mark, we also need to make sure that the filesystem |
1733 | * supports decoding file handles, so user has a way to map back the |
1734 | * reported fids to filesystem objects. |
1735 | */ |
1736 | if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) |
1737 | return -EOPNOTSUPP; |
1738 | |
1739 | return 0; |
1740 | } |
1741 | |
1742 | static int fanotify_events_supported(struct fsnotify_group *group, |
1743 | const struct path *path, __u64 mask, |
1744 | unsigned int flags) |
1745 | { |
1746 | unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; |
1747 | bool is_dir = d_is_dir(dentry: path->dentry); |
1748 | /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ |
1749 | bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || |
1750 | (mask & FAN_RENAME) || |
1751 | (flags & FAN_MARK_IGNORE); |
1752 | |
1753 | /* |
1754 | * Filesystems need to opt-into pre-content evnets (a.k.a HSM) |
1755 | * and they are only supported on regular files and directories. |
1756 | */ |
1757 | if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { |
1758 | if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) |
1759 | return -EOPNOTSUPP; |
1760 | if (!is_dir && !d_is_reg(dentry: path->dentry)) |
1761 | return -EINVAL; |
1762 | } |
1763 | |
1764 | /* |
1765 | * Some filesystems such as 'proc' acquire unusual locks when opening |
1766 | * files. For them fanotify permission events have high chances of |
1767 | * deadlocking the system - open done when reporting fanotify event |
1768 | * blocks on this "unusual" lock while another process holding the lock |
1769 | * waits for fanotify permission event to be answered. Just disallow |
1770 | * permission events for such filesystems. |
1771 | */ |
1772 | if (mask & FANOTIFY_PERM_EVENTS && |
1773 | path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) |
1774 | return -EINVAL; |
1775 | |
1776 | /* |
1777 | * mount and sb marks are not allowed on kernel internal pseudo fs, |
1778 | * like pipe_mnt, because that would subscribe to events on all the |
1779 | * anonynous pipes in the system. |
1780 | * |
1781 | * SB_NOUSER covers all of the internal pseudo fs whose objects are not |
1782 | * exposed to user's mount namespace, but there are other SB_KERNMOUNT |
1783 | * fs, like nsfs, debugfs, for which the value of allowing sb and mount |
1784 | * mark is questionable. For now we leave them alone. |
1785 | */ |
1786 | if (mark_type != FAN_MARK_INODE && |
1787 | path->mnt->mnt_sb->s_flags & SB_NOUSER) |
1788 | return -EINVAL; |
1789 | |
1790 | /* |
1791 | * We shouldn't have allowed setting dirent events and the directory |
1792 | * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, |
1793 | * but because we always allowed it, error only when using new APIs. |
1794 | */ |
1795 | if (strict_dir_events && mark_type == FAN_MARK_INODE && |
1796 | !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) |
1797 | return -ENOTDIR; |
1798 | |
1799 | return 0; |
1800 | } |
1801 | |
1802 | static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, |
1803 | int dfd, const char __user *pathname) |
1804 | { |
1805 | struct inode *inode = NULL; |
1806 | struct fsnotify_group *group; |
1807 | struct path path; |
1808 | struct fan_fsid __fsid, *fsid = NULL; |
1809 | struct user_namespace *user_ns = NULL; |
1810 | struct mnt_namespace *mntns; |
1811 | u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; |
1812 | unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; |
1813 | unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; |
1814 | unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; |
1815 | unsigned int obj_type, fid_mode; |
1816 | void *obj = NULL; |
1817 | u32 umask = 0; |
1818 | int ret; |
1819 | |
1820 | pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", |
1821 | __func__, fanotify_fd, flags, dfd, pathname, mask); |
1822 | |
1823 | /* we only use the lower 32 bits as of right now. */ |
1824 | if (upper_32_bits(mask)) |
1825 | return -EINVAL; |
1826 | |
1827 | if (flags & ~FANOTIFY_MARK_FLAGS) |
1828 | return -EINVAL; |
1829 | |
1830 | switch (mark_type) { |
1831 | case FAN_MARK_INODE: |
1832 | obj_type = FSNOTIFY_OBJ_TYPE_INODE; |
1833 | break; |
1834 | case FAN_MARK_MOUNT: |
1835 | obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; |
1836 | break; |
1837 | case FAN_MARK_FILESYSTEM: |
1838 | obj_type = FSNOTIFY_OBJ_TYPE_SB; |
1839 | break; |
1840 | case FAN_MARK_MNTNS: |
1841 | obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; |
1842 | break; |
1843 | default: |
1844 | return -EINVAL; |
1845 | } |
1846 | |
1847 | switch (mark_cmd) { |
1848 | case FAN_MARK_ADD: |
1849 | case FAN_MARK_REMOVE: |
1850 | if (!mask) |
1851 | return -EINVAL; |
1852 | break; |
1853 | case FAN_MARK_FLUSH: |
1854 | if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) |
1855 | return -EINVAL; |
1856 | break; |
1857 | default: |
1858 | return -EINVAL; |
1859 | } |
1860 | |
1861 | if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) |
1862 | valid_mask |= FANOTIFY_PERM_EVENTS; |
1863 | |
1864 | if (mask & ~valid_mask) |
1865 | return -EINVAL; |
1866 | |
1867 | |
1868 | /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ |
1869 | if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) |
1870 | return -EINVAL; |
1871 | |
1872 | /* |
1873 | * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with |
1874 | * FAN_MARK_IGNORED_MASK. |
1875 | */ |
1876 | if (ignore == FAN_MARK_IGNORED_MASK) { |
1877 | mask &= ~FANOTIFY_EVENT_FLAGS; |
1878 | umask = FANOTIFY_EVENT_FLAGS; |
1879 | } |
1880 | |
1881 | CLASS(fd, f)(fd: fanotify_fd); |
1882 | if (fd_empty(f)) |
1883 | return -EBADF; |
1884 | |
1885 | /* verify that this is indeed an fanotify instance */ |
1886 | if (unlikely(fd_file(f)->f_op != &fanotify_fops)) |
1887 | return -EINVAL; |
1888 | group = fd_file(f)->private_data; |
1889 | |
1890 | /* Only report mount events on mnt namespace */ |
1891 | if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { |
1892 | if (mask & ~FANOTIFY_MOUNT_EVENTS) |
1893 | return -EINVAL; |
1894 | if (mark_type != FAN_MARK_MNTNS) |
1895 | return -EINVAL; |
1896 | } else { |
1897 | if (mask & FANOTIFY_MOUNT_EVENTS) |
1898 | return -EINVAL; |
1899 | if (mark_type == FAN_MARK_MNTNS) |
1900 | return -EINVAL; |
1901 | } |
1902 | |
1903 | /* |
1904 | * A user is allowed to setup sb/mount/mntns marks only if it is |
1905 | * capable in the user ns where the group was created. |
1906 | */ |
1907 | if (!ns_capable(ns: group->user_ns, CAP_SYS_ADMIN) && |
1908 | mark_type != FAN_MARK_INODE) |
1909 | return -EPERM; |
1910 | |
1911 | /* |
1912 | * Permission events are not allowed for FAN_CLASS_NOTIF. |
1913 | * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. |
1914 | */ |
1915 | if (mask & FANOTIFY_PERM_EVENTS && |
1916 | group->priority == FSNOTIFY_PRIO_NORMAL) |
1917 | return -EINVAL; |
1918 | else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && |
1919 | group->priority == FSNOTIFY_PRIO_CONTENT) |
1920 | return -EINVAL; |
1921 | |
1922 | if (mask & FAN_FS_ERROR && |
1923 | mark_type != FAN_MARK_FILESYSTEM) |
1924 | return -EINVAL; |
1925 | |
1926 | /* |
1927 | * Evictable is only relevant for inode marks, because only inode object |
1928 | * can be evicted on memory pressure. |
1929 | */ |
1930 | if (flags & FAN_MARK_EVICTABLE && |
1931 | mark_type != FAN_MARK_INODE) |
1932 | return -EINVAL; |
1933 | |
1934 | /* |
1935 | * Events that do not carry enough information to report |
1936 | * event->fd require a group that supports reporting fid. Those |
1937 | * events are not supported on a mount mark, because they do not |
1938 | * carry enough information (i.e. path) to be filtered by mount |
1939 | * point. |
1940 | */ |
1941 | fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); |
1942 | if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && |
1943 | (!fid_mode || mark_type == FAN_MARK_MOUNT)) |
1944 | return -EINVAL; |
1945 | |
1946 | /* |
1947 | * FAN_RENAME uses special info type records to report the old and |
1948 | * new parent+name. Reporting only old and new parent id is less |
1949 | * useful and was not implemented. |
1950 | */ |
1951 | if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) |
1952 | return -EINVAL; |
1953 | |
1954 | /* Pre-content events are not currently generated for directories. */ |
1955 | if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) |
1956 | return -EINVAL; |
1957 | |
1958 | if (mark_cmd == FAN_MARK_FLUSH) { |
1959 | fsnotify_clear_marks_by_group(group, obj_type); |
1960 | return 0; |
1961 | } |
1962 | |
1963 | ret = fanotify_find_path(dfd, filename: pathname, path: &path, flags, |
1964 | mask: (mask & ALL_FSNOTIFY_EVENTS), obj_type); |
1965 | if (ret) |
1966 | return ret; |
1967 | |
1968 | if (mark_cmd == FAN_MARK_ADD) { |
1969 | ret = fanotify_events_supported(group, path: &path, mask, flags); |
1970 | if (ret) |
1971 | goto path_put_and_out; |
1972 | } |
1973 | |
1974 | if (fid_mode) { |
1975 | ret = fanotify_test_fsid(dentry: path.dentry, flags, fsid: &__fsid); |
1976 | if (ret) |
1977 | goto path_put_and_out; |
1978 | |
1979 | ret = fanotify_test_fid(dentry: path.dentry, flags); |
1980 | if (ret) |
1981 | goto path_put_and_out; |
1982 | |
1983 | fsid = &__fsid; |
1984 | } |
1985 | |
1986 | /* |
1987 | * In addition to being capable in the user ns where group was created, |
1988 | * the user also needs to be capable in the user ns associated with |
1989 | * the filesystem or in the user ns associated with the mntns |
1990 | * (when marking mntns). |
1991 | */ |
1992 | if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { |
1993 | inode = path.dentry->d_inode; |
1994 | obj = inode; |
1995 | } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { |
1996 | user_ns = path.mnt->mnt_sb->s_user_ns; |
1997 | obj = path.mnt; |
1998 | } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { |
1999 | user_ns = path.mnt->mnt_sb->s_user_ns; |
2000 | obj = path.mnt->mnt_sb; |
2001 | } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { |
2002 | mntns = mnt_ns_from_dentry(dentry: path.dentry); |
2003 | user_ns = mntns->user_ns; |
2004 | obj = mntns; |
2005 | } |
2006 | |
2007 | ret = -EPERM; |
2008 | if (user_ns && !ns_capable(ns: user_ns, CAP_SYS_ADMIN)) |
2009 | goto path_put_and_out; |
2010 | |
2011 | ret = -EINVAL; |
2012 | if (!obj) |
2013 | goto path_put_and_out; |
2014 | |
2015 | /* |
2016 | * If some other task has this inode open for write we should not add |
2017 | * an ignore mask, unless that ignore mask is supposed to survive |
2018 | * modification changes anyway. |
2019 | */ |
2020 | if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && |
2021 | !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { |
2022 | ret = !inode ? -EINVAL : -EISDIR; |
2023 | /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ |
2024 | if (ignore == FAN_MARK_IGNORE && |
2025 | (!inode || S_ISDIR(inode->i_mode))) |
2026 | goto path_put_and_out; |
2027 | |
2028 | ret = 0; |
2029 | if (inode && inode_is_open_for_write(inode)) |
2030 | goto path_put_and_out; |
2031 | } |
2032 | |
2033 | /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ |
2034 | if (!inode || !S_ISDIR(inode->i_mode)) { |
2035 | mask &= ~FAN_EVENT_ON_CHILD; |
2036 | umask = FAN_EVENT_ON_CHILD; |
2037 | /* |
2038 | * If group needs to report parent fid, register for getting |
2039 | * events with parent/name info for non-directory. |
2040 | */ |
2041 | if ((fid_mode & FAN_REPORT_DIR_FID) && |
2042 | (flags & FAN_MARK_ADD) && !ignore) |
2043 | mask |= FAN_EVENT_ON_CHILD; |
2044 | } |
2045 | |
2046 | /* create/update an inode mark */ |
2047 | switch (mark_cmd) { |
2048 | case FAN_MARK_ADD: |
2049 | ret = fanotify_add_mark(group, obj, obj_type, mask, fan_flags: flags, |
2050 | fsid); |
2051 | break; |
2052 | case FAN_MARK_REMOVE: |
2053 | ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, |
2054 | umask); |
2055 | break; |
2056 | default: |
2057 | ret = -EINVAL; |
2058 | } |
2059 | |
2060 | path_put_and_out: |
2061 | path_put(&path); |
2062 | return ret; |
2063 | } |
2064 | |
2065 | #ifndef CONFIG_ARCH_SPLIT_ARG64 |
2066 | SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, |
2067 | __u64, mask, int, dfd, |
2068 | const char __user *, pathname) |
2069 | { |
2070 | return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); |
2071 | } |
2072 | #endif |
2073 | |
2074 | #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) |
2075 | SYSCALL32_DEFINE6(fanotify_mark, |
2076 | int, fanotify_fd, unsigned int, flags, |
2077 | SC_ARG64(mask), int, dfd, |
2078 | const char __user *, pathname) |
2079 | { |
2080 | return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), |
2081 | dfd, pathname); |
2082 | } |
2083 | #endif |
2084 | |
2085 | /* |
2086 | * fanotify_user_setup - Our initialization function. Note that we cannot return |
2087 | * error because we have compiled-in VFS hooks. So an (unlikely) failure here |
2088 | * must result in panic(). |
2089 | */ |
2090 | static int __init fanotify_user_setup(void) |
2091 | { |
2092 | struct sysinfo si; |
2093 | int max_marks; |
2094 | |
2095 | si_meminfo(val: &si); |
2096 | /* |
2097 | * Allow up to 1% of addressable memory to be accounted for per user |
2098 | * marks limited to the range [8192, 1048576]. mount and sb marks are |
2099 | * a lot cheaper than inode marks, but there is no reason for a user |
2100 | * to have many of those, so calculate by the cost of inode marks. |
2101 | */ |
2102 | max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / |
2103 | INODE_MARK_COST; |
2104 | max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, |
2105 | FANOTIFY_DEFAULT_MAX_USER_MARKS); |
2106 | |
2107 | BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); |
2108 | BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); |
2109 | BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); |
2110 | |
2111 | fanotify_mark_cache = KMEM_CACHE(fanotify_mark, |
2112 | SLAB_PANIC|SLAB_ACCOUNT); |
2113 | fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, |
2114 | SLAB_PANIC); |
2115 | fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, |
2116 | SLAB_PANIC); |
2117 | if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { |
2118 | fanotify_perm_event_cachep = |
2119 | KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); |
2120 | } |
2121 | fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); |
2122 | |
2123 | fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; |
2124 | init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = |
2125 | FANOTIFY_DEFAULT_MAX_GROUPS; |
2126 | init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; |
2127 | fanotify_sysctls_init(); |
2128 | |
2129 | return 0; |
2130 | } |
2131 | device_initcall(fanotify_user_setup); |
2132 |
Definitions
- fanotify_max_queued_events
- ft_zero
- ft_int_max
- fanotify_table
- fanotify_sysctls_init
- fanotify_mark_cache
- fanotify_fid_event_cachep
- fanotify_path_event_cachep
- fanotify_perm_event_cachep
- fanotify_mnt_event_cachep
- fanotify_fid_info_len
- fanotify_dir_name_info_len
- fanotify_event_len
- fanotify_unhash_event
- get_one_event
- create_fd
- process_access_response_info
- finish_permission_event
- process_access_response
- copy_mnt_info_to_user
- copy_error_info_to_user
- copy_fid_info_to_user
- copy_pidfd_info_to_user
- copy_range_info_to_user
- copy_info_records_to_user
- copy_event_to_user
- fanotify_poll
- fanotify_read
- fanotify_write
- fanotify_release
- fanotify_ioctl
- fanotify_fops
- fanotify_find_path
- fanotify_mark_remove_from_mask
- fanotify_remove_mark
- fanotify_mark_update_flags
- fanotify_mark_add_to_mask
- fan_fsid
- fanotify_set_mark_fsid
- fanotify_add_new_mark
- fanotify_group_init_error_pool
- fanotify_may_update_existing_mark
- fanotify_add_mark
- fanotify_alloc_overflow_event
- fanotify_alloc_merge_hash
- fanotify_test_fsid
- fanotify_test_fid
- fanotify_events_supported
- do_fanotify_mark
Improve your Profiling and Debugging skills
Find out more