1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _FS_CEPH_MDS_CLIENT_H
3#define _FS_CEPH_MDS_CLIENT_H
4
5#include <linux/completion.h>
6#include <linux/kref.h>
7#include <linux/list.h>
8#include <linux/mutex.h>
9#include <linux/rbtree.h>
10#include <linux/spinlock.h>
11#include <linux/refcount.h>
12#include <linux/utsname.h>
13#include <linux/ktime.h>
14
15#include <linux/ceph/types.h>
16#include <linux/ceph/messenger.h>
17#include <linux/ceph/auth.h>
18
19#include "mdsmap.h"
20#include "metric.h"
21#include "super.h"
22
23/* The first 8 bits are reserved for old ceph releases */
24enum ceph_feature_type {
25 CEPHFS_FEATURE_MIMIC = 8,
26 CEPHFS_FEATURE_REPLY_ENCODING,
27 CEPHFS_FEATURE_RECLAIM_CLIENT,
28 CEPHFS_FEATURE_LAZY_CAP_WANTED,
29 CEPHFS_FEATURE_MULTI_RECONNECT,
30 CEPHFS_FEATURE_DELEG_INO,
31 CEPHFS_FEATURE_METRIC_COLLECT,
32 CEPHFS_FEATURE_ALTERNATE_NAME,
33 CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
34 CEPHFS_FEATURE_OP_GETVXATTR,
35 CEPHFS_FEATURE_32BITS_RETRY_FWD,
36 CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
37 CEPHFS_FEATURE_HAS_OWNER_UIDGID,
38
39 CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
40};
41
42#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
43 0, 1, 2, 3, 4, 5, 6, 7, \
44 CEPHFS_FEATURE_MIMIC, \
45 CEPHFS_FEATURE_REPLY_ENCODING, \
46 CEPHFS_FEATURE_LAZY_CAP_WANTED, \
47 CEPHFS_FEATURE_MULTI_RECONNECT, \
48 CEPHFS_FEATURE_DELEG_INO, \
49 CEPHFS_FEATURE_METRIC_COLLECT, \
50 CEPHFS_FEATURE_ALTERNATE_NAME, \
51 CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
52 CEPHFS_FEATURE_OP_GETVXATTR, \
53 CEPHFS_FEATURE_32BITS_RETRY_FWD, \
54 CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
55}
56
57/*
58 * Some lock dependencies:
59 *
60 * session->s_mutex
61 * mdsc->mutex
62 *
63 * mdsc->snap_rwsem
64 *
65 * ci->i_ceph_lock
66 * mdsc->snap_flush_lock
67 * mdsc->cap_delay_lock
68 *
69 */
70
71struct ceph_fs_client;
72struct ceph_cap;
73
74/*
75 * parsed info about a single inode. pointers are into the encoded
76 * on-wire structures within the mds reply message payload.
77 */
78struct ceph_mds_reply_info_in {
79 struct ceph_mds_reply_inode *in;
80 struct ceph_dir_layout dir_layout;
81 u32 symlink_len;
82 char *symlink;
83 u32 xattr_len;
84 char *xattr_data;
85 u64 inline_version;
86 u32 inline_len;
87 char *inline_data;
88 u32 pool_ns_len;
89 char *pool_ns_data;
90 u64 max_bytes;
91 u64 max_files;
92 s32 dir_pin;
93 struct ceph_timespec btime;
94 struct ceph_timespec snap_btime;
95 u8 *fscrypt_auth;
96 u8 *fscrypt_file;
97 u32 fscrypt_auth_len;
98 u32 fscrypt_file_len;
99 u64 rsnaps;
100 u64 change_attr;
101};
102
103struct ceph_mds_reply_dir_entry {
104 bool is_nokey;
105 char *name;
106 u32 name_len;
107 u32 raw_hash;
108 struct ceph_mds_reply_lease *lease;
109 struct ceph_mds_reply_info_in inode;
110 loff_t offset;
111};
112
113struct ceph_mds_reply_xattr {
114 char *xattr_value;
115 size_t xattr_value_len;
116};
117
118/*
119 * parsed info about an mds reply, including information about
120 * either: 1) the target inode and/or its parent directory and dentry,
121 * and directory contents (for readdir results), or
122 * 2) the file range lock info (for fcntl F_GETLK results).
123 */
124struct ceph_mds_reply_info_parsed {
125 struct ceph_mds_reply_head *head;
126
127 /* trace */
128 struct ceph_mds_reply_info_in diri, targeti;
129 struct ceph_mds_reply_dirfrag *dirfrag;
130 char *dname;
131 u8 *altname;
132 u32 dname_len;
133 u32 altname_len;
134 struct ceph_mds_reply_lease *dlease;
135 struct ceph_mds_reply_xattr xattr_info;
136
137 /* extra */
138 union {
139 /* for fcntl F_GETLK results */
140 struct ceph_filelock *filelock_reply;
141
142 /* for readdir results */
143 struct {
144 struct ceph_mds_reply_dirfrag *dir_dir;
145 size_t dir_buf_size;
146 int dir_nr;
147 bool dir_end;
148 bool dir_complete;
149 bool hash_order;
150 bool offset_hash;
151 struct ceph_mds_reply_dir_entry *dir_entries;
152 };
153
154 /* for create results */
155 struct {
156 bool has_create_ino;
157 u64 ino;
158 };
159 };
160
161 /* encoded blob describing snapshot contexts for certain
162 operations (e.g., open) */
163 void *snapblob;
164 int snapblob_len;
165};
166
167
168/*
169 * cap releases are batched and sent to the MDS en masse.
170 *
171 * Account for per-message overhead of mds_cap_release header
172 * and __le32 for osd epoch barrier trailing field.
173 */
174#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
175 sizeof(struct ceph_mds_cap_release)) / \
176 sizeof(struct ceph_mds_cap_item))
177
178
179/*
180 * state associated with each MDS<->client session
181 */
182enum {
183 CEPH_MDS_SESSION_NEW = 1,
184 CEPH_MDS_SESSION_OPENING = 2,
185 CEPH_MDS_SESSION_OPEN = 3,
186 CEPH_MDS_SESSION_HUNG = 4,
187 CEPH_MDS_SESSION_RESTARTING = 5,
188 CEPH_MDS_SESSION_RECONNECTING = 6,
189 CEPH_MDS_SESSION_CLOSING = 7,
190 CEPH_MDS_SESSION_CLOSED = 8,
191 CEPH_MDS_SESSION_REJECTED = 9,
192};
193
194struct ceph_mds_session {
195 struct ceph_mds_client *s_mdsc;
196 int s_mds;
197 int s_state;
198 unsigned long s_ttl; /* time until mds kills us */
199 unsigned long s_features;
200 u64 s_seq; /* incoming msg seq # */
201 struct mutex s_mutex; /* serialize session messages */
202
203 struct ceph_connection s_con;
204
205 struct ceph_auth_handshake s_auth;
206
207 atomic_t s_cap_gen; /* inc each time we get mds stale msg */
208 unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */
209
210 /* protected by s_cap_lock */
211 spinlock_t s_cap_lock;
212 refcount_t s_ref;
213 struct list_head s_caps; /* all caps issued by this session */
214 struct ceph_cap *s_cap_iterator;
215 int s_nr_caps;
216 int s_num_cap_releases;
217 int s_cap_reconnect;
218 int s_readonly;
219 struct list_head s_cap_releases; /* waiting cap_release messages */
220 struct work_struct s_cap_release_work;
221
222 /* See ceph_inode_info->i_dirty_item. */
223 struct list_head s_cap_dirty; /* inodes w/ dirty caps */
224
225 /* See ceph_inode_info->i_flushing_item. */
226 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
227
228 unsigned long s_renew_requested; /* last time we sent a renew req */
229 u64 s_renew_seq;
230
231 struct list_head s_waiting; /* waiting requests */
232 struct list_head s_unsafe; /* unsafe requests */
233 struct xarray s_delegated_inos;
234};
235
236/*
237 * modes of choosing which MDS to send a request to
238 */
239enum {
240 USE_ANY_MDS,
241 USE_RANDOM_MDS,
242 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
243};
244
245struct ceph_mds_request;
246struct ceph_mds_client;
247
248/*
249 * request completion callback
250 */
251typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
252 struct ceph_mds_request *req);
253/*
254 * wait for request completion callback
255 */
256typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
257 struct ceph_mds_request *req);
258
259/*
260 * an in-flight mds request
261 */
262struct ceph_mds_request {
263 u64 r_tid; /* transaction id */
264 struct rb_node r_node;
265 struct ceph_mds_client *r_mdsc;
266
267 struct kref r_kref;
268 int r_op; /* mds op code */
269
270 /* operation on what? */
271 struct inode *r_inode; /* arg1 */
272 struct dentry *r_dentry; /* arg1 */
273 struct dentry *r_old_dentry; /* arg2: rename from or link from */
274 struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
275 char *r_path1, *r_path2;
276 struct ceph_vino r_ino1, r_ino2;
277
278 struct inode *r_parent; /* parent dir inode */
279 struct inode *r_target_inode; /* resulting inode */
280 struct inode *r_new_inode; /* new inode (for creates) */
281
282#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
283#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
284#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */
285#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */
286#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
287#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
288#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
289#define CEPH_MDS_R_ASYNC (8) /* async request */
290#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */
291 unsigned long r_req_flags;
292
293 struct mutex r_fill_mutex;
294
295 union ceph_mds_request_args r_args;
296
297 struct ceph_fscrypt_auth *r_fscrypt_auth;
298 u64 r_fscrypt_file;
299
300 u8 *r_altname; /* fscrypt binary crypttext for long filenames */
301 u32 r_altname_len; /* length of r_altname */
302
303 int r_fmode; /* file mode, if expecting cap */
304 int r_request_release_offset;
305 const struct cred *r_cred;
306 struct mnt_idmap *r_mnt_idmap;
307 struct timespec64 r_stamp;
308
309 /* for choosing which mds to send this request to */
310 int r_direct_mode;
311 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
312
313 /* data payload is used for xattr ops */
314 struct ceph_pagelist *r_pagelist;
315
316 /* what caps shall we drop? */
317 int r_inode_drop, r_inode_unless;
318 int r_dentry_drop, r_dentry_unless;
319 int r_old_dentry_drop, r_old_dentry_unless;
320 struct inode *r_old_inode;
321 int r_old_inode_drop, r_old_inode_unless;
322
323 struct ceph_msg *r_request; /* original request */
324 struct ceph_msg *r_reply;
325 struct ceph_mds_reply_info_parsed r_reply_info;
326 int r_err;
327 u32 r_readdir_offset;
328
329 struct page *r_locked_page;
330 int r_dir_caps;
331 int r_num_caps;
332
333 unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
334 unsigned long r_started; /* start time to measure timeout against */
335 unsigned long r_start_latency; /* start time to measure latency */
336 unsigned long r_end_latency; /* finish time to measure latency */
337 unsigned long r_request_started; /* start time for mds request only,
338 used to measure lease durations */
339
340 /* link unsafe requests to parent directory, for fsync */
341 struct inode *r_unsafe_dir;
342 struct list_head r_unsafe_dir_item;
343
344 /* unsafe requests that modify the target inode */
345 struct list_head r_unsafe_target_item;
346
347 struct ceph_mds_session *r_session;
348
349 int r_attempts; /* resend attempts */
350 int r_num_fwd; /* number of forward attempts */
351 int r_resend_mds; /* mds to resend to next, if any*/
352 u32 r_sent_on_mseq; /* cap mseq request was sent at*/
353 u64 r_deleg_ino;
354
355 struct list_head r_wait;
356 struct completion r_completion;
357 struct completion r_safe_completion;
358 ceph_mds_request_callback_t r_callback;
359 struct list_head r_unsafe_item; /* per-session unsafe list item */
360
361 long long r_dir_release_cnt;
362 long long r_dir_ordered_cnt;
363 int r_readdir_cache_idx;
364
365 int r_feature_needed;
366
367 struct ceph_cap_reservation r_caps_reservation;
368};
369
370struct ceph_pool_perm {
371 struct rb_node node;
372 int perm;
373 s64 pool;
374 size_t pool_ns_len;
375 char pool_ns[];
376};
377
378struct ceph_snapid_map {
379 struct rb_node node;
380 struct list_head lru;
381 atomic_t ref;
382 dev_t dev;
383 u64 snap;
384 unsigned long last_used;
385};
386
387/*
388 * node for list of quotarealm inodes that are not visible from the filesystem
389 * mountpoint, but required to handle, e.g. quotas.
390 */
391struct ceph_quotarealm_inode {
392 struct rb_node node;
393 u64 ino;
394 unsigned long timeout; /* last time a lookup failed for this inode */
395 struct mutex mutex;
396 struct inode *inode;
397};
398
399struct cap_wait {
400 struct list_head list;
401 u64 ino;
402 pid_t tgid;
403 int need;
404 int want;
405};
406
407enum {
408 CEPH_MDSC_STOPPING_BEGIN = 1,
409 CEPH_MDSC_STOPPING_FLUSHING = 2,
410 CEPH_MDSC_STOPPING_FLUSHED = 3,
411};
412
413/*
414 * mds client state
415 */
416struct ceph_mds_client {
417 struct ceph_fs_client *fsc;
418 struct mutex mutex; /* all nested structures */
419
420 struct ceph_mdsmap *mdsmap;
421 struct completion safe_umount_waiters;
422 wait_queue_head_t session_close_wq;
423 struct list_head waiting_for_map;
424 int mdsmap_err;
425
426 struct ceph_mds_session **sessions; /* NULL for mds if no session */
427 atomic_t num_sessions;
428 int max_sessions; /* len of sessions array */
429
430 spinlock_t stopping_lock; /* protect snap_empty */
431 int stopping; /* the stage of shutting down */
432 atomic_t stopping_blockers;
433 struct completion stopping_waiter;
434
435 atomic64_t quotarealms_count; /* # realms with quota */
436 /*
437 * We keep a list of inodes we don't see in the mountpoint but that we
438 * need to track quota realms.
439 */
440 struct rb_root quotarealms_inodes;
441 struct mutex quotarealms_inodes_mutex;
442
443 /*
444 * snap_rwsem will cover cap linkage into snaprealms, and
445 * realm snap contexts. (later, we can do per-realm snap
446 * contexts locks..) the empty list contains realms with no
447 * references (implying they contain no inodes with caps) that
448 * should be destroyed.
449 */
450 u64 last_snap_seq;
451 struct rw_semaphore snap_rwsem;
452 struct rb_root snap_realms;
453 struct list_head snap_empty;
454 int num_snap_realms;
455 spinlock_t snap_empty_lock; /* protect snap_empty */
456
457 u64 last_tid; /* most recent mds request */
458 u64 oldest_tid; /* oldest incomplete mds request,
459 excluding setfilelock requests */
460 struct rb_root request_tree; /* pending mds requests */
461 struct delayed_work delayed_work; /* delayed work */
462 unsigned long last_renew_caps; /* last time we renewed our caps */
463 struct list_head cap_delay_list; /* caps with delayed release */
464 struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
465 spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */
466 struct list_head snap_flush_list; /* cap_snaps ready to flush */
467 spinlock_t snap_flush_lock;
468
469 u64 last_cap_flush_tid;
470 struct list_head cap_flush_list;
471 struct list_head cap_dirty_migrating; /* ...that are migration... */
472 int num_cap_flushing; /* # caps we are flushing */
473 spinlock_t cap_dirty_lock; /* protects above items */
474 wait_queue_head_t cap_flushing_wq;
475
476 struct work_struct cap_reclaim_work;
477 atomic_t cap_reclaim_pending;
478
479 struct work_struct cap_unlink_work;
480
481 /*
482 * Cap reservations
483 *
484 * Maintain a global pool of preallocated struct ceph_caps, referenced
485 * by struct ceph_caps_reservations. This ensures that we preallocate
486 * memory needed to successfully process an MDS response. (If an MDS
487 * sends us cap information and we fail to process it, we will have
488 * problems due to the client and MDS being out of sync.)
489 *
490 * Reservations are 'owned' by a ceph_cap_reservation context.
491 */
492 spinlock_t caps_list_lock;
493 struct list_head caps_list; /* unused (reserved or
494 unreserved) */
495 struct list_head cap_wait_list;
496 int caps_total_count; /* total caps allocated */
497 int caps_use_count; /* in use */
498 int caps_use_max; /* max used caps */
499 int caps_reserve_count; /* unused, reserved */
500 int caps_avail_count; /* unused, unreserved */
501 int caps_min_count; /* keep at least this many
502 (unreserved) */
503 spinlock_t dentry_list_lock;
504 struct list_head dentry_leases; /* fifo list */
505 struct list_head dentry_dir_leases; /* lru list */
506
507 struct ceph_client_metric metric;
508
509 spinlock_t snapid_map_lock;
510 struct rb_root snapid_map_tree;
511 struct list_head snapid_map_lru;
512
513 struct rw_semaphore pool_perm_rwsem;
514 struct rb_root pool_perm_tree;
515
516 char nodename[__NEW_UTS_LEN + 1];
517};
518
519extern const char *ceph_mds_op_name(int op);
520
521extern bool check_session_state(struct ceph_mds_session *s);
522void inc_session_sequence(struct ceph_mds_session *s);
523
524extern struct ceph_mds_session *
525__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
526
527extern const char *ceph_session_state_name(int s);
528
529extern struct ceph_mds_session *
530ceph_get_mds_session(struct ceph_mds_session *s);
531extern void ceph_put_mds_session(struct ceph_mds_session *s);
532
533extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
534 struct ceph_msg *msg, int mds);
535
536extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
537extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
538extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
539extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
540
541extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
542
543extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
544extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
545 struct inode *dir);
546extern struct ceph_mds_request *
547ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
548extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
549 struct inode *dir,
550 struct ceph_mds_request *req);
551int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
552 struct ceph_mds_request *req,
553 ceph_mds_request_wait_callback_t wait_func);
554extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
555 struct inode *dir,
556 struct ceph_mds_request *req);
557extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
558extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req);
559static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
560{
561 kref_get(kref: &req->r_kref);
562}
563extern void ceph_mdsc_release_request(struct kref *kref);
564static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
565{
566 kref_put(kref: &req->r_kref, release: ceph_mdsc_release_request);
567}
568
569extern void send_flush_mdlog(struct ceph_mds_session *s);
570extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
571 void (*cb)(struct ceph_mds_session *),
572 bool check_state);
573extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
574extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
575 struct ceph_cap *cap);
576extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
577 struct ceph_mds_session *session);
578extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
579extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
580extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
581extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
582 int (*cb)(struct inode *, int mds, void *),
583 void *arg);
584extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
585
586static inline void ceph_mdsc_free_path(char *path, int len)
587{
588 if (!IS_ERR_OR_NULL(ptr: path))
589 __putname(path - (PATH_MAX - 1 - len));
590}
591
592extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
593 struct dentry *dentry, int *plen, u64 *base,
594 int for_wire);
595
596extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
597extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
598 struct dentry *dentry, char action,
599 u32 seq);
600
601extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
602 struct ceph_msg *msg);
603extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
604 struct ceph_msg *msg);
605
606extern struct ceph_mds_session *
607ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
608extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
609 struct ceph_mds_session *session);
610
611extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
612 struct ceph_mds_session *session,
613 int max_caps);
614
615static inline int ceph_wait_on_async_create(struct inode *inode)
616{
617 struct ceph_inode_info *ci = ceph_inode(inode);
618
619 return wait_on_bit(word: &ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
620 TASK_KILLABLE);
621}
622
623extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
624extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
625extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
626
627extern bool enable_unsafe_idmap;
628#endif
629

source code of linux/fs/ceph/mds_client.h