caps.c source code [linux/fs/ceph/caps.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/fs.h>
5	#include <linux/kernel.h>
6	#include <linux/sched/signal.h>
7	#include <linux/slab.h>
8	#include <linux/vmalloc.h>
9	#include <linux/wait.h>
10	#include <linux/writeback.h>
11	#include <linux/iversion.h>
12	#include <linux/filelock.h>
13
14	#include "super.h"
15	#include "mds_client.h"
16	#include "cache.h"
17	#include "crypto.h"
18	#include <linux/ceph/decode.h>
19	#include <linux/ceph/messenger.h>
20
21	/*
22	* Capability management
23	*
24	* The Ceph metadata servers control client access to inode metadata
25	* and file data by issuing capabilities, granting clients permission
26	* to read and/or write both inode field and file data to OSDs
27	* (storage nodes). Each capability consists of a set of bits
28	* indicating which operations are allowed.
29	*
30	* If the client holds a *_SHARED cap, the client has a coherent value
31	* that can be safely read from the cached inode.
32	*
33	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
34	* client is allowed to change inode attributes (e.g., file size,
35	* mtime), note its dirty state in the ceph_cap, and asynchronously
36	* flush that metadata change to the MDS.
37	*
38	* In the event of a conflicting operation (perhaps by another
39	* client), the MDS will revoke the conflicting client capabilities.
40	*
41	* In order for a client to cache an inode, it must hold a capability
42	* with at least one MDS server. When inodes are released, release
43	* notifications are batched and periodically sent en masse to the MDS
44	* cluster to release server state.
45	*/
46
47	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
48	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
49	struct ceph_mds_session *session,
50	struct ceph_inode_info *ci,
51	u64 oldest_flush_tid);
52
53	/*
54	* Generate readable cap strings for debugging output.
55	*/
56	#define MAX_CAP_STR 20
57	static char cap_str[MAX_CAP_STR][`40`];
58	static DEFINE_SPINLOCK(cap_str_lock);
59	static int last_cap_str;
60
61	static char gcap_string(char* s, int* c)
62	{
63	if (c & CEPH_CAP_GSHARED)
64	*s++ = `'s'`;
65	if (c & CEPH_CAP_GEXCL)
66	*s++ = `'x'`;
67	if (c & CEPH_CAP_GCACHE)
68	*s++ = `'c'`;
69	if (c & CEPH_CAP_GRD)
70	*s++ = `'r'`;
71	if (c & CEPH_CAP_GWR)
72	*s++ = `'w'`;
73	if (c & CEPH_CAP_GBUFFER)
74	*s++ = `'b'`;
75	if (c & CEPH_CAP_GWREXTEND)
76	*s++ = `'a'`;
77	if (c & CEPH_CAP_GLAZYIO)
78	*s++ = `'l'`;
79	return s;
80	}
81
82	const char ceph_cap_string(int* caps)
83	{
84	int i;
85	char *s;
86	int c;
87
88	spin_lock(lock: &cap_str_lock);
89	i = last_cap_str++;
90	if (last_cap_str == MAX_CAP_STR)
91	last_cap_str = `0`;
92	spin_unlock(lock: &cap_str_lock);
93
94	s = cap_str[i];
95
96	if (caps & CEPH_CAP_PIN)
97	*s++ = `'p'`;
98
99	c = (caps >> CEPH_CAP_SAUTH) & `3`;
100	if (c) {
101	*s++ = `'A'`;
102	s = gcap_string(s, c);
103	}
104
105	c = (caps >> CEPH_CAP_SLINK) & `3`;
106	if (c) {
107	*s++ = `'L'`;
108	s = gcap_string(s, c);
109	}
110
111	c = (caps >> CEPH_CAP_SXATTR) & `3`;
112	if (c) {
113	*s++ = `'X'`;
114	s = gcap_string(s, c);
115	}
116
117	c = caps >> CEPH_CAP_SFILE;
118	if (c) {
119	*s++ = `'F'`;
120	s = gcap_string(s, c);
121	}
122
123	if (s == cap_str[i])
124	*s++ = `'-'`;
125	*s = `0`;
126	return cap_str[i];
127	}
128
129	void ceph_caps_init(struct ceph_mds_client *mdsc)
130	{
131	INIT_LIST_HEAD(list: &mdsc->caps_list);
132	spin_lock_init(&mdsc->caps_list_lock);
133	}
134
135	void ceph_caps_finalize(struct ceph_mds_client *mdsc)
136	{
137	struct ceph_cap *cap;
138
139	spin_lock(lock: &mdsc->caps_list_lock);
140	while (!list_empty(head: &mdsc->caps_list)) {
141	cap = list_first_entry(&mdsc->caps_list,
142	struct ceph_cap, caps_item);
143	list_del(entry: &cap->caps_item);
144	kmem_cache_free(s: ceph_cap_cachep, objp: cap);
145	}
146	mdsc->caps_total_count = `0`;
147	mdsc->caps_avail_count = `0`;
148	mdsc->caps_use_count = `0`;
149	mdsc->caps_reserve_count = `0`;
150	mdsc->caps_min_count = `0`;
151	spin_unlock(lock: &mdsc->caps_list_lock);
152	}
153
154	void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
155	struct ceph_mount_options *fsopt)
156	{
157	spin_lock(lock: &mdsc->caps_list_lock);
158	mdsc->caps_min_count = fsopt->max_readdir;
159	if (mdsc->caps_min_count < `1024`)
160	mdsc->caps_min_count = `1024`;
161	mdsc->caps_use_max = fsopt->caps_max;
162	if (mdsc->caps_use_max > `0` &&
163	mdsc->caps_use_max < mdsc->caps_min_count)
164	mdsc->caps_use_max = mdsc->caps_min_count;
165	spin_unlock(lock: &mdsc->caps_list_lock);
166	}
167
168	static void __ceph_unreserve_caps(struct ceph_mds_client mdsc, int* nr_caps)
169	{
170	struct ceph_cap *cap;
171	int i;
172
173	if (nr_caps) {
174	BUG_ON(mdsc->caps_reserve_count < nr_caps);
175	mdsc->caps_reserve_count -= nr_caps;
176	if (mdsc->caps_avail_count >=
177	mdsc->caps_reserve_count + mdsc->caps_min_count) {
178	mdsc->caps_total_count -= nr_caps;
179	for (i = `0`; i < nr_caps; i++) {
180	cap = list_first_entry(&mdsc->caps_list,
181	struct ceph_cap, caps_item);
182	list_del(entry: &cap->caps_item);
183	kmem_cache_free(s: ceph_cap_cachep, objp: cap);
184	}
185	} else {
186	mdsc->caps_avail_count += nr_caps;
187	}
188
189	doutc(mdsc->fsc->client,
190	"caps %d = %d used + %d resv + %d avail\n",
191	mdsc->caps_total_count, mdsc->caps_use_count,
192	mdsc->caps_reserve_count, mdsc->caps_avail_count);
193	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
194	mdsc->caps_reserve_count +
195	mdsc->caps_avail_count);
196	}
197	}
198
199	/*
200	* Called under mdsc->mutex.
201	*/
202	int ceph_reserve_caps(struct ceph_mds_client *mdsc,
203	struct ceph_cap_reservation ctx, int* need)
204	{
205	struct ceph_client *cl = mdsc->fsc->client;
206	int i, j;
207	struct ceph_cap *cap;
208	int have;
209	int alloc = `0`;
210	int max_caps;
211	int err = `0`;
212	bool trimmed = false;
213	struct ceph_mds_session *s;
214	LIST_HEAD(newcaps);
215
216	doutc(cl, "ctx=%p need=%d\n", ctx, need);
217
218	/ first reserve any caps that are already allocated /
219	spin_lock(lock: &mdsc->caps_list_lock);
220	if (mdsc->caps_avail_count >= need)
221	have = need;
222	else
223	have = mdsc->caps_avail_count;
224	mdsc->caps_avail_count -= have;
225	mdsc->caps_reserve_count += have;
226	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
227	mdsc->caps_reserve_count +
228	mdsc->caps_avail_count);
229	spin_unlock(lock: &mdsc->caps_list_lock);
230
231	for (i = have; i < need; ) {
232	cap = kmem_cache_alloc(cachep: ceph_cap_cachep, GFP_NOFS);
233	if (cap) {
234	list_add(new: &cap->caps_item, head: &newcaps);
235	alloc++;
236	i++;
237	continue;
238	}
239
240	if (!trimmed) {
241	for (j = `0`; j < mdsc->max_sessions; j++) {
242	s = __ceph_lookup_mds_session(mdsc, mds: j);
243	if (!s)
244	continue;
245	mutex_unlock(lock: &mdsc->mutex);
246
247	mutex_lock(&s->s_mutex);
248	max_caps = s->s_nr_caps - (need - i);
249	ceph_trim_caps(mdsc, session: s, max_caps);
250	mutex_unlock(lock: &s->s_mutex);
251
252	ceph_put_mds_session(s);
253	mutex_lock(&mdsc->mutex);
254	}
255	trimmed = true;
256
257	spin_lock(lock: &mdsc->caps_list_lock);
258	if (mdsc->caps_avail_count) {
259	int more_have;
260	if (mdsc->caps_avail_count >= need - i)
261	more_have = need - i;
262	else
263	more_have = mdsc->caps_avail_count;
264
265	i += more_have;
266	have += more_have;
267	mdsc->caps_avail_count -= more_have;
268	mdsc->caps_reserve_count += more_have;
269
270	}
271	spin_unlock(lock: &mdsc->caps_list_lock);
272
273	continue;
274	}
275
276	pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need,
277	have + alloc);
278	err = -ENOMEM;
279	break;
280	}
281
282	if (!err) {
283	BUG_ON(have + alloc != need);
284	ctx->count = need;
285	ctx->used = `0`;
286	}
287
288	spin_lock(lock: &mdsc->caps_list_lock);
289	mdsc->caps_total_count += alloc;
290	mdsc->caps_reserve_count += alloc;
291	list_splice(list: &newcaps, head: &mdsc->caps_list);
292
293	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
294	mdsc->caps_reserve_count +
295	mdsc->caps_avail_count);
296
297	if (err)
298	__ceph_unreserve_caps(mdsc, nr_caps: have + alloc);
299
300	spin_unlock(lock: &mdsc->caps_list_lock);
301
302	doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx,
303	mdsc->caps_total_count, mdsc->caps_use_count,
304	mdsc->caps_reserve_count, mdsc->caps_avail_count);
305	return err;
306	}
307
308	void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
309	struct ceph_cap_reservation *ctx)
310	{
311	struct ceph_client *cl = mdsc->fsc->client;
312	bool reclaim = false;
313	if (!ctx->count)
314	return;
315
316	doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count);
317	spin_lock(lock: &mdsc->caps_list_lock);
318	__ceph_unreserve_caps(mdsc, nr_caps: ctx->count);
319	ctx->count = `0`;
320
321	if (mdsc->caps_use_max > `0` &&
322	mdsc->caps_use_count > mdsc->caps_use_max)
323	reclaim = true;
324	spin_unlock(lock: &mdsc->caps_list_lock);
325
326	if (reclaim)
327	ceph_reclaim_caps_nr(mdsc, nr: ctx->used);
328	}
329
330	struct ceph_cap ceph_get_cap(struct* ceph_mds_client *mdsc,
331	struct ceph_cap_reservation *ctx)
332	{
333	struct ceph_client *cl = mdsc->fsc->client;
334	struct ceph_cap *cap = NULL;
335
336	/ temporary, until we do something about cap import/export /
337	if (!ctx) {
338	cap = kmem_cache_alloc(cachep: ceph_cap_cachep, GFP_NOFS);
339	if (cap) {
340	spin_lock(lock: &mdsc->caps_list_lock);
341	mdsc->caps_use_count++;
342	mdsc->caps_total_count++;
343	spin_unlock(lock: &mdsc->caps_list_lock);
344	} else {
345	spin_lock(lock: &mdsc->caps_list_lock);
346	if (mdsc->caps_avail_count) {
347	BUG_ON(list_empty(&mdsc->caps_list));
348
349	mdsc->caps_avail_count--;
350	mdsc->caps_use_count++;
351	cap = list_first_entry(&mdsc->caps_list,
352	struct ceph_cap, caps_item);
353	list_del(entry: &cap->caps_item);
354
355	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
356	mdsc->caps_reserve_count + mdsc->caps_avail_count);
357	}
358	spin_unlock(lock: &mdsc->caps_list_lock);
359	}
360
361	return cap;
362	}
363
364	spin_lock(lock: &mdsc->caps_list_lock);
365	doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx,
366	ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
367	mdsc->caps_reserve_count, mdsc->caps_avail_count);
368	BUG_ON(!ctx->count);
369	BUG_ON(ctx->count > mdsc->caps_reserve_count);
370	BUG_ON(list_empty(&mdsc->caps_list));
371
372	ctx->count--;
373	ctx->used++;
374	mdsc->caps_reserve_count--;
375	mdsc->caps_use_count++;
376
377	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
378	list_del(entry: &cap->caps_item);
379
380	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
381	mdsc->caps_reserve_count + mdsc->caps_avail_count);
382	spin_unlock(lock: &mdsc->caps_list_lock);
383	return cap;
384	}
385
386	void ceph_put_cap(struct ceph_mds_client mdsc, struct* ceph_cap *cap)
387	{
388	struct ceph_client *cl = mdsc->fsc->client;
389
390	spin_lock(lock: &mdsc->caps_list_lock);
391	doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap,
392	mdsc->caps_total_count, mdsc->caps_use_count,
393	mdsc->caps_reserve_count, mdsc->caps_avail_count);
394	mdsc->caps_use_count--;
395	/*
396	* Keep some preallocated caps around (ceph_min_count), to
397	* avoid lots of free/alloc churn.
398	*/
399	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
400	mdsc->caps_min_count) {
401	mdsc->caps_total_count--;
402	kmem_cache_free(s: ceph_cap_cachep, objp: cap);
403	} else {
404	mdsc->caps_avail_count++;
405	list_add(new: &cap->caps_item, head: &mdsc->caps_list);
406	}
407
408	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
409	mdsc->caps_reserve_count + mdsc->caps_avail_count);
410	spin_unlock(lock: &mdsc->caps_list_lock);
411	}
412
413	void ceph_reservation_status(struct ceph_fs_client *fsc,
414	int total, int* avail, int* used, int* *reserved,
415	int *min)
416	{
417	struct ceph_mds_client *mdsc = fsc->mdsc;
418
419	spin_lock(lock: &mdsc->caps_list_lock);
420
421	if (total)
422	*total = mdsc->caps_total_count;
423	if (avail)
424	*avail = mdsc->caps_avail_count;
425	if (used)
426	*used = mdsc->caps_use_count;
427	if (reserved)
428	*reserved = mdsc->caps_reserve_count;
429	if (min)
430	*min = mdsc->caps_min_count;
431
432	spin_unlock(lock: &mdsc->caps_list_lock);
433	}
434
435	/*
436	* Find ceph_cap for given mds, if any.
437	*
438	* Called with i_ceph_lock held.
439	*/
440	struct ceph_cap __get_cap_for_mds(struct* ceph_inode_info ci, int* mds)
441	{
442	struct ceph_cap *cap;
443	struct rb_node *n = ci->i_caps.rb_node;
444
445	while (n) {
446	cap = rb_entry(n, struct ceph_cap, ci_node);
447	if (mds < cap->mds)
448	n = n->rb_left;
449	else if (mds > cap->mds)
450	n = n->rb_right;
451	else
452	return cap;
453	}
454	return NULL;
455	}
456
457	struct ceph_cap ceph_get_cap_for_mds(struct* ceph_inode_info ci, int* mds)
458	{
459	struct ceph_cap *cap;
460
461	spin_lock(lock: &ci->i_ceph_lock);
462	cap = __get_cap_for_mds(ci, mds);
463	spin_unlock(lock: &ci->i_ceph_lock);
464	return cap;
465	}
466
467	/*
468	* Called under i_ceph_lock.
469	*/
470	static void __insert_cap_node(struct ceph_inode_info *ci,
471	struct ceph_cap *new)
472	{
473	struct rb_node **p = &ci->i_caps.rb_node;
474	struct rb_node *parent = NULL;
475	struct ceph_cap *cap = NULL;
476
477	while (*p) {
478	parent = *p;
479	cap = rb_entry(parent, struct ceph_cap, ci_node);
480	if (new->mds < cap->mds)
481	p = &(*p)->rb_left;
482	else if (new->mds > cap->mds)
483	p = &(*p)->rb_right;
484	else
485	BUG();
486	}
487
488	rb_link_node(node: &new->ci_node, parent, rb_link: p);
489	rb_insert_color(&new->ci_node, &ci->i_caps);
490	}
491
492	/*
493	* (re)set cap hold timeouts, which control the delayed release
494	* of unused caps back to the MDS. Should be called on cap use.
495	*/
496	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
497	struct ceph_inode_info *ci)
498	{
499	struct inode *inode = &ci->netfs.inode;
500	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
501
502	ci->i_hold_caps_max = round_jiffies(j: jiffies +
503	opt->caps_wanted_delay_max * HZ);
504	doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode,
505	ceph_vinop(inode), ci->i_hold_caps_max - jiffies);
506	}
507
508	/*
509	* (Re)queue cap at the end of the delayed cap release list.
510	*
511	* If I_FLUSH is set, leave the inode at the front of the list.
512	*
513	* Caller holds i_ceph_lock
514	* -> we take mdsc->cap_delay_lock
515	*/
516	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
517	struct ceph_inode_info *ci)
518	{
519	struct inode *inode = &ci->netfs.inode;
520
521	doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n",
522	inode, ceph_vinop(inode), ci->i_ceph_flags,
523	ci->i_hold_caps_max);
524	if (!mdsc->stopping) {
525	spin_lock(lock: &mdsc->cap_delay_lock);
526	if (!list_empty(head: &ci->i_cap_delay_list)) {
527	if (ci->i_ceph_flags & CEPH_I_FLUSH)
528	goto no_change;
529	list_del_init(entry: &ci->i_cap_delay_list);
530	}
531	__cap_set_timeouts(mdsc, ci);
532	list_add_tail(new: &ci->i_cap_delay_list, head: &mdsc->cap_delay_list);
533	no_change:
534	spin_unlock(lock: &mdsc->cap_delay_lock);
535	}
536	}
537
538	/*
539	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
540	* indicating we should send a cap message to flush dirty metadata
541	* asap, and move to the front of the delayed cap list.
542	*/
543	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
544	struct ceph_inode_info *ci)
545	{
546	struct inode *inode = &ci->netfs.inode;
547
548	doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
549	spin_lock(lock: &mdsc->cap_delay_lock);
550	ci->i_ceph_flags \|= CEPH_I_FLUSH;
551	if (!list_empty(head: &ci->i_cap_delay_list))
552	list_del_init(entry: &ci->i_cap_delay_list);
553	list_add(new: &ci->i_cap_delay_list, head: &mdsc->cap_delay_list);
554	spin_unlock(lock: &mdsc->cap_delay_lock);
555	}
556
557	/*
558	* Cancel delayed work on cap.
559	*
560	* Caller must hold i_ceph_lock.
561	*/
562	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
563	struct ceph_inode_info *ci)
564	{
565	struct inode *inode = &ci->netfs.inode;
566
567	doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
568	if (list_empty(head: &ci->i_cap_delay_list))
569	return;
570	spin_lock(lock: &mdsc->cap_delay_lock);
571	list_del_init(entry: &ci->i_cap_delay_list);
572	spin_unlock(lock: &mdsc->cap_delay_lock);
573	}
574
575	/ Common issue checks for add_cap, handle_cap_grant. /
576	static void __check_cap_issue(struct ceph_inode_info ci, struct* ceph_cap *cap,
577	unsigned issued)
578	{
579	struct inode *inode = &ci->netfs.inode;
580	struct ceph_client *cl = ceph_inode_to_client(inode);
581
582	unsigned had = __ceph_caps_issued(ci, NULL);
583
584	lockdep_assert_held(&ci->i_ceph_lock);
585
586	/*
587	* Each time we receive FILE_CACHE anew, we increment
588	* i_rdcache_gen.
589	*/
590	if (S_ISREG(ci->netfs.inode.i_mode) &&
591	(issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
592	(had & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == `0`) {
593	ci->i_rdcache_gen++;
594	}
595
596	/*
597	* If FILE_SHARED is newly issued, mark dir not complete. We don't
598	* know what happened to this directory while we didn't have the cap.
599	* If FILE_SHARED is being revoked, also mark dir not complete. It
600	* stops on-going cached readdir.
601	*/
602	if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
603	if (issued & CEPH_CAP_FILE_SHARED)
604	atomic_inc(v: &ci->i_shared_gen);
605	if (S_ISDIR(ci->netfs.inode.i_mode)) {
606	doutc(cl, " marking %p NOT complete\n", inode);
607	__ceph_dir_clear_complete(ci);
608	}
609	}
610
611	/ Wipe saved layout if we're losing DIR_CREATE caps /
612	if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
613	!(issued & CEPH_CAP_DIR_CREATE)) {
614	ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
615	memset(&ci->i_cached_layout, `0`, sizeof(ci->i_cached_layout));
616	}
617	}
618
619	/**
620	* change_auth_cap_ses - move inode to appropriate lists when auth caps change
621	* @ci: inode to be moved
622	* @session: new auth caps session
623	*/
624	void change_auth_cap_ses(struct ceph_inode_info *ci,
625	struct ceph_mds_session *session)
626	{
627	lockdep_assert_held(&ci->i_ceph_lock);
628
629	if (list_empty(head: &ci->i_dirty_item) && list_empty(head: &ci->i_flushing_item))
630	return;
631
632	spin_lock(lock: &session->s_mdsc->cap_dirty_lock);
633	if (!list_empty(head: &ci->i_dirty_item))
634	list_move(list: &ci->i_dirty_item, head: &session->s_cap_dirty);
635	if (!list_empty(head: &ci->i_flushing_item))
636	list_move_tail(list: &ci->i_flushing_item, head: &session->s_cap_flushing);
637	spin_unlock(lock: &session->s_mdsc->cap_dirty_lock);
638	}
639
640	/*
641	* Add a capability under the given MDS session.
642	*
643	* Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
644	*
645	* @fmode is the open file mode, if we are opening a file, otherwise
646	* it is < 0. (This is so we can atomically add the cap and add an
647	* open file reference to it.)
648	*/
649	void ceph_add_cap(struct inode *inode,
650	struct ceph_mds_session *session, u64 cap_id,
651	unsigned issued, unsigned wanted,
652	unsigned seq, unsigned mseq, u64 realmino, int flags,
653	struct ceph_cap **new_cap)
654	{
655	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
656	struct ceph_client *cl = ceph_inode_to_client(inode);
657	struct ceph_inode_info *ci = ceph_inode(inode);
658	struct ceph_cap *cap;
659	int mds = session->s_mds;
660	int actual_wanted;
661	u32 gen;
662
663	lockdep_assert_held(&ci->i_ceph_lock);
664
665	doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode,
666	ceph_vinop(inode), session->s_mds, cap_id,
667	ceph_cap_string(issued), seq);
668
669	gen = atomic_read(v: &session->s_cap_gen);
670
671	cap = __get_cap_for_mds(ci, mds);
672	if (!cap) {
673	cap = *new_cap;
674	*new_cap = NULL;
675
676	cap->issued = `0`;
677	cap->implemented = `0`;
678	cap->mds = mds;
679	cap->mds_wanted = `0`;
680	cap->mseq = `0`;
681
682	cap->ci = ci;
683	__insert_cap_node(ci, new: cap);
684
685	/ add to session cap list /
686	cap->session = session;
687	spin_lock(lock: &session->s_cap_lock);
688	list_add_tail(new: &cap->session_caps, head: &session->s_caps);
689	session->s_nr_caps++;
690	atomic64_inc(v: &mdsc->metric.total_caps);
691	spin_unlock(lock: &session->s_cap_lock);
692	} else {
693	spin_lock(lock: &session->s_cap_lock);
694	list_move_tail(list: &cap->session_caps, head: &session->s_caps);
695	spin_unlock(lock: &session->s_cap_lock);
696
697	if (cap->cap_gen < gen)
698	cap->issued = cap->implemented = CEPH_CAP_PIN;
699
700	/*
701	* auth mds of the inode changed. we received the cap export
702	* message, but still haven't received the cap import message.
703	* handle_cap_export() updated the new auth MDS' cap.
704	*
705	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
706	* a message that was send before the cap import message. So
707	* don't remove caps.
708	*/
709	if (ceph_seq_cmp(a: seq, b: cap->seq) <= `0`) {
710	WARN_ON(cap != ci->i_auth_cap);
711	WARN_ON(cap->cap_id != cap_id);
712	seq = cap->seq;
713	mseq = cap->mseq;
714	issued \|= cap->issued;
715	flags \|= CEPH_CAP_FLAG_AUTH;
716	}
717	}
718
719	if (!ci->i_snap_realm \|\|
720	((flags & CEPH_CAP_FLAG_AUTH) &&
721	realmino != (u64)-`1` && ci->i_snap_realm->ino != realmino)) {
722	/*
723	* add this inode to the appropriate snap realm
724	*/
725	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
726	ino: realmino);
727	if (realm)
728	ceph_change_snap_realm(inode, realm);
729	else
730	WARN(`1`, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
731	__func__, realmino, ci->i_vino.ino,
732	ci->i_snap_realm ? ci->i_snap_realm->ino : `0`);
733	}
734
735	__check_cap_issue(ci, cap, issued);
736
737	/*
738	* If we are issued caps we don't want, or the mds' wanted
739	* value appears to be off, queue a check so we'll release
740	* later and/or update the mds wanted value.
741	*/
742	actual_wanted = __ceph_caps_wanted(ci);
743	if ((wanted & ~actual_wanted) \|\|
744	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
745	doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n",
746	ceph_cap_string(issued), ceph_cap_string(wanted),
747	ceph_cap_string(actual_wanted));
748	__cap_delay_requeue(mdsc, ci);
749	}
750
751	if (flags & CEPH_CAP_FLAG_AUTH) {
752	if (!ci->i_auth_cap \|\|
753	ceph_seq_cmp(a: ci->i_auth_cap->mseq, b: mseq) < `0`) {
754	if (ci->i_auth_cap &&
755	ci->i_auth_cap->session != cap->session)
756	change_auth_cap_ses(ci, session: cap->session);
757	ci->i_auth_cap = cap;
758	cap->mds_wanted = wanted;
759	}
760	} else {
761	WARN_ON(ci->i_auth_cap == cap);
762	}
763
764	doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n",
765	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
766	ceph_cap_string(issued\|cap->issued), seq, mds);
767	cap->cap_id = cap_id;
768	cap->issued = issued;
769	cap->implemented \|= issued;
770	if (ceph_seq_cmp(a: mseq, b: cap->mseq) > `0`)
771	cap->mds_wanted = wanted;
772	else
773	cap->mds_wanted \|= wanted;
774	cap->seq = seq;
775	cap->issue_seq = seq;
776	cap->mseq = mseq;
777	cap->cap_gen = gen;
778	wake_up_all(&ci->i_cap_wq);
779	}
780
781	/*
782	* Return true if cap has not timed out and belongs to the current
783	* generation of the MDS session (i.e. has not gone 'stale' due to
784	* us losing touch with the mds).
785	*/
786	static int __cap_is_valid(struct ceph_cap *cap)
787	{
788	struct inode *inode = &cap->ci->netfs.inode;
789	struct ceph_client *cl = cap->session->s_mdsc->fsc->client;
790	unsigned long ttl;
791	u32 gen;
792
793	gen = atomic_read(v: &cap->session->s_cap_gen);
794	ttl = cap->session->s_cap_ttl;
795
796	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {
797	doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n",
798	inode, ceph_vinop(inode), cap,
799	ceph_cap_string(cap->issued), cap->cap_gen, gen);
800	return `0`;
801	}
802
803	return `1`;
804	}
805
806	/*
807	* Return set of valid cap bits issued to us. Note that caps time
808	* out, and may be invalidated in bulk if the client session times out
809	* and session->s_cap_gen is bumped.
810	*/
811	int __ceph_caps_issued(struct ceph_inode_info ci, int* *implemented)
812	{
813	struct inode *inode = &ci->netfs.inode;
814	struct ceph_client *cl = ceph_inode_to_client(inode);
815	int have = ci->i_snap_caps;
816	struct ceph_cap *cap;
817	struct rb_node *p;
818
819	if (implemented)
820	*implemented = `0`;
821	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
822	cap = rb_entry(p, struct ceph_cap, ci_node);
823	if (!__cap_is_valid(cap))
824	continue;
825	doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode,
826	ceph_vinop(inode), cap, ceph_cap_string(cap->issued));
827	have \|= cap->issued;
828	if (implemented)
829	*implemented \|= cap->implemented;
830	}
831	/*
832	* exclude caps issued by non-auth MDS, but are been revoking
833	* by the auth MDS. The non-auth MDS should be revoking/exporting
834	* these caps, but the message is delayed.
835	*/
836	if (ci->i_auth_cap) {
837	cap = ci->i_auth_cap;
838	have &= ~cap->implemented \| cap->issued;
839	}
840	return have;
841	}
842
843	/*
844	* Get cap bits issued by caps other than @ocap
845	*/
846	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct* ceph_cap *ocap)
847	{
848	int have = ci->i_snap_caps;
849	struct ceph_cap *cap;
850	struct rb_node *p;
851
852	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
853	cap = rb_entry(p, struct ceph_cap, ci_node);
854	if (cap == ocap)
855	continue;
856	if (!__cap_is_valid(cap))
857	continue;
858	have \|= cap->issued;
859	}
860	return have;
861	}
862
863	/*
864	* Move a cap to the end of the LRU (oldest caps at list head, newest
865	* at list tail).
866	*/
867	static void __touch_cap(struct ceph_cap *cap)
868	{
869	struct inode *inode = &cap->ci->netfs.inode;
870	struct ceph_mds_session *s = cap->session;
871	struct ceph_client *cl = s->s_mdsc->fsc->client;
872
873	spin_lock(lock: &s->s_cap_lock);
874	if (!s->s_cap_iterator) {
875	doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode,
876	ceph_vinop(inode), cap, s->s_mds);
877	list_move_tail(list: &cap->session_caps, head: &s->s_caps);
878	} else {
879	doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n",
880	inode, ceph_vinop(inode), cap, s->s_mds);
881	}
882	spin_unlock(lock: &s->s_cap_lock);
883	}
884
885	/*
886	* Check if we hold the given mask. If so, move the cap(s) to the
887	* front of their respective LRUs. (This is the preferred way for
888	* callers to check for caps they want.)
889	*/
890	int __ceph_caps_issued_mask(struct ceph_inode_info ci, int* mask, int touch)
891	{
892	struct inode *inode = &ci->netfs.inode;
893	struct ceph_client *cl = ceph_inode_to_client(inode);
894	struct ceph_cap *cap;
895	struct rb_node *p;
896	int have = ci->i_snap_caps;
897
898	if ((have & mask) == mask) {
899	doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n",
900	inode, ceph_vinop(inode), ceph_cap_string(have),
901	ceph_cap_string(mask));
902	return `1`;
903	}
904
905	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
906	cap = rb_entry(p, struct ceph_cap, ci_node);
907	if (!__cap_is_valid(cap))
908	continue;
909	if ((cap->issued & mask) == mask) {
910	doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n",
911	inode, ceph_vinop(inode), cap,
912	ceph_cap_string(cap->issued),
913	ceph_cap_string(mask));
914	if (touch)
915	__touch_cap(cap);
916	return `1`;
917	}
918
919	/ does a combination of caps satisfy mask? /
920	have \|= cap->issued;
921	if ((have & mask) == mask) {
922	doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n",
923	inode, ceph_vinop(inode),
924	ceph_cap_string(cap->issued),
925	ceph_cap_string(mask));
926	if (touch) {
927	struct rb_node *q;
928
929	/ touch this + preceding caps /
930	__touch_cap(cap);
931	for (q = rb_first(&ci->i_caps); q != p;
932	q = rb_next(q)) {
933	cap = rb_entry(q, struct ceph_cap,
934	ci_node);
935	if (!__cap_is_valid(cap))
936	continue;
937	if (cap->issued & mask)
938	__touch_cap(cap);
939	}
940	}
941	return `1`;
942	}
943	}
944
945	return `0`;
946	}
947
948	int __ceph_caps_issued_mask_metric(struct ceph_inode_info ci, int* mask,
949	int touch)
950	{
951	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb: ci->netfs.inode.i_sb);
952	int r;
953
954	r = __ceph_caps_issued_mask(ci, mask, touch);
955	if (r)
956	ceph_update_cap_hit(m: &fsc->mdsc->metric);
957	else
958	ceph_update_cap_mis(m: &fsc->mdsc->metric);
959	return r;
960	}
961
962	/*
963	* Return true if mask caps are currently being revoked by an MDS.
964	*/
965	int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
966	struct ceph_cap ocap, int* mask)
967	{
968	struct ceph_cap *cap;
969	struct rb_node *p;
970
971	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
972	cap = rb_entry(p, struct ceph_cap, ci_node);
973	if (cap != ocap &&
974	(cap->implemented & ~cap->issued & mask))
975	return `1`;
976	}
977	return `0`;
978	}
979
980	int ceph_caps_revoking(struct ceph_inode_info ci, int* mask)
981	{
982	struct inode *inode = &ci->netfs.inode;
983	struct ceph_client *cl = ceph_inode_to_client(inode);
984	int ret;
985
986	spin_lock(lock: &ci->i_ceph_lock);
987	ret = __ceph_caps_revoking_other(ci, NULL, mask);
988	spin_unlock(lock: &ci->i_ceph_lock);
989	doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode),
990	ceph_cap_string(mask), ret);
991	return ret;
992	}
993
994	int __ceph_caps_used(struct ceph_inode_info *ci)
995	{
996	int used = `0`;
997	if (ci->i_pin_ref)
998	used \|= CEPH_CAP_PIN;
999	if (ci->i_rd_ref)
1000	used \|= CEPH_CAP_FILE_RD;
1001	if (ci->i_rdcache_ref \|\|
1002	(S_ISREG(ci->netfs.inode.i_mode) &&
1003	ci->netfs.inode.i_data.nrpages))
1004	used \|= CEPH_CAP_FILE_CACHE;
1005	if (ci->i_wr_ref)
1006	used \|= CEPH_CAP_FILE_WR;
1007	if (ci->i_wb_ref \|\| ci->i_wrbuffer_ref)
1008	used \|= CEPH_CAP_FILE_BUFFER;
1009	if (ci->i_fx_ref)
1010	used \|= CEPH_CAP_FILE_EXCL;
1011	return used;
1012	}
1013
1014	#define FMODE_WAIT_BIAS 1000
1015
1016	/*
1017	* wanted, by virtue of open file modes
1018	*/
1019	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
1020	{
1021	const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
1022	const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
1023	const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
1024	const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
1025	struct ceph_mount_options *opt =
1026	ceph_inode_to_fs_client(inode: &ci->netfs.inode)->mount_options;
1027	unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1028	unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1029
1030	if (S_ISDIR(ci->netfs.inode.i_mode)) {
1031	int want = `0`;
1032
1033	/ use used_cutoff here, to keep dir's wanted caps longer /
1034	if (ci->i_nr_by_mode[RD_SHIFT] > `0` \|\|
1035	time_after(ci->i_last_rd, used_cutoff))
1036	want \|= CEPH_CAP_ANY_SHARED;
1037
1038	if (ci->i_nr_by_mode[WR_SHIFT] > `0` \|\|
1039	time_after(ci->i_last_wr, used_cutoff)) {
1040	want \|= CEPH_CAP_ANY_SHARED \| CEPH_CAP_FILE_EXCL;
1041	if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1042	want \|= CEPH_CAP_ANY_DIR_OPS;
1043	}
1044
1045	if (want \|\| ci->i_nr_by_mode[PIN_SHIFT] > `0`)
1046	want \|= CEPH_CAP_PIN;
1047
1048	return want;
1049	} else {
1050	int bits = `0`;
1051
1052	if (ci->i_nr_by_mode[RD_SHIFT] > `0`) {
1053	if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS \|\|
1054	time_after(ci->i_last_rd, used_cutoff))
1055	bits \|= `1` << RD_SHIFT;
1056	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
1057	bits \|= `1` << RD_SHIFT;
1058	}
1059
1060	if (ci->i_nr_by_mode[WR_SHIFT] > `0`) {
1061	if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS \|\|
1062	time_after(ci->i_last_wr, used_cutoff))
1063	bits \|= `1` << WR_SHIFT;
1064	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
1065	bits \|= `1` << WR_SHIFT;
1066	}
1067
1068	/ check lazyio only when read/write is wanted /
1069	if ((bits & (CEPH_FILE_MODE_RDWR << `1`)) &&
1070	ci->i_nr_by_mode[LAZY_SHIFT] > `0`)
1071	bits \|= `1` << LAZY_SHIFT;
1072
1073	return bits ? ceph_caps_for_mode(mode: bits >> `1`) : `0`;
1074	}
1075	}
1076
1077	/*
1078	* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1079	*/
1080	int __ceph_caps_wanted(struct ceph_inode_info *ci)
1081	{
1082	int w = __ceph_caps_file_wanted(ci) \| __ceph_caps_used(ci);
1083	if (S_ISDIR(ci->netfs.inode.i_mode)) {
1084	/ we want EXCL if holding caps of dir ops /
1085	if (w & CEPH_CAP_ANY_DIR_OPS)
1086	w \|= CEPH_CAP_FILE_EXCL;
1087	} else {
1088	/ we want EXCL if dirty data /
1089	if (w & CEPH_CAP_FILE_BUFFER)
1090	w \|= CEPH_CAP_FILE_EXCL;
1091	}
1092	return w;
1093	}
1094
1095	/*
1096	* Return caps we have registered with the MDS(s) as 'wanted'.
1097	*/
1098	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
1099	{
1100	struct ceph_cap *cap;
1101	struct rb_node *p;
1102	int mds_wanted = `0`;
1103
1104	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1105	cap = rb_entry(p, struct ceph_cap, ci_node);
1106	if (check && !__cap_is_valid(cap))
1107	continue;
1108	if (cap == ci->i_auth_cap)
1109	mds_wanted \|= cap->mds_wanted;
1110	else
1111	mds_wanted \|= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
1112	}
1113	return mds_wanted;
1114	}
1115
1116	int ceph_is_any_caps(struct inode *inode)
1117	{
1118	struct ceph_inode_info *ci = ceph_inode(inode);
1119	int ret;
1120
1121	spin_lock(lock: &ci->i_ceph_lock);
1122	ret = __ceph_is_any_real_caps(ci);
1123	spin_unlock(lock: &ci->i_ceph_lock);
1124
1125	return ret;
1126	}
1127
1128	/*
1129	* Remove a cap. Take steps to deal with a racing iterate_session_caps.
1130	*
1131	* caller should hold i_ceph_lock.
1132	* caller will not hold session s_mutex if called from destroy_inode.
1133	*/
1134	void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
1135	{
1136	struct ceph_mds_session *session = cap->session;
1137	struct ceph_client *cl = session->s_mdsc->fsc->client;
1138	struct ceph_inode_info *ci = cap->ci;
1139	struct inode *inode = &ci->netfs.inode;
1140	struct ceph_mds_client *mdsc;
1141	int removed = `0`;
1142
1143	/ 'ci' being NULL means the remove have already occurred /
1144	if (!ci) {
1145	doutc(cl, "inode is NULL\n");
1146	return;
1147	}
1148
1149	lockdep_assert_held(&ci->i_ceph_lock);
1150
1151	doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode));
1152
1153	mdsc = ceph_inode_to_fs_client(inode: &ci->netfs.inode)->mdsc;
1154
1155	/ remove from inode's cap rbtree, and clear auth cap /
1156	rb_erase(&cap->ci_node, &ci->i_caps);
1157	if (ci->i_auth_cap == cap)
1158	ci->i_auth_cap = NULL;
1159
1160	/ remove from session list /
1161	spin_lock(lock: &session->s_cap_lock);
1162	if (session->s_cap_iterator == cap) {
1163	/ not yet, we are iterating over this very cap /
1164	doutc(cl, "delaying %p removal from session %p\n", cap,
1165	cap->session);
1166	} else {
1167	list_del_init(entry: &cap->session_caps);
1168	session->s_nr_caps--;
1169	atomic64_dec(v: &mdsc->metric.total_caps);
1170	cap->session = NULL;
1171	removed = `1`;
1172	}
1173	/ protect backpointer with s_cap_lock: see iterate_session_caps /
1174	cap->ci = NULL;
1175
1176	/*
1177	* s_cap_reconnect is protected by s_cap_lock. no one changes
1178	* s_cap_gen while session is in the reconnect state.
1179	*/
1180	if (queue_release &&
1181	(!session->s_cap_reconnect \|\|
1182	cap->cap_gen == atomic_read(v: &session->s_cap_gen))) {
1183	cap->queue_release = `1`;
1184	if (removed) {
1185	__ceph_queue_cap_release(session, cap);
1186	removed = `0`;
1187	}
1188	} else {
1189	cap->queue_release = `0`;
1190	}
1191	cap->cap_ino = ci->i_vino.ino;
1192
1193	spin_unlock(lock: &session->s_cap_lock);
1194
1195	if (removed)
1196	ceph_put_cap(mdsc, cap);
1197
1198	if (!__ceph_is_any_real_caps(ci)) {
1199	/ when reconnect denied, we remove session caps forcibly,*
1200	* i_wr_ref can be non-zero. If there are ongoing write,
1201	* keep i_snap_realm.
1202	*/
1203	if (ci->i_wr_ref == `0` && ci->i_snap_realm)
1204	ceph_change_snap_realm(inode: &ci->netfs.inode, NULL);
1205
1206	__cap_delay_cancel(mdsc, ci);
1207	}
1208	}
1209
1210	void ceph_remove_cap(struct ceph_mds_client mdsc, struct* ceph_cap *cap,
1211	bool queue_release)
1212	{
1213	struct ceph_inode_info *ci = cap->ci;
1214	struct ceph_fs_client *fsc;
1215
1216	/ 'ci' being NULL means the remove have already occurred /
1217	if (!ci) {
1218	doutc(mdsc->fsc->client, "inode is NULL\n");
1219	return;
1220	}
1221
1222	lockdep_assert_held(&ci->i_ceph_lock);
1223
1224	fsc = ceph_inode_to_fs_client(inode: &ci->netfs.inode);
1225	WARN_ON_ONCE(ci->i_auth_cap == cap &&
1226	!list_empty(&ci->i_dirty_item) &&
1227	!fsc->blocklisted &&
1228	!ceph_inode_is_shutdown(&ci->netfs.inode));
1229
1230	__ceph_remove_cap(cap, queue_release);
1231	}
1232
1233	struct cap_msg_args {
1234	struct ceph_mds_session *session;
1235	u64 ino, cid, follows;
1236	u64 flush_tid, oldest_flush_tid, size, max_size;
1237	u64 xattr_version;
1238	u64 change_attr;
1239	struct ceph_buffer *xattr_buf;
1240	struct ceph_buffer *old_xattr_buf;
1241	struct timespec64 atime, mtime, ctime, btime;
1242	int op, caps, wanted, dirty;
1243	u32 seq, issue_seq, mseq, time_warp_seq;
1244	u32 flags;
1245	kuid_t uid;
1246	kgid_t gid;
1247	umode_t mode;
1248	bool inline_data;
1249	bool wake;
1250	bool encrypted;
1251	u32 fscrypt_auth_len;
1252	u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
1253	};
1254
1255	/ Marshal up the cap msg to the MDS /
1256	static void encode_cap_msg(struct ceph_msg msg, struct* cap_msg_args *arg)
1257	{
1258	struct ceph_mds_caps *fc;
1259	void *p;
1260	struct ceph_mds_client *mdsc = arg->session->s_mdsc;
1261	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
1262
1263	doutc(mdsc->fsc->client,
1264	"%s %llx %llx caps %s wanted %s dirty %s seq %u/%u"
1265	" tid %llu/%llu mseq %u follows %lld size %llu/%llu"
1266	" xattr_ver %llu xattr_len %d\n",
1267	ceph_cap_op_name(arg->op), arg->cid, arg->ino,
1268	ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
1269	ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
1270	arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
1271	arg->size, arg->max_size, arg->xattr_version,
1272	arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : `0`);
1273
1274	msg->hdr.version = cpu_to_le16(`12`);
1275	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
1276
1277	fc = msg->front.iov_base;
1278	memset(fc, `0`, sizeof(*fc));
1279
1280	fc->cap_id = cpu_to_le64(arg->cid);
1281	fc->op = cpu_to_le32(arg->op);
1282	fc->seq = cpu_to_le32(arg->seq);
1283	fc->issue_seq = cpu_to_le32(arg->issue_seq);
1284	fc->migrate_seq = cpu_to_le32(arg->mseq);
1285	fc->caps = cpu_to_le32(arg->caps);
1286	fc->wanted = cpu_to_le32(arg->wanted);
1287	fc->dirty = cpu_to_le32(arg->dirty);
1288	fc->ino = cpu_to_le64(arg->ino);
1289	fc->snap_follows = cpu_to_le64(arg->follows);
1290
1291	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1292	if (arg->encrypted)
1293	fc->size = cpu_to_le64(round_up(arg->size,
1294	CEPH_FSCRYPT_BLOCK_SIZE));
1295	else
1296	#endif
1297	fc->size = cpu_to_le64(arg->size);
1298	fc->max_size = cpu_to_le64(arg->max_size);
1299	ceph_encode_timespec64(tv: &fc->mtime, ts: &arg->mtime);
1300	ceph_encode_timespec64(tv: &fc->atime, ts: &arg->atime);
1301	ceph_encode_timespec64(tv: &fc->ctime, ts: &arg->ctime);
1302	fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
1303
1304	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
1305	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
1306	fc->mode = cpu_to_le32(arg->mode);
1307
1308	fc->xattr_version = cpu_to_le64(arg->xattr_version);
1309	if (arg->xattr_buf) {
1310	msg->middle = ceph_buffer_get(b: arg->xattr_buf);
1311	fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1312	msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1313	}
1314
1315	p = fc + `1`;
1316	/ flock buffer size (version 2) /
1317	ceph_encode_32(p: &p, v: `0`);
1318	/ inline version (version 4) /
1319	ceph_encode_64(p: &p, v: arg->inline_data ? `0` : CEPH_INLINE_NONE);
1320	/ inline data size /
1321	ceph_encode_32(p: &p, v: `0`);
1322	/*
1323	* osd_epoch_barrier (version 5)
1324	* The epoch_barrier is protected osdc->lock, so READ_ONCE here in
1325	* case it was recently changed
1326	*/
1327	ceph_encode_32(p: &p, READ_ONCE(osdc->epoch_barrier));
1328	/ oldest_flush_tid (version 6) /
1329	ceph_encode_64(p: &p, v: arg->oldest_flush_tid);
1330
1331	/*
1332	* caller_uid/caller_gid (version 7)
1333	*
1334	* Currently, we don't properly track which caller dirtied the caps
1335	* last, and force a flush of them when there is a conflict. For now,
1336	* just set this to 0:0, to emulate how the MDS has worked up to now.
1337	*/
1338	ceph_encode_32(p: &p, v: `0`);
1339	ceph_encode_32(p: &p, v: `0`);
1340
1341	/ pool namespace (version 8) (mds always ignores this) /
1342	ceph_encode_32(p: &p, v: `0`);
1343
1344	/ btime and change_attr (version 9) /
1345	ceph_encode_timespec64(tv: p, ts: &arg->btime);
1346	p += sizeof(struct ceph_timespec);
1347	ceph_encode_64(p: &p, v: arg->change_attr);
1348
1349	/ Advisory flags (version 10) /
1350	ceph_encode_32(p: &p, v: arg->flags);
1351
1352	/ dirstats (version 11) - these are r/o on the client /
1353	ceph_encode_64(p: &p, v: `0`);
1354	ceph_encode_64(p: &p, v: `0`);
1355
1356	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1357	/*
1358	* fscrypt_auth and fscrypt_file (version 12)
1359	*
1360	* fscrypt_auth holds the crypto context (if any). fscrypt_file
1361	* tracks the real i_size as an __le64 field (and we use a rounded-up
1362	* i_size in the traditional size field).
1363	*/
1364	ceph_encode_32(p: &p, v: arg->fscrypt_auth_len);
1365	ceph_encode_copy(p: &p, s: arg->fscrypt_auth, len: arg->fscrypt_auth_len);
1366	ceph_encode_32(p: &p, v: sizeof(__le64));
1367	ceph_encode_64(p: &p, v: arg->size);
1368	#else /* CONFIG_FS_ENCRYPTION */
1369	ceph_encode_32(&p, `0`);
1370	ceph_encode_32(&p, `0`);
1371	#endif /* CONFIG_FS_ENCRYPTION */
1372	}
1373
1374	/*
1375	* Queue cap releases when an inode is dropped from our cache.
1376	*/
1377	void __ceph_remove_caps(struct ceph_inode_info *ci)
1378	{
1379	struct inode *inode = &ci->netfs.inode;
1380	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
1381	struct rb_node *p;
1382
1383	/ lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)*
1384	* may call __ceph_caps_issued_mask() on a freeing inode. */
1385	spin_lock(lock: &ci->i_ceph_lock);
1386	p = rb_first(&ci->i_caps);
1387	while (p) {
1388	struct ceph_cap cap = rb_entry(p, struct* ceph_cap, ci_node);
1389	p = rb_next(p);
1390	ceph_remove_cap(mdsc, cap, queue_release: true);
1391	}
1392	spin_unlock(lock: &ci->i_ceph_lock);
1393	}
1394
1395	/*
1396	* Prepare to send a cap message to an MDS. Update the cap state, and populate
1397	* the arg struct with the parameters that will need to be sent. This should
1398	* be done under the i_ceph_lock to guard against changes to cap state.
1399	*
1400	* Make note of max_size reported/requested from mds, revoked caps
1401	* that have now been implemented.
1402	*/
1403	static void __prep_cap(struct cap_msg_args arg, struct* ceph_cap *cap,
1404	int op, int flags, int used, int want, int retain,
1405	int flushing, u64 flush_tid, u64 oldest_flush_tid)
1406	{
1407	struct ceph_inode_info *ci = cap->ci;
1408	struct inode *inode = &ci->netfs.inode;
1409	struct ceph_client *cl = ceph_inode_to_client(inode);
1410	int held, revoking;
1411
1412	lockdep_assert_held(&ci->i_ceph_lock);
1413
1414	held = cap->issued \| cap->implemented;
1415	revoking = cap->implemented & ~cap->issued;
1416	retain &= ~revoking;
1417
1418	doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n",
1419	inode, ceph_vinop(inode), cap, cap->session,
1420	ceph_cap_string(held), ceph_cap_string(held & retain),
1421	ceph_cap_string(revoking));
1422	BUG_ON((retain & CEPH_CAP_PIN) == `0`);
1423
1424	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
1425
1426	cap->issued &= retain; / drop bits we don't want /
1427	/*
1428	* Wake up any waiters on wanted -> needed transition. This is due to
1429	* the weird transition from buffered to sync IO... we need to flush
1430	* dirty pages _before_ allowing sync writes to avoid reordering.
1431	*/
1432	arg->wake = cap->implemented & ~cap->issued;
1433	cap->implemented &= cap->issued \| used;
1434	cap->mds_wanted = want;
1435
1436	arg->session = cap->session;
1437	arg->ino = ceph_vino(inode).ino;
1438	arg->cid = cap->cap_id;
1439	arg->follows = flushing ? ci->i_head_snapc->seq : `0`;
1440	arg->flush_tid = flush_tid;
1441	arg->oldest_flush_tid = oldest_flush_tid;
1442	arg->size = i_size_read(inode);
1443	ci->i_reported_size = arg->size;
1444	arg->max_size = ci->i_wanted_max_size;
1445	if (cap == ci->i_auth_cap) {
1446	if (want & CEPH_CAP_ANY_FILE_WR)
1447	ci->i_requested_max_size = arg->max_size;
1448	else
1449	ci->i_requested_max_size = `0`;
1450	}
1451
1452	if (flushing & CEPH_CAP_XATTR_EXCL) {
1453	arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1454	arg->xattr_version = ci->i_xattrs.version;
1455	arg->xattr_buf = ceph_buffer_get(b: ci->i_xattrs.blob);
1456	} else {
1457	arg->xattr_buf = NULL;
1458	arg->old_xattr_buf = NULL;
1459	}
1460
1461	arg->mtime = inode_get_mtime(inode);
1462	arg->atime = inode_get_atime(inode);
1463	arg->ctime = inode_get_ctime(inode);
1464	arg->btime = ci->i_btime;
1465	arg->change_attr = inode_peek_iversion_raw(inode);
1466
1467	arg->op = op;
1468	arg->caps = cap->implemented;
1469	arg->wanted = want;
1470	arg->dirty = flushing;
1471
1472	arg->seq = cap->seq;
1473	arg->issue_seq = cap->issue_seq;
1474	arg->mseq = cap->mseq;
1475	arg->time_warp_seq = ci->i_time_warp_seq;
1476
1477	arg->uid = inode->i_uid;
1478	arg->gid = inode->i_gid;
1479	arg->mode = inode->i_mode;
1480
1481	arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1482	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1483	!list_empty(head: &ci->i_cap_snaps)) {
1484	struct ceph_cap_snap *capsnap;
1485	list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1486	if (capsnap->cap_flush.tid)
1487	break;
1488	if (capsnap->need_flush) {
1489	flags \|= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1490	break;
1491	}
1492	}
1493	}
1494	arg->flags = flags;
1495	arg->encrypted = IS_ENCRYPTED(inode);
1496	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1497	if (ci->fscrypt_auth_len &&
1498	WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
1499	/ Don't set this if it's too big /
1500	arg->fscrypt_auth_len = `0`;
1501	} else {
1502	arg->fscrypt_auth_len = ci->fscrypt_auth_len;
1503	memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
1504	min_t(size_t, ci->fscrypt_auth_len,
1505	sizeof(arg->fscrypt_auth)));
1506	}
1507	#endif /* CONFIG_FS_ENCRYPTION */
1508	}
1509
1510	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1511	#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
1512	4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
1513
1514	static inline int cap_msg_size(struct cap_msg_args *arg)
1515	{
1516	return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
1517	}
1518	#else
1519	#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
1520	4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
1521
1522	static inline int cap_msg_size(struct cap_msg_args *arg)
1523	{
1524	return CAP_MSG_FIXED_FIELDS;
1525	}
1526	#endif /* CONFIG_FS_ENCRYPTION */
1527
1528	/*
1529	* Send a cap msg on the given inode.
1530	*
1531	* Caller should hold snap_rwsem (read), s_mutex.
1532	*/
1533	static void __send_cap(struct cap_msg_args arg, struct* ceph_inode_info *ci)
1534	{
1535	struct ceph_msg *msg;
1536	struct inode *inode = &ci->netfs.inode;
1537	struct ceph_client *cl = ceph_inode_to_client(inode);
1538
1539	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, front_len: cap_msg_size(arg), GFP_NOFS,
1540	can_fail: false);
1541	if (!msg) {
1542	pr_err_client(cl,
1543	"error allocating cap msg: ino (%llx.%llx)"
1544	" flushing %s tid %llu, requeuing cap.\n",
1545	ceph_vinop(inode), ceph_cap_string(arg->dirty),
1546	arg->flush_tid);
1547	spin_lock(lock: &ci->i_ceph_lock);
1548	__cap_delay_requeue(mdsc: arg->session->s_mdsc, ci);
1549	spin_unlock(lock: &ci->i_ceph_lock);
1550	return;
1551	}
1552
1553	encode_cap_msg(msg, arg);
1554	ceph_con_send(con: &arg->session->s_con, msg);
1555	ceph_buffer_put(b: arg->old_xattr_buf);
1556	ceph_buffer_put(b: arg->xattr_buf);
1557	if (arg->wake)
1558	wake_up_all(&ci->i_cap_wq);
1559	}
1560
1561	static inline int __send_flush_snap(struct inode *inode,
1562	struct ceph_mds_session *session,
1563	struct ceph_cap_snap *capsnap,
1564	u32 mseq, u64 oldest_flush_tid)
1565	{
1566	struct cap_msg_args arg;
1567	struct ceph_msg *msg;
1568
1569	arg.session = session;
1570	arg.ino = ceph_vino(inode).ino;
1571	arg.cid = `0`;
1572	arg.follows = capsnap->follows;
1573	arg.flush_tid = capsnap->cap_flush.tid;
1574	arg.oldest_flush_tid = oldest_flush_tid;
1575
1576	arg.size = capsnap->size;
1577	arg.max_size = `0`;
1578	arg.xattr_version = capsnap->xattr_version;
1579	arg.xattr_buf = capsnap->xattr_blob;
1580	arg.old_xattr_buf = NULL;
1581
1582	arg.atime = capsnap->atime;
1583	arg.mtime = capsnap->mtime;
1584	arg.ctime = capsnap->ctime;
1585	arg.btime = capsnap->btime;
1586	arg.change_attr = capsnap->change_attr;
1587
1588	arg.op = CEPH_CAP_OP_FLUSHSNAP;
1589	arg.caps = capsnap->issued;
1590	arg.wanted = `0`;
1591	arg.dirty = capsnap->dirty;
1592
1593	arg.seq = `0`;
1594	arg.issue_seq = `0`;
1595	arg.mseq = mseq;
1596	arg.time_warp_seq = capsnap->time_warp_seq;
1597
1598	arg.uid = capsnap->uid;
1599	arg.gid = capsnap->gid;
1600	arg.mode = capsnap->mode;
1601
1602	arg.inline_data = capsnap->inline_data;
1603	arg.flags = `0`;
1604	arg.wake = false;
1605	arg.encrypted = IS_ENCRYPTED(inode);
1606
1607	/ No fscrypt_auth changes from a capsnap./
1608	arg.fscrypt_auth_len = `0`;
1609
1610	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, front_len: cap_msg_size(arg: &arg),
1611	GFP_NOFS, can_fail: false);
1612	if (!msg)
1613	return -ENOMEM;
1614
1615	encode_cap_msg(msg, arg: &arg);
1616	ceph_con_send(con: &arg.session->s_con, msg);
1617	return `0`;
1618	}
1619
1620	/*
1621	* When a snapshot is taken, clients accumulate dirty metadata on
1622	* inodes with capabilities in ceph_cap_snaps to describe the file
1623	* state at the time the snapshot was taken. This must be flushed
1624	* asynchronously back to the MDS once sync writes complete and dirty
1625	* data is written out.
1626	*
1627	* Called under i_ceph_lock.
1628	*/
1629	static void __ceph_flush_snaps(struct ceph_inode_info *ci,
1630	struct ceph_mds_session *session)
1631	__releases(ci->i_ceph_lock)
1632	__acquires(ci->i_ceph_lock)
1633	{
1634	struct inode *inode = &ci->netfs.inode;
1635	struct ceph_mds_client *mdsc = session->s_mdsc;
1636	struct ceph_client *cl = mdsc->fsc->client;
1637	struct ceph_cap_snap *capsnap;
1638	u64 oldest_flush_tid = `0`;
1639	u64 first_tid = `1`, last_tid = `0`;
1640
1641	doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode),
1642	session);
1643
1644	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1645	/*
1646	* we need to wait for sync writes to complete and for dirty
1647	* pages to be written out.
1648	*/
1649	if (capsnap->dirty_pages \|\| capsnap->writing)
1650	break;
1651
1652	/ should be removed by ceph_try_drop_cap_snap() /
1653	BUG_ON(!capsnap->need_flush);
1654
1655	/ only flush each capsnap once /
1656	if (capsnap->cap_flush.tid > `0`) {
1657	doutc(cl, "already flushed %p, skipping\n", capsnap);
1658	continue;
1659	}
1660
1661	spin_lock(lock: &mdsc->cap_dirty_lock);
1662	capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1663	list_add_tail(new: &capsnap->cap_flush.g_list,
1664	head: &mdsc->cap_flush_list);
1665	if (oldest_flush_tid == `0`)
1666	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1667	if (list_empty(head: &ci->i_flushing_item)) {
1668	list_add_tail(new: &ci->i_flushing_item,
1669	head: &session->s_cap_flushing);
1670	}
1671	spin_unlock(lock: &mdsc->cap_dirty_lock);
1672
1673	list_add_tail(new: &capsnap->cap_flush.i_list,
1674	head: &ci->i_cap_flush_list);
1675
1676	if (first_tid == `1`)
1677	first_tid = capsnap->cap_flush.tid;
1678	last_tid = capsnap->cap_flush.tid;
1679	}
1680
1681	ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
1682
1683	while (first_tid <= last_tid) {
1684	struct ceph_cap *cap = ci->i_auth_cap;
1685	struct ceph_cap_flush cf = NULL, iter;
1686	int ret;
1687
1688	if (!(cap && cap->session == session)) {
1689	doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n",
1690	inode, ceph_vinop(inode), cap, session->s_mds);
1691	break;
1692	}
1693
1694	ret = -ENOENT;
1695	list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
1696	if (iter->tid >= first_tid) {
1697	cf = iter;
1698	ret = `0`;
1699	break;
1700	}
1701	}
1702	if (ret < `0`)
1703	break;
1704
1705	first_tid = cf->tid + `1`;
1706
1707	capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
1708	refcount_inc(r: &capsnap->nref);
1709	spin_unlock(lock: &ci->i_ceph_lock);
1710
1711	doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode,
1712	ceph_vinop(inode), capsnap, cf->tid,
1713	ceph_cap_string(capsnap->dirty));
1714
1715	ret = __send_flush_snap(inode, session, capsnap, mseq: cap->mseq,
1716	oldest_flush_tid);
1717	if (ret < `0`) {
1718	pr_err_client(cl, "error sending cap flushsnap, "
1719	"ino (%llx.%llx) tid %llu follows %llu\n",
1720	ceph_vinop(inode), cf->tid,
1721	capsnap->follows);
1722	}
1723
1724	ceph_put_cap_snap(capsnap);
1725	spin_lock(lock: &ci->i_ceph_lock);
1726	}
1727	}
1728
1729	void ceph_flush_snaps(struct ceph_inode_info *ci,
1730	struct ceph_mds_session **psession)
1731	{
1732	struct inode *inode = &ci->netfs.inode;
1733	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
1734	struct ceph_client *cl = ceph_inode_to_client(inode);
1735	struct ceph_mds_session *session = NULL;
1736	bool need_put = false;
1737	int mds;
1738
1739	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
1740	if (psession)
1741	session = *psession;
1742	retry:
1743	spin_lock(lock: &ci->i_ceph_lock);
1744	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
1745	doutc(cl, " no capsnap needs flush, doing nothing\n");
1746	goto out;
1747	}
1748	if (!ci->i_auth_cap) {
1749	doutc(cl, " no auth cap (migrating?), doing nothing\n");
1750	goto out;
1751	}
1752
1753	mds = ci->i_auth_cap->session->s_mds;
1754	if (session && session->s_mds != mds) {
1755	doutc(cl, " oops, wrong session %p mutex\n", session);
1756	ceph_put_mds_session(s: session);
1757	session = NULL;
1758	}
1759	if (!session) {
1760	spin_unlock(lock: &ci->i_ceph_lock);
1761	mutex_lock(&mdsc->mutex);
1762	session = __ceph_lookup_mds_session(mdsc, mds);
1763	mutex_unlock(lock: &mdsc->mutex);
1764	goto retry;
1765	}
1766
1767	// make sure flushsnap messages are sent in proper order.
1768	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
1769	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid: `0`);
1770
1771	__ceph_flush_snaps(ci, session);
1772	out:
1773	spin_unlock(lock: &ci->i_ceph_lock);
1774
1775	if (psession)
1776	*psession = session;
1777	else
1778	ceph_put_mds_session(s: session);
1779	/ we flushed them all; remove this inode from the queue /
1780	spin_lock(lock: &mdsc->snap_flush_lock);
1781	if (!list_empty(head: &ci->i_snap_flush_item))
1782	need_put = true;
1783	list_del_init(entry: &ci->i_snap_flush_item);
1784	spin_unlock(lock: &mdsc->snap_flush_lock);
1785
1786	if (need_put)
1787	iput(inode);
1788	}
1789
1790	/*
1791	* Mark caps dirty. If inode is newly dirty, return the dirty flags.
1792	* Caller is then responsible for calling __mark_inode_dirty with the
1793	* returned flags value.
1794	*/
1795	int __ceph_mark_dirty_caps(struct ceph_inode_info ci, int* mask,
1796	struct ceph_cap_flush **pcf)
1797	{
1798	struct ceph_mds_client *mdsc =
1799	ceph_sb_to_fs_client(sb: ci->netfs.inode.i_sb)->mdsc;
1800	struct inode *inode = &ci->netfs.inode;
1801	struct ceph_client *cl = ceph_inode_to_client(inode);
1802	int was = ci->i_dirty_caps;
1803	int dirty = `0`;
1804
1805	lockdep_assert_held(&ci->i_ceph_lock);
1806
1807	if (!ci->i_auth_cap) {
1808	pr_warn_client(cl, "%p %llx.%llx mask %s, "
1809	"but no auth cap (session was closed?)\n",
1810	inode, ceph_vinop(inode),
1811	ceph_cap_string(mask));
1812	return `0`;
1813	}
1814
1815	doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode,
1816	ceph_vinop(inode), ceph_cap_string(mask),
1817	ceph_cap_string(was), ceph_cap_string(was \| mask));
1818	ci->i_dirty_caps \|= mask;
1819	if (was == `0`) {
1820	struct ceph_mds_session *session = ci->i_auth_cap->session;
1821
1822	WARN_ON_ONCE(ci->i_prealloc_cap_flush);
1823	swap(ci->i_prealloc_cap_flush, *pcf);
1824
1825	if (!ci->i_head_snapc) {
1826	WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
1827	ci->i_head_snapc = ceph_get_snap_context(
1828	sc: ci->i_snap_realm->cached_context);
1829	}
1830	doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n",
1831	inode, ceph_vinop(inode), ci->i_head_snapc,
1832	ci->i_auth_cap);
1833	BUG_ON(!list_empty(&ci->i_dirty_item));
1834	spin_lock(lock: &mdsc->cap_dirty_lock);
1835	list_add(new: &ci->i_dirty_item, head: &session->s_cap_dirty);
1836	spin_unlock(lock: &mdsc->cap_dirty_lock);
1837	if (ci->i_flushing_caps == `0`) {
1838	ihold(inode);
1839	dirty \|= I_DIRTY_SYNC;
1840	}
1841	} else {
1842	WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
1843	}
1844	BUG_ON(list_empty(&ci->i_dirty_item));
1845	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1846	(mask & CEPH_CAP_FILE_BUFFER))
1847	dirty \|= I_DIRTY_DATASYNC;
1848	__cap_delay_requeue(mdsc, ci);
1849	return dirty;
1850	}
1851
1852	struct ceph_cap_flush ceph_alloc_cap_flush(void*)
1853	{
1854	struct ceph_cap_flush *cf;
1855
1856	cf = kmem_cache_alloc(cachep: ceph_cap_flush_cachep, GFP_KERNEL);
1857	if (!cf)
1858	return NULL;
1859
1860	cf->is_capsnap = false;
1861	return cf;
1862	}
1863
1864	void ceph_free_cap_flush(struct ceph_cap_flush *cf)
1865	{
1866	if (cf)
1867	kmem_cache_free(s: ceph_cap_flush_cachep, objp: cf);
1868	}
1869
1870	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1871	{
1872	if (!list_empty(head: &mdsc->cap_flush_list)) {
1873	struct ceph_cap_flush *cf =
1874	list_first_entry(&mdsc->cap_flush_list,
1875	struct ceph_cap_flush, g_list);
1876	return cf->tid;
1877	}
1878	return `0`;
1879	}
1880
1881	/*
1882	* Remove cap_flush from the mdsc's or inode's flushing cap list.
1883	* Return true if caller needs to wake up flush waiters.
1884	*/
1885	static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1886	struct ceph_cap_flush *cf)
1887	{
1888	struct ceph_cap_flush *prev;
1889	bool wake = cf->wake;
1890
1891	if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1892	prev = list_prev_entry(cf, g_list);
1893	prev->wake = true;
1894	wake = false;
1895	}
1896	list_del_init(entry: &cf->g_list);
1897	return wake;
1898	}
1899
1900	static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1901	struct ceph_cap_flush *cf)
1902	{
1903	struct ceph_cap_flush *prev;
1904	bool wake = cf->wake;
1905
1906	if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1907	prev = list_prev_entry(cf, i_list);
1908	prev->wake = true;
1909	wake = false;
1910	}
1911	list_del_init(entry: &cf->i_list);
1912	return wake;
1913	}
1914
1915	/*
1916	* Add dirty inode to the flushing list. Assigned a seq number so we
1917	* can wait for caps to flush without starving.
1918	*
1919	* Called under i_ceph_lock. Returns the flush tid.
1920	*/
1921	static u64 __mark_caps_flushing(struct inode *inode,
1922	struct ceph_mds_session *session, bool wake,
1923	u64 *oldest_flush_tid)
1924	{
1925	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
1926	struct ceph_client *cl = ceph_inode_to_client(inode);
1927	struct ceph_inode_info *ci = ceph_inode(inode);
1928	struct ceph_cap_flush *cf = NULL;
1929	int flushing;
1930
1931	lockdep_assert_held(&ci->i_ceph_lock);
1932	BUG_ON(ci->i_dirty_caps == `0`);
1933	BUG_ON(list_empty(&ci->i_dirty_item));
1934	BUG_ON(!ci->i_prealloc_cap_flush);
1935
1936	flushing = ci->i_dirty_caps;
1937	doutc(cl, "flushing %s, flushing_caps %s -> %s\n",
1938	ceph_cap_string(flushing),
1939	ceph_cap_string(ci->i_flushing_caps),
1940	ceph_cap_string(ci->i_flushing_caps \| flushing));
1941	ci->i_flushing_caps \|= flushing;
1942	ci->i_dirty_caps = `0`;
1943	doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
1944
1945	swap(cf, ci->i_prealloc_cap_flush);
1946	cf->caps = flushing;
1947	cf->wake = wake;
1948
1949	spin_lock(lock: &mdsc->cap_dirty_lock);
1950	list_del_init(entry: &ci->i_dirty_item);
1951
1952	cf->tid = ++mdsc->last_cap_flush_tid;
1953	list_add_tail(new: &cf->g_list, head: &mdsc->cap_flush_list);
1954	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1955
1956	if (list_empty(head: &ci->i_flushing_item)) {
1957	list_add_tail(new: &ci->i_flushing_item, head: &session->s_cap_flushing);
1958	mdsc->num_cap_flushing++;
1959	}
1960	spin_unlock(lock: &mdsc->cap_dirty_lock);
1961
1962	list_add_tail(new: &cf->i_list, head: &ci->i_cap_flush_list);
1963
1964	return cf->tid;
1965	}
1966
1967	/*
1968	* try to invalidate mapping pages without blocking.
1969	*/
1970	static int try_nonblocking_invalidate(struct inode *inode)
1971	__releases(ci->i_ceph_lock)
1972	__acquires(ci->i_ceph_lock)
1973	{
1974	struct ceph_client *cl = ceph_inode_to_client(inode);
1975	struct ceph_inode_info *ci = ceph_inode(inode);
1976	u32 invalidating_gen = ci->i_rdcache_gen;
1977
1978	spin_unlock(lock: &ci->i_ceph_lock);
1979	ceph_fscache_invalidate(inode, dio_write: false);
1980	invalidate_mapping_pages(mapping: &inode->i_data, start: `0`, end: -`1`);
1981	spin_lock(lock: &ci->i_ceph_lock);
1982
1983	if (inode->i_data.nrpages == `0` &&
1984	invalidating_gen == ci->i_rdcache_gen) {
1985	/ success. /
1986	doutc(cl, "%p %llx.%llx success\n", inode,
1987	ceph_vinop(inode));
1988	/ save any racing async invalidate some trouble /
1989	ci->i_rdcache_revoking = ci->i_rdcache_gen - `1`;
1990	return `0`;
1991	}
1992	doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode));
1993	return -`1`;
1994	}
1995
1996	bool __ceph_should_report_size(struct ceph_inode_info *ci)
1997	{
1998	loff_t size = i_size_read(inode: &ci->netfs.inode);
1999	/ mds will adjust max size according to the reported size /
2000	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
2001	return false;
2002	if (size >= ci->i_max_size)
2003	return true;
2004	/ half of previous max_size increment has been used /
2005	if (ci->i_max_size > ci->i_reported_size &&
2006	(size << `1`) >= ci->i_max_size + ci->i_reported_size)
2007	return true;
2008	return false;
2009	}
2010
2011	/*
2012	* Swiss army knife function to examine currently used and wanted
2013	* versus held caps. Release, flush, ack revoked caps to mds as
2014	* appropriate.
2015	*
2016	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
2017	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
2018	* further delay.
2019	*/
2020	void ceph_check_caps(struct ceph_inode_info ci, int* flags)
2021	{
2022	struct inode *inode = &ci->netfs.inode;
2023	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
2024	struct ceph_client *cl = ceph_inode_to_client(inode);
2025	struct ceph_cap *cap;
2026	u64 flush_tid, oldest_flush_tid;
2027	int file_wanted, used, cap_used;
2028	int issued, implemented, want, retain, revoking, flushing = `0`;
2029	int mds = -`1`; / keep track of how far we've gone through i_caps list*
2030	to avoid an infinite loop on retry /*
2031	struct rb_node *p;
2032	bool queue_invalidate = false;
2033	bool tried_invalidate = false;
2034	bool queue_writeback = false;
2035	struct ceph_mds_session *session = NULL;
2036
2037	spin_lock(lock: &ci->i_ceph_lock);
2038	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
2039	ci->i_ceph_flags \|= CEPH_I_ASYNC_CHECK_CAPS;
2040
2041	/ Don't send messages until we get async create reply /
2042	spin_unlock(lock: &ci->i_ceph_lock);
2043	return;
2044	}
2045
2046	if (ci->i_ceph_flags & CEPH_I_FLUSH)
2047	flags \|= CHECK_CAPS_FLUSH;
2048	retry:
2049	/ Caps wanted by virtue of active open files. /
2050	file_wanted = __ceph_caps_file_wanted(ci);
2051
2052	/ Caps which have active references against them /
2053	used = __ceph_caps_used(ci);
2054
2055	/*
2056	* "issued" represents the current caps that the MDS wants us to have.
2057	* "implemented" is the set that we have been granted, and includes the
2058	* ones that have not yet been returned to the MDS (the "revoking" set,
2059	* usually because they have outstanding references).
2060	*/
2061	issued = __ceph_caps_issued(ci, implemented: &implemented);
2062	revoking = implemented & ~issued;
2063
2064	want = file_wanted;
2065
2066	/ The ones we currently want to retain (may be adjusted below) /
2067	retain = file_wanted \| used \| CEPH_CAP_PIN;
2068	if (!mdsc->stopping && inode->i_nlink > `0`) {
2069	if (file_wanted) {
2070	retain \|= CEPH_CAP_ANY; / be greedy /
2071	} else if (S_ISDIR(inode->i_mode) &&
2072	(issued & CEPH_CAP_FILE_SHARED) &&
2073	__ceph_dir_is_complete(ci)) {
2074	/*
2075	* If a directory is complete, we want to keep
2076	* the exclusive cap. So that MDS does not end up
2077	* revoking the shared cap on every create/unlink
2078	* operation.
2079	*/
2080	if (IS_RDONLY(inode)) {
2081	want = CEPH_CAP_ANY_SHARED;
2082	} else {
2083	want \|= CEPH_CAP_ANY_SHARED \| CEPH_CAP_FILE_EXCL;
2084	}
2085	retain \|= want;
2086	} else {
2087
2088	retain \|= CEPH_CAP_ANY_SHARED;
2089	/*
2090	* keep RD only if we didn't have the file open RW,
2091	* because then the mds would revoke it anyway to
2092	* journal max_size=0.
2093	*/
2094	if (ci->i_max_size == `0`)
2095	retain \|= CEPH_CAP_ANY_RD;
2096	}
2097	}
2098
2099	doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
2100	"flushing %s issued %s revoking %s retain %s %s%s%s\n",
2101	inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
2102	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
2103	ceph_cap_string(ci->i_flushing_caps),
2104	ceph_cap_string(issued), ceph_cap_string(revoking),
2105	ceph_cap_string(retain),
2106	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
2107	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
2108	(flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
2109
2110	/*
2111	* If we no longer need to hold onto old our caps, and we may
2112	* have cached pages, but don't want them, then try to invalidate.
2113	* If we fail, it's because pages are locked.... try again later.
2114	*/
2115	if ((!(flags & CHECK_CAPS_NOINVAL) \|\| mdsc->stopping) &&
2116	S_ISREG(inode->i_mode) &&
2117	!(ci->i_wb_ref \|\| ci->i_wrbuffer_ref) && / no dirty pages... /
2118	inode->i_data.nrpages && / have cached pages /
2119	(revoking & (CEPH_CAP_FILE_CACHE\|
2120	CEPH_CAP_FILE_LAZYIO)) && / or revoking cache /
2121	!tried_invalidate) {
2122	doutc(cl, "trying to invalidate on %p %llx.%llx\n",
2123	inode, ceph_vinop(inode));
2124	if (try_nonblocking_invalidate(inode) < `0`) {
2125	doutc(cl, "queuing invalidate\n");
2126	queue_invalidate = true;
2127	ci->i_rdcache_revoking = ci->i_rdcache_gen;
2128	}
2129	tried_invalidate = true;
2130	goto retry;
2131	}
2132
2133	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2134	int mflags = `0`;
2135	struct cap_msg_args arg;
2136
2137	cap = rb_entry(p, struct ceph_cap, ci_node);
2138
2139	/ avoid looping forever /
2140	if (mds >= cap->mds \|\|
2141	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
2142	continue;
2143
2144	/*
2145	* If we have an auth cap, we don't need to consider any
2146	* overlapping caps as used.
2147	*/
2148	cap_used = used;
2149	if (ci->i_auth_cap && cap != ci->i_auth_cap)
2150	cap_used &= ~ci->i_auth_cap->issued;
2151
2152	revoking = cap->implemented & ~cap->issued;
2153	doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n",
2154	cap->mds, cap, ceph_cap_string(cap_used),
2155	ceph_cap_string(cap->issued),
2156	ceph_cap_string(cap->implemented),
2157	ceph_cap_string(revoking));
2158
2159	/ completed revocation? going down and there are no caps? /
2160	if (revoking) {
2161	if ((revoking & cap_used) == `0`) {
2162	doutc(cl, "completed revocation of %s\n",
2163	ceph_cap_string(cap->implemented & ~cap->issued));
2164	goto ack;
2165	}
2166
2167	/*
2168	* If the "i_wrbuffer_ref" was increased by mmap or generic
2169	* cache write just before the ceph_check_caps() is called,
2170	* the Fb capability revoking will fail this time. Then we
2171	* must wait for the BDI's delayed work to flush the dirty
2172	* pages and to release the "i_wrbuffer_ref", which will cost
2173	* at most 5 seconds. That means the MDS needs to wait at
2174	* most 5 seconds to finished the Fb capability's revocation.
2175	*
2176	* Let's queue a writeback for it.
2177	*/
2178	if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
2179	(revoking & CEPH_CAP_FILE_BUFFER))
2180	queue_writeback = true;
2181	}
2182
2183	if (cap == ci->i_auth_cap &&
2184	(cap->issued & CEPH_CAP_FILE_WR)) {
2185	/ request larger max_size from MDS? /
2186	if (ci->i_wanted_max_size > ci->i_max_size &&
2187	ci->i_wanted_max_size > ci->i_requested_max_size) {
2188	doutc(cl, "requesting new max_size\n");
2189	goto ack;
2190	}
2191
2192	/ approaching file_max? /
2193	if (__ceph_should_report_size(ci)) {
2194	doutc(cl, "i_size approaching max_size\n");
2195	goto ack;
2196	}
2197	}
2198	/ flush anything dirty? /
2199	if (cap == ci->i_auth_cap) {
2200	if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
2201	doutc(cl, "flushing dirty caps\n");
2202	goto ack;
2203	}
2204	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
2205	doutc(cl, "flushing snap caps\n");
2206	goto ack;
2207	}
2208	}
2209
2210	/ want more caps from mds? /
2211	if (want & ~cap->mds_wanted) {
2212	if (want & ~(cap->mds_wanted \| cap->issued))
2213	goto ack;
2214	if (!__cap_is_valid(cap))
2215	goto ack;
2216	}
2217
2218	/ things we might delay /
2219	if ((cap->issued & ~retain) == `0`)
2220	continue; / nope, all good /
2221
2222	ack:
2223	ceph_put_mds_session(s: session);
2224	session = ceph_get_mds_session(s: cap->session);
2225
2226	/ kick flushing and flush snaps before sending normal*
2227	* cap message */
2228	if (cap == ci->i_auth_cap &&
2229	(ci->i_ceph_flags &
2230	(CEPH_I_KICK_FLUSH \| CEPH_I_FLUSH_SNAPS))) {
2231	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2232	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid: `0`);
2233	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2234	__ceph_flush_snaps(ci, session);
2235
2236	goto retry;
2237	}
2238
2239	if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2240	flushing = ci->i_dirty_caps;
2241	flush_tid = __mark_caps_flushing(inode, session, wake: false,
2242	oldest_flush_tid: &oldest_flush_tid);
2243	if (flags & CHECK_CAPS_FLUSH &&
2244	list_empty(head: &session->s_cap_dirty))
2245	mflags \|= CEPH_CLIENT_CAPS_SYNC;
2246	} else {
2247	flushing = `0`;
2248	flush_tid = `0`;
2249	spin_lock(lock: &mdsc->cap_dirty_lock);
2250	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2251	spin_unlock(lock: &mdsc->cap_dirty_lock);
2252	}
2253
2254	mds = cap->mds; / remember mds, so we don't repeat /
2255
2256	__prep_cap(arg: &arg, cap, op: CEPH_CAP_OP_UPDATE, flags: mflags, used: cap_used,
2257	want, retain, flushing, flush_tid, oldest_flush_tid);
2258
2259	spin_unlock(lock: &ci->i_ceph_lock);
2260	__send_cap(arg: &arg, ci);
2261	spin_lock(lock: &ci->i_ceph_lock);
2262
2263	goto retry; / retake i_ceph_lock and restart our cap scan. /
2264	}
2265
2266	/ periodically re-calculate caps wanted by open files /
2267	if (__ceph_is_any_real_caps(ci) &&
2268	list_empty(head: &ci->i_cap_delay_list) &&
2269	(file_wanted & ~CEPH_CAP_PIN) &&
2270	!(used & (CEPH_CAP_FILE_RD \| CEPH_CAP_ANY_FILE_WR))) {
2271	__cap_delay_requeue(mdsc, ci);
2272	}
2273
2274	spin_unlock(lock: &ci->i_ceph_lock);
2275
2276	ceph_put_mds_session(s: session);
2277	if (queue_writeback)
2278	ceph_queue_writeback(inode);
2279	if (queue_invalidate)
2280	ceph_queue_invalidate(inode);
2281	}
2282
2283	/*
2284	* Try to flush dirty caps back to the auth mds.
2285	*/
2286	static int try_flush_caps(struct inode inode, u64 ptid)
2287	{
2288	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
2289	struct ceph_inode_info *ci = ceph_inode(inode);
2290	int flushing = `0`;
2291	u64 flush_tid = `0`, oldest_flush_tid = `0`;
2292
2293	spin_lock(lock: &ci->i_ceph_lock);
2294	retry_locked:
2295	if (ci->i_dirty_caps && ci->i_auth_cap) {
2296	struct ceph_cap *cap = ci->i_auth_cap;
2297	struct cap_msg_args arg;
2298	struct ceph_mds_session *session = cap->session;
2299
2300	if (session->s_state < CEPH_MDS_SESSION_OPEN) {
2301	spin_unlock(lock: &ci->i_ceph_lock);
2302	goto out;
2303	}
2304
2305	if (ci->i_ceph_flags &
2306	(CEPH_I_KICK_FLUSH \| CEPH_I_FLUSH_SNAPS)) {
2307	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2308	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid: `0`);
2309	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2310	__ceph_flush_snaps(ci, session);
2311	goto retry_locked;
2312	}
2313
2314	flushing = ci->i_dirty_caps;
2315	flush_tid = __mark_caps_flushing(inode, session, wake: true,
2316	oldest_flush_tid: &oldest_flush_tid);
2317
2318	__prep_cap(arg: &arg, cap, op: CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
2319	used: __ceph_caps_used(ci), want: __ceph_caps_wanted(ci),
2320	retain: (cap->issued \| cap->implemented),
2321	flushing, flush_tid, oldest_flush_tid);
2322	spin_unlock(lock: &ci->i_ceph_lock);
2323
2324	__send_cap(arg: &arg, ci);
2325	} else {
2326	if (!list_empty(head: &ci->i_cap_flush_list)) {
2327	struct ceph_cap_flush *cf =
2328	list_last_entry(&ci->i_cap_flush_list,
2329	struct ceph_cap_flush, i_list);
2330	cf->wake = true;
2331	flush_tid = cf->tid;
2332	}
2333	flushing = ci->i_flushing_caps;
2334	spin_unlock(lock: &ci->i_ceph_lock);
2335	}
2336	out:
2337	*ptid = flush_tid;
2338	return flushing;
2339	}
2340
2341	/*
2342	* Return true if we've flushed caps through the given flush_tid.
2343	*/
2344	static int caps_are_flushed(struct inode *inode, u64 flush_tid)
2345	{
2346	struct ceph_inode_info *ci = ceph_inode(inode);
2347	int ret = `1`;
2348
2349	spin_lock(lock: &ci->i_ceph_lock);
2350	if (!list_empty(head: &ci->i_cap_flush_list)) {
2351	struct ceph_cap_flush * cf =
2352	list_first_entry(&ci->i_cap_flush_list,
2353	struct ceph_cap_flush, i_list);
2354	if (cf->tid <= flush_tid)
2355	ret = `0`;
2356	}
2357	spin_unlock(lock: &ci->i_ceph_lock);
2358	return ret;
2359	}
2360
2361	/*
2362	* flush the mdlog and wait for any unsafe requests to complete.
2363	*/
2364	static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
2365	{
2366	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
2367	struct ceph_client *cl = ceph_inode_to_client(inode);
2368	struct ceph_inode_info *ci = ceph_inode(inode);
2369	struct ceph_mds_request req1 = NULL, req2 = NULL;
2370	int ret, err = `0`;
2371
2372	spin_lock(lock: &ci->i_unsafe_lock);
2373	if (S_ISDIR(inode->i_mode) && !list_empty(head: &ci->i_unsafe_dirops)) {
2374	req1 = list_last_entry(&ci->i_unsafe_dirops,
2375	struct ceph_mds_request,
2376	r_unsafe_dir_item);
2377	ceph_mdsc_get_request(req: req1);
2378	}
2379	if (!list_empty(head: &ci->i_unsafe_iops)) {
2380	req2 = list_last_entry(&ci->i_unsafe_iops,
2381	struct ceph_mds_request,
2382	r_unsafe_target_item);
2383	ceph_mdsc_get_request(req: req2);
2384	}
2385	spin_unlock(lock: &ci->i_unsafe_lock);
2386
2387	/*
2388	* Trigger to flush the journal logs in all the relevant MDSes
2389	* manually, or in the worst case we must wait at most 5 seconds
2390	* to wait the journal logs to be flushed by the MDSes periodically.
2391	*/
2392	if (req1 \|\| req2) {
2393	struct ceph_mds_request *req;
2394	struct ceph_mds_session **sessions;
2395	struct ceph_mds_session *s;
2396	unsigned int max_sessions;
2397	int i;
2398
2399	mutex_lock(&mdsc->mutex);
2400	max_sessions = mdsc->max_sessions;
2401
2402	sessions = kcalloc(n: max_sessions, size: sizeof(s), GFP_KERNEL);
2403	if (!sessions) {
2404	mutex_unlock(lock: &mdsc->mutex);
2405	err = -ENOMEM;
2406	goto out;
2407	}
2408
2409	spin_lock(lock: &ci->i_unsafe_lock);
2410	if (req1) {
2411	list_for_each_entry(req, &ci->i_unsafe_dirops,
2412	r_unsafe_dir_item) {
2413	s = req->r_session;
2414	if (!s)
2415	continue;
2416	if (!sessions[s->s_mds]) {
2417	s = ceph_get_mds_session(s);
2418	sessions[s->s_mds] = s;
2419	}
2420	}
2421	}
2422	if (req2) {
2423	list_for_each_entry(req, &ci->i_unsafe_iops,
2424	r_unsafe_target_item) {
2425	s = req->r_session;
2426	if (!s)
2427	continue;
2428	if (!sessions[s->s_mds]) {
2429	s = ceph_get_mds_session(s);
2430	sessions[s->s_mds] = s;
2431	}
2432	}
2433	}
2434	spin_unlock(lock: &ci->i_unsafe_lock);
2435
2436	/ the auth MDS /
2437	spin_lock(lock: &ci->i_ceph_lock);
2438	if (ci->i_auth_cap) {
2439	s = ci->i_auth_cap->session;
2440	if (!sessions[s->s_mds])
2441	sessions[s->s_mds] = ceph_get_mds_session(s);
2442	}
2443	spin_unlock(lock: &ci->i_ceph_lock);
2444	mutex_unlock(lock: &mdsc->mutex);
2445
2446	/ send flush mdlog request to MDSes /
2447	for (i = `0`; i < max_sessions; i++) {
2448	s = sessions[i];
2449	if (s) {
2450	send_flush_mdlog(s);
2451	ceph_put_mds_session(s);
2452	}
2453	}
2454	kfree(objp: sessions);
2455	}
2456
2457	doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode,
2458	ceph_vinop(inode), req1 ? req1->r_tid : `0ULL`,
2459	req2 ? req2->r_tid : `0ULL`);
2460	if (req1) {
2461	ret = !wait_for_completion_timeout(x: &req1->r_safe_completion,
2462	timeout: ceph_timeout_jiffies(timeout: req1->r_timeout));
2463	if (ret)
2464	err = -EIO;
2465	}
2466	if (req2) {
2467	ret = !wait_for_completion_timeout(x: &req2->r_safe_completion,
2468	timeout: ceph_timeout_jiffies(timeout: req2->r_timeout));
2469	if (ret)
2470	err = -EIO;
2471	}
2472
2473	out:
2474	if (req1)
2475	ceph_mdsc_put_request(req: req1);
2476	if (req2)
2477	ceph_mdsc_put_request(req: req2);
2478	return err;
2479	}
2480
2481	int ceph_fsync(struct file file, loff_t start, loff_t end, int* datasync)
2482	{
2483	struct inode *inode = file->f_mapping->host;
2484	struct ceph_inode_info *ci = ceph_inode(inode);
2485	struct ceph_client *cl = ceph_inode_to_client(inode);
2486	u64 flush_tid;
2487	int ret, err;
2488	int dirty;
2489
2490	doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode),
2491	datasync ? " datasync" : "");
2492
2493	ret = file_write_and_wait_range(file, start, end);
2494	if (datasync)
2495	goto out;
2496
2497	ret = ceph_wait_on_async_create(inode);
2498	if (ret)
2499	goto out;
2500
2501	dirty = try_flush_caps(inode, ptid: &flush_tid);
2502	doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty));
2503
2504	err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
2505
2506	/*
2507	* only wait on non-file metadata writeback (the mds
2508	* can recover size and mtime, so we don't need to
2509	* wait for that)
2510	*/
2511	if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2512	err = wait_event_interruptible(ci->i_cap_wq,
2513	caps_are_flushed(inode, flush_tid));
2514	}
2515
2516	if (err < `0`)
2517	ret = err;
2518
2519	err = file_check_and_advance_wb_err(file);
2520	if (err < `0`)
2521	ret = err;
2522	out:
2523	doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode),
2524	datasync ? " datasync" : "", ret);
2525	return ret;
2526	}
2527
2528	/*
2529	* Flush any dirty caps back to the mds. If we aren't asked to wait,
2530	* queue inode for flush but don't do so immediately, because we can
2531	* get by with fewer MDS messages if we wait for data writeback to
2532	* complete first.
2533	*/
2534	int ceph_write_inode(struct inode inode, struct* writeback_control *wbc)
2535	{
2536	struct ceph_inode_info *ci = ceph_inode(inode);
2537	struct ceph_client *cl = ceph_inode_to_client(inode);
2538	u64 flush_tid;
2539	int err = `0`;
2540	int dirty;
2541	int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
2542
2543	doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait);
2544	ceph_fscache_unpin_writeback(inode, wbc);
2545	if (wait) {
2546	err = ceph_wait_on_async_create(inode);
2547	if (err)
2548	return err;
2549	dirty = try_flush_caps(inode, ptid: &flush_tid);
2550	if (dirty)
2551	err = wait_event_interruptible(ci->i_cap_wq,
2552	caps_are_flushed(inode, flush_tid));
2553	} else {
2554	struct ceph_mds_client *mdsc =
2555	ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
2556
2557	spin_lock(lock: &ci->i_ceph_lock);
2558	if (__ceph_caps_dirty(ci))
2559	__cap_delay_requeue_front(mdsc, ci);
2560	spin_unlock(lock: &ci->i_ceph_lock);
2561	}
2562	return err;
2563	}
2564
2565	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2566	struct ceph_mds_session *session,
2567	struct ceph_inode_info *ci,
2568	u64 oldest_flush_tid)
2569	__releases(ci->i_ceph_lock)
2570	__acquires(ci->i_ceph_lock)
2571	{
2572	struct inode *inode = &ci->netfs.inode;
2573	struct ceph_client *cl = mdsc->fsc->client;
2574	struct ceph_cap *cap;
2575	struct ceph_cap_flush *cf;
2576	int ret;
2577	u64 first_tid = `0`;
2578	u64 last_snap_flush = `0`;
2579
2580	/ Don't do anything until create reply comes in /
2581	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
2582	return;
2583
2584	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2585
2586	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2587	if (cf->is_capsnap) {
2588	last_snap_flush = cf->tid;
2589	break;
2590	}
2591	}
2592
2593	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2594	if (cf->tid < first_tid)
2595	continue;
2596
2597	cap = ci->i_auth_cap;
2598	if (!(cap && cap->session == session)) {
2599	pr_err_client(cl, "%p auth cap %p not mds%d ???\n",
2600	inode, cap, session->s_mds);
2601	break;
2602	}
2603
2604	first_tid = cf->tid + `1`;
2605
2606	if (!cf->is_capsnap) {
2607	struct cap_msg_args arg;
2608
2609	doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n",
2610	inode, ceph_vinop(inode), cap, cf->tid,
2611	ceph_cap_string(cf->caps));
2612	__prep_cap(arg: &arg, cap, op: CEPH_CAP_OP_FLUSH,
2613	flags: (cf->tid < last_snap_flush ?
2614	CEPH_CLIENT_CAPS_PENDING_CAPSNAP : `0`),
2615	used: __ceph_caps_used(ci),
2616	want: __ceph_caps_wanted(ci),
2617	retain: (cap->issued \| cap->implemented),
2618	flushing: cf->caps, flush_tid: cf->tid, oldest_flush_tid);
2619	spin_unlock(lock: &ci->i_ceph_lock);
2620	__send_cap(arg: &arg, ci);
2621	} else {
2622	struct ceph_cap_snap *capsnap =
2623	container_of(cf, struct ceph_cap_snap,
2624	cap_flush);
2625	doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n",
2626	inode, ceph_vinop(inode), capsnap, cf->tid,
2627	ceph_cap_string(capsnap->dirty));
2628
2629	refcount_inc(r: &capsnap->nref);
2630	spin_unlock(lock: &ci->i_ceph_lock);
2631
2632	ret = __send_flush_snap(inode, session, capsnap, mseq: cap->mseq,
2633	oldest_flush_tid);
2634	if (ret < `0`) {
2635	pr_err_client(cl, "error sending cap flushsnap,"
2636	" %p %llx.%llx tid %llu follows %llu\n",
2637	inode, ceph_vinop(inode), cf->tid,
2638	capsnap->follows);
2639	}
2640
2641	ceph_put_cap_snap(capsnap);
2642	}
2643
2644	spin_lock(lock: &ci->i_ceph_lock);
2645	}
2646	}
2647
2648	void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2649	struct ceph_mds_session *session)
2650	{
2651	struct ceph_client *cl = mdsc->fsc->client;
2652	struct ceph_inode_info *ci;
2653	struct ceph_cap *cap;
2654	u64 oldest_flush_tid;
2655
2656	doutc(cl, "mds%d\n", session->s_mds);
2657
2658	spin_lock(lock: &mdsc->cap_dirty_lock);
2659	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2660	spin_unlock(lock: &mdsc->cap_dirty_lock);
2661
2662	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2663	struct inode *inode = &ci->netfs.inode;
2664
2665	spin_lock(lock: &ci->i_ceph_lock);
2666	cap = ci->i_auth_cap;
2667	if (!(cap && cap->session == session)) {
2668	pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
2669	inode, ceph_vinop(inode), cap,
2670	session->s_mds);
2671	spin_unlock(lock: &ci->i_ceph_lock);
2672	continue;
2673	}
2674
2675
2676	/*
2677	* if flushing caps were revoked, we re-send the cap flush
2678	* in client reconnect stage. This guarantees MDS * processes
2679	* the cap flush message before issuing the flushing caps to
2680	* other client.
2681	*/
2682	if ((cap->issued & ci->i_flushing_caps) !=
2683	ci->i_flushing_caps) {
2684	/ encode_caps_cb() also will reset these sequence*
2685	* numbers. make sure sequence numbers in cap flush
2686	* message match later reconnect message */
2687	cap->seq = `0`;
2688	cap->issue_seq = `0`;
2689	cap->mseq = `0`;
2690	__kick_flushing_caps(mdsc, session, ci,
2691	oldest_flush_tid);
2692	} else {
2693	ci->i_ceph_flags \|= CEPH_I_KICK_FLUSH;
2694	}
2695
2696	spin_unlock(lock: &ci->i_ceph_lock);
2697	}
2698	}
2699
2700	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2701	struct ceph_mds_session *session)
2702	{
2703	struct ceph_client *cl = mdsc->fsc->client;
2704	struct ceph_inode_info *ci;
2705	struct ceph_cap *cap;
2706	u64 oldest_flush_tid;
2707
2708	lockdep_assert_held(&session->s_mutex);
2709
2710	doutc(cl, "mds%d\n", session->s_mds);
2711
2712	spin_lock(lock: &mdsc->cap_dirty_lock);
2713	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2714	spin_unlock(lock: &mdsc->cap_dirty_lock);
2715
2716	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2717	struct inode *inode = &ci->netfs.inode;
2718
2719	spin_lock(lock: &ci->i_ceph_lock);
2720	cap = ci->i_auth_cap;
2721	if (!(cap && cap->session == session)) {
2722	pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
2723	inode, ceph_vinop(inode), cap,
2724	session->s_mds);
2725	spin_unlock(lock: &ci->i_ceph_lock);
2726	continue;
2727	}
2728	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2729	__kick_flushing_caps(mdsc, session, ci,
2730	oldest_flush_tid);
2731	}
2732	spin_unlock(lock: &ci->i_ceph_lock);
2733	}
2734	}
2735
2736	void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2737	struct ceph_inode_info *ci)
2738	{
2739	struct ceph_mds_client *mdsc = session->s_mdsc;
2740	struct ceph_cap *cap = ci->i_auth_cap;
2741	struct inode *inode = &ci->netfs.inode;
2742
2743	lockdep_assert_held(&ci->i_ceph_lock);
2744
2745	doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n",
2746	inode, ceph_vinop(inode),
2747	ceph_cap_string(ci->i_flushing_caps));
2748
2749	if (!list_empty(head: &ci->i_cap_flush_list)) {
2750	u64 oldest_flush_tid;
2751	spin_lock(lock: &mdsc->cap_dirty_lock);
2752	list_move_tail(list: &ci->i_flushing_item,
2753	head: &cap->session->s_cap_flushing);
2754	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2755	spin_unlock(lock: &mdsc->cap_dirty_lock);
2756
2757	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2758	}
2759	}
2760
2761
2762	/*
2763	* Take references to capabilities we hold, so that we don't release
2764	* them to the MDS prematurely.
2765	*/
2766	void ceph_take_cap_refs(struct ceph_inode_info ci, int* got,
2767	bool snap_rwsem_locked)
2768	{
2769	struct inode *inode = &ci->netfs.inode;
2770	struct ceph_client *cl = ceph_inode_to_client(inode);
2771
2772	lockdep_assert_held(&ci->i_ceph_lock);
2773
2774	if (got & CEPH_CAP_PIN)
2775	ci->i_pin_ref++;
2776	if (got & CEPH_CAP_FILE_RD)
2777	ci->i_rd_ref++;
2778	if (got & CEPH_CAP_FILE_CACHE)
2779	ci->i_rdcache_ref++;
2780	if (got & CEPH_CAP_FILE_EXCL)
2781	ci->i_fx_ref++;
2782	if (got & CEPH_CAP_FILE_WR) {
2783	if (ci->i_wr_ref == `0` && !ci->i_head_snapc) {
2784	BUG_ON(!snap_rwsem_locked);
2785	ci->i_head_snapc = ceph_get_snap_context(
2786	sc: ci->i_snap_realm->cached_context);
2787	}
2788	ci->i_wr_ref++;
2789	}
2790	if (got & CEPH_CAP_FILE_BUFFER) {
2791	if (ci->i_wb_ref == `0`)
2792	ihold(inode);
2793	ci->i_wb_ref++;
2794	doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
2795	ceph_vinop(inode), ci->i_wb_ref-`1`, ci->i_wb_ref);
2796	}
2797	}
2798
2799	/*
2800	* Try to grab cap references. Specify those refs we @want, and the
2801	* minimal set we @need. Also include the larger offset we are writing
2802	* to (when applicable), and check against max_size here as well.
2803	* Note that caller is responsible for ensuring max_size increases are
2804	* requested from the MDS.
2805	*
2806	* Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2807	* or a negative error code. There are 3 speical error codes:
2808	* -EAGAIN: need to sleep but non-blocking is specified
2809	* -EFBIG: ask caller to call check_max_size() and try again.
2810	* -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
2811	*/
2812	enum {
2813	/ first 8 bits are reserved for CEPH_FILE_MODE_FOO /
2814	NON_BLOCKING = (`1` << `8`),
2815	CHECK_FILELOCK = (`1` << `9`),
2816	};
2817
2818	static int try_get_cap_refs(struct inode inode, int* need, int want,
2819	loff_t endoff, int flags, int *got)
2820	{
2821	struct ceph_inode_info *ci = ceph_inode(inode);
2822	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
2823	struct ceph_client *cl = ceph_inode_to_client(inode);
2824	int ret = `0`;
2825	int have, implemented;
2826	bool snap_rwsem_locked = false;
2827
2828	doutc(cl, "%p %llx.%llx need %s want %s\n", inode,
2829	ceph_vinop(inode), ceph_cap_string(need),
2830	ceph_cap_string(want));
2831
2832	again:
2833	spin_lock(lock: &ci->i_ceph_lock);
2834
2835	if ((flags & CHECK_FILELOCK) &&
2836	(ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2837	doutc(cl, "%p %llx.%llx error filelock\n", inode,
2838	ceph_vinop(inode));
2839	ret = -EIO;
2840	goto out_unlock;
2841	}
2842
2843	/ finish pending truncate /
2844	while (ci->i_truncate_pending) {
2845	spin_unlock(lock: &ci->i_ceph_lock);
2846	if (snap_rwsem_locked) {
2847	up_read(sem: &mdsc->snap_rwsem);
2848	snap_rwsem_locked = false;
2849	}
2850	__ceph_do_pending_vmtruncate(inode);
2851	spin_lock(lock: &ci->i_ceph_lock);
2852	}
2853
2854	have = __ceph_caps_issued(ci, implemented: &implemented);
2855
2856	if (have & need & CEPH_CAP_FILE_WR) {
2857	if (endoff >= `0` && endoff > (loff_t)ci->i_max_size) {
2858	doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n",
2859	inode, ceph_vinop(inode), endoff, ci->i_max_size);
2860	if (endoff > ci->i_requested_max_size)
2861	ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
2862	goto out_unlock;
2863	}
2864	/*
2865	* If a sync write is in progress, we must wait, so that we
2866	* can get a final snapshot value for size+mtime.
2867	*/
2868	if (__ceph_have_pending_cap_snap(ci)) {
2869	doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode,
2870	ceph_vinop(inode));
2871	goto out_unlock;
2872	}
2873	}
2874
2875	if ((have & need) == need) {
2876	/*
2877	* Look at (implemented & ~have & not) so that we keep waiting
2878	* on transition from wanted -> needed caps. This is needed
2879	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
2880	* going before a prior buffered writeback happens.
2881	*
2882	* For RDCACHE\|RD -> RD, there is not need to wait and we can
2883	* just exclude the revoking caps and force to sync read.
2884	*/
2885	int not = want & ~(have & need);
2886	int revoking = implemented & ~have;
2887	int exclude = revoking & not;
2888	doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n",
2889	inode, ceph_vinop(inode), ceph_cap_string(have),
2890	ceph_cap_string(not), ceph_cap_string(revoking));
2891	if (!exclude \|\| !(exclude & CEPH_CAP_FILE_BUFFER)) {
2892	if (!snap_rwsem_locked &&
2893	!ci->i_head_snapc &&
2894	(need & CEPH_CAP_FILE_WR)) {
2895	if (!down_read_trylock(sem: &mdsc->snap_rwsem)) {
2896	/*
2897	* we can not call down_read() when
2898	* task isn't in TASK_RUNNING state
2899	*/
2900	if (flags & NON_BLOCKING) {
2901	ret = -EAGAIN;
2902	goto out_unlock;
2903	}
2904
2905	spin_unlock(lock: &ci->i_ceph_lock);
2906	down_read(sem: &mdsc->snap_rwsem);
2907	snap_rwsem_locked = true;
2908	goto again;
2909	}
2910	snap_rwsem_locked = true;
2911	}
2912	if ((have & want) == want)
2913	*got = need \| (want & ~exclude);
2914	else
2915	*got = need;
2916	ceph_take_cap_refs(ci, got: *got, snap_rwsem_locked: true);
2917	ret = `1`;
2918	}
2919	} else {
2920	int session_readonly = false;
2921	int mds_wanted;
2922	if (ci->i_auth_cap &&
2923	(need & (CEPH_CAP_FILE_WR \| CEPH_CAP_FILE_EXCL))) {
2924	struct ceph_mds_session *s = ci->i_auth_cap->session;
2925	spin_lock(lock: &s->s_cap_lock);
2926	session_readonly = s->s_readonly;
2927	spin_unlock(lock: &s->s_cap_lock);
2928	}
2929	if (session_readonly) {
2930	doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n",
2931	inode, ceph_vinop(inode), ceph_cap_string(need),
2932	ci->i_auth_cap->mds);
2933	ret = -EROFS;
2934	goto out_unlock;
2935	}
2936
2937	if (ceph_inode_is_shutdown(inode)) {
2938	doutc(cl, "%p %llx.%llx inode is shutdown\n",
2939	inode, ceph_vinop(inode));
2940	ret = -ESTALE;
2941	goto out_unlock;
2942	}
2943	mds_wanted = __ceph_caps_mds_wanted(ci, check: false);
2944	if (need & ~mds_wanted) {
2945	doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n",
2946	inode, ceph_vinop(inode), ceph_cap_string(need),
2947	ceph_cap_string(mds_wanted));
2948	ret = -EUCLEAN;
2949	goto out_unlock;
2950	}
2951
2952	doutc(cl, "%p %llx.%llx have %s need %s\n", inode,
2953	ceph_vinop(inode), ceph_cap_string(have),
2954	ceph_cap_string(need));
2955	}
2956	out_unlock:
2957
2958	__ceph_touch_fmode(ci, mdsc, fmode: flags);
2959
2960	spin_unlock(lock: &ci->i_ceph_lock);
2961	if (snap_rwsem_locked)
2962	up_read(sem: &mdsc->snap_rwsem);
2963
2964	if (!ret)
2965	ceph_update_cap_mis(m: &mdsc->metric);
2966	else if (ret == `1`)
2967	ceph_update_cap_hit(m: &mdsc->metric);
2968
2969	doutc(cl, "%p %llx.%llx ret %d got %s\n", inode,
2970	ceph_vinop(inode), ret, ceph_cap_string(*got));
2971	return ret;
2972	}
2973
2974	/*
2975	* Check the offset we are writing up to against our current
2976	* max_size. If necessary, tell the MDS we want to write to
2977	* a larger offset.
2978	*/
2979	static void check_max_size(struct inode *inode, loff_t endoff)
2980	{
2981	struct ceph_inode_info *ci = ceph_inode(inode);
2982	struct ceph_client *cl = ceph_inode_to_client(inode);
2983	int check = `0`;
2984
2985	/ do we need to explicitly request a larger max_size? /
2986	spin_lock(lock: &ci->i_ceph_lock);
2987	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
2988	doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n",
2989	inode, ceph_vinop(inode), endoff);
2990	ci->i_wanted_max_size = endoff;
2991	}
2992	/ duplicate ceph_check_caps()'s logic /
2993	if (ci->i_auth_cap &&
2994	(ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
2995	ci->i_wanted_max_size > ci->i_max_size &&
2996	ci->i_wanted_max_size > ci->i_requested_max_size)
2997	check = `1`;
2998	spin_unlock(lock: &ci->i_ceph_lock);
2999	if (check)
3000	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
3001	}
3002
3003	static inline int get_used_fmode(int caps)
3004	{
3005	int fmode = `0`;
3006	if (caps & CEPH_CAP_FILE_RD)
3007	fmode \|= CEPH_FILE_MODE_RD;
3008	if (caps & CEPH_CAP_FILE_WR)
3009	fmode \|= CEPH_FILE_MODE_WR;
3010	return fmode;
3011	}
3012
3013	int ceph_try_get_caps(struct inode inode, int* need, int want,
3014	bool nonblock, int *got)
3015	{
3016	int ret, flags;
3017
3018	BUG_ON(need & ~CEPH_CAP_FILE_RD);
3019	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO \|
3020	CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
3021	CEPH_CAP_ANY_DIR_OPS));
3022	if (need) {
3023	ret = ceph_pool_perm_check(inode, need);
3024	if (ret < `0`)
3025	return ret;
3026	}
3027
3028	flags = get_used_fmode(caps: need \| want);
3029	if (nonblock)
3030	flags \|= NON_BLOCKING;
3031
3032	ret = try_get_cap_refs(inode, need, want, endoff: `0`, flags, got);
3033	/ three special error codes /
3034	if (ret == -EAGAIN \|\| ret == -EFBIG \|\| ret == -EUCLEAN)
3035	ret = `0`;
3036	return ret;
3037	}
3038
3039	/*
3040	* Wait for caps, and take cap references. If we can't get a WR cap
3041	* due to a small max_size, make sure we check_max_size (and possibly
3042	* ask the mds) so we don't get hung up indefinitely.
3043	*/
3044	int __ceph_get_caps(struct inode inode, struct* ceph_file_info fi, int* need,
3045	int want, loff_t endoff, int *got)
3046	{
3047	struct ceph_inode_info *ci = ceph_inode(inode);
3048	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
3049	int ret, _got, flags;
3050
3051	ret = ceph_pool_perm_check(inode, need);
3052	if (ret < `0`)
3053	return ret;
3054
3055	if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
3056	fi->filp_gen != READ_ONCE(fsc->filp_gen))
3057	return -EBADF;
3058
3059	flags = get_used_fmode(caps: need \| want);
3060
3061	while (true) {
3062	flags &= CEPH_FILE_MODE_MASK;
3063	if (vfs_inode_has_locks(inode))
3064	flags \|= CHECK_FILELOCK;
3065	_got = `0`;
3066	ret = try_get_cap_refs(inode, need, want, endoff,
3067	flags, got: &_got);
3068	WARN_ON_ONCE(ret == -EAGAIN);
3069	if (!ret) {
3070	struct ceph_mds_client *mdsc = fsc->mdsc;
3071	struct cap_wait cw;
3072	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3073
3074	cw.ino = ceph_ino(inode);
3075	cw.tgid = current->tgid;
3076	cw.need = need;
3077	cw.want = want;
3078
3079	spin_lock(lock: &mdsc->caps_list_lock);
3080	list_add(new: &cw.list, head: &mdsc->cap_wait_list);
3081	spin_unlock(lock: &mdsc->caps_list_lock);
3082
3083	/ make sure used fmode not timeout /
3084	ceph_get_fmode(ci, mode: flags, FMODE_WAIT_BIAS);
3085	add_wait_queue(wq_head: &ci->i_cap_wq, wq_entry: &wait);
3086
3087	flags \|= NON_BLOCKING;
3088	while (!(ret = try_get_cap_refs(inode, need, want,
3089	endoff, flags, got: &_got))) {
3090	if (signal_pending(current)) {
3091	ret = -ERESTARTSYS;
3092	break;
3093	}
3094	wait_woken(wq_entry: &wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3095	}
3096
3097	remove_wait_queue(wq_head: &ci->i_cap_wq, wq_entry: &wait);
3098	ceph_put_fmode(ci, mode: flags, FMODE_WAIT_BIAS);
3099
3100	spin_lock(lock: &mdsc->caps_list_lock);
3101	list_del(entry: &cw.list);
3102	spin_unlock(lock: &mdsc->caps_list_lock);
3103
3104	if (ret == -EAGAIN)
3105	continue;
3106	}
3107
3108	if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
3109	fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
3110	if (ret >= `0` && _got)
3111	ceph_put_cap_refs(ci, had: _got);
3112	return -EBADF;
3113	}
3114
3115	if (ret < `0`) {
3116	if (ret == -EFBIG \|\| ret == -EUCLEAN) {
3117	int ret2 = ceph_wait_on_async_create(inode);
3118	if (ret2 < `0`)
3119	return ret2;
3120	}
3121	if (ret == -EFBIG) {
3122	check_max_size(inode, endoff);
3123	continue;
3124	}
3125	if (ret == -EUCLEAN) {
3126	/ session was killed, try renew caps /
3127	ret = ceph_renew_caps(inode, fmode: flags);
3128	if (ret == `0`)
3129	continue;
3130	}
3131	return ret;
3132	}
3133
3134	if (S_ISREG(ci->netfs.inode.i_mode) &&
3135	ceph_has_inline_data(ci) &&
3136	(_got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
3137	i_size_read(inode) > `0`) {
3138	struct page *page =
3139	find_get_page(mapping: inode->i_mapping, offset: `0`);
3140	if (page) {
3141	bool uptodate = PageUptodate(page);
3142
3143	put_page(page);
3144	if (uptodate)
3145	break;
3146	}
3147	/*
3148	* drop cap refs first because getattr while
3149	* holding * caps refs can cause deadlock.
3150	*/
3151	ceph_put_cap_refs(ci, had: _got);
3152	_got = `0`;
3153
3154	/*
3155	* getattr request will bring inline data into
3156	* page cache
3157	*/
3158	ret = __ceph_do_getattr(inode, NULL,
3159	CEPH_STAT_CAP_INLINE_DATA,
3160	force: true);
3161	if (ret < `0`)
3162	return ret;
3163	continue;
3164	}
3165	break;
3166	}
3167	*got = _got;
3168	return `0`;
3169	}
3170
3171	int ceph_get_caps(struct file filp, int* need, int want, loff_t endoff,
3172	int *got)
3173	{
3174	struct ceph_file_info *fi = filp->private_data;
3175	struct inode *inode = file_inode(f: filp);
3176
3177	return __ceph_get_caps(inode, fi, need, want, endoff, got);
3178	}
3179
3180	/*
3181	* Take cap refs. Caller must already know we hold at least one ref
3182	* on the caps in question or we don't know this is safe.
3183	*/
3184	void ceph_get_cap_refs(struct ceph_inode_info ci, int* caps)
3185	{
3186	spin_lock(lock: &ci->i_ceph_lock);
3187	ceph_take_cap_refs(ci, got: caps, snap_rwsem_locked: false);
3188	spin_unlock(lock: &ci->i_ceph_lock);
3189	}
3190
3191
3192	/*
3193	* drop cap_snap that is not associated with any snapshot.
3194	* we don't need to send FLUSHSNAP message for it.
3195	*/
3196	static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
3197	struct ceph_cap_snap *capsnap)
3198	{
3199	struct inode *inode = &ci->netfs.inode;
3200	struct ceph_client *cl = ceph_inode_to_client(inode);
3201
3202	if (!capsnap->need_flush &&
3203	!capsnap->writing && !capsnap->dirty_pages) {
3204	doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows);
3205	BUG_ON(capsnap->cap_flush.tid > `0`);
3206	ceph_put_snap_context(sc: capsnap->context);
3207	if (!list_is_last(list: &capsnap->ci_item, head: &ci->i_cap_snaps))
3208	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
3209
3210	list_del(entry: &capsnap->ci_item);
3211	ceph_put_cap_snap(capsnap);
3212	return `1`;
3213	}
3214	return `0`;
3215	}
3216
3217	enum put_cap_refs_mode {
3218	PUT_CAP_REFS_SYNC = `0`,
3219	PUT_CAP_REFS_ASYNC,
3220	};
3221
3222	/*
3223	* Release cap refs.
3224	*
3225	* If we released the last ref on any given cap, call ceph_check_caps
3226	* to release (or schedule a release).
3227	*
3228	* If we are releasing a WR cap (from a sync write), finalize any affected
3229	* cap_snap, and wake up any waiters.
3230	*/
3231	static void __ceph_put_cap_refs(struct ceph_inode_info ci, int* had,
3232	enum put_cap_refs_mode mode)
3233	{
3234	struct inode *inode = &ci->netfs.inode;
3235	struct ceph_client *cl = ceph_inode_to_client(inode);
3236	int last = `0`, put = `0`, flushsnaps = `0`, wake = `0`;
3237	bool check_flushsnaps = false;
3238
3239	spin_lock(lock: &ci->i_ceph_lock);
3240	if (had & CEPH_CAP_PIN)
3241	--ci->i_pin_ref;
3242	if (had & CEPH_CAP_FILE_RD)
3243	if (--ci->i_rd_ref == `0`)
3244	last++;
3245	if (had & CEPH_CAP_FILE_CACHE)
3246	if (--ci->i_rdcache_ref == `0`)
3247	last++;
3248	if (had & CEPH_CAP_FILE_EXCL)
3249	if (--ci->i_fx_ref == `0`)
3250	last++;
3251	if (had & CEPH_CAP_FILE_BUFFER) {
3252	if (--ci->i_wb_ref == `0`) {
3253	last++;
3254	/ put the ref held by ceph_take_cap_refs() /
3255	put++;
3256	check_flushsnaps = true;
3257	}
3258	doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
3259	ceph_vinop(inode), ci->i_wb_ref+`1`, ci->i_wb_ref);
3260	}
3261	if (had & CEPH_CAP_FILE_WR) {
3262	if (--ci->i_wr_ref == `0`) {
3263	/*
3264	* The Fb caps will always be took and released
3265	* together with the Fw caps.
3266	*/
3267	WARN_ON_ONCE(ci->i_wb_ref);
3268
3269	last++;
3270	check_flushsnaps = true;
3271	if (ci->i_wrbuffer_ref_head == `0` &&
3272	ci->i_dirty_caps == `0` &&
3273	ci->i_flushing_caps == `0`) {
3274	BUG_ON(!ci->i_head_snapc);
3275	ceph_put_snap_context(sc: ci->i_head_snapc);
3276	ci->i_head_snapc = NULL;
3277	}
3278	/ see comment in __ceph_remove_cap() /
3279	if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
3280	ceph_change_snap_realm(inode, NULL);
3281	}
3282	}
3283	if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
3284	struct ceph_cap_snap *capsnap =
3285	list_last_entry(&ci->i_cap_snaps,
3286	struct ceph_cap_snap,
3287	ci_item);
3288
3289	capsnap->writing = `0`;
3290	if (ceph_try_drop_cap_snap(ci, capsnap))
3291	/ put the ref held by ceph_queue_cap_snap() /
3292	put++;
3293	else if (__ceph_finish_cap_snap(ci, capsnap))
3294	flushsnaps = `1`;
3295	wake = `1`;
3296	}
3297	spin_unlock(lock: &ci->i_ceph_lock);
3298
3299	doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode),
3300	ceph_cap_string(had), last ? " last" : "", put ? " put" : "");
3301
3302	switch (mode) {
3303	case PUT_CAP_REFS_SYNC:
3304	if (last)
3305	ceph_check_caps(ci, flags: `0`);
3306	else if (flushsnaps)
3307	ceph_flush_snaps(ci, NULL);
3308	break;
3309	case PUT_CAP_REFS_ASYNC:
3310	if (last)
3311	ceph_queue_check_caps(inode);
3312	else if (flushsnaps)
3313	ceph_queue_flush_snaps(inode);
3314	break;
3315	default:
3316	break;
3317	}
3318	if (wake)
3319	wake_up_all(&ci->i_cap_wq);
3320	while (put-- > `0`)
3321	iput(inode);
3322	}
3323
3324	void ceph_put_cap_refs(struct ceph_inode_info ci, int* had)
3325	{
3326	__ceph_put_cap_refs(ci, had, mode: PUT_CAP_REFS_SYNC);
3327	}
3328
3329	void ceph_put_cap_refs_async(struct ceph_inode_info ci, int* had)
3330	{
3331	__ceph_put_cap_refs(ci, had, mode: PUT_CAP_REFS_ASYNC);
3332	}
3333
3334	/*
3335	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
3336	* context. Adjust per-snap dirty page accounting as appropriate.
3337	* Once all dirty data for a cap_snap is flushed, flush snapped file
3338	* metadata back to the MDS. If we dropped the last ref, call
3339	* ceph_check_caps.
3340	*/
3341	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info ci, int* nr,
3342	struct ceph_snap_context *snapc)
3343	{
3344	struct inode *inode = &ci->netfs.inode;
3345	struct ceph_client *cl = ceph_inode_to_client(inode);
3346	struct ceph_cap_snap capsnap = NULL, iter;
3347	int put = `0`;
3348	bool last = false;
3349	bool flush_snaps = false;
3350	bool complete_capsnap = false;
3351
3352	spin_lock(lock: &ci->i_ceph_lock);
3353	ci->i_wrbuffer_ref -= nr;
3354	if (ci->i_wrbuffer_ref == `0`) {
3355	last = true;
3356	put++;
3357	}
3358
3359	if (ci->i_head_snapc == snapc) {
3360	ci->i_wrbuffer_ref_head -= nr;
3361	if (ci->i_wrbuffer_ref_head == `0` &&
3362	ci->i_wr_ref == `0` &&
3363	ci->i_dirty_caps == `0` &&
3364	ci->i_flushing_caps == `0`) {
3365	BUG_ON(!ci->i_head_snapc);
3366	ceph_put_snap_context(sc: ci->i_head_snapc);
3367	ci->i_head_snapc = NULL;
3368	}
3369	doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n",
3370	inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr,
3371	ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref,
3372	ci->i_wrbuffer_ref_head, last ? " LAST" : "");
3373	} else {
3374	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
3375	if (iter->context == snapc) {
3376	capsnap = iter;
3377	break;
3378	}
3379	}
3380
3381	if (!capsnap) {
3382	/*
3383	* The capsnap should already be removed when removing
3384	* auth cap in the case of a forced unmount.
3385	*/
3386	WARN_ON_ONCE(ci->i_auth_cap);
3387	goto unlock;
3388	}
3389
3390	capsnap->dirty_pages -= nr;
3391	if (capsnap->dirty_pages == `0`) {
3392	complete_capsnap = true;
3393	if (!capsnap->writing) {
3394	if (ceph_try_drop_cap_snap(ci, capsnap)) {
3395	put++;
3396	} else {
3397	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
3398	flush_snaps = true;
3399	}
3400	}
3401	}
3402	doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n",
3403	inode, ceph_vinop(inode), capsnap, capsnap->context->seq,
3404	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
3405	ci->i_wrbuffer_ref, capsnap->dirty_pages,
3406	last ? " (wrbuffer last)" : "",
3407	complete_capsnap ? " (complete capsnap)" : "");
3408	}
3409
3410	unlock:
3411	spin_unlock(lock: &ci->i_ceph_lock);
3412
3413	if (last) {
3414	ceph_check_caps(ci, flags: `0`);
3415	} else if (flush_snaps) {
3416	ceph_flush_snaps(ci, NULL);
3417	}
3418	if (complete_capsnap)
3419	wake_up_all(&ci->i_cap_wq);
3420	while (put-- > `0`) {
3421	iput(inode);
3422	}
3423	}
3424
3425	/*
3426	* Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
3427	*/
3428	static void invalidate_aliases(struct inode *inode)
3429	{
3430	struct ceph_client *cl = ceph_inode_to_client(inode);
3431	struct dentry dn, prev = NULL;
3432
3433	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
3434	d_prune_aliases(inode);
3435	/*
3436	* For non-directory inode, d_find_alias() only returns
3437	* hashed dentry. After calling d_invalidate(), the
3438	* dentry becomes unhashed.
3439	*
3440	* For directory inode, d_find_alias() can return
3441	* unhashed dentry. But directory inode should have
3442	* one alias at most.
3443	*/
3444	while ((dn = d_find_alias(inode))) {
3445	if (dn == prev) {
3446	dput(dn);
3447	break;
3448	}
3449	d_invalidate(dn);
3450	if (prev)
3451	dput(prev);
3452	prev = dn;
3453	}
3454	if (prev)
3455	dput(prev);
3456	}
3457
3458	struct cap_extra_info {
3459	struct ceph_string *pool_ns;
3460	/ inline data /
3461	u64 inline_version;
3462	void *inline_data;
3463	u32 inline_len;
3464	/ dirstat /
3465	bool dirstat_valid;
3466	u64 nfiles;
3467	u64 nsubdirs;
3468	u64 change_attr;
3469	/ currently issued /
3470	int issued;
3471	struct timespec64 btime;
3472	u8 *fscrypt_auth;
3473	u32 fscrypt_auth_len;
3474	u64 fscrypt_file_size;
3475	};
3476
3477	/*
3478	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
3479	* actually be a revocation if it specifies a smaller cap set.)
3480	*
3481	* caller holds s_mutex and i_ceph_lock, we drop both.
3482	*/
3483	static void handle_cap_grant(struct inode *inode,
3484	struct ceph_mds_session *session,
3485	struct ceph_cap *cap,
3486	struct ceph_mds_caps *grant,
3487	struct ceph_buffer *xattr_buf,
3488	struct cap_extra_info *extra_info)
3489	__releases(ci->i_ceph_lock)
3490	__releases(session->s_mdsc->snap_rwsem)
3491	{
3492	struct ceph_client *cl = ceph_inode_to_client(inode);
3493	struct ceph_inode_info *ci = ceph_inode(inode);
3494	int seq = le32_to_cpu(grant->seq);
3495	int newcaps = le32_to_cpu(grant->caps);
3496	int used, wanted, dirty;
3497	u64 size = le64_to_cpu(grant->size);
3498	u64 max_size = le64_to_cpu(grant->max_size);
3499	unsigned char check_caps = `0`;
3500	bool was_stale = cap->cap_gen < atomic_read(v: &session->s_cap_gen);
3501	bool wake = false;
3502	bool writeback = false;
3503	bool queue_trunc = false;
3504	bool queue_invalidate = false;
3505	bool deleted_inode = false;
3506	bool fill_inline = false;
3507
3508	/*
3509	* If there is at least one crypto block then we'll trust
3510	* fscrypt_file_size. If the real length of the file is 0, then
3511	* ignore it (it has probably been truncated down to 0 by the MDS).
3512	*/
3513	if (IS_ENCRYPTED(inode) && size)
3514	size = extra_info->fscrypt_file_size;
3515
3516	doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode,
3517	ceph_vinop(inode), cap, session->s_mds, seq,
3518	ceph_cap_string(newcaps));
3519	doutc(cl, " size %llu max_size %llu, i_size %llu\n", size,
3520	max_size, i_size_read(inode));
3521
3522
3523	/*
3524	* If CACHE is being revoked, and we have no dirty buffers,
3525	* try to invalidate (once). (If there are dirty buffers, we
3526	* will invalidate _after_ writeback.)
3527	*/
3528	if (S_ISREG(inode->i_mode) && / don't invalidate readdir cache /
3529	((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3530	(newcaps & CEPH_CAP_FILE_LAZYIO) == `0` &&
3531	!(ci->i_wrbuffer_ref \|\| ci->i_wb_ref)) {
3532	if (try_nonblocking_invalidate(inode)) {
3533	/ there were locked pages.. invalidate later*
3534	in a separate thread. /*
3535	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3536	queue_invalidate = true;
3537	ci->i_rdcache_revoking = ci->i_rdcache_gen;
3538	}
3539	}
3540	}
3541
3542	if (was_stale)
3543	cap->issued = cap->implemented = CEPH_CAP_PIN;
3544
3545	/*
3546	* auth mds of the inode changed. we received the cap export message,
3547	* but still haven't received the cap import message. handle_cap_export
3548	* updated the new auth MDS' cap.
3549	*
3550	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
3551	* that was sent before the cap import message. So don't remove caps.
3552	*/
3553	if (ceph_seq_cmp(a: seq, b: cap->seq) <= `0`) {
3554	WARN_ON(cap != ci->i_auth_cap);
3555	WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
3556	seq = cap->seq;
3557	newcaps \|= cap->issued;
3558	}
3559
3560	/ side effects now are allowed /
3561	cap->cap_gen = atomic_read(v: &session->s_cap_gen);
3562	cap->seq = seq;
3563
3564	__check_cap_issue(ci, cap, issued: newcaps);
3565
3566	inode_set_max_iversion_raw(inode, val: extra_info->change_attr);
3567
3568	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
3569	(extra_info->issued & CEPH_CAP_AUTH_EXCL) == `0`) {
3570	umode_t mode = le32_to_cpu(grant->mode);
3571
3572	if (inode_wrong_type(inode, mode))
3573	pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
3574	ceph_vinop(inode), inode->i_mode, mode);
3575	else
3576	inode->i_mode = mode;
3577	inode->i_uid = make_kuid(from: &init_user_ns, le32_to_cpu(grant->uid));
3578	inode->i_gid = make_kgid(from: &init_user_ns, le32_to_cpu(grant->gid));
3579	ci->i_btime = extra_info->btime;
3580	doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
3581	ceph_vinop(inode), inode->i_mode,
3582	from_kuid(&init_user_ns, inode->i_uid),
3583	from_kgid(&init_user_ns, inode->i_gid));
3584	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
3585	if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len \|\|
3586	memcmp(p: ci->fscrypt_auth, q: extra_info->fscrypt_auth,
3587	size: ci->fscrypt_auth_len))
3588	pr_warn_ratelimited_client(cl,
3589	"cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
3590	ci->fscrypt_auth_len,
3591	extra_info->fscrypt_auth_len);
3592	#endif
3593	}
3594
3595	if ((newcaps & CEPH_CAP_LINK_SHARED) &&
3596	(extra_info->issued & CEPH_CAP_LINK_EXCL) == `0`) {
3597	set_nlink(inode, le32_to_cpu(grant->nlink));
3598	if (inode->i_nlink == `0`)
3599	deleted_inode = true;
3600	}
3601
3602	if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == `0` &&
3603	grant->xattr_len) {
3604	int len = le32_to_cpu(grant->xattr_len);
3605	u64 version = le64_to_cpu(grant->xattr_version);
3606
3607	if (version > ci->i_xattrs.version) {
3608	doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n",
3609	version, inode, ceph_vinop(inode), len);
3610	if (ci->i_xattrs.blob)
3611	ceph_buffer_put(b: ci->i_xattrs.blob);
3612	ci->i_xattrs.blob = ceph_buffer_get(b: xattr_buf);
3613	ci->i_xattrs.version = version;
3614	ceph_forget_all_cached_acls(inode);
3615	ceph_security_invalidate_secctx(inode);
3616	}
3617	}
3618
3619	if (newcaps & CEPH_CAP_ANY_RD) {
3620	struct timespec64 mtime, atime, ctime;
3621	/ ctime/mtime/atime? /
3622	ceph_decode_timespec64(ts: &mtime, tv: &grant->mtime);
3623	ceph_decode_timespec64(ts: &atime, tv: &grant->atime);
3624	ceph_decode_timespec64(ts: &ctime, tv: &grant->ctime);
3625	ceph_fill_file_time(inode, issued: extra_info->issued,
3626	le32_to_cpu(grant->time_warp_seq),
3627	ctime: &ctime, mtime: &mtime, atime: &atime);
3628	}
3629
3630	if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
3631	ci->i_files = extra_info->nfiles;
3632	ci->i_subdirs = extra_info->nsubdirs;
3633	}
3634
3635	if (newcaps & (CEPH_CAP_ANY_FILE_RD \| CEPH_CAP_ANY_FILE_WR)) {
3636	/ file layout may have changed /
3637	s64 old_pool = ci->i_layout.pool_id;
3638	struct ceph_string *old_ns;
3639
3640	ceph_file_layout_from_legacy(fl: &ci->i_layout, legacy: &grant->layout);
3641	old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
3642	lockdep_is_held(&ci->i_ceph_lock));
3643	rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
3644
3645	if (ci->i_layout.pool_id != old_pool \|\|
3646	extra_info->pool_ns != old_ns)
3647	ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
3648
3649	extra_info->pool_ns = old_ns;
3650
3651	/ size/truncate_seq? /
3652	queue_trunc = ceph_fill_file_size(inode, issued: extra_info->issued,
3653	le32_to_cpu(grant->truncate_seq),
3654	le64_to_cpu(grant->truncate_size),
3655	size);
3656	}
3657
3658	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
3659	if (max_size != ci->i_max_size) {
3660	doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size,
3661	max_size);
3662	ci->i_max_size = max_size;
3663	if (max_size >= ci->i_wanted_max_size) {
3664	ci->i_wanted_max_size = `0`; / reset /
3665	ci->i_requested_max_size = `0`;
3666	}
3667	wake = true;
3668	}
3669	}
3670
3671	/ check cap bits /
3672	wanted = __ceph_caps_wanted(ci);
3673	used = __ceph_caps_used(ci);
3674	dirty = __ceph_caps_dirty(ci);
3675	doutc(cl, " my wanted = %s, used = %s, dirty %s\n",
3676	ceph_cap_string(wanted), ceph_cap_string(used),
3677	ceph_cap_string(dirty));
3678
3679	if ((was_stale \|\| le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3680	(wanted & ~(cap->mds_wanted \| newcaps))) {
3681	/*
3682	* If mds is importing cap, prior cap messages that update
3683	* 'wanted' may get dropped by mds (migrate seq mismatch).
3684	*
3685	* We don't send cap message to update 'wanted' if what we
3686	* want are already issued. If mds revokes caps, cap message
3687	* that releases caps also tells mds what we want. But if
3688	* caps got revoked by mds forcedly (session stale). We may
3689	* haven't told mds what we want.
3690	*/
3691	check_caps = `1`;
3692	}
3693
3694	/ revocation, grant, or no-op? /
3695	if (cap->issued & ~newcaps) {
3696	int revoking = cap->issued & ~newcaps;
3697
3698	doutc(cl, "revocation: %s -> %s (revoking %s)\n",
3699	ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
3700	ceph_cap_string(revoking));
3701	if (S_ISREG(inode->i_mode) &&
3702	(revoking & used & CEPH_CAP_FILE_BUFFER))
3703	writeback = true; / initiate writeback; will delay ack /
3704	else if (queue_invalidate &&
3705	revoking == CEPH_CAP_FILE_CACHE &&
3706	(newcaps & CEPH_CAP_FILE_LAZYIO) == `0`)
3707	; / do nothing yet, invalidation will be queued /
3708	else if (cap == ci->i_auth_cap)
3709	check_caps = `1`; / check auth cap only /
3710	else
3711	check_caps = `2`; / check all caps /
3712	/ If there is new caps, try to wake up the waiters /
3713	if (~cap->issued & newcaps)
3714	wake = true;
3715	cap->issued = newcaps;
3716	cap->implemented \|= newcaps;
3717	} else if (cap->issued == newcaps) {
3718	doutc(cl, "caps unchanged: %s -> %s\n",
3719	ceph_cap_string(cap->issued),
3720	ceph_cap_string(newcaps));
3721	} else {
3722	doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued),
3723	ceph_cap_string(newcaps));
3724	/ non-auth MDS is revoking the newly grant caps ? /
3725	if (cap == ci->i_auth_cap &&
3726	__ceph_caps_revoking_other(ci, ocap: cap, mask: newcaps))
3727	check_caps = `2`;
3728
3729	cap->issued = newcaps;
3730	cap->implemented \|= newcaps; / add bits only, to*
3731	* avoid stepping on a
3732	* pending revocation */
3733	wake = true;
3734	}
3735	BUG_ON(cap->issued & ~cap->implemented);
3736
3737	/ don't let check_caps skip sending a response to MDS for revoke msgs /
3738	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
3739	cap->mds_wanted = `0`;
3740	if (cap == ci->i_auth_cap)
3741	check_caps = `1`; / check auth cap only /
3742	else
3743	check_caps = `2`; / check all caps /
3744	}
3745
3746	if (extra_info->inline_version > `0` &&
3747	extra_info->inline_version >= ci->i_inline_version) {
3748	ci->i_inline_version = extra_info->inline_version;
3749	if (ci->i_inline_version != CEPH_INLINE_NONE &&
3750	(newcaps & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)))
3751	fill_inline = true;
3752	}
3753
3754	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
3755	if (ci->i_auth_cap == cap) {
3756	if (newcaps & ~extra_info->issued)
3757	wake = true;
3758
3759	if (ci->i_requested_max_size > max_size \|\|
3760	!(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
3761	/ re-request max_size if necessary /
3762	ci->i_requested_max_size = `0`;
3763	wake = true;
3764	}
3765
3766	ceph_kick_flushing_inode_caps(session, ci);
3767	}
3768	up_read(sem: &session->s_mdsc->snap_rwsem);
3769	}
3770	spin_unlock(lock: &ci->i_ceph_lock);
3771
3772	if (fill_inline)
3773	ceph_fill_inline_data(inode, NULL, data: extra_info->inline_data,
3774	len: extra_info->inline_len);
3775
3776	if (queue_trunc)
3777	ceph_queue_vmtruncate(inode);
3778
3779	if (writeback)
3780	/*
3781	* queue inode for writeback: we can't actually call
3782	* filemap_write_and_wait, etc. from message handler
3783	* context.
3784	*/
3785	ceph_queue_writeback(inode);
3786	if (queue_invalidate)
3787	ceph_queue_invalidate(inode);
3788	if (deleted_inode)
3789	invalidate_aliases(inode);
3790	if (wake)
3791	wake_up_all(&ci->i_cap_wq);
3792
3793	mutex_unlock(lock: &session->s_mutex);
3794	if (check_caps == `1`)
3795	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY \| CHECK_CAPS_NOINVAL);
3796	else if (check_caps == `2`)
3797	ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
3798	}
3799
3800	/*
3801	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
3802	* MDS has been safely committed.
3803	*/
3804	static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3805	struct ceph_mds_caps *m,
3806	struct ceph_mds_session *session,
3807	struct ceph_cap *cap)
3808	__releases(ci->i_ceph_lock)
3809	{
3810	struct ceph_inode_info *ci = ceph_inode(inode);
3811	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
3812	struct ceph_client *cl = mdsc->fsc->client;
3813	struct ceph_cap_flush cf, tmp_cf;
3814	LIST_HEAD(to_remove);
3815	unsigned seq = le32_to_cpu(m->seq);
3816	int dirty = le32_to_cpu(m->dirty);
3817	int cleaned = `0`;
3818	bool drop = false;
3819	bool wake_ci = false;
3820	bool wake_mdsc = false;
3821
3822	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3823	/ Is this the one that was flushed? /
3824	if (cf->tid == flush_tid)
3825	cleaned = cf->caps;
3826
3827	/ Is this a capsnap? /
3828	if (cf->is_capsnap)
3829	continue;
3830
3831	if (cf->tid <= flush_tid) {
3832	/*
3833	* An earlier or current tid. The FLUSH_ACK should
3834	* represent a superset of this flush's caps.
3835	*/
3836	wake_ci \|= __detach_cap_flush_from_ci(ci, cf);
3837	list_add_tail(new: &cf->i_list, head: &to_remove);
3838	} else {
3839	/*
3840	* This is a later one. Any caps in it are still dirty
3841	* so don't count them as cleaned.
3842	*/
3843	cleaned &= ~cf->caps;
3844	if (!cleaned)
3845	break;
3846	}
3847	}
3848
3849	doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n",
3850	inode, ceph_vinop(inode), session->s_mds, seq,
3851	ceph_cap_string(dirty), ceph_cap_string(cleaned),
3852	ceph_cap_string(ci->i_flushing_caps),
3853	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
3854
3855	if (list_empty(head: &to_remove) && !cleaned)
3856	goto out;
3857
3858	ci->i_flushing_caps &= ~cleaned;
3859
3860	spin_lock(lock: &mdsc->cap_dirty_lock);
3861
3862	list_for_each_entry(cf, &to_remove, i_list)
3863	wake_mdsc \|= __detach_cap_flush_from_mdsc(mdsc, cf);
3864
3865	if (ci->i_flushing_caps == `0`) {
3866	if (list_empty(head: &ci->i_cap_flush_list)) {
3867	list_del_init(entry: &ci->i_flushing_item);
3868	if (!list_empty(head: &session->s_cap_flushing)) {
3869	struct inode *inode =
3870	&list_first_entry(&session->s_cap_flushing,
3871	struct ceph_inode_info,
3872	i_flushing_item)->netfs.inode;
3873	doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n",
3874	session->s_mds, inode, ceph_vinop(inode));
3875	}
3876	}
3877	mdsc->num_cap_flushing--;
3878	doutc(cl, " %p %llx.%llx now !flushing\n", inode,
3879	ceph_vinop(inode));
3880
3881	if (ci->i_dirty_caps == `0`) {
3882	doutc(cl, " %p %llx.%llx now clean\n", inode,
3883	ceph_vinop(inode));
3884	BUG_ON(!list_empty(&ci->i_dirty_item));
3885	drop = true;
3886	if (ci->i_wr_ref == `0` &&
3887	ci->i_wrbuffer_ref_head == `0`) {
3888	BUG_ON(!ci->i_head_snapc);
3889	ceph_put_snap_context(sc: ci->i_head_snapc);
3890	ci->i_head_snapc = NULL;
3891	}
3892	} else {
3893	BUG_ON(list_empty(&ci->i_dirty_item));
3894	}
3895	}
3896	spin_unlock(lock: &mdsc->cap_dirty_lock);
3897
3898	out:
3899	spin_unlock(lock: &ci->i_ceph_lock);
3900
3901	while (!list_empty(head: &to_remove)) {
3902	cf = list_first_entry(&to_remove,
3903	struct ceph_cap_flush, i_list);
3904	list_del_init(entry: &cf->i_list);
3905	if (!cf->is_capsnap)
3906	ceph_free_cap_flush(cf);
3907	}
3908
3909	if (wake_ci)
3910	wake_up_all(&ci->i_cap_wq);
3911	if (wake_mdsc)
3912	wake_up_all(&mdsc->cap_flushing_wq);
3913	if (drop)
3914	iput(inode);
3915	}
3916
3917	void __ceph_remove_capsnap(struct inode inode, struct* ceph_cap_snap *capsnap,
3918	bool wake_ci, bool wake_mdsc)
3919	{
3920	struct ceph_inode_info *ci = ceph_inode(inode);
3921	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
3922	struct ceph_client *cl = mdsc->fsc->client;
3923	bool ret;
3924
3925	lockdep_assert_held(&ci->i_ceph_lock);
3926
3927	doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap,
3928	inode, ceph_vinop(inode), ci);
3929
3930	list_del_init(entry: &capsnap->ci_item);
3931	ret = __detach_cap_flush_from_ci(ci, cf: &capsnap->cap_flush);
3932	if (wake_ci)
3933	*wake_ci = ret;
3934
3935	spin_lock(lock: &mdsc->cap_dirty_lock);
3936	if (list_empty(head: &ci->i_cap_flush_list))
3937	list_del_init(entry: &ci->i_flushing_item);
3938
3939	ret = __detach_cap_flush_from_mdsc(mdsc, cf: &capsnap->cap_flush);
3940	if (wake_mdsc)
3941	*wake_mdsc = ret;
3942	spin_unlock(lock: &mdsc->cap_dirty_lock);
3943	}
3944
3945	void ceph_remove_capsnap(struct inode inode, struct* ceph_cap_snap *capsnap,
3946	bool wake_ci, bool wake_mdsc)
3947	{
3948	struct ceph_inode_info *ci = ceph_inode(inode);
3949
3950	lockdep_assert_held(&ci->i_ceph_lock);
3951
3952	WARN_ON_ONCE(capsnap->dirty_pages \|\| capsnap->writing);
3953	__ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
3954	}
3955
3956	/*
3957	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
3958	* throw away our cap_snap.
3959	*
3960	* Caller hold s_mutex.
3961	*/
3962	static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3963	struct ceph_mds_caps *m,
3964	struct ceph_mds_session *session)
3965	{
3966	struct ceph_inode_info *ci = ceph_inode(inode);
3967	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb: inode->i_sb)->mdsc;
3968	struct ceph_client *cl = mdsc->fsc->client;
3969	u64 follows = le64_to_cpu(m->snap_follows);
3970	struct ceph_cap_snap capsnap = NULL, iter;
3971	bool wake_ci = false;
3972	bool wake_mdsc = false;
3973
3974	doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode,
3975	ceph_vinop(inode), ci, session->s_mds, follows);
3976
3977	spin_lock(lock: &ci->i_ceph_lock);
3978	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
3979	if (iter->follows == follows) {
3980	if (iter->cap_flush.tid != flush_tid) {
3981	doutc(cl, " cap_snap %p follows %lld "
3982	"tid %lld != %lld\n", iter,
3983	follows, flush_tid,
3984	iter->cap_flush.tid);
3985	break;
3986	}
3987	capsnap = iter;
3988	break;
3989	} else {
3990	doutc(cl, " skipping cap_snap %p follows %lld\n",
3991	iter, iter->follows);
3992	}
3993	}
3994	if (capsnap)
3995	ceph_remove_capsnap(inode, capsnap, wake_ci: &wake_ci, wake_mdsc: &wake_mdsc);
3996	spin_unlock(lock: &ci->i_ceph_lock);
3997
3998	if (capsnap) {
3999	ceph_put_snap_context(sc: capsnap->context);
4000	ceph_put_cap_snap(capsnap);
4001	if (wake_ci)
4002	wake_up_all(&ci->i_cap_wq);
4003	if (wake_mdsc)
4004	wake_up_all(&mdsc->cap_flushing_wq);
4005	iput(inode);
4006	}
4007	}
4008
4009	/*
4010	* Handle TRUNC from MDS, indicating file truncation.
4011	*
4012	* caller hold s_mutex.
4013	*/
4014	static bool handle_cap_trunc(struct inode *inode,
4015	struct ceph_mds_caps *trunc,
4016	struct ceph_mds_session *session,
4017	struct cap_extra_info *extra_info)
4018	{
4019	struct ceph_inode_info *ci = ceph_inode(inode);
4020	struct ceph_client *cl = ceph_inode_to_client(inode);
4021	int mds = session->s_mds;
4022	int seq = le32_to_cpu(trunc->seq);
4023	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
4024	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
4025	u64 size = le64_to_cpu(trunc->size);
4026	int implemented = `0`;
4027	int dirty = __ceph_caps_dirty(ci);
4028	int issued = __ceph_caps_issued(ci: ceph_inode(inode), implemented: &implemented);
4029	bool queue_trunc = false;
4030
4031	lockdep_assert_held(&ci->i_ceph_lock);
4032
4033	issued \|= implemented \| dirty;
4034
4035	/*
4036	* If there is at least one crypto block then we'll trust
4037	* fscrypt_file_size. If the real length of the file is 0, then
4038	* ignore it (it has probably been truncated down to 0 by the MDS).
4039	*/
4040	if (IS_ENCRYPTED(inode) && size)
4041	size = extra_info->fscrypt_file_size;
4042
4043	doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n",
4044	inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq);
4045	queue_trunc = ceph_fill_file_size(inode, issued,
4046	truncate_seq, truncate_size, size);
4047	return queue_trunc;
4048	}
4049
4050	/*
4051	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
4052	* different one. If we are the most recent migration we've seen (as
4053	* indicated by mseq), make note of the migrating cap bits for the
4054	* duration (until we see the corresponding IMPORT).
4055	*
4056	* caller holds s_mutex
4057	*/
4058	static void handle_cap_export(struct inode inode, struct* ceph_mds_caps *ex,
4059	struct ceph_mds_cap_peer *ph,
4060	struct ceph_mds_session *session)
4061	{
4062	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
4063	struct ceph_client *cl = mdsc->fsc->client;
4064	struct ceph_mds_session *tsession = NULL;
4065	struct ceph_cap cap, tcap, *new_cap = NULL;
4066	struct ceph_inode_info *ci = ceph_inode(inode);
4067	u64 t_cap_id;
4068	unsigned mseq = le32_to_cpu(ex->migrate_seq);
4069	unsigned t_seq, t_mseq;
4070	int target, issued;
4071	int mds = session->s_mds;
4072
4073	if (ph) {
4074	t_cap_id = le64_to_cpu(ph->cap_id);
4075	t_seq = le32_to_cpu(ph->seq);
4076	t_mseq = le32_to_cpu(ph->mseq);
4077	target = le32_to_cpu(ph->mds);
4078	} else {
4079	t_cap_id = t_seq = t_mseq = `0`;
4080	target = -`1`;
4081	}
4082
4083	doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n",
4084	inode, ceph_vinop(inode), ci, mds, mseq, target);
4085	retry:
4086	down_read(sem: &mdsc->snap_rwsem);
4087	spin_lock(lock: &ci->i_ceph_lock);
4088	cap = __get_cap_for_mds(ci, mds);
4089	if (!cap \|\| cap->cap_id != le64_to_cpu(ex->cap_id))
4090	goto out_unlock;
4091
4092	if (target < `0`) {
4093	ceph_remove_cap(mdsc, cap, queue_release: false);
4094	goto out_unlock;
4095	}
4096
4097	/*
4098	* now we know we haven't received the cap import message yet
4099	* because the exported cap still exist.
4100	*/
4101
4102	issued = cap->issued;
4103	if (issued != cap->implemented)
4104	pr_err_ratelimited_client(cl, "issued != implemented: "
4105	"%p %llx.%llx mds%d seq %d mseq %d"
4106	" issued %s implemented %s\n",
4107	inode, ceph_vinop(inode), mds,
4108	cap->seq, cap->mseq,
4109	ceph_cap_string(issued),
4110	ceph_cap_string(cap->implemented));
4111
4112
4113	tcap = __get_cap_for_mds(ci, mds: target);
4114	if (tcap) {
4115	/ already have caps from the target /
4116	if (tcap->cap_id == t_cap_id &&
4117	ceph_seq_cmp(a: tcap->seq, b: t_seq) < `0`) {
4118	doutc(cl, " updating import cap %p mds%d\n", tcap,
4119	target);
4120	tcap->cap_id = t_cap_id;
4121	tcap->seq = t_seq - `1`;
4122	tcap->issue_seq = t_seq - `1`;
4123	tcap->issued \|= issued;
4124	tcap->implemented \|= issued;
4125	if (cap == ci->i_auth_cap) {
4126	ci->i_auth_cap = tcap;
4127	change_auth_cap_ses(ci, session: tcap->session);
4128	}
4129	}
4130	ceph_remove_cap(mdsc, cap, queue_release: false);
4131	goto out_unlock;
4132	} else if (tsession) {
4133	/ add placeholder for the export tagert /
4134	int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : `0`;
4135	tcap = new_cap;
4136	ceph_add_cap(inode, session: tsession, cap_id: t_cap_id, issued, wanted: `0`,
4137	seq: t_seq - `1`, mseq: t_mseq, realmino: (u64)-`1`, flags: flag, new_cap: &new_cap);
4138
4139	if (!list_empty(head: &ci->i_cap_flush_list) &&
4140	ci->i_auth_cap == tcap) {
4141	spin_lock(lock: &mdsc->cap_dirty_lock);
4142	list_move_tail(list: &ci->i_flushing_item,
4143	head: &tcap->session->s_cap_flushing);
4144	spin_unlock(lock: &mdsc->cap_dirty_lock);
4145	}
4146
4147	ceph_remove_cap(mdsc, cap, queue_release: false);
4148	goto out_unlock;
4149	}
4150
4151	spin_unlock(lock: &ci->i_ceph_lock);
4152	up_read(sem: &mdsc->snap_rwsem);
4153	mutex_unlock(lock: &session->s_mutex);
4154
4155	/ open target session /
4156	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
4157	if (!IS_ERR(ptr: tsession)) {
4158	if (mds > target) {
4159	mutex_lock(&session->s_mutex);
4160	mutex_lock_nested(lock: &tsession->s_mutex,
4161	SINGLE_DEPTH_NESTING);
4162	} else {
4163	mutex_lock(&tsession->s_mutex);
4164	mutex_lock_nested(lock: &session->s_mutex,
4165	SINGLE_DEPTH_NESTING);
4166	}
4167	new_cap = ceph_get_cap(mdsc, NULL);
4168	} else {
4169	WARN_ON(`1`);
4170	tsession = NULL;
4171	target = -`1`;
4172	mutex_lock(&session->s_mutex);
4173	}
4174	goto retry;
4175
4176	out_unlock:
4177	spin_unlock(lock: &ci->i_ceph_lock);
4178	up_read(sem: &mdsc->snap_rwsem);
4179	mutex_unlock(lock: &session->s_mutex);
4180	if (tsession) {
4181	mutex_unlock(lock: &tsession->s_mutex);
4182	ceph_put_mds_session(s: tsession);
4183	}
4184	if (new_cap)
4185	ceph_put_cap(mdsc, cap: new_cap);
4186	}
4187
4188	/*
4189	* Handle cap IMPORT.
4190	*
4191	* caller holds s_mutex. acquires i_ceph_lock
4192	*/
4193	static void handle_cap_import(struct ceph_mds_client *mdsc,
4194	struct inode inode, struct* ceph_mds_caps *im,
4195	struct ceph_mds_cap_peer *ph,
4196	struct ceph_mds_session *session,
4197	struct ceph_cap *target_cap, int* *old_issued)
4198	{
4199	struct ceph_inode_info *ci = ceph_inode(inode);
4200	struct ceph_client *cl = mdsc->fsc->client;
4201	struct ceph_cap cap, ocap, *new_cap = NULL;
4202	int mds = session->s_mds;
4203	int issued;
4204	unsigned caps = le32_to_cpu(im->caps);
4205	unsigned wanted = le32_to_cpu(im->wanted);
4206	unsigned seq = le32_to_cpu(im->seq);
4207	unsigned mseq = le32_to_cpu(im->migrate_seq);
4208	u64 realmino = le64_to_cpu(im->realm);
4209	u64 cap_id = le64_to_cpu(im->cap_id);
4210	u64 p_cap_id;
4211	int peer;
4212
4213	if (ph) {
4214	p_cap_id = le64_to_cpu(ph->cap_id);
4215	peer = le32_to_cpu(ph->mds);
4216	} else {
4217	p_cap_id = `0`;
4218	peer = -`1`;
4219	}
4220
4221	doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n",
4222	inode, ceph_vinop(inode), ci, mds, mseq, peer);
4223	retry:
4224	cap = __get_cap_for_mds(ci, mds);
4225	if (!cap) {
4226	if (!new_cap) {
4227	spin_unlock(lock: &ci->i_ceph_lock);
4228	new_cap = ceph_get_cap(mdsc, NULL);
4229	spin_lock(lock: &ci->i_ceph_lock);
4230	goto retry;
4231	}
4232	cap = new_cap;
4233	} else {
4234	if (new_cap) {
4235	ceph_put_cap(mdsc, cap: new_cap);
4236	new_cap = NULL;
4237	}
4238	}
4239
4240	__ceph_caps_issued(ci, implemented: &issued);
4241	issued \|= __ceph_caps_dirty(ci);
4242
4243	ceph_add_cap(inode, session, cap_id, issued: caps, wanted, seq, mseq,
4244	realmino, CEPH_CAP_FLAG_AUTH, new_cap: &new_cap);
4245
4246	ocap = peer >= `0` ? __get_cap_for_mds(ci, mds: peer) : NULL;
4247	if (ocap && ocap->cap_id == p_cap_id) {
4248	doutc(cl, " remove export cap %p mds%d flags %d\n",
4249	ocap, peer, ph->flags);
4250	if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
4251	(ocap->seq != le32_to_cpu(ph->seq) \|\|
4252	ocap->mseq != le32_to_cpu(ph->mseq))) {
4253	pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
4254	"%p %llx.%llx mds%d seq %d mseq %d"
4255	" importer mds%d has peer seq %d mseq %d\n",
4256	inode, ceph_vinop(inode), peer,
4257	ocap->seq, ocap->mseq, mds,
4258	le32_to_cpu(ph->seq),
4259	le32_to_cpu(ph->mseq));
4260	}
4261	ceph_remove_cap(mdsc, cap: ocap, queue_release: (ph->flags & CEPH_CAP_FLAG_RELEASE));
4262	}
4263
4264	*old_issued = issued;
4265	*target_cap = cap;
4266	}
4267
4268	#ifdef CONFIG_FS_ENCRYPTION
4269	static int parse_fscrypt_fields(void *p, void* *end,
4270	struct cap_extra_info *extra)
4271	{
4272	u32 len;
4273
4274	ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
4275	if (extra->fscrypt_auth_len) {
4276	ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
4277	extra->fscrypt_auth = kmalloc(size: extra->fscrypt_auth_len,
4278	GFP_KERNEL);
4279	if (!extra->fscrypt_auth)
4280	return -ENOMEM;
4281	ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
4282	extra->fscrypt_auth_len, bad);
4283	}
4284
4285	ceph_decode_32_safe(p, end, len, bad);
4286	if (len >= sizeof(u64)) {
4287	ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
4288	len -= sizeof(u64);
4289	}
4290	ceph_decode_skip_n(p, end, len, bad);
4291	return `0`;
4292	bad:
4293	return -EIO;
4294	}
4295	#else
4296	static int parse_fscrypt_fields(void *p, void* *end,
4297	struct cap_extra_info *extra)
4298	{
4299	u32 len;
4300
4301	/ Don't care about these fields unless we're encryption-capable /
4302	ceph_decode_32_safe(p, end, len, bad);
4303	if (len)
4304	ceph_decode_skip_n(p, end, len, bad);
4305	ceph_decode_32_safe(p, end, len, bad);
4306	if (len)
4307	ceph_decode_skip_n(p, end, len, bad);
4308	return `0`;
4309	bad:
4310	return -EIO;
4311	}
4312	#endif
4313
4314	/*
4315	* Handle a caps message from the MDS.
4316	*
4317	* Identify the appropriate session, inode, and call the right handler
4318	* based on the cap op.
4319	*/
4320	void ceph_handle_caps(struct ceph_mds_session *session,
4321	struct ceph_msg *msg)
4322	{
4323	struct ceph_mds_client *mdsc = session->s_mdsc;
4324	struct ceph_client *cl = mdsc->fsc->client;
4325	struct inode *inode;
4326	struct ceph_inode_info *ci;
4327	struct ceph_cap *cap;
4328	struct ceph_mds_caps *h;
4329	struct ceph_mds_cap_peer *peer = NULL;
4330	struct ceph_snap_realm *realm = NULL;
4331	int op;
4332	int msg_version = le16_to_cpu(msg->hdr.version);
4333	u32 seq, mseq;
4334	struct ceph_vino vino;
4335	void *snaptrace;
4336	size_t snaptrace_len;
4337	void p, end;
4338	struct cap_extra_info extra_info = {};
4339	bool queue_trunc;
4340	bool close_sessions = false;
4341	bool do_cap_release = false;
4342
4343	doutc(cl, "from mds%d\n", session->s_mds);
4344
4345	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
4346	return;
4347
4348	/ decode /
4349	end = msg->front.iov_base + msg->front.iov_len;
4350	if (msg->front.iov_len < sizeof(*h))
4351	goto bad;
4352	h = msg->front.iov_base;
4353	op = le32_to_cpu(h->op);
4354	vino.ino = le64_to_cpu(h->ino);
4355	vino.snap = CEPH_NOSNAP;
4356	seq = le32_to_cpu(h->seq);
4357	mseq = le32_to_cpu(h->migrate_seq);
4358
4359	snaptrace = h + `1`;
4360	snaptrace_len = le32_to_cpu(h->snap_trace_len);
4361	p = snaptrace + snaptrace_len;
4362
4363	if (msg_version >= `2`) {
4364	u32 flock_len;
4365	ceph_decode_32_safe(&p, end, flock_len, bad);
4366	if (p + flock_len > end)
4367	goto bad;
4368	p += flock_len;
4369	}
4370
4371	if (msg_version >= `3`) {
4372	if (op == CEPH_CAP_OP_IMPORT) {
4373	if (p + sizeof(*peer) > end)
4374	goto bad;
4375	peer = p;
4376	p += sizeof(*peer);
4377	} else if (op == CEPH_CAP_OP_EXPORT) {
4378	/ recorded in unused fields /
4379	peer = (void *)&h->size;
4380	}
4381	}
4382
4383	if (msg_version >= `4`) {
4384	ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
4385	ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
4386	if (p + extra_info.inline_len > end)
4387	goto bad;
4388	extra_info.inline_data = p;
4389	p += extra_info.inline_len;
4390	}
4391
4392	if (msg_version >= `5`) {
4393	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
4394	u32 epoch_barrier;
4395
4396	ceph_decode_32_safe(&p, end, epoch_barrier, bad);
4397	ceph_osdc_update_epoch_barrier(osdc, eb: epoch_barrier);
4398	}
4399
4400	if (msg_version >= `8`) {
4401	u32 pool_ns_len;
4402
4403	/ version >= 6 /
4404	ceph_decode_skip_64(&p, end, bad); // flush_tid
4405	/ version >= 7 /
4406	ceph_decode_skip_32(&p, end, bad); // caller_uid
4407	ceph_decode_skip_32(&p, end, bad); // caller_gid
4408	/ version >= 8 /
4409	ceph_decode_32_safe(&p, end, pool_ns_len, bad);
4410	if (pool_ns_len > `0`) {
4411	ceph_decode_need(&p, end, pool_ns_len, bad);
4412	extra_info.pool_ns =
4413	ceph_find_or_create_string(str: p, len: pool_ns_len);
4414	p += pool_ns_len;
4415	}
4416	}
4417
4418	if (msg_version >= `9`) {
4419	struct ceph_timespec *btime;
4420
4421	if (p + sizeof(*btime) > end)
4422	goto bad;
4423	btime = p;
4424	ceph_decode_timespec64(ts: &extra_info.btime, tv: btime);
4425	p += sizeof(*btime);
4426	ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
4427	}
4428
4429	if (msg_version >= `11`) {
4430	/ version >= 10 /
4431	ceph_decode_skip_32(&p, end, bad); // flags
4432	/ version >= 11 /
4433	extra_info.dirstat_valid = true;
4434	ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
4435	ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
4436	}
4437
4438	if (msg_version >= `12`) {
4439	if (parse_fscrypt_fields(p: &p, end, extra: &extra_info))
4440	goto bad;
4441	}
4442
4443	/ lookup ino /
4444	inode = ceph_find_inode(sb: mdsc->fsc->sb, vino);
4445	doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op),
4446	vino.ino, vino.snap, inode);
4447
4448	mutex_lock(&session->s_mutex);
4449	doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds,
4450	session->s_seq, (unsigned)seq);
4451
4452	if (!inode) {
4453	doutc(cl, " i don't have ino %llx\n", vino.ino);
4454
4455	switch (op) {
4456	case CEPH_CAP_OP_IMPORT:
4457	case CEPH_CAP_OP_REVOKE:
4458	case CEPH_CAP_OP_GRANT:
4459	do_cap_release = true;
4460	break;
4461	default:
4462	break;
4463	}
4464	goto flush_cap_releases;
4465	}
4466	ci = ceph_inode(inode);
4467
4468	/ these will work even if we don't have a cap yet /
4469	switch (op) {
4470	case CEPH_CAP_OP_FLUSHSNAP_ACK:
4471	handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
4472	m: h, session);
4473	goto done;
4474
4475	case CEPH_CAP_OP_EXPORT:
4476	handle_cap_export(inode, ex: h, ph: peer, session);
4477	goto done_unlocked;
4478
4479	case CEPH_CAP_OP_IMPORT:
4480	realm = NULL;
4481	if (snaptrace_len) {
4482	down_write(sem: &mdsc->snap_rwsem);
4483	if (ceph_update_snap_trace(m: mdsc, p: snaptrace,
4484	e: snaptrace + snaptrace_len,
4485	deletion: false, realm_ret: &realm)) {
4486	up_write(sem: &mdsc->snap_rwsem);
4487	close_sessions = true;
4488	goto done;
4489	}
4490	downgrade_write(sem: &mdsc->snap_rwsem);
4491	} else {
4492	down_read(sem: &mdsc->snap_rwsem);
4493	}
4494	spin_lock(lock: &ci->i_ceph_lock);
4495	handle_cap_import(mdsc, inode, im: h, ph: peer, session,
4496	target_cap: &cap, old_issued: &extra_info.issued);
4497	handle_cap_grant(inode, session, cap,
4498	grant: h, xattr_buf: msg->middle, extra_info: &extra_info);
4499	if (realm)
4500	ceph_put_snap_realm(mdsc, realm);
4501	goto done_unlocked;
4502	}
4503
4504	/ the rest require a cap /
4505	spin_lock(lock: &ci->i_ceph_lock);
4506	cap = __get_cap_for_mds(ci: ceph_inode(inode), mds: session->s_mds);
4507	if (!cap) {
4508	doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n",
4509	inode, ceph_ino(inode), ceph_snap(inode),
4510	session->s_mds);
4511	spin_unlock(lock: &ci->i_ceph_lock);
4512	switch (op) {
4513	case CEPH_CAP_OP_REVOKE:
4514	case CEPH_CAP_OP_GRANT:
4515	do_cap_release = true;
4516	break;
4517	default:
4518	break;
4519	}
4520	goto flush_cap_releases;
4521	}
4522
4523	/ note that each of these drops i_ceph_lock for us /
4524	switch (op) {
4525	case CEPH_CAP_OP_REVOKE:
4526	case CEPH_CAP_OP_GRANT:
4527	__ceph_caps_issued(ci, implemented: &extra_info.issued);
4528	extra_info.issued \|= __ceph_caps_dirty(ci);
4529	handle_cap_grant(inode, session, cap,
4530	grant: h, xattr_buf: msg->middle, extra_info: &extra_info);
4531	goto done_unlocked;
4532
4533	case CEPH_CAP_OP_FLUSH_ACK:
4534	handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
4535	m: h, session, cap);
4536	break;
4537
4538	case CEPH_CAP_OP_TRUNC:
4539	queue_trunc = handle_cap_trunc(inode, trunc: h, session,
4540	extra_info: &extra_info);
4541	spin_unlock(lock: &ci->i_ceph_lock);
4542	if (queue_trunc)
4543	ceph_queue_vmtruncate(inode);
4544	break;
4545
4546	default:
4547	spin_unlock(lock: &ci->i_ceph_lock);
4548	pr_err_client(cl, "unknown cap op %d %s\n", op,
4549	ceph_cap_op_name(op));
4550	}
4551
4552	done:
4553	mutex_unlock(lock: &session->s_mutex);
4554	done_unlocked:
4555	iput(inode);
4556	out:
4557	ceph_dec_mds_stopping_blocker(mdsc);
4558
4559	ceph_put_string(str: extra_info.pool_ns);
4560
4561	/ Defer closing the sessions after s_mutex lock being released /
4562	if (close_sessions)
4563	ceph_mdsc_close_sessions(mdsc);
4564
4565	kfree(objp: extra_info.fscrypt_auth);
4566	return;
4567
4568	flush_cap_releases:
4569	/*
4570	* send any cap release message to try to move things
4571	* along for the mds (who clearly thinks we still have this
4572	* cap).
4573	*/
4574	if (do_cap_release) {
4575	cap = ceph_get_cap(mdsc, NULL);
4576	cap->cap_ino = vino.ino;
4577	cap->queue_release = `1`;
4578	cap->cap_id = le64_to_cpu(h->cap_id);
4579	cap->mseq = mseq;
4580	cap->seq = seq;
4581	cap->issue_seq = seq;
4582	spin_lock(lock: &session->s_cap_lock);
4583	__ceph_queue_cap_release(session, cap);
4584	spin_unlock(lock: &session->s_cap_lock);
4585	}
4586	ceph_flush_cap_releases(mdsc, session);
4587	goto done;
4588
4589	bad:
4590	pr_err_client(cl, "corrupt message\n");
4591	ceph_msg_dump(msg);
4592	goto out;
4593	}
4594
4595	/*
4596	* Delayed work handler to process end of delayed cap release LRU list.
4597	*
4598	* If new caps are added to the list while processing it, these won't get
4599	* processed in this run. In this case, the ci->i_hold_caps_max will be
4600	* returned so that the work can be scheduled accordingly.
4601	*/
4602	unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4603	{
4604	struct ceph_client *cl = mdsc->fsc->client;
4605	struct inode *inode;
4606	struct ceph_inode_info *ci;
4607	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4608	unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4609	unsigned long loop_start = jiffies;
4610	unsigned long delay = `0`;
4611
4612	doutc(cl, "begin\n");
4613	spin_lock(lock: &mdsc->cap_delay_lock);
4614	while (!list_empty(head: &mdsc->cap_delay_list)) {
4615	ci = list_first_entry(&mdsc->cap_delay_list,
4616	struct ceph_inode_info,
4617	i_cap_delay_list);
4618	if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
4619	doutc(cl, "caps added recently. Exiting loop");
4620	delay = ci->i_hold_caps_max;
4621	break;
4622	}
4623	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == `0` &&
4624	time_before(jiffies, ci->i_hold_caps_max))
4625	break;
4626	list_del_init(entry: &ci->i_cap_delay_list);
4627
4628	inode = igrab(&ci->netfs.inode);
4629	if (inode) {
4630	spin_unlock(lock: &mdsc->cap_delay_lock);
4631	doutc(cl, "on %p %llx.%llx\n", inode,
4632	ceph_vinop(inode));
4633	ceph_check_caps(ci, flags: `0`);
4634	iput(inode);
4635	spin_lock(lock: &mdsc->cap_delay_lock);
4636	}
4637
4638	/*
4639	* Make sure too many dirty caps or general
4640	* slowness doesn't block mdsc delayed work,
4641	* preventing send_renew_caps() from running.
4642	*/
4643	if (jiffies - loop_start >= `5` * HZ)
4644	break;
4645	}
4646	spin_unlock(lock: &mdsc->cap_delay_lock);
4647	doutc(cl, "done\n");
4648
4649	return delay;
4650	}
4651
4652	/*
4653	* Flush all dirty caps to the mds
4654	*/
4655	static void flush_dirty_session_caps(struct ceph_mds_session *s)
4656	{
4657	struct ceph_mds_client *mdsc = s->s_mdsc;
4658	struct ceph_client *cl = mdsc->fsc->client;
4659	struct ceph_inode_info *ci;
4660	struct inode *inode;
4661
4662	doutc(cl, "begin\n");
4663	spin_lock(lock: &mdsc->cap_dirty_lock);
4664	while (!list_empty(head: &s->s_cap_dirty)) {
4665	ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
4666	i_dirty_item);
4667	inode = &ci->netfs.inode;
4668	ihold(inode);
4669	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
4670	spin_unlock(lock: &mdsc->cap_dirty_lock);
4671	ceph_wait_on_async_create(inode);
4672	ceph_check_caps(ci, CHECK_CAPS_FLUSH);
4673	iput(inode);
4674	spin_lock(lock: &mdsc->cap_dirty_lock);
4675	}
4676	spin_unlock(lock: &mdsc->cap_dirty_lock);
4677	doutc(cl, "done\n");
4678	}
4679
4680	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4681	{
4682	ceph_mdsc_iterate_sessions(mdsc, cb: flush_dirty_session_caps, check_state: true);
4683	}
4684
4685	void __ceph_touch_fmode(struct ceph_inode_info *ci,
4686	struct ceph_mds_client mdsc, int* fmode)
4687	{
4688	unsigned long now = jiffies;
4689	if (fmode & CEPH_FILE_MODE_RD)
4690	ci->i_last_rd = now;
4691	if (fmode & CEPH_FILE_MODE_WR)
4692	ci->i_last_wr = now;
4693	/ queue periodic check /
4694	if (fmode &&
4695	__ceph_is_any_real_caps(ci) &&
4696	list_empty(head: &ci->i_cap_delay_list))
4697	__cap_delay_requeue(mdsc, ci);
4698	}
4699
4700	void ceph_get_fmode(struct ceph_inode_info ci, int* fmode, int count)
4701	{
4702	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: ci->netfs.inode.i_sb);
4703	int bits = (fmode << `1`) \| `1`;
4704	bool already_opened = false;
4705	int i;
4706
4707	if (count == `1`)
4708	atomic64_inc(v: &mdsc->metric.opened_files);
4709
4710	spin_lock(lock: &ci->i_ceph_lock);
4711	for (i = `0`; i < CEPH_FILE_MODE_BITS; i++) {
4712	/*
4713	* If any of the mode ref is larger than 0,
4714	* that means it has been already opened by
4715	* others. Just skip checking the PIN ref.
4716	*/
4717	if (i && ci->i_nr_by_mode[i])
4718	already_opened = true;
4719
4720	if (bits & (`1` << i))
4721	ci->i_nr_by_mode[i] += count;
4722	}
4723
4724	if (!already_opened)
4725	percpu_counter_inc(fbc: &mdsc->metric.opened_inodes);
4726	spin_unlock(lock: &ci->i_ceph_lock);
4727	}
4728
4729	/*
4730	* Drop open file reference. If we were the last open file,
4731	* we may need to release capabilities to the MDS (or schedule
4732	* their delayed release).
4733	*/
4734	void ceph_put_fmode(struct ceph_inode_info ci, int* fmode, int count)
4735	{
4736	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: ci->netfs.inode.i_sb);
4737	int bits = (fmode << `1`) \| `1`;
4738	bool is_closed = true;
4739	int i;
4740
4741	if (count == `1`)
4742	atomic64_dec(v: &mdsc->metric.opened_files);
4743
4744	spin_lock(lock: &ci->i_ceph_lock);
4745	for (i = `0`; i < CEPH_FILE_MODE_BITS; i++) {
4746	if (bits & (`1` << i)) {
4747	BUG_ON(ci->i_nr_by_mode[i] < count);
4748	ci->i_nr_by_mode[i] -= count;
4749	}
4750
4751	/*
4752	* If any of the mode ref is not 0 after
4753	* decreased, that means it is still opened
4754	* by others. Just skip checking the PIN ref.
4755	*/
4756	if (i && ci->i_nr_by_mode[i])
4757	is_closed = false;
4758	}
4759
4760	if (is_closed)
4761	percpu_counter_dec(fbc: &mdsc->metric.opened_inodes);
4762	spin_unlock(lock: &ci->i_ceph_lock);
4763	}
4764
4765	/*
4766	* For a soon-to-be unlinked file, drop the LINK caps. If it
4767	* looks like the link count will hit 0, drop any other caps (other
4768	* than PIN) we don't specifically want (due to the file still being
4769	* open).
4770	*/
4771	int ceph_drop_caps_for_unlink(struct inode *inode)
4772	{
4773	struct ceph_inode_info *ci = ceph_inode(inode);
4774	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
4775
4776	spin_lock(lock: &ci->i_ceph_lock);
4777	if (inode->i_nlink == `1`) {
4778	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);
4779
4780	if (__ceph_caps_dirty(ci)) {
4781	struct ceph_mds_client *mdsc =
4782	ceph_inode_to_fs_client(inode)->mdsc;
4783
4784	doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
4785	ceph_vinop(inode));
4786	spin_lock(lock: &mdsc->cap_delay_lock);
4787	ci->i_ceph_flags \|= CEPH_I_FLUSH;
4788	if (!list_empty(head: &ci->i_cap_delay_list))
4789	list_del_init(entry: &ci->i_cap_delay_list);
4790	list_add_tail(new: &ci->i_cap_delay_list,
4791	head: &mdsc->cap_unlink_delay_list);
4792	spin_unlock(lock: &mdsc->cap_delay_lock);
4793
4794	/*
4795	* Fire the work immediately, because the MDS maybe
4796	* waiting for caps release.
4797	*/
4798	ceph_queue_cap_unlink_work(mdsc);
4799	}
4800	}
4801	spin_unlock(lock: &ci->i_ceph_lock);
4802	return drop;
4803	}
4804
4805	/*
4806	* Helpers for embedding cap and dentry lease releases into mds
4807	* requests.
4808	*
4809	* @force is used by dentry_release (below) to force inclusion of a
4810	* record for the directory inode, even when there aren't any caps to
4811	* drop.
4812	*/
4813	int ceph_encode_inode_release(void p, struct** inode *inode,
4814	int mds, int drop, int unless, int force)
4815	{
4816	struct ceph_inode_info *ci = ceph_inode(inode);
4817	struct ceph_client *cl = ceph_inode_to_client(inode);
4818	struct ceph_cap *cap;
4819	struct ceph_mds_request_release rel = p;
4820	int used, dirty;
4821	int ret = `0`;
4822
4823	spin_lock(lock: &ci->i_ceph_lock);
4824	used = __ceph_caps_used(ci);
4825	dirty = __ceph_caps_dirty(ci);
4826
4827	doutc(cl, "%p %llx.%llx mds%d used\|dirty %s drop %s unless %s\n",
4828	inode, ceph_vinop(inode), mds, ceph_cap_string(used\|dirty),
4829	ceph_cap_string(drop), ceph_cap_string(unless));
4830
4831	/ only drop unused, clean caps /
4832	drop &= ~(used \| dirty);
4833
4834	cap = __get_cap_for_mds(ci, mds);
4835	if (cap && __cap_is_valid(cap)) {
4836	unless &= cap->issued;
4837	if (unless) {
4838	if (unless & CEPH_CAP_AUTH_EXCL)
4839	drop &= ~CEPH_CAP_AUTH_SHARED;
4840	if (unless & CEPH_CAP_LINK_EXCL)
4841	drop &= ~CEPH_CAP_LINK_SHARED;
4842	if (unless & CEPH_CAP_XATTR_EXCL)
4843	drop &= ~CEPH_CAP_XATTR_SHARED;
4844	if (unless & CEPH_CAP_FILE_EXCL)
4845	drop &= ~CEPH_CAP_FILE_SHARED;
4846	}
4847
4848	if (force \|\| (cap->issued & drop)) {
4849	if (cap->issued & drop) {
4850	int wanted = __ceph_caps_wanted(ci);
4851	doutc(cl, "%p %llx.%llx cap %p %s -> %s, "
4852	"wanted %s -> %s\n", inode,
4853	ceph_vinop(inode), cap,
4854	ceph_cap_string(cap->issued),
4855	ceph_cap_string(cap->issued & ~drop),
4856	ceph_cap_string(cap->mds_wanted),
4857	ceph_cap_string(wanted));
4858
4859	cap->issued &= ~drop;
4860	cap->implemented &= ~drop;
4861	cap->mds_wanted = wanted;
4862	if (cap == ci->i_auth_cap &&
4863	!(wanted & CEPH_CAP_ANY_FILE_WR))
4864	ci->i_requested_max_size = `0`;
4865	} else {
4866	doutc(cl, "%p %llx.%llx cap %p %s (force)\n",
4867	inode, ceph_vinop(inode), cap,
4868	ceph_cap_string(cap->issued));
4869	}
4870
4871	rel->ino = cpu_to_le64(ceph_ino(inode));
4872	rel->cap_id = cpu_to_le64(cap->cap_id);
4873	rel->seq = cpu_to_le32(cap->seq);
4874	rel->issue_seq = cpu_to_le32(cap->issue_seq);
4875	rel->mseq = cpu_to_le32(cap->mseq);
4876	rel->caps = cpu_to_le32(cap->implemented);
4877	rel->wanted = cpu_to_le32(cap->mds_wanted);
4878	rel->dname_len = `0`;
4879	rel->dname_seq = `0`;
4880	p += sizeof(rel);
4881	ret = `1`;
4882	} else {
4883	doutc(cl, "%p %llx.%llx cap %p %s (noop)\n",
4884	inode, ceph_vinop(inode), cap,
4885	ceph_cap_string(cap->issued));
4886	}
4887	}
4888	spin_unlock(lock: &ci->i_ceph_lock);
4889	return ret;
4890	}
4891
4892	/**
4893	* ceph_encode_dentry_release - encode a dentry release into an outgoing request
4894	* @p: outgoing request buffer
4895	* @dentry: dentry to release
4896	* @dir: dir to release it from
4897	* @mds: mds that we're speaking to
4898	* @drop: caps being dropped
4899	* @unless: unless we have these caps
4900	*
4901	* Encode a dentry release into an outgoing request buffer. Returns 1 if the
4902	* thing was released, or a negative error code otherwise.
4903	*/
4904	int ceph_encode_dentry_release(void p, struct** dentry *dentry,
4905	struct inode *dir,
4906	int mds, int drop, int unless)
4907	{
4908	struct ceph_mds_request_release rel = p;
4909	struct ceph_dentry_info *di = ceph_dentry(dentry);
4910	struct ceph_client *cl;
4911	int force = `0`;
4912	int ret;
4913
4914	/ This shouldn't happen /
4915	BUG_ON(!dir);
4916
4917	/*
4918	* force an record for the directory caps if we have a dentry lease.
4919	* this is racy (can't take i_ceph_lock and d_lock together), but it
4920	* doesn't have to be perfect; the mds will revoke anything we don't
4921	* release.
4922	*/
4923	spin_lock(lock: &dentry->d_lock);
4924	if (di->lease_session && di->lease_session->s_mds == mds)
4925	force = `1`;
4926	spin_unlock(lock: &dentry->d_lock);
4927
4928	ret = ceph_encode_inode_release(p, inode: dir, mds, drop, unless, force);
4929
4930	cl = ceph_inode_to_client(inode: dir);
4931	spin_lock(lock: &dentry->d_lock);
4932	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
4933	doutc(cl, "%p mds%d seq %d\n", dentry, mds,
4934	(int)di->lease_seq);
4935	rel->dname_seq = cpu_to_le32(di->lease_seq);
4936	__ceph_mdsc_drop_dentry_lease(dentry);
4937	spin_unlock(lock: &dentry->d_lock);
4938	if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(inode: dir)) {
4939	int ret2 = ceph_encode_encrypted_fname(parent: dir, dentry, buf: *p);
4940
4941	if (ret2 < `0`)
4942	return ret2;
4943
4944	rel->dname_len = cpu_to_le32(ret2);
4945	*p += ret2;
4946	} else {
4947	rel->dname_len = cpu_to_le32(dentry->d_name.len);
4948	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
4949	*p += dentry->d_name.len;
4950	}
4951	} else {
4952	spin_unlock(lock: &dentry->d_lock);
4953	}
4954	return ret;
4955	}
4956
4957	static int remove_capsnaps(struct ceph_mds_client mdsc, struct* inode *inode)
4958	{
4959	struct ceph_inode_info *ci = ceph_inode(inode);
4960	struct ceph_client *cl = mdsc->fsc->client;
4961	struct ceph_cap_snap *capsnap;
4962	int capsnap_release = `0`;
4963
4964	lockdep_assert_held(&ci->i_ceph_lock);
4965
4966	doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n",
4967	ci, inode, ceph_vinop(inode));
4968
4969	while (!list_empty(head: &ci->i_cap_snaps)) {
4970	capsnap = list_first_entry(&ci->i_cap_snaps,
4971	struct ceph_cap_snap, ci_item);
4972	__ceph_remove_capsnap(inode, capsnap, NULL, NULL);
4973	ceph_put_snap_context(sc: capsnap->context);
4974	ceph_put_cap_snap(capsnap);
4975	capsnap_release++;
4976	}
4977	wake_up_all(&ci->i_cap_wq);
4978	wake_up_all(&mdsc->cap_flushing_wq);
4979	return capsnap_release;
4980	}
4981
4982	int ceph_purge_inode_cap(struct inode inode, struct* ceph_cap cap, bool invalidate)
4983	{
4984	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
4985	struct ceph_mds_client *mdsc = fsc->mdsc;
4986	struct ceph_client *cl = fsc->client;
4987	struct ceph_inode_info *ci = ceph_inode(inode);
4988	bool is_auth;
4989	bool dirty_dropped = false;
4990	int iputs = `0`;
4991
4992	lockdep_assert_held(&ci->i_ceph_lock);
4993
4994	doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n",
4995	cap, ci, inode, ceph_vinop(inode));
4996
4997	is_auth = (cap == ci->i_auth_cap);
4998	__ceph_remove_cap(cap, queue_release: false);
4999	if (is_auth) {
5000	struct ceph_cap_flush *cf;
5001
5002	if (ceph_inode_is_shutdown(inode)) {
5003	if (inode->i_data.nrpages > `0`)
5004	*invalidate = true;
5005	if (ci->i_wrbuffer_ref > `0`)
5006	mapping_set_error(mapping: &inode->i_data, error: -EIO);
5007	}
5008
5009	spin_lock(lock: &mdsc->cap_dirty_lock);
5010
5011	/ trash all of the cap flushes for this inode /
5012	while (!list_empty(head: &ci->i_cap_flush_list)) {
5013	cf = list_first_entry(&ci->i_cap_flush_list,
5014	struct ceph_cap_flush, i_list);
5015	list_del_init(entry: &cf->g_list);
5016	list_del_init(entry: &cf->i_list);
5017	if (!cf->is_capsnap)
5018	ceph_free_cap_flush(cf);
5019	}
5020
5021	if (!list_empty(head: &ci->i_dirty_item)) {
5022	pr_warn_ratelimited_client(cl,
5023	" dropping dirty %s state for %p %llx.%llx\n",
5024	ceph_cap_string(ci->i_dirty_caps),
5025	inode, ceph_vinop(inode));
5026	ci->i_dirty_caps = `0`;
5027	list_del_init(entry: &ci->i_dirty_item);
5028	dirty_dropped = true;
5029	}
5030	if (!list_empty(head: &ci->i_flushing_item)) {
5031	pr_warn_ratelimited_client(cl,
5032	" dropping dirty+flushing %s state for %p %llx.%llx\n",
5033	ceph_cap_string(ci->i_flushing_caps),
5034	inode, ceph_vinop(inode));
5035	ci->i_flushing_caps = `0`;
5036	list_del_init(entry: &ci->i_flushing_item);
5037	mdsc->num_cap_flushing--;
5038	dirty_dropped = true;
5039	}
5040	spin_unlock(lock: &mdsc->cap_dirty_lock);
5041
5042	if (dirty_dropped) {
5043	mapping_set_error(mapping: inode->i_mapping, error: -EIO);
5044
5045	if (ci->i_wrbuffer_ref_head == `0` &&
5046	ci->i_wr_ref == `0` &&
5047	ci->i_dirty_caps == `0` &&
5048	ci->i_flushing_caps == `0`) {
5049	ceph_put_snap_context(sc: ci->i_head_snapc);
5050	ci->i_head_snapc = NULL;
5051	}
5052	}
5053
5054	if (atomic_read(v: &ci->i_filelock_ref) > `0`) {
5055	/ make further file lock syscall return -EIO /
5056	ci->i_ceph_flags \|= CEPH_I_ERROR_FILELOCK;
5057	pr_warn_ratelimited_client(cl,
5058	" dropping file locks for %p %llx.%llx\n",
5059	inode, ceph_vinop(inode));
5060	}
5061
5062	if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
5063	cf = ci->i_prealloc_cap_flush;
5064	ci->i_prealloc_cap_flush = NULL;
5065	if (!cf->is_capsnap)
5066	ceph_free_cap_flush(cf);
5067	}
5068
5069	if (!list_empty(head: &ci->i_cap_snaps))
5070	iputs = remove_capsnaps(mdsc, inode);
5071	}
5072	if (dirty_dropped)
5073	++iputs;
5074	return iputs;
5075	}
5076

source code of linux/fs/ceph/caps.c