snap.c source code [linux/fs/ceph/snap.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/fs.h>
5	#include <linux/sort.h>
6	#include <linux/slab.h>
7	#include <linux/iversion.h>
8	#include "super.h"
9	#include "mds_client.h"
10	#include <linux/ceph/decode.h>
11
12	/ unused map expires after 5 minutes /
13	#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
14
15	/*
16	* Snapshots in ceph are driven in large part by cooperation from the
17	* client. In contrast to local file systems or file servers that
18	* implement snapshots at a single point in the system, ceph's
19	* distributed access to storage requires clients to help decide
20	* whether a write logically occurs before or after a recently created
21	* snapshot.
22	*
23	* This provides a perfect instantanous client-wide snapshot. Between
24	* clients, however, snapshots may appear to be applied at slightly
25	* different points in time, depending on delays in delivering the
26	* snapshot notification.
27	*
28	* Snapshots are _not_ file system-wide. Instead, each snapshot
29	* applies to the subdirectory nested beneath some directory. This
30	* effectively divides the hierarchy into multiple "realms," where all
31	* of the files contained by each realm share the same set of
32	* snapshots. An individual realm's snap set contains snapshots
33	* explicitly created on that realm, as well as any snaps in its
34	* parent's snap set _after_ the point at which the parent became it's
35	* parent (due to, say, a rename). Similarly, snaps from prior parents
36	* during the time intervals during which they were the parent are included.
37	*
38	* The client is spared most of this detail, fortunately... it must only
39	* maintains a hierarchy of realms reflecting the current parent/child
40	* realm relationship, and for each realm has an explicit list of snaps
41	* inherited from prior parents.
42	*
43	* A snap_realm struct is maintained for realms containing every inode
44	* with an open cap in the system. (The needed snap realm information is
45	* provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
46	* version number is used to ensure that as realm parameters change (new
47	* snapshot, new parent, etc.) the client's realm hierarchy is updated.
48	*
49	* The realm hierarchy drives the generation of a 'snap context' for each
50	* realm, which simply lists the resulting set of snaps for the realm. This
51	* is attached to any writes sent to OSDs.
52	*/
53	/*
54	* Unfortunately error handling is a bit mixed here. If we get a snap
55	* update, but don't have enough memory to update our realm hierarchy,
56	* it's not clear what we can do about it (besides complaining to the
57	* console).
58	*/
59
60
61	/*
62	* increase ref count for the realm
63	*
64	* caller must hold snap_rwsem.
65	*/
66	void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
67	struct ceph_snap_realm *realm)
68	{
69	lockdep_assert_held(&mdsc->snap_rwsem);
70
71	/*
72	* The 0->1 and 1->0 transitions must take the snap_empty_lock
73	* atomically with the refcount change. Go ahead and bump the
74	* nref here, unless it's 0, in which case we take the spinlock
75	* and then do the increment and remove it from the list.
76	*/
77	if (atomic_inc_not_zero(v: &realm->nref))
78	return;
79
80	spin_lock(lock: &mdsc->snap_empty_lock);
81	if (atomic_inc_return(v: &realm->nref) == `1`)
82	list_del_init(entry: &realm->empty_item);
83	spin_unlock(lock: &mdsc->snap_empty_lock);
84	}
85
86	static void __insert_snap_realm(struct rb_root *root,
87	struct ceph_snap_realm *new)
88	{
89	struct rb_node **p = &root->rb_node;
90	struct rb_node *parent = NULL;
91	struct ceph_snap_realm *r = NULL;
92
93	while (*p) {
94	parent = *p;
95	r = rb_entry(parent, struct ceph_snap_realm, node);
96	if (new->ino < r->ino)
97	p = &(*p)->rb_left;
98	else if (new->ino > r->ino)
99	p = &(*p)->rb_right;
100	else
101	BUG();
102	}
103
104	rb_link_node(node: &new->node, parent, rb_link: p);
105	rb_insert_color(&new->node, root);
106	}
107
108	/*
109	* create and get the realm rooted at @ino and bump its ref count.
110	*
111	* caller must hold snap_rwsem for write.
112	*/
113	static struct ceph_snap_realm *ceph_create_snap_realm(
114	struct ceph_mds_client *mdsc,
115	u64 ino)
116	{
117	struct ceph_snap_realm *realm;
118
119	lockdep_assert_held_write(&mdsc->snap_rwsem);
120
121	realm = kzalloc(size: sizeof(*realm), GFP_NOFS);
122	if (!realm)
123	return ERR_PTR(error: -ENOMEM);
124
125	/ Do not release the global dummy snaprealm until unmouting /
126	if (ino == CEPH_INO_GLOBAL_SNAPREALM)
127	atomic_set(v: &realm->nref, i: `2`);
128	else
129	atomic_set(v: &realm->nref, i: `1`);
130	realm->ino = ino;
131	INIT_LIST_HEAD(list: &realm->children);
132	INIT_LIST_HEAD(list: &realm->child_item);
133	INIT_LIST_HEAD(list: &realm->empty_item);
134	INIT_LIST_HEAD(list: &realm->dirty_item);
135	INIT_LIST_HEAD(list: &realm->rebuild_item);
136	INIT_LIST_HEAD(list: &realm->inodes_with_caps);
137	spin_lock_init(&realm->inodes_with_caps_lock);
138	__insert_snap_realm(root: &mdsc->snap_realms, new: realm);
139	mdsc->num_snap_realms++;
140
141	doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm);
142	return realm;
143	}
144
145	/*
146	* lookup the realm rooted at @ino.
147	*
148	* caller must hold snap_rwsem.
149	*/
150	static struct ceph_snap_realm __lookup_snap_realm(struct* ceph_mds_client *mdsc,
151	u64 ino)
152	{
153	struct ceph_client *cl = mdsc->fsc->client;
154	struct rb_node *n = mdsc->snap_realms.rb_node;
155	struct ceph_snap_realm *r;
156
157	lockdep_assert_held(&mdsc->snap_rwsem);
158
159	while (n) {
160	r = rb_entry(n, struct ceph_snap_realm, node);
161	if (ino < r->ino)
162	n = n->rb_left;
163	else if (ino > r->ino)
164	n = n->rb_right;
165	else {
166	doutc(cl, "%llx %p\n", r->ino, r);
167	return r;
168	}
169	}
170	return NULL;
171	}
172
173	struct ceph_snap_realm ceph_lookup_snap_realm(struct* ceph_mds_client *mdsc,
174	u64 ino)
175	{
176	struct ceph_snap_realm *r;
177	r = __lookup_snap_realm(mdsc, ino);
178	if (r)
179	ceph_get_snap_realm(mdsc, realm: r);
180	return r;
181	}
182
183	static void __put_snap_realm(struct ceph_mds_client *mdsc,
184	struct ceph_snap_realm *realm);
185
186	/*
187	* called with snap_rwsem (write)
188	*/
189	static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
190	struct ceph_snap_realm *realm)
191	{
192	struct ceph_client *cl = mdsc->fsc->client;
193	lockdep_assert_held_write(&mdsc->snap_rwsem);
194
195	doutc(cl, "%p %llx\n", realm, realm->ino);
196
197	rb_erase(&realm->node, &mdsc->snap_realms);
198	mdsc->num_snap_realms--;
199
200	if (realm->parent) {
201	list_del_init(entry: &realm->child_item);
202	__put_snap_realm(mdsc, realm: realm->parent);
203	}
204
205	kfree(objp: realm->prior_parent_snaps);
206	kfree(objp: realm->snaps);
207	ceph_put_snap_context(sc: realm->cached_context);
208	kfree(objp: realm);
209	}
210
211	/*
212	* caller holds snap_rwsem (write)
213	*/
214	static void __put_snap_realm(struct ceph_mds_client *mdsc,
215	struct ceph_snap_realm *realm)
216	{
217	lockdep_assert_held_write(&mdsc->snap_rwsem);
218
219	/*
220	* We do not require the snap_empty_lock here, as any caller that
221	* increments the value must hold the snap_rwsem.
222	*/
223	if (atomic_dec_and_test(v: &realm->nref))
224	__destroy_snap_realm(mdsc, realm);
225	}
226
227	/*
228	* See comments in ceph_get_snap_realm. Caller needn't hold any locks.
229	*/
230	void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
231	struct ceph_snap_realm *realm)
232	{
233	if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
234	return;
235
236	if (down_write_trylock(sem: &mdsc->snap_rwsem)) {
237	spin_unlock(lock: &mdsc->snap_empty_lock);
238	__destroy_snap_realm(mdsc, realm);
239	up_write(sem: &mdsc->snap_rwsem);
240	} else {
241	list_add(new: &realm->empty_item, head: &mdsc->snap_empty);
242	spin_unlock(lock: &mdsc->snap_empty_lock);
243	}
244	}
245
246	/*
247	* Clean up any realms whose ref counts have dropped to zero. Note
248	* that this does not include realms who were created but not yet
249	* used.
250	*
251	* Called under snap_rwsem (write)
252	*/
253	static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
254	{
255	struct ceph_snap_realm *realm;
256
257	lockdep_assert_held_write(&mdsc->snap_rwsem);
258
259	spin_lock(lock: &mdsc->snap_empty_lock);
260	while (!list_empty(head: &mdsc->snap_empty)) {
261	realm = list_first_entry(&mdsc->snap_empty,
262	struct ceph_snap_realm, empty_item);
263	list_del(entry: &realm->empty_item);
264	spin_unlock(lock: &mdsc->snap_empty_lock);
265	__destroy_snap_realm(mdsc, realm);
266	spin_lock(lock: &mdsc->snap_empty_lock);
267	}
268	spin_unlock(lock: &mdsc->snap_empty_lock);
269	}
270
271	void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
272	{
273	struct ceph_snap_realm *global_realm;
274
275	down_write(sem: &mdsc->snap_rwsem);
276	global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
277	if (global_realm)
278	ceph_put_snap_realm(mdsc, realm: global_realm);
279	__cleanup_empty_realms(mdsc);
280	up_write(sem: &mdsc->snap_rwsem);
281	}
282
283	/*
284	* adjust the parent realm of a given @realm. adjust child list, and parent
285	* pointers, and ref counts appropriately.
286	*
287	* return true if parent was changed, 0 if unchanged, <0 on error.
288	*
289	* caller must hold snap_rwsem for write.
290	*/
291	static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
292	struct ceph_snap_realm *realm,
293	u64 parentino)
294	{
295	struct ceph_client *cl = mdsc->fsc->client;
296	struct ceph_snap_realm *parent;
297
298	lockdep_assert_held_write(&mdsc->snap_rwsem);
299
300	if (realm->parent_ino == parentino)
301	return `0`;
302
303	parent = ceph_lookup_snap_realm(mdsc, ino: parentino);
304	if (!parent) {
305	parent = ceph_create_snap_realm(mdsc, ino: parentino);
306	if (IS_ERR(ptr: parent))
307	return PTR_ERR(ptr: parent);
308	}
309	doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm,
310	realm->parent_ino, realm->parent, parentino, parent);
311	if (realm->parent) {
312	list_del_init(entry: &realm->child_item);
313	ceph_put_snap_realm(mdsc, realm: realm->parent);
314	}
315	realm->parent_ino = parentino;
316	realm->parent = parent;
317	list_add(new: &realm->child_item, head: &parent->children);
318	return `1`;
319	}
320
321
322	static int cmpu64_rev(const void a, const* void *b)
323	{
324	if ((u64 )a < (u64 )b)
325	return `1`;
326	if ((u64 )a > (u64 )b)
327	return -`1`;
328	return `0`;
329	}
330
331
332	/*
333	* build the snap context for a given realm.
334	*/
335	static int build_snap_context(struct ceph_mds_client *mdsc,
336	struct ceph_snap_realm *realm,
337	struct list_head *realm_queue,
338	struct list_head *dirty_realms)
339	{
340	struct ceph_client *cl = mdsc->fsc->client;
341	struct ceph_snap_realm *parent = realm->parent;
342	struct ceph_snap_context *snapc;
343	int err = `0`;
344	u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
345
346	/*
347	* build parent context, if it hasn't been built.
348	* conservatively estimate that all parent snaps might be
349	* included by us.
350	*/
351	if (parent) {
352	if (!parent->cached_context) {
353	/ add to the queue head /
354	list_add(new: &parent->rebuild_item, head: realm_queue);
355	return `1`;
356	}
357	num += parent->cached_context->num_snaps;
358	}
359
360	/ do i actually need to update? not if my context seq*
361	matches realm seq, and my parents' does to. (this works
362	because we rebuild_snap_realms() works _downward_ in
363	hierarchy after each update.) /*
364	if (realm->cached_context &&
365	realm->cached_context->seq == realm->seq &&
366	(!parent \|\|
367	realm->cached_context->seq >= parent->cached_context->seq)) {
368	doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n",
369	realm->ino, realm, realm->cached_context,
370	realm->cached_context->seq,
371	(unsigned int)realm->cached_context->num_snaps);
372	return `0`;
373	}
374
375	/ alloc new snap context /
376	err = -ENOMEM;
377	if (num > (SIZE_MAX - sizeof(snapc)) / sizeof*(u64))
378	goto fail;
379	snapc = ceph_create_snap_context(snap_count: num, GFP_NOFS);
380	if (!snapc)
381	goto fail;
382
383	/ build (reverse sorted) snap vector /
384	num = `0`;
385	snapc->seq = realm->seq;
386	if (parent) {
387	u32 i;
388
389	/ include any of parent's snaps occurring _after_ my*
390	parent became my parent /*
391	for (i = `0`; i < parent->cached_context->num_snaps; i++)
392	if (parent->cached_context->snaps[i] >=
393	realm->parent_since)
394	snapc->snaps[num++] =
395	parent->cached_context->snaps[i];
396	if (parent->cached_context->seq > snapc->seq)
397	snapc->seq = parent->cached_context->seq;
398	}
399	memcpy(snapc->snaps + num, realm->snaps,
400	sizeof(u64)*realm->num_snaps);
401	num += realm->num_snaps;
402	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
403	sizeof(u64)*realm->num_prior_parent_snaps);
404	num += realm->num_prior_parent_snaps;
405
406	sort(base: snapc->snaps, num, size: sizeof(u64), cmp_func: cmpu64_rev, NULL);
407	snapc->num_snaps = num;
408	doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm,
409	snapc, snapc->seq, (unsigned int) snapc->num_snaps);
410
411	ceph_put_snap_context(sc: realm->cached_context);
412	realm->cached_context = snapc;
413	/ queue realm for cap_snap creation /
414	list_add_tail(new: &realm->dirty_item, head: dirty_realms);
415	return `0`;
416
417	fail:
418	/*
419	* if we fail, clear old (incorrect) cached_context... hopefully
420	* we'll have better luck building it later
421	*/
422	if (realm->cached_context) {
423	ceph_put_snap_context(sc: realm->cached_context);
424	realm->cached_context = NULL;
425	}
426	pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err);
427	return err;
428	}
429
430	/*
431	* rebuild snap context for the given realm and all of its children.
432	*/
433	static void rebuild_snap_realms(struct ceph_mds_client *mdsc,
434	struct ceph_snap_realm *realm,
435	struct list_head *dirty_realms)
436	{
437	struct ceph_client *cl = mdsc->fsc->client;
438	LIST_HEAD(realm_queue);
439	int last = `0`;
440	bool skip = false;
441
442	list_add_tail(new: &realm->rebuild_item, head: &realm_queue);
443
444	while (!list_empty(head: &realm_queue)) {
445	struct ceph_snap_realm _realm, child;
446
447	_realm = list_first_entry(&realm_queue,
448	struct ceph_snap_realm,
449	rebuild_item);
450
451	/*
452	* If the last building failed dues to memory
453	* issue, just empty the realm_queue and return
454	* to avoid infinite loop.
455	*/
456	if (last < `0`) {
457	list_del_init(entry: &_realm->rebuild_item);
458	continue;
459	}
460
461	last = build_snap_context(mdsc, realm: _realm, realm_queue: &realm_queue,
462	dirty_realms);
463	doutc(cl, "%llx %p, %s\n", realm->ino, realm,
464	last > `0` ? "is deferred" : !last ? "succeeded" : "failed");
465
466	/ is any child in the list ? /
467	list_for_each_entry(child, &_realm->children, child_item) {
468	if (!list_empty(head: &child->rebuild_item)) {
469	skip = true;
470	break;
471	}
472	}
473
474	if (!skip) {
475	list_for_each_entry(child, &_realm->children, child_item)
476	list_add_tail(new: &child->rebuild_item, head: &realm_queue);
477	}
478
479	/ last == 1 means need to build parent first /
480	if (last <= `0`)
481	list_del_init(entry: &_realm->rebuild_item);
482	}
483	}
484
485
486	/*
487	* helper to allocate and decode an array of snapids. free prior
488	* instance, if any.
489	*/
490	static int dup_array(u64 *dst, __le64 src, u32 num)
491	{
492	u32 i;
493
494	kfree(objp: *dst);
495	if (num) {
496	dst = kcalloc(n: num, size: sizeof*(u64), GFP_NOFS);
497	if (!*dst)
498	return -ENOMEM;
499	for (i = `0`; i < num; i++)
500	(*dst)[i] = get_unaligned_le64(p: src + i);
501	} else {
502	*dst = NULL;
503	}
504	return `0`;
505	}
506
507	static bool has_new_snaps(struct ceph_snap_context *o,
508	struct ceph_snap_context *n)
509	{
510	if (n->num_snaps == `0`)
511	return false;
512	/ snaps are in descending order /
513	return n->snaps[`0`] > o->seq;
514	}
515
516	/*
517	* When a snapshot is applied, the size/mtime inode metadata is queued
518	* in a ceph_cap_snap (one for each snapshot) until writeback
519	* completes and the metadata can be flushed back to the MDS.
520	*
521	* However, if a (sync) write is currently in-progress when we apply
522	* the snapshot, we have to wait until the write succeeds or fails
523	* (and a final size/mtime is known). In this case the
524	* cap_snap->writing = 1, and is said to be "pending." When the write
525	* finishes, we __ceph_finish_cap_snap().
526	*
527	* Caller must hold snap_rwsem for read (i.e., the realm topology won't
528	* change).
529	*/
530	static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
531	struct ceph_cap_snap **pcapsnap)
532	{
533	struct inode *inode = &ci->netfs.inode;
534	struct ceph_client *cl = ceph_inode_to_client(inode);
535	struct ceph_snap_context old_snapc, new_snapc;
536	struct ceph_cap_snap capsnap = pcapsnap;
537	struct ceph_buffer *old_blob = NULL;
538	int used, dirty;
539
540	spin_lock(lock: &ci->i_ceph_lock);
541	used = __ceph_caps_used(ci);
542	dirty = __ceph_caps_dirty(ci);
543
544	old_snapc = ci->i_head_snapc;
545	new_snapc = ci->i_snap_realm->cached_context;
546
547	/*
548	* If there is a write in progress, treat that as a dirty Fw,
549	* even though it hasn't completed yet; by the time we finish
550	* up this capsnap it will be.
551	*/
552	if (used & CEPH_CAP_FILE_WR)
553	dirty \|= CEPH_CAP_FILE_WR;
554
555	if (__ceph_have_pending_cap_snap(ci)) {
556	/ there is no point in queuing multiple "pending" cap_snaps,*
557	as no new writes are allowed to start when pending, so any
558	writes in progress now were started before the previous
559	cap_snap. lucky us. /*
560	doutc(cl, "%p %llx.%llx already pending\n", inode,
561	ceph_vinop(inode));
562	goto update_snapc;
563	}
564	if (ci->i_wrbuffer_ref_head == `0` &&
565	!(dirty & (CEPH_CAP_ANY_EXCL\|CEPH_CAP_FILE_WR))) {
566	doutc(cl, "%p %llx.%llx nothing dirty\|writing\n", inode,
567	ceph_vinop(inode));
568	goto update_snapc;
569	}
570
571	BUG_ON(!old_snapc);
572
573	/*
574	* There is no need to send FLUSHSNAP message to MDS if there is
575	* no new snapshot. But when there is dirty pages or on-going
576	* writes, we still need to create cap_snap. cap_snap is needed
577	* by the write path and page writeback path.
578	*
579	* also see ceph_try_drop_cap_snap()
580	*/
581	if (has_new_snaps(o: old_snapc, n: new_snapc)) {
582	if (dirty & (CEPH_CAP_ANY_EXCL\|CEPH_CAP_FILE_WR))
583	capsnap->need_flush = true;
584	} else {
585	if (!(used & CEPH_CAP_FILE_WR) &&
586	ci->i_wrbuffer_ref_head == `0`) {
587	doutc(cl, "%p %llx.%llx no new_snap\|dirty_page\|writing\n",
588	inode, ceph_vinop(inode));
589	goto update_snapc;
590	}
591	}
592
593	doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n",
594	inode, ceph_vinop(inode), capsnap, old_snapc,
595	ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
596	ihold(inode);
597
598	capsnap->follows = old_snapc->seq;
599	capsnap->issued = __ceph_caps_issued(ci, NULL);
600	capsnap->dirty = dirty;
601
602	capsnap->mode = inode->i_mode;
603	capsnap->uid = inode->i_uid;
604	capsnap->gid = inode->i_gid;
605
606	if (dirty & CEPH_CAP_XATTR_EXCL) {
607	old_blob = __ceph_build_xattrs_blob(ci);
608	capsnap->xattr_blob =
609	ceph_buffer_get(b: ci->i_xattrs.blob);
610	capsnap->xattr_version = ci->i_xattrs.version;
611	} else {
612	capsnap->xattr_blob = NULL;
613	capsnap->xattr_version = `0`;
614	}
615
616	capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
617
618	/ dirty page count moved from _head to this cap_snap;*
619	all subsequent writes page dirties occur _after_ this
620	snapshot. /*
621	capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
622	ci->i_wrbuffer_ref_head = `0`;
623	capsnap->context = old_snapc;
624	list_add_tail(new: &capsnap->ci_item, head: &ci->i_cap_snaps);
625
626	if (used & CEPH_CAP_FILE_WR) {
627	doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
628	" now pending\n", inode, ceph_vinop(inode), capsnap,
629	old_snapc, old_snapc->seq);
630	capsnap->writing = `1`;
631	} else {
632	/ note mtime, size NOW. /
633	__ceph_finish_cap_snap(ci, capsnap);
634	}
635	*pcapsnap = NULL;
636	old_snapc = NULL;
637
638	update_snapc:
639	if (ci->i_wrbuffer_ref_head == `0` &&
640	ci->i_wr_ref == `0` &&
641	ci->i_dirty_caps == `0` &&
642	ci->i_flushing_caps == `0`) {
643	ci->i_head_snapc = NULL;
644	} else {
645	ci->i_head_snapc = ceph_get_snap_context(sc: new_snapc);
646	doutc(cl, " new snapc is %p\n", new_snapc);
647	}
648	spin_unlock(lock: &ci->i_ceph_lock);
649
650	ceph_buffer_put(b: old_blob);
651	ceph_put_snap_context(sc: old_snapc);
652	}
653
654	/*
655	* Finalize the size, mtime for a cap_snap.. that is, settle on final values
656	* to be used for the snapshot, to be flushed back to the mds.
657	*
658	* If capsnap can now be flushed, add to snap_flush list, and return 1.
659	*
660	* Caller must hold i_ceph_lock.
661	*/
662	int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
663	struct ceph_cap_snap *capsnap)
664	{
665	struct inode *inode = &ci->netfs.inode;
666	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
667	struct ceph_client *cl = mdsc->fsc->client;
668
669	BUG_ON(capsnap->writing);
670	capsnap->size = i_size_read(inode);
671	capsnap->mtime = inode_get_mtime(inode);
672	capsnap->atime = inode_get_atime(inode);
673	capsnap->ctime = inode_get_ctime(inode);
674	capsnap->btime = ci->i_btime;
675	capsnap->change_attr = inode_peek_iversion_raw(inode);
676	capsnap->time_warp_seq = ci->i_time_warp_seq;
677	capsnap->truncate_size = ci->i_truncate_size;
678	capsnap->truncate_seq = ci->i_truncate_seq;
679	if (capsnap->dirty_pages) {
680	doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
681	"s=%llu still has %d dirty pages\n", inode,
682	ceph_vinop(inode), capsnap, capsnap->context,
683	capsnap->context->seq,
684	ceph_cap_string(capsnap->dirty),
685	capsnap->size, capsnap->dirty_pages);
686	return `0`;
687	}
688
689	/*
690	* Defer flushing the capsnap if the dirty buffer not flushed yet.
691	* And trigger to flush the buffer immediately.
692	*/
693	if (ci->i_wrbuffer_ref) {
694	doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
695	"s=%llu used WRBUFFER, delaying\n", inode,
696	ceph_vinop(inode), capsnap, capsnap->context,
697	capsnap->context->seq, ceph_cap_string(capsnap->dirty),
698	capsnap->size);
699	ceph_queue_writeback(inode);
700	return `0`;
701	}
702
703	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
704	doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
705	inode, ceph_vinop(inode), capsnap, capsnap->context,
706	capsnap->context->seq, ceph_cap_string(capsnap->dirty),
707	capsnap->size);
708
709	spin_lock(lock: &mdsc->snap_flush_lock);
710	if (list_empty(head: &ci->i_snap_flush_item)) {
711	ihold(inode);
712	list_add_tail(new: &ci->i_snap_flush_item, head: &mdsc->snap_flush_list);
713	}
714	spin_unlock(lock: &mdsc->snap_flush_lock);
715	return `1`; / caller may want to ceph_flush_snaps /
716	}
717
718	/*
719	* Queue cap_snaps for snap writeback for this realm and its children.
720	* Called under snap_rwsem, so realm topology won't change.
721	*/
722	static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
723	struct ceph_snap_realm *realm)
724	{
725	struct ceph_client *cl = mdsc->fsc->client;
726	struct ceph_inode_info *ci;
727	struct inode *lastinode = NULL;
728	struct ceph_cap_snap *capsnap = NULL;
729
730	doutc(cl, "%p %llx inode\n", realm, realm->ino);
731
732	spin_lock(lock: &realm->inodes_with_caps_lock);
733	list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
734	struct inode *inode = igrab(&ci->netfs.inode);
735	if (!inode)
736	continue;
737	spin_unlock(lock: &realm->inodes_with_caps_lock);
738	iput(lastinode);
739	lastinode = inode;
740
741	/*
742	* Allocate the capsnap memory outside of ceph_queue_cap_snap()
743	* to reduce very possible but unnecessary frequently memory
744	* allocate/free in this loop.
745	*/
746	if (!capsnap) {
747	capsnap = kmem_cache_zalloc(k: ceph_cap_snap_cachep, GFP_NOFS);
748	if (!capsnap) {
749	pr_err_client(cl,
750	"ENOMEM allocating ceph_cap_snap on %p\n",
751	inode);
752	return;
753	}
754	}
755	capsnap->cap_flush.is_capsnap = true;
756	refcount_set(r: &capsnap->nref, n: `1`);
757	INIT_LIST_HEAD(list: &capsnap->cap_flush.i_list);
758	INIT_LIST_HEAD(list: &capsnap->cap_flush.g_list);
759	INIT_LIST_HEAD(list: &capsnap->ci_item);
760
761	ceph_queue_cap_snap(ci, pcapsnap: &capsnap);
762	spin_lock(lock: &realm->inodes_with_caps_lock);
763	}
764	spin_unlock(lock: &realm->inodes_with_caps_lock);
765	iput(lastinode);
766
767	if (capsnap)
768	kmem_cache_free(s: ceph_cap_snap_cachep, objp: capsnap);
769	doutc(cl, "%p %llx done\n", realm, realm->ino);
770	}
771
772	/*
773	* Parse and apply a snapblob "snap trace" from the MDS. This specifies
774	* the snap realm parameters from a given realm and all of its ancestors,
775	* up to the root.
776	*
777	* Caller must hold snap_rwsem for write.
778	*/
779	int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
780	void p, void* *e, bool deletion,
781	struct ceph_snap_realm **realm_ret)
782	{
783	struct ceph_client *cl = mdsc->fsc->client;
784	struct ceph_mds_snap_realm ri; /* encoded /
785	__le64 snaps; /* encoded /
786	__le64 prior_parent_snaps; /* encoded /
787	struct ceph_snap_realm *realm;
788	struct ceph_snap_realm *first_realm = NULL;
789	struct ceph_snap_realm *realm_to_rebuild = NULL;
790	struct ceph_client *client = mdsc->fsc->client;
791	int rebuild_snapcs;
792	int err = -ENOMEM;
793	int ret;
794	LIST_HEAD(dirty_realms);
795
796	lockdep_assert_held_write(&mdsc->snap_rwsem);
797
798	doutc(cl, "deletion=%d\n", deletion);
799	more:
800	realm = NULL;
801	rebuild_snapcs = `0`;
802	ceph_decode_need(&p, e, sizeof(*ri), bad);
803	ri = p;
804	p += sizeof(*ri);
805	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
806	le32_to_cpu(ri->num_prior_parent_snaps)), bad);
807	snaps = p;
808	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
809	prior_parent_snaps = p;
810	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
811
812	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
813	if (!realm) {
814	realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
815	if (IS_ERR(ptr: realm)) {
816	err = PTR_ERR(ptr: realm);
817	goto fail;
818	}
819	}
820
821	/ ensure the parent is correct /
822	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
823	if (err < `0`)
824	goto fail;
825	rebuild_snapcs += err;
826
827	if (le64_to_cpu(ri->seq) > realm->seq) {
828	doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino,
829	realm, realm->seq, le64_to_cpu(ri->seq));
830	/ update realm parameters, snap lists /
831	realm->seq = le64_to_cpu(ri->seq);
832	realm->created = le64_to_cpu(ri->created);
833	realm->parent_since = le64_to_cpu(ri->parent_since);
834
835	realm->num_snaps = le32_to_cpu(ri->num_snaps);
836	err = dup_array(dst: &realm->snaps, src: snaps, num: realm->num_snaps);
837	if (err < `0`)
838	goto fail;
839
840	realm->num_prior_parent_snaps =
841	le32_to_cpu(ri->num_prior_parent_snaps);
842	err = dup_array(dst: &realm->prior_parent_snaps, src: prior_parent_snaps,
843	num: realm->num_prior_parent_snaps);
844	if (err < `0`)
845	goto fail;
846
847	if (realm->seq > mdsc->last_snap_seq)
848	mdsc->last_snap_seq = realm->seq;
849
850	rebuild_snapcs = `1`;
851	} else if (!realm->cached_context) {
852	doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm,
853	realm->seq);
854	rebuild_snapcs = `1`;
855	} else {
856	doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm,
857	realm->seq);
858	}
859
860	doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
861	realm, rebuild_snapcs, p, e);
862
863	/*
864	* this will always track the uppest parent realm from which
865	* we need to rebuild the snapshot contexts _downward_ in
866	* hierarchy.
867	*/
868	if (rebuild_snapcs)
869	realm_to_rebuild = realm;
870
871	/ rebuild_snapcs when we reach the _end_ (root) of the trace /
872	if (realm_to_rebuild && p >= e)
873	rebuild_snap_realms(mdsc, realm: realm_to_rebuild, dirty_realms: &dirty_realms);
874
875	if (!first_realm)
876	first_realm = realm;
877	else
878	ceph_put_snap_realm(mdsc, realm);
879
880	if (p < e)
881	goto more;
882
883	/*
884	* queue cap snaps _after_ we've built the new snap contexts,
885	* so that i_head_snapc can be set appropriately.
886	*/
887	while (!list_empty(head: &dirty_realms)) {
888	realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
889	dirty_item);
890	list_del_init(entry: &realm->dirty_item);
891	queue_realm_cap_snaps(mdsc, realm);
892	}
893
894	if (realm_ret)
895	*realm_ret = first_realm;
896	else
897	ceph_put_snap_realm(mdsc, realm: first_realm);
898
899	__cleanup_empty_realms(mdsc);
900	return `0`;
901
902	bad:
903	err = -EIO;
904	fail:
905	if (realm && !IS_ERR(ptr: realm))
906	ceph_put_snap_realm(mdsc, realm);
907	if (first_realm)
908	ceph_put_snap_realm(mdsc, realm: first_realm);
909	pr_err_client(cl, "error %d\n", err);
910
911	/*
912	* When receiving a corrupted snap trace we don't know what
913	* exactly has happened in MDS side. And we shouldn't continue
914	* writing to OSD, which may corrupt the snapshot contents.
915	*
916	* Just try to blocklist this kclient and then this kclient
917	* must be remounted to continue after the corrupted metadata
918	* fixed in the MDS side.
919	*/
920	WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
921	ret = ceph_monc_blocklist_add(monc: &client->monc, client_addr: &client->msgr.inst.addr);
922	if (ret)
923	pr_err_client(cl, "failed to blocklist %s: %d\n",
924	ceph_pr_addr(&client->msgr.inst.addr), ret);
925
926	WARN(`1`, "[client.%lld] %s %s%sdo remount to continue%s",
927	client->monc.auth->global_id, __func__,
928	ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
929	ret ? "" : " was blocklisted, ",
930	err == -EIO ? " after corrupted snaptrace is fixed" : "");
931
932	return err;
933	}
934
935
936	/*
937	* Send any cap_snaps that are queued for flush. Try to carry
938	* s_mutex across multiple snap flushes to avoid locking overhead.
939	*
940	* Caller holds no locks.
941	*/
942	static void flush_snaps(struct ceph_mds_client *mdsc)
943	{
944	struct ceph_client *cl = mdsc->fsc->client;
945	struct ceph_inode_info *ci;
946	struct inode *inode;
947	struct ceph_mds_session *session = NULL;
948
949	doutc(cl, "begin\n");
950	spin_lock(lock: &mdsc->snap_flush_lock);
951	while (!list_empty(head: &mdsc->snap_flush_list)) {
952	ci = list_first_entry(&mdsc->snap_flush_list,
953	struct ceph_inode_info, i_snap_flush_item);
954	inode = &ci->netfs.inode;
955	ihold(inode);
956	spin_unlock(lock: &mdsc->snap_flush_lock);
957	ceph_flush_snaps(ci, psession: &session);
958	iput(inode);
959	spin_lock(lock: &mdsc->snap_flush_lock);
960	}
961	spin_unlock(lock: &mdsc->snap_flush_lock);
962
963	ceph_put_mds_session(s: session);
964	doutc(cl, "done\n");
965	}
966
967	/**
968	* ceph_change_snap_realm - change the snap_realm for an inode
969	* @inode: inode to move to new snap realm
970	* @realm: new realm to move inode into (may be NULL)
971	*
972	* Detach an inode from its old snaprealm (if any) and attach it to
973	* the new snaprealm (if any). The old snap realm reference held by
974	* the inode is put. If realm is non-NULL, then the caller's reference
975	* to it is taken over by the inode.
976	*/
977	void ceph_change_snap_realm(struct inode inode, struct* ceph_snap_realm *realm)
978	{
979	struct ceph_inode_info *ci = ceph_inode(inode);
980	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
981	struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
982
983	lockdep_assert_held(&ci->i_ceph_lock);
984
985	if (oldrealm) {
986	spin_lock(lock: &oldrealm->inodes_with_caps_lock);
987	list_del_init(entry: &ci->i_snap_realm_item);
988	if (oldrealm->ino == ci->i_vino.ino)
989	oldrealm->inode = NULL;
990	spin_unlock(lock: &oldrealm->inodes_with_caps_lock);
991	ceph_put_snap_realm(mdsc, realm: oldrealm);
992	}
993
994	ci->i_snap_realm = realm;
995
996	if (realm) {
997	spin_lock(lock: &realm->inodes_with_caps_lock);
998	list_add(new: &ci->i_snap_realm_item, head: &realm->inodes_with_caps);
999	if (realm->ino == ci->i_vino.ino)
1000	realm->inode = inode;
1001	spin_unlock(lock: &realm->inodes_with_caps_lock);
1002	}
1003	}
1004
1005	/*
1006	* Handle a snap notification from the MDS.
1007	*
1008	* This can take two basic forms: the simplest is just a snap creation
1009	* or deletion notification on an existing realm. This should update the
1010	* realm and its children.
1011	*
1012	* The more difficult case is realm creation, due to snap creation at a
1013	* new point in the file hierarchy, or due to a rename that moves a file or
1014	* directory into another realm.
1015	*/
1016	void ceph_handle_snap(struct ceph_mds_client *mdsc,
1017	struct ceph_mds_session *session,
1018	struct ceph_msg *msg)
1019	{
1020	struct ceph_client *cl = mdsc->fsc->client;
1021	struct super_block *sb = mdsc->fsc->sb;
1022	int mds = session->s_mds;
1023	u64 split;
1024	int op;
1025	int trace_len;
1026	struct ceph_snap_realm *realm = NULL;
1027	void *p = msg->front.iov_base;
1028	void *e = p + msg->front.iov_len;
1029	struct ceph_mds_snap_head *h;
1030	int num_split_inos, num_split_realms;
1031	__le64 split_inos = NULL, split_realms = NULL;
1032	int i;
1033	int locked_rwsem = `0`;
1034	bool close_sessions = false;
1035
1036	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
1037	return;
1038
1039	/ decode /
1040	if (msg->front.iov_len < sizeof(*h))
1041	goto bad;
1042	h = p;
1043	op = le32_to_cpu(h->op);
1044	split = le64_to_cpu(h->split); / non-zero if we are splitting an*
1045	* existing realm */
1046	num_split_inos = le32_to_cpu(h->num_split_inos);
1047	num_split_realms = le32_to_cpu(h->num_split_realms);
1048	trace_len = le32_to_cpu(h->trace_len);
1049	p += sizeof(*h);
1050
1051	doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds,
1052	ceph_snap_op_name(op), split, trace_len);
1053
1054	down_write(sem: &mdsc->snap_rwsem);
1055	locked_rwsem = `1`;
1056
1057	if (op == CEPH_SNAP_OP_SPLIT) {
1058	struct ceph_mds_snap_realm *ri;
1059
1060	/*
1061	* A "split" breaks part of an existing realm off into
1062	* a new realm. The MDS provides a list of inodes
1063	* (with caps) and child realms that belong to the new
1064	* child.
1065	*/
1066	split_inos = p;
1067	p += sizeof(u64) * num_split_inos;
1068	split_realms = p;
1069	p += sizeof(u64) * num_split_realms;
1070	ceph_decode_need(&p, e, sizeof(*ri), bad);
1071	/ we will peek at realm info here, but will _not_*
1072	* advance p, as the realm update will occur below in
1073	* ceph_update_snap_trace. */
1074	ri = p;
1075
1076	realm = ceph_lookup_snap_realm(mdsc, ino: split);
1077	if (!realm) {
1078	realm = ceph_create_snap_realm(mdsc, ino: split);
1079	if (IS_ERR(ptr: realm))
1080	goto out;
1081	}
1082
1083	doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm);
1084	for (i = `0`; i < num_split_inos; i++) {
1085	struct ceph_vino vino = {
1086	.ino = le64_to_cpu(split_inos[i]),
1087	.snap = CEPH_NOSNAP,
1088	};
1089	struct inode *inode = ceph_find_inode(sb, vino);
1090	struct ceph_inode_info *ci;
1091
1092	if (!inode)
1093	continue;
1094	ci = ceph_inode(inode);
1095
1096	spin_lock(lock: &ci->i_ceph_lock);
1097	if (!ci->i_snap_realm)
1098	goto skip_inode;
1099	/*
1100	* If this inode belongs to a realm that was
1101	* created after our new realm, we experienced
1102	* a race (due to another split notifications
1103	* arriving from a different MDS). So skip
1104	* this inode.
1105	*/
1106	if (ci->i_snap_realm->created >
1107	le64_to_cpu(ri->created)) {
1108	doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n",
1109	inode, ceph_vinop(inode), ci->i_snap_realm->ino,
1110	ci->i_snap_realm);
1111	goto skip_inode;
1112	}
1113	doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n",
1114	inode, ceph_vinop(inode), realm->ino, realm);
1115
1116	ceph_get_snap_realm(mdsc, realm);
1117	ceph_change_snap_realm(inode, realm);
1118	spin_unlock(lock: &ci->i_ceph_lock);
1119	iput(inode);
1120	continue;
1121
1122	skip_inode:
1123	spin_unlock(lock: &ci->i_ceph_lock);
1124	iput(inode);
1125	}
1126
1127	/ we may have taken some of the old realm's children. /
1128	for (i = `0`; i < num_split_realms; i++) {
1129	struct ceph_snap_realm *child =
1130	__lookup_snap_realm(mdsc,
1131	le64_to_cpu(split_realms[i]));
1132	if (!child)
1133	continue;
1134	adjust_snap_realm_parent(mdsc, realm: child, parentino: realm->ino);
1135	}
1136	} else {
1137	/*
1138	* In the non-split case both 'num_split_inos' and
1139	* 'num_split_realms' should be 0, making this a no-op.
1140	* However the MDS happens to populate 'split_realms' list
1141	* in one of the UPDATE op cases by mistake.
1142	*
1143	* Skip both lists just in case to ensure that 'p' is
1144	* positioned at the start of realm info, as expected by
1145	* ceph_update_snap_trace().
1146	*/
1147	p += sizeof(u64) * num_split_inos;
1148	p += sizeof(u64) * num_split_realms;
1149	}
1150
1151	/*
1152	* update using the provided snap trace. if we are deleting a
1153	* snap, we can avoid queueing cap_snaps.
1154	*/
1155	if (ceph_update_snap_trace(mdsc, p, e,
1156	deletion: op == CEPH_SNAP_OP_DESTROY,
1157	NULL)) {
1158	close_sessions = true;
1159	goto bad;
1160	}
1161
1162	if (op == CEPH_SNAP_OP_SPLIT)
1163	/ we took a reference when we created the realm, above /
1164	ceph_put_snap_realm(mdsc, realm);
1165
1166	__cleanup_empty_realms(mdsc);
1167
1168	up_write(sem: &mdsc->snap_rwsem);
1169
1170	flush_snaps(mdsc);
1171	ceph_dec_mds_stopping_blocker(mdsc);
1172	return;
1173
1174	bad:
1175	pr_err_client(cl, "corrupt snap message from mds%d\n", mds);
1176	ceph_msg_dump(msg);
1177	out:
1178	if (locked_rwsem)
1179	up_write(sem: &mdsc->snap_rwsem);
1180
1181	ceph_dec_mds_stopping_blocker(mdsc);
1182
1183	if (close_sessions)
1184	ceph_mdsc_close_sessions(mdsc);
1185	return;
1186	}
1187
1188	struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
1189	u64 snap)
1190	{
1191	struct ceph_client *cl = mdsc->fsc->client;
1192	struct ceph_snapid_map sm, exist;
1193	struct rb_node *p, parent;
1194	int ret;
1195
1196	exist = NULL;
1197	spin_lock(lock: &mdsc->snapid_map_lock);
1198	p = &mdsc->snapid_map_tree.rb_node;
1199	while (*p) {
1200	exist = rb_entry(p, struct* ceph_snapid_map, node);
1201	if (snap > exist->snap) {
1202	p = &(*p)->rb_left;
1203	} else if (snap < exist->snap) {
1204	p = &(*p)->rb_right;
1205	} else {
1206	if (atomic_inc_return(v: &exist->ref) == `1`)
1207	list_del_init(entry: &exist->lru);
1208	break;
1209	}
1210	exist = NULL;
1211	}
1212	spin_unlock(lock: &mdsc->snapid_map_lock);
1213	if (exist) {
1214	doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
1215	exist->dev);
1216	return exist;
1217	}
1218
1219	sm = kmalloc(size: sizeof(*sm), GFP_NOFS);
1220	if (!sm)
1221	return NULL;
1222
1223	ret = get_anon_bdev(&sm->dev);
1224	if (ret < `0`) {
1225	kfree(objp: sm);
1226	return NULL;
1227	}
1228
1229	INIT_LIST_HEAD(list: &sm->lru);
1230	atomic_set(v: &sm->ref, i: `1`);
1231	sm->snap = snap;
1232
1233	exist = NULL;
1234	parent = NULL;
1235	p = &mdsc->snapid_map_tree.rb_node;
1236	spin_lock(lock: &mdsc->snapid_map_lock);
1237	while (*p) {
1238	parent = *p;
1239	exist = rb_entry(p, struct* ceph_snapid_map, node);
1240	if (snap > exist->snap)
1241	p = &(*p)->rb_left;
1242	else if (snap < exist->snap)
1243	p = &(*p)->rb_right;
1244	else
1245	break;
1246	exist = NULL;
1247	}
1248	if (exist) {
1249	if (atomic_inc_return(v: &exist->ref) == `1`)
1250	list_del_init(entry: &exist->lru);
1251	} else {
1252	rb_link_node(node: &sm->node, parent, rb_link: p);
1253	rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
1254	}
1255	spin_unlock(lock: &mdsc->snapid_map_lock);
1256	if (exist) {
1257	free_anon_bdev(sm->dev);
1258	kfree(objp: sm);
1259	doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
1260	exist->dev);
1261	return exist;
1262	}
1263
1264	doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev);
1265	return sm;
1266	}
1267
1268	void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
1269	struct ceph_snapid_map *sm)
1270	{
1271	if (!sm)
1272	return;
1273	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
1274	if (!RB_EMPTY_NODE(&sm->node)) {
1275	sm->last_used = jiffies;
1276	list_add_tail(new: &sm->lru, head: &mdsc->snapid_map_lru);
1277	spin_unlock(lock: &mdsc->snapid_map_lock);
1278	} else {
1279	/ already cleaned up by*
1280	* ceph_cleanup_snapid_map() */
1281	spin_unlock(lock: &mdsc->snapid_map_lock);
1282	kfree(objp: sm);
1283	}
1284	}
1285	}
1286
1287	void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
1288	{
1289	struct ceph_client *cl = mdsc->fsc->client;
1290	struct ceph_snapid_map *sm;
1291	unsigned long now;
1292	LIST_HEAD(to_free);
1293
1294	spin_lock(lock: &mdsc->snapid_map_lock);
1295	now = jiffies;
1296
1297	while (!list_empty(head: &mdsc->snapid_map_lru)) {
1298	sm = list_first_entry(&mdsc->snapid_map_lru,
1299	struct ceph_snapid_map, lru);
1300	if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
1301	break;
1302
1303	rb_erase(&sm->node, &mdsc->snapid_map_tree);
1304	list_move(list: &sm->lru, head: &to_free);
1305	}
1306	spin_unlock(lock: &mdsc->snapid_map_lock);
1307
1308	while (!list_empty(head: &to_free)) {
1309	sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
1310	list_del(entry: &sm->lru);
1311	doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev);
1312	free_anon_bdev(sm->dev);
1313	kfree(objp: sm);
1314	}
1315	}
1316
1317	void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
1318	{
1319	struct ceph_client *cl = mdsc->fsc->client;
1320	struct ceph_snapid_map *sm;
1321	struct rb_node *p;
1322	LIST_HEAD(to_free);
1323
1324	spin_lock(lock: &mdsc->snapid_map_lock);
1325	while ((p = rb_first(&mdsc->snapid_map_tree))) {
1326	sm = rb_entry(p, struct ceph_snapid_map, node);
1327	rb_erase(p, &mdsc->snapid_map_tree);
1328	RB_CLEAR_NODE(p);
1329	list_move(list: &sm->lru, head: &to_free);
1330	}
1331	spin_unlock(lock: &mdsc->snapid_map_lock);
1332
1333	while (!list_empty(head: &to_free)) {
1334	sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
1335	list_del(entry: &sm->lru);
1336	free_anon_bdev(sm->dev);
1337	if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
1338	pr_err_client(cl, "snapid map %llx -> %x still in use\n",
1339	sm->snap, sm->dev);
1340	}
1341	kfree(objp: sm);
1342	}
1343	}
1344

source code of linux/fs/ceph/snap.c