addr.c source code [linux/fs/ceph/addr.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/backing-dev.h>
5	#include <linux/fs.h>
6	#include <linux/mm.h>
7	#include <linux/swap.h>
8	#include <linux/pagemap.h>
9	#include <linux/slab.h>
10	#include <linux/pagevec.h>
11	#include <linux/task_io_accounting_ops.h>
12	#include <linux/signal.h>
13	#include <linux/iversion.h>
14	#include <linux/ktime.h>
15	#include <linux/netfs.h>
16
17	#include "super.h"
18	#include "mds_client.h"
19	#include "cache.h"
20	#include "metric.h"
21	#include "crypto.h"
22	#include <linux/ceph/osd_client.h>
23	#include <linux/ceph/striper.h>
24
25	/*
26	* Ceph address space ops.
27	*
28	* There are a few funny things going on here.
29	*
30	* The page->private field is used to reference a struct
31	* ceph_snap_context for _every_ dirty page. This indicates which
32	* snapshot the page was logically dirtied in, and thus which snap
33	* context needs to be associated with the osd write during writeback.
34	*
35	* Similarly, struct ceph_inode_info maintains a set of counters to
36	* count dirty pages on the inode. In the absence of snapshots,
37	* i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
38	*
39	* When a snapshot is taken (that is, when the client receives
40	* notification that a snapshot was taken), each inode with caps and
41	* with dirty pages (dirty pages implies there is a cap) gets a new
42	* ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
43	* order, new snaps go to the tail). The i_wrbuffer_ref_head count is
44	* moved to capsnap->dirty. (Unless a sync write is currently in
45	* progress. In that case, the capsnap is said to be "pending", new
46	* writes cannot start, and the capsnap isn't "finalized" until the
47	* write completes (or fails) and a final size/mtime for the inode for
48	* that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
49	*
50	* On writeback, we must submit writes to the osd IN SNAP ORDER. So,
51	* we look for the first capsnap in i_cap_snaps and write out pages in
52	* that snap context _only_. Then we move on to the next capsnap,
53	* eventually reaching the "live" or "head" context (i.e., pages that
54	* are not yet snapped) and are writing the most recently dirtied
55	* pages.
56	*
57	* Invalidate and so forth must take care to ensure the dirty page
58	* accounting is preserved.
59	*/
60
61	#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
62	#define CONGESTION_OFF_THRESH(congestion_kb) \
63	(CONGESTION_ON_THRESH(congestion_kb) - \
64	(CONGESTION_ON_THRESH(congestion_kb) >> 2))
65
66	static int ceph_netfs_check_write_begin(struct file file, loff_t pos, unsigned* int len,
67	struct folio *foliop, void* **_fsdata);
68
69	static inline struct ceph_snap_context page_snap_context(struct* page *page)
70	{
71	if (PagePrivate(page))
72	return (void *)page->private;
73	return NULL;
74	}
75
76	/*
77	* Dirty a page. Optimistically adjust accounting, on the assumption
78	* that we won't race with invalidate. If we do, readjust.
79	*/
80	static bool ceph_dirty_folio(struct address_space mapping, struct* folio *folio)
81	{
82	struct inode *inode = mapping->host;
83	struct ceph_client *cl = ceph_inode_to_client(inode);
84	struct ceph_inode_info *ci;
85	struct ceph_snap_context *snapc;
86
87	if (folio_test_dirty(folio)) {
88	doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
89	ceph_vinop(inode), folio, folio->index);
90	VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
91	return false;
92	}
93
94	ci = ceph_inode(inode);
95
96	/ dirty the head /
97	spin_lock(lock: &ci->i_ceph_lock);
98	BUG_ON(ci->i_wr_ref == `0`); // caller should hold Fw reference
99	if (__ceph_have_pending_cap_snap(ci)) {
100	struct ceph_cap_snap *capsnap =
101	list_last_entry(&ci->i_cap_snaps,
102	struct ceph_cap_snap,
103	ci_item);
104	snapc = ceph_get_snap_context(sc: capsnap->context);
105	capsnap->dirty_pages++;
106	} else {
107	BUG_ON(!ci->i_head_snapc);
108	snapc = ceph_get_snap_context(sc: ci->i_head_snapc);
109	++ci->i_wrbuffer_ref_head;
110	}
111	if (ci->i_wrbuffer_ref == `0`)
112	ihold(inode);
113	++ci->i_wrbuffer_ref;
114	doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
115	"snapc %p seq %lld (%d snaps)\n",
116	ceph_vinop(inode), folio, folio->index,
117	ci->i_wrbuffer_ref-`1`, ci->i_wrbuffer_ref_head-`1`,
118	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
119	snapc, snapc->seq, snapc->num_snaps);
120	spin_unlock(lock: &ci->i_ceph_lock);
121
122	/*
123	* Reference snap context in folio->private. Also set
124	* PagePrivate so that we get invalidate_folio callback.
125	*/
126	VM_WARN_ON_FOLIO(folio->private, folio);
127	folio_attach_private(folio, data: snapc);
128
129	return ceph_fscache_dirty_folio(mapping, folio);
130	}
131
132	/*
133	* If we are truncating the full folio (i.e. offset == 0), adjust the
134	* dirty folio counters appropriately. Only called if there is private
135	* data on the folio.
136	*/
137	static void ceph_invalidate_folio(struct folio *folio, size_t offset,
138	size_t length)
139	{
140	struct inode *inode = folio->mapping->host;
141	struct ceph_client *cl = ceph_inode_to_client(inode);
142	struct ceph_inode_info *ci = ceph_inode(inode);
143	struct ceph_snap_context *snapc;
144
145
146	if (offset != `0` \|\| length != folio_size(folio)) {
147	doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
148	ceph_vinop(inode), folio->index, offset, length);
149	return;
150	}
151
152	WARN_ON(!folio_test_locked(folio));
153	if (folio_test_private(folio)) {
154	doutc(cl, "%llx.%llx idx %lu full dirty page\n",
155	ceph_vinop(inode), folio->index);
156
157	snapc = folio_detach_private(folio);
158	ceph_put_wrbuffer_cap_refs(ci, nr: `1`, snapc);
159	ceph_put_snap_context(sc: snapc);
160	}
161
162	netfs_invalidate_folio(folio, offset, length);
163	}
164
165	static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
166	{
167	struct inode *inode = rreq->inode;
168	struct ceph_inode_info *ci = ceph_inode(inode);
169	struct ceph_file_layout *lo = &ci->i_layout;
170	unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
171	loff_t end = rreq->start + rreq->len, new_end;
172	struct ceph_netfs_request_data *priv = rreq->netfs_priv;
173	unsigned long max_len;
174	u32 blockoff;
175
176	if (priv) {
177	/ Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM /
178	if (priv->file_ra_disabled)
179	max_pages = `0`;
180	else
181	max_pages = priv->file_ra_pages;
182
183	}
184
185	/ Readahead is disabled /
186	if (!max_pages)
187	return;
188
189	max_len = max_pages << PAGE_SHIFT;
190
191	/*
192	* Try to expand the length forward by rounding up it to the next
193	* block, but do not exceed the file size, unless the original
194	* request already exceeds it.
195	*/
196	new_end = min(round_up(end, lo->stripe_unit), rreq->i_size);
197	if (new_end > end && new_end <= rreq->start + max_len)
198	rreq->len = new_end - rreq->start;
199
200	/ Try to expand the start downward /
201	div_u64_rem(dividend: rreq->start, divisor: lo->stripe_unit, remainder: &blockoff);
202	if (rreq->len + blockoff <= max_len) {
203	rreq->start -= blockoff;
204	rreq->len += blockoff;
205	}
206	}
207
208	static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
209	{
210	struct inode *inode = subreq->rreq->inode;
211	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
212	struct ceph_inode_info *ci = ceph_inode(inode);
213	u64 objno, objoff;
214	u32 xlen;
215
216	/ Truncate the extent at the end of the current block /
217	ceph_calc_file_object_mapping(l: &ci->i_layout, off: subreq->start, len: subreq->len,
218	objno: &objno, objoff: &objoff, xlen: &xlen);
219	subreq->len = min(xlen, fsc->mount_options->rsize);
220	return true;
221	}
222
223	static void finish_netfs_read(struct ceph_osd_request *req)
224	{
225	struct inode *inode = req->r_inode;
226	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
227	struct ceph_client *cl = fsc->client;
228	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(osd_req: req, which: `0`);
229	struct netfs_io_subrequest *subreq = req->r_priv;
230	struct ceph_osd_req_op *op = &req->r_ops[`0`];
231	int err = req->r_result;
232	bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
233
234	ceph_update_read_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
235	r_end: req->r_end_latency, size: osd_data->length, rc: err);
236
237	doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
238	subreq->len, i_size_read(req->r_inode));
239
240	/ no object means success but no data /
241	if (err == -ENOENT)
242	err = `0`;
243	else if (err == -EBLOCKLISTED)
244	fsc->blocklisted = true;
245
246	if (err >= `0`) {
247	if (sparse && err > `0`)
248	err = ceph_sparse_ext_map_end(op);
249	if (err < subreq->len)
250	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
251	if (IS_ENCRYPTED(inode) && err > `0`) {
252	err = ceph_fscrypt_decrypt_extents(inode,
253	page: osd_data->pages, off: subreq->start,
254	map: op->extent.sparse_ext,
255	ext_cnt: op->extent.sparse_ext_cnt);
256	if (err > subreq->len)
257	err = subreq->len;
258	}
259	}
260
261	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
262	ceph_put_page_vector(pages: osd_data->pages,
263	num_pages: calc_pages_for(off: osd_data->alignment,
264	len: osd_data->length), dirty: false);
265	}
266	netfs_subreq_terminated(subreq, err, false);
267	iput(req->r_inode);
268	ceph_dec_osd_stopping_blocker(mdsc: fsc->mdsc);
269	}
270
271	static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
272	{
273	struct netfs_io_request *rreq = subreq->rreq;
274	struct inode *inode = rreq->inode;
275	struct ceph_mds_reply_info_parsed *rinfo;
276	struct ceph_mds_reply_info_in *iinfo;
277	struct ceph_mds_request *req;
278	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
279	struct ceph_inode_info *ci = ceph_inode(inode);
280	struct iov_iter iter;
281	ssize_t err = `0`;
282	size_t len;
283	int mode;
284
285	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
286	__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
287
288	if (subreq->start >= inode->i_size)
289	goto out;
290
291	/ We need to fetch the inline data. /
292	mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
293	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_GETATTR, mode);
294	if (IS_ERR(ptr: req)) {
295	err = PTR_ERR(ptr: req);
296	goto out;
297	}
298	req->r_ino1 = ci->i_vino;
299	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
300	req->r_num_caps = `2`;
301
302	err = ceph_mdsc_do_request(mdsc, NULL, req);
303	if (err < `0`)
304	goto out;
305
306	rinfo = &req->r_reply_info;
307	iinfo = &rinfo->targeti;
308	if (iinfo->inline_version == CEPH_INLINE_NONE) {
309	/ The data got uninlined /
310	ceph_mdsc_put_request(req);
311	return false;
312	}
313
314	len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
315	iov_iter_xarray(i: &iter, ITER_DEST, xarray: &rreq->mapping->i_pages, start: subreq->start, count: len);
316	err = copy_to_iter(addr: iinfo->inline_data + subreq->start, bytes: len, i: &iter);
317	if (err == `0`)
318	err = -EFAULT;
319
320	ceph_mdsc_put_request(req);
321	out:
322	netfs_subreq_terminated(subreq, err, false);
323	return true;
324	}
325
326	static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
327	{
328	struct netfs_io_request *rreq = subreq->rreq;
329	struct inode *inode = rreq->inode;
330	struct ceph_inode_info *ci = ceph_inode(inode);
331	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
332	struct ceph_client *cl = fsc->client;
333	struct ceph_osd_request *req = NULL;
334	struct ceph_vino vino = ceph_vino(inode);
335	struct iov_iter iter;
336	int err = `0`;
337	u64 len = subreq->len;
338	bool sparse = IS_ENCRYPTED(inode) \|\| ceph_test_mount_opt(fsc, SPARSEREAD);
339	u64 off = subreq->start;
340	int extent_cnt;
341
342	if (ceph_inode_is_shutdown(inode)) {
343	err = -EIO;
344	goto out;
345	}
346
347	if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
348	return;
349
350	ceph_fscrypt_adjust_off_and_len(inode, off: &off, len: &len);
351
352	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout, vino,
353	offset: off, len: &len, which: `0`, num_ops: `1`, opcode: sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
354	flags: CEPH_OSD_FLAG_READ, NULL, truncate_seq: ci->i_truncate_seq,
355	truncate_size: ci->i_truncate_size, use_mempool: false);
356	if (IS_ERR(ptr: req)) {
357	err = PTR_ERR(ptr: req);
358	req = NULL;
359	goto out;
360	}
361
362	if (sparse) {
363	extent_cnt = __ceph_sparse_read_ext_count(inode, len);
364	err = ceph_alloc_sparse_ext_map(op: &req->r_ops[`0`], cnt: extent_cnt);
365	if (err)
366	goto out;
367	}
368
369	doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
370	ceph_vinop(inode), subreq->start, subreq->len, len);
371
372	iov_iter_xarray(i: &iter, ITER_DEST, xarray: &rreq->mapping->i_pages, start: subreq->start, count: len);
373
374	/*
375	* FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
376	* encrypted inodes. We'd need infrastructure that handles an iov_iter
377	* instead of page arrays, and we don't have that as of yet. Once the
378	* dust settles on the write helpers and encrypt/decrypt routines for
379	* netfs, we should be able to rework this.
380	*/
381	if (IS_ENCRYPTED(inode)) {
382	struct page **pages;
383	size_t page_off;
384
385	err = iov_iter_get_pages_alloc2(i: &iter, pages: &pages, maxsize: len, start: &page_off);
386	if (err < `0`) {
387	doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
388	ceph_vinop(inode), err);
389	goto out;
390	}
391
392	/ should always give us a page-aligned read /
393	WARN_ON_ONCE(page_off);
394	len = err;
395	err = `0`;
396
397	osd_req_op_extent_osd_data_pages(req, which: `0`, pages, length: len, alignment: `0`, pages_from_pool: false,
398	own_pages: false);
399	} else {
400	osd_req_op_extent_osd_iter(osd_req: req, which: `0`, iter: &iter);
401	}
402	if (!ceph_inc_osd_stopping_blocker(mdsc: fsc->mdsc)) {
403	err = -EIO;
404	goto out;
405	}
406	req->r_callback = finish_netfs_read;
407	req->r_priv = subreq;
408	req->r_inode = inode;
409	ihold(inode);
410
411	ceph_osdc_start_request(osdc: req->r_osdc, req);
412	out:
413	ceph_osdc_put_request(req);
414	if (err)
415	netfs_subreq_terminated(subreq, err, false);
416	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
417	}
418
419	static int ceph_init_request(struct netfs_io_request rreq, struct* file *file)
420	{
421	struct inode *inode = rreq->inode;
422	struct ceph_client *cl = ceph_inode_to_client(inode);
423	int got = `0`, want = CEPH_CAP_FILE_CACHE;
424	struct ceph_netfs_request_data *priv;
425	int ret = `0`;
426
427	if (rreq->origin != NETFS_READAHEAD)
428	return `0`;
429
430	priv = kzalloc(size: sizeof(*priv), GFP_NOFS);
431	if (!priv)
432	return -ENOMEM;
433
434	if (file) {
435	struct ceph_rw_context *rw_ctx;
436	struct ceph_file_info *fi = file->private_data;
437
438	priv->file_ra_pages = file->f_ra.ra_pages;
439	priv->file_ra_disabled = file->f_mode & FMODE_RANDOM;
440
441	rw_ctx = ceph_find_rw_context(cf: fi);
442	if (rw_ctx) {
443	rreq->netfs_priv = priv;
444	return `0`;
445	}
446	}
447
448	/*
449	* readahead callers do not necessarily hold Fcb caps
450	* (e.g. fadvise, madvise).
451	*/
452	ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, nonblock: true, got: &got);
453	if (ret < `0`) {
454	doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
455	goto out;
456	}
457
458	if (!(got & want)) {
459	doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
460	ret = -EACCES;
461	goto out;
462	}
463	if (ret == `0`) {
464	ret = -EACCES;
465	goto out;
466	}
467
468	priv->caps = got;
469	rreq->netfs_priv = priv;
470
471	out:
472	if (ret < `0`)
473	kfree(objp: priv);
474
475	return ret;
476	}
477
478	static void ceph_netfs_free_request(struct netfs_io_request *rreq)
479	{
480	struct ceph_netfs_request_data *priv = rreq->netfs_priv;
481
482	if (!priv)
483	return;
484
485	if (priv->caps)
486	ceph_put_cap_refs(ci: ceph_inode(inode: rreq->inode), had: priv->caps);
487	kfree(objp: priv);
488	rreq->netfs_priv = NULL;
489	}
490
491	const struct netfs_request_ops ceph_netfs_ops = {
492	.init_request = ceph_init_request,
493	.free_request = ceph_netfs_free_request,
494	.issue_read = ceph_netfs_issue_read,
495	.expand_readahead = ceph_netfs_expand_readahead,
496	.clamp_length = ceph_netfs_clamp_length,
497	.check_write_begin = ceph_netfs_check_write_begin,
498	};
499
500	#ifdef CONFIG_CEPH_FSCACHE
501	static void ceph_set_page_fscache(struct page *page)
502	{
503	set_page_fscache(page);
504	}
505
506	static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
507	{
508	struct inode *inode = priv;
509
510	if (IS_ERR_VALUE(error) && error != -ENOBUFS)
511	ceph_fscache_invalidate(inode, dio_write: false);
512	}
513
514	static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
515	{
516	struct ceph_inode_info *ci = ceph_inode(inode);
517	struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
518
519	fscache_write_to_cache(cookie, mapping: inode->i_mapping, start: off, len, i_size: i_size_read(inode),
520	term_func: ceph_fscache_write_terminated, term_func_priv: inode, caching);
521	}
522	#else
523	static inline void ceph_set_page_fscache(struct page *page)
524	{
525	}
526
527	static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
528	{
529	}
530	#endif /* CONFIG_CEPH_FSCACHE */
531
532	struct ceph_writeback_ctl
533	{
534	loff_t i_size;
535	u64 truncate_size;
536	u32 truncate_seq;
537	bool size_stable;
538	bool head_snapc;
539	};
540
541	/*
542	* Get ref for the oldest snapc for an inode with dirty data... that is, the
543	* only snap context we are allowed to write back.
544	*/
545	static struct ceph_snap_context *
546	get_oldest_context(struct inode inode, struct* ceph_writeback_ctl *ctl,
547	struct ceph_snap_context *page_snapc)
548	{
549	struct ceph_inode_info *ci = ceph_inode(inode);
550	struct ceph_client *cl = ceph_inode_to_client(inode);
551	struct ceph_snap_context *snapc = NULL;
552	struct ceph_cap_snap *capsnap = NULL;
553
554	spin_lock(lock: &ci->i_ceph_lock);
555	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
556	doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
557	capsnap, capsnap->context, capsnap->dirty_pages);
558	if (!capsnap->dirty_pages)
559	continue;
560
561	/ get i_size, truncate_{seq,size} for page_snapc? /
562	if (snapc && capsnap->context != page_snapc)
563	continue;
564
565	if (ctl) {
566	if (capsnap->writing) {
567	ctl->i_size = i_size_read(inode);
568	ctl->size_stable = false;
569	} else {
570	ctl->i_size = capsnap->size;
571	ctl->size_stable = true;
572	}
573	ctl->truncate_size = capsnap->truncate_size;
574	ctl->truncate_seq = capsnap->truncate_seq;
575	ctl->head_snapc = false;
576	}
577
578	if (snapc)
579	break;
580
581	snapc = ceph_get_snap_context(sc: capsnap->context);
582	if (!page_snapc \|\|
583	page_snapc == snapc \|\|
584	page_snapc->seq > snapc->seq)
585	break;
586	}
587	if (!snapc && ci->i_wrbuffer_ref_head) {
588	snapc = ceph_get_snap_context(sc: ci->i_head_snapc);
589	doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
590	ci->i_wrbuffer_ref_head);
591	if (ctl) {
592	ctl->i_size = i_size_read(inode);
593	ctl->truncate_size = ci->i_truncate_size;
594	ctl->truncate_seq = ci->i_truncate_seq;
595	ctl->size_stable = false;
596	ctl->head_snapc = true;
597	}
598	}
599	spin_unlock(lock: &ci->i_ceph_lock);
600	return snapc;
601	}
602
603	static u64 get_writepages_data_length(struct inode *inode,
604	struct page *page, u64 start)
605	{
606	struct ceph_inode_info *ci = ceph_inode(inode);
607	struct ceph_snap_context *snapc;
608	struct ceph_cap_snap *capsnap = NULL;
609	u64 end = i_size_read(inode);
610	u64 ret;
611
612	snapc = page_snap_context(page: ceph_fscrypt_pagecache_page(page));
613	if (snapc != ci->i_head_snapc) {
614	bool found = false;
615	spin_lock(lock: &ci->i_ceph_lock);
616	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
617	if (capsnap->context == snapc) {
618	if (!capsnap->writing)
619	end = capsnap->size;
620	found = true;
621	break;
622	}
623	}
624	spin_unlock(lock: &ci->i_ceph_lock);
625	WARN_ON(!found);
626	}
627	if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
628	end = ceph_fscrypt_page_offset(page) + thp_size(page);
629	ret = end > start ? end - start : `0`;
630	if (ret && fscrypt_is_bounce_page(page))
631	ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
632	return ret;
633	}
634
635	/*
636	* Write a single page, but leave the page locked.
637	*
638	* If we get a write error, mark the mapping for error, but still adjust the
639	* dirty page accounting (i.e., page is no longer dirty).
640	*/
641	static int writepage_nounlock(struct page page, struct* writeback_control *wbc)
642	{
643	struct folio *folio = page_folio(page);
644	struct inode *inode = page->mapping->host;
645	struct ceph_inode_info *ci = ceph_inode(inode);
646	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
647	struct ceph_client *cl = fsc->client;
648	struct ceph_snap_context snapc, oldest;
649	loff_t page_off = page_offset(page);
650	int err;
651	loff_t len = thp_size(page);
652	loff_t wlen;
653	struct ceph_writeback_ctl ceph_wbc;
654	struct ceph_osd_client *osdc = &fsc->client->osdc;
655	struct ceph_osd_request *req;
656	bool caching = ceph_is_cache_enabled(inode);
657	struct page *bounce_page = NULL;
658
659	doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
660	page->index);
661
662	if (ceph_inode_is_shutdown(inode))
663	return -EIO;
664
665	/ verify this is a writeable snap context /
666	snapc = page_snap_context(page);
667	if (!snapc) {
668	doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
669	page);
670	return `0`;
671	}
672	oldest = get_oldest_context(inode, ctl: &ceph_wbc, page_snapc: snapc);
673	if (snapc->seq > oldest->seq) {
674	doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
675	ceph_vinop(inode), page, snapc);
676	/ we should only noop if called by kswapd /
677	WARN_ON(!(current->flags & PF_MEMALLOC));
678	ceph_put_snap_context(sc: oldest);
679	redirty_page_for_writepage(wbc, page);
680	return `0`;
681	}
682	ceph_put_snap_context(sc: oldest);
683
684	/ is this a partial page at end of file? /
685	if (page_off >= ceph_wbc.i_size) {
686	doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
687	ceph_vinop(inode), folio->index, ceph_wbc.i_size);
688	folio_invalidate(folio, offset: `0`, length: folio_size(folio));
689	return `0`;
690	}
691
692	if (ceph_wbc.i_size < page_off + len)
693	len = ceph_wbc.i_size - page_off;
694
695	wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
696	doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
697	ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
698	snapc->seq);
699
700	if (atomic_long_inc_return(v: &fsc->writeback_count) >
701	CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
702	fsc->write_congested = true;
703
704	req = ceph_osdc_new_request(osdc, layout: &ci->i_layout, vino: ceph_vino(inode),
705	offset: page_off, len: &wlen, which: `0`, num_ops: `1`, opcode: CEPH_OSD_OP_WRITE,
706	flags: CEPH_OSD_FLAG_WRITE, snapc,
707	truncate_seq: ceph_wbc.truncate_seq,
708	truncate_size: ceph_wbc.truncate_size, use_mempool: true);
709	if (IS_ERR(ptr: req)) {
710	redirty_page_for_writepage(wbc, page);
711	return PTR_ERR(ptr: req);
712	}
713
714	if (wlen < len)
715	len = wlen;
716
717	set_page_writeback(page);
718	if (caching)
719	ceph_set_page_fscache(page);
720	ceph_fscache_write_to_cache(inode, off: page_off, len, caching);
721
722	if (IS_ENCRYPTED(inode)) {
723	bounce_page = fscrypt_encrypt_pagecache_blocks(page,
724	CEPH_FSCRYPT_BLOCK_SIZE, offs: `0`,
725	GFP_NOFS);
726	if (IS_ERR(ptr: bounce_page)) {
727	redirty_page_for_writepage(wbc, page);
728	end_page_writeback(page);
729	ceph_osdc_put_request(req);
730	return PTR_ERR(ptr: bounce_page);
731	}
732	}
733
734	/ it may be a short write due to an object boundary /
735	WARN_ON_ONCE(len > thp_size(page));
736	osd_req_op_extent_osd_data_pages(req, which: `0`,
737	pages: bounce_page ? &bounce_page : &page, length: wlen, alignment: `0`,
738	pages_from_pool: false, own_pages: false);
739	doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
740	ceph_vinop(inode), page_off, len, wlen,
741	IS_ENCRYPTED(inode) ? "" : "not ");
742
743	req->r_mtime = inode_get_mtime(inode);
744	ceph_osdc_start_request(osdc, req);
745	err = ceph_osdc_wait_request(osdc, req);
746
747	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
748	r_end: req->r_end_latency, size: len, rc: err);
749	fscrypt_free_bounce_page(bounce_page);
750	ceph_osdc_put_request(req);
751	if (err == `0`)
752	err = len;
753
754	if (err < `0`) {
755	struct writeback_control tmp_wbc;
756	if (!wbc)
757	wbc = &tmp_wbc;
758	if (err == -ERESTARTSYS) {
759	/ killed by SIGKILL /
760	doutc(cl, "%llx.%llx interrupted page %p\n",
761	ceph_vinop(inode), page);
762	redirty_page_for_writepage(wbc, page);
763	end_page_writeback(page);
764	return err;
765	}
766	if (err == -EBLOCKLISTED)
767	fsc->blocklisted = true;
768	doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
769	ceph_vinop(inode), err, page);
770	mapping_set_error(mapping: &inode->i_data, error: err);
771	wbc->pages_skipped++;
772	} else {
773	doutc(cl, "%llx.%llx cleaned page %p\n",
774	ceph_vinop(inode), page);
775	err = `0`; / vfs expects us to return 0 /
776	}
777	oldest = detach_page_private(page);
778	WARN_ON_ONCE(oldest != snapc);
779	end_page_writeback(page);
780	ceph_put_wrbuffer_cap_refs(ci, nr: `1`, snapc);
781	ceph_put_snap_context(sc: snapc); / page's reference /
782
783	if (atomic_long_dec_return(v: &fsc->writeback_count) <
784	CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
785	fsc->write_congested = false;
786
787	return err;
788	}
789
790	static int ceph_writepage(struct page page, struct* writeback_control *wbc)
791	{
792	int err;
793	struct inode *inode = page->mapping->host;
794	BUG_ON(!inode);
795	ihold(inode);
796
797	if (wbc->sync_mode == WB_SYNC_NONE &&
798	ceph_inode_to_fs_client(inode)->write_congested) {
799	redirty_page_for_writepage(wbc, page);
800	return AOP_WRITEPAGE_ACTIVATE;
801	}
802
803	wait_on_page_fscache(page);
804
805	err = writepage_nounlock(page, wbc);
806	if (err == -ERESTARTSYS) {
807	/ direct memory reclaimer was killed by SIGKILL. return 0*
808	* to prevent caller from setting mapping/page error */
809	err = `0`;
810	}
811	unlock_page(page);
812	iput(inode);
813	return err;
814	}
815
816	/*
817	* async writeback completion handler.
818	*
819	* If we get an error, set the mapping error bit, but not the individual
820	* page error bits.
821	*/
822	static void writepages_finish(struct ceph_osd_request *req)
823	{
824	struct inode *inode = req->r_inode;
825	struct ceph_inode_info *ci = ceph_inode(inode);
826	struct ceph_client *cl = ceph_inode_to_client(inode);
827	struct ceph_osd_data *osd_data;
828	struct page *page;
829	int num_pages, total_pages = `0`;
830	int i, j;
831	int rc = req->r_result;
832	struct ceph_snap_context *snapc = req->r_snapc;
833	struct address_space *mapping = inode->i_mapping;
834	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
835	unsigned int len = `0`;
836	bool remove_page;
837
838	doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
839	if (rc < `0`) {
840	mapping_set_error(mapping, error: rc);
841	ceph_set_error_write(ci);
842	if (rc == -EBLOCKLISTED)
843	fsc->blocklisted = true;
844	} else {
845	ceph_clear_error_write(ci);
846	}
847
848	/*
849	* We lost the cache cap, need to truncate the page before
850	* it is unlocked, otherwise we'd truncate it later in the
851	* page truncation thread, possibly losing some data that
852	* raced its way in
853	*/
854	remove_page = !(ceph_caps_issued(ci) &
855	(CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO));
856
857	/ clean all pages /
858	for (i = `0`; i < req->r_num_ops; i++) {
859	if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
860	pr_warn_client(cl,
861	"%llx.%llx incorrect op %d req %p index %d tid %llu\n",
862	ceph_vinop(inode), req->r_ops[i].op, req, i,
863	req->r_tid);
864	break;
865	}
866
867	osd_data = osd_req_op_extent_osd_data(osd_req: req, which: i);
868	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
869	len += osd_data->length;
870	num_pages = calc_pages_for(off: (u64)osd_data->alignment,
871	len: (u64)osd_data->length);
872	total_pages += num_pages;
873	for (j = `0`; j < num_pages; j++) {
874	page = osd_data->pages[j];
875	if (fscrypt_is_bounce_page(page)) {
876	page = fscrypt_pagecache_page(bounce_page: page);
877	fscrypt_free_bounce_page(bounce_page: osd_data->pages[j]);
878	osd_data->pages[j] = page;
879	}
880	BUG_ON(!page);
881	WARN_ON(!PageUptodate(page));
882
883	if (atomic_long_dec_return(v: &fsc->writeback_count) <
884	CONGESTION_OFF_THRESH(
885	fsc->mount_options->congestion_kb))
886	fsc->write_congested = false;
887
888	ceph_put_snap_context(sc: detach_page_private(page));
889	end_page_writeback(page);
890	doutc(cl, "unlocking %p\n", page);
891
892	if (remove_page)
893	generic_error_remove_folio(mapping: inode->i_mapping,
894	page_folio(page));
895
896	unlock_page(page);
897	}
898	doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
899	ceph_vinop(inode), osd_data->length,
900	rc >= `0` ? num_pages : `0`);
901
902	release_pages(osd_data->pages, nr: num_pages);
903	}
904
905	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
906	r_end: req->r_end_latency, size: len, rc);
907
908	ceph_put_wrbuffer_cap_refs(ci, nr: total_pages, snapc);
909
910	osd_data = osd_req_op_extent_osd_data(osd_req: req, which: `0`);
911	if (osd_data->pages_from_pool)
912	mempool_free(element: osd_data->pages, pool: ceph_wb_pagevec_pool);
913	else
914	kfree(objp: osd_data->pages);
915	ceph_osdc_put_request(req);
916	ceph_dec_osd_stopping_blocker(mdsc: fsc->mdsc);
917	}
918
919	/*
920	* initiate async writeback
921	*/
922	static int ceph_writepages_start(struct address_space *mapping,
923	struct writeback_control *wbc)
924	{
925	struct inode *inode = mapping->host;
926	struct ceph_inode_info *ci = ceph_inode(inode);
927	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
928	struct ceph_client *cl = fsc->client;
929	struct ceph_vino vino = ceph_vino(inode);
930	pgoff_t index, start_index, end = -`1`;
931	struct ceph_snap_context snapc = NULL, last_snapc = NULL, *pgsnapc;
932	struct folio_batch fbatch;
933	int rc = `0`;
934	unsigned int wsize = i_blocksize(node: inode);
935	struct ceph_osd_request *req = NULL;
936	struct ceph_writeback_ctl ceph_wbc;
937	bool should_loop, range_whole = false;
938	bool done = false;
939	bool caching = ceph_is_cache_enabled(inode);
940	xa_mark_t tag;
941
942	if (wbc->sync_mode == WB_SYNC_NONE &&
943	fsc->write_congested)
944	return `0`;
945
946	doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
947	wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
948	(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
949
950	if (ceph_inode_is_shutdown(inode)) {
951	if (ci->i_wrbuffer_ref > `0`) {
952	pr_warn_ratelimited_client(cl,
953	"%llx.%llx %lld forced umount\n",
954	ceph_vinop(inode), ceph_ino(inode));
955	}
956	mapping_set_error(mapping, error: -EIO);
957	return -EIO; / we're in a forced umount, don't write! /
958	}
959	if (fsc->mount_options->wsize < wsize)
960	wsize = fsc->mount_options->wsize;
961
962	folio_batch_init(fbatch: &fbatch);
963
964	start_index = wbc->range_cyclic ? mapping->writeback_index : `0`;
965	index = start_index;
966
967	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages) {
968	tag = PAGECACHE_TAG_TOWRITE;
969	} else {
970	tag = PAGECACHE_TAG_DIRTY;
971	}
972	retry:
973	/ find oldest snap context with dirty data /
974	snapc = get_oldest_context(inode, ctl: &ceph_wbc, NULL);
975	if (!snapc) {
976	/ hmm, why does writepages get called when there*
977	is no dirty data? /*
978	doutc(cl, " no snap context with dirty data?\n");
979	goto out;
980	}
981	doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
982	snapc->seq, snapc->num_snaps);
983
984	should_loop = false;
985	if (ceph_wbc.head_snapc && snapc != last_snapc) {
986	/ where to start/end? /
987	if (wbc->range_cyclic) {
988	index = start_index;
989	end = -`1`;
990	if (index > `0`)
991	should_loop = true;
992	doutc(cl, " cyclic, start at %lu\n", index);
993	} else {
994	index = wbc->range_start >> PAGE_SHIFT;
995	end = wbc->range_end >> PAGE_SHIFT;
996	if (wbc->range_start == `0` && wbc->range_end == LLONG_MAX)
997	range_whole = true;
998	doutc(cl, " not cyclic, %lu to %lu\n", index, end);
999	}
1000	} else if (!ceph_wbc.head_snapc) {
1001	/ Do not respect wbc->range_{start,end}. Dirty pages*
1002	* in that range can be associated with newer snapc.
1003	* They are not writeable until we write all dirty pages
1004	* associated with 'snapc' get written */
1005	if (index > `0`)
1006	should_loop = true;
1007	doutc(cl, " non-head snapc, range whole\n");
1008	}
1009
1010	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
1011	tag_pages_for_writeback(mapping, start: index, end);
1012
1013	ceph_put_snap_context(sc: last_snapc);
1014	last_snapc = snapc;
1015
1016	while (!done && index <= end) {
1017	int num_ops = `0`, op_idx;
1018	unsigned i, nr_folios, max_pages, locked_pages = `0`;
1019	struct page pages = NULL, data_pages;
1020	struct page *page;
1021	pgoff_t strip_unit_end = `0`;
1022	u64 offset = `0`, len = `0`;
1023	bool from_pool = false;
1024
1025	max_pages = wsize >> PAGE_SHIFT;
1026
1027	get_more_pages:
1028	nr_folios = filemap_get_folios_tag(mapping, start: &index,
1029	end, tag, fbatch: &fbatch);
1030	doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
1031	if (!nr_folios && !locked_pages)
1032	break;
1033	for (i = `0`; i < nr_folios && locked_pages < max_pages; i++) {
1034	page = &fbatch.folios[i]->page;
1035	doutc(cl, "? %p idx %lu\n", page, page->index);
1036	if (locked_pages == `0`)
1037	lock_page(page); / first page /
1038	else if (!trylock_page(page))
1039	break;
1040
1041	/ only dirty pages, or our accounting breaks /
1042	if (unlikely(!PageDirty(page)) \|\|
1043	unlikely(page->mapping != mapping)) {
1044	doutc(cl, "!dirty or !mapping %p\n", page);
1045	unlock_page(page);
1046	continue;
1047	}
1048	/ only if matching snap context /
1049	pgsnapc = page_snap_context(page);
1050	if (pgsnapc != snapc) {
1051	doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
1052	pgsnapc, pgsnapc->seq, snapc, snapc->seq);
1053	if (!should_loop &&
1054	!ceph_wbc.head_snapc &&
1055	wbc->sync_mode != WB_SYNC_NONE)
1056	should_loop = true;
1057	unlock_page(page);
1058	continue;
1059	}
1060	if (page_offset(page) >= ceph_wbc.i_size) {
1061	struct folio *folio = page_folio(page);
1062
1063	doutc(cl, "folio at %lu beyond eof %llu\n",
1064	folio->index, ceph_wbc.i_size);
1065	if ((ceph_wbc.size_stable \|\|
1066	folio_pos(folio) >= i_size_read(inode)) &&
1067	folio_clear_dirty_for_io(folio))
1068	folio_invalidate(folio, offset: `0`,
1069	length: folio_size(folio));
1070	folio_unlock(folio);
1071	continue;
1072	}
1073	if (strip_unit_end && (page->index > strip_unit_end)) {
1074	doutc(cl, "end of strip unit %p\n", page);
1075	unlock_page(page);
1076	break;
1077	}
1078	if (PageWriteback(page) \|\| PageFsCache(page)) {
1079	if (wbc->sync_mode == WB_SYNC_NONE) {
1080	doutc(cl, "%p under writeback\n", page);
1081	unlock_page(page);
1082	continue;
1083	}
1084	doutc(cl, "waiting on writeback %p\n", page);
1085	wait_on_page_writeback(page);
1086	wait_on_page_fscache(page);
1087	}
1088
1089	if (!clear_page_dirty_for_io(page)) {
1090	doutc(cl, "%p !clear_page_dirty_for_io\n", page);
1091	unlock_page(page);
1092	continue;
1093	}
1094
1095	/*
1096	* We have something to write. If this is
1097	* the first locked page this time through,
1098	* calculate max possinle write size and
1099	* allocate a page array
1100	*/
1101	if (locked_pages == `0`) {
1102	u64 objnum;
1103	u64 objoff;
1104	u32 xlen;
1105
1106	/ prepare async write request /
1107	offset = (u64)page_offset(page);
1108	ceph_calc_file_object_mapping(l: &ci->i_layout,
1109	off: offset, len: wsize,
1110	objno: &objnum, objoff: &objoff,
1111	xlen: &xlen);
1112	len = xlen;
1113
1114	num_ops = `1`;
1115	strip_unit_end = page->index +
1116	((len - `1`) >> PAGE_SHIFT);
1117
1118	BUG_ON(pages);
1119	max_pages = calc_pages_for(off: `0`, len: (u64)len);
1120	pages = kmalloc_array(n: max_pages,
1121	size: sizeof(*pages),
1122	GFP_NOFS);
1123	if (!pages) {
1124	from_pool = true;
1125	pages = mempool_alloc(pool: ceph_wb_pagevec_pool, GFP_NOFS);
1126	BUG_ON(!pages);
1127	}
1128
1129	len = `0`;
1130	} else if (page->index !=
1131	(offset + len) >> PAGE_SHIFT) {
1132	if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
1133	CEPH_OSD_MAX_OPS)) {
1134	redirty_page_for_writepage(wbc, page);
1135	unlock_page(page);
1136	break;
1137	}
1138
1139	num_ops++;
1140	offset = (u64)page_offset(page);
1141	len = `0`;
1142	}
1143
1144	/ note position of first page in fbatch /
1145	doutc(cl, "%llx.%llx will write page %p idx %lu\n",
1146	ceph_vinop(inode), page, page->index);
1147
1148	if (atomic_long_inc_return(v: &fsc->writeback_count) >
1149	CONGESTION_ON_THRESH(
1150	fsc->mount_options->congestion_kb))
1151	fsc->write_congested = true;
1152
1153	if (IS_ENCRYPTED(inode)) {
1154	pages[locked_pages] =
1155	fscrypt_encrypt_pagecache_blocks(page,
1156	PAGE_SIZE, offs: `0`,
1157	gfp_flags: locked_pages ? GFP_NOWAIT : GFP_NOFS);
1158	if (IS_ERR(ptr: pages[locked_pages])) {
1159	if (PTR_ERR(ptr: pages[locked_pages]) == -EINVAL)
1160	pr_err_client(cl,
1161	"inode->i_blkbits=%hhu\n",
1162	inode->i_blkbits);
1163	/ better not fail on first page! /
1164	BUG_ON(locked_pages == `0`);
1165	pages[locked_pages] = NULL;
1166	redirty_page_for_writepage(wbc, page);
1167	unlock_page(page);
1168	break;
1169	}
1170	++locked_pages;
1171	} else {
1172	pages[locked_pages++] = page;
1173	}
1174
1175	fbatch.folios[i] = NULL;
1176	len += thp_size(page);
1177	}
1178
1179	/ did we get anything? /
1180	if (!locked_pages)
1181	goto release_folios;
1182	if (i) {
1183	unsigned j, n = `0`;
1184	/ shift unused page to beginning of fbatch /
1185	for (j = `0`; j < nr_folios; j++) {
1186	if (!fbatch.folios[j])
1187	continue;
1188	if (n < j)
1189	fbatch.folios[n] = fbatch.folios[j];
1190	n++;
1191	}
1192	fbatch.nr = n;
1193
1194	if (nr_folios && i == nr_folios &&
1195	locked_pages < max_pages) {
1196	doutc(cl, "reached end fbatch, trying for more\n");
1197	folio_batch_release(fbatch: &fbatch);
1198	goto get_more_pages;
1199	}
1200	}
1201
1202	new_request:
1203	offset = ceph_fscrypt_page_offset(page: pages[`0`]);
1204	len = wsize;
1205
1206	req = ceph_osdc_new_request(&fsc->client->osdc,
1207	layout: &ci->i_layout, vino,
1208	offset, len: &len, which: `0`, num_ops,
1209	opcode: CEPH_OSD_OP_WRITE, flags: CEPH_OSD_FLAG_WRITE,
1210	snapc, truncate_seq: ceph_wbc.truncate_seq,
1211	truncate_size: ceph_wbc.truncate_size, use_mempool: false);
1212	if (IS_ERR(ptr: req)) {
1213	req = ceph_osdc_new_request(&fsc->client->osdc,
1214	layout: &ci->i_layout, vino,
1215	offset, len: &len, which: `0`,
1216	min(num_ops,
1217	CEPH_OSD_SLAB_OPS),
1218	opcode: CEPH_OSD_OP_WRITE,
1219	flags: CEPH_OSD_FLAG_WRITE,
1220	snapc, truncate_seq: ceph_wbc.truncate_seq,
1221	truncate_size: ceph_wbc.truncate_size, use_mempool: true);
1222	BUG_ON(IS_ERR(req));
1223	}
1224	BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - `1`]) +
1225	thp_size(pages[locked_pages - `1`]) - offset);
1226
1227	if (!ceph_inc_osd_stopping_blocker(mdsc: fsc->mdsc)) {
1228	rc = -EIO;
1229	goto release_folios;
1230	}
1231	req->r_callback = writepages_finish;
1232	req->r_inode = inode;
1233
1234	/ Format the osd request message and submit the write /
1235	len = `0`;
1236	data_pages = pages;
1237	op_idx = `0`;
1238	for (i = `0`; i < locked_pages; i++) {
1239	struct page *page = ceph_fscrypt_pagecache_page(page: pages[i]);
1240
1241	u64 cur_offset = page_offset(page);
1242	/*
1243	* Discontinuity in page range? Ceph can handle that by just passing
1244	* multiple extents in the write op.
1245	*/
1246	if (offset + len != cur_offset) {
1247	/ If it's full, stop here /
1248	if (op_idx + `1` == req->r_num_ops)
1249	break;
1250
1251	/ Kick off an fscache write with what we have so far. /
1252	ceph_fscache_write_to_cache(inode, off: offset, len, caching);
1253
1254	/ Start a new extent /
1255	osd_req_op_extent_dup_last(osd_req: req, which: op_idx,
1256	offset_inc: cur_offset - offset);
1257	doutc(cl, "got pages at %llu~%llu\n", offset,
1258	len);
1259	osd_req_op_extent_osd_data_pages(req, which: op_idx,
1260	pages: data_pages, length: len, alignment: `0`,
1261	pages_from_pool: from_pool, own_pages: false);
1262	osd_req_op_extent_update(osd_req: req, which: op_idx, length: len);
1263
1264	len = `0`;
1265	offset = cur_offset;
1266	data_pages = pages + i;
1267	op_idx++;
1268	}
1269
1270	set_page_writeback(page);
1271	if (caching)
1272	ceph_set_page_fscache(page);
1273	len += thp_size(page);
1274	}
1275	ceph_fscache_write_to_cache(inode, off: offset, len, caching);
1276
1277	if (ceph_wbc.size_stable) {
1278	len = min(len, ceph_wbc.i_size - offset);
1279	} else if (i == locked_pages) {
1280	/ writepages_finish() clears writeback pages*
1281	* according to the data length, so make sure
1282	* data length covers all locked pages */
1283	u64 min_len = len + `1` - thp_size(page);
1284	len = get_writepages_data_length(inode, page: pages[i - `1`],
1285	start: offset);
1286	len = max(len, min_len);
1287	}
1288	if (IS_ENCRYPTED(inode))
1289	len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
1290
1291	doutc(cl, "got pages at %llu~%llu\n", offset, len);
1292
1293	if (IS_ENCRYPTED(inode) &&
1294	((offset \| len) & ~CEPH_FSCRYPT_BLOCK_MASK))
1295	pr_warn_client(cl,
1296	"bad encrypted write offset=%lld len=%llu\n",
1297	offset, len);
1298
1299	osd_req_op_extent_osd_data_pages(req, which: op_idx, pages: data_pages, length: len,
1300	alignment: `0`, pages_from_pool: from_pool, own_pages: false);
1301	osd_req_op_extent_update(osd_req: req, which: op_idx, length: len);
1302
1303	BUG_ON(op_idx + `1` != req->r_num_ops);
1304
1305	from_pool = false;
1306	if (i < locked_pages) {
1307	BUG_ON(num_ops <= req->r_num_ops);
1308	num_ops -= req->r_num_ops;
1309	locked_pages -= i;
1310
1311	/ allocate new pages array for next request /
1312	data_pages = pages;
1313	pages = kmalloc_array(n: locked_pages, size: sizeof(*pages),
1314	GFP_NOFS);
1315	if (!pages) {
1316	from_pool = true;
1317	pages = mempool_alloc(pool: ceph_wb_pagevec_pool, GFP_NOFS);
1318	BUG_ON(!pages);
1319	}
1320	memcpy(pages, data_pages + i,
1321	locked_pages * sizeof(*pages));
1322	memset(data_pages + i, `0`,
1323	locked_pages * sizeof(*pages));
1324	} else {
1325	BUG_ON(num_ops != req->r_num_ops);
1326	index = pages[i - `1`]->index + `1`;
1327	/ request message now owns the pages array /
1328	pages = NULL;
1329	}
1330
1331	req->r_mtime = inode_get_mtime(inode);
1332	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
1333	req = NULL;
1334
1335	wbc->nr_to_write -= i;
1336	if (pages)
1337	goto new_request;
1338
1339	/*
1340	* We stop writing back only if we are not doing
1341	* integrity sync. In case of integrity sync we have to
1342	* keep going until we have written all the pages
1343	* we tagged for writeback prior to entering this loop.
1344	*/
1345	if (wbc->nr_to_write <= `0` && wbc->sync_mode == WB_SYNC_NONE)
1346	done = true;
1347
1348	release_folios:
1349	doutc(cl, "folio_batch release on %d folios (%p)\n",
1350	(int)fbatch.nr, fbatch.nr ? fbatch.folios[`0`] : NULL);
1351	folio_batch_release(fbatch: &fbatch);
1352	}
1353
1354	if (should_loop && !done) {
1355	/ more to do; loop back to beginning of file /
1356	doutc(cl, "looping back to beginning of file\n");
1357	end = start_index - `1`; / OK even when start_index == 0 /
1358
1359	/ to write dirty pages associated with next snapc,*
1360	* we need to wait until current writes complete */
1361	if (wbc->sync_mode != WB_SYNC_NONE &&
1362	start_index == `0` && / all dirty pages were checked /
1363	!ceph_wbc.head_snapc) {
1364	struct page *page;
1365	unsigned i, nr;
1366	index = `0`;
1367	while ((index <= end) &&
1368	(nr = filemap_get_folios_tag(mapping, start: &index,
1369	end: (pgoff_t)-`1`,
1370	PAGECACHE_TAG_WRITEBACK,
1371	fbatch: &fbatch))) {
1372	for (i = `0`; i < nr; i++) {
1373	page = &fbatch.folios[i]->page;
1374	if (page_snap_context(page) != snapc)
1375	continue;
1376	wait_on_page_writeback(page);
1377	}
1378	folio_batch_release(fbatch: &fbatch);
1379	cond_resched();
1380	}
1381	}
1382
1383	start_index = `0`;
1384	index = `0`;
1385	goto retry;
1386	}
1387
1388	if (wbc->range_cyclic \|\| (range_whole && wbc->nr_to_write > `0`))
1389	mapping->writeback_index = index;
1390
1391	out:
1392	ceph_osdc_put_request(req);
1393	ceph_put_snap_context(sc: last_snapc);
1394	doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
1395	rc);
1396	return rc;
1397	}
1398
1399
1400
1401	/*
1402	* See if a given @snapc is either writeable, or already written.
1403	*/
1404	static int context_is_writeable_or_written(struct inode *inode,
1405	struct ceph_snap_context *snapc)
1406	{
1407	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
1408	int ret = !oldest \|\| snapc->seq <= oldest->seq;
1409
1410	ceph_put_snap_context(sc: oldest);
1411	return ret;
1412	}
1413
1414	/**
1415	* ceph_find_incompatible - find an incompatible context and return it
1416	* @page: page being dirtied
1417	*
1418	* We are only allowed to write into/dirty a page if the page is
1419	* clean, or already dirty within the same snap context. Returns a
1420	* conflicting context if there is one, NULL if there isn't, or a
1421	* negative error code on other errors.
1422	*
1423	* Must be called with page lock held.
1424	*/
1425	static struct ceph_snap_context *
1426	ceph_find_incompatible(struct page *page)
1427	{
1428	struct inode *inode = page->mapping->host;
1429	struct ceph_client *cl = ceph_inode_to_client(inode);
1430	struct ceph_inode_info *ci = ceph_inode(inode);
1431
1432	if (ceph_inode_is_shutdown(inode)) {
1433	doutc(cl, " %llx.%llx page %p is shutdown\n",
1434	ceph_vinop(inode), page);
1435	return ERR_PTR(error: -ESTALE);
1436	}
1437
1438	for (;;) {
1439	struct ceph_snap_context snapc, oldest;
1440
1441	wait_on_page_writeback(page);
1442
1443	snapc = page_snap_context(page);
1444	if (!snapc \|\| snapc == ci->i_head_snapc)
1445	break;
1446
1447	/*
1448	* this page is already dirty in another (older) snap
1449	* context! is it writeable now?
1450	*/
1451	oldest = get_oldest_context(inode, NULL, NULL);
1452	if (snapc->seq > oldest->seq) {
1453	/ not writeable -- return it for the caller to deal with /
1454	ceph_put_snap_context(sc: oldest);
1455	doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
1456	ceph_vinop(inode), page, snapc);
1457	return ceph_get_snap_context(sc: snapc);
1458	}
1459	ceph_put_snap_context(sc: oldest);
1460
1461	/ yay, writeable, do it now (without dropping page lock) /
1462	doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
1463	ceph_vinop(inode), page, snapc);
1464	if (clear_page_dirty_for_io(page)) {
1465	int r = writepage_nounlock(page, NULL);
1466	if (r < `0`)
1467	return ERR_PTR(error: r);
1468	}
1469	}
1470	return NULL;
1471	}
1472
1473	static int ceph_netfs_check_write_begin(struct file file, loff_t pos, unsigned* int len,
1474	struct folio *foliop, void* **_fsdata)
1475	{
1476	struct inode *inode = file_inode(f: file);
1477	struct ceph_inode_info *ci = ceph_inode(inode);
1478	struct ceph_snap_context *snapc;
1479
1480	snapc = ceph_find_incompatible(folio_page(*foliop, `0`));
1481	if (snapc) {
1482	int r;
1483
1484	folio_unlock(folio: *foliop);
1485	folio_put(folio: *foliop);
1486	*foliop = NULL;
1487	if (IS_ERR(ptr: snapc))
1488	return PTR_ERR(ptr: snapc);
1489
1490	ceph_queue_writeback(inode);
1491	r = wait_event_killable(ci->i_cap_wq,
1492	context_is_writeable_or_written(inode, snapc));
1493	ceph_put_snap_context(sc: snapc);
1494	return r == `0` ? -EAGAIN : r;
1495	}
1496	return `0`;
1497	}
1498
1499	/*
1500	* We are only allowed to write into/dirty the page if the page is
1501	* clean, or already dirty within the same snap context.
1502	*/
1503	static int ceph_write_begin(struct file file, struct* address_space *mapping,
1504	loff_t pos, unsigned len,
1505	struct page *pagep, void* **fsdata)
1506	{
1507	struct inode *inode = file_inode(f: file);
1508	struct ceph_inode_info *ci = ceph_inode(inode);
1509	struct folio *folio = NULL;
1510	int r;
1511
1512	r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
1513	if (r < `0`)
1514	return r;
1515
1516	folio_wait_fscache(folio);
1517	WARN_ON_ONCE(!folio_test_locked(folio));
1518	*pagep = &folio->page;
1519	return `0`;
1520	}
1521
1522	/*
1523	* we don't do anything in here that simple_write_end doesn't do
1524	* except adjust dirty page accounting
1525	*/
1526	static int ceph_write_end(struct file file, struct* address_space *mapping,
1527	loff_t pos, unsigned len, unsigned copied,
1528	struct page subpage, void* *fsdata)
1529	{
1530	struct folio *folio = page_folio(subpage);
1531	struct inode *inode = file_inode(f: file);
1532	struct ceph_client *cl = ceph_inode_to_client(inode);
1533	bool check_cap = false;
1534
1535	doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
1536	file, folio, (int)pos, (int)copied, (int)len);
1537
1538	if (!folio_test_uptodate(folio)) {
1539	/ just return that nothing was copied on a short copy /
1540	if (copied < len) {
1541	copied = `0`;
1542	goto out;
1543	}
1544	folio_mark_uptodate(folio);
1545	}
1546
1547	/ did file size increase? /
1548	if (pos+copied > i_size_read(inode))
1549	check_cap = ceph_inode_set_size(inode, size: pos+copied);
1550
1551	folio_mark_dirty(folio);
1552
1553	out:
1554	folio_unlock(folio);
1555	folio_put(folio);
1556
1557	if (check_cap)
1558	ceph_check_caps(ci: ceph_inode(inode), CHECK_CAPS_AUTHONLY);
1559
1560	return copied;
1561	}
1562
1563	const struct address_space_operations ceph_aops = {
1564	.read_folio = netfs_read_folio,
1565	.readahead = netfs_readahead,
1566	.writepage = ceph_writepage,
1567	.writepages = ceph_writepages_start,
1568	.write_begin = ceph_write_begin,
1569	.write_end = ceph_write_end,
1570	.dirty_folio = ceph_dirty_folio,
1571	.invalidate_folio = ceph_invalidate_folio,
1572	.release_folio = netfs_release_folio,
1573	.direct_IO = noop_direct_IO,
1574	};
1575
1576	static void ceph_block_sigs(sigset_t *oldset)
1577	{
1578	sigset_t mask;
1579	siginitsetinv(set: &mask, sigmask(SIGKILL));
1580	sigprocmask(SIG_BLOCK, &mask, oldset);
1581	}
1582
1583	static void ceph_restore_sigs(sigset_t *oldset)
1584	{
1585	sigprocmask(SIG_SETMASK, oldset, NULL);
1586	}
1587
1588	/*
1589	* vm ops
1590	*/
1591	static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
1592	{
1593	struct vm_area_struct *vma = vmf->vma;
1594	struct inode *inode = file_inode(f: vma->vm_file);
1595	struct ceph_inode_info *ci = ceph_inode(inode);
1596	struct ceph_client *cl = ceph_inode_to_client(inode);
1597	struct ceph_file_info *fi = vma->vm_file->private_data;
1598	loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
1599	int want, got, err;
1600	sigset_t oldset;
1601	vm_fault_t ret = VM_FAULT_SIGBUS;
1602
1603	if (ceph_inode_is_shutdown(inode))
1604	return ret;
1605
1606	ceph_block_sigs(oldset: &oldset);
1607
1608	doutc(cl, "%llx.%llx %llu trying to get caps\n",
1609	ceph_vinop(inode), off);
1610	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1611	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
1612	else
1613	want = CEPH_CAP_FILE_CACHE;
1614
1615	got = `0`;
1616	err = ceph_get_caps(filp: vma->vm_file, CEPH_CAP_FILE_RD, want, endoff: -`1`, got: &got);
1617	if (err < `0`)
1618	goto out_restore;
1619
1620	doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
1621	off, ceph_cap_string(got));
1622
1623	if ((got & (CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO)) \|\|
1624	!ceph_has_inline_data(ci)) {
1625	CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
1626	ceph_add_rw_context(cf: fi, ctx: &rw_ctx);
1627	ret = filemap_fault(vmf);
1628	ceph_del_rw_context(cf: fi, ctx: &rw_ctx);
1629	doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
1630	ceph_vinop(inode), off, ceph_cap_string(got), ret);
1631	} else
1632	err = -EAGAIN;
1633
1634	ceph_put_cap_refs(ci, had: got);
1635
1636	if (err != -EAGAIN)
1637	goto out_restore;
1638
1639	/ read inline data /
1640	if (off >= PAGE_SIZE) {
1641	/ does not support inline data > PAGE_SIZE /
1642	ret = VM_FAULT_SIGBUS;
1643	} else {
1644	struct address_space *mapping = inode->i_mapping;
1645	struct page *page;
1646
1647	filemap_invalidate_lock_shared(mapping);
1648	page = find_or_create_page(mapping, index: `0`,
1649	gfp_mask: mapping_gfp_constraint(mapping, gfp_mask: ~__GFP_FS));
1650	if (!page) {
1651	ret = VM_FAULT_OOM;
1652	goto out_inline;
1653	}
1654	err = __ceph_do_getattr(inode, locked_page: page,
1655	CEPH_STAT_CAP_INLINE_DATA, force: true);
1656	if (err < `0` \|\| off >= i_size_read(inode)) {
1657	unlock_page(page);
1658	put_page(page);
1659	ret = vmf_error(err);
1660	goto out_inline;
1661	}
1662	if (err < PAGE_SIZE)
1663	zero_user_segment(page, start: err, PAGE_SIZE);
1664	else
1665	flush_dcache_page(page);
1666	SetPageUptodate(page);
1667	vmf->page = page;
1668	ret = VM_FAULT_MAJOR \| VM_FAULT_LOCKED;
1669	out_inline:
1670	filemap_invalidate_unlock_shared(mapping);
1671	doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
1672	ceph_vinop(inode), off, ret);
1673	}
1674	out_restore:
1675	ceph_restore_sigs(oldset: &oldset);
1676	if (err < `0`)
1677	ret = vmf_error(err);
1678
1679	return ret;
1680	}
1681
1682	static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1683	{
1684	struct vm_area_struct *vma = vmf->vma;
1685	struct inode *inode = file_inode(f: vma->vm_file);
1686	struct ceph_client *cl = ceph_inode_to_client(inode);
1687	struct ceph_inode_info *ci = ceph_inode(inode);
1688	struct ceph_file_info *fi = vma->vm_file->private_data;
1689	struct ceph_cap_flush *prealloc_cf;
1690	struct page *page = vmf->page;
1691	loff_t off = page_offset(page);
1692	loff_t size = i_size_read(inode);
1693	size_t len;
1694	int want, got, err;
1695	sigset_t oldset;
1696	vm_fault_t ret = VM_FAULT_SIGBUS;
1697
1698	if (ceph_inode_is_shutdown(inode))
1699	return ret;
1700
1701	prealloc_cf = ceph_alloc_cap_flush();
1702	if (!prealloc_cf)
1703	return VM_FAULT_OOM;
1704
1705	sb_start_pagefault(sb: inode->i_sb);
1706	ceph_block_sigs(oldset: &oldset);
1707
1708	if (off + thp_size(page) <= size)
1709	len = thp_size(page);
1710	else
1711	len = offset_in_thp(page, size);
1712
1713	doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
1714	ceph_vinop(inode), off, len, size);
1715	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1716	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
1717	else
1718	want = CEPH_CAP_FILE_BUFFER;
1719
1720	got = `0`;
1721	err = ceph_get_caps(filp: vma->vm_file, CEPH_CAP_FILE_WR, want, endoff: off + len, got: &got);
1722	if (err < `0`)
1723	goto out_free;
1724
1725	doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
1726	off, len, ceph_cap_string(got));
1727
1728	/ Update time before taking page lock /
1729	file_update_time(file: vma->vm_file);
1730	inode_inc_iversion_raw(inode);
1731
1732	do {
1733	struct ceph_snap_context *snapc;
1734
1735	lock_page(page);
1736
1737	if (page_mkwrite_check_truncate(page, inode) < `0`) {
1738	unlock_page(page);
1739	ret = VM_FAULT_NOPAGE;
1740	break;
1741	}
1742
1743	snapc = ceph_find_incompatible(page);
1744	if (!snapc) {
1745	/ success. we'll keep the page locked. /
1746	set_page_dirty(page);
1747	ret = VM_FAULT_LOCKED;
1748	break;
1749	}
1750
1751	unlock_page(page);
1752
1753	if (IS_ERR(ptr: snapc)) {
1754	ret = VM_FAULT_SIGBUS;
1755	break;
1756	}
1757
1758	ceph_queue_writeback(inode);
1759	err = wait_event_killable(ci->i_cap_wq,
1760	context_is_writeable_or_written(inode, snapc));
1761	ceph_put_snap_context(sc: snapc);
1762	} while (err == `0`);
1763
1764	if (ret == VM_FAULT_LOCKED) {
1765	int dirty;
1766	spin_lock(lock: &ci->i_ceph_lock);
1767	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1768	pcf: &prealloc_cf);
1769	spin_unlock(lock: &ci->i_ceph_lock);
1770	if (dirty)
1771	__mark_inode_dirty(inode, dirty);
1772	}
1773
1774	doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
1775	ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
1776	ceph_put_cap_refs_async(ci, had: got);
1777	out_free:
1778	ceph_restore_sigs(oldset: &oldset);
1779	sb_end_pagefault(sb: inode->i_sb);
1780	ceph_free_cap_flush(cf: prealloc_cf);
1781	if (err < `0`)
1782	ret = vmf_error(err);
1783	return ret;
1784	}
1785
1786	void ceph_fill_inline_data(struct inode inode, struct* page *locked_page,
1787	char *data, size_t len)
1788	{
1789	struct ceph_client *cl = ceph_inode_to_client(inode);
1790	struct address_space *mapping = inode->i_mapping;
1791	struct page *page;
1792
1793	if (locked_page) {
1794	page = locked_page;
1795	} else {
1796	if (i_size_read(inode) == `0`)
1797	return;
1798	page = find_or_create_page(mapping, index: `0`,
1799	gfp_mask: mapping_gfp_constraint(mapping,
1800	gfp_mask: ~__GFP_FS));
1801	if (!page)
1802	return;
1803	if (PageUptodate(page)) {
1804	unlock_page(page);
1805	put_page(page);
1806	return;
1807	}
1808	}
1809
1810	doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
1811	ceph_vinop(inode), len, locked_page);
1812
1813	if (len > `0`) {
1814	void *kaddr = kmap_atomic(page);
1815	memcpy(kaddr, data, len);
1816	kunmap_atomic(kaddr);
1817	}
1818
1819	if (page != locked_page) {
1820	if (len < PAGE_SIZE)
1821	zero_user_segment(page, start: len, PAGE_SIZE);
1822	else
1823	flush_dcache_page(page);
1824
1825	SetPageUptodate(page);
1826	unlock_page(page);
1827	put_page(page);
1828	}
1829	}
1830
1831	int ceph_uninline_data(struct file *file)
1832	{
1833	struct inode *inode = file_inode(f: file);
1834	struct ceph_inode_info *ci = ceph_inode(inode);
1835	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1836	struct ceph_client *cl = fsc->client;
1837	struct ceph_osd_request *req = NULL;
1838	struct ceph_cap_flush *prealloc_cf = NULL;
1839	struct folio *folio = NULL;
1840	u64 inline_version = CEPH_INLINE_NONE;
1841	struct page *pages[`1`];
1842	int err = `0`;
1843	u64 len;
1844
1845	spin_lock(lock: &ci->i_ceph_lock);
1846	inline_version = ci->i_inline_version;
1847	spin_unlock(lock: &ci->i_ceph_lock);
1848
1849	doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
1850	inline_version);
1851
1852	if (ceph_inode_is_shutdown(inode)) {
1853	err = -EIO;
1854	goto out;
1855	}
1856
1857	if (inline_version == CEPH_INLINE_NONE)
1858	return `0`;
1859
1860	prealloc_cf = ceph_alloc_cap_flush();
1861	if (!prealloc_cf)
1862	return -ENOMEM;
1863
1864	if (inline_version == `1`) / initial version, no data /
1865	goto out_uninline;
1866
1867	folio = read_mapping_folio(mapping: inode->i_mapping, index: `0`, file);
1868	if (IS_ERR(ptr: folio)) {
1869	err = PTR_ERR(ptr: folio);
1870	goto out;
1871	}
1872
1873	folio_lock(folio);
1874
1875	len = i_size_read(inode);
1876	if (len > folio_size(folio))
1877	len = folio_size(folio);
1878
1879	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout,
1880	vino: ceph_vino(inode), offset: `0`, len: &len, which: `0`, num_ops: `1`,
1881	opcode: CEPH_OSD_OP_CREATE, flags: CEPH_OSD_FLAG_WRITE,
1882	NULL, truncate_seq: `0`, truncate_size: `0`, use_mempool: false);
1883	if (IS_ERR(ptr: req)) {
1884	err = PTR_ERR(ptr: req);
1885	goto out_unlock;
1886	}
1887
1888	req->r_mtime = inode_get_mtime(inode);
1889	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
1890	err = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req);
1891	ceph_osdc_put_request(req);
1892	if (err < `0`)
1893	goto out_unlock;
1894
1895	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout,
1896	vino: ceph_vino(inode), offset: `0`, len: &len, which: `1`, num_ops: `3`,
1897	opcode: CEPH_OSD_OP_WRITE, flags: CEPH_OSD_FLAG_WRITE,
1898	NULL, truncate_seq: ci->i_truncate_seq,
1899	truncate_size: ci->i_truncate_size, use_mempool: false);
1900	if (IS_ERR(ptr: req)) {
1901	err = PTR_ERR(ptr: req);
1902	goto out_unlock;
1903	}
1904
1905	pages[`0`] = folio_page(folio, `0`);
1906	osd_req_op_extent_osd_data_pages(req, which: `1`, pages, length: len, alignment: `0`, pages_from_pool: false, own_pages: false);
1907
1908	{
1909	__le64 xattr_buf = cpu_to_le64(inline_version);
1910	err = osd_req_op_xattr_init(osd_req: req, which: `0`, opcode: CEPH_OSD_OP_CMPXATTR,
1911	name: "inline_version", value: &xattr_buf,
1912	size: sizeof(xattr_buf),
1913	cmp_op: CEPH_OSD_CMPXATTR_OP_GT,
1914	cmp_mode: CEPH_OSD_CMPXATTR_MODE_U64);
1915	if (err)
1916	goto out_put_req;
1917	}
1918
1919	{
1920	char xattr_buf[`32`];
1921	int xattr_len = snprintf(buf: xattr_buf, size: sizeof(xattr_buf),
1922	fmt: "%llu", inline_version);
1923	err = osd_req_op_xattr_init(osd_req: req, which: `2`, opcode: CEPH_OSD_OP_SETXATTR,
1924	name: "inline_version",
1925	value: xattr_buf, size: xattr_len, cmp_op: `0`, cmp_mode: `0`);
1926	if (err)
1927	goto out_put_req;
1928	}
1929
1930	req->r_mtime = inode_get_mtime(inode);
1931	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
1932	err = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req);
1933
1934	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
1935	r_end: req->r_end_latency, size: len, rc: err);
1936
1937	out_uninline:
1938	if (!err) {
1939	int dirty;
1940
1941	/ Set to CAP_INLINE_NONE and dirty the caps /
1942	down_read(sem: &fsc->mdsc->snap_rwsem);
1943	spin_lock(lock: &ci->i_ceph_lock);
1944	ci->i_inline_version = CEPH_INLINE_NONE;
1945	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, pcf: &prealloc_cf);
1946	spin_unlock(lock: &ci->i_ceph_lock);
1947	up_read(sem: &fsc->mdsc->snap_rwsem);
1948	if (dirty)
1949	__mark_inode_dirty(inode, dirty);
1950	}
1951	out_put_req:
1952	ceph_osdc_put_request(req);
1953	if (err == -ECANCELED)
1954	err = `0`;
1955	out_unlock:
1956	if (folio) {
1957	folio_unlock(folio);
1958	folio_put(folio);
1959	}
1960	out:
1961	ceph_free_cap_flush(cf: prealloc_cf);
1962	doutc(cl, "%llx.%llx inline_version %llu = %d\n",
1963	ceph_vinop(inode), inline_version, err);
1964	return err;
1965	}
1966
1967	static const struct vm_operations_struct ceph_vmops = {
1968	.fault = ceph_filemap_fault,
1969	.page_mkwrite = ceph_page_mkwrite,
1970	};
1971
1972	int ceph_mmap(struct file file, struct* vm_area_struct *vma)
1973	{
1974	struct address_space *mapping = file->f_mapping;
1975
1976	if (!mapping->a_ops->read_folio)
1977	return -ENOEXEC;
1978	vma->vm_ops = &ceph_vmops;
1979	return `0`;
1980	}
1981
1982	enum {
1983	POOL_READ = `1`,
1984	POOL_WRITE = `2`,
1985	};
1986
1987	static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
1988	s64 pool, struct ceph_string *pool_ns)
1989	{
1990	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode: &ci->netfs.inode);
1991	struct ceph_mds_client *mdsc = fsc->mdsc;
1992	struct ceph_client *cl = fsc->client;
1993	struct ceph_osd_request rd_req = NULL, wr_req = NULL;
1994	struct rb_node *p, parent;
1995	struct ceph_pool_perm *perm;
1996	struct page **pages;
1997	size_t pool_ns_len;
1998	int err = `0`, err2 = `0`, have = `0`;
1999
2000	down_read(sem: &mdsc->pool_perm_rwsem);
2001	p = &mdsc->pool_perm_tree.rb_node;
2002	while (*p) {
2003	perm = rb_entry(p, struct* ceph_pool_perm, node);
2004	if (pool < perm->pool)
2005	p = &(*p)->rb_left;
2006	else if (pool > perm->pool)
2007	p = &(*p)->rb_right;
2008	else {
2009	int ret = ceph_compare_string(cs: pool_ns,
2010	str: perm->pool_ns,
2011	len: perm->pool_ns_len);
2012	if (ret < `0`)
2013	p = &(*p)->rb_left;
2014	else if (ret > `0`)
2015	p = &(*p)->rb_right;
2016	else {
2017	have = perm->perm;
2018	break;
2019	}
2020	}
2021	}
2022	up_read(sem: &mdsc->pool_perm_rwsem);
2023	if (*p)
2024	goto out;
2025
2026	if (pool_ns)
2027	doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
2028	(int)pool_ns->len, pool_ns->str);
2029	else
2030	doutc(cl, "pool %lld no perm cached\n", pool);
2031
2032	down_write(sem: &mdsc->pool_perm_rwsem);
2033	p = &mdsc->pool_perm_tree.rb_node;
2034	parent = NULL;
2035	while (*p) {
2036	parent = *p;
2037	perm = rb_entry(parent, struct ceph_pool_perm, node);
2038	if (pool < perm->pool)
2039	p = &(*p)->rb_left;
2040	else if (pool > perm->pool)
2041	p = &(*p)->rb_right;
2042	else {
2043	int ret = ceph_compare_string(cs: pool_ns,
2044	str: perm->pool_ns,
2045	len: perm->pool_ns_len);
2046	if (ret < `0`)
2047	p = &(*p)->rb_left;
2048	else if (ret > `0`)
2049	p = &(*p)->rb_right;
2050	else {
2051	have = perm->perm;
2052	break;
2053	}
2054	}
2055	}
2056	if (*p) {
2057	up_write(sem: &mdsc->pool_perm_rwsem);
2058	goto out;
2059	}
2060
2061	rd_req = ceph_osdc_alloc_request(osdc: &fsc->client->osdc, NULL,
2062	num_ops: `1`, use_mempool: false, GFP_NOFS);
2063	if (!rd_req) {
2064	err = -ENOMEM;
2065	goto out_unlock;
2066	}
2067
2068	rd_req->r_flags = CEPH_OSD_FLAG_READ;
2069	osd_req_op_init(osd_req: rd_req, which: `0`, opcode: CEPH_OSD_OP_STAT, flags: `0`);
2070	rd_req->r_base_oloc.pool = pool;
2071	if (pool_ns)
2072	rd_req->r_base_oloc.pool_ns = ceph_get_string(str: pool_ns);
2073	ceph_oid_printf(oid: &rd_req->r_base_oid, fmt: "%llx.00000000", ci->i_vino.ino);
2074
2075	err = ceph_osdc_alloc_messages(req: rd_req, GFP_NOFS);
2076	if (err)
2077	goto out_unlock;
2078
2079	wr_req = ceph_osdc_alloc_request(osdc: &fsc->client->osdc, NULL,
2080	num_ops: `1`, use_mempool: false, GFP_NOFS);
2081	if (!wr_req) {
2082	err = -ENOMEM;
2083	goto out_unlock;
2084	}
2085
2086	wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
2087	osd_req_op_init(osd_req: wr_req, which: `0`, opcode: CEPH_OSD_OP_CREATE, flags: CEPH_OSD_OP_FLAG_EXCL);
2088	ceph_oloc_copy(dest: &wr_req->r_base_oloc, src: &rd_req->r_base_oloc);
2089	ceph_oid_copy(dest: &wr_req->r_base_oid, src: &rd_req->r_base_oid);
2090
2091	err = ceph_osdc_alloc_messages(req: wr_req, GFP_NOFS);
2092	if (err)
2093	goto out_unlock;
2094
2095	/ one page should be large enough for STAT data /
2096	pages = ceph_alloc_page_vector(num_pages: `1`, GFP_KERNEL);
2097	if (IS_ERR(ptr: pages)) {
2098	err = PTR_ERR(ptr: pages);
2099	goto out_unlock;
2100	}
2101
2102	osd_req_op_raw_data_in_pages(rd_req, which: `0`, pages, PAGE_SIZE,
2103	alignment: `0`, pages_from_pool: false, own_pages: true);
2104	ceph_osdc_start_request(osdc: &fsc->client->osdc, req: rd_req);
2105
2106	wr_req->r_mtime = inode_get_mtime(inode: &ci->netfs.inode);
2107	ceph_osdc_start_request(osdc: &fsc->client->osdc, req: wr_req);
2108
2109	err = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req: rd_req);
2110	err2 = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req: wr_req);
2111
2112	if (err >= `0` \|\| err == -ENOENT)
2113	have \|= POOL_READ;
2114	else if (err != -EPERM) {
2115	if (err == -EBLOCKLISTED)
2116	fsc->blocklisted = true;
2117	goto out_unlock;
2118	}
2119
2120	if (err2 == `0` \|\| err2 == -EEXIST)
2121	have \|= POOL_WRITE;
2122	else if (err2 != -EPERM) {
2123	if (err2 == -EBLOCKLISTED)
2124	fsc->blocklisted = true;
2125	err = err2;
2126	goto out_unlock;
2127	}
2128
2129	pool_ns_len = pool_ns ? pool_ns->len : `0`;
2130	perm = kmalloc(size: sizeof(*perm) + pool_ns_len + `1`, GFP_NOFS);
2131	if (!perm) {
2132	err = -ENOMEM;
2133	goto out_unlock;
2134	}
2135
2136	perm->pool = pool;
2137	perm->perm = have;
2138	perm->pool_ns_len = pool_ns_len;
2139	if (pool_ns_len > `0`)
2140	memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
2141	perm->pool_ns[pool_ns_len] = `0`;
2142
2143	rb_link_node(node: &perm->node, parent, rb_link: p);
2144	rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
2145	err = `0`;
2146	out_unlock:
2147	up_write(sem: &mdsc->pool_perm_rwsem);
2148
2149	ceph_osdc_put_request(req: rd_req);
2150	ceph_osdc_put_request(req: wr_req);
2151	out:
2152	if (!err)
2153	err = have;
2154	if (pool_ns)
2155	doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
2156	(int)pool_ns->len, pool_ns->str, err);
2157	else
2158	doutc(cl, "pool %lld result = %d\n", pool, err);
2159	return err;
2160	}
2161
2162	int ceph_pool_perm_check(struct inode inode, int* need)
2163	{
2164	struct ceph_client *cl = ceph_inode_to_client(inode);
2165	struct ceph_inode_info *ci = ceph_inode(inode);
2166	struct ceph_string *pool_ns;
2167	s64 pool;
2168	int ret, flags;
2169
2170	/ Only need to do this for regular files /
2171	if (!S_ISREG(inode->i_mode))
2172	return `0`;
2173
2174	if (ci->i_vino.snap != CEPH_NOSNAP) {
2175	/*
2176	* Pool permission check needs to write to the first object.
2177	* But for snapshot, head of the first object may have alread
2178	* been deleted. Skip check to avoid creating orphan object.
2179	*/
2180	return `0`;
2181	}
2182
2183	if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
2184	NOPOOLPERM))
2185	return `0`;
2186
2187	spin_lock(lock: &ci->i_ceph_lock);
2188	flags = ci->i_ceph_flags;
2189	pool = ci->i_layout.pool_id;
2190	spin_unlock(lock: &ci->i_ceph_lock);
2191	check:
2192	if (flags & CEPH_I_POOL_PERM) {
2193	if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
2194	doutc(cl, "pool %lld no read perm\n", pool);
2195	return -EPERM;
2196	}
2197	if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
2198	doutc(cl, "pool %lld no write perm\n", pool);
2199	return -EPERM;
2200	}
2201	return `0`;
2202	}
2203
2204	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
2205	ret = __ceph_pool_perm_get(ci, pool, pool_ns);
2206	ceph_put_string(str: pool_ns);
2207	if (ret < `0`)
2208	return ret;
2209
2210	flags = CEPH_I_POOL_PERM;
2211	if (ret & POOL_READ)
2212	flags \|= CEPH_I_POOL_RD;
2213	if (ret & POOL_WRITE)
2214	flags \|= CEPH_I_POOL_WR;
2215
2216	spin_lock(lock: &ci->i_ceph_lock);
2217	if (pool == ci->i_layout.pool_id &&
2218	pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
2219	ci->i_ceph_flags \|= flags;
2220	} else {
2221	pool = ci->i_layout.pool_id;
2222	flags = ci->i_ceph_flags;
2223	}
2224	spin_unlock(lock: &ci->i_ceph_lock);
2225	goto check;
2226	}
2227
2228	void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
2229	{
2230	struct ceph_pool_perm *perm;
2231	struct rb_node *n;
2232
2233	while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
2234	n = rb_first(&mdsc->pool_perm_tree);
2235	perm = rb_entry(n, struct ceph_pool_perm, node);
2236	rb_erase(n, &mdsc->pool_perm_tree);
2237	kfree(objp: perm);
2238	}
2239	}
2240

source code of linux/fs/ceph/addr.c