1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * (C) 2001 Clemson University and The University of Chicago |
4 | * Copyright 2018 Omnibond Systems, L.L.C. |
5 | * |
6 | * See COPYING in top-level directory. |
7 | */ |
8 | |
9 | /* |
10 | * Linux VFS inode operations. |
11 | */ |
12 | |
13 | #include <linux/blkdev.h> |
14 | #include <linux/fileattr.h> |
15 | #include "protocol.h" |
16 | #include "orangefs-kernel.h" |
17 | #include "orangefs-bufmap.h" |
18 | |
19 | static int orangefs_writepage_locked(struct folio *folio, |
20 | struct writeback_control *wbc) |
21 | { |
22 | struct inode *inode = folio->mapping->host; |
23 | struct orangefs_write_range *wr = NULL; |
24 | struct iov_iter iter; |
25 | struct bio_vec bv; |
26 | size_t wlen; |
27 | ssize_t ret; |
28 | loff_t len, off; |
29 | |
30 | folio_start_writeback(folio); |
31 | |
32 | len = i_size_read(inode); |
33 | if (folio->private) { |
34 | wr = folio->private; |
35 | off = wr->pos; |
36 | if ((off + wr->len > len) && (off <= len)) |
37 | wlen = len - off; |
38 | else |
39 | wlen = wr->len; |
40 | if (wlen == 0) |
41 | wlen = wr->len; |
42 | } else { |
43 | WARN_ON(1); |
44 | off = folio_pos(folio); |
45 | wlen = folio_size(folio); |
46 | |
47 | if (wlen > len - off) |
48 | wlen = len - off; |
49 | } |
50 | |
51 | WARN_ON(wlen == 0); |
52 | bvec_set_folio(bv: &bv, folio, len: wlen, offset_in_folio(folio, off)); |
53 | iov_iter_bvec(i: &iter, ITER_SOURCE, bvec: &bv, nr_segs: 1, count: wlen); |
54 | |
55 | ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, |
56 | len, wr, NULL, NULL); |
57 | if (ret < 0) { |
58 | mapping_set_error(mapping: folio->mapping, error: ret); |
59 | } else { |
60 | ret = 0; |
61 | } |
62 | kfree(objp: folio_detach_private(folio)); |
63 | return ret; |
64 | } |
65 | |
66 | struct orangefs_writepages { |
67 | loff_t off; |
68 | size_t len; |
69 | kuid_t uid; |
70 | kgid_t gid; |
71 | int maxpages; |
72 | int nfolios; |
73 | struct address_space *mapping; |
74 | struct folio **folios; |
75 | struct bio_vec *bv; |
76 | }; |
77 | |
78 | static int orangefs_writepages_work(struct orangefs_writepages *ow, |
79 | struct writeback_control *wbc) |
80 | { |
81 | struct inode *inode = ow->mapping->host; |
82 | struct orangefs_write_range *wrp, wr; |
83 | struct iov_iter iter; |
84 | ssize_t ret; |
85 | size_t start; |
86 | loff_t len, off; |
87 | int i; |
88 | |
89 | len = i_size_read(inode); |
90 | |
91 | start = offset_in_folio(ow->folios[0], ow->off); |
92 | for (i = 0; i < ow->nfolios; i++) { |
93 | folio_start_writeback(ow->folios[i]); |
94 | bvec_set_folio(bv: &ow->bv[i], folio: ow->folios[i], |
95 | len: folio_size(folio: ow->folios[i]) - start, offset: start); |
96 | start = 0; |
97 | } |
98 | iov_iter_bvec(i: &iter, ITER_SOURCE, bvec: ow->bv, nr_segs: ow->nfolios, count: ow->len); |
99 | |
100 | WARN_ON(ow->off >= len); |
101 | if (ow->off + ow->len > len) |
102 | ow->len = len - ow->off; |
103 | |
104 | off = ow->off; |
105 | wr.uid = ow->uid; |
106 | wr.gid = ow->gid; |
107 | ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, |
108 | 0, &wr, NULL, NULL); |
109 | if (ret < 0) |
110 | mapping_set_error(mapping: ow->mapping, error: ret); |
111 | else |
112 | ret = 0; |
113 | |
114 | for (i = 0; i < ow->nfolios; i++) { |
115 | wrp = folio_detach_private(folio: ow->folios[i]); |
116 | kfree(objp: wrp); |
117 | folio_end_writeback(folio: ow->folios[i]); |
118 | folio_unlock(folio: ow->folios[i]); |
119 | } |
120 | |
121 | return ret; |
122 | } |
123 | |
124 | static int orangefs_writepages_callback(struct folio *folio, |
125 | struct writeback_control *wbc, struct orangefs_writepages *ow) |
126 | { |
127 | struct orangefs_write_range *wr = folio->private; |
128 | int ret; |
129 | |
130 | if (!wr) { |
131 | folio_unlock(folio); |
132 | /* It's not private so there's nothing to write, right? */ |
133 | printk("writepages_callback not private!\n"); |
134 | BUG(); |
135 | return 0; |
136 | } |
137 | |
138 | ret = -1; |
139 | if (ow->nfolios == 0) { |
140 | ow->off = wr->pos; |
141 | ow->len = wr->len; |
142 | ow->uid = wr->uid; |
143 | ow->gid = wr->gid; |
144 | ow->folios[ow->nfolios++] = folio; |
145 | ret = 0; |
146 | goto done; |
147 | } |
148 | if (!uid_eq(left: ow->uid, right: wr->uid) || !gid_eq(left: ow->gid, right: wr->gid)) { |
149 | orangefs_writepages_work(ow, wbc); |
150 | ow->nfolios = 0; |
151 | ret = -1; |
152 | goto done; |
153 | } |
154 | if (ow->off + ow->len == wr->pos) { |
155 | ow->len += wr->len; |
156 | ow->folios[ow->nfolios++] = folio; |
157 | ret = 0; |
158 | goto done; |
159 | } |
160 | done: |
161 | if (ret == -1) { |
162 | if (ow->nfolios) { |
163 | orangefs_writepages_work(ow, wbc); |
164 | ow->nfolios = 0; |
165 | } |
166 | ret = orangefs_writepage_locked(folio, wbc); |
167 | mapping_set_error(mapping: folio->mapping, error: ret); |
168 | folio_unlock(folio); |
169 | folio_end_writeback(folio); |
170 | } else { |
171 | if (ow->nfolios == ow->maxpages) { |
172 | orangefs_writepages_work(ow, wbc); |
173 | ow->nfolios = 0; |
174 | } |
175 | } |
176 | return ret; |
177 | } |
178 | |
179 | static int orangefs_writepages(struct address_space *mapping, |
180 | struct writeback_control *wbc) |
181 | { |
182 | struct orangefs_writepages *ow; |
183 | struct blk_plug plug; |
184 | int error; |
185 | struct folio *folio = NULL; |
186 | |
187 | ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); |
188 | if (!ow) |
189 | return -ENOMEM; |
190 | ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; |
191 | ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL); |
192 | if (!ow->folios) { |
193 | kfree(objp: ow); |
194 | return -ENOMEM; |
195 | } |
196 | ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); |
197 | if (!ow->bv) { |
198 | kfree(objp: ow->folios); |
199 | kfree(objp: ow); |
200 | return -ENOMEM; |
201 | } |
202 | ow->mapping = mapping; |
203 | blk_start_plug(&plug); |
204 | while ((folio = writeback_iter(mapping, wbc, folio, error: &error))) |
205 | error = orangefs_writepages_callback(folio, wbc, ow); |
206 | if (ow->nfolios) |
207 | error = orangefs_writepages_work(ow, wbc); |
208 | blk_finish_plug(&plug); |
209 | kfree(objp: ow->folios); |
210 | kfree(objp: ow->bv); |
211 | kfree(objp: ow); |
212 | return error; |
213 | } |
214 | |
215 | static int orangefs_launder_folio(struct folio *); |
216 | |
217 | static void orangefs_readahead(struct readahead_control *rac) |
218 | { |
219 | loff_t offset; |
220 | struct iov_iter iter; |
221 | struct inode *inode = rac->mapping->host; |
222 | struct xarray *i_pages; |
223 | struct folio *folio; |
224 | loff_t new_start = readahead_pos(rac); |
225 | int ret; |
226 | size_t new_len = 0; |
227 | |
228 | loff_t bytes_remaining = inode->i_size - readahead_pos(rac); |
229 | loff_t pages_remaining = bytes_remaining / PAGE_SIZE; |
230 | |
231 | if (pages_remaining >= 1024) |
232 | new_len = 4194304; |
233 | else if (pages_remaining > readahead_count(rac)) |
234 | new_len = bytes_remaining; |
235 | |
236 | if (new_len) |
237 | readahead_expand(ractl: rac, new_start, new_len); |
238 | |
239 | offset = readahead_pos(rac); |
240 | i_pages = &rac->mapping->i_pages; |
241 | |
242 | iov_iter_xarray(i: &iter, ITER_DEST, xarray: i_pages, start: offset, count: readahead_length(rac)); |
243 | |
244 | /* read in the pages. */ |
245 | if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, |
246 | &offset, &iter, readahead_length(rac), |
247 | inode->i_size, NULL, NULL, rac->file)) < 0) |
248 | gossip_debug(GOSSIP_FILE_DEBUG, |
249 | "%s: wait_for_direct_io failed. \n", __func__); |
250 | else |
251 | ret = 0; |
252 | |
253 | /* clean up. */ |
254 | while ((folio = readahead_folio(ractl: rac))) { |
255 | if (!ret) |
256 | folio_mark_uptodate(folio); |
257 | folio_unlock(folio); |
258 | } |
259 | } |
260 | |
261 | static int orangefs_read_folio(struct file *file, struct folio *folio) |
262 | { |
263 | struct inode *inode = folio->mapping->host; |
264 | struct iov_iter iter; |
265 | struct bio_vec bv; |
266 | ssize_t ret; |
267 | loff_t off; /* offset of this folio in the file */ |
268 | |
269 | if (folio_test_dirty(folio)) |
270 | orangefs_launder_folio(folio); |
271 | |
272 | off = folio_pos(folio); |
273 | bvec_set_folio(bv: &bv, folio, len: folio_size(folio), offset: 0); |
274 | iov_iter_bvec(i: &iter, ITER_DEST, bvec: &bv, nr_segs: 1, count: folio_size(folio)); |
275 | |
276 | ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, |
277 | folio_size(folio), inode->i_size, NULL, NULL, file); |
278 | /* this will only zero remaining unread portions of the folio data */ |
279 | iov_iter_zero(bytes: ~0U, &iter); |
280 | /* takes care of potential aliasing */ |
281 | flush_dcache_folio(folio); |
282 | if (ret > 0) |
283 | ret = 0; |
284 | folio_end_read(folio, success: ret == 0); |
285 | return ret; |
286 | } |
287 | |
288 | static int orangefs_write_begin(struct file *file, |
289 | struct address_space *mapping, loff_t pos, unsigned len, |
290 | struct folio **foliop, void **fsdata) |
291 | { |
292 | struct orangefs_write_range *wr; |
293 | struct folio *folio; |
294 | int ret; |
295 | |
296 | folio = __filemap_get_folio(mapping, index: pos / PAGE_SIZE, FGP_WRITEBEGIN, |
297 | gfp: mapping_gfp_mask(mapping)); |
298 | if (IS_ERR(ptr: folio)) |
299 | return PTR_ERR(ptr: folio); |
300 | |
301 | *foliop = folio; |
302 | |
303 | if (folio_test_dirty(folio) && !folio_test_private(folio)) { |
304 | /* |
305 | * Should be impossible. If it happens, launder the page |
306 | * since we don't know what's dirty. This will WARN in |
307 | * orangefs_writepage_locked. |
308 | */ |
309 | ret = orangefs_launder_folio(folio); |
310 | if (ret) |
311 | return ret; |
312 | } |
313 | if (folio_test_private(folio)) { |
314 | struct orangefs_write_range *wr; |
315 | wr = folio_get_private(folio); |
316 | if (wr->pos + wr->len == pos && |
317 | uid_eq(left: wr->uid, current_fsuid()) && |
318 | gid_eq(left: wr->gid, current_fsgid())) { |
319 | wr->len += len; |
320 | goto okay; |
321 | } else { |
322 | wr->pos = pos; |
323 | wr->len = len; |
324 | ret = orangefs_launder_folio(folio); |
325 | if (ret) |
326 | return ret; |
327 | } |
328 | } |
329 | |
330 | wr = kmalloc(sizeof *wr, GFP_KERNEL); |
331 | if (!wr) |
332 | return -ENOMEM; |
333 | |
334 | wr->pos = pos; |
335 | wr->len = len; |
336 | wr->uid = current_fsuid(); |
337 | wr->gid = current_fsgid(); |
338 | folio_attach_private(folio, data: wr); |
339 | okay: |
340 | return 0; |
341 | } |
342 | |
343 | static int orangefs_write_end(struct file *file, struct address_space *mapping, |
344 | loff_t pos, unsigned len, unsigned copied, struct folio *folio, |
345 | void *fsdata) |
346 | { |
347 | struct inode *inode = folio->mapping->host; |
348 | loff_t last_pos = pos + copied; |
349 | |
350 | /* |
351 | * No need to use i_size_read() here, the i_size |
352 | * cannot change under us because we hold the i_mutex. |
353 | */ |
354 | if (last_pos > inode->i_size) |
355 | i_size_write(inode, i_size: last_pos); |
356 | |
357 | /* zero the stale part of the folio if we did a short copy */ |
358 | if (!folio_test_uptodate(folio)) { |
359 | unsigned from = pos & (PAGE_SIZE - 1); |
360 | if (copied < len) { |
361 | folio_zero_range(folio, start: from + copied, length: len - copied); |
362 | } |
363 | /* Set fully written pages uptodate. */ |
364 | if (pos == folio_pos(folio) && |
365 | (len == PAGE_SIZE || pos + len == inode->i_size)) { |
366 | folio_zero_segment(folio, start: from + copied, PAGE_SIZE); |
367 | folio_mark_uptodate(folio); |
368 | } |
369 | } |
370 | |
371 | folio_mark_dirty(folio); |
372 | folio_unlock(folio); |
373 | folio_put(folio); |
374 | |
375 | mark_inode_dirty_sync(inode: file_inode(f: file)); |
376 | return copied; |
377 | } |
378 | |
379 | static void orangefs_invalidate_folio(struct folio *folio, |
380 | size_t offset, size_t length) |
381 | { |
382 | struct orangefs_write_range *wr = folio_get_private(folio); |
383 | |
384 | if (offset == 0 && length == PAGE_SIZE) { |
385 | kfree(objp: folio_detach_private(folio)); |
386 | return; |
387 | /* write range entirely within invalidate range (or equal) */ |
388 | } else if (folio_pos(folio) + offset <= wr->pos && |
389 | wr->pos + wr->len <= folio_pos(folio) + offset + length) { |
390 | kfree(objp: folio_detach_private(folio)); |
391 | /* XXX is this right? only caller in fs */ |
392 | folio_cancel_dirty(folio); |
393 | return; |
394 | /* invalidate range chops off end of write range */ |
395 | } else if (wr->pos < folio_pos(folio) + offset && |
396 | wr->pos + wr->len <= folio_pos(folio) + offset + length && |
397 | folio_pos(folio) + offset < wr->pos + wr->len) { |
398 | size_t x; |
399 | x = wr->pos + wr->len - (folio_pos(folio) + offset); |
400 | WARN_ON(x > wr->len); |
401 | wr->len -= x; |
402 | wr->uid = current_fsuid(); |
403 | wr->gid = current_fsgid(); |
404 | /* invalidate range chops off beginning of write range */ |
405 | } else if (folio_pos(folio) + offset <= wr->pos && |
406 | folio_pos(folio) + offset + length < wr->pos + wr->len && |
407 | wr->pos < folio_pos(folio) + offset + length) { |
408 | size_t x; |
409 | x = folio_pos(folio) + offset + length - wr->pos; |
410 | WARN_ON(x > wr->len); |
411 | wr->pos += x; |
412 | wr->len -= x; |
413 | wr->uid = current_fsuid(); |
414 | wr->gid = current_fsgid(); |
415 | /* invalidate range entirely within write range (punch hole) */ |
416 | } else if (wr->pos < folio_pos(folio) + offset && |
417 | folio_pos(folio) + offset + length < wr->pos + wr->len) { |
418 | /* XXX what do we do here... should not WARN_ON */ |
419 | WARN_ON(1); |
420 | /* punch hole */ |
421 | /* |
422 | * should we just ignore this and write it out anyway? |
423 | * it hardly makes sense |
424 | */ |
425 | return; |
426 | /* non-overlapping ranges */ |
427 | } else { |
428 | /* WARN if they do overlap */ |
429 | if (!((folio_pos(folio) + offset + length <= wr->pos) ^ |
430 | (wr->pos + wr->len <= folio_pos(folio) + offset))) { |
431 | WARN_ON(1); |
432 | printk("invalidate range offset %llu length %zu\n", |
433 | folio_pos(folio) + offset, length); |
434 | printk("write range offset %llu length %zu\n", |
435 | wr->pos, wr->len); |
436 | } |
437 | return; |
438 | } |
439 | |
440 | /* |
441 | * Above there are returns where wr is freed or where we WARN. |
442 | * Thus the following runs if wr was modified above. |
443 | */ |
444 | |
445 | orangefs_launder_folio(folio); |
446 | } |
447 | |
448 | static bool orangefs_release_folio(struct folio *folio, gfp_t foo) |
449 | { |
450 | return !folio_test_private(folio); |
451 | } |
452 | |
453 | static void orangefs_free_folio(struct folio *folio) |
454 | { |
455 | kfree(objp: folio_detach_private(folio)); |
456 | } |
457 | |
458 | static int orangefs_launder_folio(struct folio *folio) |
459 | { |
460 | int r = 0; |
461 | struct writeback_control wbc = { |
462 | .sync_mode = WB_SYNC_ALL, |
463 | .nr_to_write = 0, |
464 | }; |
465 | folio_wait_writeback(folio); |
466 | if (folio_clear_dirty_for_io(folio)) { |
467 | r = orangefs_writepage_locked(folio, wbc: &wbc); |
468 | folio_end_writeback(folio); |
469 | } |
470 | return r; |
471 | } |
472 | |
473 | static ssize_t orangefs_direct_IO(struct kiocb *iocb, |
474 | struct iov_iter *iter) |
475 | { |
476 | /* |
477 | * Comment from original do_readv_writev: |
478 | * Common entry point for read/write/readv/writev |
479 | * This function will dispatch it to either the direct I/O |
480 | * or buffered I/O path depending on the mount options and/or |
481 | * augmented/extended metadata attached to the file. |
482 | * Note: File extended attributes override any mount options. |
483 | */ |
484 | struct file *file = iocb->ki_filp; |
485 | loff_t pos = iocb->ki_pos; |
486 | enum ORANGEFS_io_type type = iov_iter_rw(i: iter) == WRITE ? |
487 | ORANGEFS_IO_WRITE : ORANGEFS_IO_READ; |
488 | loff_t *offset = &pos; |
489 | struct inode *inode = file->f_mapping->host; |
490 | struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); |
491 | struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; |
492 | size_t count = iov_iter_count(i: iter); |
493 | ssize_t total_count = 0; |
494 | ssize_t ret = -EINVAL; |
495 | |
496 | gossip_debug(GOSSIP_FILE_DEBUG, |
497 | "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", |
498 | __func__, |
499 | handle, |
500 | (int)count); |
501 | |
502 | if (type == ORANGEFS_IO_WRITE) { |
503 | gossip_debug(GOSSIP_FILE_DEBUG, |
504 | "%s(%pU): proceeding with offset : %llu, " |
505 | "size %d\n", |
506 | __func__, |
507 | handle, |
508 | llu(*offset), |
509 | (int)count); |
510 | } |
511 | |
512 | if (count == 0) { |
513 | ret = 0; |
514 | goto out; |
515 | } |
516 | |
517 | while (iov_iter_count(i: iter)) { |
518 | size_t each_count = iov_iter_count(i: iter); |
519 | size_t amt_complete; |
520 | |
521 | /* how much to transfer in this loop iteration */ |
522 | if (each_count > orangefs_bufmap_size_query()) |
523 | each_count = orangefs_bufmap_size_query(); |
524 | |
525 | gossip_debug(GOSSIP_FILE_DEBUG, |
526 | "%s(%pU): size of each_count(%d)\n", |
527 | __func__, |
528 | handle, |
529 | (int)each_count); |
530 | gossip_debug(GOSSIP_FILE_DEBUG, |
531 | "%s(%pU): BEFORE wait_for_io: offset is %d\n", |
532 | __func__, |
533 | handle, |
534 | (int)*offset); |
535 | |
536 | ret = wait_for_direct_io(type, inode, offset, iter, |
537 | each_count, 0, NULL, NULL, file); |
538 | gossip_debug(GOSSIP_FILE_DEBUG, |
539 | "%s(%pU): return from wait_for_io:%d\n", |
540 | __func__, |
541 | handle, |
542 | (int)ret); |
543 | |
544 | if (ret < 0) |
545 | goto out; |
546 | |
547 | *offset += ret; |
548 | total_count += ret; |
549 | amt_complete = ret; |
550 | |
551 | gossip_debug(GOSSIP_FILE_DEBUG, |
552 | "%s(%pU): AFTER wait_for_io: offset is %d\n", |
553 | __func__, |
554 | handle, |
555 | (int)*offset); |
556 | |
557 | /* |
558 | * if we got a short I/O operations, |
559 | * fall out and return what we got so far |
560 | */ |
561 | if (amt_complete < each_count) |
562 | break; |
563 | } /*end while */ |
564 | |
565 | out: |
566 | if (total_count > 0) |
567 | ret = total_count; |
568 | if (ret > 0) { |
569 | if (type == ORANGEFS_IO_READ) { |
570 | file_accessed(file); |
571 | } else { |
572 | file_update_time(file); |
573 | if (*offset > i_size_read(inode)) |
574 | i_size_write(inode, i_size: *offset); |
575 | } |
576 | } |
577 | |
578 | gossip_debug(GOSSIP_FILE_DEBUG, |
579 | "%s(%pU): Value(%d) returned.\n", |
580 | __func__, |
581 | handle, |
582 | (int)ret); |
583 | |
584 | return ret; |
585 | } |
586 | |
587 | /** ORANGEFS2 implementation of address space operations */ |
588 | static const struct address_space_operations orangefs_address_operations = { |
589 | .readahead = orangefs_readahead, |
590 | .read_folio = orangefs_read_folio, |
591 | .writepages = orangefs_writepages, |
592 | .dirty_folio = filemap_dirty_folio, |
593 | .write_begin = orangefs_write_begin, |
594 | .write_end = orangefs_write_end, |
595 | .invalidate_folio = orangefs_invalidate_folio, |
596 | .release_folio = orangefs_release_folio, |
597 | .free_folio = orangefs_free_folio, |
598 | .migrate_folio = filemap_migrate_folio, |
599 | .launder_folio = orangefs_launder_folio, |
600 | .direct_IO = orangefs_direct_IO, |
601 | }; |
602 | |
603 | vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) |
604 | { |
605 | struct folio *folio = page_folio(vmf->page); |
606 | struct inode *inode = file_inode(f: vmf->vma->vm_file); |
607 | struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); |
608 | unsigned long *bitlock = &orangefs_inode->bitlock; |
609 | vm_fault_t ret; |
610 | struct orangefs_write_range *wr; |
611 | |
612 | sb_start_pagefault(sb: inode->i_sb); |
613 | |
614 | if (wait_on_bit(word: bitlock, bit: 1, TASK_KILLABLE)) { |
615 | ret = VM_FAULT_RETRY; |
616 | goto out; |
617 | } |
618 | |
619 | folio_lock(folio); |
620 | if (folio_test_dirty(folio) && !folio_test_private(folio)) { |
621 | /* |
622 | * Should be impossible. If it happens, launder the folio |
623 | * since we don't know what's dirty. This will WARN in |
624 | * orangefs_writepage_locked. |
625 | */ |
626 | if (orangefs_launder_folio(folio)) { |
627 | ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; |
628 | goto out; |
629 | } |
630 | } |
631 | if (folio_test_private(folio)) { |
632 | wr = folio_get_private(folio); |
633 | if (uid_eq(left: wr->uid, current_fsuid()) && |
634 | gid_eq(left: wr->gid, current_fsgid())) { |
635 | wr->pos = page_offset(page: vmf->page); |
636 | wr->len = PAGE_SIZE; |
637 | goto okay; |
638 | } else { |
639 | if (orangefs_launder_folio(folio)) { |
640 | ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; |
641 | goto out; |
642 | } |
643 | } |
644 | } |
645 | wr = kmalloc(sizeof *wr, GFP_KERNEL); |
646 | if (!wr) { |
647 | ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; |
648 | goto out; |
649 | } |
650 | wr->pos = page_offset(page: vmf->page); |
651 | wr->len = PAGE_SIZE; |
652 | wr->uid = current_fsuid(); |
653 | wr->gid = current_fsgid(); |
654 | folio_attach_private(folio, data: wr); |
655 | okay: |
656 | |
657 | file_update_time(file: vmf->vma->vm_file); |
658 | if (folio->mapping != inode->i_mapping) { |
659 | folio_unlock(folio); |
660 | ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE; |
661 | goto out; |
662 | } |
663 | |
664 | /* |
665 | * We mark the folio dirty already here so that when freeze is in |
666 | * progress, we are guaranteed that writeback during freezing will |
667 | * see the dirty folio and writeprotect it again. |
668 | */ |
669 | folio_mark_dirty(folio); |
670 | folio_wait_stable(folio); |
671 | ret = VM_FAULT_LOCKED; |
672 | out: |
673 | sb_end_pagefault(sb: inode->i_sb); |
674 | return ret; |
675 | } |
676 | |
677 | static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) |
678 | { |
679 | struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); |
680 | struct orangefs_kernel_op_s *new_op; |
681 | loff_t orig_size; |
682 | int ret = -EINVAL; |
683 | |
684 | gossip_debug(GOSSIP_INODE_DEBUG, |
685 | "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n", |
686 | __func__, |
687 | get_khandle_from_ino(inode), |
688 | &orangefs_inode->refn.khandle, |
689 | orangefs_inode->refn.fs_id, |
690 | iattr->ia_size); |
691 | |
692 | /* Ensure that we have a up to date size, so we know if it changed. */ |
693 | ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE); |
694 | if (ret == -ESTALE) |
695 | ret = -EIO; |
696 | if (ret) { |
697 | gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", |
698 | __func__, ret); |
699 | return ret; |
700 | } |
701 | orig_size = i_size_read(inode); |
702 | |
703 | /* This is truncate_setsize in a different order. */ |
704 | truncate_pagecache(inode, new: iattr->ia_size); |
705 | i_size_write(inode, i_size: iattr->ia_size); |
706 | if (iattr->ia_size > orig_size) |
707 | pagecache_isize_extended(inode, from: orig_size, to: iattr->ia_size); |
708 | |
709 | new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE); |
710 | if (!new_op) |
711 | return -ENOMEM; |
712 | |
713 | new_op->upcall.req.truncate.refn = orangefs_inode->refn; |
714 | new_op->upcall.req.truncate.size = (__s64) iattr->ia_size; |
715 | |
716 | ret = service_operation(op: new_op, |
717 | op_name: __func__, |
718 | get_interruptible_flag(inode)); |
719 | |
720 | /* |
721 | * the truncate has no downcall members to retrieve, but |
722 | * the status value tells us if it went through ok or not |
723 | */ |
724 | gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); |
725 | |
726 | op_release(op: new_op); |
727 | |
728 | if (ret != 0) |
729 | return ret; |
730 | |
731 | if (orig_size != i_size_read(inode)) |
732 | iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; |
733 | |
734 | return ret; |
735 | } |
736 | |
737 | int __orangefs_setattr(struct inode *inode, struct iattr *iattr) |
738 | { |
739 | int ret; |
740 | |
741 | if (iattr->ia_valid & ATTR_MODE) { |
742 | if (iattr->ia_mode & (S_ISVTX)) { |
743 | if (is_root_handle(inode)) { |
744 | /* |
745 | * allow sticky bit to be set on root (since |
746 | * it shows up that way by default anyhow), |
747 | * but don't show it to the server |
748 | */ |
749 | iattr->ia_mode -= S_ISVTX; |
750 | } else { |
751 | gossip_debug(GOSSIP_UTILS_DEBUG, |
752 | "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); |
753 | ret = -EINVAL; |
754 | goto out; |
755 | } |
756 | } |
757 | if (iattr->ia_mode & (S_ISUID)) { |
758 | gossip_debug(GOSSIP_UTILS_DEBUG, |
759 | "Attempting to set setuid bit (not supported); returning EINVAL.\n"); |
760 | ret = -EINVAL; |
761 | goto out; |
762 | } |
763 | } |
764 | |
765 | if (iattr->ia_valid & ATTR_SIZE) { |
766 | ret = orangefs_setattr_size(inode, iattr); |
767 | if (ret) |
768 | goto out; |
769 | } |
770 | |
771 | again: |
772 | spin_lock(lock: &inode->i_lock); |
773 | if (ORANGEFS_I(inode)->attr_valid) { |
774 | if (uid_eq(left: ORANGEFS_I(inode)->attr_uid, current_fsuid()) && |
775 | gid_eq(left: ORANGEFS_I(inode)->attr_gid, current_fsgid())) { |
776 | ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; |
777 | } else { |
778 | spin_unlock(lock: &inode->i_lock); |
779 | write_inode_now(inode, sync: 1); |
780 | goto again; |
781 | } |
782 | } else { |
783 | ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; |
784 | ORANGEFS_I(inode)->attr_uid = current_fsuid(); |
785 | ORANGEFS_I(inode)->attr_gid = current_fsgid(); |
786 | } |
787 | setattr_copy(&nop_mnt_idmap, inode, attr: iattr); |
788 | spin_unlock(lock: &inode->i_lock); |
789 | mark_inode_dirty(inode); |
790 | |
791 | ret = 0; |
792 | out: |
793 | return ret; |
794 | } |
795 | |
796 | int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) |
797 | { |
798 | int ret; |
799 | struct inode *inode = d_inode(dentry); |
800 | |
801 | ret = __orangefs_setattr(inode, iattr); |
802 | /* change mode on a file that has ACLs */ |
803 | if (!ret && (iattr->ia_valid & ATTR_MODE)) |
804 | ret = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); |
805 | return ret; |
806 | } |
807 | |
808 | /* |
809 | * Change attributes of an object referenced by dentry. |
810 | */ |
811 | int orangefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, |
812 | struct iattr *iattr) |
813 | { |
814 | int ret; |
815 | gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", |
816 | dentry); |
817 | ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); |
818 | if (ret) |
819 | goto out; |
820 | ret = __orangefs_setattr_mode(dentry, iattr); |
821 | sync_inode_metadata(inode: d_inode(dentry), wait: 1); |
822 | out: |
823 | gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", |
824 | ret); |
825 | return ret; |
826 | } |
827 | |
828 | /* |
829 | * Obtain attributes of an object given a dentry |
830 | */ |
831 | int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, |
832 | struct kstat *stat, u32 request_mask, unsigned int flags) |
833 | { |
834 | int ret; |
835 | struct inode *inode = path->dentry->d_inode; |
836 | |
837 | gossip_debug(GOSSIP_INODE_DEBUG, |
838 | "orangefs_getattr: called on %pd mask %u\n", |
839 | path->dentry, request_mask); |
840 | |
841 | ret = orangefs_inode_getattr(inode, |
842 | request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); |
843 | if (ret == 0) { |
844 | generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); |
845 | |
846 | /* override block size reported to stat */ |
847 | if (!(request_mask & STATX_SIZE)) |
848 | stat->result_mask &= ~STATX_SIZE; |
849 | |
850 | generic_fill_statx_attr(inode, stat); |
851 | } |
852 | return ret; |
853 | } |
854 | |
855 | int orangefs_permission(struct mnt_idmap *idmap, |
856 | struct inode *inode, int mask) |
857 | { |
858 | int ret; |
859 | |
860 | if (mask & MAY_NOT_BLOCK) |
861 | return -ECHILD; |
862 | |
863 | gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); |
864 | |
865 | /* Make sure the permission (and other common attrs) are up to date. */ |
866 | ret = orangefs_inode_getattr(inode, 0); |
867 | if (ret < 0) |
868 | return ret; |
869 | |
870 | return generic_permission(&nop_mnt_idmap, inode, mask); |
871 | } |
872 | |
873 | int orangefs_update_time(struct inode *inode, int flags) |
874 | { |
875 | struct iattr iattr; |
876 | |
877 | gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", |
878 | get_khandle_from_ino(inode)); |
879 | flags = generic_update_time(inode, flags); |
880 | memset(&iattr, 0, sizeof iattr); |
881 | if (flags & S_ATIME) |
882 | iattr.ia_valid |= ATTR_ATIME; |
883 | if (flags & S_CTIME) |
884 | iattr.ia_valid |= ATTR_CTIME; |
885 | if (flags & S_MTIME) |
886 | iattr.ia_valid |= ATTR_MTIME; |
887 | return __orangefs_setattr(inode, iattr: &iattr); |
888 | } |
889 | |
890 | static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) |
891 | { |
892 | u64 val = 0; |
893 | int ret; |
894 | |
895 | gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, |
896 | dentry); |
897 | |
898 | ret = orangefs_inode_getxattr(inode: d_inode(dentry), |
899 | name: "user.pvfs2.meta_hint", |
900 | buffer: &val, size: sizeof(val)); |
901 | if (ret < 0 && ret != -ENODATA) |
902 | return ret; |
903 | |
904 | gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); |
905 | |
906 | fileattr_fill_flags(fa, flags: val); |
907 | return 0; |
908 | } |
909 | |
910 | static int orangefs_fileattr_set(struct mnt_idmap *idmap, |
911 | struct dentry *dentry, struct fileattr *fa) |
912 | { |
913 | u64 val = 0; |
914 | |
915 | gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, |
916 | dentry); |
917 | /* |
918 | * ORANGEFS_MIRROR_FL is set internally when the mirroring mode is |
919 | * turned on for a file. The user is not allowed to turn on this bit, |
920 | * but the bit is present if the user first gets the flags and then |
921 | * updates the flags with some new settings. So, we ignore it in the |
922 | * following edit. bligon. |
923 | */ |
924 | if (fileattr_has_fsx(fa) || |
925 | (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | ORANGEFS_MIRROR_FL))) { |
926 | gossip_err("%s: only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n", |
927 | __func__); |
928 | return -EOPNOTSUPP; |
929 | } |
930 | val = fa->flags; |
931 | gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); |
932 | return orangefs_inode_setxattr(inode: d_inode(dentry), |
933 | name: "user.pvfs2.meta_hint", |
934 | value: &val, size: sizeof(val), flags: 0); |
935 | } |
936 | |
937 | /* ORANGEFS2 implementation of VFS inode operations for files */ |
938 | static const struct inode_operations orangefs_file_inode_operations = { |
939 | .get_inode_acl = orangefs_get_acl, |
940 | .set_acl = orangefs_set_acl, |
941 | .setattr = orangefs_setattr, |
942 | .getattr = orangefs_getattr, |
943 | .listxattr = orangefs_listxattr, |
944 | .permission = orangefs_permission, |
945 | .update_time = orangefs_update_time, |
946 | .fileattr_get = orangefs_fileattr_get, |
947 | .fileattr_set = orangefs_fileattr_set, |
948 | }; |
949 | |
950 | static int orangefs_init_iops(struct inode *inode) |
951 | { |
952 | inode->i_mapping->a_ops = &orangefs_address_operations; |
953 | |
954 | switch (inode->i_mode & S_IFMT) { |
955 | case S_IFREG: |
956 | inode->i_op = &orangefs_file_inode_operations; |
957 | inode->i_fop = &orangefs_file_operations; |
958 | break; |
959 | case S_IFLNK: |
960 | inode->i_op = &orangefs_symlink_inode_operations; |
961 | break; |
962 | case S_IFDIR: |
963 | inode->i_op = &orangefs_dir_inode_operations; |
964 | inode->i_fop = &orangefs_dir_operations; |
965 | break; |
966 | default: |
967 | gossip_debug(GOSSIP_INODE_DEBUG, |
968 | "%s: unsupported mode\n", |
969 | __func__); |
970 | return -EINVAL; |
971 | } |
972 | |
973 | return 0; |
974 | } |
975 | |
976 | /* |
977 | * Given an ORANGEFS object identifier (fsid, handle), convert it into |
978 | * a ino_t type that will be used as a hash-index from where the handle will |
979 | * be searched for in the VFS hash table of inodes. |
980 | */ |
981 | static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref) |
982 | { |
983 | if (!ref) |
984 | return 0; |
985 | return orangefs_khandle_to_ino(khandle: &(ref->khandle)); |
986 | } |
987 | |
988 | /* |
989 | * Called to set up an inode from iget5_locked. |
990 | */ |
991 | static int orangefs_set_inode(struct inode *inode, void *data) |
992 | { |
993 | struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; |
994 | ORANGEFS_I(inode)->refn.fs_id = ref->fs_id; |
995 | ORANGEFS_I(inode)->refn.khandle = ref->khandle; |
996 | ORANGEFS_I(inode)->attr_valid = 0; |
997 | hash_init(ORANGEFS_I(inode)->xattr_cache); |
998 | ORANGEFS_I(inode)->mapping_time = jiffies - 1; |
999 | ORANGEFS_I(inode)->bitlock = 0; |
1000 | return 0; |
1001 | } |
1002 | |
1003 | /* |
1004 | * Called to determine if handles match. |
1005 | */ |
1006 | static int orangefs_test_inode(struct inode *inode, void *data) |
1007 | { |
1008 | struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; |
1009 | struct orangefs_inode_s *orangefs_inode = NULL; |
1010 | |
1011 | orangefs_inode = ORANGEFS_I(inode); |
1012 | /* test handles and fs_ids... */ |
1013 | return (!ORANGEFS_khandle_cmp(kh1: &(orangefs_inode->refn.khandle), |
1014 | kh2: &(ref->khandle)) && |
1015 | orangefs_inode->refn.fs_id == ref->fs_id); |
1016 | } |
1017 | |
1018 | /* |
1019 | * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS |
1020 | * file handle. |
1021 | * |
1022 | * @sb: the file system super block instance. |
1023 | * @ref: The ORANGEFS object for which we are trying to locate an inode. |
1024 | */ |
1025 | struct inode *orangefs_iget(struct super_block *sb, |
1026 | struct orangefs_object_kref *ref) |
1027 | { |
1028 | struct inode *inode = NULL; |
1029 | unsigned long hash; |
1030 | int error; |
1031 | |
1032 | hash = orangefs_handle_hash(ref); |
1033 | inode = iget5_locked(sb, |
1034 | hash, |
1035 | test: orangefs_test_inode, |
1036 | set: orangefs_set_inode, |
1037 | ref); |
1038 | |
1039 | if (!inode) |
1040 | return ERR_PTR(error: -ENOMEM); |
1041 | |
1042 | if (!(inode->i_state & I_NEW)) |
1043 | return inode; |
1044 | |
1045 | error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); |
1046 | if (error) { |
1047 | iget_failed(inode); |
1048 | return ERR_PTR(error); |
1049 | } |
1050 | |
1051 | inode->i_ino = hash; /* needed for stat etc */ |
1052 | orangefs_init_iops(inode); |
1053 | unlock_new_inode(inode); |
1054 | |
1055 | gossip_debug(GOSSIP_INODE_DEBUG, |
1056 | "iget handle %pU, fsid %d hash %ld i_ino %lu\n", |
1057 | &ref->khandle, |
1058 | ref->fs_id, |
1059 | hash, |
1060 | inode->i_ino); |
1061 | |
1062 | return inode; |
1063 | } |
1064 | |
1065 | /* |
1066 | * Allocate an inode for a newly created file and insert it into the inode hash. |
1067 | */ |
1068 | struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, |
1069 | umode_t mode, dev_t dev, struct orangefs_object_kref *ref) |
1070 | { |
1071 | struct posix_acl *acl = NULL, *default_acl = NULL; |
1072 | unsigned long hash = orangefs_handle_hash(ref); |
1073 | struct inode *inode; |
1074 | int error; |
1075 | |
1076 | gossip_debug(GOSSIP_INODE_DEBUG, |
1077 | "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n", |
1078 | __func__, |
1079 | sb, |
1080 | MAJOR(dev), |
1081 | MINOR(dev), |
1082 | mode); |
1083 | |
1084 | inode = new_inode(sb); |
1085 | if (!inode) |
1086 | return ERR_PTR(error: -ENOMEM); |
1087 | |
1088 | error = posix_acl_create(dir, &mode, &default_acl, &acl); |
1089 | if (error) |
1090 | goto out_iput; |
1091 | |
1092 | orangefs_set_inode(inode, data: ref); |
1093 | inode->i_ino = hash; /* needed for stat etc */ |
1094 | |
1095 | error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); |
1096 | if (error) |
1097 | goto out_iput; |
1098 | |
1099 | orangefs_init_iops(inode); |
1100 | inode->i_rdev = dev; |
1101 | |
1102 | if (default_acl) { |
1103 | error = __orangefs_set_acl(inode, acl: default_acl, |
1104 | ACL_TYPE_DEFAULT); |
1105 | if (error) |
1106 | goto out_iput; |
1107 | } |
1108 | |
1109 | if (acl) { |
1110 | error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); |
1111 | if (error) |
1112 | goto out_iput; |
1113 | } |
1114 | |
1115 | error = insert_inode_locked4(inode, hash, test: orangefs_test_inode, ref); |
1116 | if (error < 0) |
1117 | goto out_iput; |
1118 | |
1119 | gossip_debug(GOSSIP_INODE_DEBUG, |
1120 | "Initializing ACL's for inode %pU\n", |
1121 | get_khandle_from_ino(inode)); |
1122 | if (mode != inode->i_mode) { |
1123 | struct iattr iattr = { |
1124 | .ia_mode = mode, |
1125 | .ia_valid = ATTR_MODE, |
1126 | }; |
1127 | inode->i_mode = mode; |
1128 | __orangefs_setattr(inode, iattr: &iattr); |
1129 | __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); |
1130 | } |
1131 | posix_acl_release(acl); |
1132 | posix_acl_release(acl: default_acl); |
1133 | return inode; |
1134 | |
1135 | out_iput: |
1136 | iput(inode); |
1137 | posix_acl_release(acl); |
1138 | posix_acl_release(acl: default_acl); |
1139 | return ERR_PTR(error); |
1140 | } |
1141 |
Definitions
- orangefs_writepage_locked
- orangefs_writepages
- orangefs_writepages_work
- orangefs_writepages_callback
- orangefs_writepages
- orangefs_readahead
- orangefs_read_folio
- orangefs_write_begin
- orangefs_write_end
- orangefs_invalidate_folio
- orangefs_release_folio
- orangefs_free_folio
- orangefs_launder_folio
- orangefs_direct_IO
- orangefs_address_operations
- orangefs_page_mkwrite
- orangefs_setattr_size
- __orangefs_setattr
- __orangefs_setattr_mode
- orangefs_setattr
- orangefs_getattr
- orangefs_permission
- orangefs_update_time
- orangefs_fileattr_get
- orangefs_fileattr_set
- orangefs_file_inode_operations
- orangefs_init_iops
- orangefs_handle_hash
- orangefs_set_inode
- orangefs_test_inode
- orangefs_iget
Improve your Profiling and Debugging skills
Find out more