1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * Copyright (c) 2000-2006 Silicon Graphics, Inc. |
4 | * All Rights Reserved. |
5 | */ |
6 | #include <linux/iversion.h> |
7 | |
8 | #include "xfs.h" |
9 | #include "xfs_fs.h" |
10 | #include "xfs_shared.h" |
11 | #include "xfs_format.h" |
12 | #include "xfs_log_format.h" |
13 | #include "xfs_trans_resv.h" |
14 | #include "xfs_mount.h" |
15 | #include "xfs_defer.h" |
16 | #include "xfs_inode.h" |
17 | #include "xfs_dir2.h" |
18 | #include "xfs_attr.h" |
19 | #include "xfs_trans_space.h" |
20 | #include "xfs_trans.h" |
21 | #include "xfs_buf_item.h" |
22 | #include "xfs_inode_item.h" |
23 | #include "xfs_iunlink_item.h" |
24 | #include "xfs_ialloc.h" |
25 | #include "xfs_bmap.h" |
26 | #include "xfs_bmap_util.h" |
27 | #include "xfs_errortag.h" |
28 | #include "xfs_error.h" |
29 | #include "xfs_quota.h" |
30 | #include "xfs_filestream.h" |
31 | #include "xfs_trace.h" |
32 | #include "xfs_icache.h" |
33 | #include "xfs_symlink.h" |
34 | #include "xfs_trans_priv.h" |
35 | #include "xfs_log.h" |
36 | #include "xfs_bmap_btree.h" |
37 | #include "xfs_reflink.h" |
38 | #include "xfs_ag.h" |
39 | #include "xfs_log_priv.h" |
40 | #include "xfs_health.h" |
41 | |
42 | struct kmem_cache *xfs_inode_cache; |
43 | |
44 | STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); |
45 | STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, |
46 | struct xfs_inode *); |
47 | |
48 | /* |
49 | * helper function to extract extent size hint from inode |
50 | */ |
51 | xfs_extlen_t |
52 | xfs_get_extsz_hint( |
53 | struct xfs_inode *ip) |
54 | { |
55 | /* |
56 | * No point in aligning allocations if we need to COW to actually |
57 | * write to them. |
58 | */ |
59 | if (xfs_is_always_cow_inode(ip)) |
60 | return 0; |
61 | if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) |
62 | return ip->i_extsize; |
63 | if (XFS_IS_REALTIME_INODE(ip)) |
64 | return ip->i_mount->m_sb.sb_rextsize; |
65 | return 0; |
66 | } |
67 | |
68 | /* |
69 | * Helper function to extract CoW extent size hint from inode. |
70 | * Between the extent size hint and the CoW extent size hint, we |
71 | * return the greater of the two. If the value is zero (automatic), |
72 | * use the default size. |
73 | */ |
74 | xfs_extlen_t |
75 | xfs_get_cowextsz_hint( |
76 | struct xfs_inode *ip) |
77 | { |
78 | xfs_extlen_t a, b; |
79 | |
80 | a = 0; |
81 | if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) |
82 | a = ip->i_cowextsize; |
83 | b = xfs_get_extsz_hint(ip); |
84 | |
85 | a = max(a, b); |
86 | if (a == 0) |
87 | return XFS_DEFAULT_COWEXTSZ_HINT; |
88 | return a; |
89 | } |
90 | |
91 | /* |
92 | * These two are wrapper routines around the xfs_ilock() routine used to |
93 | * centralize some grungy code. They are used in places that wish to lock the |
94 | * inode solely for reading the extents. The reason these places can't just |
95 | * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to |
96 | * bringing in of the extents from disk for a file in b-tree format. If the |
97 | * inode is in b-tree format, then we need to lock the inode exclusively until |
98 | * the extents are read in. Locking it exclusively all the time would limit |
99 | * our parallelism unnecessarily, though. What we do instead is check to see |
100 | * if the extents have been read in yet, and only lock the inode exclusively |
101 | * if they have not. |
102 | * |
103 | * The functions return a value which should be given to the corresponding |
104 | * xfs_iunlock() call. |
105 | */ |
106 | uint |
107 | xfs_ilock_data_map_shared( |
108 | struct xfs_inode *ip) |
109 | { |
110 | uint lock_mode = XFS_ILOCK_SHARED; |
111 | |
112 | if (xfs_need_iread_extents(&ip->i_df)) |
113 | lock_mode = XFS_ILOCK_EXCL; |
114 | xfs_ilock(ip, lock_mode); |
115 | return lock_mode; |
116 | } |
117 | |
118 | uint |
119 | xfs_ilock_attr_map_shared( |
120 | struct xfs_inode *ip) |
121 | { |
122 | uint lock_mode = XFS_ILOCK_SHARED; |
123 | |
124 | if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) |
125 | lock_mode = XFS_ILOCK_EXCL; |
126 | xfs_ilock(ip, lock_mode); |
127 | return lock_mode; |
128 | } |
129 | |
130 | /* |
131 | * You can't set both SHARED and EXCL for the same lock, |
132 | * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, |
133 | * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values |
134 | * to set in lock_flags. |
135 | */ |
136 | static inline void |
137 | xfs_lock_flags_assert( |
138 | uint lock_flags) |
139 | { |
140 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != |
141 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); |
142 | ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != |
143 | (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); |
144 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != |
145 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); |
146 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); |
147 | ASSERT(lock_flags != 0); |
148 | } |
149 | |
150 | /* |
151 | * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 |
152 | * multi-reader locks: invalidate_lock and the i_lock. This routine allows |
153 | * various combinations of the locks to be obtained. |
154 | * |
155 | * The 3 locks should always be ordered so that the IO lock is obtained first, |
156 | * the mmap lock second and the ilock last in order to prevent deadlock. |
157 | * |
158 | * Basic locking order: |
159 | * |
160 | * i_rwsem -> invalidate_lock -> page_lock -> i_ilock |
161 | * |
162 | * mmap_lock locking order: |
163 | * |
164 | * i_rwsem -> page lock -> mmap_lock |
165 | * mmap_lock -> invalidate_lock -> page_lock |
166 | * |
167 | * The difference in mmap_lock locking order mean that we cannot hold the |
168 | * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths |
169 | * can fault in pages during copy in/out (for buffered IO) or require the |
170 | * mmap_lock in get_user_pages() to map the user pages into the kernel address |
171 | * space for direct IO. Similarly the i_rwsem cannot be taken inside a page |
172 | * fault because page faults already hold the mmap_lock. |
173 | * |
174 | * Hence to serialise fully against both syscall and mmap based IO, we need to |
175 | * take both the i_rwsem and the invalidate_lock. These locks should *only* be |
176 | * both taken in places where we need to invalidate the page cache in a race |
177 | * free manner (e.g. truncate, hole punch and other extent manipulation |
178 | * functions). |
179 | */ |
180 | void |
181 | xfs_ilock( |
182 | xfs_inode_t *ip, |
183 | uint lock_flags) |
184 | { |
185 | trace_xfs_ilock(ip, lock_flags, _RET_IP_); |
186 | |
187 | xfs_lock_flags_assert(lock_flags); |
188 | |
189 | if (lock_flags & XFS_IOLOCK_EXCL) { |
190 | down_write_nested(sem: &VFS_I(ip)->i_rwsem, |
191 | XFS_IOLOCK_DEP(lock_flags)); |
192 | } else if (lock_flags & XFS_IOLOCK_SHARED) { |
193 | down_read_nested(sem: &VFS_I(ip)->i_rwsem, |
194 | XFS_IOLOCK_DEP(lock_flags)); |
195 | } |
196 | |
197 | if (lock_flags & XFS_MMAPLOCK_EXCL) { |
198 | down_write_nested(sem: &VFS_I(ip)->i_mapping->invalidate_lock, |
199 | XFS_MMAPLOCK_DEP(lock_flags)); |
200 | } else if (lock_flags & XFS_MMAPLOCK_SHARED) { |
201 | down_read_nested(sem: &VFS_I(ip)->i_mapping->invalidate_lock, |
202 | XFS_MMAPLOCK_DEP(lock_flags)); |
203 | } |
204 | |
205 | if (lock_flags & XFS_ILOCK_EXCL) |
206 | down_write_nested(sem: &ip->i_lock, XFS_ILOCK_DEP(lock_flags)); |
207 | else if (lock_flags & XFS_ILOCK_SHARED) |
208 | down_read_nested(sem: &ip->i_lock, XFS_ILOCK_DEP(lock_flags)); |
209 | } |
210 | |
211 | /* |
212 | * This is just like xfs_ilock(), except that the caller |
213 | * is guaranteed not to sleep. It returns 1 if it gets |
214 | * the requested locks and 0 otherwise. If the IO lock is |
215 | * obtained but the inode lock cannot be, then the IO lock |
216 | * is dropped before returning. |
217 | * |
218 | * ip -- the inode being locked |
219 | * lock_flags -- this parameter indicates the inode's locks to be |
220 | * to be locked. See the comment for xfs_ilock() for a list |
221 | * of valid values. |
222 | */ |
223 | int |
224 | xfs_ilock_nowait( |
225 | xfs_inode_t *ip, |
226 | uint lock_flags) |
227 | { |
228 | trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); |
229 | |
230 | xfs_lock_flags_assert(lock_flags); |
231 | |
232 | if (lock_flags & XFS_IOLOCK_EXCL) { |
233 | if (!down_write_trylock(sem: &VFS_I(ip)->i_rwsem)) |
234 | goto out; |
235 | } else if (lock_flags & XFS_IOLOCK_SHARED) { |
236 | if (!down_read_trylock(sem: &VFS_I(ip)->i_rwsem)) |
237 | goto out; |
238 | } |
239 | |
240 | if (lock_flags & XFS_MMAPLOCK_EXCL) { |
241 | if (!down_write_trylock(sem: &VFS_I(ip)->i_mapping->invalidate_lock)) |
242 | goto out_undo_iolock; |
243 | } else if (lock_flags & XFS_MMAPLOCK_SHARED) { |
244 | if (!down_read_trylock(sem: &VFS_I(ip)->i_mapping->invalidate_lock)) |
245 | goto out_undo_iolock; |
246 | } |
247 | |
248 | if (lock_flags & XFS_ILOCK_EXCL) { |
249 | if (!down_write_trylock(sem: &ip->i_lock)) |
250 | goto out_undo_mmaplock; |
251 | } else if (lock_flags & XFS_ILOCK_SHARED) { |
252 | if (!down_read_trylock(sem: &ip->i_lock)) |
253 | goto out_undo_mmaplock; |
254 | } |
255 | return 1; |
256 | |
257 | out_undo_mmaplock: |
258 | if (lock_flags & XFS_MMAPLOCK_EXCL) |
259 | up_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
260 | else if (lock_flags & XFS_MMAPLOCK_SHARED) |
261 | up_read(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
262 | out_undo_iolock: |
263 | if (lock_flags & XFS_IOLOCK_EXCL) |
264 | up_write(sem: &VFS_I(ip)->i_rwsem); |
265 | else if (lock_flags & XFS_IOLOCK_SHARED) |
266 | up_read(sem: &VFS_I(ip)->i_rwsem); |
267 | out: |
268 | return 0; |
269 | } |
270 | |
271 | /* |
272 | * xfs_iunlock() is used to drop the inode locks acquired with |
273 | * xfs_ilock() and xfs_ilock_nowait(). The caller must pass |
274 | * in the flags given to xfs_ilock() or xfs_ilock_nowait() so |
275 | * that we know which locks to drop. |
276 | * |
277 | * ip -- the inode being unlocked |
278 | * lock_flags -- this parameter indicates the inode's locks to be |
279 | * to be unlocked. See the comment for xfs_ilock() for a list |
280 | * of valid values for this parameter. |
281 | * |
282 | */ |
283 | void |
284 | xfs_iunlock( |
285 | xfs_inode_t *ip, |
286 | uint lock_flags) |
287 | { |
288 | xfs_lock_flags_assert(lock_flags); |
289 | |
290 | if (lock_flags & XFS_IOLOCK_EXCL) |
291 | up_write(sem: &VFS_I(ip)->i_rwsem); |
292 | else if (lock_flags & XFS_IOLOCK_SHARED) |
293 | up_read(sem: &VFS_I(ip)->i_rwsem); |
294 | |
295 | if (lock_flags & XFS_MMAPLOCK_EXCL) |
296 | up_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
297 | else if (lock_flags & XFS_MMAPLOCK_SHARED) |
298 | up_read(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
299 | |
300 | if (lock_flags & XFS_ILOCK_EXCL) |
301 | up_write(sem: &ip->i_lock); |
302 | else if (lock_flags & XFS_ILOCK_SHARED) |
303 | up_read(sem: &ip->i_lock); |
304 | |
305 | trace_xfs_iunlock(ip, lock_flags, _RET_IP_); |
306 | } |
307 | |
308 | /* |
309 | * give up write locks. the i/o lock cannot be held nested |
310 | * if it is being demoted. |
311 | */ |
312 | void |
313 | xfs_ilock_demote( |
314 | xfs_inode_t *ip, |
315 | uint lock_flags) |
316 | { |
317 | ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); |
318 | ASSERT((lock_flags & |
319 | ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); |
320 | |
321 | if (lock_flags & XFS_ILOCK_EXCL) |
322 | downgrade_write(sem: &ip->i_lock); |
323 | if (lock_flags & XFS_MMAPLOCK_EXCL) |
324 | downgrade_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
325 | if (lock_flags & XFS_IOLOCK_EXCL) |
326 | downgrade_write(sem: &VFS_I(ip)->i_rwsem); |
327 | |
328 | trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); |
329 | } |
330 | |
331 | void |
332 | xfs_assert_ilocked( |
333 | struct xfs_inode *ip, |
334 | uint lock_flags) |
335 | { |
336 | /* |
337 | * Sometimes we assert the ILOCK is held exclusively, but we're in |
338 | * a workqueue, so lockdep doesn't know we're the owner. |
339 | */ |
340 | if (lock_flags & XFS_ILOCK_SHARED) |
341 | rwsem_assert_held(sem: &ip->i_lock); |
342 | else if (lock_flags & XFS_ILOCK_EXCL) |
343 | rwsem_assert_held_write_nolockdep(sem: &ip->i_lock); |
344 | |
345 | if (lock_flags & XFS_MMAPLOCK_SHARED) |
346 | rwsem_assert_held(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
347 | else if (lock_flags & XFS_MMAPLOCK_EXCL) |
348 | rwsem_assert_held_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock); |
349 | |
350 | if (lock_flags & XFS_IOLOCK_SHARED) |
351 | rwsem_assert_held(sem: &VFS_I(ip)->i_rwsem); |
352 | else if (lock_flags & XFS_IOLOCK_EXCL) |
353 | rwsem_assert_held_write(sem: &VFS_I(ip)->i_rwsem); |
354 | } |
355 | |
356 | /* |
357 | * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when |
358 | * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined |
359 | * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build |
360 | * errors and warnings. |
361 | */ |
362 | #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) |
363 | static bool |
364 | xfs_lockdep_subclass_ok( |
365 | int subclass) |
366 | { |
367 | return subclass < MAX_LOCKDEP_SUBCLASSES; |
368 | } |
369 | #else |
370 | #define xfs_lockdep_subclass_ok(subclass) (true) |
371 | #endif |
372 | |
373 | /* |
374 | * Bump the subclass so xfs_lock_inodes() acquires each lock with a different |
375 | * value. This can be called for any type of inode lock combination, including |
376 | * parent locking. Care must be taken to ensure we don't overrun the subclass |
377 | * storage fields in the class mask we build. |
378 | */ |
379 | static inline uint |
380 | xfs_lock_inumorder( |
381 | uint lock_mode, |
382 | uint subclass) |
383 | { |
384 | uint class = 0; |
385 | |
386 | ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | |
387 | XFS_ILOCK_RTSUM))); |
388 | ASSERT(xfs_lockdep_subclass_ok(subclass)); |
389 | |
390 | if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { |
391 | ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); |
392 | class += subclass << XFS_IOLOCK_SHIFT; |
393 | } |
394 | |
395 | if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { |
396 | ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); |
397 | class += subclass << XFS_MMAPLOCK_SHIFT; |
398 | } |
399 | |
400 | if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { |
401 | ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); |
402 | class += subclass << XFS_ILOCK_SHIFT; |
403 | } |
404 | |
405 | return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; |
406 | } |
407 | |
408 | /* |
409 | * The following routine will lock n inodes in exclusive mode. We assume the |
410 | * caller calls us with the inodes in i_ino order. |
411 | * |
412 | * We need to detect deadlock where an inode that we lock is in the AIL and we |
413 | * start waiting for another inode that is locked by a thread in a long running |
414 | * transaction (such as truncate). This can result in deadlock since the long |
415 | * running trans might need to wait for the inode we just locked in order to |
416 | * push the tail and free space in the log. |
417 | * |
418 | * xfs_lock_inodes() can only be used to lock one type of lock at a time - |
419 | * the iolock, the mmaplock or the ilock, but not more than one at a time. If we |
420 | * lock more than one at a time, lockdep will report false positives saying we |
421 | * have violated locking orders. |
422 | */ |
423 | static void |
424 | xfs_lock_inodes( |
425 | struct xfs_inode **ips, |
426 | int inodes, |
427 | uint lock_mode) |
428 | { |
429 | int attempts = 0; |
430 | uint i; |
431 | int j; |
432 | bool try_lock; |
433 | struct xfs_log_item *lp; |
434 | |
435 | /* |
436 | * Currently supports between 2 and 5 inodes with exclusive locking. We |
437 | * support an arbitrary depth of locking here, but absolute limits on |
438 | * inodes depend on the type of locking and the limits placed by |
439 | * lockdep annotations in xfs_lock_inumorder. These are all checked by |
440 | * the asserts. |
441 | */ |
442 | ASSERT(ips && inodes >= 2 && inodes <= 5); |
443 | ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | |
444 | XFS_ILOCK_EXCL)); |
445 | ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | |
446 | XFS_ILOCK_SHARED))); |
447 | ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || |
448 | inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); |
449 | ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || |
450 | inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); |
451 | |
452 | if (lock_mode & XFS_IOLOCK_EXCL) { |
453 | ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); |
454 | } else if (lock_mode & XFS_MMAPLOCK_EXCL) |
455 | ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); |
456 | |
457 | again: |
458 | try_lock = false; |
459 | i = 0; |
460 | for (; i < inodes; i++) { |
461 | ASSERT(ips[i]); |
462 | |
463 | if (i && (ips[i] == ips[i - 1])) /* Already locked */ |
464 | continue; |
465 | |
466 | /* |
467 | * If try_lock is not set yet, make sure all locked inodes are |
468 | * not in the AIL. If any are, set try_lock to be used later. |
469 | */ |
470 | if (!try_lock) { |
471 | for (j = (i - 1); j >= 0 && !try_lock; j--) { |
472 | lp = &ips[j]->i_itemp->ili_item; |
473 | if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) |
474 | try_lock = true; |
475 | } |
476 | } |
477 | |
478 | /* |
479 | * If any of the previous locks we have locked is in the AIL, |
480 | * we must TRY to get the second and subsequent locks. If |
481 | * we can't get any, we must release all we have |
482 | * and try again. |
483 | */ |
484 | if (!try_lock) { |
485 | xfs_ilock(ip: ips[i], lock_flags: xfs_lock_inumorder(lock_mode, subclass: i)); |
486 | continue; |
487 | } |
488 | |
489 | /* try_lock means we have an inode locked that is in the AIL. */ |
490 | ASSERT(i != 0); |
491 | if (xfs_ilock_nowait(ip: ips[i], lock_flags: xfs_lock_inumorder(lock_mode, subclass: i))) |
492 | continue; |
493 | |
494 | /* |
495 | * Unlock all previous guys and try again. xfs_iunlock will try |
496 | * to push the tail if the inode is in the AIL. |
497 | */ |
498 | attempts++; |
499 | for (j = i - 1; j >= 0; j--) { |
500 | /* |
501 | * Check to see if we've already unlocked this one. Not |
502 | * the first one going back, and the inode ptr is the |
503 | * same. |
504 | */ |
505 | if (j != (i - 1) && ips[j] == ips[j + 1]) |
506 | continue; |
507 | |
508 | xfs_iunlock(ip: ips[j], lock_flags: lock_mode); |
509 | } |
510 | |
511 | if ((attempts % 5) == 0) { |
512 | delay(ticks: 1); /* Don't just spin the CPU */ |
513 | } |
514 | goto again; |
515 | } |
516 | } |
517 | |
518 | /* |
519 | * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and |
520 | * mmaplock must be double-locked separately since we use i_rwsem and |
521 | * invalidate_lock for that. We now support taking one lock EXCL and the |
522 | * other SHARED. |
523 | */ |
524 | void |
525 | xfs_lock_two_inodes( |
526 | struct xfs_inode *ip0, |
527 | uint ip0_mode, |
528 | struct xfs_inode *ip1, |
529 | uint ip1_mode) |
530 | { |
531 | int attempts = 0; |
532 | struct xfs_log_item *lp; |
533 | |
534 | ASSERT(hweight32(ip0_mode) == 1); |
535 | ASSERT(hweight32(ip1_mode) == 1); |
536 | ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); |
537 | ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); |
538 | ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); |
539 | ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); |
540 | ASSERT(ip0->i_ino != ip1->i_ino); |
541 | |
542 | if (ip0->i_ino > ip1->i_ino) { |
543 | swap(ip0, ip1); |
544 | swap(ip0_mode, ip1_mode); |
545 | } |
546 | |
547 | again: |
548 | xfs_ilock(ip: ip0, lock_flags: xfs_lock_inumorder(lock_mode: ip0_mode, subclass: 0)); |
549 | |
550 | /* |
551 | * If the first lock we have locked is in the AIL, we must TRY to get |
552 | * the second lock. If we can't get it, we must release the first one |
553 | * and try again. |
554 | */ |
555 | lp = &ip0->i_itemp->ili_item; |
556 | if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { |
557 | if (!xfs_ilock_nowait(ip: ip1, lock_flags: xfs_lock_inumorder(lock_mode: ip1_mode, subclass: 1))) { |
558 | xfs_iunlock(ip: ip0, lock_flags: ip0_mode); |
559 | if ((++attempts % 5) == 0) |
560 | delay(ticks: 1); /* Don't just spin the CPU */ |
561 | goto again; |
562 | } |
563 | } else { |
564 | xfs_ilock(ip: ip1, lock_flags: xfs_lock_inumorder(lock_mode: ip1_mode, subclass: 1)); |
565 | } |
566 | } |
567 | |
568 | uint |
569 | xfs_ip2xflags( |
570 | struct xfs_inode *ip) |
571 | { |
572 | uint flags = 0; |
573 | |
574 | if (ip->i_diflags & XFS_DIFLAG_ANY) { |
575 | if (ip->i_diflags & XFS_DIFLAG_REALTIME) |
576 | flags |= FS_XFLAG_REALTIME; |
577 | if (ip->i_diflags & XFS_DIFLAG_PREALLOC) |
578 | flags |= FS_XFLAG_PREALLOC; |
579 | if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) |
580 | flags |= FS_XFLAG_IMMUTABLE; |
581 | if (ip->i_diflags & XFS_DIFLAG_APPEND) |
582 | flags |= FS_XFLAG_APPEND; |
583 | if (ip->i_diflags & XFS_DIFLAG_SYNC) |
584 | flags |= FS_XFLAG_SYNC; |
585 | if (ip->i_diflags & XFS_DIFLAG_NOATIME) |
586 | flags |= FS_XFLAG_NOATIME; |
587 | if (ip->i_diflags & XFS_DIFLAG_NODUMP) |
588 | flags |= FS_XFLAG_NODUMP; |
589 | if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) |
590 | flags |= FS_XFLAG_RTINHERIT; |
591 | if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) |
592 | flags |= FS_XFLAG_PROJINHERIT; |
593 | if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) |
594 | flags |= FS_XFLAG_NOSYMLINKS; |
595 | if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) |
596 | flags |= FS_XFLAG_EXTSIZE; |
597 | if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) |
598 | flags |= FS_XFLAG_EXTSZINHERIT; |
599 | if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) |
600 | flags |= FS_XFLAG_NODEFRAG; |
601 | if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) |
602 | flags |= FS_XFLAG_FILESTREAM; |
603 | } |
604 | |
605 | if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { |
606 | if (ip->i_diflags2 & XFS_DIFLAG2_DAX) |
607 | flags |= FS_XFLAG_DAX; |
608 | if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) |
609 | flags |= FS_XFLAG_COWEXTSIZE; |
610 | } |
611 | |
612 | if (xfs_inode_has_attr_fork(ip)) |
613 | flags |= FS_XFLAG_HASATTR; |
614 | return flags; |
615 | } |
616 | |
617 | /* |
618 | * Lookups up an inode from "name". If ci_name is not NULL, then a CI match |
619 | * is allowed, otherwise it has to be an exact match. If a CI match is found, |
620 | * ci_name->name will point to a the actual name (caller must free) or |
621 | * will be set to NULL if an exact match is found. |
622 | */ |
623 | int |
624 | xfs_lookup( |
625 | struct xfs_inode *dp, |
626 | const struct xfs_name *name, |
627 | struct xfs_inode **ipp, |
628 | struct xfs_name *ci_name) |
629 | { |
630 | xfs_ino_t inum; |
631 | int error; |
632 | |
633 | trace_xfs_lookup(dp, xfs_lookup: name); |
634 | |
635 | if (xfs_is_shutdown(mp: dp->i_mount)) |
636 | return -EIO; |
637 | if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) |
638 | return -EIO; |
639 | |
640 | error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); |
641 | if (error) |
642 | goto out_unlock; |
643 | |
644 | error = xfs_iget(mp: dp->i_mount, NULL, ino: inum, flags: 0, lock_flags: 0, ipp); |
645 | if (error) |
646 | goto out_free_name; |
647 | |
648 | return 0; |
649 | |
650 | out_free_name: |
651 | if (ci_name) |
652 | kfree(objp: ci_name->name); |
653 | out_unlock: |
654 | *ipp = NULL; |
655 | return error; |
656 | } |
657 | |
658 | /* Propagate di_flags from a parent inode to a child inode. */ |
659 | static void |
660 | xfs_inode_inherit_flags( |
661 | struct xfs_inode *ip, |
662 | const struct xfs_inode *pip) |
663 | { |
664 | unsigned int di_flags = 0; |
665 | xfs_failaddr_t failaddr; |
666 | umode_t mode = VFS_I(ip)->i_mode; |
667 | |
668 | if (S_ISDIR(mode)) { |
669 | if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) |
670 | di_flags |= XFS_DIFLAG_RTINHERIT; |
671 | if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { |
672 | di_flags |= XFS_DIFLAG_EXTSZINHERIT; |
673 | ip->i_extsize = pip->i_extsize; |
674 | } |
675 | if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) |
676 | di_flags |= XFS_DIFLAG_PROJINHERIT; |
677 | } else if (S_ISREG(mode)) { |
678 | if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && |
679 | xfs_has_realtime(ip->i_mount)) |
680 | di_flags |= XFS_DIFLAG_REALTIME; |
681 | if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { |
682 | di_flags |= XFS_DIFLAG_EXTSIZE; |
683 | ip->i_extsize = pip->i_extsize; |
684 | } |
685 | } |
686 | if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && |
687 | xfs_inherit_noatime) |
688 | di_flags |= XFS_DIFLAG_NOATIME; |
689 | if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && |
690 | xfs_inherit_nodump) |
691 | di_flags |= XFS_DIFLAG_NODUMP; |
692 | if ((pip->i_diflags & XFS_DIFLAG_SYNC) && |
693 | xfs_inherit_sync) |
694 | di_flags |= XFS_DIFLAG_SYNC; |
695 | if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && |
696 | xfs_inherit_nosymlinks) |
697 | di_flags |= XFS_DIFLAG_NOSYMLINKS; |
698 | if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && |
699 | xfs_inherit_nodefrag) |
700 | di_flags |= XFS_DIFLAG_NODEFRAG; |
701 | if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) |
702 | di_flags |= XFS_DIFLAG_FILESTREAM; |
703 | |
704 | ip->i_diflags |= di_flags; |
705 | |
706 | /* |
707 | * Inode verifiers on older kernels only check that the extent size |
708 | * hint is an integer multiple of the rt extent size on realtime files. |
709 | * They did not check the hint alignment on a directory with both |
710 | * rtinherit and extszinherit flags set. If the misaligned hint is |
711 | * propagated from a directory into a new realtime file, new file |
712 | * allocations will fail due to math errors in the rt allocator and/or |
713 | * trip the verifiers. Validate the hint settings in the new file so |
714 | * that we don't let broken hints propagate. |
715 | */ |
716 | failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, |
717 | VFS_I(ip)->i_mode, ip->i_diflags); |
718 | if (failaddr) { |
719 | ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | |
720 | XFS_DIFLAG_EXTSZINHERIT); |
721 | ip->i_extsize = 0; |
722 | } |
723 | } |
724 | |
725 | /* Propagate di_flags2 from a parent inode to a child inode. */ |
726 | static void |
727 | xfs_inode_inherit_flags2( |
728 | struct xfs_inode *ip, |
729 | const struct xfs_inode *pip) |
730 | { |
731 | xfs_failaddr_t failaddr; |
732 | |
733 | if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { |
734 | ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; |
735 | ip->i_cowextsize = pip->i_cowextsize; |
736 | } |
737 | if (pip->i_diflags2 & XFS_DIFLAG2_DAX) |
738 | ip->i_diflags2 |= XFS_DIFLAG2_DAX; |
739 | |
740 | /* Don't let invalid cowextsize hints propagate. */ |
741 | failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, |
742 | VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); |
743 | if (failaddr) { |
744 | ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; |
745 | ip->i_cowextsize = 0; |
746 | } |
747 | } |
748 | |
749 | /* |
750 | * Initialise a newly allocated inode and return the in-core inode to the |
751 | * caller locked exclusively. |
752 | */ |
753 | int |
754 | xfs_init_new_inode( |
755 | struct mnt_idmap *idmap, |
756 | struct xfs_trans *tp, |
757 | struct xfs_inode *pip, |
758 | xfs_ino_t ino, |
759 | umode_t mode, |
760 | xfs_nlink_t nlink, |
761 | dev_t rdev, |
762 | prid_t prid, |
763 | bool init_xattrs, |
764 | struct xfs_inode **ipp) |
765 | { |
766 | struct inode *dir = pip ? VFS_I(ip: pip) : NULL; |
767 | struct xfs_mount *mp = tp->t_mountp; |
768 | struct xfs_inode *ip; |
769 | unsigned int flags; |
770 | int error; |
771 | struct timespec64 tv; |
772 | struct inode *inode; |
773 | |
774 | /* |
775 | * Protect against obviously corrupt allocation btree records. Later |
776 | * xfs_iget checks will catch re-allocation of other active in-memory |
777 | * and on-disk inodes. If we don't catch reallocating the parent inode |
778 | * here we will deadlock in xfs_iget() so we have to do these checks |
779 | * first. |
780 | */ |
781 | if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { |
782 | xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); |
783 | xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), |
784 | XFS_SICK_AG_INOBT); |
785 | return -EFSCORRUPTED; |
786 | } |
787 | |
788 | /* |
789 | * Get the in-core inode with the lock held exclusively to prevent |
790 | * others from looking at until we're done. |
791 | */ |
792 | error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, ipp: &ip); |
793 | if (error) |
794 | return error; |
795 | |
796 | ASSERT(ip != NULL); |
797 | inode = VFS_I(ip); |
798 | set_nlink(inode, nlink); |
799 | inode->i_rdev = rdev; |
800 | ip->i_projid = prid; |
801 | |
802 | if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { |
803 | inode_fsuid_set(inode, idmap); |
804 | inode->i_gid = dir->i_gid; |
805 | inode->i_mode = mode; |
806 | } else { |
807 | inode_init_owner(idmap, inode, dir, mode); |
808 | } |
809 | |
810 | /* |
811 | * If the group ID of the new file does not match the effective group |
812 | * ID or one of the supplementary group IDs, the S_ISGID bit is cleared |
813 | * (and only if the irix_sgid_inherit compatibility variable is set). |
814 | */ |
815 | if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && |
816 | !vfsgid_in_group_p(vfsgid: i_gid_into_vfsgid(idmap, inode))) |
817 | inode->i_mode &= ~S_ISGID; |
818 | |
819 | ip->i_disk_size = 0; |
820 | ip->i_df.if_nextents = 0; |
821 | ASSERT(ip->i_nblocks == 0); |
822 | |
823 | tv = inode_set_ctime_current(inode); |
824 | inode_set_mtime_to_ts(inode, ts: tv); |
825 | inode_set_atime_to_ts(inode, ts: tv); |
826 | |
827 | ip->i_extsize = 0; |
828 | ip->i_diflags = 0; |
829 | |
830 | if (xfs_has_v3inodes(mp)) { |
831 | inode_set_iversion(inode, val: 1); |
832 | ip->i_cowextsize = 0; |
833 | ip->i_crtime = tv; |
834 | } |
835 | |
836 | flags = XFS_ILOG_CORE; |
837 | switch (mode & S_IFMT) { |
838 | case S_IFIFO: |
839 | case S_IFCHR: |
840 | case S_IFBLK: |
841 | case S_IFSOCK: |
842 | ip->i_df.if_format = XFS_DINODE_FMT_DEV; |
843 | flags |= XFS_ILOG_DEV; |
844 | break; |
845 | case S_IFREG: |
846 | case S_IFDIR: |
847 | if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) |
848 | xfs_inode_inherit_flags(ip, pip); |
849 | if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) |
850 | xfs_inode_inherit_flags2(ip, pip); |
851 | fallthrough; |
852 | case S_IFLNK: |
853 | ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; |
854 | ip->i_df.if_bytes = 0; |
855 | ip->i_df.if_data = NULL; |
856 | break; |
857 | default: |
858 | ASSERT(0); |
859 | } |
860 | |
861 | /* |
862 | * If we need to create attributes immediately after allocating the |
863 | * inode, initialise an empty attribute fork right now. We use the |
864 | * default fork offset for attributes here as we don't know exactly what |
865 | * size or how many attributes we might be adding. We can do this |
866 | * safely here because we know the data fork is completely empty and |
867 | * this saves us from needing to run a separate transaction to set the |
868 | * fork offset in the immediate future. |
869 | */ |
870 | if (init_xattrs && xfs_has_attr(mp)) { |
871 | ip->i_forkoff = xfs_default_attroffset(ip) >> 3; |
872 | xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); |
873 | } |
874 | |
875 | /* |
876 | * Log the new values stuffed into the inode. |
877 | */ |
878 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); |
879 | xfs_trans_log_inode(tp, ip, flags); |
880 | |
881 | /* now that we have an i_mode we can setup the inode structure */ |
882 | xfs_setup_inode(ip); |
883 | |
884 | *ipp = ip; |
885 | return 0; |
886 | } |
887 | |
888 | /* |
889 | * Decrement the link count on an inode & log the change. If this causes the |
890 | * link count to go to zero, move the inode to AGI unlinked list so that it can |
891 | * be freed when the last active reference goes away via xfs_inactive(). |
892 | */ |
893 | static int /* error */ |
894 | xfs_droplink( |
895 | xfs_trans_t *tp, |
896 | xfs_inode_t *ip) |
897 | { |
898 | if (VFS_I(ip)->i_nlink == 0) { |
899 | xfs_alert(ip->i_mount, |
900 | "%s: Attempt to drop inode (%llu) with nlink zero.", |
901 | __func__, ip->i_ino); |
902 | return -EFSCORRUPTED; |
903 | } |
904 | |
905 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); |
906 | |
907 | drop_nlink(inode: VFS_I(ip)); |
908 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
909 | |
910 | if (VFS_I(ip)->i_nlink) |
911 | return 0; |
912 | |
913 | return xfs_iunlink(tp, ip); |
914 | } |
915 | |
916 | /* |
917 | * Increment the link count on an inode & log the change. |
918 | */ |
919 | static void |
920 | xfs_bumplink( |
921 | xfs_trans_t *tp, |
922 | xfs_inode_t *ip) |
923 | { |
924 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); |
925 | |
926 | inc_nlink(inode: VFS_I(ip)); |
927 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
928 | } |
929 | |
930 | #ifdef CONFIG_XFS_LIVE_HOOKS |
931 | /* |
932 | * Use a static key here to reduce the overhead of directory live update hooks. |
933 | * If the compiler supports jump labels, the static branch will be replaced by |
934 | * a nop sled when there are no hook users. Online fsck is currently the only |
935 | * caller, so this is a reasonable tradeoff. |
936 | * |
937 | * Note: Patching the kernel code requires taking the cpu hotplug lock. Other |
938 | * parts of the kernel allocate memory with that lock held, which means that |
939 | * XFS callers cannot hold any locks that might be used by memory reclaim or |
940 | * writeback when calling the static_branch_{inc,dec} functions. |
941 | */ |
942 | DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); |
943 | |
944 | void |
945 | xfs_dir_hook_disable(void) |
946 | { |
947 | xfs_hooks_switch_off(&xfs_dir_hooks_switch); |
948 | } |
949 | |
950 | void |
951 | xfs_dir_hook_enable(void) |
952 | { |
953 | xfs_hooks_switch_on(&xfs_dir_hooks_switch); |
954 | } |
955 | |
956 | /* Call hooks for a directory update relating to a child dirent update. */ |
957 | inline void |
958 | xfs_dir_update_hook( |
959 | struct xfs_inode *dp, |
960 | struct xfs_inode *ip, |
961 | int delta, |
962 | const struct xfs_name *name) |
963 | { |
964 | if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { |
965 | struct xfs_dir_update_params p = { |
966 | .dp = dp, |
967 | .ip = ip, |
968 | .delta = delta, |
969 | .name = name, |
970 | }; |
971 | struct xfs_mount *mp = ip->i_mount; |
972 | |
973 | xfs_hooks_call(chain: &mp->m_dir_update_hooks, action: 0, priv: &p); |
974 | } |
975 | } |
976 | |
977 | /* Call the specified function during a directory update. */ |
978 | int |
979 | xfs_dir_hook_add( |
980 | struct xfs_mount *mp, |
981 | struct xfs_dir_hook *hook) |
982 | { |
983 | return xfs_hooks_add(chain: &mp->m_dir_update_hooks, hook: &hook->dirent_hook); |
984 | } |
985 | |
986 | /* Stop calling the specified function during a directory update. */ |
987 | void |
988 | xfs_dir_hook_del( |
989 | struct xfs_mount *mp, |
990 | struct xfs_dir_hook *hook) |
991 | { |
992 | xfs_hooks_del(chain: &mp->m_dir_update_hooks, hook: &hook->dirent_hook); |
993 | } |
994 | |
995 | /* Configure directory update hook functions. */ |
996 | void |
997 | xfs_dir_hook_setup( |
998 | struct xfs_dir_hook *hook, |
999 | notifier_fn_t mod_fn) |
1000 | { |
1001 | xfs_hook_setup(hook: &hook->dirent_hook, fn: mod_fn); |
1002 | } |
1003 | #endif /* CONFIG_XFS_LIVE_HOOKS */ |
1004 | |
1005 | int |
1006 | xfs_create( |
1007 | struct mnt_idmap *idmap, |
1008 | xfs_inode_t *dp, |
1009 | struct xfs_name *name, |
1010 | umode_t mode, |
1011 | dev_t rdev, |
1012 | bool init_xattrs, |
1013 | xfs_inode_t **ipp) |
1014 | { |
1015 | int is_dir = S_ISDIR(mode); |
1016 | struct xfs_mount *mp = dp->i_mount; |
1017 | struct xfs_inode *ip = NULL; |
1018 | struct xfs_trans *tp = NULL; |
1019 | int error; |
1020 | bool unlock_dp_on_error = false; |
1021 | prid_t prid; |
1022 | struct xfs_dquot *udqp = NULL; |
1023 | struct xfs_dquot *gdqp = NULL; |
1024 | struct xfs_dquot *pdqp = NULL; |
1025 | struct xfs_trans_res *tres; |
1026 | uint resblks; |
1027 | xfs_ino_t ino; |
1028 | |
1029 | trace_xfs_create(dp, xfs_create: name); |
1030 | |
1031 | if (xfs_is_shutdown(mp)) |
1032 | return -EIO; |
1033 | if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) |
1034 | return -EIO; |
1035 | |
1036 | prid = xfs_get_initial_prid(dp); |
1037 | |
1038 | /* |
1039 | * Make sure that we have allocated dquot(s) on disk. |
1040 | */ |
1041 | error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), |
1042 | mapped_fsgid(idmap, &init_user_ns), prid, |
1043 | XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, |
1044 | &udqp, &gdqp, &pdqp); |
1045 | if (error) |
1046 | return error; |
1047 | |
1048 | if (is_dir) { |
1049 | resblks = XFS_MKDIR_SPACE_RES(mp, name->len); |
1050 | tres = &M_RES(mp)->tr_mkdir; |
1051 | } else { |
1052 | resblks = XFS_CREATE_SPACE_RES(mp, name->len); |
1053 | tres = &M_RES(mp)->tr_create; |
1054 | } |
1055 | |
1056 | /* |
1057 | * Initially assume that the file does not exist and |
1058 | * reserve the resources for that case. If that is not |
1059 | * the case we'll drop the one we have and get a more |
1060 | * appropriate transaction later. |
1061 | */ |
1062 | error = xfs_trans_alloc_icreate(mp, resv: tres, udqp, gdqp, pdqp, dblocks: resblks, |
1063 | tpp: &tp); |
1064 | if (error == -ENOSPC) { |
1065 | /* flush outstanding delalloc blocks and retry */ |
1066 | xfs_flush_inodes(mp); |
1067 | error = xfs_trans_alloc_icreate(mp, resv: tres, udqp, gdqp, pdqp, |
1068 | dblocks: resblks, tpp: &tp); |
1069 | } |
1070 | if (error) |
1071 | goto out_release_dquots; |
1072 | |
1073 | xfs_ilock(ip: dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); |
1074 | unlock_dp_on_error = true; |
1075 | |
1076 | /* |
1077 | * A newly created regular or special file just has one directory |
1078 | * entry pointing to them, but a directory also the "." entry |
1079 | * pointing to itself. |
1080 | */ |
1081 | error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); |
1082 | if (!error) |
1083 | error = xfs_init_new_inode(idmap, tp, dp, ino, mode, |
1084 | is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); |
1085 | if (error) |
1086 | goto out_trans_cancel; |
1087 | |
1088 | /* |
1089 | * Now we join the directory inode to the transaction. We do not do it |
1090 | * earlier because xfs_dialloc might commit the previous transaction |
1091 | * (and release all the locks). An error from here on will result in |
1092 | * the transaction cancel unlocking dp so don't do it explicitly in the |
1093 | * error path. |
1094 | */ |
1095 | xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); |
1096 | unlock_dp_on_error = false; |
1097 | |
1098 | error = xfs_dir_createname(tp, dp, name, ip->i_ino, |
1099 | resblks - XFS_IALLOC_SPACE_RES(mp)); |
1100 | if (error) { |
1101 | ASSERT(error != -ENOSPC); |
1102 | goto out_trans_cancel; |
1103 | } |
1104 | xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
1105 | xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); |
1106 | |
1107 | if (is_dir) { |
1108 | error = xfs_dir_init(tp, ip, dp); |
1109 | if (error) |
1110 | goto out_trans_cancel; |
1111 | |
1112 | xfs_bumplink(tp, ip: dp); |
1113 | } |
1114 | |
1115 | /* |
1116 | * Create ip with a reference from dp, and add '.' and '..' references |
1117 | * if it's a directory. |
1118 | */ |
1119 | xfs_dir_update_hook(dp, ip, delta: 1, name); |
1120 | |
1121 | /* |
1122 | * If this is a synchronous mount, make sure that the |
1123 | * create transaction goes to disk before returning to |
1124 | * the user. |
1125 | */ |
1126 | if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) |
1127 | xfs_trans_set_sync(tp); |
1128 | |
1129 | /* |
1130 | * Attach the dquot(s) to the inodes and modify them incore. |
1131 | * These ids of the inode couldn't have changed since the new |
1132 | * inode has been locked ever since it was created. |
1133 | */ |
1134 | xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); |
1135 | |
1136 | error = xfs_trans_commit(tp); |
1137 | if (error) |
1138 | goto out_release_inode; |
1139 | |
1140 | xfs_qm_dqrele(udqp); |
1141 | xfs_qm_dqrele(gdqp); |
1142 | xfs_qm_dqrele(pdqp); |
1143 | |
1144 | *ipp = ip; |
1145 | return 0; |
1146 | |
1147 | out_trans_cancel: |
1148 | xfs_trans_cancel(tp); |
1149 | out_release_inode: |
1150 | /* |
1151 | * Wait until after the current transaction is aborted to finish the |
1152 | * setup of the inode and release the inode. This prevents recursive |
1153 | * transactions and deadlocks from xfs_inactive. |
1154 | */ |
1155 | if (ip) { |
1156 | xfs_finish_inode_setup(ip); |
1157 | xfs_irele(ip); |
1158 | } |
1159 | out_release_dquots: |
1160 | xfs_qm_dqrele(udqp); |
1161 | xfs_qm_dqrele(gdqp); |
1162 | xfs_qm_dqrele(pdqp); |
1163 | |
1164 | if (unlock_dp_on_error) |
1165 | xfs_iunlock(ip: dp, XFS_ILOCK_EXCL); |
1166 | return error; |
1167 | } |
1168 | |
1169 | int |
1170 | xfs_create_tmpfile( |
1171 | struct mnt_idmap *idmap, |
1172 | struct xfs_inode *dp, |
1173 | umode_t mode, |
1174 | struct xfs_inode **ipp) |
1175 | { |
1176 | struct xfs_mount *mp = dp->i_mount; |
1177 | struct xfs_inode *ip = NULL; |
1178 | struct xfs_trans *tp = NULL; |
1179 | int error; |
1180 | prid_t prid; |
1181 | struct xfs_dquot *udqp = NULL; |
1182 | struct xfs_dquot *gdqp = NULL; |
1183 | struct xfs_dquot *pdqp = NULL; |
1184 | struct xfs_trans_res *tres; |
1185 | uint resblks; |
1186 | xfs_ino_t ino; |
1187 | |
1188 | if (xfs_is_shutdown(mp)) |
1189 | return -EIO; |
1190 | |
1191 | prid = xfs_get_initial_prid(dp); |
1192 | |
1193 | /* |
1194 | * Make sure that we have allocated dquot(s) on disk. |
1195 | */ |
1196 | error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), |
1197 | mapped_fsgid(idmap, &init_user_ns), prid, |
1198 | XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, |
1199 | &udqp, &gdqp, &pdqp); |
1200 | if (error) |
1201 | return error; |
1202 | |
1203 | resblks = XFS_IALLOC_SPACE_RES(mp); |
1204 | tres = &M_RES(mp)->tr_create_tmpfile; |
1205 | |
1206 | error = xfs_trans_alloc_icreate(mp, resv: tres, udqp, gdqp, pdqp, dblocks: resblks, |
1207 | tpp: &tp); |
1208 | if (error) |
1209 | goto out_release_dquots; |
1210 | |
1211 | error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); |
1212 | if (!error) |
1213 | error = xfs_init_new_inode(idmap, tp, dp, ino, mode, |
1214 | 0, 0, prid, false, &ip); |
1215 | if (error) |
1216 | goto out_trans_cancel; |
1217 | |
1218 | if (xfs_has_wsync(mp)) |
1219 | xfs_trans_set_sync(tp); |
1220 | |
1221 | /* |
1222 | * Attach the dquot(s) to the inodes and modify them incore. |
1223 | * These ids of the inode couldn't have changed since the new |
1224 | * inode has been locked ever since it was created. |
1225 | */ |
1226 | xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); |
1227 | |
1228 | error = xfs_iunlink(tp, ip); |
1229 | if (error) |
1230 | goto out_trans_cancel; |
1231 | |
1232 | error = xfs_trans_commit(tp); |
1233 | if (error) |
1234 | goto out_release_inode; |
1235 | |
1236 | xfs_qm_dqrele(udqp); |
1237 | xfs_qm_dqrele(gdqp); |
1238 | xfs_qm_dqrele(pdqp); |
1239 | |
1240 | *ipp = ip; |
1241 | return 0; |
1242 | |
1243 | out_trans_cancel: |
1244 | xfs_trans_cancel(tp); |
1245 | out_release_inode: |
1246 | /* |
1247 | * Wait until after the current transaction is aborted to finish the |
1248 | * setup of the inode and release the inode. This prevents recursive |
1249 | * transactions and deadlocks from xfs_inactive. |
1250 | */ |
1251 | if (ip) { |
1252 | xfs_finish_inode_setup(ip); |
1253 | xfs_irele(ip); |
1254 | } |
1255 | out_release_dquots: |
1256 | xfs_qm_dqrele(udqp); |
1257 | xfs_qm_dqrele(gdqp); |
1258 | xfs_qm_dqrele(pdqp); |
1259 | |
1260 | return error; |
1261 | } |
1262 | |
1263 | int |
1264 | xfs_link( |
1265 | xfs_inode_t *tdp, |
1266 | xfs_inode_t *sip, |
1267 | struct xfs_name *target_name) |
1268 | { |
1269 | xfs_mount_t *mp = tdp->i_mount; |
1270 | xfs_trans_t *tp; |
1271 | int error, nospace_error = 0; |
1272 | int resblks; |
1273 | |
1274 | trace_xfs_link(dp: tdp, xfs_link: target_name); |
1275 | |
1276 | ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); |
1277 | |
1278 | if (xfs_is_shutdown(mp)) |
1279 | return -EIO; |
1280 | if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) |
1281 | return -EIO; |
1282 | |
1283 | error = xfs_qm_dqattach(sip); |
1284 | if (error) |
1285 | goto std_return; |
1286 | |
1287 | error = xfs_qm_dqattach(tdp); |
1288 | if (error) |
1289 | goto std_return; |
1290 | |
1291 | resblks = XFS_LINK_SPACE_RES(mp, target_name->len); |
1292 | error = xfs_trans_alloc_dir(dp: tdp, resv: &M_RES(mp)->tr_link, ip: sip, dblocks: &resblks, |
1293 | tpp: &tp, nospace_error: &nospace_error); |
1294 | if (error) |
1295 | goto std_return; |
1296 | |
1297 | /* |
1298 | * If we are using project inheritance, we only allow hard link |
1299 | * creation in our tree when the project IDs are the same; else |
1300 | * the tree quota mechanism could be circumvented. |
1301 | */ |
1302 | if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && |
1303 | tdp->i_projid != sip->i_projid)) { |
1304 | /* |
1305 | * Project quota setup skips special files which can |
1306 | * leave inodes in a PROJINHERIT directory without a |
1307 | * project ID set. We need to allow links to be made |
1308 | * to these "project-less" inodes because userspace |
1309 | * expects them to succeed after project ID setup, |
1310 | * but everything else should be rejected. |
1311 | */ |
1312 | if (!special_file(VFS_I(sip)->i_mode) || |
1313 | sip->i_projid != 0) { |
1314 | error = -EXDEV; |
1315 | goto error_return; |
1316 | } |
1317 | } |
1318 | |
1319 | if (!resblks) { |
1320 | error = xfs_dir_canenter(tp, tdp, target_name); |
1321 | if (error) |
1322 | goto error_return; |
1323 | } |
1324 | |
1325 | /* |
1326 | * Handle initial link state of O_TMPFILE inode |
1327 | */ |
1328 | if (VFS_I(ip: sip)->i_nlink == 0) { |
1329 | struct xfs_perag *pag; |
1330 | |
1331 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); |
1332 | error = xfs_iunlink_remove(tp, pag, sip); |
1333 | xfs_perag_put(pag); |
1334 | if (error) |
1335 | goto error_return; |
1336 | } |
1337 | |
1338 | error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, |
1339 | resblks); |
1340 | if (error) |
1341 | goto error_return; |
1342 | xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
1343 | xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); |
1344 | |
1345 | xfs_bumplink(tp, ip: sip); |
1346 | xfs_dir_update_hook(dp: tdp, ip: sip, delta: 1, name: target_name); |
1347 | |
1348 | /* |
1349 | * If this is a synchronous mount, make sure that the |
1350 | * link transaction goes to disk before returning to |
1351 | * the user. |
1352 | */ |
1353 | if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) |
1354 | xfs_trans_set_sync(tp); |
1355 | |
1356 | return xfs_trans_commit(tp); |
1357 | |
1358 | error_return: |
1359 | xfs_trans_cancel(tp); |
1360 | std_return: |
1361 | if (error == -ENOSPC && nospace_error) |
1362 | error = nospace_error; |
1363 | return error; |
1364 | } |
1365 | |
1366 | /* Clear the reflink flag and the cowblocks tag if possible. */ |
1367 | static void |
1368 | xfs_itruncate_clear_reflink_flags( |
1369 | struct xfs_inode *ip) |
1370 | { |
1371 | struct xfs_ifork *dfork; |
1372 | struct xfs_ifork *cfork; |
1373 | |
1374 | if (!xfs_is_reflink_inode(ip)) |
1375 | return; |
1376 | dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); |
1377 | cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); |
1378 | if (dfork->if_bytes == 0 && cfork->if_bytes == 0) |
1379 | ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; |
1380 | if (cfork->if_bytes == 0) |
1381 | xfs_inode_clear_cowblocks_tag(ip); |
1382 | } |
1383 | |
1384 | /* |
1385 | * Free up the underlying blocks past new_size. The new size must be smaller |
1386 | * than the current size. This routine can be used both for the attribute and |
1387 | * data fork, and does not modify the inode size, which is left to the caller. |
1388 | * |
1389 | * The transaction passed to this routine must have made a permanent log |
1390 | * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the |
1391 | * given transaction and start new ones, so make sure everything involved in |
1392 | * the transaction is tidy before calling here. Some transaction will be |
1393 | * returned to the caller to be committed. The incoming transaction must |
1394 | * already include the inode, and both inode locks must be held exclusively. |
1395 | * The inode must also be "held" within the transaction. On return the inode |
1396 | * will be "held" within the returned transaction. This routine does NOT |
1397 | * require any disk space to be reserved for it within the transaction. |
1398 | * |
1399 | * If we get an error, we must return with the inode locked and linked into the |
1400 | * current transaction. This keeps things simple for the higher level code, |
1401 | * because it always knows that the inode is locked and held in the transaction |
1402 | * that returns to it whether errors occur or not. We don't mark the inode |
1403 | * dirty on error so that transactions can be easily aborted if possible. |
1404 | */ |
1405 | int |
1406 | xfs_itruncate_extents_flags( |
1407 | struct xfs_trans **tpp, |
1408 | struct xfs_inode *ip, |
1409 | int whichfork, |
1410 | xfs_fsize_t new_size, |
1411 | int flags) |
1412 | { |
1413 | struct xfs_mount *mp = ip->i_mount; |
1414 | struct xfs_trans *tp = *tpp; |
1415 | xfs_fileoff_t first_unmap_block; |
1416 | int error = 0; |
1417 | |
1418 | xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); |
1419 | if (atomic_read(v: &VFS_I(ip)->i_count)) |
1420 | xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); |
1421 | ASSERT(new_size <= XFS_ISIZE(ip)); |
1422 | ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); |
1423 | ASSERT(ip->i_itemp != NULL); |
1424 | ASSERT(ip->i_itemp->ili_lock_flags == 0); |
1425 | ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); |
1426 | |
1427 | trace_xfs_itruncate_extents_start(ip, new_size); |
1428 | |
1429 | flags |= xfs_bmapi_aflag(whichfork); |
1430 | |
1431 | /* |
1432 | * Since it is possible for space to become allocated beyond |
1433 | * the end of the file (in a crash where the space is allocated |
1434 | * but the inode size is not yet updated), simply remove any |
1435 | * blocks which show up between the new EOF and the maximum |
1436 | * possible file size. |
1437 | * |
1438 | * We have to free all the blocks to the bmbt maximum offset, even if |
1439 | * the page cache can't scale that far. |
1440 | */ |
1441 | first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); |
1442 | if (!xfs_verify_fileoff(mp, first_unmap_block)) { |
1443 | WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); |
1444 | return 0; |
1445 | } |
1446 | |
1447 | error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, |
1448 | XFS_MAX_FILEOFF); |
1449 | if (error) |
1450 | goto out; |
1451 | |
1452 | if (whichfork == XFS_DATA_FORK) { |
1453 | /* Remove all pending CoW reservations. */ |
1454 | error = xfs_reflink_cancel_cow_blocks(ip, &tp, |
1455 | first_unmap_block, XFS_MAX_FILEOFF, true); |
1456 | if (error) |
1457 | goto out; |
1458 | |
1459 | xfs_itruncate_clear_reflink_flags(ip); |
1460 | } |
1461 | |
1462 | /* |
1463 | * Always re-log the inode so that our permanent transaction can keep |
1464 | * on rolling it forward in the log. |
1465 | */ |
1466 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
1467 | |
1468 | trace_xfs_itruncate_extents_end(ip, new_size); |
1469 | |
1470 | out: |
1471 | *tpp = tp; |
1472 | return error; |
1473 | } |
1474 | |
1475 | int |
1476 | xfs_release( |
1477 | xfs_inode_t *ip) |
1478 | { |
1479 | xfs_mount_t *mp = ip->i_mount; |
1480 | int error = 0; |
1481 | |
1482 | if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) |
1483 | return 0; |
1484 | |
1485 | /* If this is a read-only mount, don't do this (would generate I/O) */ |
1486 | if (xfs_is_readonly(mp)) |
1487 | return 0; |
1488 | |
1489 | if (!xfs_is_shutdown(mp)) { |
1490 | int truncated; |
1491 | |
1492 | /* |
1493 | * If we previously truncated this file and removed old data |
1494 | * in the process, we want to initiate "early" writeout on |
1495 | * the last close. This is an attempt to combat the notorious |
1496 | * NULL files problem which is particularly noticeable from a |
1497 | * truncate down, buffered (re-)write (delalloc), followed by |
1498 | * a crash. What we are effectively doing here is |
1499 | * significantly reducing the time window where we'd otherwise |
1500 | * be exposed to that problem. |
1501 | */ |
1502 | truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); |
1503 | if (truncated) { |
1504 | xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); |
1505 | if (ip->i_delayed_blks > 0) { |
1506 | error = filemap_flush(VFS_I(ip)->i_mapping); |
1507 | if (error) |
1508 | return error; |
1509 | } |
1510 | } |
1511 | } |
1512 | |
1513 | if (VFS_I(ip)->i_nlink == 0) |
1514 | return 0; |
1515 | |
1516 | /* |
1517 | * If we can't get the iolock just skip truncating the blocks past EOF |
1518 | * because we could deadlock with the mmap_lock otherwise. We'll get |
1519 | * another chance to drop them once the last reference to the inode is |
1520 | * dropped, so we'll never leak blocks permanently. |
1521 | */ |
1522 | if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) |
1523 | return 0; |
1524 | |
1525 | if (xfs_can_free_eofblocks(ip, force: false)) { |
1526 | /* |
1527 | * Check if the inode is being opened, written and closed |
1528 | * frequently and we have delayed allocation blocks outstanding |
1529 | * (e.g. streaming writes from the NFS server), truncating the |
1530 | * blocks past EOF will cause fragmentation to occur. |
1531 | * |
1532 | * In this case don't do the truncation, but we have to be |
1533 | * careful how we detect this case. Blocks beyond EOF show up as |
1534 | * i_delayed_blks even when the inode is clean, so we need to |
1535 | * truncate them away first before checking for a dirty release. |
1536 | * Hence on the first dirty close we will still remove the |
1537 | * speculative allocation, but after that we will leave it in |
1538 | * place. |
1539 | */ |
1540 | if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) |
1541 | goto out_unlock; |
1542 | |
1543 | error = xfs_free_eofblocks(ip); |
1544 | if (error) |
1545 | goto out_unlock; |
1546 | |
1547 | /* delalloc blocks after truncation means it really is dirty */ |
1548 | if (ip->i_delayed_blks) |
1549 | xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); |
1550 | } |
1551 | |
1552 | out_unlock: |
1553 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); |
1554 | return error; |
1555 | } |
1556 | |
1557 | /* |
1558 | * xfs_inactive_truncate |
1559 | * |
1560 | * Called to perform a truncate when an inode becomes unlinked. |
1561 | */ |
1562 | STATIC int |
1563 | xfs_inactive_truncate( |
1564 | struct xfs_inode *ip) |
1565 | { |
1566 | struct xfs_mount *mp = ip->i_mount; |
1567 | struct xfs_trans *tp; |
1568 | int error; |
1569 | |
1570 | error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_itruncate, blocks: 0, rtextents: 0, flags: 0, tpp: &tp); |
1571 | if (error) { |
1572 | ASSERT(xfs_is_shutdown(mp)); |
1573 | return error; |
1574 | } |
1575 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
1576 | xfs_trans_ijoin(tp, ip, 0); |
1577 | |
1578 | /* |
1579 | * Log the inode size first to prevent stale data exposure in the event |
1580 | * of a system crash before the truncate completes. See the related |
1581 | * comment in xfs_vn_setattr_size() for details. |
1582 | */ |
1583 | ip->i_disk_size = 0; |
1584 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
1585 | |
1586 | error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); |
1587 | if (error) |
1588 | goto error_trans_cancel; |
1589 | |
1590 | ASSERT(ip->i_df.if_nextents == 0); |
1591 | |
1592 | error = xfs_trans_commit(tp); |
1593 | if (error) |
1594 | goto error_unlock; |
1595 | |
1596 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
1597 | return 0; |
1598 | |
1599 | error_trans_cancel: |
1600 | xfs_trans_cancel(tp); |
1601 | error_unlock: |
1602 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
1603 | return error; |
1604 | } |
1605 | |
1606 | /* |
1607 | * xfs_inactive_ifree() |
1608 | * |
1609 | * Perform the inode free when an inode is unlinked. |
1610 | */ |
1611 | STATIC int |
1612 | xfs_inactive_ifree( |
1613 | struct xfs_inode *ip) |
1614 | { |
1615 | struct xfs_mount *mp = ip->i_mount; |
1616 | struct xfs_trans *tp; |
1617 | int error; |
1618 | |
1619 | /* |
1620 | * We try to use a per-AG reservation for any block needed by the finobt |
1621 | * tree, but as the finobt feature predates the per-AG reservation |
1622 | * support a degraded file system might not have enough space for the |
1623 | * reservation at mount time. In that case try to dip into the reserved |
1624 | * pool and pray. |
1625 | * |
1626 | * Send a warning if the reservation does happen to fail, as the inode |
1627 | * now remains allocated and sits on the unlinked list until the fs is |
1628 | * repaired. |
1629 | */ |
1630 | if (unlikely(mp->m_finobt_nores)) { |
1631 | error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, |
1632 | XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, |
1633 | &tp); |
1634 | } else { |
1635 | error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_ifree, blocks: 0, rtextents: 0, flags: 0, tpp: &tp); |
1636 | } |
1637 | if (error) { |
1638 | if (error == -ENOSPC) { |
1639 | xfs_warn_ratelimited(mp, |
1640 | "Failed to remove inode(s) from unlinked list. " |
1641 | "Please free space, unmount and run xfs_repair."); |
1642 | } else { |
1643 | ASSERT(xfs_is_shutdown(mp)); |
1644 | } |
1645 | return error; |
1646 | } |
1647 | |
1648 | /* |
1649 | * We do not hold the inode locked across the entire rolling transaction |
1650 | * here. We only need to hold it for the first transaction that |
1651 | * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the |
1652 | * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode |
1653 | * here breaks the relationship between cluster buffer invalidation and |
1654 | * stale inode invalidation on cluster buffer item journal commit |
1655 | * completion, and can result in leaving dirty stale inodes hanging |
1656 | * around in memory. |
1657 | * |
1658 | * We have no need for serialising this inode operation against other |
1659 | * operations - we freed the inode and hence reallocation is required |
1660 | * and that will serialise on reallocating the space the deferops need |
1661 | * to free. Hence we can unlock the inode on the first commit of |
1662 | * the transaction rather than roll it right through the deferops. This |
1663 | * avoids relogging the XFS_ISTALE inode. |
1664 | * |
1665 | * We check that xfs_ifree() hasn't grown an internal transaction roll |
1666 | * by asserting that the inode is still locked when it returns. |
1667 | */ |
1668 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
1669 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); |
1670 | |
1671 | error = xfs_ifree(tp, ip); |
1672 | xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); |
1673 | if (error) { |
1674 | /* |
1675 | * If we fail to free the inode, shut down. The cancel |
1676 | * might do that, we need to make sure. Otherwise the |
1677 | * inode might be lost for a long time or forever. |
1678 | */ |
1679 | if (!xfs_is_shutdown(mp)) { |
1680 | xfs_notice(mp, "%s: xfs_ifree returned error %d", |
1681 | __func__, error); |
1682 | xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); |
1683 | } |
1684 | xfs_trans_cancel(tp); |
1685 | return error; |
1686 | } |
1687 | |
1688 | /* |
1689 | * Credit the quota account(s). The inode is gone. |
1690 | */ |
1691 | xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); |
1692 | |
1693 | return xfs_trans_commit(tp); |
1694 | } |
1695 | |
1696 | /* |
1697 | * Returns true if we need to update the on-disk metadata before we can free |
1698 | * the memory used by this inode. Updates include freeing post-eof |
1699 | * preallocations; freeing COW staging extents; and marking the inode free in |
1700 | * the inobt if it is on the unlinked list. |
1701 | */ |
1702 | bool |
1703 | xfs_inode_needs_inactive( |
1704 | struct xfs_inode *ip) |
1705 | { |
1706 | struct xfs_mount *mp = ip->i_mount; |
1707 | struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); |
1708 | |
1709 | /* |
1710 | * If the inode is already free, then there can be nothing |
1711 | * to clean up here. |
1712 | */ |
1713 | if (VFS_I(ip)->i_mode == 0) |
1714 | return false; |
1715 | |
1716 | /* |
1717 | * If this is a read-only mount, don't do this (would generate I/O) |
1718 | * unless we're in log recovery and cleaning the iunlinked list. |
1719 | */ |
1720 | if (xfs_is_readonly(mp) && !xlog_recovery_needed(log: mp->m_log)) |
1721 | return false; |
1722 | |
1723 | /* If the log isn't running, push inodes straight to reclaim. */ |
1724 | if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) |
1725 | return false; |
1726 | |
1727 | /* Metadata inodes require explicit resource cleanup. */ |
1728 | if (xfs_is_metadata_inode(ip)) |
1729 | return false; |
1730 | |
1731 | /* Want to clean out the cow blocks if there are any. */ |
1732 | if (cow_ifp && cow_ifp->if_bytes > 0) |
1733 | return true; |
1734 | |
1735 | /* Unlinked files must be freed. */ |
1736 | if (VFS_I(ip)->i_nlink == 0) |
1737 | return true; |
1738 | |
1739 | /* |
1740 | * This file isn't being freed, so check if there are post-eof blocks |
1741 | * to free. @force is true because we are evicting an inode from the |
1742 | * cache. Post-eof blocks must be freed, lest we end up with broken |
1743 | * free space accounting. |
1744 | * |
1745 | * Note: don't bother with iolock here since lockdep complains about |
1746 | * acquiring it in reclaim context. We have the only reference to the |
1747 | * inode at this point anyways. |
1748 | */ |
1749 | return xfs_can_free_eofblocks(ip, force: true); |
1750 | } |
1751 | |
1752 | /* |
1753 | * Save health status somewhere, if we're dumping an inode with uncorrected |
1754 | * errors and online repair isn't running. |
1755 | */ |
1756 | static inline void |
1757 | xfs_inactive_health( |
1758 | struct xfs_inode *ip) |
1759 | { |
1760 | struct xfs_mount *mp = ip->i_mount; |
1761 | struct xfs_perag *pag; |
1762 | unsigned int sick; |
1763 | unsigned int checked; |
1764 | |
1765 | xfs_inode_measure_sickness(ip, &sick, &checked); |
1766 | if (!sick) |
1767 | return; |
1768 | |
1769 | trace_xfs_inode_unfixed_corruption(ip, flags: sick); |
1770 | |
1771 | if (sick & XFS_SICK_INO_FORGET) |
1772 | return; |
1773 | |
1774 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); |
1775 | if (!pag) { |
1776 | /* There had better still be a perag structure! */ |
1777 | ASSERT(0); |
1778 | return; |
1779 | } |
1780 | |
1781 | xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES); |
1782 | xfs_perag_put(pag); |
1783 | } |
1784 | |
1785 | /* |
1786 | * xfs_inactive |
1787 | * |
1788 | * This is called when the vnode reference count for the vnode |
1789 | * goes to zero. If the file has been unlinked, then it must |
1790 | * now be truncated. Also, we clear all of the read-ahead state |
1791 | * kept for the inode here since the file is now closed. |
1792 | */ |
1793 | int |
1794 | xfs_inactive( |
1795 | xfs_inode_t *ip) |
1796 | { |
1797 | struct xfs_mount *mp; |
1798 | int error = 0; |
1799 | int truncate = 0; |
1800 | |
1801 | /* |
1802 | * If the inode is already free, then there can be nothing |
1803 | * to clean up here. |
1804 | */ |
1805 | if (VFS_I(ip)->i_mode == 0) { |
1806 | ASSERT(ip->i_df.if_broot_bytes == 0); |
1807 | goto out; |
1808 | } |
1809 | |
1810 | mp = ip->i_mount; |
1811 | ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); |
1812 | |
1813 | xfs_inactive_health(ip); |
1814 | |
1815 | /* |
1816 | * If this is a read-only mount, don't do this (would generate I/O) |
1817 | * unless we're in log recovery and cleaning the iunlinked list. |
1818 | */ |
1819 | if (xfs_is_readonly(mp) && !xlog_recovery_needed(log: mp->m_log)) |
1820 | goto out; |
1821 | |
1822 | /* Metadata inodes require explicit resource cleanup. */ |
1823 | if (xfs_is_metadata_inode(ip)) |
1824 | goto out; |
1825 | |
1826 | /* Try to clean out the cow blocks if there are any. */ |
1827 | if (xfs_inode_has_cow_data(ip)) |
1828 | xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); |
1829 | |
1830 | if (VFS_I(ip)->i_nlink != 0) { |
1831 | /* |
1832 | * force is true because we are evicting an inode from the |
1833 | * cache. Post-eof blocks must be freed, lest we end up with |
1834 | * broken free space accounting. |
1835 | * |
1836 | * Note: don't bother with iolock here since lockdep complains |
1837 | * about acquiring it in reclaim context. We have the only |
1838 | * reference to the inode at this point anyways. |
1839 | */ |
1840 | if (xfs_can_free_eofblocks(ip, force: true)) |
1841 | error = xfs_free_eofblocks(ip); |
1842 | |
1843 | goto out; |
1844 | } |
1845 | |
1846 | if (S_ISREG(VFS_I(ip)->i_mode) && |
1847 | (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || |
1848 | ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) |
1849 | truncate = 1; |
1850 | |
1851 | if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { |
1852 | /* |
1853 | * If this inode is being inactivated during a quotacheck and |
1854 | * has not yet been scanned by quotacheck, we /must/ remove |
1855 | * the dquots from the inode before inactivation changes the |
1856 | * block and inode counts. Most probably this is a result of |
1857 | * reloading the incore iunlinked list to purge unrecovered |
1858 | * unlinked inodes. |
1859 | */ |
1860 | xfs_qm_dqdetach(ip); |
1861 | } else { |
1862 | error = xfs_qm_dqattach(ip); |
1863 | if (error) |
1864 | goto out; |
1865 | } |
1866 | |
1867 | if (S_ISLNK(VFS_I(ip)->i_mode)) |
1868 | error = xfs_inactive_symlink(ip); |
1869 | else if (truncate) |
1870 | error = xfs_inactive_truncate(ip); |
1871 | if (error) |
1872 | goto out; |
1873 | |
1874 | /* |
1875 | * If there are attributes associated with the file then blow them away |
1876 | * now. The code calls a routine that recursively deconstructs the |
1877 | * attribute fork. If also blows away the in-core attribute fork. |
1878 | */ |
1879 | if (xfs_inode_has_attr_fork(ip)) { |
1880 | error = xfs_attr_inactive(ip); |
1881 | if (error) |
1882 | goto out; |
1883 | } |
1884 | |
1885 | ASSERT(ip->i_forkoff == 0); |
1886 | |
1887 | /* |
1888 | * Free the inode. |
1889 | */ |
1890 | error = xfs_inactive_ifree(ip); |
1891 | |
1892 | out: |
1893 | /* |
1894 | * We're done making metadata updates for this inode, so we can release |
1895 | * the attached dquots. |
1896 | */ |
1897 | xfs_qm_dqdetach(ip); |
1898 | return error; |
1899 | } |
1900 | |
1901 | /* |
1902 | * In-Core Unlinked List Lookups |
1903 | * ============================= |
1904 | * |
1905 | * Every inode is supposed to be reachable from some other piece of metadata |
1906 | * with the exception of the root directory. Inodes with a connection to a |
1907 | * file descriptor but not linked from anywhere in the on-disk directory tree |
1908 | * are collectively known as unlinked inodes, though the filesystem itself |
1909 | * maintains links to these inodes so that on-disk metadata are consistent. |
1910 | * |
1911 | * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI |
1912 | * header contains a number of buckets that point to an inode, and each inode |
1913 | * record has a pointer to the next inode in the hash chain. This |
1914 | * singly-linked list causes scaling problems in the iunlink remove function |
1915 | * because we must walk that list to find the inode that points to the inode |
1916 | * being removed from the unlinked hash bucket list. |
1917 | * |
1918 | * Hence we keep an in-memory double linked list to link each inode on an |
1919 | * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer |
1920 | * based lists would require having 64 list heads in the perag, one for each |
1921 | * list. This is expensive in terms of memory (think millions of AGs) and cache |
1922 | * misses on lookups. Instead, use the fact that inodes on the unlinked list |
1923 | * must be referenced at the VFS level to keep them on the list and hence we |
1924 | * have an existence guarantee for inodes on the unlinked list. |
1925 | * |
1926 | * Given we have an existence guarantee, we can use lockless inode cache lookups |
1927 | * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode |
1928 | * for the double linked unlinked list, and we don't need any extra locking to |
1929 | * keep the list safe as all manipulations are done under the AGI buffer lock. |
1930 | * Keeping the list up to date does not require memory allocation, just finding |
1931 | * the XFS inode and updating the next/prev unlinked list aginos. |
1932 | */ |
1933 | |
1934 | /* |
1935 | * Find an inode on the unlinked list. This does not take references to the |
1936 | * inode as we have existence guarantees by holding the AGI buffer lock and that |
1937 | * only unlinked, referenced inodes can be on the unlinked inode list. If we |
1938 | * don't find the inode in cache, then let the caller handle the situation. |
1939 | */ |
1940 | static struct xfs_inode * |
1941 | xfs_iunlink_lookup( |
1942 | struct xfs_perag *pag, |
1943 | xfs_agino_t agino) |
1944 | { |
1945 | struct xfs_inode *ip; |
1946 | |
1947 | rcu_read_lock(); |
1948 | ip = radix_tree_lookup(&pag->pag_ici_root, agino); |
1949 | if (!ip) { |
1950 | /* Caller can handle inode not being in memory. */ |
1951 | rcu_read_unlock(); |
1952 | return NULL; |
1953 | } |
1954 | |
1955 | /* |
1956 | * Inode in RCU freeing limbo should not happen. Warn about this and |
1957 | * let the caller handle the failure. |
1958 | */ |
1959 | if (WARN_ON_ONCE(!ip->i_ino)) { |
1960 | rcu_read_unlock(); |
1961 | return NULL; |
1962 | } |
1963 | ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); |
1964 | rcu_read_unlock(); |
1965 | return ip; |
1966 | } |
1967 | |
1968 | /* |
1969 | * Update the prev pointer of the next agino. Returns -ENOLINK if the inode |
1970 | * is not in cache. |
1971 | */ |
1972 | static int |
1973 | xfs_iunlink_update_backref( |
1974 | struct xfs_perag *pag, |
1975 | xfs_agino_t prev_agino, |
1976 | xfs_agino_t next_agino) |
1977 | { |
1978 | struct xfs_inode *ip; |
1979 | |
1980 | /* No update necessary if we are at the end of the list. */ |
1981 | if (next_agino == NULLAGINO) |
1982 | return 0; |
1983 | |
1984 | ip = xfs_iunlink_lookup(pag, next_agino); |
1985 | if (!ip) |
1986 | return -ENOLINK; |
1987 | |
1988 | ip->i_prev_unlinked = prev_agino; |
1989 | return 0; |
1990 | } |
1991 | |
1992 | /* |
1993 | * Point the AGI unlinked bucket at an inode and log the results. The caller |
1994 | * is responsible for validating the old value. |
1995 | */ |
1996 | STATIC int |
1997 | xfs_iunlink_update_bucket( |
1998 | struct xfs_trans *tp, |
1999 | struct xfs_perag *pag, |
2000 | struct xfs_buf *agibp, |
2001 | unsigned int bucket_index, |
2002 | xfs_agino_t new_agino) |
2003 | { |
2004 | struct xfs_agi *agi = agibp->b_addr; |
2005 | xfs_agino_t old_value; |
2006 | int offset; |
2007 | |
2008 | ASSERT(xfs_verify_agino_or_null(pag, new_agino)); |
2009 | |
2010 | old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); |
2011 | trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, |
2012 | old_value, new_agino); |
2013 | |
2014 | /* |
2015 | * We should never find the head of the list already set to the value |
2016 | * passed in because either we're adding or removing ourselves from the |
2017 | * head of the list. |
2018 | */ |
2019 | if (old_value == new_agino) { |
2020 | xfs_buf_mark_corrupt(agibp); |
2021 | xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); |
2022 | return -EFSCORRUPTED; |
2023 | } |
2024 | |
2025 | agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); |
2026 | offset = offsetof(struct xfs_agi, agi_unlinked) + |
2027 | (sizeof(xfs_agino_t) * bucket_index); |
2028 | xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); |
2029 | return 0; |
2030 | } |
2031 | |
2032 | /* |
2033 | * Load the inode @next_agino into the cache and set its prev_unlinked pointer |
2034 | * to @prev_agino. Caller must hold the AGI to synchronize with other changes |
2035 | * to the unlinked list. |
2036 | */ |
2037 | STATIC int |
2038 | xfs_iunlink_reload_next( |
2039 | struct xfs_trans *tp, |
2040 | struct xfs_buf *agibp, |
2041 | xfs_agino_t prev_agino, |
2042 | xfs_agino_t next_agino) |
2043 | { |
2044 | struct xfs_perag *pag = agibp->b_pag; |
2045 | struct xfs_mount *mp = pag->pag_mount; |
2046 | struct xfs_inode *next_ip = NULL; |
2047 | xfs_ino_t ino; |
2048 | int error; |
2049 | |
2050 | ASSERT(next_agino != NULLAGINO); |
2051 | |
2052 | #ifdef DEBUG |
2053 | rcu_read_lock(); |
2054 | next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); |
2055 | ASSERT(next_ip == NULL); |
2056 | rcu_read_unlock(); |
2057 | #endif |
2058 | |
2059 | xfs_info_ratelimited(mp, |
2060 | "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", |
2061 | next_agino, pag->pag_agno); |
2062 | |
2063 | /* |
2064 | * Use an untrusted lookup just to be cautious in case the AGI has been |
2065 | * corrupted and now points at a free inode. That shouldn't happen, |
2066 | * but we'd rather shut down now since we're already running in a weird |
2067 | * situation. |
2068 | */ |
2069 | ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); |
2070 | error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, lock_flags: 0, ipp: &next_ip); |
2071 | if (error) { |
2072 | xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); |
2073 | return error; |
2074 | } |
2075 | |
2076 | /* If this is not an unlinked inode, something is very wrong. */ |
2077 | if (VFS_I(ip: next_ip)->i_nlink != 0) { |
2078 | xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); |
2079 | error = -EFSCORRUPTED; |
2080 | goto rele; |
2081 | } |
2082 | |
2083 | next_ip->i_prev_unlinked = prev_agino; |
2084 | trace_xfs_iunlink_reload_next(ip: next_ip); |
2085 | rele: |
2086 | ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); |
2087 | if (xfs_is_quotacheck_running(mp) && next_ip) |
2088 | xfs_iflags_set(ip: next_ip, XFS_IQUOTAUNCHECKED); |
2089 | xfs_irele(ip: next_ip); |
2090 | return error; |
2091 | } |
2092 | |
2093 | static int |
2094 | xfs_iunlink_insert_inode( |
2095 | struct xfs_trans *tp, |
2096 | struct xfs_perag *pag, |
2097 | struct xfs_buf *agibp, |
2098 | struct xfs_inode *ip) |
2099 | { |
2100 | struct xfs_mount *mp = tp->t_mountp; |
2101 | struct xfs_agi *agi = agibp->b_addr; |
2102 | xfs_agino_t next_agino; |
2103 | xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); |
2104 | short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; |
2105 | int error; |
2106 | |
2107 | /* |
2108 | * Get the index into the agi hash table for the list this inode will |
2109 | * go on. Make sure the pointer isn't garbage and that this inode |
2110 | * isn't already on the list. |
2111 | */ |
2112 | next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); |
2113 | if (next_agino == agino || |
2114 | !xfs_verify_agino_or_null(pag, next_agino)) { |
2115 | xfs_buf_mark_corrupt(agibp); |
2116 | xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); |
2117 | return -EFSCORRUPTED; |
2118 | } |
2119 | |
2120 | /* |
2121 | * Update the prev pointer in the next inode to point back to this |
2122 | * inode. |
2123 | */ |
2124 | error = xfs_iunlink_update_backref(pag, agino, next_agino); |
2125 | if (error == -ENOLINK) |
2126 | error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); |
2127 | if (error) |
2128 | return error; |
2129 | |
2130 | if (next_agino != NULLAGINO) { |
2131 | /* |
2132 | * There is already another inode in the bucket, so point this |
2133 | * inode to the current head of the list. |
2134 | */ |
2135 | error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); |
2136 | if (error) |
2137 | return error; |
2138 | ip->i_next_unlinked = next_agino; |
2139 | } |
2140 | |
2141 | /* Point the head of the list to point to this inode. */ |
2142 | ip->i_prev_unlinked = NULLAGINO; |
2143 | return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); |
2144 | } |
2145 | |
2146 | /* |
2147 | * This is called when the inode's link count has gone to 0 or we are creating |
2148 | * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. |
2149 | * |
2150 | * We place the on-disk inode on a list in the AGI. It will be pulled from this |
2151 | * list when the inode is freed. |
2152 | */ |
2153 | STATIC int |
2154 | xfs_iunlink( |
2155 | struct xfs_trans *tp, |
2156 | struct xfs_inode *ip) |
2157 | { |
2158 | struct xfs_mount *mp = tp->t_mountp; |
2159 | struct xfs_perag *pag; |
2160 | struct xfs_buf *agibp; |
2161 | int error; |
2162 | |
2163 | ASSERT(VFS_I(ip)->i_nlink == 0); |
2164 | ASSERT(VFS_I(ip)->i_mode != 0); |
2165 | trace_xfs_iunlink(ip); |
2166 | |
2167 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); |
2168 | |
2169 | /* Get the agi buffer first. It ensures lock ordering on the list. */ |
2170 | error = xfs_read_agi(pag, tp, &agibp); |
2171 | if (error) |
2172 | goto out; |
2173 | |
2174 | error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); |
2175 | out: |
2176 | xfs_perag_put(pag); |
2177 | return error; |
2178 | } |
2179 | |
2180 | static int |
2181 | xfs_iunlink_remove_inode( |
2182 | struct xfs_trans *tp, |
2183 | struct xfs_perag *pag, |
2184 | struct xfs_buf *agibp, |
2185 | struct xfs_inode *ip) |
2186 | { |
2187 | struct xfs_mount *mp = tp->t_mountp; |
2188 | struct xfs_agi *agi = agibp->b_addr; |
2189 | xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); |
2190 | xfs_agino_t head_agino; |
2191 | short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; |
2192 | int error; |
2193 | |
2194 | trace_xfs_iunlink_remove(ip); |
2195 | |
2196 | /* |
2197 | * Get the index into the agi hash table for the list this inode will |
2198 | * go on. Make sure the head pointer isn't garbage. |
2199 | */ |
2200 | head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); |
2201 | if (!xfs_verify_agino(pag, head_agino)) { |
2202 | XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, |
2203 | agi, sizeof(*agi)); |
2204 | xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); |
2205 | return -EFSCORRUPTED; |
2206 | } |
2207 | |
2208 | /* |
2209 | * Set our inode's next_unlinked pointer to NULL and then return |
2210 | * the old pointer value so that we can update whatever was previous |
2211 | * to us in the list to point to whatever was next in the list. |
2212 | */ |
2213 | error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); |
2214 | if (error) |
2215 | return error; |
2216 | |
2217 | /* |
2218 | * Update the prev pointer in the next inode to point back to previous |
2219 | * inode in the chain. |
2220 | */ |
2221 | error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, |
2222 | ip->i_next_unlinked); |
2223 | if (error == -ENOLINK) |
2224 | error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, |
2225 | ip->i_next_unlinked); |
2226 | if (error) |
2227 | return error; |
2228 | |
2229 | if (head_agino != agino) { |
2230 | struct xfs_inode *prev_ip; |
2231 | |
2232 | prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); |
2233 | if (!prev_ip) { |
2234 | xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); |
2235 | return -EFSCORRUPTED; |
2236 | } |
2237 | |
2238 | error = xfs_iunlink_log_inode(tp, ip: prev_ip, pag, |
2239 | next_agino: ip->i_next_unlinked); |
2240 | prev_ip->i_next_unlinked = ip->i_next_unlinked; |
2241 | } else { |
2242 | /* Point the head of the list to the next unlinked inode. */ |
2243 | error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, |
2244 | ip->i_next_unlinked); |
2245 | } |
2246 | |
2247 | ip->i_next_unlinked = NULLAGINO; |
2248 | ip->i_prev_unlinked = 0; |
2249 | return error; |
2250 | } |
2251 | |
2252 | /* |
2253 | * Pull the on-disk inode from the AGI unlinked list. |
2254 | */ |
2255 | STATIC int |
2256 | xfs_iunlink_remove( |
2257 | struct xfs_trans *tp, |
2258 | struct xfs_perag *pag, |
2259 | struct xfs_inode *ip) |
2260 | { |
2261 | struct xfs_buf *agibp; |
2262 | int error; |
2263 | |
2264 | trace_xfs_iunlink_remove(ip); |
2265 | |
2266 | /* Get the agi buffer first. It ensures lock ordering on the list. */ |
2267 | error = xfs_read_agi(pag, tp, &agibp); |
2268 | if (error) |
2269 | return error; |
2270 | |
2271 | return xfs_iunlink_remove_inode(tp, pag, agibp, ip); |
2272 | } |
2273 | |
2274 | /* |
2275 | * Look up the inode number specified and if it is not already marked XFS_ISTALE |
2276 | * mark it stale. We should only find clean inodes in this lookup that aren't |
2277 | * already stale. |
2278 | */ |
2279 | static void |
2280 | xfs_ifree_mark_inode_stale( |
2281 | struct xfs_perag *pag, |
2282 | struct xfs_inode *free_ip, |
2283 | xfs_ino_t inum) |
2284 | { |
2285 | struct xfs_mount *mp = pag->pag_mount; |
2286 | struct xfs_inode_log_item *iip; |
2287 | struct xfs_inode *ip; |
2288 | |
2289 | retry: |
2290 | rcu_read_lock(); |
2291 | ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); |
2292 | |
2293 | /* Inode not in memory, nothing to do */ |
2294 | if (!ip) { |
2295 | rcu_read_unlock(); |
2296 | return; |
2297 | } |
2298 | |
2299 | /* |
2300 | * because this is an RCU protected lookup, we could find a recently |
2301 | * freed or even reallocated inode during the lookup. We need to check |
2302 | * under the i_flags_lock for a valid inode here. Skip it if it is not |
2303 | * valid, the wrong inode or stale. |
2304 | */ |
2305 | spin_lock(lock: &ip->i_flags_lock); |
2306 | if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) |
2307 | goto out_iflags_unlock; |
2308 | |
2309 | /* |
2310 | * Don't try to lock/unlock the current inode, but we _cannot_ skip the |
2311 | * other inodes that we did not find in the list attached to the buffer |
2312 | * and are not already marked stale. If we can't lock it, back off and |
2313 | * retry. |
2314 | */ |
2315 | if (ip != free_ip) { |
2316 | if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { |
2317 | spin_unlock(lock: &ip->i_flags_lock); |
2318 | rcu_read_unlock(); |
2319 | delay(ticks: 1); |
2320 | goto retry; |
2321 | } |
2322 | } |
2323 | ip->i_flags |= XFS_ISTALE; |
2324 | |
2325 | /* |
2326 | * If the inode is flushing, it is already attached to the buffer. All |
2327 | * we needed to do here is mark the inode stale so buffer IO completion |
2328 | * will remove it from the AIL. |
2329 | */ |
2330 | iip = ip->i_itemp; |
2331 | if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { |
2332 | ASSERT(!list_empty(&iip->ili_item.li_bio_list)); |
2333 | ASSERT(iip->ili_last_fields); |
2334 | goto out_iunlock; |
2335 | } |
2336 | |
2337 | /* |
2338 | * Inodes not attached to the buffer can be released immediately. |
2339 | * Everything else has to go through xfs_iflush_abort() on journal |
2340 | * commit as the flock synchronises removal of the inode from the |
2341 | * cluster buffer against inode reclaim. |
2342 | */ |
2343 | if (!iip || list_empty(head: &iip->ili_item.li_bio_list)) |
2344 | goto out_iunlock; |
2345 | |
2346 | __xfs_iflags_set(ip, XFS_IFLUSHING); |
2347 | spin_unlock(lock: &ip->i_flags_lock); |
2348 | rcu_read_unlock(); |
2349 | |
2350 | /* we have a dirty inode in memory that has not yet been flushed. */ |
2351 | spin_lock(lock: &iip->ili_lock); |
2352 | iip->ili_last_fields = iip->ili_fields; |
2353 | iip->ili_fields = 0; |
2354 | iip->ili_fsync_fields = 0; |
2355 | spin_unlock(lock: &iip->ili_lock); |
2356 | ASSERT(iip->ili_last_fields); |
2357 | |
2358 | if (ip != free_ip) |
2359 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
2360 | return; |
2361 | |
2362 | out_iunlock: |
2363 | if (ip != free_ip) |
2364 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
2365 | out_iflags_unlock: |
2366 | spin_unlock(lock: &ip->i_flags_lock); |
2367 | rcu_read_unlock(); |
2368 | } |
2369 | |
2370 | /* |
2371 | * A big issue when freeing the inode cluster is that we _cannot_ skip any |
2372 | * inodes that are in memory - they all must be marked stale and attached to |
2373 | * the cluster buffer. |
2374 | */ |
2375 | static int |
2376 | xfs_ifree_cluster( |
2377 | struct xfs_trans *tp, |
2378 | struct xfs_perag *pag, |
2379 | struct xfs_inode *free_ip, |
2380 | struct xfs_icluster *xic) |
2381 | { |
2382 | struct xfs_mount *mp = free_ip->i_mount; |
2383 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
2384 | struct xfs_buf *bp; |
2385 | xfs_daddr_t blkno; |
2386 | xfs_ino_t inum = xic->first_ino; |
2387 | int nbufs; |
2388 | int i, j; |
2389 | int ioffset; |
2390 | int error; |
2391 | |
2392 | nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; |
2393 | |
2394 | for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { |
2395 | /* |
2396 | * The allocation bitmap tells us which inodes of the chunk were |
2397 | * physically allocated. Skip the cluster if an inode falls into |
2398 | * a sparse region. |
2399 | */ |
2400 | ioffset = inum - xic->first_ino; |
2401 | if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { |
2402 | ASSERT(ioffset % igeo->inodes_per_cluster == 0); |
2403 | continue; |
2404 | } |
2405 | |
2406 | blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), |
2407 | XFS_INO_TO_AGBNO(mp, inum)); |
2408 | |
2409 | /* |
2410 | * We obtain and lock the backing buffer first in the process |
2411 | * here to ensure dirty inodes attached to the buffer remain in |
2412 | * the flushing state while we mark them stale. |
2413 | * |
2414 | * If we scan the in-memory inodes first, then buffer IO can |
2415 | * complete before we get a lock on it, and hence we may fail |
2416 | * to mark all the active inodes on the buffer stale. |
2417 | */ |
2418 | error = xfs_trans_get_buf(tp, target: mp->m_ddev_targp, blkno, |
2419 | numblks: mp->m_bsize * igeo->blocks_per_cluster, |
2420 | XBF_UNMAPPED, bpp: &bp); |
2421 | if (error) |
2422 | return error; |
2423 | |
2424 | /* |
2425 | * This buffer may not have been correctly initialised as we |
2426 | * didn't read it from disk. That's not important because we are |
2427 | * only using to mark the buffer as stale in the log, and to |
2428 | * attach stale cached inodes on it. That means it will never be |
2429 | * dispatched for IO. If it is, we want to know about it, and we |
2430 | * want it to fail. We can acheive this by adding a write |
2431 | * verifier to the buffer. |
2432 | */ |
2433 | bp->b_ops = &xfs_inode_buf_ops; |
2434 | |
2435 | /* |
2436 | * Now we need to set all the cached clean inodes as XFS_ISTALE, |
2437 | * too. This requires lookups, and will skip inodes that we've |
2438 | * already marked XFS_ISTALE. |
2439 | */ |
2440 | for (i = 0; i < igeo->inodes_per_cluster; i++) |
2441 | xfs_ifree_mark_inode_stale(pag, free_ip, inum: inum + i); |
2442 | |
2443 | xfs_trans_stale_inode_buf(tp, bp); |
2444 | xfs_trans_binval(tp, bp); |
2445 | } |
2446 | return 0; |
2447 | } |
2448 | |
2449 | /* |
2450 | * This is called to return an inode to the inode free list. The inode should |
2451 | * already be truncated to 0 length and have no pages associated with it. This |
2452 | * routine also assumes that the inode is already a part of the transaction. |
2453 | * |
2454 | * The on-disk copy of the inode will have been added to the list of unlinked |
2455 | * inodes in the AGI. We need to remove the inode from that list atomically with |
2456 | * respect to freeing it here. |
2457 | */ |
2458 | int |
2459 | xfs_ifree( |
2460 | struct xfs_trans *tp, |
2461 | struct xfs_inode *ip) |
2462 | { |
2463 | struct xfs_mount *mp = ip->i_mount; |
2464 | struct xfs_perag *pag; |
2465 | struct xfs_icluster xic = { 0 }; |
2466 | struct xfs_inode_log_item *iip = ip->i_itemp; |
2467 | int error; |
2468 | |
2469 | xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); |
2470 | ASSERT(VFS_I(ip)->i_nlink == 0); |
2471 | ASSERT(ip->i_df.if_nextents == 0); |
2472 | ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); |
2473 | ASSERT(ip->i_nblocks == 0); |
2474 | |
2475 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); |
2476 | |
2477 | /* |
2478 | * Free the inode first so that we guarantee that the AGI lock is going |
2479 | * to be taken before we remove the inode from the unlinked list. This |
2480 | * makes the AGI lock -> unlinked list modification order the same as |
2481 | * used in O_TMPFILE creation. |
2482 | */ |
2483 | error = xfs_difree(tp, pag, ip->i_ino, &xic); |
2484 | if (error) |
2485 | goto out; |
2486 | |
2487 | error = xfs_iunlink_remove(tp, pag, ip); |
2488 | if (error) |
2489 | goto out; |
2490 | |
2491 | /* |
2492 | * Free any local-format data sitting around before we reset the |
2493 | * data fork to extents format. Note that the attr fork data has |
2494 | * already been freed by xfs_attr_inactive. |
2495 | */ |
2496 | if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { |
2497 | kfree(objp: ip->i_df.if_data); |
2498 | ip->i_df.if_data = NULL; |
2499 | ip->i_df.if_bytes = 0; |
2500 | } |
2501 | |
2502 | VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ |
2503 | ip->i_diflags = 0; |
2504 | ip->i_diflags2 = mp->m_ino_geo.new_diflags2; |
2505 | ip->i_forkoff = 0; /* mark the attr fork not in use */ |
2506 | ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; |
2507 | if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) |
2508 | xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); |
2509 | |
2510 | /* Don't attempt to replay owner changes for a deleted inode */ |
2511 | spin_lock(lock: &iip->ili_lock); |
2512 | iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); |
2513 | spin_unlock(lock: &iip->ili_lock); |
2514 | |
2515 | /* |
2516 | * Bump the generation count so no one will be confused |
2517 | * by reincarnations of this inode. |
2518 | */ |
2519 | VFS_I(ip)->i_generation++; |
2520 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
2521 | |
2522 | if (xic.deleted) |
2523 | error = xfs_ifree_cluster(tp, pag, free_ip: ip, xic: &xic); |
2524 | out: |
2525 | xfs_perag_put(pag); |
2526 | return error; |
2527 | } |
2528 | |
2529 | /* |
2530 | * This is called to unpin an inode. The caller must have the inode locked |
2531 | * in at least shared mode so that the buffer cannot be subsequently pinned |
2532 | * once someone is waiting for it to be unpinned. |
2533 | */ |
2534 | static void |
2535 | xfs_iunpin( |
2536 | struct xfs_inode *ip) |
2537 | { |
2538 | xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); |
2539 | |
2540 | trace_xfs_inode_unpin_nowait(ip, _RET_IP_); |
2541 | |
2542 | /* Give the log a push to start the unpinning I/O */ |
2543 | xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); |
2544 | |
2545 | } |
2546 | |
2547 | static void |
2548 | __xfs_iunpin_wait( |
2549 | struct xfs_inode *ip) |
2550 | { |
2551 | wait_queue_head_t *wq = bit_waitqueue(word: &ip->i_flags, __XFS_IPINNED_BIT); |
2552 | DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); |
2553 | |
2554 | xfs_iunpin(ip); |
2555 | |
2556 | do { |
2557 | prepare_to_wait(wq_head: wq, wq_entry: &wait.wq_entry, TASK_UNINTERRUPTIBLE); |
2558 | if (xfs_ipincount(ip)) |
2559 | io_schedule(); |
2560 | } while (xfs_ipincount(ip)); |
2561 | finish_wait(wq_head: wq, wq_entry: &wait.wq_entry); |
2562 | } |
2563 | |
2564 | void |
2565 | xfs_iunpin_wait( |
2566 | struct xfs_inode *ip) |
2567 | { |
2568 | if (xfs_ipincount(ip)) |
2569 | __xfs_iunpin_wait(ip); |
2570 | } |
2571 | |
2572 | /* |
2573 | * Removing an inode from the namespace involves removing the directory entry |
2574 | * and dropping the link count on the inode. Removing the directory entry can |
2575 | * result in locking an AGF (directory blocks were freed) and removing a link |
2576 | * count can result in placing the inode on an unlinked list which results in |
2577 | * locking an AGI. |
2578 | * |
2579 | * The big problem here is that we have an ordering constraint on AGF and AGI |
2580 | * locking - inode allocation locks the AGI, then can allocate a new extent for |
2581 | * new inodes, locking the AGF after the AGI. Similarly, freeing the inode |
2582 | * removes the inode from the unlinked list, requiring that we lock the AGI |
2583 | * first, and then freeing the inode can result in an inode chunk being freed |
2584 | * and hence freeing disk space requiring that we lock an AGF. |
2585 | * |
2586 | * Hence the ordering that is imposed by other parts of the code is AGI before |
2587 | * AGF. This means we cannot remove the directory entry before we drop the inode |
2588 | * reference count and put it on the unlinked list as this results in a lock |
2589 | * order of AGF then AGI, and this can deadlock against inode allocation and |
2590 | * freeing. Therefore we must drop the link counts before we remove the |
2591 | * directory entry. |
2592 | * |
2593 | * This is still safe from a transactional point of view - it is not until we |
2594 | * get to xfs_defer_finish() that we have the possibility of multiple |
2595 | * transactions in this operation. Hence as long as we remove the directory |
2596 | * entry and drop the link count in the first transaction of the remove |
2597 | * operation, there are no transactional constraints on the ordering here. |
2598 | */ |
2599 | int |
2600 | xfs_remove( |
2601 | xfs_inode_t *dp, |
2602 | struct xfs_name *name, |
2603 | xfs_inode_t *ip) |
2604 | { |
2605 | xfs_mount_t *mp = dp->i_mount; |
2606 | xfs_trans_t *tp = NULL; |
2607 | int is_dir = S_ISDIR(VFS_I(ip)->i_mode); |
2608 | int dontcare; |
2609 | int error = 0; |
2610 | uint resblks; |
2611 | |
2612 | trace_xfs_remove(dp, xfs_remove: name); |
2613 | |
2614 | if (xfs_is_shutdown(mp)) |
2615 | return -EIO; |
2616 | if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) |
2617 | return -EIO; |
2618 | |
2619 | error = xfs_qm_dqattach(dp); |
2620 | if (error) |
2621 | goto std_return; |
2622 | |
2623 | error = xfs_qm_dqattach(ip); |
2624 | if (error) |
2625 | goto std_return; |
2626 | |
2627 | /* |
2628 | * We try to get the real space reservation first, allowing for |
2629 | * directory btree deletion(s) implying possible bmap insert(s). If we |
2630 | * can't get the space reservation then we use 0 instead, and avoid the |
2631 | * bmap btree insert(s) in the directory code by, if the bmap insert |
2632 | * tries to happen, instead trimming the LAST block from the directory. |
2633 | * |
2634 | * Ignore EDQUOT and ENOSPC being returned via nospace_error because |
2635 | * the directory code can handle a reservationless update and we don't |
2636 | * want to prevent a user from trying to free space by deleting things. |
2637 | */ |
2638 | resblks = XFS_REMOVE_SPACE_RES(mp); |
2639 | error = xfs_trans_alloc_dir(dp, resv: &M_RES(mp)->tr_remove, ip, dblocks: &resblks, |
2640 | tpp: &tp, nospace_error: &dontcare); |
2641 | if (error) { |
2642 | ASSERT(error != -ENOSPC); |
2643 | goto std_return; |
2644 | } |
2645 | |
2646 | /* |
2647 | * If we're removing a directory perform some additional validation. |
2648 | */ |
2649 | if (is_dir) { |
2650 | ASSERT(VFS_I(ip)->i_nlink >= 2); |
2651 | if (VFS_I(ip)->i_nlink != 2) { |
2652 | error = -ENOTEMPTY; |
2653 | goto out_trans_cancel; |
2654 | } |
2655 | if (!xfs_dir_isempty(ip)) { |
2656 | error = -ENOTEMPTY; |
2657 | goto out_trans_cancel; |
2658 | } |
2659 | |
2660 | /* Drop the link from ip's "..". */ |
2661 | error = xfs_droplink(tp, ip: dp); |
2662 | if (error) |
2663 | goto out_trans_cancel; |
2664 | |
2665 | /* Drop the "." link from ip to self. */ |
2666 | error = xfs_droplink(tp, ip); |
2667 | if (error) |
2668 | goto out_trans_cancel; |
2669 | |
2670 | /* |
2671 | * Point the unlinked child directory's ".." entry to the root |
2672 | * directory to eliminate back-references to inodes that may |
2673 | * get freed before the child directory is closed. If the fs |
2674 | * gets shrunk, this can lead to dirent inode validation errors. |
2675 | */ |
2676 | if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { |
2677 | error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, |
2678 | tp->t_mountp->m_sb.sb_rootino, 0); |
2679 | if (error) |
2680 | goto out_trans_cancel; |
2681 | } |
2682 | } else { |
2683 | /* |
2684 | * When removing a non-directory we need to log the parent |
2685 | * inode here. For a directory this is done implicitly |
2686 | * by the xfs_droplink call for the ".." entry. |
2687 | */ |
2688 | xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); |
2689 | } |
2690 | xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
2691 | |
2692 | /* Drop the link from dp to ip. */ |
2693 | error = xfs_droplink(tp, ip); |
2694 | if (error) |
2695 | goto out_trans_cancel; |
2696 | |
2697 | error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); |
2698 | if (error) { |
2699 | ASSERT(error != -ENOENT); |
2700 | goto out_trans_cancel; |
2701 | } |
2702 | |
2703 | /* |
2704 | * Drop the link from dp to ip, and if ip was a directory, remove the |
2705 | * '.' and '..' references since we freed the directory. |
2706 | */ |
2707 | xfs_dir_update_hook(dp, ip, delta: -1, name); |
2708 | |
2709 | /* |
2710 | * If this is a synchronous mount, make sure that the |
2711 | * remove transaction goes to disk before returning to |
2712 | * the user. |
2713 | */ |
2714 | if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) |
2715 | xfs_trans_set_sync(tp); |
2716 | |
2717 | error = xfs_trans_commit(tp); |
2718 | if (error) |
2719 | goto std_return; |
2720 | |
2721 | if (is_dir && xfs_inode_is_filestream(ip)) |
2722 | xfs_filestream_deassociate(ip); |
2723 | |
2724 | return 0; |
2725 | |
2726 | out_trans_cancel: |
2727 | xfs_trans_cancel(tp); |
2728 | std_return: |
2729 | return error; |
2730 | } |
2731 | |
2732 | /* |
2733 | * Enter all inodes for a rename transaction into a sorted array. |
2734 | */ |
2735 | #define __XFS_SORT_INODES 5 |
2736 | STATIC void |
2737 | xfs_sort_for_rename( |
2738 | struct xfs_inode *dp1, /* in: old (source) directory inode */ |
2739 | struct xfs_inode *dp2, /* in: new (target) directory inode */ |
2740 | struct xfs_inode *ip1, /* in: inode of old entry */ |
2741 | struct xfs_inode *ip2, /* in: inode of new entry */ |
2742 | struct xfs_inode *wip, /* in: whiteout inode */ |
2743 | struct xfs_inode **i_tab,/* out: sorted array of inodes */ |
2744 | int *num_inodes) /* in/out: inodes in array */ |
2745 | { |
2746 | int i, j; |
2747 | |
2748 | ASSERT(*num_inodes == __XFS_SORT_INODES); |
2749 | memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); |
2750 | |
2751 | /* |
2752 | * i_tab contains a list of pointers to inodes. We initialize |
2753 | * the table here & we'll sort it. We will then use it to |
2754 | * order the acquisition of the inode locks. |
2755 | * |
2756 | * Note that the table may contain duplicates. e.g., dp1 == dp2. |
2757 | */ |
2758 | i = 0; |
2759 | i_tab[i++] = dp1; |
2760 | i_tab[i++] = dp2; |
2761 | i_tab[i++] = ip1; |
2762 | if (ip2) |
2763 | i_tab[i++] = ip2; |
2764 | if (wip) |
2765 | i_tab[i++] = wip; |
2766 | *num_inodes = i; |
2767 | |
2768 | /* |
2769 | * Sort the elements via bubble sort. (Remember, there are at |
2770 | * most 5 elements to sort, so this is adequate.) |
2771 | */ |
2772 | for (i = 0; i < *num_inodes; i++) { |
2773 | for (j = 1; j < *num_inodes; j++) { |
2774 | if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { |
2775 | struct xfs_inode *temp = i_tab[j]; |
2776 | i_tab[j] = i_tab[j-1]; |
2777 | i_tab[j-1] = temp; |
2778 | } |
2779 | } |
2780 | } |
2781 | } |
2782 | |
2783 | static int |
2784 | xfs_finish_rename( |
2785 | struct xfs_trans *tp) |
2786 | { |
2787 | /* |
2788 | * If this is a synchronous mount, make sure that the rename transaction |
2789 | * goes to disk before returning to the user. |
2790 | */ |
2791 | if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) |
2792 | xfs_trans_set_sync(tp); |
2793 | |
2794 | return xfs_trans_commit(tp); |
2795 | } |
2796 | |
2797 | /* |
2798 | * xfs_cross_rename() |
2799 | * |
2800 | * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall |
2801 | */ |
2802 | STATIC int |
2803 | xfs_cross_rename( |
2804 | struct xfs_trans *tp, |
2805 | struct xfs_inode *dp1, |
2806 | struct xfs_name *name1, |
2807 | struct xfs_inode *ip1, |
2808 | struct xfs_inode *dp2, |
2809 | struct xfs_name *name2, |
2810 | struct xfs_inode *ip2, |
2811 | int spaceres) |
2812 | { |
2813 | int error = 0; |
2814 | int ip1_flags = 0; |
2815 | int ip2_flags = 0; |
2816 | int dp2_flags = 0; |
2817 | |
2818 | /* Swap inode number for dirent in first parent */ |
2819 | error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); |
2820 | if (error) |
2821 | goto out_trans_abort; |
2822 | |
2823 | /* Swap inode number for dirent in second parent */ |
2824 | error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); |
2825 | if (error) |
2826 | goto out_trans_abort; |
2827 | |
2828 | /* |
2829 | * If we're renaming one or more directories across different parents, |
2830 | * update the respective ".." entries (and link counts) to match the new |
2831 | * parents. |
2832 | */ |
2833 | if (dp1 != dp2) { |
2834 | dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; |
2835 | |
2836 | if (S_ISDIR(VFS_I(ip2)->i_mode)) { |
2837 | error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, |
2838 | dp1->i_ino, spaceres); |
2839 | if (error) |
2840 | goto out_trans_abort; |
2841 | |
2842 | /* transfer ip2 ".." reference to dp1 */ |
2843 | if (!S_ISDIR(VFS_I(ip1)->i_mode)) { |
2844 | error = xfs_droplink(tp, ip: dp2); |
2845 | if (error) |
2846 | goto out_trans_abort; |
2847 | xfs_bumplink(tp, ip: dp1); |
2848 | } |
2849 | |
2850 | /* |
2851 | * Although ip1 isn't changed here, userspace needs |
2852 | * to be warned about the change, so that applications |
2853 | * relying on it (like backup ones), will properly |
2854 | * notify the change |
2855 | */ |
2856 | ip1_flags |= XFS_ICHGTIME_CHG; |
2857 | ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; |
2858 | } |
2859 | |
2860 | if (S_ISDIR(VFS_I(ip1)->i_mode)) { |
2861 | error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, |
2862 | dp2->i_ino, spaceres); |
2863 | if (error) |
2864 | goto out_trans_abort; |
2865 | |
2866 | /* transfer ip1 ".." reference to dp2 */ |
2867 | if (!S_ISDIR(VFS_I(ip2)->i_mode)) { |
2868 | error = xfs_droplink(tp, ip: dp1); |
2869 | if (error) |
2870 | goto out_trans_abort; |
2871 | xfs_bumplink(tp, ip: dp2); |
2872 | } |
2873 | |
2874 | /* |
2875 | * Although ip2 isn't changed here, userspace needs |
2876 | * to be warned about the change, so that applications |
2877 | * relying on it (like backup ones), will properly |
2878 | * notify the change |
2879 | */ |
2880 | ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; |
2881 | ip2_flags |= XFS_ICHGTIME_CHG; |
2882 | } |
2883 | } |
2884 | |
2885 | if (ip1_flags) { |
2886 | xfs_trans_ichgtime(tp, ip1, ip1_flags); |
2887 | xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); |
2888 | } |
2889 | if (ip2_flags) { |
2890 | xfs_trans_ichgtime(tp, ip2, ip2_flags); |
2891 | xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); |
2892 | } |
2893 | if (dp2_flags) { |
2894 | xfs_trans_ichgtime(tp, dp2, dp2_flags); |
2895 | xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); |
2896 | } |
2897 | xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
2898 | xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); |
2899 | |
2900 | /* |
2901 | * Inform our hook clients that we've finished an exchange operation as |
2902 | * follows: removed the source and target files from their directories; |
2903 | * added the target to the source directory; and added the source to |
2904 | * the target directory. All inodes are locked, so it's ok to model a |
2905 | * rename this way so long as we say we deleted entries before we add |
2906 | * new ones. |
2907 | */ |
2908 | xfs_dir_update_hook(dp: dp1, ip: ip1, delta: -1, name: name1); |
2909 | xfs_dir_update_hook(dp: dp2, ip: ip2, delta: -1, name: name2); |
2910 | xfs_dir_update_hook(dp: dp1, ip: ip2, delta: 1, name: name1); |
2911 | xfs_dir_update_hook(dp: dp2, ip: ip1, delta: 1, name: name2); |
2912 | |
2913 | return xfs_finish_rename(tp); |
2914 | |
2915 | out_trans_abort: |
2916 | xfs_trans_cancel(tp); |
2917 | return error; |
2918 | } |
2919 | |
2920 | /* |
2921 | * xfs_rename_alloc_whiteout() |
2922 | * |
2923 | * Return a referenced, unlinked, unlocked inode that can be used as a |
2924 | * whiteout in a rename transaction. We use a tmpfile inode here so that if we |
2925 | * crash between allocating the inode and linking it into the rename transaction |
2926 | * recovery will free the inode and we won't leak it. |
2927 | */ |
2928 | static int |
2929 | xfs_rename_alloc_whiteout( |
2930 | struct mnt_idmap *idmap, |
2931 | struct xfs_name *src_name, |
2932 | struct xfs_inode *dp, |
2933 | struct xfs_inode **wip) |
2934 | { |
2935 | struct xfs_inode *tmpfile; |
2936 | struct qstr name; |
2937 | int error; |
2938 | |
2939 | error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, |
2940 | ipp: &tmpfile); |
2941 | if (error) |
2942 | return error; |
2943 | |
2944 | name.name = src_name->name; |
2945 | name.len = src_name->len; |
2946 | error = xfs_inode_init_security(inode: VFS_I(ip: tmpfile), dir: VFS_I(ip: dp), qstr: &name); |
2947 | if (error) { |
2948 | xfs_finish_inode_setup(ip: tmpfile); |
2949 | xfs_irele(ip: tmpfile); |
2950 | return error; |
2951 | } |
2952 | |
2953 | /* |
2954 | * Prepare the tmpfile inode as if it were created through the VFS. |
2955 | * Complete the inode setup and flag it as linkable. nlink is already |
2956 | * zero, so we can skip the drop_nlink. |
2957 | */ |
2958 | xfs_setup_iops(ip: tmpfile); |
2959 | xfs_finish_inode_setup(ip: tmpfile); |
2960 | VFS_I(ip: tmpfile)->i_state |= I_LINKABLE; |
2961 | |
2962 | *wip = tmpfile; |
2963 | return 0; |
2964 | } |
2965 | |
2966 | /* |
2967 | * xfs_rename |
2968 | */ |
2969 | int |
2970 | xfs_rename( |
2971 | struct mnt_idmap *idmap, |
2972 | struct xfs_inode *src_dp, |
2973 | struct xfs_name *src_name, |
2974 | struct xfs_inode *src_ip, |
2975 | struct xfs_inode *target_dp, |
2976 | struct xfs_name *target_name, |
2977 | struct xfs_inode *target_ip, |
2978 | unsigned int flags) |
2979 | { |
2980 | struct xfs_mount *mp = src_dp->i_mount; |
2981 | struct xfs_trans *tp; |
2982 | struct xfs_inode *wip = NULL; /* whiteout inode */ |
2983 | struct xfs_inode *inodes[__XFS_SORT_INODES]; |
2984 | int i; |
2985 | int num_inodes = __XFS_SORT_INODES; |
2986 | bool new_parent = (src_dp != target_dp); |
2987 | bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); |
2988 | int spaceres; |
2989 | bool retried = false; |
2990 | int error, nospace_error = 0; |
2991 | |
2992 | trace_xfs_rename(src_dp, target_dp, src_name, target_name); |
2993 | |
2994 | if ((flags & RENAME_EXCHANGE) && !target_ip) |
2995 | return -EINVAL; |
2996 | |
2997 | /* |
2998 | * If we are doing a whiteout operation, allocate the whiteout inode |
2999 | * we will be placing at the target and ensure the type is set |
3000 | * appropriately. |
3001 | */ |
3002 | if (flags & RENAME_WHITEOUT) { |
3003 | error = xfs_rename_alloc_whiteout(idmap, src_name, |
3004 | dp: target_dp, wip: &wip); |
3005 | if (error) |
3006 | return error; |
3007 | |
3008 | /* setup target dirent info as whiteout */ |
3009 | src_name->type = XFS_DIR3_FT_CHRDEV; |
3010 | } |
3011 | |
3012 | xfs_sort_for_rename(dp1: src_dp, dp2: target_dp, ip1: src_ip, ip2: target_ip, wip, |
3013 | i_tab: inodes, num_inodes: &num_inodes); |
3014 | |
3015 | retry: |
3016 | nospace_error = 0; |
3017 | spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); |
3018 | error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_rename, blocks: spaceres, rtextents: 0, flags: 0, tpp: &tp); |
3019 | if (error == -ENOSPC) { |
3020 | nospace_error = error; |
3021 | spaceres = 0; |
3022 | error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_rename, blocks: 0, rtextents: 0, flags: 0, |
3023 | tpp: &tp); |
3024 | } |
3025 | if (error) |
3026 | goto out_release_wip; |
3027 | |
3028 | /* |
3029 | * Attach the dquots to the inodes |
3030 | */ |
3031 | error = xfs_qm_vop_rename_dqattach(inodes); |
3032 | if (error) |
3033 | goto out_trans_cancel; |
3034 | |
3035 | /* |
3036 | * Lock all the participating inodes. Depending upon whether |
3037 | * the target_name exists in the target directory, and |
3038 | * whether the target directory is the same as the source |
3039 | * directory, we can lock from 2 to 5 inodes. |
3040 | */ |
3041 | xfs_lock_inodes(ips: inodes, inodes: num_inodes, XFS_ILOCK_EXCL); |
3042 | |
3043 | /* |
3044 | * Join all the inodes to the transaction. From this point on, |
3045 | * we can rely on either trans_commit or trans_cancel to unlock |
3046 | * them. |
3047 | */ |
3048 | xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); |
3049 | if (new_parent) |
3050 | xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); |
3051 | xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); |
3052 | if (target_ip) |
3053 | xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); |
3054 | if (wip) |
3055 | xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); |
3056 | |
3057 | /* |
3058 | * If we are using project inheritance, we only allow renames |
3059 | * into our tree when the project IDs are the same; else the |
3060 | * tree quota mechanism would be circumvented. |
3061 | */ |
3062 | if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && |
3063 | target_dp->i_projid != src_ip->i_projid)) { |
3064 | error = -EXDEV; |
3065 | goto out_trans_cancel; |
3066 | } |
3067 | |
3068 | /* RENAME_EXCHANGE is unique from here on. */ |
3069 | if (flags & RENAME_EXCHANGE) |
3070 | return xfs_cross_rename(tp, dp1: src_dp, name1: src_name, ip1: src_ip, |
3071 | dp2: target_dp, name2: target_name, ip2: target_ip, |
3072 | spaceres); |
3073 | |
3074 | /* |
3075 | * Try to reserve quota to handle an expansion of the target directory. |
3076 | * We'll allow the rename to continue in reservationless mode if we hit |
3077 | * a space usage constraint. If we trigger reservationless mode, save |
3078 | * the errno if there isn't any free space in the target directory. |
3079 | */ |
3080 | if (spaceres != 0) { |
3081 | error = xfs_trans_reserve_quota_nblks(tp, ip: target_dp, dblocks: spaceres, |
3082 | rblocks: 0, force: false); |
3083 | if (error == -EDQUOT || error == -ENOSPC) { |
3084 | if (!retried) { |
3085 | xfs_trans_cancel(tp); |
3086 | xfs_blockgc_free_quota(ip: target_dp, iwalk_flags: 0); |
3087 | retried = true; |
3088 | goto retry; |
3089 | } |
3090 | |
3091 | nospace_error = error; |
3092 | spaceres = 0; |
3093 | error = 0; |
3094 | } |
3095 | if (error) |
3096 | goto out_trans_cancel; |
3097 | } |
3098 | |
3099 | /* |
3100 | * Check for expected errors before we dirty the transaction |
3101 | * so we can return an error without a transaction abort. |
3102 | */ |
3103 | if (target_ip == NULL) { |
3104 | /* |
3105 | * If there's no space reservation, check the entry will |
3106 | * fit before actually inserting it. |
3107 | */ |
3108 | if (!spaceres) { |
3109 | error = xfs_dir_canenter(tp, target_dp, target_name); |
3110 | if (error) |
3111 | goto out_trans_cancel; |
3112 | } |
3113 | } else { |
3114 | /* |
3115 | * If target exists and it's a directory, check that whether |
3116 | * it can be destroyed. |
3117 | */ |
3118 | if (S_ISDIR(VFS_I(target_ip)->i_mode) && |
3119 | (!xfs_dir_isempty(target_ip) || |
3120 | (VFS_I(ip: target_ip)->i_nlink > 2))) { |
3121 | error = -EEXIST; |
3122 | goto out_trans_cancel; |
3123 | } |
3124 | } |
3125 | |
3126 | /* |
3127 | * Lock the AGI buffers we need to handle bumping the nlink of the |
3128 | * whiteout inode off the unlinked list and to handle dropping the |
3129 | * nlink of the target inode. Per locking order rules, do this in |
3130 | * increasing AG order and before directory block allocation tries to |
3131 | * grab AGFs because we grab AGIs before AGFs. |
3132 | * |
3133 | * The (vfs) caller must ensure that if src is a directory then |
3134 | * target_ip is either null or an empty directory. |
3135 | */ |
3136 | for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { |
3137 | if (inodes[i] == wip || |
3138 | (inodes[i] == target_ip && |
3139 | (VFS_I(ip: target_ip)->i_nlink == 1 || src_is_directory))) { |
3140 | struct xfs_perag *pag; |
3141 | struct xfs_buf *bp; |
3142 | |
3143 | pag = xfs_perag_get(mp, |
3144 | XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); |
3145 | error = xfs_read_agi(pag, tp, &bp); |
3146 | xfs_perag_put(pag); |
3147 | if (error) |
3148 | goto out_trans_cancel; |
3149 | } |
3150 | } |
3151 | |
3152 | /* |
3153 | * Directory entry creation below may acquire the AGF. Remove |
3154 | * the whiteout from the unlinked list first to preserve correct |
3155 | * AGI/AGF locking order. This dirties the transaction so failures |
3156 | * after this point will abort and log recovery will clean up the |
3157 | * mess. |
3158 | * |
3159 | * For whiteouts, we need to bump the link count on the whiteout |
3160 | * inode. After this point, we have a real link, clear the tmpfile |
3161 | * state flag from the inode so it doesn't accidentally get misused |
3162 | * in future. |
3163 | */ |
3164 | if (wip) { |
3165 | struct xfs_perag *pag; |
3166 | |
3167 | ASSERT(VFS_I(wip)->i_nlink == 0); |
3168 | |
3169 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); |
3170 | error = xfs_iunlink_remove(tp, pag, ip: wip); |
3171 | xfs_perag_put(pag); |
3172 | if (error) |
3173 | goto out_trans_cancel; |
3174 | |
3175 | xfs_bumplink(tp, ip: wip); |
3176 | VFS_I(ip: wip)->i_state &= ~I_LINKABLE; |
3177 | } |
3178 | |
3179 | /* |
3180 | * Set up the target. |
3181 | */ |
3182 | if (target_ip == NULL) { |
3183 | /* |
3184 | * If target does not exist and the rename crosses |
3185 | * directories, adjust the target directory link count |
3186 | * to account for the ".." reference from the new entry. |
3187 | */ |
3188 | error = xfs_dir_createname(tp, target_dp, target_name, |
3189 | src_ip->i_ino, spaceres); |
3190 | if (error) |
3191 | goto out_trans_cancel; |
3192 | |
3193 | xfs_trans_ichgtime(tp, target_dp, |
3194 | XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
3195 | |
3196 | if (new_parent && src_is_directory) { |
3197 | xfs_bumplink(tp, ip: target_dp); |
3198 | } |
3199 | } else { /* target_ip != NULL */ |
3200 | /* |
3201 | * Link the source inode under the target name. |
3202 | * If the source inode is a directory and we are moving |
3203 | * it across directories, its ".." entry will be |
3204 | * inconsistent until we replace that down below. |
3205 | * |
3206 | * In case there is already an entry with the same |
3207 | * name at the destination directory, remove it first. |
3208 | */ |
3209 | error = xfs_dir_replace(tp, target_dp, target_name, |
3210 | src_ip->i_ino, spaceres); |
3211 | if (error) |
3212 | goto out_trans_cancel; |
3213 | |
3214 | xfs_trans_ichgtime(tp, target_dp, |
3215 | XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
3216 | |
3217 | /* |
3218 | * Decrement the link count on the target since the target |
3219 | * dir no longer points to it. |
3220 | */ |
3221 | error = xfs_droplink(tp, ip: target_ip); |
3222 | if (error) |
3223 | goto out_trans_cancel; |
3224 | |
3225 | if (src_is_directory) { |
3226 | /* |
3227 | * Drop the link from the old "." entry. |
3228 | */ |
3229 | error = xfs_droplink(tp, ip: target_ip); |
3230 | if (error) |
3231 | goto out_trans_cancel; |
3232 | } |
3233 | } /* target_ip != NULL */ |
3234 | |
3235 | /* |
3236 | * Remove the source. |
3237 | */ |
3238 | if (new_parent && src_is_directory) { |
3239 | /* |
3240 | * Rewrite the ".." entry to point to the new |
3241 | * directory. |
3242 | */ |
3243 | error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, |
3244 | target_dp->i_ino, spaceres); |
3245 | ASSERT(error != -EEXIST); |
3246 | if (error) |
3247 | goto out_trans_cancel; |
3248 | } |
3249 | |
3250 | /* |
3251 | * We always want to hit the ctime on the source inode. |
3252 | * |
3253 | * This isn't strictly required by the standards since the source |
3254 | * inode isn't really being changed, but old unix file systems did |
3255 | * it and some incremental backup programs won't work without it. |
3256 | */ |
3257 | xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); |
3258 | xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); |
3259 | |
3260 | /* |
3261 | * Adjust the link count on src_dp. This is necessary when |
3262 | * renaming a directory, either within one parent when |
3263 | * the target existed, or across two parent directories. |
3264 | */ |
3265 | if (src_is_directory && (new_parent || target_ip != NULL)) { |
3266 | |
3267 | /* |
3268 | * Decrement link count on src_directory since the |
3269 | * entry that's moved no longer points to it. |
3270 | */ |
3271 | error = xfs_droplink(tp, ip: src_dp); |
3272 | if (error) |
3273 | goto out_trans_cancel; |
3274 | } |
3275 | |
3276 | /* |
3277 | * For whiteouts, we only need to update the source dirent with the |
3278 | * inode number of the whiteout inode rather than removing it |
3279 | * altogether. |
3280 | */ |
3281 | if (wip) |
3282 | error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, |
3283 | spaceres); |
3284 | else |
3285 | error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, |
3286 | spaceres); |
3287 | |
3288 | if (error) |
3289 | goto out_trans_cancel; |
3290 | |
3291 | xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
3292 | xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); |
3293 | if (new_parent) |
3294 | xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); |
3295 | |
3296 | /* |
3297 | * Inform our hook clients that we've finished a rename operation as |
3298 | * follows: removed the source and target files from their directories; |
3299 | * that we've added the source to the target directory; and finally |
3300 | * that we've added the whiteout, if there was one. All inodes are |
3301 | * locked, so it's ok to model a rename this way so long as we say we |
3302 | * deleted entries before we add new ones. |
3303 | */ |
3304 | if (target_ip) |
3305 | xfs_dir_update_hook(dp: target_dp, ip: target_ip, delta: -1, name: target_name); |
3306 | xfs_dir_update_hook(dp: src_dp, ip: src_ip, delta: -1, name: src_name); |
3307 | xfs_dir_update_hook(dp: target_dp, ip: src_ip, delta: 1, name: target_name); |
3308 | if (wip) |
3309 | xfs_dir_update_hook(dp: src_dp, ip: wip, delta: 1, name: src_name); |
3310 | |
3311 | error = xfs_finish_rename(tp); |
3312 | if (wip) |
3313 | xfs_irele(ip: wip); |
3314 | return error; |
3315 | |
3316 | out_trans_cancel: |
3317 | xfs_trans_cancel(tp); |
3318 | out_release_wip: |
3319 | if (wip) |
3320 | xfs_irele(ip: wip); |
3321 | if (error == -ENOSPC && nospace_error) |
3322 | error = nospace_error; |
3323 | return error; |
3324 | } |
3325 | |
3326 | static int |
3327 | xfs_iflush( |
3328 | struct xfs_inode *ip, |
3329 | struct xfs_buf *bp) |
3330 | { |
3331 | struct xfs_inode_log_item *iip = ip->i_itemp; |
3332 | struct xfs_dinode *dip; |
3333 | struct xfs_mount *mp = ip->i_mount; |
3334 | int error; |
3335 | |
3336 | xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); |
3337 | ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); |
3338 | ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || |
3339 | ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); |
3340 | ASSERT(iip->ili_item.li_buf == bp); |
3341 | |
3342 | dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); |
3343 | |
3344 | /* |
3345 | * We don't flush the inode if any of the following checks fail, but we |
3346 | * do still update the log item and attach to the backing buffer as if |
3347 | * the flush happened. This is a formality to facilitate predictable |
3348 | * error handling as the caller will shutdown and fail the buffer. |
3349 | */ |
3350 | error = -EFSCORRUPTED; |
3351 | if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), |
3352 | mp, XFS_ERRTAG_IFLUSH_1)) { |
3353 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, |
3354 | "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, |
3355 | __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); |
3356 | goto flush_out; |
3357 | } |
3358 | if (S_ISREG(VFS_I(ip)->i_mode)) { |
3359 | if (XFS_TEST_ERROR( |
3360 | ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && |
3361 | ip->i_df.if_format != XFS_DINODE_FMT_BTREE, |
3362 | mp, XFS_ERRTAG_IFLUSH_3)) { |
3363 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, |
3364 | "%s: Bad regular inode %llu, ptr "PTR_FMT, |
3365 | __func__, ip->i_ino, ip); |
3366 | goto flush_out; |
3367 | } |
3368 | } else if (S_ISDIR(VFS_I(ip)->i_mode)) { |
3369 | if (XFS_TEST_ERROR( |
3370 | ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && |
3371 | ip->i_df.if_format != XFS_DINODE_FMT_BTREE && |
3372 | ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, |
3373 | mp, XFS_ERRTAG_IFLUSH_4)) { |
3374 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, |
3375 | "%s: Bad directory inode %llu, ptr "PTR_FMT, |
3376 | __func__, ip->i_ino, ip); |
3377 | goto flush_out; |
3378 | } |
3379 | } |
3380 | if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > |
3381 | ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { |
3382 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, |
3383 | "%s: detected corrupt incore inode %llu, " |
3384 | "total extents = %llu nblocks = %lld, ptr "PTR_FMT, |
3385 | __func__, ip->i_ino, |
3386 | ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), |
3387 | ip->i_nblocks, ip); |
3388 | goto flush_out; |
3389 | } |
3390 | if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, |
3391 | mp, XFS_ERRTAG_IFLUSH_6)) { |
3392 | xfs_alert_tag(mp, XFS_PTAG_IFLUSH, |
3393 | "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, |
3394 | __func__, ip->i_ino, ip->i_forkoff, ip); |
3395 | goto flush_out; |
3396 | } |
3397 | |
3398 | /* |
3399 | * Inode item log recovery for v2 inodes are dependent on the flushiter |
3400 | * count for correct sequencing. We bump the flush iteration count so |
3401 | * we can detect flushes which postdate a log record during recovery. |
3402 | * This is redundant as we now log every change and hence this can't |
3403 | * happen but we need to still do it to ensure backwards compatibility |
3404 | * with old kernels that predate logging all inode changes. |
3405 | */ |
3406 | if (!xfs_has_v3inodes(mp)) |
3407 | ip->i_flushiter++; |
3408 | |
3409 | /* |
3410 | * If there are inline format data / attr forks attached to this inode, |
3411 | * make sure they are not corrupt. |
3412 | */ |
3413 | if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && |
3414 | xfs_ifork_verify_local_data(ip)) |
3415 | goto flush_out; |
3416 | if (xfs_inode_has_attr_fork(ip) && |
3417 | ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && |
3418 | xfs_ifork_verify_local_attr(ip)) |
3419 | goto flush_out; |
3420 | |
3421 | /* |
3422 | * Copy the dirty parts of the inode into the on-disk inode. We always |
3423 | * copy out the core of the inode, because if the inode is dirty at all |
3424 | * the core must be. |
3425 | */ |
3426 | xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); |
3427 | |
3428 | /* Wrap, we never let the log put out DI_MAX_FLUSH */ |
3429 | if (!xfs_has_v3inodes(mp)) { |
3430 | if (ip->i_flushiter == DI_MAX_FLUSH) |
3431 | ip->i_flushiter = 0; |
3432 | } |
3433 | |
3434 | xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); |
3435 | if (xfs_inode_has_attr_fork(ip)) |
3436 | xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); |
3437 | |
3438 | /* |
3439 | * We've recorded everything logged in the inode, so we'd like to clear |
3440 | * the ili_fields bits so we don't log and flush things unnecessarily. |
3441 | * However, we can't stop logging all this information until the data |
3442 | * we've copied into the disk buffer is written to disk. If we did we |
3443 | * might overwrite the copy of the inode in the log with all the data |
3444 | * after re-logging only part of it, and in the face of a crash we |
3445 | * wouldn't have all the data we need to recover. |
3446 | * |
3447 | * What we do is move the bits to the ili_last_fields field. When |
3448 | * logging the inode, these bits are moved back to the ili_fields field. |
3449 | * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since |
3450 | * we know that the information those bits represent is permanently on |
3451 | * disk. As long as the flush completes before the inode is logged |
3452 | * again, then both ili_fields and ili_last_fields will be cleared. |
3453 | */ |
3454 | error = 0; |
3455 | flush_out: |
3456 | spin_lock(lock: &iip->ili_lock); |
3457 | iip->ili_last_fields = iip->ili_fields; |
3458 | iip->ili_fields = 0; |
3459 | iip->ili_fsync_fields = 0; |
3460 | spin_unlock(lock: &iip->ili_lock); |
3461 | |
3462 | /* |
3463 | * Store the current LSN of the inode so that we can tell whether the |
3464 | * item has moved in the AIL from xfs_buf_inode_iodone(). |
3465 | */ |
3466 | xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, |
3467 | &iip->ili_item.li_lsn); |
3468 | |
3469 | /* generate the checksum. */ |
3470 | xfs_dinode_calc_crc(mp, dip); |
3471 | if (error) |
3472 | xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); |
3473 | return error; |
3474 | } |
3475 | |
3476 | /* |
3477 | * Non-blocking flush of dirty inode metadata into the backing buffer. |
3478 | * |
3479 | * The caller must have a reference to the inode and hold the cluster buffer |
3480 | * locked. The function will walk across all the inodes on the cluster buffer it |
3481 | * can find and lock without blocking, and flush them to the cluster buffer. |
3482 | * |
3483 | * On successful flushing of at least one inode, the caller must write out the |
3484 | * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and |
3485 | * the caller needs to release the buffer. On failure, the filesystem will be |
3486 | * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED |
3487 | * will be returned. |
3488 | */ |
3489 | int |
3490 | xfs_iflush_cluster( |
3491 | struct xfs_buf *bp) |
3492 | { |
3493 | struct xfs_mount *mp = bp->b_mount; |
3494 | struct xfs_log_item *lip, *n; |
3495 | struct xfs_inode *ip; |
3496 | struct xfs_inode_log_item *iip; |
3497 | int clcount = 0; |
3498 | int error = 0; |
3499 | |
3500 | /* |
3501 | * We must use the safe variant here as on shutdown xfs_iflush_abort() |
3502 | * will remove itself from the list. |
3503 | */ |
3504 | list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { |
3505 | iip = (struct xfs_inode_log_item *)lip; |
3506 | ip = iip->ili_inode; |
3507 | |
3508 | /* |
3509 | * Quick and dirty check to avoid locks if possible. |
3510 | */ |
3511 | if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) |
3512 | continue; |
3513 | if (xfs_ipincount(ip)) |
3514 | continue; |
3515 | |
3516 | /* |
3517 | * The inode is still attached to the buffer, which means it is |
3518 | * dirty but reclaim might try to grab it. Check carefully for |
3519 | * that, and grab the ilock while still holding the i_flags_lock |
3520 | * to guarantee reclaim will not be able to reclaim this inode |
3521 | * once we drop the i_flags_lock. |
3522 | */ |
3523 | spin_lock(lock: &ip->i_flags_lock); |
3524 | ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); |
3525 | if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { |
3526 | spin_unlock(lock: &ip->i_flags_lock); |
3527 | continue; |
3528 | } |
3529 | |
3530 | /* |
3531 | * ILOCK will pin the inode against reclaim and prevent |
3532 | * concurrent transactions modifying the inode while we are |
3533 | * flushing the inode. If we get the lock, set the flushing |
3534 | * state before we drop the i_flags_lock. |
3535 | */ |
3536 | if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { |
3537 | spin_unlock(lock: &ip->i_flags_lock); |
3538 | continue; |
3539 | } |
3540 | __xfs_iflags_set(ip, XFS_IFLUSHING); |
3541 | spin_unlock(lock: &ip->i_flags_lock); |
3542 | |
3543 | /* |
3544 | * Abort flushing this inode if we are shut down because the |
3545 | * inode may not currently be in the AIL. This can occur when |
3546 | * log I/O failure unpins the inode without inserting into the |
3547 | * AIL, leaving a dirty/unpinned inode attached to the buffer |
3548 | * that otherwise looks like it should be flushed. |
3549 | */ |
3550 | if (xlog_is_shutdown(log: mp->m_log)) { |
3551 | xfs_iunpin_wait(ip); |
3552 | xfs_iflush_abort(ip); |
3553 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
3554 | error = -EIO; |
3555 | continue; |
3556 | } |
3557 | |
3558 | /* don't block waiting on a log force to unpin dirty inodes */ |
3559 | if (xfs_ipincount(ip)) { |
3560 | xfs_iflags_clear(ip, XFS_IFLUSHING); |
3561 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
3562 | continue; |
3563 | } |
3564 | |
3565 | if (!xfs_inode_clean(ip)) |
3566 | error = xfs_iflush(ip, bp); |
3567 | else |
3568 | xfs_iflags_clear(ip, XFS_IFLUSHING); |
3569 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
3570 | if (error) |
3571 | break; |
3572 | clcount++; |
3573 | } |
3574 | |
3575 | if (error) { |
3576 | /* |
3577 | * Shutdown first so we kill the log before we release this |
3578 | * buffer. If it is an INODE_ALLOC buffer and pins the tail |
3579 | * of the log, failing it before the _log_ is shut down can |
3580 | * result in the log tail being moved forward in the journal |
3581 | * on disk because log writes can still be taking place. Hence |
3582 | * unpinning the tail will allow the ICREATE intent to be |
3583 | * removed from the log an recovery will fail with uninitialised |
3584 | * inode cluster buffers. |
3585 | */ |
3586 | xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); |
3587 | bp->b_flags |= XBF_ASYNC; |
3588 | xfs_buf_ioend_fail(bp); |
3589 | return error; |
3590 | } |
3591 | |
3592 | if (!clcount) |
3593 | return -EAGAIN; |
3594 | |
3595 | XFS_STATS_INC(mp, xs_icluster_flushcnt); |
3596 | XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); |
3597 | return 0; |
3598 | |
3599 | } |
3600 | |
3601 | /* Release an inode. */ |
3602 | void |
3603 | xfs_irele( |
3604 | struct xfs_inode *ip) |
3605 | { |
3606 | trace_xfs_irele(ip, _RET_IP_); |
3607 | iput(VFS_I(ip)); |
3608 | } |
3609 | |
3610 | /* |
3611 | * Ensure all commited transactions touching the inode are written to the log. |
3612 | */ |
3613 | int |
3614 | xfs_log_force_inode( |
3615 | struct xfs_inode *ip) |
3616 | { |
3617 | xfs_csn_t seq = 0; |
3618 | |
3619 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
3620 | if (xfs_ipincount(ip)) |
3621 | seq = ip->i_itemp->ili_commit_seq; |
3622 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
3623 | |
3624 | if (!seq) |
3625 | return 0; |
3626 | return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); |
3627 | } |
3628 | |
3629 | /* |
3630 | * Grab the exclusive iolock for a data copy from src to dest, making sure to |
3631 | * abide vfs locking order (lowest pointer value goes first) and breaking the |
3632 | * layout leases before proceeding. The loop is needed because we cannot call |
3633 | * the blocking break_layout() with the iolocks held, and therefore have to |
3634 | * back out both locks. |
3635 | */ |
3636 | static int |
3637 | xfs_iolock_two_inodes_and_break_layout( |
3638 | struct inode *src, |
3639 | struct inode *dest) |
3640 | { |
3641 | int error; |
3642 | |
3643 | if (src > dest) |
3644 | swap(src, dest); |
3645 | |
3646 | retry: |
3647 | /* Wait to break both inodes' layouts before we start locking. */ |
3648 | error = break_layout(inode: src, wait: true); |
3649 | if (error) |
3650 | return error; |
3651 | if (src != dest) { |
3652 | error = break_layout(inode: dest, wait: true); |
3653 | if (error) |
3654 | return error; |
3655 | } |
3656 | |
3657 | /* Lock one inode and make sure nobody got in and leased it. */ |
3658 | inode_lock(inode: src); |
3659 | error = break_layout(inode: src, wait: false); |
3660 | if (error) { |
3661 | inode_unlock(inode: src); |
3662 | if (error == -EWOULDBLOCK) |
3663 | goto retry; |
3664 | return error; |
3665 | } |
3666 | |
3667 | if (src == dest) |
3668 | return 0; |
3669 | |
3670 | /* Lock the other inode and make sure nobody got in and leased it. */ |
3671 | inode_lock_nested(inode: dest, subclass: I_MUTEX_NONDIR2); |
3672 | error = break_layout(inode: dest, wait: false); |
3673 | if (error) { |
3674 | inode_unlock(inode: src); |
3675 | inode_unlock(inode: dest); |
3676 | if (error == -EWOULDBLOCK) |
3677 | goto retry; |
3678 | return error; |
3679 | } |
3680 | |
3681 | return 0; |
3682 | } |
3683 | |
3684 | static int |
3685 | xfs_mmaplock_two_inodes_and_break_dax_layout( |
3686 | struct xfs_inode *ip1, |
3687 | struct xfs_inode *ip2) |
3688 | { |
3689 | int error; |
3690 | bool retry; |
3691 | struct page *page; |
3692 | |
3693 | if (ip1->i_ino > ip2->i_ino) |
3694 | swap(ip1, ip2); |
3695 | |
3696 | again: |
3697 | retry = false; |
3698 | /* Lock the first inode */ |
3699 | xfs_ilock(ip: ip1, XFS_MMAPLOCK_EXCL); |
3700 | error = xfs_break_dax_layouts(inode: VFS_I(ip: ip1), retry: &retry); |
3701 | if (error || retry) { |
3702 | xfs_iunlock(ip: ip1, XFS_MMAPLOCK_EXCL); |
3703 | if (error == 0 && retry) |
3704 | goto again; |
3705 | return error; |
3706 | } |
3707 | |
3708 | if (ip1 == ip2) |
3709 | return 0; |
3710 | |
3711 | /* Nested lock the second inode */ |
3712 | xfs_ilock(ip: ip2, lock_flags: xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, subclass: 1)); |
3713 | /* |
3714 | * We cannot use xfs_break_dax_layouts() directly here because it may |
3715 | * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable |
3716 | * for this nested lock case. |
3717 | */ |
3718 | page = dax_layout_busy_page(mapping: VFS_I(ip: ip2)->i_mapping); |
3719 | if (page && page_ref_count(page) != 1) { |
3720 | xfs_iunlock(ip: ip2, XFS_MMAPLOCK_EXCL); |
3721 | xfs_iunlock(ip: ip1, XFS_MMAPLOCK_EXCL); |
3722 | goto again; |
3723 | } |
3724 | |
3725 | return 0; |
3726 | } |
3727 | |
3728 | /* |
3729 | * Lock two inodes so that userspace cannot initiate I/O via file syscalls or |
3730 | * mmap activity. |
3731 | */ |
3732 | int |
3733 | xfs_ilock2_io_mmap( |
3734 | struct xfs_inode *ip1, |
3735 | struct xfs_inode *ip2) |
3736 | { |
3737 | int ret; |
3738 | |
3739 | ret = xfs_iolock_two_inodes_and_break_layout(src: VFS_I(ip: ip1), dest: VFS_I(ip: ip2)); |
3740 | if (ret) |
3741 | return ret; |
3742 | |
3743 | if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { |
3744 | ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); |
3745 | if (ret) { |
3746 | inode_unlock(inode: VFS_I(ip: ip2)); |
3747 | if (ip1 != ip2) |
3748 | inode_unlock(inode: VFS_I(ip: ip1)); |
3749 | return ret; |
3750 | } |
3751 | } else |
3752 | filemap_invalidate_lock_two(mapping1: VFS_I(ip: ip1)->i_mapping, |
3753 | mapping2: VFS_I(ip: ip2)->i_mapping); |
3754 | |
3755 | return 0; |
3756 | } |
3757 | |
3758 | /* Unlock both inodes to allow IO and mmap activity. */ |
3759 | void |
3760 | xfs_iunlock2_io_mmap( |
3761 | struct xfs_inode *ip1, |
3762 | struct xfs_inode *ip2) |
3763 | { |
3764 | if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { |
3765 | xfs_iunlock(ip: ip2, XFS_MMAPLOCK_EXCL); |
3766 | if (ip1 != ip2) |
3767 | xfs_iunlock(ip: ip1, XFS_MMAPLOCK_EXCL); |
3768 | } else |
3769 | filemap_invalidate_unlock_two(mapping1: VFS_I(ip: ip1)->i_mapping, |
3770 | mapping2: VFS_I(ip: ip2)->i_mapping); |
3771 | |
3772 | inode_unlock(inode: VFS_I(ip: ip2)); |
3773 | if (ip1 != ip2) |
3774 | inode_unlock(inode: VFS_I(ip: ip1)); |
3775 | } |
3776 | |
3777 | /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ |
3778 | void |
3779 | xfs_iunlock2_remapping( |
3780 | struct xfs_inode *ip1, |
3781 | struct xfs_inode *ip2) |
3782 | { |
3783 | xfs_iflags_clear(ip: ip1, XFS_IREMAPPING); |
3784 | |
3785 | if (ip1 != ip2) |
3786 | xfs_iunlock(ip: ip1, XFS_MMAPLOCK_SHARED); |
3787 | xfs_iunlock(ip: ip2, XFS_MMAPLOCK_EXCL); |
3788 | |
3789 | if (ip1 != ip2) |
3790 | inode_unlock_shared(inode: VFS_I(ip: ip1)); |
3791 | inode_unlock(inode: VFS_I(ip: ip2)); |
3792 | } |
3793 | |
3794 | /* |
3795 | * Reload the incore inode list for this inode. Caller should ensure that |
3796 | * the link count cannot change, either by taking ILOCK_SHARED or otherwise |
3797 | * preventing other threads from executing. |
3798 | */ |
3799 | int |
3800 | xfs_inode_reload_unlinked_bucket( |
3801 | struct xfs_trans *tp, |
3802 | struct xfs_inode *ip) |
3803 | { |
3804 | struct xfs_mount *mp = tp->t_mountp; |
3805 | struct xfs_buf *agibp; |
3806 | struct xfs_agi *agi; |
3807 | struct xfs_perag *pag; |
3808 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); |
3809 | xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); |
3810 | xfs_agino_t prev_agino, next_agino; |
3811 | unsigned int bucket; |
3812 | bool foundit = false; |
3813 | int error; |
3814 | |
3815 | /* Grab the first inode in the list */ |
3816 | pag = xfs_perag_get(mp, agno); |
3817 | error = xfs_ialloc_read_agi(pag, tp, &agibp); |
3818 | xfs_perag_put(pag); |
3819 | if (error) |
3820 | return error; |
3821 | |
3822 | /* |
3823 | * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the |
3824 | * incore unlinked list pointers for this inode. Check once more to |
3825 | * see if we raced with anyone else to reload the unlinked list. |
3826 | */ |
3827 | if (!xfs_inode_unlinked_incomplete(ip)) { |
3828 | foundit = true; |
3829 | goto out_agibp; |
3830 | } |
3831 | |
3832 | bucket = agino % XFS_AGI_UNLINKED_BUCKETS; |
3833 | agi = agibp->b_addr; |
3834 | |
3835 | trace_xfs_inode_reload_unlinked_bucket(ip); |
3836 | |
3837 | xfs_info_ratelimited(mp, |
3838 | "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", |
3839 | agino, agno); |
3840 | |
3841 | prev_agino = NULLAGINO; |
3842 | next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); |
3843 | while (next_agino != NULLAGINO) { |
3844 | struct xfs_inode *next_ip = NULL; |
3845 | |
3846 | /* Found this caller's inode, set its backlink. */ |
3847 | if (next_agino == agino) { |
3848 | next_ip = ip; |
3849 | next_ip->i_prev_unlinked = prev_agino; |
3850 | foundit = true; |
3851 | goto next_inode; |
3852 | } |
3853 | |
3854 | /* Try in-memory lookup first. */ |
3855 | next_ip = xfs_iunlink_lookup(pag, next_agino); |
3856 | if (next_ip) |
3857 | goto next_inode; |
3858 | |
3859 | /* Inode not in memory, try reloading it. */ |
3860 | error = xfs_iunlink_reload_next(tp, agibp, prev_agino, |
3861 | next_agino); |
3862 | if (error) |
3863 | break; |
3864 | |
3865 | /* Grab the reloaded inode. */ |
3866 | next_ip = xfs_iunlink_lookup(pag, next_agino); |
3867 | if (!next_ip) { |
3868 | /* No incore inode at all? We reloaded it... */ |
3869 | ASSERT(next_ip != NULL); |
3870 | error = -EFSCORRUPTED; |
3871 | break; |
3872 | } |
3873 | |
3874 | next_inode: |
3875 | prev_agino = next_agino; |
3876 | next_agino = next_ip->i_next_unlinked; |
3877 | } |
3878 | |
3879 | out_agibp: |
3880 | xfs_trans_brelse(tp, agibp); |
3881 | /* Should have found this inode somewhere in the iunlinked bucket. */ |
3882 | if (!error && !foundit) |
3883 | error = -EFSCORRUPTED; |
3884 | return error; |
3885 | } |
3886 | |
3887 | /* Decide if this inode is missing its unlinked list and reload it. */ |
3888 | int |
3889 | xfs_inode_reload_unlinked( |
3890 | struct xfs_inode *ip) |
3891 | { |
3892 | struct xfs_trans *tp; |
3893 | int error; |
3894 | |
3895 | error = xfs_trans_alloc_empty(mp: ip->i_mount, tpp: &tp); |
3896 | if (error) |
3897 | return error; |
3898 | |
3899 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
3900 | if (xfs_inode_unlinked_incomplete(ip)) |
3901 | error = xfs_inode_reload_unlinked_bucket(tp, ip); |
3902 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
3903 | xfs_trans_cancel(tp); |
3904 | |
3905 | return error; |
3906 | } |
3907 | |
3908 | /* Has this inode fork been zapped by repair? */ |
3909 | bool |
3910 | xfs_ifork_zapped( |
3911 | const struct xfs_inode *ip, |
3912 | int whichfork) |
3913 | { |
3914 | unsigned int datamask = 0; |
3915 | |
3916 | switch (whichfork) { |
3917 | case XFS_DATA_FORK: |
3918 | switch (ip->i_vnode.i_mode & S_IFMT) { |
3919 | case S_IFDIR: |
3920 | datamask = XFS_SICK_INO_DIR_ZAPPED; |
3921 | break; |
3922 | case S_IFLNK: |
3923 | datamask = XFS_SICK_INO_SYMLINK_ZAPPED; |
3924 | break; |
3925 | } |
3926 | return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); |
3927 | case XFS_ATTR_FORK: |
3928 | return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; |
3929 | default: |
3930 | return false; |
3931 | } |
3932 | } |
3933 | |
3934 | /* Compute the number of data and realtime blocks used by a file. */ |
3935 | void |
3936 | xfs_inode_count_blocks( |
3937 | struct xfs_trans *tp, |
3938 | struct xfs_inode *ip, |
3939 | xfs_filblks_t *dblocks, |
3940 | xfs_filblks_t *rblocks) |
3941 | { |
3942 | struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); |
3943 | |
3944 | *rblocks = 0; |
3945 | if (XFS_IS_REALTIME_INODE(ip)) |
3946 | xfs_bmap_count_leaves(ifp, rblocks); |
3947 | *dblocks = ip->i_nblocks - *rblocks; |
3948 | } |
3949 |
Definitions
- xfs_inode_cache
- xfs_get_extsz_hint
- xfs_get_cowextsz_hint
- xfs_ilock_data_map_shared
- xfs_ilock_attr_map_shared
- xfs_lock_flags_assert
- xfs_ilock
- xfs_ilock_nowait
- xfs_iunlock
- xfs_ilock_demote
- xfs_assert_ilocked
- xfs_lockdep_subclass_ok
- xfs_lock_inumorder
- xfs_lock_inodes
- xfs_lock_two_inodes
- xfs_ip2xflags
- xfs_lookup
- xfs_inode_inherit_flags
- xfs_inode_inherit_flags2
- xfs_init_new_inode
- xfs_droplink
- xfs_bumplink
- xfs_dir_hooks_switch
- xfs_dir_hook_disable
- xfs_dir_hook_enable
- xfs_dir_update_hook
- xfs_dir_hook_add
- xfs_dir_hook_del
- xfs_dir_hook_setup
- xfs_create
- xfs_create_tmpfile
- xfs_link
- xfs_itruncate_clear_reflink_flags
- xfs_itruncate_extents_flags
- xfs_release
- xfs_inactive_truncate
- xfs_inactive_ifree
- xfs_inode_needs_inactive
- xfs_inactive_health
- xfs_inactive
- xfs_iunlink_lookup
- xfs_iunlink_update_backref
- xfs_iunlink_update_bucket
- xfs_iunlink_reload_next
- xfs_iunlink_insert_inode
- xfs_iunlink
- xfs_iunlink_remove_inode
- xfs_iunlink_remove
- xfs_ifree_mark_inode_stale
- xfs_ifree_cluster
- xfs_ifree
- xfs_iunpin
- __xfs_iunpin_wait
- xfs_iunpin_wait
- xfs_remove
- xfs_sort_for_rename
- xfs_finish_rename
- xfs_cross_rename
- xfs_rename_alloc_whiteout
- xfs_rename
- xfs_iflush
- xfs_iflush_cluster
- xfs_irele
- xfs_log_force_inode
- xfs_iolock_two_inodes_and_break_layout
- xfs_mmaplock_two_inodes_and_break_dax_layout
- xfs_ilock2_io_mmap
- xfs_iunlock2_io_mmap
- xfs_iunlock2_remapping
- xfs_inode_reload_unlinked_bucket
- xfs_inode_reload_unlinked
- xfs_ifork_zapped
Improve your Profiling and Debugging skills
Find out more