1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2022 Fujitsu. All Rights Reserved. |
4 | */ |
5 | |
6 | #include "xfs.h" |
7 | #include "xfs_shared.h" |
8 | #include "xfs_format.h" |
9 | #include "xfs_log_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_alloc.h" |
13 | #include "xfs_bit.h" |
14 | #include "xfs_btree.h" |
15 | #include "xfs_inode.h" |
16 | #include "xfs_icache.h" |
17 | #include "xfs_rmap.h" |
18 | #include "xfs_rmap_btree.h" |
19 | #include "xfs_rtalloc.h" |
20 | #include "xfs_trans.h" |
21 | #include "xfs_ag.h" |
22 | |
23 | #include <linux/mm.h> |
24 | #include <linux/dax.h> |
25 | #include <linux/fs.h> |
26 | |
27 | struct xfs_failure_info { |
28 | xfs_agblock_t startblock; |
29 | xfs_extlen_t blockcount; |
30 | int mf_flags; |
31 | bool want_shutdown; |
32 | }; |
33 | |
34 | static pgoff_t |
35 | xfs_failure_pgoff( |
36 | struct xfs_mount *mp, |
37 | const struct xfs_rmap_irec *rec, |
38 | const struct xfs_failure_info *notify) |
39 | { |
40 | loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); |
41 | |
42 | if (notify->startblock > rec->rm_startblock) |
43 | pos += XFS_FSB_TO_B(mp, |
44 | notify->startblock - rec->rm_startblock); |
45 | return pos >> PAGE_SHIFT; |
46 | } |
47 | |
48 | static unsigned long |
49 | xfs_failure_pgcnt( |
50 | struct xfs_mount *mp, |
51 | const struct xfs_rmap_irec *rec, |
52 | const struct xfs_failure_info *notify) |
53 | { |
54 | xfs_agblock_t end_rec; |
55 | xfs_agblock_t end_notify; |
56 | xfs_agblock_t start_cross; |
57 | xfs_agblock_t end_cross; |
58 | |
59 | start_cross = max(rec->rm_startblock, notify->startblock); |
60 | |
61 | end_rec = rec->rm_startblock + rec->rm_blockcount; |
62 | end_notify = notify->startblock + notify->blockcount; |
63 | end_cross = min(end_rec, end_notify); |
64 | |
65 | return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; |
66 | } |
67 | |
68 | static int |
69 | xfs_dax_failure_fn( |
70 | struct xfs_btree_cur *cur, |
71 | const struct xfs_rmap_irec *rec, |
72 | void *data) |
73 | { |
74 | struct xfs_mount *mp = cur->bc_mp; |
75 | struct xfs_inode *ip; |
76 | struct xfs_failure_info *notify = data; |
77 | struct address_space *mapping; |
78 | pgoff_t pgoff; |
79 | unsigned long pgcnt; |
80 | int error = 0; |
81 | |
82 | if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || |
83 | (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { |
84 | /* Continue the query because this isn't a failure. */ |
85 | if (notify->mf_flags & MF_MEM_PRE_REMOVE) |
86 | return 0; |
87 | notify->want_shutdown = true; |
88 | return 0; |
89 | } |
90 | |
91 | /* Get files that incore, filter out others that are not in use. */ |
92 | error = xfs_iget(mp, tp: cur->bc_tp, ino: rec->rm_owner, XFS_IGET_INCORE, |
93 | lock_flags: 0, ipp: &ip); |
94 | /* Continue the rmap query if the inode isn't incore */ |
95 | if (error == -ENODATA) |
96 | return 0; |
97 | if (error) { |
98 | notify->want_shutdown = true; |
99 | return 0; |
100 | } |
101 | |
102 | mapping = VFS_I(ip)->i_mapping; |
103 | pgoff = xfs_failure_pgoff(mp, rec, notify); |
104 | pgcnt = xfs_failure_pgcnt(mp, rec, notify); |
105 | |
106 | /* Continue the rmap query if the inode isn't a dax file. */ |
107 | if (dax_mapping(mapping)) |
108 | error = mf_dax_kill_procs(mapping, index: pgoff, count: pgcnt, |
109 | mf_flags: notify->mf_flags); |
110 | |
111 | /* Invalidate the cache in dax pages. */ |
112 | if (notify->mf_flags & MF_MEM_PRE_REMOVE) |
113 | invalidate_inode_pages2_range(mapping, start: pgoff, |
114 | end: pgoff + pgcnt - 1); |
115 | |
116 | xfs_irele(ip); |
117 | return error; |
118 | } |
119 | |
120 | static int |
121 | xfs_dax_notify_failure_freeze( |
122 | struct xfs_mount *mp) |
123 | { |
124 | struct super_block *sb = mp->m_super; |
125 | int error; |
126 | |
127 | error = freeze_super(super: sb, who: FREEZE_HOLDER_KERNEL); |
128 | if (error) |
129 | xfs_emerg(mp, "already frozen by kernel, err=%d" , error); |
130 | |
131 | return error; |
132 | } |
133 | |
134 | static void |
135 | xfs_dax_notify_failure_thaw( |
136 | struct xfs_mount *mp, |
137 | bool kernel_frozen) |
138 | { |
139 | struct super_block *sb = mp->m_super; |
140 | int error; |
141 | |
142 | if (kernel_frozen) { |
143 | error = thaw_super(super: sb, who: FREEZE_HOLDER_KERNEL); |
144 | if (error) |
145 | xfs_emerg(mp, "still frozen after notify failure, err=%d" , |
146 | error); |
147 | } |
148 | |
149 | /* |
150 | * Also thaw userspace call anyway because the device is about to be |
151 | * removed immediately. |
152 | */ |
153 | thaw_super(super: sb, who: FREEZE_HOLDER_USERSPACE); |
154 | } |
155 | |
156 | static int |
157 | xfs_dax_notify_ddev_failure( |
158 | struct xfs_mount *mp, |
159 | xfs_daddr_t daddr, |
160 | xfs_daddr_t bblen, |
161 | int mf_flags) |
162 | { |
163 | struct xfs_failure_info notify = { .mf_flags = mf_flags }; |
164 | struct xfs_trans *tp = NULL; |
165 | struct xfs_btree_cur *cur = NULL; |
166 | struct xfs_buf *agf_bp = NULL; |
167 | int error = 0; |
168 | bool kernel_frozen = false; |
169 | xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); |
170 | xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); |
171 | xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, |
172 | daddr + bblen - 1); |
173 | xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); |
174 | |
175 | if (mf_flags & MF_MEM_PRE_REMOVE) { |
176 | xfs_info(mp, "Device is about to be removed!" ); |
177 | /* |
178 | * Freeze fs to prevent new mappings from being created. |
179 | * - Keep going on if others already hold the kernel forzen. |
180 | * - Keep going on if other errors too because this device is |
181 | * starting to fail. |
182 | * - If kernel frozen state is hold successfully here, thaw it |
183 | * here as well at the end. |
184 | */ |
185 | kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; |
186 | } |
187 | |
188 | error = xfs_trans_alloc_empty(mp, tpp: &tp); |
189 | if (error) |
190 | goto out; |
191 | |
192 | for (; agno <= end_agno; agno++) { |
193 | struct xfs_rmap_irec ri_low = { }; |
194 | struct xfs_rmap_irec ri_high; |
195 | struct xfs_agf *agf; |
196 | struct xfs_perag *pag; |
197 | xfs_agblock_t range_agend; |
198 | |
199 | pag = xfs_perag_get(mp, agno); |
200 | error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); |
201 | if (error) { |
202 | xfs_perag_put(pag); |
203 | break; |
204 | } |
205 | |
206 | cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); |
207 | |
208 | /* |
209 | * Set the rmap range from ri_low to ri_high, which represents |
210 | * a [start, end] where we looking for the files or metadata. |
211 | */ |
212 | memset(&ri_high, 0xFF, sizeof(ri_high)); |
213 | ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); |
214 | if (agno == end_agno) |
215 | ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); |
216 | |
217 | agf = agf_bp->b_addr; |
218 | range_agend = min(be32_to_cpu(agf->agf_length) - 1, |
219 | ri_high.rm_startblock); |
220 | notify.startblock = ri_low.rm_startblock; |
221 | notify.blockcount = range_agend + 1 - ri_low.rm_startblock; |
222 | |
223 | error = xfs_rmap_query_range(cur, &ri_low, &ri_high, |
224 | xfs_dax_failure_fn, ¬ify); |
225 | xfs_btree_del_cursor(cur, error); |
226 | xfs_trans_brelse(tp, agf_bp); |
227 | xfs_perag_put(pag); |
228 | if (error) |
229 | break; |
230 | |
231 | fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); |
232 | } |
233 | |
234 | xfs_trans_cancel(tp); |
235 | |
236 | /* |
237 | * Shutdown fs from a force umount in pre-remove case which won't fail, |
238 | * so errors can be ignored. Otherwise, shutdown the filesystem with |
239 | * CORRUPT flag if error occured or notify.want_shutdown was set during |
240 | * RMAP querying. |
241 | */ |
242 | if (mf_flags & MF_MEM_PRE_REMOVE) |
243 | xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); |
244 | else if (error || notify.want_shutdown) { |
245 | xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); |
246 | if (!error) |
247 | error = -EFSCORRUPTED; |
248 | } |
249 | |
250 | out: |
251 | /* Thaw the fs if it has been frozen before. */ |
252 | if (mf_flags & MF_MEM_PRE_REMOVE) |
253 | xfs_dax_notify_failure_thaw(mp, kernel_frozen); |
254 | |
255 | return error; |
256 | } |
257 | |
258 | static int |
259 | xfs_dax_notify_failure( |
260 | struct dax_device *dax_dev, |
261 | u64 offset, |
262 | u64 len, |
263 | int mf_flags) |
264 | { |
265 | struct xfs_mount *mp = dax_holder(dax_dev); |
266 | u64 ddev_start; |
267 | u64 ddev_end; |
268 | |
269 | if (!(mp->m_super->s_flags & SB_BORN)) { |
270 | xfs_warn(mp, "filesystem is not ready for notify_failure()!" ); |
271 | return -EIO; |
272 | } |
273 | |
274 | if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { |
275 | xfs_debug(mp, |
276 | "notify_failure() not supported on realtime device!" ); |
277 | return -EOPNOTSUPP; |
278 | } |
279 | |
280 | if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && |
281 | mp->m_logdev_targp != mp->m_ddev_targp) { |
282 | /* |
283 | * In the pre-remove case the failure notification is attempting |
284 | * to trigger a force unmount. The expectation is that the |
285 | * device is still present, but its removal is in progress and |
286 | * can not be cancelled, proceed with accessing the log device. |
287 | */ |
288 | if (mf_flags & MF_MEM_PRE_REMOVE) |
289 | return 0; |
290 | xfs_err(mp, "ondisk log corrupt, shutting down fs!" ); |
291 | xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); |
292 | return -EFSCORRUPTED; |
293 | } |
294 | |
295 | if (!xfs_has_rmapbt(mp)) { |
296 | xfs_debug(mp, "notify_failure() needs rmapbt enabled!" ); |
297 | return -EOPNOTSUPP; |
298 | } |
299 | |
300 | ddev_start = mp->m_ddev_targp->bt_dax_part_off; |
301 | ddev_end = ddev_start + bdev_nr_bytes(bdev: mp->m_ddev_targp->bt_bdev) - 1; |
302 | |
303 | /* Notify failure on the whole device. */ |
304 | if (offset == 0 && len == U64_MAX) { |
305 | offset = ddev_start; |
306 | len = bdev_nr_bytes(bdev: mp->m_ddev_targp->bt_bdev); |
307 | } |
308 | |
309 | /* Ignore the range out of filesystem area */ |
310 | if (offset + len - 1 < ddev_start) |
311 | return -ENXIO; |
312 | if (offset > ddev_end) |
313 | return -ENXIO; |
314 | |
315 | /* Calculate the real range when it touches the boundary */ |
316 | if (offset > ddev_start) |
317 | offset -= ddev_start; |
318 | else { |
319 | len -= ddev_start - offset; |
320 | offset = 0; |
321 | } |
322 | if (offset + len - 1 > ddev_end) |
323 | len = ddev_end - offset + 1; |
324 | |
325 | return xfs_dax_notify_ddev_failure(mp, daddr: BTOBB(offset), bblen: BTOBB(len), |
326 | mf_flags); |
327 | } |
328 | |
329 | const struct dax_holder_operations xfs_dax_holder_operations = { |
330 | .notify_failure = xfs_dax_notify_failure, |
331 | }; |
332 | |