1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/ceph/ceph_debug.h> |
3 | |
4 | #include <linux/bug.h> |
5 | #include <linux/err.h> |
6 | #include <linux/random.h> |
7 | #include <linux/slab.h> |
8 | #include <linux/types.h> |
9 | |
10 | #include <linux/ceph/messenger.h> |
11 | #include <linux/ceph/decode.h> |
12 | |
13 | #include "mdsmap.h" |
14 | #include "mds_client.h" |
15 | #include "super.h" |
16 | |
17 | #define CEPH_MDS_IS_READY(i, ignore_laggy) \ |
18 | (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) |
19 | |
20 | static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) |
21 | { |
22 | int n = 0; |
23 | int i, j; |
24 | |
25 | /* count */ |
26 | for (i = 0; i < m->possible_max_rank; i++) |
27 | if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
28 | n++; |
29 | if (n == 0) |
30 | return -1; |
31 | |
32 | /* pick */ |
33 | n = get_random_u32_below(ceil: n); |
34 | for (j = 0, i = 0; i < m->possible_max_rank; i++) { |
35 | if (CEPH_MDS_IS_READY(i, ignore_laggy)) |
36 | j++; |
37 | if (j > n) |
38 | break; |
39 | } |
40 | |
41 | return i; |
42 | } |
43 | |
44 | /* |
45 | * choose a random mds that is "up" (i.e. has a state > 0), or -1. |
46 | */ |
47 | int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) |
48 | { |
49 | int mds; |
50 | |
51 | mds = __mdsmap_get_random_mds(m, ignore_laggy: false); |
52 | if (mds == m->possible_max_rank || mds == -1) |
53 | mds = __mdsmap_get_random_mds(m, ignore_laggy: true); |
54 | |
55 | return mds == m->possible_max_rank ? -1 : mds; |
56 | } |
57 | |
58 | #define __decode_and_drop_type(p, end, type, bad) \ |
59 | do { \ |
60 | if (*p + sizeof(type) > end) \ |
61 | goto bad; \ |
62 | *p += sizeof(type); \ |
63 | } while (0) |
64 | |
65 | #define __decode_and_drop_set(p, end, type, bad) \ |
66 | do { \ |
67 | u32 n; \ |
68 | size_t need; \ |
69 | ceph_decode_32_safe(p, end, n, bad); \ |
70 | need = sizeof(type) * n; \ |
71 | ceph_decode_need(p, end, need, bad); \ |
72 | *p += need; \ |
73 | } while (0) |
74 | |
75 | #define __decode_and_drop_map(p, end, ktype, vtype, bad) \ |
76 | do { \ |
77 | u32 n; \ |
78 | size_t need; \ |
79 | ceph_decode_32_safe(p, end, n, bad); \ |
80 | need = (sizeof(ktype) + sizeof(vtype)) * n; \ |
81 | ceph_decode_need(p, end, need, bad); \ |
82 | *p += need; \ |
83 | } while (0) |
84 | |
85 | |
86 | static int __decode_and_drop_compat_set(void **p, void* end) |
87 | { |
88 | int i; |
89 | /* compat, ro_compat, incompat*/ |
90 | for (i = 0; i < 3; i++) { |
91 | u32 n; |
92 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); |
93 | /* mask */ |
94 | *p += sizeof(u64); |
95 | /* names (map<u64, string>) */ |
96 | n = ceph_decode_32(p); |
97 | while (n-- > 0) { |
98 | u32 len; |
99 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), |
100 | bad); |
101 | *p += sizeof(u64); |
102 | len = ceph_decode_32(p); |
103 | ceph_decode_need(p, end, len, bad); |
104 | *p += len; |
105 | } |
106 | } |
107 | return 0; |
108 | bad: |
109 | return -1; |
110 | } |
111 | |
112 | /* |
113 | * Decode an MDS map |
114 | * |
115 | * Ignore any fields we don't care about (there are quite a few of |
116 | * them). |
117 | */ |
118 | struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, |
119 | void *end, bool msgr2) |
120 | { |
121 | struct ceph_client *cl = mdsc->fsc->client; |
122 | struct ceph_mdsmap *m; |
123 | const void *start = *p; |
124 | int i, j, n; |
125 | int err; |
126 | u8 mdsmap_v; |
127 | u16 mdsmap_ev; |
128 | u32 target; |
129 | |
130 | m = kzalloc(size: sizeof(*m), GFP_NOFS); |
131 | if (!m) |
132 | return ERR_PTR(error: -ENOMEM); |
133 | |
134 | ceph_decode_need(p, end, 1 + 1, bad); |
135 | mdsmap_v = ceph_decode_8(p); |
136 | *p += sizeof(u8); /* mdsmap_cv */ |
137 | if (mdsmap_v >= 4) { |
138 | u32 mdsmap_len; |
139 | ceph_decode_32_safe(p, end, mdsmap_len, bad); |
140 | if (end < *p + mdsmap_len) |
141 | goto bad; |
142 | end = *p + mdsmap_len; |
143 | } |
144 | |
145 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); |
146 | m->m_epoch = ceph_decode_32(p); |
147 | m->m_client_epoch = ceph_decode_32(p); |
148 | m->m_last_failure = ceph_decode_32(p); |
149 | m->m_root = ceph_decode_32(p); |
150 | m->m_session_timeout = ceph_decode_32(p); |
151 | m->m_session_autoclose = ceph_decode_32(p); |
152 | m->m_max_file_size = ceph_decode_64(p); |
153 | m->m_max_mds = ceph_decode_32(p); |
154 | |
155 | /* |
156 | * pick out the active nodes as the m_num_active_mds, the |
157 | * m_num_active_mds maybe larger than m_max_mds when decreasing |
158 | * the max_mds in cluster side, in other case it should less |
159 | * than or equal to m_max_mds. |
160 | */ |
161 | m->m_num_active_mds = n = ceph_decode_32(p); |
162 | |
163 | /* |
164 | * the possible max rank, it maybe larger than the m_num_active_mds, |
165 | * for example if the mds_max == 2 in the cluster, when the MDS(0) |
166 | * was laggy and being replaced by a new MDS, we will temporarily |
167 | * receive a new mds map with n_num_mds == 1 and the active MDS(1), |
168 | * and the mds rank >= m_num_active_mds. |
169 | */ |
170 | m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); |
171 | |
172 | m->m_info = kcalloc(n: m->possible_max_rank, size: sizeof(*m->m_info), GFP_NOFS); |
173 | if (!m->m_info) |
174 | goto nomem; |
175 | |
176 | /* pick out active nodes from mds_info (state > 0) */ |
177 | for (i = 0; i < n; i++) { |
178 | u64 global_id; |
179 | u32 namelen; |
180 | s32 mds, inc, state; |
181 | u8 info_v; |
182 | void *info_end = NULL; |
183 | struct ceph_entity_addr addr; |
184 | u32 num_export_targets; |
185 | void *pexport_targets = NULL; |
186 | struct ceph_timespec laggy_since; |
187 | struct ceph_mds_info *info; |
188 | bool laggy; |
189 | |
190 | ceph_decode_need(p, end, sizeof(u64) + 1, bad); |
191 | global_id = ceph_decode_64(p); |
192 | info_v= ceph_decode_8(p); |
193 | if (info_v >= 4) { |
194 | u32 info_len; |
195 | ceph_decode_need(p, end, 1 + sizeof(u32), bad); |
196 | *p += sizeof(u8); /* info_cv */ |
197 | info_len = ceph_decode_32(p); |
198 | info_end = *p + info_len; |
199 | if (info_end > end) |
200 | goto bad; |
201 | } |
202 | |
203 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); |
204 | *p += sizeof(u64); |
205 | namelen = ceph_decode_32(p); /* skip mds name */ |
206 | *p += namelen; |
207 | |
208 | ceph_decode_32_safe(p, end, mds, bad); |
209 | ceph_decode_32_safe(p, end, inc, bad); |
210 | ceph_decode_32_safe(p, end, state, bad); |
211 | *p += sizeof(u64); /* state_seq */ |
212 | if (info_v >= 8) |
213 | err = ceph_decode_entity_addrvec(p, end, msgr2, addr: &addr); |
214 | else |
215 | err = ceph_decode_entity_addr(p, end, addr: &addr); |
216 | if (err) |
217 | goto corrupt; |
218 | |
219 | ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since), |
220 | bad); |
221 | laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; |
222 | *p += sizeof(u32); |
223 | ceph_decode_32_safe(p, end, namelen, bad); |
224 | *p += namelen; |
225 | if (info_v >= 2) { |
226 | ceph_decode_32_safe(p, end, num_export_targets, bad); |
227 | pexport_targets = *p; |
228 | *p += num_export_targets * sizeof(u32); |
229 | } else { |
230 | num_export_targets = 0; |
231 | } |
232 | |
233 | if (info_end && *p != info_end) { |
234 | if (*p > info_end) |
235 | goto bad; |
236 | *p = info_end; |
237 | } |
238 | |
239 | doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n" , i+1, n, global_id, |
240 | mds, inc, ceph_pr_addr(&addr), |
241 | ceph_mds_state_name(state), laggy ? "(laggy)" : "" ); |
242 | |
243 | if (mds < 0 || mds >= m->possible_max_rank) { |
244 | pr_warn_client(cl, "got incorrect mds(%d)\n" , mds); |
245 | continue; |
246 | } |
247 | |
248 | if (state <= 0) { |
249 | doutc(cl, "got incorrect state(%s)\n" , |
250 | ceph_mds_state_name(state)); |
251 | continue; |
252 | } |
253 | |
254 | info = &m->m_info[mds]; |
255 | info->global_id = global_id; |
256 | info->state = state; |
257 | info->addr = addr; |
258 | info->laggy = laggy; |
259 | info->num_export_targets = num_export_targets; |
260 | if (num_export_targets) { |
261 | info->export_targets = kcalloc(n: num_export_targets, |
262 | size: sizeof(u32), GFP_NOFS); |
263 | if (!info->export_targets) |
264 | goto nomem; |
265 | for (j = 0; j < num_export_targets; j++) { |
266 | target = ceph_decode_32(p: &pexport_targets); |
267 | info->export_targets[j] = target; |
268 | } |
269 | } else { |
270 | info->export_targets = NULL; |
271 | } |
272 | } |
273 | |
274 | /* pg_pools */ |
275 | ceph_decode_32_safe(p, end, n, bad); |
276 | m->m_num_data_pg_pools = n; |
277 | m->m_data_pg_pools = kcalloc(n, size: sizeof(u64), GFP_NOFS); |
278 | if (!m->m_data_pg_pools) |
279 | goto nomem; |
280 | ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); |
281 | for (i = 0; i < n; i++) |
282 | m->m_data_pg_pools[i] = ceph_decode_64(p); |
283 | m->m_cas_pg_pool = ceph_decode_64(p); |
284 | m->m_enabled = m->m_epoch > 1; |
285 | |
286 | mdsmap_ev = 1; |
287 | if (mdsmap_v >= 2) { |
288 | ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext); |
289 | } |
290 | if (mdsmap_ev >= 3) { |
291 | if (__decode_and_drop_compat_set(p, end) < 0) |
292 | goto bad_ext; |
293 | } |
294 | /* metadata_pool */ |
295 | if (mdsmap_ev < 5) { |
296 | __decode_and_drop_type(p, end, u32, bad_ext); |
297 | } else { |
298 | __decode_and_drop_type(p, end, u64, bad_ext); |
299 | } |
300 | |
301 | /* created + modified + tableserver */ |
302 | __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); |
303 | __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); |
304 | __decode_and_drop_type(p, end, u32, bad_ext); |
305 | |
306 | /* in */ |
307 | { |
308 | int num_laggy = 0; |
309 | ceph_decode_32_safe(p, end, n, bad_ext); |
310 | ceph_decode_need(p, end, sizeof(u32) * n, bad_ext); |
311 | |
312 | for (i = 0; i < n; i++) { |
313 | s32 mds = ceph_decode_32(p); |
314 | if (mds >= 0 && mds < m->possible_max_rank) { |
315 | if (m->m_info[mds].laggy) |
316 | num_laggy++; |
317 | } |
318 | } |
319 | m->m_num_laggy = num_laggy; |
320 | |
321 | if (n > m->possible_max_rank) { |
322 | void *new_m_info = krealloc(objp: m->m_info, |
323 | new_size: n * sizeof(*m->m_info), |
324 | GFP_NOFS | __GFP_ZERO); |
325 | if (!new_m_info) |
326 | goto nomem; |
327 | m->m_info = new_m_info; |
328 | } |
329 | m->possible_max_rank = n; |
330 | } |
331 | |
332 | /* inc */ |
333 | __decode_and_drop_map(p, end, u32, u32, bad_ext); |
334 | /* up */ |
335 | __decode_and_drop_map(p, end, u32, u64, bad_ext); |
336 | /* failed */ |
337 | __decode_and_drop_set(p, end, u32, bad_ext); |
338 | /* stopped */ |
339 | __decode_and_drop_set(p, end, u32, bad_ext); |
340 | |
341 | if (mdsmap_ev >= 4) { |
342 | /* last_failure_osd_epoch */ |
343 | __decode_and_drop_type(p, end, u32, bad_ext); |
344 | } |
345 | if (mdsmap_ev >= 6) { |
346 | /* ever_allowed_snaps */ |
347 | __decode_and_drop_type(p, end, u8, bad_ext); |
348 | /* explicitly_allowed_snaps */ |
349 | __decode_and_drop_type(p, end, u8, bad_ext); |
350 | } |
351 | if (mdsmap_ev >= 7) { |
352 | /* inline_data_enabled */ |
353 | __decode_and_drop_type(p, end, u8, bad_ext); |
354 | } |
355 | if (mdsmap_ev >= 8) { |
356 | /* enabled */ |
357 | ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); |
358 | /* fs_name */ |
359 | ceph_decode_skip_string(p, end, bad_ext); |
360 | } |
361 | /* damaged */ |
362 | if (mdsmap_ev >= 9) { |
363 | size_t need; |
364 | ceph_decode_32_safe(p, end, n, bad_ext); |
365 | need = sizeof(u32) * n; |
366 | ceph_decode_need(p, end, need, bad_ext); |
367 | *p += need; |
368 | m->m_damaged = n > 0; |
369 | } else { |
370 | m->m_damaged = false; |
371 | } |
372 | if (mdsmap_ev >= 17) { |
373 | /* balancer */ |
374 | ceph_decode_skip_string(p, end, bad_ext); |
375 | /* standby_count_wanted */ |
376 | ceph_decode_skip_32(p, end, bad_ext); |
377 | /* old_max_mds */ |
378 | ceph_decode_skip_32(p, end, bad_ext); |
379 | /* min_compat_client */ |
380 | ceph_decode_skip_8(p, end, bad_ext); |
381 | /* required_client_features */ |
382 | ceph_decode_skip_set(p, end, 64, bad_ext); |
383 | /* bal_rank_mask */ |
384 | ceph_decode_skip_string(p, end, bad_ext); |
385 | } |
386 | if (mdsmap_ev >= 18) { |
387 | ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); |
388 | } |
389 | bad_ext: |
390 | doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n" , |
391 | !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); |
392 | *p = end; |
393 | doutc(cl, "success epoch %u\n" , m->m_epoch); |
394 | return m; |
395 | nomem: |
396 | err = -ENOMEM; |
397 | goto out_err; |
398 | corrupt: |
399 | pr_err_client(cl, "corrupt mdsmap\n" ); |
400 | print_hex_dump(KERN_DEBUG, prefix_str: "mdsmap: " , |
401 | prefix_type: DUMP_PREFIX_OFFSET, rowsize: 16, groupsize: 1, |
402 | buf: start, len: end - start, ascii: true); |
403 | out_err: |
404 | ceph_mdsmap_destroy(m); |
405 | return ERR_PTR(error: err); |
406 | bad: |
407 | err = -EINVAL; |
408 | goto corrupt; |
409 | } |
410 | |
411 | void ceph_mdsmap_destroy(struct ceph_mdsmap *m) |
412 | { |
413 | int i; |
414 | |
415 | if (m->m_info) { |
416 | for (i = 0; i < m->possible_max_rank; i++) |
417 | kfree(objp: m->m_info[i].export_targets); |
418 | kfree(objp: m->m_info); |
419 | } |
420 | kfree(objp: m->m_data_pg_pools); |
421 | kfree(objp: m); |
422 | } |
423 | |
424 | bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) |
425 | { |
426 | int i, nr_active = 0; |
427 | if (!m->m_enabled) |
428 | return false; |
429 | if (m->m_damaged) |
430 | return false; |
431 | if (m->m_num_laggy == m->m_num_active_mds) |
432 | return false; |
433 | for (i = 0; i < m->possible_max_rank; i++) { |
434 | if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) |
435 | nr_active++; |
436 | } |
437 | return nr_active > 0; |
438 | } |
439 | |