1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* AFS fileserver probing |
3 | * |
4 | * Copyright (C) 2018, 2020 Red Hat, Inc. All Rights Reserved. |
5 | * Written by David Howells (dhowells@redhat.com) |
6 | */ |
7 | |
8 | #include <linux/sched.h> |
9 | #include <linux/slab.h> |
10 | #include "afs_fs.h" |
11 | #include "internal.h" |
12 | #include "protocol_afs.h" |
13 | #include "protocol_yfs.h" |
14 | |
15 | static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; |
16 | static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ; |
17 | |
18 | struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, |
19 | enum afs_estate_trace where) |
20 | { |
21 | if (estate) { |
22 | int r; |
23 | |
24 | __refcount_inc(r: &estate->ref, oldp: &r); |
25 | trace_afs_estate(server_debug_id: estate->server_id, estate_debug_id: estate->probe_seq, ref: r, reason: where); |
26 | } |
27 | return estate; |
28 | } |
29 | |
30 | static void afs_endpoint_state_rcu(struct rcu_head *rcu) |
31 | { |
32 | struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu); |
33 | |
34 | trace_afs_estate(server_debug_id: estate->server_id, estate_debug_id: estate->probe_seq, ref: refcount_read(r: &estate->ref), |
35 | reason: afs_estate_trace_free); |
36 | afs_put_addrlist(alist: estate->addresses, reason: afs_alist_trace_put_estate); |
37 | kfree(objp: estate); |
38 | } |
39 | |
40 | void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where) |
41 | { |
42 | if (estate) { |
43 | unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq; |
44 | bool dead; |
45 | int r; |
46 | |
47 | dead = __refcount_dec_and_test(r: &estate->ref, oldp: &r); |
48 | trace_afs_estate(server_debug_id: server_id, estate_debug_id: probe_seq, ref: r, reason: where); |
49 | if (dead) |
50 | call_rcu(head: &estate->rcu, func: afs_endpoint_state_rcu); |
51 | } |
52 | } |
53 | |
54 | /* |
55 | * Start the probe polling timer. We have to supply it with an inc on the |
56 | * outstanding server count. |
57 | */ |
58 | static void afs_schedule_fs_probe(struct afs_net *net, |
59 | struct afs_server *server, bool fast) |
60 | { |
61 | unsigned long atj; |
62 | |
63 | if (!net->live) |
64 | return; |
65 | |
66 | atj = server->probed_at; |
67 | atj += fast ? afs_fs_probe_fast_poll_interval : afs_fs_probe_slow_poll_interval; |
68 | |
69 | afs_inc_servers_outstanding(net); |
70 | if (timer_reduce(timer: &net->fs_probe_timer, expires: atj)) |
71 | afs_dec_servers_outstanding(net); |
72 | } |
73 | |
74 | /* |
75 | * Handle the completion of a set of probes. |
76 | */ |
77 | static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server, |
78 | struct afs_endpoint_state *estate) |
79 | { |
80 | bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags); |
81 | |
82 | write_seqlock(sl: &net->fs_lock); |
83 | if (responded) { |
84 | list_add_tail(new: &server->probe_link, head: &net->fs_probe_slow); |
85 | } else { |
86 | server->rtt = UINT_MAX; |
87 | clear_bit(AFS_SERVER_FL_RESPONDING, addr: &server->flags); |
88 | list_add_tail(new: &server->probe_link, head: &net->fs_probe_fast); |
89 | } |
90 | |
91 | write_sequnlock(sl: &net->fs_lock); |
92 | |
93 | afs_schedule_fs_probe(net, server, fast: !responded); |
94 | } |
95 | |
96 | /* |
97 | * Handle the completion of a probe. |
98 | */ |
99 | static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server, |
100 | struct afs_endpoint_state *estate) |
101 | { |
102 | _enter("" ); |
103 | |
104 | if (atomic_dec_and_test(v: &estate->nr_probing)) |
105 | afs_finished_fs_probe(net, server, estate); |
106 | |
107 | wake_up_all(&server->probe_wq); |
108 | } |
109 | |
110 | /* |
111 | * Handle inability to send a probe due to ENOMEM when trying to allocate a |
112 | * call struct. |
113 | */ |
114 | static void afs_fs_probe_not_done(struct afs_net *net, |
115 | struct afs_server *server, |
116 | struct afs_endpoint_state *estate, |
117 | int index) |
118 | { |
119 | _enter("" ); |
120 | |
121 | trace_afs_io_error(call: 0, error: -ENOMEM, where: afs_io_error_fs_probe_fail); |
122 | spin_lock(lock: &server->probe_lock); |
123 | |
124 | set_bit(AFS_ESTATE_LOCAL_FAILURE, addr: &estate->flags); |
125 | if (estate->error == 0) |
126 | estate->error = -ENOMEM; |
127 | |
128 | set_bit(nr: index, addr: &estate->failed_set); |
129 | |
130 | spin_unlock(lock: &server->probe_lock); |
131 | return afs_done_one_fs_probe(net, server, estate); |
132 | } |
133 | |
134 | /* |
135 | * Process the result of probing a fileserver. This is called after successful |
136 | * or failed delivery of an FS.GetCapabilities operation. |
137 | */ |
138 | void afs_fileserver_probe_result(struct afs_call *call) |
139 | { |
140 | struct afs_endpoint_state *estate = call->probe; |
141 | struct afs_addr_list *alist = estate->addresses; |
142 | struct afs_address *addr = &alist->addrs[call->probe_index]; |
143 | struct afs_server *server = call->server; |
144 | unsigned int index = call->probe_index; |
145 | unsigned int rtt_us = -1, cap0; |
146 | int ret = call->error; |
147 | |
148 | _enter("%pU,%u" , &server->uuid, index); |
149 | |
150 | WRITE_ONCE(addr->last_error, ret); |
151 | |
152 | spin_lock(lock: &server->probe_lock); |
153 | |
154 | switch (ret) { |
155 | case 0: |
156 | estate->error = 0; |
157 | goto responded; |
158 | case -ECONNABORTED: |
159 | if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) { |
160 | estate->abort_code = call->abort_code; |
161 | estate->error = ret; |
162 | } |
163 | goto responded; |
164 | case -ENOMEM: |
165 | case -ENONET: |
166 | clear_bit(nr: index, addr: &estate->responsive_set); |
167 | set_bit(AFS_ESTATE_LOCAL_FAILURE, addr: &estate->flags); |
168 | trace_afs_io_error(call: call->debug_id, error: ret, where: afs_io_error_fs_probe_fail); |
169 | goto out; |
170 | case -ECONNRESET: /* Responded, but call expired. */ |
171 | case -ERFKILL: |
172 | case -EADDRNOTAVAIL: |
173 | case -ENETUNREACH: |
174 | case -EHOSTUNREACH: |
175 | case -EHOSTDOWN: |
176 | case -ECONNREFUSED: |
177 | case -ETIMEDOUT: |
178 | case -ETIME: |
179 | default: |
180 | clear_bit(nr: index, addr: &estate->responsive_set); |
181 | set_bit(nr: index, addr: &estate->failed_set); |
182 | if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) && |
183 | (estate->error == 0 || |
184 | estate->error == -ETIMEDOUT || |
185 | estate->error == -ETIME)) |
186 | estate->error = ret; |
187 | trace_afs_io_error(call: call->debug_id, error: ret, where: afs_io_error_fs_probe_fail); |
188 | goto out; |
189 | } |
190 | |
191 | responded: |
192 | clear_bit(nr: index, addr: &estate->failed_set); |
193 | |
194 | if (call->service_id == YFS_FS_SERVICE) { |
195 | set_bit(AFS_ESTATE_IS_YFS, addr: &estate->flags); |
196 | set_bit(AFS_SERVER_FL_IS_YFS, addr: &server->flags); |
197 | server->service_id = call->service_id; |
198 | } else { |
199 | set_bit(AFS_ESTATE_NOT_YFS, addr: &estate->flags); |
200 | if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) { |
201 | clear_bit(AFS_SERVER_FL_IS_YFS, addr: &server->flags); |
202 | server->service_id = call->service_id; |
203 | } |
204 | cap0 = ntohl(call->tmp); |
205 | if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) |
206 | set_bit(AFS_SERVER_FL_HAS_FS64, addr: &server->flags); |
207 | else |
208 | clear_bit(AFS_SERVER_FL_HAS_FS64, addr: &server->flags); |
209 | } |
210 | |
211 | rtt_us = rxrpc_kernel_get_srtt(addr->peer); |
212 | if (rtt_us < estate->rtt) { |
213 | estate->rtt = rtt_us; |
214 | server->rtt = rtt_us; |
215 | alist->preferred = index; |
216 | } |
217 | |
218 | smp_wmb(); /* Set rtt before responded. */ |
219 | set_bit(AFS_ESTATE_RESPONDED, addr: &estate->flags); |
220 | set_bit(nr: index, addr: &estate->responsive_set); |
221 | set_bit(AFS_SERVER_FL_RESPONDING, addr: &server->flags); |
222 | out: |
223 | spin_unlock(lock: &server->probe_lock); |
224 | |
225 | trace_afs_fs_probe(server, tx: false, estate, addr_index: index, error: call->error, abort_code: call->abort_code, rtt_us); |
226 | _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d" , |
227 | estate->probe_seq, &server->uuid, index, |
228 | rxrpc_kernel_remote_addr(alist->addrs[index].peer), |
229 | rtt_us, ret); |
230 | |
231 | return afs_done_one_fs_probe(net: call->net, server, estate); |
232 | } |
233 | |
234 | /* |
235 | * Probe all of a fileserver's addresses to find out the best route and to |
236 | * query its capabilities. |
237 | */ |
238 | void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, |
239 | struct afs_addr_list *new_alist, struct key *key) |
240 | { |
241 | struct afs_endpoint_state *estate, *old; |
242 | struct afs_addr_list *alist; |
243 | unsigned long unprobed; |
244 | |
245 | _enter("%pU" , &server->uuid); |
246 | |
247 | estate = kzalloc(size: sizeof(*estate), GFP_KERNEL); |
248 | if (!estate) |
249 | return; |
250 | |
251 | refcount_set(r: &estate->ref, n: 1); |
252 | estate->server_id = server->debug_id; |
253 | estate->rtt = UINT_MAX; |
254 | |
255 | write_lock(&server->fs_lock); |
256 | |
257 | old = rcu_dereference_protected(server->endpoint_state, |
258 | lockdep_is_held(&server->fs_lock)); |
259 | estate->responsive_set = old->responsive_set; |
260 | estate->addresses = afs_get_addrlist(alist: new_alist ?: old->addresses, |
261 | reason: afs_alist_trace_get_estate); |
262 | alist = estate->addresses; |
263 | estate->probe_seq = ++server->probe_counter; |
264 | atomic_set(v: &estate->nr_probing, i: alist->nr_addrs); |
265 | |
266 | rcu_assign_pointer(server->endpoint_state, estate); |
267 | set_bit(AFS_ESTATE_SUPERSEDED, addr: &old->flags); |
268 | write_unlock(&server->fs_lock); |
269 | |
270 | trace_afs_estate(server_debug_id: estate->server_id, estate_debug_id: estate->probe_seq, ref: refcount_read(r: &estate->ref), |
271 | reason: afs_estate_trace_alloc_probe); |
272 | |
273 | afs_get_address_preferences(net, alist); |
274 | |
275 | server->probed_at = jiffies; |
276 | unprobed = (1UL << alist->nr_addrs) - 1; |
277 | while (unprobed) { |
278 | unsigned int index = 0, i; |
279 | int best_prio = -1; |
280 | |
281 | for (i = 0; i < alist->nr_addrs; i++) { |
282 | if (test_bit(i, &unprobed) && |
283 | alist->addrs[i].prio > best_prio) { |
284 | index = i; |
285 | best_prio = alist->addrs[i].prio; |
286 | } |
287 | } |
288 | __clear_bit(index, &unprobed); |
289 | |
290 | trace_afs_fs_probe(server, tx: true, estate, addr_index: index, error: 0, abort_code: 0, rtt_us: 0); |
291 | if (!afs_fs_get_capabilities(net, server, estate, addr_index: index, key)) |
292 | afs_fs_probe_not_done(net, server, estate, index); |
293 | } |
294 | |
295 | afs_put_endpoint_state(estate: old, where: afs_estate_trace_put_probe); |
296 | } |
297 | |
298 | /* |
299 | * Wait for the first as-yet untried fileserver to respond, for the probe state |
300 | * to be superseded or for all probes to finish. |
301 | */ |
302 | int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr) |
303 | { |
304 | struct afs_endpoint_state *estate; |
305 | struct afs_server_list *slist = op->server_list; |
306 | bool still_probing = true; |
307 | int ret = 0, i; |
308 | |
309 | _enter("%u" , slist->nr_servers); |
310 | |
311 | for (i = 0; i < slist->nr_servers; i++) { |
312 | estate = states[i].endpoint_state; |
313 | if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) |
314 | return 2; |
315 | if (atomic_read(v: &estate->nr_probing)) |
316 | still_probing = true; |
317 | if (estate->responsive_set & states[i].untried_addrs) |
318 | return 1; |
319 | } |
320 | if (!still_probing) |
321 | return 0; |
322 | |
323 | for (i = 0; i < slist->nr_servers; i++) |
324 | add_wait_queue(wq_head: &slist->servers[i].server->probe_wq, wq_entry: &states[i].probe_waiter); |
325 | |
326 | for (;;) { |
327 | still_probing = false; |
328 | |
329 | set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); |
330 | for (i = 0; i < slist->nr_servers; i++) { |
331 | estate = states[i].endpoint_state; |
332 | if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) { |
333 | ret = 2; |
334 | goto stop; |
335 | } |
336 | if (atomic_read(v: &estate->nr_probing)) |
337 | still_probing = true; |
338 | if (estate->responsive_set & states[i].untried_addrs) { |
339 | ret = 1; |
340 | goto stop; |
341 | } |
342 | } |
343 | |
344 | if (!still_probing || signal_pending(current)) |
345 | goto stop; |
346 | schedule(); |
347 | } |
348 | |
349 | stop: |
350 | set_current_state(TASK_RUNNING); |
351 | |
352 | for (i = 0; i < slist->nr_servers; i++) |
353 | remove_wait_queue(wq_head: &slist->servers[i].server->probe_wq, wq_entry: &states[i].probe_waiter); |
354 | |
355 | if (!ret && signal_pending(current)) |
356 | ret = -ERESTARTSYS; |
357 | return ret; |
358 | } |
359 | |
360 | /* |
361 | * Probe timer. We have an increment on fs_outstanding that we need to pass |
362 | * along to the work item. |
363 | */ |
364 | void afs_fs_probe_timer(struct timer_list *timer) |
365 | { |
366 | struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer); |
367 | |
368 | if (!net->live || !queue_work(wq: afs_wq, work: &net->fs_prober)) |
369 | afs_dec_servers_outstanding(net); |
370 | } |
371 | |
372 | /* |
373 | * Dispatch a probe to a server. |
374 | */ |
375 | static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server) |
376 | __releases(&net->fs_lock) |
377 | { |
378 | struct key *key = NULL; |
379 | |
380 | /* We remove it from the queues here - it will be added back to |
381 | * one of the queues on the completion of the probe. |
382 | */ |
383 | list_del_init(entry: &server->probe_link); |
384 | |
385 | afs_get_server(server, afs_server_trace_get_probe); |
386 | write_sequnlock(sl: &net->fs_lock); |
387 | |
388 | afs_fs_probe_fileserver(net, server, NULL, key); |
389 | afs_put_server(net, server, afs_server_trace_put_probe); |
390 | } |
391 | |
392 | /* |
393 | * Probe a server immediately without waiting for its due time to come |
394 | * round. This is used when all of the addresses have been tried. |
395 | */ |
396 | void afs_probe_fileserver(struct afs_net *net, struct afs_server *server) |
397 | { |
398 | write_seqlock(sl: &net->fs_lock); |
399 | if (!list_empty(head: &server->probe_link)) |
400 | return afs_dispatch_fs_probe(net, server); |
401 | write_sequnlock(sl: &net->fs_lock); |
402 | } |
403 | |
404 | /* |
405 | * Probe dispatcher to regularly dispatch probes to keep NAT alive. |
406 | */ |
407 | void afs_fs_probe_dispatcher(struct work_struct *work) |
408 | { |
409 | struct afs_net *net = container_of(work, struct afs_net, fs_prober); |
410 | struct afs_server *fast, *slow, *server; |
411 | unsigned long nowj, timer_at, poll_at; |
412 | bool first_pass = true, set_timer = false; |
413 | |
414 | if (!net->live) { |
415 | afs_dec_servers_outstanding(net); |
416 | return; |
417 | } |
418 | |
419 | _enter("" ); |
420 | |
421 | if (list_empty(head: &net->fs_probe_fast) && list_empty(head: &net->fs_probe_slow)) { |
422 | afs_dec_servers_outstanding(net); |
423 | _leave(" [none]" ); |
424 | return; |
425 | } |
426 | |
427 | again: |
428 | write_seqlock(sl: &net->fs_lock); |
429 | |
430 | fast = slow = server = NULL; |
431 | nowj = jiffies; |
432 | timer_at = nowj + MAX_JIFFY_OFFSET; |
433 | |
434 | if (!list_empty(head: &net->fs_probe_fast)) { |
435 | fast = list_first_entry(&net->fs_probe_fast, struct afs_server, probe_link); |
436 | poll_at = fast->probed_at + afs_fs_probe_fast_poll_interval; |
437 | if (time_before(nowj, poll_at)) { |
438 | timer_at = poll_at; |
439 | set_timer = true; |
440 | fast = NULL; |
441 | } |
442 | } |
443 | |
444 | if (!list_empty(head: &net->fs_probe_slow)) { |
445 | slow = list_first_entry(&net->fs_probe_slow, struct afs_server, probe_link); |
446 | poll_at = slow->probed_at + afs_fs_probe_slow_poll_interval; |
447 | if (time_before(nowj, poll_at)) { |
448 | if (time_before(poll_at, timer_at)) |
449 | timer_at = poll_at; |
450 | set_timer = true; |
451 | slow = NULL; |
452 | } |
453 | } |
454 | |
455 | server = fast ?: slow; |
456 | if (server) |
457 | _debug("probe %pU" , &server->uuid); |
458 | |
459 | if (server && (first_pass || !need_resched())) { |
460 | afs_dispatch_fs_probe(net, server); |
461 | first_pass = false; |
462 | goto again; |
463 | } |
464 | |
465 | write_sequnlock(sl: &net->fs_lock); |
466 | |
467 | if (server) { |
468 | if (!queue_work(wq: afs_wq, work: &net->fs_prober)) |
469 | afs_dec_servers_outstanding(net); |
470 | _leave(" [requeue]" ); |
471 | } else if (set_timer) { |
472 | if (timer_reduce(timer: &net->fs_probe_timer, expires: timer_at)) |
473 | afs_dec_servers_outstanding(net); |
474 | _leave(" [timer]" ); |
475 | } else { |
476 | afs_dec_servers_outstanding(net); |
477 | _leave(" [quiesce]" ); |
478 | } |
479 | } |
480 | |
481 | /* |
482 | * Wait for a probe on a particular fileserver to complete for 2s. |
483 | */ |
484 | int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, |
485 | unsigned long exclude, bool is_intr) |
486 | { |
487 | struct wait_queue_entry wait; |
488 | unsigned long timo = 2 * HZ; |
489 | |
490 | if (atomic_read(v: &estate->nr_probing) == 0) |
491 | goto dont_wait; |
492 | |
493 | init_wait_entry(wq_entry: &wait, flags: 0); |
494 | for (;;) { |
495 | prepare_to_wait_event(wq_head: &server->probe_wq, wq_entry: &wait, |
496 | state: is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); |
497 | if (timo == 0 || |
498 | test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) || |
499 | (estate->responsive_set & ~exclude) || |
500 | atomic_read(v: &estate->nr_probing) == 0 || |
501 | (is_intr && signal_pending(current))) |
502 | break; |
503 | timo = schedule_timeout(timeout: timo); |
504 | } |
505 | |
506 | finish_wait(wq_head: &server->probe_wq, wq_entry: &wait); |
507 | |
508 | dont_wait: |
509 | if (estate->responsive_set & ~exclude) |
510 | return 1; |
511 | if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) |
512 | return 0; |
513 | if (is_intr && signal_pending(current)) |
514 | return -ERESTARTSYS; |
515 | if (timo == 0) |
516 | return -ETIME; |
517 | return -EDESTADDRREQ; |
518 | } |
519 | |
520 | /* |
521 | * Clean up the probing when the namespace is killed off. |
522 | */ |
523 | void afs_fs_probe_cleanup(struct afs_net *net) |
524 | { |
525 | if (del_timer_sync(timer: &net->fs_probe_timer)) |
526 | afs_dec_servers_outstanding(net); |
527 | } |
528 | |