| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* Handle fileserver selection and rotation. |
| 3 | * |
| 4 | * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. |
| 5 | * Written by David Howells (dhowells@redhat.com) |
| 6 | */ |
| 7 | |
| 8 | #include <linux/kernel.h> |
| 9 | #include <linux/slab.h> |
| 10 | #include <linux/fs.h> |
| 11 | #include <linux/sched.h> |
| 12 | #include <linux/delay.h> |
| 13 | #include <linux/sched/signal.h> |
| 14 | #include "internal.h" |
| 15 | #include "afs_fs.h" |
| 16 | #include "protocol_uae.h" |
| 17 | |
| 18 | void afs_clear_server_states(struct afs_operation *op) |
| 19 | { |
| 20 | unsigned int i; |
| 21 | |
| 22 | if (op->server_states) { |
| 23 | for (i = 0; i < op->server_list->nr_servers; i++) |
| 24 | afs_put_endpoint_state(estate: op->server_states[i].endpoint_state, |
| 25 | where: afs_estate_trace_put_server_state); |
| 26 | kfree(objp: op->server_states); |
| 27 | } |
| 28 | } |
| 29 | |
| 30 | /* |
| 31 | * Begin iteration through a server list, starting with the vnode's last used |
| 32 | * server if possible, or the last recorded good server if not. |
| 33 | */ |
| 34 | static bool afs_start_fs_iteration(struct afs_operation *op, |
| 35 | struct afs_vnode *vnode) |
| 36 | { |
| 37 | struct afs_server *server; |
| 38 | void *cb_server; |
| 39 | int i; |
| 40 | |
| 41 | trace_afs_rotate(op, reason: afs_rotate_trace_start, extra: 0); |
| 42 | |
| 43 | read_lock(&op->volume->servers_lock); |
| 44 | op->server_list = afs_get_serverlist( |
| 45 | rcu_dereference_protected(op->volume->servers, |
| 46 | lockdep_is_held(&op->volume->servers_lock))); |
| 47 | read_unlock(&op->volume->servers_lock); |
| 48 | |
| 49 | op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]), |
| 50 | GFP_KERNEL); |
| 51 | if (!op->server_states) { |
| 52 | afs_op_nomem(op); |
| 53 | trace_afs_rotate(op, reason: afs_rotate_trace_nomem, extra: 0); |
| 54 | return false; |
| 55 | } |
| 56 | |
| 57 | rcu_read_lock(); |
| 58 | for (i = 0; i < op->server_list->nr_servers; i++) { |
| 59 | struct afs_endpoint_state *estate; |
| 60 | struct afs_server_state *s = &op->server_states[i]; |
| 61 | |
| 62 | server = op->server_list->servers[i].server; |
| 63 | estate = rcu_dereference(server->endpoint_state); |
| 64 | s->endpoint_state = afs_get_endpoint_state(estate, |
| 65 | where: afs_estate_trace_get_server_state); |
| 66 | s->probe_seq = estate->probe_seq; |
| 67 | s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1; |
| 68 | init_waitqueue_entry(wq_entry: &s->probe_waiter, current); |
| 69 | afs_get_address_preferences(net: op->net, alist: estate->addresses); |
| 70 | } |
| 71 | rcu_read_unlock(); |
| 72 | |
| 73 | |
| 74 | op->untried_servers = (1UL << op->server_list->nr_servers) - 1; |
| 75 | op->server_index = -1; |
| 76 | |
| 77 | cb_server = vnode->cb_server; |
| 78 | if (cb_server) { |
| 79 | /* See if the vnode's preferred record is still available */ |
| 80 | for (i = 0; i < op->server_list->nr_servers; i++) { |
| 81 | server = op->server_list->servers[i].server; |
| 82 | if (server == cb_server) { |
| 83 | op->server_index = i; |
| 84 | goto found_interest; |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | /* If we have a lock outstanding on a server that's no longer |
| 89 | * serving this vnode, then we can't switch to another server |
| 90 | * and have to return an error. |
| 91 | */ |
| 92 | if (op->flags & AFS_OPERATION_CUR_ONLY) { |
| 93 | afs_op_set_error(op, error: -ESTALE); |
| 94 | trace_afs_rotate(op, reason: afs_rotate_trace_stale_lock, extra: 0); |
| 95 | return false; |
| 96 | } |
| 97 | |
| 98 | /* Note that the callback promise is effectively broken */ |
| 99 | write_seqlock(sl: &vnode->cb_lock); |
| 100 | ASSERTCMP(cb_server, ==, vnode->cb_server); |
| 101 | vnode->cb_server = NULL; |
| 102 | if (afs_clear_cb_promise(vnode, trace: afs_cb_promise_clear_rotate_server)) |
| 103 | vnode->cb_break++; |
| 104 | write_sequnlock(sl: &vnode->cb_lock); |
| 105 | } |
| 106 | |
| 107 | found_interest: |
| 108 | return true; |
| 109 | } |
| 110 | |
| 111 | /* |
| 112 | * Post volume busy note. |
| 113 | */ |
| 114 | static void afs_busy(struct afs_operation *op, u32 abort_code) |
| 115 | { |
| 116 | const char *m; |
| 117 | |
| 118 | switch (abort_code) { |
| 119 | case VOFFLINE: m = "offline" ; break; |
| 120 | case VRESTARTING: m = "restarting" ; break; |
| 121 | case VSALVAGING: m = "being salvaged" ; break; |
| 122 | default: m = "busy" ; break; |
| 123 | } |
| 124 | |
| 125 | pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n" , |
| 126 | op->volume->vid, op->volume->name, &op->server->uuid, m); |
| 127 | } |
| 128 | |
| 129 | /* |
| 130 | * Sleep and retry the operation to the same fileserver. |
| 131 | */ |
| 132 | static bool afs_sleep_and_retry(struct afs_operation *op) |
| 133 | { |
| 134 | trace_afs_rotate(op, reason: afs_rotate_trace_busy_sleep, extra: 0); |
| 135 | if (!(op->flags & AFS_OPERATION_UNINTR)) { |
| 136 | msleep_interruptible(msecs: 1000); |
| 137 | if (signal_pending(current)) { |
| 138 | afs_op_set_error(op, error: -ERESTARTSYS); |
| 139 | return false; |
| 140 | } |
| 141 | } else { |
| 142 | msleep(msecs: 1000); |
| 143 | } |
| 144 | |
| 145 | return true; |
| 146 | } |
| 147 | |
| 148 | /* |
| 149 | * Select the fileserver to use. May be called multiple times to rotate |
| 150 | * through the fileservers. |
| 151 | */ |
| 152 | bool afs_select_fileserver(struct afs_operation *op) |
| 153 | { |
| 154 | struct afs_addr_list *alist; |
| 155 | struct afs_server *server; |
| 156 | struct afs_vnode *vnode = op->file[0].vnode; |
| 157 | unsigned long set, failed; |
| 158 | s32 abort_code = op->call_abort_code; |
| 159 | int best_prio = 0; |
| 160 | int error = op->call_error, addr_index, i, j; |
| 161 | |
| 162 | op->nr_iterations++; |
| 163 | |
| 164 | _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d" , |
| 165 | op->debug_id, op->nr_iterations, op->volume->vid, |
| 166 | op->server_index, op->untried_servers, |
| 167 | op->addr_index, op->addr_tried, |
| 168 | error, abort_code); |
| 169 | |
| 170 | if (op->flags & AFS_OPERATION_STOP) { |
| 171 | trace_afs_rotate(op, reason: afs_rotate_trace_stopped, extra: 0); |
| 172 | _leave(" = f [stopped]" ); |
| 173 | return false; |
| 174 | } |
| 175 | |
| 176 | if (op->nr_iterations == 0) |
| 177 | goto start; |
| 178 | |
| 179 | WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error); |
| 180 | trace_afs_rotate(op, reason: afs_rotate_trace_iter, extra: op->call_error); |
| 181 | |
| 182 | /* Evaluate the result of the previous operation, if there was one. */ |
| 183 | switch (op->call_error) { |
| 184 | case 0: |
| 185 | clear_bit(AFS_SE_VOLUME_OFFLINE, |
| 186 | addr: &op->server_list->servers[op->server_index].flags); |
| 187 | clear_bit(AFS_SE_VOLUME_BUSY, |
| 188 | addr: &op->server_list->servers[op->server_index].flags); |
| 189 | op->cumul_error.responded = true; |
| 190 | |
| 191 | /* We succeeded, but we may need to redo the op from another |
| 192 | * server if we're looking at a set of RO volumes where some of |
| 193 | * the servers have not yet been brought up to date lest we |
| 194 | * regress the data. We only switch to the new version once |
| 195 | * >=50% of the servers are updated. |
| 196 | */ |
| 197 | error = afs_update_volume_state(op); |
| 198 | if (error != 0) { |
| 199 | if (error == 1) { |
| 200 | afs_sleep_and_retry(op); |
| 201 | goto restart_from_beginning; |
| 202 | } |
| 203 | afs_op_set_error(op, error); |
| 204 | goto failed; |
| 205 | } |
| 206 | fallthrough; |
| 207 | default: |
| 208 | /* Success or local failure. Stop. */ |
| 209 | afs_op_set_error(op, error); |
| 210 | op->flags |= AFS_OPERATION_STOP; |
| 211 | trace_afs_rotate(op, reason: afs_rotate_trace_stop, extra: error); |
| 212 | _leave(" = f [okay/local %d]" , error); |
| 213 | return false; |
| 214 | |
| 215 | case -ECONNABORTED: |
| 216 | /* The far side rejected the operation on some grounds. This |
| 217 | * might involve the server being busy or the volume having been moved. |
| 218 | * |
| 219 | * Note that various V* errors should not be sent to a cache manager |
| 220 | * by a fileserver as they should be translated to more modern UAE* |
| 221 | * errors instead. IBM AFS and OpenAFS fileservers, however, do leak |
| 222 | * these abort codes. |
| 223 | */ |
| 224 | trace_afs_rotate(op, reason: afs_rotate_trace_aborted, extra: abort_code); |
| 225 | op->cumul_error.responded = true; |
| 226 | switch (abort_code) { |
| 227 | case VNOVOL: |
| 228 | /* This fileserver doesn't know about the volume. |
| 229 | * - May indicate that the VL is wrong - retry once and compare |
| 230 | * the results. |
| 231 | * - May indicate that the fileserver couldn't attach to the vol. |
| 232 | * - The volume might have been temporarily removed so that it can |
| 233 | * be replaced by a volume restore. "vos" might have ended one |
| 234 | * transaction and has yet to create the next. |
| 235 | * - The volume might not be blessed or might not be in-service |
| 236 | * (administrative action). |
| 237 | */ |
| 238 | if (op->flags & AFS_OPERATION_VNOVOL) { |
| 239 | afs_op_accumulate_error(op, error: -EREMOTEIO, abort_code); |
| 240 | goto next_server; |
| 241 | } |
| 242 | |
| 243 | write_lock(&op->volume->servers_lock); |
| 244 | op->server_list->vnovol_mask |= 1 << op->server_index; |
| 245 | write_unlock(&op->volume->servers_lock); |
| 246 | |
| 247 | set_bit(AFS_VOLUME_NEEDS_UPDATE, addr: &op->volume->flags); |
| 248 | error = afs_check_volume_status(op->volume, op); |
| 249 | if (error < 0) { |
| 250 | afs_op_set_error(op, error); |
| 251 | goto failed; |
| 252 | } |
| 253 | |
| 254 | if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { |
| 255 | afs_op_set_error(op, error: -ENOMEDIUM); |
| 256 | goto failed; |
| 257 | } |
| 258 | |
| 259 | /* If the server list didn't change, then assume that |
| 260 | * it's the fileserver having trouble. |
| 261 | */ |
| 262 | if (rcu_access_pointer(op->volume->servers) == op->server_list) { |
| 263 | afs_op_accumulate_error(op, error: -EREMOTEIO, abort_code); |
| 264 | goto next_server; |
| 265 | } |
| 266 | |
| 267 | /* Try again */ |
| 268 | op->flags |= AFS_OPERATION_VNOVOL; |
| 269 | _leave(" = t [vnovol]" ); |
| 270 | return true; |
| 271 | |
| 272 | case VVOLEXISTS: |
| 273 | case VONLINE: |
| 274 | /* These should not be returned from the fileserver. */ |
| 275 | pr_warn("Fileserver returned unexpected abort %d\n" , |
| 276 | abort_code); |
| 277 | afs_op_accumulate_error(op, error: -EREMOTEIO, abort_code); |
| 278 | goto next_server; |
| 279 | |
| 280 | case VNOSERVICE: |
| 281 | /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver |
| 282 | * if the volume was neither in-service nor administratively |
| 283 | * blessed. All usage was replaced by VNOVOL because AFS 3.1 and |
| 284 | * earlier cache managers did not handle VNOSERVICE and assumed |
| 285 | * it was the client OSes errno 105. |
| 286 | * |
| 287 | * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the |
| 288 | * fileserver idle dead time error which was sent in place of |
| 289 | * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the |
| 290 | * fileserver took too long to send a reply to the client. |
| 291 | * RX_CALL_TIMEOUT would have caused the cache manager to mark the |
| 292 | * server down whereas VNOSERVICE since AFS 3.2 would cause cache |
| 293 | * manager to temporarily (up to 15 minutes) mark the volume |
| 294 | * instance as unusable. |
| 295 | * |
| 296 | * The idle dead logic resulted in cache inconsistency since a |
| 297 | * state changing call that the cache manager assumed was dead |
| 298 | * could still be processed to completion by the fileserver. This |
| 299 | * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer |
| 300 | * returned. However, many 1.4.8 through 1.6.24 fileservers are |
| 301 | * still in existence. |
| 302 | * |
| 303 | * AuriStorFS fileservers have never returned VNOSERVICE. |
| 304 | * |
| 305 | * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. |
| 306 | */ |
| 307 | case RX_CALL_TIMEOUT: |
| 308 | afs_op_accumulate_error(op, error: -ETIMEDOUT, abort_code); |
| 309 | goto next_server; |
| 310 | |
| 311 | case VSALVAGING: /* This error should not be leaked to cache managers |
| 312 | * but is from OpenAFS demand attach fileservers. |
| 313 | * It should be treated as an alias for VOFFLINE. |
| 314 | */ |
| 315 | case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */ |
| 316 | case VOFFLINE: |
| 317 | /* The volume is in use by the volserver or another volume utility |
| 318 | * for an operation that might alter the contents. The volume is |
| 319 | * expected to come back but it might take a long time (could be |
| 320 | * days). |
| 321 | */ |
| 322 | if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE, |
| 323 | addr: &op->server_list->servers[op->server_index].flags)) { |
| 324 | afs_busy(op, abort_code); |
| 325 | clear_bit(AFS_SE_VOLUME_BUSY, |
| 326 | addr: &op->server_list->servers[op->server_index].flags); |
| 327 | } |
| 328 | if (op->flags & AFS_OPERATION_NO_VSLEEP) { |
| 329 | afs_op_set_error(op, error: -EADV); |
| 330 | goto failed; |
| 331 | } |
| 332 | goto busy; |
| 333 | |
| 334 | case VRESTARTING: /* The fileserver is either shutting down or starting up. */ |
| 335 | case VBUSY: |
| 336 | /* The volume is in use by the volserver or another volume |
| 337 | * utility for an operation that is not expected to alter the |
| 338 | * contents of the volume. VBUSY does not need to be returned |
| 339 | * for a ROVOL or BACKVOL bound to an ITBusy volserver |
| 340 | * transaction. The fileserver is permitted to continue serving |
| 341 | * content from ROVOLs and BACKVOLs during an ITBusy transaction |
| 342 | * because the content will not change. However, many fileserver |
| 343 | * releases do return VBUSY for ROVOL and BACKVOL instances under |
| 344 | * many circumstances. |
| 345 | * |
| 346 | * Retry after going round all the servers unless we have a file |
| 347 | * lock we need to maintain. |
| 348 | */ |
| 349 | if (op->flags & AFS_OPERATION_NO_VSLEEP) { |
| 350 | afs_op_set_error(op, error: -EBUSY); |
| 351 | goto failed; |
| 352 | } |
| 353 | if (!test_and_set_bit(AFS_SE_VOLUME_BUSY, |
| 354 | addr: &op->server_list->servers[op->server_index].flags)) { |
| 355 | afs_busy(op, abort_code); |
| 356 | clear_bit(AFS_SE_VOLUME_OFFLINE, |
| 357 | addr: &op->server_list->servers[op->server_index].flags); |
| 358 | } |
| 359 | busy: |
| 360 | if (op->flags & AFS_OPERATION_CUR_ONLY) { |
| 361 | if (!afs_sleep_and_retry(op)) |
| 362 | goto failed; |
| 363 | |
| 364 | /* Retry with same server & address */ |
| 365 | _leave(" = t [vbusy]" ); |
| 366 | return true; |
| 367 | } |
| 368 | |
| 369 | op->flags |= AFS_OPERATION_VBUSY; |
| 370 | goto next_server; |
| 371 | |
| 372 | case VMOVED: |
| 373 | /* The volume migrated to another server. We consider |
| 374 | * consider all locks and callbacks broken and request |
| 375 | * an update from the VLDB. |
| 376 | * |
| 377 | * We also limit the number of VMOVED hops we will |
| 378 | * honour, just in case someone sets up a loop. |
| 379 | */ |
| 380 | if (op->flags & AFS_OPERATION_VMOVED) { |
| 381 | afs_op_set_error(op, error: -EREMOTEIO); |
| 382 | goto failed; |
| 383 | } |
| 384 | op->flags |= AFS_OPERATION_VMOVED; |
| 385 | |
| 386 | set_bit(AFS_VOLUME_WAIT, addr: &op->volume->flags); |
| 387 | set_bit(AFS_VOLUME_NEEDS_UPDATE, addr: &op->volume->flags); |
| 388 | error = afs_check_volume_status(op->volume, op); |
| 389 | if (error < 0) { |
| 390 | afs_op_set_error(op, error); |
| 391 | goto failed; |
| 392 | } |
| 393 | |
| 394 | /* If the server list didn't change, then the VLDB is |
| 395 | * out of sync with the fileservers. This is hopefully |
| 396 | * a temporary condition, however, so we don't want to |
| 397 | * permanently block access to the file. |
| 398 | * |
| 399 | * TODO: Try other fileservers if we can. |
| 400 | * |
| 401 | * TODO: Retry a few times with sleeps. |
| 402 | */ |
| 403 | if (rcu_access_pointer(op->volume->servers) == op->server_list) { |
| 404 | afs_op_accumulate_error(op, error: -ENOMEDIUM, abort_code); |
| 405 | goto failed; |
| 406 | } |
| 407 | |
| 408 | goto restart_from_beginning; |
| 409 | |
| 410 | case UAEIO: |
| 411 | case VIO: |
| 412 | afs_op_accumulate_error(op, error: -EREMOTEIO, abort_code); |
| 413 | if (op->volume->type != AFSVL_RWVOL) |
| 414 | goto next_server; |
| 415 | goto failed; |
| 416 | |
| 417 | case VDISKFULL: |
| 418 | case UAENOSPC: |
| 419 | /* The partition is full. Only applies to RWVOLs. |
| 420 | * Translate locally and return ENOSPC. |
| 421 | * No replicas to failover to. |
| 422 | */ |
| 423 | afs_op_set_error(op, error: -ENOSPC); |
| 424 | goto failed_but_online; |
| 425 | |
| 426 | case VOVERQUOTA: |
| 427 | case UAEDQUOT: |
| 428 | /* Volume is full. Only applies to RWVOLs. |
| 429 | * Translate locally and return EDQUOT. |
| 430 | * No replicas to failover to. |
| 431 | */ |
| 432 | afs_op_set_error(op, error: -EDQUOT); |
| 433 | goto failed_but_online; |
| 434 | |
| 435 | case RX_INVALID_OPERATION: |
| 436 | case RXGEN_OPCODE: |
| 437 | /* Handle downgrading to an older operation. */ |
| 438 | afs_op_set_error(op, error: -ENOTSUPP); |
| 439 | if (op->flags & AFS_OPERATION_DOWNGRADE) { |
| 440 | op->flags &= ~AFS_OPERATION_DOWNGRADE; |
| 441 | goto go_again; |
| 442 | } |
| 443 | goto failed_but_online; |
| 444 | |
| 445 | default: |
| 446 | afs_op_accumulate_error(op, error, abort_code); |
| 447 | failed_but_online: |
| 448 | clear_bit(AFS_SE_VOLUME_OFFLINE, |
| 449 | addr: &op->server_list->servers[op->server_index].flags); |
| 450 | clear_bit(AFS_SE_VOLUME_BUSY, |
| 451 | addr: &op->server_list->servers[op->server_index].flags); |
| 452 | goto failed; |
| 453 | } |
| 454 | |
| 455 | case -ETIMEDOUT: |
| 456 | case -ETIME: |
| 457 | if (afs_op_error(op) != -EDESTADDRREQ) |
| 458 | goto iterate_address; |
| 459 | fallthrough; |
| 460 | case -ERFKILL: |
| 461 | case -EADDRNOTAVAIL: |
| 462 | case -ENETUNREACH: |
| 463 | case -EHOSTUNREACH: |
| 464 | case -EHOSTDOWN: |
| 465 | case -ECONNREFUSED: |
| 466 | _debug("no conn" ); |
| 467 | afs_op_accumulate_error(op, error, abort_code: 0); |
| 468 | goto iterate_address; |
| 469 | |
| 470 | case -ENETRESET: |
| 471 | pr_warn("kAFS: Peer reset %s (op=%x)\n" , |
| 472 | op->type ? op->type->name : "???" , op->debug_id); |
| 473 | fallthrough; |
| 474 | case -ECONNRESET: |
| 475 | _debug("call reset" ); |
| 476 | afs_op_set_error(op, error); |
| 477 | goto failed; |
| 478 | } |
| 479 | |
| 480 | restart_from_beginning: |
| 481 | trace_afs_rotate(op, reason: afs_rotate_trace_restart, extra: 0); |
| 482 | _debug("restart" ); |
| 483 | op->estate = NULL; |
| 484 | op->server = NULL; |
| 485 | afs_clear_server_states(op); |
| 486 | op->server_states = NULL; |
| 487 | afs_put_serverlist(op->net, op->server_list); |
| 488 | op->server_list = NULL; |
| 489 | start: |
| 490 | _debug("start" ); |
| 491 | ASSERTCMP(op->estate, ==, NULL); |
| 492 | /* See if we need to do an update of the volume record. Note that the |
| 493 | * volume may have moved or even have been deleted. |
| 494 | */ |
| 495 | error = afs_check_volume_status(op->volume, op); |
| 496 | trace_afs_rotate(op, reason: afs_rotate_trace_check_vol_status, extra: error); |
| 497 | if (error < 0) { |
| 498 | afs_op_set_error(op, error); |
| 499 | goto failed; |
| 500 | } |
| 501 | |
| 502 | if (!afs_start_fs_iteration(op, vnode)) |
| 503 | goto failed; |
| 504 | |
| 505 | _debug("__ VOL %llx __" , op->volume->vid); |
| 506 | |
| 507 | pick_server: |
| 508 | _debug("pick [%lx]" , op->untried_servers); |
| 509 | ASSERTCMP(op->estate, ==, NULL); |
| 510 | |
| 511 | error = afs_wait_for_fs_probes(op, states: op->server_states, |
| 512 | intr: !(op->flags & AFS_OPERATION_UNINTR)); |
| 513 | switch (error) { |
| 514 | case 0: /* No untried responsive servers and no outstanding probes */ |
| 515 | trace_afs_rotate(op, reason: afs_rotate_trace_probe_none, extra: 0); |
| 516 | goto no_more_servers; |
| 517 | case 1: /* Got a response */ |
| 518 | trace_afs_rotate(op, reason: afs_rotate_trace_probe_response, extra: 0); |
| 519 | break; |
| 520 | case 2: /* Probe data superseded */ |
| 521 | trace_afs_rotate(op, reason: afs_rotate_trace_probe_superseded, extra: 0); |
| 522 | goto restart_from_beginning; |
| 523 | default: |
| 524 | trace_afs_rotate(op, reason: afs_rotate_trace_probe_error, extra: error); |
| 525 | afs_op_set_error(op, error); |
| 526 | goto failed; |
| 527 | } |
| 528 | |
| 529 | /* Pick the untried server with the highest priority untried endpoint. |
| 530 | * If we have outstanding callbacks, we stick with the server we're |
| 531 | * already using if we can. |
| 532 | */ |
| 533 | if (op->server) { |
| 534 | _debug("server %u" , op->server_index); |
| 535 | if (test_bit(op->server_index, &op->untried_servers)) |
| 536 | goto selected_server; |
| 537 | op->server = NULL; |
| 538 | _debug("no server" ); |
| 539 | } |
| 540 | |
| 541 | rcu_read_lock(); |
| 542 | op->server_index = -1; |
| 543 | best_prio = -1; |
| 544 | for (i = 0; i < op->server_list->nr_servers; i++) { |
| 545 | struct afs_endpoint_state *es; |
| 546 | struct afs_server_entry *se = &op->server_list->servers[i]; |
| 547 | struct afs_addr_list *sal; |
| 548 | struct afs_server *s = se->server; |
| 549 | |
| 550 | if (!test_bit(i, &op->untried_servers) || |
| 551 | test_bit(AFS_SE_EXCLUDED, &se->flags) || |
| 552 | !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) |
| 553 | continue; |
| 554 | es = op->server_states[i].endpoint_state; |
| 555 | sal = es->addresses; |
| 556 | |
| 557 | afs_get_address_preferences_rcu(net: op->net, alist: sal); |
| 558 | for (j = 0; j < sal->nr_addrs; j++) { |
| 559 | if (es->failed_set & (1 << j)) |
| 560 | continue; |
| 561 | if (!sal->addrs[j].peer) |
| 562 | continue; |
| 563 | if (sal->addrs[j].prio > best_prio) { |
| 564 | op->server_index = i; |
| 565 | best_prio = sal->addrs[j].prio; |
| 566 | } |
| 567 | } |
| 568 | } |
| 569 | rcu_read_unlock(); |
| 570 | |
| 571 | if (op->server_index == -1) |
| 572 | goto no_more_servers; |
| 573 | |
| 574 | selected_server: |
| 575 | trace_afs_rotate(op, reason: afs_rotate_trace_selected_server, extra: best_prio); |
| 576 | _debug("use %d prio %u" , op->server_index, best_prio); |
| 577 | __clear_bit(op->server_index, &op->untried_servers); |
| 578 | |
| 579 | /* We're starting on a different fileserver from the list. We need to |
| 580 | * check it, create a callback intercept, find its address list and |
| 581 | * probe its capabilities before we use it. |
| 582 | */ |
| 583 | ASSERTCMP(op->estate, ==, NULL); |
| 584 | server = op->server_list->servers[op->server_index].server; |
| 585 | |
| 586 | if (!afs_check_server_record(op, server, key: op->key)) |
| 587 | goto failed; |
| 588 | |
| 589 | _debug("USING SERVER: %pU" , &server->uuid); |
| 590 | |
| 591 | op->flags |= AFS_OPERATION_RETRY_SERVER; |
| 592 | op->server = server; |
| 593 | if (vnode->cb_server != server) { |
| 594 | vnode->cb_server = server; |
| 595 | vnode->cb_v_check = atomic_read(v: &vnode->volume->cb_v_break); |
| 596 | afs_clear_cb_promise(vnode, trace: afs_cb_promise_clear_server_change); |
| 597 | } |
| 598 | |
| 599 | retry_server: |
| 600 | op->addr_tried = 0; |
| 601 | op->addr_index = -1; |
| 602 | |
| 603 | iterate_address: |
| 604 | /* Iterate over the current server's address list to try and find an |
| 605 | * address on which it will respond to us. |
| 606 | */ |
| 607 | op->estate = op->server_states[op->server_index].endpoint_state; |
| 608 | set = READ_ONCE(op->estate->responsive_set); |
| 609 | failed = READ_ONCE(op->estate->failed_set); |
| 610 | _debug("iterate ES=%x rs=%lx fs=%lx" , op->estate->probe_seq, set, failed); |
| 611 | set &= ~(failed | op->addr_tried); |
| 612 | trace_afs_rotate(op, reason: afs_rotate_trace_iterate_addr, extra: set); |
| 613 | if (!set) |
| 614 | goto wait_for_more_probe_results; |
| 615 | |
| 616 | alist = op->estate->addresses; |
| 617 | best_prio = -1; |
| 618 | addr_index = 0; |
| 619 | for (i = 0; i < alist->nr_addrs; i++) { |
| 620 | if (!(set & (1 << i))) |
| 621 | continue; |
| 622 | if (alist->addrs[i].prio > best_prio) { |
| 623 | addr_index = i; |
| 624 | best_prio = alist->addrs[i].prio; |
| 625 | } |
| 626 | } |
| 627 | |
| 628 | alist->preferred = addr_index; |
| 629 | |
| 630 | op->addr_index = addr_index; |
| 631 | set_bit(nr: addr_index, addr: &op->addr_tried); |
| 632 | |
| 633 | _debug("address [%u] %u/%u %pISp" , |
| 634 | op->server_index, addr_index, alist->nr_addrs, |
| 635 | rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); |
| 636 | go_again: |
| 637 | op->volsync.creation = TIME64_MIN; |
| 638 | op->volsync.update = TIME64_MIN; |
| 639 | op->call_responded = false; |
| 640 | _leave(" = t" ); |
| 641 | return true; |
| 642 | |
| 643 | wait_for_more_probe_results: |
| 644 | error = afs_wait_for_one_fs_probe(server: op->server, estate: op->estate, exclude: op->addr_tried, |
| 645 | is_intr: !(op->flags & AFS_OPERATION_UNINTR)); |
| 646 | if (error == 1) |
| 647 | goto iterate_address; |
| 648 | if (!error) |
| 649 | goto restart_from_beginning; |
| 650 | |
| 651 | /* We've now had a failure to respond on all of a server's addresses - |
| 652 | * immediately probe them again and consider retrying the server. |
| 653 | */ |
| 654 | trace_afs_rotate(op, reason: afs_rotate_trace_probe_fileserver, extra: 0); |
| 655 | afs_probe_fileserver(op->net, op->server); |
| 656 | if (op->flags & AFS_OPERATION_RETRY_SERVER) { |
| 657 | error = afs_wait_for_one_fs_probe(server: op->server, estate: op->estate, exclude: op->addr_tried, |
| 658 | is_intr: !(op->flags & AFS_OPERATION_UNINTR)); |
| 659 | switch (error) { |
| 660 | case 1: |
| 661 | op->flags &= ~AFS_OPERATION_RETRY_SERVER; |
| 662 | trace_afs_rotate(op, reason: afs_rotate_trace_retry_server, extra: 1); |
| 663 | goto retry_server; |
| 664 | case 0: |
| 665 | trace_afs_rotate(op, reason: afs_rotate_trace_retry_server, extra: 0); |
| 666 | goto restart_from_beginning; |
| 667 | case -ERESTARTSYS: |
| 668 | afs_op_set_error(op, error); |
| 669 | goto failed; |
| 670 | case -ETIME: |
| 671 | case -EDESTADDRREQ: |
| 672 | goto next_server; |
| 673 | } |
| 674 | } |
| 675 | |
| 676 | next_server: |
| 677 | trace_afs_rotate(op, reason: afs_rotate_trace_next_server, extra: 0); |
| 678 | _debug("next" ); |
| 679 | op->estate = NULL; |
| 680 | goto pick_server; |
| 681 | |
| 682 | no_more_servers: |
| 683 | /* That's all the servers poked to no good effect. Try again if some |
| 684 | * of them were busy. |
| 685 | */ |
| 686 | trace_afs_rotate(op, reason: afs_rotate_trace_no_more_servers, extra: 0); |
| 687 | if (op->flags & AFS_OPERATION_VBUSY) { |
| 688 | afs_sleep_and_retry(op); |
| 689 | op->flags &= ~AFS_OPERATION_VBUSY; |
| 690 | goto restart_from_beginning; |
| 691 | } |
| 692 | |
| 693 | rcu_read_lock(); |
| 694 | for (i = 0; i < op->server_list->nr_servers; i++) { |
| 695 | struct afs_endpoint_state *estate; |
| 696 | |
| 697 | estate = op->server_states[i].endpoint_state; |
| 698 | error = READ_ONCE(estate->error); |
| 699 | if (error < 0) |
| 700 | afs_op_accumulate_error(op, error, abort_code: estate->abort_code); |
| 701 | } |
| 702 | rcu_read_unlock(); |
| 703 | |
| 704 | failed: |
| 705 | trace_afs_rotate(op, reason: afs_rotate_trace_failed, extra: 0); |
| 706 | op->flags |= AFS_OPERATION_STOP; |
| 707 | op->estate = NULL; |
| 708 | _leave(" = f [failed %d]" , afs_op_error(op)); |
| 709 | return false; |
| 710 | } |
| 711 | |
| 712 | /* |
| 713 | * Dump cursor state in the case of the error being EDESTADDRREQ. |
| 714 | */ |
| 715 | void afs_dump_edestaddrreq(const struct afs_operation *op) |
| 716 | { |
| 717 | static int count; |
| 718 | int i; |
| 719 | |
| 720 | if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) |
| 721 | return; |
| 722 | count++; |
| 723 | |
| 724 | rcu_read_lock(); |
| 725 | |
| 726 | pr_notice("EDESTADDR occurred\n" ); |
| 727 | pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n" , |
| 728 | op->file[0].cb_break_before, |
| 729 | op->file[1].cb_break_before, op->flags, op->cumul_error.error); |
| 730 | pr_notice("OP: ut=%lx ix=%d ni=%u\n" , |
| 731 | op->untried_servers, op->server_index, op->nr_iterations); |
| 732 | pr_notice("OP: call er=%d ac=%d r=%u\n" , |
| 733 | op->call_error, op->call_abort_code, op->call_responded); |
| 734 | |
| 735 | if (op->server_list) { |
| 736 | const struct afs_server_list *sl = op->server_list; |
| 737 | |
| 738 | pr_notice("FC: SL nr=%u vnov=%hx\n" , |
| 739 | sl->nr_servers, sl->vnovol_mask); |
| 740 | for (i = 0; i < sl->nr_servers; i++) { |
| 741 | const struct afs_server *s = sl->servers[i].server; |
| 742 | const struct afs_endpoint_state *e = |
| 743 | rcu_dereference(s->endpoint_state); |
| 744 | const struct afs_addr_list *a = e->addresses; |
| 745 | |
| 746 | pr_notice("FC: server fl=%lx av=%u %pU\n" , |
| 747 | s->flags, s->addr_version, &s->uuid); |
| 748 | pr_notice("FC: - pq=%x R=%lx F=%lx\n" , |
| 749 | e->probe_seq, e->responsive_set, e->failed_set); |
| 750 | if (a) { |
| 751 | pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n" , |
| 752 | a->version, |
| 753 | a->nr_ipv4, a->nr_addrs, a->max_addrs, |
| 754 | a->preferred); |
| 755 | if (a == e->addresses) |
| 756 | pr_notice("FC: - current\n" ); |
| 757 | } |
| 758 | } |
| 759 | } |
| 760 | |
| 761 | pr_notice("AC: t=%lx ax=%d\n" , op->addr_tried, op->addr_index); |
| 762 | rcu_read_unlock(); |
| 763 | } |
| 764 | |