| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* RACK-TLP [RFC8958] Implementation |
| 3 | * |
| 4 | * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. |
| 5 | * Written by David Howells (dhowells@redhat.com) |
| 6 | */ |
| 7 | |
| 8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 9 | |
| 10 | #include "ar-internal.h" |
| 11 | |
| 12 | static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, |
| 13 | ktime_t t2, rxrpc_seq_t seq2) |
| 14 | { |
| 15 | if (ktime_after(cmp1: t1, cmp2: t2)) |
| 16 | return true; |
| 17 | return t1 == t2 && after(seq1, seq2); |
| 18 | } |
| 19 | |
| 20 | /* |
| 21 | * Mark a packet lost. |
| 22 | */ |
| 23 | static void rxrpc_rack_mark_lost(struct rxrpc_call *call, |
| 24 | struct rxrpc_txqueue *tq, unsigned int ix) |
| 25 | { |
| 26 | if (__test_and_set_bit(ix, &tq->segment_lost)) { |
| 27 | if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) |
| 28 | call->tx_nr_resent--; |
| 29 | } else { |
| 30 | call->tx_nr_lost++; |
| 31 | } |
| 32 | tq->segment_xmit_ts[ix] = UINT_MAX; |
| 33 | } |
| 34 | |
| 35 | /* |
| 36 | * Get the transmission time of a packet in the Tx queue. |
| 37 | */ |
| 38 | static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) |
| 39 | { |
| 40 | if (tq->segment_xmit_ts[ix] == UINT_MAX) |
| 41 | return KTIME_MAX; |
| 42 | return ktime_add_us(kt: tq->xmit_ts_base, usec: tq->segment_xmit_ts[ix]); |
| 43 | } |
| 44 | |
| 45 | /* |
| 46 | * Get a bitmask of nack bits for a queue segment and mask off any that aren't |
| 47 | * yet reported. |
| 48 | */ |
| 49 | static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) |
| 50 | { |
| 51 | unsigned long nacks = ~tq->segment_acked; |
| 52 | |
| 53 | if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) |
| 54 | nacks &= (1UL << tq->nr_reported_acks) - 1; |
| 55 | return nacks; |
| 56 | } |
| 57 | |
| 58 | /* |
| 59 | * Update the RACK state for the most recently sent packet that has been |
| 60 | * delivered [RFC8958 6.2 Step 2]. |
| 61 | */ |
| 62 | static void rxrpc_rack_update(struct rxrpc_call *call, |
| 63 | struct rxrpc_ack_summary *summary, |
| 64 | struct rxrpc_txqueue *tq, |
| 65 | unsigned int ix) |
| 66 | { |
| 67 | rxrpc_seq_t seq = tq->qbase + ix; |
| 68 | ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); |
| 69 | ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); |
| 70 | |
| 71 | if (__test_and_clear_bit(ix, &tq->segment_lost)) |
| 72 | call->tx_nr_lost--; |
| 73 | |
| 74 | if (test_bit(ix, &tq->segment_retransmitted)) { |
| 75 | /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ |
| 76 | if (before(seq1: call->acks_highest_serial, seq2: tq->segment_serial[ix])) |
| 77 | return; |
| 78 | if (rtt < minmax_get(m: &call->min_rtt)) |
| 79 | return; |
| 80 | } |
| 81 | |
| 82 | /* The RACK algorithm requires the segment ACKs to be traversed in |
| 83 | * order of segment transmission - but the only thing this seems to |
| 84 | * matter for is that RACK.rtt is set to the rtt of the most recently |
| 85 | * transmitted segment. We should be able to achieve the same by only |
| 86 | * setting RACK.rtt if the xmit time is greater. |
| 87 | */ |
| 88 | if (ktime_after(cmp1: xmit_ts, cmp2: call->rack_rtt_ts)) { |
| 89 | call->rack_rtt = rtt; |
| 90 | call->rack_rtt_ts = xmit_ts; |
| 91 | } |
| 92 | |
| 93 | if (rxrpc_rack_sent_after(t1: xmit_ts, seq1: seq, t2: call->rack_xmit_ts, seq2: call->rack_end_seq)) { |
| 94 | call->rack_rtt = rtt; |
| 95 | call->rack_xmit_ts = xmit_ts; |
| 96 | call->rack_end_seq = seq; |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | /* |
| 101 | * Detect data segment reordering [RFC8958 6.2 Step 3]. |
| 102 | */ |
| 103 | static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, |
| 104 | struct rxrpc_ack_summary *summary, |
| 105 | struct rxrpc_txqueue *tq, |
| 106 | unsigned int ix) |
| 107 | { |
| 108 | rxrpc_seq_t seq = tq->qbase + ix; |
| 109 | |
| 110 | /* Track the highest sequence number so far ACK'd. This is not |
| 111 | * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer |
| 112 | * could put a NACK in the last SACK slot. |
| 113 | */ |
| 114 | if (after(seq1: seq, seq2: call->rack_fack)) |
| 115 | call->rack_fack = seq; |
| 116 | else if (before(seq1: seq, seq2: call->rack_fack) && |
| 117 | test_bit(ix, &tq->segment_retransmitted)) |
| 118 | call->rack_reordering_seen = true; |
| 119 | } |
| 120 | |
| 121 | void rxrpc_input_rack_one(struct rxrpc_call *call, |
| 122 | struct rxrpc_ack_summary *summary, |
| 123 | struct rxrpc_txqueue *tq, |
| 124 | unsigned int ix) |
| 125 | { |
| 126 | rxrpc_rack_update(call, summary, tq, ix); |
| 127 | rxrpc_rack_detect_reordering(call, summary, tq, ix); |
| 128 | } |
| 129 | |
| 130 | void rxrpc_input_rack(struct rxrpc_call *call, |
| 131 | struct rxrpc_ack_summary *summary, |
| 132 | struct rxrpc_txqueue *tq, |
| 133 | unsigned long new_acks) |
| 134 | { |
| 135 | while (new_acks) { |
| 136 | unsigned int ix = __ffs(new_acks); |
| 137 | |
| 138 | __clear_bit(ix, &new_acks); |
| 139 | rxrpc_input_rack_one(call, summary, tq, ix); |
| 140 | } |
| 141 | |
| 142 | trace_rxrpc_rack_update(call, summary); |
| 143 | } |
| 144 | |
| 145 | /* |
| 146 | * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated |
| 147 | * duration of the reordering window. |
| 148 | * |
| 149 | * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can |
| 150 | * be given a 'DUPLICATE' reason with the serial number referring to the |
| 151 | * duplicated DATA packet. Rx does not inform as to whether this was a |
| 152 | * reception of the same packet twice or of a retransmission of a packet we |
| 153 | * already received (though this could be determined by the transmitter based |
| 154 | * on the serial number). |
| 155 | */ |
| 156 | static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, |
| 157 | struct rxrpc_ack_summary *summary) |
| 158 | { |
| 159 | rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ |
| 160 | rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ |
| 161 | bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; |
| 162 | int dup_thresh = 3; |
| 163 | |
| 164 | /* DSACK-based reordering window adaptation */ |
| 165 | if (!call->rack_dsack_round_none && |
| 166 | after_eq(seq1: snd_una, seq2: call->rack_dsack_round)) |
| 167 | call->rack_dsack_round_none = true; |
| 168 | |
| 169 | /* Grow the reordering window per round that sees DSACK. Reset the |
| 170 | * window after 16 DSACK-free recoveries. |
| 171 | */ |
| 172 | if (call->rack_dsack_round_none && have_dsack_option) { |
| 173 | call->rack_dsack_round_none = false; |
| 174 | call->rack_dsack_round = snd_nxt; |
| 175 | call->rack_reo_wnd_mult++; |
| 176 | call->rack_reo_wnd_persist = 16; |
| 177 | } else if (summary->exiting_fast_or_rto_recovery) { |
| 178 | call->rack_reo_wnd_persist--; |
| 179 | if (call->rack_reo_wnd_persist <= 0) |
| 180 | call->rack_reo_wnd_mult = 1; |
| 181 | } |
| 182 | |
| 183 | if (!call->rack_reordering_seen) { |
| 184 | if (summary->in_fast_or_rto_recovery) |
| 185 | return 0; |
| 186 | if (call->acks_nr_sacks >= dup_thresh) |
| 187 | return 0; |
| 188 | } |
| 189 | |
| 190 | return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, |
| 191 | call->srtt_us >> 3)); |
| 192 | } |
| 193 | |
| 194 | /* |
| 195 | * Detect losses [RFC8958 6.2 Step 5]. |
| 196 | */ |
| 197 | static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, |
| 198 | struct rxrpc_ack_summary *summary) |
| 199 | { |
| 200 | struct rxrpc_txqueue *tq; |
| 201 | ktime_t timeout = 0, lost_after, now = ktime_get_real(); |
| 202 | |
| 203 | call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); |
| 204 | lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); |
| 205 | trace_rxrpc_rack_scan_loss(call); |
| 206 | |
| 207 | for (tq = call->tx_queue; tq; tq = tq->next) { |
| 208 | unsigned long nacks = rxrpc_tq_nacks(tq); |
| 209 | |
| 210 | if (after(seq1: tq->qbase, seq2: call->tx_transmitted)) |
| 211 | break; |
| 212 | trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); |
| 213 | |
| 214 | /* Skip ones marked lost but not yet retransmitted */ |
| 215 | nacks &= ~tq->segment_lost | tq->segment_retransmitted; |
| 216 | |
| 217 | while (nacks) { |
| 218 | unsigned int ix = __ffs(nacks); |
| 219 | rxrpc_seq_t seq = tq->qbase + ix; |
| 220 | ktime_t remaining; |
| 221 | ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); |
| 222 | |
| 223 | __clear_bit(ix, &nacks); |
| 224 | |
| 225 | if (rxrpc_rack_sent_after(t1: call->rack_xmit_ts, seq1: call->rack_end_seq, |
| 226 | t2: xmit_ts, seq2: seq)) { |
| 227 | remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); |
| 228 | if (remaining <= 0) { |
| 229 | rxrpc_rack_mark_lost(call, tq, ix); |
| 230 | trace_rxrpc_rack_detect_loss(call, summary, seq); |
| 231 | } else { |
| 232 | timeout = max(remaining, timeout); |
| 233 | } |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | return timeout; |
| 239 | } |
| 240 | |
| 241 | /* |
| 242 | * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. |
| 243 | */ |
| 244 | void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, |
| 245 | struct rxrpc_ack_summary *summary) |
| 246 | { |
| 247 | ktime_t timeout = rxrpc_rack_detect_loss(call, summary); |
| 248 | |
| 249 | if (timeout) { |
| 250 | call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; |
| 251 | call->rack_timo_at = ktime_add(ktime_get_real(), timeout); |
| 252 | trace_rxrpc_rack_timer(call, delay: timeout, exp: false); |
| 253 | trace_rxrpc_timer_set(call, delay: timeout, why: rxrpc_timer_trace_rack_reo); |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | /* |
| 258 | * Handle RACK-TLP RTO expiration [RFC8958 6.3]. |
| 259 | */ |
| 260 | static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) |
| 261 | { |
| 262 | struct rxrpc_txqueue *tq; |
| 263 | rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ |
| 264 | ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); |
| 265 | ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); |
| 266 | |
| 267 | for (tq = call->tx_queue; tq; tq = tq->next) { |
| 268 | unsigned long unacked = ~tq->segment_acked; |
| 269 | |
| 270 | trace_rxrpc_rack_mark_loss_tq(call, tq); |
| 271 | while (unacked) { |
| 272 | unsigned int ix = __ffs(unacked); |
| 273 | rxrpc_seq_t seq = tq->qbase + ix; |
| 274 | ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); |
| 275 | |
| 276 | if (after(seq1: seq, seq2: call->tx_transmitted)) |
| 277 | return; |
| 278 | __clear_bit(ix, &unacked); |
| 279 | |
| 280 | if (seq == snd_una || |
| 281 | ktime_before(cmp1: xmit_ts, cmp2: deadline)) |
| 282 | rxrpc_rack_mark_lost(call, tq, ix); |
| 283 | } |
| 284 | } |
| 285 | } |
| 286 | |
| 287 | /* |
| 288 | * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. |
| 289 | */ |
| 290 | ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) |
| 291 | { |
| 292 | unsigned int flight_size = rxrpc_tx_in_flight(call); |
| 293 | ktime_t rto_at = ktime_add(call->tx_last_sent, |
| 294 | rxrpc_get_rto_backoff(call, false)); |
| 295 | ktime_t pto; |
| 296 | |
| 297 | if (call->rtt_count > 0) { |
| 298 | /* Use 2*SRTT as the timeout. */ |
| 299 | pto = ns_to_ktime(ns: call->srtt_us * NSEC_PER_USEC / 4); |
| 300 | if (flight_size) |
| 301 | pto = ktime_add(pto, call->tlp_max_ack_delay); |
| 302 | } else { |
| 303 | pto = NSEC_PER_SEC; |
| 304 | } |
| 305 | |
| 306 | if (ktime_after(ktime_add(now, pto), cmp2: rto_at)) |
| 307 | pto = ktime_sub(rto_at, now); |
| 308 | return pto; |
| 309 | } |
| 310 | |
| 311 | /* |
| 312 | * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. |
| 313 | */ |
| 314 | void rxrpc_tlp_send_probe(struct rxrpc_call *call) |
| 315 | { |
| 316 | unsigned int in_flight = rxrpc_tx_in_flight(call); |
| 317 | |
| 318 | if (after_eq(seq1: call->acks_hard_ack, seq2: call->tx_transmitted)) |
| 319 | return; /* Everything we transmitted has been acked. */ |
| 320 | |
| 321 | /* There must be no other loss probe still in flight and we need to |
| 322 | * have taken a new RTT sample since last probe or the start of |
| 323 | * connection. |
| 324 | */ |
| 325 | if (!call->tlp_serial && |
| 326 | call->tlp_rtt_taken != call->rtt_taken) { |
| 327 | call->tlp_is_retrans = false; |
| 328 | if (after(seq1: call->send_top, seq2: call->tx_transmitted) && |
| 329 | rxrpc_tx_window_space(call) > 0) { |
| 330 | /* Transmit the lowest-sequence unsent DATA */ |
| 331 | call->tx_last_serial = 0; |
| 332 | rxrpc_transmit_some_data(call, limit: 1, trace: rxrpc_txdata_tlp_new_data); |
| 333 | call->tlp_serial = call->tx_last_serial; |
| 334 | call->tlp_seq = call->tx_transmitted; |
| 335 | trace_rxrpc_tlp_probe(call, trace: rxrpc_tlp_probe_trace_transmit_new); |
| 336 | in_flight = rxrpc_tx_in_flight(call); |
| 337 | } else { |
| 338 | /* Retransmit the highest-sequence DATA sent */ |
| 339 | call->tx_last_serial = 0; |
| 340 | rxrpc_resend_tlp(call); |
| 341 | call->tlp_is_retrans = true; |
| 342 | trace_rxrpc_tlp_probe(call, trace: rxrpc_tlp_probe_trace_retransmit); |
| 343 | } |
| 344 | } else { |
| 345 | trace_rxrpc_tlp_probe(call, trace: rxrpc_tlp_probe_trace_busy); |
| 346 | } |
| 347 | |
| 348 | if (in_flight != 0) { |
| 349 | ktime_t rto = rxrpc_get_rto_backoff(call, retrans: false); |
| 350 | |
| 351 | call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; |
| 352 | call->rack_timo_at = ktime_add(ktime_get_real(), rto); |
| 353 | trace_rxrpc_rack_timer(call, delay: rto, exp: false); |
| 354 | trace_rxrpc_timer_set(call, delay: rto, why: rxrpc_timer_trace_rack_rto); |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. |
| 360 | */ |
| 361 | void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) |
| 362 | { |
| 363 | if (!call->tlp_serial || after(seq1: call->tlp_seq, seq2: call->acks_hard_ack)) |
| 364 | return; |
| 365 | |
| 366 | if (!call->tlp_is_retrans) { |
| 367 | /* TLP of new data delivered */ |
| 368 | trace_rxrpc_tlp_ack(call, summary, trace: rxrpc_tlp_ack_trace_new_data); |
| 369 | call->tlp_serial = 0; |
| 370 | } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && |
| 371 | summary->acked_serial == call->tlp_serial) { |
| 372 | /* General Case: Detected packet losses using RACK [7.4.1] */ |
| 373 | trace_rxrpc_tlp_ack(call, summary, trace: rxrpc_tlp_ack_trace_dup_acked); |
| 374 | call->tlp_serial = 0; |
| 375 | } else if (after(seq1: call->acks_hard_ack, seq2: call->tlp_seq)) { |
| 376 | /* Repaired the single loss */ |
| 377 | trace_rxrpc_tlp_ack(call, summary, trace: rxrpc_tlp_ack_trace_hard_beyond); |
| 378 | call->tlp_serial = 0; |
| 379 | // TODO: Invoke congestion control to react to the loss |
| 380 | // event the probe has repaired |
| 381 | } else if (summary->tlp_probe_acked) { |
| 382 | trace_rxrpc_tlp_ack(call, summary, trace: rxrpc_tlp_ack_trace_acked); |
| 383 | /* Special Case: Detected a single loss repaired by the loss |
| 384 | * probe [7.4.2] |
| 385 | */ |
| 386 | call->tlp_serial = 0; |
| 387 | } else { |
| 388 | trace_rxrpc_tlp_ack(call, summary, trace: rxrpc_tlp_ack_trace_incomplete); |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | /* |
| 393 | * Handle RACK timer expiration; returns true to request a resend. |
| 394 | */ |
| 395 | void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) |
| 396 | { |
| 397 | struct rxrpc_ack_summary summary = {}; |
| 398 | enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; |
| 399 | |
| 400 | trace_rxrpc_rack_timer(call, delay: overran_by, exp: true); |
| 401 | call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; |
| 402 | |
| 403 | switch (mode) { |
| 404 | case RXRPC_CALL_RACKTIMER_RACK_REORDER: |
| 405 | rxrpc_rack_detect_loss_and_arm_timer(call, summary: &summary); |
| 406 | break; |
| 407 | case RXRPC_CALL_RACKTIMER_TLP_PTO: |
| 408 | rxrpc_tlp_send_probe(call); |
| 409 | break; |
| 410 | case RXRPC_CALL_RACKTIMER_RTO: |
| 411 | // Might need to poke the congestion algo in some way |
| 412 | rxrpc_rack_mark_losses_on_rto(call); |
| 413 | break; |
| 414 | //case RXRPC_CALL_RACKTIMER_ZEROWIN: |
| 415 | default: |
| 416 | pr_warn("Unexpected rack timer %u" , call->rack_timer_mode); |
| 417 | } |
| 418 | } |
| 419 | |