| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ |
| 3 | #ifndef BPF_ARENA_SPIN_LOCK_H |
| 4 | #define BPF_ARENA_SPIN_LOCK_H |
| 5 | |
| 6 | #include <vmlinux.h> |
| 7 | #include <bpf/bpf_helpers.h> |
| 8 | #include "bpf_atomic.h" |
| 9 | |
| 10 | #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) |
| 11 | #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) |
| 12 | |
| 13 | #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) |
| 14 | |
| 15 | #define EBUSY 16 |
| 16 | #define EOPNOTSUPP 95 |
| 17 | #define ETIMEDOUT 110 |
| 18 | |
| 19 | #ifndef __arena |
| 20 | #define __arena __attribute__((address_space(1))) |
| 21 | #endif |
| 22 | |
| 23 | extern unsigned long CONFIG_NR_CPUS __kconfig; |
| 24 | |
| 25 | /* |
| 26 | * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but |
| 27 | * PowerPC overrides the definition to define lock->val as u32 instead of |
| 28 | * atomic_t, leading to compilation errors. Import a local definition below so |
| 29 | * that we don't depend on the vmlinux.h version. |
| 30 | */ |
| 31 | |
| 32 | struct __qspinlock { |
| 33 | union { |
| 34 | atomic_t val; |
| 35 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
| 36 | struct { |
| 37 | u8 locked; |
| 38 | u8 pending; |
| 39 | }; |
| 40 | struct { |
| 41 | u16 locked_pending; |
| 42 | u16 tail; |
| 43 | }; |
| 44 | #else |
| 45 | struct { |
| 46 | u16 tail; |
| 47 | u16 locked_pending; |
| 48 | }; |
| 49 | struct { |
| 50 | u8 reserved[2]; |
| 51 | u8 pending; |
| 52 | u8 locked; |
| 53 | }; |
| 54 | #endif |
| 55 | }; |
| 56 | }; |
| 57 | |
| 58 | #define arena_spinlock_t struct __qspinlock |
| 59 | /* FIXME: Using typedef causes CO-RE relocation error */ |
| 60 | /* typedef struct qspinlock arena_spinlock_t; */ |
| 61 | |
| 62 | struct arena_mcs_spinlock { |
| 63 | struct arena_mcs_spinlock __arena *next; |
| 64 | int locked; |
| 65 | int count; |
| 66 | }; |
| 67 | |
| 68 | struct arena_qnode { |
| 69 | struct arena_mcs_spinlock mcs; |
| 70 | }; |
| 71 | |
| 72 | #define _Q_MAX_NODES 4 |
| 73 | #define _Q_PENDING_LOOPS 1 |
| 74 | |
| 75 | /* |
| 76 | * Bitfields in the atomic value: |
| 77 | * |
| 78 | * 0- 7: locked byte |
| 79 | * 8: pending |
| 80 | * 9-15: not used |
| 81 | * 16-17: tail index |
| 82 | * 18-31: tail cpu (+1) |
| 83 | */ |
| 84 | #define _Q_MAX_CPUS 1024 |
| 85 | |
| 86 | #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ |
| 87 | << _Q_ ## type ## _OFFSET) |
| 88 | #define _Q_LOCKED_OFFSET 0 |
| 89 | #define _Q_LOCKED_BITS 8 |
| 90 | #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) |
| 91 | |
| 92 | #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) |
| 93 | #define _Q_PENDING_BITS 8 |
| 94 | #define _Q_PENDING_MASK _Q_SET_MASK(PENDING) |
| 95 | |
| 96 | #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) |
| 97 | #define _Q_TAIL_IDX_BITS 2 |
| 98 | #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) |
| 99 | |
| 100 | #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) |
| 101 | #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) |
| 102 | #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) |
| 103 | |
| 104 | #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET |
| 105 | #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) |
| 106 | |
| 107 | #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) |
| 108 | #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) |
| 109 | |
| 110 | struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; |
| 111 | |
| 112 | static inline u32 encode_tail(int cpu, int idx) |
| 113 | { |
| 114 | u32 tail; |
| 115 | |
| 116 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; |
| 117 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ |
| 118 | |
| 119 | return tail; |
| 120 | } |
| 121 | |
| 122 | static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) |
| 123 | { |
| 124 | u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
| 125 | u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
| 126 | |
| 127 | return &qnodes[cpu][idx].mcs; |
| 128 | } |
| 129 | |
| 130 | static inline |
| 131 | struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) |
| 132 | { |
| 133 | return &((struct arena_qnode __arena *)base + idx)->mcs; |
| 134 | } |
| 135 | |
| 136 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) |
| 137 | |
| 138 | /** |
| 139 | * xchg_tail - Put in the new queue tail code word & retrieve previous one |
| 140 | * @lock : Pointer to queued spinlock structure |
| 141 | * @tail : The new queue tail code word |
| 142 | * Return: The previous queue tail code word |
| 143 | * |
| 144 | * xchg(lock, tail) |
| 145 | * |
| 146 | * p,*,* -> n,*,* ; prev = xchg(lock, node) |
| 147 | */ |
| 148 | static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) |
| 149 | { |
| 150 | u32 old, new; |
| 151 | |
| 152 | old = atomic_read(&lock->val); |
| 153 | do { |
| 154 | new = (old & _Q_LOCKED_PENDING_MASK) | tail; |
| 155 | /* |
| 156 | * We can use relaxed semantics since the caller ensures that |
| 157 | * the MCS node is properly initialized before updating the |
| 158 | * tail. |
| 159 | */ |
| 160 | /* These loops are not expected to stall, but we still need to |
| 161 | * prove to the verifier they will terminate eventually. |
| 162 | */ |
| 163 | cond_break_label(out); |
| 164 | } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); |
| 165 | |
| 166 | return old; |
| 167 | out: |
| 168 | bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!" , __func__); |
| 169 | return old; |
| 170 | } |
| 171 | |
| 172 | /** |
| 173 | * clear_pending - clear the pending bit. |
| 174 | * @lock: Pointer to queued spinlock structure |
| 175 | * |
| 176 | * *,1,* -> *,0,* |
| 177 | */ |
| 178 | static __always_inline void clear_pending(arena_spinlock_t __arena *lock) |
| 179 | { |
| 180 | WRITE_ONCE(lock->pending, 0); |
| 181 | } |
| 182 | |
| 183 | /** |
| 184 | * clear_pending_set_locked - take ownership and clear the pending bit. |
| 185 | * @lock: Pointer to queued spinlock structure |
| 186 | * |
| 187 | * *,1,0 -> *,0,1 |
| 188 | * |
| 189 | * Lock stealing is not allowed if this function is used. |
| 190 | */ |
| 191 | static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) |
| 192 | { |
| 193 | WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); |
| 194 | } |
| 195 | |
| 196 | /** |
| 197 | * set_locked - Set the lock bit and own the lock |
| 198 | * @lock: Pointer to queued spinlock structure |
| 199 | * |
| 200 | * *,*,0 -> *,0,1 |
| 201 | */ |
| 202 | static __always_inline void set_locked(arena_spinlock_t __arena *lock) |
| 203 | { |
| 204 | WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); |
| 205 | } |
| 206 | |
| 207 | static __always_inline |
| 208 | u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) |
| 209 | { |
| 210 | u32 old, new; |
| 211 | |
| 212 | old = atomic_read(&lock->val); |
| 213 | do { |
| 214 | new = old | _Q_PENDING_VAL; |
| 215 | /* |
| 216 | * These loops are not expected to stall, but we still need to |
| 217 | * prove to the verifier they will terminate eventually. |
| 218 | */ |
| 219 | cond_break_label(out); |
| 220 | } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); |
| 221 | |
| 222 | return old; |
| 223 | out: |
| 224 | bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!" , __func__); |
| 225 | return old; |
| 226 | } |
| 227 | |
| 228 | /** |
| 229 | * arena_spin_trylock - try to acquire the queued spinlock |
| 230 | * @lock : Pointer to queued spinlock structure |
| 231 | * Return: 1 if lock acquired, 0 if failed |
| 232 | */ |
| 233 | static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) |
| 234 | { |
| 235 | int val = atomic_read(&lock->val); |
| 236 | |
| 237 | if (unlikely(val)) |
| 238 | return 0; |
| 239 | |
| 240 | return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); |
| 241 | } |
| 242 | |
| 243 | __noinline |
| 244 | int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) |
| 245 | { |
| 246 | struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; |
| 247 | int ret = -ETIMEDOUT; |
| 248 | u32 old, tail; |
| 249 | int idx; |
| 250 | |
| 251 | /* |
| 252 | * Wait for in-progress pending->locked hand-overs with a bounded |
| 253 | * number of spins so that we guarantee forward progress. |
| 254 | * |
| 255 | * 0,1,0 -> 0,0,1 |
| 256 | */ |
| 257 | if (val == _Q_PENDING_VAL) { |
| 258 | int cnt = _Q_PENDING_LOOPS; |
| 259 | val = atomic_cond_read_relaxed_label(&lock->val, |
| 260 | (VAL != _Q_PENDING_VAL) || !cnt--, |
| 261 | release_err); |
| 262 | } |
| 263 | |
| 264 | /* |
| 265 | * If we observe any contention; queue. |
| 266 | */ |
| 267 | if (val & ~_Q_LOCKED_MASK) |
| 268 | goto queue; |
| 269 | |
| 270 | /* |
| 271 | * trylock || pending |
| 272 | * |
| 273 | * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock |
| 274 | */ |
| 275 | val = arena_fetch_set_pending_acquire(lock); |
| 276 | |
| 277 | /* |
| 278 | * If we observe contention, there is a concurrent locker. |
| 279 | * |
| 280 | * Undo and queue; our setting of PENDING might have made the |
| 281 | * n,0,0 -> 0,0,0 transition fail and it will now be waiting |
| 282 | * on @next to become !NULL. |
| 283 | */ |
| 284 | if (unlikely(val & ~_Q_LOCKED_MASK)) { |
| 285 | |
| 286 | /* Undo PENDING if we set it. */ |
| 287 | if (!(val & _Q_PENDING_MASK)) |
| 288 | clear_pending(lock); |
| 289 | |
| 290 | goto queue; |
| 291 | } |
| 292 | |
| 293 | /* |
| 294 | * We're pending, wait for the owner to go away. |
| 295 | * |
| 296 | * 0,1,1 -> *,1,0 |
| 297 | * |
| 298 | * this wait loop must be a load-acquire such that we match the |
| 299 | * store-release that clears the locked bit and create lock |
| 300 | * sequentiality; this is because not all |
| 301 | * clear_pending_set_locked() implementations imply full |
| 302 | * barriers. |
| 303 | */ |
| 304 | if (val & _Q_LOCKED_MASK) |
| 305 | smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); |
| 306 | |
| 307 | /* |
| 308 | * take ownership and clear the pending bit. |
| 309 | * |
| 310 | * 0,1,0 -> 0,0,1 |
| 311 | */ |
| 312 | clear_pending_set_locked(lock); |
| 313 | return 0; |
| 314 | |
| 315 | /* |
| 316 | * End of pending bit optimistic spinning and beginning of MCS |
| 317 | * queuing. |
| 318 | */ |
| 319 | queue: |
| 320 | node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; |
| 321 | idx = node0->count++; |
| 322 | tail = encode_tail(bpf_get_smp_processor_id(), idx); |
| 323 | |
| 324 | /* |
| 325 | * 4 nodes are allocated based on the assumption that there will not be |
| 326 | * nested NMIs taking spinlocks. That may not be true in some |
| 327 | * architectures even though the chance of needing more than 4 nodes |
| 328 | * will still be extremely unlikely. When that happens, we simply return |
| 329 | * an error. Original qspinlock has a trylock fallback in this case. |
| 330 | */ |
| 331 | if (unlikely(idx >= _Q_MAX_NODES)) { |
| 332 | ret = -EBUSY; |
| 333 | goto release_node_err; |
| 334 | } |
| 335 | |
| 336 | node = grab_mcs_node(node0, idx); |
| 337 | |
| 338 | /* |
| 339 | * Ensure that we increment the head node->count before initialising |
| 340 | * the actual node. If the compiler is kind enough to reorder these |
| 341 | * stores, then an IRQ could overwrite our assignments. |
| 342 | */ |
| 343 | barrier(); |
| 344 | |
| 345 | node->locked = 0; |
| 346 | node->next = NULL; |
| 347 | |
| 348 | /* |
| 349 | * We touched a (possibly) cold cacheline in the per-cpu queue node; |
| 350 | * attempt the trylock once more in the hope someone let go while we |
| 351 | * weren't watching. |
| 352 | */ |
| 353 | if (arena_spin_trylock(lock)) |
| 354 | goto release; |
| 355 | |
| 356 | /* |
| 357 | * Ensure that the initialisation of @node is complete before we |
| 358 | * publish the updated tail via xchg_tail() and potentially link |
| 359 | * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. |
| 360 | */ |
| 361 | smp_wmb(); |
| 362 | |
| 363 | /* |
| 364 | * Publish the updated tail. |
| 365 | * We have already touched the queueing cacheline; don't bother with |
| 366 | * pending stuff. |
| 367 | * |
| 368 | * p,*,* -> n,*,* |
| 369 | */ |
| 370 | old = xchg_tail(lock, tail); |
| 371 | next = NULL; |
| 372 | |
| 373 | /* |
| 374 | * if there was a previous node; link it and wait until reaching the |
| 375 | * head of the waitqueue. |
| 376 | */ |
| 377 | if (old & _Q_TAIL_MASK) { |
| 378 | prev = decode_tail(old); |
| 379 | |
| 380 | /* Link @node into the waitqueue. */ |
| 381 | WRITE_ONCE(prev->next, node); |
| 382 | |
| 383 | arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); |
| 384 | |
| 385 | /* |
| 386 | * While waiting for the MCS lock, the next pointer may have |
| 387 | * been set by another lock waiter. We cannot prefetch here |
| 388 | * due to lack of equivalent instruction in BPF ISA. |
| 389 | */ |
| 390 | next = READ_ONCE(node->next); |
| 391 | } |
| 392 | |
| 393 | /* |
| 394 | * we're at the head of the waitqueue, wait for the owner & pending to |
| 395 | * go away. |
| 396 | * |
| 397 | * *,x,y -> *,0,0 |
| 398 | * |
| 399 | * this wait loop must use a load-acquire such that we match the |
| 400 | * store-release that clears the locked bit and create lock |
| 401 | * sequentiality; this is because the set_locked() function below |
| 402 | * does not imply a full barrier. |
| 403 | */ |
| 404 | val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), |
| 405 | release_node_err); |
| 406 | |
| 407 | /* |
| 408 | * claim the lock: |
| 409 | * |
| 410 | * n,0,0 -> 0,0,1 : lock, uncontended |
| 411 | * *,*,0 -> *,*,1 : lock, contended |
| 412 | * |
| 413 | * If the queue head is the only one in the queue (lock value == tail) |
| 414 | * and nobody is pending, clear the tail code and grab the lock. |
| 415 | * Otherwise, we only need to grab the lock. |
| 416 | */ |
| 417 | |
| 418 | /* |
| 419 | * In the PV case we might already have _Q_LOCKED_VAL set, because |
| 420 | * of lock stealing; therefore we must also allow: |
| 421 | * |
| 422 | * n,0,1 -> 0,0,1 |
| 423 | * |
| 424 | * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the |
| 425 | * above wait condition, therefore any concurrent setting of |
| 426 | * PENDING will make the uncontended transition fail. |
| 427 | */ |
| 428 | if ((val & _Q_TAIL_MASK) == tail) { |
| 429 | if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
| 430 | goto release; /* No contention */ |
| 431 | } |
| 432 | |
| 433 | /* |
| 434 | * Either somebody is queued behind us or _Q_PENDING_VAL got set |
| 435 | * which will then detect the remaining tail and queue behind us |
| 436 | * ensuring we'll see a @next. |
| 437 | */ |
| 438 | set_locked(lock); |
| 439 | |
| 440 | /* |
| 441 | * contended path; wait for next if not observed yet, release. |
| 442 | */ |
| 443 | if (!next) |
| 444 | next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); |
| 445 | |
| 446 | arch_mcs_spin_unlock_contended(&next->locked); |
| 447 | |
| 448 | release:; |
| 449 | /* |
| 450 | * release the node |
| 451 | * |
| 452 | * Doing a normal dec vs this_cpu_dec is fine. An upper context always |
| 453 | * decrements count it incremented before returning, thus we're fine. |
| 454 | * For contexts interrupting us, they either observe our dec or not. |
| 455 | * Just ensure the compiler doesn't reorder this statement, as a |
| 456 | * this_cpu_dec implicitly implied that. |
| 457 | */ |
| 458 | barrier(); |
| 459 | node0->count--; |
| 460 | return 0; |
| 461 | release_node_err: |
| 462 | barrier(); |
| 463 | node0->count--; |
| 464 | goto release_err; |
| 465 | release_err: |
| 466 | return ret; |
| 467 | } |
| 468 | |
| 469 | /** |
| 470 | * arena_spin_lock - acquire a queued spinlock |
| 471 | * @lock: Pointer to queued spinlock structure |
| 472 | * |
| 473 | * On error, returned value will be negative. |
| 474 | * On success, zero is returned. |
| 475 | * |
| 476 | * The return value _must_ be tested against zero for success, |
| 477 | * instead of checking it against negative, for passing the |
| 478 | * BPF verifier. |
| 479 | * |
| 480 | * The user should do: |
| 481 | * if (arena_spin_lock(...) != 0) // failure |
| 482 | * or |
| 483 | * if (arena_spin_lock(...) == 0) // success |
| 484 | * or |
| 485 | * if (arena_spin_lock(...)) // failure |
| 486 | * or |
| 487 | * if (!arena_spin_lock(...)) // success |
| 488 | * instead of: |
| 489 | * if (arena_spin_lock(...) < 0) // failure |
| 490 | * |
| 491 | * The return value can still be inspected later. |
| 492 | */ |
| 493 | static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) |
| 494 | { |
| 495 | int val = 0; |
| 496 | |
| 497 | if (CONFIG_NR_CPUS > 1024) |
| 498 | return -EOPNOTSUPP; |
| 499 | |
| 500 | bpf_preempt_disable(); |
| 501 | if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) |
| 502 | return 0; |
| 503 | |
| 504 | val = arena_spin_lock_slowpath(lock, val); |
| 505 | /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ |
| 506 | if (val) |
| 507 | bpf_preempt_enable(); |
| 508 | return val; |
| 509 | } |
| 510 | |
| 511 | /** |
| 512 | * arena_spin_unlock - release a queued spinlock |
| 513 | * @lock : Pointer to queued spinlock structure |
| 514 | */ |
| 515 | static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) |
| 516 | { |
| 517 | /* |
| 518 | * unlock() needs release semantics: |
| 519 | */ |
| 520 | smp_store_release(&lock->locked, 0); |
| 521 | bpf_preempt_enable(); |
| 522 | } |
| 523 | |
| 524 | #define arena_spin_lock_irqsave(lock, flags) \ |
| 525 | ({ \ |
| 526 | int __ret; \ |
| 527 | bpf_local_irq_save(&(flags)); \ |
| 528 | __ret = arena_spin_lock((lock)); \ |
| 529 | if (__ret) \ |
| 530 | bpf_local_irq_restore(&(flags)); \ |
| 531 | (__ret); \ |
| 532 | }) |
| 533 | |
| 534 | #define arena_spin_unlock_irqrestore(lock, flags) \ |
| 535 | ({ \ |
| 536 | arena_spin_unlock((lock)); \ |
| 537 | bpf_local_irq_restore(&(flags)); \ |
| 538 | }) |
| 539 | |
| 540 | #endif |
| 541 | |
| 542 | #endif /* BPF_ARENA_SPIN_LOCK_H */ |
| 543 | |