| 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* |
| 3 | * A demo sched_ext flattened cgroup hierarchy scheduler. It implements |
| 4 | * hierarchical weight-based cgroup CPU control by flattening the cgroup |
| 5 | * hierarchy into a single layer by compounding the active weight share at each |
| 6 | * level. Consider the following hierarchy with weights in parentheses: |
| 7 | * |
| 8 | * R + A (100) + B (100) |
| 9 | * | \ C (100) |
| 10 | * \ D (200) |
| 11 | * |
| 12 | * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. |
| 13 | * Let's say all three have runnable tasks. The total share that each of these |
| 14 | * three cgroups is entitled to can be calculated by compounding its share at |
| 15 | * each level. |
| 16 | * |
| 17 | * For example, B is competing against C and in that competition its share is |
| 18 | * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's |
| 19 | * share in that competition is 100/(200+100) == 1/3. B's eventual share in the |
| 20 | * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's |
| 21 | * eventual shaer is the same at 1/6. D is only competing at the top level and |
| 22 | * its share is 200/(100+200) == 2/3. |
| 23 | * |
| 24 | * So, instead of hierarchically scheduling level-by-level, we can consider it |
| 25 | * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 |
| 26 | * and keep updating the eventual shares as the cgroups' runnable states change. |
| 27 | * |
| 28 | * This flattening of hierarchy can bring a substantial performance gain when |
| 29 | * the cgroup hierarchy is nested multiple levels. in a simple benchmark using |
| 30 | * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it |
| 31 | * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two |
| 32 | * apache instances competing with 2:1 weight ratio nested four level deep. |
| 33 | * |
| 34 | * However, the gain comes at the cost of not being able to properly handle |
| 35 | * thundering herd of cgroups. For example, if many cgroups which are nested |
| 36 | * behind a low priority parent cgroup wake up around the same time, they may be |
| 37 | * able to consume more CPU cycles than they are entitled to. In many use cases, |
| 38 | * this isn't a real concern especially given the performance gain. Also, there |
| 39 | * are ways to mitigate the problem further by e.g. introducing an extra |
| 40 | * scheduling layer on cgroup delegation boundaries. |
| 41 | * |
| 42 | * The scheduler first picks the cgroup to run and then schedule the tasks |
| 43 | * within by using nested weighted vtime scheduling by default. The |
| 44 | * cgroup-internal scheduling can be switched to FIFO with the -f option. |
| 45 | */ |
| 46 | #include <scx/common.bpf.h> |
| 47 | #include "scx_flatcg.h" |
| 48 | |
| 49 | /* |
| 50 | * Maximum amount of retries to find a valid cgroup. |
| 51 | */ |
| 52 | enum { |
| 53 | FALLBACK_DSQ = 0, |
| 54 | CGROUP_MAX_RETRIES = 1024, |
| 55 | }; |
| 56 | |
| 57 | char _license[] SEC("license" ) = "GPL" ; |
| 58 | |
| 59 | const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ |
| 60 | const volatile u64 cgrp_slice_ns; |
| 61 | const volatile bool fifo_sched; |
| 62 | |
| 63 | u64 cvtime_now; |
| 64 | UEI_DEFINE(uei); |
| 65 | |
| 66 | struct { |
| 67 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
| 68 | __type(key, u32); |
| 69 | __type(value, u64); |
| 70 | __uint(max_entries, FCG_NR_STATS); |
| 71 | } stats SEC(".maps" ); |
| 72 | |
| 73 | static void stat_inc(enum fcg_stat_idx idx) |
| 74 | { |
| 75 | u32 idx_v = idx; |
| 76 | |
| 77 | u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); |
| 78 | if (cnt_p) |
| 79 | (*cnt_p)++; |
| 80 | } |
| 81 | |
| 82 | struct fcg_cpu_ctx { |
| 83 | u64 cur_cgid; |
| 84 | u64 cur_at; |
| 85 | }; |
| 86 | |
| 87 | struct { |
| 88 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
| 89 | __type(key, u32); |
| 90 | __type(value, struct fcg_cpu_ctx); |
| 91 | __uint(max_entries, 1); |
| 92 | } cpu_ctx SEC(".maps" ); |
| 93 | |
| 94 | struct { |
| 95 | __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); |
| 96 | __uint(map_flags, BPF_F_NO_PREALLOC); |
| 97 | __type(key, int); |
| 98 | __type(value, struct fcg_cgrp_ctx); |
| 99 | } cgrp_ctx SEC(".maps" ); |
| 100 | |
| 101 | struct cgv_node { |
| 102 | struct bpf_rb_node rb_node; |
| 103 | __u64 cvtime; |
| 104 | __u64 cgid; |
| 105 | }; |
| 106 | |
| 107 | private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; |
| 108 | private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); |
| 109 | |
| 110 | struct cgv_node_stash { |
| 111 | struct cgv_node __kptr *node; |
| 112 | }; |
| 113 | |
| 114 | struct { |
| 115 | __uint(type, BPF_MAP_TYPE_HASH); |
| 116 | __uint(max_entries, 16384); |
| 117 | __type(key, __u64); |
| 118 | __type(value, struct cgv_node_stash); |
| 119 | } cgv_node_stash SEC(".maps" ); |
| 120 | |
| 121 | struct fcg_task_ctx { |
| 122 | u64 bypassed_at; |
| 123 | }; |
| 124 | |
| 125 | struct { |
| 126 | __uint(type, BPF_MAP_TYPE_TASK_STORAGE); |
| 127 | __uint(map_flags, BPF_F_NO_PREALLOC); |
| 128 | __type(key, int); |
| 129 | __type(value, struct fcg_task_ctx); |
| 130 | } task_ctx SEC(".maps" ); |
| 131 | |
| 132 | /* gets inc'd on weight tree changes to expire the cached hweights */ |
| 133 | u64 hweight_gen = 1; |
| 134 | |
| 135 | static u64 div_round_up(u64 dividend, u64 divisor) |
| 136 | { |
| 137 | return (dividend + divisor - 1) / divisor; |
| 138 | } |
| 139 | |
| 140 | static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) |
| 141 | { |
| 142 | struct cgv_node *cgc_a, *cgc_b; |
| 143 | |
| 144 | cgc_a = container_of(a, struct cgv_node, rb_node); |
| 145 | cgc_b = container_of(b, struct cgv_node, rb_node); |
| 146 | |
| 147 | return cgc_a->cvtime < cgc_b->cvtime; |
| 148 | } |
| 149 | |
| 150 | static struct fcg_cpu_ctx *find_cpu_ctx(void) |
| 151 | { |
| 152 | struct fcg_cpu_ctx *cpuc; |
| 153 | u32 idx = 0; |
| 154 | |
| 155 | cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); |
| 156 | if (!cpuc) { |
| 157 | scx_bpf_error("cpu_ctx lookup failed" ); |
| 158 | return NULL; |
| 159 | } |
| 160 | return cpuc; |
| 161 | } |
| 162 | |
| 163 | static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) |
| 164 | { |
| 165 | struct fcg_cgrp_ctx *cgc; |
| 166 | |
| 167 | cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); |
| 168 | if (!cgc) { |
| 169 | scx_bpf_error("cgrp_ctx lookup failed for cgid %llu" , cgrp->kn->id); |
| 170 | return NULL; |
| 171 | } |
| 172 | return cgc; |
| 173 | } |
| 174 | |
| 175 | static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) |
| 176 | { |
| 177 | struct fcg_cgrp_ctx *cgc; |
| 178 | |
| 179 | cgrp = bpf_cgroup_ancestor(cgrp, level); |
| 180 | if (!cgrp) { |
| 181 | scx_bpf_error("ancestor cgroup lookup failed" ); |
| 182 | return NULL; |
| 183 | } |
| 184 | |
| 185 | cgc = find_cgrp_ctx(cgrp); |
| 186 | if (!cgc) |
| 187 | scx_bpf_error("ancestor cgrp_ctx lookup failed" ); |
| 188 | bpf_cgroup_release(cgrp); |
| 189 | return cgc; |
| 190 | } |
| 191 | |
| 192 | static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) |
| 193 | { |
| 194 | int level; |
| 195 | |
| 196 | if (!cgc->nr_active) { |
| 197 | stat_inc(idx: FCG_STAT_HWT_SKIP); |
| 198 | return; |
| 199 | } |
| 200 | |
| 201 | if (cgc->hweight_gen == hweight_gen) { |
| 202 | stat_inc(idx: FCG_STAT_HWT_CACHE); |
| 203 | return; |
| 204 | } |
| 205 | |
| 206 | stat_inc(idx: FCG_STAT_HWT_UPDATES); |
| 207 | bpf_for(level, 0, cgrp->level + 1) { |
| 208 | struct fcg_cgrp_ctx *cgc; |
| 209 | bool is_active; |
| 210 | |
| 211 | cgc = find_ancestor_cgrp_ctx(cgrp, level); |
| 212 | if (!cgc) |
| 213 | break; |
| 214 | |
| 215 | if (!level) { |
| 216 | cgc->hweight = FCG_HWEIGHT_ONE; |
| 217 | cgc->hweight_gen = hweight_gen; |
| 218 | } else { |
| 219 | struct fcg_cgrp_ctx *pcgc; |
| 220 | |
| 221 | pcgc = find_ancestor_cgrp_ctx(cgrp, level: level - 1); |
| 222 | if (!pcgc) |
| 223 | break; |
| 224 | |
| 225 | /* |
| 226 | * We can be opportunistic here and not grab the |
| 227 | * cgv_tree_lock and deal with the occasional races. |
| 228 | * However, hweight updates are already cached and |
| 229 | * relatively low-frequency. Let's just do the |
| 230 | * straightforward thing. |
| 231 | */ |
| 232 | bpf_spin_lock(&cgv_tree_lock); |
| 233 | is_active = cgc->nr_active; |
| 234 | if (is_active) { |
| 235 | cgc->hweight_gen = pcgc->hweight_gen; |
| 236 | cgc->hweight = |
| 237 | div_round_up(pcgc->hweight * cgc->weight, |
| 238 | pcgc->child_weight_sum); |
| 239 | } |
| 240 | bpf_spin_unlock(&cgv_tree_lock); |
| 241 | |
| 242 | if (!is_active) { |
| 243 | stat_inc(idx: FCG_STAT_HWT_RACE); |
| 244 | break; |
| 245 | } |
| 246 | } |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) |
| 251 | { |
| 252 | u64 delta, cvtime, max_budget; |
| 253 | |
| 254 | /* |
| 255 | * A node which is on the rbtree can't be pointed to from elsewhere yet |
| 256 | * and thus can't be updated and repositioned. Instead, we collect the |
| 257 | * vtime deltas separately and apply it asynchronously here. |
| 258 | */ |
| 259 | delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta); |
| 260 | cvtime = cgv_node->cvtime + delta; |
| 261 | |
| 262 | /* |
| 263 | * Allow a cgroup to carry the maximum budget proportional to its |
| 264 | * hweight such that a full-hweight cgroup can immediately take up half |
| 265 | * of the CPUs at the most while staying at the front of the rbtree. |
| 266 | */ |
| 267 | max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / |
| 268 | (2 * FCG_HWEIGHT_ONE); |
| 269 | if (time_before(cvtime, cvtime_now - max_budget)) |
| 270 | cvtime = cvtime_now - max_budget; |
| 271 | |
| 272 | cgv_node->cvtime = cvtime; |
| 273 | } |
| 274 | |
| 275 | static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) |
| 276 | { |
| 277 | struct cgv_node_stash *stash; |
| 278 | struct cgv_node *cgv_node; |
| 279 | u64 cgid = cgrp->kn->id; |
| 280 | |
| 281 | /* paired with cmpxchg in try_pick_next_cgroup() */ |
| 282 | if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { |
| 283 | stat_inc(idx: FCG_STAT_ENQ_SKIP); |
| 284 | return; |
| 285 | } |
| 286 | |
| 287 | stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); |
| 288 | if (!stash) { |
| 289 | scx_bpf_error("cgv_node lookup failed for cgid %llu" , cgid); |
| 290 | return; |
| 291 | } |
| 292 | |
| 293 | /* NULL if the node is already on the rbtree */ |
| 294 | cgv_node = bpf_kptr_xchg(&stash->node, NULL); |
| 295 | if (!cgv_node) { |
| 296 | stat_inc(idx: FCG_STAT_ENQ_RACE); |
| 297 | return; |
| 298 | } |
| 299 | |
| 300 | bpf_spin_lock(&cgv_tree_lock); |
| 301 | cgrp_cap_budget(cgv_node, cgc); |
| 302 | bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); |
| 303 | bpf_spin_unlock(&cgv_tree_lock); |
| 304 | } |
| 305 | |
| 306 | static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) |
| 307 | { |
| 308 | /* |
| 309 | * Tell fcg_stopping() that this bypassed the regular scheduling path |
| 310 | * and should be force charged to the cgroup. 0 is used to indicate that |
| 311 | * the task isn't bypassing, so if the current runtime is 0, go back by |
| 312 | * one nanosecond. |
| 313 | */ |
| 314 | taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; |
| 315 | } |
| 316 | |
| 317 | s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) |
| 318 | { |
| 319 | struct fcg_task_ctx *taskc; |
| 320 | bool is_idle = false; |
| 321 | s32 cpu; |
| 322 | |
| 323 | cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); |
| 324 | |
| 325 | taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); |
| 326 | if (!taskc) { |
| 327 | scx_bpf_error("task_ctx lookup failed" ); |
| 328 | return cpu; |
| 329 | } |
| 330 | |
| 331 | /* |
| 332 | * If select_cpu_dfl() is recommending local enqueue, the target CPU is |
| 333 | * idle. Follow it and charge the cgroup later in fcg_stopping() after |
| 334 | * the fact. |
| 335 | */ |
| 336 | if (is_idle) { |
| 337 | set_bypassed_at(p, taskc); |
| 338 | stat_inc(idx: FCG_STAT_LOCAL); |
| 339 | scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); |
| 340 | } |
| 341 | |
| 342 | return cpu; |
| 343 | } |
| 344 | |
| 345 | void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) |
| 346 | { |
| 347 | struct fcg_task_ctx *taskc; |
| 348 | struct cgroup *cgrp; |
| 349 | struct fcg_cgrp_ctx *cgc; |
| 350 | |
| 351 | taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); |
| 352 | if (!taskc) { |
| 353 | scx_bpf_error("task_ctx lookup failed" ); |
| 354 | return; |
| 355 | } |
| 356 | |
| 357 | /* |
| 358 | * Use the direct dispatching and force charging to deal with tasks with |
| 359 | * custom affinities so that we don't have to worry about per-cgroup |
| 360 | * dq's containing tasks that can't be executed from some CPUs. |
| 361 | */ |
| 362 | if (p->nr_cpus_allowed != nr_cpus) { |
| 363 | set_bypassed_at(p, taskc); |
| 364 | |
| 365 | /* |
| 366 | * The global dq is deprioritized as we don't want to let tasks |
| 367 | * to boost themselves by constraining its cpumask. The |
| 368 | * deprioritization is rather severe, so let's not apply that to |
| 369 | * per-cpu kernel threads. This is ham-fisted. We probably wanna |
| 370 | * implement per-cgroup fallback dq's instead so that we have |
| 371 | * more control over when tasks with custom cpumask get issued. |
| 372 | */ |
| 373 | if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { |
| 374 | stat_inc(idx: FCG_STAT_LOCAL); |
| 375 | scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, |
| 376 | enq_flags); |
| 377 | } else { |
| 378 | stat_inc(idx: FCG_STAT_GLOBAL); |
| 379 | scx_bpf_dsq_insert(p, FALLBACK_DSQ, SCX_SLICE_DFL, |
| 380 | enq_flags); |
| 381 | } |
| 382 | return; |
| 383 | } |
| 384 | |
| 385 | cgrp = __COMPAT_scx_bpf_task_cgroup(p); |
| 386 | cgc = find_cgrp_ctx(cgrp); |
| 387 | if (!cgc) |
| 388 | goto out_release; |
| 389 | |
| 390 | if (fifo_sched) { |
| 391 | scx_bpf_dsq_insert(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); |
| 392 | } else { |
| 393 | u64 tvtime = p->scx.dsq_vtime; |
| 394 | |
| 395 | /* |
| 396 | * Limit the amount of budget that an idling task can accumulate |
| 397 | * to one slice. |
| 398 | */ |
| 399 | if (time_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) |
| 400 | tvtime = cgc->tvtime_now - SCX_SLICE_DFL; |
| 401 | |
| 402 | scx_bpf_dsq_insert_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, |
| 403 | tvtime, enq_flags); |
| 404 | } |
| 405 | |
| 406 | cgrp_enqueued(cgrp, cgc); |
| 407 | out_release: |
| 408 | bpf_cgroup_release(cgrp); |
| 409 | } |
| 410 | |
| 411 | /* |
| 412 | * Walk the cgroup tree to update the active weight sums as tasks wake up and |
| 413 | * sleep. The weight sums are used as the base when calculating the proportion a |
| 414 | * given cgroup or task is entitled to at each level. |
| 415 | */ |
| 416 | static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) |
| 417 | { |
| 418 | struct fcg_cgrp_ctx *cgc; |
| 419 | bool updated = false; |
| 420 | int idx; |
| 421 | |
| 422 | cgc = find_cgrp_ctx(cgrp); |
| 423 | if (!cgc) |
| 424 | return; |
| 425 | |
| 426 | /* |
| 427 | * In most cases, a hot cgroup would have multiple threads going to |
| 428 | * sleep and waking up while the whole cgroup stays active. In leaf |
| 429 | * cgroups, ->nr_runnable which is updated with __sync operations gates |
| 430 | * ->nr_active updates, so that we don't have to grab the cgv_tree_lock |
| 431 | * repeatedly for a busy cgroup which is staying active. |
| 432 | */ |
| 433 | if (runnable) { |
| 434 | if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) |
| 435 | return; |
| 436 | stat_inc(idx: FCG_STAT_ACT); |
| 437 | } else { |
| 438 | if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) |
| 439 | return; |
| 440 | stat_inc(idx: FCG_STAT_DEACT); |
| 441 | } |
| 442 | |
| 443 | /* |
| 444 | * If @cgrp is becoming runnable, its hweight should be refreshed after |
| 445 | * it's added to the weight tree so that enqueue has the up-to-date |
| 446 | * value. If @cgrp is becoming quiescent, the hweight should be |
| 447 | * refreshed before it's removed from the weight tree so that the usage |
| 448 | * charging which happens afterwards has access to the latest value. |
| 449 | */ |
| 450 | if (!runnable) |
| 451 | cgrp_refresh_hweight(cgrp, cgc); |
| 452 | |
| 453 | /* propagate upwards */ |
| 454 | bpf_for(idx, 0, cgrp->level) { |
| 455 | int level = cgrp->level - idx; |
| 456 | struct fcg_cgrp_ctx *cgc, *pcgc = NULL; |
| 457 | bool propagate = false; |
| 458 | |
| 459 | cgc = find_ancestor_cgrp_ctx(cgrp, level); |
| 460 | if (!cgc) |
| 461 | break; |
| 462 | if (level) { |
| 463 | pcgc = find_ancestor_cgrp_ctx(cgrp, level: level - 1); |
| 464 | if (!pcgc) |
| 465 | break; |
| 466 | } |
| 467 | |
| 468 | /* |
| 469 | * We need the propagation protected by a lock to synchronize |
| 470 | * against weight changes. There's no reason to drop the lock at |
| 471 | * each level but bpf_spin_lock() doesn't want any function |
| 472 | * calls while locked. |
| 473 | */ |
| 474 | bpf_spin_lock(&cgv_tree_lock); |
| 475 | |
| 476 | if (runnable) { |
| 477 | if (!cgc->nr_active++) { |
| 478 | updated = true; |
| 479 | if (pcgc) { |
| 480 | propagate = true; |
| 481 | pcgc->child_weight_sum += cgc->weight; |
| 482 | } |
| 483 | } |
| 484 | } else { |
| 485 | if (!--cgc->nr_active) { |
| 486 | updated = true; |
| 487 | if (pcgc) { |
| 488 | propagate = true; |
| 489 | pcgc->child_weight_sum -= cgc->weight; |
| 490 | } |
| 491 | } |
| 492 | } |
| 493 | |
| 494 | bpf_spin_unlock(&cgv_tree_lock); |
| 495 | |
| 496 | if (!propagate) |
| 497 | break; |
| 498 | } |
| 499 | |
| 500 | if (updated) |
| 501 | __sync_fetch_and_add(&hweight_gen, 1); |
| 502 | |
| 503 | if (runnable) |
| 504 | cgrp_refresh_hweight(cgrp, cgc); |
| 505 | } |
| 506 | |
| 507 | void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) |
| 508 | { |
| 509 | struct cgroup *cgrp; |
| 510 | |
| 511 | cgrp = __COMPAT_scx_bpf_task_cgroup(p); |
| 512 | update_active_weight_sums(cgrp, true); |
| 513 | bpf_cgroup_release(cgrp); |
| 514 | } |
| 515 | |
| 516 | void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) |
| 517 | { |
| 518 | struct cgroup *cgrp; |
| 519 | struct fcg_cgrp_ctx *cgc; |
| 520 | |
| 521 | if (fifo_sched) |
| 522 | return; |
| 523 | |
| 524 | cgrp = __COMPAT_scx_bpf_task_cgroup(p); |
| 525 | cgc = find_cgrp_ctx(cgrp); |
| 526 | if (cgc) { |
| 527 | /* |
| 528 | * @cgc->tvtime_now always progresses forward as tasks start |
| 529 | * executing. The test and update can be performed concurrently |
| 530 | * from multiple CPUs and thus racy. Any error should be |
| 531 | * contained and temporary. Let's just live with it. |
| 532 | */ |
| 533 | if (time_before(cgc->tvtime_now, p->scx.dsq_vtime)) |
| 534 | cgc->tvtime_now = p->scx.dsq_vtime; |
| 535 | } |
| 536 | bpf_cgroup_release(cgrp); |
| 537 | } |
| 538 | |
| 539 | void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) |
| 540 | { |
| 541 | struct fcg_task_ctx *taskc; |
| 542 | struct cgroup *cgrp; |
| 543 | struct fcg_cgrp_ctx *cgc; |
| 544 | |
| 545 | /* |
| 546 | * Scale the execution time by the inverse of the weight and charge. |
| 547 | * |
| 548 | * Note that the default yield implementation yields by setting |
| 549 | * @p->scx.slice to zero and the following would treat the yielding task |
| 550 | * as if it has consumed all its slice. If this penalizes yielding tasks |
| 551 | * too much, determine the execution time by taking explicit timestamps |
| 552 | * instead of depending on @p->scx.slice. |
| 553 | */ |
| 554 | if (!fifo_sched) |
| 555 | p->scx.dsq_vtime += |
| 556 | (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; |
| 557 | |
| 558 | taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); |
| 559 | if (!taskc) { |
| 560 | scx_bpf_error("task_ctx lookup failed" ); |
| 561 | return; |
| 562 | } |
| 563 | |
| 564 | if (!taskc->bypassed_at) |
| 565 | return; |
| 566 | |
| 567 | cgrp = __COMPAT_scx_bpf_task_cgroup(p); |
| 568 | cgc = find_cgrp_ctx(cgrp); |
| 569 | if (cgc) { |
| 570 | __sync_fetch_and_add(&cgc->cvtime_delta, |
| 571 | p->se.sum_exec_runtime - taskc->bypassed_at); |
| 572 | taskc->bypassed_at = 0; |
| 573 | } |
| 574 | bpf_cgroup_release(cgrp); |
| 575 | } |
| 576 | |
| 577 | void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) |
| 578 | { |
| 579 | struct cgroup *cgrp; |
| 580 | |
| 581 | cgrp = __COMPAT_scx_bpf_task_cgroup(p); |
| 582 | update_active_weight_sums(cgrp, false); |
| 583 | bpf_cgroup_release(cgrp); |
| 584 | } |
| 585 | |
| 586 | void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) |
| 587 | { |
| 588 | struct fcg_cgrp_ctx *cgc, *pcgc = NULL; |
| 589 | |
| 590 | cgc = find_cgrp_ctx(cgrp); |
| 591 | if (!cgc) |
| 592 | return; |
| 593 | |
| 594 | if (cgrp->level) { |
| 595 | pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); |
| 596 | if (!pcgc) |
| 597 | return; |
| 598 | } |
| 599 | |
| 600 | bpf_spin_lock(&cgv_tree_lock); |
| 601 | if (pcgc && cgc->nr_active) |
| 602 | pcgc->child_weight_sum += (s64)weight - cgc->weight; |
| 603 | cgc->weight = weight; |
| 604 | bpf_spin_unlock(&cgv_tree_lock); |
| 605 | } |
| 606 | |
| 607 | static bool try_pick_next_cgroup(u64 *cgidp) |
| 608 | { |
| 609 | struct bpf_rb_node *rb_node; |
| 610 | struct cgv_node_stash *stash; |
| 611 | struct cgv_node *cgv_node; |
| 612 | struct fcg_cgrp_ctx *cgc; |
| 613 | struct cgroup *cgrp; |
| 614 | u64 cgid; |
| 615 | |
| 616 | /* pop the front cgroup and wind cvtime_now accordingly */ |
| 617 | bpf_spin_lock(&cgv_tree_lock); |
| 618 | |
| 619 | rb_node = bpf_rbtree_first(&cgv_tree); |
| 620 | if (!rb_node) { |
| 621 | bpf_spin_unlock(&cgv_tree_lock); |
| 622 | stat_inc(idx: FCG_STAT_PNC_NO_CGRP); |
| 623 | *cgidp = 0; |
| 624 | return true; |
| 625 | } |
| 626 | |
| 627 | rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); |
| 628 | bpf_spin_unlock(&cgv_tree_lock); |
| 629 | |
| 630 | if (!rb_node) { |
| 631 | /* |
| 632 | * This should never happen. bpf_rbtree_first() was called |
| 633 | * above while the tree lock was held, so the node should |
| 634 | * always be present. |
| 635 | */ |
| 636 | scx_bpf_error("node could not be removed" ); |
| 637 | return true; |
| 638 | } |
| 639 | |
| 640 | cgv_node = container_of(rb_node, struct cgv_node, rb_node); |
| 641 | cgid = cgv_node->cgid; |
| 642 | |
| 643 | if (time_before(cvtime_now, cgv_node->cvtime)) |
| 644 | cvtime_now = cgv_node->cvtime; |
| 645 | |
| 646 | /* |
| 647 | * If lookup fails, the cgroup's gone. Free and move on. See |
| 648 | * fcg_cgroup_exit(). |
| 649 | */ |
| 650 | cgrp = bpf_cgroup_from_id(cgid); |
| 651 | if (!cgrp) { |
| 652 | stat_inc(idx: FCG_STAT_PNC_GONE); |
| 653 | goto out_free; |
| 654 | } |
| 655 | |
| 656 | cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); |
| 657 | if (!cgc) { |
| 658 | bpf_cgroup_release(cgrp); |
| 659 | stat_inc(idx: FCG_STAT_PNC_GONE); |
| 660 | goto out_free; |
| 661 | } |
| 662 | |
| 663 | if (!scx_bpf_dsq_move_to_local(cgid)) { |
| 664 | bpf_cgroup_release(cgrp); |
| 665 | stat_inc(idx: FCG_STAT_PNC_EMPTY); |
| 666 | goto out_stash; |
| 667 | } |
| 668 | |
| 669 | /* |
| 670 | * Successfully consumed from the cgroup. This will be our current |
| 671 | * cgroup for the new slice. Refresh its hweight. |
| 672 | */ |
| 673 | cgrp_refresh_hweight(cgrp, cgc); |
| 674 | |
| 675 | bpf_cgroup_release(cgrp); |
| 676 | |
| 677 | /* |
| 678 | * As the cgroup may have more tasks, add it back to the rbtree. Note |
| 679 | * that here we charge the full slice upfront and then exact later |
| 680 | * according to the actual consumption. This prevents lowpri thundering |
| 681 | * herd from saturating the machine. |
| 682 | */ |
| 683 | bpf_spin_lock(&cgv_tree_lock); |
| 684 | cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); |
| 685 | cgrp_cap_budget(cgv_node, cgc); |
| 686 | bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); |
| 687 | bpf_spin_unlock(&cgv_tree_lock); |
| 688 | |
| 689 | *cgidp = cgid; |
| 690 | stat_inc(idx: FCG_STAT_PNC_NEXT); |
| 691 | return true; |
| 692 | |
| 693 | out_stash: |
| 694 | stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); |
| 695 | if (!stash) { |
| 696 | stat_inc(idx: FCG_STAT_PNC_GONE); |
| 697 | goto out_free; |
| 698 | } |
| 699 | |
| 700 | /* |
| 701 | * Paired with cmpxchg in cgrp_enqueued(). If they see the following |
| 702 | * transition, they'll enqueue the cgroup. If they are earlier, we'll |
| 703 | * see their task in the dq below and requeue the cgroup. |
| 704 | */ |
| 705 | __sync_val_compare_and_swap(&cgc->queued, 1, 0); |
| 706 | |
| 707 | if (scx_bpf_dsq_nr_queued(cgid)) { |
| 708 | bpf_spin_lock(&cgv_tree_lock); |
| 709 | bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); |
| 710 | bpf_spin_unlock(&cgv_tree_lock); |
| 711 | stat_inc(idx: FCG_STAT_PNC_RACE); |
| 712 | } else { |
| 713 | cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); |
| 714 | if (cgv_node) { |
| 715 | scx_bpf_error("unexpected !NULL cgv_node stash" ); |
| 716 | goto out_free; |
| 717 | } |
| 718 | } |
| 719 | |
| 720 | return false; |
| 721 | |
| 722 | out_free: |
| 723 | bpf_obj_drop(cgv_node); |
| 724 | return false; |
| 725 | } |
| 726 | |
| 727 | void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) |
| 728 | { |
| 729 | struct fcg_cpu_ctx *cpuc; |
| 730 | struct fcg_cgrp_ctx *cgc; |
| 731 | struct cgroup *cgrp; |
| 732 | u64 now = scx_bpf_now(); |
| 733 | bool picked_next = false; |
| 734 | |
| 735 | cpuc = find_cpu_ctx(); |
| 736 | if (!cpuc) |
| 737 | return; |
| 738 | |
| 739 | if (!cpuc->cur_cgid) |
| 740 | goto pick_next_cgroup; |
| 741 | |
| 742 | if (time_before(now, cpuc->cur_at + cgrp_slice_ns)) { |
| 743 | if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) { |
| 744 | stat_inc(idx: FCG_STAT_CNS_KEEP); |
| 745 | return; |
| 746 | } |
| 747 | stat_inc(idx: FCG_STAT_CNS_EMPTY); |
| 748 | } else { |
| 749 | stat_inc(idx: FCG_STAT_CNS_EXPIRE); |
| 750 | } |
| 751 | |
| 752 | /* |
| 753 | * The current cgroup is expiring. It was already charged a full slice. |
| 754 | * Calculate the actual usage and accumulate the delta. |
| 755 | */ |
| 756 | cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); |
| 757 | if (!cgrp) { |
| 758 | stat_inc(idx: FCG_STAT_CNS_GONE); |
| 759 | goto pick_next_cgroup; |
| 760 | } |
| 761 | |
| 762 | cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); |
| 763 | if (cgc) { |
| 764 | /* |
| 765 | * We want to update the vtime delta and then look for the next |
| 766 | * cgroup to execute but the latter needs to be done in a loop |
| 767 | * and we can't keep the lock held. Oh well... |
| 768 | */ |
| 769 | bpf_spin_lock(&cgv_tree_lock); |
| 770 | __sync_fetch_and_add(&cgc->cvtime_delta, |
| 771 | (cpuc->cur_at + cgrp_slice_ns - now) * |
| 772 | FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); |
| 773 | bpf_spin_unlock(&cgv_tree_lock); |
| 774 | } else { |
| 775 | stat_inc(idx: FCG_STAT_CNS_GONE); |
| 776 | } |
| 777 | |
| 778 | bpf_cgroup_release(cgrp); |
| 779 | |
| 780 | pick_next_cgroup: |
| 781 | cpuc->cur_at = now; |
| 782 | |
| 783 | if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ)) { |
| 784 | cpuc->cur_cgid = 0; |
| 785 | return; |
| 786 | } |
| 787 | |
| 788 | bpf_repeat(CGROUP_MAX_RETRIES) { |
| 789 | if (try_pick_next_cgroup(&cpuc->cur_cgid)) { |
| 790 | picked_next = true; |
| 791 | break; |
| 792 | } |
| 793 | } |
| 794 | |
| 795 | /* |
| 796 | * This only happens if try_pick_next_cgroup() races against enqueue |
| 797 | * path for more than CGROUP_MAX_RETRIES times, which is extremely |
| 798 | * unlikely and likely indicates an underlying bug. There shouldn't be |
| 799 | * any stall risk as the race is against enqueue. |
| 800 | */ |
| 801 | if (!picked_next) |
| 802 | stat_inc(idx: FCG_STAT_PNC_FAIL); |
| 803 | } |
| 804 | |
| 805 | s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, |
| 806 | struct scx_init_task_args *args) |
| 807 | { |
| 808 | struct fcg_task_ctx *taskc; |
| 809 | struct fcg_cgrp_ctx *cgc; |
| 810 | |
| 811 | /* |
| 812 | * @p is new. Let's ensure that its task_ctx is available. We can sleep |
| 813 | * in this function and the following will automatically use GFP_KERNEL. |
| 814 | */ |
| 815 | taskc = bpf_task_storage_get(&task_ctx, p, 0, |
| 816 | BPF_LOCAL_STORAGE_GET_F_CREATE); |
| 817 | if (!taskc) |
| 818 | return -ENOMEM; |
| 819 | |
| 820 | taskc->bypassed_at = 0; |
| 821 | |
| 822 | if (!(cgc = find_cgrp_ctx(args->cgroup))) |
| 823 | return -ENOENT; |
| 824 | |
| 825 | p->scx.dsq_vtime = cgc->tvtime_now; |
| 826 | |
| 827 | return 0; |
| 828 | } |
| 829 | |
| 830 | int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, |
| 831 | struct scx_cgroup_init_args *args) |
| 832 | { |
| 833 | struct fcg_cgrp_ctx *cgc; |
| 834 | struct cgv_node *cgv_node; |
| 835 | struct cgv_node_stash empty_stash = {}, *stash; |
| 836 | u64 cgid = cgrp->kn->id; |
| 837 | int ret; |
| 838 | |
| 839 | /* |
| 840 | * Technically incorrect as cgroup ID is full 64bit while dsq ID is |
| 841 | * 63bit. Should not be a problem in practice and easy to spot in the |
| 842 | * unlikely case that it breaks. |
| 843 | */ |
| 844 | ret = scx_bpf_create_dsq(cgid, -1); |
| 845 | if (ret) |
| 846 | return ret; |
| 847 | |
| 848 | cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, |
| 849 | BPF_LOCAL_STORAGE_GET_F_CREATE); |
| 850 | if (!cgc) { |
| 851 | ret = -ENOMEM; |
| 852 | goto err_destroy_dsq; |
| 853 | } |
| 854 | |
| 855 | cgc->weight = args->weight; |
| 856 | cgc->hweight = FCG_HWEIGHT_ONE; |
| 857 | |
| 858 | ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, |
| 859 | BPF_NOEXIST); |
| 860 | if (ret) { |
| 861 | if (ret != -ENOMEM) |
| 862 | scx_bpf_error("unexpected stash creation error (%d)" , |
| 863 | ret); |
| 864 | goto err_destroy_dsq; |
| 865 | } |
| 866 | |
| 867 | stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); |
| 868 | if (!stash) { |
| 869 | scx_bpf_error("unexpected cgv_node stash lookup failure" ); |
| 870 | ret = -ENOENT; |
| 871 | goto err_destroy_dsq; |
| 872 | } |
| 873 | |
| 874 | cgv_node = bpf_obj_new(struct cgv_node); |
| 875 | if (!cgv_node) { |
| 876 | ret = -ENOMEM; |
| 877 | goto err_del_cgv_node; |
| 878 | } |
| 879 | |
| 880 | cgv_node->cgid = cgid; |
| 881 | cgv_node->cvtime = cvtime_now; |
| 882 | |
| 883 | cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); |
| 884 | if (cgv_node) { |
| 885 | scx_bpf_error("unexpected !NULL cgv_node stash" ); |
| 886 | ret = -EBUSY; |
| 887 | goto err_drop; |
| 888 | } |
| 889 | |
| 890 | return 0; |
| 891 | |
| 892 | err_drop: |
| 893 | bpf_obj_drop(cgv_node); |
| 894 | err_del_cgv_node: |
| 895 | bpf_map_delete_elem(&cgv_node_stash, &cgid); |
| 896 | err_destroy_dsq: |
| 897 | scx_bpf_destroy_dsq(cgid); |
| 898 | return ret; |
| 899 | } |
| 900 | |
| 901 | void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) |
| 902 | { |
| 903 | u64 cgid = cgrp->kn->id; |
| 904 | |
| 905 | /* |
| 906 | * For now, there's no way find and remove the cgv_node if it's on the |
| 907 | * cgv_tree. Let's drain them in the dispatch path as they get popped |
| 908 | * off the front of the tree. |
| 909 | */ |
| 910 | bpf_map_delete_elem(&cgv_node_stash, &cgid); |
| 911 | scx_bpf_destroy_dsq(cgid); |
| 912 | } |
| 913 | |
| 914 | void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, |
| 915 | struct cgroup *from, struct cgroup *to) |
| 916 | { |
| 917 | struct fcg_cgrp_ctx *from_cgc, *to_cgc; |
| 918 | s64 delta; |
| 919 | |
| 920 | /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ |
| 921 | if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) |
| 922 | return; |
| 923 | |
| 924 | delta = time_delta(p->scx.dsq_vtime, from_cgc->tvtime_now); |
| 925 | p->scx.dsq_vtime = to_cgc->tvtime_now + delta; |
| 926 | } |
| 927 | |
| 928 | s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) |
| 929 | { |
| 930 | return scx_bpf_create_dsq(FALLBACK_DSQ, -1); |
| 931 | } |
| 932 | |
| 933 | void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) |
| 934 | { |
| 935 | UEI_RECORD(uei, ei); |
| 936 | } |
| 937 | |
| 938 | SCX_OPS_DEFINE(flatcg_ops, |
| 939 | .select_cpu = (void *)fcg_select_cpu, |
| 940 | .enqueue = (void *)fcg_enqueue, |
| 941 | .dispatch = (void *)fcg_dispatch, |
| 942 | .runnable = (void *)fcg_runnable, |
| 943 | .running = (void *)fcg_running, |
| 944 | .stopping = (void *)fcg_stopping, |
| 945 | .quiescent = (void *)fcg_quiescent, |
| 946 | .init_task = (void *)fcg_init_task, |
| 947 | .cgroup_set_weight = (void *)fcg_cgroup_set_weight, |
| 948 | .cgroup_init = (void *)fcg_cgroup_init, |
| 949 | .cgroup_exit = (void *)fcg_cgroup_exit, |
| 950 | .cgroup_move = (void *)fcg_cgroup_move, |
| 951 | .init = (void *)fcg_init, |
| 952 | .exit = (void *)fcg_exit, |
| 953 | .flags = SCX_OPS_ENQ_EXITING, |
| 954 | .name = "flatcg" ); |
| 955 | |