| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | |
| 3 | /* Copyright (c) 2019 Facebook */ |
| 4 | |
| 5 | #include <assert.h> |
| 6 | #include <limits.h> |
| 7 | #include <unistd.h> |
| 8 | #include <sys/file.h> |
| 9 | #include <sys/resource.h> |
| 10 | #include <sys/time.h> |
| 11 | #include <linux/err.h> |
| 12 | #include <linux/list.h> |
| 13 | #include <linux/zalloc.h> |
| 14 | #include <api/fs/fs.h> |
| 15 | #include <bpf/bpf.h> |
| 16 | #include <bpf/btf.h> |
| 17 | #include <perf/bpf_perf.h> |
| 18 | |
| 19 | #include "bpf_counter.h" |
| 20 | #include "bpf-utils.h" |
| 21 | #include "counts.h" |
| 22 | #include "debug.h" |
| 23 | #include "evsel.h" |
| 24 | #include "evlist.h" |
| 25 | #include "target.h" |
| 26 | #include "cgroup.h" |
| 27 | #include "cpumap.h" |
| 28 | #include "thread_map.h" |
| 29 | |
| 30 | #include "bpf_skel/bpf_prog_profiler.skel.h" |
| 31 | #include "bpf_skel/bperf_u.h" |
| 32 | #include "bpf_skel/bperf_leader.skel.h" |
| 33 | #include "bpf_skel/bperf_follower.skel.h" |
| 34 | |
| 35 | struct bpf_counter { |
| 36 | void *skel; |
| 37 | struct list_head list; |
| 38 | }; |
| 39 | |
| 40 | #define ATTR_MAP_SIZE 16 |
| 41 | |
| 42 | static void *u64_to_ptr(__u64 ptr) |
| 43 | { |
| 44 | return (void *)(unsigned long)ptr; |
| 45 | } |
| 46 | |
| 47 | |
| 48 | void set_max_rlimit(void) |
| 49 | { |
| 50 | struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; |
| 51 | |
| 52 | setrlimit(RLIMIT_MEMLOCK, &rinf); |
| 53 | } |
| 54 | |
| 55 | static __u32 bpf_link_get_id(int fd) |
| 56 | { |
| 57 | struct bpf_link_info link_info = { .id = 0, }; |
| 58 | __u32 link_info_len = sizeof(link_info); |
| 59 | |
| 60 | bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); |
| 61 | return link_info.id; |
| 62 | } |
| 63 | |
| 64 | static __u32 bpf_link_get_prog_id(int fd) |
| 65 | { |
| 66 | struct bpf_link_info link_info = { .id = 0, }; |
| 67 | __u32 link_info_len = sizeof(link_info); |
| 68 | |
| 69 | bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); |
| 70 | return link_info.prog_id; |
| 71 | } |
| 72 | |
| 73 | static __u32 bpf_map_get_id(int fd) |
| 74 | { |
| 75 | struct bpf_map_info map_info = { .id = 0, }; |
| 76 | __u32 map_info_len = sizeof(map_info); |
| 77 | |
| 78 | bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len); |
| 79 | return map_info.id; |
| 80 | } |
| 81 | |
| 82 | /* trigger the leader program on a cpu */ |
| 83 | int bperf_trigger_reading(int prog_fd, int cpu) |
| 84 | { |
| 85 | DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, |
| 86 | .ctx_in = NULL, |
| 87 | .ctx_size_in = 0, |
| 88 | .flags = BPF_F_TEST_RUN_ON_CPU, |
| 89 | .cpu = cpu, |
| 90 | .retval = 0, |
| 91 | ); |
| 92 | |
| 93 | return bpf_prog_test_run_opts(prog_fd, &opts); |
| 94 | } |
| 95 | |
| 96 | static struct bpf_counter *bpf_counter_alloc(void) |
| 97 | { |
| 98 | struct bpf_counter *counter; |
| 99 | |
| 100 | counter = zalloc(sizeof(*counter)); |
| 101 | if (counter) |
| 102 | INIT_LIST_HEAD(list: &counter->list); |
| 103 | return counter; |
| 104 | } |
| 105 | |
| 106 | static int bpf_program_profiler__destroy(struct evsel *evsel) |
| 107 | { |
| 108 | struct bpf_counter *counter, *tmp; |
| 109 | |
| 110 | list_for_each_entry_safe(counter, tmp, |
| 111 | &evsel->bpf_counter_list, list) { |
| 112 | list_del_init(entry: &counter->list); |
| 113 | bpf_prog_profiler_bpf__destroy(counter->skel); |
| 114 | free(counter); |
| 115 | } |
| 116 | assert(list_empty(head: &evsel->bpf_counter_list)); |
| 117 | |
| 118 | return 0; |
| 119 | } |
| 120 | |
| 121 | static char *bpf_target_prog_name(int tgt_fd) |
| 122 | { |
| 123 | struct bpf_func_info *func_info; |
| 124 | struct perf_bpil *info_linear; |
| 125 | const struct btf_type *t; |
| 126 | struct btf *btf = NULL; |
| 127 | char *name = NULL; |
| 128 | |
| 129 | info_linear = get_bpf_prog_info_linear(tgt_fd, 1UL << PERF_BPIL_FUNC_INFO); |
| 130 | if (IS_ERR_OR_NULL(ptr: info_linear)) { |
| 131 | pr_debug("failed to get info_linear for prog FD %d\n" , tgt_fd); |
| 132 | return NULL; |
| 133 | } |
| 134 | |
| 135 | if (info_linear->info.btf_id == 0) { |
| 136 | pr_debug("prog FD %d doesn't have valid btf\n" , tgt_fd); |
| 137 | goto out; |
| 138 | } |
| 139 | |
| 140 | btf = btf__load_from_kernel_by_id(info_linear->info.btf_id); |
| 141 | if (libbpf_get_error(btf)) { |
| 142 | pr_debug("failed to load btf for prog FD %d\n" , tgt_fd); |
| 143 | goto out; |
| 144 | } |
| 145 | |
| 146 | func_info = u64_to_ptr(ptr: info_linear->info.func_info); |
| 147 | t = btf__type_by_id(btf, func_info[0].type_id); |
| 148 | if (!t) { |
| 149 | pr_debug("btf %d doesn't have type %d\n" , |
| 150 | info_linear->info.btf_id, func_info[0].type_id); |
| 151 | goto out; |
| 152 | } |
| 153 | name = strdup(btf__name_by_offset(btf, t->name_off)); |
| 154 | out: |
| 155 | btf__free(btf); |
| 156 | free(info_linear); |
| 157 | return name; |
| 158 | } |
| 159 | |
| 160 | static int bpf_program_profiler_load_one(struct evsel *evsel, u32 prog_id) |
| 161 | { |
| 162 | struct bpf_prog_profiler_bpf *skel; |
| 163 | struct bpf_counter *counter; |
| 164 | struct bpf_program *prog; |
| 165 | char *prog_name = NULL; |
| 166 | int prog_fd; |
| 167 | int err; |
| 168 | |
| 169 | prog_fd = bpf_prog_get_fd_by_id(prog_id); |
| 170 | if (prog_fd < 0) { |
| 171 | pr_err("Failed to open fd for bpf prog %u\n" , prog_id); |
| 172 | return -1; |
| 173 | } |
| 174 | counter = bpf_counter_alloc(); |
| 175 | if (!counter) { |
| 176 | close(prog_fd); |
| 177 | return -1; |
| 178 | } |
| 179 | |
| 180 | skel = bpf_prog_profiler_bpf__open(); |
| 181 | if (!skel) { |
| 182 | pr_err("Failed to open bpf skeleton\n" ); |
| 183 | goto err_out; |
| 184 | } |
| 185 | |
| 186 | skel->rodata->num_cpu = evsel__nr_cpus(evsel); |
| 187 | |
| 188 | bpf_map__set_max_entries(skel->maps.events, evsel__nr_cpus(evsel)); |
| 189 | bpf_map__set_max_entries(skel->maps.fentry_readings, 1); |
| 190 | bpf_map__set_max_entries(skel->maps.accum_readings, 1); |
| 191 | |
| 192 | prog_name = bpf_target_prog_name(tgt_fd: prog_fd); |
| 193 | if (!prog_name) { |
| 194 | pr_err("Failed to get program name for bpf prog %u. Does it have BTF?\n" , prog_id); |
| 195 | goto err_out; |
| 196 | } |
| 197 | |
| 198 | bpf_object__for_each_program(prog, skel->obj) { |
| 199 | err = bpf_program__set_attach_target(prog, prog_fd, prog_name); |
| 200 | if (err) { |
| 201 | pr_err("bpf_program__set_attach_target failed.\n" |
| 202 | "Does bpf prog %u have BTF?\n" , prog_id); |
| 203 | goto err_out; |
| 204 | } |
| 205 | } |
| 206 | set_max_rlimit(); |
| 207 | err = bpf_prog_profiler_bpf__load(skel); |
| 208 | if (err) { |
| 209 | pr_err("bpf_prog_profiler_bpf__load failed\n" ); |
| 210 | goto err_out; |
| 211 | } |
| 212 | |
| 213 | assert(skel != NULL); |
| 214 | counter->skel = skel; |
| 215 | list_add(new: &counter->list, head: &evsel->bpf_counter_list); |
| 216 | free(prog_name); |
| 217 | close(prog_fd); |
| 218 | return 0; |
| 219 | err_out: |
| 220 | bpf_prog_profiler_bpf__destroy(skel); |
| 221 | free(prog_name); |
| 222 | free(counter); |
| 223 | close(prog_fd); |
| 224 | return -1; |
| 225 | } |
| 226 | |
| 227 | static int bpf_program_profiler__load(struct evsel *evsel, struct target *target) |
| 228 | { |
| 229 | char *bpf_str, *bpf_str_, *tok, *saveptr = NULL, *p; |
| 230 | u32 prog_id; |
| 231 | int ret; |
| 232 | |
| 233 | bpf_str_ = bpf_str = strdup(target->bpf_str); |
| 234 | if (!bpf_str) |
| 235 | return -1; |
| 236 | |
| 237 | while ((tok = strtok_r(bpf_str, "," , &saveptr)) != NULL) { |
| 238 | prog_id = strtoul(tok, &p, 10); |
| 239 | if (prog_id == 0 || prog_id == UINT_MAX || |
| 240 | (*p != '\0' && *p != ',')) { |
| 241 | pr_err("Failed to parse bpf prog ids %s\n" , |
| 242 | target->bpf_str); |
| 243 | free(bpf_str_); |
| 244 | return -1; |
| 245 | } |
| 246 | |
| 247 | ret = bpf_program_profiler_load_one(evsel, prog_id); |
| 248 | if (ret) { |
| 249 | bpf_program_profiler__destroy(evsel); |
| 250 | free(bpf_str_); |
| 251 | return -1; |
| 252 | } |
| 253 | bpf_str = NULL; |
| 254 | } |
| 255 | free(bpf_str_); |
| 256 | return 0; |
| 257 | } |
| 258 | |
| 259 | static int bpf_program_profiler__enable(struct evsel *evsel) |
| 260 | { |
| 261 | struct bpf_counter *counter; |
| 262 | int ret; |
| 263 | |
| 264 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
| 265 | assert(counter->skel != NULL); |
| 266 | ret = bpf_prog_profiler_bpf__attach(counter->skel); |
| 267 | if (ret) { |
| 268 | bpf_program_profiler__destroy(evsel); |
| 269 | return ret; |
| 270 | } |
| 271 | } |
| 272 | return 0; |
| 273 | } |
| 274 | |
| 275 | static int bpf_program_profiler__disable(struct evsel *evsel) |
| 276 | { |
| 277 | struct bpf_counter *counter; |
| 278 | |
| 279 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
| 280 | assert(counter->skel != NULL); |
| 281 | bpf_prog_profiler_bpf__detach(counter->skel); |
| 282 | } |
| 283 | return 0; |
| 284 | } |
| 285 | |
| 286 | static int bpf_program_profiler__read(struct evsel *evsel) |
| 287 | { |
| 288 | // BPF_MAP_TYPE_PERCPU_ARRAY uses /sys/devices/system/cpu/possible |
| 289 | // Sometimes possible > online, like on a Ryzen 3900X that has 24 |
| 290 | // threads but its possible showed 0-31 -acme |
| 291 | int num_cpu_bpf = libbpf_num_possible_cpus(); |
| 292 | struct bpf_perf_event_value values[num_cpu_bpf]; |
| 293 | struct bpf_counter *counter; |
| 294 | struct perf_counts_values *counts; |
| 295 | int reading_map_fd; |
| 296 | __u32 key = 0; |
| 297 | int err, idx, bpf_cpu; |
| 298 | |
| 299 | if (list_empty(head: &evsel->bpf_counter_list)) |
| 300 | return -EAGAIN; |
| 301 | |
| 302 | perf_cpu_map__for_each_idx(idx, evsel__cpus(evsel)) { |
| 303 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: idx, thread: 0); |
| 304 | counts->val = 0; |
| 305 | counts->ena = 0; |
| 306 | counts->run = 0; |
| 307 | } |
| 308 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
| 309 | struct bpf_prog_profiler_bpf *skel = counter->skel; |
| 310 | |
| 311 | assert(skel != NULL); |
| 312 | reading_map_fd = bpf_map__fd(skel->maps.accum_readings); |
| 313 | |
| 314 | err = bpf_map_lookup_elem(reading_map_fd, &key, values); |
| 315 | if (err) { |
| 316 | pr_err("failed to read value\n" ); |
| 317 | return err; |
| 318 | } |
| 319 | |
| 320 | for (bpf_cpu = 0; bpf_cpu < num_cpu_bpf; bpf_cpu++) { |
| 321 | idx = perf_cpu_map__idx(evsel__cpus(evsel), |
| 322 | (struct perf_cpu){.cpu = bpf_cpu}); |
| 323 | if (idx == -1) |
| 324 | continue; |
| 325 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: idx, thread: 0); |
| 326 | counts->val += values[bpf_cpu].counter; |
| 327 | counts->ena += values[bpf_cpu].enabled; |
| 328 | counts->run += values[bpf_cpu].running; |
| 329 | } |
| 330 | } |
| 331 | return 0; |
| 332 | } |
| 333 | |
| 334 | static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx, |
| 335 | int fd) |
| 336 | { |
| 337 | struct bpf_prog_profiler_bpf *skel; |
| 338 | struct bpf_counter *counter; |
| 339 | int cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx).cpu; |
| 340 | int ret; |
| 341 | |
| 342 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
| 343 | skel = counter->skel; |
| 344 | assert(skel != NULL); |
| 345 | |
| 346 | ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events), |
| 347 | &cpu, &fd, BPF_ANY); |
| 348 | if (ret) |
| 349 | return ret; |
| 350 | } |
| 351 | return 0; |
| 352 | } |
| 353 | |
| 354 | struct bpf_counter_ops bpf_program_profiler_ops = { |
| 355 | .load = bpf_program_profiler__load, |
| 356 | .enable = bpf_program_profiler__enable, |
| 357 | .disable = bpf_program_profiler__disable, |
| 358 | .read = bpf_program_profiler__read, |
| 359 | .destroy = bpf_program_profiler__destroy, |
| 360 | .install_pe = bpf_program_profiler__install_pe, |
| 361 | }; |
| 362 | |
| 363 | static bool bperf_attr_map_compatible(int attr_map_fd) |
| 364 | { |
| 365 | struct bpf_map_info map_info = {0}; |
| 366 | __u32 map_info_len = sizeof(map_info); |
| 367 | int err; |
| 368 | |
| 369 | err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len); |
| 370 | |
| 371 | if (err) |
| 372 | return false; |
| 373 | return (map_info.key_size == sizeof(struct perf_event_attr)) && |
| 374 | (map_info.value_size == sizeof(struct perf_event_attr_map_entry)); |
| 375 | } |
| 376 | |
| 377 | static int bperf_lock_attr_map(struct target *target) |
| 378 | { |
| 379 | char path[PATH_MAX]; |
| 380 | int map_fd, err; |
| 381 | |
| 382 | if (target->attr_map) { |
| 383 | scnprintf(buf: path, PATH_MAX, fmt: "%s" , target->attr_map); |
| 384 | } else { |
| 385 | scnprintf(path, PATH_MAX, "%s/fs/bpf/%s" , sysfs__mountpoint(), |
| 386 | BPF_PERF_DEFAULT_ATTR_MAP_PATH); |
| 387 | } |
| 388 | |
| 389 | if (access(path, F_OK)) { |
| 390 | map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, |
| 391 | sizeof(struct perf_event_attr), |
| 392 | sizeof(struct perf_event_attr_map_entry), |
| 393 | ATTR_MAP_SIZE, NULL); |
| 394 | if (map_fd < 0) |
| 395 | return -1; |
| 396 | |
| 397 | err = bpf_obj_pin(map_fd, path); |
| 398 | if (err) { |
| 399 | /* someone pinned the map in parallel? */ |
| 400 | close(map_fd); |
| 401 | map_fd = bpf_obj_get(path); |
| 402 | if (map_fd < 0) |
| 403 | return -1; |
| 404 | } |
| 405 | } else { |
| 406 | map_fd = bpf_obj_get(path); |
| 407 | if (map_fd < 0) |
| 408 | return -1; |
| 409 | } |
| 410 | |
| 411 | if (!bperf_attr_map_compatible(attr_map_fd: map_fd)) { |
| 412 | close(map_fd); |
| 413 | return -1; |
| 414 | |
| 415 | } |
| 416 | err = flock(map_fd, LOCK_EX); |
| 417 | if (err) { |
| 418 | close(map_fd); |
| 419 | return -1; |
| 420 | } |
| 421 | return map_fd; |
| 422 | } |
| 423 | |
| 424 | static int bperf_check_target(struct evsel *evsel, |
| 425 | struct target *target, |
| 426 | enum bperf_filter_type *filter_type, |
| 427 | __u32 *filter_entry_cnt) |
| 428 | { |
| 429 | if (evsel->core.leader->nr_members > 1) { |
| 430 | pr_err("bpf managed perf events do not yet support groups.\n" ); |
| 431 | return -1; |
| 432 | } |
| 433 | |
| 434 | /* determine filter type based on target */ |
| 435 | if (target->system_wide) { |
| 436 | *filter_type = BPERF_FILTER_GLOBAL; |
| 437 | *filter_entry_cnt = 1; |
| 438 | } else if (target->cpu_list) { |
| 439 | *filter_type = BPERF_FILTER_CPU; |
| 440 | *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel)); |
| 441 | } else if (target->tid) { |
| 442 | *filter_type = BPERF_FILTER_PID; |
| 443 | *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); |
| 444 | } else if (target->pid || evsel->evlist->workload.pid != -1) { |
| 445 | *filter_type = BPERF_FILTER_TGID; |
| 446 | *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); |
| 447 | } else { |
| 448 | pr_err("bpf managed perf events do not yet support these targets.\n" ); |
| 449 | return -1; |
| 450 | } |
| 451 | |
| 452 | return 0; |
| 453 | } |
| 454 | |
| 455 | static __u32 filter_entry_cnt; |
| 456 | |
| 457 | static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, |
| 458 | struct perf_event_attr_map_entry *entry) |
| 459 | { |
| 460 | struct bperf_leader_bpf *skel = bperf_leader_bpf__open(); |
| 461 | int link_fd, diff_map_fd, err; |
| 462 | struct bpf_link *link = NULL; |
| 463 | struct perf_thread_map *threads; |
| 464 | |
| 465 | if (!skel) { |
| 466 | pr_err("Failed to open leader skeleton\n" ); |
| 467 | return -1; |
| 468 | } |
| 469 | |
| 470 | bpf_map__set_max_entries(skel->maps.events, libbpf_num_possible_cpus()); |
| 471 | err = bperf_leader_bpf__load(skel); |
| 472 | if (err) { |
| 473 | pr_err("Failed to load leader skeleton\n" ); |
| 474 | goto out; |
| 475 | } |
| 476 | |
| 477 | link = bpf_program__attach(skel->progs.on_switch); |
| 478 | if (IS_ERR(ptr: link)) { |
| 479 | pr_err("Failed to attach leader program\n" ); |
| 480 | err = PTR_ERR(ptr: link); |
| 481 | goto out; |
| 482 | } |
| 483 | |
| 484 | link_fd = bpf_link__fd(link); |
| 485 | diff_map_fd = bpf_map__fd(skel->maps.diff_readings); |
| 486 | entry->link_id = bpf_link_get_id(fd: link_fd); |
| 487 | entry->diff_map_id = bpf_map_get_id(fd: diff_map_fd); |
| 488 | err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY); |
| 489 | assert(err == 0); |
| 490 | |
| 491 | evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id); |
| 492 | assert(evsel->bperf_leader_link_fd >= 0); |
| 493 | |
| 494 | /* |
| 495 | * save leader_skel for install_pe, which is called within |
| 496 | * following evsel__open_per_cpu call |
| 497 | */ |
| 498 | evsel->leader_skel = skel; |
| 499 | assert(!perf_cpu_map__has_any_cpu_or_is_empty(evsel->core.cpus)); |
| 500 | /* Always open system wide. */ |
| 501 | threads = thread_map__new_by_tid(tid: -1); |
| 502 | evsel__open(evsel, cpus: evsel->core.cpus, threads); |
| 503 | perf_thread_map__put(threads); |
| 504 | |
| 505 | out: |
| 506 | bperf_leader_bpf__destroy(skel); |
| 507 | bpf_link__destroy(link); |
| 508 | return err; |
| 509 | } |
| 510 | |
| 511 | static int bperf_attach_follower_program(struct bperf_follower_bpf *skel, |
| 512 | enum bperf_filter_type filter_type, |
| 513 | bool inherit) |
| 514 | { |
| 515 | struct bpf_link *link; |
| 516 | int err = 0; |
| 517 | |
| 518 | if ((filter_type == BPERF_FILTER_PID || |
| 519 | filter_type == BPERF_FILTER_TGID) && inherit) |
| 520 | /* attach all follower bpf progs to enable event inheritance */ |
| 521 | err = bperf_follower_bpf__attach(skel); |
| 522 | else { |
| 523 | link = bpf_program__attach(skel->progs.fexit_XXX); |
| 524 | if (IS_ERR(ptr: link)) |
| 525 | err = PTR_ERR(ptr: link); |
| 526 | } |
| 527 | |
| 528 | return err; |
| 529 | } |
| 530 | |
| 531 | static int bperf__load(struct evsel *evsel, struct target *target) |
| 532 | { |
| 533 | struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff}; |
| 534 | int attr_map_fd, diff_map_fd = -1, err; |
| 535 | enum bperf_filter_type filter_type; |
| 536 | __u32 i; |
| 537 | |
| 538 | if (bperf_check_target(evsel, target, filter_type: &filter_type, filter_entry_cnt: &filter_entry_cnt)) |
| 539 | return -1; |
| 540 | |
| 541 | evsel->bperf_leader_prog_fd = -1; |
| 542 | evsel->bperf_leader_link_fd = -1; |
| 543 | |
| 544 | /* |
| 545 | * Step 1: hold a fd on the leader program and the bpf_link, if |
| 546 | * the program is not already gone, reload the program. |
| 547 | * Use flock() to ensure exclusive access to the perf_event_attr |
| 548 | * map. |
| 549 | */ |
| 550 | attr_map_fd = bperf_lock_attr_map(target); |
| 551 | if (attr_map_fd < 0) { |
| 552 | pr_err("Failed to lock perf_event_attr map\n" ); |
| 553 | return -1; |
| 554 | } |
| 555 | |
| 556 | err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry); |
| 557 | if (err) { |
| 558 | err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY); |
| 559 | if (err) |
| 560 | goto out; |
| 561 | } |
| 562 | |
| 563 | evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id); |
| 564 | if (evsel->bperf_leader_link_fd < 0 && |
| 565 | bperf_reload_leader_program(evsel, attr_map_fd, entry: &entry)) { |
| 566 | err = -1; |
| 567 | goto out; |
| 568 | } |
| 569 | /* |
| 570 | * The bpf_link holds reference to the leader program, and the |
| 571 | * leader program holds reference to the maps. Therefore, if |
| 572 | * link_id is valid, diff_map_id should also be valid. |
| 573 | */ |
| 574 | evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id( |
| 575 | bpf_link_get_prog_id(fd: evsel->bperf_leader_link_fd)); |
| 576 | assert(evsel->bperf_leader_prog_fd >= 0); |
| 577 | |
| 578 | diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id); |
| 579 | assert(diff_map_fd >= 0); |
| 580 | |
| 581 | /* |
| 582 | * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check |
| 583 | * whether the kernel support it |
| 584 | */ |
| 585 | err = bperf_trigger_reading(prog_fd: evsel->bperf_leader_prog_fd, cpu: 0); |
| 586 | if (err) { |
| 587 | pr_err("The kernel does not support test_run for raw_tp BPF programs.\n" |
| 588 | "Therefore, --use-bpf might show inaccurate readings\n" ); |
| 589 | goto out; |
| 590 | } |
| 591 | |
| 592 | /* Step 2: load the follower skeleton */ |
| 593 | evsel->follower_skel = bperf_follower_bpf__open(); |
| 594 | if (!evsel->follower_skel) { |
| 595 | err = -1; |
| 596 | pr_err("Failed to open follower skeleton\n" ); |
| 597 | goto out; |
| 598 | } |
| 599 | |
| 600 | /* attach fexit program to the leader program */ |
| 601 | bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX, |
| 602 | evsel->bperf_leader_prog_fd, "on_switch" ); |
| 603 | |
| 604 | /* connect to leader diff_reading map */ |
| 605 | bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd); |
| 606 | |
| 607 | /* set up reading map */ |
| 608 | bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, |
| 609 | filter_entry_cnt); |
| 610 | err = bperf_follower_bpf__load(evsel->follower_skel); |
| 611 | if (err) { |
| 612 | pr_err("Failed to load follower skeleton\n" ); |
| 613 | bperf_follower_bpf__destroy(evsel->follower_skel); |
| 614 | evsel->follower_skel = NULL; |
| 615 | goto out; |
| 616 | } |
| 617 | |
| 618 | for (i = 0; i < filter_entry_cnt; i++) { |
| 619 | int filter_map_fd; |
| 620 | __u32 key; |
| 621 | struct bperf_filter_value fval = { i, 0 }; |
| 622 | |
| 623 | if (filter_type == BPERF_FILTER_PID || |
| 624 | filter_type == BPERF_FILTER_TGID) |
| 625 | key = perf_thread_map__pid(evsel->core.threads, i); |
| 626 | else if (filter_type == BPERF_FILTER_CPU) |
| 627 | key = perf_cpu_map__cpu(evsel->core.cpus, i).cpu; |
| 628 | else |
| 629 | break; |
| 630 | |
| 631 | filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); |
| 632 | bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY); |
| 633 | } |
| 634 | |
| 635 | evsel->follower_skel->bss->type = filter_type; |
| 636 | evsel->follower_skel->bss->inherit = target->inherit; |
| 637 | |
| 638 | err = bperf_attach_follower_program(skel: evsel->follower_skel, filter_type, |
| 639 | inherit: target->inherit); |
| 640 | |
| 641 | out: |
| 642 | if (err && evsel->bperf_leader_link_fd >= 0) |
| 643 | close(evsel->bperf_leader_link_fd); |
| 644 | if (err && evsel->bperf_leader_prog_fd >= 0) |
| 645 | close(evsel->bperf_leader_prog_fd); |
| 646 | if (diff_map_fd >= 0) |
| 647 | close(diff_map_fd); |
| 648 | |
| 649 | flock(attr_map_fd, LOCK_UN); |
| 650 | close(attr_map_fd); |
| 651 | |
| 652 | return err; |
| 653 | } |
| 654 | |
| 655 | static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) |
| 656 | { |
| 657 | struct bperf_leader_bpf *skel = evsel->leader_skel; |
| 658 | int cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx).cpu; |
| 659 | |
| 660 | return bpf_map_update_elem(bpf_map__fd(skel->maps.events), |
| 661 | &cpu, &fd, BPF_ANY); |
| 662 | } |
| 663 | |
| 664 | /* |
| 665 | * trigger the leader prog on each cpu, so the accum_reading map could get |
| 666 | * the latest readings. |
| 667 | */ |
| 668 | static int bperf_sync_counters(struct evsel *evsel) |
| 669 | { |
| 670 | struct perf_cpu cpu; |
| 671 | int idx; |
| 672 | |
| 673 | perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus) |
| 674 | bperf_trigger_reading(prog_fd: evsel->bperf_leader_prog_fd, cpu: cpu.cpu); |
| 675 | |
| 676 | return 0; |
| 677 | } |
| 678 | |
| 679 | static int bperf__enable(struct evsel *evsel) |
| 680 | { |
| 681 | evsel->follower_skel->bss->enabled = 1; |
| 682 | return 0; |
| 683 | } |
| 684 | |
| 685 | static int bperf__disable(struct evsel *evsel) |
| 686 | { |
| 687 | evsel->follower_skel->bss->enabled = 0; |
| 688 | return 0; |
| 689 | } |
| 690 | |
| 691 | static int bperf__read(struct evsel *evsel) |
| 692 | { |
| 693 | struct bperf_follower_bpf *skel = evsel->follower_skel; |
| 694 | __u32 num_cpu_bpf = cpu__max_cpu().cpu; |
| 695 | struct bpf_perf_event_value values[num_cpu_bpf]; |
| 696 | struct perf_counts_values *counts; |
| 697 | int reading_map_fd, err = 0; |
| 698 | __u32 i; |
| 699 | int j; |
| 700 | |
| 701 | bperf_sync_counters(evsel); |
| 702 | reading_map_fd = bpf_map__fd(skel->maps.accum_readings); |
| 703 | |
| 704 | for (i = 0; i < filter_entry_cnt; i++) { |
| 705 | struct perf_cpu entry; |
| 706 | __u32 cpu; |
| 707 | |
| 708 | err = bpf_map_lookup_elem(reading_map_fd, &i, values); |
| 709 | if (err) |
| 710 | goto out; |
| 711 | switch (evsel->follower_skel->bss->type) { |
| 712 | case BPERF_FILTER_GLOBAL: |
| 713 | assert(i == 0); |
| 714 | |
| 715 | perf_cpu_map__for_each_cpu(entry, j, evsel__cpus(evsel)) { |
| 716 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: j, thread: 0); |
| 717 | counts->val = values[entry.cpu].counter; |
| 718 | counts->ena = values[entry.cpu].enabled; |
| 719 | counts->run = values[entry.cpu].running; |
| 720 | } |
| 721 | break; |
| 722 | case BPERF_FILTER_CPU: |
| 723 | cpu = perf_cpu_map__cpu(evsel__cpus(evsel), i).cpu; |
| 724 | assert(cpu >= 0); |
| 725 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: i, thread: 0); |
| 726 | counts->val = values[cpu].counter; |
| 727 | counts->ena = values[cpu].enabled; |
| 728 | counts->run = values[cpu].running; |
| 729 | break; |
| 730 | case BPERF_FILTER_PID: |
| 731 | case BPERF_FILTER_TGID: |
| 732 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: 0, thread: i); |
| 733 | counts->val = 0; |
| 734 | counts->ena = 0; |
| 735 | counts->run = 0; |
| 736 | |
| 737 | for (cpu = 0; cpu < num_cpu_bpf; cpu++) { |
| 738 | counts->val += values[cpu].counter; |
| 739 | counts->ena += values[cpu].enabled; |
| 740 | counts->run += values[cpu].running; |
| 741 | } |
| 742 | break; |
| 743 | default: |
| 744 | break; |
| 745 | } |
| 746 | } |
| 747 | out: |
| 748 | return err; |
| 749 | } |
| 750 | |
| 751 | static int bperf__destroy(struct evsel *evsel) |
| 752 | { |
| 753 | bperf_follower_bpf__destroy(evsel->follower_skel); |
| 754 | close(evsel->bperf_leader_prog_fd); |
| 755 | close(evsel->bperf_leader_link_fd); |
| 756 | return 0; |
| 757 | } |
| 758 | |
| 759 | /* |
| 760 | * bperf: share hardware PMCs with BPF |
| 761 | * |
| 762 | * perf uses performance monitoring counters (PMC) to monitor system |
| 763 | * performance. The PMCs are limited hardware resources. For example, |
| 764 | * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. |
| 765 | * |
| 766 | * Modern data center systems use these PMCs in many different ways: |
| 767 | * system level monitoring, (maybe nested) container level monitoring, per |
| 768 | * process monitoring, profiling (in sample mode), etc. In some cases, |
| 769 | * there are more active perf_events than available hardware PMCs. To allow |
| 770 | * all perf_events to have a chance to run, it is necessary to do expensive |
| 771 | * time multiplexing of events. |
| 772 | * |
| 773 | * On the other hand, many monitoring tools count the common metrics |
| 774 | * (cycles, instructions). It is a waste to have multiple tools create |
| 775 | * multiple perf_events of "cycles" and occupy multiple PMCs. |
| 776 | * |
| 777 | * bperf tries to reduce such wastes by allowing multiple perf_events of |
| 778 | * "cycles" or "instructions" (at different scopes) to share PMUs. Instead |
| 779 | * of having each perf-stat session to read its own perf_events, bperf uses |
| 780 | * BPF programs to read the perf_events and aggregate readings to BPF maps. |
| 781 | * Then, the perf-stat session(s) reads the values from these BPF maps. |
| 782 | * |
| 783 | * || |
| 784 | * shared progs and maps <- || -> per session progs and maps |
| 785 | * || |
| 786 | * --------------- || |
| 787 | * | perf_events | || |
| 788 | * --------------- fexit || ----------------- |
| 789 | * | --------||----> | follower prog | |
| 790 | * --------------- / || --- ----------------- |
| 791 | * cs -> | leader prog |/ ||/ | | |
| 792 | * --> --------------- /|| -------------- ------------------ |
| 793 | * / | | / || | filter map | | accum_readings | |
| 794 | * / ------------ ------------ || -------------- ------------------ |
| 795 | * | | prev map | | diff map | || | |
| 796 | * | ------------ ------------ || | |
| 797 | * \ || | |
| 798 | * = \ ==================================================== | ============ |
| 799 | * \ / user space |
| 800 | * \ / |
| 801 | * \ / |
| 802 | * BPF_PROG_TEST_RUN BPF_MAP_LOOKUP_ELEM |
| 803 | * \ / |
| 804 | * \ / |
| 805 | * \------ perf-stat ----------------------/ |
| 806 | * |
| 807 | * The figure above shows the architecture of bperf. Note that the figure |
| 808 | * is divided into 3 regions: shared progs and maps (top left), per session |
| 809 | * progs and maps (top right), and user space (bottom). |
| 810 | * |
| 811 | * The leader prog is triggered on each context switch (cs). The leader |
| 812 | * prog reads perf_events and stores the difference (current_reading - |
| 813 | * previous_reading) to the diff map. For the same metric, e.g. "cycles", |
| 814 | * multiple perf-stat sessions share the same leader prog. |
| 815 | * |
| 816 | * Each perf-stat session creates a follower prog as fexit program to the |
| 817 | * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38) |
| 818 | * follower progs to the same leader prog. The follower prog checks current |
| 819 | * task and processor ID to decide whether to add the value from the diff |
| 820 | * map to its accumulated reading map (accum_readings). |
| 821 | * |
| 822 | * Finally, perf-stat user space reads the value from accum_reading map. |
| 823 | * |
| 824 | * Besides context switch, it is also necessary to trigger the leader prog |
| 825 | * before perf-stat reads the value. Otherwise, the accum_reading map may |
| 826 | * not have the latest reading from the perf_events. This is achieved by |
| 827 | * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU. |
| 828 | * |
| 829 | * Comment before the definition of struct perf_event_attr_map_entry |
| 830 | * describes how different sessions of perf-stat share information about |
| 831 | * the leader prog. |
| 832 | */ |
| 833 | |
| 834 | struct bpf_counter_ops bperf_ops = { |
| 835 | .load = bperf__load, |
| 836 | .enable = bperf__enable, |
| 837 | .disable = bperf__disable, |
| 838 | .read = bperf__read, |
| 839 | .install_pe = bperf__install_pe, |
| 840 | .destroy = bperf__destroy, |
| 841 | }; |
| 842 | |
| 843 | extern struct bpf_counter_ops bperf_cgrp_ops; |
| 844 | |
| 845 | static bool bpf_counter_skip(struct evsel *evsel) |
| 846 | { |
| 847 | return evsel->bpf_counter_ops == NULL; |
| 848 | } |
| 849 | |
| 850 | int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) |
| 851 | { |
| 852 | if (bpf_counter_skip(evsel)) |
| 853 | return 0; |
| 854 | return evsel->bpf_counter_ops->install_pe(evsel, cpu_map_idx, fd); |
| 855 | } |
| 856 | |
| 857 | int bpf_counter__load(struct evsel *evsel, struct target *target) |
| 858 | { |
| 859 | if (target->bpf_str) |
| 860 | evsel->bpf_counter_ops = &bpf_program_profiler_ops; |
| 861 | else if (cgrp_event_expanded && target->use_bpf) |
| 862 | evsel->bpf_counter_ops = &bperf_cgrp_ops; |
| 863 | else if (target->use_bpf || evsel->bpf_counter || |
| 864 | evsel__match_bpf_counter_events(name: evsel->name)) |
| 865 | evsel->bpf_counter_ops = &bperf_ops; |
| 866 | |
| 867 | if (evsel->bpf_counter_ops) |
| 868 | return evsel->bpf_counter_ops->load(evsel, target); |
| 869 | return 0; |
| 870 | } |
| 871 | |
| 872 | int bpf_counter__enable(struct evsel *evsel) |
| 873 | { |
| 874 | if (bpf_counter_skip(evsel)) |
| 875 | return 0; |
| 876 | return evsel->bpf_counter_ops->enable(evsel); |
| 877 | } |
| 878 | |
| 879 | int bpf_counter__disable(struct evsel *evsel) |
| 880 | { |
| 881 | if (bpf_counter_skip(evsel)) |
| 882 | return 0; |
| 883 | return evsel->bpf_counter_ops->disable(evsel); |
| 884 | } |
| 885 | |
| 886 | int bpf_counter__read(struct evsel *evsel) |
| 887 | { |
| 888 | if (bpf_counter_skip(evsel)) |
| 889 | return -EAGAIN; |
| 890 | return evsel->bpf_counter_ops->read(evsel); |
| 891 | } |
| 892 | |
| 893 | void bpf_counter__destroy(struct evsel *evsel) |
| 894 | { |
| 895 | if (bpf_counter_skip(evsel)) |
| 896 | return; |
| 897 | evsel->bpf_counter_ops->destroy(evsel); |
| 898 | evsel->bpf_counter_ops = NULL; |
| 899 | evsel->bpf_skel = NULL; |
| 900 | } |
| 901 | |