1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * access_tracking_perf_test |
4 | * |
5 | * Copyright (C) 2021, Google, Inc. |
6 | * |
7 | * This test measures the performance effects of KVM's access tracking. |
8 | * Access tracking is driven by the MMU notifiers test_young, clear_young, and |
9 | * clear_flush_young. These notifiers do not have a direct userspace API, |
10 | * however the clear_young notifier can be triggered by marking a pages as idle |
11 | * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to |
12 | * enable access tracking on guest memory. |
13 | * |
14 | * To measure performance this test runs a VM with a configurable number of |
15 | * vCPUs that each touch every page in disjoint regions of memory. Performance |
16 | * is measured in the time it takes all vCPUs to finish touching their |
17 | * predefined region. |
18 | * |
19 | * Note that a deterministic correctness test of access tracking is not possible |
20 | * by using page_idle as it exists today. This is for a few reasons: |
21 | * |
22 | * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This |
23 | * means subsequent guest accesses are not guaranteed to see page table |
24 | * updates made by KVM until some time in the future. |
25 | * |
26 | * 2. page_idle only operates on LRU pages. Newly allocated pages are not |
27 | * immediately allocated to LRU lists. Instead they are held in a "pagevec", |
28 | * which is drained to LRU lists some time in the future. There is no |
29 | * userspace API to force this drain to occur. |
30 | * |
31 | * These limitations are worked around in this test by using a large enough |
32 | * region of memory for each vCPU such that the number of translations cached in |
33 | * the TLB and the number of pages held in pagevecs are a small fraction of the |
34 | * overall workload. And if either of those conditions are not true (for example |
35 | * in nesting, where TLB size is unlimited) this test will print a warning |
36 | * rather than silently passing. |
37 | */ |
38 | #include <inttypes.h> |
39 | #include <limits.h> |
40 | #include <pthread.h> |
41 | #include <sys/mman.h> |
42 | #include <sys/types.h> |
43 | #include <sys/stat.h> |
44 | |
45 | #include "kvm_util.h" |
46 | #include "test_util.h" |
47 | #include "memstress.h" |
48 | #include "guest_modes.h" |
49 | #include "processor.h" |
50 | |
51 | /* Global variable used to synchronize all of the vCPU threads. */ |
52 | static int iteration; |
53 | |
54 | /* Defines what vCPU threads should do during a given iteration. */ |
55 | static enum { |
56 | /* Run the vCPU to access all its memory. */ |
57 | ITERATION_ACCESS_MEMORY, |
58 | /* Mark the vCPU's memory idle in page_idle. */ |
59 | ITERATION_MARK_IDLE, |
60 | } iteration_work; |
61 | |
62 | /* The iteration that was last completed by each vCPU. */ |
63 | static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; |
64 | |
65 | /* Whether to overlap the regions of memory vCPUs access. */ |
66 | static bool overlap_memory_access; |
67 | |
68 | struct test_params { |
69 | /* The backing source for the region of memory. */ |
70 | enum vm_mem_backing_src_type backing_src; |
71 | |
72 | /* The amount of memory to allocate for each vCPU. */ |
73 | uint64_t vcpu_memory_bytes; |
74 | |
75 | /* The number of vCPUs to create in the VM. */ |
76 | int nr_vcpus; |
77 | }; |
78 | |
79 | static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) |
80 | { |
81 | uint64_t value; |
82 | off_t offset = index * sizeof(value); |
83 | |
84 | TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value), |
85 | "pread from %s offset 0x%" PRIx64 " failed!" , |
86 | filename, offset); |
87 | |
88 | return value; |
89 | |
90 | } |
91 | |
92 | #define PAGEMAP_PRESENT (1ULL << 63) |
93 | #define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) |
94 | |
95 | static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) |
96 | { |
97 | uint64_t hva = (uint64_t) addr_gva2hva(vm, gva); |
98 | uint64_t entry; |
99 | uint64_t pfn; |
100 | |
101 | entry = pread_uint64(pagemap_fd, "pagemap" , hva / getpagesize()); |
102 | if (!(entry & PAGEMAP_PRESENT)) |
103 | return 0; |
104 | |
105 | pfn = entry & PAGEMAP_PFN_MASK; |
106 | __TEST_REQUIRE(pfn, "Looking up PFNs requires CAP_SYS_ADMIN" ); |
107 | |
108 | return pfn; |
109 | } |
110 | |
111 | static bool is_page_idle(int page_idle_fd, uint64_t pfn) |
112 | { |
113 | uint64_t bits = pread_uint64(page_idle_fd, "page_idle" , pfn / 64); |
114 | |
115 | return !!((bits >> (pfn % 64)) & 1); |
116 | } |
117 | |
118 | static void mark_page_idle(int page_idle_fd, uint64_t pfn) |
119 | { |
120 | uint64_t bits = 1ULL << (pfn % 64); |
121 | |
122 | TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8, |
123 | "Set page_idle bits for PFN 0x%" PRIx64, pfn); |
124 | } |
125 | |
126 | static void mark_vcpu_memory_idle(struct kvm_vm *vm, |
127 | struct memstress_vcpu_args *vcpu_args) |
128 | { |
129 | int vcpu_idx = vcpu_args->vcpu_idx; |
130 | uint64_t base_gva = vcpu_args->gva; |
131 | uint64_t pages = vcpu_args->pages; |
132 | uint64_t page; |
133 | uint64_t still_idle = 0; |
134 | uint64_t no_pfn = 0; |
135 | int page_idle_fd; |
136 | int pagemap_fd; |
137 | |
138 | /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ |
139 | if (overlap_memory_access && vcpu_idx) |
140 | return; |
141 | |
142 | page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap" , O_RDWR); |
143 | TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle." ); |
144 | |
145 | pagemap_fd = open("/proc/self/pagemap" , O_RDONLY); |
146 | TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap." ); |
147 | |
148 | for (page = 0; page < pages; page++) { |
149 | uint64_t gva = base_gva + page * memstress_args.guest_page_size; |
150 | uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); |
151 | |
152 | if (!pfn) { |
153 | no_pfn++; |
154 | continue; |
155 | } |
156 | |
157 | if (is_page_idle(page_idle_fd, pfn)) { |
158 | still_idle++; |
159 | continue; |
160 | } |
161 | |
162 | mark_page_idle(page_idle_fd, pfn); |
163 | } |
164 | |
165 | /* |
166 | * Assumption: Less than 1% of pages are going to be swapped out from |
167 | * under us during this test. |
168 | */ |
169 | TEST_ASSERT(no_pfn < pages / 100, |
170 | "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages." , |
171 | vcpu_idx, no_pfn, pages); |
172 | |
173 | /* |
174 | * Check that at least 90% of memory has been marked idle (the rest |
175 | * might not be marked idle because the pages have not yet made it to an |
176 | * LRU list or the translations are still cached in the TLB). 90% is |
177 | * arbitrary; high enough that we ensure most memory access went through |
178 | * access tracking but low enough as to not make the test too brittle |
179 | * over time and across architectures. |
180 | * |
181 | * When running the guest as a nested VM, "warn" instead of asserting |
182 | * as the TLB size is effectively unlimited and the KVM doesn't |
183 | * explicitly flush the TLB when aging SPTEs. As a result, more pages |
184 | * are cached and the guest won't see the "idle" bit cleared. |
185 | */ |
186 | if (still_idle >= pages / 10) { |
187 | #ifdef __x86_64__ |
188 | TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR), |
189 | "vCPU%d: Too many pages still idle (%lu out of %lu)" , |
190 | vcpu_idx, still_idle, pages); |
191 | #endif |
192 | printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), " |
193 | "this will affect performance results.\n" , |
194 | vcpu_idx, still_idle, pages); |
195 | } |
196 | |
197 | close(page_idle_fd); |
198 | close(pagemap_fd); |
199 | } |
200 | |
201 | static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) |
202 | { |
203 | struct ucall uc; |
204 | uint64_t actual_ucall = get_ucall(vcpu, &uc); |
205 | |
206 | TEST_ASSERT(expected_ucall == actual_ucall, |
207 | "Guest exited unexpectedly (expected ucall %" PRIu64 |
208 | ", got %" PRIu64 ")" , |
209 | expected_ucall, actual_ucall); |
210 | } |
211 | |
212 | static bool spin_wait_for_next_iteration(int *current_iteration) |
213 | { |
214 | int last_iteration = *current_iteration; |
215 | |
216 | do { |
217 | if (READ_ONCE(memstress_args.stop_vcpus)) |
218 | return false; |
219 | |
220 | *current_iteration = READ_ONCE(iteration); |
221 | } while (last_iteration == *current_iteration); |
222 | |
223 | return true; |
224 | } |
225 | |
226 | static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) |
227 | { |
228 | struct kvm_vcpu *vcpu = vcpu_args->vcpu; |
229 | struct kvm_vm *vm = memstress_args.vm; |
230 | int vcpu_idx = vcpu_args->vcpu_idx; |
231 | int current_iteration = 0; |
232 | |
233 | while (spin_wait_for_next_iteration(¤t_iteration)) { |
234 | switch (READ_ONCE(iteration_work)) { |
235 | case ITERATION_ACCESS_MEMORY: |
236 | vcpu_run(vcpu); |
237 | assert_ucall(vcpu, UCALL_SYNC); |
238 | break; |
239 | case ITERATION_MARK_IDLE: |
240 | mark_vcpu_memory_idle(vm, vcpu_args); |
241 | break; |
242 | }; |
243 | |
244 | vcpu_last_completed_iteration[vcpu_idx] = current_iteration; |
245 | } |
246 | } |
247 | |
248 | static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration) |
249 | { |
250 | while (READ_ONCE(vcpu_last_completed_iteration[vcpu_idx]) != |
251 | target_iteration) { |
252 | continue; |
253 | } |
254 | } |
255 | |
256 | /* The type of memory accesses to perform in the VM. */ |
257 | enum access_type { |
258 | ACCESS_READ, |
259 | ACCESS_WRITE, |
260 | }; |
261 | |
262 | static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description) |
263 | { |
264 | struct timespec ts_start; |
265 | struct timespec ts_elapsed; |
266 | int next_iteration, i; |
267 | |
268 | /* Kick off the vCPUs by incrementing iteration. */ |
269 | next_iteration = ++iteration; |
270 | |
271 | clock_gettime(CLOCK_MONOTONIC, &ts_start); |
272 | |
273 | /* Wait for all vCPUs to finish the iteration. */ |
274 | for (i = 0; i < nr_vcpus; i++) |
275 | spin_wait_for_vcpu(vcpu_idx: i, target_iteration: next_iteration); |
276 | |
277 | ts_elapsed = timespec_elapsed(ts_start); |
278 | pr_info("%-30s: %ld.%09lds\n" , |
279 | description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); |
280 | } |
281 | |
282 | static void access_memory(struct kvm_vm *vm, int nr_vcpus, |
283 | enum access_type access, const char *description) |
284 | { |
285 | memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); |
286 | iteration_work = ITERATION_ACCESS_MEMORY; |
287 | run_iteration(vm, nr_vcpus, description); |
288 | } |
289 | |
290 | static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus) |
291 | { |
292 | /* |
293 | * Even though this parallelizes the work across vCPUs, this is still a |
294 | * very slow operation because page_idle forces the test to mark one pfn |
295 | * at a time and the clear_young notifier serializes on the KVM MMU |
296 | * lock. |
297 | */ |
298 | pr_debug("Marking VM memory idle (slow)...\n" ); |
299 | iteration_work = ITERATION_MARK_IDLE; |
300 | run_iteration(vm, nr_vcpus, description: "Mark memory idle" ); |
301 | } |
302 | |
303 | static void run_test(enum vm_guest_mode mode, void *arg) |
304 | { |
305 | struct test_params *params = arg; |
306 | struct kvm_vm *vm; |
307 | int nr_vcpus = params->nr_vcpus; |
308 | |
309 | vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, |
310 | params->backing_src, !overlap_memory_access); |
311 | |
312 | memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main); |
313 | |
314 | pr_info("\n" ); |
315 | access_memory(vm, nr_vcpus, access: ACCESS_WRITE, description: "Populating memory" ); |
316 | |
317 | /* As a control, read and write to the populated memory first. */ |
318 | access_memory(vm, nr_vcpus, access: ACCESS_WRITE, description: "Writing to populated memory" ); |
319 | access_memory(vm, nr_vcpus, access: ACCESS_READ, description: "Reading from populated memory" ); |
320 | |
321 | /* Repeat on memory that has been marked as idle. */ |
322 | mark_memory_idle(vm, nr_vcpus); |
323 | access_memory(vm, nr_vcpus, access: ACCESS_WRITE, description: "Writing to idle memory" ); |
324 | mark_memory_idle(vm, nr_vcpus); |
325 | access_memory(vm, nr_vcpus, access: ACCESS_READ, description: "Reading from idle memory" ); |
326 | |
327 | memstress_join_vcpu_threads(nr_vcpus); |
328 | memstress_destroy_vm(vm); |
329 | } |
330 | |
331 | static void help(char *name) |
332 | { |
333 | puts("" ); |
334 | printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n" , |
335 | name); |
336 | puts("" ); |
337 | printf(" -h: Display this help message." ); |
338 | guest_modes_help(); |
339 | printf(" -b: specify the size of the memory region which should be\n" |
340 | " dirtied by each vCPU. e.g. 10M or 3G.\n" |
341 | " (default: 1G)\n" ); |
342 | printf(" -v: specify the number of vCPUs to run.\n" ); |
343 | printf(" -o: Overlap guest memory accesses instead of partitioning\n" |
344 | " them into a separate region of memory for each vCPU.\n" ); |
345 | backing_src_help("-s" ); |
346 | puts("" ); |
347 | exit(0); |
348 | } |
349 | |
350 | int main(int argc, char *argv[]) |
351 | { |
352 | struct test_params params = { |
353 | .backing_src = DEFAULT_VM_MEM_SRC, |
354 | .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, |
355 | .nr_vcpus = 1, |
356 | }; |
357 | int page_idle_fd; |
358 | int opt; |
359 | |
360 | guest_modes_append_default(); |
361 | |
362 | while ((opt = getopt(argc, argv, "hm:b:v:os:" )) != -1) { |
363 | switch (opt) { |
364 | case 'm': |
365 | guest_modes_cmdline(optarg); |
366 | break; |
367 | case 'b': |
368 | params.vcpu_memory_bytes = parse_size(optarg); |
369 | break; |
370 | case 'v': |
371 | params.nr_vcpus = atoi_positive("Number of vCPUs" , optarg); |
372 | break; |
373 | case 'o': |
374 | overlap_memory_access = true; |
375 | break; |
376 | case 's': |
377 | params.backing_src = parse_backing_src_type(optarg); |
378 | break; |
379 | case 'h': |
380 | default: |
381 | help(name: argv[0]); |
382 | break; |
383 | } |
384 | } |
385 | |
386 | page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap" , O_RDWR); |
387 | __TEST_REQUIRE(page_idle_fd >= 0, |
388 | "CONFIG_IDLE_PAGE_TRACKING is not enabled" ); |
389 | close(page_idle_fd); |
390 | |
391 | for_each_guest_mode(run_test, ¶ms); |
392 | |
393 | return 0; |
394 | } |
395 | |