1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * x86 APERF/MPERF KHz calculation for |
4 | * /sys/.../cpufreq/scaling_cur_freq |
5 | * |
6 | * Copyright (C) 2017 Intel Corp. |
7 | * Author: Len Brown <len.brown@intel.com> |
8 | */ |
9 | #include <linux/cpufreq.h> |
10 | #include <linux/delay.h> |
11 | #include <linux/ktime.h> |
12 | #include <linux/math64.h> |
13 | #include <linux/percpu.h> |
14 | #include <linux/rcupdate.h> |
15 | #include <linux/sched/isolation.h> |
16 | #include <linux/sched/topology.h> |
17 | #include <linux/smp.h> |
18 | #include <linux/syscore_ops.h> |
19 | |
20 | #include <asm/cpu.h> |
21 | #include <asm/cpu_device_id.h> |
22 | #include <asm/intel-family.h> |
23 | |
24 | #include "cpu.h" |
25 | |
26 | struct aperfmperf { |
27 | seqcount_t seq; |
28 | unsigned long last_update; |
29 | u64 acnt; |
30 | u64 mcnt; |
31 | u64 aperf; |
32 | u64 mperf; |
33 | }; |
34 | |
35 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { |
36 | .seq = SEQCNT_ZERO(cpu_samples.seq) |
37 | }; |
38 | |
39 | static void init_counter_refs(void) |
40 | { |
41 | u64 aperf, mperf; |
42 | |
43 | rdmsrl(MSR_IA32_APERF, aperf); |
44 | rdmsrl(MSR_IA32_MPERF, mperf); |
45 | |
46 | this_cpu_write(cpu_samples.aperf, aperf); |
47 | this_cpu_write(cpu_samples.mperf, mperf); |
48 | } |
49 | |
50 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) |
51 | /* |
52 | * APERF/MPERF frequency ratio computation. |
53 | * |
54 | * The scheduler wants to do frequency invariant accounting and needs a <1 |
55 | * ratio to account for the 'current' frequency, corresponding to |
56 | * freq_curr / freq_max. |
57 | * |
58 | * Since the frequency freq_curr on x86 is controlled by micro-controller and |
59 | * our P-state setting is little more than a request/hint, we need to observe |
60 | * the effective frequency 'BusyMHz', i.e. the average frequency over a time |
61 | * interval after discarding idle time. This is given by: |
62 | * |
63 | * BusyMHz = delta_APERF / delta_MPERF * freq_base |
64 | * |
65 | * where freq_base is the max non-turbo P-state. |
66 | * |
67 | * The freq_max term has to be set to a somewhat arbitrary value, because we |
68 | * can't know which turbo states will be available at a given point in time: |
69 | * it all depends on the thermal headroom of the entire package. We set it to |
70 | * the turbo level with 4 cores active. |
71 | * |
72 | * Benchmarks show that's a good compromise between the 1C turbo ratio |
73 | * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, |
74 | * which would ignore the entire turbo range (a conspicuous part, making |
75 | * freq_curr/freq_max always maxed out). |
76 | * |
77 | * An exception to the heuristic above is the Atom uarch, where we choose the |
78 | * highest turbo level for freq_max since Atom's are generally oriented towards |
79 | * power efficiency. |
80 | * |
81 | * Setting freq_max to anything less than the 1C turbo ratio makes the ratio |
82 | * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. |
83 | */ |
84 | |
85 | DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); |
86 | |
87 | static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; |
88 | static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; |
89 | |
90 | void arch_set_max_freq_ratio(bool turbo_disabled) |
91 | { |
92 | arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : |
93 | arch_turbo_freq_ratio; |
94 | } |
95 | EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); |
96 | |
97 | static bool __init turbo_disabled(void) |
98 | { |
99 | u64 misc_en; |
100 | int err; |
101 | |
102 | err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, p: &misc_en); |
103 | if (err) |
104 | return false; |
105 | |
106 | return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); |
107 | } |
108 | |
109 | static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) |
110 | { |
111 | int err; |
112 | |
113 | err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, p: base_freq); |
114 | if (err) |
115 | return false; |
116 | |
117 | err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, p: turbo_freq); |
118 | if (err) |
119 | return false; |
120 | |
121 | *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ |
122 | *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ |
123 | |
124 | return true; |
125 | } |
126 | |
127 | #define X86_MATCH(model) \ |
128 | X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ |
129 | INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) |
130 | |
131 | static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { |
132 | X86_MATCH(XEON_PHI_KNL), |
133 | X86_MATCH(XEON_PHI_KNM), |
134 | {} |
135 | }; |
136 | |
137 | static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { |
138 | X86_MATCH(SKYLAKE_X), |
139 | {} |
140 | }; |
141 | |
142 | static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { |
143 | X86_MATCH(ATOM_GOLDMONT), |
144 | X86_MATCH(ATOM_GOLDMONT_D), |
145 | X86_MATCH(ATOM_GOLDMONT_PLUS), |
146 | {} |
147 | }; |
148 | |
149 | static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, |
150 | int num_delta_fratio) |
151 | { |
152 | int fratio, delta_fratio, found; |
153 | int err, i; |
154 | u64 msr; |
155 | |
156 | err = rdmsrl_safe(MSR_PLATFORM_INFO, p: base_freq); |
157 | if (err) |
158 | return false; |
159 | |
160 | *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
161 | |
162 | err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, p: &msr); |
163 | if (err) |
164 | return false; |
165 | |
166 | fratio = (msr >> 8) & 0xFF; |
167 | i = 16; |
168 | found = 0; |
169 | do { |
170 | if (found >= num_delta_fratio) { |
171 | *turbo_freq = fratio; |
172 | return true; |
173 | } |
174 | |
175 | delta_fratio = (msr >> (i + 5)) & 0x7; |
176 | |
177 | if (delta_fratio) { |
178 | found += 1; |
179 | fratio -= delta_fratio; |
180 | } |
181 | |
182 | i += 8; |
183 | } while (i < 64); |
184 | |
185 | return true; |
186 | } |
187 | |
188 | static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) |
189 | { |
190 | u64 ratios, counts; |
191 | u32 group_size; |
192 | int err, i; |
193 | |
194 | err = rdmsrl_safe(MSR_PLATFORM_INFO, p: base_freq); |
195 | if (err) |
196 | return false; |
197 | |
198 | *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
199 | |
200 | err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, p: &ratios); |
201 | if (err) |
202 | return false; |
203 | |
204 | err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, p: &counts); |
205 | if (err) |
206 | return false; |
207 | |
208 | for (i = 0; i < 64; i += 8) { |
209 | group_size = (counts >> i) & 0xFF; |
210 | if (group_size >= size) { |
211 | *turbo_freq = (ratios >> i) & 0xFF; |
212 | return true; |
213 | } |
214 | } |
215 | |
216 | return false; |
217 | } |
218 | |
219 | static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) |
220 | { |
221 | u64 msr; |
222 | int err; |
223 | |
224 | err = rdmsrl_safe(MSR_PLATFORM_INFO, p: base_freq); |
225 | if (err) |
226 | return false; |
227 | |
228 | err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, p: &msr); |
229 | if (err) |
230 | return false; |
231 | |
232 | *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
233 | *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ |
234 | |
235 | /* The CPU may have less than 4 cores */ |
236 | if (!*turbo_freq) |
237 | *turbo_freq = msr & 0xFF; /* 1C turbo */ |
238 | |
239 | return true; |
240 | } |
241 | |
242 | static bool __init intel_set_max_freq_ratio(void) |
243 | { |
244 | u64 base_freq, turbo_freq; |
245 | u64 turbo_ratio; |
246 | |
247 | if (slv_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq)) |
248 | goto out; |
249 | |
250 | if (x86_match_cpu(match: has_glm_turbo_ratio_limits) && |
251 | skx_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq, size: 1)) |
252 | goto out; |
253 | |
254 | if (x86_match_cpu(match: has_knl_turbo_ratio_limits) && |
255 | knl_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq, num_delta_fratio: 1)) |
256 | goto out; |
257 | |
258 | if (x86_match_cpu(match: has_skx_turbo_ratio_limits) && |
259 | skx_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq, size: 4)) |
260 | goto out; |
261 | |
262 | if (core_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq)) |
263 | goto out; |
264 | |
265 | return false; |
266 | |
267 | out: |
268 | /* |
269 | * Some hypervisors advertise X86_FEATURE_APERFMPERF |
270 | * but then fill all MSR's with zeroes. |
271 | * Some CPUs have turbo boost but don't declare any turbo ratio |
272 | * in MSR_TURBO_RATIO_LIMIT. |
273 | */ |
274 | if (!base_freq || !turbo_freq) { |
275 | pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n" ); |
276 | return false; |
277 | } |
278 | |
279 | turbo_ratio = div_u64(dividend: turbo_freq * SCHED_CAPACITY_SCALE, divisor: base_freq); |
280 | if (!turbo_ratio) { |
281 | pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n" ); |
282 | return false; |
283 | } |
284 | |
285 | arch_turbo_freq_ratio = turbo_ratio; |
286 | arch_set_max_freq_ratio(turbo_disabled()); |
287 | |
288 | return true; |
289 | } |
290 | |
291 | #ifdef CONFIG_PM_SLEEP |
292 | static struct syscore_ops freq_invariance_syscore_ops = { |
293 | .resume = init_counter_refs, |
294 | }; |
295 | |
296 | static void register_freq_invariance_syscore_ops(void) |
297 | { |
298 | register_syscore_ops(ops: &freq_invariance_syscore_ops); |
299 | } |
300 | #else |
301 | static inline void register_freq_invariance_syscore_ops(void) {} |
302 | #endif |
303 | |
304 | static void freq_invariance_enable(void) |
305 | { |
306 | if (static_branch_unlikely(&arch_scale_freq_key)) { |
307 | WARN_ON_ONCE(1); |
308 | return; |
309 | } |
310 | static_branch_enable(&arch_scale_freq_key); |
311 | register_freq_invariance_syscore_ops(); |
312 | pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n" , arch_max_freq_ratio); |
313 | } |
314 | |
315 | void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) |
316 | { |
317 | arch_turbo_freq_ratio = ratio; |
318 | arch_set_max_freq_ratio(turbo_disabled); |
319 | freq_invariance_enable(); |
320 | } |
321 | |
322 | static void __init bp_init_freq_invariance(void) |
323 | { |
324 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) |
325 | return; |
326 | |
327 | if (intel_set_max_freq_ratio()) |
328 | freq_invariance_enable(); |
329 | } |
330 | |
331 | static void disable_freq_invariance_workfn(struct work_struct *work) |
332 | { |
333 | int cpu; |
334 | |
335 | static_branch_disable(&arch_scale_freq_key); |
336 | |
337 | /* |
338 | * Set arch_freq_scale to a default value on all cpus |
339 | * This negates the effect of scaling |
340 | */ |
341 | for_each_possible_cpu(cpu) |
342 | per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; |
343 | } |
344 | |
345 | static DECLARE_WORK(disable_freq_invariance_work, |
346 | disable_freq_invariance_workfn); |
347 | |
348 | DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; |
349 | |
350 | static void scale_freq_tick(u64 acnt, u64 mcnt) |
351 | { |
352 | u64 freq_scale; |
353 | |
354 | if (!arch_scale_freq_invariant()) |
355 | return; |
356 | |
357 | if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) |
358 | goto error; |
359 | |
360 | if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) |
361 | goto error; |
362 | |
363 | freq_scale = div64_u64(dividend: acnt, divisor: mcnt); |
364 | if (!freq_scale) |
365 | goto error; |
366 | |
367 | if (freq_scale > SCHED_CAPACITY_SCALE) |
368 | freq_scale = SCHED_CAPACITY_SCALE; |
369 | |
370 | this_cpu_write(arch_freq_scale, freq_scale); |
371 | return; |
372 | |
373 | error: |
374 | pr_warn("Scheduler frequency invariance went wobbly, disabling!\n" ); |
375 | schedule_work(work: &disable_freq_invariance_work); |
376 | } |
377 | #else |
378 | static inline void bp_init_freq_invariance(void) { } |
379 | static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } |
380 | #endif /* CONFIG_X86_64 && CONFIG_SMP */ |
381 | |
382 | void arch_scale_freq_tick(void) |
383 | { |
384 | struct aperfmperf *s = this_cpu_ptr(&cpu_samples); |
385 | u64 acnt, mcnt, aperf, mperf; |
386 | |
387 | if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) |
388 | return; |
389 | |
390 | rdmsrl(MSR_IA32_APERF, aperf); |
391 | rdmsrl(MSR_IA32_MPERF, mperf); |
392 | acnt = aperf - s->aperf; |
393 | mcnt = mperf - s->mperf; |
394 | |
395 | s->aperf = aperf; |
396 | s->mperf = mperf; |
397 | |
398 | raw_write_seqcount_begin(&s->seq); |
399 | s->last_update = jiffies; |
400 | s->acnt = acnt; |
401 | s->mcnt = mcnt; |
402 | raw_write_seqcount_end(&s->seq); |
403 | |
404 | scale_freq_tick(acnt, mcnt); |
405 | } |
406 | |
407 | /* |
408 | * Discard samples older than the define maximum sample age of 20ms. There |
409 | * is no point in sending IPIs in such a case. If the scheduler tick was |
410 | * not running then the CPU is either idle or isolated. |
411 | */ |
412 | #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) |
413 | |
414 | unsigned int arch_freq_get_on_cpu(int cpu) |
415 | { |
416 | struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); |
417 | unsigned int seq, freq; |
418 | unsigned long last; |
419 | u64 acnt, mcnt; |
420 | |
421 | if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) |
422 | goto fallback; |
423 | |
424 | do { |
425 | seq = raw_read_seqcount_begin(&s->seq); |
426 | last = s->last_update; |
427 | acnt = s->acnt; |
428 | mcnt = s->mcnt; |
429 | } while (read_seqcount_retry(&s->seq, seq)); |
430 | |
431 | /* |
432 | * Bail on invalid count and when the last update was too long ago, |
433 | * which covers idle and NOHZ full CPUs. |
434 | */ |
435 | if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) |
436 | goto fallback; |
437 | |
438 | return div64_u64(dividend: (cpu_khz * acnt), divisor: mcnt); |
439 | |
440 | fallback: |
441 | freq = cpufreq_quick_get(cpu); |
442 | return freq ? freq : cpu_khz; |
443 | } |
444 | |
445 | static int __init bp_init_aperfmperf(void) |
446 | { |
447 | if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) |
448 | return 0; |
449 | |
450 | init_counter_refs(); |
451 | bp_init_freq_invariance(); |
452 | return 0; |
453 | } |
454 | early_initcall(bp_init_aperfmperf); |
455 | |
456 | void ap_init_aperfmperf(void) |
457 | { |
458 | if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) |
459 | init_counter_refs(); |
460 | } |
461 | |