1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * fam15h_power.c - AMD Family 15h processor power monitoring |
4 | * |
5 | * Copyright (c) 2011-2016 Advanced Micro Devices, Inc. |
6 | * Author: Andreas Herrmann <herrmann.der.user@googlemail.com> |
7 | */ |
8 | |
9 | #include <linux/err.h> |
10 | #include <linux/hwmon.h> |
11 | #include <linux/hwmon-sysfs.h> |
12 | #include <linux/init.h> |
13 | #include <linux/module.h> |
14 | #include <linux/pci.h> |
15 | #include <linux/bitops.h> |
16 | #include <linux/cpu.h> |
17 | #include <linux/cpumask.h> |
18 | #include <linux/time.h> |
19 | #include <linux/sched.h> |
20 | #include <linux/topology.h> |
21 | #include <asm/processor.h> |
22 | #include <asm/msr.h> |
23 | |
24 | MODULE_DESCRIPTION("AMD Family 15h CPU processor power monitor" ); |
25 | MODULE_AUTHOR("Andreas Herrmann <herrmann.der.user@googlemail.com>" ); |
26 | MODULE_LICENSE("GPL" ); |
27 | |
28 | /* D18F3 */ |
29 | #define REG_NORTHBRIDGE_CAP 0xe8 |
30 | |
31 | /* D18F4 */ |
32 | #define REG_PROCESSOR_TDP 0x1b8 |
33 | |
34 | /* D18F5 */ |
35 | #define REG_TDP_RUNNING_AVERAGE 0xe0 |
36 | #define REG_TDP_LIMIT3 0xe8 |
37 | |
38 | #define FAM15H_MIN_NUM_ATTRS 2 |
39 | #define FAM15H_NUM_GROUPS 2 |
40 | #define MAX_CUS 8 |
41 | |
42 | /* set maximum interval as 1 second */ |
43 | #define MAX_INTERVAL 1000 |
44 | |
45 | #define PCI_DEVICE_ID_AMD_15H_M70H_NB_F4 0x15b4 |
46 | |
47 | struct fam15h_power_data { |
48 | struct pci_dev *pdev; |
49 | unsigned int tdp_to_watts; |
50 | unsigned int base_tdp; |
51 | unsigned int processor_pwr_watts; |
52 | unsigned int cpu_pwr_sample_ratio; |
53 | const struct attribute_group *groups[FAM15H_NUM_GROUPS]; |
54 | struct attribute_group group; |
55 | /* maximum accumulated power of a compute unit */ |
56 | u64 max_cu_acc_power; |
57 | /* accumulated power of the compute units */ |
58 | u64 cu_acc_power[MAX_CUS]; |
59 | /* performance timestamp counter */ |
60 | u64 cpu_sw_pwr_ptsc[MAX_CUS]; |
61 | /* online/offline status of current compute unit */ |
62 | int cu_on[MAX_CUS]; |
63 | unsigned long power_period; |
64 | }; |
65 | |
66 | static bool is_carrizo_or_later(void) |
67 | { |
68 | return boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model >= 0x60; |
69 | } |
70 | |
71 | static ssize_t power1_input_show(struct device *dev, |
72 | struct device_attribute *attr, char *buf) |
73 | { |
74 | u32 val, tdp_limit, running_avg_range; |
75 | s32 running_avg_capture; |
76 | u64 curr_pwr_watts; |
77 | struct fam15h_power_data *data = dev_get_drvdata(dev); |
78 | struct pci_dev *f4 = data->pdev; |
79 | |
80 | pci_bus_read_config_dword(bus: f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), |
81 | REG_TDP_RUNNING_AVERAGE, val: &val); |
82 | |
83 | /* |
84 | * On Carrizo and later platforms, TdpRunAvgAccCap bit field |
85 | * is extended to 4:31 from 4:25. |
86 | */ |
87 | if (is_carrizo_or_later()) { |
88 | running_avg_capture = val >> 4; |
89 | running_avg_capture = sign_extend32(value: running_avg_capture, index: 27); |
90 | } else { |
91 | running_avg_capture = (val >> 4) & 0x3fffff; |
92 | running_avg_capture = sign_extend32(value: running_avg_capture, index: 21); |
93 | } |
94 | |
95 | running_avg_range = (val & 0xf) + 1; |
96 | |
97 | pci_bus_read_config_dword(bus: f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), |
98 | REG_TDP_LIMIT3, val: &val); |
99 | |
100 | /* |
101 | * On Carrizo and later platforms, ApmTdpLimit bit field |
102 | * is extended to 16:31 from 16:28. |
103 | */ |
104 | if (is_carrizo_or_later()) |
105 | tdp_limit = val >> 16; |
106 | else |
107 | tdp_limit = (val >> 16) & 0x1fff; |
108 | |
109 | curr_pwr_watts = ((u64)(tdp_limit + |
110 | data->base_tdp)) << running_avg_range; |
111 | curr_pwr_watts -= running_avg_capture; |
112 | curr_pwr_watts *= data->tdp_to_watts; |
113 | |
114 | /* |
115 | * Convert to microWatt |
116 | * |
117 | * power is in Watt provided as fixed point integer with |
118 | * scaling factor 1/(2^16). For conversion we use |
119 | * (10^6)/(2^16) = 15625/(2^10) |
120 | */ |
121 | curr_pwr_watts = (curr_pwr_watts * 15625) >> (10 + running_avg_range); |
122 | return sprintf(buf, fmt: "%u\n" , (unsigned int) curr_pwr_watts); |
123 | } |
124 | static DEVICE_ATTR_RO(power1_input); |
125 | |
126 | static ssize_t power1_crit_show(struct device *dev, |
127 | struct device_attribute *attr, char *buf) |
128 | { |
129 | struct fam15h_power_data *data = dev_get_drvdata(dev); |
130 | |
131 | return sprintf(buf, fmt: "%u\n" , data->processor_pwr_watts); |
132 | } |
133 | static DEVICE_ATTR_RO(power1_crit); |
134 | |
135 | static void do_read_registers_on_cu(void *_data) |
136 | { |
137 | struct fam15h_power_data *data = _data; |
138 | int cu; |
139 | |
140 | /* |
141 | * With the new x86 topology modelling, cpu core id actually |
142 | * is compute unit id. |
143 | */ |
144 | cu = topology_core_id(smp_processor_id()); |
145 | |
146 | rdmsrl_safe(MSR_F15H_CU_PWR_ACCUMULATOR, p: &data->cu_acc_power[cu]); |
147 | rdmsrl_safe(MSR_F15H_PTSC, p: &data->cpu_sw_pwr_ptsc[cu]); |
148 | |
149 | data->cu_on[cu] = 1; |
150 | } |
151 | |
152 | /* |
153 | * This function is only able to be called when CPUID |
154 | * Fn8000_0007:EDX[12] is set. |
155 | */ |
156 | static int read_registers(struct fam15h_power_data *data) |
157 | { |
158 | int core, this_core; |
159 | cpumask_var_t mask; |
160 | int ret, cpu; |
161 | |
162 | ret = zalloc_cpumask_var(mask: &mask, GFP_KERNEL); |
163 | if (!ret) |
164 | return -ENOMEM; |
165 | |
166 | memset(data->cu_on, 0, sizeof(int) * MAX_CUS); |
167 | |
168 | cpus_read_lock(); |
169 | |
170 | /* |
171 | * Choose the first online core of each compute unit, and then |
172 | * read their MSR value of power and ptsc in a single IPI, |
173 | * because the MSR value of CPU core represent the compute |
174 | * unit's. |
175 | */ |
176 | core = -1; |
177 | |
178 | for_each_online_cpu(cpu) { |
179 | this_core = topology_core_id(cpu); |
180 | |
181 | if (this_core == core) |
182 | continue; |
183 | |
184 | core = this_core; |
185 | |
186 | /* get any CPU on this compute unit */ |
187 | cpumask_set_cpu(cpumask_any(topology_sibling_cpumask(cpu)), dstp: mask); |
188 | } |
189 | |
190 | on_each_cpu_mask(mask, func: do_read_registers_on_cu, info: data, wait: true); |
191 | |
192 | cpus_read_unlock(); |
193 | free_cpumask_var(mask); |
194 | |
195 | return 0; |
196 | } |
197 | |
198 | static ssize_t power1_average_show(struct device *dev, |
199 | struct device_attribute *attr, char *buf) |
200 | { |
201 | struct fam15h_power_data *data = dev_get_drvdata(dev); |
202 | u64 prev_cu_acc_power[MAX_CUS], prev_ptsc[MAX_CUS], |
203 | jdelta[MAX_CUS]; |
204 | u64 tdelta, avg_acc; |
205 | int cu, cu_num, ret; |
206 | signed long leftover; |
207 | |
208 | /* |
209 | * With the new x86 topology modelling, x86_max_cores is the |
210 | * compute unit number. |
211 | */ |
212 | cu_num = topology_num_cores_per_package(); |
213 | |
214 | ret = read_registers(data); |
215 | if (ret) |
216 | return 0; |
217 | |
218 | for (cu = 0; cu < cu_num; cu++) { |
219 | prev_cu_acc_power[cu] = data->cu_acc_power[cu]; |
220 | prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; |
221 | } |
222 | |
223 | leftover = schedule_timeout_interruptible(timeout: msecs_to_jiffies(m: data->power_period)); |
224 | if (leftover) |
225 | return 0; |
226 | |
227 | ret = read_registers(data); |
228 | if (ret) |
229 | return 0; |
230 | |
231 | for (cu = 0, avg_acc = 0; cu < cu_num; cu++) { |
232 | /* check if current compute unit is online */ |
233 | if (data->cu_on[cu] == 0) |
234 | continue; |
235 | |
236 | if (data->cu_acc_power[cu] < prev_cu_acc_power[cu]) { |
237 | jdelta[cu] = data->max_cu_acc_power + data->cu_acc_power[cu]; |
238 | jdelta[cu] -= prev_cu_acc_power[cu]; |
239 | } else { |
240 | jdelta[cu] = data->cu_acc_power[cu] - prev_cu_acc_power[cu]; |
241 | } |
242 | tdelta = data->cpu_sw_pwr_ptsc[cu] - prev_ptsc[cu]; |
243 | jdelta[cu] *= data->cpu_pwr_sample_ratio * 1000; |
244 | do_div(jdelta[cu], tdelta); |
245 | |
246 | /* the unit is microWatt */ |
247 | avg_acc += jdelta[cu]; |
248 | } |
249 | |
250 | return sprintf(buf, fmt: "%llu\n" , (unsigned long long)avg_acc); |
251 | } |
252 | static DEVICE_ATTR_RO(power1_average); |
253 | |
254 | static ssize_t power1_average_interval_show(struct device *dev, |
255 | struct device_attribute *attr, |
256 | char *buf) |
257 | { |
258 | struct fam15h_power_data *data = dev_get_drvdata(dev); |
259 | |
260 | return sprintf(buf, fmt: "%lu\n" , data->power_period); |
261 | } |
262 | |
263 | static ssize_t power1_average_interval_store(struct device *dev, |
264 | struct device_attribute *attr, |
265 | const char *buf, size_t count) |
266 | { |
267 | struct fam15h_power_data *data = dev_get_drvdata(dev); |
268 | unsigned long temp; |
269 | int ret; |
270 | |
271 | ret = kstrtoul(s: buf, base: 10, res: &temp); |
272 | if (ret) |
273 | return ret; |
274 | |
275 | if (temp > MAX_INTERVAL) |
276 | return -EINVAL; |
277 | |
278 | /* the interval value should be greater than 0 */ |
279 | if (temp <= 0) |
280 | return -EINVAL; |
281 | |
282 | data->power_period = temp; |
283 | |
284 | return count; |
285 | } |
286 | static DEVICE_ATTR_RW(power1_average_interval); |
287 | |
288 | static int fam15h_power_init_attrs(struct pci_dev *pdev, |
289 | struct fam15h_power_data *data) |
290 | { |
291 | int n = FAM15H_MIN_NUM_ATTRS; |
292 | struct attribute **fam15h_power_attrs; |
293 | struct cpuinfo_x86 *c = &boot_cpu_data; |
294 | |
295 | if (c->x86 == 0x15 && |
296 | (c->x86_model <= 0xf || |
297 | (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) |
298 | n += 1; |
299 | |
300 | /* check if processor supports accumulated power */ |
301 | if (boot_cpu_has(X86_FEATURE_ACC_POWER)) |
302 | n += 2; |
303 | |
304 | fam15h_power_attrs = devm_kcalloc(dev: &pdev->dev, n, |
305 | size: sizeof(*fam15h_power_attrs), |
306 | GFP_KERNEL); |
307 | |
308 | if (!fam15h_power_attrs) |
309 | return -ENOMEM; |
310 | |
311 | n = 0; |
312 | fam15h_power_attrs[n++] = &dev_attr_power1_crit.attr; |
313 | if (c->x86 == 0x15 && |
314 | (c->x86_model <= 0xf || |
315 | (c->x86_model >= 0x60 && c->x86_model <= 0x7f))) |
316 | fam15h_power_attrs[n++] = &dev_attr_power1_input.attr; |
317 | |
318 | if (boot_cpu_has(X86_FEATURE_ACC_POWER)) { |
319 | fam15h_power_attrs[n++] = &dev_attr_power1_average.attr; |
320 | fam15h_power_attrs[n++] = &dev_attr_power1_average_interval.attr; |
321 | } |
322 | |
323 | data->group.attrs = fam15h_power_attrs; |
324 | |
325 | return 0; |
326 | } |
327 | |
328 | static bool should_load_on_this_node(struct pci_dev *f4) |
329 | { |
330 | u32 val; |
331 | |
332 | pci_bus_read_config_dword(bus: f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 3), |
333 | REG_NORTHBRIDGE_CAP, val: &val); |
334 | if ((val & BIT(29)) && ((val >> 30) & 3)) |
335 | return false; |
336 | |
337 | return true; |
338 | } |
339 | |
340 | /* |
341 | * Newer BKDG versions have an updated recommendation on how to properly |
342 | * initialize the running average range (was: 0xE, now: 0x9). This avoids |
343 | * counter saturations resulting in bogus power readings. |
344 | * We correct this value ourselves to cope with older BIOSes. |
345 | */ |
346 | static const struct pci_device_id affected_device[] = { |
347 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, |
348 | { 0 } |
349 | }; |
350 | |
351 | static void tweak_runavg_range(struct pci_dev *pdev) |
352 | { |
353 | u32 val; |
354 | |
355 | /* |
356 | * let this quirk apply only to the current version of the |
357 | * northbridge, since future versions may change the behavior |
358 | */ |
359 | if (!pci_match_id(ids: affected_device, dev: pdev)) |
360 | return; |
361 | |
362 | pci_bus_read_config_dword(bus: pdev->bus, |
363 | PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), |
364 | REG_TDP_RUNNING_AVERAGE, val: &val); |
365 | if ((val & 0xf) != 0xe) |
366 | return; |
367 | |
368 | val &= ~0xf; |
369 | val |= 0x9; |
370 | pci_bus_write_config_dword(bus: pdev->bus, |
371 | PCI_DEVFN(PCI_SLOT(pdev->devfn), 5), |
372 | REG_TDP_RUNNING_AVERAGE, val); |
373 | } |
374 | |
375 | #ifdef CONFIG_PM |
376 | static int fam15h_power_resume(struct pci_dev *pdev) |
377 | { |
378 | tweak_runavg_range(pdev); |
379 | return 0; |
380 | } |
381 | #else |
382 | #define fam15h_power_resume NULL |
383 | #endif |
384 | |
385 | static int fam15h_power_init_data(struct pci_dev *f4, |
386 | struct fam15h_power_data *data) |
387 | { |
388 | u32 val; |
389 | u64 tmp; |
390 | int ret; |
391 | |
392 | pci_read_config_dword(dev: f4, REG_PROCESSOR_TDP, val: &val); |
393 | data->base_tdp = val >> 16; |
394 | tmp = val & 0xffff; |
395 | |
396 | pci_bus_read_config_dword(bus: f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5), |
397 | REG_TDP_LIMIT3, val: &val); |
398 | |
399 | data->tdp_to_watts = ((val & 0x3ff) << 6) | ((val >> 10) & 0x3f); |
400 | tmp *= data->tdp_to_watts; |
401 | |
402 | /* result not allowed to be >= 256W */ |
403 | if ((tmp >> 16) >= 256) |
404 | dev_warn(&f4->dev, |
405 | "Bogus value for ProcessorPwrWatts (processor_pwr_watts>=%u)\n" , |
406 | (unsigned int) (tmp >> 16)); |
407 | |
408 | /* convert to microWatt */ |
409 | data->processor_pwr_watts = (tmp * 15625) >> 10; |
410 | |
411 | ret = fam15h_power_init_attrs(pdev: f4, data); |
412 | if (ret) |
413 | return ret; |
414 | |
415 | |
416 | /* CPUID Fn8000_0007:EDX[12] indicates to support accumulated power */ |
417 | if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) |
418 | return 0; |
419 | |
420 | /* |
421 | * determine the ratio of the compute unit power accumulator |
422 | * sample period to the PTSC counter period by executing CPUID |
423 | * Fn8000_0007:ECX |
424 | */ |
425 | data->cpu_pwr_sample_ratio = cpuid_ecx(op: 0x80000007); |
426 | |
427 | if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, p: &tmp)) { |
428 | pr_err("Failed to read max compute unit power accumulator MSR\n" ); |
429 | return -ENODEV; |
430 | } |
431 | |
432 | data->max_cu_acc_power = tmp; |
433 | |
434 | /* |
435 | * Milliseconds are a reasonable interval for the measurement. |
436 | * But it shouldn't set too long here, because several seconds |
437 | * would cause the read function to hang. So set default |
438 | * interval as 10 ms. |
439 | */ |
440 | data->power_period = 10; |
441 | |
442 | return read_registers(data); |
443 | } |
444 | |
445 | static int fam15h_power_probe(struct pci_dev *pdev, |
446 | const struct pci_device_id *id) |
447 | { |
448 | struct fam15h_power_data *data; |
449 | struct device *dev = &pdev->dev; |
450 | struct device *hwmon_dev; |
451 | int ret; |
452 | |
453 | /* |
454 | * though we ignore every other northbridge, we still have to |
455 | * do the tweaking on _each_ node in MCM processors as the counters |
456 | * are working hand-in-hand |
457 | */ |
458 | tweak_runavg_range(pdev); |
459 | |
460 | if (!should_load_on_this_node(f4: pdev)) |
461 | return -ENODEV; |
462 | |
463 | data = devm_kzalloc(dev, size: sizeof(struct fam15h_power_data), GFP_KERNEL); |
464 | if (!data) |
465 | return -ENOMEM; |
466 | |
467 | ret = fam15h_power_init_data(f4: pdev, data); |
468 | if (ret) |
469 | return ret; |
470 | |
471 | data->pdev = pdev; |
472 | |
473 | data->groups[0] = &data->group; |
474 | |
475 | hwmon_dev = devm_hwmon_device_register_with_groups(dev, name: "fam15h_power" , |
476 | drvdata: data, |
477 | groups: &data->groups[0]); |
478 | return PTR_ERR_OR_ZERO(ptr: hwmon_dev); |
479 | } |
480 | |
481 | static const struct pci_device_id fam15h_power_id_table[] = { |
482 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, |
483 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, |
484 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, |
485 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M70H_NB_F4) }, |
486 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, |
487 | { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, |
488 | {} |
489 | }; |
490 | MODULE_DEVICE_TABLE(pci, fam15h_power_id_table); |
491 | |
492 | static struct pci_driver fam15h_power_driver = { |
493 | .name = "fam15h_power" , |
494 | .id_table = fam15h_power_id_table, |
495 | .probe = fam15h_power_probe, |
496 | .resume = fam15h_power_resume, |
497 | }; |
498 | |
499 | module_pci_driver(fam15h_power_driver); |
500 | |