1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright(c) 2022 Intel Corporation. */ |
3 | |
4 | #include <linux/cpu.h> |
5 | #include <linux/delay.h> |
6 | #include <linux/fs.h> |
7 | #include <linux/nmi.h> |
8 | #include <linux/slab.h> |
9 | #include <linux/stop_machine.h> |
10 | |
11 | #include "ifs.h" |
12 | |
13 | /* |
14 | * Note all code and data in this file is protected by |
15 | * ifs_sem. On HT systems all threads on a core will |
16 | * execute together, but only the first thread on the |
17 | * core will update results of the test. |
18 | */ |
19 | |
20 | #define CREATE_TRACE_POINTS |
21 | #include <trace/events/intel_ifs.h> |
22 | |
23 | /* Max retries on the same chunk */ |
24 | #define MAX_IFS_RETRIES 5 |
25 | |
26 | struct run_params { |
27 | struct ifs_data *ifsd; |
28 | union ifs_scan *activate; |
29 | union ifs_status status; |
30 | }; |
31 | |
32 | /* |
33 | * Number of TSC cycles that a logical CPU will wait for the other |
34 | * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). |
35 | */ |
36 | #define IFS_THREAD_WAIT 100000 |
37 | |
38 | enum ifs_status_err_code { |
39 | IFS_NO_ERROR = 0, |
40 | IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, |
41 | IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, |
42 | IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, |
43 | IFS_INVALID_CHUNK_RANGE = 4, |
44 | IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, |
45 | IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, |
46 | IFS_UNASSIGNED_ERROR_CODE = 7, |
47 | IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, |
48 | IFS_INTERRUPTED_DURING_EXECUTION = 9, |
49 | IFS_UNASSIGNED_ERROR_CODE_0xA = 0xA, |
50 | IFS_CORRUPTED_CHUNK = 0xB, |
51 | }; |
52 | |
53 | static const char * const scan_test_status[] = { |
54 | [IFS_NO_ERROR] = "SCAN no error" , |
55 | [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join." , |
56 | [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination." , |
57 | [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = |
58 | "Core Abort SCAN Response due to power management condition." , |
59 | [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range" , |
60 | [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1." , |
61 | [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently" , |
62 | [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7" , |
63 | [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = |
64 | "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently" , |
65 | [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start" , |
66 | [IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA" , |
67 | [IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading" , |
68 | }; |
69 | |
70 | static void message_not_tested(struct device *dev, int cpu, union ifs_status status) |
71 | { |
72 | if (status.error_code < ARRAY_SIZE(scan_test_status)) { |
73 | dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n" , |
74 | cpumask_pr_args(cpu_smt_mask(cpu)), |
75 | scan_test_status[status.error_code]); |
76 | } else if (status.error_code == IFS_SW_TIMEOUT) { |
77 | dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n" , |
78 | cpumask_pr_args(cpu_smt_mask(cpu))); |
79 | } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { |
80 | dev_info(dev, "CPU(s) %*pbl: %s\n" , |
81 | cpumask_pr_args(cpu_smt_mask(cpu)), |
82 | "Not all scan chunks were executed. Maximum forward progress retries exceeded" ); |
83 | } else { |
84 | dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n" , |
85 | cpumask_pr_args(cpu_smt_mask(cpu)), status.data); |
86 | } |
87 | } |
88 | |
89 | static void message_fail(struct device *dev, int cpu, union ifs_status status) |
90 | { |
91 | struct ifs_data *ifsd = ifs_get_data(dev); |
92 | |
93 | /* |
94 | * control_error is set when the microcode runs into a problem |
95 | * loading the image from the reserved BIOS memory, or it has |
96 | * been corrupted. Reloading the image may fix this issue. |
97 | */ |
98 | if (status.control_error) { |
99 | dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n" , |
100 | cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); |
101 | } |
102 | |
103 | /* |
104 | * signature_error is set when the output from the scan chains does not |
105 | * match the expected signature. This might be a transient problem (e.g. |
106 | * due to a bit flip from an alpha particle or neutron). If the problem |
107 | * repeats on a subsequent test, then it indicates an actual problem in |
108 | * the core being tested. |
109 | */ |
110 | if (status.signature_error) { |
111 | dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n" , |
112 | cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); |
113 | } |
114 | } |
115 | |
116 | static bool can_restart(union ifs_status status) |
117 | { |
118 | enum ifs_status_err_code err_code = status.error_code; |
119 | |
120 | /* Signature for chunk is bad, or scan test failed */ |
121 | if (status.signature_error || status.control_error) |
122 | return false; |
123 | |
124 | switch (err_code) { |
125 | case IFS_NO_ERROR: |
126 | case IFS_OTHER_THREAD_COULD_NOT_JOIN: |
127 | case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: |
128 | case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: |
129 | case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: |
130 | case IFS_INTERRUPTED_DURING_EXECUTION: |
131 | return true; |
132 | case IFS_INVALID_CHUNK_RANGE: |
133 | case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: |
134 | case IFS_CORE_NOT_CAPABLE_CURRENTLY: |
135 | case IFS_UNASSIGNED_ERROR_CODE: |
136 | case IFS_UNASSIGNED_ERROR_CODE_0xA: |
137 | case IFS_CORRUPTED_CHUNK: |
138 | break; |
139 | } |
140 | return false; |
141 | } |
142 | |
143 | #define SPINUNIT 100 /* 100 nsec */ |
144 | static atomic_t array_cpus_in; |
145 | static atomic_t scan_cpus_in; |
146 | |
147 | /* |
148 | * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus() |
149 | */ |
150 | static void wait_for_sibling_cpu(atomic_t *t, long long timeout) |
151 | { |
152 | int cpu = smp_processor_id(); |
153 | const struct cpumask *smt_mask = cpu_smt_mask(cpu); |
154 | int all_cpus = cpumask_weight(srcp: smt_mask); |
155 | |
156 | atomic_inc(v: t); |
157 | while (atomic_read(v: t) < all_cpus) { |
158 | if (timeout < SPINUNIT) |
159 | return; |
160 | ndelay(SPINUNIT); |
161 | timeout -= SPINUNIT; |
162 | touch_nmi_watchdog(); |
163 | } |
164 | } |
165 | |
166 | /* |
167 | * Execute the scan. Called "simultaneously" on all threads of a core |
168 | * at high priority using the stop_cpus mechanism. |
169 | */ |
170 | static int doscan(void *data) |
171 | { |
172 | int cpu = smp_processor_id(), start, stop; |
173 | struct run_params *params = data; |
174 | union ifs_status status; |
175 | struct ifs_data *ifsd; |
176 | int first; |
177 | |
178 | ifsd = params->ifsd; |
179 | |
180 | if (ifsd->generation) { |
181 | start = params->activate->gen2.start; |
182 | stop = params->activate->gen2.stop; |
183 | } else { |
184 | start = params->activate->gen0.start; |
185 | stop = params->activate->gen0.stop; |
186 | } |
187 | |
188 | /* Only the first logical CPU on a core reports result */ |
189 | first = cpumask_first(srcp: cpu_smt_mask(cpu)); |
190 | |
191 | wait_for_sibling_cpu(t: &scan_cpus_in, NSEC_PER_SEC); |
192 | |
193 | /* |
194 | * This WRMSR will wait for other HT threads to also write |
195 | * to this MSR (at most for activate.delay cycles). Then it |
196 | * starts scan of each requested chunk. The core scan happens |
197 | * during the "execution" of the WRMSR. This instruction can |
198 | * take up to 200 milliseconds (in the case where all chunks |
199 | * are processed in a single pass) before it retires. |
200 | */ |
201 | wrmsrl(MSR_ACTIVATE_SCAN, val: params->activate->data); |
202 | rdmsrl(MSR_SCAN_STATUS, status.data); |
203 | |
204 | trace_ifs_status(batch: ifsd->cur_batch, start, stop, status: status.data); |
205 | |
206 | /* Pass back the result of the scan */ |
207 | if (cpu == first) |
208 | params->status = status; |
209 | |
210 | return 0; |
211 | } |
212 | |
213 | /* |
214 | * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN |
215 | * on all threads of the core to be tested. Loop if necessary to complete |
216 | * run of all chunks. Include some defensive tests to make sure forward |
217 | * progress is made, and that the whole test completes in a reasonable time. |
218 | */ |
219 | static void ifs_test_core(int cpu, struct device *dev) |
220 | { |
221 | union ifs_scan activate; |
222 | union ifs_status status; |
223 | unsigned long timeout; |
224 | struct ifs_data *ifsd; |
225 | int to_start, to_stop; |
226 | int status_chunk; |
227 | struct run_params params; |
228 | int retries; |
229 | |
230 | ifsd = ifs_get_data(dev); |
231 | |
232 | activate.gen0.rsvd = 0; |
233 | activate.delay = IFS_THREAD_WAIT; |
234 | activate.sigmce = 0; |
235 | to_start = 0; |
236 | to_stop = ifsd->valid_chunks - 1; |
237 | |
238 | params.ifsd = ifs_get_data(dev); |
239 | |
240 | if (ifsd->generation) { |
241 | activate.gen2.start = to_start; |
242 | activate.gen2.stop = to_stop; |
243 | } else { |
244 | activate.gen0.start = to_start; |
245 | activate.gen0.stop = to_stop; |
246 | } |
247 | |
248 | timeout = jiffies + HZ / 2; |
249 | retries = MAX_IFS_RETRIES; |
250 | |
251 | while (to_start <= to_stop) { |
252 | if (time_after(jiffies, timeout)) { |
253 | status.error_code = IFS_SW_TIMEOUT; |
254 | break; |
255 | } |
256 | |
257 | params.activate = &activate; |
258 | atomic_set(v: &scan_cpus_in, i: 0); |
259 | stop_core_cpuslocked(cpu, fn: doscan, data: ¶ms); |
260 | |
261 | status = params.status; |
262 | |
263 | /* Some cases can be retried, give up for others */ |
264 | if (!can_restart(status)) |
265 | break; |
266 | |
267 | status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num; |
268 | if (status_chunk == to_start) { |
269 | /* Check for forward progress */ |
270 | if (--retries == 0) { |
271 | if (status.error_code == IFS_NO_ERROR) |
272 | status.error_code = IFS_SW_PARTIAL_COMPLETION; |
273 | break; |
274 | } |
275 | } else { |
276 | retries = MAX_IFS_RETRIES; |
277 | if (ifsd->generation) |
278 | activate.gen2.start = status_chunk; |
279 | else |
280 | activate.gen0.start = status_chunk; |
281 | to_start = status_chunk; |
282 | } |
283 | } |
284 | |
285 | /* Update status for this core */ |
286 | ifsd->scan_details = status.data; |
287 | |
288 | if (status.control_error || status.signature_error) { |
289 | ifsd->status = SCAN_TEST_FAIL; |
290 | message_fail(dev, cpu, status); |
291 | } else if (status.error_code) { |
292 | ifsd->status = SCAN_NOT_TESTED; |
293 | message_not_tested(dev, cpu, status); |
294 | } else { |
295 | ifsd->status = SCAN_TEST_PASS; |
296 | } |
297 | } |
298 | |
299 | static int do_array_test(void *data) |
300 | { |
301 | union ifs_array *command = data; |
302 | int cpu = smp_processor_id(); |
303 | int first; |
304 | |
305 | wait_for_sibling_cpu(t: &array_cpus_in, NSEC_PER_SEC); |
306 | |
307 | /* |
308 | * Only one logical CPU on a core needs to trigger the Array test via MSR write. |
309 | */ |
310 | first = cpumask_first(srcp: cpu_smt_mask(cpu)); |
311 | |
312 | if (cpu == first) { |
313 | wrmsrl(MSR_ARRAY_BIST, val: command->data); |
314 | /* Pass back the result of the test */ |
315 | rdmsrl(MSR_ARRAY_BIST, command->data); |
316 | } |
317 | |
318 | return 0; |
319 | } |
320 | |
321 | static void ifs_array_test_core(int cpu, struct device *dev) |
322 | { |
323 | union ifs_array command = {}; |
324 | bool timed_out = false; |
325 | struct ifs_data *ifsd; |
326 | unsigned long timeout; |
327 | |
328 | ifsd = ifs_get_data(dev); |
329 | |
330 | command.array_bitmask = ~0U; |
331 | timeout = jiffies + HZ / 2; |
332 | |
333 | do { |
334 | if (time_after(jiffies, timeout)) { |
335 | timed_out = true; |
336 | break; |
337 | } |
338 | atomic_set(v: &array_cpus_in, i: 0); |
339 | stop_core_cpuslocked(cpu, fn: do_array_test, data: &command); |
340 | |
341 | if (command.ctrl_result) |
342 | break; |
343 | } while (command.array_bitmask); |
344 | |
345 | ifsd->scan_details = command.data; |
346 | |
347 | if (command.ctrl_result) |
348 | ifsd->status = SCAN_TEST_FAIL; |
349 | else if (timed_out || command.array_bitmask) |
350 | ifsd->status = SCAN_NOT_TESTED; |
351 | else |
352 | ifsd->status = SCAN_TEST_PASS; |
353 | } |
354 | |
355 | #define ARRAY_GEN1_TEST_ALL_ARRAYS 0x0ULL |
356 | #define ARRAY_GEN1_STATUS_FAIL 0x1ULL |
357 | |
358 | static int do_array_test_gen1(void *status) |
359 | { |
360 | int cpu = smp_processor_id(); |
361 | int first; |
362 | |
363 | first = cpumask_first(srcp: cpu_smt_mask(cpu)); |
364 | |
365 | if (cpu == first) { |
366 | wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS); |
367 | rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status)); |
368 | } |
369 | |
370 | return 0; |
371 | } |
372 | |
373 | static void ifs_array_test_gen1(int cpu, struct device *dev) |
374 | { |
375 | struct ifs_data *ifsd = ifs_get_data(dev); |
376 | u64 status = 0; |
377 | |
378 | stop_core_cpuslocked(cpu, fn: do_array_test_gen1, data: &status); |
379 | ifsd->scan_details = status; |
380 | |
381 | if (status & ARRAY_GEN1_STATUS_FAIL) |
382 | ifsd->status = SCAN_TEST_FAIL; |
383 | else |
384 | ifsd->status = SCAN_TEST_PASS; |
385 | } |
386 | |
387 | /* |
388 | * Initiate per core test. It wakes up work queue threads on the target cpu and |
389 | * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and |
390 | * wait for all sibling threads to finish the scan test. |
391 | */ |
392 | int do_core_test(int cpu, struct device *dev) |
393 | { |
394 | const struct ifs_test_caps *test = ifs_get_test_caps(dev); |
395 | struct ifs_data *ifsd = ifs_get_data(dev); |
396 | int ret = 0; |
397 | |
398 | /* Prevent CPUs from being taken offline during the scan test */ |
399 | cpus_read_lock(); |
400 | |
401 | if (!cpu_online(cpu)) { |
402 | dev_info(dev, "cannot test on the offline cpu %d\n" , cpu); |
403 | ret = -EINVAL; |
404 | goto out; |
405 | } |
406 | |
407 | switch (test->test_num) { |
408 | case IFS_TYPE_SAF: |
409 | if (!ifsd->loaded) |
410 | ret = -EPERM; |
411 | else |
412 | ifs_test_core(cpu, dev); |
413 | break; |
414 | case IFS_TYPE_ARRAY_BIST: |
415 | if (ifsd->array_gen == ARRAY_GEN0) |
416 | ifs_array_test_core(cpu, dev); |
417 | else |
418 | ifs_array_test_gen1(cpu, dev); |
419 | break; |
420 | default: |
421 | ret = -EINVAL; |
422 | } |
423 | out: |
424 | cpus_read_unlock(); |
425 | return ret; |
426 | } |
427 | |