1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright(c) 2022 Intel Corporation. */
3
4#include <linux/cpu.h>
5#include <linux/delay.h>
6#include <linux/fs.h>
7#include <linux/nmi.h>
8#include <linux/slab.h>
9#include <linux/stop_machine.h>
10
11#include "ifs.h"
12
13/*
14 * Note all code and data in this file is protected by
15 * ifs_sem. On HT systems all threads on a core will
16 * execute together, but only the first thread on the
17 * core will update results of the test.
18 */
19
20#define CREATE_TRACE_POINTS
21#include <trace/events/intel_ifs.h>
22
23/* Max retries on the same chunk */
24#define MAX_IFS_RETRIES 5
25
26struct run_params {
27 struct ifs_data *ifsd;
28 union ifs_scan *activate;
29 union ifs_status status;
30};
31
32/*
33 * Number of TSC cycles that a logical CPU will wait for the other
34 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
35 */
36#define IFS_THREAD_WAIT 100000
37
38enum ifs_status_err_code {
39 IFS_NO_ERROR = 0,
40 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1,
41 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2,
42 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3,
43 IFS_INVALID_CHUNK_RANGE = 4,
44 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5,
45 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6,
46 IFS_UNASSIGNED_ERROR_CODE = 7,
47 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8,
48 IFS_INTERRUPTED_DURING_EXECUTION = 9,
49 IFS_UNASSIGNED_ERROR_CODE_0xA = 0xA,
50 IFS_CORRUPTED_CHUNK = 0xB,
51};
52
53static const char * const scan_test_status[] = {
54 [IFS_NO_ERROR] = "SCAN no error",
55 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
56 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
57 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
58 "Core Abort SCAN Response due to power management condition.",
59 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
60 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
61 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
62 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
63 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
64 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
65 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
66 [IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA",
67 [IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading",
68};
69
70static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
71{
72 if (status.error_code < ARRAY_SIZE(scan_test_status)) {
73 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
74 cpumask_pr_args(cpu_smt_mask(cpu)),
75 scan_test_status[status.error_code]);
76 } else if (status.error_code == IFS_SW_TIMEOUT) {
77 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
78 cpumask_pr_args(cpu_smt_mask(cpu)));
79 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
80 dev_info(dev, "CPU(s) %*pbl: %s\n",
81 cpumask_pr_args(cpu_smt_mask(cpu)),
82 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
83 } else {
84 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
85 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
86 }
87}
88
89static void message_fail(struct device *dev, int cpu, union ifs_status status)
90{
91 struct ifs_data *ifsd = ifs_get_data(dev);
92
93 /*
94 * control_error is set when the microcode runs into a problem
95 * loading the image from the reserved BIOS memory, or it has
96 * been corrupted. Reloading the image may fix this issue.
97 */
98 if (status.control_error) {
99 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
100 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
101 }
102
103 /*
104 * signature_error is set when the output from the scan chains does not
105 * match the expected signature. This might be a transient problem (e.g.
106 * due to a bit flip from an alpha particle or neutron). If the problem
107 * repeats on a subsequent test, then it indicates an actual problem in
108 * the core being tested.
109 */
110 if (status.signature_error) {
111 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
112 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
113 }
114}
115
116static bool can_restart(union ifs_status status)
117{
118 enum ifs_status_err_code err_code = status.error_code;
119
120 /* Signature for chunk is bad, or scan test failed */
121 if (status.signature_error || status.control_error)
122 return false;
123
124 switch (err_code) {
125 case IFS_NO_ERROR:
126 case IFS_OTHER_THREAD_COULD_NOT_JOIN:
127 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
128 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
129 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
130 case IFS_INTERRUPTED_DURING_EXECUTION:
131 return true;
132 case IFS_INVALID_CHUNK_RANGE:
133 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
134 case IFS_CORE_NOT_CAPABLE_CURRENTLY:
135 case IFS_UNASSIGNED_ERROR_CODE:
136 case IFS_UNASSIGNED_ERROR_CODE_0xA:
137 case IFS_CORRUPTED_CHUNK:
138 break;
139 }
140 return false;
141}
142
143#define SPINUNIT 100 /* 100 nsec */
144static atomic_t array_cpus_in;
145static atomic_t scan_cpus_in;
146
147/*
148 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
149 */
150static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
151{
152 int cpu = smp_processor_id();
153 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
154 int all_cpus = cpumask_weight(srcp: smt_mask);
155
156 atomic_inc(v: t);
157 while (atomic_read(v: t) < all_cpus) {
158 if (timeout < SPINUNIT)
159 return;
160 ndelay(SPINUNIT);
161 timeout -= SPINUNIT;
162 touch_nmi_watchdog();
163 }
164}
165
166/*
167 * Execute the scan. Called "simultaneously" on all threads of a core
168 * at high priority using the stop_cpus mechanism.
169 */
170static int doscan(void *data)
171{
172 int cpu = smp_processor_id(), start, stop;
173 struct run_params *params = data;
174 union ifs_status status;
175 struct ifs_data *ifsd;
176 int first;
177
178 ifsd = params->ifsd;
179
180 if (ifsd->generation) {
181 start = params->activate->gen2.start;
182 stop = params->activate->gen2.stop;
183 } else {
184 start = params->activate->gen0.start;
185 stop = params->activate->gen0.stop;
186 }
187
188 /* Only the first logical CPU on a core reports result */
189 first = cpumask_first(srcp: cpu_smt_mask(cpu));
190
191 wait_for_sibling_cpu(t: &scan_cpus_in, NSEC_PER_SEC);
192
193 /*
194 * This WRMSR will wait for other HT threads to also write
195 * to this MSR (at most for activate.delay cycles). Then it
196 * starts scan of each requested chunk. The core scan happens
197 * during the "execution" of the WRMSR. This instruction can
198 * take up to 200 milliseconds (in the case where all chunks
199 * are processed in a single pass) before it retires.
200 */
201 wrmsrl(MSR_ACTIVATE_SCAN, val: params->activate->data);
202 rdmsrl(MSR_SCAN_STATUS, status.data);
203
204 trace_ifs_status(batch: ifsd->cur_batch, start, stop, status: status.data);
205
206 /* Pass back the result of the scan */
207 if (cpu == first)
208 params->status = status;
209
210 return 0;
211}
212
213/*
214 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
215 * on all threads of the core to be tested. Loop if necessary to complete
216 * run of all chunks. Include some defensive tests to make sure forward
217 * progress is made, and that the whole test completes in a reasonable time.
218 */
219static void ifs_test_core(int cpu, struct device *dev)
220{
221 union ifs_scan activate;
222 union ifs_status status;
223 unsigned long timeout;
224 struct ifs_data *ifsd;
225 int to_start, to_stop;
226 int status_chunk;
227 struct run_params params;
228 int retries;
229
230 ifsd = ifs_get_data(dev);
231
232 activate.gen0.rsvd = 0;
233 activate.delay = IFS_THREAD_WAIT;
234 activate.sigmce = 0;
235 to_start = 0;
236 to_stop = ifsd->valid_chunks - 1;
237
238 params.ifsd = ifs_get_data(dev);
239
240 if (ifsd->generation) {
241 activate.gen2.start = to_start;
242 activate.gen2.stop = to_stop;
243 } else {
244 activate.gen0.start = to_start;
245 activate.gen0.stop = to_stop;
246 }
247
248 timeout = jiffies + HZ / 2;
249 retries = MAX_IFS_RETRIES;
250
251 while (to_start <= to_stop) {
252 if (time_after(jiffies, timeout)) {
253 status.error_code = IFS_SW_TIMEOUT;
254 break;
255 }
256
257 params.activate = &activate;
258 atomic_set(v: &scan_cpus_in, i: 0);
259 stop_core_cpuslocked(cpu, fn: doscan, data: &params);
260
261 status = params.status;
262
263 /* Some cases can be retried, give up for others */
264 if (!can_restart(status))
265 break;
266
267 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
268 if (status_chunk == to_start) {
269 /* Check for forward progress */
270 if (--retries == 0) {
271 if (status.error_code == IFS_NO_ERROR)
272 status.error_code = IFS_SW_PARTIAL_COMPLETION;
273 break;
274 }
275 } else {
276 retries = MAX_IFS_RETRIES;
277 if (ifsd->generation)
278 activate.gen2.start = status_chunk;
279 else
280 activate.gen0.start = status_chunk;
281 to_start = status_chunk;
282 }
283 }
284
285 /* Update status for this core */
286 ifsd->scan_details = status.data;
287
288 if (status.control_error || status.signature_error) {
289 ifsd->status = SCAN_TEST_FAIL;
290 message_fail(dev, cpu, status);
291 } else if (status.error_code) {
292 ifsd->status = SCAN_NOT_TESTED;
293 message_not_tested(dev, cpu, status);
294 } else {
295 ifsd->status = SCAN_TEST_PASS;
296 }
297}
298
299static int do_array_test(void *data)
300{
301 union ifs_array *command = data;
302 int cpu = smp_processor_id();
303 int first;
304
305 wait_for_sibling_cpu(t: &array_cpus_in, NSEC_PER_SEC);
306
307 /*
308 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
309 */
310 first = cpumask_first(srcp: cpu_smt_mask(cpu));
311
312 if (cpu == first) {
313 wrmsrl(MSR_ARRAY_BIST, val: command->data);
314 /* Pass back the result of the test */
315 rdmsrl(MSR_ARRAY_BIST, command->data);
316 }
317
318 return 0;
319}
320
321static void ifs_array_test_core(int cpu, struct device *dev)
322{
323 union ifs_array command = {};
324 bool timed_out = false;
325 struct ifs_data *ifsd;
326 unsigned long timeout;
327
328 ifsd = ifs_get_data(dev);
329
330 command.array_bitmask = ~0U;
331 timeout = jiffies + HZ / 2;
332
333 do {
334 if (time_after(jiffies, timeout)) {
335 timed_out = true;
336 break;
337 }
338 atomic_set(v: &array_cpus_in, i: 0);
339 stop_core_cpuslocked(cpu, fn: do_array_test, data: &command);
340
341 if (command.ctrl_result)
342 break;
343 } while (command.array_bitmask);
344
345 ifsd->scan_details = command.data;
346
347 if (command.ctrl_result)
348 ifsd->status = SCAN_TEST_FAIL;
349 else if (timed_out || command.array_bitmask)
350 ifsd->status = SCAN_NOT_TESTED;
351 else
352 ifsd->status = SCAN_TEST_PASS;
353}
354
355#define ARRAY_GEN1_TEST_ALL_ARRAYS 0x0ULL
356#define ARRAY_GEN1_STATUS_FAIL 0x1ULL
357
358static int do_array_test_gen1(void *status)
359{
360 int cpu = smp_processor_id();
361 int first;
362
363 first = cpumask_first(srcp: cpu_smt_mask(cpu));
364
365 if (cpu == first) {
366 wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS);
367 rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status));
368 }
369
370 return 0;
371}
372
373static void ifs_array_test_gen1(int cpu, struct device *dev)
374{
375 struct ifs_data *ifsd = ifs_get_data(dev);
376 u64 status = 0;
377
378 stop_core_cpuslocked(cpu, fn: do_array_test_gen1, data: &status);
379 ifsd->scan_details = status;
380
381 if (status & ARRAY_GEN1_STATUS_FAIL)
382 ifsd->status = SCAN_TEST_FAIL;
383 else
384 ifsd->status = SCAN_TEST_PASS;
385}
386
387/*
388 * Initiate per core test. It wakes up work queue threads on the target cpu and
389 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
390 * wait for all sibling threads to finish the scan test.
391 */
392int do_core_test(int cpu, struct device *dev)
393{
394 const struct ifs_test_caps *test = ifs_get_test_caps(dev);
395 struct ifs_data *ifsd = ifs_get_data(dev);
396 int ret = 0;
397
398 /* Prevent CPUs from being taken offline during the scan test */
399 cpus_read_lock();
400
401 if (!cpu_online(cpu)) {
402 dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
403 ret = -EINVAL;
404 goto out;
405 }
406
407 switch (test->test_num) {
408 case IFS_TYPE_SAF:
409 if (!ifsd->loaded)
410 ret = -EPERM;
411 else
412 ifs_test_core(cpu, dev);
413 break;
414 case IFS_TYPE_ARRAY_BIST:
415 if (ifsd->array_gen == ARRAY_GEN0)
416 ifs_array_test_core(cpu, dev);
417 else
418 ifs_array_test_gen1(cpu, dev);
419 break;
420 default:
421 ret = -EINVAL;
422 }
423out:
424 cpus_read_unlock();
425 return ret;
426}
427

source code of linux/drivers/platform/x86/intel/ifs/runtest.c