1 | // SPDX-License-Identifier: MIT |
2 | /* |
3 | * Copyright © 2018 Intel Corporation |
4 | */ |
5 | |
6 | #include <linux/crc32.h> |
7 | |
8 | #include "gem/i915_gem_stolen.h" |
9 | |
10 | #include "i915_memcpy.h" |
11 | #include "i915_selftest.h" |
12 | #include "intel_gpu_commands.h" |
13 | #include "selftests/igt_reset.h" |
14 | #include "selftests/igt_atomic.h" |
15 | #include "selftests/igt_spinner.h" |
16 | |
17 | static int |
18 | __igt_reset_stolen(struct intel_gt *gt, |
19 | intel_engine_mask_t mask, |
20 | const char *msg) |
21 | { |
22 | struct i915_ggtt *ggtt = gt->ggtt; |
23 | const struct resource *dsm = >->i915->dsm.stolen; |
24 | resource_size_t num_pages, page; |
25 | struct intel_engine_cs *engine; |
26 | intel_wakeref_t wakeref; |
27 | enum intel_engine_id id; |
28 | struct igt_spinner spin; |
29 | long max, count; |
30 | void *tmp; |
31 | u32 *crc; |
32 | int err; |
33 | |
34 | if (!drm_mm_node_allocated(node: &ggtt->error_capture)) |
35 | return 0; |
36 | |
37 | num_pages = resource_size(res: dsm) >> PAGE_SHIFT; |
38 | if (!num_pages) |
39 | return 0; |
40 | |
41 | crc = kmalloc_array(n: num_pages, size: sizeof(u32), GFP_KERNEL); |
42 | if (!crc) |
43 | return -ENOMEM; |
44 | |
45 | tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); |
46 | if (!tmp) { |
47 | err = -ENOMEM; |
48 | goto err_crc; |
49 | } |
50 | |
51 | igt_global_reset_lock(gt); |
52 | wakeref = intel_runtime_pm_get(rpm: gt->uncore->rpm); |
53 | |
54 | err = igt_spinner_init(spin: &spin, gt); |
55 | if (err) |
56 | goto err_lock; |
57 | |
58 | for_each_engine(engine, gt, id) { |
59 | struct intel_context *ce; |
60 | struct i915_request *rq; |
61 | |
62 | if (!(mask & engine->mask)) |
63 | continue; |
64 | |
65 | if (!intel_engine_can_store_dword(engine)) |
66 | continue; |
67 | |
68 | ce = intel_context_create(engine); |
69 | if (IS_ERR(ptr: ce)) { |
70 | err = PTR_ERR(ptr: ce); |
71 | goto err_spin; |
72 | } |
73 | rq = igt_spinner_create_request(spin: &spin, ce, MI_ARB_CHECK); |
74 | intel_context_put(ce); |
75 | if (IS_ERR(ptr: rq)) { |
76 | err = PTR_ERR(ptr: rq); |
77 | goto err_spin; |
78 | } |
79 | i915_request_add(rq); |
80 | } |
81 | |
82 | for (page = 0; page < num_pages; page++) { |
83 | dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); |
84 | void __iomem *s; |
85 | void *in; |
86 | |
87 | ggtt->vm.insert_page(&ggtt->vm, dma, |
88 | ggtt->error_capture.start, |
89 | i915_gem_get_pat_index(i915: gt->i915, |
90 | level: I915_CACHE_NONE), |
91 | 0); |
92 | mb(); |
93 | |
94 | s = io_mapping_map_wc(mapping: &ggtt->iomap, |
95 | offset: ggtt->error_capture.start, |
96 | PAGE_SIZE); |
97 | |
98 | if (!__drm_mm_interval_first(mm: >->i915->mm.stolen, |
99 | start: page << PAGE_SHIFT, |
100 | last: ((page + 1) << PAGE_SHIFT) - 1)) |
101 | memset_io(s, STACK_MAGIC, PAGE_SIZE); |
102 | |
103 | in = (void __force *)s; |
104 | if (i915_memcpy_from_wc(dst: tmp, src: in, PAGE_SIZE)) |
105 | in = tmp; |
106 | crc[page] = crc32_le(crc: 0, p: in, PAGE_SIZE); |
107 | |
108 | io_mapping_unmap(vaddr: s); |
109 | } |
110 | mb(); |
111 | ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); |
112 | |
113 | if (mask == ALL_ENGINES) { |
114 | intel_gt_reset(gt, stalled_mask: mask, NULL); |
115 | } else { |
116 | for_each_engine(engine, gt, id) { |
117 | if (mask & engine->mask) |
118 | intel_engine_reset(engine, NULL); |
119 | } |
120 | } |
121 | |
122 | max = -1; |
123 | count = 0; |
124 | for (page = 0; page < num_pages; page++) { |
125 | dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); |
126 | void __iomem *s; |
127 | void *in; |
128 | u32 x; |
129 | |
130 | ggtt->vm.insert_page(&ggtt->vm, dma, |
131 | ggtt->error_capture.start, |
132 | i915_gem_get_pat_index(i915: gt->i915, |
133 | level: I915_CACHE_NONE), |
134 | 0); |
135 | mb(); |
136 | |
137 | s = io_mapping_map_wc(mapping: &ggtt->iomap, |
138 | offset: ggtt->error_capture.start, |
139 | PAGE_SIZE); |
140 | |
141 | in = (void __force *)s; |
142 | if (i915_memcpy_from_wc(dst: tmp, src: in, PAGE_SIZE)) |
143 | in = tmp; |
144 | x = crc32_le(crc: 0, p: in, PAGE_SIZE); |
145 | |
146 | if (x != crc[page] && |
147 | !__drm_mm_interval_first(mm: >->i915->mm.stolen, |
148 | start: page << PAGE_SHIFT, |
149 | last: ((page + 1) << PAGE_SHIFT) - 1)) { |
150 | pr_debug("unused stolen page %pa modified by GPU reset\n" , |
151 | &page); |
152 | if (count++ == 0) |
153 | igt_hexdump(buf: in, PAGE_SIZE); |
154 | max = page; |
155 | } |
156 | |
157 | io_mapping_unmap(vaddr: s); |
158 | } |
159 | mb(); |
160 | ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); |
161 | |
162 | if (count > 0) { |
163 | pr_info("%s reset clobbered %ld pages of stolen, last clobber at page %ld\n" , |
164 | msg, count, max); |
165 | } |
166 | if (max >= I915_GEM_STOLEN_BIAS >> PAGE_SHIFT) { |
167 | pr_err("%s reset clobbered unreserved area [above %x] of stolen; may cause severe faults\n" , |
168 | msg, I915_GEM_STOLEN_BIAS); |
169 | err = -EINVAL; |
170 | } |
171 | |
172 | err_spin: |
173 | igt_spinner_fini(spin: &spin); |
174 | |
175 | err_lock: |
176 | intel_runtime_pm_put(rpm: gt->uncore->rpm, wref: wakeref); |
177 | igt_global_reset_unlock(gt); |
178 | |
179 | kfree(objp: tmp); |
180 | err_crc: |
181 | kfree(objp: crc); |
182 | return err; |
183 | } |
184 | |
185 | static int igt_reset_device_stolen(void *arg) |
186 | { |
187 | return __igt_reset_stolen(gt: arg, ALL_ENGINES, msg: "device" ); |
188 | } |
189 | |
190 | static int igt_reset_engines_stolen(void *arg) |
191 | { |
192 | struct intel_gt *gt = arg; |
193 | struct intel_engine_cs *engine; |
194 | enum intel_engine_id id; |
195 | int err; |
196 | |
197 | if (!intel_has_reset_engine(gt)) |
198 | return 0; |
199 | |
200 | for_each_engine(engine, gt, id) { |
201 | err = __igt_reset_stolen(gt, mask: engine->mask, msg: engine->name); |
202 | if (err) |
203 | return err; |
204 | } |
205 | |
206 | return 0; |
207 | } |
208 | |
209 | static int igt_global_reset(void *arg) |
210 | { |
211 | struct intel_gt *gt = arg; |
212 | unsigned int reset_count; |
213 | intel_wakeref_t wakeref; |
214 | int err = 0; |
215 | |
216 | /* Check that we can issue a global GPU reset */ |
217 | |
218 | igt_global_reset_lock(gt); |
219 | wakeref = intel_runtime_pm_get(rpm: gt->uncore->rpm); |
220 | |
221 | reset_count = i915_reset_count(error: >->i915->gpu_error); |
222 | |
223 | intel_gt_reset(gt, ALL_ENGINES, NULL); |
224 | |
225 | if (i915_reset_count(error: >->i915->gpu_error) == reset_count) { |
226 | pr_err("No GPU reset recorded!\n" ); |
227 | err = -EINVAL; |
228 | } |
229 | |
230 | intel_runtime_pm_put(rpm: gt->uncore->rpm, wref: wakeref); |
231 | igt_global_reset_unlock(gt); |
232 | |
233 | if (intel_gt_is_wedged(gt)) |
234 | err = -EIO; |
235 | |
236 | return err; |
237 | } |
238 | |
239 | static int igt_wedged_reset(void *arg) |
240 | { |
241 | struct intel_gt *gt = arg; |
242 | intel_wakeref_t wakeref; |
243 | |
244 | /* Check that we can recover a wedged device with a GPU reset */ |
245 | |
246 | igt_global_reset_lock(gt); |
247 | wakeref = intel_runtime_pm_get(rpm: gt->uncore->rpm); |
248 | |
249 | intel_gt_set_wedged(gt); |
250 | |
251 | GEM_BUG_ON(!intel_gt_is_wedged(gt)); |
252 | intel_gt_reset(gt, ALL_ENGINES, NULL); |
253 | |
254 | intel_runtime_pm_put(rpm: gt->uncore->rpm, wref: wakeref); |
255 | igt_global_reset_unlock(gt); |
256 | |
257 | return intel_gt_is_wedged(gt) ? -EIO : 0; |
258 | } |
259 | |
260 | static int igt_atomic_reset(void *arg) |
261 | { |
262 | struct intel_gt *gt = arg; |
263 | const typeof(*igt_atomic_phases) *p; |
264 | intel_wakeref_t wakeref; |
265 | int err = 0; |
266 | |
267 | /* Check that the resets are usable from atomic context */ |
268 | |
269 | wakeref = intel_gt_pm_get(gt); |
270 | igt_global_reset_lock(gt); |
271 | |
272 | /* Flush any requests before we get started and check basics */ |
273 | if (!igt_force_reset(gt)) |
274 | goto unlock; |
275 | |
276 | for (p = igt_atomic_phases; p->name; p++) { |
277 | intel_engine_mask_t awake; |
278 | |
279 | GEM_TRACE("__intel_gt_reset under %s\n" , p->name); |
280 | |
281 | awake = reset_prepare(gt); |
282 | p->critical_section_begin(); |
283 | |
284 | err = __intel_gt_reset(gt, ALL_ENGINES); |
285 | |
286 | p->critical_section_end(); |
287 | reset_finish(gt, awake); |
288 | |
289 | if (err) { |
290 | pr_err("__intel_gt_reset failed under %s\n" , p->name); |
291 | break; |
292 | } |
293 | } |
294 | |
295 | /* As we poke around the guts, do a full reset before continuing. */ |
296 | igt_force_reset(gt); |
297 | |
298 | unlock: |
299 | igt_global_reset_unlock(gt); |
300 | intel_gt_pm_put(gt, handle: wakeref); |
301 | |
302 | return err; |
303 | } |
304 | |
305 | static int igt_atomic_engine_reset(void *arg) |
306 | { |
307 | struct intel_gt *gt = arg; |
308 | const typeof(*igt_atomic_phases) *p; |
309 | struct intel_engine_cs *engine; |
310 | enum intel_engine_id id; |
311 | intel_wakeref_t wakeref; |
312 | int err = 0; |
313 | |
314 | /* Check that the resets are usable from atomic context */ |
315 | |
316 | if (!intel_has_reset_engine(gt)) |
317 | return 0; |
318 | |
319 | if (intel_uc_uses_guc_submission(uc: >->uc)) |
320 | return 0; |
321 | |
322 | wakeref = intel_gt_pm_get(gt); |
323 | igt_global_reset_lock(gt); |
324 | |
325 | /* Flush any requests before we get started and check basics */ |
326 | if (!igt_force_reset(gt)) |
327 | goto out_unlock; |
328 | |
329 | for_each_engine(engine, gt, id) { |
330 | struct tasklet_struct *t = &engine->sched_engine->tasklet; |
331 | |
332 | if (t->func) |
333 | tasklet_disable(t); |
334 | intel_engine_pm_get(engine); |
335 | |
336 | for (p = igt_atomic_phases; p->name; p++) { |
337 | GEM_TRACE("intel_engine_reset(%s) under %s\n" , |
338 | engine->name, p->name); |
339 | if (strcmp(p->name, "softirq" )) |
340 | local_bh_disable(); |
341 | |
342 | p->critical_section_begin(); |
343 | err = __intel_engine_reset_bh(engine, NULL); |
344 | p->critical_section_end(); |
345 | |
346 | if (strcmp(p->name, "softirq" )) |
347 | local_bh_enable(); |
348 | |
349 | if (err) { |
350 | pr_err("intel_engine_reset(%s) failed under %s\n" , |
351 | engine->name, p->name); |
352 | break; |
353 | } |
354 | } |
355 | |
356 | intel_engine_pm_put(engine); |
357 | if (t->func) { |
358 | tasklet_enable(t); |
359 | tasklet_hi_schedule(t); |
360 | } |
361 | if (err) |
362 | break; |
363 | } |
364 | |
365 | /* As we poke around the guts, do a full reset before continuing. */ |
366 | igt_force_reset(gt); |
367 | |
368 | out_unlock: |
369 | igt_global_reset_unlock(gt); |
370 | intel_gt_pm_put(gt, handle: wakeref); |
371 | |
372 | return err; |
373 | } |
374 | |
375 | int intel_reset_live_selftests(struct drm_i915_private *i915) |
376 | { |
377 | static const struct i915_subtest tests[] = { |
378 | SUBTEST(igt_global_reset), /* attempt to recover GPU first */ |
379 | SUBTEST(igt_reset_device_stolen), |
380 | SUBTEST(igt_reset_engines_stolen), |
381 | SUBTEST(igt_wedged_reset), |
382 | SUBTEST(igt_atomic_reset), |
383 | SUBTEST(igt_atomic_engine_reset), |
384 | }; |
385 | struct intel_gt *gt = to_gt(i915); |
386 | |
387 | if (!intel_has_gpu_reset(gt)) |
388 | return 0; |
389 | |
390 | if (intel_gt_is_wedged(gt)) |
391 | return -EIO; /* we're long past hope of a successful reset */ |
392 | |
393 | return intel_gt_live_subtests(tests, gt); |
394 | } |
395 | |