1 | // SPDX-License-Identifier: MIT |
2 | /* |
3 | * Copyright © 2019 Intel Corporation |
4 | */ |
5 | |
6 | #include "i915_drv.h" |
7 | #include "i915_request.h" |
8 | |
9 | #include "intel_context.h" |
10 | #include "intel_engine_heartbeat.h" |
11 | #include "intel_engine_pm.h" |
12 | #include "intel_engine.h" |
13 | #include "intel_gt.h" |
14 | #include "intel_reset.h" |
15 | |
16 | /* |
17 | * While the engine is active, we send a periodic pulse along the engine |
18 | * to check on its health and to flush any idle-barriers. If that request |
19 | * is stuck, and we fail to preempt it, we declare the engine hung and |
20 | * issue a reset -- in the hope that restores progress. |
21 | */ |
22 | |
23 | static bool next_heartbeat(struct intel_engine_cs *engine) |
24 | { |
25 | struct i915_request *rq; |
26 | long delay; |
27 | |
28 | delay = READ_ONCE(engine->props.heartbeat_interval_ms); |
29 | |
30 | rq = engine->heartbeat.systole; |
31 | |
32 | /* |
33 | * FIXME: The final period extension is disabled if the period has been |
34 | * modified from the default. This is to prevent issues with certain |
35 | * selftests which override the value and expect specific behaviour. |
36 | * Once the selftests have been updated to either cope with variable |
37 | * heartbeat periods (or to override the pre-emption timeout as well, |
38 | * or just to add a selftest specific override of the extension), the |
39 | * generic override can be removed. |
40 | */ |
41 | if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && |
42 | delay == engine->defaults.heartbeat_interval_ms) { |
43 | long longer; |
44 | |
45 | /* |
46 | * The final try is at the highest priority possible. Up until now |
47 | * a pre-emption might not even have been attempted. So make sure |
48 | * this last attempt allows enough time for a pre-emption to occur. |
49 | */ |
50 | longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; |
51 | longer = intel_clamp_heartbeat_interval_ms(engine, value: longer); |
52 | if (longer > delay) |
53 | delay = longer; |
54 | } |
55 | |
56 | if (!delay) |
57 | return false; |
58 | |
59 | delay = msecs_to_jiffies_timeout(m: delay); |
60 | if (delay >= HZ) |
61 | delay = round_jiffies_up_relative(j: delay); |
62 | mod_delayed_work(wq: system_highpri_wq, dwork: &engine->heartbeat.work, delay: delay + 1); |
63 | |
64 | return true; |
65 | } |
66 | |
67 | static struct i915_request * |
68 | heartbeat_create(struct intel_context *ce, gfp_t gfp) |
69 | { |
70 | struct i915_request *rq; |
71 | |
72 | intel_context_enter(ce); |
73 | rq = __i915_request_create(ce, gfp); |
74 | intel_context_exit(ce); |
75 | |
76 | return rq; |
77 | } |
78 | |
79 | static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) |
80 | { |
81 | engine->wakeref_serial = READ_ONCE(engine->serial) + 1; |
82 | i915_request_add_active_barriers(rq); |
83 | if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) |
84 | engine->heartbeat.systole = i915_request_get(rq); |
85 | } |
86 | |
87 | static void heartbeat_commit(struct i915_request *rq, |
88 | const struct i915_sched_attr *attr) |
89 | { |
90 | idle_pulse(engine: rq->engine, rq); |
91 | |
92 | __i915_request_commit(request: rq); |
93 | __i915_request_queue(rq, attr); |
94 | } |
95 | |
96 | static void show_heartbeat(const struct i915_request *rq, |
97 | struct intel_engine_cs *engine) |
98 | { |
99 | struct drm_printer p = |
100 | drm_dbg_printer(drm: &engine->i915->drm, category: DRM_UT_DRIVER, prefix: "heartbeat" ); |
101 | |
102 | if (!rq) { |
103 | intel_engine_dump(engine, m: &p, |
104 | header: "%s heartbeat not ticking\n" , |
105 | engine->name); |
106 | } else { |
107 | intel_engine_dump(engine, m: &p, |
108 | header: "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n" , |
109 | engine->name, |
110 | rq->fence.context, |
111 | rq->fence.seqno, |
112 | rq->sched.attr.priority); |
113 | } |
114 | } |
115 | |
116 | static void |
117 | reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) |
118 | { |
119 | if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) |
120 | show_heartbeat(rq, engine); |
121 | |
122 | if (intel_engine_uses_guc(engine)) |
123 | /* |
124 | * GuC itself is toast or GuC's hang detection |
125 | * is disabled. Either way, need to find the |
126 | * hang culprit manually. |
127 | */ |
128 | intel_guc_find_hung_context(engine); |
129 | |
130 | intel_gt_handle_error(gt: engine->gt, engine_mask: engine->mask, |
131 | I915_ERROR_CAPTURE, |
132 | fmt: "stopped heartbeat on %s" , |
133 | engine->name); |
134 | } |
135 | |
136 | static void heartbeat(struct work_struct *wrk) |
137 | { |
138 | struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; |
139 | struct intel_engine_cs *engine = |
140 | container_of(wrk, typeof(*engine), heartbeat.work.work); |
141 | struct intel_context *ce = engine->kernel_context; |
142 | struct i915_request *rq; |
143 | unsigned long serial; |
144 | |
145 | /* Just in case everything has gone horribly wrong, give it a kick */ |
146 | intel_engine_flush_submission(engine); |
147 | |
148 | rq = engine->heartbeat.systole; |
149 | if (rq && i915_request_completed(rq)) { |
150 | i915_request_put(rq); |
151 | engine->heartbeat.systole = NULL; |
152 | } |
153 | |
154 | if (!intel_engine_pm_get_if_awake(engine)) |
155 | return; |
156 | |
157 | if (intel_gt_is_wedged(gt: engine->gt)) |
158 | goto out; |
159 | |
160 | if (i915_sched_engine_disabled(sched_engine: engine->sched_engine)) { |
161 | reset_engine(engine, rq: engine->heartbeat.systole); |
162 | goto out; |
163 | } |
164 | |
165 | if (engine->heartbeat.systole) { |
166 | long delay = READ_ONCE(engine->props.heartbeat_interval_ms); |
167 | |
168 | /* Safeguard against too-fast worker invocations */ |
169 | if (!time_after(jiffies, |
170 | rq->emitted_jiffies + msecs_to_jiffies(delay))) |
171 | goto out; |
172 | |
173 | if (!i915_sw_fence_signaled(fence: &rq->submit)) { |
174 | /* |
175 | * Not yet submitted, system is stalled. |
176 | * |
177 | * This more often happens for ring submission, |
178 | * where all contexts are funnelled into a common |
179 | * ringbuffer. If one context is blocked on an |
180 | * external fence, not only is it not submitted, |
181 | * but all other contexts, including the kernel |
182 | * context are stuck waiting for the signal. |
183 | */ |
184 | } else if (engine->sched_engine->schedule && |
185 | rq->sched.attr.priority < I915_PRIORITY_BARRIER) { |
186 | /* |
187 | * Gradually raise the priority of the heartbeat to |
188 | * give high priority work [which presumably desires |
189 | * low latency and no jitter] the chance to naturally |
190 | * complete before being preempted. |
191 | */ |
192 | attr.priority = I915_PRIORITY_NORMAL; |
193 | if (rq->sched.attr.priority >= attr.priority) |
194 | attr.priority = I915_PRIORITY_HEARTBEAT; |
195 | if (rq->sched.attr.priority >= attr.priority) |
196 | attr.priority = I915_PRIORITY_BARRIER; |
197 | |
198 | local_bh_disable(); |
199 | engine->sched_engine->schedule(rq, &attr); |
200 | local_bh_enable(); |
201 | } else { |
202 | reset_engine(engine, rq); |
203 | } |
204 | |
205 | rq->emitted_jiffies = jiffies; |
206 | goto out; |
207 | } |
208 | |
209 | serial = READ_ONCE(engine->serial); |
210 | if (engine->wakeref_serial == serial) |
211 | goto out; |
212 | |
213 | if (!mutex_trylock(lock: &ce->timeline->mutex)) { |
214 | /* Unable to lock the kernel timeline, is the engine stuck? */ |
215 | if (xchg(&engine->heartbeat.blocked, serial) == serial) |
216 | intel_gt_handle_error(gt: engine->gt, engine_mask: engine->mask, |
217 | I915_ERROR_CAPTURE, |
218 | fmt: "no heartbeat on %s" , |
219 | engine->name); |
220 | goto out; |
221 | } |
222 | |
223 | rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); |
224 | if (IS_ERR(ptr: rq)) |
225 | goto unlock; |
226 | |
227 | heartbeat_commit(rq, attr: &attr); |
228 | |
229 | unlock: |
230 | mutex_unlock(lock: &ce->timeline->mutex); |
231 | out: |
232 | if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) |
233 | i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); |
234 | intel_engine_pm_put(engine); |
235 | } |
236 | |
237 | void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) |
238 | { |
239 | if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) |
240 | return; |
241 | |
242 | next_heartbeat(engine); |
243 | } |
244 | |
245 | void intel_engine_park_heartbeat(struct intel_engine_cs *engine) |
246 | { |
247 | if (cancel_delayed_work(dwork: &engine->heartbeat.work)) |
248 | i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); |
249 | } |
250 | |
251 | void intel_gt_unpark_heartbeats(struct intel_gt *gt) |
252 | { |
253 | struct intel_engine_cs *engine; |
254 | enum intel_engine_id id; |
255 | |
256 | for_each_engine(engine, gt, id) |
257 | if (intel_engine_pm_is_awake(engine)) |
258 | intel_engine_unpark_heartbeat(engine); |
259 | } |
260 | |
261 | void intel_gt_park_heartbeats(struct intel_gt *gt) |
262 | { |
263 | struct intel_engine_cs *engine; |
264 | enum intel_engine_id id; |
265 | |
266 | for_each_engine(engine, gt, id) |
267 | intel_engine_park_heartbeat(engine); |
268 | } |
269 | |
270 | void intel_engine_init_heartbeat(struct intel_engine_cs *engine) |
271 | { |
272 | INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); |
273 | } |
274 | |
275 | static int __intel_engine_pulse(struct intel_engine_cs *engine) |
276 | { |
277 | struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; |
278 | struct intel_context *ce = engine->kernel_context; |
279 | struct i915_request *rq; |
280 | |
281 | lockdep_assert_held(&ce->timeline->mutex); |
282 | GEM_BUG_ON(!intel_engine_has_preemption(engine)); |
283 | GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); |
284 | |
285 | rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); |
286 | if (IS_ERR(ptr: rq)) |
287 | return PTR_ERR(ptr: rq); |
288 | |
289 | __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); |
290 | |
291 | heartbeat_commit(rq, attr: &attr); |
292 | GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); |
293 | |
294 | /* Ensure the forced pulse gets a full period to execute */ |
295 | next_heartbeat(engine); |
296 | |
297 | return 0; |
298 | } |
299 | |
300 | static unsigned long set_heartbeat(struct intel_engine_cs *engine, |
301 | unsigned long delay) |
302 | { |
303 | unsigned long old; |
304 | |
305 | old = xchg(&engine->props.heartbeat_interval_ms, delay); |
306 | if (delay) |
307 | intel_engine_unpark_heartbeat(engine); |
308 | else |
309 | intel_engine_park_heartbeat(engine); |
310 | |
311 | return old; |
312 | } |
313 | |
314 | int intel_engine_set_heartbeat(struct intel_engine_cs *engine, |
315 | unsigned long delay) |
316 | { |
317 | struct intel_context *ce = engine->kernel_context; |
318 | int err = 0; |
319 | |
320 | if (!delay && !intel_engine_has_preempt_reset(engine)) |
321 | return -ENODEV; |
322 | |
323 | /* FIXME: Remove together with equally marked hack in next_heartbeat. */ |
324 | if (delay != engine->defaults.heartbeat_interval_ms && |
325 | delay < 2 * engine->props.preempt_timeout_ms) { |
326 | if (intel_engine_uses_guc(engine)) |
327 | drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n" , |
328 | engine->name); |
329 | else |
330 | drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n" , |
331 | engine->name); |
332 | } |
333 | |
334 | intel_engine_pm_get(engine); |
335 | |
336 | err = mutex_lock_interruptible(&ce->timeline->mutex); |
337 | if (err) |
338 | goto out_rpm; |
339 | |
340 | if (delay != engine->props.heartbeat_interval_ms) { |
341 | unsigned long saved = set_heartbeat(engine, delay); |
342 | |
343 | /* recheck current execution */ |
344 | if (intel_engine_has_preemption(engine)) { |
345 | err = __intel_engine_pulse(engine); |
346 | if (err) |
347 | set_heartbeat(engine, delay: saved); |
348 | } |
349 | } |
350 | |
351 | mutex_unlock(lock: &ce->timeline->mutex); |
352 | |
353 | out_rpm: |
354 | intel_engine_pm_put(engine); |
355 | return err; |
356 | } |
357 | |
358 | int intel_engine_pulse(struct intel_engine_cs *engine) |
359 | { |
360 | struct intel_context *ce = engine->kernel_context; |
361 | int err; |
362 | |
363 | if (!intel_engine_has_preemption(engine)) |
364 | return -ENODEV; |
365 | |
366 | if (!intel_engine_pm_get_if_awake(engine)) |
367 | return 0; |
368 | |
369 | err = -EINTR; |
370 | if (!mutex_lock_interruptible(&ce->timeline->mutex)) { |
371 | err = __intel_engine_pulse(engine); |
372 | mutex_unlock(lock: &ce->timeline->mutex); |
373 | } |
374 | |
375 | intel_engine_flush_submission(engine); |
376 | intel_engine_pm_put(engine); |
377 | return err; |
378 | } |
379 | |
380 | int intel_engine_flush_barriers(struct intel_engine_cs *engine) |
381 | { |
382 | struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; |
383 | struct intel_context *ce = engine->kernel_context; |
384 | struct i915_request *rq; |
385 | int err; |
386 | |
387 | if (llist_empty(head: &engine->barrier_tasks)) |
388 | return 0; |
389 | |
390 | if (!intel_engine_pm_get_if_awake(engine)) |
391 | return 0; |
392 | |
393 | if (mutex_lock_interruptible(&ce->timeline->mutex)) { |
394 | err = -EINTR; |
395 | goto out_rpm; |
396 | } |
397 | |
398 | rq = heartbeat_create(ce, GFP_KERNEL); |
399 | if (IS_ERR(ptr: rq)) { |
400 | err = PTR_ERR(ptr: rq); |
401 | goto out_unlock; |
402 | } |
403 | |
404 | heartbeat_commit(rq, attr: &attr); |
405 | |
406 | err = 0; |
407 | out_unlock: |
408 | mutex_unlock(lock: &ce->timeline->mutex); |
409 | out_rpm: |
410 | intel_engine_pm_put(engine); |
411 | return err; |
412 | } |
413 | |
414 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
415 | #include "selftest_engine_heartbeat.c" |
416 | #endif |
417 | |