1 | /* |
2 | * Copyright 2019 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | */ |
23 | |
24 | #include "amdgpu.h" |
25 | #include "umc_v6_7.h" |
26 | #define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms |
27 | |
28 | static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, |
29 | struct ras_err_data *err_data, uint64_t err_addr, |
30 | uint32_t ch_inst, uint32_t umc_inst) |
31 | { |
32 | switch (amdgpu_ip_version(adev, ip: UMC_HWIP, inst: 0)) { |
33 | case IP_VERSION(6, 7, 0): |
34 | umc_v6_7_convert_error_address(adev, |
35 | err_data, err_addr, ch_inst, umc_inst); |
36 | break; |
37 | default: |
38 | dev_warn(adev->dev, |
39 | "UMC address to Physical address translation is not supported\n" ); |
40 | return AMDGPU_RAS_FAIL; |
41 | } |
42 | |
43 | return AMDGPU_RAS_SUCCESS; |
44 | } |
45 | |
46 | int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, |
47 | uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) |
48 | { |
49 | struct ras_err_data err_data; |
50 | int ret; |
51 | |
52 | ret = amdgpu_ras_error_data_init(err_data: &err_data); |
53 | if (ret) |
54 | return ret; |
55 | |
56 | err_data.err_addr = |
57 | kcalloc(n: adev->umc.max_ras_err_cnt_per_query, |
58 | size: sizeof(struct eeprom_table_record), GFP_KERNEL); |
59 | if (!err_data.err_addr) { |
60 | dev_warn(adev->dev, |
61 | "Failed to alloc memory for umc error record in MCA notifier!\n" ); |
62 | ret = AMDGPU_RAS_FAIL; |
63 | goto out_fini_err_data; |
64 | } |
65 | |
66 | /* |
67 | * Translate UMC channel address to Physical address |
68 | */ |
69 | ret = amdgpu_umc_convert_error_address(adev, err_data: &err_data, err_addr, |
70 | ch_inst, umc_inst); |
71 | if (ret) |
72 | goto out_free_err_addr; |
73 | |
74 | if (amdgpu_bad_page_threshold != 0) { |
75 | amdgpu_ras_add_bad_pages(adev, bps: err_data.err_addr, |
76 | pages: err_data.err_addr_cnt); |
77 | amdgpu_ras_save_bad_pages(adev, NULL); |
78 | } |
79 | |
80 | out_free_err_addr: |
81 | kfree(objp: err_data.err_addr); |
82 | |
83 | out_fini_err_data: |
84 | amdgpu_ras_error_data_fini(err_data: &err_data); |
85 | |
86 | return ret; |
87 | } |
88 | |
89 | static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, |
90 | void *ras_error_status) |
91 | { |
92 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
93 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
94 | unsigned int error_query_mode; |
95 | int ret = 0; |
96 | unsigned long err_count; |
97 | |
98 | amdgpu_ras_get_error_query_mode(adev, mode: &error_query_mode); |
99 | |
100 | mutex_lock(&con->page_retirement_lock); |
101 | ret = amdgpu_dpm_get_ecc_info(adev, umc_ecc: (void *)&(con->umc_ecc)); |
102 | if (ret == -EOPNOTSUPP && |
103 | error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { |
104 | if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && |
105 | adev->umc.ras->ras_block.hw_ops->query_ras_error_count) |
106 | adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); |
107 | |
108 | if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && |
109 | adev->umc.ras->ras_block.hw_ops->query_ras_error_address && |
110 | adev->umc.max_ras_err_cnt_per_query) { |
111 | err_data->err_addr = |
112 | kcalloc(n: adev->umc.max_ras_err_cnt_per_query, |
113 | size: sizeof(struct eeprom_table_record), GFP_KERNEL); |
114 | |
115 | /* still call query_ras_error_address to clear error status |
116 | * even NOMEM error is encountered |
117 | */ |
118 | if(!err_data->err_addr) |
119 | dev_warn(adev->dev, "Failed to alloc memory for " |
120 | "umc error address record!\n" ); |
121 | |
122 | /* umc query_ras_error_address is also responsible for clearing |
123 | * error status |
124 | */ |
125 | adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status); |
126 | } |
127 | } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY || |
128 | (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) { |
129 | if (adev->umc.ras && |
130 | adev->umc.ras->ecc_info_query_ras_error_count) |
131 | adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status); |
132 | |
133 | if (adev->umc.ras && |
134 | adev->umc.ras->ecc_info_query_ras_error_address && |
135 | adev->umc.max_ras_err_cnt_per_query) { |
136 | err_data->err_addr = |
137 | kcalloc(n: adev->umc.max_ras_err_cnt_per_query, |
138 | size: sizeof(struct eeprom_table_record), GFP_KERNEL); |
139 | |
140 | /* still call query_ras_error_address to clear error status |
141 | * even NOMEM error is encountered |
142 | */ |
143 | if(!err_data->err_addr) |
144 | dev_warn(adev->dev, "Failed to alloc memory for " |
145 | "umc error address record!\n" ); |
146 | |
147 | /* umc query_ras_error_address is also responsible for clearing |
148 | * error status |
149 | */ |
150 | adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status); |
151 | } |
152 | } |
153 | |
154 | /* only uncorrectable error needs gpu reset */ |
155 | if (err_data->ue_count || err_data->de_count) { |
156 | err_count = err_data->ue_count + err_data->de_count; |
157 | if ((amdgpu_bad_page_threshold != 0) && |
158 | err_data->err_addr_cnt) { |
159 | amdgpu_ras_add_bad_pages(adev, bps: err_data->err_addr, |
160 | pages: err_data->err_addr_cnt); |
161 | amdgpu_ras_save_bad_pages(adev, new_cnt: &err_count); |
162 | |
163 | amdgpu_dpm_send_hbm_bad_pages_num(adev, size: con->eeprom_control.ras_num_recs); |
164 | |
165 | if (con->update_channel_flag == true) { |
166 | amdgpu_dpm_send_hbm_bad_channel_flag(adev, size: con->eeprom_control.bad_channel_bitmap); |
167 | con->update_channel_flag = false; |
168 | } |
169 | } |
170 | } |
171 | |
172 | kfree(objp: err_data->err_addr); |
173 | |
174 | mutex_unlock(lock: &con->page_retirement_lock); |
175 | } |
176 | |
177 | static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, |
178 | void *ras_error_status, |
179 | struct amdgpu_iv_entry *entry, |
180 | bool reset) |
181 | { |
182 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
183 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
184 | |
185 | kgd2kfd_set_sram_ecc_flag(kfd: adev->kfd.dev); |
186 | amdgpu_umc_handle_bad_pages(adev, ras_error_status); |
187 | |
188 | if (err_data->ue_count && reset) { |
189 | /* use mode-2 reset for poison consumption */ |
190 | if (!entry) |
191 | con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; |
192 | amdgpu_ras_reset_gpu(adev); |
193 | } |
194 | |
195 | return AMDGPU_RAS_SUCCESS; |
196 | } |
197 | |
198 | int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, |
199 | bool reset, uint32_t timeout_ms) |
200 | { |
201 | struct ras_err_data err_data; |
202 | struct ras_common_if head = { |
203 | .block = AMDGPU_RAS_BLOCK__UMC, |
204 | }; |
205 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head: &head); |
206 | uint32_t timeout = timeout_ms; |
207 | |
208 | memset(&err_data, 0, sizeof(err_data)); |
209 | amdgpu_ras_error_data_init(err_data: &err_data); |
210 | |
211 | do { |
212 | |
213 | amdgpu_umc_handle_bad_pages(adev, ras_error_status: &err_data); |
214 | |
215 | if (timeout && !err_data.de_count) { |
216 | msleep(msecs: 1); |
217 | timeout--; |
218 | } |
219 | |
220 | } while (timeout && !err_data.de_count); |
221 | |
222 | if (!timeout) |
223 | dev_warn(adev->dev, "Can't find bad pages\n" ); |
224 | |
225 | if (err_data.de_count) |
226 | dev_info(adev->dev, "%ld new deferred hardware errors detected\n" , err_data.de_count); |
227 | |
228 | if (obj) { |
229 | obj->err_data.ue_count += err_data.ue_count; |
230 | obj->err_data.ce_count += err_data.ce_count; |
231 | obj->err_data.de_count += err_data.de_count; |
232 | } |
233 | |
234 | amdgpu_ras_error_data_fini(err_data: &err_data); |
235 | |
236 | kgd2kfd_set_sram_ecc_flag(kfd: adev->kfd.dev); |
237 | |
238 | if (reset) { |
239 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
240 | |
241 | /* use mode-2 reset for poison consumption */ |
242 | con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; |
243 | amdgpu_ras_reset_gpu(adev); |
244 | } |
245 | |
246 | return 0; |
247 | } |
248 | |
249 | int amdgpu_umc_poison_handler(struct amdgpu_device *adev, |
250 | enum amdgpu_ras_block block, bool reset) |
251 | { |
252 | int ret = AMDGPU_RAS_SUCCESS; |
253 | |
254 | if (adev->gmc.xgmi.connected_to_cpu || |
255 | adev->gmc.is_app_apu) { |
256 | if (reset) { |
257 | /* MCA poison handler is only responsible for GPU reset, |
258 | * let MCA notifier do page retirement. |
259 | */ |
260 | kgd2kfd_set_sram_ecc_flag(kfd: adev->kfd.dev); |
261 | amdgpu_ras_reset_gpu(adev); |
262 | } |
263 | return ret; |
264 | } |
265 | |
266 | if (!amdgpu_sriov_vf(adev)) { |
267 | if (amdgpu_ip_version(adev, ip: UMC_HWIP, inst: 0) < IP_VERSION(12, 0, 0)) { |
268 | struct ras_err_data err_data; |
269 | struct ras_common_if head = { |
270 | .block = AMDGPU_RAS_BLOCK__UMC, |
271 | }; |
272 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head: &head); |
273 | |
274 | ret = amdgpu_ras_error_data_init(err_data: &err_data); |
275 | if (ret) |
276 | return ret; |
277 | |
278 | ret = amdgpu_umc_do_page_retirement(adev, ras_error_status: &err_data, NULL, reset); |
279 | |
280 | if (ret == AMDGPU_RAS_SUCCESS && obj) { |
281 | obj->err_data.ue_count += err_data.ue_count; |
282 | obj->err_data.ce_count += err_data.ce_count; |
283 | obj->err_data.de_count += err_data.de_count; |
284 | } |
285 | |
286 | amdgpu_ras_error_data_fini(err_data: &err_data); |
287 | } else { |
288 | if (reset) { |
289 | amdgpu_umc_bad_page_polling_timeout(adev, |
290 | reset, MAX_UMC_POISON_POLLING_TIME_SYNC); |
291 | } else { |
292 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
293 | |
294 | atomic_inc(v: &con->page_retirement_req_cnt); |
295 | |
296 | wake_up(&con->page_retirement_wq); |
297 | } |
298 | } |
299 | } else { |
300 | if (adev->virt.ops && adev->virt.ops->ras_poison_handler) |
301 | adev->virt.ops->ras_poison_handler(adev, block); |
302 | else |
303 | dev_warn(adev->dev, |
304 | "No ras_poison_handler interface in SRIOV!\n" ); |
305 | } |
306 | |
307 | return ret; |
308 | } |
309 | |
310 | int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, |
311 | void *ras_error_status, |
312 | struct amdgpu_iv_entry *entry) |
313 | { |
314 | return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, reset: true); |
315 | } |
316 | |
317 | int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) |
318 | { |
319 | int err; |
320 | struct amdgpu_umc_ras *ras; |
321 | |
322 | if (!adev->umc.ras) |
323 | return 0; |
324 | |
325 | ras = adev->umc.ras; |
326 | |
327 | err = amdgpu_ras_register_ras_block(adev, ras_block_obj: &ras->ras_block); |
328 | if (err) { |
329 | dev_err(adev->dev, "Failed to register umc ras block!\n" ); |
330 | return err; |
331 | } |
332 | |
333 | strcpy(p: adev->umc.ras->ras_block.ras_comm.name, q: "umc" ); |
334 | ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC; |
335 | ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; |
336 | adev->umc.ras_if = &ras->ras_block.ras_comm; |
337 | |
338 | if (!ras->ras_block.ras_late_init) |
339 | ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init; |
340 | |
341 | if (!ras->ras_block.ras_cb) |
342 | ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb; |
343 | |
344 | return 0; |
345 | } |
346 | |
347 | int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
348 | { |
349 | int r; |
350 | |
351 | r = amdgpu_ras_block_late_init(adev, ras_block); |
352 | if (r) |
353 | return r; |
354 | |
355 | if (amdgpu_ras_is_supported(adev, block: ras_block->block)) { |
356 | r = amdgpu_irq_get(adev, src: &adev->gmc.ecc_irq, type: 0); |
357 | if (r) |
358 | goto late_fini; |
359 | } |
360 | |
361 | /* ras init of specific umc version */ |
362 | if (adev->umc.ras && |
363 | adev->umc.ras->err_cnt_init) |
364 | adev->umc.ras->err_cnt_init(adev); |
365 | |
366 | return 0; |
367 | |
368 | late_fini: |
369 | amdgpu_ras_block_late_fini(adev, ras_block); |
370 | return r; |
371 | } |
372 | |
373 | int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, |
374 | struct amdgpu_irq_src *source, |
375 | struct amdgpu_iv_entry *entry) |
376 | { |
377 | struct ras_common_if *ras_if = adev->umc.ras_if; |
378 | struct ras_dispatch_if ih_data = { |
379 | .entry = entry, |
380 | }; |
381 | |
382 | if (!ras_if) |
383 | return 0; |
384 | |
385 | ih_data.head = *ras_if; |
386 | |
387 | amdgpu_ras_interrupt_dispatch(adev, info: &ih_data); |
388 | return 0; |
389 | } |
390 | |
391 | void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, |
392 | uint64_t err_addr, |
393 | uint64_t retired_page, |
394 | uint32_t channel_index, |
395 | uint32_t umc_inst) |
396 | { |
397 | struct eeprom_table_record *err_rec = |
398 | &err_data->err_addr[err_data->err_addr_cnt]; |
399 | |
400 | err_rec->address = err_addr; |
401 | /* page frame address is saved */ |
402 | err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; |
403 | err_rec->ts = (uint64_t)ktime_get_real_seconds(); |
404 | err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; |
405 | err_rec->cu = 0; |
406 | err_rec->mem_channel = channel_index; |
407 | err_rec->mcumc_id = umc_inst; |
408 | |
409 | err_data->err_addr_cnt++; |
410 | } |
411 | |
412 | int amdgpu_umc_loop_channels(struct amdgpu_device *adev, |
413 | umc_func func, void *data) |
414 | { |
415 | uint32_t node_inst = 0; |
416 | uint32_t umc_inst = 0; |
417 | uint32_t ch_inst = 0; |
418 | int ret = 0; |
419 | |
420 | if (adev->umc.node_inst_num) { |
421 | LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { |
422 | ret = func(adev, node_inst, umc_inst, ch_inst, data); |
423 | if (ret) { |
424 | dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n" , |
425 | node_inst, umc_inst, ch_inst, ret); |
426 | return ret; |
427 | } |
428 | } |
429 | } else { |
430 | LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { |
431 | ret = func(adev, 0, umc_inst, ch_inst, data); |
432 | if (ret) { |
433 | dev_err(adev->dev, "Umc %d ch %d func returns %d\n" , |
434 | umc_inst, ch_inst, ret); |
435 | return ret; |
436 | } |
437 | } |
438 | } |
439 | |
440 | return 0; |
441 | } |
442 | |