1 | /* |
2 | * Copyright 2021 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | */ |
23 | #include "umc_v6_7.h" |
24 | #include "amdgpu_ras.h" |
25 | #include "amdgpu_umc.h" |
26 | #include "amdgpu.h" |
27 | |
28 | #include "umc/umc_6_7_0_offset.h" |
29 | #include "umc/umc_6_7_0_sh_mask.h" |
30 | |
31 | const uint32_t |
32 | umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = { |
33 | {28, 20, 24, 16, 12, 4, 8, 0}, |
34 | {6, 30, 2, 26, 22, 14, 18, 10}, |
35 | {19, 11, 15, 7, 3, 27, 31, 23}, |
36 | {9, 1, 5, 29, 25, 17, 21, 13} |
37 | }; |
38 | const uint32_t |
39 | umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = { |
40 | {19, 11, 15, 7, 3, 27, 31, 23}, |
41 | {9, 1, 5, 29, 25, 17, 21, 13}, |
42 | {28, 20, 24, 16, 12, 4, 8, 0}, |
43 | {6, 30, 2, 26, 22, 14, 18, 10}, |
44 | }; |
45 | |
46 | static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev, |
47 | uint32_t umc_inst, |
48 | uint32_t ch_inst) |
49 | { |
50 | uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst; |
51 | |
52 | /* adjust umc and channel index offset, |
53 | * the register address is not linear on each umc instace */ |
54 | umc_inst = index / 4; |
55 | ch_inst = index % 4; |
56 | |
57 | return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst; |
58 | } |
59 | |
60 | static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev, |
61 | uint64_t mc_umc_status, uint32_t umc_reg_offset) |
62 | { |
63 | uint32_t mc_umc_addr; |
64 | uint64_t reg_value; |
65 | |
66 | if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1) |
67 | dev_info(adev->dev, "Deferred error\n" ); |
68 | |
69 | if (mc_umc_status) |
70 | dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n" , mc_umc_status, umc_reg_offset); |
71 | |
72 | /* print IPID registers value */ |
73 | mc_umc_addr = |
74 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0); |
75 | reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4); |
76 | if (reg_value) |
77 | dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n" , reg_value, umc_reg_offset); |
78 | |
79 | /* print SYND registers value */ |
80 | mc_umc_addr = |
81 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0); |
82 | reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4); |
83 | if (reg_value) |
84 | dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n" , reg_value, umc_reg_offset); |
85 | |
86 | /* print MISC0 registers value */ |
87 | mc_umc_addr = |
88 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0); |
89 | reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4); |
90 | if (reg_value) |
91 | dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n" , reg_value, umc_reg_offset); |
92 | } |
93 | |
94 | static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev, |
95 | uint32_t umc_inst, uint32_t ch_inst, |
96 | unsigned long *error_count) |
97 | { |
98 | uint64_t mc_umc_status; |
99 | uint32_t eccinfo_table_idx; |
100 | uint32_t umc_reg_offset; |
101 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
102 | |
103 | umc_reg_offset = get_umc_v6_7_reg_offset(adev, |
104 | umc_inst, ch_inst); |
105 | |
106 | eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; |
107 | /* check for SRAM correctable error |
108 | MCUMC_STATUS is a 64 bit register */ |
109 | mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; |
110 | if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && |
111 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) { |
112 | *error_count += 1; |
113 | |
114 | umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset); |
115 | |
116 | if (ras->umc_ecc.record_ce_addr_supported) { |
117 | uint64_t err_addr, soc_pa; |
118 | uint32_t channel_index = |
119 | adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; |
120 | |
121 | err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr; |
122 | err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); |
123 | /* translate umc channel address to soc pa, 3 parts are included */ |
124 | soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | |
125 | ADDR_OF_256B_BLOCK(channel_index) | |
126 | OFFSET_IN_256B_BLOCK(err_addr); |
127 | |
128 | /* The umc channel bits are not original values, they are hashed */ |
129 | SET_CHANNEL_HASH(channel_index, soc_pa); |
130 | |
131 | dev_info(adev->dev, "Error Address(PA): 0x%llx\n" , soc_pa); |
132 | } |
133 | } |
134 | } |
135 | |
136 | static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev, |
137 | uint32_t umc_inst, uint32_t ch_inst, |
138 | unsigned long *error_count) |
139 | { |
140 | uint64_t mc_umc_status; |
141 | uint32_t eccinfo_table_idx; |
142 | uint32_t umc_reg_offset; |
143 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
144 | |
145 | umc_reg_offset = get_umc_v6_7_reg_offset(adev, |
146 | umc_inst, ch_inst); |
147 | |
148 | eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; |
149 | /* check the MCUMC_STATUS */ |
150 | mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; |
151 | if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && |
152 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || |
153 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || |
154 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || |
155 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || |
156 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) { |
157 | *error_count += 1; |
158 | |
159 | umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset); |
160 | } |
161 | } |
162 | |
163 | static int umc_v6_7_ecc_info_querry_ecc_error_count(struct amdgpu_device *adev, |
164 | uint32_t node_inst, uint32_t umc_inst, |
165 | uint32_t ch_inst, void *data) |
166 | { |
167 | struct ras_err_data *err_data = (struct ras_err_data *)data; |
168 | |
169 | umc_v6_7_ecc_info_query_correctable_error_count(adev, |
170 | umc_inst, ch_inst, |
171 | error_count: &(err_data->ce_count)); |
172 | |
173 | umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev, |
174 | umc_inst, ch_inst, |
175 | error_count: &(err_data->ue_count)); |
176 | |
177 | return 0; |
178 | } |
179 | |
180 | static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev, |
181 | void *ras_error_status) |
182 | { |
183 | amdgpu_umc_loop_channels(adev, |
184 | func: umc_v6_7_ecc_info_querry_ecc_error_count, data: ras_error_status); |
185 | } |
186 | |
187 | void umc_v6_7_convert_error_address(struct amdgpu_device *adev, |
188 | struct ras_err_data *err_data, uint64_t err_addr, |
189 | uint32_t ch_inst, uint32_t umc_inst) |
190 | { |
191 | uint32_t channel_index; |
192 | uint64_t soc_pa, retired_page, column; |
193 | |
194 | channel_index = |
195 | adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; |
196 | /* translate umc channel address to soc pa, 3 parts are included */ |
197 | soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | |
198 | ADDR_OF_256B_BLOCK(channel_index) | |
199 | OFFSET_IN_256B_BLOCK(err_addr); |
200 | |
201 | /* The umc channel bits are not original values, they are hashed */ |
202 | SET_CHANNEL_HASH(channel_index, soc_pa); |
203 | |
204 | /* clear [C4 C3 C2] in soc physical address */ |
205 | soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); |
206 | |
207 | /* loop for all possibilities of [C4 C3 C2] */ |
208 | for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) { |
209 | retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT); |
210 | dev_info(adev->dev, "Error Address(PA): 0x%llx\n" , retired_page); |
211 | amdgpu_umc_fill_error_record(err_data, err_addr, |
212 | retired_page, channel_index, umc_inst); |
213 | |
214 | /* shift R14 bit */ |
215 | retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT); |
216 | dev_info(adev->dev, "Error Address(PA): 0x%llx\n" , retired_page); |
217 | amdgpu_umc_fill_error_record(err_data, err_addr, |
218 | retired_page, channel_index, umc_inst); |
219 | } |
220 | } |
221 | |
222 | static int umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, |
223 | uint32_t node_inst, uint32_t umc_inst, |
224 | uint32_t ch_inst, void *data) |
225 | { |
226 | uint64_t mc_umc_status, err_addr; |
227 | uint32_t eccinfo_table_idx; |
228 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
229 | struct ras_err_data *err_data = (struct ras_err_data *)data; |
230 | |
231 | eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst; |
232 | mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; |
233 | |
234 | if (mc_umc_status == 0) |
235 | return 0; |
236 | |
237 | if (!err_data->err_addr) |
238 | return 0; |
239 | |
240 | /* calculate error address if ue error is detected */ |
241 | if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && |
242 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { |
243 | |
244 | err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; |
245 | err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); |
246 | |
247 | umc_v6_7_convert_error_address(adev, err_data, err_addr, |
248 | ch_inst, umc_inst); |
249 | } |
250 | |
251 | return 0; |
252 | } |
253 | |
254 | static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev, |
255 | void *ras_error_status) |
256 | { |
257 | amdgpu_umc_loop_channels(adev, |
258 | func: umc_v6_7_ecc_info_query_error_address, data: ras_error_status); |
259 | } |
260 | |
261 | static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev, |
262 | uint32_t umc_reg_offset, |
263 | unsigned long *error_count, |
264 | uint32_t ch_inst, |
265 | uint32_t umc_inst) |
266 | { |
267 | uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; |
268 | uint32_t ecc_err_cnt, ecc_err_cnt_addr; |
269 | uint64_t mc_umc_status; |
270 | uint32_t mc_umc_status_addr; |
271 | |
272 | /* UMC 6_1_1 registers */ |
273 | ecc_err_cnt_sel_addr = |
274 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel); |
275 | ecc_err_cnt_addr = |
276 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt); |
277 | mc_umc_status_addr = |
278 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); |
279 | |
280 | /* select the lower chip and check the error count */ |
281 | ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4); |
282 | ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, |
283 | EccErrCntCsSel, 0); |
284 | WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); |
285 | |
286 | ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); |
287 | *error_count += |
288 | (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - |
289 | UMC_V6_7_CE_CNT_INIT); |
290 | |
291 | /* select the higher chip and check the err counter */ |
292 | ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, |
293 | EccErrCntCsSel, 1); |
294 | WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); |
295 | |
296 | ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); |
297 | *error_count += |
298 | (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - |
299 | UMC_V6_7_CE_CNT_INIT); |
300 | |
301 | /* check for SRAM correctable error |
302 | MCUMC_STATUS is a 64 bit register */ |
303 | mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); |
304 | if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && |
305 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) { |
306 | *error_count += 1; |
307 | |
308 | umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset); |
309 | |
310 | { |
311 | uint64_t err_addr, soc_pa; |
312 | uint32_t mc_umc_addrt0; |
313 | uint32_t channel_index; |
314 | |
315 | mc_umc_addrt0 = |
316 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); |
317 | |
318 | channel_index = |
319 | adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; |
320 | |
321 | err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); |
322 | err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); |
323 | |
324 | /* translate umc channel address to soc pa, 3 parts are included */ |
325 | soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | |
326 | ADDR_OF_256B_BLOCK(channel_index) | |
327 | OFFSET_IN_256B_BLOCK(err_addr); |
328 | |
329 | /* The umc channel bits are not original values, they are hashed */ |
330 | SET_CHANNEL_HASH(channel_index, soc_pa); |
331 | |
332 | dev_info(adev->dev, "Error Address(PA): 0x%llx\n" , soc_pa); |
333 | } |
334 | } |
335 | } |
336 | |
337 | static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev, |
338 | uint32_t umc_reg_offset, |
339 | unsigned long *error_count) |
340 | { |
341 | uint64_t mc_umc_status; |
342 | uint32_t mc_umc_status_addr; |
343 | |
344 | mc_umc_status_addr = |
345 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); |
346 | |
347 | /* check the MCUMC_STATUS */ |
348 | mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); |
349 | if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && |
350 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || |
351 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || |
352 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || |
353 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || |
354 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) { |
355 | *error_count += 1; |
356 | |
357 | umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset); |
358 | } |
359 | } |
360 | |
361 | static int umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev, |
362 | uint32_t node_inst, uint32_t umc_inst, |
363 | uint32_t ch_inst, void *data) |
364 | { |
365 | uint32_t ecc_err_cnt_addr; |
366 | uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; |
367 | uint32_t umc_reg_offset = |
368 | get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst); |
369 | |
370 | ecc_err_cnt_sel_addr = |
371 | SOC15_REG_OFFSET(UMC, 0, |
372 | regUMCCH0_0_EccErrCntSel); |
373 | ecc_err_cnt_addr = |
374 | SOC15_REG_OFFSET(UMC, 0, |
375 | regUMCCH0_0_EccErrCnt); |
376 | |
377 | /* select the lower chip */ |
378 | ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + |
379 | umc_reg_offset) * 4); |
380 | ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, |
381 | UMCCH0_0_EccErrCntSel, |
382 | EccErrCntCsSel, 0); |
383 | WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, |
384 | ecc_err_cnt_sel); |
385 | |
386 | /* clear lower chip error count */ |
387 | WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, |
388 | UMC_V6_7_CE_CNT_INIT); |
389 | |
390 | /* select the higher chip */ |
391 | ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + |
392 | umc_reg_offset) * 4); |
393 | ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, |
394 | UMCCH0_0_EccErrCntSel, |
395 | EccErrCntCsSel, 1); |
396 | WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, |
397 | ecc_err_cnt_sel); |
398 | |
399 | /* clear higher chip error count */ |
400 | WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, |
401 | UMC_V6_7_CE_CNT_INIT); |
402 | |
403 | return 0; |
404 | } |
405 | |
406 | static void umc_v6_7_reset_error_count(struct amdgpu_device *adev) |
407 | { |
408 | amdgpu_umc_loop_channels(adev, |
409 | func: umc_v6_7_reset_error_count_per_channel, NULL); |
410 | } |
411 | |
412 | static int umc_v6_7_query_ecc_error_count(struct amdgpu_device *adev, |
413 | uint32_t node_inst, uint32_t umc_inst, |
414 | uint32_t ch_inst, void *data) |
415 | { |
416 | struct ras_err_data *err_data = (struct ras_err_data *)data; |
417 | uint32_t umc_reg_offset = |
418 | get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst); |
419 | |
420 | umc_v6_7_query_correctable_error_count(adev, |
421 | umc_reg_offset, |
422 | error_count: &(err_data->ce_count), |
423 | ch_inst, umc_inst); |
424 | |
425 | umc_v6_7_querry_uncorrectable_error_count(adev, |
426 | umc_reg_offset, |
427 | error_count: &(err_data->ue_count)); |
428 | |
429 | return 0; |
430 | } |
431 | |
432 | static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev, |
433 | void *ras_error_status) |
434 | { |
435 | amdgpu_umc_loop_channels(adev, |
436 | func: umc_v6_7_query_ecc_error_count, data: ras_error_status); |
437 | |
438 | umc_v6_7_reset_error_count(adev); |
439 | } |
440 | |
441 | static int umc_v6_7_query_error_address(struct amdgpu_device *adev, |
442 | uint32_t node_inst, uint32_t umc_inst, |
443 | uint32_t ch_inst, void *data) |
444 | { |
445 | uint32_t mc_umc_status_addr; |
446 | uint64_t mc_umc_status = 0, mc_umc_addrt0, err_addr; |
447 | struct ras_err_data *err_data = (struct ras_err_data *)data; |
448 | uint32_t umc_reg_offset = |
449 | get_umc_v6_7_reg_offset(adev, umc_inst, ch_inst); |
450 | |
451 | mc_umc_status_addr = |
452 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); |
453 | mc_umc_addrt0 = |
454 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); |
455 | |
456 | mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); |
457 | |
458 | if (mc_umc_status == 0) |
459 | return 0; |
460 | |
461 | if (!err_data->err_addr) { |
462 | /* clear umc status */ |
463 | WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); |
464 | return 0; |
465 | } |
466 | |
467 | /* calculate error address if ue error is detected */ |
468 | if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && |
469 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { |
470 | err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); |
471 | err_addr = |
472 | REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); |
473 | |
474 | umc_v6_7_convert_error_address(adev, err_data, err_addr, |
475 | ch_inst, umc_inst); |
476 | } |
477 | |
478 | /* clear umc status */ |
479 | WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); |
480 | |
481 | return 0; |
482 | } |
483 | |
484 | static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev, |
485 | void *ras_error_status) |
486 | { |
487 | amdgpu_umc_loop_channels(adev, |
488 | func: umc_v6_7_query_error_address, data: ras_error_status); |
489 | } |
490 | |
491 | static uint32_t umc_v6_7_query_ras_poison_mode_per_channel( |
492 | struct amdgpu_device *adev, |
493 | uint32_t umc_reg_offset) |
494 | { |
495 | uint32_t ecc_ctrl_addr, ecc_ctrl; |
496 | |
497 | ecc_ctrl_addr = |
498 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl); |
499 | ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr + |
500 | umc_reg_offset) * 4); |
501 | |
502 | return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn); |
503 | } |
504 | |
505 | static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev) |
506 | { |
507 | uint32_t umc_reg_offset = 0; |
508 | |
509 | /* Enabling fatal error in umc instance0 channel0 will be |
510 | * considered as fatal error mode |
511 | */ |
512 | umc_reg_offset = get_umc_v6_7_reg_offset(adev, umc_inst: 0, ch_inst: 0); |
513 | return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset); |
514 | } |
515 | |
516 | const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = { |
517 | .query_ras_error_count = umc_v6_7_query_ras_error_count, |
518 | .query_ras_error_address = umc_v6_7_query_ras_error_address, |
519 | }; |
520 | |
521 | struct amdgpu_umc_ras umc_v6_7_ras = { |
522 | .ras_block = { |
523 | .hw_ops = &umc_v6_7_ras_hw_ops, |
524 | }, |
525 | .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode, |
526 | .ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count, |
527 | .ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address, |
528 | }; |
529 | |