1 | /* |
2 | * Copyright 2020 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | */ |
23 | #include "amdgpu.h" |
24 | #include "sdma/sdma_4_4_0_offset.h" |
25 | #include "sdma/sdma_4_4_0_sh_mask.h" |
26 | #include "soc15.h" |
27 | #include "amdgpu_ras.h" |
28 | |
29 | #define SDMA1_REG_OFFSET 0x600 |
30 | #define SDMA2_REG_OFFSET 0x1cda0 |
31 | #define SDMA3_REG_OFFSET 0x1d1a0 |
32 | #define SDMA4_REG_OFFSET 0x1d5a0 |
33 | |
34 | /* helper function that allow only use sdma0 register offset |
35 | * to calculate register offset for all the sdma instances */ |
36 | static uint32_t sdma_v4_4_get_reg_offset(struct amdgpu_device *adev, |
37 | uint32_t instance, |
38 | uint32_t offset) |
39 | { |
40 | uint32_t sdma_base = adev->reg_offset[SDMA0_HWIP][0][0]; |
41 | |
42 | switch (instance) { |
43 | case 0: |
44 | return (sdma_base + offset); |
45 | case 1: |
46 | return (sdma_base + SDMA1_REG_OFFSET + offset); |
47 | case 2: |
48 | return (sdma_base + SDMA2_REG_OFFSET + offset); |
49 | case 3: |
50 | return (sdma_base + SDMA3_REG_OFFSET + offset); |
51 | case 4: |
52 | return (sdma_base + SDMA4_REG_OFFSET + offset); |
53 | default: |
54 | break; |
55 | } |
56 | return 0; |
57 | } |
58 | |
59 | static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = { |
60 | { "SDMA_MBANK_DATA_BUF0_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
61 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF0_SED), |
62 | 0, 0, |
63 | }, |
64 | { "SDMA_MBANK_DATA_BUF1_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
65 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF1_SED), |
66 | 0, 0, |
67 | }, |
68 | { "SDMA_MBANK_DATA_BUF2_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
69 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF2_SED), |
70 | 0, 0, |
71 | }, |
72 | { "SDMA_MBANK_DATA_BUF3_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
73 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF3_SED), |
74 | 0, 0, |
75 | }, |
76 | { "SDMA_MBANK_DATA_BUF4_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
77 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF4_SED), |
78 | 0, 0, |
79 | }, |
80 | { "SDMA_MBANK_DATA_BUF5_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
81 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF5_SED), |
82 | 0, 0, |
83 | }, |
84 | { "SDMA_MBANK_DATA_BUF6_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
85 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF6_SED), |
86 | 0, 0, |
87 | }, |
88 | { "SDMA_MBANK_DATA_BUF7_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
89 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF7_SED), |
90 | 0, 0, |
91 | }, |
92 | { "SDMA_MBANK_DATA_BUF8_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
93 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF8_SED), |
94 | 0, 0, |
95 | }, |
96 | { "SDMA_MBANK_DATA_BUF9_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
97 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF9_SED), |
98 | 0, 0, |
99 | }, |
100 | { "SDMA_MBANK_DATA_BUF10_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
101 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF10_SED), |
102 | 0, 0, |
103 | }, |
104 | { "SDMA_MBANK_DATA_BUF11_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
105 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF11_SED), |
106 | 0, 0, |
107 | }, |
108 | { "SDMA_MBANK_DATA_BUF12_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
109 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF12_SED), |
110 | 0, 0, |
111 | }, |
112 | { "SDMA_MBANK_DATA_BUF13_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
113 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF13_SED), |
114 | 0, 0, |
115 | }, |
116 | { "SDMA_MBANK_DATA_BUF14_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
117 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF14_SED), |
118 | 0, 0, |
119 | }, |
120 | { "SDMA_MBANK_DATA_BUF15_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER), |
121 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER, SDMA_MBANK_DATA_BUF15_SED), |
122 | 0, 0, |
123 | }, |
124 | { "SDMA_UCODE_BUF_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
125 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UCODE_BUF_SED), |
126 | 0, 0, |
127 | }, |
128 | { "SDMA_RB_CMD_BUF_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
129 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_RB_CMD_BUF_SED), |
130 | 0, 0, |
131 | }, |
132 | { "SDMA_IB_CMD_BUF_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
133 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_IB_CMD_BUF_SED), |
134 | 0, 0, |
135 | }, |
136 | { "SDMA_UTCL1_RD_FIFO_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
137 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_RD_FIFO_SED), |
138 | 0, 0, |
139 | }, |
140 | { "SDMA_UTCL1_RDBST_FIFO_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
141 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_RDBST_FIFO_SED), |
142 | 0, 0, |
143 | }, |
144 | { "SDMA_UTCL1_WR_FIFO_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
145 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_UTCL1_WR_FIFO_SED), |
146 | 0, 0, |
147 | }, |
148 | { "SDMA_DATA_LUT_FIFO_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
149 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_DATA_LUT_FIFO_SED), |
150 | 0, 0, |
151 | }, |
152 | { "SDMA_SPLIT_DATA_BUF_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
153 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_SPLIT_DATA_BUF_SED), |
154 | 0, 0, |
155 | }, |
156 | { "SDMA_MC_WR_ADDR_FIFO_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
157 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_MC_WR_ADDR_FIFO_SED), |
158 | 0, 0, |
159 | }, |
160 | { "SDMA_MC_RDRET_BUF_SED" , SOC15_REG_ENTRY(SDMA0, 0, regSDMA0_EDC_COUNTER2), |
161 | SOC15_REG_FIELD(SDMA0_EDC_COUNTER2, SDMA_MC_WR_ADDR_FIFO_SED), |
162 | 0, 0, |
163 | }, |
164 | }; |
165 | |
166 | static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev, |
167 | uint32_t reg_offset, |
168 | uint32_t value, |
169 | uint32_t instance, |
170 | uint32_t *sec_count) |
171 | { |
172 | uint32_t i; |
173 | uint32_t sec_cnt; |
174 | |
175 | /* double bits error (multiple bits) error detection is not supported */ |
176 | for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) { |
177 | if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset) |
178 | continue; |
179 | |
180 | /* the SDMA_EDC_COUNTER register in each sdma instance |
181 | * shares the same sed shift_mask |
182 | * */ |
183 | sec_cnt = (value & |
184 | sdma_v4_4_ras_fields[i].sec_count_mask) >> |
185 | sdma_v4_4_ras_fields[i].sec_count_shift; |
186 | if (sec_cnt) { |
187 | dev_info(adev->dev, "Detected %s in SDMA%d, SED %d\n" , |
188 | sdma_v4_4_ras_fields[i].name, |
189 | instance, sec_cnt); |
190 | *sec_count += sec_cnt; |
191 | } |
192 | } |
193 | } |
194 | |
195 | static int sdma_v4_4_query_ras_error_count_by_instance(struct amdgpu_device *adev, |
196 | uint32_t instance, |
197 | void *ras_error_status) |
198 | { |
199 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
200 | uint32_t sec_count = 0; |
201 | uint32_t reg_value = 0; |
202 | uint32_t reg_offset = 0; |
203 | |
204 | reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER); |
205 | reg_value = RREG32(reg_offset); |
206 | /* double bit error is not supported */ |
207 | if (reg_value) |
208 | sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, value: reg_value, |
209 | instance, sec_count: &sec_count); |
210 | |
211 | reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2); |
212 | reg_value = RREG32(reg_offset); |
213 | /* double bit error is not supported */ |
214 | if (reg_value) |
215 | sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, value: reg_value, |
216 | instance, sec_count: &sec_count); |
217 | |
218 | /* |
219 | * err_data->ue_count should be initialized to 0 |
220 | * before calling into this function |
221 | * |
222 | * SDMA RAS supports single bit uncorrectable error detection. |
223 | * So, increment uncorrectable error count. |
224 | */ |
225 | err_data->ue_count += sec_count; |
226 | |
227 | /* |
228 | * SDMA RAS does not support correctable errors. |
229 | * Set ce count to 0. |
230 | */ |
231 | err_data->ce_count = 0; |
232 | |
233 | return 0; |
234 | }; |
235 | |
236 | static void sdma_v4_4_reset_ras_error_count(struct amdgpu_device *adev) |
237 | { |
238 | int i; |
239 | uint32_t reg_offset; |
240 | |
241 | /* write 0 to EDC_COUNTER reg to clear sdma edc counters */ |
242 | if (amdgpu_ras_is_supported(adev, block: AMDGPU_RAS_BLOCK__SDMA)) { |
243 | for (i = 0; i < adev->sdma.num_instances; i++) { |
244 | reg_offset = sdma_v4_4_get_reg_offset(adev, instance: i, regSDMA0_EDC_COUNTER); |
245 | WREG32(reg_offset, 0); |
246 | reg_offset = sdma_v4_4_get_reg_offset(adev, instance: i, regSDMA0_EDC_COUNTER2); |
247 | WREG32(reg_offset, 0); |
248 | } |
249 | } |
250 | } |
251 | |
252 | static void sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) |
253 | { |
254 | int i = 0; |
255 | |
256 | for (i = 0; i < adev->sdma.num_instances; i++) { |
257 | if (sdma_v4_4_query_ras_error_count_by_instance(adev, instance: i, ras_error_status)) { |
258 | dev_err(adev->dev, "Query ras error count failed in SDMA%d\n" , i); |
259 | return; |
260 | } |
261 | } |
262 | |
263 | } |
264 | |
265 | const struct amdgpu_ras_block_hw_ops sdma_v4_4_ras_hw_ops = { |
266 | .query_ras_error_count = sdma_v4_4_query_ras_error_count, |
267 | .reset_ras_error_count = sdma_v4_4_reset_ras_error_count, |
268 | }; |
269 | |
270 | struct amdgpu_sdma_ras sdma_v4_4_ras = { |
271 | .ras_block = { |
272 | .hw_ops = &sdma_v4_4_ras_hw_ops, |
273 | }, |
274 | }; |
275 | |