1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | * |
23 | */ |
24 | #include <linux/list.h> |
25 | #include "amdgpu.h" |
26 | #include "amdgpu_xgmi.h" |
27 | #include "amdgpu_ras.h" |
28 | #include "soc15.h" |
29 | #include "df/df_3_6_offset.h" |
30 | #include "xgmi/xgmi_4_0_0_smn.h" |
31 | #include "xgmi/xgmi_4_0_0_sh_mask.h" |
32 | #include "xgmi/xgmi_6_1_0_sh_mask.h" |
33 | #include "wafl/wafl2_4_0_0_smn.h" |
34 | #include "wafl/wafl2_4_0_0_sh_mask.h" |
35 | |
36 | #include "amdgpu_reset.h" |
37 | |
38 | #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c |
39 | #define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218 |
40 | #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 |
41 | #define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 |
42 | |
43 | static DEFINE_MUTEX(xgmi_mutex); |
44 | |
45 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 |
46 | |
47 | static LIST_HEAD(xgmi_hive_list); |
48 | |
49 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
50 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, |
51 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, |
52 | }; |
53 | |
54 | static const int wafl_pcs_err_status_reg_vg20[] = { |
55 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, |
56 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, |
57 | }; |
58 | |
59 | static const int xgmi_pcs_err_status_reg_arct[] = { |
60 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, |
61 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, |
62 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, |
63 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, |
64 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, |
65 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, |
66 | }; |
67 | |
68 | /* same as vg20*/ |
69 | static const int wafl_pcs_err_status_reg_arct[] = { |
70 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, |
71 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, |
72 | }; |
73 | |
74 | static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { |
75 | smnPCS_XGMI3X16_PCS_ERROR_STATUS, |
76 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, |
77 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, |
78 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, |
79 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, |
80 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, |
81 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, |
82 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 |
83 | }; |
84 | |
85 | static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = { |
86 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, |
87 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000, |
88 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000, |
89 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000, |
90 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000, |
91 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000, |
92 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000, |
93 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000 |
94 | }; |
95 | |
96 | static const int walf_pcs_err_status_reg_aldebaran[] = { |
97 | smnPCS_GOPX1_PCS_ERROR_STATUS, |
98 | smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 |
99 | }; |
100 | |
101 | static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { |
102 | smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK, |
103 | smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 |
104 | }; |
105 | |
106 | static const int xgmi3x16_pcs_err_status_reg_v6_4[] = { |
107 | smnPCS_XGMI3X16_PCS_ERROR_STATUS, |
108 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000 |
109 | }; |
110 | |
111 | static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = { |
112 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, |
113 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 |
114 | }; |
115 | |
116 | static const u64 xgmi_v6_4_0_mca_base_array[] = { |
117 | 0x11a09200, |
118 | 0x11b09200, |
119 | }; |
120 | |
121 | static const char *xgmi_v6_4_0_ras_error_code_ext[32] = { |
122 | [0x00] = "XGMI PCS DataLossErr" , |
123 | [0x01] = "XGMI PCS TrainingErr" , |
124 | [0x02] = "XGMI PCS FlowCtrlAckErr" , |
125 | [0x03] = "XGMI PCS RxFifoUnderflowErr" , |
126 | [0x04] = "XGMI PCS RxFifoOverflowErr" , |
127 | [0x05] = "XGMI PCS CRCErr" , |
128 | [0x06] = "XGMI PCS BERExceededErr" , |
129 | [0x07] = "XGMI PCS TxMetaDataErr" , |
130 | [0x08] = "XGMI PCS ReplayBufParityErr" , |
131 | [0x09] = "XGMI PCS DataParityErr" , |
132 | [0x0a] = "XGMI PCS ReplayFifoOverflowErr" , |
133 | [0x0b] = "XGMI PCS ReplayFifoUnderflowErr" , |
134 | [0x0c] = "XGMI PCS ElasticFifoOverflowErr" , |
135 | [0x0d] = "XGMI PCS DeskewErr" , |
136 | [0x0e] = "XGMI PCS FlowCtrlCRCErr" , |
137 | [0x0f] = "XGMI PCS DataStartupLimitErr" , |
138 | [0x10] = "XGMI PCS FCInitTimeoutErr" , |
139 | [0x11] = "XGMI PCS RecoveryTimeoutErr" , |
140 | [0x12] = "XGMI PCS ReadySerialTimeoutErr" , |
141 | [0x13] = "XGMI PCS ReadySerialAttemptErr" , |
142 | [0x14] = "XGMI PCS RecoveryAttemptErr" , |
143 | [0x15] = "XGMI PCS RecoveryRelockAttemptErr" , |
144 | [0x16] = "XGMI PCS ReplayAttemptErr" , |
145 | [0x17] = "XGMI PCS SyncHdrErr" , |
146 | [0x18] = "XGMI PCS TxReplayTimeoutErr" , |
147 | [0x19] = "XGMI PCS RxReplayTimeoutErr" , |
148 | [0x1a] = "XGMI PCS LinkSubTxTimeoutErr" , |
149 | [0x1b] = "XGMI PCS LinkSubRxTimeoutErr" , |
150 | [0x1c] = "XGMI PCS RxCMDPktErr" , |
151 | }; |
152 | |
153 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
154 | {"XGMI PCS DataLossErr" , |
155 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, |
156 | {"XGMI PCS TrainingErr" , |
157 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, |
158 | {"XGMI PCS CRCErr" , |
159 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, |
160 | {"XGMI PCS BERExceededErr" , |
161 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, |
162 | {"XGMI PCS TxMetaDataErr" , |
163 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, |
164 | {"XGMI PCS ReplayBufParityErr" , |
165 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, |
166 | {"XGMI PCS DataParityErr" , |
167 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, |
168 | {"XGMI PCS ReplayFifoOverflowErr" , |
169 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, |
170 | {"XGMI PCS ReplayFifoUnderflowErr" , |
171 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, |
172 | {"XGMI PCS ElasticFifoOverflowErr" , |
173 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, |
174 | {"XGMI PCS DeskewErr" , |
175 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, |
176 | {"XGMI PCS DataStartupLimitErr" , |
177 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, |
178 | {"XGMI PCS FCInitTimeoutErr" , |
179 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, |
180 | {"XGMI PCS RecoveryTimeoutErr" , |
181 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, |
182 | {"XGMI PCS ReadySerialTimeoutErr" , |
183 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, |
184 | {"XGMI PCS ReadySerialAttemptErr" , |
185 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, |
186 | {"XGMI PCS RecoveryAttemptErr" , |
187 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, |
188 | {"XGMI PCS RecoveryRelockAttemptErr" , |
189 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, |
190 | }; |
191 | |
192 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { |
193 | {"WAFL PCS DataLossErr" , |
194 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, |
195 | {"WAFL PCS TrainingErr" , |
196 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, |
197 | {"WAFL PCS CRCErr" , |
198 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, |
199 | {"WAFL PCS BERExceededErr" , |
200 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, |
201 | {"WAFL PCS TxMetaDataErr" , |
202 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, |
203 | {"WAFL PCS ReplayBufParityErr" , |
204 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, |
205 | {"WAFL PCS DataParityErr" , |
206 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, |
207 | {"WAFL PCS ReplayFifoOverflowErr" , |
208 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, |
209 | {"WAFL PCS ReplayFifoUnderflowErr" , |
210 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, |
211 | {"WAFL PCS ElasticFifoOverflowErr" , |
212 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, |
213 | {"WAFL PCS DeskewErr" , |
214 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, |
215 | {"WAFL PCS DataStartupLimitErr" , |
216 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, |
217 | {"WAFL PCS FCInitTimeoutErr" , |
218 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, |
219 | {"WAFL PCS RecoveryTimeoutErr" , |
220 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, |
221 | {"WAFL PCS ReadySerialTimeoutErr" , |
222 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, |
223 | {"WAFL PCS ReadySerialAttemptErr" , |
224 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, |
225 | {"WAFL PCS RecoveryAttemptErr" , |
226 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, |
227 | {"WAFL PCS RecoveryRelockAttemptErr" , |
228 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, |
229 | }; |
230 | |
231 | static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { |
232 | {"XGMI3X16 PCS DataLossErr" , |
233 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)}, |
234 | {"XGMI3X16 PCS TrainingErr" , |
235 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)}, |
236 | {"XGMI3X16 PCS FlowCtrlAckErr" , |
237 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)}, |
238 | {"XGMI3X16 PCS RxFifoUnderflowErr" , |
239 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)}, |
240 | {"XGMI3X16 PCS RxFifoOverflowErr" , |
241 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)}, |
242 | {"XGMI3X16 PCS CRCErr" , |
243 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)}, |
244 | {"XGMI3X16 PCS BERExceededErr" , |
245 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)}, |
246 | {"XGMI3X16 PCS TxVcidDataErr" , |
247 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)}, |
248 | {"XGMI3X16 PCS ReplayBufParityErr" , |
249 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)}, |
250 | {"XGMI3X16 PCS DataParityErr" , |
251 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)}, |
252 | {"XGMI3X16 PCS ReplayFifoOverflowErr" , |
253 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, |
254 | {"XGMI3X16 PCS ReplayFifoUnderflowErr" , |
255 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, |
256 | {"XGMI3X16 PCS ElasticFifoOverflowErr" , |
257 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, |
258 | {"XGMI3X16 PCS DeskewErr" , |
259 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)}, |
260 | {"XGMI3X16 PCS FlowCtrlCRCErr" , |
261 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)}, |
262 | {"XGMI3X16 PCS DataStartupLimitErr" , |
263 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)}, |
264 | {"XGMI3X16 PCS FCInitTimeoutErr" , |
265 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, |
266 | {"XGMI3X16 PCS RecoveryTimeoutErr" , |
267 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, |
268 | {"XGMI3X16 PCS ReadySerialTimeoutErr" , |
269 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, |
270 | {"XGMI3X16 PCS ReadySerialAttemptErr" , |
271 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, |
272 | {"XGMI3X16 PCS RecoveryAttemptErr" , |
273 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, |
274 | {"XGMI3X16 PCS RecoveryRelockAttemptErr" , |
275 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, |
276 | {"XGMI3X16 PCS ReplayAttemptErr" , |
277 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)}, |
278 | {"XGMI3X16 PCS SyncHdrErr" , |
279 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)}, |
280 | {"XGMI3X16 PCS TxReplayTimeoutErr" , |
281 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)}, |
282 | {"XGMI3X16 PCS RxReplayTimeoutErr" , |
283 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)}, |
284 | {"XGMI3X16 PCS LinkSubTxTimeoutErr" , |
285 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)}, |
286 | {"XGMI3X16 PCS LinkSubRxTimeoutErr" , |
287 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)}, |
288 | {"XGMI3X16 PCS RxCMDPktErr" , |
289 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)}, |
290 | }; |
291 | |
292 | /** |
293 | * DOC: AMDGPU XGMI Support |
294 | * |
295 | * XGMI is a high speed interconnect that joins multiple GPU cards |
296 | * into a homogeneous memory space that is organized by a collective |
297 | * hive ID and individual node IDs, both of which are 64-bit numbers. |
298 | * |
299 | * The file xgmi_device_id contains the unique per GPU device ID and |
300 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. |
301 | * |
302 | * Inside the device directory a sub-directory 'xgmi_hive_info' is |
303 | * created which contains the hive ID and the list of nodes. |
304 | * |
305 | * The hive ID is stored in: |
306 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id |
307 | * |
308 | * The node information is stored in numbered directories: |
309 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id |
310 | * |
311 | * Each device has their own xgmi_hive_info direction with a mirror |
312 | * set of node sub-directories. |
313 | * |
314 | * The XGMI memory space is built by contiguously adding the power of |
315 | * two padded VRAM space from each node to each other. |
316 | * |
317 | */ |
318 | |
319 | static struct attribute amdgpu_xgmi_hive_id = { |
320 | .name = "xgmi_hive_id" , |
321 | .mode = S_IRUGO |
322 | }; |
323 | |
324 | static struct attribute *amdgpu_xgmi_hive_attrs[] = { |
325 | &amdgpu_xgmi_hive_id, |
326 | NULL |
327 | }; |
328 | ATTRIBUTE_GROUPS(amdgpu_xgmi_hive); |
329 | |
330 | static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, |
331 | struct attribute *attr, char *buf) |
332 | { |
333 | struct amdgpu_hive_info *hive = container_of( |
334 | kobj, struct amdgpu_hive_info, kobj); |
335 | |
336 | if (attr == &amdgpu_xgmi_hive_id) |
337 | return snprintf(buf, PAGE_SIZE, fmt: "%llu\n" , hive->hive_id); |
338 | |
339 | return 0; |
340 | } |
341 | |
342 | static void amdgpu_xgmi_hive_release(struct kobject *kobj) |
343 | { |
344 | struct amdgpu_hive_info *hive = container_of( |
345 | kobj, struct amdgpu_hive_info, kobj); |
346 | |
347 | amdgpu_reset_put_reset_domain(domain: hive->reset_domain); |
348 | hive->reset_domain = NULL; |
349 | |
350 | mutex_destroy(lock: &hive->hive_lock); |
351 | kfree(objp: hive); |
352 | } |
353 | |
354 | static const struct sysfs_ops amdgpu_xgmi_hive_ops = { |
355 | .show = amdgpu_xgmi_show_attrs, |
356 | }; |
357 | |
358 | static const struct kobj_type amdgpu_xgmi_hive_type = { |
359 | .release = amdgpu_xgmi_hive_release, |
360 | .sysfs_ops = &amdgpu_xgmi_hive_ops, |
361 | .default_groups = amdgpu_xgmi_hive_groups, |
362 | }; |
363 | |
364 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, |
365 | struct device_attribute *attr, |
366 | char *buf) |
367 | { |
368 | struct drm_device *ddev = dev_get_drvdata(dev); |
369 | struct amdgpu_device *adev = drm_to_adev(ddev); |
370 | |
371 | return sysfs_emit(buf, fmt: "%llu\n" , adev->gmc.xgmi.node_id); |
372 | |
373 | } |
374 | |
375 | static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev, |
376 | struct device_attribute *attr, |
377 | char *buf) |
378 | { |
379 | struct drm_device *ddev = dev_get_drvdata(dev); |
380 | struct amdgpu_device *adev = drm_to_adev(ddev); |
381 | |
382 | return sysfs_emit(buf, fmt: "%u\n" , adev->gmc.xgmi.physical_node_id); |
383 | |
384 | } |
385 | |
386 | static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev, |
387 | struct device_attribute *attr, |
388 | char *buf) |
389 | { |
390 | struct drm_device *ddev = dev_get_drvdata(dev); |
391 | struct amdgpu_device *adev = drm_to_adev(ddev); |
392 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
393 | int i; |
394 | |
395 | for (i = 0; i < top->num_nodes; i++) |
396 | sprintf(buf: buf + 3 * i, fmt: "%02x " , top->nodes[i].num_hops); |
397 | |
398 | return sysfs_emit(buf, fmt: "%s\n" , buf); |
399 | } |
400 | |
401 | static ssize_t amdgpu_xgmi_show_num_links(struct device *dev, |
402 | struct device_attribute *attr, |
403 | char *buf) |
404 | { |
405 | struct drm_device *ddev = dev_get_drvdata(dev); |
406 | struct amdgpu_device *adev = drm_to_adev(ddev); |
407 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
408 | int i; |
409 | |
410 | for (i = 0; i < top->num_nodes; i++) |
411 | sprintf(buf: buf + 3 * i, fmt: "%02x " , top->nodes[i].num_links); |
412 | |
413 | return sysfs_emit(buf, fmt: "%s\n" , buf); |
414 | } |
415 | |
416 | static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev, |
417 | struct device_attribute *attr, |
418 | char *buf) |
419 | { |
420 | struct drm_device *ddev = dev_get_drvdata(dev); |
421 | struct amdgpu_device *adev = drm_to_adev(ddev); |
422 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
423 | int i, j, size = 0; |
424 | int current_node; |
425 | /* |
426 | * get the node id in the sysfs for the current socket and show |
427 | * it in the port num info output in the sysfs for easy reading. |
428 | * it is NOT the one retrieved from xgmi ta. |
429 | */ |
430 | for (i = 0; i < top->num_nodes; i++) { |
431 | if (top->nodes[i].node_id == adev->gmc.xgmi.node_id) { |
432 | current_node = i; |
433 | break; |
434 | } |
435 | } |
436 | |
437 | for (i = 0; i < top->num_nodes; i++) { |
438 | for (j = 0; j < top->nodes[i].num_links; j++) |
439 | /* node id in sysfs starts from 1 rather than 0 so +1 here */ |
440 | size += sysfs_emit_at(buf, at: size, fmt: "%02x:%02x -> %02x:%02x\n" , current_node + 1, |
441 | top->nodes[i].port_num[j].src_xgmi_port_num, i + 1, |
442 | top->nodes[i].port_num[j].dst_xgmi_port_num); |
443 | } |
444 | |
445 | return size; |
446 | } |
447 | |
448 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) |
449 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, |
450 | struct device_attribute *attr, |
451 | char *buf) |
452 | { |
453 | struct drm_device *ddev = dev_get_drvdata(dev); |
454 | struct amdgpu_device *adev = drm_to_adev(ddev); |
455 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; |
456 | uint64_t fica_out; |
457 | unsigned int error_count = 0; |
458 | |
459 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); |
460 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); |
461 | |
462 | if ((!adev->df.funcs) || |
463 | (!adev->df.funcs->get_fica) || |
464 | (!adev->df.funcs->set_fica)) |
465 | return -EINVAL; |
466 | |
467 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
468 | if (fica_out != 0x1f) |
469 | pr_err("xGMI error counters not enabled!\n" ); |
470 | |
471 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
472 | |
473 | if ((fica_out & 0xffff) == 2) |
474 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); |
475 | |
476 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
477 | |
478 | return sysfs_emit(buf, fmt: "%u\n" , error_count); |
479 | } |
480 | |
481 | |
482 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); |
483 | static DEVICE_ATTR(xgmi_physical_id, 0444, amdgpu_xgmi_show_physical_id, NULL); |
484 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); |
485 | static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL); |
486 | static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL); |
487 | static DEVICE_ATTR(xgmi_port_num, S_IRUGO, amdgpu_xgmi_show_connected_port_num, NULL); |
488 | |
489 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, |
490 | struct amdgpu_hive_info *hive) |
491 | { |
492 | int ret = 0; |
493 | char node[10] = { 0 }; |
494 | |
495 | /* Create xgmi device id file */ |
496 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_device_id); |
497 | if (ret) { |
498 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n" ); |
499 | return ret; |
500 | } |
501 | |
502 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_physical_id); |
503 | if (ret) { |
504 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_physical_id\n" ); |
505 | return ret; |
506 | } |
507 | |
508 | /* Create xgmi error file */ |
509 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_error); |
510 | if (ret) |
511 | pr_err("failed to create xgmi_error\n" ); |
512 | |
513 | /* Create xgmi num hops file */ |
514 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_num_hops); |
515 | if (ret) |
516 | pr_err("failed to create xgmi_num_hops\n" ); |
517 | |
518 | /* Create xgmi num links file */ |
519 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_num_links); |
520 | if (ret) |
521 | pr_err("failed to create xgmi_num_links\n" ); |
522 | |
523 | /* Create xgmi port num file if supported */ |
524 | if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) { |
525 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_port_num); |
526 | if (ret) |
527 | dev_err(adev->dev, "failed to create xgmi_port_num\n" ); |
528 | } |
529 | |
530 | /* Create sysfs link to hive info folder on the first device */ |
531 | if (hive->kobj.parent != (&adev->dev->kobj)) { |
532 | ret = sysfs_create_link(kobj: &adev->dev->kobj, target: &hive->kobj, |
533 | name: "xgmi_hive_info" ); |
534 | if (ret) { |
535 | dev_err(adev->dev, "XGMI: Failed to create link to hive info" ); |
536 | goto remove_file; |
537 | } |
538 | } |
539 | |
540 | sprintf(buf: node, fmt: "node%d" , atomic_read(v: &hive->number_devices)); |
541 | /* Create sysfs link form the hive folder to yourself */ |
542 | ret = sysfs_create_link(kobj: &hive->kobj, target: &adev->dev->kobj, name: node); |
543 | if (ret) { |
544 | dev_err(adev->dev, "XGMI: Failed to create link from hive info" ); |
545 | goto remove_link; |
546 | } |
547 | |
548 | goto success; |
549 | |
550 | |
551 | remove_link: |
552 | sysfs_remove_link(kobj: &adev->dev->kobj, name: adev_to_drm(adev)->unique); |
553 | |
554 | remove_file: |
555 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_device_id); |
556 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_physical_id); |
557 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_error); |
558 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_hops); |
559 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_links); |
560 | if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) |
561 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_port_num); |
562 | |
563 | success: |
564 | return ret; |
565 | } |
566 | |
567 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, |
568 | struct amdgpu_hive_info *hive) |
569 | { |
570 | char node[10]; |
571 | memset(node, 0, sizeof(node)); |
572 | |
573 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_device_id); |
574 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_physical_id); |
575 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_error); |
576 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_hops); |
577 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_links); |
578 | if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) |
579 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_port_num); |
580 | |
581 | if (hive->kobj.parent != (&adev->dev->kobj)) |
582 | sysfs_remove_link(kobj: &adev->dev->kobj,name: "xgmi_hive_info" ); |
583 | |
584 | sprintf(buf: node, fmt: "node%d" , atomic_read(v: &hive->number_devices)); |
585 | sysfs_remove_link(kobj: &hive->kobj, name: node); |
586 | |
587 | } |
588 | |
589 | |
590 | |
591 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) |
592 | { |
593 | struct amdgpu_hive_info *hive = NULL; |
594 | int ret; |
595 | |
596 | if (!adev->gmc.xgmi.hive_id) |
597 | return NULL; |
598 | |
599 | if (adev->hive) { |
600 | kobject_get(kobj: &adev->hive->kobj); |
601 | return adev->hive; |
602 | } |
603 | |
604 | mutex_lock(&xgmi_mutex); |
605 | |
606 | list_for_each_entry(hive, &xgmi_hive_list, node) { |
607 | if (hive->hive_id == adev->gmc.xgmi.hive_id) |
608 | goto pro_end; |
609 | } |
610 | |
611 | hive = kzalloc(size: sizeof(*hive), GFP_KERNEL); |
612 | if (!hive) { |
613 | dev_err(adev->dev, "XGMI: allocation failed\n" ); |
614 | ret = -ENOMEM; |
615 | hive = NULL; |
616 | goto pro_end; |
617 | } |
618 | |
619 | /* initialize new hive if not exist */ |
620 | ret = kobject_init_and_add(kobj: &hive->kobj, |
621 | ktype: &amdgpu_xgmi_hive_type, |
622 | parent: &adev->dev->kobj, |
623 | fmt: "%s" , "xgmi_hive_info" ); |
624 | if (ret) { |
625 | dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n" ); |
626 | kobject_put(kobj: &hive->kobj); |
627 | hive = NULL; |
628 | goto pro_end; |
629 | } |
630 | |
631 | /** |
632 | * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, |
633 | * Host driver decide how to reset the GPU either through FLR or chain reset. |
634 | * Guest side will get individual notifications from the host for the FLR |
635 | * if necessary. |
636 | */ |
637 | if (!amdgpu_sriov_vf(adev)) { |
638 | /** |
639 | * Avoid recreating reset domain when hive is reconstructed for the case |
640 | * of reset the devices in the XGMI hive during probe for passthrough GPU |
641 | * See https://www.spinics.net/lists/amd-gfx/msg58836.html |
642 | */ |
643 | if (adev->reset_domain->type != XGMI_HIVE) { |
644 | hive->reset_domain = |
645 | amdgpu_reset_create_reset_domain(type: XGMI_HIVE, wq_name: "amdgpu-reset-hive" ); |
646 | if (!hive->reset_domain) { |
647 | dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n" ); |
648 | ret = -ENOMEM; |
649 | kobject_put(kobj: &hive->kobj); |
650 | hive = NULL; |
651 | goto pro_end; |
652 | } |
653 | } else { |
654 | amdgpu_reset_get_reset_domain(domain: adev->reset_domain); |
655 | hive->reset_domain = adev->reset_domain; |
656 | } |
657 | } |
658 | |
659 | hive->hive_id = adev->gmc.xgmi.hive_id; |
660 | INIT_LIST_HEAD(list: &hive->device_list); |
661 | INIT_LIST_HEAD(list: &hive->node); |
662 | mutex_init(&hive->hive_lock); |
663 | atomic_set(v: &hive->number_devices, i: 0); |
664 | task_barrier_init(tb: &hive->tb); |
665 | hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; |
666 | hive->hi_req_gpu = NULL; |
667 | |
668 | /* |
669 | * hive pstate on boot is high in vega20 so we have to go to low |
670 | * pstate on after boot. |
671 | */ |
672 | hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; |
673 | list_add_tail(new: &hive->node, head: &xgmi_hive_list); |
674 | |
675 | pro_end: |
676 | if (hive) |
677 | kobject_get(kobj: &hive->kobj); |
678 | mutex_unlock(lock: &xgmi_mutex); |
679 | return hive; |
680 | } |
681 | |
682 | void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) |
683 | { |
684 | if (hive) |
685 | kobject_put(kobj: &hive->kobj); |
686 | } |
687 | |
688 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
689 | { |
690 | int ret = 0; |
691 | struct amdgpu_hive_info *hive; |
692 | struct amdgpu_device *request_adev; |
693 | bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; |
694 | bool init_low; |
695 | |
696 | hive = amdgpu_get_xgmi_hive(adev); |
697 | if (!hive) |
698 | return 0; |
699 | |
700 | request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; |
701 | init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; |
702 | amdgpu_put_xgmi_hive(hive); |
703 | /* fw bug so temporarily disable pstate switching */ |
704 | return 0; |
705 | |
706 | if (!hive || adev->asic_type != CHIP_VEGA20) |
707 | return 0; |
708 | |
709 | mutex_lock(&hive->hive_lock); |
710 | |
711 | if (is_hi_req) |
712 | hive->hi_req_count++; |
713 | else |
714 | hive->hi_req_count--; |
715 | |
716 | /* |
717 | * Vega20 only needs single peer to request pstate high for the hive to |
718 | * go high but all peers must request pstate low for the hive to go low |
719 | */ |
720 | if (hive->pstate == pstate || |
721 | (!is_hi_req && hive->hi_req_count && !init_low)) |
722 | goto out; |
723 | |
724 | dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n" , pstate); |
725 | |
726 | ret = amdgpu_dpm_set_xgmi_pstate(adev: request_adev, pstate); |
727 | if (ret) { |
728 | dev_err(request_adev->dev, |
729 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d" , |
730 | request_adev->gmc.xgmi.node_id, |
731 | request_adev->gmc.xgmi.hive_id, ret); |
732 | goto out; |
733 | } |
734 | |
735 | if (init_low) |
736 | hive->pstate = hive->hi_req_count ? |
737 | hive->pstate : AMDGPU_XGMI_PSTATE_MIN; |
738 | else { |
739 | hive->pstate = pstate; |
740 | hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? |
741 | adev : NULL; |
742 | } |
743 | out: |
744 | mutex_unlock(lock: &hive->hive_lock); |
745 | return ret; |
746 | } |
747 | |
748 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
749 | { |
750 | int ret; |
751 | |
752 | if (amdgpu_sriov_vf(adev)) |
753 | return 0; |
754 | |
755 | /* Each psp need to set the latest topology */ |
756 | ret = psp_xgmi_set_topology_info(psp: &adev->psp, |
757 | number_devices: atomic_read(v: &hive->number_devices), |
758 | topology: &adev->psp.xgmi_context.top_info); |
759 | if (ret) |
760 | dev_err(adev->dev, |
761 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d" , |
762 | adev->gmc.xgmi.node_id, |
763 | adev->gmc.xgmi.hive_id, ret); |
764 | |
765 | return ret; |
766 | } |
767 | |
768 | |
769 | /* |
770 | * NOTE psp_xgmi_node_info.num_hops layout is as follows: |
771 | * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) |
772 | * num_hops[5:3] = reserved |
773 | * num_hops[2:0] = number of hops |
774 | */ |
775 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, |
776 | struct amdgpu_device *peer_adev) |
777 | { |
778 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
779 | uint8_t num_hops_mask = 0x7; |
780 | int i; |
781 | |
782 | for (i = 0 ; i < top->num_nodes; ++i) |
783 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) |
784 | return top->nodes[i].num_hops & num_hops_mask; |
785 | return -EINVAL; |
786 | } |
787 | |
788 | int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, |
789 | struct amdgpu_device *peer_adev) |
790 | { |
791 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
792 | int i; |
793 | |
794 | for (i = 0 ; i < top->num_nodes; ++i) |
795 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) |
796 | return top->nodes[i].num_links; |
797 | return -EINVAL; |
798 | } |
799 | |
800 | /* |
801 | * Devices that support extended data require the entire hive to initialize with |
802 | * the shared memory buffer flag set. |
803 | * |
804 | * Hive locks and conditions apply - see amdgpu_xgmi_add_device |
805 | */ |
806 | static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, |
807 | bool set_extended_data) |
808 | { |
809 | struct amdgpu_device *tmp_adev; |
810 | int ret; |
811 | |
812 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
813 | ret = psp_xgmi_initialize(psp: &tmp_adev->psp, set_extended_data, load_ta: false); |
814 | if (ret) { |
815 | dev_err(tmp_adev->dev, |
816 | "XGMI: Failed to initialize xgmi session for data partition %i\n" , |
817 | set_extended_data); |
818 | return ret; |
819 | } |
820 | |
821 | } |
822 | |
823 | return 0; |
824 | } |
825 | |
826 | static void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev, |
827 | struct amdgpu_device *peer_adev) |
828 | { |
829 | struct psp_xgmi_topology_info *top_info = &adev->psp.xgmi_context.top_info; |
830 | struct psp_xgmi_topology_info *peer_info = &peer_adev->psp.xgmi_context.top_info; |
831 | |
832 | for (int i = 0; i < peer_info->num_nodes; i++) { |
833 | if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) { |
834 | for (int j = 0; j < top_info->num_nodes; j++) { |
835 | if (top_info->nodes[j].node_id == peer_adev->gmc.xgmi.node_id) { |
836 | peer_info->nodes[i].num_hops = top_info->nodes[j].num_hops; |
837 | peer_info->nodes[i].is_sharing_enabled = |
838 | top_info->nodes[j].is_sharing_enabled; |
839 | peer_info->nodes[i].num_links = |
840 | top_info->nodes[j].num_links; |
841 | return; |
842 | } |
843 | } |
844 | } |
845 | } |
846 | } |
847 | |
848 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
849 | { |
850 | struct psp_xgmi_topology_info *top_info; |
851 | struct amdgpu_hive_info *hive; |
852 | struct amdgpu_xgmi *entry; |
853 | struct amdgpu_device *tmp_adev = NULL; |
854 | |
855 | int count = 0, ret = 0; |
856 | |
857 | if (!adev->gmc.xgmi.supported) |
858 | return 0; |
859 | |
860 | if (!adev->gmc.xgmi.pending_reset && |
861 | amdgpu_device_ip_get_ip_block(adev, type: AMD_IP_BLOCK_TYPE_PSP)) { |
862 | ret = psp_xgmi_initialize(psp: &adev->psp, set_extended_data: false, load_ta: true); |
863 | if (ret) { |
864 | dev_err(adev->dev, |
865 | "XGMI: Failed to initialize xgmi session\n" ); |
866 | return ret; |
867 | } |
868 | |
869 | ret = psp_xgmi_get_hive_id(psp: &adev->psp, hive_id: &adev->gmc.xgmi.hive_id); |
870 | if (ret) { |
871 | dev_err(adev->dev, |
872 | "XGMI: Failed to get hive id\n" ); |
873 | return ret; |
874 | } |
875 | |
876 | ret = psp_xgmi_get_node_id(psp: &adev->psp, node_id: &adev->gmc.xgmi.node_id); |
877 | if (ret) { |
878 | dev_err(adev->dev, |
879 | "XGMI: Failed to get node id\n" ); |
880 | return ret; |
881 | } |
882 | } else { |
883 | adev->gmc.xgmi.hive_id = 16; |
884 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; |
885 | } |
886 | |
887 | hive = amdgpu_get_xgmi_hive(adev); |
888 | if (!hive) { |
889 | ret = -EINVAL; |
890 | dev_err(adev->dev, |
891 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n" , |
892 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
893 | goto exit; |
894 | } |
895 | mutex_lock(&hive->hive_lock); |
896 | |
897 | top_info = &adev->psp.xgmi_context.top_info; |
898 | |
899 | list_add_tail(new: &adev->gmc.xgmi.head, head: &hive->device_list); |
900 | list_for_each_entry(entry, &hive->device_list, head) |
901 | top_info->nodes[count++].node_id = entry->node_id; |
902 | top_info->num_nodes = count; |
903 | atomic_set(v: &hive->number_devices, i: count); |
904 | |
905 | task_barrier_add_task(tb: &hive->tb); |
906 | |
907 | if (!adev->gmc.xgmi.pending_reset && |
908 | amdgpu_device_ip_get_ip_block(adev, type: AMD_IP_BLOCK_TYPE_PSP)) { |
909 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
910 | /* update node list for other device in the hive */ |
911 | if (tmp_adev != adev) { |
912 | top_info = &tmp_adev->psp.xgmi_context.top_info; |
913 | top_info->nodes[count - 1].node_id = |
914 | adev->gmc.xgmi.node_id; |
915 | top_info->num_nodes = count; |
916 | } |
917 | ret = amdgpu_xgmi_update_topology(hive, adev: tmp_adev); |
918 | if (ret) |
919 | goto exit_unlock; |
920 | } |
921 | |
922 | if (amdgpu_sriov_vf(adev) && |
923 | adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) { |
924 | /* only get topology for VF being init if it can support full duplex */ |
925 | ret = psp_xgmi_get_topology_info(psp: &adev->psp, number_devices: count, |
926 | topology: &adev->psp.xgmi_context.top_info, get_extended_data: false); |
927 | if (ret) { |
928 | dev_err(adev->dev, |
929 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , |
930 | adev->gmc.xgmi.node_id, |
931 | adev->gmc.xgmi.hive_id, ret); |
932 | /* To do: continue with some node failed or disable the whole hive*/ |
933 | goto exit_unlock; |
934 | } |
935 | |
936 | /* fill the topology info for peers instead of getting from PSP */ |
937 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
938 | amdgpu_xgmi_fill_topology_info(adev, peer_adev: tmp_adev); |
939 | } |
940 | } else { |
941 | /* get latest topology info for each device from psp */ |
942 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
943 | ret = psp_xgmi_get_topology_info(psp: &tmp_adev->psp, number_devices: count, |
944 | topology: &tmp_adev->psp.xgmi_context.top_info, get_extended_data: false); |
945 | if (ret) { |
946 | dev_err(tmp_adev->dev, |
947 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , |
948 | tmp_adev->gmc.xgmi.node_id, |
949 | tmp_adev->gmc.xgmi.hive_id, ret); |
950 | /* To do : continue with some node failed or disable the whole hive */ |
951 | goto exit_unlock; |
952 | } |
953 | } |
954 | } |
955 | |
956 | /* get topology again for hives that support extended data */ |
957 | if (adev->psp.xgmi_context.supports_extended_data) { |
958 | |
959 | /* initialize the hive to get extended data. */ |
960 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, set_extended_data: true); |
961 | if (ret) |
962 | goto exit_unlock; |
963 | |
964 | /* get the extended data. */ |
965 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
966 | ret = psp_xgmi_get_topology_info(psp: &tmp_adev->psp, number_devices: count, |
967 | topology: &tmp_adev->psp.xgmi_context.top_info, get_extended_data: true); |
968 | if (ret) { |
969 | dev_err(tmp_adev->dev, |
970 | "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d" , |
971 | tmp_adev->gmc.xgmi.node_id, |
972 | tmp_adev->gmc.xgmi.hive_id, ret); |
973 | goto exit_unlock; |
974 | } |
975 | } |
976 | |
977 | /* initialize the hive to get non-extended data for the next round. */ |
978 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, set_extended_data: false); |
979 | if (ret) |
980 | goto exit_unlock; |
981 | |
982 | } |
983 | } |
984 | |
985 | if (!ret && !adev->gmc.xgmi.pending_reset) |
986 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); |
987 | |
988 | exit_unlock: |
989 | mutex_unlock(lock: &hive->hive_lock); |
990 | exit: |
991 | if (!ret) { |
992 | adev->hive = hive; |
993 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n" , |
994 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); |
995 | } else { |
996 | amdgpu_put_xgmi_hive(hive); |
997 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n" , |
998 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, |
999 | ret); |
1000 | } |
1001 | |
1002 | return ret; |
1003 | } |
1004 | |
1005 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
1006 | { |
1007 | struct amdgpu_hive_info *hive = adev->hive; |
1008 | |
1009 | if (!adev->gmc.xgmi.supported) |
1010 | return -EINVAL; |
1011 | |
1012 | if (!hive) |
1013 | return -EINVAL; |
1014 | |
1015 | mutex_lock(&hive->hive_lock); |
1016 | task_barrier_rem_task(tb: &hive->tb); |
1017 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); |
1018 | if (hive->hi_req_gpu == adev) |
1019 | hive->hi_req_gpu = NULL; |
1020 | list_del(entry: &adev->gmc.xgmi.head); |
1021 | mutex_unlock(lock: &hive->hive_lock); |
1022 | |
1023 | amdgpu_put_xgmi_hive(hive); |
1024 | adev->hive = NULL; |
1025 | |
1026 | if (atomic_dec_return(v: &hive->number_devices) == 0) { |
1027 | /* Remove the hive from global hive list */ |
1028 | mutex_lock(&xgmi_mutex); |
1029 | list_del(entry: &hive->node); |
1030 | mutex_unlock(lock: &xgmi_mutex); |
1031 | |
1032 | amdgpu_put_xgmi_hive(hive); |
1033 | } |
1034 | |
1035 | return 0; |
1036 | } |
1037 | |
1038 | static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, |
1039 | struct aca_bank_report *report, void *data) |
1040 | { |
1041 | struct amdgpu_device *adev = handle->adev; |
1042 | const char *error_str; |
1043 | u64 status; |
1044 | int ret, ext_error_code; |
1045 | |
1046 | ret = aca_bank_info_decode(bank, info: &report->info); |
1047 | if (ret) |
1048 | return ret; |
1049 | |
1050 | status = bank->regs[ACA_REG_IDX_STATUS]; |
1051 | ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); |
1052 | |
1053 | error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? |
1054 | xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; |
1055 | if (error_str) |
1056 | dev_info(adev->dev, "%s detected\n" , error_str); |
1057 | |
1058 | if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) || |
1059 | (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) |
1060 | report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); |
1061 | |
1062 | return 0; |
1063 | } |
1064 | |
1065 | static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = { |
1066 | .aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report, |
1067 | }; |
1068 | |
1069 | static const struct aca_info xgmi_v6_4_0_aca_info = { |
1070 | .hwip = ACA_HWIP_TYPE_PCS_XGMI, |
1071 | .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK, |
1072 | .bank_ops = &xgmi_v6_4_0_aca_bank_ops, |
1073 | }; |
1074 | |
1075 | static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
1076 | { |
1077 | int r; |
1078 | |
1079 | if (!adev->gmc.xgmi.supported || |
1080 | adev->gmc.xgmi.num_physical_nodes == 0) |
1081 | return 0; |
1082 | |
1083 | amdgpu_ras_reset_error_count(adev, block: AMDGPU_RAS_BLOCK__XGMI_WAFL); |
1084 | |
1085 | r = amdgpu_ras_block_late_init(adev, ras_block); |
1086 | if (r) |
1087 | return r; |
1088 | |
1089 | switch (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0)) { |
1090 | case IP_VERSION(6, 4, 0): |
1091 | r = amdgpu_ras_bind_aca(adev, blk: AMDGPU_RAS_BLOCK__XGMI_WAFL, |
1092 | aca_info: &xgmi_v6_4_0_aca_info, NULL); |
1093 | if (r) |
1094 | goto late_fini; |
1095 | break; |
1096 | default: |
1097 | break; |
1098 | } |
1099 | |
1100 | return 0; |
1101 | |
1102 | late_fini: |
1103 | amdgpu_ras_block_late_fini(adev, ras_block); |
1104 | |
1105 | return r; |
1106 | } |
1107 | |
1108 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, |
1109 | uint64_t addr) |
1110 | { |
1111 | struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; |
1112 | return (addr + xgmi->physical_node_id * xgmi->node_segment_size); |
1113 | } |
1114 | |
1115 | static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) |
1116 | { |
1117 | WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); |
1118 | WREG32_PCIE(pcs_status_reg, 0); |
1119 | } |
1120 | |
1121 | static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev) |
1122 | { |
1123 | uint32_t i; |
1124 | |
1125 | switch (adev->asic_type) { |
1126 | case CHIP_ARCTURUS: |
1127 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) |
1128 | pcs_clear_status(adev, |
1129 | pcs_status_reg: xgmi_pcs_err_status_reg_arct[i]); |
1130 | break; |
1131 | case CHIP_VEGA20: |
1132 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) |
1133 | pcs_clear_status(adev, |
1134 | pcs_status_reg: xgmi_pcs_err_status_reg_vg20[i]); |
1135 | break; |
1136 | case CHIP_ALDEBARAN: |
1137 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) |
1138 | pcs_clear_status(adev, |
1139 | pcs_status_reg: xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
1140 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) |
1141 | pcs_clear_status(adev, |
1142 | pcs_status_reg: walf_pcs_err_status_reg_aldebaran[i]); |
1143 | break; |
1144 | default: |
1145 | break; |
1146 | } |
1147 | |
1148 | switch (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0)) { |
1149 | case IP_VERSION(6, 4, 0): |
1150 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) |
1151 | pcs_clear_status(adev, |
1152 | pcs_status_reg: xgmi3x16_pcs_err_status_reg_v6_4[i]); |
1153 | break; |
1154 | default: |
1155 | break; |
1156 | } |
1157 | } |
1158 | |
1159 | static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base) |
1160 | { |
1161 | WREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS, 0ULL); |
1162 | } |
1163 | |
1164 | static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst) |
1165 | { |
1166 | int i; |
1167 | |
1168 | for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) |
1169 | __xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, mca_base: xgmi_v6_4_0_mca_base_array[i]); |
1170 | } |
1171 | |
1172 | static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev) |
1173 | { |
1174 | int i; |
1175 | |
1176 | for_each_inst(i, adev->aid_mask) |
1177 | xgmi_v6_4_0_reset_error_count(adev, xgmi_inst: i); |
1178 | } |
1179 | |
1180 | static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) |
1181 | { |
1182 | switch (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0)) { |
1183 | case IP_VERSION(6, 4, 0): |
1184 | xgmi_v6_4_0_reset_ras_error_count(adev); |
1185 | break; |
1186 | default: |
1187 | amdgpu_xgmi_legacy_reset_ras_error_count(adev); |
1188 | break; |
1189 | } |
1190 | } |
1191 | |
1192 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, |
1193 | uint32_t value, |
1194 | uint32_t mask_value, |
1195 | uint32_t *ue_count, |
1196 | uint32_t *ce_count, |
1197 | bool is_xgmi_pcs, |
1198 | bool check_mask) |
1199 | { |
1200 | int i; |
1201 | int ue_cnt = 0; |
1202 | const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL; |
1203 | uint32_t field_array_size = 0; |
1204 | |
1205 | if (is_xgmi_pcs) { |
1206 | if (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0) == |
1207 | IP_VERSION(6, 1, 0) || |
1208 | amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0) == |
1209 | IP_VERSION(6, 4, 0)) { |
1210 | pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; |
1211 | field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); |
1212 | } else { |
1213 | pcs_ras_fields = &xgmi_pcs_ras_fields[0]; |
1214 | field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields); |
1215 | } |
1216 | } else { |
1217 | pcs_ras_fields = &wafl_pcs_ras_fields[0]; |
1218 | field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields); |
1219 | } |
1220 | |
1221 | if (check_mask) |
1222 | value = value & ~mask_value; |
1223 | |
1224 | /* query xgmi/walf pcs error status, |
1225 | * only ue is supported */ |
1226 | for (i = 0; value && i < field_array_size; i++) { |
1227 | ue_cnt = (value & |
1228 | pcs_ras_fields[i].pcs_err_mask) >> |
1229 | pcs_ras_fields[i].pcs_err_shift; |
1230 | if (ue_cnt) { |
1231 | dev_info(adev->dev, "%s detected\n" , |
1232 | pcs_ras_fields[i].err_name); |
1233 | *ue_count += ue_cnt; |
1234 | } |
1235 | |
1236 | /* reset bit value if the bit is checked */ |
1237 | value &= ~(pcs_ras_fields[i].pcs_err_mask); |
1238 | } |
1239 | |
1240 | return 0; |
1241 | } |
1242 | |
1243 | static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, |
1244 | void *ras_error_status) |
1245 | { |
1246 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
1247 | int i, supported = 1; |
1248 | uint32_t data, mask_data = 0; |
1249 | uint32_t ue_cnt = 0, ce_cnt = 0; |
1250 | |
1251 | if (!amdgpu_ras_is_supported(adev, block: AMDGPU_RAS_BLOCK__XGMI_WAFL)) |
1252 | return ; |
1253 | |
1254 | err_data->ue_count = 0; |
1255 | err_data->ce_count = 0; |
1256 | |
1257 | switch (adev->asic_type) { |
1258 | case CHIP_ARCTURUS: |
1259 | /* check xgmi pcs error */ |
1260 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { |
1261 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); |
1262 | if (data) |
1263 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1264 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: false); |
1265 | } |
1266 | /* check wafl pcs error */ |
1267 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { |
1268 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); |
1269 | if (data) |
1270 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1271 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: false, check_mask: false); |
1272 | } |
1273 | break; |
1274 | case CHIP_VEGA20: |
1275 | /* check xgmi pcs error */ |
1276 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { |
1277 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); |
1278 | if (data) |
1279 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1280 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: false); |
1281 | } |
1282 | /* check wafl pcs error */ |
1283 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { |
1284 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); |
1285 | if (data) |
1286 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1287 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: false, check_mask: false); |
1288 | } |
1289 | break; |
1290 | case CHIP_ALDEBARAN: |
1291 | /* check xgmi3x16 pcs error */ |
1292 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { |
1293 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
1294 | mask_data = |
1295 | RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]); |
1296 | if (data) |
1297 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1298 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: true); |
1299 | } |
1300 | /* check wafl pcs error */ |
1301 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { |
1302 | data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); |
1303 | mask_data = |
1304 | RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]); |
1305 | if (data) |
1306 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1307 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: false, check_mask: true); |
1308 | } |
1309 | break; |
1310 | default: |
1311 | supported = 0; |
1312 | break; |
1313 | } |
1314 | |
1315 | switch (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0)) { |
1316 | case IP_VERSION(6, 4, 0): |
1317 | /* check xgmi3x16 pcs error */ |
1318 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) { |
1319 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]); |
1320 | mask_data = |
1321 | RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]); |
1322 | if (data) |
1323 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1324 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: true); |
1325 | } |
1326 | break; |
1327 | default: |
1328 | if (!supported) |
1329 | dev_warn(adev->dev, "XGMI RAS error query not supported" ); |
1330 | break; |
1331 | } |
1332 | |
1333 | amdgpu_ras_reset_error_count(adev, block: AMDGPU_RAS_BLOCK__XGMI_WAFL); |
1334 | |
1335 | err_data->ue_count += ue_cnt; |
1336 | err_data->ce_count += ce_cnt; |
1337 | } |
1338 | |
1339 | static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status) |
1340 | { |
1341 | const char *error_str; |
1342 | int ext_error_code; |
1343 | |
1344 | ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); |
1345 | |
1346 | error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? |
1347 | xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; |
1348 | if (error_str) |
1349 | dev_info(adev->dev, "%s detected\n" , error_str); |
1350 | |
1351 | switch (ext_error_code) { |
1352 | case 0: |
1353 | return ACA_ERROR_TYPE_UE; |
1354 | case 6: |
1355 | return ACA_ERROR_TYPE_CE; |
1356 | default: |
1357 | return -EINVAL; |
1358 | } |
1359 | |
1360 | return -EINVAL; |
1361 | } |
1362 | |
1363 | static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info, |
1364 | u64 mca_base, struct ras_err_data *err_data) |
1365 | { |
1366 | int xgmi_inst = mcm_info->die_id; |
1367 | u64 status = 0; |
1368 | |
1369 | status = RREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS); |
1370 | if (!ACA_REG__STATUS__VAL(status)) |
1371 | return; |
1372 | |
1373 | switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) { |
1374 | case ACA_ERROR_TYPE_UE: |
1375 | amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, count: 1ULL); |
1376 | break; |
1377 | case ACA_ERROR_TYPE_CE: |
1378 | amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, count: 1ULL); |
1379 | break; |
1380 | default: |
1381 | break; |
1382 | } |
1383 | |
1384 | WREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS, 0ULL); |
1385 | } |
1386 | |
1387 | static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data) |
1388 | { |
1389 | struct amdgpu_smuio_mcm_config_info mcm_info = { |
1390 | .socket_id = adev->smuio.funcs->get_socket_id(adev), |
1391 | .die_id = xgmi_inst, |
1392 | }; |
1393 | int i; |
1394 | |
1395 | for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) |
1396 | __xgmi_v6_4_0_query_error_count(adev, mcm_info: &mcm_info, mca_base: xgmi_v6_4_0_mca_base_array[i], err_data); |
1397 | } |
1398 | |
1399 | static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) |
1400 | { |
1401 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
1402 | int i; |
1403 | |
1404 | for_each_inst(i, adev->aid_mask) |
1405 | xgmi_v6_4_0_query_error_count(adev, xgmi_inst: i, err_data); |
1406 | } |
1407 | |
1408 | static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, |
1409 | void *ras_error_status) |
1410 | { |
1411 | switch (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0)) { |
1412 | case IP_VERSION(6, 4, 0): |
1413 | xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status); |
1414 | break; |
1415 | default: |
1416 | amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status); |
1417 | break; |
1418 | } |
1419 | } |
1420 | |
1421 | /* Trigger XGMI/WAFL error */ |
1422 | static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, |
1423 | void *inject_if, uint32_t instance_mask) |
1424 | { |
1425 | int ret1, ret2; |
1426 | struct ta_ras_trigger_error_input *block_info = |
1427 | (struct ta_ras_trigger_error_input *)inject_if; |
1428 | |
1429 | if (amdgpu_dpm_set_df_cstate(adev, cstate: DF_CSTATE_DISALLOW)) |
1430 | dev_warn(adev->dev, "Failed to disallow df cstate" ); |
1431 | |
1432 | ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, mode: XGMI_PLPD_DISALLOW); |
1433 | if (ret1 && ret1 != -EOPNOTSUPP) |
1434 | dev_warn(adev->dev, "Failed to disallow XGMI power down" ); |
1435 | |
1436 | ret2 = psp_ras_trigger_error(psp: &adev->psp, info: block_info, instance_mask); |
1437 | |
1438 | if (amdgpu_ras_intr_triggered()) |
1439 | return ret2; |
1440 | |
1441 | ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, mode: XGMI_PLPD_DEFAULT); |
1442 | if (ret1 && ret1 != -EOPNOTSUPP) |
1443 | dev_warn(adev->dev, "Failed to allow XGMI power down" ); |
1444 | |
1445 | if (amdgpu_dpm_set_df_cstate(adev, cstate: DF_CSTATE_ALLOW)) |
1446 | dev_warn(adev->dev, "Failed to allow df cstate" ); |
1447 | |
1448 | return ret2; |
1449 | } |
1450 | |
1451 | struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { |
1452 | .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, |
1453 | .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, |
1454 | .ras_error_inject = amdgpu_ras_error_inject_xgmi, |
1455 | }; |
1456 | |
1457 | struct amdgpu_xgmi_ras xgmi_ras = { |
1458 | .ras_block = { |
1459 | .hw_ops = &xgmi_ras_hw_ops, |
1460 | .ras_late_init = amdgpu_xgmi_ras_late_init, |
1461 | }, |
1462 | }; |
1463 | |
1464 | int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) |
1465 | { |
1466 | int err; |
1467 | struct amdgpu_xgmi_ras *ras; |
1468 | |
1469 | if (!adev->gmc.xgmi.ras) |
1470 | return 0; |
1471 | |
1472 | ras = adev->gmc.xgmi.ras; |
1473 | err = amdgpu_ras_register_ras_block(adev, ras_block_obj: &ras->ras_block); |
1474 | if (err) { |
1475 | dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n" ); |
1476 | return err; |
1477 | } |
1478 | |
1479 | strcpy(p: ras->ras_block.ras_comm.name, q: "xgmi_wafl" ); |
1480 | ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL; |
1481 | ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; |
1482 | adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm; |
1483 | |
1484 | return 0; |
1485 | } |
1486 | |