1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | * |
23 | */ |
24 | #ifndef _AMDGPU_RAS_H |
25 | #define _AMDGPU_RAS_H |
26 | |
27 | #include <linux/debugfs.h> |
28 | #include <linux/list.h> |
29 | #include "ta_ras_if.h" |
30 | #include "amdgpu_ras_eeprom.h" |
31 | #include "amdgpu_smuio.h" |
32 | |
33 | struct amdgpu_iv_entry; |
34 | |
35 | #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) |
36 | /* position of instance value in sub_block_index of |
37 | * ta_ras_trigger_error_input, the sub block uses lower 12 bits |
38 | */ |
39 | #define AMDGPU_RAS_INST_MASK 0xfffff000 |
40 | #define AMDGPU_RAS_INST_SHIFT 0xc |
41 | |
42 | enum amdgpu_ras_block { |
43 | AMDGPU_RAS_BLOCK__UMC = 0, |
44 | AMDGPU_RAS_BLOCK__SDMA, |
45 | AMDGPU_RAS_BLOCK__GFX, |
46 | AMDGPU_RAS_BLOCK__MMHUB, |
47 | AMDGPU_RAS_BLOCK__ATHUB, |
48 | AMDGPU_RAS_BLOCK__PCIE_BIF, |
49 | AMDGPU_RAS_BLOCK__HDP, |
50 | AMDGPU_RAS_BLOCK__XGMI_WAFL, |
51 | AMDGPU_RAS_BLOCK__DF, |
52 | AMDGPU_RAS_BLOCK__SMN, |
53 | AMDGPU_RAS_BLOCK__SEM, |
54 | AMDGPU_RAS_BLOCK__MP0, |
55 | AMDGPU_RAS_BLOCK__MP1, |
56 | AMDGPU_RAS_BLOCK__FUSE, |
57 | AMDGPU_RAS_BLOCK__MCA, |
58 | AMDGPU_RAS_BLOCK__VCN, |
59 | AMDGPU_RAS_BLOCK__JPEG, |
60 | |
61 | AMDGPU_RAS_BLOCK__LAST |
62 | }; |
63 | |
64 | enum amdgpu_ras_mca_block { |
65 | AMDGPU_RAS_MCA_BLOCK__MP0 = 0, |
66 | AMDGPU_RAS_MCA_BLOCK__MP1, |
67 | AMDGPU_RAS_MCA_BLOCK__MPIO, |
68 | AMDGPU_RAS_MCA_BLOCK__IOHC, |
69 | |
70 | AMDGPU_RAS_MCA_BLOCK__LAST |
71 | }; |
72 | |
73 | #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST |
74 | #define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST |
75 | #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) |
76 | |
77 | enum amdgpu_ras_gfx_subblock { |
78 | /* CPC */ |
79 | AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START = 0, |
80 | AMDGPU_RAS_BLOCK__GFX_CPC_SCRATCH = |
81 | AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START, |
82 | AMDGPU_RAS_BLOCK__GFX_CPC_UCODE, |
83 | AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME1, |
84 | AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME1, |
85 | AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME1, |
86 | AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME2, |
87 | AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME2, |
88 | AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, |
89 | AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_END = |
90 | AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, |
91 | /* CPF */ |
92 | AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, |
93 | AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME2 = |
94 | AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, |
95 | AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME1, |
96 | AMDGPU_RAS_BLOCK__GFX_CPF_TAG, |
97 | AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPF_TAG, |
98 | /* CPG */ |
99 | AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, |
100 | AMDGPU_RAS_BLOCK__GFX_CPG_DMA_ROQ = |
101 | AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, |
102 | AMDGPU_RAS_BLOCK__GFX_CPG_DMA_TAG, |
103 | AMDGPU_RAS_BLOCK__GFX_CPG_TAG, |
104 | AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPG_TAG, |
105 | /* GDS */ |
106 | AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, |
107 | AMDGPU_RAS_BLOCK__GFX_GDS_MEM = AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, |
108 | AMDGPU_RAS_BLOCK__GFX_GDS_INPUT_QUEUE, |
109 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_CMD_RAM_MEM, |
110 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_DATA_RAM_MEM, |
111 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, |
112 | AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_END = |
113 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, |
114 | /* SPI */ |
115 | AMDGPU_RAS_BLOCK__GFX_SPI_SR_MEM, |
116 | /* SQ */ |
117 | AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, |
118 | AMDGPU_RAS_BLOCK__GFX_SQ_SGPR = AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, |
119 | AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D, |
120 | AMDGPU_RAS_BLOCK__GFX_SQ_LDS_I, |
121 | AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, |
122 | AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_END = AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, |
123 | /* SQC (3 ranges) */ |
124 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, |
125 | /* SQC range 0 */ |
126 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START = |
127 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, |
128 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_UTCL1_LFIFO = |
129 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START, |
130 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_WRITE_DATA_BUF, |
131 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_UTCL1_LFIFO, |
132 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_WRITE_DATA_BUF, |
133 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO, |
134 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_WRITE_DATA_BUF, |
135 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, |
136 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_END = |
137 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, |
138 | /* SQC range 1 */ |
139 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, |
140 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM = |
141 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, |
142 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, |
143 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_MISS_FIFO, |
144 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_BANK_RAM, |
145 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_TAG_RAM, |
146 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_HIT_FIFO, |
147 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_MISS_FIFO, |
148 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, |
149 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, |
150 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_END = |
151 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, |
152 | /* SQC range 2 */ |
153 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, |
154 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM = |
155 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, |
156 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, |
157 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_MISS_FIFO, |
158 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_BANK_RAM, |
159 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_TAG_RAM, |
160 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_HIT_FIFO, |
161 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_MISS_FIFO, |
162 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, |
163 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, |
164 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END = |
165 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, |
166 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_END = |
167 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END, |
168 | /* TA */ |
169 | AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, |
170 | AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO = |
171 | AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, |
172 | AMDGPU_RAS_BLOCK__GFX_TA_FS_AFIFO, |
173 | AMDGPU_RAS_BLOCK__GFX_TA_FL_LFIFO, |
174 | AMDGPU_RAS_BLOCK__GFX_TA_FX_LFIFO, |
175 | AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, |
176 | AMDGPU_RAS_BLOCK__GFX_TA_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, |
177 | /* TCA */ |
178 | AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, |
179 | AMDGPU_RAS_BLOCK__GFX_TCA_HOLE_FIFO = |
180 | AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, |
181 | AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, |
182 | AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_END = |
183 | AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, |
184 | /* TCC (5 sub-ranges) */ |
185 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, |
186 | /* TCC range 0 */ |
187 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START = |
188 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, |
189 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA = |
190 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START, |
191 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1, |
192 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0, |
193 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1, |
194 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_0, |
195 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_1, |
196 | AMDGPU_RAS_BLOCK__GFX_TCC_HIGH_RATE_TAG, |
197 | AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, |
198 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_END = |
199 | AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, |
200 | /* TCC range 1 */ |
201 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, |
202 | AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_DEC = |
203 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, |
204 | AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, |
205 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_END = |
206 | AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, |
207 | /* TCC range 2 */ |
208 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, |
209 | AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_DATA = |
210 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, |
211 | AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_CONTROL, |
212 | AMDGPU_RAS_BLOCK__GFX_TCC_UC_ATOMIC_FIFO, |
213 | AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_RETURN, |
214 | AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_CACHE_READ, |
215 | AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO, |
216 | AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO_NEXT_RAM, |
217 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, |
218 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_END = |
219 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, |
220 | /* TCC range 3 */ |
221 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, |
222 | AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO = |
223 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, |
224 | AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, |
225 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_END = |
226 | AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, |
227 | /* TCC range 4 */ |
228 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, |
229 | AMDGPU_RAS_BLOCK__GFX_TCC_WRRET_TAG_WRITE_RETURN = |
230 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, |
231 | AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, |
232 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END = |
233 | AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, |
234 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_END = |
235 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END, |
236 | /* TCI */ |
237 | AMDGPU_RAS_BLOCK__GFX_TCI_WRITE_RAM, |
238 | /* TCP */ |
239 | AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, |
240 | AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM = |
241 | AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, |
242 | AMDGPU_RAS_BLOCK__GFX_TCP_LFIFO_RAM, |
243 | AMDGPU_RAS_BLOCK__GFX_TCP_CMD_FIFO, |
244 | AMDGPU_RAS_BLOCK__GFX_TCP_VM_FIFO, |
245 | AMDGPU_RAS_BLOCK__GFX_TCP_DB_RAM, |
246 | AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO0, |
247 | AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, |
248 | AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_END = |
249 | AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, |
250 | /* TD */ |
251 | AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, |
252 | AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO = |
253 | AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, |
254 | AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_HI, |
255 | AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, |
256 | AMDGPU_RAS_BLOCK__GFX_TD_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, |
257 | /* EA (3 sub-ranges) */ |
258 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, |
259 | /* EA range 0 */ |
260 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START = |
261 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, |
262 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM = |
263 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START, |
264 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_CMDMEM, |
265 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_DATAMEM, |
266 | AMDGPU_RAS_BLOCK__GFX_EA_RRET_TAGMEM, |
267 | AMDGPU_RAS_BLOCK__GFX_EA_WRET_TAGMEM, |
268 | AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_CMDMEM, |
269 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_CMDMEM, |
270 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, |
271 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_END = |
272 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, |
273 | /* EA range 1 */ |
274 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, |
275 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_PAGEMEM = |
276 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, |
277 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_PAGEMEM, |
278 | AMDGPU_RAS_BLOCK__GFX_EA_IORD_CMDMEM, |
279 | AMDGPU_RAS_BLOCK__GFX_EA_IOWR_CMDMEM, |
280 | AMDGPU_RAS_BLOCK__GFX_EA_IOWR_DATAMEM, |
281 | AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_PAGEMEM, |
282 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, |
283 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_END = |
284 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, |
285 | /* EA range 2 */ |
286 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, |
287 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D0MEM = |
288 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, |
289 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D1MEM, |
290 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D2MEM, |
291 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, |
292 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END = |
293 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, |
294 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX_END = |
295 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END, |
296 | /* UTC VM L2 bank */ |
297 | AMDGPU_RAS_BLOCK__UTC_VML2_BANK_CACHE, |
298 | /* UTC VM walker */ |
299 | AMDGPU_RAS_BLOCK__UTC_VML2_WALKER, |
300 | /* UTC ATC L2 2MB cache */ |
301 | AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_2M_BANK, |
302 | /* UTC ATC L2 4KB cache */ |
303 | AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_4K_BANK, |
304 | AMDGPU_RAS_BLOCK__GFX_MAX |
305 | }; |
306 | |
307 | enum amdgpu_ras_error_type { |
308 | AMDGPU_RAS_ERROR__NONE = 0, |
309 | AMDGPU_RAS_ERROR__PARITY = 1, |
310 | AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, |
311 | AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, |
312 | AMDGPU_RAS_ERROR__POISON = 8, |
313 | }; |
314 | |
315 | enum amdgpu_ras_ret { |
316 | AMDGPU_RAS_SUCCESS = 0, |
317 | AMDGPU_RAS_FAIL, |
318 | AMDGPU_RAS_UE, |
319 | AMDGPU_RAS_CE, |
320 | AMDGPU_RAS_PT, |
321 | }; |
322 | |
323 | /* ras error status reisger fields */ |
324 | #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 |
325 | #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L |
326 | #define ERR_STATUS_LO__MEMORY_ID__SHIFT 0x18 |
327 | #define ERR_STATUS_LO__MEMORY_ID_MASK 0xFF000000L |
328 | #define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT 0x2 |
329 | #define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK 0x00000004L |
330 | #define ERR_STATUS__ERR_CNT__SHIFT 0x17 |
331 | #define ERR_STATUS__ERR_CNT_MASK 0x03800000L |
332 | |
333 | #define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \ |
334 | ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi |
335 | |
336 | #define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \ |
337 | (adev->reg_offset[hwip][ip_inst][segment] + (reg)) |
338 | |
339 | #define AMDGPU_RAS_ERR_INFO_VALID (1 << 0) |
340 | #define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) |
341 | #define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) |
342 | |
343 | #define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0) |
344 | #define AMDGPU_RAS_GPU_RESET_MODE1_RESET (0x1 << 1) |
345 | |
346 | struct amdgpu_ras_err_status_reg_entry { |
347 | uint32_t hwip; |
348 | uint32_t ip_inst; |
349 | uint32_t seg_lo; |
350 | uint32_t reg_lo; |
351 | uint32_t seg_hi; |
352 | uint32_t reg_hi; |
353 | uint32_t reg_inst; |
354 | uint32_t flags; |
355 | const char *block_name; |
356 | }; |
357 | |
358 | struct amdgpu_ras_memory_id_entry { |
359 | uint32_t memory_id; |
360 | const char *name; |
361 | }; |
362 | |
363 | struct ras_common_if { |
364 | enum amdgpu_ras_block block; |
365 | enum amdgpu_ras_error_type type; |
366 | uint32_t sub_block_index; |
367 | char name[32]; |
368 | }; |
369 | |
370 | #define MAX_UMC_CHANNEL_NUM 32 |
371 | |
372 | struct ecc_info_per_ch { |
373 | uint16_t ce_count_lo_chip; |
374 | uint16_t ce_count_hi_chip; |
375 | uint64_t mca_umc_status; |
376 | uint64_t mca_umc_addr; |
377 | uint64_t mca_ceumc_addr; |
378 | }; |
379 | |
380 | struct umc_ecc_info { |
381 | struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM]; |
382 | |
383 | /* Determine smu ecctable whether support |
384 | * record correctable error address |
385 | */ |
386 | int record_ce_addr_supported; |
387 | }; |
388 | |
389 | struct amdgpu_ras { |
390 | /* ras infrastructure */ |
391 | /* for ras itself. */ |
392 | uint32_t features; |
393 | uint32_t schema; |
394 | struct list_head head; |
395 | /* sysfs */ |
396 | struct device_attribute features_attr; |
397 | struct device_attribute version_attr; |
398 | struct device_attribute schema_attr; |
399 | struct bin_attribute badpages_attr; |
400 | struct dentry *de_ras_eeprom_table; |
401 | /* block array */ |
402 | struct ras_manager *objs; |
403 | |
404 | /* gpu recovery */ |
405 | struct work_struct recovery_work; |
406 | atomic_t in_recovery; |
407 | struct amdgpu_device *adev; |
408 | /* error handler data */ |
409 | struct ras_err_handler_data *eh_data; |
410 | struct mutex recovery_lock; |
411 | |
412 | uint32_t flags; |
413 | bool reboot; |
414 | struct amdgpu_ras_eeprom_control eeprom_control; |
415 | |
416 | bool error_query_ready; |
417 | |
418 | /* bad page count threshold */ |
419 | uint32_t bad_page_cnt_threshold; |
420 | |
421 | /* disable ras error count harvest in recovery */ |
422 | bool disable_ras_err_cnt_harvest; |
423 | |
424 | /* is poison mode supported */ |
425 | bool poison_supported; |
426 | |
427 | /* RAS count errors delayed work */ |
428 | struct delayed_work ras_counte_delay_work; |
429 | atomic_t ras_ue_count; |
430 | atomic_t ras_ce_count; |
431 | |
432 | /* record umc error info queried from smu */ |
433 | struct umc_ecc_info umc_ecc; |
434 | |
435 | /* Indicates smu whether need update bad channel info */ |
436 | bool update_channel_flag; |
437 | /* Record status of smu mca debug mode */ |
438 | bool is_mca_debug_mode; |
439 | |
440 | /* Record special requirements of gpu reset caller */ |
441 | uint32_t gpu_reset_flags; |
442 | }; |
443 | |
444 | struct ras_fs_data { |
445 | char sysfs_name[48]; |
446 | char debugfs_name[32]; |
447 | }; |
448 | |
449 | struct ras_err_info { |
450 | struct amdgpu_smuio_mcm_config_info mcm_info; |
451 | u64 ce_count; |
452 | u64 ue_count; |
453 | }; |
454 | |
455 | struct ras_err_node { |
456 | struct list_head node; |
457 | struct ras_err_info err_info; |
458 | }; |
459 | |
460 | struct ras_err_data { |
461 | unsigned long ue_count; |
462 | unsigned long ce_count; |
463 | unsigned long err_addr_cnt; |
464 | struct eeprom_table_record *err_addr; |
465 | u32 err_list_count; |
466 | struct list_head err_node_list; |
467 | }; |
468 | |
469 | #define for_each_ras_error(err_node, err_data) \ |
470 | list_for_each_entry(err_node, &(err_data)->err_node_list, node) |
471 | |
472 | struct ras_err_handler_data { |
473 | /* point to bad page records array */ |
474 | struct eeprom_table_record *bps; |
475 | /* the count of entries */ |
476 | int count; |
477 | /* the space can place new entries */ |
478 | int space_left; |
479 | }; |
480 | |
481 | typedef int (*ras_ih_cb)(struct amdgpu_device *adev, |
482 | void *err_data, |
483 | struct amdgpu_iv_entry *entry); |
484 | |
485 | struct ras_ih_data { |
486 | /* interrupt bottom half */ |
487 | struct work_struct ih_work; |
488 | int inuse; |
489 | /* IP callback */ |
490 | ras_ih_cb cb; |
491 | /* full of entries */ |
492 | unsigned char *ring; |
493 | unsigned int ring_size; |
494 | unsigned int element_size; |
495 | unsigned int aligned_element_size; |
496 | unsigned int rptr; |
497 | unsigned int wptr; |
498 | }; |
499 | |
500 | struct ras_manager { |
501 | struct ras_common_if head; |
502 | /* reference count */ |
503 | int use; |
504 | /* ras block link */ |
505 | struct list_head node; |
506 | /* the device */ |
507 | struct amdgpu_device *adev; |
508 | /* sysfs */ |
509 | struct device_attribute sysfs_attr; |
510 | int attr_inuse; |
511 | |
512 | /* fs node name */ |
513 | struct ras_fs_data fs_data; |
514 | |
515 | /* IH data */ |
516 | struct ras_ih_data ih_data; |
517 | |
518 | struct ras_err_data err_data; |
519 | }; |
520 | |
521 | struct ras_badpage { |
522 | unsigned int bp; |
523 | unsigned int size; |
524 | unsigned int flags; |
525 | }; |
526 | |
527 | /* interfaces for IP */ |
528 | struct ras_fs_if { |
529 | struct ras_common_if head; |
530 | const char* sysfs_name; |
531 | char debugfs_name[32]; |
532 | }; |
533 | |
534 | struct ras_query_if { |
535 | struct ras_common_if head; |
536 | unsigned long ue_count; |
537 | unsigned long ce_count; |
538 | }; |
539 | |
540 | struct ras_inject_if { |
541 | struct ras_common_if head; |
542 | uint64_t address; |
543 | uint64_t value; |
544 | uint32_t instance_mask; |
545 | }; |
546 | |
547 | struct ras_cure_if { |
548 | struct ras_common_if head; |
549 | uint64_t address; |
550 | }; |
551 | |
552 | struct ras_ih_if { |
553 | struct ras_common_if head; |
554 | ras_ih_cb cb; |
555 | }; |
556 | |
557 | struct ras_dispatch_if { |
558 | struct ras_common_if head; |
559 | struct amdgpu_iv_entry *entry; |
560 | }; |
561 | |
562 | struct ras_debug_if { |
563 | union { |
564 | struct ras_common_if head; |
565 | struct ras_inject_if inject; |
566 | }; |
567 | int op; |
568 | }; |
569 | |
570 | struct amdgpu_ras_block_object { |
571 | struct ras_common_if ras_comm; |
572 | |
573 | int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, |
574 | enum amdgpu_ras_block block, uint32_t sub_block_index); |
575 | int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block); |
576 | void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block); |
577 | ras_ih_cb ras_cb; |
578 | const struct amdgpu_ras_block_hw_ops *hw_ops; |
579 | }; |
580 | |
581 | struct amdgpu_ras_block_hw_ops { |
582 | int (*ras_error_inject)(struct amdgpu_device *adev, |
583 | void *inject_if, uint32_t instance_mask); |
584 | void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); |
585 | void (*query_ras_error_status)(struct amdgpu_device *adev); |
586 | void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); |
587 | void (*reset_ras_error_count)(struct amdgpu_device *adev); |
588 | void (*reset_ras_error_status)(struct amdgpu_device *adev); |
589 | bool (*query_poison_status)(struct amdgpu_device *adev); |
590 | bool (*handle_poison_consumption)(struct amdgpu_device *adev); |
591 | }; |
592 | |
593 | /* work flow |
594 | * vbios |
595 | * 1: ras feature enable (enabled by default) |
596 | * psp |
597 | * 2: ras framework init (in ip_init) |
598 | * IP |
599 | * 3: IH add |
600 | * 4: debugfs/sysfs create |
601 | * 5: query/inject |
602 | * 6: debugfs/sysfs remove |
603 | * 7: IH remove |
604 | * 8: feature disable |
605 | */ |
606 | |
607 | |
608 | int amdgpu_ras_recovery_init(struct amdgpu_device *adev); |
609 | |
610 | void amdgpu_ras_resume(struct amdgpu_device *adev); |
611 | void amdgpu_ras_suspend(struct amdgpu_device *adev); |
612 | |
613 | int amdgpu_ras_query_error_count(struct amdgpu_device *adev, |
614 | unsigned long *ce_count, |
615 | unsigned long *ue_count, |
616 | struct ras_query_if *query_info); |
617 | |
618 | /* error handling functions */ |
619 | int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, |
620 | struct eeprom_table_record *bps, int pages); |
621 | |
622 | int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, |
623 | unsigned long *new_cnt); |
624 | |
625 | static inline enum ta_ras_block |
626 | amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { |
627 | switch (block) { |
628 | case AMDGPU_RAS_BLOCK__UMC: |
629 | return TA_RAS_BLOCK__UMC; |
630 | case AMDGPU_RAS_BLOCK__SDMA: |
631 | return TA_RAS_BLOCK__SDMA; |
632 | case AMDGPU_RAS_BLOCK__GFX: |
633 | return TA_RAS_BLOCK__GFX; |
634 | case AMDGPU_RAS_BLOCK__MMHUB: |
635 | return TA_RAS_BLOCK__MMHUB; |
636 | case AMDGPU_RAS_BLOCK__ATHUB: |
637 | return TA_RAS_BLOCK__ATHUB; |
638 | case AMDGPU_RAS_BLOCK__PCIE_BIF: |
639 | return TA_RAS_BLOCK__PCIE_BIF; |
640 | case AMDGPU_RAS_BLOCK__HDP: |
641 | return TA_RAS_BLOCK__HDP; |
642 | case AMDGPU_RAS_BLOCK__XGMI_WAFL: |
643 | return TA_RAS_BLOCK__XGMI_WAFL; |
644 | case AMDGPU_RAS_BLOCK__DF: |
645 | return TA_RAS_BLOCK__DF; |
646 | case AMDGPU_RAS_BLOCK__SMN: |
647 | return TA_RAS_BLOCK__SMN; |
648 | case AMDGPU_RAS_BLOCK__SEM: |
649 | return TA_RAS_BLOCK__SEM; |
650 | case AMDGPU_RAS_BLOCK__MP0: |
651 | return TA_RAS_BLOCK__MP0; |
652 | case AMDGPU_RAS_BLOCK__MP1: |
653 | return TA_RAS_BLOCK__MP1; |
654 | case AMDGPU_RAS_BLOCK__FUSE: |
655 | return TA_RAS_BLOCK__FUSE; |
656 | case AMDGPU_RAS_BLOCK__MCA: |
657 | return TA_RAS_BLOCK__MCA; |
658 | case AMDGPU_RAS_BLOCK__VCN: |
659 | return TA_RAS_BLOCK__VCN; |
660 | case AMDGPU_RAS_BLOCK__JPEG: |
661 | return TA_RAS_BLOCK__JPEG; |
662 | default: |
663 | WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n" , block); |
664 | return TA_RAS_BLOCK__UMC; |
665 | } |
666 | } |
667 | |
668 | static inline enum ta_ras_error_type |
669 | amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { |
670 | switch (error) { |
671 | case AMDGPU_RAS_ERROR__NONE: |
672 | return TA_RAS_ERROR__NONE; |
673 | case AMDGPU_RAS_ERROR__PARITY: |
674 | return TA_RAS_ERROR__PARITY; |
675 | case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: |
676 | return TA_RAS_ERROR__SINGLE_CORRECTABLE; |
677 | case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: |
678 | return TA_RAS_ERROR__MULTI_UNCORRECTABLE; |
679 | case AMDGPU_RAS_ERROR__POISON: |
680 | return TA_RAS_ERROR__POISON; |
681 | default: |
682 | WARN_ONCE(1, "RAS ERROR: unexpected error type %d\n" , error); |
683 | return TA_RAS_ERROR__NONE; |
684 | } |
685 | } |
686 | |
687 | /* called in ip_init and ip_fini */ |
688 | int amdgpu_ras_init(struct amdgpu_device *adev); |
689 | int amdgpu_ras_late_init(struct amdgpu_device *adev); |
690 | int amdgpu_ras_fini(struct amdgpu_device *adev); |
691 | int amdgpu_ras_pre_fini(struct amdgpu_device *adev); |
692 | |
693 | int amdgpu_ras_block_late_init(struct amdgpu_device *adev, |
694 | struct ras_common_if *ras_block); |
695 | |
696 | void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, |
697 | struct ras_common_if *ras_block); |
698 | |
699 | int amdgpu_ras_feature_enable(struct amdgpu_device *adev, |
700 | struct ras_common_if *head, bool enable); |
701 | |
702 | int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, |
703 | struct ras_common_if *head, bool enable); |
704 | |
705 | int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, |
706 | struct ras_common_if *head); |
707 | |
708 | int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, |
709 | struct ras_common_if *head); |
710 | |
711 | void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); |
712 | |
713 | int amdgpu_ras_query_error_status(struct amdgpu_device *adev, |
714 | struct ras_query_if *info); |
715 | |
716 | int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, |
717 | enum amdgpu_ras_block block); |
718 | int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, |
719 | enum amdgpu_ras_block block); |
720 | |
721 | int amdgpu_ras_error_inject(struct amdgpu_device *adev, |
722 | struct ras_inject_if *info); |
723 | |
724 | int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, |
725 | struct ras_common_if *head); |
726 | |
727 | int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, |
728 | struct ras_common_if *head); |
729 | |
730 | int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, |
731 | struct ras_dispatch_if *info); |
732 | |
733 | struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, |
734 | struct ras_common_if *head); |
735 | |
736 | extern atomic_t amdgpu_ras_in_intr; |
737 | |
738 | static inline bool amdgpu_ras_intr_triggered(void) |
739 | { |
740 | return !!atomic_read(v: &amdgpu_ras_in_intr); |
741 | } |
742 | |
743 | static inline void amdgpu_ras_intr_cleared(void) |
744 | { |
745 | atomic_set(v: &amdgpu_ras_in_intr, i: 0); |
746 | } |
747 | |
748 | void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); |
749 | |
750 | void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); |
751 | |
752 | bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev); |
753 | |
754 | void amdgpu_release_ras_context(struct amdgpu_device *adev); |
755 | |
756 | int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); |
757 | |
758 | const char *get_ras_block_str(struct ras_common_if *ras_block); |
759 | |
760 | bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); |
761 | |
762 | int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block); |
763 | |
764 | int amdgpu_ras_reset_gpu(struct amdgpu_device *adev); |
765 | |
766 | struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); |
767 | |
768 | int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); |
769 | |
770 | void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); |
771 | bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); |
772 | |
773 | int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, |
774 | struct amdgpu_ras_block_object *ras_block_obj); |
775 | void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); |
776 | void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name); |
777 | bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, |
778 | const struct amdgpu_ras_err_status_reg_entry *reg_entry, |
779 | uint32_t instance, |
780 | uint32_t *memory_id); |
781 | bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, |
782 | const struct amdgpu_ras_err_status_reg_entry *reg_entry, |
783 | uint32_t instance, |
784 | unsigned long *err_cnt); |
785 | void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, |
786 | const struct amdgpu_ras_err_status_reg_entry *reg_list, |
787 | uint32_t reg_list_size, |
788 | const struct amdgpu_ras_memory_id_entry *mem_list, |
789 | uint32_t mem_list_size, |
790 | uint32_t instance, |
791 | uint32_t err_type, |
792 | unsigned long *err_count); |
793 | void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, |
794 | const struct amdgpu_ras_err_status_reg_entry *reg_list, |
795 | uint32_t reg_list_size, |
796 | uint32_t instance); |
797 | |
798 | int amdgpu_ras_error_data_init(struct ras_err_data *err_data); |
799 | void amdgpu_ras_error_data_fini(struct ras_err_data *err_data); |
800 | int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, |
801 | struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); |
802 | int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, |
803 | struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); |
804 | |
805 | #endif |
806 | |