1 | /* |
2 | * Copyright (C) 2021 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included |
12 | * in all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
15 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN |
18 | * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
20 | */ |
21 | #ifndef __AMDGPU_MCA_H__ |
22 | #define __AMDGPU_MCA_H__ |
23 | |
24 | #include "amdgpu_ras.h" |
25 | |
26 | #define MCA_MAX_REGS_COUNT (16) |
27 | |
28 | #define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) |
29 | #define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63) |
30 | #define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62) |
31 | #define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61) |
32 | #define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60) |
33 | #define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59) |
34 | #define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58) |
35 | #define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57) |
36 | #define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56) |
37 | #define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55) |
38 | #define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53) |
39 | #define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46) |
40 | #define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45) |
41 | #define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44) |
42 | #define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43) |
43 | #define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40) |
44 | #define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32) |
45 | #define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24) |
46 | #define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16) |
47 | #define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0) |
48 | |
49 | #define MCA_REG__MISC0__ERRCNT(x) MCA_REG_FIELD(x, 43, 32) |
50 | |
51 | #define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0) |
52 | |
53 | enum amdgpu_mca_ip { |
54 | AMDGPU_MCA_IP_UNKNOW = -1, |
55 | AMDGPU_MCA_IP_PSP = 0, |
56 | AMDGPU_MCA_IP_SDMA, |
57 | AMDGPU_MCA_IP_GC, |
58 | AMDGPU_MCA_IP_SMU, |
59 | AMDGPU_MCA_IP_MP5, |
60 | AMDGPU_MCA_IP_UMC, |
61 | AMDGPU_MCA_IP_PCS_XGMI, |
62 | AMDGPU_MCA_IP_COUNT, |
63 | }; |
64 | |
65 | enum amdgpu_mca_error_type { |
66 | AMDGPU_MCA_ERROR_TYPE_UE = 0, |
67 | AMDGPU_MCA_ERROR_TYPE_CE, |
68 | AMDGPU_MCA_ERROR_TYPE_DE, |
69 | }; |
70 | |
71 | struct amdgpu_mca_ras_block { |
72 | struct amdgpu_ras_block_object ras_block; |
73 | }; |
74 | |
75 | struct amdgpu_mca_ras { |
76 | struct ras_common_if *ras_if; |
77 | struct amdgpu_mca_ras_block *ras; |
78 | }; |
79 | |
80 | struct amdgpu_mca { |
81 | struct amdgpu_mca_ras mp0; |
82 | struct amdgpu_mca_ras mp1; |
83 | struct amdgpu_mca_ras mpio; |
84 | const struct amdgpu_mca_smu_funcs *mca_funcs; |
85 | }; |
86 | |
87 | enum mca_reg_idx { |
88 | MCA_REG_IDX_STATUS = 1, |
89 | MCA_REG_IDX_ADDR = 2, |
90 | MCA_REG_IDX_MISC0 = 3, |
91 | MCA_REG_IDX_IPID = 5, |
92 | MCA_REG_IDX_SYND = 6, |
93 | MCA_REG_IDX_COUNT = 16, |
94 | }; |
95 | |
96 | struct mca_bank_info { |
97 | int socket_id; |
98 | int aid; |
99 | int hwid; |
100 | int mcatype; |
101 | }; |
102 | |
103 | struct mca_bank_entry { |
104 | int idx; |
105 | enum amdgpu_mca_error_type type; |
106 | enum amdgpu_mca_ip ip; |
107 | struct mca_bank_info info; |
108 | uint64_t regs[MCA_MAX_REGS_COUNT]; |
109 | }; |
110 | |
111 | struct mca_bank_node { |
112 | struct mca_bank_entry entry; |
113 | struct list_head node; |
114 | }; |
115 | |
116 | struct mca_bank_set { |
117 | int nr_entries; |
118 | struct list_head list; |
119 | }; |
120 | |
121 | struct amdgpu_mca_smu_funcs { |
122 | int max_ue_count; |
123 | int max_ce_count; |
124 | int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); |
125 | int (*mca_get_ras_mca_set)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, |
126 | struct mca_bank_set *mca_set); |
127 | int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, |
128 | struct mca_bank_entry *entry, uint32_t *count); |
129 | int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, |
130 | uint32_t *count); |
131 | int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, |
132 | int idx, struct mca_bank_entry *entry); |
133 | }; |
134 | |
135 | void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, |
136 | uint64_t mc_status_addr, |
137 | unsigned long *error_count); |
138 | |
139 | void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, |
140 | uint64_t mc_status_addr, |
141 | unsigned long *error_count); |
142 | |
143 | void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, |
144 | uint64_t mc_status_addr); |
145 | |
146 | void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, |
147 | uint64_t mc_status_addr, |
148 | void *ras_error_status); |
149 | int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); |
150 | int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); |
151 | int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); |
152 | |
153 | void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); |
154 | int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); |
155 | int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count); |
156 | int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, |
157 | enum amdgpu_mca_error_type type, uint32_t *total); |
158 | int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, |
159 | enum amdgpu_mca_error_type type, uint32_t *count); |
160 | int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, |
161 | enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count); |
162 | int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, |
163 | enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set); |
164 | int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, |
165 | int idx, struct mca_bank_entry *entry); |
166 | |
167 | void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); |
168 | |
169 | void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set); |
170 | int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry); |
171 | void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set); |
172 | int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data); |
173 | |
174 | #endif |
175 | |