1 | /* |
2 | * Copyright 2019 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | */ |
23 | |
24 | #ifndef _AMDGPU_RAS_EEPROM_H |
25 | #define _AMDGPU_RAS_EEPROM_H |
26 | |
27 | #include <linux/i2c.h> |
28 | |
29 | #define RAS_TABLE_VER_V1 0x00010000 |
30 | #define RAS_TABLE_VER_V2_1 0x00021000 |
31 | |
32 | struct amdgpu_device; |
33 | |
34 | enum amdgpu_ras_gpu_health_status { |
35 | GPU_HEALTH_USABLE = 0, |
36 | GPU_RETIRED__ECC_REACH_THRESHOLD = 2, |
37 | }; |
38 | |
39 | enum amdgpu_ras_eeprom_err_type { |
40 | AMDGPU_RAS_EEPROM_ERR_NA, |
41 | AMDGPU_RAS_EEPROM_ERR_RECOVERABLE, |
42 | AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE, |
43 | AMDGPU_RAS_EEPROM_ERR_COUNT, |
44 | }; |
45 | |
46 | struct { |
47 | uint32_t ; |
48 | uint32_t ; |
49 | uint32_t ; |
50 | uint32_t ; |
51 | uint32_t ; |
52 | } __packed; |
53 | |
54 | struct amdgpu_ras_eeprom_table_ras_info { |
55 | u8 rma_status; |
56 | u8 health_percent; |
57 | u16 ecc_page_threshold; |
58 | u32 padding[64 - 1]; |
59 | } __packed; |
60 | |
61 | struct amdgpu_ras_eeprom_control { |
62 | struct amdgpu_ras_eeprom_table_header tbl_hdr; |
63 | |
64 | struct amdgpu_ras_eeprom_table_ras_info tbl_rai; |
65 | |
66 | /* Base I2C EEPPROM 19-bit memory address, |
67 | * where the table is located. For more information, |
68 | * see top of amdgpu_eeprom.c. |
69 | */ |
70 | u32 i2c_address; |
71 | |
72 | /* The byte offset off of @i2c_address |
73 | * where the table header is found, |
74 | * and where the records start--always |
75 | * right after the header. |
76 | */ |
77 | u32 ; |
78 | u32 ras_info_offset; |
79 | u32 ras_record_offset; |
80 | |
81 | /* Number of records in the table. |
82 | */ |
83 | u32 ras_num_recs; |
84 | |
85 | /* First record index to read, 0-based. |
86 | * Range is [0, num_recs-1]. This is |
87 | * an absolute index, starting right after |
88 | * the table header. |
89 | */ |
90 | u32 ras_fri; |
91 | |
92 | /* Maximum possible number of records |
93 | * we could store, i.e. the maximum capacity |
94 | * of the table. |
95 | */ |
96 | u32 ras_max_record_count; |
97 | |
98 | /* Protect table access via this mutex. |
99 | */ |
100 | struct mutex ras_tbl_mutex; |
101 | |
102 | /* Record channel info which occurred bad pages |
103 | */ |
104 | u32 bad_channel_bitmap; |
105 | }; |
106 | |
107 | /* |
108 | * Represents single table record. Packed to be easily serialized into byte |
109 | * stream. |
110 | */ |
111 | struct eeprom_table_record { |
112 | |
113 | union { |
114 | uint64_t address; |
115 | uint64_t offset; |
116 | }; |
117 | |
118 | uint64_t retired_page; |
119 | uint64_t ts; |
120 | |
121 | enum amdgpu_ras_eeprom_err_type err_type; |
122 | |
123 | union { |
124 | unsigned char bank; |
125 | unsigned char cu; |
126 | }; |
127 | |
128 | unsigned char mem_channel; |
129 | unsigned char mcumc_id; |
130 | } __packed; |
131 | |
132 | int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, |
133 | bool *exceed_err_limit); |
134 | |
135 | int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); |
136 | |
137 | bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev); |
138 | |
139 | int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, |
140 | struct eeprom_table_record *records, const u32 num); |
141 | |
142 | int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, |
143 | struct eeprom_table_record *records, const u32 num); |
144 | |
145 | uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control); |
146 | |
147 | void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control); |
148 | |
149 | extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; |
150 | extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; |
151 | |
152 | #endif // _AMDGPU_RAS_EEPROM_H |
153 | |