1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * This file implements the error recovery as a core part of PCIe error |
4 | * reporting. When a PCIe error is delivered, an error message will be |
5 | * collected and printed to console, then, an error recovery procedure |
6 | * will be executed by following the PCI error recovery rules. |
7 | * |
8 | * Copyright (C) 2006 Intel Corp. |
9 | * Tom Long Nguyen (tom.l.nguyen@intel.com) |
10 | * Zhang Yanmin (yanmin.zhang@intel.com) |
11 | */ |
12 | |
13 | #define dev_fmt(fmt) "AER: " fmt |
14 | |
15 | #include <linux/pci.h> |
16 | #include <linux/pm_runtime.h> |
17 | #include <linux/module.h> |
18 | #include <linux/kernel.h> |
19 | #include <linux/errno.h> |
20 | #include <linux/aer.h> |
21 | #include "portdrv.h" |
22 | #include "../pci.h" |
23 | |
24 | static pci_ers_result_t merge_result(enum pci_ers_result orig, |
25 | enum pci_ers_result new) |
26 | { |
27 | if (new == PCI_ERS_RESULT_NO_AER_DRIVER) |
28 | return PCI_ERS_RESULT_NO_AER_DRIVER; |
29 | |
30 | if (new == PCI_ERS_RESULT_NONE) |
31 | return orig; |
32 | |
33 | switch (orig) { |
34 | case PCI_ERS_RESULT_CAN_RECOVER: |
35 | case PCI_ERS_RESULT_RECOVERED: |
36 | orig = new; |
37 | break; |
38 | case PCI_ERS_RESULT_DISCONNECT: |
39 | if (new == PCI_ERS_RESULT_NEED_RESET) |
40 | orig = PCI_ERS_RESULT_NEED_RESET; |
41 | break; |
42 | default: |
43 | break; |
44 | } |
45 | |
46 | return orig; |
47 | } |
48 | |
49 | static int report_error_detected(struct pci_dev *dev, |
50 | pci_channel_state_t state, |
51 | enum pci_ers_result *result) |
52 | { |
53 | struct pci_driver *pdrv; |
54 | pci_ers_result_t vote; |
55 | const struct pci_error_handlers *err_handler; |
56 | |
57 | device_lock(dev: &dev->dev); |
58 | pdrv = dev->driver; |
59 | if (pci_dev_is_disconnected(dev)) { |
60 | vote = PCI_ERS_RESULT_DISCONNECT; |
61 | } else if (!pci_dev_set_io_state(dev, new: state)) { |
62 | pci_info(dev, "can't recover (state transition %u -> %u invalid)\n" , |
63 | dev->error_state, state); |
64 | vote = PCI_ERS_RESULT_NONE; |
65 | } else if (!pdrv || !pdrv->err_handler || |
66 | !pdrv->err_handler->error_detected) { |
67 | /* |
68 | * If any device in the subtree does not have an error_detected |
69 | * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent |
70 | * error callbacks of "any" device in the subtree, and will |
71 | * exit in the disconnected error state. |
72 | */ |
73 | if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { |
74 | vote = PCI_ERS_RESULT_NO_AER_DRIVER; |
75 | pci_info(dev, "can't recover (no error_detected callback)\n" ); |
76 | } else { |
77 | vote = PCI_ERS_RESULT_NONE; |
78 | } |
79 | } else { |
80 | err_handler = pdrv->err_handler; |
81 | vote = err_handler->error_detected(dev, state); |
82 | } |
83 | pci_uevent_ers(pdev: dev, err_type: vote); |
84 | *result = merge_result(orig: *result, new: vote); |
85 | device_unlock(dev: &dev->dev); |
86 | return 0; |
87 | } |
88 | |
89 | static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data) |
90 | { |
91 | pm_runtime_get_sync(dev: &pdev->dev); |
92 | return 0; |
93 | } |
94 | |
95 | static int pci_pm_runtime_put(struct pci_dev *pdev, void *data) |
96 | { |
97 | pm_runtime_put(dev: &pdev->dev); |
98 | return 0; |
99 | } |
100 | |
101 | static int report_frozen_detected(struct pci_dev *dev, void *data) |
102 | { |
103 | return report_error_detected(dev, state: pci_channel_io_frozen, result: data); |
104 | } |
105 | |
106 | static int report_normal_detected(struct pci_dev *dev, void *data) |
107 | { |
108 | return report_error_detected(dev, state: pci_channel_io_normal, result: data); |
109 | } |
110 | |
111 | static int report_mmio_enabled(struct pci_dev *dev, void *data) |
112 | { |
113 | struct pci_driver *pdrv; |
114 | pci_ers_result_t vote, *result = data; |
115 | const struct pci_error_handlers *err_handler; |
116 | |
117 | device_lock(dev: &dev->dev); |
118 | pdrv = dev->driver; |
119 | if (!pdrv || |
120 | !pdrv->err_handler || |
121 | !pdrv->err_handler->mmio_enabled) |
122 | goto out; |
123 | |
124 | err_handler = pdrv->err_handler; |
125 | vote = err_handler->mmio_enabled(dev); |
126 | *result = merge_result(orig: *result, new: vote); |
127 | out: |
128 | device_unlock(dev: &dev->dev); |
129 | return 0; |
130 | } |
131 | |
132 | static int report_slot_reset(struct pci_dev *dev, void *data) |
133 | { |
134 | struct pci_driver *pdrv; |
135 | pci_ers_result_t vote, *result = data; |
136 | const struct pci_error_handlers *err_handler; |
137 | |
138 | device_lock(dev: &dev->dev); |
139 | pdrv = dev->driver; |
140 | if (!pdrv || |
141 | !pdrv->err_handler || |
142 | !pdrv->err_handler->slot_reset) |
143 | goto out; |
144 | |
145 | err_handler = pdrv->err_handler; |
146 | vote = err_handler->slot_reset(dev); |
147 | *result = merge_result(orig: *result, new: vote); |
148 | out: |
149 | device_unlock(dev: &dev->dev); |
150 | return 0; |
151 | } |
152 | |
153 | static int report_resume(struct pci_dev *dev, void *data) |
154 | { |
155 | struct pci_driver *pdrv; |
156 | const struct pci_error_handlers *err_handler; |
157 | |
158 | device_lock(dev: &dev->dev); |
159 | pdrv = dev->driver; |
160 | if (!pci_dev_set_io_state(dev, new: pci_channel_io_normal) || |
161 | !pdrv || |
162 | !pdrv->err_handler || |
163 | !pdrv->err_handler->resume) |
164 | goto out; |
165 | |
166 | err_handler = pdrv->err_handler; |
167 | err_handler->resume(dev); |
168 | out: |
169 | pci_uevent_ers(pdev: dev, err_type: PCI_ERS_RESULT_RECOVERED); |
170 | device_unlock(dev: &dev->dev); |
171 | return 0; |
172 | } |
173 | |
174 | /** |
175 | * pci_walk_bridge - walk bridges potentially AER affected |
176 | * @bridge: bridge which may be a Port, an RCEC, or an RCiEP |
177 | * @cb: callback to be called for each device found |
178 | * @userdata: arbitrary pointer to be passed to callback |
179 | * |
180 | * If the device provided is a bridge, walk the subordinate bus, including |
181 | * any bridged devices on buses under this bus. Call the provided callback |
182 | * on each device found. |
183 | * |
184 | * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP, |
185 | * call the callback on the device itself. |
186 | */ |
187 | static void pci_walk_bridge(struct pci_dev *bridge, |
188 | int (*cb)(struct pci_dev *, void *), |
189 | void *userdata) |
190 | { |
191 | if (bridge->subordinate) |
192 | pci_walk_bus(top: bridge->subordinate, cb, userdata); |
193 | else |
194 | cb(bridge, userdata); |
195 | } |
196 | |
197 | pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, |
198 | pci_channel_state_t state, |
199 | pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)) |
200 | { |
201 | int type = pci_pcie_type(dev); |
202 | struct pci_dev *bridge; |
203 | pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; |
204 | struct pci_host_bridge *host = pci_find_host_bridge(bus: dev->bus); |
205 | |
206 | /* |
207 | * If the error was detected by a Root Port, Downstream Port, RCEC, |
208 | * or RCiEP, recovery runs on the device itself. For Ports, that |
209 | * also includes any subordinate devices. |
210 | * |
211 | * If it was detected by another device (Endpoint, etc), recovery |
212 | * runs on the device and anything else under the same Port, i.e., |
213 | * everything under "bridge". |
214 | */ |
215 | if (type == PCI_EXP_TYPE_ROOT_PORT || |
216 | type == PCI_EXP_TYPE_DOWNSTREAM || |
217 | type == PCI_EXP_TYPE_RC_EC || |
218 | type == PCI_EXP_TYPE_RC_END) |
219 | bridge = dev; |
220 | else |
221 | bridge = pci_upstream_bridge(dev); |
222 | |
223 | pci_walk_bridge(bridge, cb: pci_pm_runtime_get_sync, NULL); |
224 | |
225 | pci_dbg(bridge, "broadcast error_detected message\n" ); |
226 | if (state == pci_channel_io_frozen) { |
227 | pci_walk_bridge(bridge, cb: report_frozen_detected, userdata: &status); |
228 | if (reset_subordinates(bridge) != PCI_ERS_RESULT_RECOVERED) { |
229 | pci_warn(bridge, "subordinate device reset failed\n" ); |
230 | goto failed; |
231 | } |
232 | } else { |
233 | pci_walk_bridge(bridge, cb: report_normal_detected, userdata: &status); |
234 | } |
235 | |
236 | if (status == PCI_ERS_RESULT_CAN_RECOVER) { |
237 | status = PCI_ERS_RESULT_RECOVERED; |
238 | pci_dbg(bridge, "broadcast mmio_enabled message\n" ); |
239 | pci_walk_bridge(bridge, cb: report_mmio_enabled, userdata: &status); |
240 | } |
241 | |
242 | if (status == PCI_ERS_RESULT_NEED_RESET) { |
243 | /* |
244 | * TODO: Should call platform-specific |
245 | * functions to reset slot before calling |
246 | * drivers' slot_reset callbacks? |
247 | */ |
248 | status = PCI_ERS_RESULT_RECOVERED; |
249 | pci_dbg(bridge, "broadcast slot_reset message\n" ); |
250 | pci_walk_bridge(bridge, cb: report_slot_reset, userdata: &status); |
251 | } |
252 | |
253 | if (status != PCI_ERS_RESULT_RECOVERED) |
254 | goto failed; |
255 | |
256 | pci_dbg(bridge, "broadcast resume message\n" ); |
257 | pci_walk_bridge(bridge, cb: report_resume, userdata: &status); |
258 | |
259 | /* |
260 | * If we have native control of AER, clear error status in the device |
261 | * that detected the error. If the platform retained control of AER, |
262 | * it is responsible for clearing this status. In that case, the |
263 | * signaling device may not even be visible to the OS. |
264 | */ |
265 | if (host->native_aer || pcie_ports_native) { |
266 | pcie_clear_device_status(dev); |
267 | pci_aer_clear_nonfatal_status(dev); |
268 | } |
269 | |
270 | pci_walk_bridge(bridge, cb: pci_pm_runtime_put, NULL); |
271 | |
272 | pci_info(bridge, "device recovery successful\n" ); |
273 | return status; |
274 | |
275 | failed: |
276 | pci_walk_bridge(bridge, cb: pci_pm_runtime_put, NULL); |
277 | |
278 | pci_uevent_ers(pdev: bridge, err_type: PCI_ERS_RESULT_DISCONNECT); |
279 | |
280 | /* TODO: Should kernel panic here? */ |
281 | pci_info(bridge, "device recovery failed\n" ); |
282 | |
283 | return status; |
284 | } |
285 | |