1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * This file implements the error recovery as a core part of PCIe error |
4 | * reporting. When a PCIe error is delivered, an error message will be |
5 | * collected and printed to console, then, an error recovery procedure |
6 | * will be executed by following the PCI error recovery rules. |
7 | * |
8 | * Copyright (C) 2006 Intel Corp. |
9 | * Tom Long Nguyen (tom.l.nguyen@intel.com) |
10 | * Zhang Yanmin (yanmin.zhang@intel.com) |
11 | */ |
12 | |
13 | #define dev_fmt(fmt) "AER: " fmt |
14 | |
15 | #include <linux/pci.h> |
16 | #include <linux/pm_runtime.h> |
17 | #include <linux/module.h> |
18 | #include <linux/kernel.h> |
19 | #include <linux/errno.h> |
20 | #include <linux/aer.h> |
21 | #include "portdrv.h" |
22 | #include "../pci.h" |
23 | |
24 | static pci_ers_result_t merge_result(enum pci_ers_result orig, |
25 | enum pci_ers_result new) |
26 | { |
27 | if (new == PCI_ERS_RESULT_NO_AER_DRIVER) |
28 | return PCI_ERS_RESULT_NO_AER_DRIVER; |
29 | |
30 | if (new == PCI_ERS_RESULT_NONE) |
31 | return orig; |
32 | |
33 | switch (orig) { |
34 | case PCI_ERS_RESULT_CAN_RECOVER: |
35 | case PCI_ERS_RESULT_RECOVERED: |
36 | orig = new; |
37 | break; |
38 | case PCI_ERS_RESULT_DISCONNECT: |
39 | if (new == PCI_ERS_RESULT_NEED_RESET) |
40 | orig = PCI_ERS_RESULT_NEED_RESET; |
41 | break; |
42 | default: |
43 | break; |
44 | } |
45 | |
46 | return orig; |
47 | } |
48 | |
49 | static int report_error_detected(struct pci_dev *dev, |
50 | pci_channel_state_t state, |
51 | enum pci_ers_result *result) |
52 | { |
53 | struct pci_driver *pdrv; |
54 | pci_ers_result_t vote; |
55 | const struct pci_error_handlers *err_handler; |
56 | |
57 | device_lock(dev: &dev->dev); |
58 | pdrv = dev->driver; |
59 | if (pci_dev_is_disconnected(dev)) { |
60 | vote = PCI_ERS_RESULT_DISCONNECT; |
61 | } else if (!pci_dev_set_io_state(dev, new: state)) { |
62 | pci_info(dev, "can't recover (state transition %u -> %u invalid)\n" , |
63 | dev->error_state, state); |
64 | vote = PCI_ERS_RESULT_NONE; |
65 | } else if (!pdrv || !pdrv->err_handler || |
66 | !pdrv->err_handler->error_detected) { |
67 | /* |
68 | * If any device in the subtree does not have an error_detected |
69 | * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent |
70 | * error callbacks of "any" device in the subtree, and will |
71 | * exit in the disconnected error state. |
72 | */ |
73 | if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { |
74 | vote = PCI_ERS_RESULT_NO_AER_DRIVER; |
75 | pci_info(dev, "can't recover (no error_detected callback)\n" ); |
76 | } else { |
77 | vote = PCI_ERS_RESULT_NONE; |
78 | } |
79 | } else { |
80 | err_handler = pdrv->err_handler; |
81 | vote = err_handler->error_detected(dev, state); |
82 | } |
83 | pci_uevent_ers(pdev: dev, err_type: vote); |
84 | *result = merge_result(orig: *result, new: vote); |
85 | device_unlock(dev: &dev->dev); |
86 | return 0; |
87 | } |
88 | |
89 | static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data) |
90 | { |
91 | pm_runtime_get_sync(dev: &pdev->dev); |
92 | return 0; |
93 | } |
94 | |
95 | static int pci_pm_runtime_put(struct pci_dev *pdev, void *data) |
96 | { |
97 | pm_runtime_put(dev: &pdev->dev); |
98 | return 0; |
99 | } |
100 | |
101 | static int report_frozen_detected(struct pci_dev *dev, void *data) |
102 | { |
103 | return report_error_detected(dev, state: pci_channel_io_frozen, result: data); |
104 | } |
105 | |
106 | static int report_normal_detected(struct pci_dev *dev, void *data) |
107 | { |
108 | return report_error_detected(dev, state: pci_channel_io_normal, result: data); |
109 | } |
110 | |
111 | static int report_mmio_enabled(struct pci_dev *dev, void *data) |
112 | { |
113 | struct pci_driver *pdrv; |
114 | pci_ers_result_t vote, *result = data; |
115 | const struct pci_error_handlers *err_handler; |
116 | |
117 | device_lock(dev: &dev->dev); |
118 | pdrv = dev->driver; |
119 | if (!pdrv || !pdrv->err_handler || !pdrv->err_handler->mmio_enabled) |
120 | goto out; |
121 | |
122 | err_handler = pdrv->err_handler; |
123 | vote = err_handler->mmio_enabled(dev); |
124 | *result = merge_result(orig: *result, new: vote); |
125 | out: |
126 | device_unlock(dev: &dev->dev); |
127 | return 0; |
128 | } |
129 | |
130 | static int report_slot_reset(struct pci_dev *dev, void *data) |
131 | { |
132 | struct pci_driver *pdrv; |
133 | pci_ers_result_t vote, *result = data; |
134 | const struct pci_error_handlers *err_handler; |
135 | |
136 | device_lock(dev: &dev->dev); |
137 | pdrv = dev->driver; |
138 | if (!pdrv || !pdrv->err_handler || !pdrv->err_handler->slot_reset) |
139 | goto out; |
140 | |
141 | err_handler = pdrv->err_handler; |
142 | vote = err_handler->slot_reset(dev); |
143 | *result = merge_result(orig: *result, new: vote); |
144 | out: |
145 | device_unlock(dev: &dev->dev); |
146 | return 0; |
147 | } |
148 | |
149 | static int report_resume(struct pci_dev *dev, void *data) |
150 | { |
151 | struct pci_driver *pdrv; |
152 | const struct pci_error_handlers *err_handler; |
153 | |
154 | device_lock(dev: &dev->dev); |
155 | pdrv = dev->driver; |
156 | if (!pci_dev_set_io_state(dev, new: pci_channel_io_normal) || |
157 | !pdrv || !pdrv->err_handler || !pdrv->err_handler->resume) |
158 | goto out; |
159 | |
160 | err_handler = pdrv->err_handler; |
161 | err_handler->resume(dev); |
162 | out: |
163 | pci_uevent_ers(pdev: dev, err_type: PCI_ERS_RESULT_RECOVERED); |
164 | device_unlock(dev: &dev->dev); |
165 | return 0; |
166 | } |
167 | |
168 | /** |
169 | * pci_walk_bridge - walk bridges potentially AER affected |
170 | * @bridge: bridge which may be a Port, an RCEC, or an RCiEP |
171 | * @cb: callback to be called for each device found |
172 | * @userdata: arbitrary pointer to be passed to callback |
173 | * |
174 | * If the device provided is a bridge, walk the subordinate bus, including |
175 | * any bridged devices on buses under this bus. Call the provided callback |
176 | * on each device found. |
177 | * |
178 | * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP, |
179 | * call the callback on the device itself. |
180 | */ |
181 | static void pci_walk_bridge(struct pci_dev *bridge, |
182 | int (*cb)(struct pci_dev *, void *), |
183 | void *userdata) |
184 | { |
185 | if (bridge->subordinate) |
186 | pci_walk_bus(top: bridge->subordinate, cb, userdata); |
187 | else |
188 | cb(bridge, userdata); |
189 | } |
190 | |
191 | pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, |
192 | pci_channel_state_t state, |
193 | pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)) |
194 | { |
195 | int type = pci_pcie_type(dev); |
196 | struct pci_dev *bridge; |
197 | pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; |
198 | struct pci_host_bridge *host = pci_find_host_bridge(bus: dev->bus); |
199 | |
200 | /* |
201 | * If the error was detected by a Root Port, Downstream Port, RCEC, |
202 | * or RCiEP, recovery runs on the device itself. For Ports, that |
203 | * also includes any subordinate devices. |
204 | * |
205 | * If it was detected by another device (Endpoint, etc), recovery |
206 | * runs on the device and anything else under the same Port, i.e., |
207 | * everything under "bridge". |
208 | */ |
209 | if (type == PCI_EXP_TYPE_ROOT_PORT || |
210 | type == PCI_EXP_TYPE_DOWNSTREAM || |
211 | type == PCI_EXP_TYPE_RC_EC || |
212 | type == PCI_EXP_TYPE_RC_END) |
213 | bridge = dev; |
214 | else |
215 | bridge = pci_upstream_bridge(dev); |
216 | |
217 | pci_walk_bridge(bridge, cb: pci_pm_runtime_get_sync, NULL); |
218 | |
219 | pci_dbg(bridge, "broadcast error_detected message\n" ); |
220 | if (state == pci_channel_io_frozen) { |
221 | pci_walk_bridge(bridge, cb: report_frozen_detected, userdata: &status); |
222 | if (reset_subordinates(bridge) != PCI_ERS_RESULT_RECOVERED) { |
223 | pci_warn(bridge, "subordinate device reset failed\n" ); |
224 | goto failed; |
225 | } |
226 | } else { |
227 | pci_walk_bridge(bridge, cb: report_normal_detected, userdata: &status); |
228 | } |
229 | |
230 | if (status == PCI_ERS_RESULT_CAN_RECOVER) { |
231 | status = PCI_ERS_RESULT_RECOVERED; |
232 | pci_dbg(bridge, "broadcast mmio_enabled message\n" ); |
233 | pci_walk_bridge(bridge, cb: report_mmio_enabled, userdata: &status); |
234 | } |
235 | |
236 | if (status == PCI_ERS_RESULT_NEED_RESET) { |
237 | /* |
238 | * TODO: Should call platform-specific |
239 | * functions to reset slot before calling |
240 | * drivers' slot_reset callbacks? |
241 | */ |
242 | status = PCI_ERS_RESULT_RECOVERED; |
243 | pci_dbg(bridge, "broadcast slot_reset message\n" ); |
244 | pci_walk_bridge(bridge, cb: report_slot_reset, userdata: &status); |
245 | } |
246 | |
247 | if (status != PCI_ERS_RESULT_RECOVERED) |
248 | goto failed; |
249 | |
250 | pci_dbg(bridge, "broadcast resume message\n" ); |
251 | pci_walk_bridge(bridge, cb: report_resume, userdata: &status); |
252 | |
253 | /* |
254 | * If we have native control of AER, clear error status in the device |
255 | * that detected the error. If the platform retained control of AER, |
256 | * it is responsible for clearing this status. In that case, the |
257 | * signaling device may not even be visible to the OS. |
258 | */ |
259 | if (host->native_aer || pcie_ports_native) { |
260 | pcie_clear_device_status(dev); |
261 | pci_aer_clear_nonfatal_status(dev); |
262 | } |
263 | |
264 | pci_walk_bridge(bridge, cb: pci_pm_runtime_put, NULL); |
265 | |
266 | pci_info(bridge, "device recovery successful\n" ); |
267 | return status; |
268 | |
269 | failed: |
270 | pci_walk_bridge(bridge, cb: pci_pm_runtime_put, NULL); |
271 | |
272 | pci_uevent_ers(pdev: bridge, err_type: PCI_ERS_RESULT_DISCONNECT); |
273 | |
274 | pci_info(bridge, "device recovery failed\n" ); |
275 | |
276 | return status; |
277 | } |
278 | |