1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. |
4 | * Copyright IBM Corp. 2004 2005 |
5 | * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 |
6 | * |
7 | * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> |
8 | */ |
9 | #include <linux/delay.h> |
10 | #include <linux/interrupt.h> |
11 | #include <linux/irq.h> |
12 | #include <linux/module.h> |
13 | #include <linux/pci.h> |
14 | #include <linux/pci_hotplug.h> |
15 | #include <asm/eeh.h> |
16 | #include <asm/eeh_event.h> |
17 | #include <asm/ppc-pci.h> |
18 | #include <asm/pci-bridge.h> |
19 | #include <asm/rtas.h> |
20 | |
21 | struct eeh_rmv_data { |
22 | struct list_head removed_vf_list; |
23 | int removed_dev_count; |
24 | }; |
25 | |
26 | static int eeh_result_priority(enum pci_ers_result result) |
27 | { |
28 | switch (result) { |
29 | case PCI_ERS_RESULT_NONE: |
30 | return 1; |
31 | case PCI_ERS_RESULT_NO_AER_DRIVER: |
32 | return 2; |
33 | case PCI_ERS_RESULT_RECOVERED: |
34 | return 3; |
35 | case PCI_ERS_RESULT_CAN_RECOVER: |
36 | return 4; |
37 | case PCI_ERS_RESULT_DISCONNECT: |
38 | return 5; |
39 | case PCI_ERS_RESULT_NEED_RESET: |
40 | return 6; |
41 | default: |
42 | WARN_ONCE(1, "Unknown pci_ers_result value: %d\n" , result); |
43 | return 0; |
44 | } |
45 | }; |
46 | |
47 | static const char *pci_ers_result_name(enum pci_ers_result result) |
48 | { |
49 | switch (result) { |
50 | case PCI_ERS_RESULT_NONE: |
51 | return "none" ; |
52 | case PCI_ERS_RESULT_CAN_RECOVER: |
53 | return "can recover" ; |
54 | case PCI_ERS_RESULT_NEED_RESET: |
55 | return "need reset" ; |
56 | case PCI_ERS_RESULT_DISCONNECT: |
57 | return "disconnect" ; |
58 | case PCI_ERS_RESULT_RECOVERED: |
59 | return "recovered" ; |
60 | case PCI_ERS_RESULT_NO_AER_DRIVER: |
61 | return "no AER driver" ; |
62 | default: |
63 | WARN_ONCE(1, "Unknown result type: %d\n" , result); |
64 | return "unknown" ; |
65 | } |
66 | }; |
67 | |
68 | static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, |
69 | enum pci_ers_result new) |
70 | { |
71 | if (eeh_result_priority(result: new) > eeh_result_priority(result: old)) |
72 | return new; |
73 | return old; |
74 | } |
75 | |
76 | static bool eeh_dev_removed(struct eeh_dev *edev) |
77 | { |
78 | return !edev || (edev->mode & EEH_DEV_REMOVED); |
79 | } |
80 | |
81 | static bool eeh_edev_actionable(struct eeh_dev *edev) |
82 | { |
83 | if (!edev->pdev) |
84 | return false; |
85 | if (edev->pdev->error_state == pci_channel_io_perm_failure) |
86 | return false; |
87 | if (eeh_dev_removed(edev)) |
88 | return false; |
89 | if (eeh_pe_passed(edev->pe)) |
90 | return false; |
91 | |
92 | return true; |
93 | } |
94 | |
95 | /** |
96 | * eeh_pcid_get - Get the PCI device driver |
97 | * @pdev: PCI device |
98 | * |
99 | * The function is used to retrieve the PCI device driver for |
100 | * the indicated PCI device. Besides, we will increase the reference |
101 | * of the PCI device driver to prevent that being unloaded on |
102 | * the fly. Otherwise, kernel crash would be seen. |
103 | */ |
104 | static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) |
105 | { |
106 | if (!pdev || !pdev->dev.driver) |
107 | return NULL; |
108 | |
109 | if (!try_module_get(module: pdev->dev.driver->owner)) |
110 | return NULL; |
111 | |
112 | return to_pci_driver(drv: pdev->dev.driver); |
113 | } |
114 | |
115 | /** |
116 | * eeh_pcid_put - Dereference on the PCI device driver |
117 | * @pdev: PCI device |
118 | * |
119 | * The function is called to do dereference on the PCI device |
120 | * driver of the indicated PCI device. |
121 | */ |
122 | static inline void eeh_pcid_put(struct pci_dev *pdev) |
123 | { |
124 | if (!pdev || !pdev->dev.driver) |
125 | return; |
126 | |
127 | module_put(module: pdev->dev.driver->owner); |
128 | } |
129 | |
130 | /** |
131 | * eeh_disable_irq - Disable interrupt for the recovering device |
132 | * @dev: PCI device |
133 | * |
134 | * This routine must be called when reporting temporary or permanent |
135 | * error to the particular PCI device to disable interrupt of that |
136 | * device. If the device has enabled MSI or MSI-X interrupt, we needn't |
137 | * do real work because EEH should freeze DMA transfers for those PCI |
138 | * devices encountering EEH errors, which includes MSI or MSI-X. |
139 | */ |
140 | static void eeh_disable_irq(struct eeh_dev *edev) |
141 | { |
142 | /* Don't disable MSI and MSI-X interrupts. They are |
143 | * effectively disabled by the DMA Stopped state |
144 | * when an EEH error occurs. |
145 | */ |
146 | if (edev->pdev->msi_enabled || edev->pdev->msix_enabled) |
147 | return; |
148 | |
149 | if (!irq_has_action(irq: edev->pdev->irq)) |
150 | return; |
151 | |
152 | edev->mode |= EEH_DEV_IRQ_DISABLED; |
153 | disable_irq_nosync(irq: edev->pdev->irq); |
154 | } |
155 | |
156 | /** |
157 | * eeh_enable_irq - Enable interrupt for the recovering device |
158 | * @dev: PCI device |
159 | * |
160 | * This routine must be called to enable interrupt while failed |
161 | * device could be resumed. |
162 | */ |
163 | static void eeh_enable_irq(struct eeh_dev *edev) |
164 | { |
165 | if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { |
166 | edev->mode &= ~EEH_DEV_IRQ_DISABLED; |
167 | /* |
168 | * FIXME !!!!! |
169 | * |
170 | * This is just ass backwards. This maze has |
171 | * unbalanced irq_enable/disable calls. So instead of |
172 | * finding the root cause it works around the warning |
173 | * in the irq_enable code by conditionally calling |
174 | * into it. |
175 | * |
176 | * That's just wrong.The warning in the core code is |
177 | * there to tell people to fix their asymmetries in |
178 | * their own code, not by abusing the core information |
179 | * to avoid it. |
180 | * |
181 | * I so wish that the assymetry would be the other way |
182 | * round and a few more irq_disable calls render that |
183 | * shit unusable forever. |
184 | * |
185 | * tglx |
186 | */ |
187 | if (irqd_irq_disabled(d: irq_get_irq_data(irq: edev->pdev->irq))) |
188 | enable_irq(irq: edev->pdev->irq); |
189 | } |
190 | } |
191 | |
192 | static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
193 | { |
194 | struct pci_dev *pdev; |
195 | |
196 | if (!edev) |
197 | return; |
198 | |
199 | /* |
200 | * We cannot access the config space on some adapters. |
201 | * Otherwise, it will cause fenced PHB. We don't save |
202 | * the content in their config space and will restore |
203 | * from the initial config space saved when the EEH |
204 | * device is created. |
205 | */ |
206 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) |
207 | return; |
208 | |
209 | pdev = eeh_dev_to_pci_dev(edev); |
210 | if (!pdev) |
211 | return; |
212 | |
213 | pci_save_state(dev: pdev); |
214 | } |
215 | |
216 | static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) |
217 | { |
218 | struct eeh_pe *pe; |
219 | struct eeh_dev *edev, *tmp; |
220 | |
221 | eeh_for_each_pe(root, pe) |
222 | eeh_pe_for_each_dev(pe, edev, tmp) |
223 | if (eeh_edev_actionable(edev)) |
224 | edev->pdev->error_state = s; |
225 | } |
226 | |
227 | static void eeh_set_irq_state(struct eeh_pe *root, bool enable) |
228 | { |
229 | struct eeh_pe *pe; |
230 | struct eeh_dev *edev, *tmp; |
231 | |
232 | eeh_for_each_pe(root, pe) { |
233 | eeh_pe_for_each_dev(pe, edev, tmp) { |
234 | if (!eeh_edev_actionable(edev)) |
235 | continue; |
236 | |
237 | if (!eeh_pcid_get(edev->pdev)) |
238 | continue; |
239 | |
240 | if (enable) |
241 | eeh_enable_irq(edev); |
242 | else |
243 | eeh_disable_irq(edev); |
244 | |
245 | eeh_pcid_put(pdev: edev->pdev); |
246 | } |
247 | } |
248 | } |
249 | |
250 | typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, |
251 | struct pci_dev *, |
252 | struct pci_driver *); |
253 | static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, |
254 | enum pci_ers_result *result) |
255 | { |
256 | struct pci_dev *pdev; |
257 | struct pci_driver *driver; |
258 | enum pci_ers_result new_result; |
259 | |
260 | pci_lock_rescan_remove(); |
261 | pdev = edev->pdev; |
262 | if (pdev) |
263 | get_device(dev: &pdev->dev); |
264 | pci_unlock_rescan_remove(); |
265 | if (!pdev) { |
266 | eeh_edev_info(edev, "no device" ); |
267 | return; |
268 | } |
269 | device_lock(dev: &pdev->dev); |
270 | if (eeh_edev_actionable(edev)) { |
271 | driver = eeh_pcid_get(pdev); |
272 | |
273 | if (!driver) |
274 | eeh_edev_info(edev, "no driver" ); |
275 | else if (!driver->err_handler) |
276 | eeh_edev_info(edev, "driver not EEH aware" ); |
277 | else if (edev->mode & EEH_DEV_NO_HANDLER) |
278 | eeh_edev_info(edev, "driver bound too late" ); |
279 | else { |
280 | new_result = fn(edev, pdev, driver); |
281 | eeh_edev_info(edev, "%s driver reports: '%s'" , |
282 | driver->name, |
283 | pci_ers_result_name(result: new_result)); |
284 | if (result) |
285 | *result = pci_ers_merge_result(old: *result, |
286 | new: new_result); |
287 | } |
288 | if (driver) |
289 | eeh_pcid_put(pdev); |
290 | } else { |
291 | eeh_edev_info(edev, "not actionable (%d,%d,%d)" , !!pdev, |
292 | !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); |
293 | } |
294 | device_unlock(dev: &pdev->dev); |
295 | if (edev->pdev != pdev) |
296 | eeh_edev_warn(edev, "Device changed during processing!\n" ); |
297 | put_device(dev: &pdev->dev); |
298 | } |
299 | |
300 | static void eeh_pe_report(const char *name, struct eeh_pe *root, |
301 | eeh_report_fn fn, enum pci_ers_result *result) |
302 | { |
303 | struct eeh_pe *pe; |
304 | struct eeh_dev *edev, *tmp; |
305 | |
306 | pr_info("EEH: Beginning: '%s'\n" , name); |
307 | eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) |
308 | eeh_pe_report_edev(edev, fn, result); |
309 | if (result) |
310 | pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n" , |
311 | name, pci_ers_result_name(*result)); |
312 | else |
313 | pr_info("EEH: Finished:'%s'" , name); |
314 | } |
315 | |
316 | /** |
317 | * eeh_report_error - Report pci error to each device driver |
318 | * @edev: eeh device |
319 | * @driver: device's PCI driver |
320 | * |
321 | * Report an EEH error to each device driver. |
322 | */ |
323 | static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, |
324 | struct pci_dev *pdev, |
325 | struct pci_driver *driver) |
326 | { |
327 | enum pci_ers_result rc; |
328 | |
329 | if (!driver->err_handler->error_detected) |
330 | return PCI_ERS_RESULT_NONE; |
331 | |
332 | eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)" , |
333 | driver->name); |
334 | rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); |
335 | |
336 | edev->in_error = true; |
337 | pci_uevent_ers(pdev, err_type: PCI_ERS_RESULT_NONE); |
338 | return rc; |
339 | } |
340 | |
341 | /** |
342 | * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled |
343 | * @edev: eeh device |
344 | * @driver: device's PCI driver |
345 | * |
346 | * Tells each device driver that IO ports, MMIO and config space I/O |
347 | * are now enabled. |
348 | */ |
349 | static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, |
350 | struct pci_dev *pdev, |
351 | struct pci_driver *driver) |
352 | { |
353 | if (!driver->err_handler->mmio_enabled) |
354 | return PCI_ERS_RESULT_NONE; |
355 | eeh_edev_info(edev, "Invoking %s->mmio_enabled()" , driver->name); |
356 | return driver->err_handler->mmio_enabled(pdev); |
357 | } |
358 | |
359 | /** |
360 | * eeh_report_reset - Tell device that slot has been reset |
361 | * @edev: eeh device |
362 | * @driver: device's PCI driver |
363 | * |
364 | * This routine must be called while EEH tries to reset particular |
365 | * PCI device so that the associated PCI device driver could take |
366 | * some actions, usually to save data the driver needs so that the |
367 | * driver can work again while the device is recovered. |
368 | */ |
369 | static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, |
370 | struct pci_dev *pdev, |
371 | struct pci_driver *driver) |
372 | { |
373 | if (!driver->err_handler->slot_reset || !edev->in_error) |
374 | return PCI_ERS_RESULT_NONE; |
375 | eeh_edev_info(edev, "Invoking %s->slot_reset()" , driver->name); |
376 | return driver->err_handler->slot_reset(pdev); |
377 | } |
378 | |
379 | static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
380 | { |
381 | struct pci_dev *pdev; |
382 | |
383 | if (!edev) |
384 | return; |
385 | |
386 | /* |
387 | * The content in the config space isn't saved because |
388 | * the blocked config space on some adapters. We have |
389 | * to restore the initial saved config space when the |
390 | * EEH device is created. |
391 | */ |
392 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { |
393 | if (list_is_last(list: &edev->entry, head: &edev->pe->edevs)) |
394 | eeh_pe_restore_bars(edev->pe); |
395 | |
396 | return; |
397 | } |
398 | |
399 | pdev = eeh_dev_to_pci_dev(edev); |
400 | if (!pdev) |
401 | return; |
402 | |
403 | pci_restore_state(dev: pdev); |
404 | } |
405 | |
406 | /** |
407 | * eeh_report_resume - Tell device to resume normal operations |
408 | * @edev: eeh device |
409 | * @driver: device's PCI driver |
410 | * |
411 | * This routine must be called to notify the device driver that it |
412 | * could resume so that the device driver can do some initialization |
413 | * to make the recovered device work again. |
414 | */ |
415 | static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, |
416 | struct pci_dev *pdev, |
417 | struct pci_driver *driver) |
418 | { |
419 | if (!driver->err_handler->resume || !edev->in_error) |
420 | return PCI_ERS_RESULT_NONE; |
421 | |
422 | eeh_edev_info(edev, "Invoking %s->resume()" , driver->name); |
423 | driver->err_handler->resume(pdev); |
424 | |
425 | pci_uevent_ers(pdev: edev->pdev, err_type: PCI_ERS_RESULT_RECOVERED); |
426 | #ifdef CONFIG_PCI_IOV |
427 | if (eeh_ops->notify_resume) |
428 | eeh_ops->notify_resume(edev); |
429 | #endif |
430 | return PCI_ERS_RESULT_NONE; |
431 | } |
432 | |
433 | /** |
434 | * eeh_report_failure - Tell device driver that device is dead. |
435 | * @edev: eeh device |
436 | * @driver: device's PCI driver |
437 | * |
438 | * This informs the device driver that the device is permanently |
439 | * dead, and that no further recovery attempts will be made on it. |
440 | */ |
441 | static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, |
442 | struct pci_dev *pdev, |
443 | struct pci_driver *driver) |
444 | { |
445 | enum pci_ers_result rc; |
446 | |
447 | if (!driver->err_handler->error_detected) |
448 | return PCI_ERS_RESULT_NONE; |
449 | |
450 | eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)" , |
451 | driver->name); |
452 | rc = driver->err_handler->error_detected(pdev, |
453 | pci_channel_io_perm_failure); |
454 | |
455 | pci_uevent_ers(pdev, err_type: PCI_ERS_RESULT_DISCONNECT); |
456 | return rc; |
457 | } |
458 | |
459 | static void *eeh_add_virt_device(struct eeh_dev *edev) |
460 | { |
461 | struct pci_driver *driver; |
462 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
463 | |
464 | if (!(edev->physfn)) { |
465 | eeh_edev_warn(edev, "Not for VF\n" ); |
466 | return NULL; |
467 | } |
468 | |
469 | driver = eeh_pcid_get(pdev: dev); |
470 | if (driver) { |
471 | if (driver->err_handler) { |
472 | eeh_pcid_put(pdev: dev); |
473 | return NULL; |
474 | } |
475 | eeh_pcid_put(pdev: dev); |
476 | } |
477 | |
478 | #ifdef CONFIG_PCI_IOV |
479 | pci_iov_add_virtfn(dev: edev->physfn, id: edev->vf_index); |
480 | #endif |
481 | return NULL; |
482 | } |
483 | |
484 | static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
485 | { |
486 | struct pci_driver *driver; |
487 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
488 | struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; |
489 | |
490 | /* |
491 | * Actually, we should remove the PCI bridges as well. |
492 | * However, that's lots of complexity to do that, |
493 | * particularly some of devices under the bridge might |
494 | * support EEH. So we just care about PCI devices for |
495 | * simplicity here. |
496 | */ |
497 | if (!eeh_edev_actionable(edev) || |
498 | (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) |
499 | return; |
500 | |
501 | if (rmv_data) { |
502 | driver = eeh_pcid_get(pdev: dev); |
503 | if (driver) { |
504 | if (driver->err_handler && |
505 | driver->err_handler->error_detected && |
506 | driver->err_handler->slot_reset) { |
507 | eeh_pcid_put(pdev: dev); |
508 | return; |
509 | } |
510 | eeh_pcid_put(pdev: dev); |
511 | } |
512 | } |
513 | |
514 | /* Remove it from PCI subsystem */ |
515 | pr_info("EEH: Removing %s without EEH sensitive driver\n" , |
516 | pci_name(dev)); |
517 | edev->mode |= EEH_DEV_DISCONNECTED; |
518 | if (rmv_data) |
519 | rmv_data->removed_dev_count++; |
520 | |
521 | if (edev->physfn) { |
522 | #ifdef CONFIG_PCI_IOV |
523 | pci_iov_remove_virtfn(dev: edev->physfn, id: edev->vf_index); |
524 | edev->pdev = NULL; |
525 | #endif |
526 | if (rmv_data) |
527 | list_add(new: &edev->rmv_entry, head: &rmv_data->removed_vf_list); |
528 | } else { |
529 | pci_lock_rescan_remove(); |
530 | pci_stop_and_remove_bus_device(dev); |
531 | pci_unlock_rescan_remove(); |
532 | } |
533 | } |
534 | |
535 | static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) |
536 | { |
537 | struct eeh_dev *edev, *tmp; |
538 | |
539 | eeh_pe_for_each_dev(pe, edev, tmp) { |
540 | if (!(edev->mode & EEH_DEV_DISCONNECTED)) |
541 | continue; |
542 | |
543 | edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); |
544 | eeh_pe_tree_remove(edev); |
545 | } |
546 | |
547 | return NULL; |
548 | } |
549 | |
550 | /* |
551 | * Explicitly clear PE's frozen state for PowerNV where |
552 | * we have frozen PE until BAR restore is completed. It's |
553 | * harmless to clear it for pSeries. To be consistent with |
554 | * PE reset (for 3 times), we try to clear the frozen state |
555 | * for 3 times as well. |
556 | */ |
557 | static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) |
558 | { |
559 | struct eeh_pe *pe; |
560 | int i; |
561 | |
562 | eeh_for_each_pe(root, pe) { |
563 | if (include_passed || !eeh_pe_passed(pe)) { |
564 | for (i = 0; i < 3; i++) |
565 | if (!eeh_unfreeze_pe(pe)) |
566 | break; |
567 | if (i >= 3) |
568 | return -EIO; |
569 | } |
570 | } |
571 | eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); |
572 | return 0; |
573 | } |
574 | |
575 | int eeh_pe_reset_and_recover(struct eeh_pe *pe) |
576 | { |
577 | int ret; |
578 | |
579 | /* Bail if the PE is being recovered */ |
580 | if (pe->state & EEH_PE_RECOVERING) |
581 | return 0; |
582 | |
583 | /* Put the PE into recovery mode */ |
584 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
585 | |
586 | /* Save states */ |
587 | eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); |
588 | |
589 | /* Issue reset */ |
590 | ret = eeh_pe_reset_full(pe, true); |
591 | if (ret) { |
592 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
593 | return ret; |
594 | } |
595 | |
596 | /* Unfreeze the PE */ |
597 | ret = eeh_clear_pe_frozen_state(root: pe, include_passed: true); |
598 | if (ret) { |
599 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
600 | return ret; |
601 | } |
602 | |
603 | /* Restore device state */ |
604 | eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); |
605 | |
606 | /* Clear recovery mode */ |
607 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
608 | |
609 | return 0; |
610 | } |
611 | |
612 | /** |
613 | * eeh_reset_device - Perform actual reset of a pci slot |
614 | * @driver_eeh_aware: Does the device's driver provide EEH support? |
615 | * @pe: EEH PE |
616 | * @bus: PCI bus corresponding to the isolcated slot |
617 | * @rmv_data: Optional, list to record removed devices |
618 | * |
619 | * This routine must be called to do reset on the indicated PE. |
620 | * During the reset, udev might be invoked because those affected |
621 | * PCI devices will be removed and then added. |
622 | */ |
623 | static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, |
624 | struct eeh_rmv_data *rmv_data, |
625 | bool driver_eeh_aware) |
626 | { |
627 | time64_t tstamp; |
628 | int cnt, rc; |
629 | struct eeh_dev *edev; |
630 | struct eeh_pe *tmp_pe; |
631 | bool any_passed = false; |
632 | |
633 | eeh_for_each_pe(pe, tmp_pe) |
634 | any_passed |= eeh_pe_passed(tmp_pe); |
635 | |
636 | /* pcibios will clear the counter; save the value */ |
637 | cnt = pe->freeze_count; |
638 | tstamp = pe->tstamp; |
639 | |
640 | /* |
641 | * We don't remove the corresponding PE instances because |
642 | * we need the information afterwords. The attached EEH |
643 | * devices are expected to be attached soon when calling |
644 | * into pci_hp_add_devices(). |
645 | */ |
646 | eeh_pe_state_mark(pe, EEH_PE_KEEP); |
647 | if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
648 | eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); |
649 | } else { |
650 | pci_lock_rescan_remove(); |
651 | pci_hp_remove_devices(bus); |
652 | pci_unlock_rescan_remove(); |
653 | } |
654 | |
655 | /* |
656 | * Reset the pci controller. (Asserts RST#; resets config space). |
657 | * Reconfigure bridges and devices. Don't try to bring the system |
658 | * up if the reset failed for some reason. |
659 | * |
660 | * During the reset, it's very dangerous to have uncontrolled PCI |
661 | * config accesses. So we prefer to block them. However, controlled |
662 | * PCI config accesses initiated from EEH itself are allowed. |
663 | */ |
664 | rc = eeh_pe_reset_full(pe, false); |
665 | if (rc) |
666 | return rc; |
667 | |
668 | pci_lock_rescan_remove(); |
669 | |
670 | /* Restore PE */ |
671 | eeh_ops->configure_bridge(pe); |
672 | eeh_pe_restore_bars(pe); |
673 | |
674 | /* Clear frozen state */ |
675 | rc = eeh_clear_pe_frozen_state(root: pe, include_passed: false); |
676 | if (rc) { |
677 | pci_unlock_rescan_remove(); |
678 | return rc; |
679 | } |
680 | |
681 | /* Give the system 5 seconds to finish running the user-space |
682 | * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, |
683 | * this is a hack, but if we don't do this, and try to bring |
684 | * the device up before the scripts have taken it down, |
685 | * potentially weird things happen. |
686 | */ |
687 | if (!driver_eeh_aware || rmv_data->removed_dev_count) { |
688 | pr_info("EEH: Sleep 5s ahead of %s hotplug\n" , |
689 | (driver_eeh_aware ? "partial" : "complete" )); |
690 | ssleep(seconds: 5); |
691 | |
692 | /* |
693 | * The EEH device is still connected with its parent |
694 | * PE. We should disconnect it so the binding can be |
695 | * rebuilt when adding PCI devices. |
696 | */ |
697 | edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); |
698 | eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); |
699 | if (pe->type & EEH_PE_VF) { |
700 | eeh_add_virt_device(edev); |
701 | } else { |
702 | if (!driver_eeh_aware) |
703 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
704 | pci_hp_add_devices(bus); |
705 | } |
706 | } |
707 | eeh_pe_state_clear(pe, EEH_PE_KEEP, true); |
708 | |
709 | pe->tstamp = tstamp; |
710 | pe->freeze_count = cnt; |
711 | |
712 | pci_unlock_rescan_remove(); |
713 | return 0; |
714 | } |
715 | |
716 | /* The longest amount of time to wait for a pci device |
717 | * to come back on line, in seconds. |
718 | */ |
719 | #define MAX_WAIT_FOR_RECOVERY 300 |
720 | |
721 | |
722 | /* Walks the PE tree after processing an event to remove any stale PEs. |
723 | * |
724 | * NB: This needs to be recursive to ensure the leaf PEs get removed |
725 | * before their parents do. Although this is possible to do recursively |
726 | * we don't since this is easier to read and we need to garantee |
727 | * the leaf nodes will be handled first. |
728 | */ |
729 | static void eeh_pe_cleanup(struct eeh_pe *pe) |
730 | { |
731 | struct eeh_pe *child_pe, *tmp; |
732 | |
733 | list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) |
734 | eeh_pe_cleanup(pe: child_pe); |
735 | |
736 | if (pe->state & EEH_PE_KEEP) |
737 | return; |
738 | |
739 | if (!(pe->state & EEH_PE_INVALID)) |
740 | return; |
741 | |
742 | if (list_empty(head: &pe->edevs) && list_empty(head: &pe->child_list)) { |
743 | list_del(entry: &pe->child); |
744 | kfree(objp: pe); |
745 | } |
746 | } |
747 | |
748 | /** |
749 | * eeh_check_slot_presence - Check if a device is still present in a slot |
750 | * @pdev: pci_dev to check |
751 | * |
752 | * This function may return a false positive if we can't determine the slot's |
753 | * presence state. This might happen for PCIe slots if the PE containing |
754 | * the upstream bridge is also frozen, or the bridge is part of the same PE |
755 | * as the device. |
756 | * |
757 | * This shouldn't happen often, but you might see it if you hotplug a PCIe |
758 | * switch. |
759 | */ |
760 | static bool eeh_slot_presence_check(struct pci_dev *pdev) |
761 | { |
762 | const struct hotplug_slot_ops *ops; |
763 | struct pci_slot *slot; |
764 | u8 state; |
765 | int rc; |
766 | |
767 | if (!pdev) |
768 | return false; |
769 | |
770 | if (pdev->error_state == pci_channel_io_perm_failure) |
771 | return false; |
772 | |
773 | slot = pdev->slot; |
774 | if (!slot || !slot->hotplug) |
775 | return true; |
776 | |
777 | ops = slot->hotplug->ops; |
778 | if (!ops || !ops->get_adapter_status) |
779 | return true; |
780 | |
781 | /* set the attention indicator while we've got the slot ops */ |
782 | if (ops->set_attention_status) |
783 | ops->set_attention_status(slot->hotplug, 1); |
784 | |
785 | rc = ops->get_adapter_status(slot->hotplug, &state); |
786 | if (rc) |
787 | return true; |
788 | |
789 | return !!state; |
790 | } |
791 | |
792 | static void eeh_clear_slot_attention(struct pci_dev *pdev) |
793 | { |
794 | const struct hotplug_slot_ops *ops; |
795 | struct pci_slot *slot; |
796 | |
797 | if (!pdev) |
798 | return; |
799 | |
800 | if (pdev->error_state == pci_channel_io_perm_failure) |
801 | return; |
802 | |
803 | slot = pdev->slot; |
804 | if (!slot || !slot->hotplug) |
805 | return; |
806 | |
807 | ops = slot->hotplug->ops; |
808 | if (!ops || !ops->set_attention_status) |
809 | return; |
810 | |
811 | ops->set_attention_status(slot->hotplug, 0); |
812 | } |
813 | |
814 | /** |
815 | * eeh_handle_normal_event - Handle EEH events on a specific PE |
816 | * @pe: EEH PE - which should not be used after we return, as it may |
817 | * have been invalidated. |
818 | * |
819 | * Attempts to recover the given PE. If recovery fails or the PE has failed |
820 | * too many times, remove the PE. |
821 | * |
822 | * While PHB detects address or data parity errors on particular PCI |
823 | * slot, the associated PE will be frozen. Besides, DMA's occurring |
824 | * to wild addresses (which usually happen due to bugs in device |
825 | * drivers or in PCI adapter firmware) can cause EEH error. #SERR, |
826 | * #PERR or other misc PCI-related errors also can trigger EEH errors. |
827 | * |
828 | * Recovery process consists of unplugging the device driver (which |
829 | * generated hotplug events to userspace), then issuing a PCI #RST to |
830 | * the device, then reconfiguring the PCI config space for all bridges |
831 | * & devices under this slot, and then finally restarting the device |
832 | * drivers (which cause a second set of hotplug events to go out to |
833 | * userspace). |
834 | */ |
835 | void eeh_handle_normal_event(struct eeh_pe *pe) |
836 | { |
837 | struct pci_bus *bus; |
838 | struct eeh_dev *edev, *tmp; |
839 | struct eeh_pe *tmp_pe; |
840 | int rc = 0; |
841 | enum pci_ers_result result = PCI_ERS_RESULT_NONE; |
842 | struct eeh_rmv_data rmv_data = |
843 | {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; |
844 | int devices = 0; |
845 | |
846 | bus = eeh_pe_bus_get(pe); |
847 | if (!bus) { |
848 | pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n" , |
849 | __func__, pe->phb->global_number, pe->addr); |
850 | return; |
851 | } |
852 | |
853 | /* |
854 | * When devices are hot-removed we might get an EEH due to |
855 | * a driver attempting to touch the MMIO space of a removed |
856 | * device. In this case we don't have a device to recover |
857 | * so suppress the event if we can't find any present devices. |
858 | * |
859 | * The hotplug driver should take care of tearing down the |
860 | * device itself. |
861 | */ |
862 | eeh_for_each_pe(pe, tmp_pe) |
863 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
864 | if (eeh_slot_presence_check(pdev: edev->pdev)) |
865 | devices++; |
866 | |
867 | if (!devices) { |
868 | pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n" , |
869 | pe->phb->global_number, pe->addr); |
870 | goto out; /* nothing to recover */ |
871 | } |
872 | |
873 | /* Log the event */ |
874 | if (pe->type & EEH_PE_PHB) { |
875 | pr_err("EEH: Recovering PHB#%x, location: %s\n" , |
876 | pe->phb->global_number, eeh_pe_loc_get(pe)); |
877 | } else { |
878 | struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); |
879 | |
880 | pr_err("EEH: Recovering PHB#%x-PE#%x\n" , |
881 | pe->phb->global_number, pe->addr); |
882 | pr_err("EEH: PE location: %s, PHB location: %s\n" , |
883 | eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); |
884 | } |
885 | |
886 | #ifdef CONFIG_STACKTRACE |
887 | /* |
888 | * Print the saved stack trace now that we've verified there's |
889 | * something to recover. |
890 | */ |
891 | if (pe->trace_entries) { |
892 | void **ptrs = (void **) pe->stack_trace; |
893 | int i; |
894 | |
895 | pr_err("EEH: Frozen PHB#%x-PE#%x detected\n" , |
896 | pe->phb->global_number, pe->addr); |
897 | |
898 | /* FIXME: Use the same format as dump_stack() */ |
899 | pr_err("EEH: Call Trace:\n" ); |
900 | for (i = 0; i < pe->trace_entries; i++) |
901 | pr_err("EEH: [%pK] %pS\n" , ptrs[i], ptrs[i]); |
902 | |
903 | pe->trace_entries = 0; |
904 | } |
905 | #endif /* CONFIG_STACKTRACE */ |
906 | |
907 | eeh_for_each_pe(pe, tmp_pe) |
908 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
909 | edev->mode &= ~EEH_DEV_NO_HANDLER; |
910 | |
911 | eeh_pe_update_time_stamp(pe); |
912 | pe->freeze_count++; |
913 | if (pe->freeze_count > eeh_max_freezes) { |
914 | pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n" , |
915 | pe->phb->global_number, pe->addr, |
916 | pe->freeze_count); |
917 | |
918 | goto recover_failed; |
919 | } |
920 | |
921 | /* Walk the various device drivers attached to this slot through |
922 | * a reset sequence, giving each an opportunity to do what it needs |
923 | * to accomplish the reset. Each child gets a report of the |
924 | * status ... if any child can't handle the reset, then the entire |
925 | * slot is dlpar removed and added. |
926 | * |
927 | * When the PHB is fenced, we have to issue a reset to recover from |
928 | * the error. Override the result if necessary to have partially |
929 | * hotplug for this case. |
930 | */ |
931 | pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n" , |
932 | pe->freeze_count, eeh_max_freezes); |
933 | pr_info("EEH: Notify device drivers to shutdown\n" ); |
934 | eeh_set_channel_state(root: pe, s: pci_channel_io_frozen); |
935 | eeh_set_irq_state(root: pe, enable: false); |
936 | eeh_pe_report(name: "error_detected(IO frozen)" , root: pe, |
937 | fn: eeh_report_error, result: &result); |
938 | if (result == PCI_ERS_RESULT_DISCONNECT) |
939 | goto recover_failed; |
940 | |
941 | /* |
942 | * Error logged on a PHB are always fences which need a full |
943 | * PHB reset to clear so force that to happen. |
944 | */ |
945 | if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE) |
946 | result = PCI_ERS_RESULT_NEED_RESET; |
947 | |
948 | /* Get the current PCI slot state. This can take a long time, |
949 | * sometimes over 300 seconds for certain systems. |
950 | */ |
951 | rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000); |
952 | if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { |
953 | pr_warn("EEH: Permanent failure\n" ); |
954 | goto recover_failed; |
955 | } |
956 | |
957 | /* Since rtas may enable MMIO when posting the error log, |
958 | * don't post the error log until after all dev drivers |
959 | * have been informed. |
960 | */ |
961 | pr_info("EEH: Collect temporary log\n" ); |
962 | eeh_slot_error_detail(pe, EEH_LOG_TEMP); |
963 | |
964 | /* If all device drivers were EEH-unaware, then shut |
965 | * down all of the device drivers, and hope they |
966 | * go down willingly, without panicing the system. |
967 | */ |
968 | if (result == PCI_ERS_RESULT_NONE) { |
969 | pr_info("EEH: Reset with hotplug activity\n" ); |
970 | rc = eeh_reset_device(pe, bus, NULL, driver_eeh_aware: false); |
971 | if (rc) { |
972 | pr_warn("%s: Unable to reset, err=%d\n" , __func__, rc); |
973 | goto recover_failed; |
974 | } |
975 | } |
976 | |
977 | /* If all devices reported they can proceed, then re-enable MMIO */ |
978 | if (result == PCI_ERS_RESULT_CAN_RECOVER) { |
979 | pr_info("EEH: Enable I/O for affected devices\n" ); |
980 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); |
981 | if (rc < 0) |
982 | goto recover_failed; |
983 | |
984 | if (rc) { |
985 | result = PCI_ERS_RESULT_NEED_RESET; |
986 | } else { |
987 | pr_info("EEH: Notify device drivers to resume I/O\n" ); |
988 | eeh_pe_report(name: "mmio_enabled" , root: pe, |
989 | fn: eeh_report_mmio_enabled, result: &result); |
990 | } |
991 | } |
992 | if (result == PCI_ERS_RESULT_CAN_RECOVER) { |
993 | pr_info("EEH: Enabled DMA for affected devices\n" ); |
994 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); |
995 | if (rc < 0) |
996 | goto recover_failed; |
997 | |
998 | if (rc) { |
999 | result = PCI_ERS_RESULT_NEED_RESET; |
1000 | } else { |
1001 | /* |
1002 | * We didn't do PE reset for the case. The PE |
1003 | * is still in frozen state. Clear it before |
1004 | * resuming the PE. |
1005 | */ |
1006 | eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); |
1007 | result = PCI_ERS_RESULT_RECOVERED; |
1008 | } |
1009 | } |
1010 | |
1011 | /* If any device called out for a reset, then reset the slot */ |
1012 | if (result == PCI_ERS_RESULT_NEED_RESET) { |
1013 | pr_info("EEH: Reset without hotplug activity\n" ); |
1014 | rc = eeh_reset_device(pe, bus, rmv_data: &rmv_data, driver_eeh_aware: true); |
1015 | if (rc) { |
1016 | pr_warn("%s: Cannot reset, err=%d\n" , __func__, rc); |
1017 | goto recover_failed; |
1018 | } |
1019 | |
1020 | result = PCI_ERS_RESULT_NONE; |
1021 | eeh_set_channel_state(root: pe, s: pci_channel_io_normal); |
1022 | eeh_set_irq_state(root: pe, enable: true); |
1023 | eeh_pe_report(name: "slot_reset" , root: pe, fn: eeh_report_reset, |
1024 | result: &result); |
1025 | } |
1026 | |
1027 | if ((result == PCI_ERS_RESULT_RECOVERED) || |
1028 | (result == PCI_ERS_RESULT_NONE)) { |
1029 | /* |
1030 | * For those hot removed VFs, we should add back them after PF |
1031 | * get recovered properly. |
1032 | */ |
1033 | list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, |
1034 | rmv_entry) { |
1035 | eeh_add_virt_device(edev); |
1036 | list_del(entry: &edev->rmv_entry); |
1037 | } |
1038 | |
1039 | /* Tell all device drivers that they can resume operations */ |
1040 | pr_info("EEH: Notify device driver to resume\n" ); |
1041 | eeh_set_channel_state(root: pe, s: pci_channel_io_normal); |
1042 | eeh_set_irq_state(root: pe, enable: true); |
1043 | eeh_pe_report(name: "resume" , root: pe, fn: eeh_report_resume, NULL); |
1044 | eeh_for_each_pe(pe, tmp_pe) { |
1045 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) { |
1046 | edev->mode &= ~EEH_DEV_NO_HANDLER; |
1047 | edev->in_error = false; |
1048 | } |
1049 | } |
1050 | |
1051 | pr_info("EEH: Recovery successful.\n" ); |
1052 | goto out; |
1053 | } |
1054 | |
1055 | recover_failed: |
1056 | /* |
1057 | * About 90% of all real-life EEH failures in the field |
1058 | * are due to poorly seated PCI cards. Only 10% or so are |
1059 | * due to actual, failed cards. |
1060 | */ |
1061 | pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" |
1062 | "Please try reseating or replacing it\n" , |
1063 | pe->phb->global_number, pe->addr); |
1064 | |
1065 | eeh_slot_error_detail(pe, EEH_LOG_PERM); |
1066 | |
1067 | /* Notify all devices that they're about to go down. */ |
1068 | eeh_set_irq_state(root: pe, enable: false); |
1069 | eeh_pe_report(name: "error_detected(permanent failure)" , root: pe, |
1070 | fn: eeh_report_failure, NULL); |
1071 | eeh_set_channel_state(root: pe, s: pci_channel_io_perm_failure); |
1072 | |
1073 | /* Mark the PE to be removed permanently */ |
1074 | eeh_pe_state_mark(pe, EEH_PE_REMOVED); |
1075 | |
1076 | /* |
1077 | * Shut down the device drivers for good. We mark |
1078 | * all removed devices correctly to avoid access |
1079 | * the their PCI config any more. |
1080 | */ |
1081 | if (pe->type & EEH_PE_VF) { |
1082 | eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); |
1083 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
1084 | } else { |
1085 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
1086 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
1087 | |
1088 | pci_lock_rescan_remove(); |
1089 | pci_hp_remove_devices(bus); |
1090 | pci_unlock_rescan_remove(); |
1091 | /* The passed PE should no longer be used */ |
1092 | return; |
1093 | } |
1094 | |
1095 | out: |
1096 | /* |
1097 | * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING |
1098 | * we don't want to modify the PE tree structure so we do it here. |
1099 | */ |
1100 | eeh_pe_cleanup(pe); |
1101 | |
1102 | /* clear the slot attention LED for all recovered devices */ |
1103 | eeh_for_each_pe(pe, tmp_pe) |
1104 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
1105 | eeh_clear_slot_attention(pdev: edev->pdev); |
1106 | |
1107 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
1108 | } |
1109 | |
1110 | /** |
1111 | * eeh_handle_special_event - Handle EEH events without a specific failing PE |
1112 | * |
1113 | * Called when an EEH event is detected but can't be narrowed down to a |
1114 | * specific PE. Iterates through possible failures and handles them as |
1115 | * necessary. |
1116 | */ |
1117 | void eeh_handle_special_event(void) |
1118 | { |
1119 | struct eeh_pe *pe, *phb_pe, *tmp_pe; |
1120 | struct eeh_dev *edev, *tmp_edev; |
1121 | struct pci_bus *bus; |
1122 | struct pci_controller *hose; |
1123 | unsigned long flags; |
1124 | int rc; |
1125 | |
1126 | |
1127 | do { |
1128 | rc = eeh_ops->next_error(&pe); |
1129 | |
1130 | switch (rc) { |
1131 | case EEH_NEXT_ERR_DEAD_IOC: |
1132 | /* Mark all PHBs in dead state */ |
1133 | eeh_serialize_lock(&flags); |
1134 | |
1135 | /* Purge all events */ |
1136 | eeh_remove_event(NULL, true); |
1137 | |
1138 | list_for_each_entry(hose, &hose_list, list_node) { |
1139 | phb_pe = eeh_phb_pe_get(hose); |
1140 | if (!phb_pe) continue; |
1141 | |
1142 | eeh_pe_mark_isolated(phb_pe); |
1143 | } |
1144 | |
1145 | eeh_serialize_unlock(flags); |
1146 | |
1147 | break; |
1148 | case EEH_NEXT_ERR_FROZEN_PE: |
1149 | case EEH_NEXT_ERR_FENCED_PHB: |
1150 | case EEH_NEXT_ERR_DEAD_PHB: |
1151 | /* Mark the PE in fenced state */ |
1152 | eeh_serialize_lock(&flags); |
1153 | |
1154 | /* Purge all events of the PHB */ |
1155 | eeh_remove_event(pe, true); |
1156 | |
1157 | if (rc != EEH_NEXT_ERR_DEAD_PHB) |
1158 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
1159 | eeh_pe_mark_isolated(pe); |
1160 | |
1161 | eeh_serialize_unlock(flags); |
1162 | |
1163 | break; |
1164 | case EEH_NEXT_ERR_NONE: |
1165 | return; |
1166 | default: |
1167 | pr_warn("%s: Invalid value %d from next_error()\n" , |
1168 | __func__, rc); |
1169 | return; |
1170 | } |
1171 | |
1172 | /* |
1173 | * For fenced PHB and frozen PE, it's handled as normal |
1174 | * event. We have to remove the affected PHBs for dead |
1175 | * PHB and IOC |
1176 | */ |
1177 | if (rc == EEH_NEXT_ERR_FROZEN_PE || |
1178 | rc == EEH_NEXT_ERR_FENCED_PHB) { |
1179 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
1180 | eeh_handle_normal_event(pe); |
1181 | } else { |
1182 | eeh_for_each_pe(pe, tmp_pe) |
1183 | eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) |
1184 | edev->mode &= ~EEH_DEV_NO_HANDLER; |
1185 | |
1186 | /* Notify all devices to be down */ |
1187 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
1188 | eeh_pe_report( |
1189 | name: "error_detected(permanent failure)" , root: pe, |
1190 | fn: eeh_report_failure, NULL); |
1191 | eeh_set_channel_state(root: pe, s: pci_channel_io_perm_failure); |
1192 | |
1193 | pci_lock_rescan_remove(); |
1194 | list_for_each_entry(hose, &hose_list, list_node) { |
1195 | phb_pe = eeh_phb_pe_get(hose); |
1196 | if (!phb_pe || |
1197 | !(phb_pe->state & EEH_PE_ISOLATED) || |
1198 | (phb_pe->state & EEH_PE_RECOVERING)) |
1199 | continue; |
1200 | |
1201 | bus = eeh_pe_bus_get(phb_pe); |
1202 | if (!bus) { |
1203 | pr_err("%s: Cannot find PCI bus for " |
1204 | "PHB#%x-PE#%x\n" , |
1205 | __func__, |
1206 | pe->phb->global_number, |
1207 | pe->addr); |
1208 | break; |
1209 | } |
1210 | pci_hp_remove_devices(bus); |
1211 | } |
1212 | pci_unlock_rescan_remove(); |
1213 | } |
1214 | |
1215 | /* |
1216 | * If we have detected dead IOC, we needn't proceed |
1217 | * any more since all PHBs would have been removed |
1218 | */ |
1219 | if (rc == EEH_NEXT_ERR_DEAD_IOC) |
1220 | break; |
1221 | } while (rc != EEH_NEXT_ERR_NONE); |
1222 | } |
1223 | |