1 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB |
2 | /* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ |
3 | |
4 | #include <devlink.h> |
5 | |
6 | #include "fw_reset.h" |
7 | #include "diag/fw_tracer.h" |
8 | #include "lib/tout.h" |
9 | |
10 | enum { |
11 | MLX5_FW_RESET_FLAGS_RESET_REQUESTED, |
12 | MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, |
13 | MLX5_FW_RESET_FLAGS_PENDING_COMP, |
14 | MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, |
15 | MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED |
16 | }; |
17 | |
18 | struct mlx5_fw_reset { |
19 | struct mlx5_core_dev *dev; |
20 | struct mlx5_nb nb; |
21 | struct workqueue_struct *wq; |
22 | struct work_struct fw_live_patch_work; |
23 | struct work_struct reset_request_work; |
24 | struct work_struct reset_unload_work; |
25 | struct work_struct reset_reload_work; |
26 | struct work_struct reset_now_work; |
27 | struct work_struct reset_abort_work; |
28 | unsigned long reset_flags; |
29 | struct timer_list timer; |
30 | struct completion done; |
31 | int ret; |
32 | }; |
33 | |
34 | enum { |
35 | MLX5_FW_RST_STATE_IDLE = 0, |
36 | MLX5_FW_RST_STATE_TOGGLE_REQ = 4, |
37 | }; |
38 | |
39 | enum { |
40 | MLX5_RST_STATE_BIT_NUM = 12, |
41 | MLX5_RST_ACK_BIT_NUM = 22, |
42 | }; |
43 | |
44 | static u8 mlx5_get_fw_rst_state(struct mlx5_core_dev *dev) |
45 | { |
46 | return (ioread32be(&dev->iseg->initializing) >> MLX5_RST_STATE_BIT_NUM) & 0xF; |
47 | } |
48 | |
49 | static void mlx5_set_fw_rst_ack(struct mlx5_core_dev *dev) |
50 | { |
51 | iowrite32be(BIT(MLX5_RST_ACK_BIT_NUM), &dev->iseg->initializing); |
52 | } |
53 | |
54 | static int mlx5_fw_reset_enable_remote_dev_reset_set(struct devlink *devlink, u32 id, |
55 | struct devlink_param_gset_ctx *ctx) |
56 | { |
57 | struct mlx5_core_dev *dev = devlink_priv(devlink); |
58 | struct mlx5_fw_reset *fw_reset; |
59 | |
60 | fw_reset = dev->priv.fw_reset; |
61 | |
62 | if (ctx->val.vbool) |
63 | clear_bit(nr: MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, addr: &fw_reset->reset_flags); |
64 | else |
65 | set_bit(nr: MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, addr: &fw_reset->reset_flags); |
66 | return 0; |
67 | } |
68 | |
69 | static int mlx5_fw_reset_enable_remote_dev_reset_get(struct devlink *devlink, u32 id, |
70 | struct devlink_param_gset_ctx *ctx) |
71 | { |
72 | struct mlx5_core_dev *dev = devlink_priv(devlink); |
73 | struct mlx5_fw_reset *fw_reset; |
74 | |
75 | fw_reset = dev->priv.fw_reset; |
76 | |
77 | ctx->val.vbool = !test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, |
78 | &fw_reset->reset_flags); |
79 | return 0; |
80 | } |
81 | |
82 | static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level, |
83 | u8 reset_type_sel, u8 sync_resp, bool sync_start) |
84 | { |
85 | u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; |
86 | u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; |
87 | |
88 | MLX5_SET(mfrl_reg, in, reset_level, reset_level); |
89 | MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); |
90 | MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_resp, sync_resp); |
91 | MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, sync_start); |
92 | |
93 | return mlx5_core_access_reg(dev, data_in: in, size_in: sizeof(in), data_out: out, size_out: sizeof(out), reg_num: MLX5_REG_MFRL, arg: 0, write: 1); |
94 | } |
95 | |
96 | static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level, |
97 | u8 *reset_type, u8 *reset_state) |
98 | { |
99 | u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; |
100 | u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; |
101 | int err; |
102 | |
103 | err = mlx5_core_access_reg(dev, data_in: in, size_in: sizeof(in), data_out: out, size_out: sizeof(out), reg_num: MLX5_REG_MFRL, arg: 0, write: 0); |
104 | if (err) |
105 | return err; |
106 | |
107 | if (reset_level) |
108 | *reset_level = MLX5_GET(mfrl_reg, out, reset_level); |
109 | if (reset_type) |
110 | *reset_type = MLX5_GET(mfrl_reg, out, reset_type); |
111 | if (reset_state) |
112 | *reset_state = MLX5_GET(mfrl_reg, out, reset_state); |
113 | |
114 | return 0; |
115 | } |
116 | |
117 | int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type) |
118 | { |
119 | return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL); |
120 | } |
121 | |
122 | static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev, |
123 | struct netlink_ext_ack *extack) |
124 | { |
125 | u8 reset_state; |
126 | |
127 | if (mlx5_reg_mfrl_query(dev, NULL, NULL, reset_state: &reset_state)) |
128 | goto out; |
129 | |
130 | if (!reset_state) |
131 | return 0; |
132 | |
133 | switch (reset_state) { |
134 | case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION: |
135 | case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS: |
136 | NL_SET_ERR_MSG_MOD(extack, "Sync reset still in progress" ); |
137 | return -EBUSY; |
138 | case MLX5_MFRL_REG_RESET_STATE_NEG_TIMEOUT: |
139 | NL_SET_ERR_MSG_MOD(extack, "Sync reset negotiation timeout" ); |
140 | return -ETIMEDOUT; |
141 | case MLX5_MFRL_REG_RESET_STATE_NACK: |
142 | NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset" ); |
143 | return -EPERM; |
144 | case MLX5_MFRL_REG_RESET_STATE_UNLOAD_TIMEOUT: |
145 | NL_SET_ERR_MSG_MOD(extack, "Sync reset unload timeout" ); |
146 | return -ETIMEDOUT; |
147 | } |
148 | |
149 | out: |
150 | NL_SET_ERR_MSG_MOD(extack, "Sync reset failed" ); |
151 | return -EIO; |
152 | } |
153 | |
154 | int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, |
155 | struct netlink_ext_ack *extack) |
156 | { |
157 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
158 | u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; |
159 | u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; |
160 | int err, rst_res; |
161 | |
162 | set_bit(nr: MLX5_FW_RESET_FLAGS_PENDING_COMP, addr: &fw_reset->reset_flags); |
163 | |
164 | MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3); |
165 | MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); |
166 | MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1); |
167 | err = mlx5_access_reg(dev, data_in: in, size_in: sizeof(in), data_out: out, size_out: sizeof(out), |
168 | reg_id: MLX5_REG_MFRL, arg: 0, write: 1, verbose: false); |
169 | if (!err) |
170 | return 0; |
171 | |
172 | clear_bit(nr: MLX5_FW_RESET_FLAGS_PENDING_COMP, addr: &fw_reset->reset_flags); |
173 | if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state)) { |
174 | rst_res = mlx5_fw_reset_get_reset_state_err(dev, extack); |
175 | return rst_res ? rst_res : err; |
176 | } |
177 | |
178 | NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed" ); |
179 | return mlx5_cmd_check(dev, err, in, out); |
180 | } |
181 | |
182 | int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev, |
183 | struct netlink_ext_ack *extack) |
184 | { |
185 | u8 rst_state; |
186 | int err; |
187 | |
188 | err = mlx5_fw_reset_get_reset_state_err(dev, extack); |
189 | if (err) |
190 | return err; |
191 | |
192 | rst_state = mlx5_get_fw_rst_state(dev); |
193 | if (!rst_state) |
194 | return 0; |
195 | |
196 | mlx5_core_err(dev, "Sync reset did not complete, state=%d\n" , rst_state); |
197 | NL_SET_ERR_MSG_MOD(extack, "Sync reset did not complete successfully" ); |
198 | return rst_state; |
199 | } |
200 | |
201 | int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) |
202 | { |
203 | return mlx5_reg_mfrl_set(dev, reset_level: MLX5_MFRL_REG_RESET_LEVEL0, reset_type_sel: 0, sync_resp: 0, sync_start: false); |
204 | } |
205 | |
206 | static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded) |
207 | { |
208 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
209 | |
210 | /* if this is the driver that initiated the fw reset, devlink completed the reload */ |
211 | if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { |
212 | complete(&fw_reset->done); |
213 | } else { |
214 | if (!unloaded) |
215 | mlx5_unload_one(dev, suspend: false); |
216 | if (mlx5_health_wait_pci_up(dev)) |
217 | mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n" ); |
218 | else |
219 | mlx5_load_one(dev, recovery: true); |
220 | devlink_remote_reload_actions_performed(devlink: priv_to_devlink(priv: dev), limit: 0, |
221 | BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | |
222 | BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); |
223 | } |
224 | } |
225 | |
226 | static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) |
227 | { |
228 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
229 | |
230 | del_timer_sync(timer: &fw_reset->timer); |
231 | } |
232 | |
233 | static int mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health) |
234 | { |
235 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
236 | |
237 | if (!test_and_clear_bit(nr: MLX5_FW_RESET_FLAGS_RESET_REQUESTED, addr: &fw_reset->reset_flags)) { |
238 | mlx5_core_warn(dev, "Reset request was already cleared\n" ); |
239 | return -EALREADY; |
240 | } |
241 | |
242 | mlx5_stop_sync_reset_poll(dev); |
243 | if (poll_health) |
244 | mlx5_start_health_poll(dev); |
245 | return 0; |
246 | } |
247 | |
248 | static void mlx5_sync_reset_reload_work(struct work_struct *work) |
249 | { |
250 | struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, |
251 | reset_reload_work); |
252 | struct mlx5_core_dev *dev = fw_reset->dev; |
253 | |
254 | mlx5_sync_reset_clear_reset_requested(dev, poll_health: false); |
255 | mlx5_enter_error_state(dev, force: true); |
256 | mlx5_fw_reset_complete_reload(dev, unloaded: false); |
257 | } |
258 | |
259 | #define MLX5_RESET_POLL_INTERVAL (HZ / 10) |
260 | static void poll_sync_reset(struct timer_list *t) |
261 | { |
262 | struct mlx5_fw_reset *fw_reset = from_timer(fw_reset, t, timer); |
263 | struct mlx5_core_dev *dev = fw_reset->dev; |
264 | u32 fatal_error; |
265 | |
266 | if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) |
267 | return; |
268 | |
269 | fatal_error = mlx5_health_check_fatal_sensors(dev); |
270 | |
271 | if (fatal_error) { |
272 | mlx5_core_warn(dev, "Got Device Reset\n" ); |
273 | if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) |
274 | queue_work(wq: fw_reset->wq, work: &fw_reset->reset_reload_work); |
275 | else |
276 | mlx5_core_err(dev, "Device is being removed, Drop new reset work\n" ); |
277 | return; |
278 | } |
279 | |
280 | mod_timer(timer: &fw_reset->timer, expires: round_jiffies(j: jiffies + MLX5_RESET_POLL_INTERVAL)); |
281 | } |
282 | |
283 | static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev) |
284 | { |
285 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
286 | |
287 | timer_setup(&fw_reset->timer, poll_sync_reset, 0); |
288 | fw_reset->timer.expires = round_jiffies(j: jiffies + MLX5_RESET_POLL_INTERVAL); |
289 | add_timer(timer: &fw_reset->timer); |
290 | } |
291 | |
292 | static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev) |
293 | { |
294 | return mlx5_reg_mfrl_set(dev, reset_level: MLX5_MFRL_REG_RESET_LEVEL3, reset_type_sel: 0, sync_resp: 1, sync_start: false); |
295 | } |
296 | |
297 | static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev) |
298 | { |
299 | return mlx5_reg_mfrl_set(dev, reset_level: MLX5_MFRL_REG_RESET_LEVEL3, reset_type_sel: 0, sync_resp: 2, sync_start: false); |
300 | } |
301 | |
302 | static int mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev) |
303 | { |
304 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
305 | |
306 | if (test_and_set_bit(nr: MLX5_FW_RESET_FLAGS_RESET_REQUESTED, addr: &fw_reset->reset_flags)) { |
307 | mlx5_core_warn(dev, "Reset request was already set\n" ); |
308 | return -EALREADY; |
309 | } |
310 | mlx5_stop_health_poll(dev, disable_health: true); |
311 | mlx5_start_sync_reset_poll(dev); |
312 | return 0; |
313 | } |
314 | |
315 | static void mlx5_fw_live_patch_event(struct work_struct *work) |
316 | { |
317 | struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, |
318 | fw_live_patch_work); |
319 | struct mlx5_core_dev *dev = fw_reset->dev; |
320 | |
321 | mlx5_core_info(dev, "Live patch updated firmware version: %d.%d.%d\n" , fw_rev_maj(dev), |
322 | fw_rev_min(dev), fw_rev_sub(dev)); |
323 | |
324 | if (mlx5_fw_tracer_reload(tracer: dev->tracer)) |
325 | mlx5_core_err(dev, "Failed to reload FW tracer\n" ); |
326 | } |
327 | |
328 | #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE) |
329 | static int mlx5_check_hotplug_interrupt(struct mlx5_core_dev *dev) |
330 | { |
331 | struct pci_dev *bridge = dev->pdev->bus->self; |
332 | u16 reg16; |
333 | int err; |
334 | |
335 | if (!bridge) |
336 | return -EOPNOTSUPP; |
337 | |
338 | err = pcie_capability_read_word(dev: bridge, PCI_EXP_SLTCTL, val: ®16); |
339 | if (err) |
340 | return err; |
341 | |
342 | if ((reg16 & PCI_EXP_SLTCTL_HPIE) && (reg16 & PCI_EXP_SLTCTL_DLLSCE)) { |
343 | mlx5_core_warn(dev, "FW reset is not supported as HotPlug is enabled\n" ); |
344 | return -EOPNOTSUPP; |
345 | } |
346 | |
347 | return 0; |
348 | } |
349 | #endif |
350 | |
351 | static const struct pci_device_id mgt_ifc_device_ids[] = { |
352 | { PCI_VDEVICE(MELLANOX, 0xc2d2) }, /* BlueField1 MGT interface device ID */ |
353 | { PCI_VDEVICE(MELLANOX, 0xc2d3) }, /* BlueField2 MGT interface device ID */ |
354 | { PCI_VDEVICE(MELLANOX, 0xc2d4) }, /* BlueField3-Lx MGT interface device ID */ |
355 | { PCI_VDEVICE(MELLANOX, 0xc2d5) }, /* BlueField3 MGT interface device ID */ |
356 | { PCI_VDEVICE(MELLANOX, 0xc2d6) }, /* BlueField4 MGT interface device ID */ |
357 | }; |
358 | |
359 | static bool mlx5_is_mgt_ifc_pci_device(struct mlx5_core_dev *dev, u16 dev_id) |
360 | { |
361 | int i; |
362 | |
363 | for (i = 0; i < ARRAY_SIZE(mgt_ifc_device_ids); ++i) |
364 | if (mgt_ifc_device_ids[i].device == dev_id) |
365 | return true; |
366 | |
367 | return false; |
368 | } |
369 | |
370 | static int mlx5_check_dev_ids(struct mlx5_core_dev *dev, u16 dev_id) |
371 | { |
372 | struct pci_bus *bridge_bus = dev->pdev->bus; |
373 | struct pci_dev *sdev; |
374 | u16 sdev_id; |
375 | int err; |
376 | |
377 | /* Check that all functions under the pci bridge are PFs of |
378 | * this device otherwise fail this function. |
379 | */ |
380 | list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { |
381 | err = pci_read_config_word(dev: sdev, PCI_DEVICE_ID, val: &sdev_id); |
382 | if (err) |
383 | return pcibios_err_to_errno(err); |
384 | |
385 | if (sdev_id == dev_id) |
386 | continue; |
387 | |
388 | if (mlx5_is_mgt_ifc_pci_device(dev, dev_id: sdev_id)) |
389 | continue; |
390 | |
391 | mlx5_core_warn(dev, "unrecognized dev_id (0x%x)\n" , sdev_id); |
392 | return -EPERM; |
393 | } |
394 | return 0; |
395 | } |
396 | |
397 | static bool mlx5_is_reset_now_capable(struct mlx5_core_dev *dev) |
398 | { |
399 | u16 dev_id; |
400 | int err; |
401 | |
402 | if (!MLX5_CAP_GEN(dev, fast_teardown)) { |
403 | mlx5_core_warn(dev, "fast teardown is not supported by firmware\n" ); |
404 | return false; |
405 | } |
406 | |
407 | #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE) |
408 | err = mlx5_check_hotplug_interrupt(dev); |
409 | if (err) |
410 | return false; |
411 | #endif |
412 | |
413 | err = pci_read_config_word(dev: dev->pdev, PCI_DEVICE_ID, val: &dev_id); |
414 | if (err) |
415 | return false; |
416 | return (!mlx5_check_dev_ids(dev, dev_id)); |
417 | } |
418 | |
419 | static void mlx5_sync_reset_request_event(struct work_struct *work) |
420 | { |
421 | struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, |
422 | reset_request_work); |
423 | struct mlx5_core_dev *dev = fw_reset->dev; |
424 | int err; |
425 | |
426 | if (test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags) || |
427 | !mlx5_is_reset_now_capable(dev)) { |
428 | err = mlx5_fw_reset_set_reset_sync_nack(dev); |
429 | mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s" , |
430 | err ? "Failed" : "Sent" ); |
431 | return; |
432 | } |
433 | if (mlx5_sync_reset_set_reset_requested(dev)) |
434 | return; |
435 | |
436 | err = mlx5_fw_reset_set_reset_sync_ack(dev); |
437 | if (err) |
438 | mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n" , err); |
439 | else |
440 | mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n" ); |
441 | } |
442 | |
443 | static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev) |
444 | { |
445 | struct pci_bus *bridge_bus = dev->pdev->bus; |
446 | struct pci_dev *bridge = bridge_bus->self; |
447 | unsigned long timeout; |
448 | struct pci_dev *sdev; |
449 | u16 reg16, dev_id; |
450 | int cap, err; |
451 | |
452 | err = pci_read_config_word(dev: dev->pdev, PCI_DEVICE_ID, val: &dev_id); |
453 | if (err) |
454 | return pcibios_err_to_errno(err); |
455 | err = mlx5_check_dev_ids(dev, dev_id); |
456 | if (err) |
457 | return err; |
458 | cap = pci_find_capability(dev: bridge, PCI_CAP_ID_EXP); |
459 | if (!cap) |
460 | return -EOPNOTSUPP; |
461 | |
462 | list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { |
463 | pci_save_state(dev: sdev); |
464 | pci_cfg_access_lock(dev: sdev); |
465 | } |
466 | /* PCI link toggle */ |
467 | err = pcie_capability_set_word(dev: bridge, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LD); |
468 | if (err) |
469 | return pcibios_err_to_errno(err); |
470 | msleep(msecs: 500); |
471 | err = pcie_capability_clear_word(dev: bridge, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LD); |
472 | if (err) |
473 | return pcibios_err_to_errno(err); |
474 | |
475 | /* Check link */ |
476 | if (!bridge->link_active_reporting) { |
477 | mlx5_core_warn(dev, "No PCI link reporting capability\n" ); |
478 | msleep(msecs: 1000); |
479 | goto restore; |
480 | } |
481 | |
482 | timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE)); |
483 | do { |
484 | err = pci_read_config_word(dev: bridge, where: cap + PCI_EXP_LNKSTA, val: ®16); |
485 | if (err) |
486 | return pcibios_err_to_errno(err); |
487 | if (reg16 & PCI_EXP_LNKSTA_DLLLA) |
488 | break; |
489 | msleep(msecs: 20); |
490 | } while (!time_after(jiffies, timeout)); |
491 | |
492 | if (reg16 & PCI_EXP_LNKSTA_DLLLA) { |
493 | mlx5_core_info(dev, "PCI Link up\n" ); |
494 | } else { |
495 | mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n" , |
496 | reg16, mlx5_tout_ms(dev, PCI_TOGGLE)); |
497 | err = -ETIMEDOUT; |
498 | goto restore; |
499 | } |
500 | |
501 | do { |
502 | err = pci_read_config_word(dev: dev->pdev, PCI_DEVICE_ID, val: ®16); |
503 | if (err) |
504 | return pcibios_err_to_errno(err); |
505 | if (reg16 == dev_id) |
506 | break; |
507 | msleep(msecs: 20); |
508 | } while (!time_after(jiffies, timeout)); |
509 | |
510 | if (reg16 == dev_id) { |
511 | mlx5_core_info(dev, "Firmware responds to PCI config cycles again\n" ); |
512 | } else { |
513 | mlx5_core_err(dev, "Firmware is not responsive (0x%04x) after %llu ms\n" , |
514 | reg16, mlx5_tout_ms(dev, PCI_TOGGLE)); |
515 | err = -ETIMEDOUT; |
516 | } |
517 | |
518 | restore: |
519 | list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { |
520 | pci_cfg_access_unlock(dev: sdev); |
521 | pci_restore_state(dev: sdev); |
522 | } |
523 | |
524 | return err; |
525 | } |
526 | |
527 | static void mlx5_sync_reset_now_event(struct work_struct *work) |
528 | { |
529 | struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, |
530 | reset_now_work); |
531 | struct mlx5_core_dev *dev = fw_reset->dev; |
532 | int err; |
533 | |
534 | if (mlx5_sync_reset_clear_reset_requested(dev, poll_health: false)) |
535 | return; |
536 | |
537 | mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n" ); |
538 | |
539 | err = mlx5_cmd_fast_teardown_hca(dev); |
540 | if (err) { |
541 | mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n" , err); |
542 | goto done; |
543 | } |
544 | |
545 | err = mlx5_pci_link_toggle(dev); |
546 | if (err) { |
547 | mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n" , err); |
548 | set_bit(nr: MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, addr: &fw_reset->reset_flags); |
549 | } |
550 | |
551 | mlx5_enter_error_state(dev, force: true); |
552 | done: |
553 | fw_reset->ret = err; |
554 | mlx5_fw_reset_complete_reload(dev, unloaded: false); |
555 | } |
556 | |
557 | static void mlx5_sync_reset_unload_event(struct work_struct *work) |
558 | { |
559 | struct mlx5_fw_reset *fw_reset; |
560 | struct mlx5_core_dev *dev; |
561 | unsigned long timeout; |
562 | bool reset_action; |
563 | u8 rst_state; |
564 | int err; |
565 | |
566 | fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work); |
567 | dev = fw_reset->dev; |
568 | |
569 | if (mlx5_sync_reset_clear_reset_requested(dev, poll_health: false)) |
570 | return; |
571 | |
572 | mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n" ); |
573 | |
574 | err = mlx5_cmd_fast_teardown_hca(dev); |
575 | if (err) |
576 | mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n" , err); |
577 | else |
578 | mlx5_enter_error_state(dev, force: true); |
579 | |
580 | if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) |
581 | mlx5_unload_one_devl_locked(dev, suspend: false); |
582 | else |
583 | mlx5_unload_one(dev, suspend: false); |
584 | |
585 | mlx5_set_fw_rst_ack(dev); |
586 | mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n" ); |
587 | |
588 | reset_action = false; |
589 | timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD)); |
590 | do { |
591 | rst_state = mlx5_get_fw_rst_state(dev); |
592 | if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ || |
593 | rst_state == MLX5_FW_RST_STATE_IDLE) { |
594 | reset_action = true; |
595 | break; |
596 | } |
597 | msleep(msecs: 20); |
598 | } while (!time_after(jiffies, timeout)); |
599 | |
600 | if (!reset_action) { |
601 | mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n" , |
602 | rst_state); |
603 | fw_reset->ret = -ETIMEDOUT; |
604 | goto done; |
605 | } |
606 | |
607 | mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n" , rst_state); |
608 | if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) { |
609 | err = mlx5_pci_link_toggle(dev); |
610 | if (err) { |
611 | mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, err %d\n" , err); |
612 | fw_reset->ret = err; |
613 | } |
614 | } |
615 | |
616 | done: |
617 | mlx5_fw_reset_complete_reload(dev, unloaded: true); |
618 | } |
619 | |
620 | static void mlx5_sync_reset_abort_event(struct work_struct *work) |
621 | { |
622 | struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, |
623 | reset_abort_work); |
624 | struct mlx5_core_dev *dev = fw_reset->dev; |
625 | |
626 | if (mlx5_sync_reset_clear_reset_requested(dev, poll_health: true)) |
627 | return; |
628 | mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n" ); |
629 | } |
630 | |
631 | static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe) |
632 | { |
633 | struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe; |
634 | u8 sync_event_rst_type; |
635 | |
636 | sync_fw_update_eqe = &eqe->data.sync_fw_update; |
637 | sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK; |
638 | switch (sync_event_rst_type) { |
639 | case MLX5_SYNC_RST_STATE_RESET_REQUEST: |
640 | queue_work(wq: fw_reset->wq, work: &fw_reset->reset_request_work); |
641 | break; |
642 | case MLX5_SYNC_RST_STATE_RESET_UNLOAD: |
643 | queue_work(wq: fw_reset->wq, work: &fw_reset->reset_unload_work); |
644 | break; |
645 | case MLX5_SYNC_RST_STATE_RESET_NOW: |
646 | queue_work(wq: fw_reset->wq, work: &fw_reset->reset_now_work); |
647 | break; |
648 | case MLX5_SYNC_RST_STATE_RESET_ABORT: |
649 | queue_work(wq: fw_reset->wq, work: &fw_reset->reset_abort_work); |
650 | break; |
651 | } |
652 | } |
653 | |
654 | static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data) |
655 | { |
656 | struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb); |
657 | struct mlx5_eqe *eqe = data; |
658 | |
659 | if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) |
660 | return NOTIFY_DONE; |
661 | |
662 | switch (eqe->sub_type) { |
663 | case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT: |
664 | queue_work(wq: fw_reset->wq, work: &fw_reset->fw_live_patch_work); |
665 | break; |
666 | case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT: |
667 | mlx5_sync_reset_events_handle(fw_reset, eqe); |
668 | break; |
669 | default: |
670 | return NOTIFY_DONE; |
671 | } |
672 | |
673 | return NOTIFY_OK; |
674 | } |
675 | |
676 | int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev) |
677 | { |
678 | unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE); |
679 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
680 | unsigned long timeout; |
681 | int err; |
682 | |
683 | if (MLX5_CAP_GEN(dev, pci_sync_for_fw_update_with_driver_unload)) |
684 | pci_sync_update_timeout += mlx5_tout_ms(dev, RESET_UNLOAD); |
685 | timeout = msecs_to_jiffies(m: pci_sync_update_timeout); |
686 | if (!wait_for_completion_timeout(x: &fw_reset->done, timeout)) { |
687 | mlx5_core_warn(dev, "FW sync reset timeout after %lu seconds\n" , |
688 | pci_sync_update_timeout / 1000); |
689 | err = -ETIMEDOUT; |
690 | goto out; |
691 | } |
692 | err = fw_reset->ret; |
693 | if (test_and_clear_bit(nr: MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, addr: &fw_reset->reset_flags)) { |
694 | mlx5_unload_one_devl_locked(dev, suspend: false); |
695 | mlx5_load_one_devl_locked(dev, recovery: true); |
696 | } |
697 | out: |
698 | clear_bit(nr: MLX5_FW_RESET_FLAGS_PENDING_COMP, addr: &fw_reset->reset_flags); |
699 | return err; |
700 | } |
701 | |
702 | void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev) |
703 | { |
704 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
705 | |
706 | if (!fw_reset) |
707 | return; |
708 | |
709 | MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT); |
710 | mlx5_eq_notifier_register(dev, nb: &fw_reset->nb); |
711 | } |
712 | |
713 | void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev) |
714 | { |
715 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
716 | |
717 | if (!fw_reset) |
718 | return; |
719 | |
720 | mlx5_eq_notifier_unregister(dev, nb: &fw_reset->nb); |
721 | } |
722 | |
723 | void mlx5_drain_fw_reset(struct mlx5_core_dev *dev) |
724 | { |
725 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
726 | |
727 | if (!fw_reset) |
728 | return; |
729 | |
730 | set_bit(nr: MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, addr: &fw_reset->reset_flags); |
731 | cancel_work_sync(work: &fw_reset->fw_live_patch_work); |
732 | cancel_work_sync(work: &fw_reset->reset_request_work); |
733 | cancel_work_sync(work: &fw_reset->reset_unload_work); |
734 | cancel_work_sync(work: &fw_reset->reset_reload_work); |
735 | cancel_work_sync(work: &fw_reset->reset_now_work); |
736 | cancel_work_sync(work: &fw_reset->reset_abort_work); |
737 | } |
738 | |
739 | static const struct devlink_param mlx5_fw_reset_devlink_params[] = { |
740 | DEVLINK_PARAM_GENERIC(ENABLE_REMOTE_DEV_RESET, BIT(DEVLINK_PARAM_CMODE_RUNTIME), |
741 | mlx5_fw_reset_enable_remote_dev_reset_get, |
742 | mlx5_fw_reset_enable_remote_dev_reset_set, NULL), |
743 | }; |
744 | |
745 | int mlx5_fw_reset_init(struct mlx5_core_dev *dev) |
746 | { |
747 | struct mlx5_fw_reset *fw_reset; |
748 | int err; |
749 | |
750 | if (!MLX5_CAP_MCAM_REG(dev, mfrl)) |
751 | return 0; |
752 | |
753 | fw_reset = kzalloc(size: sizeof(*fw_reset), GFP_KERNEL); |
754 | if (!fw_reset) |
755 | return -ENOMEM; |
756 | fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events" ); |
757 | if (!fw_reset->wq) { |
758 | kfree(objp: fw_reset); |
759 | return -ENOMEM; |
760 | } |
761 | |
762 | fw_reset->dev = dev; |
763 | dev->priv.fw_reset = fw_reset; |
764 | |
765 | err = devl_params_register(devlink: priv_to_devlink(priv: dev), |
766 | params: mlx5_fw_reset_devlink_params, |
767 | ARRAY_SIZE(mlx5_fw_reset_devlink_params)); |
768 | if (err) { |
769 | destroy_workqueue(wq: fw_reset->wq); |
770 | kfree(objp: fw_reset); |
771 | return err; |
772 | } |
773 | |
774 | INIT_WORK(&fw_reset->fw_live_patch_work, mlx5_fw_live_patch_event); |
775 | INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event); |
776 | INIT_WORK(&fw_reset->reset_unload_work, mlx5_sync_reset_unload_event); |
777 | INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work); |
778 | INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event); |
779 | INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event); |
780 | |
781 | init_completion(x: &fw_reset->done); |
782 | return 0; |
783 | } |
784 | |
785 | void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev) |
786 | { |
787 | struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; |
788 | |
789 | if (!fw_reset) |
790 | return; |
791 | |
792 | devl_params_unregister(devlink: priv_to_devlink(priv: dev), |
793 | params: mlx5_fw_reset_devlink_params, |
794 | ARRAY_SIZE(mlx5_fw_reset_devlink_params)); |
795 | destroy_workqueue(wq: fw_reset->wq); |
796 | kfree(objp: dev->priv.fw_reset); |
797 | } |
798 | |