1 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB |
2 | // Copyright (c) 2018 Mellanox Technologies |
3 | |
4 | #include <linux/mlx5/driver.h> |
5 | |
6 | #include "mlx5_core.h" |
7 | #include "lib/eq.h" |
8 | #include "lib/events.h" |
9 | |
10 | struct mlx5_event_nb { |
11 | struct mlx5_nb nb; |
12 | void *ctx; |
13 | }; |
14 | |
15 | /* General events handlers for the low level mlx5_core driver |
16 | * |
17 | * Other Major feature specific events such as |
18 | * clock/eswitch/fpga/FW trace and many others, are handled elsewhere, with |
19 | * separate notifiers callbacks, specifically by those mlx5 components. |
20 | */ |
21 | static int any_notifier(struct notifier_block *, unsigned long, void *); |
22 | static int temp_warn(struct notifier_block *, unsigned long, void *); |
23 | static int port_module(struct notifier_block *, unsigned long, void *); |
24 | static int pcie_core(struct notifier_block *, unsigned long, void *); |
25 | |
26 | /* handler which forwards the event to events->fw_nh, driver notifiers */ |
27 | static int forward_event(struct notifier_block *, unsigned long, void *); |
28 | |
29 | static struct mlx5_nb events_nbs_ref[] = { |
30 | /* Events to be processed by mlx5_core */ |
31 | {.nb.notifier_call = any_notifier, .event_type = MLX5_EVENT_TYPE_NOTIFY_ANY }, |
32 | {.nb.notifier_call = temp_warn, .event_type = MLX5_EVENT_TYPE_TEMP_WARN_EVENT }, |
33 | {.nb.notifier_call = port_module, .event_type = MLX5_EVENT_TYPE_PORT_MODULE_EVENT }, |
34 | {.nb.notifier_call = pcie_core, .event_type = MLX5_EVENT_TYPE_GENERAL_EVENT }, |
35 | |
36 | /* Events to be forwarded (as is) to mlx5 core interfaces (mlx5e/mlx5_ib) */ |
37 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PORT_CHANGE }, |
38 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_GENERAL_EVENT }, |
39 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_OBJECT_CHANGE }, |
40 | /* QP/WQ resource events to forward */ |
41 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_DCT_DRAINED }, |
42 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PATH_MIG }, |
43 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_COMM_EST }, |
44 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SQ_DRAINED }, |
45 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_LAST_WQE }, |
46 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_CATAS_ERROR }, |
47 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PATH_MIG_FAILED }, |
48 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR }, |
49 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_ACCESS_ERROR }, |
50 | /* SRQ events */ |
51 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_CATAS_ERROR }, |
52 | {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_RQ_LIMIT }, |
53 | }; |
54 | |
55 | struct mlx5_events { |
56 | struct mlx5_core_dev *dev; |
57 | struct workqueue_struct *wq; |
58 | struct mlx5_event_nb notifiers[ARRAY_SIZE(events_nbs_ref)]; |
59 | /* driver notifier chain for fw events */ |
60 | struct atomic_notifier_head fw_nh; |
61 | /* port module events stats */ |
62 | struct mlx5_pme_stats pme_stats; |
63 | /*pcie_core*/ |
64 | struct work_struct pcie_core_work; |
65 | /* driver notifier chain for sw events */ |
66 | struct blocking_notifier_head sw_nh; |
67 | }; |
68 | |
69 | static const char *eqe_type_str(u8 type) |
70 | { |
71 | switch (type) { |
72 | case MLX5_EVENT_TYPE_COMP: |
73 | return "MLX5_EVENT_TYPE_COMP" ; |
74 | case MLX5_EVENT_TYPE_PATH_MIG: |
75 | return "MLX5_EVENT_TYPE_PATH_MIG" ; |
76 | case MLX5_EVENT_TYPE_COMM_EST: |
77 | return "MLX5_EVENT_TYPE_COMM_EST" ; |
78 | case MLX5_EVENT_TYPE_SQ_DRAINED: |
79 | return "MLX5_EVENT_TYPE_SQ_DRAINED" ; |
80 | case MLX5_EVENT_TYPE_SRQ_LAST_WQE: |
81 | return "MLX5_EVENT_TYPE_SRQ_LAST_WQE" ; |
82 | case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: |
83 | return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT" ; |
84 | case MLX5_EVENT_TYPE_CQ_ERROR: |
85 | return "MLX5_EVENT_TYPE_CQ_ERROR" ; |
86 | case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: |
87 | return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR" ; |
88 | case MLX5_EVENT_TYPE_PATH_MIG_FAILED: |
89 | return "MLX5_EVENT_TYPE_PATH_MIG_FAILED" ; |
90 | case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: |
91 | return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR" ; |
92 | case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: |
93 | return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR" ; |
94 | case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: |
95 | return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR" ; |
96 | case MLX5_EVENT_TYPE_INTERNAL_ERROR: |
97 | return "MLX5_EVENT_TYPE_INTERNAL_ERROR" ; |
98 | case MLX5_EVENT_TYPE_PORT_CHANGE: |
99 | return "MLX5_EVENT_TYPE_PORT_CHANGE" ; |
100 | case MLX5_EVENT_TYPE_GPIO_EVENT: |
101 | return "MLX5_EVENT_TYPE_GPIO_EVENT" ; |
102 | case MLX5_EVENT_TYPE_PORT_MODULE_EVENT: |
103 | return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT" ; |
104 | case MLX5_EVENT_TYPE_TEMP_WARN_EVENT: |
105 | return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT" ; |
106 | case MLX5_EVENT_TYPE_REMOTE_CONFIG: |
107 | return "MLX5_EVENT_TYPE_REMOTE_CONFIG" ; |
108 | case MLX5_EVENT_TYPE_DB_BF_CONGESTION: |
109 | return "MLX5_EVENT_TYPE_DB_BF_CONGESTION" ; |
110 | case MLX5_EVENT_TYPE_STALL_EVENT: |
111 | return "MLX5_EVENT_TYPE_STALL_EVENT" ; |
112 | case MLX5_EVENT_TYPE_CMD: |
113 | return "MLX5_EVENT_TYPE_CMD" ; |
114 | case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED: |
115 | return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED" ; |
116 | case MLX5_EVENT_TYPE_VHCA_STATE_CHANGE: |
117 | return "MLX5_EVENT_TYPE_VHCA_STATE_CHANGE" ; |
118 | case MLX5_EVENT_TYPE_PAGE_REQUEST: |
119 | return "MLX5_EVENT_TYPE_PAGE_REQUEST" ; |
120 | case MLX5_EVENT_TYPE_PAGE_FAULT: |
121 | return "MLX5_EVENT_TYPE_PAGE_FAULT" ; |
122 | case MLX5_EVENT_TYPE_PPS_EVENT: |
123 | return "MLX5_EVENT_TYPE_PPS_EVENT" ; |
124 | case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE: |
125 | return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE" ; |
126 | case MLX5_EVENT_TYPE_FPGA_ERROR: |
127 | return "MLX5_EVENT_TYPE_FPGA_ERROR" ; |
128 | case MLX5_EVENT_TYPE_FPGA_QP_ERROR: |
129 | return "MLX5_EVENT_TYPE_FPGA_QP_ERROR" ; |
130 | case MLX5_EVENT_TYPE_GENERAL_EVENT: |
131 | return "MLX5_EVENT_TYPE_GENERAL_EVENT" ; |
132 | case MLX5_EVENT_TYPE_MONITOR_COUNTER: |
133 | return "MLX5_EVENT_TYPE_MONITOR_COUNTER" ; |
134 | case MLX5_EVENT_TYPE_DEVICE_TRACER: |
135 | return "MLX5_EVENT_TYPE_DEVICE_TRACER" ; |
136 | case MLX5_EVENT_TYPE_OBJECT_CHANGE: |
137 | return "MLX5_EVENT_TYPE_OBJECT_CHANGE" ; |
138 | default: |
139 | return "Unrecognized event" ; |
140 | } |
141 | } |
142 | |
143 | /* handles all FW events, type == eqe->type */ |
144 | static int any_notifier(struct notifier_block *nb, |
145 | unsigned long type, void *data) |
146 | { |
147 | struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); |
148 | struct mlx5_events *events = event_nb->ctx; |
149 | struct mlx5_eqe *eqe = data; |
150 | |
151 | mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d)\n" , |
152 | eqe_type_str(eqe->type), eqe->sub_type); |
153 | return NOTIFY_OK; |
154 | } |
155 | |
156 | /* type == MLX5_EVENT_TYPE_TEMP_WARN_EVENT */ |
157 | static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) |
158 | { |
159 | struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); |
160 | struct mlx5_events *events = event_nb->ctx; |
161 | struct mlx5_eqe *eqe = data; |
162 | u64 value_lsb; |
163 | u64 value_msb; |
164 | |
165 | value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); |
166 | value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); |
167 | |
168 | mlx5_core_warn(events->dev, |
169 | "High temperature on sensors with bit set %llx %llx" , |
170 | value_msb, value_lsb); |
171 | |
172 | return NOTIFY_OK; |
173 | } |
174 | |
175 | /* MLX5_EVENT_TYPE_PORT_MODULE_EVENT */ |
176 | static const char *mlx5_pme_status_to_string(enum port_module_event_status_type status) |
177 | { |
178 | switch (status) { |
179 | case MLX5_MODULE_STATUS_PLUGGED: |
180 | return "Cable plugged" ; |
181 | case MLX5_MODULE_STATUS_UNPLUGGED: |
182 | return "Cable unplugged" ; |
183 | case MLX5_MODULE_STATUS_ERROR: |
184 | return "Cable error" ; |
185 | case MLX5_MODULE_STATUS_DISABLED: |
186 | return "Cable disabled" ; |
187 | default: |
188 | return "Unknown status" ; |
189 | } |
190 | } |
191 | |
192 | static const char *mlx5_pme_error_to_string(enum port_module_event_error_type error) |
193 | { |
194 | switch (error) { |
195 | case MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED: |
196 | return "Power budget exceeded" ; |
197 | case MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX: |
198 | return "Long Range for non MLNX cable" ; |
199 | case MLX5_MODULE_EVENT_ERROR_BUS_STUCK: |
200 | return "Bus stuck (I2C or data shorted)" ; |
201 | case MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT: |
202 | return "No EEPROM/retry timeout" ; |
203 | case MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST: |
204 | return "Enforce part number list" ; |
205 | case MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER: |
206 | return "Unknown identifier" ; |
207 | case MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE: |
208 | return "High Temperature" ; |
209 | case MLX5_MODULE_EVENT_ERROR_BAD_CABLE: |
210 | return "Bad or shorted cable/module" ; |
211 | case MLX5_MODULE_EVENT_ERROR_PCIE_POWER_SLOT_EXCEEDED: |
212 | return "One or more network ports have been powered down due to insufficient/unadvertised power on the PCIe slot" ; |
213 | default: |
214 | return "Unknown error" ; |
215 | } |
216 | } |
217 | |
218 | /* type == MLX5_EVENT_TYPE_PORT_MODULE_EVENT */ |
219 | static int port_module(struct notifier_block *nb, unsigned long type, void *data) |
220 | { |
221 | struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); |
222 | struct mlx5_events *events = event_nb->ctx; |
223 | struct mlx5_eqe *eqe = data; |
224 | |
225 | enum port_module_event_status_type module_status; |
226 | enum port_module_event_error_type error_type; |
227 | struct mlx5_eqe_port_module *module_event_eqe; |
228 | const char *status_str; |
229 | u8 module_num; |
230 | |
231 | module_event_eqe = &eqe->data.port_module; |
232 | module_status = module_event_eqe->module_status & |
233 | PORT_MODULE_EVENT_MODULE_STATUS_MASK; |
234 | error_type = module_event_eqe->error_type & |
235 | PORT_MODULE_EVENT_ERROR_TYPE_MASK; |
236 | |
237 | if (module_status < MLX5_MODULE_STATUS_NUM) |
238 | events->pme_stats.status_counters[module_status]++; |
239 | |
240 | if (module_status == MLX5_MODULE_STATUS_ERROR) |
241 | if (error_type < MLX5_MODULE_EVENT_ERROR_NUM) |
242 | events->pme_stats.error_counters[error_type]++; |
243 | |
244 | if (!printk_ratelimit()) |
245 | return NOTIFY_OK; |
246 | |
247 | module_num = module_event_eqe->module; |
248 | status_str = mlx5_pme_status_to_string(status: module_status); |
249 | if (module_status == MLX5_MODULE_STATUS_ERROR) { |
250 | const char *error_str = mlx5_pme_error_to_string(error: error_type); |
251 | |
252 | mlx5_core_err(events->dev, |
253 | "Port module event[error]: module %u, %s, %s\n" , |
254 | module_num, status_str, error_str); |
255 | } else { |
256 | mlx5_core_info(events->dev, |
257 | "Port module event: module %u, %s\n" , |
258 | module_num, status_str); |
259 | } |
260 | |
261 | return NOTIFY_OK; |
262 | } |
263 | |
264 | enum { |
265 | MLX5_PCI_POWER_COULD_NOT_BE_READ = 0x0, |
266 | MLX5_PCI_POWER_SUFFICIENT_REPORTED = 0x1, |
267 | MLX5_PCI_POWER_INSUFFICIENT_REPORTED = 0x2, |
268 | }; |
269 | |
270 | static void mlx5_pcie_event(struct work_struct *work) |
271 | { |
272 | u32 out[MLX5_ST_SZ_DW(mpein_reg)] = {0}; |
273 | u32 in[MLX5_ST_SZ_DW(mpein_reg)] = {0}; |
274 | struct mlx5_events *events; |
275 | struct mlx5_core_dev *dev; |
276 | u8 power_status; |
277 | u16 pci_power; |
278 | |
279 | events = container_of(work, struct mlx5_events, pcie_core_work); |
280 | dev = events->dev; |
281 | |
282 | if (!MLX5_CAP_MCAM_FEATURE(dev, pci_status_and_power)) |
283 | return; |
284 | |
285 | mlx5_core_access_reg(dev, data_in: in, size_in: sizeof(in), data_out: out, size_out: sizeof(out), |
286 | reg_num: MLX5_REG_MPEIN, arg: 0, write: 0); |
287 | power_status = MLX5_GET(mpein_reg, out, pwr_status); |
288 | pci_power = MLX5_GET(mpein_reg, out, pci_power); |
289 | |
290 | switch (power_status) { |
291 | case MLX5_PCI_POWER_COULD_NOT_BE_READ: |
292 | mlx5_core_info_rl(dev, |
293 | "PCIe slot power capability was not advertised.\n" ); |
294 | break; |
295 | case MLX5_PCI_POWER_INSUFFICIENT_REPORTED: |
296 | mlx5_core_warn_rl(dev, |
297 | "Detected insufficient power on the PCIe slot (%uW).\n" , |
298 | pci_power); |
299 | break; |
300 | case MLX5_PCI_POWER_SUFFICIENT_REPORTED: |
301 | mlx5_core_info_rl(dev, |
302 | "PCIe slot advertised sufficient power (%uW).\n" , |
303 | pci_power); |
304 | break; |
305 | } |
306 | } |
307 | |
308 | static int pcie_core(struct notifier_block *nb, unsigned long type, void *data) |
309 | { |
310 | struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, |
311 | struct mlx5_event_nb, |
312 | nb); |
313 | struct mlx5_events *events = event_nb->ctx; |
314 | struct mlx5_eqe *eqe = data; |
315 | |
316 | switch (eqe->sub_type) { |
317 | case MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT: |
318 | queue_work(wq: events->wq, work: &events->pcie_core_work); |
319 | break; |
320 | default: |
321 | return NOTIFY_DONE; |
322 | } |
323 | |
324 | return NOTIFY_OK; |
325 | } |
326 | |
327 | void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct mlx5_pme_stats *stats) |
328 | { |
329 | *stats = dev->priv.events->pme_stats; |
330 | } |
331 | |
332 | /* forward event as is to registered interfaces (mlx5e/mlx5_ib) */ |
333 | static int forward_event(struct notifier_block *nb, unsigned long event, void *data) |
334 | { |
335 | struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); |
336 | struct mlx5_events *events = event_nb->ctx; |
337 | struct mlx5_eqe *eqe = data; |
338 | |
339 | mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d) forward to interfaces\n" , |
340 | eqe_type_str(eqe->type), eqe->sub_type); |
341 | atomic_notifier_call_chain(nh: &events->fw_nh, val: event, v: data); |
342 | return NOTIFY_OK; |
343 | } |
344 | |
345 | int mlx5_events_init(struct mlx5_core_dev *dev) |
346 | { |
347 | struct mlx5_events *events = kzalloc(size: sizeof(*events), GFP_KERNEL); |
348 | |
349 | if (!events) |
350 | return -ENOMEM; |
351 | |
352 | ATOMIC_INIT_NOTIFIER_HEAD(&events->fw_nh); |
353 | events->dev = dev; |
354 | dev->priv.events = events; |
355 | events->wq = create_singlethread_workqueue("mlx5_events" ); |
356 | if (!events->wq) { |
357 | kfree(objp: events); |
358 | return -ENOMEM; |
359 | } |
360 | INIT_WORK(&events->pcie_core_work, mlx5_pcie_event); |
361 | BLOCKING_INIT_NOTIFIER_HEAD(&events->sw_nh); |
362 | |
363 | return 0; |
364 | } |
365 | |
366 | void mlx5_events_cleanup(struct mlx5_core_dev *dev) |
367 | { |
368 | destroy_workqueue(wq: dev->priv.events->wq); |
369 | kvfree(addr: dev->priv.events); |
370 | } |
371 | |
372 | void mlx5_events_start(struct mlx5_core_dev *dev) |
373 | { |
374 | struct mlx5_events *events = dev->priv.events; |
375 | int i; |
376 | |
377 | for (i = 0; i < ARRAY_SIZE(events_nbs_ref); i++) { |
378 | events->notifiers[i].nb = events_nbs_ref[i]; |
379 | events->notifiers[i].ctx = events; |
380 | mlx5_eq_notifier_register(dev, nb: &events->notifiers[i].nb); |
381 | } |
382 | } |
383 | |
384 | void mlx5_events_stop(struct mlx5_core_dev *dev) |
385 | { |
386 | struct mlx5_events *events = dev->priv.events; |
387 | int i; |
388 | |
389 | for (i = ARRAY_SIZE(events_nbs_ref) - 1; i >= 0 ; i--) |
390 | mlx5_eq_notifier_unregister(dev, nb: &events->notifiers[i].nb); |
391 | flush_workqueue(events->wq); |
392 | } |
393 | |
394 | /* This API is used only for processing and forwarding firmware |
395 | * events to mlx5 consumer. |
396 | */ |
397 | int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) |
398 | { |
399 | struct mlx5_events *events = dev->priv.events; |
400 | |
401 | return atomic_notifier_chain_register(nh: &events->fw_nh, nb); |
402 | } |
403 | EXPORT_SYMBOL(mlx5_notifier_register); |
404 | |
405 | int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) |
406 | { |
407 | struct mlx5_events *events = dev->priv.events; |
408 | |
409 | return atomic_notifier_chain_unregister(nh: &events->fw_nh, nb); |
410 | } |
411 | EXPORT_SYMBOL(mlx5_notifier_unregister); |
412 | |
413 | int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, void *data) |
414 | { |
415 | return atomic_notifier_call_chain(nh: &events->fw_nh, val: event, v: data); |
416 | } |
417 | |
418 | /* This API is used only for processing and forwarding driver-specific |
419 | * events to mlx5 consumers. |
420 | */ |
421 | int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) |
422 | { |
423 | struct mlx5_events *events = dev->priv.events; |
424 | |
425 | return blocking_notifier_chain_register(nh: &events->sw_nh, nb); |
426 | } |
427 | EXPORT_SYMBOL(mlx5_blocking_notifier_register); |
428 | |
429 | int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) |
430 | { |
431 | struct mlx5_events *events = dev->priv.events; |
432 | |
433 | return blocking_notifier_chain_unregister(nh: &events->sw_nh, nb); |
434 | } |
435 | EXPORT_SYMBOL(mlx5_blocking_notifier_unregister); |
436 | |
437 | int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event, |
438 | void *data) |
439 | { |
440 | struct mlx5_events *events = dev->priv.events; |
441 | |
442 | return blocking_notifier_call_chain(nh: &events->sw_nh, val: event, v: data); |
443 | } |
444 | |