pci.c source code [linux/drivers/cxl/pci.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright(c) 2020 Intel Corporation. All rights reserved. /
3	#include <linux/unaligned.h>
4	#include <linux/io-64-nonatomic-lo-hi.h>
5	#include <linux/moduleparam.h>
6	#include <linux/module.h>
7	#include <linux/delay.h>
8	#include <linux/sizes.h>
9	#include <linux/mutex.h>
10	#include <linux/list.h>
11	#include <linux/pci.h>
12	#include <linux/aer.h>
13	#include <linux/io.h>
14	#include <cxl/mailbox.h>
15	#include "cxlmem.h"
16	#include "cxlpci.h"
17	#include "cxl.h"
18	#include "pmu.h"
19
20	/**
21	* DOC: cxl pci
22	*
23	* This implements the PCI exclusive functionality for a CXL device as it is
24	* defined by the Compute Express Link specification. CXL devices may surface
25	* certain functionality even if it isn't CXL enabled. While this driver is
26	* focused around the PCI specific aspects of a CXL device, it binds to the
27	* specific CXL memory device class code, and therefore the implementation of
28	* cxl_pci is focused around CXL memory devices.
29	*
30	* The driver has several responsibilities, mainly:
31	* - Create the memX device and register on the CXL bus.
32	* - Enumerate device's register interface and map them.
33	* - Registers nvdimm bridge device with cxl_core.
34	* - Registers a CXL mailbox with cxl_core.
35	*/
36
37	#define cxl_doorbell_busy(cxlds) \
38	(readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \
39	CXLDEV_MBOX_CTRL_DOORBELL)
40
41	/ CXL 2.0 - 8.2.8.4 /
42	#define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
43
44	/*
45	* CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to
46	* dictate how long to wait for the mailbox to become ready. The new
47	* field allows the device to tell software the amount of time to wait
48	* before mailbox ready. This field per the spec theoretically allows
49	* for up to 255 seconds. 255 seconds is unreasonably long, its longer
50	* than the maximum SATA port link recovery wait. Default to 60 seconds
51	* until someone builds a CXL device that needs more time in practice.
52	*/
53	static unsigned short mbox_ready_timeout = `60`;
54	module_param(mbox_ready_timeout, ushort, `0644`);
55	MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
56
57	static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
58	{
59	const unsigned long start = jiffies;
60	unsigned long end = start;
61
62	while (cxl_doorbell_busy(cxlds)) {
63	end = jiffies;
64
65	if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
66	/ Check again in case preempted before timeout test /
67	if (!cxl_doorbell_busy(cxlds))
68	break;
69	return -ETIMEDOUT;
70	}
71	cpu_relax();
72	}
73
74	dev_dbg(cxlds->dev, "Doorbell wait took %dms",
75	jiffies_to_msecs(end) - jiffies_to_msecs(start));
76	return `0`;
77	}
78
79	#define cxl_err(dev, status, msg) \
80	dev_err_ratelimited(dev, msg ", device state %s%s\n", \
81	status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
82	status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
83
84	#define cxl_cmd_err(dev, cmd, status, msg) \
85	dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \
86	(cmd)->opcode, \
87	status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
88	status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
89
90	/*
91	* Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique
92	* wrapper object for each irq within the same cxlds.
93	*/
94	struct cxl_dev_id {
95	struct cxl_dev_state *cxlds;
96	};
97
98	static int cxl_request_irq(struct cxl_dev_state cxlds, int* irq,
99	irq_handler_t thread_fn)
100	{
101	struct device *dev = cxlds->dev;
102	struct cxl_dev_id *dev_id;
103
104	dev_id = devm_kzalloc(dev, size: sizeof(*dev_id), GFP_KERNEL);
105	if (!dev_id)
106	return -ENOMEM;
107	dev_id->cxlds = cxlds;
108
109	return devm_request_threaded_irq(dev, irq, NULL, thread_fn,
110	IRQF_SHARED \| IRQF_ONESHOT, NULL,
111	dev_id);
112	}
113
114	static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
115	{
116	u64 reg;
117
118	reg = readq(addr: cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
119	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == `100`;
120	}
121
122	static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
123	{
124	u64 reg;
125	u16 opcode;
126	struct cxl_dev_id *dev_id = id;
127	struct cxl_dev_state *cxlds = dev_id->cxlds;
128	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
129	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
130
131	if (!cxl_mbox_background_complete(cxlds))
132	return IRQ_NONE;
133
134	reg = readq(addr: cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
135	opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg);
136	if (opcode == CXL_MBOX_OP_SANITIZE) {
137	mutex_lock(&cxl_mbox->mbox_mutex);
138	if (mds->security.sanitize_node)
139	mod_delayed_work(wq: system_percpu_wq, dwork: &mds->security.poll_dwork, delay: `0`);
140	mutex_unlock(lock: &cxl_mbox->mbox_mutex);
141	} else {
142	/ short-circuit the wait in __cxl_pci_mbox_send_cmd() /
143	rcuwait_wake_up(w: &cxl_mbox->mbox_wait);
144	}
145
146	return IRQ_HANDLED;
147	}
148
149	/*
150	* Sanitization operation polling mode.
151	*/
152	static void cxl_mbox_sanitize_work(struct work_struct *work)
153	{
154	struct cxl_memdev_state *mds =
155	container_of(work, typeof(*mds), security.poll_dwork.work);
156	struct cxl_dev_state *cxlds = &mds->cxlds;
157	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
158
159	mutex_lock(&cxl_mbox->mbox_mutex);
160	if (cxl_mbox_background_complete(cxlds)) {
161	mds->security.poll_tmo_secs = `0`;
162	if (mds->security.sanitize_node)
163	sysfs_notify_dirent(kn: mds->security.sanitize_node);
164	mds->security.sanitize_active = false;
165
166	dev_dbg(cxlds->dev, "Sanitization operation ended\n");
167	} else {
168	int timeout = mds->security.poll_tmo_secs + `10`;
169
170	mds->security.poll_tmo_secs = min(`15` * `60`, timeout);
171	schedule_delayed_work(dwork: &mds->security.poll_dwork, delay: timeout * HZ);
172	}
173	mutex_unlock(lock: &cxl_mbox->mbox_mutex);
174	}
175
176	/**
177	* __cxl_pci_mbox_send_cmd() - Execute a mailbox command
178	* @cxl_mbox: CXL mailbox context
179	* @mbox_cmd: Command to send to the memory device.
180	*
181	* Context: Any context. Expects mbox_mutex to be held.
182	* Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
183	* Caller should check the return code in @mbox_cmd to make sure it
184	* succeeded.
185	*
186	* This is a generic form of the CXL mailbox send command thus only using the
187	* registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
188	* devices, and perhaps other types of CXL devices may have further information
189	* available upon error conditions. Driver facilities wishing to send mailbox
190	* commands should use the wrapper command.
191	*
192	* The CXL spec allows for up to two mailboxes. The intention is for the primary
193	* mailbox to be OS controlled and the secondary mailbox to be used by system
194	* firmware. This allows the OS and firmware to communicate with the device and
195	* not need to coordinate with each other. The driver only uses the primary
196	* mailbox.
197	*/
198	static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox,
199	struct cxl_mbox_cmd *mbox_cmd)
200	{
201	struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox);
202	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
203	void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
204	struct device *dev = cxlds->dev;
205	u64 cmd_reg, status_reg;
206	size_t out_len;
207	int rc;
208
209	lockdep_assert_held(&cxl_mbox->mbox_mutex);
210
211	/*
212	* Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
213	* 1. Caller reads MB Control Register to verify doorbell is clear
214	* 2. Caller writes Command Register
215	* 3. Caller writes Command Payload Registers if input payload is non-empty
216	* 4. Caller writes MB Control Register to set doorbell
217	* 5. Caller either polls for doorbell to be clear or waits for interrupt if configured
218	* 6. Caller reads MB Status Register to fetch Return code
219	* 7. If command successful, Caller reads Command Register to get Payload Length
220	* 8. If output payload is non-empty, host reads Command Payload Registers
221	*
222	* Hardware is free to do whatever it wants before the doorbell is rung,
223	* and isn't allowed to change anything after it clears the doorbell. As
224	* such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
225	* also happen in any order (though some orders might not make sense).
226	*/
227
228	/ #1 /
229	if (cxl_doorbell_busy(cxlds)) {
230	u64 md_status =
231	readq(addr: cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
232
233	cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
234	"mailbox queue busy");
235	return -EBUSY;
236	}
237
238	/*
239	* With sanitize polling, hardware might be done and the poller still
240	* not be in sync. Ensure no new command comes in until so. Keep the
241	* hardware semantics and only allow device health status.
242	*/
243	if (mds->security.poll_tmo_secs > `0`) {
244	if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO)
245	return -EBUSY;
246	}
247
248	cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
249	mbox_cmd->opcode);
250	if (mbox_cmd->size_in) {
251	if (WARN_ON(!mbox_cmd->payload_in))
252	return -EINVAL;
253
254	cmd_reg \|= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
255	mbox_cmd->size_in);
256	memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
257	}
258
259	/ #2, #3 /
260	writeq(val: cmd_reg, addr: cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
261
262	/ #4 /
263	dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode);
264	writel(CXLDEV_MBOX_CTRL_DOORBELL,
265	addr: cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
266
267	/ #5 /
268	rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
269	if (rc == -ETIMEDOUT) {
270	u64 md_status = readq(addr: cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
271
272	cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
273	return rc;
274	}
275
276	/ #6 /
277	status_reg = readq(addr: cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
278	mbox_cmd->return_code =
279	FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
280
281	/*
282	* Handle the background command in a synchronous manner.
283	*
284	* All other mailbox commands will serialize/queue on the mbox_mutex,
285	* which we currently hold. Furthermore this also guarantees that
286	* cxl_mbox_background_complete() checks are safe amongst each other,
287	* in that no new bg operation can occur in between.
288	*
289	* Background operations are timesliced in accordance with the nature
290	* of the command. In the event of timeout, the mailbox state is
291	* indeterminate until the next successful command submission and the
292	* driver can get back in sync with the hardware state.
293	*/
294	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
295	u64 bg_status_reg;
296	int i, timeout;
297
298	/*
299	* Sanitization is a special case which monopolizes the device
300	* and cannot be timesliced. Handle asynchronously instead,
301	* and allow userspace to poll(2) for completion.
302	*/
303	if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) {
304	if (mds->security.sanitize_active)
305	return -EBUSY;
306
307	/ give first timeout a second /
308	timeout = `1`;
309	mds->security.poll_tmo_secs = timeout;
310	mds->security.sanitize_active = true;
311	schedule_delayed_work(dwork: &mds->security.poll_dwork,
312	delay: timeout * HZ);
313	dev_dbg(dev, "Sanitization operation started\n");
314	goto success;
315	}
316
317	dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
318	mbox_cmd->opcode);
319
320	timeout = mbox_cmd->poll_interval_ms;
321	for (i = `0`; i < mbox_cmd->poll_count; i++) {
322	if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait,
323	cxl_mbox_background_complete(cxlds),
324	TASK_UNINTERRUPTIBLE,
325	msecs_to_jiffies(timeout)) > `0`)
326	break;
327	}
328
329	if (!cxl_mbox_background_complete(cxlds)) {
330	dev_err(dev, "timeout waiting for background (%d ms)\n",
331	timeout * mbox_cmd->poll_count);
332	return -ETIMEDOUT;
333	}
334
335	bg_status_reg = readq(addr: cxlds->regs.mbox +
336	CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
337	mbox_cmd->return_code =
338	FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
339	bg_status_reg);
340	dev_dbg(dev,
341	"Mailbox background operation (0x%04x) completed\n",
342	mbox_cmd->opcode);
343	}
344
345	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
346	dev_dbg(dev, "Mailbox operation had an error: %s\n",
347	cxl_mbox_cmd_rc2str(mbox_cmd));
348	return `0`; / completed but caller must check return_code /
349	}
350
351	success:
352	/ #7 /
353	cmd_reg = readq(addr: cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
354	out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
355
356	/ #8 /
357	if (out_len && mbox_cmd->payload_out) {
358	/*
359	* Sanitize the copy. If hardware misbehaves, out_len per the
360	* spec can actually be greater than the max allowed size (21
361	* bits available but spec defined 1M max). The caller also may
362	* have requested less data than the hardware supplied even
363	* within spec.
364	*/
365	size_t n;
366
367	n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len);
368	memcpy_fromio(mbox_cmd->payload_out, payload, n);
369	mbox_cmd->size_out = n;
370	} else {
371	mbox_cmd->size_out = `0`;
372	}
373
374	return `0`;
375	}
376
377	static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox,
378	struct cxl_mbox_cmd *cmd)
379	{
380	int rc;
381
382	mutex_lock(&cxl_mbox->mbox_mutex);
383	rc = __cxl_pci_mbox_send_cmd(cxl_mbox, mbox_cmd: cmd);
384	mutex_unlock(lock: &cxl_mbox->mbox_mutex);
385
386	return rc;
387	}
388
389	static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
390	{
391	struct cxl_dev_state *cxlds = &mds->cxlds;
392	struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
393	const int cap = readl(addr: cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
394	struct device *dev = cxlds->dev;
395	unsigned long timeout;
396	int irq, msgnum;
397	u64 md_status;
398	u32 ctrl;
399
400	timeout = jiffies + mbox_ready_timeout * HZ;
401	do {
402	md_status = readq(addr: cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
403	if (md_status & CXLMDEV_MBOX_IF_READY)
404	break;
405	if (msleep_interruptible(msecs: `100`))
406	break;
407	} while (!time_after(jiffies, timeout));
408
409	if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
410	cxl_err(dev, md_status, "timeout awaiting mailbox ready");
411	return -ETIMEDOUT;
412	}
413
414	/*
415	* A command may be in flight from a previous driver instance,
416	* think kexec, do one doorbell wait so that
417	* __cxl_pci_mbox_send_cmd() can assume that it is the only
418	* source for future doorbell busy events.
419	*/
420	if (cxl_pci_mbox_wait_for_doorbell(cxlds) != `0`) {
421	cxl_err(dev, md_status, "timeout awaiting mailbox idle");
422	return -ETIMEDOUT;
423	}
424
425	cxl_mbox->mbox_send = cxl_pci_mbox_send;
426	cxl_mbox->payload_size =
427	`1` << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
428
429	/*
430	* CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
431	*
432	* If the size is too small, mandatory commands will not work and so
433	* there's no point in going forward. If the size is too large, there's
434	* no harm is soft limiting it.
435	*/
436	cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M);
437	if (cxl_mbox->payload_size < `256`) {
438	dev_err(dev, "Mailbox is too small (%zub)",
439	cxl_mbox->payload_size);
440	return -ENXIO;
441	}
442
443	dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size);
444
445	INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work);
446
447	/ background command interrupts are optional /
448	if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) \|\| !irq_avail)
449	return `0`;
450
451	msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap);
452	irq = pci_irq_vector(to_pci_dev(cxlds->dev), nr: msgnum);
453	if (irq < `0`)
454	return `0`;
455
456	if (cxl_request_irq(cxlds, irq, thread_fn: cxl_pci_mbox_irq))
457	return `0`;
458
459	dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n");
460	/ enable background command mbox irq support /
461	ctrl = readl(addr: cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
462	ctrl \|= CXLDEV_MBOX_CTRL_BG_CMD_IRQ;
463	writel(val: ctrl, addr: cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
464
465	return `0`;
466	}
467
468	/*
469	* Assume that any RCIEP that emits the CXL memory expander class code
470	* is an RCD
471	*/
472	static bool is_cxl_restricted(struct pci_dev *pdev)
473	{
474	return pci_pcie_type(dev: pdev) == PCI_EXP_TYPE_RC_END;
475	}
476
477	static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev,
478	struct cxl_register_map *map,
479	struct cxl_dport *dport)
480	{
481	resource_size_t component_reg_phys;
482
483	map = (struct* cxl_register_map) {
484	.host = &pdev->dev,
485	.resource = CXL_RESOURCE_NONE,
486	};
487
488	struct cxl_port *port __free(put_cxl_port) =
489	cxl_pci_find_port(pdev, dport: &dport);
490	if (!port)
491	return -EPROBE_DEFER;
492
493	component_reg_phys = cxl_rcd_component_reg_phys(dev: &pdev->dev, dport);
494	if (component_reg_phys == CXL_RESOURCE_NONE)
495	return -ENXIO;
496
497	map->resource = component_reg_phys;
498	map->reg_type = CXL_REGLOC_RBI_COMPONENT;
499	map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE;
500
501	return `0`;
502	}
503
504	static int cxl_pci_setup_regs(struct pci_dev pdev, enum* cxl_regloc_type type,
505	struct cxl_register_map *map)
506	{
507	int rc;
508
509	rc = cxl_find_regblock(pdev, type, map);
510
511	/*
512	* If the Register Locator DVSEC does not exist, check if it
513	* is an RCH and try to extract the Component Registers from
514	* an RCRB.
515	*/
516	if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) {
517	struct cxl_dport *dport;
518	struct cxl_port *port __free(put_cxl_port) =
519	cxl_pci_find_port(pdev, dport: &dport);
520	if (!port)
521	return -EPROBE_DEFER;
522
523	rc = cxl_rcrb_get_comp_regs(pdev, map, dport);
524	if (rc)
525	return rc;
526
527	rc = cxl_dport_map_rcd_linkcap(pdev, dport);
528	if (rc)
529	return rc;
530
531	} else if (rc) {
532	return rc;
533	}
534
535	return cxl_setup_regs(map);
536	}
537
538	static int cxl_pci_ras_unmask(struct pci_dev *pdev)
539	{
540	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
541	void __iomem *addr;
542	u32 orig_val, val, mask;
543	u16 cap;
544	int rc;
545
546	if (!cxlds->regs.ras) {
547	dev_dbg(&pdev->dev, "No RAS registers.\n");
548	return `0`;
549	}
550
551	/ BIOS has PCIe AER error control /
552	if (!pcie_aer_is_native(dev: pdev))
553	return `0`;
554
555	rc = pcie_capability_read_word(dev: pdev, PCI_EXP_DEVCTL, val: &cap);
556	if (rc)
557	return rc;
558
559	if (cap & PCI_EXP_DEVCTL_URRE) {
560	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET;
561	orig_val = readl(addr);
562
563	mask = CXL_RAS_UNCORRECTABLE_MASK_MASK \|
564	CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK;
565	val = orig_val & ~mask;
566	writel(val, addr);
567	dev_dbg(&pdev->dev,
568	"Uncorrectable RAS Errors Mask: %#x -> %#x\n",
569	orig_val, val);
570	}
571
572	if (cap & PCI_EXP_DEVCTL_CERE) {
573	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET;
574	orig_val = readl(addr);
575	val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK;
576	writel(val, addr);
577	dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n",
578	orig_val, val);
579	}
580
581	return `0`;
582	}
583
584	static void free_event_buf(void *buf)
585	{
586	kvfree(addr: buf);
587	}
588
589	/*
590	* There is a single buffer for reading event logs from the mailbox. All logs
591	* share this buffer protected by the mds->event_log_lock.
592	*/
593	static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds)
594	{
595	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
596	struct cxl_get_event_payload *buf;
597
598	buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
599	if (!buf)
600	return -ENOMEM;
601	mds->event.buf = buf;
602
603	return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf);
604	}
605
606	static bool cxl_alloc_irq_vectors(struct pci_dev *pdev)
607	{
608	int nvecs;
609
610	/*
611	* Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must
612	* not generate INTx messages if that function participates in
613	* CXL.cache or CXL.mem.
614	*
615	* Additionally pci_alloc_irq_vectors() handles calling
616	* pci_free_irq_vectors() automatically despite not being called
617	* pcim_*. See pci_setup_msi_context().
618	*/
619	nvecs = pci_alloc_irq_vectors(dev: pdev, min_vecs: `1`, CXL_PCI_DEFAULT_MAX_VECTORS,
620	PCI_IRQ_MSIX \| PCI_IRQ_MSI);
621	if (nvecs < `1`) {
622	dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs);
623	return false;
624	}
625	return true;
626	}
627
628	static irqreturn_t cxl_event_thread(int irq, void *id)
629	{
630	struct cxl_dev_id *dev_id = id;
631	struct cxl_dev_state *cxlds = dev_id->cxlds;
632	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
633	u32 status;
634
635	do {
636	/*
637	* CXL 3.0 8.2.8.3.1: The lower 32 bits are the status;
638	* ignore the reserved upper 32 bits
639	*/
640	status = readl(addr: cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET);
641	/ Ignore logs unknown to the driver /
642	status &= CXLDEV_EVENT_STATUS_ALL;
643	if (!status)
644	break;
645	cxl_mem_get_event_records(mds, status);
646	cond_resched();
647	} while (status);
648
649	return IRQ_HANDLED;
650	}
651
652	static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting)
653	{
654	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
655	int irq;
656
657	if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX)
658	return -ENXIO;
659
660	irq = pci_irq_vector(dev: pdev,
661	FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting));
662	if (irq < `0`)
663	return irq;
664
665	return cxl_request_irq(cxlds, irq, thread_fn: cxl_event_thread);
666	}
667
668	static int cxl_event_get_int_policy(struct cxl_memdev_state *mds,
669	struct cxl_event_interrupt_policy *policy)
670	{
671	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
672	struct cxl_mbox_cmd mbox_cmd = {
673	.opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY,
674	.payload_out = policy,
675	.size_out = sizeof(*policy),
676	};
677	int rc;
678
679	rc = cxl_internal_send_cmd(cxl_mbox, cmd: &mbox_cmd);
680	if (rc < `0`)
681	dev_err(mds->cxlds.dev,
682	"Failed to get event interrupt policy : %d", rc);
683
684	return rc;
685	}
686
687	static int cxl_event_config_msgnums(struct cxl_memdev_state *mds,
688	struct cxl_event_interrupt_policy *policy)
689	{
690	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
691	struct cxl_mbox_cmd mbox_cmd;
692	int rc;
693
694	policy = (struct* cxl_event_interrupt_policy) {
695	.info_settings = CXL_INT_MSI_MSIX,
696	.warn_settings = CXL_INT_MSI_MSIX,
697	.failure_settings = CXL_INT_MSI_MSIX,
698	.fatal_settings = CXL_INT_MSI_MSIX,
699	};
700
701	mbox_cmd = (struct cxl_mbox_cmd) {
702	.opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY,
703	.payload_in = policy,
704	.size_in = sizeof(*policy),
705	};
706
707	rc = cxl_internal_send_cmd(cxl_mbox, cmd: &mbox_cmd);
708	if (rc < `0`) {
709	dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d",
710	rc);
711	return rc;
712	}
713
714	/ Retrieve final interrupt settings /
715	return cxl_event_get_int_policy(mds, policy);
716	}
717
718	static int cxl_event_irqsetup(struct cxl_memdev_state *mds)
719	{
720	struct cxl_dev_state *cxlds = &mds->cxlds;
721	struct cxl_event_interrupt_policy policy;
722	int rc;
723
724	rc = cxl_event_config_msgnums(mds, policy: &policy);
725	if (rc)
726	return rc;
727
728	rc = cxl_event_req_irq(cxlds, setting: policy.info_settings);
729	if (rc) {
730	dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n");
731	return rc;
732	}
733
734	rc = cxl_event_req_irq(cxlds, setting: policy.warn_settings);
735	if (rc) {
736	dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n");
737	return rc;
738	}
739
740	rc = cxl_event_req_irq(cxlds, setting: policy.failure_settings);
741	if (rc) {
742	dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n");
743	return rc;
744	}
745
746	rc = cxl_event_req_irq(cxlds, setting: policy.fatal_settings);
747	if (rc) {
748	dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n");
749	return rc;
750	}
751
752	return `0`;
753	}
754
755	static bool cxl_event_int_is_fw(u8 setting)
756	{
757	u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting);
758
759	return mode == CXL_INT_FW;
760	}
761
762	static int cxl_event_config(struct pci_host_bridge *host_bridge,
763	struct cxl_memdev_state *mds, bool irq_avail)
764	{
765	struct cxl_event_interrupt_policy policy;
766	int rc;
767
768	/*
769	* When BIOS maintains CXL error reporting control, it will process
770	* event records. Only one agent can do so.
771	*/
772	if (!host_bridge->native_cxl_error)
773	return `0`;
774
775	if (!irq_avail) {
776	dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n");
777	return `0`;
778	}
779
780	rc = cxl_event_get_int_policy(mds, policy: &policy);
781	if (rc)
782	return rc;
783
784	if (cxl_event_int_is_fw(setting: policy.info_settings) \|\|
785	cxl_event_int_is_fw(setting: policy.warn_settings) \|\|
786	cxl_event_int_is_fw(setting: policy.failure_settings) \|\|
787	cxl_event_int_is_fw(setting: policy.fatal_settings)) {
788	dev_err(mds->cxlds.dev,
789	"FW still in control of Event Logs despite _OSC settings\n");
790	return -EBUSY;
791	}
792
793	rc = cxl_mem_alloc_event_buf(mds);
794	if (rc)
795	return rc;
796
797	rc = cxl_event_irqsetup(mds);
798	if (rc)
799	return rc;
800
801	cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL);
802
803	return `0`;
804	}
805
806	static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds)
807	{
808	int rc;
809
810	/*
811	* Fail the init if there's no mailbox. For a type3 this is out of spec.
812	*/
813	if (!cxlds->reg_map.device_map.mbox.valid)
814	return -ENODEV;
815
816	rc = cxl_mailbox_init(cxl_mbox: &cxlds->cxl_mbox, host: cxlds->dev);
817	if (rc)
818	return rc;
819
820	return `0`;
821	}
822
823	static ssize_t rcd_pcie_cap_emit(struct device dev, u16 offset, char* *buf, size_t width)
824	{
825	struct cxl_dev_state *cxlds = dev_get_drvdata(dev);
826	struct cxl_memdev *cxlmd = cxlds->cxlmd;
827	struct device *root_dev;
828	struct cxl_dport *dport;
829	struct cxl_port *root __free(put_cxl_port) =
830	cxl_mem_find_port(cxlmd, dport: &dport);
831
832	if (!root)
833	return -ENXIO;
834
835	root_dev = root->uport_dev;
836	if (!root_dev)
837	return -ENXIO;
838
839	if (!dport->regs.rcd_pcie_cap)
840	return -ENXIO;
841
842	guard(device)(T: root_dev);
843	if (!root_dev->driver)
844	return -ENXIO;
845
846	switch (width) {
847	case `2`:
848	return sysfs_emit(buf, fmt: "%#x\n",
849	readw(addr: dport->regs.rcd_pcie_cap + offset));
850	case `4`:
851	return sysfs_emit(buf, fmt: "%#x\n",
852	readl(addr: dport->regs.rcd_pcie_cap + offset));
853	default:
854	return -EINVAL;
855	}
856	}
857
858	static ssize_t rcd_link_cap_show(struct device *dev,
859	struct device_attribute attr, char* *buf)
860	{
861	return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCAP, buf, width: sizeof(u32));
862	}
863	static DEVICE_ATTR_RO(rcd_link_cap);
864
865	static ssize_t rcd_link_ctrl_show(struct device *dev,
866	struct device_attribute attr, char* *buf)
867	{
868	return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCTL, buf, width: sizeof(u16));
869	}
870	static DEVICE_ATTR_RO(rcd_link_ctrl);
871
872	static ssize_t rcd_link_status_show(struct device *dev,
873	struct device_attribute attr, char* *buf)
874	{
875	return rcd_pcie_cap_emit(dev, PCI_EXP_LNKSTA, buf, width: sizeof(u16));
876	}
877	static DEVICE_ATTR_RO(rcd_link_status);
878
879	static struct attribute *cxl_rcd_attrs[] = {
880	&dev_attr_rcd_link_cap.attr,
881	&dev_attr_rcd_link_ctrl.attr,
882	&dev_attr_rcd_link_status.attr,
883	NULL
884	};
885
886	static umode_t cxl_rcd_visible(struct kobject kobj, struct* attribute a, int* n)
887	{
888	struct device *dev = kobj_to_dev(kobj);
889	struct pci_dev *pdev = to_pci_dev(dev);
890
891	if (is_cxl_restricted(pdev))
892	return a->mode;
893
894	return `0`;
895	}
896
897	static struct attribute_group cxl_rcd_group = {
898	.attrs = cxl_rcd_attrs,
899	.is_visible = cxl_rcd_visible,
900	};
901	__ATTRIBUTE_GROUPS(cxl_rcd);
902
903	static int cxl_pci_probe(struct pci_dev pdev, const* struct pci_device_id *id)
904	{
905	struct pci_host_bridge *host_bridge = pci_find_host_bridge(bus: pdev->bus);
906	struct cxl_dpa_info range_info = { `0` };
907	struct cxl_memdev_state *mds;
908	struct cxl_dev_state *cxlds;
909	struct cxl_register_map map;
910	struct cxl_memdev *cxlmd;
911	int rc, pmu_count;
912	unsigned int i;
913	bool irq_avail;
914
915	/*
916	* Double check the anonymous union trickery in struct cxl_regs
917	* FIXME switch to struct_group()
918	*/
919	BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
920	offsetof(struct cxl_regs, device_regs.memdev));
921
922	rc = pcim_enable_device(pdev);
923	if (rc)
924	return rc;
925	pci_set_master(dev: pdev);
926
927	mds = cxl_memdev_state_create(dev: &pdev->dev);
928	if (IS_ERR(ptr: mds))
929	return PTR_ERR(ptr: mds);
930	cxlds = &mds->cxlds;
931	pci_set_drvdata(pdev, data: cxlds);
932
933	cxlds->rcd = is_cxl_restricted(pdev);
934	cxlds->serial = pci_get_dsn(dev: pdev);
935	cxlds->cxl_dvsec = pci_find_dvsec_capability(
936	dev: pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
937	if (!cxlds->cxl_dvsec)
938	dev_warn(&pdev->dev,
939	"Device DVSEC not present, skip CXL.mem init\n");
940
941	rc = cxl_pci_setup_regs(pdev, type: CXL_REGLOC_RBI_MEMDEV, map: &map);
942	if (rc)
943	return rc;
944
945	rc = cxl_map_device_regs(map: &map, regs: &cxlds->regs.device_regs);
946	if (rc)
947	return rc;
948
949	/*
950	* If the component registers can't be found, the cxl_pci driver may
951	* still be useful for management functions so don't return an error.
952	*/
953	rc = cxl_pci_setup_regs(pdev, type: CXL_REGLOC_RBI_COMPONENT,
954	map: &cxlds->reg_map);
955	if (rc)
956	dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
957	else if (!cxlds->reg_map.component_map.ras.valid)
958	dev_dbg(&pdev->dev, "RAS registers not found\n");
959
960	rc = cxl_map_component_regs(map: &cxlds->reg_map, regs: &cxlds->regs.component,
961	BIT(CXL_CM_CAP_CAP_ID_RAS));
962	if (rc)
963	dev_dbg(&pdev->dev, "Failed to map RAS capability.\n");
964
965	rc = cxl_pci_type3_init_mailbox(cxlds);
966	if (rc)
967	return rc;
968
969	rc = cxl_await_media_ready(cxlds);
970	if (rc == `0`)
971	cxlds->media_ready = true;
972	else
973	dev_warn(&pdev->dev, "Media not active (%d)\n", rc);
974
975	irq_avail = cxl_alloc_irq_vectors(pdev);
976
977	rc = cxl_pci_setup_mailbox(mds, irq_avail);
978	if (rc)
979	return rc;
980
981	rc = cxl_enumerate_cmds(mds);
982	if (rc)
983	return rc;
984
985	rc = cxl_set_timestamp(mds);
986	if (rc)
987	return rc;
988
989	rc = cxl_poison_state_init(mds);
990	if (rc)
991	return rc;
992
993	rc = cxl_dev_state_identify(mds);
994	if (rc)
995	return rc;
996
997	rc = cxl_mem_dpa_fetch(mds, info: &range_info);
998	if (rc)
999	return rc;
1000
1001	rc = cxl_dpa_setup(cxlds, info: &range_info);
1002	if (rc)
1003	return rc;
1004
1005	rc = devm_cxl_setup_features(cxlds);
1006	if (rc)
1007	dev_dbg(&pdev->dev, "No CXL Features discovered\n");
1008
1009	cxlmd = devm_cxl_add_memdev(host: &pdev->dev, cxlds);
1010	if (IS_ERR(ptr: cxlmd))
1011	return PTR_ERR(ptr: cxlmd);
1012
1013	rc = devm_cxl_setup_fw_upload(host: &pdev->dev, mds);
1014	if (rc)
1015	return rc;
1016
1017	rc = devm_cxl_sanitize_setup_notifier(host: &pdev->dev, cxlmd);
1018	if (rc)
1019	return rc;
1020
1021	rc = devm_cxl_setup_fwctl(host: &pdev->dev, cxlmd);
1022	if (rc)
1023	dev_dbg(&pdev->dev, "No CXL FWCTL setup\n");
1024
1025	pmu_count = cxl_count_regblock(pdev, type: CXL_REGLOC_RBI_PMU);
1026	if (pmu_count < `0`)
1027	return pmu_count;
1028
1029	for (i = `0`; i < pmu_count; i++) {
1030	struct cxl_pmu_regs pmu_regs;
1031
1032	rc = cxl_find_regblock_instance(pdev, type: CXL_REGLOC_RBI_PMU, map: &map, index: i);
1033	if (rc) {
1034	dev_dbg(&pdev->dev, "Could not find PMU regblock\n");
1035	break;
1036	}
1037
1038	rc = cxl_map_pmu_regs(map: &map, regs: &pmu_regs);
1039	if (rc) {
1040	dev_dbg(&pdev->dev, "Could not map PMU regs\n");
1041	break;
1042	}
1043
1044	rc = devm_cxl_pmu_add(parent: cxlds->dev, regs: &pmu_regs, assoc_id: cxlmd->id, idx: i, type: CXL_PMU_MEMDEV);
1045	if (rc) {
1046	dev_dbg(&pdev->dev, "Could not add PMU instance\n");
1047	break;
1048	}
1049	}
1050
1051	rc = cxl_event_config(host_bridge, mds, irq_avail);
1052	if (rc)
1053	return rc;
1054
1055	if (cxl_pci_ras_unmask(pdev))
1056	dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
1057
1058	pci_save_state(dev: pdev);
1059
1060	return rc;
1061	}
1062
1063	static const struct pci_device_id cxl_mem_pci_tbl[] = {
1064	/ PCI class code for CXL.mem Type-3 Devices /
1065	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << `8` \| CXL_MEMORY_PROGIF), ~`0`)},
1066	{ / terminate list / },
1067	};
1068	MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
1069
1070	static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev)
1071	{
1072	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
1073	struct cxl_memdev *cxlmd = cxlds->cxlmd;
1074	struct device *dev = &cxlmd->dev;
1075
1076	dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n",
1077	dev_name(dev));
1078	pci_restore_state(dev: pdev);
1079	if (device_attach(dev) <= `0`)
1080	return PCI_ERS_RESULT_DISCONNECT;
1081	return PCI_ERS_RESULT_RECOVERED;
1082	}
1083
1084	static void cxl_error_resume(struct pci_dev *pdev)
1085	{
1086	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
1087	struct cxl_memdev *cxlmd = cxlds->cxlmd;
1088	struct device *dev = &cxlmd->dev;
1089
1090	dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev),
1091	dev->driver ? "successful" : "failed");
1092	}
1093
1094	static void cxl_reset_done(struct pci_dev *pdev)
1095	{
1096	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
1097	struct cxl_memdev *cxlmd = cxlds->cxlmd;
1098	struct device *dev = &pdev->dev;
1099
1100	/*
1101	* FLR does not expect to touch the HDM decoders and related
1102	* registers. SBR, however, will wipe all device configurations.
1103	* Issue a warning if there was an active decoder before the reset
1104	* that no longer exists.
1105	*/
1106	guard(device)(T: &cxlmd->dev);
1107	if (cxlmd->endpoint &&
1108	cxl_endpoint_decoder_reset_detected(port: cxlmd->endpoint)) {
1109	dev_crit(dev, "SBR happened without memory regions removal.\n");
1110	dev_crit(dev, "System may be unstable if regions hosted system memory.\n");
1111	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
1112	}
1113	}
1114
1115	static const struct pci_error_handlers cxl_error_handlers = {
1116	.error_detected = cxl_error_detected,
1117	.slot_reset = cxl_slot_reset,
1118	.resume = cxl_error_resume,
1119	.cor_error_detected = cxl_cor_error_detected,
1120	.reset_done = cxl_reset_done,
1121	};
1122
1123	static struct pci_driver cxl_pci_driver = {
1124	.name = KBUILD_MODNAME,
1125	.id_table = cxl_mem_pci_tbl,
1126	.probe = cxl_pci_probe,
1127	.err_handler = &cxl_error_handlers,
1128	.dev_groups = cxl_rcd_groups,
1129	.driver = {
1130	.probe_type = PROBE_PREFER_ASYNCHRONOUS,
1131	},
1132	};
1133
1134	#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
1135	static void cxl_handle_cper_event(enum cxl_event_type ev_type,
1136	struct cxl_cper_event_rec *rec)
1137	{
1138	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
1139	struct pci_dev *pdev __free(pci_dev_put) = NULL;
1140	enum cxl_event_log_type log_type;
1141	struct cxl_dev_state *cxlds;
1142	unsigned int devfn;
1143	u32 hdr_flags;
1144
1145	pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
1146	device_id->segment_num, device_id->bus_num,
1147	device_id->device_num, device_id->func_num);
1148
1149	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
1150	pdev = pci_get_domain_bus_and_slot(domain: device_id->segment_num,
1151	bus: device_id->bus_num, devfn);
1152	if (!pdev)
1153	return;
1154
1155	guard(device)(T: &pdev->dev);
1156	if (pdev->driver != &cxl_pci_driver)
1157	return;
1158
1159	cxlds = pci_get_drvdata(pdev);
1160	if (!cxlds)
1161	return;
1162
1163	/ Fabricate a log type /
1164	hdr_flags = get_unaligned_le24(p: rec->event.generic.hdr.flags);
1165	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
1166
1167	cxl_event_trace_record(cxlmd: cxlds->cxlmd, type: log_type, event_type: ev_type,
1168	uuid: &uuid_null, evt: &rec->event);
1169	}
1170
1171	static void cxl_cper_work_fn(struct work_struct *work)
1172	{
1173	struct cxl_cper_work_data wd;
1174
1175	while (cxl_cper_kfifo_get(wd: &wd))
1176	cxl_handle_cper_event(ev_type: wd.event_type, rec: &wd.rec);
1177	}
1178	static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
1179
1180	static int __init cxl_pci_driver_init(void)
1181	{
1182	int rc;
1183
1184	rc = pci_register_driver(&cxl_pci_driver);
1185	if (rc)
1186	return rc;
1187
1188	rc = cxl_cper_register_work(work: &cxl_cper_work);
1189	if (rc)
1190	pci_unregister_driver(dev: &cxl_pci_driver);
1191
1192	return rc;
1193	}
1194
1195	static void __exit cxl_pci_driver_exit(void)
1196	{
1197	cxl_cper_unregister_work(work: &cxl_cper_work);
1198	cancel_work_sync(work: &cxl_cper_work);
1199	pci_unregister_driver(dev: &cxl_pci_driver);
1200	}
1201
1202	module_init(cxl_pci_driver_init);
1203	module_exit(cxl_pci_driver_exit);
1204	MODULE_DESCRIPTION("CXL: PCI manageability");
1205	MODULE_LICENSE("GPL v2");
1206	MODULE_IMPORT_NS("CXL");
1207

source code of linux/drivers/cxl/pci.c