diff mbox series

[v4,3/3] cxl/pci: Add sysfs attribute for CXL 1.1 device link status

Message ID 20240409073528.13214-4-kobayashi.da-06@fujitsu.com
State New
Headers show
Series cxl: Export cxl1.1 device link status to sysfs | expand

Commit Message

Kobayashi,Daisuke April 9, 2024, 7:35 a.m. UTC
Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.

In CXL1.1, the link status of the device is included in the RCRB mapped to
the memory mapped register area. Critically, that arrangement makes the link
status and control registers invisible to existing PCI user tooling.

Export those registers via sysfs with the expectation that PCI user
tooling will alternatively look for these sysfs files when attempting to
access to these CXL 1.1 endpoints registers.

Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
---
 drivers/cxl/pci.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

Comments

Bjorn Helgaas April 9, 2024, 3:05 p.m. UTC | #1
On Tue, Apr 09, 2024 at 04:35:28PM +0900, Kobayashi,Daisuke wrote:
> Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.
> 
> In CXL1.1, the link status of the device is included in the RCRB mapped to
> the memory mapped register area. Critically, that arrangement makes the link
> status and control registers invisible to existing PCI user tooling.

Idle thought: PCIe does define RCRB, even pre-CXL.  Maybe the PCI core
should be enhanced to comprehend RCRB directly?

> +static ssize_t rcd_link_status_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct cxl_port *port;
> +	struct cxl_dport *dport;
> +	struct device *parent = dev->parent;
> +	struct pci_dev *parent_pdev = to_pci_dev(parent);
> +
> +	port = cxl_pci_find_port(parent_pdev, &dport);
> +	if (!port)
> +		return -EINVAL;
> +
> +	return sysfs_emit(buf, "%x\n", dport->rcrb.rcd_lnkstatus);

Is it really what you want to capture PCI_EXP_LNKSTA once at
enumeration-time and expose that static value forever?  I assume
status bits can change over time, so I would naively expect that you
want the *current* value, not just a value from the distant past.

Bjorn
Dan Williams April 9, 2024, 6:04 p.m. UTC | #2
Bjorn Helgaas wrote:
> On Tue, Apr 09, 2024 at 04:35:28PM +0900, Kobayashi,Daisuke wrote:
> > Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.
> > 
> > In CXL1.1, the link status of the device is included in the RCRB mapped to
> > the memory mapped register area. Critically, that arrangement makes the link
> > status and control registers invisible to existing PCI user tooling.
> 
> Idle thought: PCIe does define RCRB, even pre-CXL.  Maybe the PCI core
> should be enhanced to comprehend RCRB directly?

It depends on if this slow drip of features continues, and it seems that
PCIe base RCRB is scoped to a single device/port whereas CXL appears to
extend it to merge the endpoint config space and root-port config space
into a double-sized RCRB area.

I.e. there will continue to be CXL specifics involved.

Also, this is a one-generation-quirk as CXL 2.0+ hosts drop this awkward
RCRB arrangement.

> > +static ssize_t rcd_link_status_show(struct device *dev,
> > +				   struct device_attribute *attr, char *buf)
> > +{
> > +	struct cxl_port *port;
> > +	struct cxl_dport *dport;
> > +	struct device *parent = dev->parent;
> > +	struct pci_dev *parent_pdev = to_pci_dev(parent);
> > +
> > +	port = cxl_pci_find_port(parent_pdev, &dport);
> > +	if (!port)
> > +		return -EINVAL;
> > +
> > +	return sysfs_emit(buf, "%x\n", dport->rcrb.rcd_lnkstatus);
> 
> Is it really what you want to capture PCI_EXP_LNKSTA once at
> enumeration-time and expose that static value forever?  I assume
> status bits can change over time, so I would naively expect that you
> want the *current* value, not just a value from the distant past.

I expect this should copy what is done for aer_cap where that single
RCRB capability block is cached for future access. That said many of the
link status change events would also cause the device to be rescanned
and that value is refreshed once per driver bind event.
Dan Williams April 9, 2024, 9:33 p.m. UTC | #3
Kobayashi,Daisuke wrote:
> Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.
> 
> In CXL1.1, the link status of the device is included in the RCRB mapped to
> the memory mapped register area. Critically, that arrangement makes the link
> status and control registers invisible to existing PCI user tooling.
> 
> Export those registers via sysfs with the expectation that PCI user
> tooling will alternatively look for these sysfs files when attempting to
> access to these CXL 1.1 endpoints registers.
> 
> Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
> ---
>  drivers/cxl/pci.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 74 insertions(+)
> 
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 2ff361e756d6..0ff15738b1ba 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -786,6 +786,79 @@ static int cxl_event_config(struct pci_host_bridge *host_bridge,
>  	return 0;
>  }
>  
> +static ssize_t rcd_link_cap_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct cxl_port *port;
> +	struct cxl_dport *dport;
> +	struct device *parent = dev->parent;
> +	struct pci_dev *parent_pdev = to_pci_dev(parent);
> +
> +	port = cxl_pci_find_port(parent_pdev, &dport);
> +	if (!port)
> +		return -EINVAL;

A few problems with this:

1/ No need to convert to the parent PCI device when there is a lookup
routine to go from cxl_memdev to its upstream port.

        struct cxl_dev_state *cxlds = dev_get_drvdata(dev);
        struct cxl_memdev *cxlmd = cxlds->cxlmd;

2/ The port reference is leaked Add a put_cxl_port() __free() routine
like this:

	diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
	index 534e25e2f0a4..d81bc4cc0a4c 100644
	--- a/drivers/cxl/cxl.h
	+++ b/drivers/cxl/cxl.h
	@@ -744,6 +744,7 @@ DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_cxl_root(_T))
	 int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd);
	 void cxl_bus_rescan(void);
	 void cxl_bus_drain(void);
	+DEFINE_FREE(put_cxl_port, struct cxl_port *, if (_T) put_cxl_port(_T))
	 struct cxl_port *cxl_pci_find_port(struct pci_dev *pdev,
	                                   struct cxl_dport **dport);
	 struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
	
...and then:

	struct cxl_port *port __free(put_cxl_port) = cxl_mem_find_port(cxlmd, &dport);

3/ The port corresponding to a memdev can disappear at any time so you
need to do the same validation the cxl_mem_probe() does to keep the port
active during the register access:

	guard(device)(&port->dev);
	if (!port->dev.driver)
		return -ENXIO;

...then you can read from the cached PCIe capability similar to how the
error handler path reads from aer_cap.
Dan Williams April 9, 2024, 9:47 p.m. UTC | #4
Dan Williams wrote:
> Kobayashi,Daisuke wrote:
> > Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.
> > 
> > In CXL1.1, the link status of the device is included in the RCRB mapped to
> > the memory mapped register area. Critically, that arrangement makes the link
> > status and control registers invisible to existing PCI user tooling.
> > 
> > Export those registers via sysfs with the expectation that PCI user
> > tooling will alternatively look for these sysfs files when attempting to
> > access to these CXL 1.1 endpoints registers.
> > 
> > Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
> > ---
> >  drivers/cxl/pci.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 74 insertions(+)
> > 
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 2ff361e756d6..0ff15738b1ba 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
[..]
> 3/ The port corresponding to a memdev can disappear at any time so you
> need to do the same validation the cxl_mem_probe() does to keep the port
> active during the register access:
> 
> 	guard(device)(&port->dev);
> 	if (!port->dev.driver)
> 		return -ENXIO;

Apologies, I made a mistake here. Copy how cxl_mem_probe() accesses the
dport.

	endpoint_parent = port->uport_dev;
	guard(device)(&endpoint_parent->dev);
	if (!endpoint_parent->driver)
		return -ENXIO;
Lukas Wunner April 9, 2024, 10:18 p.m. UTC | #5
On Tue, Apr 09, 2024 at 10:05:40AM -0500, Bjorn Helgaas wrote:
> On Tue, Apr 09, 2024 at 04:35:28PM +0900, Kobayashi,Daisuke wrote:
> > Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.
> > 
> > In CXL1.1, the link status of the device is included in the RCRB mapped to
> > the memory mapped register area. Critically, that arrangement makes the link
> > status and control registers invisible to existing PCI user tooling.
> 
> Idle thought: PCIe does define RCRB, even pre-CXL.  Maybe the PCI core
> should be enhanced to comprehend RCRB directly?

The way CXL 1.1 (ab)uses the RCRB differs from what the PCIe Base Spec
envisions:

Per PCIe r6.2 sec 7.2.3, the RCRB contains additional Extended Capabilities
of a Root Port -- in addition to those in the Root Port's Config Space.
What we could do in the PCI core to support this is to amend our helpers
which search for Extended Capabilities to also search for them in the RCRB.

In fact, two years ago I cooked up a patch which does exactly that:

https://github.com/l1k/linux/commit/3eb94f042527

And I cooked up another patch to fetch the RCRB's address from the CXL
Early Discovery ACPI table:

https://github.com/l1k/linux/commit/d9d3cf45cf8c

The reason I never submitted the patches?  I realized after the fact that
CXL 1.1 uses the RCRB in a completely different way:

Per CXL r3.0 sec 8.2.1, RCH Downstream and RCD Upstream Ports do not
actually possess a Config Space.  Instead, they possess *only* an RCRB.
And that RCRB contains a Type 1 Configuration Space Header.

But because the PCIe Base Spec prescribes that there has to be an
Extended Capability at offset 0 of the RCRB, the CXL spec puts a
Null Extended Capability at offset 0 so that the Type 1 Config Space
Header is skipped.

However this means that the first dword of the Type 1 Config Space
Header does not contain a Vendor ID and Device ID.

So what we could do is create a fake pci_dev for each RCH Downstream and
RCD Upstream Port plus a specially crafted struct pci_ops whose ->read()
and write() callbacks access the RCRB.  But how do we know which Vendor
and Device ID to return from a ->read()?  There is none in the RCRB!

The CXL Consortium seems to have realized the mess they made with
CXL 1.1 and from CXL 2.0 onwards everything is now a proper PCI device.
I talked to Dan about my findings and his decision was basically to
not enable any of that legacy CXL 1.1 RCRB functionality in the kernel.

Thanks,

Lukas
Kobayashi,Daisuke April 10, 2024, 7:22 a.m. UTC | #6
> Dan Williams wrote:
> > Kobayashi,Daisuke wrote:
> > > Add sysfs attribute for CXL 1.1 device link status to the cxl pci device.
> > >
> > > In CXL1.1, the link status of the device is included in the RCRB mapped to
> > > the memory mapped register area. Critically, that arrangement makes the
> link
> > > status and control registers invisible to existing PCI user tooling.
> > >
> > > Export those registers via sysfs with the expectation that PCI user
> > > tooling will alternatively look for these sysfs files when attempting to
> > > access to these CXL 1.1 endpoints registers.
> > >
> > > Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
> > > ---
> > >  drivers/cxl/pci.c | 74
> +++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 74 insertions(+)
> > >
> > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > index 2ff361e756d6..0ff15738b1ba 100644
> > > --- a/drivers/cxl/pci.c
> > > +++ b/drivers/cxl/pci.c
> [..]
> > 3/ The port corresponding to a memdev can disappear at any time so you
> > need to do the same validation the cxl_mem_probe() does to keep the port
> > active during the register access:
> >
> > 	guard(device)(&port->dev);
> > 	if (!port->dev.driver)
> > 		return -ENXIO;
> 
> Apologies, I made a mistake here. Copy how cxl_mem_probe() accesses the
> dport.
> 
> 	endpoint_parent = port->uport_dev;
> 	guard(device)(&endpoint_parent->dev);
> 	if (!endpoint_parent->driver)
> 		return -ENXIO;

Thank you for your feedback.
I could not find the exact same code as the suggestion from cxl_mem_probe(), 
but would your suggestion be correct with the following modification:

diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 2ff361e756d6..0ff15738b1ba 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -786,6 +786,79 @@ static int cxl_event_config(struct pci_host_bridge *host_bridge,
 	return 0;
 }
 
+static ssize_t rcd_link_cap_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct cxl_port *port;
+	struct cxl_dport *dport;
+	struct cxl_dev_state *cxlds = dev_get_drvdata(dev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *endpoint_parent;
+
+	port = cxl_mem_find_port(cxlmd, &dport);
+	if (!port)
+		return -EINVAL;
+
+	endpoint_parent = port->uport_dev;
+	guard(device)(&endpoint_parent);
+	if (!endpoint_parent->driver)
+		return -ENXIO;
+	return sysfs_emit(buf, "%x\n", dport->rcrb.rcd_lnkcap);
+}
+static DEVICE_ATTR_RO(rcd_link_cap);
[..]
--

From reading the guard macro, my understanding is that this is a macro which
calls the constructor here, and calls the destructor when the scope is exited. 
Will this prevent the port from disappearing?
diff mbox series

Patch

diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 2ff361e756d6..0ff15738b1ba 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -786,6 +786,79 @@  static int cxl_event_config(struct pci_host_bridge *host_bridge,
 	return 0;
 }
 
+static ssize_t rcd_link_cap_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct cxl_port *port;
+	struct cxl_dport *dport;
+	struct device *parent = dev->parent;
+	struct pci_dev *parent_pdev = to_pci_dev(parent);
+
+	port = cxl_pci_find_port(parent_pdev, &dport);
+	if (!port)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%x\n", dport->rcrb.rcd_lnkcap);
+}
+static DEVICE_ATTR_RO(rcd_link_cap);
+
+static ssize_t rcd_link_ctrl_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct cxl_port *port;
+	struct cxl_dport *dport;
+	struct device *parent = dev->parent;
+	struct pci_dev *parent_pdev = to_pci_dev(parent);
+
+	port = cxl_pci_find_port(parent_pdev, &dport);
+	if (!port)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%x\n", dport->rcrb.rcd_lnkctrl);
+}
+static DEVICE_ATTR_RO(rcd_link_ctrl);
+
+static ssize_t rcd_link_status_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct cxl_port *port;
+	struct cxl_dport *dport;
+	struct device *parent = dev->parent;
+	struct pci_dev *parent_pdev = to_pci_dev(parent);
+
+	port = cxl_pci_find_port(parent_pdev, &dport);
+	if (!port)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%x\n", dport->rcrb.rcd_lnkstatus);
+}
+static DEVICE_ATTR_RO(rcd_link_status);
+
+static struct attribute *cxl_rcd_attrs[] = {
+		&dev_attr_rcd_link_cap.attr,
+		&dev_attr_rcd_link_ctrl.attr,
+		&dev_attr_rcd_link_status.attr,
+		NULL
+};
+
+static umode_t cxl_rcd_visible(struct kobject *kobj,
+					  struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (is_cxl_restricted(pdev))
+		return a->mode;
+
+	return 0;
+}
+
+static struct attribute_group cxl_rcd_group = {
+		.attrs = cxl_rcd_attrs,
+		.is_visible = cxl_rcd_visible,
+};
+__ATTRIBUTE_GROUPS(cxl_rcd);
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
@@ -969,6 +1042,7 @@  static struct pci_driver cxl_pci_driver = {
 	.id_table		= cxl_mem_pci_tbl,
 	.probe			= cxl_pci_probe,
 	.err_handler		= &cxl_error_handlers,
+	.dev_groups		= cxl_rcd_groups,
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
 	},