diff mbox series

[3/5] cxl/pci: Add DOE Auxiliary Devices

Message ID 20211105235056.3711389-4-ira.weiny@intel.com
State New
Headers show
Series CXL: Read CDAT and DSMAS data from the device. | expand

Commit Message

Ira Weiny Nov. 5, 2021, 11:50 p.m. UTC
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>

CXL devices have DOE mailboxes.  Create auxiliary devices which can be
driven by the generic DOE auxiliary driver.

Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

---
Changes from V4:
	Make this an Auxiliary Driver rather than library functions
	Split this out into it's own patch
	Base on the new cxl_dev_state structure

Changes from Ben
	s/CXL_DOE_DEV_NAME/DOE_DEV_NAME/
---
 drivers/cxl/Kconfig |   1 +
 drivers/cxl/cxl.h   |  13 +++++
 drivers/cxl/pci.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+)

Comments

Jonathan Cameron Nov. 8, 2021, 1:09 p.m. UTC | #1
On Fri, 5 Nov 2021 16:50:54 -0700
<ira.weiny@intel.com> wrote:

> From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> driven by the generic DOE auxiliary driver.

I'd like Bjorn's input on the balance here between what is done
in cxl/pci.c and what should be in the PCI core code somewhere.

The tricky bit preventing this being done entirely as part of 
PCI device instantiation is the interrupts.

> 
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Mostly new code, so not sure I should really be listed on this
one but I don't mind either way.

A few comments inline but overall this ended up nice and clean.

> 
> ---
> Changes from V4:
> 	Make this an Auxiliary Driver rather than library functions
> 	Split this out into it's own patch
> 	Base on the new cxl_dev_state structure
> 
> Changes from Ben
> 	s/CXL_DOE_DEV_NAME/DOE_DEV_NAME/
> ---
>  drivers/cxl/Kconfig |   1 +
>  drivers/cxl/cxl.h   |  13 +++++
>  drivers/cxl/pci.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 134 insertions(+)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index 67c91378f2dd..9d53720bea07 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -16,6 +16,7 @@ if CXL_BUS
>  config CXL_MEM
>  	tristate "CXL.mem: Memory Devices"
>  	default CXL_BUS
> +	select PCI_DOE_DRIVER
>  	help
>  	  The CXL.mem protocol allows a device to act as a provider of
>  	  "System RAM" and/or "Persistent Memory" that is fully coherent
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 5e2e93451928..f1241a7f2b7b 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -75,6 +75,19 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
>  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
>  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
>  
> +/*
> + * Address space properties derived from:
> + * CXL 2.0 8.2.5.12.7 CXL HDM Decoder 0 Control Register
> + */
> +#define CXL_ADDRSPACE_RAM   BIT(0)
> +#define CXL_ADDRSPACE_PMEM  BIT(1)
> +#define CXL_ADDRSPACE_TYPE2 BIT(2)
> +#define CXL_ADDRSPACE_TYPE3 BIT(3)
> +#define CXL_ADDRSPACE_MASK  GENMASK(3, 0)

Stray.

> +
> +#define CXL_DOE_PROTOCOL_COMPLIANCE 0
> +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
> +
>  #define CXL_COMPONENT_REGS() \
>  	void __iomem *hdm_decoder
>  
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 8dc91fd3396a..df524b74f1d2 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -6,6 +6,7 @@
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/pci.h>
> +#include <linux/pci-doe.h>
>  #include <linux/io.h>
>  #include "cxlmem.h"
>  #include "pci.h"
> @@ -471,6 +472,120 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
>  	return rc;
>  }
>  
> +static void cxl_mem_free_irq_vectors(void *data)
> +{
> +	pci_free_irq_vectors(data);
> +}
> +
> +static void cxl_destroy_doe_device(void *ad)
> +{
> +	struct auxiliary_device *adev = ad;
Local variable doesn't add anything, just pass it directly
into the functions as a void *.

> +
> +	auxiliary_device_delete(adev);
> +	auxiliary_device_uninit(adev);

Both needed?  These are just wrappers around
put_device() and device_del()

Normally after device_add() suceeded we only ever call device_del()
as per the docs for device_add()
https://elixir.bootlin.com/linux/latest/source/drivers/base/core.c#L3277

> +}
> +
> +static DEFINE_IDA(cxl_doe_adev_ida);
> +static void __doe_dev_release(struct auxiliary_device *adev)
> +{
> +	struct pci_doe_dev *doe_dev = container_of(adev, struct pci_doe_dev,
> +						   adev);
> +
> +	ida_free(&cxl_doe_adev_ida, adev->id);
> +	kfree(doe_dev);
> +}
> +
> +static void cxl_doe_dev_release(struct device *dev)
> +{
> +	struct auxiliary_device *adev = container_of(dev,
> +						struct auxiliary_device,
> +						dev);
> +	__doe_dev_release(adev);
> +}
> +
> +static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)

Pass in the struct device, or maybe even the struct pci_dev as
nothing in here is using the cxl_dev_state.

> +{
> +	struct device *dev = cxlds->dev;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	int irqs, rc;
> +	u16 pos = 0;
> +
> +	/*
> +	 * An implementation of a cxl type3 device may support an unknown
> +	 * number of interrupts. Assume that number is not that large and
> +	 * request them all.
> +	 */
> +	irqs = pci_msix_vec_count(pdev);
> +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> +	if (rc != irqs) {
> +		/* No interrupt available - carry on */
> +		dev_dbg(dev, "No interrupts available for DOE\n");
> +	} else {
> +		/*
> +		 * Enabling bus mastering could be done within the DOE
> +		 * initialization, but as it potentially has other impacts
> +		 * keep it within the driver.
> +		 */
> +		pci_set_master(pdev);
> +		rc = devm_add_action_or_reset(dev,
> +					      cxl_mem_free_irq_vectors,
> +					      pdev);
> +		if (rc)
> +			return rc;
> +	}
> +

Above here is driver specific...
Everything from here is is generic so perhaps move it to the PCI core?
Alternatively wait until we have users that aren't CXL.

> +	pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> +
> +	while (pos > 0) {
> +		struct auxiliary_device *adev;
> +		struct pci_doe_dev *new_dev;
> +		int id;
> +
> +		new_dev = kzalloc(sizeof(*new_dev), GFP_KERNEL);
> +		if (!new_dev)
> +			return -ENOMEM;
> +
> +		new_dev->pdev = pdev;
> +		new_dev->cap_offset = pos;
> +
> +		/* Set up struct auxiliary_device */
> +		adev = &new_dev->adev;
> +		id = ida_alloc(&cxl_doe_adev_ida, GFP_KERNEL);
> +		if (id < 0) {
> +			kfree(new_dev);
> +			return -ENOMEM;
> +		}
> +
> +		adev->id = id;
> +		adev->name = DOE_DEV_NAME;
> +		adev->dev.release = cxl_doe_dev_release;
> +		adev->dev.parent = dev;
> +
> +		if (auxiliary_device_init(adev)) {
> +			__doe_dev_release(adev);
> +			return -EIO;
> +		}
> +
> +		if (auxiliary_device_add(adev)) {
> +			auxiliary_device_uninit(adev);
> +			return -EIO;
> +		}
> +
> +		rc = devm_add_action_or_reset(dev, cxl_destroy_doe_device, adev);
> +		if (rc)
> +			return rc;
> +
> +		if (device_attach(&adev->dev) != 1)
> +			dev_err(&adev->dev,
> +				"Failed to attach a driver to DOE device %d\n",
> +				adev->id);

I wondered about this and how it would happen.
Given soft dependency only between the drivers it's possible but error or info?
I'd go with dev_info().  It is an error I'd bail out and used deferred probing
to try again when it will succeed.

> +
> +		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> +	}
> +
> +	return 0;
> +}
> +
>  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct cxl_register_map map;
> @@ -517,6 +632,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	rc = cxl_setup_doe_devices(cxlds);
> +	if (rc)
> +		return rc;
> +
>  	cxlmd = devm_cxl_add_memdev(cxlds);
>  	if (IS_ERR(cxlmd))
>  		return PTR_ERR(cxlmd);
> @@ -546,3 +665,4 @@ static struct pci_driver cxl_pci_driver = {
>  MODULE_LICENSE("GPL v2");
>  module_pci_driver(cxl_pci_driver);
>  MODULE_IMPORT_NS(CXL);
> +MODULE_SOFTDEP("pre: pci_doe");
Ira Weiny Nov. 11, 2021, 1:31 a.m. UTC | #2
On Mon, Nov 08, 2021 at 01:09:18PM +0000, Jonathan Cameron wrote:
> On Fri, 5 Nov 2021 16:50:54 -0700
> <ira.weiny@intel.com> wrote:
> 
> > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > 
> > CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> > driven by the generic DOE auxiliary driver.
> 
> I'd like Bjorn's input on the balance here between what is done
> in cxl/pci.c and what should be in the PCI core code somewhere.
> 
> The tricky bit preventing this being done entirely as part of 
> PCI device instantiation is the interrupts.
> 
> > 
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> Mostly new code, so not sure I should really be listed on this
> one but I don't mind either way.
> 
> A few comments inline but overall this ended up nice and clean.
> 
> > 
> > ---
> > Changes from V4:
> > 	Make this an Auxiliary Driver rather than library functions
> > 	Split this out into it's own patch
> > 	Base on the new cxl_dev_state structure
> > 
> > Changes from Ben
> > 	s/CXL_DOE_DEV_NAME/DOE_DEV_NAME/
> > ---
> >  drivers/cxl/Kconfig |   1 +
> >  drivers/cxl/cxl.h   |  13 +++++
> >  drivers/cxl/pci.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 134 insertions(+)
> > 
> > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > index 67c91378f2dd..9d53720bea07 100644
> > --- a/drivers/cxl/Kconfig
> > +++ b/drivers/cxl/Kconfig
> > @@ -16,6 +16,7 @@ if CXL_BUS
> >  config CXL_MEM
> >  	tristate "CXL.mem: Memory Devices"
> >  	default CXL_BUS
> > +	select PCI_DOE_DRIVER
> >  	help
> >  	  The CXL.mem protocol allows a device to act as a provider of
> >  	  "System RAM" and/or "Persistent Memory" that is fully coherent
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index 5e2e93451928..f1241a7f2b7b 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> > @@ -75,6 +75,19 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
> >  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
> >  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
> >  
> > +/*
> > + * Address space properties derived from:
> > + * CXL 2.0 8.2.5.12.7 CXL HDM Decoder 0 Control Register
> > + */
> > +#define CXL_ADDRSPACE_RAM   BIT(0)
> > +#define CXL_ADDRSPACE_PMEM  BIT(1)
> > +#define CXL_ADDRSPACE_TYPE2 BIT(2)
> > +#define CXL_ADDRSPACE_TYPE3 BIT(3)
> > +#define CXL_ADDRSPACE_MASK  GENMASK(3, 0)
> 
> Stray.

Not sure what you mean here???

There were a number of defines which were unused but I left them in.

This came right out of your patch 3.

https://lore.kernel.org/linux-cxl/20210524133938.2815206-4-Jonathan.Cameron@huawei.com/

I can remove these defines if you want?

> 
> > +
> > +#define CXL_DOE_PROTOCOL_COMPLIANCE 0
> > +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
> > +
> >  #define CXL_COMPONENT_REGS() \
> >  	void __iomem *hdm_decoder
> >  
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 8dc91fd3396a..df524b74f1d2 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -6,6 +6,7 @@
> >  #include <linux/mutex.h>
> >  #include <linux/list.h>
> >  #include <linux/pci.h>
> > +#include <linux/pci-doe.h>
> >  #include <linux/io.h>
> >  #include "cxlmem.h"
> >  #include "pci.h"
> > @@ -471,6 +472,120 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
> >  	return rc;
> >  }
> >  
> > +static void cxl_mem_free_irq_vectors(void *data)
> > +{
> > +	pci_free_irq_vectors(data);
> > +}
> > +
> > +static void cxl_destroy_doe_device(void *ad)
> > +{
> > +	struct auxiliary_device *adev = ad;
> Local variable doesn't add anything, just pass it directly
> into the functions as a void *.

Yea...  Thanks...  :-D

> 
> > +
> > +	auxiliary_device_delete(adev);
> > +	auxiliary_device_uninit(adev);
> 
> Both needed?  These are just wrappers around
> put_device() and device_del()

These are both needed per the Auxiliary Device doc.  :-/

> 
> Normally after device_add() suceeded we only ever call device_del()
> as per the docs for device_add()
> https://elixir.bootlin.com/linux/latest/source/drivers/base/core.c#L3277

I think you are miss reading that comment.  Here auxiliary_device_add() has
succeeded.  Therefore both device_del() and put_device() must be called.  In
the case of auxiliary_device_add() failing we only call
auxiliary_device_uninit() [put_device()].

So I think this is correct.

The other places I spot checked called device_del() _and_ put_device().

> 
> > +}
> > +
> > +static DEFINE_IDA(cxl_doe_adev_ida);
> > +static void __doe_dev_release(struct auxiliary_device *adev)
> > +{
> > +	struct pci_doe_dev *doe_dev = container_of(adev, struct pci_doe_dev,
> > +						   adev);
> > +
> > +	ida_free(&cxl_doe_adev_ida, adev->id);
> > +	kfree(doe_dev);
> > +}
> > +
> > +static void cxl_doe_dev_release(struct device *dev)
> > +{
> > +	struct auxiliary_device *adev = container_of(dev,
> > +						struct auxiliary_device,
> > +						dev);
> > +	__doe_dev_release(adev);
> > +}
> > +
> > +static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
> 
> Pass in the struct device, or maybe even the struct pci_dev as
> nothing in here is using the cxl_dev_state.

Ah yea can I leave this per the next patch?  Or I can change it then change it
to cxlds in the next patch.  But I would rather leave it.

> 
> > +{
> > +	struct device *dev = cxlds->dev;
> > +	struct pci_dev *pdev = to_pci_dev(dev);
> > +	int irqs, rc;
> > +	u16 pos = 0;
> > +
> > +	/*
> > +	 * An implementation of a cxl type3 device may support an unknown
> > +	 * number of interrupts. Assume that number is not that large and
> > +	 * request them all.
> > +	 */
> > +	irqs = pci_msix_vec_count(pdev);
> > +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> > +	if (rc != irqs) {
> > +		/* No interrupt available - carry on */
> > +		dev_dbg(dev, "No interrupts available for DOE\n");
> > +	} else {
> > +		/*
> > +		 * Enabling bus mastering could be done within the DOE
> > +		 * initialization, but as it potentially has other impacts
> > +		 * keep it within the driver.
> > +		 */
> > +		pci_set_master(pdev);
> > +		rc = devm_add_action_or_reset(dev,
> > +					      cxl_mem_free_irq_vectors,
> > +					      pdev);
> > +		if (rc)
> > +			return rc;
> > +	}
> > +
> 
> Above here is driver specific...
> Everything from here is is generic so perhaps move it to the PCI core?
> Alternatively wait until we have users that aren't CXL.

I'm still looking for where in the PCI core this would be appropriate to
place...

> 
> > +	pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> > +
> > +	while (pos > 0) {
> > +		struct auxiliary_device *adev;
> > +		struct pci_doe_dev *new_dev;
> > +		int id;
> > +
> > +		new_dev = kzalloc(sizeof(*new_dev), GFP_KERNEL);
> > +		if (!new_dev)
> > +			return -ENOMEM;
> > +
> > +		new_dev->pdev = pdev;
> > +		new_dev->cap_offset = pos;
> > +
> > +		/* Set up struct auxiliary_device */
> > +		adev = &new_dev->adev;
> > +		id = ida_alloc(&cxl_doe_adev_ida, GFP_KERNEL);
> > +		if (id < 0) {
> > +			kfree(new_dev);
> > +			return -ENOMEM;
> > +		}
> > +
> > +		adev->id = id;
> > +		adev->name = DOE_DEV_NAME;
> > +		adev->dev.release = cxl_doe_dev_release;
> > +		adev->dev.parent = dev;
> > +
> > +		if (auxiliary_device_init(adev)) {
> > +			__doe_dev_release(adev);
> > +			return -EIO;
> > +		}
> > +
> > +		if (auxiliary_device_add(adev)) {
> > +			auxiliary_device_uninit(adev);
> > +			return -EIO;
> > +		}
> > +
> > +		rc = devm_add_action_or_reset(dev, cxl_destroy_doe_device, adev);
> > +		if (rc)
> > +			return rc;
> > +
> > +		if (device_attach(&adev->dev) != 1)
> > +			dev_err(&adev->dev,
> > +				"Failed to attach a driver to DOE device %d\n",
> > +				adev->id);
> 
> I wondered about this and how it would happen.
> Given soft dependency only between the drivers it's possible but error or info?
> I'd go with dev_info().  It is an error I'd bail out and used deferred probing
> to try again when it will succeed.

I made this dev_err() on purpose.  And I don't know about the deferred probing.
Maybe deferred probing on the CDAT read but even that I think is going to be a
pain.

The sequence I can think of is:

cxl_pci loaded
	[finds all devices]
	[soft loads pci_doe]
	[device_attach works]
Admin unloads pci_doe
	[hot-plug new device]
	[device_attach fails]
	[cdat will fail until driver is loaded]

I spoke with Dan about this and while this is unfortunate it is what the user
asked for.  So I prefer dev_err() above to make sure that there is an
indication of why this device is potentially not going to work.

Thanks for the review,
Ira

> 
> > +
> > +		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  {
> >  	struct cxl_register_map map;
> > @@ -517,6 +632,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > +	rc = cxl_setup_doe_devices(cxlds);
> > +	if (rc)
> > +		return rc;
> > +
> >  	cxlmd = devm_cxl_add_memdev(cxlds);
> >  	if (IS_ERR(cxlmd))
> >  		return PTR_ERR(cxlmd);
> > @@ -546,3 +665,4 @@ static struct pci_driver cxl_pci_driver = {
> >  MODULE_LICENSE("GPL v2");
> >  module_pci_driver(cxl_pci_driver);
> >  MODULE_IMPORT_NS(CXL);
> > +MODULE_SOFTDEP("pre: pci_doe");
>
Jonathan Cameron Nov. 11, 2021, 11:53 a.m. UTC | #3
On Wed, 10 Nov 2021 17:31:23 -0800
Ira Weiny <ira.weiny@intel.com> wrote:

> On Mon, Nov 08, 2021 at 01:09:18PM +0000, Jonathan Cameron wrote:
> > On Fri, 5 Nov 2021 16:50:54 -0700
> > <ira.weiny@intel.com> wrote:
> >   
> > > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > 
> > > CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> > > driven by the generic DOE auxiliary driver.  
> > 
> > I'd like Bjorn's input on the balance here between what is done
> > in cxl/pci.c and what should be in the PCI core code somewhere.
> > 
> > The tricky bit preventing this being done entirely as part of 
> > PCI device instantiation is the interrupts.
> >   
> > > 
> > > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>  
> > 
> > Mostly new code, so not sure I should really be listed on this
> > one but I don't mind either way.
> > 
> > A few comments inline but overall this ended up nice and clean.
> >   
> > > 
> > > ---
> > > Changes from V4:
> > > 	Make this an Auxiliary Driver rather than library functions
> > > 	Split this out into it's own patch
> > > 	Base on the new cxl_dev_state structure
> > > 
> > > Changes from Ben
> > > 	s/CXL_DOE_DEV_NAME/DOE_DEV_NAME/
> > > ---
> > >  drivers/cxl/Kconfig |   1 +
> > >  drivers/cxl/cxl.h   |  13 +++++
> > >  drivers/cxl/pci.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++
> > >  3 files changed, 134 insertions(+)
> > > 
> > > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > > index 67c91378f2dd..9d53720bea07 100644
> > > --- a/drivers/cxl/Kconfig
> > > +++ b/drivers/cxl/Kconfig
> > > @@ -16,6 +16,7 @@ if CXL_BUS
> > >  config CXL_MEM
> > >  	tristate "CXL.mem: Memory Devices"
> > >  	default CXL_BUS
> > > +	select PCI_DOE_DRIVER
> > >  	help
> > >  	  The CXL.mem protocol allows a device to act as a provider of
> > >  	  "System RAM" and/or "Persistent Memory" that is fully coherent
> > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > > index 5e2e93451928..f1241a7f2b7b 100644
> > > --- a/drivers/cxl/cxl.h
> > > +++ b/drivers/cxl/cxl.h
> > > @@ -75,6 +75,19 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
> > >  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
> > >  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
> > >  
> > > +/*
> > > + * Address space properties derived from:
> > > + * CXL 2.0 8.2.5.12.7 CXL HDM Decoder 0 Control Register
> > > + */
> > > +#define CXL_ADDRSPACE_RAM   BIT(0)
> > > +#define CXL_ADDRSPACE_PMEM  BIT(1)
> > > +#define CXL_ADDRSPACE_TYPE2 BIT(2)
> > > +#define CXL_ADDRSPACE_TYPE3 BIT(3)
> > > +#define CXL_ADDRSPACE_MASK  GENMASK(3, 0)  
> > 
> > Stray.  
> 
> Not sure what you mean here???
> 
> There were a number of defines which were unused but I left them in.
> 
> This came right out of your patch 3.
> 
> https://lore.kernel.org/linux-cxl/20210524133938.2815206-4-Jonathan.Cameron@huawei.com/
> 
> I can remove these defines if you want?

They don't have anything to do with DOE that I can see. Probably a side effect
of a merge that went wrong and I didn't notice!




> > 
> > Normally after device_add() suceeded we only ever call device_del()
> > as per the docs for device_add()
> > https://elixir.bootlin.com/linux/latest/source/drivers/base/core.c#L3277  
> 
> I think you are miss reading that comment.  Here auxiliary_device_add() has
> succeeded.  Therefore both device_del() and put_device() must be called.  In
> the case of auxiliary_device_add() failing we only call
> auxiliary_device_uninit() [put_device()].
> 
> So I think this is correct.
> 
> The other places I spot checked called device_del() _and_ put_device().

Yeah. I had that wrong. Ref counts will be wrong otherwise.


> 
> >   
> > > +}
> > > +
> > > +static DEFINE_IDA(cxl_doe_adev_ida);
> > > +static void __doe_dev_release(struct auxiliary_device *adev)
> > > +{
> > > +	struct pci_doe_dev *doe_dev = container_of(adev, struct pci_doe_dev,
> > > +						   adev);
> > > +
> > > +	ida_free(&cxl_doe_adev_ida, adev->id);
> > > +	kfree(doe_dev);
> > > +}
> > > +
> > > +static void cxl_doe_dev_release(struct device *dev)
> > > +{
> > > +	struct auxiliary_device *adev = container_of(dev,
> > > +						struct auxiliary_device,
> > > +						dev);
> > > +	__doe_dev_release(adev);
> > > +}
> > > +
> > > +static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)  
> > 
> > Pass in the struct device, or maybe even the struct pci_dev as
> > nothing in here is using the cxl_dev_state.  
> 
> Ah yea can I leave this per the next patch?  Or I can change it then change it
> to cxlds in the next patch.  But I would rather leave it.

I think we will end up reworking this anyway, but maybe there will
still be a cxl_setup_doe_devices wrapper involved.

> 
> >   
> > > +{
> > > +	struct device *dev = cxlds->dev;
> > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > +	int irqs, rc;
> > > +	u16 pos = 0;
> > > +
> > > +	/*
> > > +	 * An implementation of a cxl type3 device may support an unknown
> > > +	 * number of interrupts. Assume that number is not that large and
> > > +	 * request them all.
> > > +	 */
> > > +	irqs = pci_msix_vec_count(pdev);
> > > +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> > > +	if (rc != irqs) {
> > > +		/* No interrupt available - carry on */
> > > +		dev_dbg(dev, "No interrupts available for DOE\n");
> > > +	} else {
> > > +		/*
> > > +		 * Enabling bus mastering could be done within the DOE
> > > +		 * initialization, but as it potentially has other impacts
> > > +		 * keep it within the driver.
> > > +		 */
> > > +		pci_set_master(pdev);
> > > +		rc = devm_add_action_or_reset(dev,
> > > +					      cxl_mem_free_irq_vectors,
> > > +					      pdev);
> > > +		if (rc)
> > > +			return rc;
> > > +	}
> > > +  
> > 
> > Above here is driver specific...
> > Everything from here is is generic so perhaps move it to the PCI core?
> > Alternatively wait until we have users that aren't CXL.  
> 
> I'm still looking for where in the PCI core this would be appropriate to
> place...

Yeah, this needs Bjorn's input. One option would be to move from a soft
dependency to a hard one on the pci-doe module and just put this in there
as an exported utility function.

> 
> >   
> > > +	pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> > > +
> > > +	while (pos > 0) {
> > > +		struct auxiliary_device *adev;
> > > +		struct pci_doe_dev *new_dev;
> > > +		int id;
> > > +
> > > +		new_dev = kzalloc(sizeof(*new_dev), GFP_KERNEL);
> > > +		if (!new_dev)
> > > +			return -ENOMEM;
> > > +
> > > +		new_dev->pdev = pdev;
> > > +		new_dev->cap_offset = pos;
> > > +
> > > +		/* Set up struct auxiliary_device */
> > > +		adev = &new_dev->adev;
> > > +		id = ida_alloc(&cxl_doe_adev_ida, GFP_KERNEL);
> > > +		if (id < 0) {
> > > +			kfree(new_dev);
> > > +			return -ENOMEM;
> > > +		}
> > > +
> > > +		adev->id = id;
> > > +		adev->name = DOE_DEV_NAME;
> > > +		adev->dev.release = cxl_doe_dev_release;
> > > +		adev->dev.parent = dev;
> > > +
> > > +		if (auxiliary_device_init(adev)) {
> > > +			__doe_dev_release(adev);
> > > +			return -EIO;
> > > +		}
> > > +
> > > +		if (auxiliary_device_add(adev)) {
> > > +			auxiliary_device_uninit(adev);
> > > +			return -EIO;
> > > +		}
> > > +
> > > +		rc = devm_add_action_or_reset(dev, cxl_destroy_doe_device, adev);
> > > +		if (rc)
> > > +			return rc;
> > > +
> > > +		if (device_attach(&adev->dev) != 1)
> > > +			dev_err(&adev->dev,
> > > +				"Failed to attach a driver to DOE device %d\n",
> > > +				adev->id);  
> > 
> > I wondered about this and how it would happen.
> > Given soft dependency only between the drivers it's possible but error or info?
> > I'd go with dev_info().  It is an error I'd bail out and used deferred probing
> > to try again when it will succeed.  
> 
> I made this dev_err() on purpose.  And I don't know about the deferred probing.
> Maybe deferred probing on the CDAT read but even that I think is going to be a
> pain.
> 
> The sequence I can think of is:
> 
> cxl_pci loaded
> 	[finds all devices]
> 	[soft loads pci_doe]
> 	[device_attach works]
> Admin unloads pci_doe
> 	[hot-plug new device]
> 	[device_attach fails]
> 	[cdat will fail until driver is loaded]
> 
> I spoke with Dan about this and while this is unfortunate it is what the user
> asked for.  So I prefer dev_err() above to make sure that there is an
> indication of why this device is potentially not going to work.
I'm fine with it being dev_err(), but make it a hard error if it happens.
I don't like potentially not working, and would rather see definitely not
working in this case - so have the function return an error code.

Jonathan
> 
> Thanks for the review,
> Ira
>
Bjorn Helgaas Nov. 16, 2021, 11:48 p.m. UTC | #4
On Fri, Nov 05, 2021 at 04:50:54PM -0700, ira.weiny@intel.com wrote:
> From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> driven by the generic DOE auxiliary driver.

I admit to not being thrilled about joining the elite group of six
users of auxiliary_device_init(), and I don't know exactly what
benefits the auxiliary devices have.

Based on the ECN, it sounds like any PCI device can have DOE
capabilities, so I suspect the support for it should be in
drivers/pci/, not drivers/cxl/.  I don't really see anything
CXL-specific below.

What do these DOE capabilities look like in lspci?  I don't see any
support in the current version (which looks like it's a year old).

> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> ---
> Changes from V4:
> 	Make this an Auxiliary Driver rather than library functions
> 	Split this out into it's own patch
> 	Base on the new cxl_dev_state structure
> 
> Changes from Ben
> 	s/CXL_DOE_DEV_NAME/DOE_DEV_NAME/
> ---
>  drivers/cxl/Kconfig |   1 +
>  drivers/cxl/cxl.h   |  13 +++++
>  drivers/cxl/pci.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 134 insertions(+)
> 
> diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> index 67c91378f2dd..9d53720bea07 100644
> --- a/drivers/cxl/Kconfig
> +++ b/drivers/cxl/Kconfig
> @@ -16,6 +16,7 @@ if CXL_BUS
>  config CXL_MEM
>  	tristate "CXL.mem: Memory Devices"
>  	default CXL_BUS
> +	select PCI_DOE_DRIVER
>  	help
>  	  The CXL.mem protocol allows a device to act as a provider of
>  	  "System RAM" and/or "Persistent Memory" that is fully coherent
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 5e2e93451928..f1241a7f2b7b 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -75,6 +75,19 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
>  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
>  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
>  
> +/*
> + * Address space properties derived from:
> + * CXL 2.0 8.2.5.12.7 CXL HDM Decoder 0 Control Register
> + */
> +#define CXL_ADDRSPACE_RAM   BIT(0)
> +#define CXL_ADDRSPACE_PMEM  BIT(1)
> +#define CXL_ADDRSPACE_TYPE2 BIT(2)
> +#define CXL_ADDRSPACE_TYPE3 BIT(3)
> +#define CXL_ADDRSPACE_MASK  GENMASK(3, 0)
> +
> +#define CXL_DOE_PROTOCOL_COMPLIANCE 0
> +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2

None of these are used here, so they belong in a different patch.

>  #define CXL_COMPONENT_REGS() \
>  	void __iomem *hdm_decoder
>  
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 8dc91fd3396a..df524b74f1d2 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -6,6 +6,7 @@
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/pci.h>
> +#include <linux/pci-doe.h>
>  #include <linux/io.h>
>  #include "cxlmem.h"
>  #include "pci.h"
> @@ -471,6 +472,120 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
>  	return rc;
>  }
>  
> +static void cxl_mem_free_irq_vectors(void *data)
> +{
> +	pci_free_irq_vectors(data);
> +}
> +
> +static void cxl_destroy_doe_device(void *ad)
> +{
> +	struct auxiliary_device *adev = ad;
> +
> +	auxiliary_device_delete(adev);
> +	auxiliary_device_uninit(adev);
> +}
> +
> +static DEFINE_IDA(cxl_doe_adev_ida);
> +static void __doe_dev_release(struct auxiliary_device *adev)

Why the "__" prefix?  I don't see any similar name that requires
disambiguation.

> +{
> +	struct pci_doe_dev *doe_dev = container_of(adev, struct pci_doe_dev,
> +						   adev);
> +
> +	ida_free(&cxl_doe_adev_ida, adev->id);
> +	kfree(doe_dev);
> +}
> +
> +static void cxl_doe_dev_release(struct device *dev)
> +{
> +	struct auxiliary_device *adev = container_of(dev,
> +						struct auxiliary_device,
> +						dev);
> +	__doe_dev_release(adev);
> +}
> +
> +static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
> +{
> +	struct device *dev = cxlds->dev;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	int irqs, rc;
> +	u16 pos = 0;
> +
> +	/*
> +	 * An implementation of a cxl type3 device may support an unknown
> +	 * number of interrupts. Assume that number is not that large and
> +	 * request them all.
> +	 */
> +	irqs = pci_msix_vec_count(pdev);
> +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> +	if (rc != irqs) {
> +		/* No interrupt available - carry on */
> +		dev_dbg(dev, "No interrupts available for DOE\n");
> +	} else {
> +		/*
> +		 * Enabling bus mastering could be done within the DOE
> +		 * initialization, but as it potentially has other impacts
> +		 * keep it within the driver.
> +		 */
> +		pci_set_master(pdev);

This enables the device to perform DMA, which doesn't seem to have
anything to do with the rest of this code.  Can it go somewhere near
something to do with DMA?

> +		rc = devm_add_action_or_reset(dev,
> +					      cxl_mem_free_irq_vectors,
> +					      pdev);
> +		if (rc)
> +			return rc;
> +	}
> +
> +	pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> +
> +	while (pos > 0) {
> +		struct auxiliary_device *adev;
> +		struct pci_doe_dev *new_dev;
> +		int id;
> +
> +		new_dev = kzalloc(sizeof(*new_dev), GFP_KERNEL);
> +		if (!new_dev)
> +			return -ENOMEM;
> +
> +		new_dev->pdev = pdev;
> +		new_dev->cap_offset = pos;
> +
> +		/* Set up struct auxiliary_device */
> +		adev = &new_dev->adev;
> +		id = ida_alloc(&cxl_doe_adev_ida, GFP_KERNEL);
> +		if (id < 0) {
> +			kfree(new_dev);
> +			return -ENOMEM;
> +		}
> +
> +		adev->id = id;
> +		adev->name = DOE_DEV_NAME;
> +		adev->dev.release = cxl_doe_dev_release;
> +		adev->dev.parent = dev;
> +
> +		if (auxiliary_device_init(adev)) {
> +			__doe_dev_release(adev);
> +			return -EIO;
> +		}
> +
> +		if (auxiliary_device_add(adev)) {
> +			auxiliary_device_uninit(adev);
> +			return -EIO;
> +		}
> +
> +		rc = devm_add_action_or_reset(dev, cxl_destroy_doe_device, adev);
> +		if (rc)
> +			return rc;
> +
> +		if (device_attach(&adev->dev) != 1)
> +			dev_err(&adev->dev,
> +				"Failed to attach a driver to DOE device %d\n",
> +				adev->id);
> +
> +		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);

So we get an auxiliary device for every instance of a DOE capability?
I think the commit log should mention something about how many are
created (e.g., "one per DOE capability"), how they are named, whether
they appear in sysfs, how drivers bind to them, etc.

I assume there needs to be some coordination between possible multiple
users of a DOE capability?  How does that work?

> +	}
> +
> +	return 0;
> +}
> +
>  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct cxl_register_map map;
> @@ -517,6 +632,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	rc = cxl_setup_doe_devices(cxlds);
> +	if (rc)
> +		return rc;
> +
>  	cxlmd = devm_cxl_add_memdev(cxlds);
>  	if (IS_ERR(cxlmd))
>  		return PTR_ERR(cxlmd);
> @@ -546,3 +665,4 @@ static struct pci_driver cxl_pci_driver = {
>  MODULE_LICENSE("GPL v2");
>  module_pci_driver(cxl_pci_driver);
>  MODULE_IMPORT_NS(CXL);
> +MODULE_SOFTDEP("pre: pci_doe");
> -- 
> 2.28.0.rc0.12.gb6a658bd00c9
>
Jonathan Cameron Nov. 17, 2021, 12:23 p.m. UTC | #5
On Tue, 16 Nov 2021 17:48:29 -0600
Bjorn Helgaas <helgaas@kernel.org> wrote:

> On Fri, Nov 05, 2021 at 04:50:54PM -0700, ira.weiny@intel.com wrote:
> > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > 
> > CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> > driven by the generic DOE auxiliary driver.  
> 
> I admit to not being thrilled about joining the elite group of six
> users of auxiliary_device_init(), and I don't know exactly what
> benefits the auxiliary devices have.

One for Dan...

> 
> Based on the ECN, it sounds like any PCI device can have DOE
> capabilities, so I suspect the support for it should be in
> drivers/pci/, not drivers/cxl/.  I don't really see anything
> CXL-specific below.

Agreed though how it all gets tied together isn't totally clear
to me yet. The messy bit is interrupts given I don't think we have
a model for enabling those anywhere other than in individual PCI drivers.

> 
> What do these DOE capabilities look like in lspci?  I don't see any
> support in the current version (which looks like it's a year old).

I don't think anyone has added support yet, but it would be simple to do.
Given possibility of breaking things if we actually exercise the discovery
protocol, we'll be constrained to just reporting there is a DOE instances
which is of limited use.

> 
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > 
> > ---
> > Changes from V4:
> > 	Make this an Auxiliary Driver rather than library functions
> > 	Split this out into it's own patch
> > 	Base on the new cxl_dev_state structure
> > 
> > Changes from Ben
> > 	s/CXL_DOE_DEV_NAME/DOE_DEV_NAME/
> > ---
> >  drivers/cxl/Kconfig |   1 +
> >  drivers/cxl/cxl.h   |  13 +++++
> >  drivers/cxl/pci.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 134 insertions(+)
> > 
> > diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
> > index 67c91378f2dd..9d53720bea07 100644
> > --- a/drivers/cxl/Kconfig
> > +++ b/drivers/cxl/Kconfig
> > @@ -16,6 +16,7 @@ if CXL_BUS
> >  config CXL_MEM
> >  	tristate "CXL.mem: Memory Devices"
> >  	default CXL_BUS
> > +	select PCI_DOE_DRIVER
> >  	help
> >  	  The CXL.mem protocol allows a device to act as a provider of
> >  	  "System RAM" and/or "Persistent Memory" that is fully coherent
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index 5e2e93451928..f1241a7f2b7b 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> > @@ -75,6 +75,19 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
> >  #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
> >  #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
> >  
> > +/*
> > + * Address space properties derived from:
> > + * CXL 2.0 8.2.5.12.7 CXL HDM Decoder 0 Control Register
> > + */
> > +#define CXL_ADDRSPACE_RAM   BIT(0)
> > +#define CXL_ADDRSPACE_PMEM  BIT(1)
> > +#define CXL_ADDRSPACE_TYPE2 BIT(2)
> > +#define CXL_ADDRSPACE_TYPE3 BIT(3)
> > +#define CXL_ADDRSPACE_MASK  GENMASK(3, 0)
> > +
> > +#define CXL_DOE_PROTOCOL_COMPLIANCE 0
> > +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2  
> 
> None of these are used here, so they belong in a different patch.s
> 
> >  #define CXL_COMPONENT_REGS() \
> >  	void __iomem *hdm_decoder
> >  
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 8dc91fd3396a..df524b74f1d2 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -6,6 +6,7 @@
> >  #include <linux/mutex.h>
> >  #include <linux/list.h>
> >  #include <linux/pci.h>
> > +#include <linux/pci-doe.h>
> >  #include <linux/io.h>
> >  #include "cxlmem.h"
> >  #include "pci.h"
> > @@ -471,6 +472,120 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
> >  	return rc;
> >  }
> >  
> > +static void cxl_mem_free_irq_vectors(void *data)
> > +{
> > +	pci_free_irq_vectors(data);
> > +}
> > +
> > +static void cxl_destroy_doe_device(void *ad)
> > +{
> > +	struct auxiliary_device *adev = ad;
> > +
> > +	auxiliary_device_delete(adev);
> > +	auxiliary_device_uninit(adev);
> > +}
> > +
> > +static DEFINE_IDA(cxl_doe_adev_ida);
> > +static void __doe_dev_release(struct auxiliary_device *adev)  
> 
> Why the "__" prefix?  I don't see any similar name that requires
> disambiguation.
> 
> > +{
> > +	struct pci_doe_dev *doe_dev = container_of(adev, struct pci_doe_dev,
> > +						   adev);
> > +
> > +	ida_free(&cxl_doe_adev_ida, adev->id);
> > +	kfree(doe_dev);
> > +}
> > +
> > +static void cxl_doe_dev_release(struct device *dev)
> > +{
> > +	struct auxiliary_device *adev = container_of(dev,
> > +						struct auxiliary_device,
> > +						dev);
> > +	__doe_dev_release(adev);
> > +}
> > +
> > +static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
> > +{
> > +	struct device *dev = cxlds->dev;
> > +	struct pci_dev *pdev = to_pci_dev(dev);
> > +	int irqs, rc;
> > +	u16 pos = 0;
> > +
> > +	/*
> > +	 * An implementation of a cxl type3 device may support an unknown
> > +	 * number of interrupts. Assume that number is not that large and
> > +	 * request them all.
> > +	 */
> > +	irqs = pci_msix_vec_count(pdev);
> > +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> > +	if (rc != irqs) {
> > +		/* No interrupt available - carry on */
> > +		dev_dbg(dev, "No interrupts available for DOE\n");
> > +	} else {
> > +		/*
> > +		 * Enabling bus mastering could be done within the DOE
> > +		 * initialization, but as it potentially has other impacts
> > +		 * keep it within the driver.
> > +		 */
> > +		pci_set_master(pdev);  
> 
> This enables the device to perform DMA, which doesn't seem to have
> anything to do with the rest of this code.  Can it go somewhere near
> something to do with DMA?

Needed for MSI/MSIx as well.  The driver doesn't do DMA for anything else.
Hence it's here in the interrupt enable path.

> 
> > +		rc = devm_add_action_or_reset(dev,
> > +					      cxl_mem_free_irq_vectors,
> > +					      pdev);
> > +		if (rc)
> > +			return rc;
> > +	}
> > +
> > +	pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
> > +
> > +	while (pos > 0) {
> > +		struct auxiliary_device *adev;
> > +		struct pci_doe_dev *new_dev;
> > +		int id;
> > +
> > +		new_dev = kzalloc(sizeof(*new_dev), GFP_KERNEL);
> > +		if (!new_dev)
> > +			return -ENOMEM;
> > +
> > +		new_dev->pdev = pdev;
> > +		new_dev->cap_offset = pos;
> > +
> > +		/* Set up struct auxiliary_device */
> > +		adev = &new_dev->adev;
> > +		id = ida_alloc(&cxl_doe_adev_ida, GFP_KERNEL);
> > +		if (id < 0) {
> > +			kfree(new_dev);
> > +			return -ENOMEM;
> > +		}
> > +
> > +		adev->id = id;
> > +		adev->name = DOE_DEV_NAME;
> > +		adev->dev.release = cxl_doe_dev_release;
> > +		adev->dev.parent = dev;
> > +
> > +		if (auxiliary_device_init(adev)) {
> > +			__doe_dev_release(adev);
> > +			return -EIO;
> > +		}
> > +
> > +		if (auxiliary_device_add(adev)) {
> > +			auxiliary_device_uninit(adev);
> > +			return -EIO;
> > +		}
> > +
> > +		rc = devm_add_action_or_reset(dev, cxl_destroy_doe_device, adev);
> > +		if (rc)
> > +			return rc;
> > +
> > +		if (device_attach(&adev->dev) != 1)
> > +			dev_err(&adev->dev,
> > +				"Failed to attach a driver to DOE device %d\n",
> > +				adev->id);
> > +
> > +		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);  
> 
> So we get an auxiliary device for every instance of a DOE capability?
> I think the commit log should mention something about how many are
> created (e.g., "one per DOE capability"), how they are named, whether
> they appear in sysfs, how drivers bind to them, etc.
> 
> I assume there needs to be some coordination between possible multiple
> users of a DOE capability?  How does that work?

The DOE handling implementation makes everything synchronous - so if multiple
users each may have to wait on queueing their query / responses exchanges.

The fun of non OS software accessing these is still an open question.

Jonathan

> 
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  {
> >  	struct cxl_register_map map;
> > @@ -517,6 +632,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > +	rc = cxl_setup_doe_devices(cxlds);
> > +	if (rc)
> > +		return rc;
> > +
> >  	cxlmd = devm_cxl_add_memdev(cxlds);
> >  	if (IS_ERR(cxlmd))
> >  		return PTR_ERR(cxlmd);
> > @@ -546,3 +665,4 @@ static struct pci_driver cxl_pci_driver = {
> >  MODULE_LICENSE("GPL v2");
> >  module_pci_driver(cxl_pci_driver);
> >  MODULE_IMPORT_NS(CXL);
> > +MODULE_SOFTDEP("pre: pci_doe");
> > -- 
> > 2.28.0.rc0.12.gb6a658bd00c9
> >
Bjorn Helgaas Nov. 17, 2021, 10:15 p.m. UTC | #6
[+cc Christoph, Thomas for INTx/MSI/bus mastering question below]

On Wed, Nov 17, 2021 at 12:23:35PM +0000, Jonathan Cameron wrote:
> On Tue, 16 Nov 2021 17:48:29 -0600
> Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Fri, Nov 05, 2021 at 04:50:54PM -0700, ira.weiny@intel.com wrote:
> > > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > 
> > > CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> > > driven by the generic DOE auxiliary driver.  

> > Based on the ECN, it sounds like any PCI device can have DOE
> > capabilities, so I suspect the support for it should be in
> > drivers/pci/, not drivers/cxl/.  I don't really see anything
> > CXL-specific below.
> 
> Agreed though how it all gets tied together isn't totally clear
> to me yet. The messy bit is interrupts given I don't think we have
> a model for enabling those anywhere other than in individual PCI drivers.

Ah.  Yeah, that is a little messy.  The only real precedent where the
PCI core and a driver might need to coordinate on interrupts is the
portdrv.  So far we've pretended that bridges do not have
device-specific functionality that might require interrupts.  I don't
think that's actually true, but we haven't integrated drivers for the
tuning, performance monitoring, and similar features that bridges may
have.  Yet.

In any case, I think the argument that DOE capabilities are not
CXL-specific still holds.

> > What do these DOE capabilities look like in lspci?  I don't see any
> > support in the current version (which looks like it's a year old).
> 
> I don't think anyone has added support yet, but it would be simple to do.
> Given possibility of breaking things if we actually exercise the discovery
> protocol, we'll be constrained to just reporting there is a DOE instances
> which is of limited use.

I think it's essential that lspci at least show the existence of DOE
capabilities and the safe-to-read registers (Capabilities, Control,
Status).

There's a very long lead time between adding the support and getting
updated versions of lspci into distros.

> > > +	 * An implementation of a cxl type3 device may support an unknown
> > > +	 * number of interrupts. Assume that number is not that large and
> > > +	 * request them all.
> > > +	 */
> > > +	irqs = pci_msix_vec_count(pdev);
> > > +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> > > +	if (rc != irqs) {
> > > +		/* No interrupt available - carry on */
> > > +		dev_dbg(dev, "No interrupts available for DOE\n");
> > > +	} else {
> > > +		/*
> > > +		 * Enabling bus mastering could be done within the DOE
> > > +		 * initialization, but as it potentially has other impacts
> > > +		 * keep it within the driver.
> > > +		 */
> > > +		pci_set_master(pdev);  
> > 
> > This enables the device to perform DMA, which doesn't seem to have
> > anything to do with the rest of this code.  Can it go somewhere
> > near something to do with DMA?
> 
> Needed for MSI/MSIx as well.  The driver doesn't do DMA for anything
> else.  Hence it's here in the interrupt enable path.

Oh, right, of course.  A hint here that MSI/MSI-X depends on bus
mastering would save me the trouble.

I wonder if the infrastructure, e.g., something inside
pci_alloc_irq_vectors_affinity() should do this for us.  The
connection is "obvious" but not mentioned in
Documentation/PCI/msi-howto.rst and I'm not sure how callers that
supply PCI_IRQ_ALL_TYPES would know whether they got a single MSI
vector (which requires bus mastering) or an INTx vector (which does
not).

> > So we get an auxiliary device for every instance of a DOE
> > capability?  I think the commit log should mention something about
> > how many are created (e.g., "one per DOE capability"), how they
> > are named, whether they appear in sysfs, how drivers bind to them,
> > etc.
> > 
> > I assume there needs to be some coordination between possible
> > multiple users of a DOE capability?  How does that work?
> 
> The DOE handling implementation makes everything synchronous - so if
> multiple users each may have to wait on queueing their query /
> responses exchanges.
> 
> The fun of non OS software accessing these is still an open
> question.

Sounds like something that potentially could be wrapped up in a safe
but slow interface that could be usable by others, including lspci?

Bjorn
Jonathan Cameron Nov. 18, 2021, 10:51 a.m. UTC | #7
On Wed, 17 Nov 2021 16:15:36 -0600
Bjorn Helgaas <helgaas@kernel.org> wrote:

> [+cc Christoph, Thomas for INTx/MSI/bus mastering question below]
> 
> On Wed, Nov 17, 2021 at 12:23:35PM +0000, Jonathan Cameron wrote:
> > On Tue, 16 Nov 2021 17:48:29 -0600
> > Bjorn Helgaas <helgaas@kernel.org> wrote:  
> > > On Fri, Nov 05, 2021 at 04:50:54PM -0700, ira.weiny@intel.com wrote:  
> > > > From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > 
> > > > CXL devices have DOE mailboxes.  Create auxiliary devices which can be
> > > > driven by the generic DOE auxiliary driver.    
> 
> > > Based on the ECN, it sounds like any PCI device can have DOE
> > > capabilities, so I suspect the support for it should be in
> > > drivers/pci/, not drivers/cxl/.  I don't really see anything
> > > CXL-specific below.  
> > 
> > Agreed though how it all gets tied together isn't totally clear
> > to me yet. The messy bit is interrupts given I don't think we have
> > a model for enabling those anywhere other than in individual PCI drivers.  
> 
> Ah.  Yeah, that is a little messy.  The only real precedent where the
> PCI core and a driver might need to coordinate on interrupts is the
> portdrv.  So far we've pretended that bridges do not have
> device-specific functionality that might require interrupts.  I don't
> think that's actually true, but we haven't integrated drivers for the
> tuning, performance monitoring, and similar features that bridges may
> have.  Yet.

Upstream ports of CXL switches will have DOE / CDAT - though no one has
one yet and we haven't emulated one in QEMU either yet (will do that shortly).
Also CMA (IDE possibly) will be a requirement for switches soon.  Still all that
stuff is at least in various specs, so can probably fit in the existing port-drv
framework.

I dropped work on our RP / portdrv PMU from a few years back because we broke the
hardware out as a separate RCiEP.  Hacking custom support into portdrv wasn't
pretty IIRC. 

> 
> In any case, I think the argument that DOE capabilities are not
> CXL-specific still holds.

Agreed.

> 
> > > What do these DOE capabilities look like in lspci?  I don't see any
> > > support in the current version (which looks like it's a year old).  
> > 
> > I don't think anyone has added support yet, but it would be simple to do.
> > Given possibility of breaking things if we actually exercise the discovery
> > protocol, we'll be constrained to just reporting there is a DOE instances
> > which is of limited use.  
> 
> I think it's essential that lspci at least show the existence of DOE
> capabilities and the safe-to-read registers (Capabilities, Control,
> Status).
> 
> There's a very long lead time between adding the support and getting
> updated versions of lspci into distros.

I'll add lspci support to my todo list.

> 
> > > > +	 * An implementation of a cxl type3 device may support an unknown
> > > > +	 * number of interrupts. Assume that number is not that large and
> > > > +	 * request them all.
> > > > +	 */
> > > > +	irqs = pci_msix_vec_count(pdev);
> > > > +	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
> > > > +	if (rc != irqs) {
> > > > +		/* No interrupt available - carry on */
> > > > +		dev_dbg(dev, "No interrupts available for DOE\n");
> > > > +	} else {
> > > > +		/*
> > > > +		 * Enabling bus mastering could be done within the DOE
> > > > +		 * initialization, but as it potentially has other impacts
> > > > +		 * keep it within the driver.
> > > > +		 */
> > > > +		pci_set_master(pdev);    
> > > 
> > > This enables the device to perform DMA, which doesn't seem to have
> > > anything to do with the rest of this code.  Can it go somewhere
> > > near something to do with DMA?  
> > 
> > Needed for MSI/MSIx as well.  The driver doesn't do DMA for anything
> > else.  Hence it's here in the interrupt enable path.  
> 
> Oh, right, of course.  A hint here that MSI/MSI-X depends on bus
> mastering would save me the trouble.

Ira, please add a comment when you respin.  Thanks!

> 
> I wonder if the infrastructure, e.g., something inside
> pci_alloc_irq_vectors_affinity() should do this for us.  The
> connection is "obvious" but not mentioned in
> Documentation/PCI/msi-howto.rst and I'm not sure how callers that
> supply PCI_IRQ_ALL_TYPES would know whether they got a single MSI
> vector (which requires bus mastering) or an INTx vector (which does
> not).

I wonder if there are devices that drivers that deliberately delay
the pci_set_master() until more stuff is set up.  I'd hope no device
is broken enough that it would matter, but 'maybe'?

In this particular case we don't run into the what type of interrupt
question as the spec requires MSI / MSIX but agreed that could be
a problem more generally.

I wonder how many types of EP actually have interrupts but no DMA though?
There are the memory buffers used for some types of P2P + the bridges you
mention.

> 
> > > So we get an auxiliary device for every instance of a DOE
> > > capability?  I think the commit log should mention something about
> > > how many are created (e.g., "one per DOE capability"), how they
> > > are named, whether they appear in sysfs, how drivers bind to them,
> > > etc.
> > > 
> > > I assume there needs to be some coordination between possible
> > > multiple users of a DOE capability?  How does that work?  
> > 
> > The DOE handling implementation makes everything synchronous - so if
> > multiple users each may have to wait on queueing their query /
> > responses exchanges.
> > 
> > The fun of non OS software accessing these is still an open
> > question.  
> 
> Sounds like something that potentially could be wrapped up in a safe
> but slow interface that could be usable by others, including lspci?

So one version of this patch set had a generic IOCTL interface, but
discussions around that (we also briefly touched on it at the plumbers
microconf) were heading in the direction of protocol specific interfaces.

If we put that back we'd need a more complex means of controlling access,
possibly only allowing the discovery protocols.
The problem is some protocols have strict ordering requirements so exposing
anything close to raw access could for example let userspace break establishment of
secure channels or collapse an existing secure channel.

A simple answer might be to have the DOE driver expose the discovered protocols
via sysfs.  I don't think they will change over time so simply caching them
at first load should be fine.

Jonathan

> 
> Bjorn
Christoph Hellwig Nov. 19, 2021, 6:48 a.m. UTC | #8
On Wed, Nov 17, 2021 at 04:15:36PM -0600, Bjorn Helgaas wrote:
> > Agreed though how it all gets tied together isn't totally clear
> > to me yet. The messy bit is interrupts given I don't think we have
> > a model for enabling those anywhere other than in individual PCI drivers.
> 
> Ah.  Yeah, that is a little messy.  The only real precedent where the
> PCI core and a driver might need to coordinate on interrupts is the
> portdrv.  So far we've pretended that bridges do not have
> device-specific functionality that might require interrupts.  I don't
> think that's actually true, but we haven't integrated drivers for the
> tuning, performance monitoring, and similar features that bridges may
> have.  Yet.

And portdrv really is conceptually part of the core PCI core, and
should eventually be fully integrated..

> In any case, I think the argument that DOE capabilities are not
> CXL-specific still holds.

Agreed.

> Oh, right, of course.  A hint here that MSI/MSI-X depends on bus
> mastering would save me the trouble.
> 
> I wonder if the infrastructure, e.g., something inside
> pci_alloc_irq_vectors_affinity() should do this for us.  The
> connection is "obvious" but not mentioned in
> Documentation/PCI/msi-howto.rst and I'm not sure how callers that
> supply PCI_IRQ_ALL_TYPES would know whether they got a single MSI
> vector (which requires bus mastering) or an INTx vector (which does
> not).

As a minimum step we should document that this.  That being said
I don't tink we can just make the interrupt API call pci_set_master
as there might be strange ordering requirements in the drivers.

> > > So we get an auxiliary device for every instance of a DOE
> > > capability?  I think the commit log should mention something about
> > > how many are created (e.g., "one per DOE capability"), how they
> > > are named, whether they appear in sysfs, how drivers bind to them,
> > > etc.
> > > 
> > > I assume there needs to be some coordination between possible
> > > multiple users of a DOE capability?  How does that work?
> > 
> > The DOE handling implementation makes everything synchronous - so if
> > multiple users each may have to wait on queueing their query /
> > responses exchanges.
> > 
> > The fun of non OS software accessing these is still an open
> > question.
> 
> Sounds like something that potentially could be wrapped up in a safe
> but slow interface that could be usable by others, including lspci?

I guess we have to.  I think this interface is a nightmare.  Why o why
does the PCI SGI keep doing these stupid things (see also VPDs).
Dan Williams Nov. 29, 2021, 11:37 p.m. UTC | #9
On Thu, Nov 18, 2021 at 10:48 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Wed, Nov 17, 2021 at 04:15:36PM -0600, Bjorn Helgaas wrote:
> > > Agreed though how it all gets tied together isn't totally clear
> > > to me yet. The messy bit is interrupts given I don't think we have
> > > a model for enabling those anywhere other than in individual PCI drivers.
> >
> > Ah.  Yeah, that is a little messy.  The only real precedent where the
> > PCI core and a driver might need to coordinate on interrupts is the
> > portdrv.  So far we've pretended that bridges do not have
> > device-specific functionality that might require interrupts.  I don't
> > think that's actually true, but we haven't integrated drivers for the
> > tuning, performance monitoring, and similar features that bridges may
> > have.  Yet.
>
> And portdrv really is conceptually part of the core PCI core, and
> should eventually be fully integrated..

What does a fully integrated portdrv look like? DOE enabling could
follow a similar model.

>
> > In any case, I think the argument that DOE capabilities are not
> > CXL-specific still holds.
>
> Agreed.

I don't think anyone is arguing that DOE is something CXL specific.
The enabling belongs only in drivers/pci/ as a DOE core, and then that
core is referenced by any other random PCI driver that needs to
interact with a DOE.

The question is what does that DOE core look like? A Linux device
representing the DOE capability and a common driver for the
data-transfer seems a reasonable abstraction to me and that's what
Auxiliary Bus offers.
Dan Williams Nov. 29, 2021, 11:59 p.m. UTC | #10
On Mon, Nov 29, 2021 at 3:37 PM Dan Williams <dan.j.williams@intel.com> wrote:
>
> On Thu, Nov 18, 2021 at 10:48 PM Christoph Hellwig <hch@lst.de> wrote:
> >
> > On Wed, Nov 17, 2021 at 04:15:36PM -0600, Bjorn Helgaas wrote:
> > > > Agreed though how it all gets tied together isn't totally clear
> > > > to me yet. The messy bit is interrupts given I don't think we have
> > > > a model for enabling those anywhere other than in individual PCI drivers.
> > >
> > > Ah.  Yeah, that is a little messy.  The only real precedent where the
> > > PCI core and a driver might need to coordinate on interrupts is the
> > > portdrv.  So far we've pretended that bridges do not have
> > > device-specific functionality that might require interrupts.  I don't
> > > think that's actually true, but we haven't integrated drivers for the
> > > tuning, performance monitoring, and similar features that bridges may
> > > have.  Yet.
> >
> > And portdrv really is conceptually part of the core PCI core, and
> > should eventually be fully integrated..
>
> What does a fully integrated portdrv look like? DOE enabling could
> follow a similar model.
>
> >
> > > In any case, I think the argument that DOE capabilities are not
> > > CXL-specific still holds.
> >
> > Agreed.
>
> I don't think anyone is arguing that DOE is something CXL specific.
> The enabling belongs only in drivers/pci/ as a DOE core, and then that
> core is referenced by any other random PCI driver that needs to
> interact with a DOE.
>
> The question is what does that DOE core look like? A Linux device
> representing the DOE capability and a common driver for the
> data-transfer seems a reasonable abstraction to me and that's what
> Auxiliary Bus offers.

I will also add that this is not just an argument to use a
device+driver organization for its own sake, it also allows for an
idiomatic ABI for determining when the kernel is using a device
capability vs when userspace is using it. For example,
IO_STRICT_DEVMEM currently locks out userspace MMIO when a kernel
driver is attached to a device. I see a need for a similar policy to
lock out userspace from configuration writes to the DOE while the
kernel is using the DOE. If userspace wants / needs access returned to
it then it can force unbind just that one DOE aux-driver rather than
unbind the driver for the entire device if DOE was just a library that
all drivers linked against.

DOE negotiates security features like SPDM and IDE. I think it is
important for the kernel to be able to control access to DOE instances
even though it has not cared about protecting itself from userspace
initiated configuration writes in the past.
Christoph Hellwig Nov. 30, 2021, 6:42 a.m. UTC | #11
On Mon, Nov 29, 2021 at 03:59:25PM -0800, Dan Williams wrote:
> DOE negotiates security features like SPDM and IDE. I think it is
> important for the kernel to be able to control access to DOE instances
> even though it has not cared about protecting itself from userspace
> initiated configuration writes in the past.

I think DOE is pretty much a kernel only feature and we can't allow
userspace access to it at all.
diff mbox series

Patch

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 67c91378f2dd..9d53720bea07 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -16,6 +16,7 @@  if CXL_BUS
 config CXL_MEM
 	tristate "CXL.mem: Memory Devices"
 	default CXL_BUS
+	select PCI_DOE_DRIVER
 	help
 	  The CXL.mem protocol allows a device to act as a provider of
 	  "System RAM" and/or "Persistent Memory" that is fully coherent
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 5e2e93451928..f1241a7f2b7b 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -75,6 +75,19 @@  static inline int cxl_hdm_decoder_count(u32 cap_hdr)
 #define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
 #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
 
+/*
+ * Address space properties derived from:
+ * CXL 2.0 8.2.5.12.7 CXL HDM Decoder 0 Control Register
+ */
+#define CXL_ADDRSPACE_RAM   BIT(0)
+#define CXL_ADDRSPACE_PMEM  BIT(1)
+#define CXL_ADDRSPACE_TYPE2 BIT(2)
+#define CXL_ADDRSPACE_TYPE3 BIT(3)
+#define CXL_ADDRSPACE_MASK  GENMASK(3, 0)
+
+#define CXL_DOE_PROTOCOL_COMPLIANCE 0
+#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
+
 #define CXL_COMPONENT_REGS() \
 	void __iomem *hdm_decoder
 
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 8dc91fd3396a..df524b74f1d2 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -6,6 +6,7 @@ 
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/pci.h>
+#include <linux/pci-doe.h>
 #include <linux/io.h>
 #include "cxlmem.h"
 #include "pci.h"
@@ -471,6 +472,120 @@  static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
 	return rc;
 }
 
+static void cxl_mem_free_irq_vectors(void *data)
+{
+	pci_free_irq_vectors(data);
+}
+
+static void cxl_destroy_doe_device(void *ad)
+{
+	struct auxiliary_device *adev = ad;
+
+	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
+}
+
+static DEFINE_IDA(cxl_doe_adev_ida);
+static void __doe_dev_release(struct auxiliary_device *adev)
+{
+	struct pci_doe_dev *doe_dev = container_of(adev, struct pci_doe_dev,
+						   adev);
+
+	ida_free(&cxl_doe_adev_ida, adev->id);
+	kfree(doe_dev);
+}
+
+static void cxl_doe_dev_release(struct device *dev)
+{
+	struct auxiliary_device *adev = container_of(dev,
+						struct auxiliary_device,
+						dev);
+	__doe_dev_release(adev);
+}
+
+static int cxl_setup_doe_devices(struct cxl_dev_state *cxlds)
+{
+	struct device *dev = cxlds->dev;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int irqs, rc;
+	u16 pos = 0;
+
+	/*
+	 * An implementation of a cxl type3 device may support an unknown
+	 * number of interrupts. Assume that number is not that large and
+	 * request them all.
+	 */
+	irqs = pci_msix_vec_count(pdev);
+	rc = pci_alloc_irq_vectors(pdev, irqs, irqs, PCI_IRQ_MSIX);
+	if (rc != irqs) {
+		/* No interrupt available - carry on */
+		dev_dbg(dev, "No interrupts available for DOE\n");
+	} else {
+		/*
+		 * Enabling bus mastering could be done within the DOE
+		 * initialization, but as it potentially has other impacts
+		 * keep it within the driver.
+		 */
+		pci_set_master(pdev);
+		rc = devm_add_action_or_reset(dev,
+					      cxl_mem_free_irq_vectors,
+					      pdev);
+		if (rc)
+			return rc;
+	}
+
+	pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
+
+	while (pos > 0) {
+		struct auxiliary_device *adev;
+		struct pci_doe_dev *new_dev;
+		int id;
+
+		new_dev = kzalloc(sizeof(*new_dev), GFP_KERNEL);
+		if (!new_dev)
+			return -ENOMEM;
+
+		new_dev->pdev = pdev;
+		new_dev->cap_offset = pos;
+
+		/* Set up struct auxiliary_device */
+		adev = &new_dev->adev;
+		id = ida_alloc(&cxl_doe_adev_ida, GFP_KERNEL);
+		if (id < 0) {
+			kfree(new_dev);
+			return -ENOMEM;
+		}
+
+		adev->id = id;
+		adev->name = DOE_DEV_NAME;
+		adev->dev.release = cxl_doe_dev_release;
+		adev->dev.parent = dev;
+
+		if (auxiliary_device_init(adev)) {
+			__doe_dev_release(adev);
+			return -EIO;
+		}
+
+		if (auxiliary_device_add(adev)) {
+			auxiliary_device_uninit(adev);
+			return -EIO;
+		}
+
+		rc = devm_add_action_or_reset(dev, cxl_destroy_doe_device, adev);
+		if (rc)
+			return rc;
+
+		if (device_attach(&adev->dev) != 1)
+			dev_err(&adev->dev,
+				"Failed to attach a driver to DOE device %d\n",
+				adev->id);
+
+		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DOE);
+	}
+
+	return 0;
+}
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct cxl_register_map map;
@@ -517,6 +632,10 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	rc = cxl_setup_doe_devices(cxlds);
+	if (rc)
+		return rc;
+
 	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
@@ -546,3 +665,4 @@  static struct pci_driver cxl_pci_driver = {
 MODULE_LICENSE("GPL v2");
 module_pci_driver(cxl_pci_driver);
 MODULE_IMPORT_NS(CXL);
+MODULE_SOFTDEP("pre: pci_doe");